From 07c34b2d004f16798c17ed479679a511c6bd2f29 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Sat, 29 Sep 2018 16:51:57 +0200 Subject: Add documentation For -recursive and -irc --- README.rst | 35 +++++++++++++++++++++++++++++++++++ crocoite/cli.py | 7 ++++++- crocoite/controller.py | 5 +++-- 3 files changed, 44 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index b1fce2c..61c5b04 100644 --- a/README.rst +++ b/README.rst @@ -17,10 +17,12 @@ The following dependencies must be present to run crocoite: - pychrome_ - warcio_ - html5lib_ +- bottom_ (IRC client) .. _pychrome: https://github.com/fate0/pychrome .. _warcio: https://github.com/webrecorder/warcio .. _html5lib: https://github.com/html5lib/html5lib-python +.. _bottom: https://github.com/numberoverzero/bottom It is recommended to prepare a virtualenv and let pip handle the dependency resolution for Python packages instead: @@ -119,6 +121,39 @@ does not work any more. Secondly it also saves a screenshot of the full page, so even if future browsers cannot render and display the stored HTML a fully rendered version of the website can be replayed instead. +Advanced usage +-------------- + +crocoite is built with the Unix philosophy (“do one thing and do it well”) in +mind. Thus ``crocoite-grab`` can only save a single page. If you want recursion +use ``crocoite-recursive``, which follows hyperlinks according to ``--policy``. +It can either recurse a maximum number of levels or grab all pages with the +same prefix as the start URL: + +.. code:: bash + + crocoite-recursive --policy prefix http://www.example.com/dir/ output + +will save all pages in ``/dir/`` and below to individual files in the output +directory ``output``. You can customize the command used to grab individual +pages by appending it after ``output``. This way distributed grabs (ssh to a +different machine and execute the job there, queue the command with Slurm, …) +are possible. + +IRC bot +^^^^^^^ + +A simple IRC bot (“chromebot”) is provided with the command ``crocoite-irc``. +It reads its configuration from a config file like the example provided in +``contrib/chromebot.ini`` and supports the following commands: + +a -j -r + Archive with processes according to recursion +s + Get job status for +r + Revoke or abort running job with + Related projects ---------------- diff --git a/crocoite/cli.py b/crocoite/cli.py index 8473a0d..63199c9 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -111,8 +111,13 @@ def irc (): from configparser import ConfigParser from .irc import Bot + parser = argparse.ArgumentParser(description='IRC bot.') + parser.add_argument('--config', '-c', help='Config file location', metavar='PATH', default='chromebot.ini') + + args = parser.parse_args () + config = ConfigParser () - config.read ('chromebot.ini') + config.read (args.config) s = config['irc'] bot = Bot ( diff --git a/crocoite/controller.py b/crocoite/controller.py index 9c153b8..81f0638 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -312,8 +312,9 @@ class RecursiveController: async def fetch (self, url): """ - Overrideable fetch action for URLs. Defaults to sequential - SinglePageController. + Fetch a single URL using an external command + + command is usually crocoite-grab """ def formatCommand (e): -- cgit v1.2.3