diff options
| -rw-r--r-- | README.rst | 35 | ||||
| -rw-r--r-- | crocoite/cli.py | 7 | ||||
| -rw-r--r-- | crocoite/controller.py | 5 | 
3 files changed, 44 insertions, 3 deletions
| @@ -17,10 +17,12 @@ The following dependencies must be present to run crocoite:  - pychrome_   - warcio_  - html5lib_ +- bottom_ (IRC client)  .. _pychrome: https://github.com/fate0/pychrome  .. _warcio: https://github.com/webrecorder/warcio  .. _html5lib: https://github.com/html5lib/html5lib-python +.. _bottom: https://github.com/numberoverzero/bottom  It is recommended to prepare a virtualenv and let pip handle the dependency  resolution for Python packages instead: @@ -119,6 +121,39 @@ does not work any more. Secondly it also saves a screenshot of the full page,  so even if future browsers cannot render and display the stored HTML a fully  rendered version of the website can be replayed instead. +Advanced usage +-------------- + +crocoite is built with the Unix philosophy (“do one thing and do it well”) in +mind. Thus ``crocoite-grab`` can only save a single page. If you want recursion +use ``crocoite-recursive``, which follows hyperlinks according to ``--policy``. +It can either recurse a maximum number of levels or grab all pages with the +same prefix as the start URL: + +.. code:: bash + +   crocoite-recursive --policy prefix http://www.example.com/dir/ output + +will save all pages in ``/dir/`` and below to individual files in the output +directory ``output``. You can customize the command used to grab individual +pages by appending it after ``output``. This way distributed grabs (ssh to a +different machine and execute the job there, queue the command with Slurm, …) +are possible. + +IRC bot +^^^^^^^ + +A simple IRC bot (“chromebot”) is provided with the command ``crocoite-irc``. +It reads its configuration from a config file like the example provided in +``contrib/chromebot.ini`` and supports the following commands: + +a <url> -j <concurrency> -r <policy> +    Archive <url> with <concurrency> processes according to recursion <policy> +s <uuid> +    Get job status for <uuid> +r <uuid> +    Revoke or abort running job with <uuid> +  Related projects  ---------------- diff --git a/crocoite/cli.py b/crocoite/cli.py index 8473a0d..63199c9 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -111,8 +111,13 @@ def irc ():      from configparser import ConfigParser      from .irc import Bot +    parser = argparse.ArgumentParser(description='IRC bot.') +    parser.add_argument('--config', '-c', help='Config file location', metavar='PATH', default='chromebot.ini') + +    args = parser.parse_args () +      config = ConfigParser () -    config.read ('chromebot.ini') +    config.read (args.config)      s = config['irc']      bot = Bot ( diff --git a/crocoite/controller.py b/crocoite/controller.py index 9c153b8..81f0638 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -312,8 +312,9 @@ class RecursiveController:      async def fetch (self, url):          """ -        Overrideable fetch action for URLs. Defaults to sequential -        SinglePageController. +        Fetch a single URL using an external command + +        command is usually crocoite-grab          """          def formatCommand (e): | 
