diff options
Diffstat (limited to 'doc')
-rw-r--r-- | doc/_ext/clicklist.py | 45 | ||||
-rw-r--r-- | doc/conf.py | 44 | ||||
-rw-r--r-- | doc/develop.rst | 39 | ||||
-rw-r--r-- | doc/index.rst | 36 | ||||
-rw-r--r-- | doc/plugins.rst | 16 | ||||
-rw-r--r-- | doc/rationale.rst | 76 | ||||
-rw-r--r-- | doc/related.rst | 14 | ||||
-rw-r--r-- | doc/usage.rst | 162 |
8 files changed, 432 insertions, 0 deletions
diff --git a/doc/_ext/clicklist.py b/doc/_ext/clicklist.py new file mode 100644 index 0000000..a69452c --- /dev/null +++ b/doc/_ext/clicklist.py @@ -0,0 +1,45 @@ +""" +Render click.yaml config file into human-readable list of supported sites +""" + +import pkg_resources, yaml +from docutils import nodes +from docutils.parsers.rst import Directive +from yarl import URL + +class ClickList (Directive): + def run(self): + # XXX: do this once only + fd = pkg_resources.resource_stream ('crocoite', 'data/click.yaml') + config = list (yaml.safe_load_all (fd)) + + l = nodes.definition_list () + for site in config: + urls = set () + v = nodes.definition () + vl = nodes.bullet_list () + v += vl + for s in site['selector']: + i = nodes.list_item () + i += nodes.paragraph (text=s['description']) + vl += i + urls.update (map (lambda x: URL(x).with_path ('/'), s.get ('urls', []))) + + item = nodes.definition_list_item () + term = ', '.join (map (lambda x: x.host, urls)) if urls else site['match'] + k = nodes.term (text=term) + item += k + + item += v + l += item + return [l] + +def setup(app): + app.add_directive ("clicklist", ClickList) + + return { + 'version': '0.1', + 'parallel_read_safe': True, + 'parallel_write_safe': True, + } + diff --git a/doc/conf.py b/doc/conf.py new file mode 100644 index 0000000..8336c27 --- /dev/null +++ b/doc/conf.py @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- +import os, sys + +# -- Project information ----------------------------------------------------- + +project = 'crocoite' +copyright = '2019 crocoite contributors' +author = 'crocoite contributors' + +# -- General configuration --------------------------------------------------- + +sys.path.append(os.path.abspath("./_ext")) +extensions = [ + 'sphinx.ext.viewcode', + 'sphinx.ext.autodoc', + 'clicklist', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +source_suffix = '.rst' +master_doc = 'index' +language = 'en' +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] +pygments_style = 'tango' + +# -- Options for HTML output ------------------------------------------------- + +html_theme = 'alabaster' +html_theme_options = { + "description": "Preservation for the modern web", + "github_user": "PromyLOPh", + "github_repo": "crocoite", + "travis_button": True, + "github_button": True, + "codecov_button": True, + "fixed_sidebar": True, +} +#html_static_path = ['_static'] +html_sidebars = { + '**': ['about.html', 'navigation.html', 'searchbox.html'], +} + diff --git a/doc/develop.rst b/doc/develop.rst new file mode 100644 index 0000000..801ab21 --- /dev/null +++ b/doc/develop.rst @@ -0,0 +1,39 @@ +Development +----------- + +Generally crocoite provides reasonable defaults for Google Chrome via +:py:mod:`crocoite.devtools`. When debugging this software it might be necessary +to open a non-headless instance of the browser by running + +.. code:: bash + + google-chrome-stable --remote-debugging-port=9222 --auto-open-devtools-for-tabs + +and then passing the option :option:`--browser=http://localhost:9222` to +:program:`crocoite-single`. This allows human intervention through the +browser’s builtin console. + +Release guide +^^^^^^^^^^^^^ + +crocoite uses `semantic versioning`_. To create a new release, bump the version +number in ``setup.py`` according to the linked guide, create distribution +packages:: + + python setup.py sdist bdist_wheel + +Verify them:: + + twine check dist/* + +Try to install and use them in a separate sandbox. And finally sign and upload +a new version to pypi_:: + + gpg --detach-sign --armor dist/*.tar.gz + twine upload dist/* + +Then update the documentation using :program:`sphing-doc` and upload it as well. + +.. _semantic versioning: https://semver.org/spec/v2.0.0.html +.. _pypi: https://pypi.org + diff --git a/doc/index.rst b/doc/index.rst new file mode 100644 index 0000000..53f5f77 --- /dev/null +++ b/doc/index.rst @@ -0,0 +1,36 @@ +crocoite +======== + +Preservation for the modern web, powered by `headless Google +Chrome`_. + +.. _headless Google Chrome: https://developers.google.com/web/updates/2017/04/headless-chrome + +.. toctree:: + :maxdepth: 1 + :hidden: + + usage.rst + plugins.rst + rationale.rst + develop.rst + related.rst + +Features +-------- + +Google Chrome-powered + HTML renderer, JavaScript engine and network stack, supporting modern web + technologies and protocols +WARC output + Includes all network requests made by the browser +Site interaction + :ref:`Auto-expand on-click content <click>`, infinite-scrolling +DOM snapshot + Contains the page’s state, renderable without JavaScript +Image screenshot + Entire page +Machine-readable interface + Easy integration into custom tools/scripts + + diff --git a/doc/plugins.rst b/doc/plugins.rst new file mode 100644 index 0000000..062e1bf --- /dev/null +++ b/doc/plugins.rst @@ -0,0 +1,16 @@ +Plugins +======= + +crocoite comes with plug-ins that modify loaded sites’ or interact with them. + +.. _click: + +click +----- + +The following sites are currently supported. Note this is an ongoing +battle against layout changes and thus older software versions will stop +working very soon. + +.. clicklist:: + diff --git a/doc/rationale.rst b/doc/rationale.rst new file mode 100644 index 0000000..f37db7c --- /dev/null +++ b/doc/rationale.rst @@ -0,0 +1,76 @@ +Rationale +--------- + +Most modern websites depend heavily on executing code, usually JavaScript, on +the user’s machine. They also make use of new and emerging Web technologies +like HTML5, WebSockets, service workers and more. Even worse from the +preservation point of view, they also require some form of user interaction to +dynamically load more content (infinite scrolling, dynamic comment loading, +etc). + +The naive approach of fetching a HTML page, parsing it and extracting +links to referenced resources therefore is not sufficient to create a faithful +snapshot of these web applications. A full browser, capable of running scripts and +providing modern Web API’s is absolutely required for this task. Thankfully +Google Chrome runs without a display (headless mode) and can be controlled by +external programs, allowing them to navigate and extract or inject data. +This section describes the solutions crocoite offers and explains design +decisions taken. + +crocoite captures resources by listening to Chrome’s `network events`_ and +requesting the response body using `Network.getResponseBody`_. This approach +has caveats: The original HTTP requests and responses, as sent over the wire, +are not available. They are reconstructed from parsed data. The character +encoding for text documents is changed to UTF-8. And the content body of HTTP +redirects cannot be retrieved due to a race condition. + +.. _network events: https://chromedevtools.github.io/devtools-protocol/1-3/Network +.. _Network.getResponseBody: https://chromedevtools.github.io/devtools-protocol/1-3/Network#method-getResponseBody + +But at the same time it allows crocoite to rely on Chrome’s well-tested network +stack and HTTP parser. Thus it supports HTTP version 1 and 2 as well as +transport protocols like SSL and QUIC. Depending on Chrome also eliminates the +need for a man-in-the-middle proxy, like warcprox_, which has to decrypt SSL +traffic and present a fake certificate to the browser in order to store the +transmitted content. + +.. _warcprox: https://github.com/internetarchive/warcprox + +WARC records generated by crocoite therefore are an abstract view on the +resource they represent and not necessarily the data sent over the wire. A URL +fetched with HTTP/2 for example will still result in a HTTP/1.1 +request/response pair in the WARC file. This may be undesireable from +an archivist’s point of view (“save the data exactly like we received it”). But +this level of abstraction is inevitable when dealing with more than one +protocol. + +crocoite also interacts with and therefore alters the grabbed websites. It does +so by injecting `behavior scripts`_ into the site. Typically these are written +in JavaScript, because interacting with a page is easier this way. These +scripts then perform different tasks: Extracting targets from visible +hyperlinks, clicking buttons or scrolling the website to to load more content, +as well as taking a static screenshot of ``<canvas>`` elements for the DOM +snapshot (see below). + +.. _behavior scripts: https://github.com/PromyLOPh/crocoite/tree/master/crocoite/data + +Replaying archived WARC’s can be quite challenging and might not be possible +with current technology (or even at all): + +- Some sites request assets based on screen resolution, pixel ratio and + supported image formats (webp). Replaying those with different parameters + won’t work, since assets for those are missing. Example: missguided.com. +- Some fetch different scripts based on user agent. Example: youtube.com. +- Requests containing randomly generated JavaScript callback function names + won’t work. Example: weather.com. +- Range requests (Range: bytes=1-100) are captured as-is, making playback + difficult + +crocoite offers two methods to work around these issues. Firstly it can save a +DOM snapshot to the WARC file. It contains the entire DOM in HTML format minus +``<script>`` tags after the site has been fully loaded and thus can be +displayed without executing scripts. Obviously JavaScript-based navigation +does not work any more. Secondly it also saves a screenshot of the full page, +so even if future browsers cannot render and display the stored HTML a fully +rendered version of the website can be replayed instead. + diff --git a/doc/related.rst b/doc/related.rst new file mode 100644 index 0000000..62e2569 --- /dev/null +++ b/doc/related.rst @@ -0,0 +1,14 @@ +Related projects +---------------- + +brozzler_ + Uses Google Chrome as well, but intercepts traffic using a proxy. Supports + distributed crawling and immediate playback. +Squidwarc_ + Communicates with headless Google Chrome and uses the Network API to + retrieve requests like crocoite. Supports recursive crawls and page + scrolling, but neither custom JavaScript nor distributed crawling. + +.. _brozzler: https://github.com/internetarchive/brozzler +.. _Squidwarc: https://github.com/N0taN3rd/Squidwarc + diff --git a/doc/usage.rst b/doc/usage.rst new file mode 100644 index 0000000..34a3e7b --- /dev/null +++ b/doc/usage.rst @@ -0,0 +1,162 @@ +Usage +----- + +Quick start using pywb_, expects Google Chrome to be installed already: + +.. code:: bash + + pip install crocoite pywb + crocoite http://example.com/ example.com.warc.gz + wb-manager init test && wb-manager add test example.com.warc.gz + wayback & + $BROWSER http://localhost:8080 + +.. _pywb: https://github.com/ikreymer/pywb + +It is recommended to install at least Micrsoft’s Corefonts_ as well as DejaVu_, +Liberation_ or a similar font family covering a wide range of character sets. +Otherwise page screenshots may be unusable due to missing glyphs. + +.. _Corefonts: http://corefonts.sourceforge.net/ +.. _DejaVu: https://dejavu-fonts.github.io/ +.. _Liberation: https://pagure.io/liberation-fonts + +Recursion +^^^^^^^^^ + +.. program:: crocoite + +By default crocoite will only retrieve the URL specified on the command line. +However it can follow links as well. There’s currently two recursion strategies +available, depth- and prefix-based. + +.. code:: bash + + crocoite -r 1 https://example.com/ example.com.warc.gz + +will retrieve ``example.com`` and all pages directly refered to by it. +Increasing the number increases the depth, so a value of :samp:`2` would first grab +``example.com``, queue all pages linked there as well as every reference on +each of those pages. + +On the other hand + +.. code:: bash + + crocoite -r prefix https://example.com/dir/ example.com.warc.gz + +will retrieve the URL specified and all pages referenced which have the same +URL prefix. There trailing slash is significant. Without it crocoite would also +grab ``/dir-something`` or ``/dir.html`` for example. + +If an output file template is used each page is written to an individual file. For example + +.. code:: bash + + crocoite -r prefix https://example.com/ '{host}-{date}-{seqnum}.warc.gz' + +will write one file page page to files like +:file:`example.com-2019-09-09T15:15:15+02:00-1.warc.gz`. ``seqnum`` is unique to +each page of a single job and should always be used. + +When running a recursive job, increasing the concurrency (i.e. how many pages +are fetched at the same time) can speed up the process. For example you can +pass :option:`-j` :samp:`4` to retrieve four pages at the same time. Keep in mind +that each process starts a full browser that requires a lot of resources (one +to two GB of RAM and one or two CPU cores). + +Customizing +^^^^^^^^^^^ + +.. program:: crocoite-single + +Under the hood :program:`crocoite` starts one instance of +:program:`crocoite-single` to fetch each page. You can customize its options by +appending a command template like this: + +.. code:: bash + + crocoite -r prefix https://example.com example.com.warc.gz -- \ + crocoite-single --timeout 5 -k '{url}' '{dest}' + +This reduces the global timeout to 5 seconds and ignores TLS errors. If an +option is prefixed with an exclamation mark (``!``) it will not be expanded. +This is useful for passing :option:`--warcinfo`, which expects JSON-encoded data. + +Command line options +^^^^^^^^^^^^^^^^^^^^ + +Below is a list of all command line arguments available: + +.. program:: crocoite + +crocoite +++++++++ + +Front-end with recursion support and simple job management. + +.. option:: -j N, --concurrency N + + Maximum number of concurrent fetch jobs. + +.. option:: -r POLICY, --recursion POLICY + + Enables recursion based on POLICY, which can be a positive integer + (recursion depth) or the string :kbd:`prefix`. + +.. option:: --tempdir DIR + + Directory for temporary WARC files. + +.. program:: crocoite-single + +crocoite-single ++++++++++++++++ + +Back-end to fetch a single page. + +.. option:: -b SET-COOKIE, --cookie SET-COOKIE + + Add cookie to browser’s cookie jar. This option always *appends* cookies, + replacing those provided by :option:`-c`. + + .. versionadded:: 1.1 + +.. option:: -c FILE, --cookie-jar FILE + + Load cookies from FILE. :program:`crocoite` provides a default cookie file, + which contains cookies to, for example, circumvent age restrictions. This + option *replaces* that default file. + + .. versionadded:: 1.1 + +.. option:: --idle-timeout SEC + + Time after which a page is considered “idle”. + +.. option:: -k, --insecure + + Allow insecure connections, i.e. self-signed ore expired HTTPS certificates. + +.. option:: --timeout SEC + + Global archiving timeout. + + +.. option:: --warcinfo JSON + + Inject additional JSON-encoded information into the resulting WARC. + +IRC bot +^^^^^^^ + +A simple IRC bot (“chromebot”) is provided with the command :program:`crocoite-irc`. +It reads its configuration from a config file like the example provided in +:file:`contrib/chromebot.json` and supports the following commands: + +a <url> -j <concurrency> -r <policy> -k -b <set-cookie> + Archive <url> with <concurrency> processes according to recursion <policy> +s <uuid> + Get job status for <uuid> +r <uuid> + Revoke or abort running job with <uuid> |