summaryrefslogtreecommitdiff
path: root/doc
diff options
context:
space:
mode:
Diffstat (limited to 'doc')
-rw-r--r--doc/_ext/clicklist.py45
-rw-r--r--doc/conf.py44
-rw-r--r--doc/develop.rst39
-rw-r--r--doc/index.rst36
-rw-r--r--doc/plugins.rst16
-rw-r--r--doc/rationale.rst76
-rw-r--r--doc/related.rst14
-rw-r--r--doc/usage.rst162
8 files changed, 432 insertions, 0 deletions
diff --git a/doc/_ext/clicklist.py b/doc/_ext/clicklist.py
new file mode 100644
index 0000000..a69452c
--- /dev/null
+++ b/doc/_ext/clicklist.py
@@ -0,0 +1,45 @@
+"""
+Render click.yaml config file into human-readable list of supported sites
+"""
+
+import pkg_resources, yaml
+from docutils import nodes
+from docutils.parsers.rst import Directive
+from yarl import URL
+
+class ClickList (Directive):
+ def run(self):
+ # XXX: do this once only
+ fd = pkg_resources.resource_stream ('crocoite', 'data/click.yaml')
+ config = list (yaml.safe_load_all (fd))
+
+ l = nodes.definition_list ()
+ for site in config:
+ urls = set ()
+ v = nodes.definition ()
+ vl = nodes.bullet_list ()
+ v += vl
+ for s in site['selector']:
+ i = nodes.list_item ()
+ i += nodes.paragraph (text=s['description'])
+ vl += i
+ urls.update (map (lambda x: URL(x).with_path ('/'), s.get ('urls', [])))
+
+ item = nodes.definition_list_item ()
+ term = ', '.join (map (lambda x: x.host, urls)) if urls else site['match']
+ k = nodes.term (text=term)
+ item += k
+
+ item += v
+ l += item
+ return [l]
+
+def setup(app):
+ app.add_directive ("clicklist", ClickList)
+
+ return {
+ 'version': '0.1',
+ 'parallel_read_safe': True,
+ 'parallel_write_safe': True,
+ }
+
diff --git a/doc/conf.py b/doc/conf.py
new file mode 100644
index 0000000..8336c27
--- /dev/null
+++ b/doc/conf.py
@@ -0,0 +1,44 @@
+# -*- coding: utf-8 -*-
+import os, sys
+
+# -- Project information -----------------------------------------------------
+
+project = 'crocoite'
+copyright = '2019 crocoite contributors'
+author = 'crocoite contributors'
+
+# -- General configuration ---------------------------------------------------
+
+sys.path.append(os.path.abspath("./_ext"))
+extensions = [
+ 'sphinx.ext.viewcode',
+ 'sphinx.ext.autodoc',
+ 'clicklist',
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+source_suffix = '.rst'
+master_doc = 'index'
+language = 'en'
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+pygments_style = 'tango'
+
+# -- Options for HTML output -------------------------------------------------
+
+html_theme = 'alabaster'
+html_theme_options = {
+ "description": "Preservation for the modern web",
+ "github_user": "PromyLOPh",
+ "github_repo": "crocoite",
+ "travis_button": True,
+ "github_button": True,
+ "codecov_button": True,
+ "fixed_sidebar": True,
+}
+#html_static_path = ['_static']
+html_sidebars = {
+ '**': ['about.html', 'navigation.html', 'searchbox.html'],
+}
+
diff --git a/doc/develop.rst b/doc/develop.rst
new file mode 100644
index 0000000..801ab21
--- /dev/null
+++ b/doc/develop.rst
@@ -0,0 +1,39 @@
+Development
+-----------
+
+Generally crocoite provides reasonable defaults for Google Chrome via
+:py:mod:`crocoite.devtools`. When debugging this software it might be necessary
+to open a non-headless instance of the browser by running
+
+.. code:: bash
+
+ google-chrome-stable --remote-debugging-port=9222 --auto-open-devtools-for-tabs
+
+and then passing the option :option:`--browser=http://localhost:9222` to
+:program:`crocoite-single`. This allows human intervention through the
+browser’s builtin console.
+
+Release guide
+^^^^^^^^^^^^^
+
+crocoite uses `semantic versioning`_. To create a new release, bump the version
+number in ``setup.py`` according to the linked guide, create distribution
+packages::
+
+ python setup.py sdist bdist_wheel
+
+Verify them::
+
+ twine check dist/*
+
+Try to install and use them in a separate sandbox. And finally sign and upload
+a new version to pypi_::
+
+ gpg --detach-sign --armor dist/*.tar.gz
+ twine upload dist/*
+
+Then update the documentation using :program:`sphing-doc` and upload it as well.
+
+.. _semantic versioning: https://semver.org/spec/v2.0.0.html
+.. _pypi: https://pypi.org
+
diff --git a/doc/index.rst b/doc/index.rst
new file mode 100644
index 0000000..53f5f77
--- /dev/null
+++ b/doc/index.rst
@@ -0,0 +1,36 @@
+crocoite
+========
+
+Preservation for the modern web, powered by `headless Google
+Chrome`_.
+
+.. _headless Google Chrome: https://developers.google.com/web/updates/2017/04/headless-chrome
+
+.. toctree::
+ :maxdepth: 1
+ :hidden:
+
+ usage.rst
+ plugins.rst
+ rationale.rst
+ develop.rst
+ related.rst
+
+Features
+--------
+
+Google Chrome-powered
+ HTML renderer, JavaScript engine and network stack, supporting modern web
+ technologies and protocols
+WARC output
+ Includes all network requests made by the browser
+Site interaction
+ :ref:`Auto-expand on-click content <click>`, infinite-scrolling
+DOM snapshot
+ Contains the page’s state, renderable without JavaScript
+Image screenshot
+ Entire page
+Machine-readable interface
+ Easy integration into custom tools/scripts
+
+
diff --git a/doc/plugins.rst b/doc/plugins.rst
new file mode 100644
index 0000000..062e1bf
--- /dev/null
+++ b/doc/plugins.rst
@@ -0,0 +1,16 @@
+Plugins
+=======
+
+crocoite comes with plug-ins that modify loaded sites’ or interact with them.
+
+.. _click:
+
+click
+-----
+
+The following sites are currently supported. Note this is an ongoing
+battle against layout changes and thus older software versions will stop
+working very soon.
+
+.. clicklist::
+
diff --git a/doc/rationale.rst b/doc/rationale.rst
new file mode 100644
index 0000000..f37db7c
--- /dev/null
+++ b/doc/rationale.rst
@@ -0,0 +1,76 @@
+Rationale
+---------
+
+Most modern websites depend heavily on executing code, usually JavaScript, on
+the user’s machine. They also make use of new and emerging Web technologies
+like HTML5, WebSockets, service workers and more. Even worse from the
+preservation point of view, they also require some form of user interaction to
+dynamically load more content (infinite scrolling, dynamic comment loading,
+etc).
+
+The naive approach of fetching a HTML page, parsing it and extracting
+links to referenced resources therefore is not sufficient to create a faithful
+snapshot of these web applications. A full browser, capable of running scripts and
+providing modern Web API’s is absolutely required for this task. Thankfully
+Google Chrome runs without a display (headless mode) and can be controlled by
+external programs, allowing them to navigate and extract or inject data.
+This section describes the solutions crocoite offers and explains design
+decisions taken.
+
+crocoite captures resources by listening to Chrome’s `network events`_ and
+requesting the response body using `Network.getResponseBody`_. This approach
+has caveats: The original HTTP requests and responses, as sent over the wire,
+are not available. They are reconstructed from parsed data. The character
+encoding for text documents is changed to UTF-8. And the content body of HTTP
+redirects cannot be retrieved due to a race condition.
+
+.. _network events: https://chromedevtools.github.io/devtools-protocol/1-3/Network
+.. _Network.getResponseBody: https://chromedevtools.github.io/devtools-protocol/1-3/Network#method-getResponseBody
+
+But at the same time it allows crocoite to rely on Chrome’s well-tested network
+stack and HTTP parser. Thus it supports HTTP version 1 and 2 as well as
+transport protocols like SSL and QUIC. Depending on Chrome also eliminates the
+need for a man-in-the-middle proxy, like warcprox_, which has to decrypt SSL
+traffic and present a fake certificate to the browser in order to store the
+transmitted content.
+
+.. _warcprox: https://github.com/internetarchive/warcprox
+
+WARC records generated by crocoite therefore are an abstract view on the
+resource they represent and not necessarily the data sent over the wire. A URL
+fetched with HTTP/2 for example will still result in a HTTP/1.1
+request/response pair in the WARC file. This may be undesireable from
+an archivist’s point of view (“save the data exactly like we received it”). But
+this level of abstraction is inevitable when dealing with more than one
+protocol.
+
+crocoite also interacts with and therefore alters the grabbed websites. It does
+so by injecting `behavior scripts`_ into the site. Typically these are written
+in JavaScript, because interacting with a page is easier this way. These
+scripts then perform different tasks: Extracting targets from visible
+hyperlinks, clicking buttons or scrolling the website to to load more content,
+as well as taking a static screenshot of ``<canvas>`` elements for the DOM
+snapshot (see below).
+
+.. _behavior scripts: https://github.com/PromyLOPh/crocoite/tree/master/crocoite/data
+
+Replaying archived WARC’s can be quite challenging and might not be possible
+with current technology (or even at all):
+
+- Some sites request assets based on screen resolution, pixel ratio and
+ supported image formats (webp). Replaying those with different parameters
+ won’t work, since assets for those are missing. Example: missguided.com.
+- Some fetch different scripts based on user agent. Example: youtube.com.
+- Requests containing randomly generated JavaScript callback function names
+ won’t work. Example: weather.com.
+- Range requests (Range: bytes=1-100) are captured as-is, making playback
+ difficult
+
+crocoite offers two methods to work around these issues. Firstly it can save a
+DOM snapshot to the WARC file. It contains the entire DOM in HTML format minus
+``<script>`` tags after the site has been fully loaded and thus can be
+displayed without executing scripts. Obviously JavaScript-based navigation
+does not work any more. Secondly it also saves a screenshot of the full page,
+so even if future browsers cannot render and display the stored HTML a fully
+rendered version of the website can be replayed instead.
+
diff --git a/doc/related.rst b/doc/related.rst
new file mode 100644
index 0000000..62e2569
--- /dev/null
+++ b/doc/related.rst
@@ -0,0 +1,14 @@
+Related projects
+----------------
+
+brozzler_
+ Uses Google Chrome as well, but intercepts traffic using a proxy. Supports
+ distributed crawling and immediate playback.
+Squidwarc_
+ Communicates with headless Google Chrome and uses the Network API to
+ retrieve requests like crocoite. Supports recursive crawls and page
+ scrolling, but neither custom JavaScript nor distributed crawling.
+
+.. _brozzler: https://github.com/internetarchive/brozzler
+.. _Squidwarc: https://github.com/N0taN3rd/Squidwarc
+
diff --git a/doc/usage.rst b/doc/usage.rst
new file mode 100644
index 0000000..34a3e7b
--- /dev/null
+++ b/doc/usage.rst
@@ -0,0 +1,162 @@
+Usage
+-----
+
+Quick start using pywb_, expects Google Chrome to be installed already:
+
+.. code:: bash
+
+ pip install crocoite pywb
+ crocoite http://example.com/ example.com.warc.gz
+ wb-manager init test && wb-manager add test example.com.warc.gz
+ wayback &
+ $BROWSER http://localhost:8080
+
+.. _pywb: https://github.com/ikreymer/pywb
+
+It is recommended to install at least Micrsoft’s Corefonts_ as well as DejaVu_,
+Liberation_ or a similar font family covering a wide range of character sets.
+Otherwise page screenshots may be unusable due to missing glyphs.
+
+.. _Corefonts: http://corefonts.sourceforge.net/
+.. _DejaVu: https://dejavu-fonts.github.io/
+.. _Liberation: https://pagure.io/liberation-fonts
+
+Recursion
+^^^^^^^^^
+
+.. program:: crocoite
+
+By default crocoite will only retrieve the URL specified on the command line.
+However it can follow links as well. There’s currently two recursion strategies
+available, depth- and prefix-based.
+
+.. code:: bash
+
+ crocoite -r 1 https://example.com/ example.com.warc.gz
+
+will retrieve ``example.com`` and all pages directly refered to by it.
+Increasing the number increases the depth, so a value of :samp:`2` would first grab
+``example.com``, queue all pages linked there as well as every reference on
+each of those pages.
+
+On the other hand
+
+.. code:: bash
+
+ crocoite -r prefix https://example.com/dir/ example.com.warc.gz
+
+will retrieve the URL specified and all pages referenced which have the same
+URL prefix. There trailing slash is significant. Without it crocoite would also
+grab ``/dir-something`` or ``/dir.html`` for example.
+
+If an output file template is used each page is written to an individual file. For example
+
+.. code:: bash
+
+ crocoite -r prefix https://example.com/ '{host}-{date}-{seqnum}.warc.gz'
+
+will write one file page page to files like
+:file:`example.com-2019-09-09T15:15:15+02:00-1.warc.gz`. ``seqnum`` is unique to
+each page of a single job and should always be used.
+
+When running a recursive job, increasing the concurrency (i.e. how many pages
+are fetched at the same time) can speed up the process. For example you can
+pass :option:`-j` :samp:`4` to retrieve four pages at the same time. Keep in mind
+that each process starts a full browser that requires a lot of resources (one
+to two GB of RAM and one or two CPU cores).
+
+Customizing
+^^^^^^^^^^^
+
+.. program:: crocoite-single
+
+Under the hood :program:`crocoite` starts one instance of
+:program:`crocoite-single` to fetch each page. You can customize its options by
+appending a command template like this:
+
+.. code:: bash
+
+ crocoite -r prefix https://example.com example.com.warc.gz -- \
+ crocoite-single --timeout 5 -k '{url}' '{dest}'
+
+This reduces the global timeout to 5 seconds and ignores TLS errors. If an
+option is prefixed with an exclamation mark (``!``) it will not be expanded.
+This is useful for passing :option:`--warcinfo`, which expects JSON-encoded data.
+
+Command line options
+^^^^^^^^^^^^^^^^^^^^
+
+Below is a list of all command line arguments available:
+
+.. program:: crocoite
+
+crocoite
+++++++++
+
+Front-end with recursion support and simple job management.
+
+.. option:: -j N, --concurrency N
+
+ Maximum number of concurrent fetch jobs.
+
+.. option:: -r POLICY, --recursion POLICY
+
+ Enables recursion based on POLICY, which can be a positive integer
+ (recursion depth) or the string :kbd:`prefix`.
+
+.. option:: --tempdir DIR
+
+ Directory for temporary WARC files.
+
+.. program:: crocoite-single
+
+crocoite-single
++++++++++++++++
+
+Back-end to fetch a single page.
+
+.. option:: -b SET-COOKIE, --cookie SET-COOKIE
+
+ Add cookie to browser’s cookie jar. This option always *appends* cookies,
+ replacing those provided by :option:`-c`.
+
+ .. versionadded:: 1.1
+
+.. option:: -c FILE, --cookie-jar FILE
+
+ Load cookies from FILE. :program:`crocoite` provides a default cookie file,
+ which contains cookies to, for example, circumvent age restrictions. This
+ option *replaces* that default file.
+
+ .. versionadded:: 1.1
+
+.. option:: --idle-timeout SEC
+
+ Time after which a page is considered “idle”.
+
+.. option:: -k, --insecure
+
+ Allow insecure connections, i.e. self-signed ore expired HTTPS certificates.
+
+.. option:: --timeout SEC
+
+ Global archiving timeout.
+
+
+.. option:: --warcinfo JSON
+
+ Inject additional JSON-encoded information into the resulting WARC.
+
+IRC bot
+^^^^^^^
+
+A simple IRC bot (“chromebot”) is provided with the command :program:`crocoite-irc`.
+It reads its configuration from a config file like the example provided in
+:file:`contrib/chromebot.json` and supports the following commands:
+
+a <url> -j <concurrency> -r <policy> -k -b <set-cookie>
+ Archive <url> with <concurrency> processes according to recursion <policy>
+s <uuid>
+ Get job status for <uuid>
+r <uuid>
+ Revoke or abort running job with <uuid>