diff options
Diffstat (limited to 'doc')
-rw-r--r-- | doc/conf.py | 178 | ||||
-rw-r--r-- | doc/develop.rst | 17 | ||||
-rw-r--r-- | doc/index.rst | 41 | ||||
-rw-r--r-- | doc/install.rst | 47 | ||||
-rw-r--r-- | doc/rationale.rst | 76 | ||||
-rw-r--r-- | doc/related.rst | 14 | ||||
-rw-r--r-- | doc/usage.rst | 15 |
7 files changed, 388 insertions, 0 deletions
diff --git a/doc/conf.py b/doc/conf.py new file mode 100644 index 0000000..746dbc6 --- /dev/null +++ b/doc/conf.py @@ -0,0 +1,178 @@ +# -*- coding: utf-8 -*- +# +# Configuration file for the Sphinx documentation builder. +# +# This file does only contain a selection of the most common options. For a +# full list see the documentation: +# http://www.sphinx-doc.org/en/master/config + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + + +# -- Project information ----------------------------------------------------- + +project = 'crocoite' +copyright = '' +author = '' + +# The short X.Y version +version = '' +# The full version, including alpha/beta/rc tags +release = '0.1' + + +# -- General configuration --------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.viewcode', + 'sphinx.ext.autodoc', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The master toctree document. +master_doc = 'index' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = None + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'alabaster' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +# html_theme_options = {} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Custom sidebar templates, must be a dictionary that maps document names +# to template names. +# +# The default sidebars (for documents that don't match any pattern) are +# defined by theme itself. Builtin themes are using these templates by +# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', +# 'searchbox.html']``. +# +# html_sidebars = {} + + +# -- Options for HTMLHelp output --------------------------------------------- + +# Output file base name for HTML help builder. +htmlhelp_basename = 'crocoitedoc' + + +# -- Options for LaTeX output ------------------------------------------------ + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'crocoite.tex', 'crocoite Documentation', + 'crocoite contributors', 'manual'), +] + + +# -- Options for manual page output ------------------------------------------ + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'crocoite', 'crocoite Documentation', + [author], 1) +] + + +# -- Options for Texinfo output ---------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'crocoite', 'crocoite Documentation', + author, 'crocoite', 'One line description of project.', + 'Miscellaneous'), +] + + +# -- Options for Epub output ------------------------------------------------- + +# Bibliographic Dublin Core info. +epub_title = project + +# The unique identifier of the text. This can be a ISBN number +# or the project homepage. +# +# epub_identifier = '' + +# A unique identification for the text. +# +# epub_uid = '' + +# A list of files that should not be packed into the epub file. +epub_exclude_files = ['search.html'] + + +# -- Extension configuration ------------------------------------------------- diff --git a/doc/develop.rst b/doc/develop.rst new file mode 100644 index 0000000..0113c92 --- /dev/null +++ b/doc/develop.rst @@ -0,0 +1,17 @@ +Development +----------- + +Generally crocoite provides reasonable defaults for Google Chrome via its +`devtools module`_. When debugging this software it might be necessary to open +a non-headless instance of the browser by running + +.. code:: bash + + google-chrome-stable --remote-debugging-port=9222 --auto-open-devtools-for-tabs + +and then passing the option ``--browser=http://localhost:9222`` to +``crocoite-grab``. This allows human intervention through the browser’s builtin +console. + +.. _devtools module: crocoite/devtools.py + diff --git a/doc/index.rst b/doc/index.rst new file mode 100644 index 0000000..d62c7e1 --- /dev/null +++ b/doc/index.rst @@ -0,0 +1,41 @@ +crocoite +======== + +Preservation for the modern web, powered by `headless Google +Chrome`_. + +.. image:: https://travis-ci.org/PromyLOPh/crocoite.svg?branch=master + :target: https://travis-ci.org/PromyLOPh/crocoite + +.. image:: https://codecov.io/gh/PromyLOPh/crocoite/branch/master/graph/badge.svg + :target: https://codecov.io/gh/PromyLOPh/crocoite + +.. _headless Google Chrome: https://developers.google.com/web/updates/2017/04/headless-chrome + +.. toctree:: + :maxdepth: 1 + :hidden: + + install.rst + usage.rst + rationale.rst + develop.rst + related.rst + +Features +-------- + +Google Chrome-powered + HTML renderer, JavaScript engine and network stack, supporting modern web + technologies and protocols +WARC output + Includes all network requests made by the browser +Site interaction + Auto-expand on-click content, infinite-scrolling +DOM snapshot + Contains the page’s state, renderable without JavaScript +Image screenshot + Entire page +Machine-readable interface + Easy integration into custom tools/scripts + diff --git a/doc/install.rst b/doc/install.rst new file mode 100644 index 0000000..5e76956 --- /dev/null +++ b/doc/install.rst @@ -0,0 +1,47 @@ +Installation +------------ + +These dependencies must be present to run crocoite: + +- Python ≥3.6 +- PyYAML_ +- aiohttp_ +- websockets_ +- warcio_ +- html5lib_ +- yarl_ +- multidict_ +- bottom_ (IRC client) +- `Google Chrome`_ + +.. _PyYAML: https://pyyaml.org/wiki/PyYAML +.. _aiohttp: https://aiohttp.readthedocs.io/ +.. _websockets: https://websockets.readthedocs.io/ +.. _warcio: https://github.com/webrecorder/warcio +.. _html5lib: https://github.com/html5lib/html5lib-python +.. _bottom: https://github.com/numberoverzero/bottom +.. _Google Chrome: https://www.google.com/chrome/ +.. _yarl: https://yarl.readthedocs.io/ +.. _multidict: https://multidict.readthedocs.io/ + +The following commands clone the repository from GitHub_, set up a virtual +environment and install crocoite: + +.. _GitHub: https://github.com/PromyLOPh/crocoite + +.. code:: bash + + git clone https://github.com/PromyLOPh/crocoite.git + cd crocoite + virtualenv -p python3 sandbox + source sandbox/bin/activate + pip install . + +It is recommended to install at least Micrsoft’s Corefonts_ as well as DejaVu_, +Liberation_ or a similar font family covering a wide range of character sets. +Otherwise page screenshots may be unusable due to missing glyphs. + +.. _Corefonts: http://corefonts.sourceforge.net/ +.. _DejaVu: https://dejavu-fonts.github.io/ +.. _Liberation: https://pagure.io/liberation-fonts + diff --git a/doc/rationale.rst b/doc/rationale.rst new file mode 100644 index 0000000..f37db7c --- /dev/null +++ b/doc/rationale.rst @@ -0,0 +1,76 @@ +Rationale +--------- + +Most modern websites depend heavily on executing code, usually JavaScript, on +the user’s machine. They also make use of new and emerging Web technologies +like HTML5, WebSockets, service workers and more. Even worse from the +preservation point of view, they also require some form of user interaction to +dynamically load more content (infinite scrolling, dynamic comment loading, +etc). + +The naive approach of fetching a HTML page, parsing it and extracting +links to referenced resources therefore is not sufficient to create a faithful +snapshot of these web applications. A full browser, capable of running scripts and +providing modern Web API’s is absolutely required for this task. Thankfully +Google Chrome runs without a display (headless mode) and can be controlled by +external programs, allowing them to navigate and extract or inject data. +This section describes the solutions crocoite offers and explains design +decisions taken. + +crocoite captures resources by listening to Chrome’s `network events`_ and +requesting the response body using `Network.getResponseBody`_. This approach +has caveats: The original HTTP requests and responses, as sent over the wire, +are not available. They are reconstructed from parsed data. The character +encoding for text documents is changed to UTF-8. And the content body of HTTP +redirects cannot be retrieved due to a race condition. + +.. _network events: https://chromedevtools.github.io/devtools-protocol/1-3/Network +.. _Network.getResponseBody: https://chromedevtools.github.io/devtools-protocol/1-3/Network#method-getResponseBody + +But at the same time it allows crocoite to rely on Chrome’s well-tested network +stack and HTTP parser. Thus it supports HTTP version 1 and 2 as well as +transport protocols like SSL and QUIC. Depending on Chrome also eliminates the +need for a man-in-the-middle proxy, like warcprox_, which has to decrypt SSL +traffic and present a fake certificate to the browser in order to store the +transmitted content. + +.. _warcprox: https://github.com/internetarchive/warcprox + +WARC records generated by crocoite therefore are an abstract view on the +resource they represent and not necessarily the data sent over the wire. A URL +fetched with HTTP/2 for example will still result in a HTTP/1.1 +request/response pair in the WARC file. This may be undesireable from +an archivist’s point of view (“save the data exactly like we received it”). But +this level of abstraction is inevitable when dealing with more than one +protocol. + +crocoite also interacts with and therefore alters the grabbed websites. It does +so by injecting `behavior scripts`_ into the site. Typically these are written +in JavaScript, because interacting with a page is easier this way. These +scripts then perform different tasks: Extracting targets from visible +hyperlinks, clicking buttons or scrolling the website to to load more content, +as well as taking a static screenshot of ``<canvas>`` elements for the DOM +snapshot (see below). + +.. _behavior scripts: https://github.com/PromyLOPh/crocoite/tree/master/crocoite/data + +Replaying archived WARC’s can be quite challenging and might not be possible +with current technology (or even at all): + +- Some sites request assets based on screen resolution, pixel ratio and + supported image formats (webp). Replaying those with different parameters + won’t work, since assets for those are missing. Example: missguided.com. +- Some fetch different scripts based on user agent. Example: youtube.com. +- Requests containing randomly generated JavaScript callback function names + won’t work. Example: weather.com. +- Range requests (Range: bytes=1-100) are captured as-is, making playback + difficult + +crocoite offers two methods to work around these issues. Firstly it can save a +DOM snapshot to the WARC file. It contains the entire DOM in HTML format minus +``<script>`` tags after the site has been fully loaded and thus can be +displayed without executing scripts. Obviously JavaScript-based navigation +does not work any more. Secondly it also saves a screenshot of the full page, +so even if future browsers cannot render and display the stored HTML a fully +rendered version of the website can be replayed instead. + diff --git a/doc/related.rst b/doc/related.rst new file mode 100644 index 0000000..62e2569 --- /dev/null +++ b/doc/related.rst @@ -0,0 +1,14 @@ +Related projects +---------------- + +brozzler_ + Uses Google Chrome as well, but intercepts traffic using a proxy. Supports + distributed crawling and immediate playback. +Squidwarc_ + Communicates with headless Google Chrome and uses the Network API to + retrieve requests like crocoite. Supports recursive crawls and page + scrolling, but neither custom JavaScript nor distributed crawling. + +.. _brozzler: https://github.com/internetarchive/brozzler +.. _Squidwarc: https://github.com/N0taN3rd/Squidwarc + diff --git a/doc/usage.rst b/doc/usage.rst new file mode 100644 index 0000000..9049356 --- /dev/null +++ b/doc/usage.rst @@ -0,0 +1,15 @@ +Usage +----- + +.. autofunction:: crocoite.cli.single + +Recursion +^^^^^^^^^ + +.. autofunction:: crocoite.cli.recursive + +IRC bot +^^^^^^^ + +.. autofunction:: crocoite.cli.irc + |