45 files changed, 5816 insertions, 1686 deletions
diff --git a/.gitignore b/.gitignore
index 836e311..80f8304 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,7 @@
 __pycache__
 *.sw?
 *.egg-info/
+.pytest_cache/
+coverage.xml
+htmlcov
+.coverage
diff --git a/.travis.yml b/.travis.yml
index e09e518..b1d417c 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,14 +1,23 @@
-dist: trusty
+dist: xenial
 language: python
-python:
-    - "3.4"
-    - "3.5"
-    - "3.6"
-    - "pypy3.5"
+matrix:
+  include:
+    - python: "3.6"
+    - python: "3.7"
+    - python: "3.8"
+    - python: "3.6-dev"
+    - python: "3.7-dev"
+    - python: "3.8-dev"
+  allow_failures:
+    - python: "3.6-dev"
+    - python: "3.7-dev"
+    - python: "3.8-dev"
 install:
     - pip install .
 script:
-    - python -m unittest crocoite.browser
+    - python setup.py test
 addons:
     chrome: stable
 sudo: required
+after_success:
+  - bash <(curl -s https://codecov.io/bash)
diff --git a/README.rst b/README.rst
index 08e14ba..b1d9e5b 100644
--- a/README.rst
+++ b/README.rst
@@ -1,134 +1,15 @@
 crocoite
 ========
 
-Archive websites using `headless Google Chrome`_ and its DevTools protocol.
+.. code:: bash
 
-.. image:: https://travis-ci.org/PromyLOPh/crocoite.svg?branch=master
-    :target: https://travis-ci.org/PromyLOPh/crocoite
-
-.. _headless Google Chrome: https://developers.google.com/web/updates/2017/04/headless-chrome
-
-Dependencies
-------------
-
-- Python 3
-- pychrome_ 
-- warcio_
-- html5lib_
-- Celery_
-
-.. _pychrome: https://github.com/fate0/pychrome
-.. _warcio: https://github.com/webrecorder/warcio
-.. _html5lib: https://github.com/html5lib/html5lib-python
-.. _Celery: http://www.celeryproject.org/
-
-Usage
------
-
-One-shot commandline interface and pywb_ playback::
-
-    crocoite-grab --output example.com.warc.gz http://example.com/
-    rm -rf collections && wb-manager init test && wb-manager add test example.com.warc.gz
+    pip install crocoite pywb
+    crocoite http://example.com/ example.com.warc.gz
+    wb-manager init test && wb-manager add test example.com.warc.gz
     wayback &
     $BROWSER http://localhost:8080
 
-.. _pywb: https://github.com/ikreymer/pywb
-
-Behavior scripts
-^^^^^^^^^^^^^^^^
-
-A lot of sites need some form of interaction to dynamically load more content. Twitter for
-instance continously loads new posts when scrolling to the bottom of the page.
-crocoite can emulate these user interactions (and more) by combining control
-code written in Python and injecting JavaScript into the page. The code can be
-limited to certain URLs or apply to every page loaded. By default all scripts
-available are enabled, see command line flag ``--behavior``.
-
-Caveats
--------
-
-- Original HTTP requests/responses are not available. They are rebuilt from
-  parsed data. Character encoding for text documents is changed to UTF-8.
-- Some sites request assets based on screen resolution, pixel ratio and
-  supported image formats (webp). Replaying those with different parameters
-  won’t work, since assets for those are missing. Example: missguided.com.
-- Some fetch different scripts based on user agent. Example: youtube.com.
-- Requests containing randomly generated JavaScript callback function names
-  won’t work. Example: weather.com.
-- Range requests (Range: bytes=1-100) are captured as-is, making playback
-  difficult
-- Content body of HTTP redirects cannot be retrived due to race condition
-
-Most of these issues can be worked around by using the DOM snapshot, which is
-also saved. This causes its own set of issues though:
-
-- JavaScript-based navigation does not work.
-
-Distributed crawling
---------------------
-
-Configure using celeryconfig.py
-
-.. code:: python
-
-    broker_url = 'pyamqp://'
-    result_backend = 'rpc://'
-    warc_filename = '{domain}-{date}-{id}.warc.gz'
-    temp_dir = '/tmp/'
-    finished_dir = '/tmp/finished'
-
-Start a Celery worker::
-
-    celery -A crocoite.task worker -Q crocoite.archive,crocoite.controller --loglevel=info
-
-Then queue archive job::
-
-    crocoite-grab --distributed http://example.com
-
-The worker will create a temporary file named according to ``warc_filename`` in
-``/tmp`` while archiving and move it to ``/tmp/finished`` when done.
-
-IRC bot
-^^^^^^^
-
-Configure sopel_ (``~/.sopel/default.cfg``) to use the plugin located in
-``contrib/celerycrocoite.py``
-
-.. code:: ini
-
-    [core]
-    nick = chromebot
-    host = irc.efnet.fr
-    port = 6667
-    owner = someone
-    extra = /path/to/crocoite/contrib
-    enable = celerycrocoite
-    channels = #somechannel
-
-Then start it by running ``sopel``. The bot must be addressed directly (i.e.
-``chromebot: <command>``). The following commands are currently supported:
-
-a <url>
-    Archives <url> and all of its resources (images, css, …). A unique UID
-    (UUID) is assigned to each job.
-s <uuid>
-    Get status of job with <uuid>
-r <uuid>
-    Revoke job with <uuid>. If it started already the job will be killed.
-
-.. _sopel: https://sopel.chat/
-
-Related projects
-----------------
-
-brozzler_
-    Uses Google Chrome as well, but intercepts traffic using a proxy. Supports
-    distributed crawling and immediate playback.
-Squidwarc_
-    Communicates with headless Google Chrome and uses the Network API to
-    retrieve requests like crocoite. Supports recursive crawls and page
-    scrolling, but neither custom JavaScript nor distributed crawling.
+See documentation_ for more information.
 
-.. _brozzler: https://github.com/internetarchive/brozzler
-.. _Squidwarc: https://github.com/N0taN3rd/Squidwarc
+.. _documentation: https://6xq.net/crocoite/
 
diff --git a/contrib/celerycrocoite.py b/contrib/celerycrocoite.py
deleted file mode 100644
index 26c35ce..0000000
--- a/contrib/celerycrocoite.py
+++ /dev/null
@@ -1,231 +0,0 @@
-# Copyright (c) 2017 crocoite contributors
-# 
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-# 
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-# 
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-
-"""
-Module for Sopel IRC bot
-"""
-
-import os, logging, argparse
-from sopel.module import nickname_commands, require_chanmsg, thread, example, require_privilege, VOICE
-from sopel.tools import Identifier, SopelMemory
-import celery, celery.exceptions
-from celery.result import AsyncResult
-from urllib.parse import urlsplit
-from threading import Thread
-from queue import Queue
-import queue
-
-from crocoite import behavior, task
-from crocoite.controller import defaultSettings
-
-def prettyTimeDelta (seconds):
-    """
-    Pretty-print seconds to human readable string 1d 1h 1m 1s
-    """
-    seconds = int(seconds)
-    days, seconds = divmod(seconds, 86400)
-    hours, seconds = divmod(seconds, 3600)
-    minutes, seconds = divmod(seconds, 60)
-    s = [(days, 'd'), (hours, 'h'), (minutes, 'm'), (seconds, 's')]
-    s = filter (lambda x: x[0] != 0, s)
-    return ' '.join (map (lambda x: '{}{}'.format (*x), s))
-
-def prettyBytes (b):
-    """
-    Pretty-print bytes
-    """
-    prefixes = ['B', 'KiB', 'MiB', 'GiB', 'TiB']
-    while b >= 1024 and len (prefixes) > 1:
-        b /= 1024
-        prefixes.pop (0)
-    return '{:.1f} {}'.format (b, prefixes[0])
-
-def setup (bot):
-    m = bot.memory['crocoite'] = {}
-    q = m['q'] = Queue ()
-    t = m['t'] = Thread (target=celeryWorker, args=(bot, q))
-    t.start ()
-
-def shutdown (bot):
-    m = bot.memory['crocoite']
-    q = m['q']
-    t = m['t']
-    q.put_nowait (None)
-    t.join ()
-
-def isValidUrl (s):
-    url = urlsplit (s)
-    if url.scheme and url.netloc and url.scheme in {'http', 'https'}:
-        return s
-    raise TypeError ()
-
-def checkCompletedJobs (bot, jobs):
-    delete = set ()
-    for i, data in jobs.items ():
-        handle = data['handle']
-        trigger = data['trigger']
-        args = data['args']
-        url = args['url']
-        channel = trigger.sender
-        user = trigger.nick
-        if Identifier (channel) not in bot.channels:
-            continue
-        try:
-            result = handle.get (timeout=0.1)
-            stats = result['stats']
-            bot.msg (channel, '{}: {} ({}) finished. {} requests, {} failed, {} received.'.format (user, url,
-                    handle.id, stats['requests'], stats['failed'],
-                    prettyBytes (stats['bytesRcv'])))
-            delete.add (handle.id)
-        except celery.exceptions.TimeoutError:
-            pass
-        except Exception as e:
-            # json serialization does not work well with exceptions. If their class
-            # names are unique we can still distinguish them.
-            ename = type (e).__name__
-            if ename == 'TaskRevokedError':
-                bot.msg (channel, '{}: {} ({}) was revoked'.format (user, url, handle.id))
-            else:
-                bot.msg (channel, '{} ({}) failed'.format (user, url, handle.id))
-                logging.exception ('{} ({}) failed'.format (url, handle.id))
-            delete.add (handle.id)
-    for d in delete:
-        del jobs[d]
-
-def celeryWorker (bot, q):
-    """
-    Serialize celery operations in a single thread. This is a workaround for
-    https://github.com/celery/celery/issues/4480
-    """
-
-    jobs = {}
-
-    while True:
-        try:
-            item = q.get (timeout=1)
-        except queue.Empty:
-            checkCompletedJobs (bot, jobs)
-            continue
-
-        if item is None:
-            break
-        action, trigger, args = item
-        if action == 'a':
-            handle = task.controller.delay (**args)
-            j = jobs[handle.id] = {'handle': handle, 'trigger': trigger, 'args': args}
-
-            # pretty-print a few selected args
-            showargs = {
-                    'idleTimeout': prettyTimeDelta (args['settings']['idleTimeout']),
-                    'timeout': prettyTimeDelta (args['settings']['timeout']),
-                    'maxBodySize': prettyBytes (args['settings']['maxBodySize']),
-                    'recursive': args['recursive'],
-                    'concurrency': args['concurrency'],
-                    }
-            strargs = ', '.join (map (lambda x: '{}={}'.format (*x), showargs.items ()))
-            bot.msg (trigger.sender, '{}: {} has been queued as {} with {}'.format (trigger.nick, args['url'], handle.id, strargs))
-        elif action == 'status':
-            if args and args in jobs:
-                j = jobs[args]
-                jtrigger = j['trigger']
-                handle = j['handle']
-                bot.msg (trigger.sender, '{}: {}, queued {}, by {}'.format (handle.id,
-                        handle.status, jtrigger.time, jtrigger.nick))
-            else:
-                bot.msg (trigger.sender, "Job not found.")
-        elif action == 'revoke':
-            if args and args in jobs:
-                j = jobs[args]
-                handle = j['handle']
-                handle.revoke (terminate=True)
-                # response is handled above
-            else:
-                bot.msg (trigger.sender, "Job not found.")
-        q.task_done ()
-
-class NonExitingArgumentParser (argparse.ArgumentParser):
-    def exit (self, status=0, message=None):
-        # should never be called
-        pass
-
-    def error (self, message):
-        raise Exception (message)
-
-archiveparser = NonExitingArgumentParser (prog='a', add_help=False)
-archiveparser.add_argument('--timeout', default=1*60*60, type=int, help='Maximum time for archival', metavar='SEC', choices=[60, 1*60*60, 2*60*60])
-archiveparser.add_argument('--idle-timeout', default=10, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout', metavar='SEC', choices=[1, 10, 20, 30, 60])
-archiveparser.add_argument('--max-body-size', default=defaultSettings.maxBodySize, type=int, dest='maxBodySize', help='Max body size', metavar='BYTES', choices=[1*1024*1024, 10*1024*1024, defaultSettings.maxBodySize, 100*1024*1024])
-archiveparser.add_argument('--concurrency', default=1, type=int, help='Parallel workers for this job', choices=range (9))
-archiveparser.add_argument('--recursive', help='Enable recursion', choices=['0', '1', '2', '3', 'prefix'])
-archiveparser.add_argument('url', help='Website URL', type=isValidUrl)
-
-@nickname_commands ('a', 'archive')
-@require_chanmsg ()
-@require_privilege (VOICE)
-@example ('a http://example.com')
-def archive (bot, trigger):
-    """
-    Archive a URL to WARC
-    """
-
-    try:
-        args = archiveparser.parse_args (trigger.group (2).split ())
-    except Exception as e:
-        bot.reply ('{} -- {}'.format (e.args[0], archiveparser.format_usage ()))
-        return
-    if not args:
-        bot.reply ('Sorry, I don’t understand {}'.format (trigger.group (2)))
-        return
-    blacklistedBehavior = {'domSnapshot', 'screenshot'}
-    settings = dict (maxBodySize=args.maxBodySize,
-            logBuffer=defaultSettings.logBuffer, idleTimeout=args.idleTimeout,
-            timeout=args.timeout)
-    args = dict (url=args.url,
-            enabledBehaviorNames=list (behavior.availableNames-blacklistedBehavior),
-            settings=settings, recursive=args.recursive,
-            concurrency=args.concurrency)
-    q = bot.memory['crocoite']['q']
-    q.put_nowait (('a', trigger, args))
-
-@nickname_commands ('s', 'status')
-@example ('s c251f09e-3c26-481f-96e0-4b5f58bd1170')
-@require_chanmsg ()
-def status (bot, trigger):
-    """
-    Retrieve status for a job
-    """
-
-    i = trigger.group(2)
-    q = bot.memory['crocoite']['q']
-    q.put_nowait (('status', trigger, i))
-
-@nickname_commands ('r', 'revoke')
-@example ('r c251f09e-3c26-481f-96e0-4b5f58bd1170')
-@require_privilege (VOICE)
-@require_chanmsg ()
-def revoke (bot, trigger):
-    """
-    Cancel (revoke) a job
-    """
-
-    i = trigger.group(2)
-    q = bot.memory['crocoite']['q']
-    q.put_nowait (('revoke', trigger, i))
-
diff --git a/contrib/chromebot.json b/contrib/chromebot.json
new file mode 100644
index 0000000..214b770
--- /dev/null
+++ b/contrib/chromebot.json
@@ -0,0 +1,16 @@
+{
+  "irc": {
+    "host": "irc.example.com",
+    "port": 6667,
+    "ssl": false,
+    "nick": "chromebot",
+    "channels": ["#testchannel"]
+  },
+  "tempdir": "/path/to/tmp",
+  "destdir": "/path/to/warc",
+  "process_limit": 1
+  "blacklist": {
+    "^https?://(.+\\.)?local(host)?/": "Not acceptable"
+  },
+  "need_voice": false
+}
diff --git a/contrib/dashboard.css b/contrib/dashboard.css
new file mode 100644
index 0000000..469db78
--- /dev/null
+++ b/contrib/dashboard.css
@@ -0,0 +1,7 @@
+.jid {
+	font-family: Consolas, mono;
+}
+.job .urls {
+	max-height: 10em;
+	overflow-y: scroll;
+}
diff --git a/contrib/dashboard.html b/contrib/dashboard.html
new file mode 100644
index 0000000..49a15bc
--- /dev/null
+++ b/contrib/dashboard.html
@@ -0,0 +1,23 @@
+<!DOCTYPE html>
+<html lang="en">
+    <head>
+		<meta charset="utf-8">
+		 <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
+        <title>chromebot dashboard</title>
+		<!--<script src="https://cdn.jsdelivr.net/npm/vue@2/dist/vue.js"></script>-->
+		<script src="https://cdn.jsdelivr.net/npm/vue@2/dist/vue.min.js"></script>
+		<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bulma@0.7/css/bulma.min.css">
+		<link rel="stylesheet" href="dashboard.css">
+    </head>
+    <body>
+		<noscript>Please enable JavaScript.</noscript>
+		<section id="app" class="section">
+		<h1 class="title">chromebot dashboard</h1>
+		<bot-status v-bind:jobs="jobs"></bot-status>
+		<div class="jobs">
+			<job-item v-for="j in jobs" v-bind:job="j" v-bind:jobs="jobs" v-bind:key="j.id"></job-item>
+		</div>
+		</section>
+		<script src="dashboard.js"></script>
+    </body>
+</html>
diff --git a/contrib/dashboard.js b/contrib/dashboard.js
new file mode 100644
index 0000000..b5520dc
--- /dev/null
+++ b/contrib/dashboard.js
@@ -0,0 +1,129 @@
+/* configuration */
+let socket = "wss://localhost:6789/",
+	urllogMax = 100;
+
+function formatSize (bytes) {
+	let prefixes = ['B', 'KiB', 'MiB', 'GiB', 'TiB'];
+	while (bytes >= 1024 && prefixes.length > 1) {
+		bytes /= 1024;
+		prefixes.shift ();
+	}
+	return bytes.toFixed (1) + ' ' + prefixes[0];
+}
+
+class Job {
+	constructor (id, url, user, queued) {
+		this.id = id;
+		this.url = url;
+		this.user = user;
+		this.status = undefined;
+		this.stats = {'pending': 0, 'have': 0, 'running': 0,
+				'requests': 0, 'finished': 0, 'failed': 0,
+				'bytesRcv': 0, 'crashed': 0, 'ignored': 0};
+		this.urllog = [];
+		this.queued = queued;
+		this.started = undefined;
+		this.finished = undefined;
+		this.aborted = undefined;
+	}
+
+	addUrl (url) {
+		if (this.urllog.push (url) > urllogMax) {
+			this.urllog.shift ();
+		}
+	}
+}
+
+let jobs = {};
+let ws = new WebSocket(socket);
+ws.onmessage = function (event) {
+	var msg = JSON.parse (event.data);
+	let msgdate = new Date (Date.parse (msg.date));
+	var j = undefined;
+	if (msg.job) {
+		j = jobs[msg.job];
+		if (j === undefined) {
+			j = new Job (msg.job, 'unknown', '<unknown>', new Date ());
+			Vue.set (jobs, msg.job, j);
+		}
+	}
+	if (msg.uuid == '36cc34a6-061b-4cc5-84a9-4ab6552c8d75') {
+		j = new Job (msg.job, msg.url, msg.user, msgdate);
+		/* jobs[msg.job] = j does not work with vue, see
+		https://vuejs.org/v2/guide/list.html#Object-Change-Detection-Caveats
+		*/
+		Vue.set (jobs, msg.job, j);
+		j.status = 'pending';
+	} else if (msg.uuid == '46e62d60-f498-4ab0-90e1-d08a073b10fb') {
+		j.status = 'running';
+		j.started = msgdate;
+	} else if (msg.uuid == '7b40ffbb-faab-4224-90ed-cd4febd8f7ec') {
+		j.status = 'finished';
+		j.finished = msgdate;
+	} else if (msg.uuid == '865b3b3e-a54a-4a56-a545-f38a37bac295') {
+		j.status = 'aborted';
+		j.aborted = msgdate;
+	} else if (msg.uuid == '5c0f9a11-dcd8-4182-a60f-54f4d3ab3687') {
+		/* forwarded job message */
+		let rmsg = msg.data;
+		if (rmsg.uuid == '24d92d16-770e-4088-b769-4020e127a7ff') {
+			/* job status */
+			Object.assign (j.stats, rmsg);
+		} else if (rmsg.uuid == '5b8498e4-868d-413c-a67e-004516b8452c') {
+			/* recursion status */
+			Object.assign (j.stats, rmsg);
+		} else if (rmsg.uuid == 'd1288fbe-8bae-42c8-af8c-f2fa8b41794f') {
+			/* fetch */
+			j.addUrl (rmsg.url);
+		}
+	}
+};
+ws.onopen = function (event) {
+};
+ws.onerror = function (event) {
+};
+
+Vue.component('job-item', {
+  props: ['job', 'jobs'],
+  template: '<div class="job box" :id="job.id"><ul class="columns"><li class="jid column is-narrow"><a :href="\'#\' + job.id">{{ job.id }}</a></li><li class="url column"><a :href="job.url">{{ job.url }}</a></li><li class="status column is-narrow"><job-status v-bind:job="job"></job-status></li></ul><job-stats v-bind:job="job"></job-stats></div>',
+});
+Vue.component('job-status', {
+	props: ['job'],
+	template: '<span v-if="job.status == \'pending\'">queued on {{ job.queued.toLocaleString() }}</span><span v-else-if="job.status == \'aborted\'">aborted on {{ job.aborted.toLocaleString() }}</span><span v-else-if="job.status == \'running\'">running since {{ job.started.toLocaleString() }}</span><span v-else-if="job.status == \'finished\'">finished since {{ job.finished.toLocaleString() }}</span>'
+});
+Vue.component('job-stats', {
+  props: ['job'],
+  template: '<div><progress class="progress is-info" :value="job.stats.have" :max="job.stats.have+job.stats.pending+job.stats.running"></progress><ul class="stats columns"><li class="column">{{ job.stats.have }} <small>have</small></li><li class="column">{{ job.stats.running }} <small>running</small></li><li class="column">{{ job.stats.pending }} <small>pending</small></li><li class="column">{{ job.stats.requests }} <small>requests</small><li class="column"><filesize v-bind:value="job.stats.bytesRcv"></filesize></li></ul><job-urls v-bind:job="job"></job-urls></div>'
+});
+Vue.component('job-urls', {
+  props: ['job'],
+  template: '<ul class="urls"><li v-for="u in job.urllog">{{ u }}</li></ul>'
+});
+Vue.component('filesize', {
+  props: ['value'],
+  template: '<span class="filesize">{{ fvalue }}</span>',
+  computed: { fvalue: function () { return formatSize (this.value); } }
+});
+Vue.component('bot-status', {
+	props: ['jobs'],
+	template: '<nav class="level"><div class="level-item has-text-centered"><div><p class="heading">Pending</p><p class="title">{{ stats.pending }}</p></div></div><div class="level-item has-text-centered"><div><p class="heading">Running</p><p class="title">{{ stats.running }}</p></div></div><div class="level-item has-text-centered"><div><p class="heading">Finished</p><p class="title">{{ stats.finished+stats.aborted }}</p></div></div><div class="level-item has-text-centered"><div><p class="heading">Transferred</p><p class="title"><filesize v-bind:value="stats.totalBytes"></filesize></p></div></div></nav>',
+	computed: {
+		stats: function () {
+			let s = {pending: 0, running: 0, finished: 0, aborted: 0, totalBytes: 0};
+			for (let k in this.jobs) {
+				let j = this.jobs[k];
+				s[j.status]++;
+				s.totalBytes += j.stats.bytesRcv;
+			}
+			return s;
+		}
+	}
+});
+
+let app = new Vue({
+  el: '#app',
+  data: {
+	jobs: jobs,
+  }
+});
+
diff --git a/crocoite/behavior.py b/crocoite/behavior.py
index 95e8160..1610751 100644
--- a/crocoite/behavior.py
+++ b/crocoite/behavior.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 crocoite contributors
+# Copyright (c) 2017–2018 crocoite contributors
 # 
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -19,31 +19,74 @@
 # THE SOFTWARE.
 
 """
-Generic and per-site behavior scripts
+Behavior scripts (i.e. subclasses of Behavior) are a powerful method to
+manipulate websites loaded into Chrome. They are executed by the controller
+after the page started loading (onload), after it has been idle for a while
+(onstop) and after loading was stopped (onfinish).
+
+The script’s excercise their power either through DevTools API calls or by
+injecting JavaScript into the page context. Thus they can manipulate both, the
+browser itself (DevTools; modify resolution, get DOM snapshot) as well as the
+page (JavaScript; trigger JavaScript events, call web API’s).
+
+They also emit (yield) data processable by any consumer registered to the
+controller. This allows storing captured screenshots inside WARC files, for
+instance.
 """
 
-import logging
-from io import BytesIO
-from urllib.parse import urlsplit
-import os.path
-import pkg_resources
+import asyncio, json, os.path
 from base64 import b64decode
+from collections import OrderedDict
+import pkg_resources
+
+from html5lib.serializer import HTMLSerializer
+from yarl import URL
+import yaml
 
-from .util import randomString, packageUrl, getFormattedViewportMetrics
+from .util import getFormattedViewportMetrics
 from . import html
 from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker
-from html5lib.serializer import HTMLSerializer
-from warcio.statusandheaders import StatusAndHeaders
+from .devtools import Crashed, TabException
+
+class Script:
+    """ A JavaScript resource """
 
-logger = logging.getLogger(__name__)
+    __slots__ = ('path', 'data')
+    datadir = 'data'
+
+    def __init__ (self, path=None, encoding='utf-8'):
+        self.path = path
+        if path:
+            self.data = pkg_resources.resource_string (__name__, os.path.join (self.datadir, path)).decode (encoding)
+
+    def __repr__ (self):
+        return f'<Script {self.path}>'
+
+    def __str__ (self):
+        return self.data
+
+    @property
+    def abspath (self):
+        return pkg_resources.resource_filename (__name__,
+                os.path.join (self.datadir, self.path))
+
+    @classmethod
+    def fromStr (cls, data, path=None):
+        s = Script ()
+        s.data = data
+        s.path = path
+        return s
 
 class Behavior:
+    __slots__ = ('loader', 'logger')
+
     # unique behavior name
     name = None
 
-    def __init__ (self, loader):
+    def __init__ (self, loader, logger):
         assert self.name is not None
         self.loader = loader
+        self.logger = logger.bind (context=type (self).__name__)
 
     def __contains__ (self, url):
         """
@@ -51,54 +94,97 @@ class Behavior:
         """
         return True
 
-    def loadScript (self, path, encoding='utf-8'):
-        return pkg_resources.resource_string (__name__, os.path.join ('data', path)).decode (encoding)
+    def __repr__ (self):
+        return f'<Behavior {self.name}>'
 
-    def useScript (self, script, encoding='utf-8'):
-        writer = self.loader.writer
-        record = writer.create_warc_record (packageUrl ('script'), 'metadata',
-                payload=BytesIO (script.encode (encoding)),
-                warc_headers_dict={'Content-Type': 'application/javascript; charset={}'.format (encoding)})
-        writer.write_record (record)
+    async def onload (self):
+        """ After loading the page started """
+        # this is a dirty hack to make this function an async generator
+        return
+        yield # pragma: no cover
 
-    def onload (self):
-        """ Before loading the page """
-        pass
-
-    def onstop (self):
+    async def onstop (self):
         """ Before page loading is stopped """
-        pass
+        return
+        yield # pragma: no cover
 
-    def onfinish (self):
+    async def onfinish (self):
         """ After the site has stopped loading """
-        pass
-
-class HostnameFilter:
-    """ Limit behavior script to hostname """
-
-    hostname = None
-
-    def __contains__ (self, url):
-        url = urlsplit (url)
-        hostname = url.hostname.split ('.')[::-1]
-        return hostname[:2] == self.hostname
+        return
+        yield # pragma: no cover
 
 class JsOnload (Behavior):
     """ Execute JavaScript on page load """
 
-    scriptPath = None
+    __slots__ = ('script', 'context', 'options')
 
-    def __init__ (self, loader):
-        super ().__init__ (loader)
-        self.script = self.loadScript (self.scriptPath)
-        self.scriptHandle = None
+    scriptPath = None
 
-    def onload (self):
-        self.useScript (self.script)
-        self.scriptHandle = self.loader.tab.Page.addScriptToEvaluateOnNewDocument (source=self.script)['identifier']
+    def __init__ (self, loader, logger):
+        super ().__init__ (loader, logger)
+        self.script = Script (self.scriptPath)
+        self.context = None
+        # options passed to constructor
+        self.options = {}
 
-    def onstop (self):
-        self.loader.tab.Page.removeScriptToEvaluateOnNewDocument (identifier=self.scriptHandle)
+    async def onload (self):
+        tab = self.loader.tab
+        yield self.script
+
+        # This is slightly awkward, since we cannot compile the class into an
+        # objectId and then reference it. Therefore the script must return a
+        # class constructor, which is then called with a generic options
+        # parameter.
+        # XXX: is there a better way to do this?
+        result = await tab.Runtime.evaluate (expression=str (self.script))
+        self.logger.debug ('behavior onload inject',
+                uuid='a2da9b78-5648-44c5-bfa8-5c7573e13ad3', result=result)
+        exception = result.get ('exceptionDetails', None)
+        result = result['result']
+        assert result['type'] == 'function', result
+        assert result.get ('subtype') != 'error', exception
+        constructor = result['objectId']
+
+        if self.options:
+            yield Script.fromStr (json.dumps (self.options, indent=2), f'{self.script.path}#options')
+
+        try:
+            result = await tab.Runtime.callFunctionOn (
+                    functionDeclaration='function(options){return new this(options);}',
+                    objectId=constructor,
+                    arguments=[{'value': self.options}])
+            self.logger.debug ('behavior onload start',
+                    uuid='6c0605ae-93b3-46b3-b575-ba45790909a7', result=result)
+            result = result['result']
+            assert result['type'] == 'object', result
+            assert result.get ('subtype') != 'error', result
+            self.context = result['objectId']
+        except TabException as e:
+            if e.args[0] == -32000:
+                # the site probably reloaded. ignore this, since we’ll be
+                # re-injected into the new site by the controller.
+                self.logger.error ('jsonload onload failed',
+                        uuid='c151a863-78d1-41f4-a8e6-c022a6c5d252',
+                        exception=e.args)
+            else:
+                raise
+
+    async def onstop (self):
+        tab = self.loader.tab
+        try:
+            assert self.context is not None
+            await tab.Runtime.callFunctionOn (functionDeclaration='function(){return this.stop();}',
+                    objectId=self.context)
+            await tab.Runtime.releaseObject (objectId=self.context)
+        except TabException as e:
+            # cannot do anything about that. Ignoring should be fine.
+            self.logger.error ('jsonload onstop failed',
+                    uuid='1786726f-c8ec-4f79-8769-30954d4e32f5',
+                    exception=e.args,
+                    objectId=self.context)
+
+        return
+        yield # pragma: no cover
 
 ### Generic scripts ###
 
@@ -106,24 +192,10 @@ class Scroll (JsOnload):
     name = 'scroll'
     scriptPath = 'scroll.js'
 
-    def __init__ (self, loader):
-        super ().__init__ (loader)
-        stopVarname = '__' + __package__ + '_stop__'
-        newStopVarname = randomString ()
-        self.script = self.script.replace (stopVarname, newStopVarname)
-        self.stopVarname = newStopVarname
-
-    def onstop (self):
-        super ().onstop ()
-        # removing the script does not stop it if running
-        script = '{} = true; window.scrollTo (0, 0);'.format (self.stopVarname)
-        self.useScript (script)
-        self.loader.tab.Runtime.evaluate (expression=script, returnByValue=True)
-
 class EmulateScreenMetrics (Behavior):
     name = 'emulateScreenMetrics'
 
-    def onstop (self):
+    async def onstop (self):
         """
         Emulate different screen sizes, causing the site to fetch assets (img
         srcset and css, for example) for different screen resolutions.
@@ -139,17 +211,32 @@ class EmulateScreenMetrics (Behavior):
                 {'width': 320, 'height': 480, 'deviceScaleFactor': 163/cssPpi, 'mobile': True},
                 # 6th gen iPhone (portrait mode)
                 {'width': 750, 'height': 1334, 'deviceScaleFactor': 326/cssPpi, 'mobile': True},
-                # and reset
-                {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1, 'mobile': False},
                 ]
         l = self.loader
         tab = l.tab
         for s in sizes:
-            tab.Emulation.setDeviceMetricsOverride (**s)
+            self.logger.debug ('device override',
+                    uuid='3d2d8096-1a75-4830-ad79-ae5f6f97071d', **s)
+            await tab.Emulation.setDeviceMetricsOverride (**s)
             # give the browser time to re-eval page and start requests
-            l.wait (1)
-        # XXX: this seems to be broken, it does not clear the override
-        #tab.Emulation.clearDeviceMetricsOverride ()
+            # XXX: should wait until loader is not busy any more
+            await asyncio.sleep (1)
+        self.logger.debug ('clear override',
+                uuid='f9401683-eb3a-4b86-9bb2-c8c5d876fc8d')
+        await tab.Emulation.clearDeviceMetricsOverride ()
+        return
+        yield # pragma: no cover
+
+class DomSnapshotEvent:
+    __slots__ = ('url', 'document', 'viewport')
+
+    def __init__ (self, url, document, viewport):
+        # XXX: document encoding?
+        assert isinstance (document, bytes)
+
+        self.url = url
+        self.document = document
+        self.viewport = viewport
 
 class DomSnapshot (Behavior):
     """
@@ -157,38 +244,39 @@ class DomSnapshot (Behavior):
 
     We could use DOMSnapshot.getSnapshot here, but the API is not stable
     yet. Also computed styles are not really necessary here.
-
-    XXX: Currently writes a response, when it should use “resource”. pywb
-    can’t handle that though.
     """
 
+    __slots__ = ('script', )
+
     name = 'domSnapshot'
 
-    def __init__ (self, loader):
-        super ().__init__ (loader)
-        self.script = self.loadScript ('canvas-snapshot.js')
+    def __init__ (self, loader, logger):
+        super ().__init__ (loader, logger)
+        self.script = Script ('canvas-snapshot.js')
 
-    def onfinish (self):
+    async def onfinish (self):
         tab = self.loader.tab
-        writer = self.loader.writer
 
-        self.useScript (self.script)
-        tab.Runtime.evaluate (expression=self.script, returnByValue=True)
+        yield self.script
+        await tab.Runtime.evaluate (expression=str (self.script), returnByValue=True)
 
-        viewport = getFormattedViewportMetrics (tab)
-        dom = tab.DOM.getDocument (depth=-1, pierce=True)
+        viewport = await getFormattedViewportMetrics (tab)
+        dom = await tab.DOM.getDocument (depth=-1, pierce=True)
+        self.logger.debug ('dom snapshot document',
+                uuid='0c720784-8bd1-4fdc-a811-84394d753539', dom=dom)
         haveUrls = set ()
         for doc in ChromeTreeWalker (dom['root']).split ():
-            rawUrl = doc['documentURL']
-            if rawUrl in haveUrls:
+            url = URL (doc['documentURL'])
+            if url in haveUrls:
                 # ignore duplicate URLs. they are usually caused by
                 # javascript-injected iframes (advertising) with no(?) src
-                logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl))
-                continue
-            url = urlsplit (rawUrl)
-            if url.scheme in ('http', 'https'):
-                logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL']))
-                haveUrls.add (rawUrl)
+                self.logger.warning ('dom snapshot duplicate',
+                        uuid='d44de989-98d4-456e-82e7-9d4c49acab5e')
+            elif url.scheme in ('http', 'https'):
+                self.logger.debug ('dom snapshot',
+                        uuid='ece7ff05-ccd9-44b5-b6a8-be25a24b96f4',
+                        base=doc["baseURL"])
+                haveUrls.add (url)
                 walker = ChromeTreeWalker (doc)
                 # remove script, to make the page static and noscript, because at the
                 # time we took the snapshot scripts were enabled
@@ -196,41 +284,89 @@ class DomSnapshot (Behavior):
                 disallowedAttributes = html.eventAttributes
                 stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes)
                 serializer = HTMLSerializer ()
-                httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
-                record = writer.create_warc_record (doc['documentURL'], 'response',
-                        payload=BytesIO (serializer.render (stream, 'utf-8')),
-                        http_headers=httpHeaders,
-                        warc_headers_dict={'X-DOM-Snapshot': str (True),
-                                'X-Chrome-Viewport': viewport})
-                writer.write_record (record)
+                yield DomSnapshotEvent (url.with_fragment(None), serializer.render (stream, 'utf-8'), viewport)
+
+class ScreenshotEvent:
+    __slots__ = ('yoff', 'data', 'url')
+
+    def __init__ (self, url, yoff, data):
+        self.url = url
+        self.yoff = yoff
+        self.data = data
 
 class Screenshot (Behavior):
     """
     Create screenshot from tab and write it to WARC
+
+    Chrome will allocate an additional 512MB of RAM when using this plugin.
     """
 
+    __slots__ = ('script')
+
     name = 'screenshot'
 
-    def onfinish (self):
+    # Hardcoded max texture size of 16,384 (crbug.com/770769)
+    maxDim = 16*1024
+
+    def __init__ (self, loader, logger):
+        super ().__init__ (loader, logger)
+        self.script = Script ('screenshot.js')
+
+    async def onfinish (self):
         tab = self.loader.tab
-        writer = self.loader.writer
 
-        # see https://github.com/GoogleChrome/puppeteer/blob/230be28b067b521f0577206899db01f0ca7fc0d2/examples/screenshots-longpage.js
-        # Hardcoded max texture size of 16,384 (crbug.com/770769)
-        maxDim = 16*1024
-        metrics = tab.Page.getLayoutMetrics ()
+        # for top-level/full-screen elements with position: fixed we need to
+        # figure out their actual size (i.e. scrollHeight) and use that when
+        # overriding the viewport size.
+        # we could do this without javascript, but that would require several
+        # round-trips to Chrome or pulling down the entire DOM+computed styles
+        tab = self.loader.tab
+        yield self.script
+        result = await tab.Runtime.evaluate (expression=str (self.script), returnByValue=True)
+        assert result['result']['type'] == 'object', result
+        result = result['result']['value']
+
+        # this is required to make the browser render more than just the small
+        # actual viewport (i.e. entire page).  see
+        # https://github.com/GoogleChrome/puppeteer/blob/45873ea737b4ebe4fa7d6f46256b2ea19ce18aa7/lib/Page.js#L805
+        metrics = await tab.Page.getLayoutMetrics ()
         contentSize = metrics['contentSize']
-        width = min (contentSize['width'], maxDim)
+        contentHeight = max (result + [contentSize['height']])
+
+        override = {
+                'width': 0,
+                'height': 0,
+                'deviceScaleFactor': 0,
+                'mobile': False,
+                'viewport': {'x': 0,
+                    'y': 0,
+                    'width': contentSize['width'],
+                    'height': contentHeight,
+                    'scale': 1}
+                }
+        self.logger.debug ('screenshot override',
+                uuid='e0affa18-cbb1-4d97-9d13-9a88f704b1b2', override=override)
+        await tab.Emulation.setDeviceMetricsOverride (**override)
+
+        tree = await tab.Page.getFrameTree ()
+        try:
+            url = URL (tree['frameTree']['frame']['url']).with_fragment (None)
+        except KeyError:
+            self.logger.error ('frame without url',
+                    uuid='edc2743d-b93e-4ba1-964e-db232f2f96ff', tree=tree)
+            url = None
+
+        width = min (contentSize['width'], self.maxDim)
         # we’re ignoring horizontal scroll intentionally. Most horizontal
         # layouts use JavaScript scrolling and don’t extend the viewport.
-        for yoff in range (0, contentSize['height'], maxDim):
-            height = min (contentSize['height'] - yoff, maxDim)
+        for yoff in range (0, contentHeight, self.maxDim):
+            height = min (contentHeight - yoff, self.maxDim)
             clip = {'x': 0, 'y': yoff, 'width': width, 'height': height, 'scale': 1}
-            data = b64decode (tab.Page.captureScreenshot (format='png', clip=clip)['data'])
-            url = packageUrl ('screenshot-{}-{}.png'.format (0, yoff))
-            record = writer.create_warc_record (url, 'resource',
-                    payload=BytesIO (data), warc_headers_dict={'Content-Type': 'image/png'})
-            writer.write_record (record)
+            ret = await tab.Page.captureScreenshot (format='png', clip=clip)
+            data = b64decode (ret['data'])
+            yield ScreenshotEvent (url, yoff, data)
+
+        await tab.Emulation.clearDeviceMetricsOverride ()
 
 class Click (JsOnload):
     """ Generic link clicking """
@@ -238,6 +374,27 @@ class Click (JsOnload):
     name = 'click'
     scriptPath = 'click.js'
 
+    def __init__ (self, loader, logger):
+        super ().__init__ (loader, logger)
+        with pkg_resources.resource_stream (__name__, os.path.join ('data', 'click.yaml')) as fd:
+            self.options['sites'] = list (yaml.safe_load_all (fd))
+
+class ExtractLinksEvent:
+    __slots__ = ('links', )
+
+    def __init__ (self, links):
+        self.links = links
+
+    def __repr__ (self):
+        return f'<ExtractLinksEvent {self.links!r}>'
+
+def mapOrIgnore (f, l):
+    for e in l:
+        try:
+            yield f (e)
+        except:
+            pass
+
 class ExtractLinks (Behavior):
     """
     Extract links from a page using JavaScript
@@ -246,23 +403,37 @@ class ExtractLinks (Behavior):
     manually resolve relative links.
     """
 
+    __slots__ = ('script', )
+
     name = 'extractLinks'
 
-    def __init__ (self, loader):
-        super ().__init__ (loader)
-        self.script = self.loadScript ('extract-links.js')
-        self.links = None
+    def __init__ (self, loader, logger):
+        super ().__init__ (loader, logger)
+        self.script = Script ('extract-links.js')
 
-    def onfinish (self):
+    async def onfinish (self):
         tab = self.loader.tab
-        self.useScript (self.script)
-        result = tab.Runtime.evaluate (expression=self.script, returnByValue=True)
-        self.links = list (set (result['result']['value']))
+        yield self.script
+        result = await tab.Runtime.evaluate (expression=str (self.script), returnByValue=True)
+        yield ExtractLinksEvent (list (set (mapOrIgnore (URL, result['result']['value']))))
+
+class Crash (Behavior):
+    """ Crash the browser. For testing only. Obviously. """
+
+    name = 'crash'
+
+    async def onstop (self):
+        try:
+            await self.loader.tab.Page.crash ()
+        except Crashed:
+            pass
+        return
+        yield # pragma: no cover
 
 # available behavior scripts. Order matters, move those modifying the page
 # towards the end of available
-generic = [Scroll, EmulateScreenMetrics, Click, ExtractLinks]
-perSite = []
-available = generic + perSite + [Screenshot, DomSnapshot]
-availableNames = set (map (lambda x: x.name, available))
+available = [Scroll, Click, ExtractLinks, Screenshot, EmulateScreenMetrics, DomSnapshot]
+#available.append (Crash)
+# order matters, since behavior can modify the page (dom snapshots, for instance)
+availableMap = OrderedDict (map (lambda x: (x.name, x), available))
 
diff --git a/crocoite/browser.py b/crocoite/browser.py
index e58ebcf..3518789 100644
--- a/crocoite/browser.py
+++ b/crocoite/browser.py
@@ -22,91 +22,198 @@
 Chrome browser interactions.
 """
 
-import logging
-from urllib.parse import urlsplit
-from base64 import b64decode
-import pychrome
+import asyncio
+from base64 import b64decode, b64encode
+from datetime import datetime, timedelta
+from http.server import BaseHTTPRequestHandler
 
-class Item:
-    """
-    Simple wrapper containing Chrome request and response
-    """
+from yarl import URL
+from multidict import CIMultiDict
 
-    def __init__ (self, tab):
-        self.tab = tab
-        self.chromeRequest = None
-        self.chromeResponse = None
-        self.chromeFinished = None
+from .logger import Level
+from .devtools import Browser, TabException
 
-    def __repr__ (self):
-        return '<Item {}>'.format (self.request['url'])
+# These two classes’ only purpose is so we can later tell whether a body was
+# base64-encoded or a unicode string
+class Base64Body (bytes):
+    def __new__ (cls, value):
+        return bytes.__new__ (cls, b64decode (value))
 
-    @property
-    def request (self):
-        return self.chromeRequest['request']
+    @classmethod
+    def fromBytes (cls, b):
+        """ For testing """
+        return cls (b64encode (b))
 
-    @property
-    def response (self):
-        return self.chromeResponse['response']
+class UnicodeBody (bytes):
+    def __new__ (cls, value):
+        if type (value) is not str:
+            raise TypeError ('expecting unicode string')
 
-    @property
-    def initiator (self):
-        return self.chromeRequest['initiator']
+        return bytes.__new__ (cls, value.encode ('utf-8'))
 
-    @property
-    def id (self):
-        return self.chromeRequest['requestId']
+class Request:
+    __slots__ = ('headers', 'body', 'initiator', 'hasPostData', 'method', 'timestamp')
 
-    @property
-    def encodedDataLength (self):
-        return self.chromeFinished['encodedDataLength']
+    def __init__ (self, method=None, headers=None, body=None):
+        self.headers = headers
+        self.body = body
+        self.hasPostData = False
+        self.initiator = None
+        # HTTP method
+        self.method = method
+        self.timestamp = None
 
-    @property
-    def url (self):
-        return self.response['url']
+    def __repr__ (self):
+        return f'Request({self.method!r}, {self.headers!r}, {self.body!r})'
+
+    def __eq__ (self, b):
+        if b is None:
+            return False
+
+        if not isinstance (b, Request):
+            raise TypeError ('Can only compare equality with Request.')
+
+        # do not compare hasPostData (only required to fetch body) and
+        # timestamp (depends on time)
+        return self.headers == b.headers and \
+                self.body == b.body and \
+                self.initiator == b.initiator and \
+                self.method == b.method
+
+class Response:
+    __slots__ = ('status', 'statusText', 'headers', 'body', 'bytesReceived',
+            'timestamp', 'mimeType')
+
+    def __init__ (self, status=None, statusText=None, headers=None, body=None, mimeType=None):
+        self.status = status
+        self.statusText = statusText
+        self.headers = headers
+        self.body = body
+        # bytes received over the network (not body size!)
+        self.bytesReceived = 0
+        self.timestamp = None
+        self.mimeType = mimeType
 
-    @property
-    def parsedUrl (self):
-        return urlsplit (self.url)
+    def __repr__ (self):
+        return f'Response({self.status!r}, {self.statusText!r}, {self.headers!r}, {self.body!r}, {self.mimeType!r})'
+
+    def __eq__ (self, b):
+        if b is None:
+            return False
+
+        if not isinstance (b, Response):
+            raise TypeError ('Can only compare equality with Response.')
+
+        # do not compare bytesReceived (depends on network), timestamp
+        # (depends on time) and statusText (does not matter)
+        return self.status == b.status and \
+                self.statusText == b.statusText and \
+                self.headers == b.headers and \
+                self.body == b.body and \
+                self.mimeType == b.mimeType
+
+class ReferenceTimestamp:
+    """ Map relative timestamp to absolute timestamp """
+
+    def __init__ (self, relative, absolute):
+        self.relative = timedelta (seconds=relative)
+        self.absolute = datetime.utcfromtimestamp (absolute)
+
+    def __call__ (self, relative):
+        if not isinstance (relative, timedelta):
+            relative = timedelta (seconds=relative)
+        return self.absolute + (relative-self.relative)
+
+class RequestResponsePair:
+    __slots__ = ('request', 'response', 'id', 'url', 'remoteIpAddress',
+            'protocol', 'resourceType', '_time')
+
+    def __init__ (self, id=None, url=None, request=None, response=None):
+        self.request = request
+        self.response = response
+        self.id = id
+        self.url = url
+        self.remoteIpAddress = None
+        self.protocol = None
+        self.resourceType = None
+        self._time = None
 
-    @property
-    def body (self):
-        """ Return response body or None """
-        try:
-            body = self.tab.Network.getResponseBody (requestId=self.id, _timeout=60)
-            rawBody = body['body']
-            base64Encoded = body['base64Encoded']
-            if base64Encoded:
-                rawBody = b64decode (rawBody)
-            else:
-                rawBody = rawBody.encode ('utf8')
-            return rawBody, base64Encoded
-        except (pychrome.exceptions.CallMethodException, pychrome.exceptions.TimeoutException):
-            raise ValueError ('Cannot fetch response body')
-
-    @property
-    def requestBody (self):
-        """ Get request/POST body """
-        req = self.request
-        postData = req.get ('postData')
-        if postData:
-            return postData.encode ('utf8'), False
-        elif req.get ('hasPostData', False):
-            try:
-                return b64decode (self.tab.Network.getRequestPostData (requestId=self.id, _timeout=60)['postData']), True
-            except (pychrome.exceptions.CallMethodException, pychrome.exceptions.TimeoutException):
-                raise ValueError ('Cannot fetch request body')
-        return None, False
+    def __repr__ (self):
+        return f'RequestResponsePair({self.id!r}, {self.url!r}, {self.request!r}, {self.response!r})'
+
+    def __eq__ (self, b):
+        if not isinstance (b, RequestResponsePair):
+            raise TypeError (f'Can only compare with {self.__class__.__name__}')
+
+        # do not compare id and _time. These depend on external factors and do
+        # not influence the request/response *content*
+        return self.request == b.request and \
+                self.response == b.response and \
+                self.url == b.url and \
+                self.remoteIpAddress == b.remoteIpAddress and \
+                self.protocol == b.protocol and \
+                self.resourceType == b.resourceType
+
+    def fromRequestWillBeSent (self, req):
+        """ Set request data from Chrome Network.requestWillBeSent event """
+        r = req['request']
+
+        self.id = req['requestId']
+        self.url = URL (r['url'])
+        self.resourceType = req.get ('type')
+        self._time = ReferenceTimestamp (req['timestamp'], req['wallTime'])
+
+        assert self.request is None, req
+        self.request = Request ()
+        self.request.initiator = req['initiator']
+        self.request.headers = CIMultiDict (self._unfoldHeaders (r['headers']))
+        self.request.hasPostData = r.get ('hasPostData', False)
+        self.request.method = r['method']
+        self.request.timestamp = self._time (req['timestamp'])
+        if self.request.hasPostData:
+            postData = r.get ('postData')
+            if postData is not None:
+                self.request.body = UnicodeBody (postData)
+
+    def fromResponse (self, r, timestamp=None, resourceType=None):
+        """
+        Set response data from Chrome’s Response object.
+        
+        Request must exist. Updates if response was set before. Sometimes
+        fromResponseReceived is triggered twice by Chrome. No idea why.
+        """
+        assert self.request is not None, (self.request, r)
+
+        if not timestamp:
+            timestamp = self.request.timestamp
+
+        self.remoteIpAddress = r.get ('remoteIPAddress')
+        self.protocol = r.get ('protocol')
+        if resourceType:
+            self.resourceType = resourceType
+
+        # a response may contain updated request headers (i.e. those actually
+        # sent over the wire)
+        if 'requestHeaders' in r:
+            self.request.headers = CIMultiDict (self._unfoldHeaders (r['requestHeaders']))
+
+        self.response = Response ()
+        self.response.headers = CIMultiDict (self._unfoldHeaders (r['headers']))
+        self.response.status = r['status']
+        self.response.statusText = r['statusText']
+        self.response.timestamp = timestamp
+        self.response.mimeType = r['mimeType']
 
-    @property
-    def requestHeaders (self):
-        # the response object may contain refined headers, which were
-        # *actually* sent over the wire
-        return self._unfoldHeaders (self.response.get ('requestHeaders', self.request['headers']))
+    def fromResponseReceived (self, resp):
+        """ Set response data from Chrome Network.responseReceived """
+        return self.fromResponse (resp['response'],
+                self._time (resp['timestamp']), resp['type'])
 
-    @property
-    def responseHeaders (self):
-        return self._unfoldHeaders (self.response['headers'])
+    def fromLoadingFinished (self, data):
+        self.response.bytesReceived = data['encodedDataLength']
+
+    def fromLoadingFailed (self, data):
+        self.response = None
 
     @staticmethod
     def _unfoldHeaders (headers):
@@ -120,14 +227,46 @@ class Item:
                 items.append ((k, v))
         return items
 
-    def setRequest (self, req):
-        self.chromeRequest = req
+    async def prefetchRequestBody (self, tab):
+        if self.request.hasPostData and self.request.body is None:
+            try:
+                postData = await tab.Network.getRequestPostData (requestId=self.id)
+                self.request.body = UnicodeBody (postData['postData'])
+            except TabException:
+                self.request.body = None
+
+    async def prefetchResponseBody (self, tab):
+        """ Fetch response body """
+        try:
+            body = await tab.Network.getResponseBody (requestId=self.id)
+            if body['base64Encoded']:
+                self.response.body = Base64Body (body['body'])
+            else:
+                self.response.body = UnicodeBody (body['body'])
+        except TabException:
+            self.response.body = None
 
-    def setResponse (self, resp):
-        self.chromeResponse = resp
+class NavigateError (IOError):
+    pass
 
-    def setFinished (self, finished):
-        self.chromeFinished = finished
+class PageIdle:
+    """ Page idle event """
+
+    __slots__ = ('idle', )
+
+    def __init__ (self, idle):
+        self.idle = idle
+
+    def __bool__ (self):
+        return self.idle
+
+class FrameNavigated:
+    __slots__ = ('id', 'url', 'mimeType')
+
+    def __init__ (self, id, url, mimeType):
+        self.id = id
+        self.url = URL (url)
+        self.mimeType = mimeType
 
 class SiteLoader:
     """
@@ -136,505 +275,245 @@ class SiteLoader:
     XXX: track popup windows/new tabs and close them
     """
 
+    __slots__ = ('requests', 'browser', 'logger', 'tab', '_iterRunning',
+            '_framesLoading', '_rootFrame')
     allowedSchemes = {'http', 'https'}
 
-    def __init__ (self, browser, url, logger=logging.getLogger(__name__)):
+    def __init__ (self, browser, logger):
         self.requests = {}
-        self.browser = browser
-        self.url = url
-        self.logger = logger
+        self.browser = Browser (url=browser)
+        self.logger = logger.bind (context=type (self).__name__)
+        self._iterRunning = []
 
-        self.tab = browser.new_tab()
+        self._framesLoading = set ()
+        self._rootFrame = None
 
-    def __enter__ (self):
-        tab = self.tab
-        # setup callbacks
-        tab.Network.requestWillBeSent = self._requestWillBeSent
-        tab.Network.responseReceived = self._responseReceived
-        tab.Network.loadingFinished = self._loadingFinished
-        tab.Network.loadingFailed = self._loadingFailed
-        tab.Log.entryAdded = self._entryAdded
-        #tab.Page.loadEventFired = loadEventFired
-        tab.Page.javascriptDialogOpening = self._javascriptDialogOpening
-
-        # start the tab
-        tab.start()
+    async def __aenter__ (self):
+        tab = self.tab = await self.browser.__aenter__ ()
 
         # enable events
-        tab.Log.enable ()
-        tab.Network.enable()
-        tab.Page.enable ()
-        tab.Network.clearBrowserCache ()
-        if tab.Network.canClearBrowserCookies ()['result']:
-            tab.Network.clearBrowserCookies ()
-
+        await asyncio.gather (*[
+                tab.Log.enable (),
+                tab.Network.enable(),
+                tab.Page.enable (),
+                tab.Inspector.enable (),
+                tab.Network.clearBrowserCache (),
+                tab.Network.clearBrowserCookies (),
+                ])
         return self
 
+    async def __aexit__ (self, exc_type, exc_value, traceback):
+        for task in self._iterRunning:
+            # ignore any results from stuff we did not end up using anyway
+            if not task.done ():
+                task.cancel ()
+        self._iterRunning = []
+        await self.browser.__aexit__ (exc_type, exc_value, traceback)
+        self.tab = None
+        return False
+
     def __len__ (self):
         return len (self.requests)
 
-    def start (self):
-        self.tab.Page.navigate(url=self.url)
-
-    def wait (self, timeout=1):
-        self.tab.wait (timeout)
-
-    def waitIdle (self, idleTimeout=1, maxTimeout=60):
-        step = 0
-        for i in range (0, maxTimeout):
-            self.wait (1)
-            if len (self) == 0:
-                step += 1
-                if step > idleTimeout:
-                    break
-            else:
-                step = 0
-
-    def stop (self):
-        """
-        Stop loading site
-
-        XXX: stop executing scripts
-        """
-
+    async def __aiter__ (self):
+        """ Retrieve network items """
         tab = self.tab
+        assert tab is not None
+        handler = {
+                tab.Network.requestWillBeSent: self._requestWillBeSent,
+                tab.Network.responseReceived: self._responseReceived,
+                tab.Network.loadingFinished: self._loadingFinished,
+                tab.Network.loadingFailed: self._loadingFailed,
+                tab.Log.entryAdded: self._entryAdded,
+                tab.Page.javascriptDialogOpening: self._javascriptDialogOpening,
+                tab.Page.frameStartedLoading: self._frameStartedLoading,
+                tab.Page.frameStoppedLoading: self._frameStoppedLoading,
+                tab.Page.frameNavigated: self._frameNavigated,
+                }
+
+        # The implementation is a little advanced. Why? The goal here is to
+        # process events from the tab as quickly as possible (i.e.
+        # asynchronously). We need to make sure that JavaScript dialogs are
+        # handled immediately for instance. Otherwise they stall every
+        # other request. Also, we don’t want to use an unbounded queue,
+        # since the items yielded can get quite big (response body). Thus
+        # we need to block (yield) for every item completed, but not
+        # handled by the consumer (caller).
+        running = self._iterRunning
+        tabGetTask = asyncio.ensure_future (self.tab.get ())
+        running.append (tabGetTask)
+        while True:
+            done, pending = await asyncio.wait (running, return_when=asyncio.FIRST_COMPLETED)
+            for t in done:
+                result = t.result ()
+                if result is None:
+                    pass
+                elif t == tabGetTask:
+                    method, data = result
+                    f = handler.get (method, None)
+                    if f is not None:
+                        task = asyncio.ensure_future (f (**data))
+                        pending.add (task)
+                    tabGetTask = asyncio.ensure_future (self.tab.get ())
+                    pending.add (tabGetTask)
+                else:
+                    yield result
 
-        tab.Page.stopLoading ()
-        tab.Network.disable ()
-        tab.Page.disable ()
-        tab.Log.disable ()
-        # XXX: we can’t drain the event queue directly, so insert (yet another) wait
-        tab.wait (1)
-        tab.Network.requestWillBeSent = None
-        tab.Network.responseReceived = None
-        tab.Network.loadingFinished = None
-        tab.Network.loadingFailed = None
-        tab.Page.loadEventFired = None
-        tab.Page.javascriptDialogOpening = None
-        tab.Log.entryAdded = None
-
-    def __exit__ (self, exc_type, exc_value, traceback):
-        self.tab.stop ()
-        self.browser.close_tab(self.tab)
-        return False
-
-    # overrideable callbacks
-    def loadingFinished (self, item, redirect=False):
-        pass
+            running = pending
+            self._iterRunning = running
 
-    def loadingFailed (self, item):
-        pass
+    async def navigate (self, url):
+        ret = await self.tab.Page.navigate(url=url)
+        self.logger.debug ('navigate',
+                uuid='9d47ded2-951f-4e09-86ee-fd4151e20666', result=ret)
+        if 'errorText' in ret:
+            raise NavigateError (ret['errorText'])
+        self._rootFrame = ret['frameId']
 
     # internal chrome callbacks
-    def _requestWillBeSent (self, **kwargs):
+    async def _requestWillBeSent (self, **kwargs):
+        self.logger.debug ('requestWillBeSent',
+                uuid='b828d75a-650d-42d2-8c66-14f4547512da', args=kwargs)
+
         reqId = kwargs['requestId']
         req = kwargs['request']
+        url = URL (req['url'])
+        logger = self.logger.bind (reqId=reqId, reqUrl=url)
 
-        url = urlsplit (req['url'])
         if url.scheme not in self.allowedSchemes:
             return
 
+        ret = None
         item = self.requests.get (reqId)
         if item:
             # redirects never “finish” loading, but yield another requestWillBeSent with this key set
             redirectResp = kwargs.get ('redirectResponse')
             if redirectResp:
-                # create fake responses
-                resp = {'requestId': reqId, 'response': redirectResp, 'timestamp': kwargs['timestamp']}
-                item.setResponse (resp)
-                resp = {'requestId': reqId, 'encodedDataLength': 0, 'timestamp': kwargs['timestamp']}
-                item.setFinished (resp)
-                self.loadingFinished (item, redirect=True)
-                self.logger.info ('redirected request {} has url {}'.format (reqId, req['url']))
+                if item.url != url:
+                    # this happens for unknown reasons. the docs simply state
+                    # it can differ in case of a redirect. Fix it and move on.
+                    logger.warning ('redirect url differs',
+                            uuid='558a7df7-2258-4fe4-b16d-22b6019cc163',
+                            expected=item.url)
+                    redirectResp['url'] = str (item.url)
+                item.fromResponse (redirectResp)
+                logger.info ('redirect', uuid='85eaec41-e2a9-49c2-9445-6f19690278b8', target=url)
+                # XXX: queue this? no need to wait for it
+                await item.prefetchRequestBody (self.tab)
+                # cannot fetch response body due to race condition (item id reused)
+                ret = item
             else:
-                self.logger.warning ('request {} already exists, overwriting.'.format (reqId))
+                logger.warning ('request exists', uuid='2c989142-ba00-4791-bb03-c2a14e91a56b')
 
-        item = Item (self.tab)
-        item.setRequest (kwargs)
+        item = RequestResponsePair ()
+        item.fromRequestWillBeSent (kwargs)
         self.requests[reqId] = item
 
-    def _responseReceived (self, **kwargs):
+        return ret
+
+    async def _responseReceived (self, **kwargs):
+        self.logger.debug ('responseReceived',
+                uuid='ecd67e69-401a-41cb-b4ec-eeb1f1ec6abb', args=kwargs)
+
         reqId = kwargs['requestId']
         item = self.requests.get (reqId)
         if item is None:
             return
 
         resp = kwargs['response']
-        url = urlsplit (resp['url'])
+        url = URL (resp['url'])
+        logger = self.logger.bind (reqId=reqId, respUrl=url)
+        if item.url != url:
+            logger.error ('url mismatch', uuid='7385f45f-0b06-4cbc-81f9-67bcd72ee7d0', respUrl=url)
         if url.scheme in self.allowedSchemes:
-            self.logger.info ('response {} {}'.format (reqId, resp['url']))
-            item.setResponse (kwargs)
+            item.fromResponseReceived (kwargs)
         else:
-            self.logger.warning ('response: ignoring scheme {}'.format (url.scheme))
+            logger.warning ('scheme forbidden', uuid='2ea6e5d7-dd3b-4881-b9de-156c1751c666')
 
-    def _loadingFinished (self, **kwargs):
+    async def _loadingFinished (self, **kwargs):
         """
         Item was fully loaded. For some items the request body is not available
         when responseReceived is fired, thus move everything here.
         """
+        self.logger.debug ('loadingFinished',
+                uuid='35479405-a5b5-4395-8c33-d3601d1796b9', args=kwargs)
+
         reqId = kwargs['requestId']
         item = self.requests.pop (reqId, None)
         if item is None:
             # we never recorded this request (blacklisted scheme, for example)
             return
+        if not item.response:
+            # chrome failed to send us a responseReceived event for this item,
+            # so we can’t record it (missing request/response headers)
+            self.logger.error ('response missing',
+                    uuid='fac3ab96-3f9b-4c5a-95c7-f83b675cdcb9', requestId=item.id)
+            return
+
         req = item.request
-        resp = item.response
-        assert req['url'] == resp['url'], 'req and resp urls are not the same {} vs {}'.format (req['url'], resp['url'])
-        url = urlsplit (resp['url'])
-        if url.scheme in self.allowedSchemes:
-            self.logger.info ('finished {} {}'.format (reqId, req['url']))
-            item.setFinished (kwargs)
-            self.loadingFinished (item)
+        if item.url.scheme in self.allowedSchemes:
+            item.fromLoadingFinished (kwargs)
+            # XXX queue both
+            await asyncio.gather (item.prefetchRequestBody (self.tab), item.prefetchResponseBody (self.tab))
+            return item
+
+    async def _loadingFailed (self, **kwargs):
+        self.logger.info ('loadingFailed',
+                uuid='4a944e85-5fae-4aa6-9e7c-e578b29392e4', args=kwargs)
 
-    def _loadingFailed (self, **kwargs):
         reqId = kwargs['requestId']
-        self.logger.warning ('failed {} {}'.format (reqId, kwargs['errorText'], kwargs.get ('blockedReason')))
+        logger = self.logger.bind (reqId=reqId)
         item = self.requests.pop (reqId, None)
-        self.loadingFailed (item)
+        if item is not None:
+            item.fromLoadingFailed (kwargs)
+            return item
 
-    def _entryAdded (self, **kwargs):
+    async def _entryAdded (self, **kwargs):
         """ Log entry added """
         entry = kwargs['entry']
-        level = {'verbose': logging.DEBUG, 'info': logging.INFO,
-                'warning': logging.WARNING,
-                'error': logging.ERROR}[entry['level']]
-        self.logger.log (level, 'console: {}: {}'.format (entry['source'], entry['text']), extra={'raw': entry})
+        level = {'verbose': Level.DEBUG, 'info': Level.INFO,
+                'warning': Level.WARNING,
+                'error': Level.ERROR}.get (entry.pop ('level'), Level.INFO)
+        entry['uuid'] = 'e62ffb5a-0521-459c-a3d9-1124551934d2'
+        self.logger (level, 'console', **entry)
 
-    def _javascriptDialogOpening (self, **kwargs):
+    async def _javascriptDialogOpening (self, **kwargs):
         t = kwargs.get ('type')
         if t in {'alert', 'confirm', 'prompt'}:
-            self.logger.info ('javascript opened a dialog: {}, {}, canceling'.format (t, kwargs.get ('message')))
-            self.tab.Page.handleJavaScriptDialog (accept=False)
+            self.logger.info ('js dialog',
+                    uuid='d6f07ce2-648e-493b-a1df-f353bed27c84',
+                    action='cancel', type=t, message=kwargs.get ('message'))
+            await self.tab.Page.handleJavaScriptDialog (accept=False)
         elif t == 'beforeunload':
             # we must accept this one, otherwise the page will not unload/close
-            self.logger.info ('javascript opened a dialog: {}, {}, procceeding'.format (t, kwargs.get ('message')))
-            self.tab.Page.handleJavaScriptDialog (accept=True)
-        else:
-            self.logger.warning ('unknown javascript dialog type {}'.format (t))
-
-class AccountingSiteLoader (SiteLoader):
-    """
-    SiteLoader that keeps basic statistics about retrieved pages.
-    """
-
-    def __init__ (self, browser, url, logger=logging.getLogger(__name__)):
-        super ().__init__ (browser, url, logger)
-
-        self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0}
-
-    def loadingFinished (self, item, redirect=False):
-        super ().loadingFinished (item, redirect)
-
-        self.stats['finished'] += 1
-        self.stats['bytesRcv'] += item.encodedDataLength
-
-    def loadingFailed (self, item):
-        super ().loadingFailed (item)
-
-        self.stats['failed'] += 1
-
-    def _requestWillBeSent (self, **kwargs):
-        super ()._requestWillBeSent (**kwargs)
-
-        self.stats['requests'] += 1
-
-import subprocess
-from tempfile import mkdtemp
-import socket, shutil
-
-class ChromeService:
-    """
-    Start Chrome with socket activation (i.e. pass listening socket). Polling
-    is not required with this method, since reads will block until Chrome is
-    ready.
-    """
-
-    def __init__ (self, binary='google-chrome-stable', host='localhost', port=9222, windowSize=(1920, 1080)):
-        self.binary = binary
-        self.host = host
-        self.port = port
-        self.windowSize = windowSize
-        self.p = None
-
-    def __enter__ (self):
-        assert self.p is None
-
-        port = self.port
-        while True:
-            s = socket.socket ()
-            s.setsockopt (socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
-            try:
-                s.bind ((self.host, port))
-                break
-            except OSError:
-                # try different port
-                if port < 65000:
-                    port += 1
-                else:
-                    raise
-        s.listen (10)
-        self.userDataDir = mkdtemp ()
-        args = [self.binary,
-                '--window-size={},{}'.format (*self.windowSize),
-                '--user-data-dir={}'.format (self.userDataDir), # use temporory user dir
-                '--no-default-browser-check',
-                '--no-first-run', # don’t show first run screen
-                '--disable-breakpad', # no error reports
-                '--disable-extensions',
-                '--disable-infobars',
-                '--disable-notifications', # no libnotify
-                '--headless',
-                '--disable-gpu',
-                '--hide-scrollbars', # hide scrollbars on screenshots
-                '--mute-audio', # don’t play any audio
-                '--remote-debugging-socket-fd={}'.format (s.fileno ()),
-                '--homepage=about:blank',
-                'about:blank']
-        # start new session, so ^C does not affect subprocess
-        self.p = subprocess.Popen (args, pass_fds=[s.fileno()], start_new_session=True,
-                stdin=subprocess.DEVNULL, stdout=subprocess.DEVNULL,
-                stderr=subprocess.DEVNULL)
-        s.close ()
-
-        return 'http://{}:{}'.format (self.host, port)
-
-    def __exit__ (self, *exc):
-        self.p.terminate ()
-        self.p.wait ()
-        shutil.rmtree (self.userDataDir)
-        self.p = None
-
-class NullService:
-    def __init__ (self, url):
-        self.url = url
-
-    def __enter__ (self):
-        return self.url
-
-    def __exit__ (self, *exc):
-        pass
-
-### tests ###
-
-import unittest, time
-from http.server import BaseHTTPRequestHandler
-
-class TestHTTPRequestHandler (BaseHTTPRequestHandler):
-    encodingTestString = {
-        'latin1': 'äöü',
-        'utf-8': 'äöü',
-        'ISO-8859-1': 'äöü',
-        }
-    binaryTestData = b'\x00\x01\x02'
-    # 1×1 pixel PNG
-    imageTestData = b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x00\x00\x00\x00:~\x9bU\x00\x00\x00\nIDAT\x08\x1dc\xf8\x0f\x00\x01\x01\x01\x006_g\x80\x00\x00\x00\x00IEND\xaeB`\x82'
-    htmlTestData = '<html><body><img src="/image"><img src="/nonexistent"></body></html>'
-    alertData = '<html><body><script>window.addEventListener("beforeunload", function (e) { e.returnValue = "bye?"; return e.returnValue; }); alert("stopping here"); if (confirm("are you sure?") || prompt ("42?")) { window.location = "/nonexistent"; }</script><img src="/image"></body></html>'
-
-    def do_GET(self):
-        path = self.path
-        if path.startswith ('/redirect/301'):
-            self.send_response(301)
-            self.send_header ('Location', path[13:])
-            self.end_headers()
-        elif path == '/empty':
-            self.send_response (200)
-            self.end_headers ()
-        elif path.startswith ('/encoding'):
-            # send text data with different encodings
-            _, _, encoding = path.split ('/', 3)
-            self.send_response (200)
-            self.send_header ('Content-Type', 'text/plain; charset={}'.format (encoding))
-            self.end_headers ()
-            self.wfile.write (self.encodingTestString[encoding].encode (encoding))
-        elif path == '/binary':
-            # send binary data
-            self.send_response (200)
-            self.send_header ('Content-Type', 'application/octet-stream')
-            self.send_header ('Content-Length', len (self.binaryTestData))
-            self.end_headers ()
-            self.wfile.write (self.binaryTestData)
-        elif path == '/image':
-            # send binary data
-            self.send_response (200)
-            self.send_header ('Content-Type', 'image/png')
-            self.end_headers ()
-            self.wfile.write (self.imageTestData)
-        elif path == '/attachment':
-            self.send_response (200)
-            self.send_header ('Content-Type', 'text/plain; charset=utf-8')
-            self.send_header ('Content-Disposition', 'attachment; filename="attachment.txt"')
-            self.end_headers ()
-            self.wfile.write (self.encodingTestString['utf-8'].encode ('utf-8'))
-        elif path == '/html':
-            self.send_response (200)
-            self.send_header ('Content-Type', 'text/html; charset=utf-8')
-            self.end_headers ()
-            self.wfile.write (self.htmlTestData.encode ('utf-8'))
-        elif path == '/alert':
-            self.send_response (200)
-            self.send_header ('Content-Type', 'text/html; charset=utf-8')
-            self.end_headers ()
-            self.wfile.write (self.alertData.encode ('utf-8'))
-        else:
-            self.send_response (404)
-            self.end_headers ()
-
-    def log_message (self, format, *args):
-        pass
-
-def startServer ():
-    import http.server
-    PORT = 8000
-    httpd = http.server.HTTPServer (("localhost", PORT), TestHTTPRequestHandler)
-    httpd.serve_forever()
-
-class TestSiteLoaderAdapter (SiteLoader):
-    def __init__ (self, browser, url):
-        SiteLoader.__init__ (self, browser, url)
-        self.finished = []
-
-    def loadingFinished (self, item, redirect=False):
-        self.finished.append (item)
-
-class TestSiteLoader (unittest.TestCase):
-    def setUp (self):
-        from multiprocessing import Process
-        self.server = Process (target=startServer)
-        self.server.start ()
-        self.baseurl = 'http://localhost:8000/'
-        self.service = ChromeService ()
-        browserUrl = self.service.__enter__ ()
-        self.browser = pychrome.Browser(url=browserUrl)
-
-    def buildAdapter (self, path):
-        return TestSiteLoaderAdapter (self.browser, '{}{}'.format (self.baseurl, path))
-
-    def assertUrls (self, l, expect):
-        urls = set (map (lambda x: x.parsedUrl.path, l.finished))
-        expect = set (expect)
-        self.assertEqual (urls, expect)
-        
-    def test_wait (self):
-        waittime = 2
-        with self.buildAdapter ('empty') as l:
-            l.start ()
-            before = time.time ()
-            l.wait (waittime)
-            after = time.time ()
-            self.assertTrue ((after-before) >= waittime)
-
-    def test_empty (self):
-        with self.buildAdapter ('empty') as l:
-            l.start ()
-            l.waitIdle ()
-            self.assertEqual (len (l.finished), 1)
-
-    def test_redirect301 (self):
-        with self.buildAdapter ('redirect/301/empty') as l:
-            l.start ()
-            l.waitIdle ()
-            self.assertEqual (len (l.finished), 2)
-            self.assertUrls (l, ['/redirect/301/empty', '/empty'])
-            for item in l.finished:
-                if item.parsedUrl.path == '/empty':
-                    self.assertEqual (item.response['status'], 200)
-                    self.assertEqual (item.body[0], b'')
-                elif item.parsedUrl.path == '/redirect/301/empty':
-                    self.assertEqual (item.response['status'], 301)
-                else:
-                    self.fail ('unknown url')
-
-    def test_redirect301multi (self):
-        with self.buildAdapter ('redirect/301/redirect/301/empty') as l:
-            l.start ()
-            l.waitIdle ()
-            self.assertEqual (len (l.finished), 3)
-            self.assertUrls (l, ['/redirect/301/redirect/301/empty', '/redirect/301/empty', '/empty'])
-            for item in l.finished:
-                if item.parsedUrl.path == '/empty':
-                    self.assertEqual (item.response['status'], 200)
-                    self.assertEqual (item.body[0], b'')
-                elif item.parsedUrl.path in {'/redirect/301/empty', \
-                        '/redirect/301/redirect/301/empty'}:
-                    self.assertEqual (item.response['status'], 301)
-                else:
-                    self.fail ('unknown url')
-
-    def test_encoding (self):
-        """ Text responses are transformed to UTF-8. Make sure this works
-        correctly. """
-        for encoding, expected in TestHTTPRequestHandler.encodingTestString.items ():
-            with self.buildAdapter ('encoding/{}'.format (encoding)) as l:
-                l.start ()
-                l.waitIdle ()
-                self.assertEqual (len (l.finished), 1)
-                self.assertUrls (l, ['/encoding/{}'.format (encoding)])
-                self.assertEqual (l.finished[0].body[0], expected.encode ('utf8'))
-
-    def test_binary (self):
-        """ Browser should ignore content it cannot display (i.e. octet-stream) """
-        with self.buildAdapter ('binary') as l:
-            l.start ()
-            l.waitIdle ()
-            self.assertEqual (len (l.finished), 0)
-
-    def test_image (self):
-        """ Images should be displayed inline """
-        with self.buildAdapter ('image') as l:
-            l.start ()
-            l.waitIdle ()
-            self.assertEqual (len (l.finished), 1)
-            self.assertUrls (l, ['/image'])
-            self.assertEqual (l.finished[0].body[0], TestHTTPRequestHandler.imageTestData)
-
-    def test_attachment (self):
-        """ And downloads won’t work in headless mode """
-        with self.buildAdapter ('attachment') as l:
-            l.start ()
-            l.waitIdle ()
-            self.assertEqual (len (l.finished), 0)
-
-    def test_html (self):
-        with self.buildAdapter ('html') as l:
-            l.start ()
-            l.waitIdle ()
-            self.assertEqual (len (l.finished), 3)
-            self.assertUrls (l, ['/html', '/image', '/nonexistent'])
-            for item in l.finished:
-                if item.parsedUrl.path == '/html':
-                    self.assertEqual (item.response['status'], 200)
-                    self.assertEqual (item.body[0], TestHTTPRequestHandler.htmlTestData.encode ('utf-8'))
-                elif item.parsedUrl.path == '/image':
-                    self.assertEqual (item.response['status'], 200)
-                    self.assertEqual (item.body[0], TestHTTPRequestHandler.imageTestData)
-                elif item.parsedUrl.path == '/nonexistent':
-                    self.assertEqual (item.response['status'], 404)
-                else:
-                    self.fail ('unknown url')
-
-    def test_alert (self):
-        with self.buildAdapter ('alert') as l:
-            l.start ()
-            l.waitIdle ()
-            self.assertUrls (l, ['/alert', '/image'])
-            for item in l.finished:
-                if item.parsedUrl.path == '/alert':
-                    self.assertEqual (item.response['status'], 200)
-                    self.assertEqual (item.body[0], TestHTTPRequestHandler.alertData.encode ('utf-8'))
-                elif item.parsedUrl.path == '/image':
-                    self.assertEqual (item.response['status'], 200)
-                    self.assertEqual (item.body[0], TestHTTPRequestHandler.imageTestData)
-                else:
-                    self.fail ('unknown url')
-
-    def tearDown (self):
-        self.service.__exit__ (None, None, None)
-        self.server.terminate ()
-        self.server.join ()
-
-if __name__ == '__main__':
-    import sys
-    if sys.argv[1] == 'server':
-        startServer ()
+            self.logger.info ('js dialog',
+                    uuid='96399b99-9834-4c8f-bd93-cb9fa2225abd',
+                    action='proceed', type=t, message=kwargs.get ('message'))
+            await self.tab.Page.handleJavaScriptDialog (accept=True)
+        else: # pragma: no cover
+            self.logger.warning ('js dialog unknown',
+                    uuid='3ef7292e-8595-4e89-b834-0cc6bc40ee38', **kwargs)
+
+    async def _frameStartedLoading (self, **kwargs):
+        self.logger.debug ('frameStartedLoading',
+                uuid='bbeb39c0-3304-4221-918e-f26bd443c566', args=kwargs)
+
+        self._framesLoading.add (kwargs['frameId'])
+        return PageIdle (False)
+
+    async def _frameStoppedLoading (self, **kwargs):
+        self.logger.debug ('frameStoppedLoading',
+                uuid='fcbe8110-511c-4cbb-ac2b-f61a5782c5a0', args=kwargs)
+
+        self._framesLoading.remove (kwargs['frameId'])
+        if not self._framesLoading:
+            return PageIdle (True)
+
+    async def _frameNavigated (self, **kwargs):
+        self.logger.debug ('frameNavigated',
+                uuid='0e876f7d-7129-4612-8632-686f42ac6e1f', args=kwargs)
+        frame = kwargs['frame']
+        if self._rootFrame == frame['id']:
+            assert frame.get ('parentId', None) is None, "root frame must not have a parent"
+            return FrameNavigated (frame['id'], frame['url'], frame['mimeType'])
 
diff --git a/crocoite/cli.py b/crocoite/cli.py
index f6454da..04bbb19 100644
--- a/crocoite/cli.py
+++ b/crocoite/cli.py
@@ -22,72 +22,235 @@
 Command line interface
 """
 
-import logging, argparse, json, sys
+import argparse, sys, signal, asyncio, os, json
+from traceback import TracebackException
+from enum import IntEnum
+from yarl import URL
+from http.cookies import SimpleCookie
+import pkg_resources
+try:
+    import manhole
+    manhole.install (patch_fork=False, oneshot_on='USR1')
+except ModuleNotFoundError:
+    pass
 
-from . import behavior
-from .controller import RecursiveController, defaultSettings, \
-        ControllerSettings, DepthLimit, PrefixLimit
-from .browser import NullService, ChromeService
+from . import behavior, browser
+from .controller import SinglePageController, \
+        ControllerSettings, StatsHandler, LogHandler, \
+        RecursiveController, DepthLimit, PrefixLimit
+from .devtools import Passthrough, Process
+from .warc import WarcHandler
+from .logger import Logger, JsonPrintConsumer, DatetimeConsumer, \
+        WarcHandlerConsumer, Level
+from .devtools import Crashed
 
-def parseRecursive (recursive, url):
+def absurl (s):
+    """ argparse: Absolute URL """
+    u = URL (s)
+    if u.is_absolute ():
+        return u
+    raise argparse.ArgumentTypeError ('Must be absolute')
+
+def cookie (s):
+    """ argparse: Cookie """
+    c = SimpleCookie (s)
+    # for some reason the constructor does not raise an exception if the cookie
+    # supplied is invalid. It’ll simply be empty.
+    if len (c) != 1:
+        raise argparse.ArgumentTypeError ('Invalid cookie')
+    # we want a single Morsel
+    return next (iter (c.values ()))
+
+def cookiejar (f):
+    """ argparse: Cookies from file """
+    cookies = []
+    try:
+        with open (f, 'r') as fd:
+            for l in fd:
+                l = l.lstrip ()
+                if l and not l.startswith ('#'):
+                    cookies.append (cookie (l))
+    except FileNotFoundError:
+        raise argparse.ArgumentTypeError (f'Cookie jar "{f}" does not exist')
+    return cookies
+
+class SingleExitStatus(IntEnum):
+    """ Exit status for single-shot command line """
+    Ok = 0
+    Fail = 1
+    BrowserCrash = 2
+    Navigate = 3
+
+def single ():
+    parser = argparse.ArgumentParser(description='crocoite helper tools to fetch individual pages.')
+    parser.add_argument('--browser', help='DevTools URL', type=absurl, metavar='URL')
+    parser.add_argument('--timeout', default=1*60*60, type=int, help='Maximum time for archival', metavar='SEC')
+    parser.add_argument('--idle-timeout', default=30, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout', metavar='SEC')
+    parser.add_argument('--behavior', help='Enable behavior script',
+            dest='enabledBehaviorNames',
+            default=list (behavior.availableMap.keys ()),
+            choices=list (behavior.availableMap.keys ()),
+            metavar='NAME', nargs='*')
+    parser.add_argument('--warcinfo', help='Add extra information to warcinfo record',
+            metavar='JSON', type=json.loads)
+    # re-using curl’s short/long switch names whenever possible
+    parser.add_argument('-k', '--insecure',
+            action='store_true',
+            help='Disable certificate validation')
+    parser.add_argument ('-b', '--cookie', type=cookie, metavar='SET-COOKIE',
+            action='append', default=[], help='Cookies in Set-Cookie format.')
+    parser.add_argument ('-c', '--cookie-jar', dest='cookieJar',
+            type=cookiejar, metavar='FILE',
+            default=pkg_resources.resource_filename (__name__, 'data/cookies.txt'),
+            help='Cookie jar file, read-only.')
+    parser.add_argument('url', help='Website URL', type=absurl, metavar='URL')
+    parser.add_argument('output', help='WARC filename', metavar='FILE')
+
+    args = parser.parse_args ()
+
+    logger = Logger (consumer=[DatetimeConsumer (), JsonPrintConsumer ()])
+
+    ret = SingleExitStatus.Fail
+    service = Process ()
+    if args.browser:
+        service = Passthrough (args.browser)
+    settings = ControllerSettings (
+            idleTimeout=args.idleTimeout,
+            timeout=args.timeout,
+            insecure=args.insecure,
+            cookies=args.cookieJar + args.cookie,
+            )
+    with open (args.output, 'wb') as fd, WarcHandler (fd, logger) as warcHandler:
+        logger.connect (WarcHandlerConsumer (warcHandler))
+        handler = [StatsHandler (), LogHandler (logger), warcHandler]
+        b = list (map (lambda x: behavior.availableMap[x], args.enabledBehaviorNames))
+        controller = SinglePageController (url=args.url, settings=settings,
+                service=service, handler=handler, behavior=b, logger=logger,
+                warcinfo=args.warcinfo)
+        try:
+            loop = asyncio.get_event_loop()
+            run = asyncio.ensure_future (controller.run ())
+            stop = lambda signum: run.cancel ()
+            loop.add_signal_handler (signal.SIGINT, stop, signal.SIGINT)
+            loop.add_signal_handler (signal.SIGTERM, stop, signal.SIGTERM)
+            loop.run_until_complete(run)
+            loop.close()
+            ret = SingleExitStatus.Ok
+        except Crashed:
+            ret = SingleExitStatus.BrowserCrash
+        except asyncio.CancelledError:
+            # don’t log this one
+            pass
+        except browser.NavigateError:
+            ret = SingleExitStatus.Navigate
+        except Exception as e:
+            ret = SingleExitStatus.Fail
+            logger.error ('cli exception',
+                    uuid='7fd69858-ecaa-4225-b213-8ab880aa3cc5',
+                    traceback=list (TracebackException.from_exception (e).format ()))
+        finally:
+            r = handler[0].stats
+            logger.info ('stats', context='cli', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **r)
+            logger.info ('exit', context='cli', uuid='9b1bd603-f7cd-4745-895a-5b894a5166f2', status=ret)
+
+    return ret
+
+def parsePolicy (recursive, url):
     if recursive is None:
         return DepthLimit (0)
     elif recursive.isdigit ():
         return DepthLimit (int (recursive))
     elif recursive == 'prefix':
         return PrefixLimit (url)
-    else:
-        raise ValueError ('Unsupported')
+    raise argparse.ArgumentTypeError ('Unsupported recursion mode')
+
+def recursive ():
+    logger = Logger (consumer=[DatetimeConsumer (), JsonPrintConsumer ()])
 
-def main ():
     parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.')
-    parser.add_argument('--browser', help='DevTools URL', metavar='URL')
-    parser.add_argument('--recursive', help='Follow links recursively')
-    parser.add_argument('--concurrency', '-j', type=int, default=1)
-    parser.add_argument('--timeout', default=10, type=int, help='Maximum time for archival', metavar='SEC')
-    parser.add_argument('--idle-timeout', default=2, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout', metavar='SEC')
-    parser.add_argument('--log-buffer', default=defaultSettings.logBuffer, type=int, dest='logBuffer', metavar='LINES')
-    parser.add_argument('--max-body-size', default=defaultSettings.maxBodySize, type=int, dest='maxBodySize', help='Max body size', metavar='BYTES')
-    parser.add_argument('--behavior', help='Comma-separated list of enabled behavior scripts',
-            dest='enabledBehaviorNames',
-            default=list (behavior.availableNames),
-            choices=list (behavior.availableNames))
-    group = parser.add_mutually_exclusive_group (required=True)
-    group.add_argument('--output', help='WARC filename', metavar='FILE')
-    group.add_argument('--distributed', help='Use celery worker', action='store_true')
-    parser.add_argument('url', help='Website URL')
+    parser.add_argument('-j', '--concurrency',
+            help='Run at most N jobs concurrently', metavar='N', default=1,
+            type=int)
+    parser.add_argument('-r', '--recursion', help='Recursion policy',
+            metavar='POLICY')
+    parser.add_argument('--tempdir', help='Directory for temporary files',
+            metavar='DIR')
+    parser.add_argument('url', help='Seed URL', type=absurl, metavar='URL')
+    parser.add_argument('output',
+            help='Output file, supports templates {host}, {date} and {seqnum}',
+            metavar='FILE')
+    parser.add_argument('command',
+            help='Fetch command, supports templates {url} and {dest}',
+            metavar='CMD', nargs='*',
+            default=['crocoite-single', '{url}', '{dest}'])
 
     args = parser.parse_args ()
+    try:
+        policy = parsePolicy (args.recursion, args.url)
+    except argparse.ArgumentTypeError as e:
+        parser.error (str (e))
 
-    if args.distributed:
-        if args.browser:
-            parser.error ('--browser is not supported for distributed jobs')
-        from . import task
-        settings = dict (maxBodySize=args.maxBodySize,
-                logBuffer=args.logBuffer, idleTimeout=args.idleTimeout,
-                timeout=args.timeout)
-        result = task.controller.delay (url=args.url, settings=settings,
-                enabledBehaviorNames=args.enabledBehaviorNames,
-                recursive=args.recursive, concurrency=args.concurrency)
-        r = result.get ()
-    else:
-        logging.basicConfig (level=logging.INFO)
+    try:
+        controller = RecursiveController (url=args.url, output=args.output,
+                command=args.command, logger=logger, policy=policy,
+                tempdir=args.tempdir, concurrency=args.concurrency)
+    except ValueError as e:
+        parser.error (str (e))
 
-        try:
-            recursionPolicy = parseRecursive (args.recursive, args.url)
-        except ValueError:
-            parser.error ('Invalid argument for --recursive')
-        service = ChromeService ()
-        if args.browser:
-            service = NullService (args.browser)
-        settings = ControllerSettings (maxBodySize=args.maxBodySize,
-                logBuffer=args.logBuffer, idleTimeout=args.idleTimeout,
-                timeout=args.timeout)
-        with open (args.output, 'wb') as fd:
-            controller = RecursiveController (args.url, fd, settings=settings,
-                    recursionPolicy=recursionPolicy, service=service)
-            r = controller.run ()
-    json.dump (r, sys.stdout)
-
-    return True
+    run = asyncio.ensure_future (controller.run ())
+    loop = asyncio.get_event_loop()
+    stop = lambda signum: run.cancel ()
+    loop.add_signal_handler (signal.SIGINT, stop, signal.SIGINT)
+    loop.add_signal_handler (signal.SIGTERM, stop, signal.SIGTERM)
+    try:
+        loop.run_until_complete(run)
+    except asyncio.CancelledError:
+        pass
+    finally:
+        loop.close()
+
+    return 0
+
+def irc ():
+    import json, re
+    from .irc import Chromebot
+
+    logger = Logger (consumer=[DatetimeConsumer (), JsonPrintConsumer ()])
+
+    parser = argparse.ArgumentParser(description='IRC bot.')
+    parser.add_argument('--config', '-c', help='Config file location', metavar='PATH', default='chromebot.json')
+
+    args = parser.parse_args ()
+
+    with open (args.config) as fd:
+        config = json.load (fd)
+    s = config['irc']
+    blacklist = dict (map (lambda x: (re.compile (x[0], re.I), x[1]), config['blacklist'].items ()))
+
+    loop = asyncio.get_event_loop()
+    bot = Chromebot (
+            host=s['host'],
+            port=s['port'],
+            ssl=s['ssl'],
+            nick=s['nick'],
+            channels=s['channels'],
+            tempdir=config['tempdir'],
+            destdir=config['destdir'],
+            processLimit=config['process_limit'],
+            logger=logger,
+            blacklist=blacklist,
+            needVoice=config['need_voice'],
+            loop=loop)
+    stop = lambda signum: bot.cancel ()
+    loop.add_signal_handler (signal.SIGINT, stop, signal.SIGINT)
+    loop.add_signal_handler (signal.SIGTERM, stop, signal.SIGTERM)
+    loop.run_until_complete(bot.run ())
+
+def dashboard ():
+    from .irc import Dashboard
+
+    loop = asyncio.get_event_loop()
+    d = Dashboard (sys.stdin, loop)
+    loop.run_until_complete(d.run ())
+    loop.run_forever()
 
diff --git a/crocoite/controller.py b/crocoite/controller.py
index bc6f948..8374b4e 100644
--- a/crocoite/controller.py
+++ b/crocoite/controller.py
@@ -22,115 +22,297 @@
 Controller classes, handling actions required for archival
 """
 
+import time, tempfile, asyncio, json, os, shutil, signal
+from itertools import islice
+from datetime import datetime
+from operator import attrgetter
+from abc import ABC, abstractmethod
+from yarl import URL
+
+from . import behavior as cbehavior
+from .browser import SiteLoader, RequestResponsePair, PageIdle, FrameNavigated
+from .util import getFormattedViewportMetrics, getSoftwareInfo
+from .behavior import ExtractLinksEvent
+from .devtools import toCookieParam
+
 class ControllerSettings:
-    def __init__ (self, logBuffer=1000, maxBodySize=50*1024*1024, idleTimeout=2, timeout=10):
-        self.logBuffer = logBuffer
-        self.maxBodySize = maxBodySize
+    __slots__ = ('idleTimeout', 'timeout', 'insecure', 'cookies')
+
+    def __init__ (self, idleTimeout=2, timeout=10, insecure=False, cookies=None):
         self.idleTimeout = idleTimeout
         self.timeout = timeout
+        self.insecure = insecure
+        self.cookies = cookies or []
 
-    def toDict (self):
-        return dict (logBuffer=self.logBuffer, maxBodySize=self.maxBodySize,
-                idleTimeout=self.idleTimeout, timeout=self.timeout)
+    def __repr__ (self):
+        return f'<ControllerSetting idleTimeout={self.idleTimeout!r}, timeout={self.timeout!r}, insecure={self.insecure!r}, cookies={self.cookies!r}>'
 
 defaultSettings = ControllerSettings ()
 
-import logging
-from urllib.parse import urlsplit, urlunsplit
+class EventHandler (ABC):
+    """ Abstract base class for event handler """
 
-import pychrome
+    __slots__ = ()
 
-from . import behavior as cbehavior
-from .browser import ChromeService
-from .warc import WarcLoader, SerializingWARCWriter
-from .util import getFormattedViewportMetrics
+    @abstractmethod
+    async def push (self, item):
+        raise NotImplementedError ()
+
+class StatsHandler (EventHandler):
+    __slots__ = ('stats', )
+
+    def __init__ (self):
+        self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0}
+
+    async def push (self, item):
+        if isinstance (item, RequestResponsePair):
+            self.stats['requests'] += 1
+            if not item.response:
+                self.stats['failed'] += 1
+            else:
+                self.stats['finished'] += 1
+                self.stats['bytesRcv'] += item.response.bytesReceived
+
+class LogHandler (EventHandler):
+    """ Handle items by logging information about them """
+
+    __slots__ = ('logger', )
+
+    def __init__ (self, logger):
+        self.logger = logger.bind (context=type (self).__name__)
+
+    async def push (self, item):
+        if isinstance (item, ExtractLinksEvent):
+            # limit number of links per message, so json blob won’t get too big
+            it = iter (item.links)
+            limit = 100
+            while True:
+                limitlinks = list (islice (it, 0, limit))
+                if not limitlinks:
+                    break
+                self.logger.info ('extracted links', context=type (item).__name__,
+                        uuid='8ee5e9c9-1130-4c5c-88ff-718508546e0c', links=limitlinks)
+
+
+class ControllerStart:
+    __slots__ = ('payload', )
+
+    def __init__ (self, payload):
+        self.payload = payload
+
+class IdleStateTracker (EventHandler):
+    """ Track SiteLoader’s idle state by listening to PageIdle events """
+
+    __slots__ = ('_idle', '_loop', '_idleSince')
+
+    def __init__ (self, loop):
+        self._idle = True
+        self._loop = loop
+
+        self._idleSince = self._loop.time ()
+
+    async def push (self, item):
+        if isinstance (item, PageIdle):
+            self._idle = bool (item)
+            if self._idle:
+                self._idleSince = self._loop.time ()
+
+    async def wait (self, timeout):
+        """ Wait until page has been idle for at least timeout seconds. If the
+        page has been idle before calling this function it may return
+        immediately. """
+
+        assert timeout > 0
+        while True:
+            if self._idle:
+                now = self._loop.time ()
+                sleep = timeout-(now-self._idleSince)
+                if sleep <= 0:
+                    break
+            else:
+                # not idle, check again after timeout expires
+                sleep = timeout
+            await asyncio.sleep (sleep)
+
+class InjectBehaviorOnload (EventHandler):
+    """ Control behavior script injection based on frame navigation messages.
+    When a page is reloaded (for whatever reason), the scripts need to be
+    reinjected. """
+
+    __slots__ = ('controller', '_loaded')
+
+    def __init__ (self, controller):
+        self.controller = controller
+        self._loaded = False
+
+    async def push (self, item):
+        if isinstance (item, FrameNavigated):
+            await self._runon ('load')
+            self._loaded = True
+
+    async def stop (self):
+        if self._loaded:
+            await self._runon ('stop')
 
-def firstOrNone (it):
-    """ Return first item of iterator it or None if empty """
-    try:
-        return next (it)
-    except StopIteration:
-        return None
+    async def finish (self):
+        if self._loaded:
+            await self._runon ('finish')
+
+    async def _runon (self, method):
+        controller = self.controller
+        for b in controller._enabledBehavior:
+            f = getattr (b, 'on' + method)
+            async for item in f ():
+                await controller.processItem (item)
 
 class SinglePageController:
     """
-    Archive a single page url to file output.
+    Archive a single page url.
+
+    Dispatches between producer (site loader and behavior scripts) and consumer
+    (stats, warc writer).
     """
 
-    def __init__ (self, url, output, service=ChromeService (), behavior=cbehavior.available, \
-            logger=logging.getLogger(__name__), settings=defaultSettings):
+    __slots__ = ('url', 'service', 'behavior', 'settings', 'logger', 'handler',
+            'warcinfo', '_enabledBehavior')
+
+    def __init__ (self, url, logger, \
+            service, behavior=cbehavior.available, \
+            settings=defaultSettings, handler=None, \
+            warcinfo=None):
         self.url = url
-        self.output = output
         self.service = service
         self.behavior = behavior
         self.settings = settings
-        self.logger = logger
-
-    def run (self):
-        ret = {'stats': None, 'links': []}
-
-        with self.service as browser:
-            browser = pychrome.Browser (url=browser)
-            writer = SerializingWARCWriter (self.output, gzip=True)
-
-            with WarcLoader (browser, self.url, writer,
-                    logBuffer=self.settings.logBuffer,
-                    maxBodySize=self.settings.maxBodySize) as l:
-                version = l.tab.Browser.getVersion ()
-                payload = {
-                        'software': __package__,
-                        'browser': version['product'],
+        self.logger = logger.bind (context=type (self).__name__, url=url)
+        self.handler = handler or []
+        self.warcinfo = warcinfo
+
+    async def processItem (self, item):
+        for h in self.handler:
+            await h.push (item)
+
+    async def run (self):
+        logger = self.logger
+        async def processQueue ():
+            async for item in l:
+                await self.processItem (item)
+
+        idle = IdleStateTracker (asyncio.get_event_loop ())
+        self.handler.append (idle)
+        behavior = InjectBehaviorOnload (self)
+        self.handler.append (behavior)
+
+        async with self.service as browser, SiteLoader (browser, logger=logger) as l:
+            handle = asyncio.ensure_future (processQueue ())
+            timeoutProc = asyncio.ensure_future (asyncio.sleep (self.settings.timeout))
+
+            # configure browser
+            tab = l.tab
+            await tab.Security.setIgnoreCertificateErrors (ignore=self.settings.insecure)
+            await tab.Network.setCookies (cookies=list (map (toCookieParam, self.settings.cookies)))
+
+            # not all behavior scripts are allowed for every URL, filter them
+            self._enabledBehavior = list (filter (lambda x: self.url in x,
+                    map (lambda x: x (l, logger), self.behavior)))
+
+            version = await tab.Browser.getVersion ()
+            payload = {
+                    'software': getSoftwareInfo (),
+                    'browser': {
+                        'product': version['product'],
                         'useragent': version['userAgent'],
-                        'viewport': getFormattedViewportMetrics (l.tab),
-                        }
-                warcinfo = writer.create_warcinfo_record (filename=None, info=payload)
-                writer.write_record (warcinfo)
-
-                # not all behavior scripts are allowed for every URL, filter them
-                enabledBehavior = list (filter (lambda x: self.url in x,
-                        map (lambda x: x (l), self.behavior)))
-                linksBehavior = firstOrNone (filter (lambda x: isinstance (x, cbehavior.ExtractLinks),
-                        enabledBehavior))
-
-                for b in enabledBehavior:
-                    self.logger.debug ('starting onload behavior {}'.format (b.name))
-                    b.onload ()
-                l.start ()
-
-                l.waitIdle (self.settings.idleTimeout, self.settings.timeout)
-
-                for b in enabledBehavior:
-                    self.logger.debug ('starting onstop behavior {}'.format (b.name))
-                    b.onstop ()
-
-                # if we stopped due to timeout, wait for remaining assets
-                l.waitIdle (2, 60)
-                l.stop ()
-
-                for b in enabledBehavior:
-                    self.logger.debug ('starting onfinish behavior {}'.format (b.name))
-                    b.onfinish ()
-
-                ret['stats'] = l.stats
-                ret['links'] = linksBehavior.links if linksBehavior else None
-            writer.flush ()
-        return ret
-
-from collections import UserDict
-
-class IntegerDict (UserDict):
-    """ Dict with dict/dict per-item arithmetic propagation, i.e. {1: 2}+{1: 1}={1: 3} """
-    def __add__ (self, b):
-        newdict = self.__class__ (self)
-        for k, v in b.items ():
-            if k in self:
-                newdict[k] += v
+                        'viewport': await getFormattedViewportMetrics (tab),
+                        },
+                    'tool': 'crocoite-single', # not the name of the cli utility
+                    'parameters': {
+                        'url': self.url,
+                        'idleTimeout': self.settings.idleTimeout,
+                        'timeout': self.settings.timeout,
+                        'behavior': list (map (attrgetter('name'), self._enabledBehavior)),
+                        'insecure': self.settings.insecure,
+                        'cookies': list (map (lambda x: x.OutputString(), self.settings.cookies)),
+                        },
+                    }
+            if self.warcinfo:
+                payload['extra'] = self.warcinfo
+            await self.processItem (ControllerStart (payload))
+
+            await l.navigate (self.url)
+
+            idleProc = asyncio.ensure_future (idle.wait (self.settings.idleTimeout))
+            while True:
+                try:
+                    finished, pending = await asyncio.wait([idleProc, timeoutProc, handle],
+                            return_when=asyncio.FIRST_COMPLETED)
+                except asyncio.CancelledError:
+                    idleProc.cancel ()
+                    timeoutProc.cancel ()
+                    break
+
+                if handle in finished:
+                    # something went wrong while processing the data
+                    logger.error ('fetch failed',
+                        uuid='43a0686a-a3a9-4214-9acd-43f6976f8ff3')
+                    idleProc.cancel ()
+                    timeoutProc.cancel ()
+                    handle.result ()
+                    assert False # previous line should always raise Exception
+                elif timeoutProc in finished:
+                    # global timeout
+                    logger.debug ('global timeout',
+                            uuid='2f858adc-9448-4ace-94b4-7cd1484c0728')
+                    idleProc.cancel ()
+                    timeoutProc.result ()
+                    break
+                elif idleProc in finished:
+                    # idle timeout
+                    logger.debug ('idle timeout',
+                            uuid='90702590-94c4-44ef-9b37-02a16de444c3')
+                    idleProc.result ()
+                    timeoutProc.cancel ()
+                    break
+
+            await behavior.stop ()
+            await tab.Page.stopLoading ()
+            await asyncio.sleep (1)
+            await behavior.finish ()
+
+            # wait until loads from behavior scripts are done and browser is
+            # idle for at least 1 second
+            try:
+                await asyncio.wait_for (idle.wait (1), timeout=1)
+            except (asyncio.TimeoutError, asyncio.CancelledError):
+                pass
+
+            if handle.done ():
+                handle.result ()
             else:
-                newdict[k] = v
-        return newdict
+                handle.cancel ()
+
+class SetEntry:
+    """ A object, to be used with sets, that compares equality only on its
+    primary property. """
+    def __init__ (self, value, **props):
+        self.value = value
+        for k, v in props.items ():
+            setattr (self, k, v)
+
+    def __eq__ (self, b):
+        assert isinstance (b, SetEntry)
+        return self.value == b.value
+
+    def __hash__ (self):
+        return hash (self.value)
+
+    def __repr__ (self):
+        return f'<SetEntry {self.value!r}>'
 
 class RecursionPolicy:
     """ Abstract recursion policy """
+
+    __slots__ = ()
+
     def __call__ (self, urls):
         raise NotImplementedError
 
@@ -138,20 +320,23 @@ class DepthLimit (RecursionPolicy):
     """
     Limit recursion by depth.
     
-    depth==0 means no recursion, depth==1 is the page and outgoing links, …
+    depth==0 means no recursion, depth==1 is the page and outgoing links
     """
+
+    __slots__ = ('maxdepth', )
+
     def __init__ (self, maxdepth=0):
         self.maxdepth = maxdepth
 
     def __call__ (self, urls):
-        if self.maxdepth <= 0:
-            return {}
-        else:
-            self.maxdepth -= 1
-            return urls
+        newurls = set ()
+        for u in urls:
+            if u.depth <= self.maxdepth:
+                newurls.add (u)
+        return newurls
 
     def __repr__ (self):
-        return '<DepthLimit {}>'.format (self.maxdepth)
+        return f'<DepthLimit {self.maxdepth}>'
 
 class PrefixLimit (RecursionPolicy):
     """
@@ -161,65 +346,196 @@ class PrefixLimit (RecursionPolicy):
     ignored: http://example.com/bar http://offsite.example/foo
     accepted: http://example.com/foobar http://example.com/foo/bar
     """
+
+    __slots__ = ('prefix', )
+
     def __init__ (self, prefix):
         self.prefix = prefix
 
     def __call__ (self, urls):
-        return set (filter (lambda u: u.startswith (self.prefix), urls))
+        return set (filter (lambda u: str(u.value).startswith (str (self.prefix)), urls))
 
-def removeFragment (u):
-    """ Remove fragment from url (i.e. #hashvalue) """
-    s = urlsplit (u)
-    return urlunsplit ((s.scheme, s.netloc, s.path, s.query, ''))
+def hasTemplate (s):
+    """ Return True if string s has string templates """
+    return '{' in s and '}' in s
 
 class RecursiveController:
     """
     Simple recursive controller
 
-    Visits links acording to recursionPolicy
+    Visits links acording to policy
     """
 
-    def __init__ (self, url, output, service=ChromeService (), behavior=cbehavior.available, \
-            logger=logging.getLogger(__name__), settings=defaultSettings,
-            recursionPolicy=DepthLimit (0)):
+    __slots__ = ('url', 'output', 'command', 'logger', 'policy', 'have',
+            'pending', 'stats', 'tempdir', 'running', 'concurrency',
+            'copyLock')
+
+    SCHEME_WHITELIST = {'http', 'https'}
+
+    def __init__ (self, url, output, command, logger,
+            tempdir=None, policy=DepthLimit (0), concurrency=1):
         self.url = url
         self.output = output
-        self.service = service
-        self.behavior = behavior
-        self.settings = settings
-        self.logger = logger
-        self.recursionPolicy = recursionPolicy
-
-    def fetch (self, urls):
+        self.command = command
+        self.logger = logger.bind (context=type(self).__name__, seedurl=url)
+        self.policy = policy
+        self.tempdir = tempdir
+        # A lock if only a single output file (no template) is requested
+        self.copyLock = None if hasTemplate (output) else asyncio.Lock ()
+        # some sanity checks. XXX move to argparse?
+        if self.copyLock and os.path.exists (self.output):
+                raise ValueError ('Output file exists')
+        # tasks currently running
+        self.running = set ()
+        # max number of tasks running
+        self.concurrency = concurrency
+        # keep in sync with StatsHandler
+        self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0, 'crashed': 0, 'ignored': 0}
+
+    async def fetch (self, entry, seqnum):
         """
-        Overrideable fetch action for URLs. Defaults to sequential
-        SinglePageController.
+        Fetch a single URL using an external command
+
+        command is usually crocoite-single
         """
-        result = []
-        for u in urls:
-            c = SinglePageController (u, self.output, self.service,
-                    self.behavior, self.logger, self.settings)
-            result.append (c.run ())
-        return result
-
-    def run (self):
-        have = set ()
-        urls = set ([self.url])
-        ret = {'stats': IntegerDict ()}
-
-        while urls:
-            self.logger.info ('retrieving {} urls'.format (len (urls)))
-            result = self.fetch (urls)
-
-            have.update (urls)
-            urls = set ()
-            for r in result:
-                ret['stats'] += r['stats']
-                urls.update (map (removeFragment, r['links']))
-            urls.difference_update (have)
-
-            urls = self.recursionPolicy (urls)
-        # everything in ret must be serializeable
-        ret['stats'] = dict (ret['stats'])
-        return ret
+
+        assert isinstance (entry, SetEntry)
+
+        url = entry.value
+        depth = entry.depth
+        logger = self.logger.bind (url=url)
+
+        def formatCommand (e):
+            # provide means to disable variable expansion
+            if e.startswith ('!'):
+                return e[1:]
+            else:
+                return e.format (url=url, dest=dest.name)
+
+        def formatOutput (p):
+            return p.format (host=url.host,
+                    date=datetime.utcnow ().isoformat (), seqnum=seqnum)
+
+        def logStats ():
+            logger.info ('stats', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **self.stats)
+
+        if url.scheme not in self.SCHEME_WHITELIST:
+            self.stats['ignored'] += 1
+            logStats ()
+            self.logger.warning ('scheme not whitelisted', url=url,
+                    uuid='57e838de-4494-4316-ae98-cd3a2ebf541b')
+            return
+
+        dest = tempfile.NamedTemporaryFile (dir=self.tempdir,
+                prefix=os.path.basename (self.output) + '-', suffix='.warc.gz',
+                delete=False)
+        command = list (map (formatCommand, self.command))
+        logger.info ('fetch', uuid='d1288fbe-8bae-42c8-af8c-f2fa8b41794f',
+                command=command)
+        try:
+            process = await asyncio.create_subprocess_exec (*command,
+                    stdout=asyncio.subprocess.PIPE,
+                    stderr=asyncio.subprocess.DEVNULL,
+                    stdin=asyncio.subprocess.DEVNULL,
+                    start_new_session=True, limit=100*1024*1024)
+            while True:
+                data = await process.stdout.readline ()
+                if not data:
+                    break
+                data = json.loads (data)
+                uuid = data.get ('uuid')
+                if uuid == '8ee5e9c9-1130-4c5c-88ff-718508546e0c':
+                    links = set (self.policy (map (lambda x: SetEntry (URL(x).with_fragment(None), depth=depth+1), data.get ('links', []))))
+                    links.difference_update (self.have)
+                    self.pending.update (links)
+                elif uuid == '24d92d16-770e-4088-b769-4020e127a7ff':
+                    for k in self.stats.keys ():
+                        self.stats[k] += data.get (k, 0)
+                    logStats ()
+        except asyncio.CancelledError:
+            # graceful cancellation
+            process.send_signal (signal.SIGINT)
+        except Exception as e:
+            process.kill ()
+            raise e
+        finally:
+            code = await process.wait()
+            if code == 0:
+                if self.copyLock is None:
+                    # atomically move once finished
+                    lastDestpath = None
+                    while True:
+                        # XXX: must generate a new name every time, otherwise
+                        # this loop never terminates
+                        destpath = formatOutput (self.output)
+                        assert destpath != lastDestpath
+                        lastDestpath = destpath
+
+                        # python does not have rename(…, …, RENAME_NOREPLACE),
+                        # but this is safe nontheless, since we’re
+                        # single-threaded
+                        if not os.path.exists (destpath):
+                            # create the directory, so templates like
+                            # /{host}/{date}/… are possible
+                            os.makedirs (os.path.dirname (destpath), exist_ok=True)
+                            os.rename (dest.name, destpath)
+                            break
+                else:
+                    # atomically (in the context of this process) append to
+                    # existing file
+                    async with self.copyLock:
+                        with open (dest.name, 'rb') as infd, \
+                                open (self.output, 'ab') as outfd:
+                            shutil.copyfileobj (infd, outfd)
+                        os.unlink (dest.name)
+            else:
+                self.stats['crashed'] += 1
+                logStats ()
+
+    async def run (self):
+        def log ():
+            # self.have includes running jobs
+            self.logger.info ('recursing',
+                    uuid='5b8498e4-868d-413c-a67e-004516b8452c',
+                    pending=len (self.pending),
+                    have=len (self.have)-len(self.running),
+                    running=len (self.running))
+
+        seqnum = 1
+        try:
+            self.have = set ()
+            self.pending = set ([SetEntry (self.url, depth=0)])
+
+            while self.pending:
+                # since pending is a set this picks a random item, which is fine
+                u = self.pending.pop ()
+                self.have.add (u)
+                t = asyncio.ensure_future (self.fetch (u, seqnum))
+                self.running.add (t)
+                seqnum += 1
+
+                log ()
+
+                if len (self.running) >= self.concurrency or not self.pending:
+                    done, pending = await asyncio.wait (self.running,
+                            return_when=asyncio.FIRST_COMPLETED)
+                    self.running.difference_update (done)
+                    # propagate exceptions
+                    for r in done:
+                        r.result ()
+        except asyncio.CancelledError:
+            self.logger.info ('cancel',
+                    uuid='d58154c8-ec27-40f2-ab9e-e25c1b21cd88',
+                    pending=len (self.pending),
+                    have=len (self.have)-len (self.running),
+                    running=len (self.running))
+        finally:
+            done = await asyncio.gather (*self.running,
+                    return_exceptions=True)
+            # propagate exceptions
+            for r in done:
+                if isinstance (r, Exception):
+                    raise r
+            self.running = set ()
+            log ()
 
diff --git a/crocoite/data/click.js b/crocoite/data/click.js
index c51a690..ae189da 100644
--- a/crocoite/data/click.js
+++ b/crocoite/data/click.js
@@ -4,109 +4,7 @@
  *  like navigating to a different location. Thus whitelist known elements.
  */
 
-(function(){
-const selectorFlag = Object.freeze ({
-	none: 0,
-	multi: 1, /* click item multiple times */
-});
-const defaultClickThrottle = 50; /* in ms */
-const discoverInterval = 1000; /* 1 second */
-const sites = Object.freeze ([
-	{
-		hostname: /^www\.facebook\.com$/i,
-		selector: [
-			/* show more comments */
-			{s: 'a.UFIPagerLink[role=button]', flags: selectorFlag.none},
-			/* show nested comments*/
-			{s: 'a.UFICommentLink[role=button]', flags: selectorFlag.none},
-			],
-	}, {
-		hostname: /^twitter\.com$/i,
-		selector: [
-			/* expand threads */
-			{s: 'a.ThreadedConversation-moreRepliesLink', flags: selectorFlag.none},
-			/* show hidden profiles */
-			{s: 'button.ProfileWarningTimeline-button', flags: selectorFlag.none},
-			/* show hidden/sensitive media */
-			{s: 'button.Tombstone-action.js-display-this-media', flags: selectorFlag.none},
-			],
-	}, {
-		hostname: /^disqus\.com$/i,
-		selector: [
-			/* load more comments */
-			{s: 'a.load-more__button', flags: selectorFlag.multi},
-			],
-	}, {
-		hostname: /^(www|np)\.reddit\.com$/i,
-		selector: [
-			/* show more comments, reddit’s javascript ignores events if too
-			 * frequent */
-			{s: 'span.morecomments a', flags: selectorFlag.none, throttle: 500},
-			],
-	}, {
-		hostname: /^www\.instagram\.com$/i,
-		selector: [
-			/* posts may have multiple images that load dynamically, click the arrow */
-			{s: 'a[role=button].coreSpriteRightChevron', flags: selectorFlag.multi, throttle: 500},
-			/* load more comments */
-			{s: 'article div ul li a[role=button]', flags: selectorFlag.multi},
-			],
-	}, {
-		hostname: /^www\.youtube\.com$/i,
-		selector: [
-			/* expand comment thread */
-			{s: 'ytd-comment-thread-renderer div.more-button', flags: selectorFlag.none},
-			],
-	}, {
-		hostname: /^www\.patreon\.com$/i,
-		selector: [
-			/* load more content */
-			{s: 'div[display=flex] div[display=block] button[color=gray][type=button]', flags: selectorFlag.multi},
-			/* load more comments */
-			{s: 'div.stackable[display=block] > div  > div  > a[color=dark][target=_self]', flags: selectorFlag.none},
-			/* load more replies */
-			{s: 'div > a[scale="0"][color=blue][size="1"]', flags: selectorFlag.none},
-			],
-	}
-	]);
-
-/* pick selectors matching current location */
-let hostname = document.location.hostname;
-let selector = [];
-for (let s of sites) {
-	if (s.hostname.test (hostname)) {
-		selector = selector.concat (s.selector);
-	}
-}
-
-function makeClickEvent () {
-	return new MouseEvent('click', {
-				view: window,
-				bubbles: true,
-				cancelable: true
-				});
-}
-
-/* throttle clicking */
-let queue = [];
-let clickTimeout = null;
-function click () {
-	if (queue.length > 0) {
-		const item = queue.shift ();
-		const o = item.o;
-		const selector = item.selector;
-		o.dispatchEvent (makeClickEvent ());
-
-		if (queue.length > 0) {
-			const nextTimeout = 'throttle' in selector ?
-					selector.throttle : defaultClickThrottle;
-			clickTimeout = window.setTimeout (click, nextTimeout);
-		} else {
-			clickTimeout = null;
-		}
-	}
-}
-
+(function() {
 /*	Element is visible if itself and all of its parents are
  */
 function isVisible (o) {
@@ -128,28 +26,82 @@ function isClickable (o) {
 	return !o.hasAttribute ('disabled') && isVisible (o);
 }
 
-/* some sites don’t remove/replace the element immediately, so keep track of
- * which ones we already clicked */
-let have = new Set ();
-function discover () {
-	for (let s of selector) {
-		let obj = document.querySelectorAll (s.s);
-		for (let o of obj) {
-			if (!have.has (o) && isClickable (o)) {
-				queue.push ({o: o, selector: s});
-				if (!(s.flags & selectorFlag.multi)) {
-					have.add (o);
+const defaultClickThrottle = 50; /* in ms */
+const discoverInterval = 1000; /* 1 second */
+
+class Click {
+	constructor(options) {
+		/* pick selectors matching current location */
+		let hostname = document.location.hostname;
+		this.selector = [];
+		for (let s of options['sites']) {
+			let r = new RegExp (s.match, 'i');
+			if (r.test (hostname)) {
+				this.selector = this.selector.concat (s.selector);
+			}
+		}
+		/* throttle clicking */
+		this.queue = [];
+		this.clickTimeout = null;
+
+		/* some sites don’t remove/replace the element immediately, so keep track of
+		 * which ones we already clicked */
+		this.have = new Set ();
+
+		/* XXX: can we use a mutation observer instead? */
+		this.interval = window.setInterval (this.discover.bind (this), discoverInterval);
+	}
+
+	makeClickEvent () {
+		return new MouseEvent('click', {
+					view: window,
+					bubbles: true,
+					cancelable: true
+					});
+	}
+
+	click () {
+		if (this.queue.length > 0) {
+			const item = this.queue.shift ();
+			const o = item.o;
+			const selector = item.selector;
+			o.dispatchEvent (this.makeClickEvent ());
+
+			if (this.queue.length > 0) {
+				const nextTimeout = 'throttle' in selector ?
+						selector.throttle : defaultClickThrottle;
+				this.clickTimeout = window.setTimeout (this.click.bind (this), nextTimeout);
+			} else {
+				this.clickTimeout = null;
+			}
+		}
+	}
+
+	discover () {
+		for (let s of this.selector) {
+			let obj = document.querySelectorAll (s.selector);
+			for (let o of obj) {
+				if (!this.have.has (o) && isClickable (o)) {
+					this.queue.push ({o: o, selector: s});
+					if (!s.multi) {
+						this.have.add (o);
+					}
 				}
 			}
 		}
+		if (this.queue.length > 0 && this.clickTimeout === null) {
+			/* start clicking immediately */
+			this.clickTimeout = window.setTimeout (this.click.bind (this), 0);
+		}
+		return true;
 	}
-	if (queue.length > 0 && clickTimeout === null) {
-		/* start clicking immediately */
-		clickTimeout = window.setTimeout (click, 0);
+
+
+	stop () {
+		window.clearInterval (this.interval);
+		window.clearTimeout (this.clickTimeout);
 	}
-	return true;
 }
 
-/* XXX: can we use a mutation observer instead? */
-window.setInterval (discover, discoverInterval);
-}());
+return Click;
+}())
diff --git a/crocoite/data/click.yaml b/crocoite/data/click.yaml
new file mode 100644
index 0000000..78278b9
--- /dev/null
+++ b/crocoite/data/click.yaml
@@ -0,0 +1,117 @@
+# Configuration for behavior.py:Click
+# Example URLs are random. Believe me.
+match: ^www\.facebook\.com$
+selector:
+  - description: Show comments and replies/nested comments on user pages.
+    selector: form[action="/ajax/ufi/modify.php"] a[data-testid^="UFI2CommentsPagerRenderer/pager_depth_"]
+    urls: ["https://www.facebook.com/tagesschau"]
+  - description: Initially show comments below a single post/video, i.e. /user/post/123.
+    selector: form[action="/ajax/ufi/modify.php"] a[data-testid="UFI2CommentsCount/root"]
+    urls: ["https://www.facebook.com/tagesschau/posts/10157061068659407"]
+  - description: Close the “register now” nag screen. For screenshots.
+    selector: a#expanding_cta_close_button[role=button]
+    urls: ["https://www.facebook.com/tagesschau"]
+---
+match: ^twitter\.com$
+selector:
+  - description: Expand threads.
+    selector: a.ThreadedConversation-moreRepliesLink
+    urls: ["https://twitter.com/realDonaldTrump/status/1068826073775964160"]
+  - description: Show hidden profiles.
+    selector: button.ProfileWarningTimeline-button
+    urls: ["https://twitter.com/CookieCyboid"]
+  - description: Show hidden/sensitive media. For screen-/snapshots.
+    selector: button.Tombstone-action.js-display-this-media
+    urls: ["https://twitter.com/CookieCyboid/status/1070807283305713665"]
+  - description: Show more replies.
+    selector: button.ThreadedConversation-showMoreThreadsButton
+    urls: ["https://twitter.com/fuglydug/status/1172160128101076995"]
+---
+match: ^disqus\.com$
+selector:
+  - description: Load more comments.
+    selector: a.load-more__button
+    multi: True
+---
+# new layout
+match: ^www\.reddit\.com$
+selector:
+  - description: Show more comments.
+    selector: div[id^=moreComments-] > div > p
+    # reddit’s javascript ignores events if too frequent
+    throttle: 500
+    urls: ["https://www.reddit.com/r/subredditcancer/comments/b2b80f/we_are_moderators_of_rwatchpeopledie_amaa_just/"]
+---
+# old layout
+match: ^(old|np)\.reddit\.com$
+selector:
+  - description: Show more comments.
+    selector: span.morecomments a
+    # reddit’s javascript ignores events if too frequent
+    throttle: 500
+    urls: ["https://old.reddit.com/r/subredditcancer/comments/b2b80f/we_are_moderators_of_rwatchpeopledie_amaa_just/"]
+---
+match: ^www\.youtube\.com$
+selector:
+  - description: Expand single comment.
+    selector: ytd-comment-thread-renderer span[slot=more-button]
+    urls: ["https://www.youtube.com/watch?v=udtFqQuBFSc"]
+  - description: Show more comment thread replies.
+    selector: div.ytd-comment-replies-renderer > yt-next-continuation > paper-button
+    urls: ["https://www.youtube.com/watch?v=Lov0T3eXI2k"]
+    multi: True
+---
+match: ^www\.patreon\.com$
+selector:
+  - description: Load more comments.
+    selector: div[data-tag=post-card] button[data-tag=loadMoreCommentsCta]
+    urls: ["https://www.patreon.com/posts/what-im-on-22124040"]
+---
+match: ^(www\.)?gab\.com$
+selector:
+  - description: Load more posts.
+    selector: div.item-list[role=feed] button.load-more
+    multi: True
+    urls: ["https://gab.com/gab"]
+---
+match: ^(www\.)?github\.com$
+selector:
+  - description: Show hidden issue items.
+    urls: ["https://github.com/dominictarr/event-stream/issues/116"]
+    selector: div#discussion_bucket form.ajax-pagination-form button.ajax-pagination-btn
+---
+match: ^www\.gamasutra\.com$
+selector:
+    - description: Load more comments.
+      urls: ["http://www.gamasutra.com/blogs/RaminShokrizade/20130626/194933/The_Top_F2P_Monetization_Tricks.php"]
+      selector: div#dynamiccomments div.viewTopCmts a
+---
+match: ^(www\.)?steamcommunity\.com$
+selector:
+    - description: Load more content.
+      urls: ["https://steamcommunity.com/app/252950/reviews/?p=1&browsefilter=toprated&filterLanguage=all"]
+      selector: "#GetMoreContentBtn a"
+      multi: True
+---
+match: ^imgur\.com$
+selector:
+    - description: Load more images of an album.
+      urls: ["https://imgur.com/a/JG1yc"]
+      selector: div.js-post-truncated a.post-loadall
+    - description: Expand all comments. For snapshots.
+      urls: ["https://imgur.com/a/JG1yc"]
+      selector: div.comments-info span.comments-expand
+    - description: Show bad replies. for snapshots.
+      urls: ["https://imgur.com/gallery/jRzMfRG"]
+      selector: div#comments div.bad-captions a.link
+---
+match: ^(www\.)?vimeo\.com$
+selector:
+    - description: Load more videos on profile page.
+      urls: ["https://vimeo.com/dsam4a"]
+      selector: div.profile_main div.profile-load-more__button--wrapper button
+#    XXX: this works when using a non-headless browser, but does not otherwise
+#    - description: Expand video comments
+#      urls: ["https://vimeo.com/22439234"]
+#      selector: section#comments button.iris_comment-more
+#      multi: True
diff --git a/crocoite/data/cookies.txt b/crocoite/data/cookies.txt
new file mode 100644
index 0000000..6ac62c3
--- /dev/null
+++ b/crocoite/data/cookies.txt
@@ -0,0 +1,9 @@
+# Default cookies for crocoite. This file does *not* use Netscape’s cookie
+# file format. Lines are expected to be in Set-Cookie format.
+# And this line is a comment.
+
+# Reddit:
+# skip over 18 prompt
+over18=1; Domain=www.reddit.com
+# skip quarantined subreddit prompt
+_options={%22pref_quarantine_optin%22:true}; Domain=www.reddit.com
diff --git a/crocoite/data/extract-links.js b/crocoite/data/extract-links.js
index 4d1a3d0..5a4f9f0 100644
--- a/crocoite/data/extract-links.js
+++ b/crocoite/data/extract-links.js
@@ -25,11 +25,26 @@ function isClickable (o) {
 }
 /* --- end copy&paste */
 
-let x = document.body.querySelectorAll('a[href]');
 let ret = [];
+['a[href]', 'area[href]'].forEach (function (s) {
+	let x = document.querySelectorAll(s);
+	for (let i=0; i < x.length; i++) {
+		if (isClickable (x[i])) {
+			ret.push (x[i].href);
+		}
+	}
+});
+
+/* If Chrome loads plain-text documents it’ll wrap them into <pre>. Check those
+ * for links as well, assuming the whole line is a link (i.e. list of links). */
+let x = document.querySelectorAll ('body > pre');
 for (let i=0; i < x.length; i++) {
-	if (isClickable (x[i])) {
-		ret.push (x[i].href);
+	if (isVisible (x[i])) {
+		x[i].innerText.split ('\n').forEach (function (s) {
+			if (s.match ('^https?://')) {
+				ret.push (s);
+			}
+		});
 	}
 }
 return ret; /* immediately return results, for use with Runtime.evaluate() */
diff --git a/crocoite/data/screenshot.js b/crocoite/data/screenshot.js
new file mode 100644
index 0000000..a9a41e1
--- /dev/null
+++ b/crocoite/data/screenshot.js
@@ -0,0 +1,20 @@
+/* Find and scrollable full-screen elements and return their actual size
+ */
+(function () {
+/* limit the number of elements queried */
+let elem = document.querySelectorAll ('body > div');
+let ret = [];
+for (let i = 0; i < elem.length; i++) {
+	let e = elem[i];
+	let s = window.getComputedStyle (e);
+	if (s.getPropertyValue ('position') == 'fixed' &&
+			s.getPropertyValue ('overflow') == 'auto' &&
+			s.getPropertyValue ('left') == '0px' &&
+			s.getPropertyValue ('right') == '0px' &&
+			s.getPropertyValue ('top') == '0px' &&
+			s.getPropertyValue ('bottom') == '0px') {
+		ret.push (e.scrollHeight);
+	}
+}
+return ret; /* immediately return results, for use with Runtime.evaluate() */
+})();
diff --git a/crocoite/data/scroll.js b/crocoite/data/scroll.js
index 13e856d..be88edf 100644
--- a/crocoite/data/scroll.js
+++ b/crocoite/data/scroll.js
@@ -1,23 +1,38 @@
 /*	Continuously scrolls the page
  */
-var __crocoite_stop__ = false;
 (function(){
-function scroll (event) {
-	if (__crocoite_stop__) {
-		return false;
-	} else {
+class Scroll {
+	constructor (options) {
+		this.scrolled = new Map ();
+		this.interval = window.setInterval (this.scroll.bind (this), 200);
+	}
+
+	stop() {
+		window.clearInterval (this.interval);
+		window.scrollTo (0, 0);
+		this.scrolled.forEach (function (value, key, map) {
+			key.scrollTop = value;
+		});
+	}
+	/* save initial scroll state */
+	save(obj) {
+		if (!this.scrolled.has (obj)) {
+			this.scrolled.set (obj, obj.scrollTop);
+		}
+	}
+	/* perform a single scroll step */
+	scroll (event) {
 		window.scrollBy (0, window.innerHeight/2);
-		document.querySelectorAll ('*').forEach (
+		document.querySelectorAll ('html body *').forEach (
 			function (d) {
-				if (d.clientHeight < d.scrollHeight) {
+				if (d.scrollHeight-d.scrollTop > d.clientHeight) {
+					this.save (d);
 					d.scrollBy (0, d.clientHeight/2);
 				}
-			});
+			}.bind (this));
 		return true;
 	}
 }
-function onload (event) {
-    window.setInterval (scroll, 200);
-}
-document.addEventListener("DOMContentLoaded", onload);
-}());
+
+return Scroll;
+}())
diff --git a/crocoite/devtools.py b/crocoite/devtools.py
new file mode 100644
index 0000000..8b5c69d
--- /dev/null
+++ b/crocoite/devtools.py
@@ -0,0 +1,392 @@
+# Copyright (c) 2017 crocoite contributors
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+"""
+Communication with Google Chrome through its DevTools protocol.
+"""
+
+import json, asyncio, logging, os
+from tempfile import mkdtemp
+import shutil
+from http.cookies import Morsel
+
+import aiohttp, websockets
+from yarl import URL
+
+from .util import StrJsonEncoder
+
+logger = logging.getLogger (__name__)
+
+class Browser:
+    """
+    Communicate with Google Chrome through its DevTools protocol.
+    
+    Asynchronous context manager that creates a new Tab when entering.
+    Destroyed upon exit.
+    """
+
+    __slots__ = ('session', 'url', 'tab')
+
+    def __init__ (self, url):
+        self.url = URL (url)
+        self.session = None
+        self.tab = None
+
+    async def __aiter__ (self):
+        """ List all tabs """
+        async with aiohttp.ClientSession () as session:
+            async with session.get (self.url.with_path ('/json/list')) as r:
+                resp = await r.json ()
+                for tab in resp:
+                    if tab['type'] == 'page':
+                        yield tab
+
+    async def __aenter__ (self):
+        """ Create tab """
+        assert self.tab is None
+        assert self.session is None
+        self.session = aiohttp.ClientSession ()
+        async with self.session.get (self.url.with_path ('/json/new')) as r:
+            resp = await r.json ()
+            self.tab = await Tab.create (**resp)
+            return self.tab
+
+    async def __aexit__ (self, excType, excValue, traceback):
+        assert self.tab is not None
+        assert self.session is not None
+
+        await self.tab.close ()
+
+        try:
+            async with self.session.get (self.url.with_path (f'/json/close/{self.tab.id}')) as r:
+                resp = await r.text ()
+                assert resp == 'Target is closing'
+        except aiohttp.client_exceptions.ClientConnectorError:
+            # oh boy, the whole browser crashed instead
+            if excType is Crashed:
+                # exception is reraised by `return False`
+                pass
+            else:
+                # this one is more important
+                raise
+
+        self.tab = None
+        await self.session.close ()
+        self.session = None
+
+        return False
+
+class TabFunction:
+    """
+    Helper class for infinite-depth tab functions.
+
+    A method usually consists of namespace (Page, Network, …) and function name
+    (getFoobar) separated by a dot. This class creates these function names
+    while providing an intuitive Python interface (tab.Network.getFoobar).
+
+    This was inspired by pychrome.
+    """
+
+    __slots__ = ('name', 'tab')
+
+    def __init__ (self, name, tab):
+        self.name = name
+        self.tab = tab
+
+    def __eq__ (self, b):
+        assert isinstance (b, TabFunction)
+        return self.name == b.name
+
+    def __hash__ (self):
+        return hash (self.name)
+
+    def __getattr__ (self, k):
+        return TabFunction (f'{self.name}.{k}', self.tab)
+
+    async def __call__ (self, **kwargs):
+        return await self.tab (self.name, **kwargs)
+
+    def __repr__ (self):
+        return f'<TabFunction {self.name}>'
+
+class TabException (Exception):
+    pass
+
+class Crashed (TabException):
+    pass
+
+class MethodNotFound (TabException):
+    pass
+
+class InvalidParameter (TabException):
+    pass
+
+# map error codes to native exceptions
+errorMap = {-32601: MethodNotFound, -32602: InvalidParameter}
+
+class Tab:
+    """
+    Communicate with a single Google Chrome browser tab.
+    """
+    __slots__ = ('id', 'wsUrl', 'ws', 'msgid', 'transactions', 'queue', '_recvHandle', 'crashed')
+
+    def __init__ (self, tabid, ws):
+        """ Do not use this method, use Browser context manager. """
+        self.id = tabid
+        self.ws = ws
+        self.msgid = 1
+        self.crashed = False
+        self.transactions = {}
+        self.queue = asyncio.Queue ()
+
+    def __getattr__ (self, k):
+        return TabFunction (k, self)
+
+    async def __call__ (self, method, **kwargs):
+        """
+        Actually call browser method with kwargs
+        """
+
+        if self.crashed or self._recvHandle.done ():
+            raise Crashed ()
+
+        msgid = self.msgid
+        self.msgid += 1
+        message = {'method': method, 'params': kwargs, 'id': msgid}
+        t = self.transactions[msgid] = {'event': asyncio.Event (), 'result': None}
+        logger.debug (f'← {message}')
+        await self.ws.send (json.dumps (message, cls=StrJsonEncoder))
+        await t['event'].wait ()
+        ret = t['result']
+        del self.transactions[msgid]
+        if isinstance (ret, Exception):
+            raise ret
+        return ret
+
+    async def _recvProcess (self):
+        """
+        Receive process that dispatches received websocket frames
+
+        These are either events which will be put into a queue or request
+        responses which unblock a __call__.
+        """
+
+        async def markCrashed (reason):
+            # all pending requests can be considered failed since the
+            # browser state is lost
+            for v in self.transactions.values ():
+                v['result'] = Crashed (reason)
+                v['event'].set ()
+            # and all future requests will fail as well until reloaded
+            self.crashed = True
+            await self.queue.put (Crashed (reason))
+
+        while True:
+            try:
+                msg = await self.ws.recv ()
+                msg = json.loads (msg)
+            except Exception as e:
+                # right now we cannot recover from this
+                await markCrashed (e)
+                break
+            logger.debug (f'→ {msg}')
+            if 'id' in msg:
+                msgid = msg['id']
+                t = self.transactions.get (msgid, None)
+                if t is not None:
+                    if 'error' in msg:
+                        e = msg['error']
+                        t['result'] = errorMap.get (e['code'], TabException) (e['code'], e['message'])
+                    else:
+                        t['result'] = msg['result']
+                    t['event'].set ()
+                else:
+                    # ignore stale result
+                    pass # pragma: no cover
+            elif 'method' in msg:
+                # special treatment
+                if msg['method'] == 'Inspector.targetCrashed':
+                    await markCrashed ('target')
+                else:
+                    await self.queue.put (msg)
+            else:
+                assert False # pragma: no cover
+
+    async def run (self):
+        self._recvHandle = asyncio.ensure_future (self._recvProcess ())
+
+    async def close (self):
+        self._recvHandle.cancel ()
+        await self.ws.close ()
+        # no join, throw away the queue. There will be nobody listening on the
+        # other end.
+        #await self.queue.join ()
+
+    @property
+    def pending (self):
+        return self.queue.qsize ()
+
+    async def get (self):
+        def getattrRecursive (obj, name):
+            if '.' in name:
+                n, ext = name.split ('.', 1)
+                return getattrRecursive (getattr (obj, n), ext)
+            return getattr (obj, name)
+
+        if self.crashed:
+            raise Crashed ()
+
+        ret = await self.queue.get ()
+        if isinstance (ret, Exception):
+            raise ret
+        return getattrRecursive (self, ret['method']), ret['params']
+
+    @classmethod
+    async def create (cls, **kwargs):
+        """ Async init """
+        # increase size limit of a single frame to something ridiciously high,
+        # so we can safely grab screenshots
+        maxSize = 100*1024*1024 # 100 MB
+        # chrome does not like pings and kills the connection, disable them
+        ws = await websockets.connect(kwargs['webSocketDebuggerUrl'],
+                max_size=maxSize, ping_interval=None)
+        ret = cls (kwargs['id'], ws)
+        await ret.run ()
+        return ret
+
+class Process:
+    """ Start Google Chrome listening on a random port """
+
+    __slots__ = ('binary', 'windowSize', 'p', 'userDataDir')
+
+    def __init__ (self, binary='google-chrome-stable', windowSize=(1920, 1080)):
+        self.binary = binary
+        self.windowSize = windowSize
+        self.p = None
+
+    async def __aenter__ (self):
+        assert self.p is None
+        self.userDataDir = mkdtemp (prefix=__package__ + '-chrome-userdata-')
+        # see https://github.com/GoogleChrome/chrome-launcher/blob/master/docs/chrome-flags-for-tools.md
+        args = [self.binary,
+                '--window-size={},{}'.format (*self.windowSize),
+                f'--user-data-dir={self.userDataDir}', # use temporory user dir
+                '--no-default-browser-check',
+                '--no-first-run', # don’t show first run screen
+                '--disable-breakpad', # no error reports
+                '--disable-extensions',
+                '--disable-infobars',
+                '--disable-notifications', # no libnotify
+                '--disable-background-networking', # disable background services (updating, safe browsing, …)
+                '--safebrowsing-disable-auto-update',
+                '--disable-sync', # no google account syncing
+                '--metrics-recording-only', # do not submit metrics
+                '--disable-default-apps',
+                '--disable-background-timer-throttling',
+                '--disable-client-side-phishing-detection',
+                '--disable-popup-blocking',
+                '--disable-prompt-on-repost',
+                '--enable-automation', # enable various automation-related things
+                '--password-store=basic',
+                '--headless',
+                '--disable-gpu',
+                '--hide-scrollbars', # hide scrollbars on screenshots
+                '--mute-audio', # don’t play any audio
+                '--remote-debugging-port=0', # pick a port. XXX: we may want to use --remote-debugging-pipe instead
+                '--homepage=about:blank',
+                'about:blank']
+        # start new session, so ^C does not affect subprocess
+        self.p = await asyncio.create_subprocess_exec (*args,
+                stdout=asyncio.subprocess.DEVNULL,
+                stderr=asyncio.subprocess.DEVNULL,
+                stdin=asyncio.subprocess.DEVNULL,
+                start_new_session=True)
+        port = None
+        # chrome writes its current active devtools port to a file. due to the
+        # sleep() this is rather ugly, but should work with all versions of the
+        # browser.
+        for i in range (100):
+            try:
+                with open (os.path.join (self.userDataDir, 'DevToolsActivePort'), 'r') as fd:
+                    port = int (fd.readline ().strip ())
+                    break
+            except FileNotFoundError:
+                await asyncio.sleep (0.2)
+        if port is None:
+            raise Exception ('Chrome died on us.')
+
+        return URL.build(scheme='http', host='localhost', port=port)
+
+    async def __aexit__ (self, *exc):
+        try:
+            self.p.terminate ()
+            await self.p.wait ()
+        except ProcessLookupError:
+            # ok, fine, dead already
+            pass
+
+        # Try to delete the temporary directory multiple times. It looks like
+        # Chrome will change files in there even after it exited (i.e. .wait()
+        # returned). Very strange.
+        for i in range (5):
+            try:
+                shutil.rmtree (self.userDataDir)
+                break
+            except:
+                await asyncio.sleep (0.2)
+
+        self.p = None
+        return False
+
+class Passthrough:
+    __slots__ = ('url', )
+
+    def __init__ (self, url):
+        self.url = URL (url)
+
+    async def __aenter__ (self):
+        return self.url
+
+    async def __aexit__ (self, *exc):
+        return False
+
+def toCookieParam (m):
+    """
+    Convert Python’s http.cookies.Morsel to Chrome’s CookieParam, see
+    https://chromedevtools.github.io/devtools-protocol/1-3/Network#type-CookieParam
+    """
+
+    assert isinstance (m, Morsel)
+
+    out = {'name': m.key, 'value': m.value}
+
+    # unsupported by chrome
+    for k in ('max-age', 'comment', 'version'):
+        if m[k]:
+            raise ValueError (f'Unsupported cookie attribute {k} set, cannot convert')
+
+    for mname, cname in [('expires', None), ('path', None), ('domain', None), ('secure', None), ('httponly', 'httpOnly')]:
+        value = m[mname]
+        if value:
+            cname = cname or mname
+            out[cname] = value
+
+    return out
+
diff --git a/crocoite/html.py b/crocoite/html.py
index f891101..30f6ca5 100644
--- a/crocoite/html.py
+++ b/crocoite/html.py
@@ -22,6 +22,10 @@
 HTML helper
 """
 
+from html5lib.treewalkers.base import TreeWalker
+from html5lib.filters.base import Filter
+from html5lib import constants
+
 # HTML void tags, see https://html.spec.whatwg.org/multipage/syntax.html#void-elements
 voidTags = {'area',
         'base',
@@ -103,10 +107,7 @@ eventAttributes = {'onabort',
         'onvolumechange',
         'onwaiting'}
 
-from html5lib.treewalkers.base import TreeWalker
-from html5lib.filters.base import Filter
-from html5lib.serializer import HTMLSerializer
-from html5lib import constants
+default_namespace = constants.namespaces["html"]
 
 class ChromeTreeWalker (TreeWalker):
     """
@@ -123,11 +124,14 @@ class ChromeTreeWalker (TreeWalker):
             elif name == '#document':
                 for child in node.get ('children', []):
                     yield from self.recurse (child)
+            elif name == '#cdata-section':
+                # html5lib cannot generate cdata, so we’re faking it by using
+                # an empty tag
+                yield from self.emptyTag (default_namespace,
+                        '![CDATA[' + node['nodeValue'] + ']]', {})
             else:
-                assert False, name
+                assert False, (name, node)
         else:
-            default_namespace = constants.namespaces["html"]
-
             attributes = node.get ('attributes', [])
             convertedAttr = {}
             for i in range (0, len (attributes), 2):
@@ -195,7 +199,6 @@ class StripAttributeFilter (Filter):
         self.attributes = set (map (str.lower, attributes))
 
     def __iter__(self):
-        default_namespace = constants.namespaces["html"]
         for token in Filter.__iter__(self):
             data = token.get ('data')
             if data and token['type'] in {'StartTag', 'EmptyTag'}:
diff --git a/crocoite/irc.py b/crocoite/irc.py
new file mode 100644
index 0000000..d9c0634
--- /dev/null
+++ b/crocoite/irc.py
@@ -0,0 +1,671 @@
+# Copyright (c) 2017–2018 crocoite contributors
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+"""
+IRC bot “chromebot”
+"""
+
+import asyncio, argparse, json, tempfile, time, random, os, shlex
+from datetime import datetime
+from urllib.parse import urlsplit
+from enum import IntEnum, unique
+from collections import defaultdict
+from abc import abstractmethod
+from functools import wraps
+import bottom
+import websockets
+
+from .util import StrJsonEncoder
+from .cli import cookie
+
+### helper functions ###
+def prettyTimeDelta (seconds):
+    """
+    Pretty-print seconds to human readable string 1d 1h 1m 1s
+    """
+    seconds = int(seconds)
+    days, seconds = divmod(seconds, 86400)
+    hours, seconds = divmod(seconds, 3600)
+    minutes, seconds = divmod(seconds, 60)
+    s = [(days, 'd'), (hours, 'h'), (minutes, 'm'), (seconds, 's')]
+    s = filter (lambda x: x[0] != 0, s)
+    return ' '.join (map (lambda x: '{}{}'.format (*x), s))
+
+def prettyBytes (b):
+    """
+    Pretty-print bytes
+    """
+    prefixes = ['B', 'KiB', 'MiB', 'GiB', 'TiB']
+    while b >= 1024 and len (prefixes) > 1:
+        b /= 1024
+        prefixes.pop (0)
+    return f'{b:.1f} {prefixes[0]}'
+
+def isValidUrl (s):
+    url = urlsplit (s)
+    if url.scheme and url.netloc and url.scheme in {'http', 'https'}:
+        return s
+    raise TypeError ()
+
+class NonExitingArgumentParser (argparse.ArgumentParser):
+    """ Argument parser that does not call exit(), suitable for interactive use """
+
+    def exit (self, status=0, message=None):
+        # should never be called
+        pass
+
+    def error (self, message):
+        # if we use subparsers it’s important to return self, so we can show
+        # the correct help
+        raise Exception (self, message)
+
+    def format_usage (self):
+        return super().format_usage ().replace ('\n', ' ')
+
+class Status(IntEnum):
+    """ Job status """
+    undefined = 0
+    pending = 1
+    running = 2
+    aborted = 3
+    finished = 4
+
+# see https://arxiv.org/html/0901.4016 on how to build proquints (human
+# pronouncable unique ids)
+toConsonant = 'bdfghjklmnprstvz'
+toVowel = 'aiou'
+
+def u16ToQuint (v):
+    """ Transform a 16 bit unsigned integer into a single quint """
+    assert 0 <= v < 2**16
+    # quints are “big-endian”
+    return ''.join ([
+            toConsonant[(v>>(4+2+4+2))&0xf],
+            toVowel[(v>>(4+2+4))&0x3],
+            toConsonant[(v>>(4+2))&0xf],
+            toVowel[(v>>4)&0x3],
+            toConsonant[(v>>0)&0xf],
+            ])
+
+def uintToQuint (v, length=2):
+    """ Turn any integer into a proquint with fixed length """
+    assert 0 <= v < 2**(length*16)
+
+    return '-'.join (reversed ([u16ToQuint ((v>>(x*16))&0xffff) for x in range (length)]))
+
+def makeJobId ():
+    """ Create job id from time and randomness source """
+    # allocate 48 bits for the time (in milliseconds) and add 16 random bits
+    # at the end (just to be sure) for a total of 64 bits. Should be enough to
+    # avoid collisions.
+    randbits = 16
+    stamp = (int (time.time ()*1000) << randbits) | random.randint (0, 2**randbits-1)
+    return uintToQuint (stamp, 4)
+
+class Job:
+    """ Archival job """
+
+    __slots__ = ('id', 'stats', 'rstats', 'started', 'finished', 'nick', 'status', 'process', 'url')
+
+    def __init__ (self, url, nick):
+        self.id = makeJobId ()
+        self.stats = {}
+        self.rstats = {}
+        self.started = datetime.utcnow ()
+        self.finished = None
+        self.url = url
+        # user who scheduled this job
+        self.nick = nick
+        self.status = Status.pending
+        self.process = None
+
+    def formatStatus (self):
+        stats = self.stats
+        rstats = self.rstats
+        return (f"{self.url} ({self.id}) {self.status.name}. "
+                f"{rstats.get ('have', 0)} pages finished, "
+                f"{rstats.get ('pending', 0)} pending; "
+                f"{stats.get ('crashed', 0)} crashed, "
+                f"{stats.get ('requests', 0)} requests, "
+                f"{stats.get ('failed', 0)} failed, "
+                f"{prettyBytes (stats.get ('bytesRcv', 0))} received.")
+
+@unique
+class NickMode(IntEnum):
+    # the actual numbers don’t matter, but their order must be strictly
+    # increasing (with priviledge level)
+    operator = 100
+    voice = 10
+
+    @classmethod
+    def fromMode (cls, mode):
+        return {'v': cls.voice, 'o': cls.operator}[mode]
+
+    @classmethod
+    def fromNickPrefix (cls, mode):
+        return {'@': cls.operator, '+': cls.voice}[mode]
+
+    @property
+    def human (self):
+        return {self.operator: 'operator', self.voice: 'voice'}[self]
+
+class User:
+    """ IRC user """
+    __slots__ = ('name', 'modes')
+
+    def __init__ (self, name, modes=None):
+        self.name = name
+        self.modes = modes or set ()
+
+    def __eq__ (self, b):
+        return self.name == b.name
+
+    def __hash__ (self):
+        return hash (self.name)
+
+    def __repr__ (self):
+        return f'<User {self.name} {self.modes}>'
+
+    def hasPriv (self, p):
+        if p is None:
+            return True
+        else:
+            return self.modes and max (self.modes) >= p
+
+    @classmethod
+    def fromName (cls, name):
+        """ Get mode and name from NAMES command """
+        try:
+            modes = {NickMode.fromNickPrefix (name[0])}
+            name = name[1:]
+        except KeyError:
+            modes = set ()
+        return cls (name, modes)
+
+class ReplyContext:
+    __slots__ = ('client', 'target', 'user')
+
+    def __init__ (self, client, target, user):
+        self.client = client
+        self.target = target
+        self.user = user
+
+    def __call__ (self, message):
+        self.client.send ('PRIVMSG', target=self.target,
+                message=f'{self.user.name}: {message}')
+
+class RefCountEvent:
+    """
+    Ref-counted event that triggers if a) armed and b) refcount drops to zero.
+    
+    Must be used as a context manager.
+    """
+    __slots__ = ('count', 'event', 'armed')
+
+    def __init__ (self):
+        self.armed = False
+        self.count = 0
+        self.event = asyncio.Event ()
+
+    def __enter__ (self):
+        self.count += 1
+        self.event.clear ()
+
+    def __exit__ (self, exc_type, exc_val, exc_tb):
+        self.count -= 1
+        if self.armed and self.count == 0:
+            self.event.set ()
+
+    async def wait (self):
+        await self.event.wait ()
+
+    def arm (self):
+        self.armed = True
+        if self.count == 0:
+            self.event.set ()
+
+class ArgparseBot (bottom.Client):
+    """
+    Simple IRC bot using argparse
+    
+    Tracks user’s modes, reconnects on disconnect
+    """
+
+    __slots__ = ('channels', 'nick', 'parser', 'users', '_quit')
+
+    def __init__ (self, host, port, ssl, nick, logger, channels=None, loop=None):
+        super().__init__ (host=host, port=port, ssl=ssl, loop=loop)
+        self.channels = channels or []
+        self.nick = nick
+        # map channel -> nick -> user
+        self.users = defaultdict (dict)
+        self.logger = logger.bind (context=type (self).__name__)
+        self.parser = self.getParser ()
+
+        # bot does not accept new queries in shutdown mode, unless explicitly
+        # permitted by the parser
+        self._quit = RefCountEvent ()
+
+        # register bottom event handler
+        self.on('CLIENT_CONNECT', self.onConnect)
+        self.on('PING', self.onKeepalive)
+        self.on('PRIVMSG', self.onMessage)
+        self.on('CLIENT_DISCONNECT', self.onDisconnect)
+        self.on('RPL_NAMREPLY', self.onNameReply)
+        self.on('CHANNELMODE', self.onMode)
+        self.on('PART', self.onPart)
+        self.on('JOIN', self.onJoin)
+        # XXX: we would like to handle KICK, but bottom does not support that at the moment
+
+    @abstractmethod
+    def getParser (self):
+        pass
+
+    def cancel (self):
+        self.logger.info ('cancel', uuid='1eb34aea-a854-4fec-90b2-7f8a3812a9cd')
+        self._quit.arm ()
+    
+    async def run (self):
+        await self.connect ()
+        await self._quit.wait ()
+        self.send ('QUIT', message='Bye.')
+        await self.disconnect ()
+
+    async def onConnect (self, **kwargs):
+        self.logger.info ('connect', nick=self.nick, uuid='01f7b138-ea53-4609-88e9-61f3eca3e7e7')
+
+        self.send('NICK', nick=self.nick)
+        self.send('USER', user=self.nick, realname='https://github.com/PromyLOPh/crocoite')
+
+        # Don't try to join channels until the server has
+        # sent the MOTD, or signaled that there's no MOTD.
+        done, pending = await asyncio.wait(
+            [self.wait('RPL_ENDOFMOTD'), self.wait('ERR_NOMOTD')],
+            loop=self.loop, return_when=asyncio.FIRST_COMPLETED)
+
+        # Cancel whichever waiter's event didn't come in.
+        for future in pending:
+            future.cancel()
+
+        for c in self.channels:
+            self.logger.info ('join', channel=c, uuid='367063a5-9069-4025-907c-65ba88af8593')
+            self.send ('JOIN', channel=c)
+            # no need for NAMES here, server sends this automatically
+
+    async def onNameReply (self, channel, users, **kwargs):
+        # channels may be too big for a single message
+        addusers = dict (map (lambda x: (x.name, x), map (User.fromName, users)))
+        if channel not in self.users:
+            self.users[channel] = addusers
+        else:
+            self.users[channel].update (addusers)
+
+    @staticmethod
+    def parseMode (mode):
+        """ Parse mode strings like +a, -b, +a-b, -b+a, … """
+        action = '+'
+        ret = []
+        for c in mode:
+            if c in {'+', '-'}:
+                action = c
+            else:
+                ret.append ((action, c))
+        return ret
+
+    async def onMode (self, channel, modes, params, **kwargs):
+        if channel not in self.channels:
+            return
+
+        for (action, mode), nick in zip (self.parseMode (modes), params):
+            try:
+                m = NickMode.fromMode (mode)
+                u = self.users[channel].get (nick, User (nick))
+                if action == '+':
+                    u.modes.add (m)
+                elif action == '-':
+                    u.modes.remove (m)
+            except KeyError:
+                # unknown mode, ignore
+                pass
+
+    async def onPart (self, nick, channel, **kwargs):
+        if channel not in self.channels:
+            return
+
+        try:
+            self.users[channel].pop (nick)
+        except KeyError:
+            # gone already
+            pass
+
+    async def onJoin (self, nick, channel, **kwargs):
+        if channel not in self.channels:
+            return
+
+        self.users[channel][nick] = User (nick)
+
+    async def onKeepalive (self, message, **kwargs):
+        """ Ping received """
+        self.send('PONG', message=message)
+
+    async def onMessage (self, nick, target, message, **kwargs):
+        """ Message received """
+        msgPrefix = self.nick + ':'
+        if target in self.channels and message.startswith (msgPrefix):
+            user = self.users[target].get (nick, User (nick))
+            reply = ReplyContext (client=self, target=target, user=user)
+
+            # shlex.split supports quoting arguments, which str.split() does not
+            command = shlex.split (message[len (msgPrefix):])
+            try:
+                args = self.parser.parse_args (command)
+            except Exception as e:
+                reply (f'{e.args[1]} -- {e.args[0].format_usage ()}')
+                return
+            if not args or not hasattr (args, 'func'):
+                reply (f'Sorry, I don’t understand {command}')
+                return
+
+            minPriv = getattr (args, 'minPriv', None)
+            if self._quit.armed and not getattr (args, 'allowOnShutdown', False):
+                reply ('Sorry, I’m shutting down and cannot accept your request right now.')
+            elif not user.hasPriv (minPriv):
+                reply (f'Sorry, you need the privilege {minPriv.human} to use this command.')
+            else:
+                with self._quit:
+                    await args.func (user=user, args=args, reply=reply)
+
+    async def onDisconnect (self, **kwargs):
+        """ Auto-reconnect """
+        self.logger.info ('disconnect', uuid='4c74b2c8-2403-4921-879d-2279ad85db72')
+        while True:
+            if not self._quit.armed:
+                await asyncio.sleep (10, loop=self.loop)
+                self.logger.info ('reconnect', uuid='c53555cb-e1a4-4b69-b1c9-3320269c19d7')
+                try:
+                    await self.connect ()
+                finally:
+                    break
+
+def jobExists (func):
+    """ Chromebot job exists """
+    @wraps (func)
+    async def inner (self, **kwargs):
+        # XXX: not sure why it works with **kwargs, but not (user, args, reply)
+        args = kwargs.get ('args')
+        reply = kwargs.get ('reply')
+        j = self.jobs.get (args.id, None)
+        if not j:
+            reply (f'Job {args.id} is unknown')
+        else:
+            ret = await func (self, job=j, **kwargs)
+            return ret
+    return inner
+
+class Chromebot (ArgparseBot):
+    __slots__ = ('jobs', 'tempdir', 'destdir', 'processLimit', 'blacklist', 'needVoice')
+
+    def __init__ (self, host, port, ssl, nick, logger, channels=None,
+            tempdir=None, destdir='.', processLimit=1,
+            blacklist={}, needVoice=False, loop=None):
+        self.needVoice = needVoice
+
+        super().__init__ (host=host, port=port, ssl=ssl, nick=nick,
+                logger=logger, channels=channels, loop=loop)
+
+        self.jobs = {}
+        self.tempdir = tempdir or tempfile.gettempdir()
+        self.destdir = destdir
+        self.processLimit = asyncio.Semaphore (processLimit)
+        self.blacklist = blacklist
+
+    def getParser (self):
+        parser = NonExitingArgumentParser (prog=self.nick + ': ', add_help=False)
+        subparsers = parser.add_subparsers(help='Sub-commands')
+
+        archiveparser = subparsers.add_parser('a', help='Archive a site', add_help=False)
+        archiveparser.add_argument('--concurrency', '-j', default=1, type=int, help='Parallel workers for this job', choices=range (1, 5))
+        archiveparser.add_argument('--recursive', '-r', help='Enable recursion', choices=['0', '1', 'prefix'], default='0')
+        archiveparser.add_argument('--insecure', '-k',
+                help='Disable certificate checking', action='store_true')
+        # parsing the cookie here, so we can give an early feedback without
+        # waiting for the job to crash on invalid arguments.
+        archiveparser.add_argument('--cookie', '-b', type=cookie,
+                help='Add a cookie', action='append', default=[])
+        archiveparser.add_argument('url', help='Website URL', type=isValidUrl, metavar='URL')
+        archiveparser.set_defaults (func=self.handleArchive,
+                minPriv=NickMode.voice if self.needVoice else None)
+
+        statusparser = subparsers.add_parser ('s', help='Get job status', add_help=False)
+        statusparser.add_argument('id', help='Job id', metavar='UUID')
+        statusparser.set_defaults (func=self.handleStatus, allowOnShutdown=True)
+
+        abortparser = subparsers.add_parser ('r', help='Revoke/abort job', add_help=False)
+        abortparser.add_argument('id', help='Job id', metavar='UUID')
+        abortparser.set_defaults (func=self.handleAbort, allowOnShutdown=True,
+                minPriv=NickMode.voice if self.needVoice else None)
+
+        return parser
+
+    def isBlacklisted (self, url):
+        for k, v in self.blacklist.items():
+            if k.match (url):
+                return v
+        return False
+
+    async def handleArchive (self, user, args, reply):
+        """ Handle the archive command """
+
+        msg = self.isBlacklisted (args.url)
+        if msg:
+            reply (f'{args.url} cannot be queued: {msg}')
+            return
+
+        # make sure the job id is unique. Since ids are time-based we can just
+        # wait.
+        while True:
+            j = Job (args.url, user.name)
+            if j.id not in self.jobs:
+                break
+            time.sleep (0.01)
+        self.jobs[j.id] = j
+
+        logger = self.logger.bind (job=j.id)
+
+        showargs = {
+                'recursive': args.recursive,
+                'concurrency': args.concurrency,
+                }
+        if args.insecure:
+            showargs['insecure'] = args.insecure
+        warcinfo = {'chromebot': {
+                'jobid': j.id,
+                'user': user.name,
+                'queued': j.started,
+                'url': args.url,
+                'recursive': args.recursive,
+                'concurrency': args.concurrency,
+                }}
+        grabCmd = ['crocoite-single']
+        # prefix warcinfo with !, so it won’t get expanded
+        grabCmd.extend (['--warcinfo',
+                '!' + json.dumps (warcinfo, cls=StrJsonEncoder)])
+        for v in args.cookie:
+            grabCmd.extend (['--cookie', v.OutputString ()])
+        if args.insecure:
+            grabCmd.append ('--insecure')
+        grabCmd.extend (['{url}', '{dest}'])
+        cmdline = ['crocoite',
+                '--tempdir', self.tempdir,
+                '--recursion', args.recursive,
+                '--concurrency', str (args.concurrency),
+                args.url,
+                os.path.join (self.destdir,
+                        j.id + '-{host}-{date}-{seqnum}.warc.gz'),
+                '--'] + grabCmd
+
+        strargs = ', '.join (map (lambda x: '{}={}'.format (*x), showargs.items ()))
+        reply (f'{args.url} has been queued as {j.id} with {strargs}')
+        logger.info ('queue', user=user.name, url=args.url, cmdline=cmdline,
+                uuid='36cc34a6-061b-4cc5-84a9-4ab6552c8d75')
+
+        async with self.processLimit:
+            if j.status == Status.pending:
+                # job was not aborted
+                j.process = await asyncio.create_subprocess_exec (*cmdline,
+                        stdout=asyncio.subprocess.PIPE,
+                        stderr=asyncio.subprocess.DEVNULL,
+                        stdin=asyncio.subprocess.DEVNULL,
+                        start_new_session=True, limit=100*1024*1024)
+                while True:
+                    data = await j.process.stdout.readline ()
+                    if not data:
+                        break
+
+                    # job is marked running after the first message is received from it
+                    if j.status == Status.pending:
+                        logger.info ('start', uuid='46e62d60-f498-4ab0-90e1-d08a073b10fb')
+                        j.status = Status.running
+
+                    data = json.loads (data)
+                    msgid = data.get ('uuid')
+                    if msgid == '24d92d16-770e-4088-b769-4020e127a7ff':
+                        j.stats = data
+                    elif msgid == '5b8498e4-868d-413c-a67e-004516b8452c':
+                        j.rstats = data
+
+                    # forward message, so the dashboard can use it
+                    logger.info ('message',
+                            uuid='5c0f9a11-dcd8-4182-a60f-54f4d3ab3687',
+                            data=data)
+                code = await j.process.wait ()
+
+        if j.status == Status.running:
+            logger.info ('finish', uuid='7b40ffbb-faab-4224-90ed-cd4febd8f7ec')
+            j.status = Status.finished
+        j.finished = datetime.utcnow ()
+
+        stats = j.stats
+        rstats = j.rstats
+        reply (j.formatStatus ())
+
+    @jobExists
+    async def handleStatus (self, user, args, reply, job):
+        """ Handle status command """
+
+        rstats = job.rstats
+        reply (job.formatStatus ())
+
+    @jobExists
+    async def handleAbort (self, user, args, reply, job):
+        """ Handle abort command """
+
+        if job.status not in {Status.pending, Status.running}:
+            reply ('This job is not running.')
+            return
+
+        job.status = Status.aborted
+        self.logger.info ('abort', job=job.id, user=user.name,
+                uuid='865b3b3e-a54a-4a56-a545-f38a37bac295')
+        if job.process and job.process.returncode is None:
+            job.process.terminate ()
+
+class Dashboard:
+    __slots__ = ('fd', 'clients', 'loop', 'log', 'maxLog', 'pingInterval', 'pingTimeout')
+    # these messages will not be forwarded to the browser
+    ignoreMsgid = {
+            # connect
+            '01f7b138-ea53-4609-88e9-61f3eca3e7e7',
+            # join
+            '367063a5-9069-4025-907c-65ba88af8593',
+            # disconnect
+            '4c74b2c8-2403-4921-879d-2279ad85db72',
+            # reconnect
+            'c53555cb-e1a4-4b69-b1c9-3320269c19d7',
+            }
+
+    def __init__ (self, fd, loop, maxLog=1000, pingInterval=30, pingTimeout=10):
+        self.fd = fd
+        self.clients = set ()
+        self.loop = loop
+        # log buffer
+        self.log = []
+        self.maxLog = maxLog
+        self.pingInterval = pingInterval
+        self.pingTimeout = pingTimeout
+
+    async def client (self, websocket, path):
+        self.clients.add (websocket)
+        try:
+            for l in self.log:
+                buf = json.dumps (l)
+                await websocket.send (buf)
+            while True:
+                try:
+                    msg = await asyncio.wait_for (websocket.recv(), timeout=self.pingInterval)
+                except asyncio.TimeoutError:
+                    try:
+                        pong = await websocket.ping()
+                        await asyncio.wait_for (pong, timeout=self.pingTimeout)
+                    except asyncio.TimeoutError:
+                        break
+                except websockets.exceptions.ConnectionClosed:
+                    break
+        finally:
+            self.clients.remove (websocket)
+
+    def handleStdin (self):
+        buf = self.fd.readline ()
+        if not buf:
+            return
+
+        try:
+            data = json.loads (buf)
+        except json.decoder.JSONDecodeError:
+            # ignore invalid
+            return
+        msgid = data['uuid']
+
+        if msgid in self.ignoreMsgid:
+            return
+
+        # a few messages may contain sensitive information that we want to hide
+        if msgid == '36cc34a6-061b-4cc5-84a9-4ab6552c8d75':
+            # queue
+            del data['cmdline']
+        elif msgid == '5c0f9a11-dcd8-4182-a60f-54f4d3ab3687':
+            nesteddata = data['data']
+            nestedmsgid = nesteddata['uuid']
+            if nestedmsgid == 'd1288fbe-8bae-42c8-af8c-f2fa8b41794f':
+                del nesteddata['command']
+            
+        buf = json.dumps (data)
+        for c in self.clients:
+            # XXX can’t await here
+            asyncio.ensure_future (c.send (buf))
+
+        self.log.append (data)
+        while len (self.log) > self.maxLog:
+            self.log.pop (0)
+
+    def run (self, host='localhost', port=6789):
+        self.loop.add_reader (self.fd, self.handleStdin)
+        return websockets.serve(self.client, host, port)
+
diff --git a/crocoite/logger.py b/crocoite/logger.py
new file mode 100644
index 0000000..ac389ca
--- /dev/null
+++ b/crocoite/logger.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2017–2018 crocoite contributors
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+"""
+Simple logger inspired by structlog.
+
+It is usually used like this: Classes are passed a logger instance. They bind
+context to their name, so identifying the source of messages is easier. Every
+log message carries a unique id (uuid) for automated identification as well as
+a short human-readable message (msg) and arbitrary payload.
+"""
+
+import sys, json
+from datetime import datetime
+from functools import partial
+from enum import IntEnum
+
+from pytz import utc
+
+from .util import StrJsonEncoder
+
+class Level(IntEnum):
+    DEBUG = 0
+    INFO = 1
+    WARNING = 2
+    ERROR = 3
+
+class Logger:
+    def __init__ (self, consumer=None, bindings=None):
+        self.bindings = bindings or {}
+        self.consumer = consumer or []
+
+    def __call__ (self, level, *args, **kwargs):
+        if not isinstance (level, Level):
+            level = Level[level.upper ()]
+        kwargs['level'] = level
+        if args:
+            if len (args) == 1:
+                args, = args
+            kwargs['msg'] = args
+        # do not overwrite arguments
+        for k, v in self.bindings.items ():
+            if k not in kwargs:
+                kwargs[k] = v
+        for c in self.consumer:
+            kwargs = c (**kwargs)
+        return kwargs
+
+    def __getattr__ (self, k):
+        """ Bind all method names to level, so Logger.info, Logger.warning, … work """
+        return partial (self.__call__, k)
+
+    def bind (self, **kwargs):
+        d = self.bindings.copy ()
+        d.update (kwargs)
+        # consumer is not a copy intentionally, so attaching to the parent
+        # logger will attach to all children as well
+        return self.__class__ (consumer=self.consumer, bindings=d)
+
+    def unbind (self, **kwargs):
+        d = self.bindings.copy ()
+        for k in kwargs.keys ():
+            del d[k]
+        return self.__class__ (consumer=self.consumer, bindings=d)
+
+    def connect (self, consumer):
+        self.consumer.append (consumer)
+
+    def disconnect (self, consumer):
+        self.consumer.remove (consumer)
+
+class Consumer:
+    def __call__ (self, **kwargs): # pragma: no cover
+        raise NotImplementedError ()
+
+class NullConsumer (Consumer):
+    def __call__ (self, **kwargs):
+        return kwargs
+
+class PrintConsumer (Consumer):
+    """
+    Simple printing consumer
+    """
+    def __call__ (self, **kwargs):
+        sys.stderr.write (str (kwargs))
+        sys.stderr.write ('\n')
+        sys.stderr.flush ()
+        return kwargs
+
+class JsonPrintConsumer (Consumer):
+    def __init__ (self, minLevel=Level.DEBUG):
+        self.minLevel = minLevel
+
+    def __call__ (self, **kwargs):
+        if kwargs['level'] >= self.minLevel:
+            json.dump (kwargs, sys.stdout, cls=StrJsonEncoder)
+            sys.stdout.write ('\n')
+            sys.stdout.flush ()
+        return kwargs
+
+class DatetimeConsumer (Consumer):
+    def __call__ (self, **kwargs):
+        kwargs['date'] = datetime.utcnow ().replace (tzinfo=utc)
+        return kwargs
+
+class WarcHandlerConsumer (Consumer):
+    def __init__ (self, warc, minLevel=Level.DEBUG):
+        self.warc = warc
+        self.minLevel = minLevel
+
+    def __call__ (self, **kwargs):
+        if kwargs['level'] >= self.minLevel:
+            self.warc._writeLog (json.dumps (kwargs, cls=StrJsonEncoder))
+        return kwargs
+
diff --git a/crocoite/task.py b/crocoite/task.py
deleted file mode 100644
index e93cfde..0000000
--- a/crocoite/task.py
+++ /dev/null
@@ -1,119 +0,0 @@
-# Copyright (c) 2017–2018 crocoite contributors
-# 
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-# 
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-# 
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-
-"""
-Celery distributed tasks
-"""
-
-import os, logging
-
-from urllib.parse import urlsplit
-from datetime import datetime
-from operator import attrgetter
-from itertools import chain
-
-def _monkeyPatchSyncTasks ():
-    """ Result sets don’t support the argument disable_sync_subtasks argument """
-    import celery.result
-    celery.result.assert_will_not_block = lambda: None
-
-_monkeyPatchSyncTasks ()
-from celery import Celery
-from celery.utils.log import get_task_logger
-
-from .browser import ChromeService
-from .controller import SinglePageController, ControllerSettings, RecursiveController, defaultSettings, DepthLimit, PrefixLimit
-from . import behavior
-from .cli import parseRecursive
-
-app = Celery ('crocoite.distributed')
-app.config_from_object('celeryconfig')
-app.conf.task_routes = {
-        'crocoite.task.archive': {'queue': 'crocoite.archive'},
-        'crocoite.task.controller': {'queue': 'crocoite.controller'},
-        # <method>.chunks is actually a starmap job
-        'celery.starmap': {'queue': 'crocoite.archive'},
-        }
-app.conf.task_default_queue = 'crocoite.default'
-# disable prefetching, since our tasks usually run for a _very_ long time
-app.conf.worker_prefetch_multiplier = 1
-logger = get_task_logger('crocoite.distributed.archive')
-
-@app.task(bind=True, track_started=True)
-def archive (self, url, settings, enabledBehaviorNames):
-    """
-    Archive a single URL
-
-    Supports these config keys (celeryconfig):
-
-    warc_filename = '{domain}-{date}-{id}.warc.gz'
-    temp_dir = '/tmp/'
-    finished_dir = '/tmp/finished'
-    """
-
-    parsedUrl = urlsplit (url)
-    outFile = app.conf.warc_filename.format (
-                    id=self.request.root_id,
-                    domain=parsedUrl.hostname.replace ('/', '-'),
-                    date=datetime.utcnow ().isoformat (),
-                    )
-    outPath = os.path.join (app.conf.temp_dir, outFile)
-    fd = open (outPath, 'wb')
-
-    enabledBehavior = list (filter (lambda x: x.name in enabledBehaviorNames, behavior.available))
-    settings = ControllerSettings (**settings)
-    controller = SinglePageController (url, fd, behavior=enabledBehavior, settings=settings)
-    ret = controller.run ()
-
-    os.makedirs (app.conf.finished_dir, exist_ok=True)
-    outPath = os.path.join (app.conf.finished_dir, outFile)
-    os.rename (fd.name, outPath)
-
-    return ret
-
-class DistributedRecursiveController (RecursiveController):
-    """ Distributed, recursive controller using celery """
-
-    def __init__ (self, url, service=ChromeService (), behavior=behavior.available, \
-            logger=logging.getLogger(__name__), settings=defaultSettings,
-            recursionPolicy=DepthLimit (0), concurrency=1):
-        super ().__init__ (url, None, service, behavior, logger, settings, recursionPolicy)
-        self.concurrency = concurrency
-
-    def fetch (self, urls):
-        def chunksIter (urls):
-            for u in urls:
-                yield (u, self.settings.toDict (), list (map (attrgetter ('name'), self.behavior)))
-        itemsPerTask = len (urls)//self.concurrency
-        if itemsPerTask <= 0:
-            itemsPerTask = len (urls)
-        return chain.from_iterable (archive.chunks (chunksIter (urls), itemsPerTask).apply_async ().get ())
-
-@app.task(bind=True, track_started=True)
-def controller (self, url, settings, enabledBehaviorNames, recursive, concurrency):
-    """ Recursive controller """
-
-    recursionPolicy = parseRecursive (recursive, url)
-    enabledBehavior = list (filter (lambda x: x.name in enabledBehaviorNames, behavior.available))
-    settings = ControllerSettings (**settings)
-    controller = DistributedRecursiveController (url, None, behavior=enabledBehavior,
-            settings=settings, recursionPolicy=recursionPolicy, concurrency=concurrency)
-    return controller.run ()
-
diff --git a/crocoite/test_behavior.py b/crocoite/test_behavior.py
new file mode 100644
index 0000000..1efea08
--- /dev/null
+++ b/crocoite/test_behavior.py
@@ -0,0 +1,266 @@
+# Copyright (c) 2017 crocoite contributors
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+import asyncio, os, yaml, re, math, struct
+from functools import partial
+from operator import attrgetter
+
+import pytest
+from yarl import URL
+from aiohttp import web
+
+import pkg_resources
+from .logger import Logger
+from .devtools import Process
+from .behavior import Scroll, Behavior, ExtractLinks, ExtractLinksEvent, Crash, \
+        Screenshot, ScreenshotEvent, DomSnapshot, DomSnapshotEvent, mapOrIgnore
+from .controller import SinglePageController, EventHandler, ControllerSettings
+from .devtools import Crashed
+
+with pkg_resources.resource_stream (__name__, os.path.join ('data', 'click.yaml')) as fd:
+    sites = list (yaml.safe_load_all (fd))
+clickParam = []
+for o in sites:
+    for s in o['selector']:
+        for u in s.get ('urls', []):
+            clickParam.append ((u, s['selector']))
+
+class ClickTester (Behavior):
+    """
+    Test adapter checking a given selector exists after loading the page
+    """
+
+    __slots__ = ('selector', )
+
+    name = 'testclick'
+
+    def __init__ (self, loader, logger, selector):
+        super ().__init__ (loader, logger)
+        self.selector = selector
+
+    async def onfinish (self):
+        tab = self.loader.tab
+        results = await tab.DOM.getDocument ()
+        rootNode = results['root']['nodeId']
+        results = await tab.DOM.querySelectorAll (nodeId=rootNode, selector=self.selector)
+        assert results['nodeIds'], self.selector
+
+        # XXX: this is not true for every element we click. Github uses <button
+        # type=submit> and <form> without an event listener on the <button>
+#        # verify that an event listener exists
+#        for nid in results['nodeIds']:
+#            obj = (await tab.DOM.resolveNode (nodeId=nid))['object']
+#            assert obj['type'] == 'object'
+#            listeners = (await tab.DOMDebugger.getEventListeners (objectId=obj['objectId']))['listeners']
+#            assert any (map (lambda x: x['type'] == 'click', listeners)), listeners
+
+        return
+        yield # pragma: no cover
+
+@pytest.mark.parametrize("url,selector", clickParam)
+@pytest.mark.asyncio
+@pytest.mark.xfail(reason='depends on network access')
+async def test_click_selectors (url, selector):
+    """
+    Make sure the CSS selector exists on an example url
+    """
+    logger = Logger ()
+    settings = ControllerSettings (idleTimeout=5, timeout=10)
+    # Some selectors are loaded dynamically and require scrolling
+    controller = SinglePageController (url=url, logger=logger,
+            settings=settings,
+            service=Process (),
+            behavior=[Scroll, partial(ClickTester, selector=selector)])
+    await controller.run ()
+
+matchParam = []
+for o in sites:
+    for s in o['selector']:
+        for u in s.get ('urls', []):
+            matchParam.append ((o['match'], URL (u)))
+
+@pytest.mark.parametrize("match,url", matchParam)
+@pytest.mark.asyncio
+async def test_click_match (match, url):
+    """ Test urls must match """
+    # keep this aligned with click.js
+    assert re.match (match, url.host, re.I)
+
+
+class AccumHandler (EventHandler):
+    """ Test adapter that accumulates all incoming items """
+    __slots__ = ('data')
+
+    def __init__ (self):
+        super().__init__ ()
+        self.data = []
+
+    async def push (self, item):
+        self.data.append (item)
+
+async def simpleServer (url, response):
+    async def f (req):
+        return web.Response (body=response, status=200, content_type='text/html', charset='utf-8')
+
+    app = web.Application ()
+    app.router.add_route ('GET', url.path, f)
+    runner = web.AppRunner(app)
+    await runner.setup()
+    site = web.TCPSite(runner, url.host, url.port)
+    await site.start()
+    return runner
+
+@pytest.mark.asyncio
+async def test_extract_links ():
+    """
+    Make sure the CSS selector exists on an example url
+    """
+
+    url = URL.build (scheme='http', host='localhost', port=8080)
+    runner = await simpleServer (url, """<html><head></head>
+            <body>
+            <div>
+                <a href="/relative">foo</a>
+                <a href="http://example.com/absolute/">foo</a>
+                <a href="https://example.com/absolute/secure">foo</a>
+                <a href="#anchor">foo</a>
+                <a href="http://neue_preise_f%c3%bcr_zahnimplantate_k%c3%b6nnten_sie_%c3%bcberraschen">foo</a>
+
+                <a href="/hidden/visibility" style="visibility: hidden">foo</a>
+                <a href="/hidden/display" style="display: none">foo</a>
+                <div style="display: none">
+                <a href="/hidden/display/insidediv">foo</a>
+                </div>
+                <!--<a href="/hidden/comment">foo</a>-->
+
+                <p><img src="shapes.png" usemap="#shapes">
+                 <map name="shapes"><area shape=rect coords="50,50,100,100" href="/map/rect"></map></p>
+            </div>
+            </body></html>""")
+
+    try:
+        handler = AccumHandler ()
+        logger = Logger ()
+        controller = SinglePageController (url=url, logger=logger,
+                service=Process (), behavior=[ExtractLinks], handler=[handler])
+        await controller.run ()
+
+        links = []
+        for d in handler.data:
+            if isinstance (d, ExtractLinksEvent):
+                links.extend (d.links)
+        assert sorted (links) == sorted ([
+                url.with_path ('/relative'),
+                url.with_fragment ('anchor'),
+                URL ('http://neue_preise_f%C3%BCr_zahnimplantate_k%C3%B6nnten_sie_%C3%BCberraschen'),
+                URL ('http://example.com/absolute/'),
+                URL ('https://example.com/absolute/secure'),
+                url.with_path ('/hidden/visibility'), # XXX: shall we ignore these as well?
+                url.with_path ('/map/rect'),
+                ])
+    finally:
+        await runner.cleanup ()
+
+@pytest.mark.asyncio
+async def test_crash ():
+    """
+    Crashing through Behavior works?
+    """
+
+    url = URL.build (scheme='http', host='localhost', port=8080)
+    runner = await simpleServer (url, '<html></html>')
+
+    try:
+        logger = Logger ()
+        controller = SinglePageController (url=url, logger=logger,
+                service=Process (), behavior=[Crash])
+        with pytest.raises (Crashed):
+            await controller.run ()
+    finally:
+        await runner.cleanup ()
+
+@pytest.mark.asyncio
+async def test_screenshot ():
+    """
+    Make sure screenshots are taken and have the correct dimensions. We can’t
+    and don’t want to check their content.
+    """
+    # ceil(0) == 0, so starting with 1
+    for expectHeight in (1, Screenshot.maxDim, Screenshot.maxDim+1, Screenshot.maxDim*2+Screenshot.maxDim//2):
+        url = URL.build (scheme='http', host='localhost', port=8080)
+        runner = await simpleServer (url, f'<html><body style="margin: 0; padding: 0;"><div style="height: {expectHeight}"></div></body></html>')
+
+        try:
+            handler = AccumHandler ()
+            logger = Logger ()
+            controller = SinglePageController (url=url, logger=logger,
+                    service=Process (), behavior=[Screenshot], handler=[handler])
+            await controller.run ()
+
+            screenshots = list (filter (lambda x: isinstance (x, ScreenshotEvent), handler.data))
+            assert len (screenshots) == math.ceil (expectHeight/Screenshot.maxDim)
+            totalHeight = 0
+            for s in screenshots:
+                assert s.url == url
+                # PNG ident is fixed, IHDR is always the first chunk
+                assert s.data.startswith (b'\x89PNG\r\n\x1a\n\x00\x00\x00\x0dIHDR')
+                width, height = struct.unpack ('>II', s.data[16:24])
+                assert height <= Screenshot.maxDim
+                totalHeight += height
+            # screenshot height is at least canvas height (XXX: get hardcoded
+            # value from devtools.Process)
+            assert totalHeight == max (expectHeight, 1080)
+        finally:
+            await runner.cleanup ()
+
+@pytest.mark.asyncio
+async def test_dom_snapshot ():
+    """
+    Behavior plug-in works, <canvas> is replaced by static image, <script> is
+    stripped. Actual conversion from Chrome DOM to HTML is validated by module
+    .test_html
+    """
+
+    url = URL.build (scheme='http', host='localhost', port=8080)
+    runner = await simpleServer (url, f'<html><body><p>ÄÖÜäöü</p><script>alert("yes");</script><canvas id="canvas" width="1" height="1">Alternate text.</canvas></body></html>')
+
+    try:
+        handler = AccumHandler ()
+        logger = Logger ()
+        controller = SinglePageController (url=url, logger=logger,
+                service=Process (), behavior=[DomSnapshot], handler=[handler])
+        await controller.run ()
+
+        snapshots = list (filter (lambda x: isinstance (x, DomSnapshotEvent), handler.data))
+        assert len (snapshots) == 1
+        doc = snapshots[0].document
+        assert doc.startswith ('<HTML><HEAD><meta charset=utf-8></HEAD><BODY><P>ÄÖÜäöü</P><IMG id=canvas width=1 height=1 src="data:image/png;base64,'.encode ('utf-8'))
+        assert doc.endswith ('></BODY></HTML>'.encode ('utf-8'))
+    finally:
+        await runner.cleanup ()
+
+def test_mapOrIgnore ():
+    def fail (x):
+        if x < 50:
+            raise Exception ()
+        return x+1
+
+    assert list (mapOrIgnore (fail, range (100))) == list (range (51, 101))
+
diff --git a/crocoite/test_browser.py b/crocoite/test_browser.py
new file mode 100644
index 0000000..7084214
--- /dev/null
+++ b/crocoite/test_browser.py
@@ -0,0 +1,387 @@
+# Copyright (c) 2017 crocoite contributors
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+import asyncio, socket
+from operator import itemgetter
+from http.server import BaseHTTPRequestHandler
+from datetime import datetime
+
+from yarl import URL
+from aiohttp import web
+from multidict import CIMultiDict
+
+from hypothesis import given
+import hypothesis.strategies as st
+from hypothesis.provisional import domains
+import pytest
+
+from .browser import RequestResponsePair, SiteLoader, Request, \
+        UnicodeBody, ReferenceTimestamp, Base64Body, UnicodeBody, Request, \
+        Response, NavigateError, PageIdle, FrameNavigated
+from .logger import Logger, Consumer
+from .devtools import Crashed, Process
+
+# if you want to know what’s going on:
+#import logging
+#logging.basicConfig(level=logging.DEBUG)
+
+class AssertConsumer (Consumer):
+    def __call__ (self, **kwargs):
+        assert 'uuid' in kwargs
+        assert 'msg' in kwargs
+        assert 'context' in kwargs
+        return kwargs
+
+@pytest.fixture
+def logger ():
+    return Logger (consumer=[AssertConsumer ()])
+
+@pytest.fixture
+async def loader (logger):
+    async with Process () as browser, SiteLoader (browser, logger) as l:
+        yield l
+
+@pytest.mark.asyncio
+async def test_crash (loader):
+    with pytest.raises (Crashed):
+        await loader.tab.Page.crash ()
+
+@pytest.mark.asyncio
+async def test_invalidurl (loader):
+    host = 'nonexistent.example'
+
+    # make sure the url does *not* resolve (some DNS intercepting ISP’s mess
+    # with this)
+    loop = asyncio.get_event_loop ()
+    try:
+        resolved = await loop.getaddrinfo (host, None)
+    except socket.gaierror:
+        url = URL.build (scheme='http', host=host)
+        with pytest.raises (NavigateError):
+            await loader.navigate (url)
+    else:
+        pytest.skip (f'host {host} resolved to {resolved}')
+
+timestamp = st.one_of (
+                st.integers(min_value=0, max_value=2**32-1),
+                st.floats (min_value=0, max_value=2**32-1),
+                )
+
+@given(timestamp, timestamp, timestamp)
+def test_referencetimestamp (relativeA, absoluteA, relativeB):
+    ts = ReferenceTimestamp (relativeA, absoluteA)
+    absoluteA = datetime.utcfromtimestamp (absoluteA)
+    absoluteB = ts (relativeB)
+    assert (absoluteA < absoluteB and relativeA < relativeB) or \
+            (absoluteA >= absoluteB and relativeA >= relativeB)
+    assert abs ((absoluteB - absoluteA).total_seconds () - (relativeB - relativeA)) < 10e-6
+
+def urls ():
+    """ Build http/https URL """
+    scheme = st.sampled_from (['http', 'https'])
+    # Path must start with a slash
+    pathSt = st.builds (lambda x: '/' + x, st.text ())
+    args = st.fixed_dictionaries ({
+            'scheme': scheme,
+            'host': domains (),
+            'port': st.one_of (st.none (), st.integers (min_value=1, max_value=2**16-1)),
+            'path': pathSt,
+            'query_string': st.text (),
+            'fragment': st.text (),
+            })
+    return st.builds (lambda x: URL.build (**x), args)
+
+def urlsStr ():
+    return st.builds (lambda x: str (x), urls ())
+
+asciiText = st.text (st.characters (min_codepoint=32, max_codepoint=126))
+
+def chromeHeaders ():
+    # token as defined by https://tools.ietf.org/html/rfc7230#section-3.2.6
+    token = st.sampled_from('abcdefghijklmnopqrstuvwxyz0123456789!#$%&\'*+-.^_`|~')
+    # XXX: the value should be asciiText without leading/trailing spaces
+    return st.dictionaries (token, token)
+
+def fixedDicts (fixed, dynamic):
+    return st.builds (lambda x, y: x.update (y), st.fixed_dictionaries (fixed), st.lists (dynamic))
+
+def chromeRequestWillBeSent (reqid, url):
+    methodSt = st.sampled_from (['GET', 'POST', 'PUT', 'DELETE'])
+    return st.fixed_dictionaries ({
+            'requestId': reqid,
+            'initiator': st.just ('Test'),
+            'wallTime': timestamp,
+            'timestamp': timestamp,
+            'request': st.fixed_dictionaries ({
+                'url': url,
+                'method': methodSt,
+                'headers': chromeHeaders (),
+                # XXX: postData, hasPostData
+                })
+            })
+
+def chromeResponseReceived (reqid, url):
+    mimeTypeSt = st.one_of (st.none (), st.just ('text/html'))
+    remoteIpAddressSt = st.one_of (st.none (), st.just ('127.0.0.1'))
+    protocolSt = st.one_of (st.none (), st.just ('h2'))
+    statusCodeSt = st.integers (min_value=100, max_value=999)
+    typeSt = st.sampled_from (['Document', 'Stylesheet', 'Image', 'Media',
+            'Font', 'Script', 'TextTrack', 'XHR', 'Fetch', 'EventSource',
+            'WebSocket', 'Manifest', 'SignedExchange', 'Ping',
+            'CSPViolationReport', 'Other'])
+    return st.fixed_dictionaries ({
+            'requestId': reqid,
+            'timestamp': timestamp,
+            'type': typeSt,
+            'response': st.fixed_dictionaries ({
+                'url': url,
+                'requestHeaders': chromeHeaders (), # XXX: make this optional
+                'headers': chromeHeaders (),
+                'status': statusCodeSt,
+                'statusText': asciiText,
+                'mimeType': mimeTypeSt,
+                'remoteIPAddress': remoteIpAddressSt,
+                'protocol': protocolSt,
+                })
+            })
+
+def chromeReqResp ():
+    # XXX: will this gnerated the same url for all testcases?
+    reqid = st.shared (st.text (), 'reqresp')
+    url = st.shared (urlsStr (), 'reqresp')
+    return st.tuples (chromeRequestWillBeSent (reqid, url),
+            chromeResponseReceived (reqid, url))
+
+def requestResponsePair ():
+    def f (creq, cresp, hasPostData, reqBody, respBody):
+        i = RequestResponsePair ()
+        i.fromRequestWillBeSent (creq)
+        i.request.hasPostData = hasPostData
+        if hasPostData:
+            i.request.body = reqBody
+
+        if cresp is not None:
+            i.fromResponseReceived (cresp)
+            if respBody is not None:
+                i.response.body = respBody
+        return i
+
+    bodySt = st.one_of (
+            st.none (),
+            st.builds (UnicodeBody, st.text ()),
+            st.builds (Base64Body.fromBytes, st.binary ())
+            )
+    return st.builds (lambda reqresp, hasPostData, reqBody, respBody:
+            f (reqresp[0], reqresp[1], hasPostData, reqBody, respBody),
+            chromeReqResp (), st.booleans (), bodySt, bodySt)
+
+@given(chromeReqResp ())
+def test_requestResponsePair (creqresp):
+    creq, cresp = creqresp
+
+    item = RequestResponsePair ()
+
+    assert item.id is None
+    assert item.url is None
+    assert item.request is None
+    assert item.response is None
+
+    item.fromRequestWillBeSent (creq)
+
+    assert item.id == creq['requestId']
+    url = URL (creq['request']['url'])
+    assert item.url == url
+    assert item.request is not None
+    assert item.request.timestamp == datetime.utcfromtimestamp (creq['wallTime'])
+    assert set (item.request.headers.keys ()) == set (creq['request']['headers'].keys ())
+    assert item.response is None
+
+    item.fromResponseReceived (cresp)
+
+    # url will not be overwritten
+    assert item.id == creq['requestId'] == cresp['requestId']
+    assert item.url == url
+    assert item.request is not None
+    assert set (item.request.headers.keys ()) == set (cresp['response']['requestHeaders'].keys ())
+    assert item.response is not None
+    assert set (item.response.headers.keys ()) == set (cresp['response']['headers'].keys ())
+    assert (item.response.timestamp - item.request.timestamp).total_seconds () - \
+            (cresp['timestamp'] - creq['timestamp']) < 10e-6
+
+@given(chromeReqResp ())
+def test_requestResponsePair_eq (creqresp):
+    creq, cresp = creqresp
+
+    item = RequestResponsePair ()
+    item2 = RequestResponsePair ()
+    assert item == item
+    assert item == item2
+
+    item.fromRequestWillBeSent (creq)
+    assert item != item2
+    item2.fromRequestWillBeSent (creq)
+    assert item == item
+    assert item == item2
+
+    item.fromResponseReceived (cresp)
+    assert item != item2
+    item2.fromResponseReceived (cresp)
+    assert item == item
+    assert item == item2
+
+    # XXX: test for inequality with different parameters
+
+### Google Chrome integration tests ###
+
+serverUrl = URL.build (scheme='http', host='localhost', port=8080)
+items = [
+    RequestResponsePair (
+        url=serverUrl.with_path ('/encoding/utf-8'),
+        request=Request (method='GET'),
+        response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=utf-8')]),
+            body=UnicodeBody ('äöü'), mimeType='text/html')
+        ),
+    RequestResponsePair (
+        url=serverUrl.with_path ('/encoding/latin1'),
+        request=Request (method='GET'),
+        response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=latin1')]),
+            body=UnicodeBody ('äöü'), mimeType='text/html')
+        ),
+    RequestResponsePair (
+        url=serverUrl.with_path ('/encoding/utf-16'),
+        request=Request (method='GET'),
+        response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=utf-16')]),
+            body=UnicodeBody ('äöü'), mimeType='text/html')
+        ),
+    RequestResponsePair (
+        url=serverUrl.with_path ('/encoding/ISO-8859-1'),
+        request=Request (method='GET'),
+        response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=ISO-8859-1')]),
+            body=UnicodeBody ('äöü'), mimeType='text/html')
+        ),
+    RequestResponsePair (
+        url=serverUrl.with_path ('/status/200'),
+        request=Request (method='GET'),
+        response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/plain')]),
+            body=b'',
+            mimeType='text/plain'),
+        ),
+    # redirects never have a response body
+    RequestResponsePair (
+        url=serverUrl.with_path ('/status/301'),
+        request=Request (method='GET'),
+        response=Response (status=301,
+            headers=CIMultiDict ([('Content-Type', 'text/plain'),
+                ('Location', str (serverUrl.with_path ('/status/301/redirected')))]),
+            body=None,
+            mimeType='text/plain'),
+        ),
+    RequestResponsePair (
+        url=serverUrl.with_path ('/image/png'),
+        request=Request (method='GET'),
+        response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'image/png')]),
+            body=Base64Body.fromBytes (b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x00\x00\x00\x00:~\x9bU\x00\x00\x00\nIDAT\x08\x1dc\xf8\x0f\x00\x01\x01\x01\x006_g\x80\x00\x00\x00\x00IEND\xaeB`\x82'),
+            mimeType='image/png'),
+        ),
+    RequestResponsePair (
+        url=serverUrl.with_path ('/script/alert'),
+        request=Request (method='GET'),
+        response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=utf-8')]),
+            body=UnicodeBody ('''<html><body><script>
+window.addEventListener("beforeunload", function (e) {
+    e.returnValue = "bye?";
+    return e.returnValue;
+});
+alert("stopping here");
+if (confirm("are you sure?") || prompt ("42?")) {
+    window.location = "/nonexistent";
+}
+</script></body></html>'''), mimeType='text/html')
+        ),
+    ]
+
+@pytest.mark.asyncio
+# would be nice if we could use hypothesis here somehow
+@pytest.mark.parametrize("golden", items)
+async def test_integration_item (loader, golden):
+    async def f (req):
+        body = golden.response.body
+        contentType = golden.response.headers.get ('content-type', '') if golden.response.headers is not None else ''
+        charsetOff = contentType.find ('charset=')
+        if isinstance (body, UnicodeBody) and charsetOff != -1:
+            encoding = contentType[charsetOff+len ('charset='):]
+            body = golden.response.body.decode ('utf-8').encode (encoding)
+        return web.Response (body=body, status=golden.response.status,
+                headers=golden.response.headers)
+
+    app = web.Application ()
+    app.router.add_route (golden.request.method, golden.url.path, f)
+    runner = web.AppRunner(app)
+    await runner.setup()
+    site = web.TCPSite(runner, serverUrl.host, serverUrl.port)
+    try:
+        await site.start()
+    except Exception as e:
+        pytest.skip (e)
+
+    haveReqResp = False
+    haveNavigated = False
+    try:
+        await loader.navigate (golden.url)
+
+        it = loader.__aiter__ ()
+        while True:
+            try:
+                item = await asyncio.wait_for (it.__anext__ (), timeout=1)
+            except asyncio.TimeoutError:
+                break
+            # XXX: can only check the first req/resp right now (due to redirect)
+            if isinstance (item, RequestResponsePair) and not haveReqResp:
+                # we do not know this in advance
+                item.request.initiator = None
+                item.request.headers = None
+                item.remoteIpAddress = None
+                item.protocol = None
+                item.resourceType = None
+
+                if item.response:
+                    assert item.response.statusText is not None
+                    item.response.statusText = None
+
+                    del item.response.headers['server']
+                    del item.response.headers['content-length']
+                    del item.response.headers['date']
+                assert item == golden
+                haveReqResp = True
+            elif isinstance (item, FrameNavigated):
+                # XXX: can’t check this, because of the redirect
+                #assert item.url == golden.url
+                haveNavigated = True
+    finally:
+        assert haveReqResp
+        assert haveNavigated
+        await runner.cleanup ()
+
+def test_page_idle ():
+    for v in (True, False):
+        idle = PageIdle (v)
+        assert bool (idle) == v
+
+
diff --git a/crocoite/test_controller.py b/crocoite/test_controller.py
new file mode 100644
index 0000000..7216a42
--- /dev/null
+++ b/crocoite/test_controller.py
@@ -0,0 +1,203 @@
+# Copyright (c) 2017–2018 crocoite contributors
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+import asyncio
+
+from yarl import URL
+from aiohttp import web
+
+import pytest
+
+from .logger import Logger
+from .controller import ControllerSettings, SinglePageController, SetEntry, \
+        IdleStateTracker
+from .browser import PageIdle
+from .devtools import Process
+from .test_browser import loader
+
+@pytest.mark.asyncio
+async def test_controller_timeout ():
+    """ Make sure the controller terminates, even if the site keeps reloading/fetching stuff """
+
+    async def f (req):
+        return web.Response (body="""<html>
+<body>
+<p>hello</p>
+<script>
+window.setTimeout (function () { window.location = '/' }, 250);
+window.setInterval (function () { fetch('/').then (function (e) { console.log (e) }) }, 150);
+</script>
+</body>
+</html>""", status=200, content_type='text/html', charset='utf-8')
+
+    url = URL.build (scheme='http', host='localhost', port=8080)
+    app = web.Application ()
+    app.router.add_route ('GET', '/', f)
+    runner = web.AppRunner(app)
+    await runner.setup()
+    site = web.TCPSite(runner, url.host, url.port)
+    await site.start()
+
+    loop = asyncio.get_event_loop ()
+    try:
+        logger = Logger ()
+        settings = ControllerSettings (idleTimeout=1, timeout=5)
+        controller = SinglePageController (url=url, logger=logger,
+                service=Process (), behavior=[], settings=settings)
+        # give the controller a little more time to finish, since there are
+        # hard-coded asyncio.sleep calls in there right now.
+        # XXX fix this
+        before = loop.time ()
+        await asyncio.wait_for (controller.run (), timeout=settings.timeout*2)
+        after = loop.time ()
+        assert after-before >= settings.timeout, (settings.timeout*2, after-before)
+    finally:
+        # give the browser some time to close before interrupting the
+        # connection by destroying the HTTP server
+        await asyncio.sleep (1)
+        await runner.cleanup ()
+
+@pytest.mark.asyncio
+async def test_controller_idle_timeout ():
+    """ Make sure the controller terminates, even if the site keeps reloading/fetching stuff """
+
+    async def f (req):
+        return web.Response (body="""<html>
+<body>
+<p>hello</p>
+<script>
+window.setInterval (function () { fetch('/').then (function (e) { console.log (e) }) }, 2000);
+</script>
+</body>
+</html>""", status=200, content_type='text/html', charset='utf-8')
+
+    url = URL.build (scheme='http', host='localhost', port=8080)
+    app = web.Application ()
+    app.router.add_route ('GET', '/', f)
+    runner = web.AppRunner(app)
+    await runner.setup()
+    site = web.TCPSite(runner, url.host, url.port)
+    await site.start()
+
+    loop = asyncio.get_event_loop ()
+    try:
+        logger = Logger ()
+        settings = ControllerSettings (idleTimeout=1, timeout=60)
+        controller = SinglePageController (url=url, logger=logger,
+                service=Process (), behavior=[], settings=settings)
+        before = loop.time ()
+        await asyncio.wait_for (controller.run (), settings.timeout*2)
+        after = loop.time ()
+        assert settings.idleTimeout <= after-before <= settings.idleTimeout*2+3
+    finally:
+        await runner.cleanup ()
+
+def test_set_entry ():
+    a = SetEntry (1, a=2, b=3)
+    assert a == a
+    assert hash (a) == hash (a)
+
+    b = SetEntry (1, a=2, b=4)
+    assert a == b
+    assert hash (a) == hash (b)
+
+    c = SetEntry (2, a=2, b=3)
+    assert a != c
+    assert hash (a) != hash (c)
+
+@pytest.mark.asyncio
+async def test_idle_state_tracker ():
+    # default is idle
+    loop = asyncio.get_event_loop ()
+    idle = IdleStateTracker (loop)
+    assert idle._idle
+
+    # idle change
+    await idle.push (PageIdle (False))
+    assert not idle._idle
+
+    # nothing happens for other objects
+    await idle.push ({})
+    assert not idle._idle
+
+    # no state change -> wait does not return
+    with pytest.raises (asyncio.TimeoutError):
+        await asyncio.wait_for (idle.wait (0.1), timeout=1)
+
+    # wait at least timeout
+    delta = 0.2
+    timeout = 1
+    await idle.push (PageIdle (True))
+    assert idle._idle
+    start = loop.time ()
+    await idle.wait (timeout)
+    end = loop.time ()
+    assert (timeout-delta) < (end-start) < (timeout+delta)
+
+@pytest.fixture
+async def recordingServer ():
+    """ Simple HTTP server that records raw requests """
+    url = URL ('http://localhost:8080')
+    reqs = []
+    async def record (request):
+        reqs.append (request)
+        return web.Response(text='ok', content_type='text/plain')
+    app = web.Application()
+    app.add_routes([web.get(url.path, record)])
+    runner = web.AppRunner(app)
+    await runner.setup()
+    site = web.TCPSite (runner, url.host, url.port)
+    await site.start()
+    yield url, reqs
+    await runner.cleanup ()
+
+from .test_devtools import tab, browser
+from http.cookies import Morsel, SimpleCookie
+
+@pytest.mark.asyncio
+async def test_set_cookies (tab, recordingServer):
+    """ Make sure cookies are set properly and only affect the domain they were
+    set for """
+
+    logger = Logger ()
+
+    url, reqs = recordingServer
+
+    cookies = []
+    c = Morsel ()
+    c.set ('foo', 'bar', '')
+    c['domain'] = 'localhost'
+    cookies.append (c)
+    c = Morsel ()
+    c.set ('buz', 'beef', '')
+    c['domain'] = 'nonexistent.example'
+
+    settings = ControllerSettings (idleTimeout=1, timeout=60, cookies=cookies)
+    controller = SinglePageController (url=url, logger=logger,
+            service=Process (), behavior=[], settings=settings)
+    await asyncio.wait_for (controller.run (), settings.timeout*2)
+    
+    assert len (reqs) == 1
+    req = reqs[0]
+    reqCookies = SimpleCookie (req.headers['cookie'])
+    assert len (reqCookies) == 1
+    c = next (iter (reqCookies.values ()))
+    assert c.key == cookies[0].key
+    assert c.value == cookies[0].value
diff --git a/crocoite/test_devtools.py b/crocoite/test_devtools.py
new file mode 100644
index 0000000..bd1a828
--- /dev/null
+++ b/crocoite/test_devtools.py
@@ -0,0 +1,208 @@
+# Copyright (c) 2017 crocoite contributors
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+import asyncio
+import pytest
+
+from aiohttp import web
+import websockets
+
+from .devtools import Browser, Tab, MethodNotFound, Crashed, \
+        InvalidParameter, Process, Passthrough
+
+@pytest.fixture
+async def browser ():
+    async with Process () as url:
+        yield Browser (url)
+
+@pytest.fixture
+async def tab (browser):
+    async with browser as tab:
+        yield tab
+        # make sure there are no transactions left over (i.e. no unawaited requests)
+        assert not tab.transactions
+
+docBody = "<html><body><p>Hello, world</p></body></html>"
+async def hello(request):
+    return web.Response(text=docBody, content_type='text/html')
+
+@pytest.fixture
+async def server ():
+    """ Simple HTTP server for testing notifications """
+    app = web.Application()
+    app.add_routes([web.get('/', hello)])
+    runner = web.AppRunner(app)
+    await runner.setup()
+    site = web.TCPSite(runner, 'localhost', 8080)
+    await site.start()
+    yield app
+    await runner.cleanup ()
+
+@pytest.mark.asyncio
+async def test_tab_create (tab):
+    """ Creating tabs works """
+    assert isinstance (tab, Tab)
+    version = await tab.Browser.getVersion ()
+    assert 'protocolVersion' in version
+    assert tab.pending == 0
+
+@pytest.mark.asyncio
+async def test_tab_close (browser):
+    """ Tabs are closed after using them """
+    async with browser as tab:
+        tid = tab.id
+    # give the browser some time to close the tab
+    await asyncio.sleep (0.5)
+    tabs = [t['id'] async for t in browser]
+    assert tid not in tabs
+
+@pytest.mark.asyncio
+async def test_tab_notify_enable_disable (tab):
+    """ Make sure enabling/disabling notifications works for all known
+    namespaces """
+    for name in ('Debugger', 'DOM', 'Log', 'Network', 'Page', 'Performance',
+            'Profiler', 'Runtime', 'Security'):
+        f = getattr (tab, name)
+        await f.enable ()
+        await f.disable ()
+
+@pytest.mark.asyncio
+async def test_tab_unknown_method (tab):
+    with pytest.raises (MethodNotFound):
+        await tab.Nonexistent.foobar ()
+
+@pytest.mark.asyncio
+async def test_tab_invalid_argument (tab):
+    # should be string
+    with pytest.raises (InvalidParameter):
+        await tab.Page.captureScreenshot (format=123)
+
+    with pytest.raises (InvalidParameter):
+        await tab.Page.captureScreenshot (format=[123])
+
+    with pytest.raises (InvalidParameter):
+        await tab.Page.captureScreenshot (format={123: '456'})
+
+@pytest.mark.asyncio
+async def test_tab_crash (tab):
+    with pytest.raises (Crashed):
+        await tab.Page.crash ()
+
+    # caling anything else now should fail as well
+    with pytest.raises (Crashed):
+        await tab.Browser.getVersion ()
+
+@pytest.mark.asyncio
+async def test_load (tab, server):
+    await tab.Network.enable ()
+    await tab.Page.navigate (url='http://localhost:8080')
+
+    haveRequest = False
+    haveResponse = False
+    haveData = False
+    haveFinished = False
+    haveBody = False
+    req = None
+    resp = None
+    while not haveBody:
+        method, data = await tab.get ()
+
+        # it can be either of those two in no specified order
+        if method in (tab.Network.requestWillBeSent, tab.Network.requestWillBeSentExtraInfo) and not haveResponse:
+            if req is None:
+                req = data
+            assert data['requestId'] == req['requestId']
+            haveRequest = True
+        elif method in (tab.Network.responseReceived, tab.Network.responseReceivedExtraInfo) and haveRequest:
+            if resp is None:
+                resp = data
+            assert data['requestId'] == resp['requestId']
+            haveResponse = True
+        elif haveRequest and haveResponse and method == tab.Network.dataReceived:
+            assert data['dataLength'] == len (docBody)
+            assert data['requestId'] == req['requestId']
+            haveData = True
+        elif haveData:
+            assert method == tab.Network.loadingFinished
+            assert data['requestId'] == req['requestId']
+            haveBody = True
+        elif haveFinished:
+            body = await tab.Network.getResponseBody (requestId=req['requestId'])
+            assert body['body'] == docBody
+            haveBody = True
+        else:
+            assert False, (method, req)
+
+    await tab.Network.disable ()
+    assert tab.pending == 0
+
+@pytest.mark.asyncio
+async def test_recv_failure(browser):
+    """ Inject failure into receiver process and crash it """
+    async with browser as tab:
+        await tab.ws.close ()
+        with pytest.raises (Crashed):
+            await tab.Browser.getVersion ()
+
+    async with browser as tab:
+        await tab.ws.close ()
+        with pytest.raises (Crashed):
+            await tab.get ()
+
+    async with browser as tab:
+        handle = asyncio.ensure_future (tab.get ())
+        await tab.ws.close ()
+        with pytest.raises (Crashed):
+            await handle
+
+@pytest.mark.asyncio
+async def test_tab_function (tab):
+    assert tab.Network.enable.name == 'Network.enable'
+    assert tab.Network.disable == tab.Network.disable
+    assert tab.Network.enable != tab.Network.disable
+    assert tab.Network != tab.Network.enable
+    assert callable (tab.Network.enable)
+    assert not callable (tab.Network.enable.name)
+    assert 'Network.enable' in repr (tab.Network.enable)
+
+@pytest.mark.asyncio
+async def test_tab_function_hash (tab):
+    d = {tab.Network.enable: 1, tab.Network.disable: 2, tab.Page: 3,
+            tab.Page.enable: 4}
+    assert len (d) == 4
+
+@pytest.mark.asyncio
+async def test_ws_ping(tab):
+    """
+    Chrome does not like websocket pings and closes the connection if it
+    receives one. Not sure why.
+    """
+    with pytest.raises (Crashed):
+        await tab.ws.ping ()
+        await tab.Browser.getVersion ()
+
+@pytest.mark.asyncio
+async def test_passthrough ():
+    """ Null service returns the url as is """
+
+    url = 'http://localhost:12345'
+    async with Passthrough (url) as u:
+        assert str (u) == url
+
diff --git a/crocoite/test_html.py b/crocoite/test_html.py
new file mode 100644
index 0000000..c17903b
--- /dev/null
+++ b/crocoite/test_html.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2017 crocoite contributors
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+import asyncio
+import pytest, html5lib
+from html5lib.serializer import HTMLSerializer
+from html5lib.treewalkers import getTreeWalker
+from aiohttp import web
+
+from .html import StripTagFilter, StripAttributeFilter, ChromeTreeWalker
+from .test_devtools import tab, browser
+
+def test_strip_tag ():
+    d = html5lib.parse ('<a>barbaz<b>foobar</b>.</a><b>foobar</b>.<b attr=1><c></c>')
+    stream = StripTagFilter (getTreeWalker ('etree')(d), ['b', 'c'])
+    serializer = HTMLSerializer ()
+    assert serializer.render (stream) == '<a>barbaz.</a>.'
+
+def test_strip_attribute ():
+    d = html5lib.parse ('<a b=1 c="yes" d></a><br b=2 c="no" d keep=1>')
+    stream = StripAttributeFilter (getTreeWalker ('etree')(d), ['b', 'c', 'd'])
+    serializer = HTMLSerializer ()
+    assert serializer.render (stream) == '<a></a><br keep=1>'
+
+@pytest.mark.asyncio
+async def test_treewalker (tab):
+    frames = await tab.Page.getFrameTree ()
+
+    framehtml = '<HTML><HEAD></HEAD><BODY></BODY></HTML>'
+    html = '<HTML><HEAD><META charset=utf-8></HEAD><BODY><H1>Hello</H1><!-- comment --><IFRAME></IFRAME></BODY></HTML>'
+    rootframe = frames['frameTree']['frame']['id']
+    await tab.Page.setDocumentContent (frameId=rootframe, html=html)
+
+    dom = await tab.DOM.getDocument (depth=-1, pierce=True)
+    docs = list (ChromeTreeWalker (dom['root']).split ())
+    assert len(docs) == 2
+    for i, doc in enumerate (docs):
+        walker = ChromeTreeWalker (doc)
+        serializer = HTMLSerializer ()
+        result = serializer.render (iter(walker))
+        if i == 0:
+            assert result == html
+        elif i == 1:
+            assert result == framehtml
+
+cdataDoc = '<test><![CDATA[Hello world]]></test>'
+xmlHeader = '<?xml version="1.0" encoding="UTF-8"?>'
+async def hello(request):
+    return web.Response(text=xmlHeader + cdataDoc, content_type='text/xml')
+
+@pytest.fixture
+async def server ():
+    """ Simple HTTP server for testing notifications """
+    app = web.Application()
+    app.add_routes([web.get('/test.xml', hello)])
+    runner = web.AppRunner(app)
+    await runner.setup()
+    site = web.TCPSite(runner, 'localhost', 8080)
+    await site.start()
+    yield app
+    await runner.cleanup ()
+
+@pytest.mark.asyncio
+async def test_treewalker_cdata (tab, server):
+    ret = await tab.Page.navigate (url='http://localhost:8080/test.xml')
+    # wait until loaded XXX: replace with idle check
+    await asyncio.sleep (0.5)
+    dom = await tab.DOM.getDocument (depth=-1, pierce=True)
+    docs = list (ChromeTreeWalker (dom['root']).split ())
+    assert len(docs) == 1
+    for i, doc in enumerate (docs):
+        walker = ChromeTreeWalker (doc)
+        serializer = HTMLSerializer ()
+        result = serializer.render (iter(walker))
+        # chrome will display a pretty-printed viewer *plus* the original
+        # source (stripped of its xml header)
+        assert cdataDoc in result
+
+
diff --git a/crocoite/test_irc.py b/crocoite/test_irc.py
new file mode 100644
index 0000000..9344de4
--- /dev/null
+++ b/crocoite/test_irc.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2017–2018 crocoite contributors
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+import pytest
+from .irc import ArgparseBot, RefCountEvent, User, NickMode
+
+def test_mode_parse ():
+    assert ArgparseBot.parseMode ('+a') == [('+', 'a')]
+    assert ArgparseBot.parseMode ('+ab') == [('+', 'a'), ('+', 'b')]
+    assert ArgparseBot.parseMode ('+a+b') == [('+', 'a'), ('+', 'b')]
+    assert ArgparseBot.parseMode ('-a') == [('-', 'a')]
+    assert ArgparseBot.parseMode ('-ab') == [('-', 'a'), ('-', 'b')]
+    assert ArgparseBot.parseMode ('-a-b') == [('-', 'a'), ('-', 'b')]
+    assert ArgparseBot.parseMode ('+a-b') == [('+', 'a'), ('-', 'b')]
+    assert ArgparseBot.parseMode ('-a+b') == [('-', 'a'), ('+', 'b')]
+    assert ArgparseBot.parseMode ('-ab+cd') == [('-', 'a'), ('-', 'b'), ('+', 'c'), ('+', 'd')]
+
+@pytest.fixture
+def event ():
+    return RefCountEvent ()
+
+def test_refcountevent_arm (event):
+    event.arm ()
+    assert event.event.is_set ()
+
+def test_refcountevent_ctxmgr (event):
+    with event:
+        assert event.count == 1
+        with event:
+            assert event.count == 2
+
+def test_refcountevent_arm_with (event):
+    with event:
+        event.arm ()
+        assert not event.event.is_set ()
+    assert event.event.is_set ()
+
+def test_nick_mode ():
+    a = User.fromName ('a')
+    a2 = User.fromName ('a')
+    a3 = User.fromName ('+a')
+    b = User.fromName ('+b')
+    c = User.fromName ('@c')
+
+    # equality is based on name only, not mode
+    assert a == a2
+    assert a == a3
+    assert a != b
+
+    assert a.hasPriv (None) and not a.hasPriv (NickMode.voice) and not a.hasPriv (NickMode.operator)
+    assert b.hasPriv (None) and b.hasPriv (NickMode.voice) and not b.hasPriv (NickMode.operator)
+    assert c.hasPriv (None) and c.hasPriv (NickMode.voice) and c.hasPriv (NickMode.operator)
+
diff --git a/crocoite/test_logger.py b/crocoite/test_logger.py
new file mode 100644
index 0000000..26e420a
--- /dev/null
+++ b/crocoite/test_logger.py
@@ -0,0 +1,91 @@
+import pytest
+from .logger import Logger, Consumer, NullConsumer, Level, DatetimeConsumer
+
+@pytest.fixture
+def logger ():
+    return Logger (consumer=[NullConsumer (), DatetimeConsumer ()])
+
+class QueueConsumer (Consumer):
+    def __init__ (self):
+        self.data = []
+
+    def __call__ (self, **kwargs):
+        self.data.append (kwargs)
+        return kwargs
+
+def test_bind (logger):
+    # simple bind
+    logger = logger.bind (foo='bar')
+    ret = logger.debug ()
+    assert ret['foo'] == 'bar'
+
+    # additional
+    ret = logger.debug (bar='baz')
+    assert ret['foo'] == 'bar'
+    assert ret['bar'] == 'baz'
+    
+    # override
+    ret = logger.debug (foo='baz')
+    assert ret['foo'] == 'baz'
+
+    # unbind
+    logger = logger.unbind (foo=None)
+    ret = logger.debug ()
+    assert 'foo' not in ret
+
+def test_consumer (logger):
+    c = QueueConsumer ()
+    logger.connect (c)
+    ret = logger.debug (foo='bar')
+    assert len (c.data) == 1
+    assert c.data[0] == ret
+    assert ret['foo'] == 'bar'
+    c.data = []
+
+    # inheritance
+    logger = logger.bind (inherit=1)
+    ret = logger.debug (foo='bar')
+    assert len (c.data) == 1
+    assert c.data[0] == ret
+    assert ret['foo'] == 'bar'
+    assert ret['inherit'] == 1
+    c.data = []
+
+    # removal
+    logger.disconnect (c)
+    ret = logger.debug (foo='bar')
+    assert len (c.data) == 0
+    assert ret['foo'] == 'bar'
+    assert ret['inherit'] == 1
+
+def test_multiarg (logger):
+    # single argument
+    ret = logger.debug('maybe', foo='bar')
+    assert ret['msg'] == 'maybe'
+    assert ret['foo'] == 'bar'
+
+    # multi arguments
+    ret = logger.debug('may', 'be', foo='bar')
+    assert ret['msg'] == ('may', 'be')
+    assert ret['foo'] == 'bar'
+
+def test_call (logger):
+    for level in ('debug', Level.DEBUG):
+        ret = logger(level, 'arg1', 'arg2', foo='bar')
+        assert ret['level'] == Level.DEBUG
+        assert ret['msg'] == ('arg1', 'arg2')
+        assert ret['foo'] == 'bar'
+
+def test_datetime (logger):
+    ret = logger.debug()
+    assert 'date' in ret
+
+def test_independence ():
+    """ Make sure two instances are completely independent """
+    l1 = Logger ()
+    c = QueueConsumer ()
+    l1.connect (c)
+    l2 = Logger ()
+    l2.info (nothing='nothing')
+    assert not c.data
+
diff --git a/crocoite/test_tools.py b/crocoite/test_tools.py
new file mode 100644
index 0000000..416b954
--- /dev/null
+++ b/crocoite/test_tools.py
@@ -0,0 +1,224 @@
+# Copyright (c) 2018 crocoite contributors
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+from tempfile import NamedTemporaryFile
+from operator import itemgetter
+from io import BytesIO
+import pytest
+from warcio.archiveiterator import ArchiveIterator
+from warcio.warcwriter import WARCWriter
+from warcio.statusandheaders import StatusAndHeaders
+from pkg_resources import parse_version
+
+from .tools import mergeWarc, Errata, FixableErrata
+
+@pytest.fixture
+def writer():
+    return WARCWriter (NamedTemporaryFile(), gzip=True)
+
+def recordsEqual(golden, underTest):
+    for a, b in zip (golden, underTest):
+        # record ids are not predictable, so we cannot compare them. Dito for
+        # dates. Content-* seems to be added when writing to file.
+        for x in {'WARC-Record-Id', 'WARC-Block-Digest', 'WARC-Date',
+                'Content-Length', 'Content-Type'}:
+            a.rec_headers.remove_header(x)
+            b.rec_headers.remove_header(x)
+        aheader = sorted(a.rec_headers.headers, key=itemgetter(0))
+        bheader = sorted(b.rec_headers.headers, key=itemgetter(0))
+        assert aheader == bheader
+        assert a.http_headers == b.http_headers
+
+def makeGolden(writer, records):
+    # additional warcinfo is written. Content does not matter.
+    record = writer.create_warc_record (
+            '',
+            'warcinfo',
+            payload=b'',
+            warc_headers_dict={'Content-Type': 'application/json; charset=utf-8'})
+    records.insert (0, record)
+    return records
+
+def test_unmodified(writer):
+    """
+    Single request/response pair, no revisits
+    """
+
+    records = []
+
+    httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True)
+    warcHeaders = {}
+    record = writer.create_warc_record ('http://example.com/', 'request', payload=BytesIO(b'foobar'),
+            warc_headers_dict=warcHeaders, http_headers=httpHeaders)
+    records.append (record)
+
+    httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
+    record = writer.create_warc_record ('http://example.com/', 'response', payload=BytesIO(b'data'),
+            warc_headers_dict=warcHeaders, http_headers=httpHeaders)
+    records.append (record)
+
+    for r in records:
+        writer.write_record (r)
+
+    output = NamedTemporaryFile()
+    mergeWarc ([writer.out.name], output)
+
+    output.seek(0)
+    recordsEqual (makeGolden (writer, records), ArchiveIterator (output))
+
+def test_different_payload(writer):
+    """
+    Duplicate URL, but different payload
+    """
+
+    records = []
+    for i in range (2):
+        httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True)
+        warcHeaders = {}
+        record = writer.create_warc_record ('http://example.com/', 'request', payload=BytesIO(b'foobar'),
+                warc_headers_dict=warcHeaders, http_headers=httpHeaders)
+        records.append (record)
+
+        httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
+        record = writer.create_warc_record ('http://example.com/', 'response',
+                payload=BytesIO(f'data{i}'.encode ('utf8')),
+                warc_headers_dict=warcHeaders, http_headers=httpHeaders)
+        records.append (record)
+
+    for r in records:
+        writer.write_record (r)
+
+    output = NamedTemporaryFile()
+    mergeWarc ([writer.out.name], output)
+
+    output.seek(0)
+    recordsEqual (makeGolden (writer, records), ArchiveIterator (output))
+
+def makeRevisit(writer, ref, dup):
+    """ Make revisit record for reference """
+    dupHeaders = dup.rec_headers
+    refHeaders = ref.rec_headers
+    record = writer.create_revisit_record (dupHeaders.get_header('WARC-Target-URI'),
+            digest=refHeaders.get_header('WARC-Payload-Digest'),
+            refers_to_uri=refHeaders.get_header('WARC-Target-URI'),
+            refers_to_date=refHeaders.get_header('WARC-Date'),
+            http_headers=dup.http_headers)
+    record.rec_headers.add_header ('WARC-Refers-To', refHeaders.get_header('WARC-Record-ID'))
+    record.rec_headers.add_header ('WARC-Truncated', 'length')
+    return record
+
+def test_resp_revisit_same_url(writer):
+    """
+    Duplicate record for the same URL, creates a revisit
+    """
+
+    records = []
+    for i in range (2):
+        httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True)
+        warcHeaders = {}
+        record = writer.create_warc_record ('http://example.com/', 'request', payload=BytesIO(b'foobar'),
+                warc_headers_dict=warcHeaders, http_headers=httpHeaders)
+        records.append (record)
+
+        httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
+        record = writer.create_warc_record ('http://example.com/', 'response', payload=BytesIO(b'data'),
+                warc_headers_dict=warcHeaders, http_headers=httpHeaders)
+        records.append (record)
+
+    for r in records:
+        writer.write_record (r)
+
+    dup = records.pop ()
+    ref = records[1]
+    records.append (makeRevisit (writer, ref, dup))
+
+    output = NamedTemporaryFile()
+    mergeWarc ([writer.out.name], output)
+
+    output.seek(0)
+    recordsEqual (makeGolden (writer, records), ArchiveIterator (output))
+
+def test_resp_revisit_other_url(writer):
+    """
+    Duplicate record for different URL, creates a revisit
+    """
+
+    records = []
+
+    httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True)
+    warcHeaders = {}
+    record = writer.create_warc_record ('http://example.com/', 'request', payload=BytesIO(b'foobar'),
+            warc_headers_dict=warcHeaders, http_headers=httpHeaders)
+    records.append (record)
+
+    httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
+    record = writer.create_warc_record ('http://example.com/', 'response', payload=BytesIO(b'data'),
+            warc_headers_dict=warcHeaders, http_headers=httpHeaders)
+    records.append (record)
+
+    httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True)
+    warcHeaders = {}
+    record = writer.create_warc_record ('http://example.com/one', 'request', payload=BytesIO(b'foobar'),
+            warc_headers_dict=warcHeaders, http_headers=httpHeaders)
+    records.append (record)
+
+    httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
+    record = writer.create_warc_record ('http://example.com/one', 'response', payload=BytesIO(b'data'),
+            warc_headers_dict=warcHeaders, http_headers=httpHeaders)
+    records.append (record)
+
+    for r in records:
+        writer.write_record (r)
+
+    dup = records.pop ()
+    ref = records[1]
+    records.append (makeRevisit (writer, ref, dup))
+
+    output = NamedTemporaryFile()
+    mergeWarc ([writer.out.name], output)
+
+    output.seek(0)
+    recordsEqual (makeGolden (writer, records), ArchiveIterator (output))
+
+def test_errata_contains():
+    """ Test version matching """
+    e = Errata('some-uuid', 'description', ['a<1.0'])
+    assert {'a': parse_version('0.1')} in e
+    assert {'a': parse_version('1.0')} not in e
+    assert {'b': parse_version('1.0')} not in e
+
+    e = Errata('some-uuid', 'description', ['a<1.0,>0.1'])
+    assert {'a': parse_version('0.1')} not in e
+    assert {'a': parse_version('0.2')} in e
+    assert {'a': parse_version('1.0')} not in e
+
+    # a AND b
+    e = Errata('some-uuid', 'description', ['a<1.0', 'b>1.0'])
+    assert {'a': parse_version('0.1')} not in e
+    assert {'b': parse_version('1.1')} not in e
+    assert {'a': parse_version('0.1'), 'b': parse_version('1.1')} in e
+
+def test_errata_fixable ():
+    e = Errata('some-uuid', 'description', ['a<1.0', 'b>1.0'])
+    assert not e.fixable
+
+    e = FixableErrata('some-uuid', 'description', ['a<1.0', 'b>1.0'])
+    assert e.fixable
+
diff --git a/crocoite/test_warc.py b/crocoite/test_warc.py
new file mode 100644
index 0000000..3ec310c
--- /dev/null
+++ b/crocoite/test_warc.py
@@ -0,0 +1,225 @@
+# Copyright (c) 2018 crocoite contributors
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+from tempfile import NamedTemporaryFile
+import json, urllib
+from operator import itemgetter
+
+from warcio.archiveiterator import ArchiveIterator
+from yarl import URL
+from multidict import CIMultiDict
+from hypothesis import given, reproduce_failure
+import hypothesis.strategies as st
+import pytest
+
+from .warc import WarcHandler
+from .logger import Logger, WarcHandlerConsumer
+from .controller import ControllerStart
+from .behavior import Script, ScreenshotEvent, DomSnapshotEvent
+from .browser import RequestResponsePair, Base64Body, UnicodeBody
+from .test_browser import requestResponsePair, urls
+
+def test_log ():
+    logger = Logger ()
+
+    with NamedTemporaryFile() as fd:
+        with WarcHandler (fd, logger) as handler:
+            warclogger = WarcHandlerConsumer (handler)
+            logger.connect (warclogger)
+            golden = []
+
+            assert handler.log.tell () == 0
+            golden.append (logger.info (foo=1, bar='baz', encoding='äöü⇔ΓΨ'))
+            assert handler.log.tell () != 0
+
+            handler.maxLogSize = 0
+            golden.append (logger.info (bar=1, baz='baz'))
+            # should flush the log
+            assert handler.log.tell () == 0
+
+        fd.seek (0)
+        for it in ArchiveIterator (fd):
+            headers = it.rec_headers
+            assert headers['warc-type'] == 'metadata'
+            assert 'warc-target-uri' not in headers
+            assert headers['x-crocoite-type'] == 'log'
+            assert headers['content-type'] == f'application/json; charset={handler.logEncoding}'
+
+            while True:
+                l = it.raw_stream.readline ()
+                if not l:
+                    break
+                data = json.loads (l.strip ())
+                assert data == golden.pop (0)
+
+def jsonObject ():
+    """ JSON-encodable objects """
+    return st.dictionaries (st.text (), st.one_of (st.integers (), st.text ()))
+
+def viewport ():
+    return st.builds (lambda x, y: f'{x}x{y}', st.integers (), st.integers ())
+
+def event ():
+    return st.one_of (
+            st.builds (ControllerStart, jsonObject ()),
+            st.builds (Script.fromStr, st.text (), st.one_of(st.none (), st.text ())),
+            st.builds (ScreenshotEvent, urls (), st.integers (), st.binary ()),
+            st.builds (DomSnapshotEvent, urls (), st.builds (lambda x: x.encode ('utf-8'), st.text ()), viewport()),
+            requestResponsePair (),
+            )
+
+@pytest.mark.asyncio
+@given (st.lists (event ()))
+async def test_push (golden):
+    def checkWarcinfoId (headers):
+        if lastWarcinfoRecordid is not None:
+            assert headers['WARC-Warcinfo-ID'] == lastWarcinfoRecordid
+
+    lastWarcinfoRecordid = None
+
+    # null logger
+    logger = Logger ()
+    with open('/tmp/test.warc.gz', 'w+b') as fd:
+        with WarcHandler (fd, logger) as handler:
+            for g in golden:
+                await handler.push (g)
+
+        fd.seek (0)
+        it = iter (ArchiveIterator (fd))
+        for g in golden:
+            if isinstance (g, ControllerStart):
+                rec = next (it)
+
+                headers = rec.rec_headers
+                assert headers['warc-type'] == 'warcinfo'
+                assert 'warc-target-uri' not in headers
+                assert 'x-crocoite-type' not in headers
+
+                data = json.load (rec.raw_stream)
+                assert data == g.payload
+
+                lastWarcinfoRecordid = headers['warc-record-id']
+                assert lastWarcinfoRecordid
+            elif isinstance (g, Script):
+                rec = next (it)
+
+                headers = rec.rec_headers
+                assert headers['warc-type'] == 'resource'
+                assert headers['content-type'] == 'application/javascript; charset=utf-8'
+                assert headers['x-crocoite-type'] == 'script'
+                checkWarcinfoId (headers)
+                if g.path:
+                    assert URL (headers['warc-target-uri']) == URL ('file://' + g.abspath)
+                else:
+                    assert 'warc-target-uri' not in headers
+
+                data = rec.raw_stream.read ().decode ('utf-8')
+                assert data == g.data
+            elif isinstance (g, ScreenshotEvent):
+                # XXX: check refers-to header
+                rec = next (it)
+
+                headers = rec.rec_headers
+                assert headers['warc-type'] == 'conversion'
+                assert headers['x-crocoite-type'] == 'screenshot'
+                checkWarcinfoId (headers)
+                assert URL (headers['warc-target-uri']) == g.url, (headers['warc-target-uri'], g.url)
+                assert headers['warc-refers-to'] is None
+                assert int (headers['X-Crocoite-Screenshot-Y-Offset']) == g.yoff
+
+                assert rec.raw_stream.read () == g.data
+            elif isinstance (g, DomSnapshotEvent):
+                rec = next (it)
+
+                headers = rec.rec_headers
+                assert headers['warc-type'] == 'conversion'
+                assert headers['x-crocoite-type'] == 'dom-snapshot'
+                checkWarcinfoId (headers)
+                assert URL (headers['warc-target-uri']) == g.url
+                assert headers['warc-refers-to'] is None
+
+                assert rec.raw_stream.read () == g.document
+            elif isinstance (g, RequestResponsePair):
+                rec = next (it)
+
+                # request
+                headers = rec.rec_headers
+                assert headers['warc-type'] == 'request'
+                assert 'x-crocoite-type' not in headers
+                checkWarcinfoId (headers)
+                assert URL (headers['warc-target-uri']) == g.url
+                assert headers['x-chrome-request-id'] == g.id
+                
+                assert CIMultiDict (rec.http_headers.headers) == g.request.headers
+                if g.request.hasPostData:
+                    if g.request.body is not None:
+                        assert rec.raw_stream.read () == g.request.body
+                    else:
+                        # body fetch failed
+                        assert headers['warc-truncated'] == 'unspecified'
+                        assert not rec.raw_stream.read ()
+                else:
+                    assert not rec.raw_stream.read ()
+
+                # response
+                if g.response:
+                    rec = next (it)
+                    headers = rec.rec_headers
+                    httpheaders = rec.http_headers
+                    assert headers['warc-type'] == 'response'
+                    checkWarcinfoId (headers)
+                    assert URL (headers['warc-target-uri']) == g.url
+                    assert headers['x-chrome-request-id'] == g.id
+                    assert 'x-crocoite-type' not in headers
+
+                    # these are checked separately
+                    filteredHeaders = CIMultiDict (httpheaders.headers)
+                    for b in {'content-type', 'content-length'}:
+                        if b in g.response.headers:
+                            g.response.headers.popall (b)
+                        if b in filteredHeaders:
+                            filteredHeaders.popall (b)
+                    assert filteredHeaders == g.response.headers
+
+                    expectedContentType = g.response.mimeType
+                    if expectedContentType is not None:
+                        assert httpheaders['content-type'].startswith (expectedContentType)
+
+                    if g.response.body is not None:
+                        assert rec.raw_stream.read () == g.response.body
+                        assert httpheaders['content-length'] == str (len (g.response.body))
+                        # body is never truncated if it exists
+                        assert headers['warc-truncated'] is None
+
+                        # unencoded strings are converted to utf8
+                        if isinstance (g.response.body, UnicodeBody) and httpheaders['content-type'] is not None:
+                            assert httpheaders['content-type'].endswith ('; charset=utf-8')
+                    else:
+                        # body fetch failed
+                        assert headers['warc-truncated'] == 'unspecified'
+                        assert not rec.raw_stream.read ()
+                        # content-length header should be kept intact
+            else:
+                assert False, f"invalid golden type {type(g)}" # pragma: no cover
+
+        # no further records
+        with pytest.raises (StopIteration):
+            next (it)
+
diff --git a/crocoite/tools.py b/crocoite/tools.py
index bc92f8f..a2ddaa3 100644
--- a/crocoite/tools.py
+++ b/crocoite/tools.py
@@ -22,76 +22,296 @@
 Misc tools
 """
 
-import shutil, sys, re, os, logging, argparse
+import shutil, sys, os, logging, argparse, json
+from io import BytesIO
+
 from warcio.archiveiterator import ArchiveIterator
 from warcio.warcwriter import WARCWriter
+from yarl import URL
 
-def mergeWarc ():
-    """
-    Merge multiple WARC files into a single file, writing revisit records for
-    items which occur multiple times
-    """
-
-    parser = argparse.ArgumentParser(description='Merge WARCs, reads filenames from stdin.')
-    parser.add_argument('--verbose', '-v', action='store_true')
-    parser.add_argument('output', type=argparse.FileType ('wb'), help='Output WARC')
+from pkg_resources import parse_version, parse_requirements
 
-    args = parser.parse_args()
-    loglevel = logging.DEBUG if args.verbose else logging.INFO
-    logging.basicConfig (level=loglevel)
+from .util import getSoftwareInfo, StrJsonEncoder
+from .warc import jsonMime, makeContentType
 
+def mergeWarc (files, output):
+    # stats
     unique = 0
     revisit = 0
+    uniqueLength = 0
+    revisitLength = 0
+
     payloadMap = {}
-    writer = WARCWriter (args.output, gzip=True)
-    for l in sys.stdin:
-        l = l.strip ()
+    writer = WARCWriter (output, gzip=True)
+
+    # Add an additional warcinfo record, describing the transformations. This
+    # is not ideal, since
+    #   “A ‘warcinfo’ record describes the records that
+    #   follow it […] until next ‘warcinfo’”
+    #   -- https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#warcinfo
+    # A warcinfo record is expected at the beginning of every file. But it
+    # might have written by a different software, so we don’t want to
+    # strip/replace that information, but supplement it.
+    warcinfo = {
+            'software': getSoftwareInfo (),
+            'tool': 'crocoite-merge', # not the name of the cli tool
+            'parameters': {'inputs': files},
+            }
+    payload = BytesIO (json.dumps (warcinfo, indent=2).encode ('utf-8'))
+    record = writer.create_warc_record ('', 'warcinfo',
+            payload=payload,
+            warc_headers_dict={'Content-Type': makeContentType (jsonMime, 'utf-8')})
+    writer.write_record (record)
+
+    for l in files:
         with open (l, 'rb') as fd:
             for record in ArchiveIterator (fd):
                 if record.rec_type in {'resource', 'response'}:
                     headers = record.rec_headers
                     rid = headers.get_header('WARC-Record-ID')
                     csum = headers.get_header('WARC-Payload-Digest')
+                    length = int (headers.get_header ('Content-Length'))
                     dup = payloadMap.get (csum, None)
                     if dup is None:
                         payloadMap[csum] = {'uri': headers.get_header('WARC-Target-URI'),
                                 'id': rid, 'date': headers.get_header('WARC-Date')}
                         unique += 1
+                        uniqueLength += length
                     else:
-                        logging.debug ('Record {} is duplicate of {}'.format (rid, dup['id']))
-                        record = writer.create_revisit_record (dup['uri'], csum, dup['uri'], dup['date'])
+                        logging.debug (f'Record {rid} is duplicate of {dup["id"]}')
+                        # Payload may be identical, but HTTP headers are
+                        # (probably) not. Include them.
+                        record = writer.create_revisit_record (
+                                headers.get_header('WARC-Target-URI'), digest=csum,
+                                refers_to_uri=dup['uri'], refers_to_date=dup['date'],
+                                http_headers=record.http_headers)
                         record.rec_headers.add_header ('WARC-Truncated', 'length')
                         record.rec_headers.add_header ('WARC-Refers-To', dup['id'])
                         revisit += 1
+                        revisitLength += length
                 else:
                     unique += 1
                 writer.write_record (record)
-    logging.info ('Wrote {} unique records, {} revisits'.format (unique, revisit))
+    json.dump (dict (
+            unique=dict (records=unique, bytes=uniqueLength),
+            revisit=dict (records=revisit, bytes=revisitLength),
+            ratio=dict (
+                    records=unique/(unique+revisit),
+                    bytes=uniqueLength/(uniqueLength+revisitLength)
+                    ),
+            ),
+            sys.stdout,
+            cls=StrJsonEncoder)
+    sys.stdout.write ('\n')
+
+def mergeWarcCli():
+    parser = argparse.ArgumentParser(description='Merge WARCs, reads filenames from stdin.')
+    parser.add_argument('--verbose', '-v', action='store_true')
+    parser.add_argument('output', type=argparse.FileType ('wb'), help='Output WARC')
+
+    args = parser.parse_args()
+    loglevel = logging.DEBUG if args.verbose else logging.INFO
+    logging.basicConfig (level=loglevel)
+
+    mergeWarc([l.strip() for l in sys.stdin], args.output)
 
 def extractScreenshot ():
     """
     Extract page screenshots from a WARC generated by crocoite into files
     """
 
-    parser = argparse.ArgumentParser(description='Extract screenshots.')
-    parser.add_argument('-f', '--force', action='store_true', help='Overwrite existing files')
-    parser.add_argument('input', type=argparse.FileType ('rb'), help='Input WARC')
+    parser = argparse.ArgumentParser(description='Extract screenshots from '
+            'WARC, write JSON info to stdout.')
+    parser.add_argument('-f', '--force', action='store_true',
+            help='Overwrite existing files')
+    parser.add_argument('-1', '--one', action='store_true',
+            help='Only extract the first screenshot into a file named prefix')
+    parser.add_argument('input', type=argparse.FileType ('rb'),
+            help='Input WARC')
     parser.add_argument('prefix', help='Output file prefix')
 
     args = parser.parse_args()
 
-    screenshotRe = re.compile (r'^urn:crocoite:screenshot-(\d+)-(\d+).png$', re.I)
+    i = 0
     with args.input:
-        for record in ArchiveIterator(args.input):
-            uri = record.rec_headers.get_header('WARC-Target-URI')
-            if record.rec_type == 'resource':
-                m = screenshotRe.match (uri)
-                xoff, yoff = m.groups ()
-                if m:
-                    outpath = '{}-{}-{}.png'.format (args.prefix, xoff, yoff)
-                    if args.force or not os.path.exists (outpath):
-                        with open (outpath, 'wb') as out:
-                            shutil.copyfileobj (record.raw_stream, out)
-                    else:
-                        print ('not overwriting {}'.format (outpath))
+        for record in ArchiveIterator (args.input):
+            headers = record.rec_headers
+            if record.rec_type != 'conversion' or \
+                    headers['Content-Type'] != 'image/png' or \
+                    'X-Crocoite-Screenshot-Y-Offset' not in headers:
+                continue
+
+            url = URL (headers.get_header ('WARC-Target-URI'))
+            yoff = int (headers.get_header ('X-Crocoite-Screenshot-Y-Offset'))
+            outpath = f'{args.prefix}{i:05d}.png' if not args.one else args.prefix
+            if args.force or not os.path.exists (outpath):
+                json.dump ({'file': outpath, 'url': url, 'yoff': yoff},
+                        sys.stdout, cls=StrJsonEncoder)
+                sys.stdout.write ('\n')
+                with open (outpath, 'wb') as out:
+                    shutil.copyfileobj (record.raw_stream, out)
+                i += 1
+            else:
+                print (f'not overwriting {outpath}', file=sys.stderr)
+
+            if args.one:
+                break
+
+class Errata:
+    __slots__ = ('uuid', 'description', 'url', 'affects')
+
+    def __init__ (self, uuid, description, affects, url=None):
+        self.uuid = uuid
+        self.description = description
+        self.url = url
+        # slightly abusing setuptool’s version parsing/matching here
+        self.affects = list (parse_requirements(affects))
+
+    def __contains__ (self, pkg):
+        """
+        Return True if the versions in pkg are affected by this errata
+
+        pkg must be a mapping from project_name to version
+        """
+        matchedAll = []
+        for a in self.affects:
+            haveVersion = pkg.get (a.project_name, None)
+            matchedAll.append (haveVersion is not None and haveVersion in a)
+        return all (matchedAll)
+
+    def __repr__ (self):
+        return f'{self.__class__.__name__}({self.uuid!r}, {self.description!r}, {self.affects!r})'
+
+    @property
+    def fixable (self):
+        return getattr (self, 'applyFix', None) is not None
+
+    def toDict (self):
+        return {'uuid': self.uuid,
+                'description': self.description,
+                'url': self.url,
+                'affects': list (map (str, self.affects)),
+                'fixable': self.fixable}
+
+class FixableErrata(Errata):
+    __slots__ = ('stats')
+
+    def __init__ (self, uuid, description, affects, url=None):
+        super().__init__ (uuid, description, affects, url)
+        # statistics for fixable erratas
+        self.stats = dict (records=dict (fixed=0, processed=0))
+
+    def applyFix (self, record):
+        raise NotImplementedError () # pragma: no cover
+
+class ContentTypeErrata (FixableErrata):
+    def __init__ (self):
+        super().__init__ (
+            uuid='552c13dc-56e5-4539-9ad8-184ccae60930',
+            description='Content-Type header uses wrong argument name encoding instead of charset.',
+            url='https://github.com/PromyLOPh/crocoite/issues/19',
+            affects=['crocoite==1.0.0'])
+
+    def applyFix (self, record):
+        # XXX: this is ugly. warcio’s write_record replaces any Content-Type
+        # header we’re setting with this one. But printing rec_headers shows
+        # the header, not .content_type.
+        contentType = record.content_type
+        if '; encoding=' in contentType:
+            contentType = contentType.replace ('; encoding=', '; charset=')
+            record.content_type = contentType
+            self.stats['records']['fixed'] += 1
+
+        self.stats['records']['processed'] += 1
+        return record
+
+bugs = [
+    Errata (uuid='34a176b3-ad3d-430f-a082-68087f304572',
+            description='Generated by version < 1.0. No erratas are supported for this version.',
+            affects=['crocoite<1.0'],
+            ),
+    ContentTypeErrata (),
+    ]
+
+def makeReport (fd):
+    alreadyFixed = set ()
+
+    for record in ArchiveIterator (fd):
+        if record.rec_type == 'warcinfo':
+            try:
+                data = json.load (record.raw_stream)
+                # errata records precceed everything else and indicate which
+                # ones were fixed already
+                if data['tool'] == 'crocoite-errata':
+                    alreadyFixed.update (data['parameters']['errata'])
+                else:
+                    haveVersions = dict ([(pkg['projectName'], parse_version(pkg['version'])) for pkg in data['software']['self']])
+                    yield from filter (lambda b: haveVersions in b and b.uuid not in alreadyFixed, bugs)
+            except json.decoder.JSONDecodeError:
+                pass
+
+def errataCheck (args):
+    hasErrata = False
+    for item in makeReport (args.input):
+        json.dump (item.toDict (), sys.stdout)
+        sys.stdout.write ('\n')
+        sys.stdout.flush ()
+        hasErrata = True
+    return int (hasErrata)
+
+def errataFix (args):
+    errata = args.errata
+
+    with args.input as infd, args.output as outfd:
+        writer = WARCWriter (outfd, gzip=True)
+
+        warcinfo = {
+                'software': getSoftwareInfo (),
+                'tool': 'crocoite-errata', # not the name of the cli tool
+                'parameters': {'errata': [errata.uuid]},
+                }
+        payload = BytesIO (json.dumps (warcinfo, indent=2).encode ('utf-8'))
+        record = writer.create_warc_record ('', 'warcinfo',
+                payload=payload,
+                warc_headers_dict={'Content-Type': makeContentType (jsonMime, 'utf-8')})
+        writer.write_record (record)
+
+        for record in ArchiveIterator (infd):
+            fixedRecord = errata.applyFix (record)
+            writer.write_record (fixedRecord)
+    json.dump (errata.stats, sys.stdout)
+    sys.stdout.write ('\n')
+    sys.stdout.flush ()
+
+def uuidToErrata (uuid, onlyFixable=True):
+    try:
+        e = next (filter (lambda x: x.uuid == uuid, bugs))
+    except StopIteration:
+        raise argparse.ArgumentTypeError (f'Errata {uuid} does not exist')
+    if not isinstance (e, FixableErrata):
+        raise argparse.ArgumentTypeError (f'Errata {uuid} is not fixable')
+    return e
+
+def errata ():
+    parser = argparse.ArgumentParser(description=f'Show/fix erratas for WARCs generated by {__package__}.')
+    parser.add_argument('input', metavar='INPUT', type=argparse.FileType ('rb'), help='Input WARC')
+
+    # XXX: required argument does not work here?!
+    subparsers = parser.add_subparsers()
+
+    checkparser = subparsers.add_parser('check', help='Show erratas')
+    checkparser.set_defaults (func=errataCheck)
+
+    fixparser = subparsers.add_parser('fix', help='Fix erratas')
+    fixparser.add_argument('errata', metavar='UUID', type=uuidToErrata, help='Apply fix for this errata')
+    fixparser.add_argument('output', metavar='OUTPUT', type=argparse.FileType ('wb'), help='Output WARC')
+    fixparser.set_defaults (func=errataFix)
+
+    args = parser.parse_args()
+
+    if not hasattr (args, 'func'):
+        parser.print_usage ()
+        parser.exit ()
+
+    return args.func (args)
 
diff --git a/crocoite/util.py b/crocoite/util.py
index ec257f1..da377a3 100644
--- a/crocoite/util.py
+++ b/crocoite/util.py
@@ -22,22 +22,82 @@
 Random utility functions
 """
 
-import random
+import random, sys, platform, os, json, urllib
+from datetime import datetime
+import hashlib, pkg_resources
 
-def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'):
-    if length is None:
-        length = random.randint (16, 32)
-    return ''.join (map (lambda x: random.choice (chars), range (length)))
+from yarl import URL
 
-def packageUrl (path):
-    """
-    Create URL for package data stored into WARC
-    """
-    return 'urn:' + __package__ + ':' + path
+class StrJsonEncoder (json.JSONEncoder):
+    """ JSON encoder that turns unknown classes into a string and thus never
+    fails """
+    def default (self, obj):
+        if isinstance (obj, datetime):
+            return obj.isoformat ()
+
+        # make sure serialization always succeeds
+        try:
+            return json.JSONEncoder.default(self, obj)
+        except TypeError:
+            return str (obj)
 
-def getFormattedViewportMetrics (tab):
-    layoutMetrics = tab.Page.getLayoutMetrics ()
+async def getFormattedViewportMetrics (tab):
+    layoutMetrics = await tab.Page.getLayoutMetrics ()
     # XXX: I’m not entirely sure which one we should use here
-    return '{}x{}'.format (layoutMetrics['layoutViewport']['clientWidth'],
-                layoutMetrics['layoutViewport']['clientHeight'])
+    viewport = layoutMetrics['layoutViewport']
+    return f"{viewport['clientWidth']}x{viewport['clientHeight']}"
+
+def getSoftwareInfo ():
+    """ Get software info for inclusion into warcinfo """
+    return {
+            'platform': platform.platform (),
+            'python': {
+                'implementation': platform.python_implementation(),
+                'version': platform.python_version (),
+                'build': platform.python_build ()
+                },
+            'self': getRequirements (__package__)
+        }
+
+def getRequirements (dist):
+    """ Get dependencies of a package.
+
+    Figure out packages’ dependencies based on setup/distutils, then look at
+    modules loaded and compute hashes of each loaded dependency.
+
+    This does not and cannot protect against malicious people. It’s only
+    purpose is to recreate this exact environment.
+    """
+
+    pending = {dist}
+    have = set ()
+    packages = []
+    while pending:
+        d = pkg_resources.get_distribution (pending.pop ())
+
+        modules = list (filter (lambda x: x, d.get_metadata ('top_level.txt').split ('\n')))
+        modhashes = {}
+        # hash loaded modules
+        for m in sys.modules.values ():
+            f = getattr (m, '__file__', None)
+            pkg = getattr (m, '__package__', None)
+            # is loaded?
+            if pkg in modules:
+                if f and os.path.isfile (f):
+                    with open (f, 'rb') as fd:
+                        contents = fd.read ()
+                        h = hashlib.new ('sha512')
+                        h.update (contents)
+                        modhashes[m.__name__] = {'sha512': h.hexdigest (), 'len': len (contents)}
+                else:
+                    modhashes[m.__name__] = {}
+
+        # only if one of the packages’ modules is actually loaded
+        if modhashes:
+            packages.append ({'projectName': d.project_name, 'modules': modhashes, 'version': d.version})
+
+        have.add (dist)
+        pending.update (d.requires ())
+        pending.difference_update (have)
+    return packages
 
diff --git a/crocoite/warc.py b/crocoite/warc.py
index 3fd65e4..415b487 100644
--- a/crocoite/warc.py
+++ b/crocoite/warc.py
@@ -22,178 +22,131 @@
 Classes writing data to WARC files
 """
 
-import logging
-import json
-from http.server import BaseHTTPRequestHandler
+import json, threading
 from io import BytesIO
-from warcio.statusandheaders import StatusAndHeaders
-from urllib.parse import urlsplit
-from logging.handlers import BufferingHandler
 from datetime import datetime
-from threading import Thread
-from queue import Queue
+from http.server import BaseHTTPRequestHandler
 
 from warcio.timeutils import datetime_to_iso_date
 from warcio.warcwriter import WARCWriter
-
-from .browser import AccountingSiteLoader
-from .util import packageUrl
-from .controller import defaultSettings
-
-class SerializingWARCWriter (WARCWriter):
-    """
-    Serializing WARC writer using separate writer thread and queue for
-    non-blocking operation
-
-    Needs an explicit .flush() before deletion.
-    """
-
-    def __init__ (self, filebuf, *args, **kwargs):
-        WARCWriter.__init__ (self, filebuf, *args, **kwargs)
-        self.queue = Queue ()
-        self.thread = Thread (target=self._run_writer)
-        self.thread.start ()
-
-    def flush (self):
-        self.queue.put (None)
-        self.thread.join ()
-        self.queue = None
-        self.thread = None
-
-    def _run_writer (self):
-        while True:
-            item = self.queue.get ()
-            if not item:
-                break
-            out, record = item
-            WARCWriter._write_warc_record (self, out, record)
-
-    def _write_warc_record (self, out, record):
-        self.queue.put ((out, record))
-
-class WARCLogHandler (BufferingHandler):
-    """
-    Buffered log handler, flushing to warcio
-    """
-
-    contentType = 'text/plain; charset=utf-8'
-
-    def __init__ (self, capacity, warcfile):
-        BufferingHandler.__init__ (self, capacity)
-        self.warcfile = warcfile
-
-    def flush (self):
-        self.acquire ()
-        try:
-            if self.buffer:
-                buf = ''
-                for record in self.buffer:
-                    buf += self.format (record)
-                    buf += '\n'
-                # XXX: record type?
-                record = self.warcfile.create_warc_record (
-                        packageUrl ('log'), 'metadata',
-                        payload=BytesIO (buf.encode ('utf8')),
-                        warc_headers_dict={'Content-Type': self.contentType})
-                self.warcfile.write_record(record)
-                self.buffer = []
-        finally:
-            self.release ()
-
-class WarcLoader (AccountingSiteLoader):
-    def __init__ (self, browser, url, writer,
-            logger=logging.getLogger(__name__),
-            logBuffer=defaultSettings.logBuffer,
-            maxBodySize=defaultSettings.maxBodySize):
-        super ().__init__ (browser, url, logger)
-        self.writer = writer
-        self.maxBodySize = maxBodySize
-        self.warcLogger = WARCLogHandler (logBuffer, writer)
-        self.logger.addHandler (self.warcLogger)
-
-    def __exit__ (self, exc_type, exc_value, traceback):
-        self.logger.removeHandler (self.warcLogger)
-        self.warcLogger.flush ()
-        return super ().__exit__ (exc_type, exc_value, traceback)
-
-    @staticmethod
-    def getStatusText (response):
-        text = response.get ('statusText')
-        if text:
-            return text
-        text = BaseHTTPRequestHandler.responses.get (response['status'])
-        if text:
-            return text[0]
-        return 'No status text available'
+from warcio.statusandheaders import StatusAndHeaders
+from yarl import URL
+
+from .util import StrJsonEncoder
+from .controller import EventHandler, ControllerStart
+from .behavior import Script, DomSnapshotEvent, ScreenshotEvent
+from .browser import RequestResponsePair, UnicodeBody
+
+# the official mimetype for json, according to https://tools.ietf.org/html/rfc8259
+jsonMime = 'application/json'
+# mime for javascript, according to https://tools.ietf.org/html/rfc4329#section-7.2
+jsMime = 'application/javascript'
+
+def makeContentType (mime, charset=None):
+    """ Create value of Content-Type WARC header with optional charset """
+    s = [mime]
+    if charset:
+        s.extend (['; charset=', charset])
+    return ''.join (s)
+
+class WarcHandler (EventHandler):
+    __slots__ = ('logger', 'writer', 'documentRecords', 'log',
+            'maxLogSize', 'logEncoding', 'warcinfoRecordId')
+
+    def __init__ (self, fd, logger):
+        self.logger = logger
+        self.writer = WARCWriter (fd, gzip=True)
+
+        self.logEncoding = 'utf-8'
+        self.log = BytesIO ()
+        # max log buffer size (bytes)
+        self.maxLogSize = 500*1024
+
+        # maps document urls to WARC record ids, required for DomSnapshotEvent
+        # and ScreenshotEvent
+        self.documentRecords = {}
+        # record id of warcinfo record
+        self.warcinfoRecordId = None
+
+    def __enter__ (self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self._flushLogEntries ()
+
+    def writeRecord (self, url, kind, payload, warc_headers_dict=None, http_headers=None):
+        """
+        Thin wrapper around writer.create_warc_record and writer.write_record.
+
+        Adds default WARC headers.
+        """
+        assert url is None or isinstance (url, URL)
+
+        d = {}
+        if self.warcinfoRecordId:
+            d['WARC-Warcinfo-ID'] = self.warcinfoRecordId
+        d.update (warc_headers_dict)
+        warc_headers_dict = d
+
+        record = self.writer.create_warc_record (str (url) if url else '',
+                kind,
+                payload=payload,
+                warc_headers_dict=warc_headers_dict,
+                http_headers=http_headers)
+        self.writer.write_record (record)
+
+        return record
 
     def _writeRequest (self, item):
-        writer = self.writer
+        logger = self.logger.bind (reqId=item.id)
 
         req = item.request
-        resp = item.response
-        url = urlsplit (resp['url'])
-
-        path = url.path
-        if url.query:
-            path += '?' + url.query
-        httpHeaders = StatusAndHeaders('{} {} HTTP/1.1'.format (req['method'], path),
-                item.requestHeaders, protocol='HTTP/1.1', is_http_request=True)
-        initiator = item.initiator
+        url = item.url
+
+        path = url.relative().with_fragment(None)
+        httpHeaders = StatusAndHeaders(f'{req.method} {path} HTTP/1.1',
+                req.headers, protocol='HTTP/1.1', is_http_request=True)
         warcHeaders = {
-                'X-Chrome-Initiator': json.dumps (initiator),
-                'WARC-Date': datetime_to_iso_date (datetime.utcfromtimestamp (item.chromeRequest['wallTime'])),
+                # required to correlate request with log entries
+                'X-Chrome-Request-ID': item.id,
+                'WARC-Date': datetime_to_iso_date (req.timestamp),
                 }
-        payload, payloadBase64Encoded = item.requestBody
-        if payload:
-            payload = BytesIO (payload)
-            warcHeaders['X-Chrome-Base64Body'] = str (payloadBase64Encoded)
-        record = writer.create_warc_record(req['url'], 'request',
-                payload=payload, http_headers=httpHeaders,
-                warc_headers_dict=warcHeaders)
-        writer.write_record(record)
 
-        return record.rec_headers['WARC-Record-ID']
-
-    def _getBody (self, item, redirect):
-        reqId = item.id
-        resp = item.response
-
-        rawBody = b''
-        base64Encoded = False
-        if redirect:
-            # redirects reuse the same request, thus we cannot safely retrieve
-            # the body (i.e getResponseBody may return the new location’s
-            # body). This is fine.
-            pass
-        elif item.encodedDataLength > self.maxBodySize:
-            # check body size first, since we’re loading everything into memory
-            raise ValueError ('body for {} too large {} vs {}'.format (reqId,
-                    item.encodedDataLength, self.maxBodySize))
+        body = item.request.body
+        if item.request.hasPostData and body is None:
+            # oops, don’t know what went wrong here
+            logger.error ('requestBody missing',
+                    uuid='ee9adc58-e723-4595-9feb-312a67ead6a0')
+            warcHeaders['WARC-Truncated'] = 'unspecified'
         else:
-            rawBody, base64Encoded = item.body
-        return rawBody, base64Encoded
+            body = BytesIO (body)
+        record = self.writeRecord (url, 'request',
+                payload=body, http_headers=httpHeaders,
+                warc_headers_dict=warcHeaders)
+        return record.rec_headers['WARC-Record-ID']
 
-    def _writeResponse (self, item, redirect, concurrentTo, rawBody, base64Encoded):
-        writer = self.writer
+    def _writeResponse (self, item, concurrentTo):
+        # fetch the body
         reqId = item.id
-        resp = item.response
 
         # now the response
+        resp = item.response
         warcHeaders = {
                 'WARC-Concurrent-To': concurrentTo,
-                'WARC-IP-Address': resp.get ('remoteIPAddress', ''),
-                'X-Chrome-Protocol': resp.get ('protocol', ''),
-                'X-Chrome-FromDiskCache': str (resp.get ('fromDiskCache')),
-                'X-Chrome-ConnectionReused': str (resp.get ('connectionReused')),
-                'X-Chrome-Base64Body': str (base64Encoded),
-                'WARC-Date': datetime_to_iso_date (datetime.utcfromtimestamp (
-                        item.chromeRequest['wallTime']+
-                        (item.chromeResponse['timestamp']-item.chromeRequest['timestamp']))),
+                # required to correlate request with log entries
+                'X-Chrome-Request-ID': item.id,
+                'WARC-Date': datetime_to_iso_date (resp.timestamp),
                 }
+        # conditional WARC headers
+        if item.remoteIpAddress:
+            warcHeaders['WARC-IP-Address'] = item.remoteIpAddress
 
-        httpHeaders = StatusAndHeaders('{} {}'.format (resp['status'],
-                self.getStatusText (resp)), item.responseHeaders,
-                protocol='HTTP/1.1')
+        # HTTP headers
+        statusText = resp.statusText or \
+                BaseHTTPRequestHandler.responses.get (
+                resp.status, ('No status text available', ))[0]
+        httpHeaders = StatusAndHeaders(f'{resp.status} {statusText}',
+                resp.headers, protocol='HTTP/1.1')
 
         # Content is saved decompressed and decoded, remove these headers
         blacklistedHeaders = {'transfer-encoding', 'content-encoding'}
@@ -203,34 +156,118 @@ class WarcLoader (AccountingSiteLoader):
         # chrome sends nothing but utf8 encoded text. Fortunately HTTP
         # headers take precedence over the document’s <meta>, thus we can
         # easily override those.
-        contentType = resp.get ('mimeType')
-        if contentType:
-            if not base64Encoded:
-                contentType += '; charset=utf-8'
-            httpHeaders.replace_header ('content-type', contentType)
-
-        httpHeaders.replace_header ('content-length', '{:d}'.format (len (rawBody)))
+        if resp.mimeType:
+            charset = 'utf-8' if isinstance (resp.body, UnicodeBody) else None
+            contentType = makeContentType (resp.mimeType, charset=charset)
+            httpHeaders.replace_header ('Content-Type', contentType)
+
+        # response body
+        body = resp.body
+        if body is None:
+            warcHeaders['WARC-Truncated'] = 'unspecified'
+        else:
+            httpHeaders.replace_header ('Content-Length', str (len (body)))
+            body = BytesIO (body)
 
-        record = writer.create_warc_record(resp['url'], 'response',
-                warc_headers_dict=warcHeaders, payload=BytesIO (rawBody),
+        record = self.writeRecord (item.url, 'response',
+                warc_headers_dict=warcHeaders, payload=body,
                 http_headers=httpHeaders)
-        writer.write_record(record)
 
-    def loadingFinished (self, item, redirect=False):
-        super ().loadingFinished (item, redirect)
+        if item.resourceType == 'Document':
+            self.documentRecords[item.url] = record.rec_headers.get_header ('WARC-Record-ID')
 
+    def _writeScript (self, item):
         writer = self.writer
+        encoding = 'utf-8'
+        # XXX: yes, we’re leaking information about the user here, but this is
+        # the one and only source URL of the scripts.
+        uri = URL(f'file://{item.abspath}') if item.path else None
+        self.writeRecord (uri, 'resource',
+                payload=BytesIO (str (item).encode (encoding)),
+                warc_headers_dict={
+                    'Content-Type': makeContentType (jsMime, encoding),
+                    'X-Crocoite-Type': 'script',
+                    })
+
+    def _writeItem (self, item):
+        assert item.request
+        concurrentTo = self._writeRequest (item)
+        # items that failed loading don’t have a response
+        if item.response:
+            self._writeResponse (item, concurrentTo)
+
+    def _addRefersTo (self, headers, url):
+        refersTo = self.documentRecords.get (url)
+        if refersTo:
+            headers['WARC-Refers-To'] = refersTo
+        else:
+            self.logger.error (f'No document record found for {url}')
+        return headers
 
-        req = item.request
-        reqId = item.id
-        resp = item.response
-        url = urlsplit (resp['url'])
-
-        try:
-            # write neither request nor response if we cannot retrieve the body
-            rawBody, base64Encoded = self._getBody (item, redirect)
-            concurrentTo = self._writeRequest (item)
-            self._writeResponse (item, redirect, concurrentTo, rawBody, base64Encoded)
-        except ValueError as e:
-            self.logger.error (e.args[0])
+    def _writeDomSnapshot (self, item):
+        writer = self.writer
+
+        warcHeaders = {
+                'X-Crocoite-Type': 'dom-snapshot',
+                'X-Chrome-Viewport': item.viewport,
+                'Content-Type': makeContentType ('text/html', 'utf-8')
+                }
+
+        self._addRefersTo (warcHeaders, item.url)
+
+        self.writeRecord (item.url, 'conversion',
+                payload=BytesIO (item.document),
+                warc_headers_dict=warcHeaders)
+
+    def _writeScreenshot (self, item):
+        writer = self.writer
+        warcHeaders = {
+                'Content-Type': makeContentType ('image/png'),
+                'X-Crocoite-Screenshot-Y-Offset': str (item.yoff),
+                'X-Crocoite-Type': 'screenshot',
+                }
+        self._addRefersTo (warcHeaders, item.url)
+        self.writeRecord (item.url, 'conversion',
+                payload=BytesIO (item.data), warc_headers_dict=warcHeaders)
+
+    def _writeControllerStart (self, item, encoding='utf-8'):
+        payload = BytesIO (json.dumps (item.payload, indent=2, cls=StrJsonEncoder).encode (encoding))
+
+        writer = self.writer
+        warcinfo = self.writeRecord (None, 'warcinfo',
+                warc_headers_dict={'Content-Type': makeContentType (jsonMime, encoding)},
+                payload=payload)
+        self.warcinfoRecordId = warcinfo.rec_headers['WARC-Record-ID']
+
+    def _flushLogEntries (self):
+        if self.log.tell () > 0:
+            writer = self.writer
+            self.log.seek (0)
+            warcHeaders = {
+                    'Content-Type': makeContentType (jsonMime, self.logEncoding),
+                    'X-Crocoite-Type': 'log',
+                    }
+            self.writeRecord (None, 'metadata', payload=self.log,
+                    warc_headers_dict=warcHeaders)
+            self.log = BytesIO ()
+
+    def _writeLog (self, item):
+        """ Handle log entries, called by .logger.WarcHandlerConsumer only """
+        self.log.write (item.encode (self.logEncoding))
+        self.log.write (b'\n')
+        if self.log.tell () > self.maxLogSize:
+            self._flushLogEntries ()
+
+    route = {Script: _writeScript,
+            RequestResponsePair: _writeItem,
+            DomSnapshotEvent: _writeDomSnapshot,
+            ScreenshotEvent: _writeScreenshot,
+            ControllerStart: _writeControllerStart,
+            }
+
+    async def push (self, item):
+        for k, v in self.route.items ():
+            if isinstance (item, k):
+                v (self, item)
+                break
 
diff --git a/doc/_ext/clicklist.py b/doc/_ext/clicklist.py
new file mode 100644
index 0000000..a69452c
--- /dev/null
+++ b/doc/_ext/clicklist.py
@@ -0,0 +1,45 @@
+"""
+Render click.yaml config file into human-readable list of supported sites
+"""
+
+import pkg_resources, yaml
+from docutils import nodes
+from docutils.parsers.rst import Directive
+from yarl import URL
+
+class ClickList (Directive):
+    def run(self):
+        # XXX: do this once only
+        fd = pkg_resources.resource_stream ('crocoite', 'data/click.yaml')
+        config = list (yaml.safe_load_all (fd))
+
+        l = nodes.definition_list ()
+        for site in config:
+            urls = set ()
+            v = nodes.definition ()
+            vl = nodes.bullet_list ()
+            v += vl
+            for s in site['selector']:
+                i = nodes.list_item ()
+                i += nodes.paragraph (text=s['description'])
+                vl += i
+                urls.update (map (lambda x: URL(x).with_path ('/'), s.get ('urls', [])))
+
+            item = nodes.definition_list_item ()
+            term = ', '.join (map (lambda x: x.host, urls)) if urls else site['match']
+            k = nodes.term (text=term)
+            item += k
+
+            item += v
+            l += item
+        return [l]
+
+def setup(app):
+    app.add_directive ("clicklist", ClickList)
+
+    return {
+        'version': '0.1',
+        'parallel_read_safe': True,
+        'parallel_write_safe': True,
+    }
+
diff --git a/doc/conf.py b/doc/conf.py
new file mode 100644
index 0000000..8336c27
--- /dev/null
+++ b/doc/conf.py
@@ -0,0 +1,44 @@
+# -*- coding: utf-8 -*-
+import os, sys
+
+# -- Project information -----------------------------------------------------
+
+project = 'crocoite'
+copyright = '2019 crocoite contributors'
+author = 'crocoite contributors'
+
+# -- General configuration ---------------------------------------------------
+
+sys.path.append(os.path.abspath("./_ext"))
+extensions = [
+    'sphinx.ext.viewcode',
+    'sphinx.ext.autodoc',
+    'clicklist',
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+source_suffix = '.rst'
+master_doc = 'index'
+language = 'en'
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+pygments_style = 'tango'
+
+# -- Options for HTML output -------------------------------------------------
+
+html_theme = 'alabaster'
+html_theme_options = {
+    "description": "Preservation for the modern web",
+    "github_user": "PromyLOPh",
+    "github_repo": "crocoite",
+    "travis_button": True,
+    "github_button": True,
+    "codecov_button": True,
+    "fixed_sidebar": True,
+}
+#html_static_path = ['_static']
+html_sidebars = {
+   '**': ['about.html', 'navigation.html', 'searchbox.html'],
+}
+
diff --git a/doc/develop.rst b/doc/develop.rst
new file mode 100644
index 0000000..801ab21
--- /dev/null
+++ b/doc/develop.rst
@@ -0,0 +1,39 @@
+Development
+-----------
+
+Generally crocoite provides reasonable defaults for Google Chrome via
+:py:mod:`crocoite.devtools`. When debugging this software it might be necessary
+to open a non-headless instance of the browser by running
+
+.. code:: bash
+
+   google-chrome-stable --remote-debugging-port=9222 --auto-open-devtools-for-tabs
+
+and then passing the option :option:`--browser=http://localhost:9222` to
+:program:`crocoite-single`. This allows human intervention through the
+browser’s builtin console.
+
+Release guide
+^^^^^^^^^^^^^
+
+crocoite uses `semantic versioning`_. To create a new release, bump the version
+number in ``setup.py`` according to the linked guide, create distribution
+packages::
+
+    python setup.py sdist bdist_wheel
+
+Verify them::
+
+    twine check dist/*
+
+Try to install and use them in a separate sandbox. And finally sign and upload
+a new version to pypi_::
+
+    gpg --detach-sign --armor dist/*.tar.gz
+    twine upload dist/*
+
+Then update the documentation using :program:`sphing-doc` and upload it as well.
+
+.. _semantic versioning: https://semver.org/spec/v2.0.0.html
+.. _pypi: https://pypi.org
+
diff --git a/doc/index.rst b/doc/index.rst
new file mode 100644
index 0000000..53f5f77
--- /dev/null
+++ b/doc/index.rst
@@ -0,0 +1,36 @@
+crocoite
+========
+
+Preservation for the modern web, powered by `headless Google
+Chrome`_.
+
+.. _headless Google Chrome: https://developers.google.com/web/updates/2017/04/headless-chrome
+
+.. toctree::
+   :maxdepth: 1
+   :hidden:
+
+   usage.rst
+   plugins.rst
+   rationale.rst
+   develop.rst
+   related.rst
+
+Features
+--------
+
+Google Chrome-powered
+    HTML renderer, JavaScript engine and network stack, supporting modern web
+    technologies and protocols
+WARC output
+    Includes all network requests made by the browser
+Site interaction
+    :ref:`Auto-expand on-click content <click>`, infinite-scrolling
+DOM snapshot
+    Contains the page’s state, renderable without JavaScript
+Image screenshot
+    Entire page
+Machine-readable interface
+    Easy integration into custom tools/scripts
+
+
diff --git a/doc/plugins.rst b/doc/plugins.rst
new file mode 100644
index 0000000..062e1bf
--- /dev/null
+++ b/doc/plugins.rst
@@ -0,0 +1,16 @@
+Plugins
+=======
+
+crocoite comes with plug-ins that modify loaded sites’ or interact with them.
+
+.. _click:
+
+click
+-----
+
+The following sites are currently supported. Note this is an ongoing
+battle against layout changes and thus older software versions will stop
+working very soon.
+
+.. clicklist::
+
diff --git a/doc/rationale.rst b/doc/rationale.rst
new file mode 100644
index 0000000..f37db7c
--- /dev/null
+++ b/doc/rationale.rst
@@ -0,0 +1,76 @@
+Rationale
+---------
+
+Most modern websites depend heavily on executing code, usually JavaScript, on
+the user’s machine. They also make use of new and emerging Web technologies
+like HTML5, WebSockets, service workers and more. Even worse from the
+preservation point of view, they also require some form of user interaction to
+dynamically load more content (infinite scrolling, dynamic comment loading,
+etc).
+
+The naive approach of fetching a HTML page, parsing it and extracting
+links to referenced resources therefore is not sufficient to create a faithful
+snapshot of these web applications. A full browser, capable of running scripts and
+providing modern Web API’s is absolutely required for this task. Thankfully
+Google Chrome runs without a display (headless mode) and can be controlled by
+external programs, allowing them to navigate and extract or inject data.
+This section describes the solutions crocoite offers and explains design
+decisions taken.
+
+crocoite captures resources by listening to Chrome’s `network events`_ and
+requesting the response body using `Network.getResponseBody`_. This approach
+has caveats: The original HTTP requests and responses, as sent over the wire,
+are not available. They are reconstructed from parsed data. The character
+encoding for text documents is changed to UTF-8. And the content body of HTTP
+redirects cannot be retrieved due to a race condition.
+
+.. _network events: https://chromedevtools.github.io/devtools-protocol/1-3/Network
+.. _Network.getResponseBody: https://chromedevtools.github.io/devtools-protocol/1-3/Network#method-getResponseBody
+
+But at the same time it allows crocoite to rely on Chrome’s well-tested network
+stack and HTTP parser. Thus it supports HTTP version 1 and 2 as well as
+transport protocols like SSL and QUIC. Depending on Chrome also eliminates the
+need for a man-in-the-middle proxy, like warcprox_, which has to decrypt SSL
+traffic and present a fake certificate to the browser in order to store the
+transmitted content.
+
+.. _warcprox: https://github.com/internetarchive/warcprox
+
+WARC records generated by crocoite therefore are an abstract view on the
+resource they represent and not necessarily the data sent over the wire. A URL
+fetched with HTTP/2 for example will still result in a HTTP/1.1
+request/response pair in the WARC file. This may be undesireable from
+an archivist’s point of view (“save the data exactly like we received it”). But
+this level of abstraction is inevitable when dealing with more than one
+protocol.
+
+crocoite also interacts with and therefore alters the grabbed websites. It does
+so by injecting `behavior scripts`_ into the site. Typically these are written
+in JavaScript, because interacting with a page is easier this way. These
+scripts then perform different tasks: Extracting targets from visible
+hyperlinks, clicking buttons or scrolling the website to to load more content,
+as well as taking a static screenshot of ``<canvas>`` elements for the DOM
+snapshot (see below).
+
+.. _behavior scripts: https://github.com/PromyLOPh/crocoite/tree/master/crocoite/data
+
+Replaying archived WARC’s can be quite challenging and might not be possible
+with current technology (or even at all):
+
+- Some sites request assets based on screen resolution, pixel ratio and
+  supported image formats (webp). Replaying those with different parameters
+  won’t work, since assets for those are missing. Example: missguided.com.
+- Some fetch different scripts based on user agent. Example: youtube.com.
+- Requests containing randomly generated JavaScript callback function names
+  won’t work. Example: weather.com.
+- Range requests (Range: bytes=1-100) are captured as-is, making playback
+  difficult
+
+crocoite offers two methods to work around these issues. Firstly it can save a
+DOM snapshot to the WARC file. It contains the entire DOM in HTML format minus
+``<script>`` tags after the site has been fully loaded and thus can be
+displayed without executing scripts.  Obviously JavaScript-based navigation
+does not work any more. Secondly it also saves a screenshot of the full page,
+so even if future browsers cannot render and display the stored HTML a fully
+rendered version of the website can be replayed instead.
+
diff --git a/doc/related.rst b/doc/related.rst
new file mode 100644
index 0000000..62e2569
--- /dev/null
+++ b/doc/related.rst
@@ -0,0 +1,14 @@
+Related projects
+----------------
+
+brozzler_
+    Uses Google Chrome as well, but intercepts traffic using a proxy. Supports
+    distributed crawling and immediate playback.
+Squidwarc_
+    Communicates with headless Google Chrome and uses the Network API to
+    retrieve requests like crocoite. Supports recursive crawls and page
+    scrolling, but neither custom JavaScript nor distributed crawling.
+
+.. _brozzler: https://github.com/internetarchive/brozzler
+.. _Squidwarc: https://github.com/N0taN3rd/Squidwarc
+
diff --git a/doc/usage.rst b/doc/usage.rst
new file mode 100644
index 0000000..34a3e7b
--- /dev/null
+++ b/doc/usage.rst
@@ -0,0 +1,162 @@
+Usage
+-----
+
+Quick start using pywb_, expects Google Chrome to be installed already:
+
+.. code:: bash
+
+    pip install crocoite pywb
+    crocoite http://example.com/ example.com.warc.gz
+    wb-manager init test && wb-manager add test example.com.warc.gz
+    wayback &
+    $BROWSER http://localhost:8080
+
+.. _pywb: https://github.com/ikreymer/pywb
+
+It is recommended to install at least Micrsoft’s Corefonts_ as well as DejaVu_,
+Liberation_ or a similar font family covering a wide range of character sets.
+Otherwise page screenshots may be unusable due to missing glyphs.
+
+.. _Corefonts: http://corefonts.sourceforge.net/
+.. _DejaVu: https://dejavu-fonts.github.io/
+.. _Liberation: https://pagure.io/liberation-fonts
+
+Recursion
+^^^^^^^^^
+
+.. program:: crocoite
+
+By default crocoite will only retrieve the URL specified on the command line.
+However it can follow links as well. There’s currently two recursion strategies
+available, depth- and prefix-based.
+
+.. code:: bash
+
+   crocoite -r 1 https://example.com/ example.com.warc.gz
+
+will retrieve ``example.com`` and all pages directly refered to by it.
+Increasing the number increases the depth, so a value of :samp:`2` would first grab
+``example.com``, queue all pages linked there as well as every reference on
+each of those pages.
+
+On the other hand
+
+.. code:: bash
+
+   crocoite -r prefix https://example.com/dir/ example.com.warc.gz
+
+will retrieve the URL specified and all pages referenced which have the same
+URL prefix. There trailing slash is significant. Without it crocoite would also
+grab ``/dir-something`` or ``/dir.html`` for example.
+
+If an output file template is used each page is written to an individual file. For example
+
+.. code:: bash
+
+   crocoite -r prefix https://example.com/ '{host}-{date}-{seqnum}.warc.gz'
+
+will write one file page page to files like
+:file:`example.com-2019-09-09T15:15:15+02:00-1.warc.gz`. ``seqnum`` is unique to
+each page of a single job and should always be used.
+
+When running a recursive job, increasing the concurrency (i.e. how many pages
+are fetched at the same time) can speed up the process. For example you can
+pass :option:`-j` :samp:`4` to retrieve four pages at the same time. Keep in mind
+that each process starts a full browser that requires a lot of resources (one
+to two GB of RAM and one or two CPU cores).
+
+Customizing
+^^^^^^^^^^^
+
+.. program:: crocoite-single
+
+Under the hood :program:`crocoite` starts one instance of
+:program:`crocoite-single` to fetch each page. You can customize its options by
+appending a command template like this:
+
+.. code:: bash
+
+   crocoite -r prefix https://example.com example.com.warc.gz -- \
+        crocoite-single --timeout 5 -k '{url}' '{dest}'
+
+This reduces the global timeout to 5 seconds and ignores TLS errors. If an
+option is prefixed with an exclamation mark (``!``) it will not be expanded.
+This is useful for passing :option:`--warcinfo`, which expects JSON-encoded data.
+
+Command line options
+^^^^^^^^^^^^^^^^^^^^
+
+Below is a list of all command line arguments available:
+
+.. program:: crocoite
+
+crocoite
+++++++++
+
+Front-end with recursion support and simple job management.
+
+.. option:: -j N, --concurrency N
+
+   Maximum number of concurrent fetch jobs.
+
+.. option:: -r POLICY, --recursion POLICY
+
+   Enables recursion based on POLICY, which can be a positive integer
+   (recursion depth) or the string :kbd:`prefix`.
+
+.. option:: --tempdir DIR
+
+   Directory for temporary WARC files.
+
+.. program:: crocoite-single
+
+crocoite-single
++++++++++++++++
+
+Back-end to fetch a single page.
+
+.. option:: -b SET-COOKIE, --cookie SET-COOKIE
+
+   Add cookie to browser’s cookie jar. This option always *appends* cookies,
+   replacing those provided by :option:`-c`.
+
+   .. versionadded:: 1.1
+
+.. option:: -c FILE, --cookie-jar FILE
+
+   Load cookies from FILE. :program:`crocoite` provides a default cookie file,
+   which contains cookies to, for example, circumvent age restrictions. This
+   option *replaces* that default file.
+
+   .. versionadded:: 1.1
+
+.. option:: --idle-timeout SEC
+
+   Time after which a page is considered “idle”.
+
+.. option:: -k, --insecure
+
+   Allow insecure connections, i.e. self-signed ore expired HTTPS certificates.
+
+.. option:: --timeout SEC
+
+   Global archiving timeout.
+
+
+.. option:: --warcinfo JSON
+
+   Inject additional JSON-encoded information into the resulting WARC.
+
+IRC bot
+^^^^^^^
+
+A simple IRC bot (“chromebot”) is provided with the command :program:`crocoite-irc`.
+It reads its configuration from a config file like the example provided in
+:file:`contrib/chromebot.json` and supports the following commands:
+
+a <url> -j <concurrency> -r <policy> -k -b <set-cookie>
+    Archive <url> with <concurrency> processes according to recursion <policy>
+s <uuid>
+    Get job status for <uuid>
+r <uuid>
+    Revoke or abort running job with <uuid>
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000..ec7d730
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,8 @@
+[aliases]
+test=pytest
+[tool:pytest]
+addopts=--cov-report=html --cov-report=xml --cov=crocoite --cov-config=setup.cfg
+[coverage:run]
+branch=True
+[build_sphinx]
+builder=dirhtml
diff --git a/setup.py b/setup.py
index 128621b..628442e 100644
--- a/setup.py
+++ b/setup.py
@@ -2,27 +2,56 @@ from setuptools import setup
 
 setup(
     name='crocoite',
-    version='0.1.0',
+    version='1.1.1',
     author='Lars-Dominik Braun',
     author_email='lars+crocoite@6xq.net',
+    url='https://6xq.net/crocoite/',
     packages=['crocoite'],
     license='LICENSE.txt',
     description='Save website to WARC using Google Chrome.',
     long_description=open('README.rst').read(),
+    long_description_content_type='text/x-rst',
     install_requires=[
-        'pychrome',
         'warcio',
         'html5lib>=0.999999999',
-        'Celery',
+        'bottom',
+        'pytz',
+        'websockets',
+        'aiohttp',
+        'PyYAML',
+        'yarl>=1.4,<1.5',
+        'multidict',
     ],
+    extras_require={
+        'manhole': ['manhole>=1.6'],
+    },
     entry_points={
     'console_scripts': [
-            'crocoite-grab = crocoite.cli:main',
-            'crocoite-merge-warc = crocoite.tools:mergeWarc',
+            # the main executable
+            'crocoite = crocoite.cli:recursive',
+            # backend helper
+            'crocoite-single = crocoite.cli:single',
+            # irc bot and dashboard
+            'crocoite-irc = crocoite.cli:irc',
+            'crocoite-irc-dashboard = crocoite.cli:dashboard',
+            # misc tools
+            'crocoite-merge-warc = crocoite.tools:mergeWarcCli',
             'crocoite-extract-screenshot = crocoite.tools:extractScreenshot',
+            'crocoite-errata = crocoite.tools:errata',
             ],
     },
     package_data={
             'crocoite': ['data/*'],
     },
+    setup_requires=['pytest-runner'],
+    tests_require=["pytest", 'pytest-asyncio', 'pytest-cov', 'hypothesis'],
+    python_requires='>=3.6',
+    classifiers=[
+        'Development Status :: 5 - Production/Stable',
+        'License :: OSI Approved :: MIT License',
+        'Operating System :: POSIX',
+        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7',
+        'Topic :: Internet :: WWW/HTTP',
+    ],
 )