6 files changed, 257 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..3975598
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+__pycache__
+*.sw?
diff --git a/README.rst b/README.rst
new file mode 100644
index 0000000..59629fe
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,88 @@
+swayback
+========
+
+This is a proof of concept for Service Worker-based web app replay, similar to
+archive.org’s Wayback Machine.
+
+Rationale
+---------
+
+Traditionally replaying websites relied heavily on rewriting URL’s in static
+HTML pages to adapt them to a new origin and path hierarchy (i.e.
+``https://web.archive.org/web/<date>/<url>``). With the rise of web apps, which
+load their content dynamically, this is no longer sufficient.
+
+Let’s look at Instagram as an example for this: User’s profiles dynamically
+load content to implement “infinite scrolling”. The corresponding request is a
+GraphQL query, which returns JSON-encoded data with an application-defined
+structure.  This response includes URL’s to images, which must be rewritten as
+well, in order for replay to work correctly. So the replay software needs to
+parse and rewrite JSON as well as HTML.
+
+However, this response could have used an arbitrary serialization format and
+may contain relative URL’s or just values used in a URL template, which are
+more difficult to spot than absolute URL’s. This makes server-side rewriting
+difficult and cumbersome, perhaps even impossible.
+
+Implementation
+--------------
+
+Instead swayback relies on a new web technology called *Service Workers*. These
+can be installed for a given domain and path prefix. They basically act as a
+proxy between the browser and server, allowing them to intercept and rewrite
+any request a web app makes. Which is exactly what we need to properly replay
+archived web apps.
+
+So swayback provides an HTTP server, responing to queries for the wildcard
+domain ``*.swayback.localhost``. The page served first installs a service
+worker and then reloads the page. Now the service worker is in control of
+network requests and rewrites a request like (for instance)
+``www.instagram.com.swayback.localhost:5000/bluebellwooi/`` to
+``swayback.localhost:5000/raw`` with the real URL in the POST request body.
+swayback’s server looks up that URL in the WARC files provided and and replies
+with the original server’s response, which is then returned by the service
+worker to the browser without modification.
+
+Usage
+-----
+
+Since this is a proof of concept functionality is quite limited. You’ll need
+the following python packages:
+
+- flask
+- warcio
+
+swayback uses the hardcoded domain ``swayback.localhost``, which means you need
+to set up your DNS resolver accordingly. An example for unbound looks like
+this:
+
+.. code:: unbound
+
+    local-zone: "swayback.localhost." redirect
+    local-data: "swayback.localhost. 30 IN A 127.0.0.1"
+
+After you recorded some WARCs move them into swayback’s base directory and run:
+
+.. code:: bash
+
+    export FLASK_APP=swayback/__init__.py
+    export FLASK_DEBUG=1
+    flask run --with-threads
+
+Then navigate to http://swayback.localhost:5000, which (hopefully) lists all
+HTML pages found in those WARC files.
+
+Caveats
+-------
+
+- Hardcoded replay domain
+- URL lookup is broken, only HTTPS sites work correctly
+
+Related projects
+----------------
+
+This approach complements efforts such as crocoite_, a web crawler based on
+Google Chrome.
+
+.. _crocoite: https://github.com/PromyLOPh/crocoite
+
diff --git a/swayback/__init__.py b/swayback/__init__.py
new file mode 100644
index 0000000..80600f3
--- /dev/null
+++ b/swayback/__init__.py
@@ -0,0 +1,68 @@
+import os
+
+from flask import Flask, render_template, send_file, request, make_response
+from warcio.archiveiterator import ArchiveIterator
+from warcio.recordloader import ArcWarcRecordLoader
+from warcio.bufferedreaders import DecompressingBufferedReader
+from io import BytesIO
+from urllib.parse import urlparse, urlunparse
+
+app = Flask(__name__)
+app.url_map.host_matching = True
+
+htmlindex = []
+urlmap = {}
+for filename in os.listdir ('.'):
+    if not filename.endswith ('.warc.gz'):
+        continue
+    print ('using', filename)
+    with open(filename, 'rb') as stream:
+        ai = ArchiveIterator(stream)
+        for record in ai:
+            if record.rec_type == 'response':
+                u = urlparse (record.rec_headers.get_header('WARC-Target-URI'))
+                if u not in urlmap:
+                    urlmap[u] = (filename, ai.get_record_offset (), ai.get_record_length ())
+                httpHeaders = record.http_headers
+                if httpHeaders.get_header ('content-type', '').startswith ('text/html'):
+                    rewrittenUrl = urlunparse (('http', u.hostname + '.swayback.localhost:5000', u[2], u[3], u[4], u[5]))
+                    htmlindex.append ((urlunparse (u), rewrittenUrl, record.rec_headers.get_header ('warc-date')))
+
+@app.route('/', host='swayback.localhost:5000')
+def index ():
+    """ A simple index of all HTML pages inside the WARCs """
+    return render_template ('index.html', index=htmlindex)
+
+@app.route('/raw', host='swayback.localhost:5000', methods=['POST'])
+def raw ():
+    """ Retrieve the original response for a given request """
+    print (request.form)
+    url = urlparse (request.form['url'])
+    try:
+        filename, offset, length = urlmap[url]
+        with open(filename, 'rb') as stream:
+            stream.seek (offset, 0)
+            buf = BytesIO (stream.read (length))
+            loader = ArcWarcRecordLoader ()
+            record = loader.parse_record_stream (DecompressingBufferedReader (buf))
+            statuscode = record.http_headers.get_statuscode ()
+            record.http_headers.remove_header ('Content-Security-Policy')
+            record.http_headers.replace_header ('Access-Control-Allow-Origin', '*')
+            headers = record.http_headers.headers
+            return record.content_stream().read(), statuscode, headers
+    except KeyError:
+        resp = make_response ('', 404)
+        resp.headers.add ('Access-Control-Allow-Origin', '*')
+        return resp
+
+@app.route('/static/sw.js', host='<host>')
+def sw (host=None):
+    """ Service worker script needs additional headers """
+    return send_file ('static/sw.js'), {'Service-Worker-Allowed': '/'}
+
+# each subdomain will need its own service worker registration
+@app.route('/<path:path>', host='<domain>.swayback.localhost:5000', methods=['GET', 'POST'])
+def register (path=None, domain=None):
+    """ Register a service worker for this origin """
+    return render_template ('sw.html')
+
diff --git a/swayback/static/sw.js b/swayback/static/sw.js
new file mode 100644
index 0000000..f3a63db
--- /dev/null
+++ b/swayback/static/sw.js
@@ -0,0 +1,39 @@
+self.addEventListener('install', function(event) {
+	console.log ('installed service worker', event);
+	self.skipWaiting();
+});
+/* load stuff through service worker immediately? XXX: only debugging? */
+self.addEventListener('activate', event => {
+  event.waitUntil(clients.claim());
+});
+
+self.addEventListener('fetch', function(event) {
+	console.log ('fetch event', event.request.url, event);
+	let url = new URL (event.request.url);
+	url.protocol = 'https:';
+	url.port = 443;
+	url.hash = '';
+	if (url.hostname.endsWith ('.swayback.localhost')) {
+		url.hostname = url.hostname.slice (0, url.hostname.length-'.swayback.localhost'.length);
+	}
+	console.log ('orig url', url);
+	let body = new FormData ();
+	body.append ('url', url);
+	body.append ('method', event.request.method);
+	let req = new Request ('http://swayback.localhost:5000/raw', {method: 'POST', body: body});
+
+	event.respondWith (
+		fetch(req)
+		.then(function (response) {
+			// response may be used only once
+			// we need to save clone to put one copy in cache
+			// and serve second one
+			let responseClone = response.clone();
+			console.log ('got resp', responseClone);
+			return response;
+		})
+		.catch (function () {
+			console.log ('nope');
+		})
+	);
+});
diff --git a/swayback/templates/index.html b/swayback/templates/index.html
new file mode 100644
index 0000000..028a5a9
--- /dev/null
+++ b/swayback/templates/index.html
@@ -0,0 +1,19 @@
+<!doctype html>
+<html>
+    <head>
+        <meta charset="utf-8">
+        <meta http-equiv="x-ua-compatible" content="ie=edge">
+        <title>Index</title>
+        <meta name="description" content="">
+        <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
+		<style>
+		</style>
+    </head>
+    <body>
+		<ul>
+			{% for u, ur, d in index %}
+				<li><a href="{{ ur }}">{{ u }}</a> {{ d }}</li>
+			{% endfor %}
+		</ul>
+    </body>
+</html>
diff --git a/swayback/templates/sw.html b/swayback/templates/sw.html
new file mode 100644
index 0000000..2abfac2
--- /dev/null
+++ b/swayback/templates/sw.html
@@ -0,0 +1,41 @@
+<!doctype html>
+<html>
+    <head>
+        <meta charset="utf-8">
+        <meta http-equiv="x-ua-compatible" content="ie=edge">
+        <title>Just a second…</title>
+        <meta name="description" content="">
+        <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
+		<style>
+		.hidden {
+			display: none;
+		}
+		</style>
+    </head>
+    <body>
+	<div class="hidden" id="unsupported">
+		<p>Service workers are not supported by your browser.</p>
+	</div>
+	<div id="waiting">
+		<p>Just a second…</p>
+	</div>
+	<script>
+	if ('serviceWorker' in navigator) {
+		/* service workers must be hosted in the same origin (i.e. subdomain) */
+		navigator.serviceWorker.register('/static/sw.js', {scope: '/'})
+		.then(function(reg) {
+			/* load actual content using the service worker */
+			window.location.reload ();
+		}).catch(function(error) {
+			console.log ('sw error', error);
+			document.getElementById ('unsupported').classList.remove ('hidden');
+			document.getElementById ('waiting').classList.add ('hidden');
+		});
+	} else {
+		console.log ('not supported');
+		document.getElementById ('unsupported').classList.remove ('hidden');
+		document.getElementById ('waiting').classList.add ('hidden');
+	}
+	</script>
+    </body>
+</html>