summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2018-02-25 15:46:33 +0100
committerLars-Dominik Braun <lars@6xq.net>2018-02-25 15:46:33 +0100
commitf6d405ced3e9330195109f7c0cef1d40863b1dd0 (patch)
tree473204f94a5305530412bc2da9fb7c12f67f6394
downloadswayback-f6d405ced3e9330195109f7c0cef1d40863b1dd0.tar.gz
swayback-f6d405ced3e9330195109f7c0cef1d40863b1dd0.tar.bz2
swayback-f6d405ced3e9330195109f7c0cef1d40863b1dd0.zip
Initial import
-rw-r--r--.gitignore2
-rw-r--r--README.rst88
-rw-r--r--swayback/__init__.py68
-rw-r--r--swayback/static/sw.js39
-rw-r--r--swayback/templates/index.html19
-rw-r--r--swayback/templates/sw.html41
6 files changed, 257 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..3975598
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+__pycache__
+*.sw?
diff --git a/README.rst b/README.rst
new file mode 100644
index 0000000..59629fe
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,88 @@
+swayback
+========
+
+This is a proof of concept for Service Worker-based web app replay, similar to
+archive.org’s Wayback Machine.
+
+Rationale
+---------
+
+Traditionally replaying websites relied heavily on rewriting URL’s in static
+HTML pages to adapt them to a new origin and path hierarchy (i.e.
+``https://web.archive.org/web/<date>/<url>``). With the rise of web apps, which
+load their content dynamically, this is no longer sufficient.
+
+Let’s look at Instagram as an example for this: User’s profiles dynamically
+load content to implement “infinite scrolling”. The corresponding request is a
+GraphQL query, which returns JSON-encoded data with an application-defined
+structure. This response includes URL’s to images, which must be rewritten as
+well, in order for replay to work correctly. So the replay software needs to
+parse and rewrite JSON as well as HTML.
+
+However, this response could have used an arbitrary serialization format and
+may contain relative URL’s or just values used in a URL template, which are
+more difficult to spot than absolute URL’s. This makes server-side rewriting
+difficult and cumbersome, perhaps even impossible.
+
+Implementation
+--------------
+
+Instead swayback relies on a new web technology called *Service Workers*. These
+can be installed for a given domain and path prefix. They basically act as a
+proxy between the browser and server, allowing them to intercept and rewrite
+any request a web app makes. Which is exactly what we need to properly replay
+archived web apps.
+
+So swayback provides an HTTP server, responing to queries for the wildcard
+domain ``*.swayback.localhost``. The page served first installs a service
+worker and then reloads the page. Now the service worker is in control of
+network requests and rewrites a request like (for instance)
+``www.instagram.com.swayback.localhost:5000/bluebellwooi/`` to
+``swayback.localhost:5000/raw`` with the real URL in the POST request body.
+swayback’s server looks up that URL in the WARC files provided and and replies
+with the original server’s response, which is then returned by the service
+worker to the browser without modification.
+
+Usage
+-----
+
+Since this is a proof of concept functionality is quite limited. You’ll need
+the following python packages:
+
+- flask
+- warcio
+
+swayback uses the hardcoded domain ``swayback.localhost``, which means you need
+to set up your DNS resolver accordingly. An example for unbound looks like
+this:
+
+.. code:: unbound
+
+ local-zone: "swayback.localhost." redirect
+ local-data: "swayback.localhost. 30 IN A 127.0.0.1"
+
+After you recorded some WARCs move them into swayback’s base directory and run:
+
+.. code:: bash
+
+ export FLASK_APP=swayback/__init__.py
+ export FLASK_DEBUG=1
+ flask run --with-threads
+
+Then navigate to http://swayback.localhost:5000, which (hopefully) lists all
+HTML pages found in those WARC files.
+
+Caveats
+-------
+
+- Hardcoded replay domain
+- URL lookup is broken, only HTTPS sites work correctly
+
+Related projects
+----------------
+
+This approach complements efforts such as crocoite_, a web crawler based on
+Google Chrome.
+
+.. _crocoite: https://github.com/PromyLOPh/crocoite
+
diff --git a/swayback/__init__.py b/swayback/__init__.py
new file mode 100644
index 0000000..80600f3
--- /dev/null
+++ b/swayback/__init__.py
@@ -0,0 +1,68 @@
+import os
+
+from flask import Flask, render_template, send_file, request, make_response
+from warcio.archiveiterator import ArchiveIterator
+from warcio.recordloader import ArcWarcRecordLoader
+from warcio.bufferedreaders import DecompressingBufferedReader
+from io import BytesIO
+from urllib.parse import urlparse, urlunparse
+
+app = Flask(__name__)
+app.url_map.host_matching = True
+
+htmlindex = []
+urlmap = {}
+for filename in os.listdir ('.'):
+ if not filename.endswith ('.warc.gz'):
+ continue
+ print ('using', filename)
+ with open(filename, 'rb') as stream:
+ ai = ArchiveIterator(stream)
+ for record in ai:
+ if record.rec_type == 'response':
+ u = urlparse (record.rec_headers.get_header('WARC-Target-URI'))
+ if u not in urlmap:
+ urlmap[u] = (filename, ai.get_record_offset (), ai.get_record_length ())
+ httpHeaders = record.http_headers
+ if httpHeaders.get_header ('content-type', '').startswith ('text/html'):
+ rewrittenUrl = urlunparse (('http', u.hostname + '.swayback.localhost:5000', u[2], u[3], u[4], u[5]))
+ htmlindex.append ((urlunparse (u), rewrittenUrl, record.rec_headers.get_header ('warc-date')))
+
+@app.route('/', host='swayback.localhost:5000')
+def index ():
+ """ A simple index of all HTML pages inside the WARCs """
+ return render_template ('index.html', index=htmlindex)
+
+@app.route('/raw', host='swayback.localhost:5000', methods=['POST'])
+def raw ():
+ """ Retrieve the original response for a given request """
+ print (request.form)
+ url = urlparse (request.form['url'])
+ try:
+ filename, offset, length = urlmap[url]
+ with open(filename, 'rb') as stream:
+ stream.seek (offset, 0)
+ buf = BytesIO (stream.read (length))
+ loader = ArcWarcRecordLoader ()
+ record = loader.parse_record_stream (DecompressingBufferedReader (buf))
+ statuscode = record.http_headers.get_statuscode ()
+ record.http_headers.remove_header ('Content-Security-Policy')
+ record.http_headers.replace_header ('Access-Control-Allow-Origin', '*')
+ headers = record.http_headers.headers
+ return record.content_stream().read(), statuscode, headers
+ except KeyError:
+ resp = make_response ('', 404)
+ resp.headers.add ('Access-Control-Allow-Origin', '*')
+ return resp
+
+@app.route('/static/sw.js', host='<host>')
+def sw (host=None):
+ """ Service worker script needs additional headers """
+ return send_file ('static/sw.js'), {'Service-Worker-Allowed': '/'}
+
+# each subdomain will need its own service worker registration
+@app.route('/<path:path>', host='<domain>.swayback.localhost:5000', methods=['GET', 'POST'])
+def register (path=None, domain=None):
+ """ Register a service worker for this origin """
+ return render_template ('sw.html')
+
diff --git a/swayback/static/sw.js b/swayback/static/sw.js
new file mode 100644
index 0000000..f3a63db
--- /dev/null
+++ b/swayback/static/sw.js
@@ -0,0 +1,39 @@
+self.addEventListener('install', function(event) {
+ console.log ('installed service worker', event);
+ self.skipWaiting();
+});
+/* load stuff through service worker immediately? XXX: only debugging? */
+self.addEventListener('activate', event => {
+ event.waitUntil(clients.claim());
+});
+
+self.addEventListener('fetch', function(event) {
+ console.log ('fetch event', event.request.url, event);
+ let url = new URL (event.request.url);
+ url.protocol = 'https:';
+ url.port = 443;
+ url.hash = '';
+ if (url.hostname.endsWith ('.swayback.localhost')) {
+ url.hostname = url.hostname.slice (0, url.hostname.length-'.swayback.localhost'.length);
+ }
+ console.log ('orig url', url);
+ let body = new FormData ();
+ body.append ('url', url);
+ body.append ('method', event.request.method);
+ let req = new Request ('http://swayback.localhost:5000/raw', {method: 'POST', body: body});
+
+ event.respondWith (
+ fetch(req)
+ .then(function (response) {
+ // response may be used only once
+ // we need to save clone to put one copy in cache
+ // and serve second one
+ let responseClone = response.clone();
+ console.log ('got resp', responseClone);
+ return response;
+ })
+ .catch (function () {
+ console.log ('nope');
+ })
+ );
+});
diff --git a/swayback/templates/index.html b/swayback/templates/index.html
new file mode 100644
index 0000000..028a5a9
--- /dev/null
+++ b/swayback/templates/index.html
@@ -0,0 +1,19 @@
+<!doctype html>
+<html>
+ <head>
+ <meta charset="utf-8">
+ <meta http-equiv="x-ua-compatible" content="ie=edge">
+ <title>Index</title>
+ <meta name="description" content="">
+ <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
+ <style>
+ </style>
+ </head>
+ <body>
+ <ul>
+ {% for u, ur, d in index %}
+ <li><a href="{{ ur }}">{{ u }}</a> {{ d }}</li>
+ {% endfor %}
+ </ul>
+ </body>
+</html>
diff --git a/swayback/templates/sw.html b/swayback/templates/sw.html
new file mode 100644
index 0000000..2abfac2
--- /dev/null
+++ b/swayback/templates/sw.html
@@ -0,0 +1,41 @@
+<!doctype html>
+<html>
+ <head>
+ <meta charset="utf-8">
+ <meta http-equiv="x-ua-compatible" content="ie=edge">
+ <title>Just a second…</title>
+ <meta name="description" content="">
+ <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
+ <style>
+ .hidden {
+ display: none;
+ }
+ </style>
+ </head>
+ <body>
+ <div class="hidden" id="unsupported">
+ <p>Service workers are not supported by your browser.</p>
+ </div>
+ <div id="waiting">
+ <p>Just a second…</p>
+ </div>
+ <script>
+ if ('serviceWorker' in navigator) {
+ /* service workers must be hosted in the same origin (i.e. subdomain) */
+ navigator.serviceWorker.register('/static/sw.js', {scope: '/'})
+ .then(function(reg) {
+ /* load actual content using the service worker */
+ window.location.reload ();
+ }).catch(function(error) {
+ console.log ('sw error', error);
+ document.getElementById ('unsupported').classList.remove ('hidden');
+ document.getElementById ('waiting').classList.add ('hidden');
+ });
+ } else {
+ console.log ('not supported');
+ document.getElementById ('unsupported').classList.remove ('hidden');
+ document.getElementById ('waiting').classList.add ('hidden');
+ }
+ </script>
+ </body>
+</html>