From f6d405ced3e9330195109f7c0cef1d40863b1dd0 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Sun, 25 Feb 2018 15:46:33 +0100 Subject: Initial import --- .gitignore | 2 + README.rst | 88 +++++++++++++++++++++++++++++++++++++++++++ swayback/__init__.py | 68 +++++++++++++++++++++++++++++++++ swayback/static/sw.js | 39 +++++++++++++++++++ swayback/templates/index.html | 19 ++++++++++ swayback/templates/sw.html | 41 ++++++++++++++++++++ 6 files changed, 257 insertions(+) create mode 100644 .gitignore create mode 100644 README.rst create mode 100644 swayback/__init__.py create mode 100644 swayback/static/sw.js create mode 100644 swayback/templates/index.html create mode 100644 swayback/templates/sw.html diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3975598 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +__pycache__ +*.sw? diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..59629fe --- /dev/null +++ b/README.rst @@ -0,0 +1,88 @@ +swayback +======== + +This is a proof of concept for Service Worker-based web app replay, similar to +archive.org’s Wayback Machine. + +Rationale +--------- + +Traditionally replaying websites relied heavily on rewriting URL’s in static +HTML pages to adapt them to a new origin and path hierarchy (i.e. +``https://web.archive.org/web//``). With the rise of web apps, which +load their content dynamically, this is no longer sufficient. + +Let’s look at Instagram as an example for this: User’s profiles dynamically +load content to implement “infinite scrolling”. The corresponding request is a +GraphQL query, which returns JSON-encoded data with an application-defined +structure. This response includes URL’s to images, which must be rewritten as +well, in order for replay to work correctly. So the replay software needs to +parse and rewrite JSON as well as HTML. + +However, this response could have used an arbitrary serialization format and +may contain relative URL’s or just values used in a URL template, which are +more difficult to spot than absolute URL’s. This makes server-side rewriting +difficult and cumbersome, perhaps even impossible. + +Implementation +-------------- + +Instead swayback relies on a new web technology called *Service Workers*. These +can be installed for a given domain and path prefix. They basically act as a +proxy between the browser and server, allowing them to intercept and rewrite +any request a web app makes. Which is exactly what we need to properly replay +archived web apps. + +So swayback provides an HTTP server, responing to queries for the wildcard +domain ``*.swayback.localhost``. The page served first installs a service +worker and then reloads the page. Now the service worker is in control of +network requests and rewrites a request like (for instance) +``www.instagram.com.swayback.localhost:5000/bluebellwooi/`` to +``swayback.localhost:5000/raw`` with the real URL in the POST request body. +swayback’s server looks up that URL in the WARC files provided and and replies +with the original server’s response, which is then returned by the service +worker to the browser without modification. + +Usage +----- + +Since this is a proof of concept functionality is quite limited. You’ll need +the following python packages: + +- flask +- warcio + +swayback uses the hardcoded domain ``swayback.localhost``, which means you need +to set up your DNS resolver accordingly. An example for unbound looks like +this: + +.. code:: unbound + + local-zone: "swayback.localhost." redirect + local-data: "swayback.localhost. 30 IN A 127.0.0.1" + +After you recorded some WARCs move them into swayback’s base directory and run: + +.. code:: bash + + export FLASK_APP=swayback/__init__.py + export FLASK_DEBUG=1 + flask run --with-threads + +Then navigate to http://swayback.localhost:5000, which (hopefully) lists all +HTML pages found in those WARC files. + +Caveats +------- + +- Hardcoded replay domain +- URL lookup is broken, only HTTPS sites work correctly + +Related projects +---------------- + +This approach complements efforts such as crocoite_, a web crawler based on +Google Chrome. + +.. _crocoite: https://github.com/PromyLOPh/crocoite + diff --git a/swayback/__init__.py b/swayback/__init__.py new file mode 100644 index 0000000..80600f3 --- /dev/null +++ b/swayback/__init__.py @@ -0,0 +1,68 @@ +import os + +from flask import Flask, render_template, send_file, request, make_response +from warcio.archiveiterator import ArchiveIterator +from warcio.recordloader import ArcWarcRecordLoader +from warcio.bufferedreaders import DecompressingBufferedReader +from io import BytesIO +from urllib.parse import urlparse, urlunparse + +app = Flask(__name__) +app.url_map.host_matching = True + +htmlindex = [] +urlmap = {} +for filename in os.listdir ('.'): + if not filename.endswith ('.warc.gz'): + continue + print ('using', filename) + with open(filename, 'rb') as stream: + ai = ArchiveIterator(stream) + for record in ai: + if record.rec_type == 'response': + u = urlparse (record.rec_headers.get_header('WARC-Target-URI')) + if u not in urlmap: + urlmap[u] = (filename, ai.get_record_offset (), ai.get_record_length ()) + httpHeaders = record.http_headers + if httpHeaders.get_header ('content-type', '').startswith ('text/html'): + rewrittenUrl = urlunparse (('http', u.hostname + '.swayback.localhost:5000', u[2], u[3], u[4], u[5])) + htmlindex.append ((urlunparse (u), rewrittenUrl, record.rec_headers.get_header ('warc-date'))) + +@app.route('/', host='swayback.localhost:5000') +def index (): + """ A simple index of all HTML pages inside the WARCs """ + return render_template ('index.html', index=htmlindex) + +@app.route('/raw', host='swayback.localhost:5000', methods=['POST']) +def raw (): + """ Retrieve the original response for a given request """ + print (request.form) + url = urlparse (request.form['url']) + try: + filename, offset, length = urlmap[url] + with open(filename, 'rb') as stream: + stream.seek (offset, 0) + buf = BytesIO (stream.read (length)) + loader = ArcWarcRecordLoader () + record = loader.parse_record_stream (DecompressingBufferedReader (buf)) + statuscode = record.http_headers.get_statuscode () + record.http_headers.remove_header ('Content-Security-Policy') + record.http_headers.replace_header ('Access-Control-Allow-Origin', '*') + headers = record.http_headers.headers + return record.content_stream().read(), statuscode, headers + except KeyError: + resp = make_response ('', 404) + resp.headers.add ('Access-Control-Allow-Origin', '*') + return resp + +@app.route('/static/sw.js', host='') +def sw (host=None): + """ Service worker script needs additional headers """ + return send_file ('static/sw.js'), {'Service-Worker-Allowed': '/'} + +# each subdomain will need its own service worker registration +@app.route('/', host='.swayback.localhost:5000', methods=['GET', 'POST']) +def register (path=None, domain=None): + """ Register a service worker for this origin """ + return render_template ('sw.html') + diff --git a/swayback/static/sw.js b/swayback/static/sw.js new file mode 100644 index 0000000..f3a63db --- /dev/null +++ b/swayback/static/sw.js @@ -0,0 +1,39 @@ +self.addEventListener('install', function(event) { + console.log ('installed service worker', event); + self.skipWaiting(); +}); +/* load stuff through service worker immediately? XXX: only debugging? */ +self.addEventListener('activate', event => { + event.waitUntil(clients.claim()); +}); + +self.addEventListener('fetch', function(event) { + console.log ('fetch event', event.request.url, event); + let url = new URL (event.request.url); + url.protocol = 'https:'; + url.port = 443; + url.hash = ''; + if (url.hostname.endsWith ('.swayback.localhost')) { + url.hostname = url.hostname.slice (0, url.hostname.length-'.swayback.localhost'.length); + } + console.log ('orig url', url); + let body = new FormData (); + body.append ('url', url); + body.append ('method', event.request.method); + let req = new Request ('http://swayback.localhost:5000/raw', {method: 'POST', body: body}); + + event.respondWith ( + fetch(req) + .then(function (response) { + // response may be used only once + // we need to save clone to put one copy in cache + // and serve second one + let responseClone = response.clone(); + console.log ('got resp', responseClone); + return response; + }) + .catch (function () { + console.log ('nope'); + }) + ); +}); diff --git a/swayback/templates/index.html b/swayback/templates/index.html new file mode 100644 index 0000000..028a5a9 --- /dev/null +++ b/swayback/templates/index.html @@ -0,0 +1,19 @@ + + + + + + Index + + + + + +
    + {% for u, ur, d in index %} +
  • {{ u }} {{ d }}
  • + {% endfor %} +
+ + diff --git a/swayback/templates/sw.html b/swayback/templates/sw.html new file mode 100644 index 0000000..2abfac2 --- /dev/null +++ b/swayback/templates/sw.html @@ -0,0 +1,41 @@ + + + + + + Just a second… + + + + + + +
+

Just a second…

+
+ + + -- cgit v1.2.3