From f6d405ced3e9330195109f7c0cef1d40863b1dd0 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Sun, 25 Feb 2018 15:46:33 +0100 Subject: Initial import --- swayback/__init__.py | 68 +++++++++++++++++++++++++++++++++++++++++++ swayback/static/sw.js | 39 +++++++++++++++++++++++++ swayback/templates/index.html | 19 ++++++++++++ swayback/templates/sw.html | 41 ++++++++++++++++++++++++++ 4 files changed, 167 insertions(+) create mode 100644 swayback/__init__.py create mode 100644 swayback/static/sw.js create mode 100644 swayback/templates/index.html create mode 100644 swayback/templates/sw.html (limited to 'swayback') diff --git a/swayback/__init__.py b/swayback/__init__.py new file mode 100644 index 0000000..80600f3 --- /dev/null +++ b/swayback/__init__.py @@ -0,0 +1,68 @@ +import os + +from flask import Flask, render_template, send_file, request, make_response +from warcio.archiveiterator import ArchiveIterator +from warcio.recordloader import ArcWarcRecordLoader +from warcio.bufferedreaders import DecompressingBufferedReader +from io import BytesIO +from urllib.parse import urlparse, urlunparse + +app = Flask(__name__) +app.url_map.host_matching = True + +htmlindex = [] +urlmap = {} +for filename in os.listdir ('.'): + if not filename.endswith ('.warc.gz'): + continue + print ('using', filename) + with open(filename, 'rb') as stream: + ai = ArchiveIterator(stream) + for record in ai: + if record.rec_type == 'response': + u = urlparse (record.rec_headers.get_header('WARC-Target-URI')) + if u not in urlmap: + urlmap[u] = (filename, ai.get_record_offset (), ai.get_record_length ()) + httpHeaders = record.http_headers + if httpHeaders.get_header ('content-type', '').startswith ('text/html'): + rewrittenUrl = urlunparse (('http', u.hostname + '.swayback.localhost:5000', u[2], u[3], u[4], u[5])) + htmlindex.append ((urlunparse (u), rewrittenUrl, record.rec_headers.get_header ('warc-date'))) + +@app.route('/', host='swayback.localhost:5000') +def index (): + """ A simple index of all HTML pages inside the WARCs """ + return render_template ('index.html', index=htmlindex) + +@app.route('/raw', host='swayback.localhost:5000', methods=['POST']) +def raw (): + """ Retrieve the original response for a given request """ + print (request.form) + url = urlparse (request.form['url']) + try: + filename, offset, length = urlmap[url] + with open(filename, 'rb') as stream: + stream.seek (offset, 0) + buf = BytesIO (stream.read (length)) + loader = ArcWarcRecordLoader () + record = loader.parse_record_stream (DecompressingBufferedReader (buf)) + statuscode = record.http_headers.get_statuscode () + record.http_headers.remove_header ('Content-Security-Policy') + record.http_headers.replace_header ('Access-Control-Allow-Origin', '*') + headers = record.http_headers.headers + return record.content_stream().read(), statuscode, headers + except KeyError: + resp = make_response ('', 404) + resp.headers.add ('Access-Control-Allow-Origin', '*') + return resp + +@app.route('/static/sw.js', host='') +def sw (host=None): + """ Service worker script needs additional headers """ + return send_file ('static/sw.js'), {'Service-Worker-Allowed': '/'} + +# each subdomain will need its own service worker registration +@app.route('/', host='.swayback.localhost:5000', methods=['GET', 'POST']) +def register (path=None, domain=None): + """ Register a service worker for this origin """ + return render_template ('sw.html') + diff --git a/swayback/static/sw.js b/swayback/static/sw.js new file mode 100644 index 0000000..f3a63db --- /dev/null +++ b/swayback/static/sw.js @@ -0,0 +1,39 @@ +self.addEventListener('install', function(event) { + console.log ('installed service worker', event); + self.skipWaiting(); +}); +/* load stuff through service worker immediately? XXX: only debugging? */ +self.addEventListener('activate', event => { + event.waitUntil(clients.claim()); +}); + +self.addEventListener('fetch', function(event) { + console.log ('fetch event', event.request.url, event); + let url = new URL (event.request.url); + url.protocol = 'https:'; + url.port = 443; + url.hash = ''; + if (url.hostname.endsWith ('.swayback.localhost')) { + url.hostname = url.hostname.slice (0, url.hostname.length-'.swayback.localhost'.length); + } + console.log ('orig url', url); + let body = new FormData (); + body.append ('url', url); + body.append ('method', event.request.method); + let req = new Request ('http://swayback.localhost:5000/raw', {method: 'POST', body: body}); + + event.respondWith ( + fetch(req) + .then(function (response) { + // response may be used only once + // we need to save clone to put one copy in cache + // and serve second one + let responseClone = response.clone(); + console.log ('got resp', responseClone); + return response; + }) + .catch (function () { + console.log ('nope'); + }) + ); +}); diff --git a/swayback/templates/index.html b/swayback/templates/index.html new file mode 100644 index 0000000..028a5a9 --- /dev/null +++ b/swayback/templates/index.html @@ -0,0 +1,19 @@ + + + + + + Index + + + + + +
    + {% for u, ur, d in index %} +
  • {{ u }} {{ d }}
  • + {% endfor %} +
+ + diff --git a/swayback/templates/sw.html b/swayback/templates/sw.html new file mode 100644 index 0000000..2abfac2 --- /dev/null +++ b/swayback/templates/sw.html @@ -0,0 +1,41 @@ + + + + + + Just a second… + + + + + + +
+

Just a second…

+
+ + + -- cgit v1.2.3