summaryrefslogtreecommitdiff
path: root/swayback
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2018-02-25 15:46:33 +0100
committerLars-Dominik Braun <lars@6xq.net>2018-02-25 15:46:33 +0100
commitf6d405ced3e9330195109f7c0cef1d40863b1dd0 (patch)
tree473204f94a5305530412bc2da9fb7c12f67f6394 /swayback
downloadswayback-f6d405ced3e9330195109f7c0cef1d40863b1dd0.tar.gz
swayback-f6d405ced3e9330195109f7c0cef1d40863b1dd0.tar.bz2
swayback-f6d405ced3e9330195109f7c0cef1d40863b1dd0.zip
Initial import
Diffstat (limited to 'swayback')
-rw-r--r--swayback/__init__.py68
-rw-r--r--swayback/static/sw.js39
-rw-r--r--swayback/templates/index.html19
-rw-r--r--swayback/templates/sw.html41
4 files changed, 167 insertions, 0 deletions
diff --git a/swayback/__init__.py b/swayback/__init__.py
new file mode 100644
index 0000000..80600f3
--- /dev/null
+++ b/swayback/__init__.py
@@ -0,0 +1,68 @@
+import os
+
+from flask import Flask, render_template, send_file, request, make_response
+from warcio.archiveiterator import ArchiveIterator
+from warcio.recordloader import ArcWarcRecordLoader
+from warcio.bufferedreaders import DecompressingBufferedReader
+from io import BytesIO
+from urllib.parse import urlparse, urlunparse
+
+app = Flask(__name__)
+app.url_map.host_matching = True
+
+htmlindex = []
+urlmap = {}
+for filename in os.listdir ('.'):
+ if not filename.endswith ('.warc.gz'):
+ continue
+ print ('using', filename)
+ with open(filename, 'rb') as stream:
+ ai = ArchiveIterator(stream)
+ for record in ai:
+ if record.rec_type == 'response':
+ u = urlparse (record.rec_headers.get_header('WARC-Target-URI'))
+ if u not in urlmap:
+ urlmap[u] = (filename, ai.get_record_offset (), ai.get_record_length ())
+ httpHeaders = record.http_headers
+ if httpHeaders.get_header ('content-type', '').startswith ('text/html'):
+ rewrittenUrl = urlunparse (('http', u.hostname + '.swayback.localhost:5000', u[2], u[3], u[4], u[5]))
+ htmlindex.append ((urlunparse (u), rewrittenUrl, record.rec_headers.get_header ('warc-date')))
+
+@app.route('/', host='swayback.localhost:5000')
+def index ():
+ """ A simple index of all HTML pages inside the WARCs """
+ return render_template ('index.html', index=htmlindex)
+
+@app.route('/raw', host='swayback.localhost:5000', methods=['POST'])
+def raw ():
+ """ Retrieve the original response for a given request """
+ print (request.form)
+ url = urlparse (request.form['url'])
+ try:
+ filename, offset, length = urlmap[url]
+ with open(filename, 'rb') as stream:
+ stream.seek (offset, 0)
+ buf = BytesIO (stream.read (length))
+ loader = ArcWarcRecordLoader ()
+ record = loader.parse_record_stream (DecompressingBufferedReader (buf))
+ statuscode = record.http_headers.get_statuscode ()
+ record.http_headers.remove_header ('Content-Security-Policy')
+ record.http_headers.replace_header ('Access-Control-Allow-Origin', '*')
+ headers = record.http_headers.headers
+ return record.content_stream().read(), statuscode, headers
+ except KeyError:
+ resp = make_response ('', 404)
+ resp.headers.add ('Access-Control-Allow-Origin', '*')
+ return resp
+
+@app.route('/static/sw.js', host='<host>')
+def sw (host=None):
+ """ Service worker script needs additional headers """
+ return send_file ('static/sw.js'), {'Service-Worker-Allowed': '/'}
+
+# each subdomain will need its own service worker registration
+@app.route('/<path:path>', host='<domain>.swayback.localhost:5000', methods=['GET', 'POST'])
+def register (path=None, domain=None):
+ """ Register a service worker for this origin """
+ return render_template ('sw.html')
+
diff --git a/swayback/static/sw.js b/swayback/static/sw.js
new file mode 100644
index 0000000..f3a63db
--- /dev/null
+++ b/swayback/static/sw.js
@@ -0,0 +1,39 @@
+self.addEventListener('install', function(event) {
+ console.log ('installed service worker', event);
+ self.skipWaiting();
+});
+/* load stuff through service worker immediately? XXX: only debugging? */
+self.addEventListener('activate', event => {
+ event.waitUntil(clients.claim());
+});
+
+self.addEventListener('fetch', function(event) {
+ console.log ('fetch event', event.request.url, event);
+ let url = new URL (event.request.url);
+ url.protocol = 'https:';
+ url.port = 443;
+ url.hash = '';
+ if (url.hostname.endsWith ('.swayback.localhost')) {
+ url.hostname = url.hostname.slice (0, url.hostname.length-'.swayback.localhost'.length);
+ }
+ console.log ('orig url', url);
+ let body = new FormData ();
+ body.append ('url', url);
+ body.append ('method', event.request.method);
+ let req = new Request ('http://swayback.localhost:5000/raw', {method: 'POST', body: body});
+
+ event.respondWith (
+ fetch(req)
+ .then(function (response) {
+ // response may be used only once
+ // we need to save clone to put one copy in cache
+ // and serve second one
+ let responseClone = response.clone();
+ console.log ('got resp', responseClone);
+ return response;
+ })
+ .catch (function () {
+ console.log ('nope');
+ })
+ );
+});
diff --git a/swayback/templates/index.html b/swayback/templates/index.html
new file mode 100644
index 0000000..028a5a9
--- /dev/null
+++ b/swayback/templates/index.html
@@ -0,0 +1,19 @@
+<!doctype html>
+<html>
+ <head>
+ <meta charset="utf-8">
+ <meta http-equiv="x-ua-compatible" content="ie=edge">
+ <title>Index</title>
+ <meta name="description" content="">
+ <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
+ <style>
+ </style>
+ </head>
+ <body>
+ <ul>
+ {% for u, ur, d in index %}
+ <li><a href="{{ ur }}">{{ u }}</a> {{ d }}</li>
+ {% endfor %}
+ </ul>
+ </body>
+</html>
diff --git a/swayback/templates/sw.html b/swayback/templates/sw.html
new file mode 100644
index 0000000..2abfac2
--- /dev/null
+++ b/swayback/templates/sw.html
@@ -0,0 +1,41 @@
+<!doctype html>
+<html>
+ <head>
+ <meta charset="utf-8">
+ <meta http-equiv="x-ua-compatible" content="ie=edge">
+ <title>Just a second…</title>
+ <meta name="description" content="">
+ <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
+ <style>
+ .hidden {
+ display: none;
+ }
+ </style>
+ </head>
+ <body>
+ <div class="hidden" id="unsupported">
+ <p>Service workers are not supported by your browser.</p>
+ </div>
+ <div id="waiting">
+ <p>Just a second…</p>
+ </div>
+ <script>
+ if ('serviceWorker' in navigator) {
+ /* service workers must be hosted in the same origin (i.e. subdomain) */
+ navigator.serviceWorker.register('/static/sw.js', {scope: '/'})
+ .then(function(reg) {
+ /* load actual content using the service worker */
+ window.location.reload ();
+ }).catch(function(error) {
+ console.log ('sw error', error);
+ document.getElementById ('unsupported').classList.remove ('hidden');
+ document.getElementById ('waiting').classList.add ('hidden');
+ });
+ } else {
+ console.log ('not supported');
+ document.getElementById ('unsupported').classList.remove ('hidden');
+ document.getElementById ('waiting').classList.add ('hidden');
+ }
+ </script>
+ </body>
+</html>