diff options
-rw-r--r-- | README.rst | 19 | ||||
-rw-r--r-- | swayback/__init__.py | 18 | ||||
-rw-r--r-- | swayback/templates/sw.js (renamed from swayback/static/sw.js) | 10 |
3 files changed, 29 insertions, 18 deletions
@@ -34,14 +34,14 @@ any request a web app makes. This is exactly what is needed to properly replay archived web apps. swayback provides an HTTP server, responing to queries for the wildcard -domain ``*.swayback.localhost``. The page served first installs a service -worker and then reloads the page. Now the service worker is in control of -network requests and rewrites a request like (for instance) +domain, which is ``*.swayback.localhost`` by default. The page served first +installs a service worker and then reloads the page. Now the service worker is +in control of network requests and rewrites a request like (for instance) ``www.instagram.com.swayback.localhost:5000/bluebellwooi/`` to ``swayback.localhost:5000/raw`` with the real URL in the POST request body. -swayback’s server looks up that URL in the WARC files provided and replies -with the original server’s response, which is then returned by the service -worker to the browser without modification. +swayback’s server looks up that URL in the WARC files provided and replies with +the original server’s response, which is then returned by the service worker to +the browser without modification. Usage ----- @@ -52,8 +52,8 @@ the following python packages: - flask - warcio -swayback uses the hardcoded domain ``swayback.localhost``, which means you need -to set up your DNS resolver accordingly. An example for unbound looks like +swayback uses the domain ``swayback.localhost`` by default, which means you +need to set up your DNS resolver accordingly. An example for unbound looks like this: .. code:: unbound @@ -75,8 +75,9 @@ HTML pages found in those WARC files. Caveats ------- -- Hardcoded replay domain - URL lookup is broken, only HTTPS sites work correctly +- Absolute hyperlink targets to different domains are not intercepted (service + worker limitation) Related projects ---------------- diff --git a/swayback/__init__.py b/swayback/__init__.py index f609736..d327589 100644 --- a/swayback/__init__.py +++ b/swayback/__init__.py @@ -7,8 +7,13 @@ from warcio.archiveiterator import ArchiveIterator from warcio.recordloader import ArcWarcRecordLoader from warcio.bufferedreaders import DecompressingBufferedReader +class DefaultSettings: + BASE_HOST = 'swayback.localhost:5000' + app = Flask(__name__) app.url_map.host_matching = True +app.config.from_object('swayback.DefaultSettings') +app.config.from_envvar('SWAYBACK_SETTINGS') htmlindex = [] urlmap = {} @@ -25,15 +30,15 @@ for filename in os.listdir ('.'): urlmap[u] = (filename, ai.get_record_offset (), ai.get_record_length ()) httpHeaders = record.http_headers if httpHeaders.get_header ('content-type', '').startswith ('text/html'): - rewrittenUrl = urlunparse (('http', u.hostname + '.swayback.localhost:5000', u[2], u[3], u[4], u[5])) + rewrittenUrl = urlunparse (('http', u.hostname + '.' + app.config['BASE_HOST'], u[2], u[3], u[4], u[5])) htmlindex.append ((urlunparse (u), rewrittenUrl, record.rec_headers.get_header ('warc-date'))) -@app.route('/', host='swayback.localhost:5000') +@app.route('/', host=app.config['BASE_HOST']) def index (): """ A simple index of all HTML pages inside the WARCs """ return render_template ('index.html', index=htmlindex) -@app.route('/raw', host='swayback.localhost:5000', methods=['OPTIONS']) +@app.route('/raw', host=app.config['BASE_HOST'], methods=['OPTIONS']) def rawPreflight (): """ CORS preflight request, allow user-defined fetch() headers """ resp = make_response ('', 200) @@ -54,7 +59,7 @@ def lookupRecord (url): except KeyError: return None -@app.route('/raw', host='swayback.localhost:5000', methods=['POST']) +@app.route('/raw', host=app.config['BASE_HOST'], methods=['POST']) def raw (): """ Retrieve the original response for a given request """ data = request.get_json () @@ -74,10 +79,11 @@ def raw (): @app.route('/static/sw.js', host='<host>') def sw (host=None): """ Service worker script needs additional headers """ - return send_file ('static/sw.js'), {'Service-Worker-Allowed': '/'} + headers = {'Service-Worker-Allowed': '/', 'Content-Type': 'application/javascript'} + return render_template ('sw.js', baseHost=app.config['BASE_HOST']), headers # each subdomain will need its own service worker registration -@app.route('/<path:path>', host='<domain>.swayback.localhost:5000', methods=['GET', 'POST']) +@app.route('/<path:path>', host='<domain>.' + app.config['BASE_HOST'], methods=['GET', 'POST']) def register (path=None, domain=None): """ Register a service worker for this origin """ return render_template ('sw.html') diff --git a/swayback/static/sw.js b/swayback/templates/sw.js index 56889c0..ea653a9 100644 --- a/swayback/static/sw.js +++ b/swayback/templates/sw.js @@ -1,3 +1,7 @@ +/* script config, dynamically generated */ +const baseHost = "{{ baseHost }}"; +const baseDomain = baseHost.split (':', 1)[0]; + self.addEventListener('install', function(event) { console.log ('installed service worker', event); self.skipWaiting(); @@ -18,8 +22,8 @@ self.addEventListener('fetch', function(event) { url.protocol = 'https:'; url.port = 443; url.hash = ''; - if (url.hostname.endsWith ('.swayback.localhost')) { - url.hostname = url.hostname.slice (0, url.hostname.length-'.swayback.localhost'.length); + if (url.hostname.endsWith ('.' + baseDomain)) { + url.hostname = url.hostname.slice (0, url.hostname.length-('.'+baseDomain).length); } console.log ('orig url', url); /* should contain everything we cannot use in the actual request (i.e. url and method) */ @@ -36,7 +40,7 @@ self.addEventListener('fetch', function(event) { headers['Accept'] = origreq.headers.get ('accept'); } console.log ('sending', body, headers); - let req = new Request ('http://swayback.localhost:5000/raw', + let req = new Request ('http://' + baseHost + '/raw', {method: 'POST', body: JSON.stringify (body), headers: headers, mode: 'cors'}); |