From 5d1bd2891183e93f43f5fdb67e0ffcefabcc8eed Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Fri, 16 Mar 2018 15:06:38 +0100 Subject: Configureable domain --- README.rst | 19 ++++++++------- swayback/__init__.py | 18 +++++++++----- swayback/static/sw.js | 57 -------------------------------------------- swayback/templates/sw.js | 61 ++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 83 insertions(+), 72 deletions(-) delete mode 100644 swayback/static/sw.js create mode 100644 swayback/templates/sw.js diff --git a/README.rst b/README.rst index fd4ee93..4c948ac 100644 --- a/README.rst +++ b/README.rst @@ -34,14 +34,14 @@ any request a web app makes. This is exactly what is needed to properly replay archived web apps. swayback provides an HTTP server, responing to queries for the wildcard -domain ``*.swayback.localhost``. The page served first installs a service -worker and then reloads the page. Now the service worker is in control of -network requests and rewrites a request like (for instance) +domain, which is ``*.swayback.localhost`` by default. The page served first +installs a service worker and then reloads the page. Now the service worker is +in control of network requests and rewrites a request like (for instance) ``www.instagram.com.swayback.localhost:5000/bluebellwooi/`` to ``swayback.localhost:5000/raw`` with the real URL in the POST request body. -swayback’s server looks up that URL in the WARC files provided and replies -with the original server’s response, which is then returned by the service -worker to the browser without modification. +swayback’s server looks up that URL in the WARC files provided and replies with +the original server’s response, which is then returned by the service worker to +the browser without modification. Usage ----- @@ -52,8 +52,8 @@ the following python packages: - flask - warcio -swayback uses the hardcoded domain ``swayback.localhost``, which means you need -to set up your DNS resolver accordingly. An example for unbound looks like +swayback uses the domain ``swayback.localhost`` by default, which means you +need to set up your DNS resolver accordingly. An example for unbound looks like this: .. code:: unbound @@ -75,8 +75,9 @@ HTML pages found in those WARC files. Caveats ------- -- Hardcoded replay domain - URL lookup is broken, only HTTPS sites work correctly +- Absolute hyperlink targets to different domains are not intercepted (service + worker limitation) Related projects ---------------- diff --git a/swayback/__init__.py b/swayback/__init__.py index f609736..d327589 100644 --- a/swayback/__init__.py +++ b/swayback/__init__.py @@ -7,8 +7,13 @@ from warcio.archiveiterator import ArchiveIterator from warcio.recordloader import ArcWarcRecordLoader from warcio.bufferedreaders import DecompressingBufferedReader +class DefaultSettings: + BASE_HOST = 'swayback.localhost:5000' + app = Flask(__name__) app.url_map.host_matching = True +app.config.from_object('swayback.DefaultSettings') +app.config.from_envvar('SWAYBACK_SETTINGS') htmlindex = [] urlmap = {} @@ -25,15 +30,15 @@ for filename in os.listdir ('.'): urlmap[u] = (filename, ai.get_record_offset (), ai.get_record_length ()) httpHeaders = record.http_headers if httpHeaders.get_header ('content-type', '').startswith ('text/html'): - rewrittenUrl = urlunparse (('http', u.hostname + '.swayback.localhost:5000', u[2], u[3], u[4], u[5])) + rewrittenUrl = urlunparse (('http', u.hostname + '.' + app.config['BASE_HOST'], u[2], u[3], u[4], u[5])) htmlindex.append ((urlunparse (u), rewrittenUrl, record.rec_headers.get_header ('warc-date'))) -@app.route('/', host='swayback.localhost:5000') +@app.route('/', host=app.config['BASE_HOST']) def index (): """ A simple index of all HTML pages inside the WARCs """ return render_template ('index.html', index=htmlindex) -@app.route('/raw', host='swayback.localhost:5000', methods=['OPTIONS']) +@app.route('/raw', host=app.config['BASE_HOST'], methods=['OPTIONS']) def rawPreflight (): """ CORS preflight request, allow user-defined fetch() headers """ resp = make_response ('', 200) @@ -54,7 +59,7 @@ def lookupRecord (url): except KeyError: return None -@app.route('/raw', host='swayback.localhost:5000', methods=['POST']) +@app.route('/raw', host=app.config['BASE_HOST'], methods=['POST']) def raw (): """ Retrieve the original response for a given request """ data = request.get_json () @@ -74,10 +79,11 @@ def raw (): @app.route('/static/sw.js', host='') def sw (host=None): """ Service worker script needs additional headers """ - return send_file ('static/sw.js'), {'Service-Worker-Allowed': '/'} + headers = {'Service-Worker-Allowed': '/', 'Content-Type': 'application/javascript'} + return render_template ('sw.js', baseHost=app.config['BASE_HOST']), headers # each subdomain will need its own service worker registration -@app.route('/', host='.swayback.localhost:5000', methods=['GET', 'POST']) +@app.route('/', host='.' + app.config['BASE_HOST'], methods=['GET', 'POST']) def register (path=None, domain=None): """ Register a service worker for this origin """ return render_template ('sw.html') diff --git a/swayback/static/sw.js b/swayback/static/sw.js deleted file mode 100644 index 56889c0..0000000 --- a/swayback/static/sw.js +++ /dev/null @@ -1,57 +0,0 @@ -self.addEventListener('install', function(event) { - console.log ('installed service worker', event); - self.skipWaiting(); -}); -/* load stuff through service worker immediately? XXX: only debugging? */ -self.addEventListener('activate', async function() { - if (self.registration.navigationPreload) { - // Enable navigation preloads! - await self.registration.navigationPreload.enable(); - } /*event => { - event.waitUntil(clients.claim());*/ -}); - -self.addEventListener('fetch', function(event) { - let origreq = event.request; - console.log ('fetch event', origreq.url, event); - let url = new URL (origreq.url); - url.protocol = 'https:'; - url.port = 443; - url.hash = ''; - if (url.hostname.endsWith ('.swayback.localhost')) { - url.hostname = url.hostname.slice (0, url.hostname.length-'.swayback.localhost'.length); - } - console.log ('orig url', url); - /* should contain everything we cannot use in the actual request (i.e. url and method) */ - let body = { - 'url': url.href, - 'method': origreq.method, - }; - let headers = { - 'Content-Type': 'application/json', - }; - /* add a few well-known request headers */ - let origheaders = origreq.headers; - if (origheaders.has ('accept')) { - headers['Accept'] = origreq.headers.get ('accept'); - } - console.log ('sending', body, headers); - let req = new Request ('http://swayback.localhost:5000/raw', - {method: 'POST', body: JSON.stringify (body), headers: headers, - mode: 'cors'}); - - event.respondWith ( - fetch(req) - .then(function (response) { - // response may be used only once - // we need to save clone to put one copy in cache - // and serve second one - let responseClone = response.clone(); - console.log ('got resp', responseClone); - return response; - }) - .catch (function () { - console.log ('nope'); - }) - ); -}); diff --git a/swayback/templates/sw.js b/swayback/templates/sw.js new file mode 100644 index 0000000..ea653a9 --- /dev/null +++ b/swayback/templates/sw.js @@ -0,0 +1,61 @@ +/* script config, dynamically generated */ +const baseHost = "{{ baseHost }}"; +const baseDomain = baseHost.split (':', 1)[0]; + +self.addEventListener('install', function(event) { + console.log ('installed service worker', event); + self.skipWaiting(); +}); +/* load stuff through service worker immediately? XXX: only debugging? */ +self.addEventListener('activate', async function() { + if (self.registration.navigationPreload) { + // Enable navigation preloads! + await self.registration.navigationPreload.enable(); + } /*event => { + event.waitUntil(clients.claim());*/ +}); + +self.addEventListener('fetch', function(event) { + let origreq = event.request; + console.log ('fetch event', origreq.url, event); + let url = new URL (origreq.url); + url.protocol = 'https:'; + url.port = 443; + url.hash = ''; + if (url.hostname.endsWith ('.' + baseDomain)) { + url.hostname = url.hostname.slice (0, url.hostname.length-('.'+baseDomain).length); + } + console.log ('orig url', url); + /* should contain everything we cannot use in the actual request (i.e. url and method) */ + let body = { + 'url': url.href, + 'method': origreq.method, + }; + let headers = { + 'Content-Type': 'application/json', + }; + /* add a few well-known request headers */ + let origheaders = origreq.headers; + if (origheaders.has ('accept')) { + headers['Accept'] = origreq.headers.get ('accept'); + } + console.log ('sending', body, headers); + let req = new Request ('http://' + baseHost + '/raw', + {method: 'POST', body: JSON.stringify (body), headers: headers, + mode: 'cors'}); + + event.respondWith ( + fetch(req) + .then(function (response) { + // response may be used only once + // we need to save clone to put one copy in cache + // and serve second one + let responseClone = response.clone(); + console.log ('got resp', responseClone); + return response; + }) + .catch (function () { + console.log ('nope'); + }) + ); +}); -- cgit v1.2.3