From 5d1bd2891183e93f43f5fdb67e0ffcefabcc8eed Mon Sep 17 00:00:00 2001
From: Lars-Dominik Braun <lars@6xq.net>
Date: Fri, 16 Mar 2018 15:06:38 +0100
Subject: Configureable domain

---
 README.rst               | 19 ++++++++-------
 swayback/__init__.py     | 18 +++++++++-----
 swayback/static/sw.js    | 57 --------------------------------------------
 swayback/templates/sw.js | 61 ++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 83 insertions(+), 72 deletions(-)
 delete mode 100644 swayback/static/sw.js
 create mode 100644 swayback/templates/sw.js

diff --git a/README.rst b/README.rst
index fd4ee93..4c948ac 100644
--- a/README.rst
+++ b/README.rst
@@ -34,14 +34,14 @@ any request a web app makes. This is exactly what is needed to properly replay
 archived web apps.
 
 swayback provides an HTTP server, responing to queries for the wildcard
-domain ``*.swayback.localhost``. The page served first installs a service
-worker and then reloads the page. Now the service worker is in control of
-network requests and rewrites a request like (for instance)
+domain, which is ``*.swayback.localhost`` by default. The page served first
+installs a service worker and then reloads the page. Now the service worker is
+in control of network requests and rewrites a request like (for instance)
 ``www.instagram.com.swayback.localhost:5000/bluebellwooi/`` to
 ``swayback.localhost:5000/raw`` with the real URL in the POST request body.
-swayback’s server looks up that URL in the WARC files provided and replies
-with the original server’s response, which is then returned by the service
-worker to the browser without modification.
+swayback’s server looks up that URL in the WARC files provided and replies with
+the original server’s response, which is then returned by the service worker to
+the browser without modification.
 
 Usage
 -----
@@ -52,8 +52,8 @@ the following python packages:
 - flask
 - warcio
 
-swayback uses the hardcoded domain ``swayback.localhost``, which means you need
-to set up your DNS resolver accordingly. An example for unbound looks like
+swayback uses the domain ``swayback.localhost`` by default, which means you
+need to set up your DNS resolver accordingly. An example for unbound looks like
 this:
 
 .. code:: unbound
@@ -75,8 +75,9 @@ HTML pages found in those WARC files.
 Caveats
 -------
 
-- Hardcoded replay domain
 - URL lookup is broken, only HTTPS sites work correctly
+- Absolute hyperlink targets to different domains are not intercepted (service
+  worker limitation)
 
 Related projects
 ----------------
diff --git a/swayback/__init__.py b/swayback/__init__.py
index f609736..d327589 100644
--- a/swayback/__init__.py
+++ b/swayback/__init__.py
@@ -7,8 +7,13 @@ from warcio.archiveiterator import ArchiveIterator
 from warcio.recordloader import ArcWarcRecordLoader
 from warcio.bufferedreaders import DecompressingBufferedReader
 
+class DefaultSettings:
+    BASE_HOST = 'swayback.localhost:5000'
+
 app = Flask(__name__)
 app.url_map.host_matching = True
+app.config.from_object('swayback.DefaultSettings')
+app.config.from_envvar('SWAYBACK_SETTINGS')
 
 htmlindex = []
 urlmap = {}
@@ -25,15 +30,15 @@ for filename in os.listdir ('.'):
                     urlmap[u] = (filename, ai.get_record_offset (), ai.get_record_length ())
                 httpHeaders = record.http_headers
                 if httpHeaders.get_header ('content-type', '').startswith ('text/html'):
-                    rewrittenUrl = urlunparse (('http', u.hostname + '.swayback.localhost:5000', u[2], u[3], u[4], u[5]))
+                    rewrittenUrl = urlunparse (('http', u.hostname + '.' + app.config['BASE_HOST'], u[2], u[3], u[4], u[5]))
                     htmlindex.append ((urlunparse (u), rewrittenUrl, record.rec_headers.get_header ('warc-date')))
 
-@app.route('/', host='swayback.localhost:5000')
+@app.route('/', host=app.config['BASE_HOST'])
 def index ():
     """ A simple index of all HTML pages inside the WARCs """
     return render_template ('index.html', index=htmlindex)
 
-@app.route('/raw', host='swayback.localhost:5000', methods=['OPTIONS'])
+@app.route('/raw', host=app.config['BASE_HOST'], methods=['OPTIONS'])
 def rawPreflight ():
     """ CORS preflight request, allow user-defined fetch() headers """
     resp = make_response ('', 200)
@@ -54,7 +59,7 @@ def lookupRecord (url):
     except KeyError:
         return None
 
-@app.route('/raw', host='swayback.localhost:5000', methods=['POST'])
+@app.route('/raw', host=app.config['BASE_HOST'], methods=['POST'])
 def raw ():
     """ Retrieve the original response for a given request """
     data = request.get_json ()
@@ -74,10 +79,11 @@ def raw ():
 @app.route('/static/sw.js', host='<host>')
 def sw (host=None):
     """ Service worker script needs additional headers """
-    return send_file ('static/sw.js'), {'Service-Worker-Allowed': '/'}
+    headers = {'Service-Worker-Allowed': '/', 'Content-Type': 'application/javascript'}
+    return render_template ('sw.js', baseHost=app.config['BASE_HOST']), headers
 
 # each subdomain will need its own service worker registration
-@app.route('/<path:path>', host='<domain>.swayback.localhost:5000', methods=['GET', 'POST'])
+@app.route('/<path:path>', host='<domain>.' + app.config['BASE_HOST'], methods=['GET', 'POST'])
 def register (path=None, domain=None):
     """ Register a service worker for this origin """
     return render_template ('sw.html')
diff --git a/swayback/static/sw.js b/swayback/static/sw.js
deleted file mode 100644
index 56889c0..0000000
--- a/swayback/static/sw.js
+++ /dev/null
@@ -1,57 +0,0 @@
-self.addEventListener('install', function(event) {
-	console.log ('installed service worker', event);
-	self.skipWaiting();
-});
-/* load stuff through service worker immediately? XXX: only debugging? */
-self.addEventListener('activate', async function() {
-    if (self.registration.navigationPreload) {
-      // Enable navigation preloads!
-      await self.registration.navigationPreload.enable();
-    } /*event => {
-  event.waitUntil(clients.claim());*/
-});
-
-self.addEventListener('fetch', function(event) {
-	let origreq = event.request;
-	console.log ('fetch event', origreq.url, event);
-	let url = new URL (origreq.url);
-	url.protocol = 'https:';
-	url.port = 443;
-	url.hash = '';
-	if (url.hostname.endsWith ('.swayback.localhost')) {
-		url.hostname = url.hostname.slice (0, url.hostname.length-'.swayback.localhost'.length);
-	}
-	console.log ('orig url', url);
-	/* should contain everything we cannot use in the actual request (i.e. url and method) */
-	let body = {
-		'url': url.href,
-		'method': origreq.method,
-		};
-	let headers = {
-		'Content-Type': 'application/json',
-		};
-	/* add a few well-known request headers */
-	let origheaders = origreq.headers;
-	if (origheaders.has ('accept')) {
-		headers['Accept'] = origreq.headers.get ('accept');
-	}
-	console.log ('sending', body, headers);
-	let req = new Request ('http://swayback.localhost:5000/raw',
-			{method: 'POST', body: JSON.stringify (body), headers: headers,
-			mode: 'cors'});
-
-	event.respondWith (
-		fetch(req)
-		.then(function (response) {
-			// response may be used only once
-			// we need to save clone to put one copy in cache
-			// and serve second one
-			let responseClone = response.clone();
-			console.log ('got resp', responseClone);
-			return response;
-		})
-		.catch (function () {
-			console.log ('nope');
-		})
-	);
-});
diff --git a/swayback/templates/sw.js b/swayback/templates/sw.js
new file mode 100644
index 0000000..ea653a9
--- /dev/null
+++ b/swayback/templates/sw.js
@@ -0,0 +1,61 @@
+/* script config, dynamically generated */
+const baseHost = "{{ baseHost }}";
+const baseDomain = baseHost.split (':', 1)[0];
+
+self.addEventListener('install', function(event) {
+	console.log ('installed service worker', event);
+	self.skipWaiting();
+});
+/* load stuff through service worker immediately? XXX: only debugging? */
+self.addEventListener('activate', async function() {
+    if (self.registration.navigationPreload) {
+      // Enable navigation preloads!
+      await self.registration.navigationPreload.enable();
+    } /*event => {
+  event.waitUntil(clients.claim());*/
+});
+
+self.addEventListener('fetch', function(event) {
+	let origreq = event.request;
+	console.log ('fetch event', origreq.url, event);
+	let url = new URL (origreq.url);
+	url.protocol = 'https:';
+	url.port = 443;
+	url.hash = '';
+	if (url.hostname.endsWith ('.' + baseDomain)) {
+		url.hostname = url.hostname.slice (0, url.hostname.length-('.'+baseDomain).length);
+	}
+	console.log ('orig url', url);
+	/* should contain everything we cannot use in the actual request (i.e. url and method) */
+	let body = {
+		'url': url.href,
+		'method': origreq.method,
+		};
+	let headers = {
+		'Content-Type': 'application/json',
+		};
+	/* add a few well-known request headers */
+	let origheaders = origreq.headers;
+	if (origheaders.has ('accept')) {
+		headers['Accept'] = origreq.headers.get ('accept');
+	}
+	console.log ('sending', body, headers);
+	let req = new Request ('http://' + baseHost + '/raw',
+			{method: 'POST', body: JSON.stringify (body), headers: headers,
+			mode: 'cors'});
+
+	event.respondWith (
+		fetch(req)
+		.then(function (response) {
+			// response may be used only once
+			// we need to save clone to put one copy in cache
+			// and serve second one
+			let responseClone = response.clone();
+			console.log ('got resp', responseClone);
+			return response;
+		})
+		.catch (function () {
+			console.log ('nope');
+		})
+	);
+});
-- 
cgit v1.2.3