summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2018-03-16 15:06:38 +0100
committerLars-Dominik Braun <lars@6xq.net>2018-03-16 15:06:38 +0100
commit5d1bd2891183e93f43f5fdb67e0ffcefabcc8eed (patch)
tree2b77d24e40bf5d6cdbead12e6dfa9f6d062150fa
parentd187373fd3b8f3aa16af8998f4e6481bb4505e60 (diff)
downloadswayback-master.tar.gz
swayback-master.tar.bz2
swayback-master.zip
Configureable domainHEADmaster
-rw-r--r--README.rst19
-rw-r--r--swayback/__init__.py18
-rw-r--r--swayback/templates/sw.js (renamed from swayback/static/sw.js)10
3 files changed, 29 insertions, 18 deletions
diff --git a/README.rst b/README.rst
index fd4ee93..4c948ac 100644
--- a/README.rst
+++ b/README.rst
@@ -34,14 +34,14 @@ any request a web app makes. This is exactly what is needed to properly replay
archived web apps.
swayback provides an HTTP server, responing to queries for the wildcard
-domain ``*.swayback.localhost``. The page served first installs a service
-worker and then reloads the page. Now the service worker is in control of
-network requests and rewrites a request like (for instance)
+domain, which is ``*.swayback.localhost`` by default. The page served first
+installs a service worker and then reloads the page. Now the service worker is
+in control of network requests and rewrites a request like (for instance)
``www.instagram.com.swayback.localhost:5000/bluebellwooi/`` to
``swayback.localhost:5000/raw`` with the real URL in the POST request body.
-swayback’s server looks up that URL in the WARC files provided and replies
-with the original server’s response, which is then returned by the service
-worker to the browser without modification.
+swayback’s server looks up that URL in the WARC files provided and replies with
+the original server’s response, which is then returned by the service worker to
+the browser without modification.
Usage
-----
@@ -52,8 +52,8 @@ the following python packages:
- flask
- warcio
-swayback uses the hardcoded domain ``swayback.localhost``, which means you need
-to set up your DNS resolver accordingly. An example for unbound looks like
+swayback uses the domain ``swayback.localhost`` by default, which means you
+need to set up your DNS resolver accordingly. An example for unbound looks like
this:
.. code:: unbound
@@ -75,8 +75,9 @@ HTML pages found in those WARC files.
Caveats
-------
-- Hardcoded replay domain
- URL lookup is broken, only HTTPS sites work correctly
+- Absolute hyperlink targets to different domains are not intercepted (service
+ worker limitation)
Related projects
----------------
diff --git a/swayback/__init__.py b/swayback/__init__.py
index f609736..d327589 100644
--- a/swayback/__init__.py
+++ b/swayback/__init__.py
@@ -7,8 +7,13 @@ from warcio.archiveiterator import ArchiveIterator
from warcio.recordloader import ArcWarcRecordLoader
from warcio.bufferedreaders import DecompressingBufferedReader
+class DefaultSettings:
+ BASE_HOST = 'swayback.localhost:5000'
+
app = Flask(__name__)
app.url_map.host_matching = True
+app.config.from_object('swayback.DefaultSettings')
+app.config.from_envvar('SWAYBACK_SETTINGS')
htmlindex = []
urlmap = {}
@@ -25,15 +30,15 @@ for filename in os.listdir ('.'):
urlmap[u] = (filename, ai.get_record_offset (), ai.get_record_length ())
httpHeaders = record.http_headers
if httpHeaders.get_header ('content-type', '').startswith ('text/html'):
- rewrittenUrl = urlunparse (('http', u.hostname + '.swayback.localhost:5000', u[2], u[3], u[4], u[5]))
+ rewrittenUrl = urlunparse (('http', u.hostname + '.' + app.config['BASE_HOST'], u[2], u[3], u[4], u[5]))
htmlindex.append ((urlunparse (u), rewrittenUrl, record.rec_headers.get_header ('warc-date')))
-@app.route('/', host='swayback.localhost:5000')
+@app.route('/', host=app.config['BASE_HOST'])
def index ():
""" A simple index of all HTML pages inside the WARCs """
return render_template ('index.html', index=htmlindex)
-@app.route('/raw', host='swayback.localhost:5000', methods=['OPTIONS'])
+@app.route('/raw', host=app.config['BASE_HOST'], methods=['OPTIONS'])
def rawPreflight ():
""" CORS preflight request, allow user-defined fetch() headers """
resp = make_response ('', 200)
@@ -54,7 +59,7 @@ def lookupRecord (url):
except KeyError:
return None
-@app.route('/raw', host='swayback.localhost:5000', methods=['POST'])
+@app.route('/raw', host=app.config['BASE_HOST'], methods=['POST'])
def raw ():
""" Retrieve the original response for a given request """
data = request.get_json ()
@@ -74,10 +79,11 @@ def raw ():
@app.route('/static/sw.js', host='<host>')
def sw (host=None):
""" Service worker script needs additional headers """
- return send_file ('static/sw.js'), {'Service-Worker-Allowed': '/'}
+ headers = {'Service-Worker-Allowed': '/', 'Content-Type': 'application/javascript'}
+ return render_template ('sw.js', baseHost=app.config['BASE_HOST']), headers
# each subdomain will need its own service worker registration
-@app.route('/<path:path>', host='<domain>.swayback.localhost:5000', methods=['GET', 'POST'])
+@app.route('/<path:path>', host='<domain>.' + app.config['BASE_HOST'], methods=['GET', 'POST'])
def register (path=None, domain=None):
""" Register a service worker for this origin """
return render_template ('sw.html')
diff --git a/swayback/static/sw.js b/swayback/templates/sw.js
index 56889c0..ea653a9 100644
--- a/swayback/static/sw.js
+++ b/swayback/templates/sw.js
@@ -1,3 +1,7 @@
+/* script config, dynamically generated */
+const baseHost = "{{ baseHost }}";
+const baseDomain = baseHost.split (':', 1)[0];
+
self.addEventListener('install', function(event) {
console.log ('installed service worker', event);
self.skipWaiting();
@@ -18,8 +22,8 @@ self.addEventListener('fetch', function(event) {
url.protocol = 'https:';
url.port = 443;
url.hash = '';
- if (url.hostname.endsWith ('.swayback.localhost')) {
- url.hostname = url.hostname.slice (0, url.hostname.length-'.swayback.localhost'.length);
+ if (url.hostname.endsWith ('.' + baseDomain)) {
+ url.hostname = url.hostname.slice (0, url.hostname.length-('.'+baseDomain).length);
}
console.log ('orig url', url);
/* should contain everything we cannot use in the actual request (i.e. url and method) */
@@ -36,7 +40,7 @@ self.addEventListener('fetch', function(event) {
headers['Accept'] = origreq.headers.get ('accept');
}
console.log ('sending', body, headers);
- let req = new Request ('http://swayback.localhost:5000/raw',
+ let req = new Request ('http://' + baseHost + '/raw',
{method: 'POST', body: JSON.stringify (body), headers: headers,
mode: 'cors'});