summaryrefslogtreecommitdiff
path: root/swayback/__init__.py
blob: 80600f32d5c9f3f56fd5bdcc34b467611bd93884 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import os

from flask import Flask, render_template, send_file, request, make_response
from warcio.archiveiterator import ArchiveIterator
from warcio.recordloader import ArcWarcRecordLoader
from warcio.bufferedreaders import DecompressingBufferedReader
from io import BytesIO
from urllib.parse import urlparse, urlunparse

app = Flask(__name__)
app.url_map.host_matching = True

htmlindex = []
urlmap = {}
for filename in os.listdir ('.'):
    if not filename.endswith ('.warc.gz'):
        continue
    print ('using', filename)
    with open(filename, 'rb') as stream:
        ai = ArchiveIterator(stream)
        for record in ai:
            if record.rec_type == 'response':
                u = urlparse (record.rec_headers.get_header('WARC-Target-URI'))
                if u not in urlmap:
                    urlmap[u] = (filename, ai.get_record_offset (), ai.get_record_length ())
                httpHeaders = record.http_headers
                if httpHeaders.get_header ('content-type', '').startswith ('text/html'):
                    rewrittenUrl = urlunparse (('http', u.hostname + '.swayback.localhost:5000', u[2], u[3], u[4], u[5]))
                    htmlindex.append ((urlunparse (u), rewrittenUrl, record.rec_headers.get_header ('warc-date')))

@app.route('/', host='swayback.localhost:5000')
def index ():
    """ A simple index of all HTML pages inside the WARCs """
    return render_template ('index.html', index=htmlindex)

@app.route('/raw', host='swayback.localhost:5000', methods=['POST'])
def raw ():
    """ Retrieve the original response for a given request """
    print (request.form)
    url = urlparse (request.form['url'])
    try:
        filename, offset, length = urlmap[url]
        with open(filename, 'rb') as stream:
            stream.seek (offset, 0)
            buf = BytesIO (stream.read (length))
            loader = ArcWarcRecordLoader ()
            record = loader.parse_record_stream (DecompressingBufferedReader (buf))
            statuscode = record.http_headers.get_statuscode ()
            record.http_headers.remove_header ('Content-Security-Policy')
            record.http_headers.replace_header ('Access-Control-Allow-Origin', '*')
            headers = record.http_headers.headers
            return record.content_stream().read(), statuscode, headers
    except KeyError:
        resp = make_response ('', 404)
        resp.headers.add ('Access-Control-Allow-Origin', '*')
        return resp

@app.route('/static/sw.js', host='<host>')
def sw (host=None):
    """ Service worker script needs additional headers """
    return send_file ('static/sw.js'), {'Service-Worker-Allowed': '/'}

# each subdomain will need its own service worker registration
@app.route('/<path:path>', host='<domain>.swayback.localhost:5000', methods=['GET', 'POST'])
def register (path=None, domain=None):
    """ Register a service worker for this origin """
    return render_template ('sw.html')