1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
|
import os
from flask import Flask, render_template, send_file, request, make_response
from warcio.archiveiterator import ArchiveIterator
from warcio.recordloader import ArcWarcRecordLoader
from warcio.bufferedreaders import DecompressingBufferedReader
from io import BytesIO
from urllib.parse import urlparse, urlunparse
app = Flask(__name__)
app.url_map.host_matching = True
htmlindex = []
urlmap = {}
for filename in os.listdir ('.'):
if not filename.endswith ('.warc.gz'):
continue
print ('using', filename)
with open(filename, 'rb') as stream:
ai = ArchiveIterator(stream)
for record in ai:
if record.rec_type == 'response':
u = urlparse (record.rec_headers.get_header('WARC-Target-URI'))
if u not in urlmap:
urlmap[u] = (filename, ai.get_record_offset (), ai.get_record_length ())
httpHeaders = record.http_headers
if httpHeaders.get_header ('content-type', '').startswith ('text/html'):
rewrittenUrl = urlunparse (('http', u.hostname + '.swayback.localhost:5000', u[2], u[3], u[4], u[5]))
htmlindex.append ((urlunparse (u), rewrittenUrl, record.rec_headers.get_header ('warc-date')))
@app.route('/', host='swayback.localhost:5000')
def index ():
""" A simple index of all HTML pages inside the WARCs """
return render_template ('index.html', index=htmlindex)
@app.route('/raw', host='swayback.localhost:5000', methods=['POST'])
def raw ():
""" Retrieve the original response for a given request """
print (request.form)
url = urlparse (request.form['url'])
try:
filename, offset, length = urlmap[url]
with open(filename, 'rb') as stream:
stream.seek (offset, 0)
buf = BytesIO (stream.read (length))
loader = ArcWarcRecordLoader ()
record = loader.parse_record_stream (DecompressingBufferedReader (buf))
statuscode = record.http_headers.get_statuscode ()
record.http_headers.remove_header ('Content-Security-Policy')
record.http_headers.replace_header ('Access-Control-Allow-Origin', '*')
headers = record.http_headers.headers
return record.content_stream().read(), statuscode, headers
except KeyError:
resp = make_response ('', 404)
resp.headers.add ('Access-Control-Allow-Origin', '*')
return resp
@app.route('/static/sw.js', host='<host>')
def sw (host=None):
""" Service worker script needs additional headers """
return send_file ('static/sw.js'), {'Service-Worker-Allowed': '/'}
# each subdomain will need its own service worker registration
@app.route('/<path:path>', host='<domain>.swayback.localhost:5000', methods=['GET', 'POST'])
def register (path=None, domain=None):
""" Register a service worker for this origin """
return render_template ('sw.html')
|