summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2019-07-02 09:14:55 +0200
committerLars-Dominik Braun <lars@6xq.net>2019-07-02 09:16:55 +0200
commit9ff793e96139ed40090ab9d8c3cae99b284858e5 (patch)
treee1b568fc77c0600a767fea1f541de1d5e85d87a5
parent9d8d48358bf44d7a3e4918bcdac3f4ef1348541b (diff)
downloadcrocoite-9ff793e96139ed40090ab9d8c3cae99b284858e5.tar.gz
crocoite-9ff793e96139ed40090ab9d8c3cae99b284858e5.tar.bz2
crocoite-9ff793e96139ed40090ab9d8c3cae99b284858e5.zip
Stabilize WARC headers
In preparation for 1.0 release: - Correct mime types - Add X-Crocoite-Type, so logs, scripts, dom-snapshots and screenshots can be identified easily - Remove random WARC headers like X-Chrome-Initiator. We don’t want to maintain those. - Remove non-standard urn-based package URLs. Can’t use them without a urn-registration
-rw-r--r--crocoite/behavior.py10
-rw-r--r--crocoite/test_tools.py5
-rw-r--r--crocoite/test_warc.py28
-rw-r--r--crocoite/tools.py4
-rw-r--r--crocoite/util.py8
-rw-r--r--crocoite/warc.py64
6 files changed, 73 insertions, 46 deletions
diff --git a/crocoite/behavior.py b/crocoite/behavior.py
index fd4d066..efb2ced 100644
--- a/crocoite/behavior.py
+++ b/crocoite/behavior.py
@@ -52,11 +52,12 @@ class Script:
""" A JavaScript resource """
__slots__ = ('path', 'data')
+ datadir = 'data'
def __init__ (self, path=None, encoding='utf-8'):
self.path = path
if path:
- self.data = pkg_resources.resource_string (__name__, os.path.join ('data', path)).decode (encoding)
+ self.data = pkg_resources.resource_string (__name__, os.path.join (self.datadir, path)).decode (encoding)
def __repr__ (self):
return f'<Script {self.path}>'
@@ -64,6 +65,11 @@ class Script:
def __str__ (self):
return self.data
+ @property
+ def abspath (self):
+ return pkg_resources.resource_filename (__name__,
+ os.path.join (self.datadir, self.path))
+
@classmethod
def fromStr (cls, data, path=None):
s = Script ()
@@ -140,7 +146,7 @@ class JsOnload (Behavior):
constructor = result['objectId']
if self.options:
- yield Script.fromStr (json.dumps (self.options, indent=2), f'{self.script.path}/options')
+ yield Script.fromStr (json.dumps (self.options, indent=2), f'{self.script.path}#options')
result = await tab.Runtime.callFunctionOn (
functionDeclaration='function(options){return new this(options);}',
objectId=constructor,
diff --git a/crocoite/test_tools.py b/crocoite/test_tools.py
index 106aa59..e8edb98 100644
--- a/crocoite/test_tools.py
+++ b/crocoite/test_tools.py
@@ -28,7 +28,6 @@ from warcio.statusandheaders import StatusAndHeaders
from pkg_resources import parse_version
from .tools import mergeWarc, Errata, FixableErrata
-from .util import packageUrl
@pytest.fixture
def writer():
@@ -49,7 +48,9 @@ def recordsEqual(golden, underTest):
def makeGolden(writer, records):
# additional warcinfo is written. Content does not matter.
- record = writer.create_warc_record (packageUrl ('warcinfo'), 'warcinfo',
+ record = writer.create_warc_record (
+ '',
+ 'warcinfo',
payload=b'',
warc_headers_dict={'Content-Type': 'text/plain; encoding=utf-8'})
records.insert (0, record)
diff --git a/crocoite/test_warc.py b/crocoite/test_warc.py
index 478892a..17f3840 100644
--- a/crocoite/test_warc.py
+++ b/crocoite/test_warc.py
@@ -57,9 +57,10 @@ def test_log ():
fd.seek (0)
for it in ArchiveIterator (fd):
headers = it.rec_headers
- assert headers['warc-type'] == 'resource'
- assert headers['warc-target-uri'].endswith (':log')
- assert headers['content-type'] == f'text/plain; encoding={handler.logEncoding}'
+ assert headers['warc-type'] == 'metadata'
+ assert 'warc-target-uri' not in headers
+ assert headers['x-crocoite-type'] == 'log'
+ assert headers['content-type'] == f'application/json; encoding={handler.logEncoding}'
while True:
l = it.raw_stream.readline ()
@@ -108,7 +109,8 @@ async def test_push (golden):
headers = rec.rec_headers
assert headers['warc-type'] == 'warcinfo'
- assert headers['warc-target-uri'].endswith (':warcinfo')
+ assert 'warc-target-uri' not in headers
+ assert 'x-crocoite-type' not in headers
data = json.load (rec.raw_stream)
assert data == g.payload
@@ -119,11 +121,14 @@ async def test_push (golden):
rec = next (it)
headers = rec.rec_headers
- assert headers['warc-type'] == 'metadata'
+ assert headers['warc-type'] == 'resource'
+ assert headers['content-type'] == 'application/javascript; charset=utf-8'
+ assert headers['x-crocoite-type'] == 'script'
checkWarcinfoId (headers)
- path = g.path or '-'
- goldenpath = f':script/{urllib.parse.quote (path)}'
- assert headers['warc-target-uri'].endswith (goldenpath), (g.path, path, goldenpath)
+ if g.path:
+ assert URL (headers['warc-target-uri']) == URL ('file://' + g.abspath)
+ else:
+ assert 'warc-target-uri' not in headers
data = rec.raw_stream.read ().decode ('utf-8')
assert data == g.data
@@ -133,6 +138,7 @@ async def test_push (golden):
headers = rec.rec_headers
assert headers['warc-type'] == 'conversion'
+ assert headers['x-crocoite-type'] == 'screenshot'
checkWarcinfoId (headers)
assert URL (headers['warc-target-uri']) == g.url, (headers['warc-target-uri'], g.url)
assert headers['warc-refers-to'] is None
@@ -144,10 +150,10 @@ async def test_push (golden):
headers = rec.rec_headers
assert headers['warc-type'] == 'conversion'
+ assert headers['x-crocoite-type'] == 'dom-snapshot'
checkWarcinfoId (headers)
assert URL (headers['warc-target-uri']) == g.url
assert headers['warc-refers-to'] is None
- assert headers['X-DOM-Snapshot'] == 'True'
assert rec.raw_stream.read () == g.document
elif isinstance (g, RequestResponsePair):
@@ -156,6 +162,7 @@ async def test_push (golden):
# request
headers = rec.rec_headers
assert headers['warc-type'] == 'request'
+ assert 'x-crocoite-type' not in headers
checkWarcinfoId (headers)
assert URL (headers['warc-target-uri']) == g.url
assert headers['x-chrome-request-id'] == g.id
@@ -164,7 +171,6 @@ async def test_push (golden):
if g.request.hasPostData:
if g.request.body is not None:
assert rec.raw_stream.read () == g.request.body
- assert str (headers['x-chrome-base64body'] or False) == str (isinstance (g.request.body, Base64Body)), (headers['x-chrome-base64body'], g.request.body)
else:
# body fetch failed
assert headers['warc-truncated'] == 'unspecified'
@@ -181,6 +187,7 @@ async def test_push (golden):
checkWarcinfoId (headers)
assert URL (headers['warc-target-uri']) == g.url
assert headers['x-chrome-request-id'] == g.id
+ assert 'x-crocoite-type' not in headers
# these are checked separately
filteredHeaders = CIMultiDict (httpheaders.headers)
@@ -197,7 +204,6 @@ async def test_push (golden):
if g.response.body is not None:
assert rec.raw_stream.read () == g.response.body
- assert str (headers['x-chrome-base64body'] or False) == str (isinstance (g.response.body, Base64Body))
assert httpheaders['content-length'] == str (len (g.response.body))
# body is never truncated if it exists
assert headers['warc-truncated'] is None
diff --git a/crocoite/tools.py b/crocoite/tools.py
index bc14f84..42ced35 100644
--- a/crocoite/tools.py
+++ b/crocoite/tools.py
@@ -31,7 +31,7 @@ from yarl import URL
from pkg_resources import parse_version, parse_requirements
-from .util import packageUrl, getSoftwareInfo, StrJsonEncoder
+from .util import getSoftwareInfo, StrJsonEncoder
def mergeWarc (files, output):
# stats
@@ -57,7 +57,7 @@ def mergeWarc (files, output):
'parameters': {'inputs': files},
}
payload = BytesIO (json.dumps (warcinfo, indent=2).encode ('utf-8'))
- record = writer.create_warc_record (packageUrl ('warcinfo'), 'warcinfo',
+ record = writer.create_warc_record ('', 'warcinfo',
payload=payload,
warc_headers_dict={'Content-Type': 'text/plain; encoding=utf-8'})
writer.write_record (record)
diff --git a/crocoite/util.py b/crocoite/util.py
index ded5e99..da377a3 100644
--- a/crocoite/util.py
+++ b/crocoite/util.py
@@ -26,6 +26,8 @@ import random, sys, platform, os, json, urllib
from datetime import datetime
import hashlib, pkg_resources
+from yarl import URL
+
class StrJsonEncoder (json.JSONEncoder):
""" JSON encoder that turns unknown classes into a string and thus never
fails """
@@ -39,12 +41,6 @@ class StrJsonEncoder (json.JSONEncoder):
except TypeError:
return str (obj)
-def packageUrl (path):
- """
- Create URL for package data stored into WARC
- """
- return 'urn:' + __package__ + ':' + urllib.parse.quote (path)
-
async def getFormattedViewportMetrics (tab):
layoutMetrics = await tab.Page.getLayoutMetrics ()
# XXX: I’m not entirely sure which one we should use here
diff --git a/crocoite/warc.py b/crocoite/warc.py
index 3a084a1..e1cdf35 100644
--- a/crocoite/warc.py
+++ b/crocoite/warc.py
@@ -30,11 +30,17 @@ from http.server import BaseHTTPRequestHandler
from warcio.timeutils import datetime_to_iso_date
from warcio.warcwriter import WARCWriter
from warcio.statusandheaders import StatusAndHeaders
+from yarl import URL
-from .util import packageUrl, StrJsonEncoder
+from .util import StrJsonEncoder
from .controller import EventHandler, ControllerStart
from .behavior import Script, DomSnapshotEvent, ScreenshotEvent
-from .browser import RequestResponsePair, UnicodeBody, Base64Body
+from .browser import RequestResponsePair, UnicodeBody
+
+# the official mimetype for json, according to https://tools.ietf.org/html/rfc8259
+jsonMime = 'application/json'
+# mime for javascript, according to https://tools.ietf.org/html/rfc4329#section-7.2
+jsMime = 'application/javascript'
class WarcHandler (EventHandler):
__slots__ = ('logger', 'writer', 'documentRecords', 'log',
@@ -67,6 +73,7 @@ class WarcHandler (EventHandler):
Adds default WARC headers.
"""
+ assert url is None or isinstance (url, URL)
d = {}
if self.warcinfoRecordId:
@@ -74,8 +81,11 @@ class WarcHandler (EventHandler):
d.update (warc_headers_dict)
warc_headers_dict = d
- record = self.writer.create_warc_record (str (url), kind, payload=payload,
- warc_headers_dict=warc_headers_dict, http_headers=http_headers)
+ record = self.writer.create_warc_record (str (url) if url else '',
+ kind,
+ payload=payload,
+ warc_headers_dict=warc_headers_dict,
+ http_headers=http_headers)
self.writer.write_record (record)
return record
@@ -90,7 +100,7 @@ class WarcHandler (EventHandler):
httpHeaders = StatusAndHeaders(f'{req.method} {path} HTTP/1.1',
req.headers, protocol='HTTP/1.1', is_http_request=True)
warcHeaders = {
- 'X-Chrome-Initiator': json.dumps (req.initiator),
+ # required to correlate request with log entries
'X-Chrome-Request-ID': item.id,
'WARC-Date': datetime_to_iso_date (req.timestamp),
}
@@ -102,7 +112,6 @@ class WarcHandler (EventHandler):
uuid='ee9adc58-e723-4595-9feb-312a67ead6a0')
warcHeaders['WARC-Truncated'] = 'unspecified'
else:
- warcHeaders['X-Chrome-Base64Body'] = str (type (body) is Base64Body)
body = BytesIO (body)
record = self.writeRecord (url, 'request',
payload=body, http_headers=httpHeaders,
@@ -117,14 +126,13 @@ class WarcHandler (EventHandler):
resp = item.response
warcHeaders = {
'WARC-Concurrent-To': concurrentTo,
+ # required to correlate request with log entries
'X-Chrome-Request-ID': item.id,
'WARC-Date': datetime_to_iso_date (resp.timestamp),
}
# conditional WARC headers
if item.remoteIpAddress:
warcHeaders['WARC-IP-Address'] = item.remoteIpAddress
- if item.protocol:
- warcHeaders['X-Chrome-Protocol'] = item.protocol
# HTTP headers
statusText = resp.statusText or \
@@ -153,7 +161,6 @@ class WarcHandler (EventHandler):
warcHeaders['WARC-Truncated'] = 'unspecified'
else:
httpHeaders.replace_header ('Content-Length', str (len (body)))
- warcHeaders['X-Chrome-Base64Body'] = str (type (body) is Base64Body)
body = BytesIO (body)
record = self.writeRecord (item.url, 'response',
@@ -166,11 +173,15 @@ class WarcHandler (EventHandler):
def _writeScript (self, item):
writer = self.writer
encoding = 'utf-8'
- path = item.path or '-'
- self.writeRecord (packageUrl (f'script/{path}'), 'metadata',
+ # XXX: yes, we’re leaking information about the user here, but this is
+ # the one and only source URL of the scripts.
+ uri = URL(f'file://{item.abspath}') if item.path else None
+ self.writeRecord (uri, 'resource',
payload=BytesIO (str (item).encode (encoding)),
- warc_headers_dict={'Content-Type':
- f'application/javascript; charset={encoding}'})
+ warc_headers_dict={
+ 'Content-Type': f'{jsMime}; charset={encoding}',
+ 'X-Crocoite-Type': 'script',
+ })
def _writeItem (self, item):
assert item.request
@@ -190,7 +201,8 @@ class WarcHandler (EventHandler):
def _writeDomSnapshot (self, item):
writer = self.writer
- warcHeaders = {'X-DOM-Snapshot': str (True),
+ warcHeaders = {
+ 'X-Crocoite-Type': 'dom-snapshot',
'X-Chrome-Viewport': item.viewport,
'Content-Type': 'text/html; charset=utf-8',
}
@@ -203,18 +215,21 @@ class WarcHandler (EventHandler):
def _writeScreenshot (self, item):
writer = self.writer
- warcHeaders = {'Content-Type': 'image/png',
- 'X-Crocoite-Screenshot-Y-Offset': str (item.yoff)}
+ warcHeaders = {
+ 'Content-Type': 'image/png',
+ 'X-Crocoite-Screenshot-Y-Offset': str (item.yoff),
+ 'X-Crocoite-Type': 'screenshot',
+ }
self._addRefersTo (warcHeaders, item.url)
self.writeRecord (item.url, 'conversion',
payload=BytesIO (item.data), warc_headers_dict=warcHeaders)
- def _writeControllerStart (self, item):
- payload = BytesIO (json.dumps (item.payload, indent=2, cls=StrJsonEncoder).encode ('utf-8'))
+ def _writeControllerStart (self, item, encoding='utf-8'):
+ payload = BytesIO (json.dumps (item.payload, indent=2, cls=StrJsonEncoder).encode (encoding))
writer = self.writer
- warcinfo = self.writeRecord (packageUrl ('warcinfo'), 'warcinfo',
- warc_headers_dict={'Content-Type': 'text/plain; encoding=utf-8'},
+ warcinfo = self.writeRecord (None, 'warcinfo',
+ warc_headers_dict={'Content-Type': f'{jsonMime}; encoding={encoding}'},
payload=payload)
self.warcinfoRecordId = warcinfo.rec_headers['WARC-Record-ID']
@@ -222,9 +237,12 @@ class WarcHandler (EventHandler):
if self.log.tell () > 0:
writer = self.writer
self.log.seek (0)
- # XXX: we should use the type continuation here
- self.writeRecord (packageUrl ('log'), 'resource', payload=self.log,
- warc_headers_dict={'Content-Type': f'text/plain; encoding={self.logEncoding}'})
+ warcHeaders = {
+ 'Content-Type': f'application/json; encoding={self.logEncoding}',
+ 'X-Crocoite-Type': 'log',
+ }
+ self.writeRecord (None, 'metadata', payload=self.log,
+ warc_headers_dict=warcHeaders)
self.log = BytesIO ()
def _writeLog (self, item):