diff options
author | Lars-Dominik Braun <lars@6xq.net> | 2019-07-02 09:14:55 +0200 |
---|---|---|
committer | Lars-Dominik Braun <lars@6xq.net> | 2019-07-02 09:16:55 +0200 |
commit | 9ff793e96139ed40090ab9d8c3cae99b284858e5 (patch) | |
tree | e1b568fc77c0600a767fea1f541de1d5e85d87a5 | |
parent | 9d8d48358bf44d7a3e4918bcdac3f4ef1348541b (diff) | |
download | crocoite-9ff793e96139ed40090ab9d8c3cae99b284858e5.tar.gz crocoite-9ff793e96139ed40090ab9d8c3cae99b284858e5.tar.bz2 crocoite-9ff793e96139ed40090ab9d8c3cae99b284858e5.zip |
Stabilize WARC headers
In preparation for 1.0 release:
- Correct mime types
- Add X-Crocoite-Type, so logs, scripts, dom-snapshots and screenshots
can be identified easily
- Remove random WARC headers like X-Chrome-Initiator. We don’t want to
maintain those.
- Remove non-standard urn-based package URLs. Can’t use them without a
urn-registration
-rw-r--r-- | crocoite/behavior.py | 10 | ||||
-rw-r--r-- | crocoite/test_tools.py | 5 | ||||
-rw-r--r-- | crocoite/test_warc.py | 28 | ||||
-rw-r--r-- | crocoite/tools.py | 4 | ||||
-rw-r--r-- | crocoite/util.py | 8 | ||||
-rw-r--r-- | crocoite/warc.py | 64 |
6 files changed, 73 insertions, 46 deletions
diff --git a/crocoite/behavior.py b/crocoite/behavior.py index fd4d066..efb2ced 100644 --- a/crocoite/behavior.py +++ b/crocoite/behavior.py @@ -52,11 +52,12 @@ class Script: """ A JavaScript resource """ __slots__ = ('path', 'data') + datadir = 'data' def __init__ (self, path=None, encoding='utf-8'): self.path = path if path: - self.data = pkg_resources.resource_string (__name__, os.path.join ('data', path)).decode (encoding) + self.data = pkg_resources.resource_string (__name__, os.path.join (self.datadir, path)).decode (encoding) def __repr__ (self): return f'<Script {self.path}>' @@ -64,6 +65,11 @@ class Script: def __str__ (self): return self.data + @property + def abspath (self): + return pkg_resources.resource_filename (__name__, + os.path.join (self.datadir, self.path)) + @classmethod def fromStr (cls, data, path=None): s = Script () @@ -140,7 +146,7 @@ class JsOnload (Behavior): constructor = result['objectId'] if self.options: - yield Script.fromStr (json.dumps (self.options, indent=2), f'{self.script.path}/options') + yield Script.fromStr (json.dumps (self.options, indent=2), f'{self.script.path}#options') result = await tab.Runtime.callFunctionOn ( functionDeclaration='function(options){return new this(options);}', objectId=constructor, diff --git a/crocoite/test_tools.py b/crocoite/test_tools.py index 106aa59..e8edb98 100644 --- a/crocoite/test_tools.py +++ b/crocoite/test_tools.py @@ -28,7 +28,6 @@ from warcio.statusandheaders import StatusAndHeaders from pkg_resources import parse_version from .tools import mergeWarc, Errata, FixableErrata -from .util import packageUrl @pytest.fixture def writer(): @@ -49,7 +48,9 @@ def recordsEqual(golden, underTest): def makeGolden(writer, records): # additional warcinfo is written. Content does not matter. - record = writer.create_warc_record (packageUrl ('warcinfo'), 'warcinfo', + record = writer.create_warc_record ( + '', + 'warcinfo', payload=b'', warc_headers_dict={'Content-Type': 'text/plain; encoding=utf-8'}) records.insert (0, record) diff --git a/crocoite/test_warc.py b/crocoite/test_warc.py index 478892a..17f3840 100644 --- a/crocoite/test_warc.py +++ b/crocoite/test_warc.py @@ -57,9 +57,10 @@ def test_log (): fd.seek (0) for it in ArchiveIterator (fd): headers = it.rec_headers - assert headers['warc-type'] == 'resource' - assert headers['warc-target-uri'].endswith (':log') - assert headers['content-type'] == f'text/plain; encoding={handler.logEncoding}' + assert headers['warc-type'] == 'metadata' + assert 'warc-target-uri' not in headers + assert headers['x-crocoite-type'] == 'log' + assert headers['content-type'] == f'application/json; encoding={handler.logEncoding}' while True: l = it.raw_stream.readline () @@ -108,7 +109,8 @@ async def test_push (golden): headers = rec.rec_headers assert headers['warc-type'] == 'warcinfo' - assert headers['warc-target-uri'].endswith (':warcinfo') + assert 'warc-target-uri' not in headers + assert 'x-crocoite-type' not in headers data = json.load (rec.raw_stream) assert data == g.payload @@ -119,11 +121,14 @@ async def test_push (golden): rec = next (it) headers = rec.rec_headers - assert headers['warc-type'] == 'metadata' + assert headers['warc-type'] == 'resource' + assert headers['content-type'] == 'application/javascript; charset=utf-8' + assert headers['x-crocoite-type'] == 'script' checkWarcinfoId (headers) - path = g.path or '-' - goldenpath = f':script/{urllib.parse.quote (path)}' - assert headers['warc-target-uri'].endswith (goldenpath), (g.path, path, goldenpath) + if g.path: + assert URL (headers['warc-target-uri']) == URL ('file://' + g.abspath) + else: + assert 'warc-target-uri' not in headers data = rec.raw_stream.read ().decode ('utf-8') assert data == g.data @@ -133,6 +138,7 @@ async def test_push (golden): headers = rec.rec_headers assert headers['warc-type'] == 'conversion' + assert headers['x-crocoite-type'] == 'screenshot' checkWarcinfoId (headers) assert URL (headers['warc-target-uri']) == g.url, (headers['warc-target-uri'], g.url) assert headers['warc-refers-to'] is None @@ -144,10 +150,10 @@ async def test_push (golden): headers = rec.rec_headers assert headers['warc-type'] == 'conversion' + assert headers['x-crocoite-type'] == 'dom-snapshot' checkWarcinfoId (headers) assert URL (headers['warc-target-uri']) == g.url assert headers['warc-refers-to'] is None - assert headers['X-DOM-Snapshot'] == 'True' assert rec.raw_stream.read () == g.document elif isinstance (g, RequestResponsePair): @@ -156,6 +162,7 @@ async def test_push (golden): # request headers = rec.rec_headers assert headers['warc-type'] == 'request' + assert 'x-crocoite-type' not in headers checkWarcinfoId (headers) assert URL (headers['warc-target-uri']) == g.url assert headers['x-chrome-request-id'] == g.id @@ -164,7 +171,6 @@ async def test_push (golden): if g.request.hasPostData: if g.request.body is not None: assert rec.raw_stream.read () == g.request.body - assert str (headers['x-chrome-base64body'] or False) == str (isinstance (g.request.body, Base64Body)), (headers['x-chrome-base64body'], g.request.body) else: # body fetch failed assert headers['warc-truncated'] == 'unspecified' @@ -181,6 +187,7 @@ async def test_push (golden): checkWarcinfoId (headers) assert URL (headers['warc-target-uri']) == g.url assert headers['x-chrome-request-id'] == g.id + assert 'x-crocoite-type' not in headers # these are checked separately filteredHeaders = CIMultiDict (httpheaders.headers) @@ -197,7 +204,6 @@ async def test_push (golden): if g.response.body is not None: assert rec.raw_stream.read () == g.response.body - assert str (headers['x-chrome-base64body'] or False) == str (isinstance (g.response.body, Base64Body)) assert httpheaders['content-length'] == str (len (g.response.body)) # body is never truncated if it exists assert headers['warc-truncated'] is None diff --git a/crocoite/tools.py b/crocoite/tools.py index bc14f84..42ced35 100644 --- a/crocoite/tools.py +++ b/crocoite/tools.py @@ -31,7 +31,7 @@ from yarl import URL from pkg_resources import parse_version, parse_requirements -from .util import packageUrl, getSoftwareInfo, StrJsonEncoder +from .util import getSoftwareInfo, StrJsonEncoder def mergeWarc (files, output): # stats @@ -57,7 +57,7 @@ def mergeWarc (files, output): 'parameters': {'inputs': files}, } payload = BytesIO (json.dumps (warcinfo, indent=2).encode ('utf-8')) - record = writer.create_warc_record (packageUrl ('warcinfo'), 'warcinfo', + record = writer.create_warc_record ('', 'warcinfo', payload=payload, warc_headers_dict={'Content-Type': 'text/plain; encoding=utf-8'}) writer.write_record (record) diff --git a/crocoite/util.py b/crocoite/util.py index ded5e99..da377a3 100644 --- a/crocoite/util.py +++ b/crocoite/util.py @@ -26,6 +26,8 @@ import random, sys, platform, os, json, urllib from datetime import datetime import hashlib, pkg_resources +from yarl import URL + class StrJsonEncoder (json.JSONEncoder): """ JSON encoder that turns unknown classes into a string and thus never fails """ @@ -39,12 +41,6 @@ class StrJsonEncoder (json.JSONEncoder): except TypeError: return str (obj) -def packageUrl (path): - """ - Create URL for package data stored into WARC - """ - return 'urn:' + __package__ + ':' + urllib.parse.quote (path) - async def getFormattedViewportMetrics (tab): layoutMetrics = await tab.Page.getLayoutMetrics () # XXX: I’m not entirely sure which one we should use here diff --git a/crocoite/warc.py b/crocoite/warc.py index 3a084a1..e1cdf35 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -30,11 +30,17 @@ from http.server import BaseHTTPRequestHandler from warcio.timeutils import datetime_to_iso_date from warcio.warcwriter import WARCWriter from warcio.statusandheaders import StatusAndHeaders +from yarl import URL -from .util import packageUrl, StrJsonEncoder +from .util import StrJsonEncoder from .controller import EventHandler, ControllerStart from .behavior import Script, DomSnapshotEvent, ScreenshotEvent -from .browser import RequestResponsePair, UnicodeBody, Base64Body +from .browser import RequestResponsePair, UnicodeBody + +# the official mimetype for json, according to https://tools.ietf.org/html/rfc8259 +jsonMime = 'application/json' +# mime for javascript, according to https://tools.ietf.org/html/rfc4329#section-7.2 +jsMime = 'application/javascript' class WarcHandler (EventHandler): __slots__ = ('logger', 'writer', 'documentRecords', 'log', @@ -67,6 +73,7 @@ class WarcHandler (EventHandler): Adds default WARC headers. """ + assert url is None or isinstance (url, URL) d = {} if self.warcinfoRecordId: @@ -74,8 +81,11 @@ class WarcHandler (EventHandler): d.update (warc_headers_dict) warc_headers_dict = d - record = self.writer.create_warc_record (str (url), kind, payload=payload, - warc_headers_dict=warc_headers_dict, http_headers=http_headers) + record = self.writer.create_warc_record (str (url) if url else '', + kind, + payload=payload, + warc_headers_dict=warc_headers_dict, + http_headers=http_headers) self.writer.write_record (record) return record @@ -90,7 +100,7 @@ class WarcHandler (EventHandler): httpHeaders = StatusAndHeaders(f'{req.method} {path} HTTP/1.1', req.headers, protocol='HTTP/1.1', is_http_request=True) warcHeaders = { - 'X-Chrome-Initiator': json.dumps (req.initiator), + # required to correlate request with log entries 'X-Chrome-Request-ID': item.id, 'WARC-Date': datetime_to_iso_date (req.timestamp), } @@ -102,7 +112,6 @@ class WarcHandler (EventHandler): uuid='ee9adc58-e723-4595-9feb-312a67ead6a0') warcHeaders['WARC-Truncated'] = 'unspecified' else: - warcHeaders['X-Chrome-Base64Body'] = str (type (body) is Base64Body) body = BytesIO (body) record = self.writeRecord (url, 'request', payload=body, http_headers=httpHeaders, @@ -117,14 +126,13 @@ class WarcHandler (EventHandler): resp = item.response warcHeaders = { 'WARC-Concurrent-To': concurrentTo, + # required to correlate request with log entries 'X-Chrome-Request-ID': item.id, 'WARC-Date': datetime_to_iso_date (resp.timestamp), } # conditional WARC headers if item.remoteIpAddress: warcHeaders['WARC-IP-Address'] = item.remoteIpAddress - if item.protocol: - warcHeaders['X-Chrome-Protocol'] = item.protocol # HTTP headers statusText = resp.statusText or \ @@ -153,7 +161,6 @@ class WarcHandler (EventHandler): warcHeaders['WARC-Truncated'] = 'unspecified' else: httpHeaders.replace_header ('Content-Length', str (len (body))) - warcHeaders['X-Chrome-Base64Body'] = str (type (body) is Base64Body) body = BytesIO (body) record = self.writeRecord (item.url, 'response', @@ -166,11 +173,15 @@ class WarcHandler (EventHandler): def _writeScript (self, item): writer = self.writer encoding = 'utf-8' - path = item.path or '-' - self.writeRecord (packageUrl (f'script/{path}'), 'metadata', + # XXX: yes, we’re leaking information about the user here, but this is + # the one and only source URL of the scripts. + uri = URL(f'file://{item.abspath}') if item.path else None + self.writeRecord (uri, 'resource', payload=BytesIO (str (item).encode (encoding)), - warc_headers_dict={'Content-Type': - f'application/javascript; charset={encoding}'}) + warc_headers_dict={ + 'Content-Type': f'{jsMime}; charset={encoding}', + 'X-Crocoite-Type': 'script', + }) def _writeItem (self, item): assert item.request @@ -190,7 +201,8 @@ class WarcHandler (EventHandler): def _writeDomSnapshot (self, item): writer = self.writer - warcHeaders = {'X-DOM-Snapshot': str (True), + warcHeaders = { + 'X-Crocoite-Type': 'dom-snapshot', 'X-Chrome-Viewport': item.viewport, 'Content-Type': 'text/html; charset=utf-8', } @@ -203,18 +215,21 @@ class WarcHandler (EventHandler): def _writeScreenshot (self, item): writer = self.writer - warcHeaders = {'Content-Type': 'image/png', - 'X-Crocoite-Screenshot-Y-Offset': str (item.yoff)} + warcHeaders = { + 'Content-Type': 'image/png', + 'X-Crocoite-Screenshot-Y-Offset': str (item.yoff), + 'X-Crocoite-Type': 'screenshot', + } self._addRefersTo (warcHeaders, item.url) self.writeRecord (item.url, 'conversion', payload=BytesIO (item.data), warc_headers_dict=warcHeaders) - def _writeControllerStart (self, item): - payload = BytesIO (json.dumps (item.payload, indent=2, cls=StrJsonEncoder).encode ('utf-8')) + def _writeControllerStart (self, item, encoding='utf-8'): + payload = BytesIO (json.dumps (item.payload, indent=2, cls=StrJsonEncoder).encode (encoding)) writer = self.writer - warcinfo = self.writeRecord (packageUrl ('warcinfo'), 'warcinfo', - warc_headers_dict={'Content-Type': 'text/plain; encoding=utf-8'}, + warcinfo = self.writeRecord (None, 'warcinfo', + warc_headers_dict={'Content-Type': f'{jsonMime}; encoding={encoding}'}, payload=payload) self.warcinfoRecordId = warcinfo.rec_headers['WARC-Record-ID'] @@ -222,9 +237,12 @@ class WarcHandler (EventHandler): if self.log.tell () > 0: writer = self.writer self.log.seek (0) - # XXX: we should use the type continuation here - self.writeRecord (packageUrl ('log'), 'resource', payload=self.log, - warc_headers_dict={'Content-Type': f'text/plain; encoding={self.logEncoding}'}) + warcHeaders = { + 'Content-Type': f'application/json; encoding={self.logEncoding}', + 'X-Crocoite-Type': 'log', + } + self.writeRecord (None, 'metadata', payload=self.log, + warc_headers_dict=warcHeaders) self.log = BytesIO () def _writeLog (self, item): |