diff options
Diffstat (limited to 'crocoite/warc.py')
-rw-r--r-- | crocoite/warc.py | 197 |
1 files changed, 100 insertions, 97 deletions
diff --git a/crocoite/warc.py b/crocoite/warc.py index ebc460d..415b487 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -24,24 +24,36 @@ Classes writing data to WARC files import json, threading from io import BytesIO -from urllib.parse import urlsplit from datetime import datetime +from http.server import BaseHTTPRequestHandler from warcio.timeutils import datetime_to_iso_date from warcio.warcwriter import WARCWriter from warcio.statusandheaders import StatusAndHeaders +from yarl import URL -from .util import packageUrl +from .util import StrJsonEncoder from .controller import EventHandler, ControllerStart from .behavior import Script, DomSnapshotEvent, ScreenshotEvent -from .browser import Item +from .browser import RequestResponsePair, UnicodeBody + +# the official mimetype for json, according to https://tools.ietf.org/html/rfc8259 +jsonMime = 'application/json' +# mime for javascript, according to https://tools.ietf.org/html/rfc4329#section-7.2 +jsMime = 'application/javascript' + +def makeContentType (mime, charset=None): + """ Create value of Content-Type WARC header with optional charset """ + s = [mime] + if charset: + s.extend (['; charset=', charset]) + return ''.join (s) class WarcHandler (EventHandler): __slots__ = ('logger', 'writer', 'documentRecords', 'log', 'maxLogSize', 'logEncoding', 'warcinfoRecordId') - def __init__ (self, fd, - logger): + def __init__ (self, fd, logger): self.logger = logger self.writer = WARCWriter (fd, gzip=True) @@ -68,6 +80,7 @@ class WarcHandler (EventHandler): Adds default WARC headers. """ + assert url is None or isinstance (url, URL) d = {} if self.warcinfoRecordId: @@ -75,8 +88,11 @@ class WarcHandler (EventHandler): d.update (warc_headers_dict) warc_headers_dict = d - record = self.writer.create_warc_record (url, kind, payload=payload, - warc_headers_dict=warc_headers_dict, http_headers=http_headers) + record = self.writer.create_warc_record (str (url) if url else '', + kind, + payload=payload, + warc_headers_dict=warc_headers_dict, + http_headers=http_headers) self.writer.write_record (record) return record @@ -85,72 +101,52 @@ class WarcHandler (EventHandler): logger = self.logger.bind (reqId=item.id) req = item.request - resp = item.response - url = urlsplit (resp['url']) - - path = url.path - if url.query: - path += '?' + url.query - httpHeaders = StatusAndHeaders('{} {} HTTP/1.1'.format (req['method'], path), - item.requestHeaders, protocol='HTTP/1.1', is_http_request=True) - initiator = item.initiator + url = item.url + + path = url.relative().with_fragment(None) + httpHeaders = StatusAndHeaders(f'{req.method} {path} HTTP/1.1', + req.headers, protocol='HTTP/1.1', is_http_request=True) warcHeaders = { - 'X-Chrome-Initiator': json.dumps (initiator), + # required to correlate request with log entries 'X-Chrome-Request-ID': item.id, - 'WARC-Date': datetime_to_iso_date (datetime.utcfromtimestamp (item.chromeRequest['wallTime'])), + 'WARC-Date': datetime_to_iso_date (req.timestamp), } - if item.requestBody is not None: - payload, payloadBase64Encoded = item.requestBody - else: + body = item.request.body + if item.request.hasPostData and body is None: # oops, don’t know what went wrong here - logger.error ('requestBody missing', uuid='ee9adc58-e723-4595-9feb-312a67ead6a0') + logger.error ('requestBody missing', + uuid='ee9adc58-e723-4595-9feb-312a67ead6a0') warcHeaders['WARC-Truncated'] = 'unspecified' - payload = None - - if payload: - payload = BytesIO (payload) - warcHeaders['X-Chrome-Base64Body'] = str (payloadBase64Encoded) - record = self.writeRecord (req['url'], 'request', - payload=payload, http_headers=httpHeaders, + else: + body = BytesIO (body) + record = self.writeRecord (url, 'request', + payload=body, http_headers=httpHeaders, warc_headers_dict=warcHeaders) return record.rec_headers['WARC-Record-ID'] def _writeResponse (self, item, concurrentTo): # fetch the body reqId = item.id - rawBody = None - base64Encoded = False - bodyTruncated = None - if item.isRedirect or item.body is None: - # redirects reuse the same request, thus we cannot safely retrieve - # the body (i.e getResponseBody may return the new location’s - # body). No body available means we failed to retrieve it. - bodyTruncated = 'unspecified' - else: - rawBody, base64Encoded = item.body # now the response resp = item.response warcHeaders = { 'WARC-Concurrent-To': concurrentTo, - 'WARC-IP-Address': resp.get ('remoteIPAddress', ''), - 'X-Chrome-Protocol': resp.get ('protocol', ''), - 'X-Chrome-FromDiskCache': str (resp.get ('fromDiskCache')), - 'X-Chrome-ConnectionReused': str (resp.get ('connectionReused')), + # required to correlate request with log entries 'X-Chrome-Request-ID': item.id, - 'WARC-Date': datetime_to_iso_date (datetime.utcfromtimestamp ( - item.chromeRequest['wallTime']+ - (item.chromeResponse['timestamp']-item.chromeRequest['timestamp']))), + 'WARC-Date': datetime_to_iso_date (resp.timestamp), } - if bodyTruncated: - warcHeaders['WARC-Truncated'] = bodyTruncated - else: - warcHeaders['X-Chrome-Base64Body'] = str (base64Encoded) + # conditional WARC headers + if item.remoteIpAddress: + warcHeaders['WARC-IP-Address'] = item.remoteIpAddress - httpHeaders = StatusAndHeaders('{} {}'.format (resp['status'], - item.statusText), item.responseHeaders, - protocol='HTTP/1.1') + # HTTP headers + statusText = resp.statusText or \ + BaseHTTPRequestHandler.responses.get ( + resp.status, ('No status text available', ))[0] + httpHeaders = StatusAndHeaders(f'{resp.status} {statusText}', + resp.headers, protocol='HTTP/1.1') # Content is saved decompressed and decoded, remove these headers blacklistedHeaders = {'transfer-encoding', 'content-encoding'} @@ -160,20 +156,21 @@ class WarcHandler (EventHandler): # chrome sends nothing but utf8 encoded text. Fortunately HTTP # headers take precedence over the document’s <meta>, thus we can # easily override those. - contentType = resp.get ('mimeType') - if contentType: - if not base64Encoded: - contentType += '; charset=utf-8' - httpHeaders.replace_header ('content-type', contentType) - - if rawBody is not None: - httpHeaders.replace_header ('content-length', '{:d}'.format (len (rawBody))) - bodyIo = BytesIO (rawBody) + if resp.mimeType: + charset = 'utf-8' if isinstance (resp.body, UnicodeBody) else None + contentType = makeContentType (resp.mimeType, charset=charset) + httpHeaders.replace_header ('Content-Type', contentType) + + # response body + body = resp.body + if body is None: + warcHeaders['WARC-Truncated'] = 'unspecified' else: - bodyIo = BytesIO () + httpHeaders.replace_header ('Content-Length', str (len (body))) + body = BytesIO (body) - record = self.writeRecord (resp['url'], 'response', - warc_headers_dict=warcHeaders, payload=bodyIo, + record = self.writeRecord (item.url, 'response', + warc_headers_dict=warcHeaders, payload=body, http_headers=httpHeaders) if item.resourceType == 'Document': @@ -182,32 +179,38 @@ class WarcHandler (EventHandler): def _writeScript (self, item): writer = self.writer encoding = 'utf-8' - self.writeRecord (packageUrl ('script/{}'.format (item.path)), 'metadata', + # XXX: yes, we’re leaking information about the user here, but this is + # the one and only source URL of the scripts. + uri = URL(f'file://{item.abspath}') if item.path else None + self.writeRecord (uri, 'resource', payload=BytesIO (str (item).encode (encoding)), - warc_headers_dict={'Content-Type': 'application/javascript; charset={}'.format (encoding)}) + warc_headers_dict={ + 'Content-Type': makeContentType (jsMime, encoding), + 'X-Crocoite-Type': 'script', + }) def _writeItem (self, item): - if item.failed: - # should have been handled by the logger already - return - + assert item.request concurrentTo = self._writeRequest (item) - self._writeResponse (item, concurrentTo) + # items that failed loading don’t have a response + if item.response: + self._writeResponse (item, concurrentTo) def _addRefersTo (self, headers, url): refersTo = self.documentRecords.get (url) if refersTo: headers['WARC-Refers-To'] = refersTo else: - self.logger.error ('No document record found for {}'.format (url)) + self.logger.error (f'No document record found for {url}') return headers def _writeDomSnapshot (self, item): writer = self.writer - warcHeaders = {'X-DOM-Snapshot': str (True), + warcHeaders = { + 'X-Crocoite-Type': 'dom-snapshot', 'X-Chrome-Viewport': item.viewport, - 'Content-Type': 'text/html; charset=utf-8', + 'Content-Type': makeContentType ('text/html', 'utf-8') } self._addRefersTo (warcHeaders, item.url) @@ -218,53 +221,53 @@ class WarcHandler (EventHandler): def _writeScreenshot (self, item): writer = self.writer - warcHeaders = {'Content-Type': 'image/png', - 'X-Crocoite-Screenshot-Y-Offset': str (item.yoff)} + warcHeaders = { + 'Content-Type': makeContentType ('image/png'), + 'X-Crocoite-Screenshot-Y-Offset': str (item.yoff), + 'X-Crocoite-Type': 'screenshot', + } self._addRefersTo (warcHeaders, item.url) self.writeRecord (item.url, 'conversion', payload=BytesIO (item.data), warc_headers_dict=warcHeaders) - def _writeControllerStart (self, item): - payload = BytesIO (json.dumps (item.payload, indent=2).encode ('utf-8')) + def _writeControllerStart (self, item, encoding='utf-8'): + payload = BytesIO (json.dumps (item.payload, indent=2, cls=StrJsonEncoder).encode (encoding)) writer = self.writer - warcinfo = self.writeRecord (packageUrl ('warcinfo'), 'warcinfo', - warc_headers_dict={'Content-Type': 'text/plain; encoding=utf-8'}, + warcinfo = self.writeRecord (None, 'warcinfo', + warc_headers_dict={'Content-Type': makeContentType (jsonMime, encoding)}, payload=payload) self.warcinfoRecordId = warcinfo.rec_headers['WARC-Record-ID'] def _flushLogEntries (self): - writer = self.writer - self.log.seek (0) - # XXX: we should use the type continuation here - self.writeRecord (packageUrl ('log'), 'resource', payload=self.log, - warc_headers_dict={'Content-Type': 'text/plain; encoding={}'.format (self.logEncoding)}) - self.log = BytesIO () + if self.log.tell () > 0: + writer = self.writer + self.log.seek (0) + warcHeaders = { + 'Content-Type': makeContentType (jsonMime, self.logEncoding), + 'X-Crocoite-Type': 'log', + } + self.writeRecord (None, 'metadata', payload=self.log, + warc_headers_dict=warcHeaders) + self.log = BytesIO () def _writeLog (self, item): """ Handle log entries, called by .logger.WarcHandlerConsumer only """ self.log.write (item.encode (self.logEncoding)) self.log.write (b'\n') - # instead of locking, check we’re running in the main thread - if self.log.tell () > self.maxLogSize and \ - threading.current_thread () is threading.main_thread (): + if self.log.tell () > self.maxLogSize: self._flushLogEntries () route = {Script: _writeScript, - Item: _writeItem, + RequestResponsePair: _writeItem, DomSnapshotEvent: _writeDomSnapshot, ScreenshotEvent: _writeScreenshot, ControllerStart: _writeControllerStart, } - def push (self, item): - processed = False + async def push (self, item): for k, v in self.route.items (): if isinstance (item, k): v (self, item) - processed = True break - if not processed: - self.logger.debug ('unknown event {}'.format (repr (item))) - |