summaryrefslogtreecommitdiff
path: root/crocoite/warc.py
diff options
context:
space:
mode:
Diffstat (limited to 'crocoite/warc.py')
-rw-r--r--crocoite/warc.py197
1 files changed, 100 insertions, 97 deletions
diff --git a/crocoite/warc.py b/crocoite/warc.py
index ebc460d..415b487 100644
--- a/crocoite/warc.py
+++ b/crocoite/warc.py
@@ -24,24 +24,36 @@ Classes writing data to WARC files
import json, threading
from io import BytesIO
-from urllib.parse import urlsplit
from datetime import datetime
+from http.server import BaseHTTPRequestHandler
from warcio.timeutils import datetime_to_iso_date
from warcio.warcwriter import WARCWriter
from warcio.statusandheaders import StatusAndHeaders
+from yarl import URL
-from .util import packageUrl
+from .util import StrJsonEncoder
from .controller import EventHandler, ControllerStart
from .behavior import Script, DomSnapshotEvent, ScreenshotEvent
-from .browser import Item
+from .browser import RequestResponsePair, UnicodeBody
+
+# the official mimetype for json, according to https://tools.ietf.org/html/rfc8259
+jsonMime = 'application/json'
+# mime for javascript, according to https://tools.ietf.org/html/rfc4329#section-7.2
+jsMime = 'application/javascript'
+
+def makeContentType (mime, charset=None):
+ """ Create value of Content-Type WARC header with optional charset """
+ s = [mime]
+ if charset:
+ s.extend (['; charset=', charset])
+ return ''.join (s)
class WarcHandler (EventHandler):
__slots__ = ('logger', 'writer', 'documentRecords', 'log',
'maxLogSize', 'logEncoding', 'warcinfoRecordId')
- def __init__ (self, fd,
- logger):
+ def __init__ (self, fd, logger):
self.logger = logger
self.writer = WARCWriter (fd, gzip=True)
@@ -68,6 +80,7 @@ class WarcHandler (EventHandler):
Adds default WARC headers.
"""
+ assert url is None or isinstance (url, URL)
d = {}
if self.warcinfoRecordId:
@@ -75,8 +88,11 @@ class WarcHandler (EventHandler):
d.update (warc_headers_dict)
warc_headers_dict = d
- record = self.writer.create_warc_record (url, kind, payload=payload,
- warc_headers_dict=warc_headers_dict, http_headers=http_headers)
+ record = self.writer.create_warc_record (str (url) if url else '',
+ kind,
+ payload=payload,
+ warc_headers_dict=warc_headers_dict,
+ http_headers=http_headers)
self.writer.write_record (record)
return record
@@ -85,72 +101,52 @@ class WarcHandler (EventHandler):
logger = self.logger.bind (reqId=item.id)
req = item.request
- resp = item.response
- url = urlsplit (resp['url'])
-
- path = url.path
- if url.query:
- path += '?' + url.query
- httpHeaders = StatusAndHeaders('{} {} HTTP/1.1'.format (req['method'], path),
- item.requestHeaders, protocol='HTTP/1.1', is_http_request=True)
- initiator = item.initiator
+ url = item.url
+
+ path = url.relative().with_fragment(None)
+ httpHeaders = StatusAndHeaders(f'{req.method} {path} HTTP/1.1',
+ req.headers, protocol='HTTP/1.1', is_http_request=True)
warcHeaders = {
- 'X-Chrome-Initiator': json.dumps (initiator),
+ # required to correlate request with log entries
'X-Chrome-Request-ID': item.id,
- 'WARC-Date': datetime_to_iso_date (datetime.utcfromtimestamp (item.chromeRequest['wallTime'])),
+ 'WARC-Date': datetime_to_iso_date (req.timestamp),
}
- if item.requestBody is not None:
- payload, payloadBase64Encoded = item.requestBody
- else:
+ body = item.request.body
+ if item.request.hasPostData and body is None:
# oops, don’t know what went wrong here
- logger.error ('requestBody missing', uuid='ee9adc58-e723-4595-9feb-312a67ead6a0')
+ logger.error ('requestBody missing',
+ uuid='ee9adc58-e723-4595-9feb-312a67ead6a0')
warcHeaders['WARC-Truncated'] = 'unspecified'
- payload = None
-
- if payload:
- payload = BytesIO (payload)
- warcHeaders['X-Chrome-Base64Body'] = str (payloadBase64Encoded)
- record = self.writeRecord (req['url'], 'request',
- payload=payload, http_headers=httpHeaders,
+ else:
+ body = BytesIO (body)
+ record = self.writeRecord (url, 'request',
+ payload=body, http_headers=httpHeaders,
warc_headers_dict=warcHeaders)
return record.rec_headers['WARC-Record-ID']
def _writeResponse (self, item, concurrentTo):
# fetch the body
reqId = item.id
- rawBody = None
- base64Encoded = False
- bodyTruncated = None
- if item.isRedirect or item.body is None:
- # redirects reuse the same request, thus we cannot safely retrieve
- # the body (i.e getResponseBody may return the new location’s
- # body). No body available means we failed to retrieve it.
- bodyTruncated = 'unspecified'
- else:
- rawBody, base64Encoded = item.body
# now the response
resp = item.response
warcHeaders = {
'WARC-Concurrent-To': concurrentTo,
- 'WARC-IP-Address': resp.get ('remoteIPAddress', ''),
- 'X-Chrome-Protocol': resp.get ('protocol', ''),
- 'X-Chrome-FromDiskCache': str (resp.get ('fromDiskCache')),
- 'X-Chrome-ConnectionReused': str (resp.get ('connectionReused')),
+ # required to correlate request with log entries
'X-Chrome-Request-ID': item.id,
- 'WARC-Date': datetime_to_iso_date (datetime.utcfromtimestamp (
- item.chromeRequest['wallTime']+
- (item.chromeResponse['timestamp']-item.chromeRequest['timestamp']))),
+ 'WARC-Date': datetime_to_iso_date (resp.timestamp),
}
- if bodyTruncated:
- warcHeaders['WARC-Truncated'] = bodyTruncated
- else:
- warcHeaders['X-Chrome-Base64Body'] = str (base64Encoded)
+ # conditional WARC headers
+ if item.remoteIpAddress:
+ warcHeaders['WARC-IP-Address'] = item.remoteIpAddress
- httpHeaders = StatusAndHeaders('{} {}'.format (resp['status'],
- item.statusText), item.responseHeaders,
- protocol='HTTP/1.1')
+ # HTTP headers
+ statusText = resp.statusText or \
+ BaseHTTPRequestHandler.responses.get (
+ resp.status, ('No status text available', ))[0]
+ httpHeaders = StatusAndHeaders(f'{resp.status} {statusText}',
+ resp.headers, protocol='HTTP/1.1')
# Content is saved decompressed and decoded, remove these headers
blacklistedHeaders = {'transfer-encoding', 'content-encoding'}
@@ -160,20 +156,21 @@ class WarcHandler (EventHandler):
# chrome sends nothing but utf8 encoded text. Fortunately HTTP
# headers take precedence over the document’s <meta>, thus we can
# easily override those.
- contentType = resp.get ('mimeType')
- if contentType:
- if not base64Encoded:
- contentType += '; charset=utf-8'
- httpHeaders.replace_header ('content-type', contentType)
-
- if rawBody is not None:
- httpHeaders.replace_header ('content-length', '{:d}'.format (len (rawBody)))
- bodyIo = BytesIO (rawBody)
+ if resp.mimeType:
+ charset = 'utf-8' if isinstance (resp.body, UnicodeBody) else None
+ contentType = makeContentType (resp.mimeType, charset=charset)
+ httpHeaders.replace_header ('Content-Type', contentType)
+
+ # response body
+ body = resp.body
+ if body is None:
+ warcHeaders['WARC-Truncated'] = 'unspecified'
else:
- bodyIo = BytesIO ()
+ httpHeaders.replace_header ('Content-Length', str (len (body)))
+ body = BytesIO (body)
- record = self.writeRecord (resp['url'], 'response',
- warc_headers_dict=warcHeaders, payload=bodyIo,
+ record = self.writeRecord (item.url, 'response',
+ warc_headers_dict=warcHeaders, payload=body,
http_headers=httpHeaders)
if item.resourceType == 'Document':
@@ -182,32 +179,38 @@ class WarcHandler (EventHandler):
def _writeScript (self, item):
writer = self.writer
encoding = 'utf-8'
- self.writeRecord (packageUrl ('script/{}'.format (item.path)), 'metadata',
+ # XXX: yes, we’re leaking information about the user here, but this is
+ # the one and only source URL of the scripts.
+ uri = URL(f'file://{item.abspath}') if item.path else None
+ self.writeRecord (uri, 'resource',
payload=BytesIO (str (item).encode (encoding)),
- warc_headers_dict={'Content-Type': 'application/javascript; charset={}'.format (encoding)})
+ warc_headers_dict={
+ 'Content-Type': makeContentType (jsMime, encoding),
+ 'X-Crocoite-Type': 'script',
+ })
def _writeItem (self, item):
- if item.failed:
- # should have been handled by the logger already
- return
-
+ assert item.request
concurrentTo = self._writeRequest (item)
- self._writeResponse (item, concurrentTo)
+ # items that failed loading don’t have a response
+ if item.response:
+ self._writeResponse (item, concurrentTo)
def _addRefersTo (self, headers, url):
refersTo = self.documentRecords.get (url)
if refersTo:
headers['WARC-Refers-To'] = refersTo
else:
- self.logger.error ('No document record found for {}'.format (url))
+ self.logger.error (f'No document record found for {url}')
return headers
def _writeDomSnapshot (self, item):
writer = self.writer
- warcHeaders = {'X-DOM-Snapshot': str (True),
+ warcHeaders = {
+ 'X-Crocoite-Type': 'dom-snapshot',
'X-Chrome-Viewport': item.viewport,
- 'Content-Type': 'text/html; charset=utf-8',
+ 'Content-Type': makeContentType ('text/html', 'utf-8')
}
self._addRefersTo (warcHeaders, item.url)
@@ -218,53 +221,53 @@ class WarcHandler (EventHandler):
def _writeScreenshot (self, item):
writer = self.writer
- warcHeaders = {'Content-Type': 'image/png',
- 'X-Crocoite-Screenshot-Y-Offset': str (item.yoff)}
+ warcHeaders = {
+ 'Content-Type': makeContentType ('image/png'),
+ 'X-Crocoite-Screenshot-Y-Offset': str (item.yoff),
+ 'X-Crocoite-Type': 'screenshot',
+ }
self._addRefersTo (warcHeaders, item.url)
self.writeRecord (item.url, 'conversion',
payload=BytesIO (item.data), warc_headers_dict=warcHeaders)
- def _writeControllerStart (self, item):
- payload = BytesIO (json.dumps (item.payload, indent=2).encode ('utf-8'))
+ def _writeControllerStart (self, item, encoding='utf-8'):
+ payload = BytesIO (json.dumps (item.payload, indent=2, cls=StrJsonEncoder).encode (encoding))
writer = self.writer
- warcinfo = self.writeRecord (packageUrl ('warcinfo'), 'warcinfo',
- warc_headers_dict={'Content-Type': 'text/plain; encoding=utf-8'},
+ warcinfo = self.writeRecord (None, 'warcinfo',
+ warc_headers_dict={'Content-Type': makeContentType (jsonMime, encoding)},
payload=payload)
self.warcinfoRecordId = warcinfo.rec_headers['WARC-Record-ID']
def _flushLogEntries (self):
- writer = self.writer
- self.log.seek (0)
- # XXX: we should use the type continuation here
- self.writeRecord (packageUrl ('log'), 'resource', payload=self.log,
- warc_headers_dict={'Content-Type': 'text/plain; encoding={}'.format (self.logEncoding)})
- self.log = BytesIO ()
+ if self.log.tell () > 0:
+ writer = self.writer
+ self.log.seek (0)
+ warcHeaders = {
+ 'Content-Type': makeContentType (jsonMime, self.logEncoding),
+ 'X-Crocoite-Type': 'log',
+ }
+ self.writeRecord (None, 'metadata', payload=self.log,
+ warc_headers_dict=warcHeaders)
+ self.log = BytesIO ()
def _writeLog (self, item):
""" Handle log entries, called by .logger.WarcHandlerConsumer only """
self.log.write (item.encode (self.logEncoding))
self.log.write (b'\n')
- # instead of locking, check we’re running in the main thread
- if self.log.tell () > self.maxLogSize and \
- threading.current_thread () is threading.main_thread ():
+ if self.log.tell () > self.maxLogSize:
self._flushLogEntries ()
route = {Script: _writeScript,
- Item: _writeItem,
+ RequestResponsePair: _writeItem,
DomSnapshotEvent: _writeDomSnapshot,
ScreenshotEvent: _writeScreenshot,
ControllerStart: _writeControllerStart,
}
- def push (self, item):
- processed = False
+ async def push (self, item):
for k, v in self.route.items ():
if isinstance (item, k):
v (self, item)
- processed = True
break
- if not processed:
- self.logger.debug ('unknown event {}'.format (repr (item)))
-