summaryrefslogtreecommitdiff
path: root/crocoite/warc.py
diff options
context:
space:
mode:
Diffstat (limited to 'crocoite/warc.py')
-rw-r--r--crocoite/warc.py96
1 files changed, 42 insertions, 54 deletions
diff --git a/crocoite/warc.py b/crocoite/warc.py
index dbd9ebc..cb1f2f7 100644
--- a/crocoite/warc.py
+++ b/crocoite/warc.py
@@ -25,6 +25,7 @@ Classes writing data to WARC files
import json, threading
from io import BytesIO
from datetime import datetime
+from http.server import BaseHTTPRequestHandler
from warcio.timeutils import datetime_to_iso_date
from warcio.warcwriter import WARCWriter
@@ -33,7 +34,7 @@ from warcio.statusandheaders import StatusAndHeaders
from .util import packageUrl, StrJsonEncoder
from .controller import EventHandler, ControllerStart
from .behavior import Script, DomSnapshotEvent, ScreenshotEvent
-from .browser import Item
+from .browser import RequestResponsePair, UnicodeBody, Base64Body
class WarcHandler (EventHandler):
__slots__ = ('logger', 'writer', 'documentRecords', 'log',
@@ -86,66 +87,51 @@ class WarcHandler (EventHandler):
url = item.url
path = url.relative().with_fragment(None)
- httpHeaders = StatusAndHeaders(f'{req["method"]} {path} HTTP/1.1',
- item.requestHeaders, protocol='HTTP/1.1', is_http_request=True)
- initiator = item.initiator
+ httpHeaders = StatusAndHeaders(f'{req.method} {path} HTTP/1.1',
+ req.headers, protocol='HTTP/1.1', is_http_request=True)
warcHeaders = {
- 'X-Chrome-Initiator': json.dumps (initiator),
+ 'X-Chrome-Initiator': json.dumps (req.initiator),
'X-Chrome-Request-ID': item.id,
- 'WARC-Date': datetime_to_iso_date (datetime.utcfromtimestamp (item.chromeRequest['wallTime'])),
+ 'WARC-Date': datetime_to_iso_date (req.timestamp),
}
- if item.requestBody is not None:
- payload, payloadBase64Encoded = item.requestBody
- else:
+ body = item.request.body
+ if item.request.hasPostData and body is None:
# oops, don’t know what went wrong here
- logger.error ('requestBody missing', uuid='ee9adc58-e723-4595-9feb-312a67ead6a0')
+ logger.error ('requestBody missing',
+ uuid='ee9adc58-e723-4595-9feb-312a67ead6a0')
warcHeaders['WARC-Truncated'] = 'unspecified'
- payload = None
-
- if payload is not None:
- payload = BytesIO (payload)
- warcHeaders['X-Chrome-Base64Body'] = str (payloadBase64Encoded)
+ else:
+ warcHeaders['X-Chrome-Base64Body'] = str (type (body) is Base64Body)
+ body = BytesIO (body)
record = self.writeRecord (url, 'request',
- payload=payload, http_headers=httpHeaders,
+ payload=body, http_headers=httpHeaders,
warc_headers_dict=warcHeaders)
return record.rec_headers['WARC-Record-ID']
def _writeResponse (self, item, concurrentTo):
# fetch the body
reqId = item.id
- rawBody = None
- base64Encoded = False
- bodyTruncated = None
- if item.isRedirect or item.body is None:
- # redirects reuse the same request, thus we cannot safely retrieve
- # the body (i.e getResponseBody may return the new location’s
- # body). No body available means we failed to retrieve it.
- bodyTruncated = 'unspecified'
- else:
- rawBody, base64Encoded = item.body
# now the response
resp = item.response
warcHeaders = {
'WARC-Concurrent-To': concurrentTo,
- 'WARC-IP-Address': resp.get ('remoteIPAddress', ''),
- 'X-Chrome-Protocol': resp.get ('protocol', ''),
- 'X-Chrome-FromDiskCache': str (resp.get ('fromDiskCache')),
- 'X-Chrome-ConnectionReused': str (resp.get ('connectionReused')),
'X-Chrome-Request-ID': item.id,
- 'WARC-Date': datetime_to_iso_date (datetime.utcfromtimestamp (
- item.chromeRequest['wallTime']+
- (item.chromeResponse['timestamp']-item.chromeRequest['timestamp']))),
+ 'WARC-Date': datetime_to_iso_date (resp.timestamp),
}
- if bodyTruncated:
- warcHeaders['WARC-Truncated'] = bodyTruncated
- else:
- warcHeaders['X-Chrome-Base64Body'] = str (base64Encoded)
-
- httpHeaders = StatusAndHeaders(f'{resp["status"]} {item.statusText}',
- item.responseHeaders,
- protocol='HTTP/1.1')
+ # conditional WARC headers
+ if item.remoteIpAddress:
+ warcHeaders['WARC-IP-Address'] = item.remoteIpAddress
+ if item.protocol:
+ warcHeaders['X-Chrome-Protocol'] = item.protocol
+
+ # HTTP headers
+ statusText = resp.statusText or \
+ BaseHTTPRequestHandler.responses.get (
+ resp.status, ('No status text available', ))[0]
+ httpHeaders = StatusAndHeaders(f'{resp.status} {statusText}',
+ resp.headers, protocol='HTTP/1.1')
# Content is saved decompressed and decoded, remove these headers
blacklistedHeaders = {'transfer-encoding', 'content-encoding'}
@@ -155,20 +141,23 @@ class WarcHandler (EventHandler):
# chrome sends nothing but utf8 encoded text. Fortunately HTTP
# headers take precedence over the document’s <meta>, thus we can
# easily override those.
- contentType = resp.get ('mimeType')
+ contentType = resp.mimeType
if contentType:
- if not base64Encoded:
+ if isinstance (resp.body, UnicodeBody):
contentType += '; charset=utf-8'
httpHeaders.replace_header ('Content-Type', contentType)
- if rawBody is not None:
- httpHeaders.replace_header ('Content-Length', str (len (rawBody)))
- bodyIo = BytesIO (rawBody)
+ # response body
+ body = resp.body
+ if body is None:
+ warcHeaders['WARC-Truncated'] = 'unspecified'
else:
- bodyIo = BytesIO ()
+ httpHeaders.replace_header ('Content-Length', str (len (body)))
+ warcHeaders['X-Chrome-Base64Body'] = str (type (body) is Base64Body)
+ body = BytesIO (body)
record = self.writeRecord (item.url, 'response',
- warc_headers_dict=warcHeaders, payload=bodyIo,
+ warc_headers_dict=warcHeaders, payload=body,
http_headers=httpHeaders)
if item.resourceType == 'Document':
@@ -184,12 +173,11 @@ class WarcHandler (EventHandler):
f'application/javascript; charset={encoding}'})
def _writeItem (self, item):
- if item.failed:
- # should have been handled by the logger already
- return
-
+ assert item.request
concurrentTo = self._writeRequest (item)
- self._writeResponse (item, concurrentTo)
+ # items that failed loading don’t have a response
+ if item.response:
+ self._writeResponse (item, concurrentTo)
def _addRefersTo (self, headers, url):
refersTo = self.documentRecords.get (url)
@@ -247,7 +235,7 @@ class WarcHandler (EventHandler):
self._flushLogEntries ()
route = {Script: _writeScript,
- Item: _writeItem,
+ RequestResponsePair: _writeItem,
DomSnapshotEvent: _writeDomSnapshot,
ScreenshotEvent: _writeScreenshot,
ControllerStart: _writeControllerStart,