diff options
author | Lars-Dominik Braun <lars@6xq.net> | 2018-11-06 16:53:16 +0100 |
---|---|---|
committer | Lars-Dominik Braun <lars@6xq.net> | 2018-11-06 16:54:58 +0100 |
commit | aeab124ac9f1e3e88f6a8ae246c90b8094e94223 (patch) | |
tree | 4b6394fcaca6b045a98f43736a6dac66a3144a7c /crocoite/warc.py | |
parent | 60fe79f2d898757f4f20aa89015e86cd63ef7871 (diff) | |
download | crocoite-aeab124ac9f1e3e88f6a8ae246c90b8094e94223.tar.gz crocoite-aeab124ac9f1e3e88f6a8ae246c90b8094e94223.tar.bz2 crocoite-aeab124ac9f1e3e88f6a8ae246c90b8094e94223.zip |
Switch single mode to asyncio
This is a direct port to asyncio without any design changes. These need
to happen in further refinements.
Fixes issue #1.
Diffstat (limited to 'crocoite/warc.py')
-rw-r--r-- | crocoite/warc.py | 32 |
1 files changed, 9 insertions, 23 deletions
diff --git a/crocoite/warc.py b/crocoite/warc.py index 9b97e75..c1cbff2 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -37,15 +37,13 @@ from .behavior import Script, DomSnapshotEvent, ScreenshotEvent from .browser import Item class WarcHandler (EventHandler): - __slots__ = ('logger', 'writer', 'maxBodySize', 'documentRecords', 'log', + __slots__ = ('logger', 'writer', 'documentRecords', 'log', 'maxLogSize', 'logEncoding', 'warcinfoRecordId') def __init__ (self, fd, - logger, - maxBodySize=defaultSettings.maxBodySize): + logger): self.logger = logger self.writer = WARCWriter (fd, gzip=True) - self.maxBodySize = maxBodySize self.logEncoding = 'utf-8' self.log = BytesIO () @@ -101,16 +99,13 @@ class WarcHandler (EventHandler): 'X-Chrome-Request-ID': item.id, 'WARC-Date': datetime_to_iso_date (datetime.utcfromtimestamp (item.chromeRequest['wallTime'])), } - try: - bodyTruncated = None + + if item.requestBody is not None: payload, payloadBase64Encoded = item.requestBody - except ValueError: + else: # oops, don’t know what went wrong here - bodyTruncated = 'unspecified' logger.error ('requestBody missing', uuid='ee9adc58-e723-4595-9feb-312a67ead6a0') - - if bodyTruncated: - warcHeaders['WARC-Truncated'] = bodyTruncated + warcHeaders['WARC-Truncated'] = 'unspecified' payload = None if payload: @@ -127,22 +122,13 @@ class WarcHandler (EventHandler): rawBody = None base64Encoded = False bodyTruncated = None - if item.isRedirect: + if item.isRedirect or item.body is None: # redirects reuse the same request, thus we cannot safely retrieve # the body (i.e getResponseBody may return the new location’s - # body). + # body). No body available means we failed to retrieve it. bodyTruncated = 'unspecified' - elif item.encodedDataLength > self.maxBodySize: - bodyTruncated = 'length' - # check body size first, since we’re loading everything into memory - self.logger.error ('body for {} too large {} vs {}'.format (reqId, - item.encodedDataLength, self.maxBodySize)) else: - try: - rawBody, base64Encoded = item.body - except ValueError: - # oops, don’t know what went wrong here - bodyTruncated = 'unspecified' + rawBody, base64Encoded = item.body # now the response resp = item.response |