summaryrefslogtreecommitdiff
path: root/crocoite/warc.py
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2018-12-25 18:56:55 +0100
committerLars-Dominik Braun <lars@6xq.net>2018-12-25 18:59:55 +0100
commitcc1132a5b4677d089e024bcd0e16e1e817a3581c (patch)
tree8509c3cca27c8c7a2620093f03a3fb4e3bb7e45b /crocoite/warc.py
parentf8217408eeea5531b5f4f353b238dc94de705132 (diff)
downloadcrocoite-cc1132a5b4677d089e024bcd0e16e1e817a3581c.tar.gz
crocoite-cc1132a5b4677d089e024bcd0e16e1e817a3581c.tar.bz2
crocoite-cc1132a5b4677d089e024bcd0e16e1e817a3581c.zip
warc: Add tests
Using hyothesis-based testcase generation. This is quite nice compared to manual test data generation, since it catches alot more corner cases (if done right). This commit also fixes a few issues, including: - log records will only be written if the log is nonempty - properly quote packageUrl path’s - drop old thread checking code - use placeholder url for scripts without name
Diffstat (limited to 'crocoite/warc.py')
-rw-r--r--crocoite/warc.py29
1 files changed, 14 insertions, 15 deletions
diff --git a/crocoite/warc.py b/crocoite/warc.py
index 04dd871..dbd9ebc 100644
--- a/crocoite/warc.py
+++ b/crocoite/warc.py
@@ -39,8 +39,7 @@ class WarcHandler (EventHandler):
__slots__ = ('logger', 'writer', 'documentRecords', 'log',
'maxLogSize', 'logEncoding', 'warcinfoRecordId')
- def __init__ (self, fd,
- logger):
+ def __init__ (self, fd, logger):
self.logger = logger
self.writer = WARCWriter (fd, gzip=True)
@@ -104,7 +103,7 @@ class WarcHandler (EventHandler):
warcHeaders['WARC-Truncated'] = 'unspecified'
payload = None
- if payload:
+ if payload is not None:
payload = BytesIO (payload)
warcHeaders['X-Chrome-Base64Body'] = str (payloadBase64Encoded)
record = self.writeRecord (url, 'request',
@@ -160,10 +159,10 @@ class WarcHandler (EventHandler):
if contentType:
if not base64Encoded:
contentType += '; charset=utf-8'
- httpHeaders.replace_header ('content-type', contentType)
+ httpHeaders.replace_header ('Content-Type', contentType)
if rawBody is not None:
- httpHeaders.replace_header ('content-length', str (len (rawBody)))
+ httpHeaders.replace_header ('Content-Length', str (len (rawBody)))
bodyIo = BytesIO (rawBody)
else:
bodyIo = BytesIO ()
@@ -178,7 +177,8 @@ class WarcHandler (EventHandler):
def _writeScript (self, item):
writer = self.writer
encoding = 'utf-8'
- self.writeRecord (packageUrl (f'script/{item.path}'), 'metadata',
+ path = item.path or '-'
+ self.writeRecord (packageUrl (f'script/{path}'), 'metadata',
payload=BytesIO (str (item).encode (encoding)),
warc_headers_dict={'Content-Type':
f'application/javascript; charset={encoding}'})
@@ -231,20 +231,19 @@ class WarcHandler (EventHandler):
self.warcinfoRecordId = warcinfo.rec_headers['WARC-Record-ID']
def _flushLogEntries (self):
- writer = self.writer
- self.log.seek (0)
- # XXX: we should use the type continuation here
- self.writeRecord (packageUrl ('log'), 'resource', payload=self.log,
- warc_headers_dict={'Content-Type': f'text/plain; encoding={self.logEncoding}'})
- self.log = BytesIO ()
+ if self.log.tell () > 0:
+ writer = self.writer
+ self.log.seek (0)
+ # XXX: we should use the type continuation here
+ self.writeRecord (packageUrl ('log'), 'resource', payload=self.log,
+ warc_headers_dict={'Content-Type': f'text/plain; encoding={self.logEncoding}'})
+ self.log = BytesIO ()
def _writeLog (self, item):
""" Handle log entries, called by .logger.WarcHandlerConsumer only """
self.log.write (item.encode (self.logEncoding))
self.log.write (b'\n')
- # instead of locking, check we’re running in the main thread
- if self.log.tell () > self.maxLogSize and \
- threading.current_thread () is threading.main_thread ():
+ if self.log.tell () > self.maxLogSize:
self._flushLogEntries ()
route = {Script: _writeScript,