summaryrefslogtreecommitdiff
path: root/crocoite/warc.py
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2018-07-28 20:25:49 +0200
committerLars-Dominik Braun <lars@6xq.net>2018-08-04 14:11:31 +0200
commit3deded13df1339ef59a760c188804adffd9ed902 (patch)
tree5eaf69ee38389073e7323585c6afdbbf5eeab487 /crocoite/warc.py
parent33a137f2d7c04468038d689b53a70fb534297f55 (diff)
downloadcrocoite-3deded13df1339ef59a760c188804adffd9ed902.tar.gz
crocoite-3deded13df1339ef59a760c188804adffd9ed902.tar.bz2
crocoite-3deded13df1339ef59a760c188804adffd9ed902.zip
Reintroduce WARC logging
Commit 7730e0d64ec895091a0dd7eb0e3c6ce2ed02d981 removed logging to WARC files. Add it again, but with a different implementation.. Credits to structlog for inspiration.
Diffstat (limited to 'crocoite/warc.py')
-rw-r--r--crocoite/warc.py38
1 files changed, 34 insertions, 4 deletions
diff --git a/crocoite/warc.py b/crocoite/warc.py
index 47dd9dc..a4a70ac 100644
--- a/crocoite/warc.py
+++ b/crocoite/warc.py
@@ -22,8 +22,7 @@
Classes writing data to WARC files
"""
-import logging
-import json
+import json, threading
from io import BytesIO
from warcio.statusandheaders import StatusAndHeaders
from urllib.parse import urlsplit
@@ -38,18 +37,31 @@ from .behavior import Script, DomSnapshotEvent, ScreenshotEvent
from .browser import Item
class WarcHandler (EventHandler):
- __slots__ = ('logger', 'writer', 'maxBodySize', 'documentRecords')
+ __slots__ = ('logger', 'writer', 'maxBodySize', 'documentRecords', 'log',
+ 'maxLogSize', 'logEncoding')
def __init__ (self, fd,
- logger=logging.getLogger(__name__),
+ logger,
maxBodySize=defaultSettings.maxBodySize):
self.logger = logger
self.writer = WARCWriter (fd, gzip=True)
self.maxBodySize = maxBodySize
+
+ self.logEncoding = 'utf-8'
+ self.log = BytesIO ()
+ # max log buffer size (bytes)
+ self.maxLogSize = 500*1024
+
# maps document urls to WARC record ids, required for DomSnapshotEvent
# and ScreenshotEvent
self.documentRecords = {}
+ def __enter__ (self):
+ return self
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ self._flushLogEntries ()
+
def _writeRequest (self, item):
writer = self.writer
@@ -204,6 +216,24 @@ class WarcHandler (EventHandler):
warcinfo = writer.create_warcinfo_record (filename=None, info=item.payload)
writer.write_record (warcinfo)
+ def _flushLogEntries (self):
+ writer = self.writer
+ self.log.seek (0)
+ # XXX: we should use the type continuation here
+ record = writer.create_warc_record (packageUrl ('log'), 'resource', payload=self.log,
+ warc_headers_dict={'Content-Type': 'text/plain; encoding={}'.format (self.logEncoding)})
+ writer.write_record (record)
+ self.log = BytesIO ()
+
+ def _writeLog (self, item):
+ """ Handle log entries, called by .logger.WarcHandlerConsumer only """
+ self.log.write (item.encode (self.logEncoding))
+ self.log.write (b'\n')
+ # instead of locking, check we’re running in the main thread
+ if self.log.tell () > self.maxLogSize and \
+ threading.current_thread () is threading.main_thread ():
+ self._flushLogEntries ()
+
route = {Script: _writeScript,
Item: _writeItem,
DomSnapshotEvent: _writeDomSnapshot,