From 3deded13df1339ef59a760c188804adffd9ed902 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Sat, 28 Jul 2018 20:25:49 +0200 Subject: Reintroduce WARC logging Commit 7730e0d64ec895091a0dd7eb0e3c6ce2ed02d981 removed logging to WARC files. Add it again, but with a different implementation.. Credits to structlog for inspiration. --- crocoite/warc.py | 38 ++++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) (limited to 'crocoite/warc.py') diff --git a/crocoite/warc.py b/crocoite/warc.py index 47dd9dc..a4a70ac 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -22,8 +22,7 @@ Classes writing data to WARC files """ -import logging -import json +import json, threading from io import BytesIO from warcio.statusandheaders import StatusAndHeaders from urllib.parse import urlsplit @@ -38,18 +37,31 @@ from .behavior import Script, DomSnapshotEvent, ScreenshotEvent from .browser import Item class WarcHandler (EventHandler): - __slots__ = ('logger', 'writer', 'maxBodySize', 'documentRecords') + __slots__ = ('logger', 'writer', 'maxBodySize', 'documentRecords', 'log', + 'maxLogSize', 'logEncoding') def __init__ (self, fd, - logger=logging.getLogger(__name__), + logger, maxBodySize=defaultSettings.maxBodySize): self.logger = logger self.writer = WARCWriter (fd, gzip=True) self.maxBodySize = maxBodySize + + self.logEncoding = 'utf-8' + self.log = BytesIO () + # max log buffer size (bytes) + self.maxLogSize = 500*1024 + # maps document urls to WARC record ids, required for DomSnapshotEvent # and ScreenshotEvent self.documentRecords = {} + def __enter__ (self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self._flushLogEntries () + def _writeRequest (self, item): writer = self.writer @@ -204,6 +216,24 @@ class WarcHandler (EventHandler): warcinfo = writer.create_warcinfo_record (filename=None, info=item.payload) writer.write_record (warcinfo) + def _flushLogEntries (self): + writer = self.writer + self.log.seek (0) + # XXX: we should use the type continuation here + record = writer.create_warc_record (packageUrl ('log'), 'resource', payload=self.log, + warc_headers_dict={'Content-Type': 'text/plain; encoding={}'.format (self.logEncoding)}) + writer.write_record (record) + self.log = BytesIO () + + def _writeLog (self, item): + """ Handle log entries, called by .logger.WarcHandlerConsumer only """ + self.log.write (item.encode (self.logEncoding)) + self.log.write (b'\n') + # instead of locking, check we’re running in the main thread + if self.log.tell () > self.maxLogSize and \ + threading.current_thread () is threading.main_thread (): + self._flushLogEntries () + route = {Script: _writeScript, Item: _writeItem, DomSnapshotEvent: _writeDomSnapshot, -- cgit v1.2.3