diff options
author | Lars-Dominik Braun <lars@6xq.net> | 2017-11-29 13:25:30 +0100 |
---|---|---|
committer | Lars-Dominik Braun <lars@6xq.net> | 2017-11-29 13:25:30 +0100 |
commit | 6f628ca24ac2b243dd4a611ff1ecff2d35aaa019 (patch) | |
tree | 62d2efbfdb996512755e9c9e8e2368ac691ced66 | |
parent | 6b656f5ccffe79f7cdb11fce3dd72359b3cbbc1b (diff) | |
download | crocoite-6f628ca24ac2b243dd4a611ff1ecff2d35aaa019.tar.gz crocoite-6f628ca24ac2b243dd4a611ff1ecff2d35aaa019.tar.bz2 crocoite-6f628ca24ac2b243dd4a611ff1ecff2d35aaa019.zip |
Use Chrome’s timestamps as WARC-Date
-rw-r--r-- | crocoite/browser.py | 16 | ||||
-rw-r--r-- | crocoite/warc.py | 6 |
2 files changed, 14 insertions, 8 deletions
diff --git a/crocoite/browser.py b/crocoite/browser.py index 756fd64..67c75c3 100644 --- a/crocoite/browser.py +++ b/crocoite/browser.py @@ -31,8 +31,8 @@ class Item: """ def __init__ (self): - self._chromeRequest = None - self._chromeResponse = None + self.chromeRequest = None + self.chromeResponse = None self.encodedDataLength = 0 def __repr__ (self): @@ -40,25 +40,25 @@ class Item: @property def request (self): - return self._chromeRequest['request'] + return self.chromeRequest['request'] @property def response (self): - return self._chromeResponse['response'] + return self.chromeResponse['response'] @property def initiator (self): - return self._chromeRequest['initiator'] + return self.chromeRequest['initiator'] @property def id (self): - return self._chromeRequest['requestId'] + return self.chromeRequest['requestId'] def setRequest (self, req): - self._chromeRequest = req + self.chromeRequest = req def setResponse (self, resp): - self._chromeResponse = resp + self.chromeResponse = resp class SiteLoader: """ diff --git a/crocoite/warc.py b/crocoite/warc.py index e06b1c7..252e8cb 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -33,6 +33,8 @@ from warcio.statusandheaders import StatusAndHeaders from urllib.parse import urlsplit from logging.handlers import BufferingHandler import pychrome +from datetime import datetime +from warcio.timeutils import datetime_to_iso_date class WARCLogHandler (BufferingHandler): """ @@ -112,6 +114,7 @@ class WarcLoader (SiteLoader): initiator = item.initiator warcHeaders = { 'X-Chrome-Initiator': json.dumps (initiator), + 'WARC-Date': datetime_to_iso_date (datetime.utcfromtimestamp (item.chromeRequest['wallTime'])), } record = writer.create_warc_record(req['url'], 'request', payload=postData, http_headers=httpHeaders, @@ -126,6 +129,9 @@ class WarcLoader (SiteLoader): 'X-Chrome-Protocol': resp.get ('protocol', ''), 'X-Chrome-FromDiskCache': str (resp.get ('fromDiskCache')), 'X-Chrome-ConnectionReused': str (resp.get ('connectionReused')), + 'WARC-Date': datetime_to_iso_date (datetime.utcfromtimestamp ( + item.chromeRequest['wallTime']+ + (item.chromeResponse['timestamp']-item.chromeRequest['timestamp']))), } rawBody = b'' |