From 6f628ca24ac2b243dd4a611ff1ecff2d35aaa019 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Wed, 29 Nov 2017 13:25:30 +0100 Subject: Use Chrome’s timestamps as WARC-Date MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crocoite/warc.py | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'crocoite/warc.py') diff --git a/crocoite/warc.py b/crocoite/warc.py index e06b1c7..252e8cb 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -33,6 +33,8 @@ from warcio.statusandheaders import StatusAndHeaders from urllib.parse import urlsplit from logging.handlers import BufferingHandler import pychrome +from datetime import datetime +from warcio.timeutils import datetime_to_iso_date class WARCLogHandler (BufferingHandler): """ @@ -112,6 +114,7 @@ class WarcLoader (SiteLoader): initiator = item.initiator warcHeaders = { 'X-Chrome-Initiator': json.dumps (initiator), + 'WARC-Date': datetime_to_iso_date (datetime.utcfromtimestamp (item.chromeRequest['wallTime'])), } record = writer.create_warc_record(req['url'], 'request', payload=postData, http_headers=httpHeaders, @@ -126,6 +129,9 @@ class WarcLoader (SiteLoader): 'X-Chrome-Protocol': resp.get ('protocol', ''), 'X-Chrome-FromDiskCache': str (resp.get ('fromDiskCache')), 'X-Chrome-ConnectionReused': str (resp.get ('connectionReused')), + 'WARC-Date': datetime_to_iso_date (datetime.utcfromtimestamp ( + item.chromeRequest['wallTime']+ + (item.chromeResponse['timestamp']-item.chromeRequest['timestamp']))), } rawBody = b'' -- cgit v1.2.3