summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2017-11-29 13:25:30 +0100
committerLars-Dominik Braun <lars@6xq.net>2017-11-29 13:25:30 +0100
commit6f628ca24ac2b243dd4a611ff1ecff2d35aaa019 (patch)
tree62d2efbfdb996512755e9c9e8e2368ac691ced66
parent6b656f5ccffe79f7cdb11fce3dd72359b3cbbc1b (diff)
downloadcrocoite-6f628ca24ac2b243dd4a611ff1ecff2d35aaa019.zip
crocoite-6f628ca24ac2b243dd4a611ff1ecff2d35aaa019.tar.gz
crocoite-6f628ca24ac2b243dd4a611ff1ecff2d35aaa019.tar.bz2
Use Chrome’s timestamps as WARC-Date
-rw-r--r--crocoite/browser.py16
-rw-r--r--crocoite/warc.py6
2 files changed, 14 insertions, 8 deletions
diff --git a/crocoite/browser.py b/crocoite/browser.py
index 756fd64..67c75c3 100644
--- a/crocoite/browser.py
+++ b/crocoite/browser.py
@@ -31,8 +31,8 @@ class Item:
"""
def __init__ (self):
- self._chromeRequest = None
- self._chromeResponse = None
+ self.chromeRequest = None
+ self.chromeResponse = None
self.encodedDataLength = 0
def __repr__ (self):
@@ -40,25 +40,25 @@ class Item:
@property
def request (self):
- return self._chromeRequest['request']
+ return self.chromeRequest['request']
@property
def response (self):
- return self._chromeResponse['response']
+ return self.chromeResponse['response']
@property
def initiator (self):
- return self._chromeRequest['initiator']
+ return self.chromeRequest['initiator']
@property
def id (self):
- return self._chromeRequest['requestId']
+ return self.chromeRequest['requestId']
def setRequest (self, req):
- self._chromeRequest = req
+ self.chromeRequest = req
def setResponse (self, resp):
- self._chromeResponse = resp
+ self.chromeResponse = resp
class SiteLoader:
"""
diff --git a/crocoite/warc.py b/crocoite/warc.py
index e06b1c7..252e8cb 100644
--- a/crocoite/warc.py
+++ b/crocoite/warc.py
@@ -33,6 +33,8 @@ from warcio.statusandheaders import StatusAndHeaders
from urllib.parse import urlsplit
from logging.handlers import BufferingHandler
import pychrome
+from datetime import datetime
+from warcio.timeutils import datetime_to_iso_date
class WARCLogHandler (BufferingHandler):
"""
@@ -112,6 +114,7 @@ class WarcLoader (SiteLoader):
initiator = item.initiator
warcHeaders = {
'X-Chrome-Initiator': json.dumps (initiator),
+ 'WARC-Date': datetime_to_iso_date (datetime.utcfromtimestamp (item.chromeRequest['wallTime'])),
}
record = writer.create_warc_record(req['url'], 'request',
payload=postData, http_headers=httpHeaders,
@@ -126,6 +129,9 @@ class WarcLoader (SiteLoader):
'X-Chrome-Protocol': resp.get ('protocol', ''),
'X-Chrome-FromDiskCache': str (resp.get ('fromDiskCache')),
'X-Chrome-ConnectionReused': str (resp.get ('connectionReused')),
+ 'WARC-Date': datetime_to_iso_date (datetime.utcfromtimestamp (
+ item.chromeRequest['wallTime']+
+ (item.chromeResponse['timestamp']-item.chromeRequest['timestamp']))),
}
rawBody = b''