summaryrefslogtreecommitdiff
path: root/crocoite/warc.py
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2018-12-18 12:34:25 +0100
committerLars-Dominik Braun <lars@6xq.net>2018-12-21 20:28:51 +0100
commit5e444dd6511d97308a84ae9c86ebf14547d01f01 (patch)
tree0852c081163ff3456038fb08ad4e47d0d47a6167 /crocoite/warc.py
parente19635a75cc1ab206be12ecf2b1c9a909baa9c21 (diff)
downloadcrocoite-5e444dd6511d97308a84ae9c86ebf14547d01f01.tar.gz
crocoite-5e444dd6511d97308a84ae9c86ebf14547d01f01.tar.bz2
crocoite-5e444dd6511d97308a84ae9c86ebf14547d01f01.zip
Parse URLs by default
Use library yarl (already pulled in by aiohttp). No URL processed should be a string.
Diffstat (limited to 'crocoite/warc.py')
-rw-r--r--crocoite/warc.py18
1 files changed, 7 insertions, 11 deletions
diff --git a/crocoite/warc.py b/crocoite/warc.py
index ebc460d..21a99aa 100644
--- a/crocoite/warc.py
+++ b/crocoite/warc.py
@@ -24,14 +24,13 @@ Classes writing data to WARC files
import json, threading
from io import BytesIO
-from urllib.parse import urlsplit
from datetime import datetime
from warcio.timeutils import datetime_to_iso_date
from warcio.warcwriter import WARCWriter
from warcio.statusandheaders import StatusAndHeaders
-from .util import packageUrl
+from .util import packageUrl, StrJsonEncoder
from .controller import EventHandler, ControllerStart
from .behavior import Script, DomSnapshotEvent, ScreenshotEvent
from .browser import Item
@@ -75,7 +74,7 @@ class WarcHandler (EventHandler):
d.update (warc_headers_dict)
warc_headers_dict = d
- record = self.writer.create_warc_record (url, kind, payload=payload,
+ record = self.writer.create_warc_record (str (url), kind, payload=payload,
warc_headers_dict=warc_headers_dict, http_headers=http_headers)
self.writer.write_record (record)
@@ -85,12 +84,9 @@ class WarcHandler (EventHandler):
logger = self.logger.bind (reqId=item.id)
req = item.request
- resp = item.response
- url = urlsplit (resp['url'])
+ url = item.url
- path = url.path
- if url.query:
- path += '?' + url.query
+ path = url.relative().with_fragment(None)
httpHeaders = StatusAndHeaders('{} {} HTTP/1.1'.format (req['method'], path),
item.requestHeaders, protocol='HTTP/1.1', is_http_request=True)
initiator = item.initiator
@@ -111,7 +107,7 @@ class WarcHandler (EventHandler):
if payload:
payload = BytesIO (payload)
warcHeaders['X-Chrome-Base64Body'] = str (payloadBase64Encoded)
- record = self.writeRecord (req['url'], 'request',
+ record = self.writeRecord (url, 'request',
payload=payload, http_headers=httpHeaders,
warc_headers_dict=warcHeaders)
return record.rec_headers['WARC-Record-ID']
@@ -172,7 +168,7 @@ class WarcHandler (EventHandler):
else:
bodyIo = BytesIO ()
- record = self.writeRecord (resp['url'], 'response',
+ record = self.writeRecord (item.url, 'response',
warc_headers_dict=warcHeaders, payload=bodyIo,
http_headers=httpHeaders)
@@ -225,7 +221,7 @@ class WarcHandler (EventHandler):
payload=BytesIO (item.data), warc_headers_dict=warcHeaders)
def _writeControllerStart (self, item):
- payload = BytesIO (json.dumps (item.payload, indent=2).encode ('utf-8'))
+ payload = BytesIO (json.dumps (item.payload, indent=2, cls=StrJsonEncoder).encode ('utf-8'))
writer = self.writer
warcinfo = self.writeRecord (packageUrl ('warcinfo'), 'warcinfo',