diff options
author | Lars-Dominik Braun <lars@6xq.net> | 2018-12-18 12:34:25 +0100 |
---|---|---|
committer | Lars-Dominik Braun <lars@6xq.net> | 2018-12-21 20:28:51 +0100 |
commit | 5e444dd6511d97308a84ae9c86ebf14547d01f01 (patch) | |
tree | 0852c081163ff3456038fb08ad4e47d0d47a6167 /crocoite/warc.py | |
parent | e19635a75cc1ab206be12ecf2b1c9a909baa9c21 (diff) | |
download | crocoite-5e444dd6511d97308a84ae9c86ebf14547d01f01.tar.gz crocoite-5e444dd6511d97308a84ae9c86ebf14547d01f01.tar.bz2 crocoite-5e444dd6511d97308a84ae9c86ebf14547d01f01.zip |
Parse URLs by default
Use library yarl (already pulled in by aiohttp). No URL processed should
be a string.
Diffstat (limited to 'crocoite/warc.py')
-rw-r--r-- | crocoite/warc.py | 18 |
1 files changed, 7 insertions, 11 deletions
diff --git a/crocoite/warc.py b/crocoite/warc.py index ebc460d..21a99aa 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -24,14 +24,13 @@ Classes writing data to WARC files import json, threading from io import BytesIO -from urllib.parse import urlsplit from datetime import datetime from warcio.timeutils import datetime_to_iso_date from warcio.warcwriter import WARCWriter from warcio.statusandheaders import StatusAndHeaders -from .util import packageUrl +from .util import packageUrl, StrJsonEncoder from .controller import EventHandler, ControllerStart from .behavior import Script, DomSnapshotEvent, ScreenshotEvent from .browser import Item @@ -75,7 +74,7 @@ class WarcHandler (EventHandler): d.update (warc_headers_dict) warc_headers_dict = d - record = self.writer.create_warc_record (url, kind, payload=payload, + record = self.writer.create_warc_record (str (url), kind, payload=payload, warc_headers_dict=warc_headers_dict, http_headers=http_headers) self.writer.write_record (record) @@ -85,12 +84,9 @@ class WarcHandler (EventHandler): logger = self.logger.bind (reqId=item.id) req = item.request - resp = item.response - url = urlsplit (resp['url']) + url = item.url - path = url.path - if url.query: - path += '?' + url.query + path = url.relative().with_fragment(None) httpHeaders = StatusAndHeaders('{} {} HTTP/1.1'.format (req['method'], path), item.requestHeaders, protocol='HTTP/1.1', is_http_request=True) initiator = item.initiator @@ -111,7 +107,7 @@ class WarcHandler (EventHandler): if payload: payload = BytesIO (payload) warcHeaders['X-Chrome-Base64Body'] = str (payloadBase64Encoded) - record = self.writeRecord (req['url'], 'request', + record = self.writeRecord (url, 'request', payload=payload, http_headers=httpHeaders, warc_headers_dict=warcHeaders) return record.rec_headers['WARC-Record-ID'] @@ -172,7 +168,7 @@ class WarcHandler (EventHandler): else: bodyIo = BytesIO () - record = self.writeRecord (resp['url'], 'response', + record = self.writeRecord (item.url, 'response', warc_headers_dict=warcHeaders, payload=bodyIo, http_headers=httpHeaders) @@ -225,7 +221,7 @@ class WarcHandler (EventHandler): payload=BytesIO (item.data), warc_headers_dict=warcHeaders) def _writeControllerStart (self, item): - payload = BytesIO (json.dumps (item.payload, indent=2).encode ('utf-8')) + payload = BytesIO (json.dumps (item.payload, indent=2, cls=StrJsonEncoder).encode ('utf-8')) writer = self.writer warcinfo = self.writeRecord (packageUrl ('warcinfo'), 'warcinfo', |