diff options
-rw-r--r-- | crocoite/behavior.py | 3 | ||||
-rw-r--r-- | crocoite/test_warc.py | 261 | ||||
-rw-r--r-- | crocoite/util.py | 4 | ||||
-rw-r--r-- | crocoite/warc.py | 29 | ||||
-rw-r--r-- | setup.py | 2 |
5 files changed, 281 insertions, 18 deletions
diff --git a/crocoite/behavior.py b/crocoite/behavior.py index 7f3a3a0..dcab33a 100644 --- a/crocoite/behavior.py +++ b/crocoite/behavior.py @@ -197,6 +197,9 @@ class DomSnapshotEvent: __slots__ = ('url', 'document', 'viewport') def __init__ (self, url, document, viewport): + # XXX: document encoding? + assert isinstance (document, bytes) + self.url = url self.document = document self.viewport = viewport diff --git a/crocoite/test_warc.py b/crocoite/test_warc.py new file mode 100644 index 0000000..7f2635b --- /dev/null +++ b/crocoite/test_warc.py @@ -0,0 +1,261 @@ +# Copyright (c) 2018 crocoite contributors +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +from tempfile import NamedTemporaryFile +import json, urllib +from operator import itemgetter + +from warcio.archiveiterator import ArchiveIterator +from yarl import URL +from hypothesis import given, reproduce_failure +import hypothesis.strategies as st +import pytest + +from .warc import WarcHandler +from .logger import Logger, WarcHandlerConsumer +from .controller import ControllerStart +from .behavior import Script, ScreenshotEvent, DomSnapshotEvent +from .browser import Item + +def test_log (): + logger = Logger () + + with NamedTemporaryFile() as fd: + with WarcHandler (fd, logger) as handler: + warclogger = WarcHandlerConsumer (handler) + logger.connect (warclogger) + golden = [] + + assert handler.log.tell () == 0 + golden.append (logger.info (foo=1, bar='baz', encoding='äöü⇔ΓΨ')) + assert handler.log.tell () != 0 + + handler.maxLogSize = 0 + golden.append (logger.info (bar=1, baz='baz')) + # should flush the log + assert handler.log.tell () == 0 + + fd.seek (0) + for it in ArchiveIterator (fd): + headers = it.rec_headers + assert headers['warc-type'] == 'resource' + assert headers['warc-target-uri'].endswith (':log') + assert headers['content-type'] == f'text/plain; encoding={handler.logEncoding}' + + while True: + l = it.raw_stream.readline () + if not l: + break + data = json.loads (l.strip ()) + assert data == golden.pop (0) + +def hostname (): + # XXX: find a better way to generate hostnames + return st.text (alphabet=st.sampled_from('abcdefghijklmnopqrstuvwxyz0123456789-'), min_size=1, max_size=253) + +def urls (): + """ Build http/https URL """ + scheme = st.one_of (st.just ('http'), st.just ('https')) + # Path must start with a slash + pathSt = st.builds (lambda x: '/' + x, st.text ()) + args = st.fixed_dictionaries ({ + 'scheme': scheme, + 'host': hostname (), + 'port': st.one_of (st.none (), st.integers (min_value=1, max_value=2**16-1)), + 'path': pathSt, + 'query_string': st.text (), + 'fragment': st.text (), + }) + return st.builds (lambda x: URL.build (**x), args) + +def item (): + def f (url, requestBody, body, mimeType): + i = Item () + # XXX: we really need some level of abstraction. Testing is a nightmare. + i.setRequest ({'requestId': 'myid', 'initiator': 'Test', 'wallTime': 0, 'timestamp': 1, 'request': {'url': str (url), 'method': 'GET', 'headers': {'None': 'None'}}}) + i.setResponse ({'requestId': 'myid', 'timestamp': 2, 'type': 'Document', 'response': {'url': str (url), 'requestHeaders': {'foo': 'bar', 'Set-Cookie': 'line1\nline2'}, 'headers': {'Response': 'Headers', 'Content-Length': '12345'}, 'status': 200}}) + if mimeType is not None: + i.chromeResponse['response']['mimeType'] = 'text/html' + i.requestBody = requestBody + i.body = body + return i + + def failedItem (url): + i = Item () + i.setRequest ({'requestId': 'myid', 'initiator': 'Test', 'wallTime': 0, 'timestamp': 1, 'request': {'url': str (url), 'method': 'GET', 'headers': {'None': 'None'}}}) + i.failed = True + return i + + bodySt = st.one_of (st.none (), st.tuples (st.one_of (st.none (), st.binary ()), st.booleans ())) + mimeTypeSt = st.one_of (st.none (), st.just ('text/html')) + return st.one_of ( + st.builds (failedItem, urls ()), + st.builds (f, urls (), bodySt, bodySt, mimeTypeSt), + ) + +def jsonObject (): + """ JSON-encodable objects """ + return st.dictionaries (st.text (), st.one_of (st.integers (), st.text ())) + +def viewport (): + return st.builds (lambda x, y: f'{x}x{y}', st.integers (), st.integers ()) + +def event (): + return st.one_of ( + st.builds (ControllerStart, jsonObject ()), + st.builds (Script.fromStr, st.text (), st.one_of(st.none (), st.text ())), + st.builds (ScreenshotEvent, urls (), st.integers (), st.binary ()), + st.builds (DomSnapshotEvent, urls (), st.builds (lambda x: x.encode ('utf-8'), st.text ()), viewport()), + item (), + ) + +@given (st.lists (event ())) +def test_push (golden): + def checkWarcinfoId (headers): + if lastWarcinfoRecordid is not None: + assert headers['WARC-Warcinfo-ID'] == lastWarcinfoRecordid + + lastWarcinfoRecordid = None + + # null logger + logger = Logger () + with NamedTemporaryFile() as fd: + with WarcHandler (fd, logger) as handler: + for g in golden: + handler.push (g) + + fd.seek (0) + it = iter (ArchiveIterator (fd)) + for g in golden: + if isinstance (g, ControllerStart): + rec = next (it) + + headers = rec.rec_headers + assert headers['warc-type'] == 'warcinfo' + assert headers['warc-target-uri'].endswith (':warcinfo') + + data = json.load (rec.raw_stream) + assert data == g.payload + + lastWarcinfoRecordid = headers['warc-record-id'] + assert lastWarcinfoRecordid + elif isinstance (g, Script): + rec = next (it) + + headers = rec.rec_headers + assert headers['warc-type'] == 'metadata' + checkWarcinfoId (headers) + path = g.path or '-' + goldenpath = f':script/{urllib.parse.quote (path)}' + assert headers['warc-target-uri'].endswith (goldenpath), (g.path, path, goldenpath) + + data = rec.raw_stream.read ().decode ('utf-8') + assert data == g.data + elif isinstance (g, ScreenshotEvent): + # XXX: check refers-to header + rec = next (it) + + headers = rec.rec_headers + assert headers['warc-type'] == 'conversion' + checkWarcinfoId (headers) + assert URL (headers['warc-target-uri']) == g.url, (headers['warc-target-uri'], g.url) + assert headers['warc-refers-to'] is None + assert int (headers['X-Crocoite-Screenshot-Y-Offset']) == g.yoff + + assert rec.raw_stream.read () == g.data + elif isinstance (g, DomSnapshotEvent): + rec = next (it) + + headers = rec.rec_headers + assert headers['warc-type'] == 'conversion' + checkWarcinfoId (headers) + assert URL (headers['warc-target-uri']) == g.url + assert headers['warc-refers-to'] is None + assert headers['X-DOM-Snapshot'] == 'True' + + assert rec.raw_stream.read () == g.document + elif isinstance (g, Item): + if g.failed: + continue + + rec = next (it) + + # request + headers = rec.rec_headers + assert headers['warc-type'] == 'request' + checkWarcinfoId (headers) + assert URL (headers['warc-target-uri']) == g.url + assert headers['x-chrome-request-id'] == g.id + + assert sorted (rec.http_headers.headers, key=itemgetter (0)) == sorted (g.requestHeaders, key=itemgetter (0)) + if g.requestBody: + if g.requestBody[0] is None: + assert not rec.raw_stream.read () + else: + assert rec.raw_stream.read () == g.requestBody[0], g.requestBody + assert str (headers['x-chrome-base64body'] or False) == str (g.requestBody[1]), (headers['x-chrome-base64body'], g.requestBody) + else: + # body fetch failed + assert headers['warc-truncated'] == 'unspecified' + + # response + rec = next (it) + headers = rec.rec_headers + httpheaders = rec.http_headers + assert headers['warc-type'] == 'response' + checkWarcinfoId (headers) + assert URL (headers['warc-target-uri']) == g.url + assert headers['x-chrome-request-id'] == g.id + + # these are checked separately + blacklistedHeaders = {'content-type', 'content-length'} + sortedHeaders = lambda l: sorted (filter (lambda x: x[0].lower() not in blacklistedHeaders, l), key=itemgetter (0)) + assert sortedHeaders (httpheaders.headers) == sortedHeaders (g.responseHeaders) + + expectedContentType = g.response.get ('mimeType') + if expectedContentType is not None: + assert httpheaders['content-type'].startswith (expectedContentType) + + if g.body: + if g.body[0] is None: + assert not rec.raw_stream.read () + #assert httpheaders['content-length'] == '0' + else: + assert rec.raw_stream.read () == g.body[0] + assert str (headers['x-chrome-base64body'] or False) == str (g.body[1]) + assert httpheaders['content-length'] == str (len (g.body[0])) + + # body is never truncated if it exists + assert headers['warc-truncated'] is None + + # unencoded strings are converted to utf8 + if not g.body[1] and httpheaders['content-type'] is not None: + assert httpheaders['content-type'].endswith ('; charset=utf-8') + else: + # body fetch failed + assert headers['warc-truncated'] == 'unspecified' + # content-length header should be kept intact + else: + assert False, f"invalid golden type {type(g)}" # pragma: no cover + + # no further records + with pytest.raises (StopIteration): + next (it) + diff --git a/crocoite/util.py b/crocoite/util.py index 5bced53..ded5e99 100644 --- a/crocoite/util.py +++ b/crocoite/util.py @@ -22,7 +22,7 @@ Random utility functions """ -import random, sys, platform, os, json +import random, sys, platform, os, json, urllib from datetime import datetime import hashlib, pkg_resources @@ -43,7 +43,7 @@ def packageUrl (path): """ Create URL for package data stored into WARC """ - return 'urn:' + __package__ + ':' + path + return 'urn:' + __package__ + ':' + urllib.parse.quote (path) async def getFormattedViewportMetrics (tab): layoutMetrics = await tab.Page.getLayoutMetrics () diff --git a/crocoite/warc.py b/crocoite/warc.py index 04dd871..dbd9ebc 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -39,8 +39,7 @@ class WarcHandler (EventHandler): __slots__ = ('logger', 'writer', 'documentRecords', 'log', 'maxLogSize', 'logEncoding', 'warcinfoRecordId') - def __init__ (self, fd, - logger): + def __init__ (self, fd, logger): self.logger = logger self.writer = WARCWriter (fd, gzip=True) @@ -104,7 +103,7 @@ class WarcHandler (EventHandler): warcHeaders['WARC-Truncated'] = 'unspecified' payload = None - if payload: + if payload is not None: payload = BytesIO (payload) warcHeaders['X-Chrome-Base64Body'] = str (payloadBase64Encoded) record = self.writeRecord (url, 'request', @@ -160,10 +159,10 @@ class WarcHandler (EventHandler): if contentType: if not base64Encoded: contentType += '; charset=utf-8' - httpHeaders.replace_header ('content-type', contentType) + httpHeaders.replace_header ('Content-Type', contentType) if rawBody is not None: - httpHeaders.replace_header ('content-length', str (len (rawBody))) + httpHeaders.replace_header ('Content-Length', str (len (rawBody))) bodyIo = BytesIO (rawBody) else: bodyIo = BytesIO () @@ -178,7 +177,8 @@ class WarcHandler (EventHandler): def _writeScript (self, item): writer = self.writer encoding = 'utf-8' - self.writeRecord (packageUrl (f'script/{item.path}'), 'metadata', + path = item.path or '-' + self.writeRecord (packageUrl (f'script/{path}'), 'metadata', payload=BytesIO (str (item).encode (encoding)), warc_headers_dict={'Content-Type': f'application/javascript; charset={encoding}'}) @@ -231,20 +231,19 @@ class WarcHandler (EventHandler): self.warcinfoRecordId = warcinfo.rec_headers['WARC-Record-ID'] def _flushLogEntries (self): - writer = self.writer - self.log.seek (0) - # XXX: we should use the type continuation here - self.writeRecord (packageUrl ('log'), 'resource', payload=self.log, - warc_headers_dict={'Content-Type': f'text/plain; encoding={self.logEncoding}'}) - self.log = BytesIO () + if self.log.tell () > 0: + writer = self.writer + self.log.seek (0) + # XXX: we should use the type continuation here + self.writeRecord (packageUrl ('log'), 'resource', payload=self.log, + warc_headers_dict={'Content-Type': f'text/plain; encoding={self.logEncoding}'}) + self.log = BytesIO () def _writeLog (self, item): """ Handle log entries, called by .logger.WarcHandlerConsumer only """ self.log.write (item.encode (self.logEncoding)) self.log.write (b'\n') - # instead of locking, check we’re running in the main thread - if self.log.tell () > self.maxLogSize and \ - threading.current_thread () is threading.main_thread (): + if self.log.tell () > self.maxLogSize: self._flushLogEntries () route = {Script: _writeScript, @@ -34,5 +34,5 @@ setup( 'crocoite': ['data/*'], }, setup_requires=["pytest-runner"], - tests_require=["pytest", 'pytest-asyncio', 'pytest-cov'], + tests_require=["pytest", 'pytest-asyncio', 'pytest-cov', 'hypothesis'], ) |