From 5e444dd6511d97308a84ae9c86ebf14547d01f01 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Tue, 18 Dec 2018 12:34:25 +0100 Subject: Parse URLs by default Use library yarl (already pulled in by aiohttp). No URL processed should be a string. --- crocoite/behavior.py | 32 ++++++++++---------------------- crocoite/browser.py | 28 +++++++++++----------------- crocoite/cli.py | 5 +++-- crocoite/controller.py | 9 ++++----- crocoite/devtools.py | 4 +++- crocoite/logger.py | 17 ++++------------- crocoite/test_behavior.py | 8 ++++---- crocoite/test_browser.py | 14 +++++++------- crocoite/util.py | 22 +++++++++++++++------- crocoite/warc.py | 18 +++++++----------- 10 files changed, 68 insertions(+), 89 deletions(-) (limited to 'crocoite') diff --git a/crocoite/behavior.py b/crocoite/behavior.py index eb5478b..321b65c 100644 --- a/crocoite/behavior.py +++ b/crocoite/behavior.py @@ -35,15 +35,15 @@ instance. """ import asyncio, json, os.path -from urllib.parse import urlsplit from base64 import b64decode from collections import OrderedDict import pkg_resources from html5lib.serializer import HTMLSerializer +from yarl import URL import yaml -from .util import getFormattedViewportMetrics, removeFragment +from .util import getFormattedViewportMetrics from . import html from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker from .devtools import Crashed @@ -107,16 +107,6 @@ class Behavior: return yield -class HostnameFilter: - """ Limit behavior script to hostname """ - - hostname = None - - def __contains__ (self, url): - url = urlsplit (url) - hostname = url.hostname.split ('.')[::-1] - return hostname[:2] == self.hostname - class JsOnload (Behavior): """ Execute JavaScript on page load """ @@ -237,16 +227,14 @@ class DomSnapshot (Behavior): dom = await tab.DOM.getDocument (depth=-1, pierce=True) haveUrls = set () for doc in ChromeTreeWalker (dom['root']).split (): - rawUrl = doc['documentURL'] - if rawUrl in haveUrls: + url = URL (doc['documentURL']) + if url in haveUrls: # ignore duplicate URLs. they are usually caused by # javascript-injected iframes (advertising) with no(?) src - self.logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl)) - continue - url = urlsplit (rawUrl) - if url.scheme in ('http', 'https'): + self.logger.warning ('have DOM snapshot for URL {}, ignoring'.format (url)) + elif url.scheme in ('http', 'https'): self.logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL'])) - haveUrls.add (rawUrl) + haveUrls.add (url) walker = ChromeTreeWalker (doc) # remove script, to make the page static and noscript, because at the # time we took the snapshot scripts were enabled @@ -254,7 +242,7 @@ class DomSnapshot (Behavior): disallowedAttributes = html.eventAttributes stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes) serializer = HTMLSerializer () - yield DomSnapshotEvent (removeFragment (doc['documentURL']), serializer.render (stream, 'utf-8'), viewport) + yield DomSnapshotEvent (url.with_fragment(None), serializer.render (stream, 'utf-8'), viewport) class ScreenshotEvent: __slots__ = ('yoff', 'data', 'url') @@ -276,7 +264,7 @@ class Screenshot (Behavior): tree = await tab.Page.getFrameTree () try: - url = removeFragment (tree['frameTree']['frame']['url']) + url = URL (tree['frameTree']['frame']['url']).with_fragment (None) except KeyError: self.logger.error ('frame without url', tree=tree) url = None @@ -333,7 +321,7 @@ class ExtractLinks (Behavior): tab = self.loader.tab yield self.script result = await tab.Runtime.evaluate (expression=str (self.script), returnByValue=True) - yield ExtractLinksEvent (list (set (result['result']['value']))) + yield ExtractLinksEvent (list (set (map (URL, result['result']['value'])))) class Crash (Behavior): """ Crash the browser. For testing only. Obviously. """ diff --git a/crocoite/browser.py b/crocoite/browser.py index c472746..1c7ac3b 100644 --- a/crocoite/browser.py +++ b/crocoite/browser.py @@ -23,9 +23,9 @@ Chrome browser interactions. """ import asyncio -from urllib.parse import urlsplit from base64 import b64decode from http.server import BaseHTTPRequestHandler +from yarl import URL from .logger import Level from .devtools import Browser, TabException @@ -72,11 +72,7 @@ class Item: @property def url (self): - return self.response.get ('url', self.request.get ('url')) - - @property - def parsedUrl (self): - return urlsplit (self.url) + return URL (self.response.get ('url', self.request.get ('url'))) @property def requestHeaders (self): @@ -274,9 +270,9 @@ class SiteLoader: async def _requestWillBeSent (self, **kwargs): reqId = kwargs['requestId'] req = kwargs['request'] - logger = self.logger.bind (reqId=reqId, reqUrl=req['url']) + url = URL (req['url']) + logger = self.logger.bind (reqId=reqId, reqUrl=url) - url = urlsplit (req['url']) if url.scheme not in self.allowedSchemes: return @@ -292,7 +288,7 @@ class SiteLoader: resp = {'requestId': reqId, 'encodedDataLength': 0, 'timestamp': kwargs['timestamp']} item.setFinished (resp) item.isRedirect = True - logger.info ('redirect', uuid='85eaec41-e2a9-49c2-9445-6f19690278b8', target=req['url']) + logger.info ('redirect', uuid='85eaec41-e2a9-49c2-9445-6f19690278b8', target=url) await item.prefetchRequestBody (self.tab) # cannot fetch request body due to race condition (item id reused) ret = item @@ -313,8 +309,10 @@ class SiteLoader: return resp = kwargs['response'] - logger = self.logger.bind (reqId=reqId, respUrl=resp['url']) - url = urlsplit (resp['url']) + url = URL (resp['url']) + logger = self.logger.bind (reqId=reqId, respUrl=url) + if item.url != url: + logger.error ('url mismatch', uuid='7385f45f-0b06-4cbc-81f9-67bcd72ee7d0', respUrl=url) if url.scheme in self.allowedSchemes: logger.debug ('response', uuid='84461c4e-e8ef-4cbd-8e8e-e10a901c8bd0') item.setResponse (kwargs) @@ -332,12 +330,8 @@ class SiteLoader: # we never recorded this request (blacklisted scheme, for example) return req = item.request - logger = self.logger.bind (reqId=reqId, reqUrl=req['url']) - resp = item.response - if req['url'] != resp['url']: - logger.error ('url mismatch', uuid='7385f45f-0b06-4cbc-81f9-67bcd72ee7d0', respUrl=resp['url']) - url = urlsplit (resp['url']) - if url.scheme in self.allowedSchemes: + logger = self.logger.bind (reqId=reqId, reqUrl=item.url) + if item.url.scheme in self.allowedSchemes: logger.info ('finished', uuid='5a8b4bad-f86a-4fe6-a53e-8da4130d6a02') item.setFinished (kwargs) await asyncio.gather (item.prefetchRequestBody (self.tab), item.prefetchResponseBody (self.tab)) diff --git a/crocoite/cli.py b/crocoite/cli.py index c3c41a4..b0ad53a 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -24,6 +24,7 @@ Command line interface import argparse, sys, signal, asyncio, os from enum import IntEnum +from yarl import URL from . import behavior from .controller import SinglePageController, \ @@ -50,7 +51,7 @@ def single (): default=list (behavior.availableMap.keys ()), choices=list (behavior.availableMap.keys ()), metavar='NAME', nargs='*') - parser.add_argument('url', help='Website URL', metavar='URL') + parser.add_argument('url', help='Website URL', type=URL, metavar='URL') parser.add_argument('output', help='WARC filename', metavar='FILE') args = parser.parse_args () @@ -102,7 +103,7 @@ def recursive (): parser.add_argument('--tempdir', help='Directory for temporary files', metavar='DIR') parser.add_argument('--prefix', help='Output filename prefix, supports templates {host} and {date}', metavar='FILENAME', default='{host}-{date}-') parser.add_argument('--concurrency', '-j', help='Run at most N jobs', metavar='N', default=1, type=int) - parser.add_argument('url', help='Seed URL', metavar='URL') + parser.add_argument('url', help='Seed URL', type=URL, metavar='URL') parser.add_argument('output', help='Output directory', metavar='DIR') parser.add_argument('command', help='Fetch command, supports templates {url} and {dest}', metavar='CMD', nargs='*', default=['crocoite-grab', '{url}', '{dest}']) diff --git a/crocoite/controller.py b/crocoite/controller.py index f8b1420..c646a61 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -26,12 +26,11 @@ import time import tempfile, asyncio, json, os from itertools import islice from datetime import datetime -from urllib.parse import urlparse from operator import attrgetter from . import behavior as cbehavior from .browser import SiteLoader, Item -from .util import getFormattedViewportMetrics, getSoftwareInfo, removeFragment +from .util import getFormattedViewportMetrics, getSoftwareInfo from .behavior import ExtractLinksEvent class ControllerSettings: @@ -316,12 +315,12 @@ class RecursiveController: return e.format (url=url, dest=dest.name) def formatPrefix (p): - return p.format (host=urlparse (url).hostname, date=datetime.utcnow ().isoformat ()) + return p.format (host=url.host, date=datetime.utcnow ().isoformat ()) def logStats (): logger.info ('stats', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **self.stats) - if urlparse (url).scheme not in self.SCHEME_WHITELIST: + if url.scheme not in self.SCHEME_WHITELIST: self.stats['ignored'] += 1 logStats () self.logger.warning ('scheme not whitelisted', url=url, @@ -344,7 +343,7 @@ class RecursiveController: data = json.loads (data) uuid = data.get ('uuid') if uuid == '8ee5e9c9-1130-4c5c-88ff-718508546e0c': - links = set (self.policy (map (removeFragment, data.get ('links', [])))) + links = set (self.policy (map (lambda x: x.with_fragment(None), data.get ('links', [])))) links.difference_update (self.have) self.pending.update (links) elif uuid == '24d92d16-770e-4088-b769-4020e127a7ff': diff --git a/crocoite/devtools.py b/crocoite/devtools.py index b071d2e..e62d4e0 100644 --- a/crocoite/devtools.py +++ b/crocoite/devtools.py @@ -27,6 +27,8 @@ from tempfile import mkdtemp import shutil import aiohttp, websockets +from .util import StrJsonEncoder + logger = logging.getLogger (__name__) class Browser: @@ -155,7 +157,7 @@ class Tab: message = {'method': method, 'params': kwargs, 'id': msgid} t = self.transactions[msgid] = {'event': asyncio.Event (), 'result': None} logger.debug ('← {}'.format (message)) - await self.ws.send (json.dumps (message)) + await self.ws.send (json.dumps (message, cls=StrJsonEncoder)) await t['event'].wait () ret = t['result'] del self.transactions[msgid] diff --git a/crocoite/logger.py b/crocoite/logger.py index cddc42d..7f4a4de 100644 --- a/crocoite/logger.py +++ b/crocoite/logger.py @@ -34,6 +34,8 @@ from enum import IntEnum from pytz import utc +from .util import StrJsonEncoder + class Level(IntEnum): DEBUG = 0 INFO = 1 @@ -102,24 +104,13 @@ class PrintConsumer (Consumer): sys.stderr.flush () return kwargs -class JsonEncoder (json.JSONEncoder): - def default (self, obj): - if isinstance (obj, datetime): - return obj.isoformat () - - # make sure serialization always succeeds - try: - return json.JSONEncoder.default(self, obj) - except TypeError: - return str (obj) - class JsonPrintConsumer (Consumer): def __init__ (self, minLevel=Level.INFO): self.minLevel = minLevel def __call__ (self, **kwargs): if kwargs['level'] >= self.minLevel: - json.dump (kwargs, sys.stdout, cls=JsonEncoder) + json.dump (kwargs, sys.stdout, cls=StrJsonEncoder) sys.stdout.write ('\n') sys.stdout.flush () return kwargs @@ -136,6 +127,6 @@ class WarcHandlerConsumer (Consumer): def __call__ (self, **kwargs): if kwargs['level'] >= self.minLevel: - self.warc._writeLog (json.dumps (kwargs, cls=JsonEncoder)) + self.warc._writeLog (json.dumps (kwargs, cls=StrJsonEncoder)) return kwargs diff --git a/crocoite/test_behavior.py b/crocoite/test_behavior.py index 280b35b..0433918 100644 --- a/crocoite/test_behavior.py +++ b/crocoite/test_behavior.py @@ -19,9 +19,9 @@ # THE SOFTWARE. import asyncio, os, yaml, re -from urllib.parse import urlparse from functools import partial import pytest +from yarl import URL import pkg_resources from .logger import Logger @@ -87,12 +87,12 @@ matchParam = [] for o in sites: for s in o['selector']: for u in s.get ('urls', []): - matchParam.append ((o['match'], u)) + matchParam.append ((o['match'], URL (u))) @pytest.mark.parametrize("match,url", matchParam) @pytest.mark.asyncio async def test_click_match (match, url): """ Test urls must match """ - host = urlparse (url).netloc - assert re.match (match, host, re.I) + # keep this aligned with click.js + assert re.match (match, url.host, re.I) diff --git a/crocoite/test_browser.py b/crocoite/test_browser.py index 06492b1..5616fcf 100644 --- a/crocoite/test_browser.py +++ b/crocoite/test_browser.py @@ -92,7 +92,7 @@ testItems = [ TItem ('html/fetchPost/binary/large', 200, {'Content-Type': 'application/octet-stream'}, b'\x00', requestBody=(100*1024)*b'\x00'), TItem ('html/fetchPost/form/large', 200, {'Content-Type': 'application/octet-stream'}, b'\x00', requestBody=b'data=' + (100*1024)*b'%21'), ] -testItemMap = dict ([(item.parsedUrl.path, item) for item in testItems]) +testItemMap = dict ([(item.url.path, item) for item in testItems]) def itemToResponse (item): async def f (req): @@ -108,7 +108,7 @@ async def server (): logging.basicConfig(level=logging.DEBUG) app = web.Application(debug=True) for item in testItems: - app.router.add_route ('*', item.parsedUrl.path, itemToResponse (item)) + app.router.add_route ('*', item.url.path, itemToResponse (item)) runner = web.AppRunner(app) await runner.setup() site = web.TCPSite(runner, 'localhost', 8080) @@ -137,10 +137,10 @@ async def loader (server, logger): yield f async def itemsLoaded (l, items): - items = dict ([(i.parsedUrl.path, i) for i in items]) + items = dict ([(i.url.path, i) for i in items]) async for item in l: assert item.chromeResponse is not None - golden = items.pop (item.parsedUrl.path) + golden = items.pop (item.url.path) if not golden: assert False, 'url {} not supposed to be fetched'.format (item.url) assert item.failed == golden.failed @@ -167,7 +167,7 @@ async def itemsLoaded (l, items): break async def literalItem (lf, item, deps=[]): - async with lf (item.parsedUrl.path) as l: + async with lf (item.url.path) as l: await l.start () await asyncio.wait_for (itemsLoaded (l, [item] + deps), timeout=30) @@ -184,7 +184,7 @@ async def test_headers_duplicate (loader): async with loader ('/headers/duplicate') as l: await l.start () async for it in l: - if it.parsedUrl.path == '/headers/duplicate': + if it.url.path == '/headers/duplicate': assert not it.failed dup = list (filter (lambda x: x[0] == 'Duplicate', it.responseHeaders)) assert len(dup) == 2 @@ -200,7 +200,7 @@ async def test_headers_req (loader): async with loader ('/headers/fetch/html') as l: await l.start () async for it in l: - if it.parsedUrl.path == '/headers/fetch/req': + if it.url.path == '/headers/fetch/req': assert not it.failed dup = list (filter (lambda x: x[0] == 'custom', it.requestHeaders)) assert len(dup) == 1 diff --git a/crocoite/util.py b/crocoite/util.py index bd26909..73a1d65 100644 --- a/crocoite/util.py +++ b/crocoite/util.py @@ -22,9 +22,22 @@ Random utility functions """ -import random, sys, platform +import random, sys, platform, os, json +from datetime import datetime import hashlib, pkg_resources -from urllib.parse import urlsplit, urlunsplit + +class StrJsonEncoder (json.JSONEncoder): + """ JSON encoder that turns unknown classes into a string and thus never + fails """ + def default (self, obj): + if isinstance (obj, datetime): + return obj.isoformat () + + # make sure serialization always succeeds + try: + return json.JSONEncoder.default(self, obj) + except TypeError: + return str (obj) def packageUrl (path): """ @@ -38,11 +51,6 @@ async def getFormattedViewportMetrics (tab): return '{}x{}'.format (layoutMetrics['layoutViewport']['clientWidth'], layoutMetrics['layoutViewport']['clientHeight']) -def removeFragment (u): - """ Remove fragment from url (i.e. #hashvalue) """ - s = urlsplit (u) - return urlunsplit ((s.scheme, s.netloc, s.path, s.query, '')) - def getSoftwareInfo (): """ Get software info for inclusion into warcinfo """ return { diff --git a/crocoite/warc.py b/crocoite/warc.py index ebc460d..21a99aa 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -24,14 +24,13 @@ Classes writing data to WARC files import json, threading from io import BytesIO -from urllib.parse import urlsplit from datetime import datetime from warcio.timeutils import datetime_to_iso_date from warcio.warcwriter import WARCWriter from warcio.statusandheaders import StatusAndHeaders -from .util import packageUrl +from .util import packageUrl, StrJsonEncoder from .controller import EventHandler, ControllerStart from .behavior import Script, DomSnapshotEvent, ScreenshotEvent from .browser import Item @@ -75,7 +74,7 @@ class WarcHandler (EventHandler): d.update (warc_headers_dict) warc_headers_dict = d - record = self.writer.create_warc_record (url, kind, payload=payload, + record = self.writer.create_warc_record (str (url), kind, payload=payload, warc_headers_dict=warc_headers_dict, http_headers=http_headers) self.writer.write_record (record) @@ -85,12 +84,9 @@ class WarcHandler (EventHandler): logger = self.logger.bind (reqId=item.id) req = item.request - resp = item.response - url = urlsplit (resp['url']) + url = item.url - path = url.path - if url.query: - path += '?' + url.query + path = url.relative().with_fragment(None) httpHeaders = StatusAndHeaders('{} {} HTTP/1.1'.format (req['method'], path), item.requestHeaders, protocol='HTTP/1.1', is_http_request=True) initiator = item.initiator @@ -111,7 +107,7 @@ class WarcHandler (EventHandler): if payload: payload = BytesIO (payload) warcHeaders['X-Chrome-Base64Body'] = str (payloadBase64Encoded) - record = self.writeRecord (req['url'], 'request', + record = self.writeRecord (url, 'request', payload=payload, http_headers=httpHeaders, warc_headers_dict=warcHeaders) return record.rec_headers['WARC-Record-ID'] @@ -172,7 +168,7 @@ class WarcHandler (EventHandler): else: bodyIo = BytesIO () - record = self.writeRecord (resp['url'], 'response', + record = self.writeRecord (item.url, 'response', warc_headers_dict=warcHeaders, payload=bodyIo, http_headers=httpHeaders) @@ -225,7 +221,7 @@ class WarcHandler (EventHandler): payload=BytesIO (item.data), warc_headers_dict=warcHeaders) def _writeControllerStart (self, item): - payload = BytesIO (json.dumps (item.payload, indent=2).encode ('utf-8')) + payload = BytesIO (json.dumps (item.payload, indent=2, cls=StrJsonEncoder).encode ('utf-8')) writer = self.writer warcinfo = self.writeRecord (packageUrl ('warcinfo'), 'warcinfo', -- cgit v1.2.3