summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2018-12-18 12:34:25 +0100
committerLars-Dominik Braun <lars@6xq.net>2018-12-21 20:28:51 +0100
commit5e444dd6511d97308a84ae9c86ebf14547d01f01 (patch)
tree0852c081163ff3456038fb08ad4e47d0d47a6167
parente19635a75cc1ab206be12ecf2b1c9a909baa9c21 (diff)
downloadcrocoite-5e444dd6511d97308a84ae9c86ebf14547d01f01.tar.gz
crocoite-5e444dd6511d97308a84ae9c86ebf14547d01f01.tar.bz2
crocoite-5e444dd6511d97308a84ae9c86ebf14547d01f01.zip
Parse URLs by default
Use library yarl (already pulled in by aiohttp). No URL processed should be a string.
-rw-r--r--README.rst2
-rw-r--r--crocoite/behavior.py32
-rw-r--r--crocoite/browser.py28
-rw-r--r--crocoite/cli.py5
-rw-r--r--crocoite/controller.py9
-rw-r--r--crocoite/devtools.py4
-rw-r--r--crocoite/logger.py17
-rw-r--r--crocoite/test_behavior.py8
-rw-r--r--crocoite/test_browser.py14
-rw-r--r--crocoite/util.py22
-rw-r--r--crocoite/warc.py18
-rw-r--r--setup.py1
12 files changed, 71 insertions, 89 deletions
diff --git a/README.rst b/README.rst
index c604d81..71d9947 100644
--- a/README.rst
+++ b/README.rst
@@ -23,6 +23,7 @@ These dependencies must be present to run crocoite:
- websockets_
- warcio_
- html5lib_
+- yarl_
- bottom_ (IRC client)
- `Google Chrome`_
@@ -33,6 +34,7 @@ These dependencies must be present to run crocoite:
.. _html5lib: https://github.com/html5lib/html5lib-python
.. _bottom: https://github.com/numberoverzero/bottom
.. _Google Chrome: https://www.google.com/chrome/
+.. _yarl: https://yarl.readthedocs.io/
The following commands clone the repository from GitHub_, set up a virtual
environment and install crocoite:
diff --git a/crocoite/behavior.py b/crocoite/behavior.py
index eb5478b..321b65c 100644
--- a/crocoite/behavior.py
+++ b/crocoite/behavior.py
@@ -35,15 +35,15 @@ instance.
"""
import asyncio, json, os.path
-from urllib.parse import urlsplit
from base64 import b64decode
from collections import OrderedDict
import pkg_resources
from html5lib.serializer import HTMLSerializer
+from yarl import URL
import yaml
-from .util import getFormattedViewportMetrics, removeFragment
+from .util import getFormattedViewportMetrics
from . import html
from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker
from .devtools import Crashed
@@ -107,16 +107,6 @@ class Behavior:
return
yield
-class HostnameFilter:
- """ Limit behavior script to hostname """
-
- hostname = None
-
- def __contains__ (self, url):
- url = urlsplit (url)
- hostname = url.hostname.split ('.')[::-1]
- return hostname[:2] == self.hostname
-
class JsOnload (Behavior):
""" Execute JavaScript on page load """
@@ -237,16 +227,14 @@ class DomSnapshot (Behavior):
dom = await tab.DOM.getDocument (depth=-1, pierce=True)
haveUrls = set ()
for doc in ChromeTreeWalker (dom['root']).split ():
- rawUrl = doc['documentURL']
- if rawUrl in haveUrls:
+ url = URL (doc['documentURL'])
+ if url in haveUrls:
# ignore duplicate URLs. they are usually caused by
# javascript-injected iframes (advertising) with no(?) src
- self.logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl))
- continue
- url = urlsplit (rawUrl)
- if url.scheme in ('http', 'https'):
+ self.logger.warning ('have DOM snapshot for URL {}, ignoring'.format (url))
+ elif url.scheme in ('http', 'https'):
self.logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL']))
- haveUrls.add (rawUrl)
+ haveUrls.add (url)
walker = ChromeTreeWalker (doc)
# remove script, to make the page static and noscript, because at the
# time we took the snapshot scripts were enabled
@@ -254,7 +242,7 @@ class DomSnapshot (Behavior):
disallowedAttributes = html.eventAttributes
stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes)
serializer = HTMLSerializer ()
- yield DomSnapshotEvent (removeFragment (doc['documentURL']), serializer.render (stream, 'utf-8'), viewport)
+ yield DomSnapshotEvent (url.with_fragment(None), serializer.render (stream, 'utf-8'), viewport)
class ScreenshotEvent:
__slots__ = ('yoff', 'data', 'url')
@@ -276,7 +264,7 @@ class Screenshot (Behavior):
tree = await tab.Page.getFrameTree ()
try:
- url = removeFragment (tree['frameTree']['frame']['url'])
+ url = URL (tree['frameTree']['frame']['url']).with_fragment (None)
except KeyError:
self.logger.error ('frame without url', tree=tree)
url = None
@@ -333,7 +321,7 @@ class ExtractLinks (Behavior):
tab = self.loader.tab
yield self.script
result = await tab.Runtime.evaluate (expression=str (self.script), returnByValue=True)
- yield ExtractLinksEvent (list (set (result['result']['value'])))
+ yield ExtractLinksEvent (list (set (map (URL, result['result']['value']))))
class Crash (Behavior):
""" Crash the browser. For testing only. Obviously. """
diff --git a/crocoite/browser.py b/crocoite/browser.py
index c472746..1c7ac3b 100644
--- a/crocoite/browser.py
+++ b/crocoite/browser.py
@@ -23,9 +23,9 @@ Chrome browser interactions.
"""
import asyncio
-from urllib.parse import urlsplit
from base64 import b64decode
from http.server import BaseHTTPRequestHandler
+from yarl import URL
from .logger import Level
from .devtools import Browser, TabException
@@ -72,11 +72,7 @@ class Item:
@property
def url (self):
- return self.response.get ('url', self.request.get ('url'))
-
- @property
- def parsedUrl (self):
- return urlsplit (self.url)
+ return URL (self.response.get ('url', self.request.get ('url')))
@property
def requestHeaders (self):
@@ -274,9 +270,9 @@ class SiteLoader:
async def _requestWillBeSent (self, **kwargs):
reqId = kwargs['requestId']
req = kwargs['request']
- logger = self.logger.bind (reqId=reqId, reqUrl=req['url'])
+ url = URL (req['url'])
+ logger = self.logger.bind (reqId=reqId, reqUrl=url)
- url = urlsplit (req['url'])
if url.scheme not in self.allowedSchemes:
return
@@ -292,7 +288,7 @@ class SiteLoader:
resp = {'requestId': reqId, 'encodedDataLength': 0, 'timestamp': kwargs['timestamp']}
item.setFinished (resp)
item.isRedirect = True
- logger.info ('redirect', uuid='85eaec41-e2a9-49c2-9445-6f19690278b8', target=req['url'])
+ logger.info ('redirect', uuid='85eaec41-e2a9-49c2-9445-6f19690278b8', target=url)
await item.prefetchRequestBody (self.tab)
# cannot fetch request body due to race condition (item id reused)
ret = item
@@ -313,8 +309,10 @@ class SiteLoader:
return
resp = kwargs['response']
- logger = self.logger.bind (reqId=reqId, respUrl=resp['url'])
- url = urlsplit (resp['url'])
+ url = URL (resp['url'])
+ logger = self.logger.bind (reqId=reqId, respUrl=url)
+ if item.url != url:
+ logger.error ('url mismatch', uuid='7385f45f-0b06-4cbc-81f9-67bcd72ee7d0', respUrl=url)
if url.scheme in self.allowedSchemes:
logger.debug ('response', uuid='84461c4e-e8ef-4cbd-8e8e-e10a901c8bd0')
item.setResponse (kwargs)
@@ -332,12 +330,8 @@ class SiteLoader:
# we never recorded this request (blacklisted scheme, for example)
return
req = item.request
- logger = self.logger.bind (reqId=reqId, reqUrl=req['url'])
- resp = item.response
- if req['url'] != resp['url']:
- logger.error ('url mismatch', uuid='7385f45f-0b06-4cbc-81f9-67bcd72ee7d0', respUrl=resp['url'])
- url = urlsplit (resp['url'])
- if url.scheme in self.allowedSchemes:
+ logger = self.logger.bind (reqId=reqId, reqUrl=item.url)
+ if item.url.scheme in self.allowedSchemes:
logger.info ('finished', uuid='5a8b4bad-f86a-4fe6-a53e-8da4130d6a02')
item.setFinished (kwargs)
await asyncio.gather (item.prefetchRequestBody (self.tab), item.prefetchResponseBody (self.tab))
diff --git a/crocoite/cli.py b/crocoite/cli.py
index c3c41a4..b0ad53a 100644
--- a/crocoite/cli.py
+++ b/crocoite/cli.py
@@ -24,6 +24,7 @@ Command line interface
import argparse, sys, signal, asyncio, os
from enum import IntEnum
+from yarl import URL
from . import behavior
from .controller import SinglePageController, \
@@ -50,7 +51,7 @@ def single ():
default=list (behavior.availableMap.keys ()),
choices=list (behavior.availableMap.keys ()),
metavar='NAME', nargs='*')
- parser.add_argument('url', help='Website URL', metavar='URL')
+ parser.add_argument('url', help='Website URL', type=URL, metavar='URL')
parser.add_argument('output', help='WARC filename', metavar='FILE')
args = parser.parse_args ()
@@ -102,7 +103,7 @@ def recursive ():
parser.add_argument('--tempdir', help='Directory for temporary files', metavar='DIR')
parser.add_argument('--prefix', help='Output filename prefix, supports templates {host} and {date}', metavar='FILENAME', default='{host}-{date}-')
parser.add_argument('--concurrency', '-j', help='Run at most N jobs', metavar='N', default=1, type=int)
- parser.add_argument('url', help='Seed URL', metavar='URL')
+ parser.add_argument('url', help='Seed URL', type=URL, metavar='URL')
parser.add_argument('output', help='Output directory', metavar='DIR')
parser.add_argument('command', help='Fetch command, supports templates {url} and {dest}', metavar='CMD', nargs='*', default=['crocoite-grab', '{url}', '{dest}'])
diff --git a/crocoite/controller.py b/crocoite/controller.py
index f8b1420..c646a61 100644
--- a/crocoite/controller.py
+++ b/crocoite/controller.py
@@ -26,12 +26,11 @@ import time
import tempfile, asyncio, json, os
from itertools import islice
from datetime import datetime
-from urllib.parse import urlparse
from operator import attrgetter
from . import behavior as cbehavior
from .browser import SiteLoader, Item
-from .util import getFormattedViewportMetrics, getSoftwareInfo, removeFragment
+from .util import getFormattedViewportMetrics, getSoftwareInfo
from .behavior import ExtractLinksEvent
class ControllerSettings:
@@ -316,12 +315,12 @@ class RecursiveController:
return e.format (url=url, dest=dest.name)
def formatPrefix (p):
- return p.format (host=urlparse (url).hostname, date=datetime.utcnow ().isoformat ())
+ return p.format (host=url.host, date=datetime.utcnow ().isoformat ())
def logStats ():
logger.info ('stats', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **self.stats)
- if urlparse (url).scheme not in self.SCHEME_WHITELIST:
+ if url.scheme not in self.SCHEME_WHITELIST:
self.stats['ignored'] += 1
logStats ()
self.logger.warning ('scheme not whitelisted', url=url,
@@ -344,7 +343,7 @@ class RecursiveController:
data = json.loads (data)
uuid = data.get ('uuid')
if uuid == '8ee5e9c9-1130-4c5c-88ff-718508546e0c':
- links = set (self.policy (map (removeFragment, data.get ('links', []))))
+ links = set (self.policy (map (lambda x: x.with_fragment(None), data.get ('links', []))))
links.difference_update (self.have)
self.pending.update (links)
elif uuid == '24d92d16-770e-4088-b769-4020e127a7ff':
diff --git a/crocoite/devtools.py b/crocoite/devtools.py
index b071d2e..e62d4e0 100644
--- a/crocoite/devtools.py
+++ b/crocoite/devtools.py
@@ -27,6 +27,8 @@ from tempfile import mkdtemp
import shutil
import aiohttp, websockets
+from .util import StrJsonEncoder
+
logger = logging.getLogger (__name__)
class Browser:
@@ -155,7 +157,7 @@ class Tab:
message = {'method': method, 'params': kwargs, 'id': msgid}
t = self.transactions[msgid] = {'event': asyncio.Event (), 'result': None}
logger.debug ('← {}'.format (message))
- await self.ws.send (json.dumps (message))
+ await self.ws.send (json.dumps (message, cls=StrJsonEncoder))
await t['event'].wait ()
ret = t['result']
del self.transactions[msgid]
diff --git a/crocoite/logger.py b/crocoite/logger.py
index cddc42d..7f4a4de 100644
--- a/crocoite/logger.py
+++ b/crocoite/logger.py
@@ -34,6 +34,8 @@ from enum import IntEnum
from pytz import utc
+from .util import StrJsonEncoder
+
class Level(IntEnum):
DEBUG = 0
INFO = 1
@@ -102,24 +104,13 @@ class PrintConsumer (Consumer):
sys.stderr.flush ()
return kwargs
-class JsonEncoder (json.JSONEncoder):
- def default (self, obj):
- if isinstance (obj, datetime):
- return obj.isoformat ()
-
- # make sure serialization always succeeds
- try:
- return json.JSONEncoder.default(self, obj)
- except TypeError:
- return str (obj)
-
class JsonPrintConsumer (Consumer):
def __init__ (self, minLevel=Level.INFO):
self.minLevel = minLevel
def __call__ (self, **kwargs):
if kwargs['level'] >= self.minLevel:
- json.dump (kwargs, sys.stdout, cls=JsonEncoder)
+ json.dump (kwargs, sys.stdout, cls=StrJsonEncoder)
sys.stdout.write ('\n')
sys.stdout.flush ()
return kwargs
@@ -136,6 +127,6 @@ class WarcHandlerConsumer (Consumer):
def __call__ (self, **kwargs):
if kwargs['level'] >= self.minLevel:
- self.warc._writeLog (json.dumps (kwargs, cls=JsonEncoder))
+ self.warc._writeLog (json.dumps (kwargs, cls=StrJsonEncoder))
return kwargs
diff --git a/crocoite/test_behavior.py b/crocoite/test_behavior.py
index 280b35b..0433918 100644
--- a/crocoite/test_behavior.py
+++ b/crocoite/test_behavior.py
@@ -19,9 +19,9 @@
# THE SOFTWARE.
import asyncio, os, yaml, re
-from urllib.parse import urlparse
from functools import partial
import pytest
+from yarl import URL
import pkg_resources
from .logger import Logger
@@ -87,12 +87,12 @@ matchParam = []
for o in sites:
for s in o['selector']:
for u in s.get ('urls', []):
- matchParam.append ((o['match'], u))
+ matchParam.append ((o['match'], URL (u)))
@pytest.mark.parametrize("match,url", matchParam)
@pytest.mark.asyncio
async def test_click_match (match, url):
""" Test urls must match """
- host = urlparse (url).netloc
- assert re.match (match, host, re.I)
+ # keep this aligned with click.js
+ assert re.match (match, url.host, re.I)
diff --git a/crocoite/test_browser.py b/crocoite/test_browser.py
index 06492b1..5616fcf 100644
--- a/crocoite/test_browser.py
+++ b/crocoite/test_browser.py
@@ -92,7 +92,7 @@ testItems = [
TItem ('html/fetchPost/binary/large', 200, {'Content-Type': 'application/octet-stream'}, b'\x00', requestBody=(100*1024)*b'\x00'),
TItem ('html/fetchPost/form/large', 200, {'Content-Type': 'application/octet-stream'}, b'\x00', requestBody=b'data=' + (100*1024)*b'%21'),
]
-testItemMap = dict ([(item.parsedUrl.path, item) for item in testItems])
+testItemMap = dict ([(item.url.path, item) for item in testItems])
def itemToResponse (item):
async def f (req):
@@ -108,7 +108,7 @@ async def server ():
logging.basicConfig(level=logging.DEBUG)
app = web.Application(debug=True)
for item in testItems:
- app.router.add_route ('*', item.parsedUrl.path, itemToResponse (item))
+ app.router.add_route ('*', item.url.path, itemToResponse (item))
runner = web.AppRunner(app)
await runner.setup()
site = web.TCPSite(runner, 'localhost', 8080)
@@ -137,10 +137,10 @@ async def loader (server, logger):
yield f
async def itemsLoaded (l, items):
- items = dict ([(i.parsedUrl.path, i) for i in items])
+ items = dict ([(i.url.path, i) for i in items])
async for item in l:
assert item.chromeResponse is not None
- golden = items.pop (item.parsedUrl.path)
+ golden = items.pop (item.url.path)
if not golden:
assert False, 'url {} not supposed to be fetched'.format (item.url)
assert item.failed == golden.failed
@@ -167,7 +167,7 @@ async def itemsLoaded (l, items):
break
async def literalItem (lf, item, deps=[]):
- async with lf (item.parsedUrl.path) as l:
+ async with lf (item.url.path) as l:
await l.start ()
await asyncio.wait_for (itemsLoaded (l, [item] + deps), timeout=30)
@@ -184,7 +184,7 @@ async def test_headers_duplicate (loader):
async with loader ('/headers/duplicate') as l:
await l.start ()
async for it in l:
- if it.parsedUrl.path == '/headers/duplicate':
+ if it.url.path == '/headers/duplicate':
assert not it.failed
dup = list (filter (lambda x: x[0] == 'Duplicate', it.responseHeaders))
assert len(dup) == 2
@@ -200,7 +200,7 @@ async def test_headers_req (loader):
async with loader ('/headers/fetch/html') as l:
await l.start ()
async for it in l:
- if it.parsedUrl.path == '/headers/fetch/req':
+ if it.url.path == '/headers/fetch/req':
assert not it.failed
dup = list (filter (lambda x: x[0] == 'custom', it.requestHeaders))
assert len(dup) == 1
diff --git a/crocoite/util.py b/crocoite/util.py
index bd26909..73a1d65 100644
--- a/crocoite/util.py
+++ b/crocoite/util.py
@@ -22,9 +22,22 @@
Random utility functions
"""
-import random, sys, platform
+import random, sys, platform, os, json
+from datetime import datetime
import hashlib, pkg_resources
-from urllib.parse import urlsplit, urlunsplit
+
+class StrJsonEncoder (json.JSONEncoder):
+ """ JSON encoder that turns unknown classes into a string and thus never
+ fails """
+ def default (self, obj):
+ if isinstance (obj, datetime):
+ return obj.isoformat ()
+
+ # make sure serialization always succeeds
+ try:
+ return json.JSONEncoder.default(self, obj)
+ except TypeError:
+ return str (obj)
def packageUrl (path):
"""
@@ -38,11 +51,6 @@ async def getFormattedViewportMetrics (tab):
return '{}x{}'.format (layoutMetrics['layoutViewport']['clientWidth'],
layoutMetrics['layoutViewport']['clientHeight'])
-def removeFragment (u):
- """ Remove fragment from url (i.e. #hashvalue) """
- s = urlsplit (u)
- return urlunsplit ((s.scheme, s.netloc, s.path, s.query, ''))
-
def getSoftwareInfo ():
""" Get software info for inclusion into warcinfo """
return {
diff --git a/crocoite/warc.py b/crocoite/warc.py
index ebc460d..21a99aa 100644
--- a/crocoite/warc.py
+++ b/crocoite/warc.py
@@ -24,14 +24,13 @@ Classes writing data to WARC files
import json, threading
from io import BytesIO
-from urllib.parse import urlsplit
from datetime import datetime
from warcio.timeutils import datetime_to_iso_date
from warcio.warcwriter import WARCWriter
from warcio.statusandheaders import StatusAndHeaders
-from .util import packageUrl
+from .util import packageUrl, StrJsonEncoder
from .controller import EventHandler, ControllerStart
from .behavior import Script, DomSnapshotEvent, ScreenshotEvent
from .browser import Item
@@ -75,7 +74,7 @@ class WarcHandler (EventHandler):
d.update (warc_headers_dict)
warc_headers_dict = d
- record = self.writer.create_warc_record (url, kind, payload=payload,
+ record = self.writer.create_warc_record (str (url), kind, payload=payload,
warc_headers_dict=warc_headers_dict, http_headers=http_headers)
self.writer.write_record (record)
@@ -85,12 +84,9 @@ class WarcHandler (EventHandler):
logger = self.logger.bind (reqId=item.id)
req = item.request
- resp = item.response
- url = urlsplit (resp['url'])
+ url = item.url
- path = url.path
- if url.query:
- path += '?' + url.query
+ path = url.relative().with_fragment(None)
httpHeaders = StatusAndHeaders('{} {} HTTP/1.1'.format (req['method'], path),
item.requestHeaders, protocol='HTTP/1.1', is_http_request=True)
initiator = item.initiator
@@ -111,7 +107,7 @@ class WarcHandler (EventHandler):
if payload:
payload = BytesIO (payload)
warcHeaders['X-Chrome-Base64Body'] = str (payloadBase64Encoded)
- record = self.writeRecord (req['url'], 'request',
+ record = self.writeRecord (url, 'request',
payload=payload, http_headers=httpHeaders,
warc_headers_dict=warcHeaders)
return record.rec_headers['WARC-Record-ID']
@@ -172,7 +168,7 @@ class WarcHandler (EventHandler):
else:
bodyIo = BytesIO ()
- record = self.writeRecord (resp['url'], 'response',
+ record = self.writeRecord (item.url, 'response',
warc_headers_dict=warcHeaders, payload=bodyIo,
http_headers=httpHeaders)
@@ -225,7 +221,7 @@ class WarcHandler (EventHandler):
payload=BytesIO (item.data), warc_headers_dict=warcHeaders)
def _writeControllerStart (self, item):
- payload = BytesIO (json.dumps (item.payload, indent=2).encode ('utf-8'))
+ payload = BytesIO (json.dumps (item.payload, indent=2, cls=StrJsonEncoder).encode ('utf-8'))
writer = self.writer
warcinfo = self.writeRecord (packageUrl ('warcinfo'), 'warcinfo',
diff --git a/setup.py b/setup.py
index 7223406..0a21c02 100644
--- a/setup.py
+++ b/setup.py
@@ -17,6 +17,7 @@ setup(
'websockets',
'aiohttp',
'PyYAML',
+ 'yarl',
],
entry_points={
'console_scripts': [