diff options
| author | Lars-Dominik Braun <lars@6xq.net> | 2018-12-18 12:34:25 +0100 | 
|---|---|---|
| committer | Lars-Dominik Braun <lars@6xq.net> | 2018-12-21 20:28:51 +0100 | 
| commit | 5e444dd6511d97308a84ae9c86ebf14547d01f01 (patch) | |
| tree | 0852c081163ff3456038fb08ad4e47d0d47a6167 /crocoite | |
| parent | e19635a75cc1ab206be12ecf2b1c9a909baa9c21 (diff) | |
| download | crocoite-5e444dd6511d97308a84ae9c86ebf14547d01f01.tar.gz crocoite-5e444dd6511d97308a84ae9c86ebf14547d01f01.tar.bz2 crocoite-5e444dd6511d97308a84ae9c86ebf14547d01f01.zip | |
Parse URLs by default
Use library yarl (already pulled in by aiohttp). No URL processed should
be a string.
Diffstat (limited to 'crocoite')
| -rw-r--r-- | crocoite/behavior.py | 32 | ||||
| -rw-r--r-- | crocoite/browser.py | 28 | ||||
| -rw-r--r-- | crocoite/cli.py | 5 | ||||
| -rw-r--r-- | crocoite/controller.py | 9 | ||||
| -rw-r--r-- | crocoite/devtools.py | 4 | ||||
| -rw-r--r-- | crocoite/logger.py | 17 | ||||
| -rw-r--r-- | crocoite/test_behavior.py | 8 | ||||
| -rw-r--r-- | crocoite/test_browser.py | 14 | ||||
| -rw-r--r-- | crocoite/util.py | 22 | ||||
| -rw-r--r-- | crocoite/warc.py | 18 | 
10 files changed, 68 insertions, 89 deletions
| diff --git a/crocoite/behavior.py b/crocoite/behavior.py index eb5478b..321b65c 100644 --- a/crocoite/behavior.py +++ b/crocoite/behavior.py @@ -35,15 +35,15 @@ instance.  """  import asyncio, json, os.path -from urllib.parse import urlsplit  from base64 import b64decode  from collections import OrderedDict  import pkg_resources  from html5lib.serializer import HTMLSerializer +from yarl import URL  import yaml -from .util import getFormattedViewportMetrics, removeFragment +from .util import getFormattedViewportMetrics  from . import html  from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker  from .devtools import Crashed @@ -107,16 +107,6 @@ class Behavior:          return          yield -class HostnameFilter: -    """ Limit behavior script to hostname """ - -    hostname = None - -    def __contains__ (self, url): -        url = urlsplit (url) -        hostname = url.hostname.split ('.')[::-1] -        return hostname[:2] == self.hostname -  class JsOnload (Behavior):      """ Execute JavaScript on page load """ @@ -237,16 +227,14 @@ class DomSnapshot (Behavior):          dom = await tab.DOM.getDocument (depth=-1, pierce=True)          haveUrls = set ()          for doc in ChromeTreeWalker (dom['root']).split (): -            rawUrl = doc['documentURL'] -            if rawUrl in haveUrls: +            url = URL (doc['documentURL']) +            if url in haveUrls:                  # ignore duplicate URLs. they are usually caused by                  # javascript-injected iframes (advertising) with no(?) src -                self.logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl)) -                continue -            url = urlsplit (rawUrl) -            if url.scheme in ('http', 'https'): +                self.logger.warning ('have DOM snapshot for URL {}, ignoring'.format (url)) +            elif url.scheme in ('http', 'https'):                  self.logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL'])) -                haveUrls.add (rawUrl) +                haveUrls.add (url)                  walker = ChromeTreeWalker (doc)                  # remove script, to make the page static and noscript, because at the                  # time we took the snapshot scripts were enabled @@ -254,7 +242,7 @@ class DomSnapshot (Behavior):                  disallowedAttributes = html.eventAttributes                  stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes)                  serializer = HTMLSerializer () -                yield DomSnapshotEvent (removeFragment (doc['documentURL']), serializer.render (stream, 'utf-8'), viewport) +                yield DomSnapshotEvent (url.with_fragment(None), serializer.render (stream, 'utf-8'), viewport)  class ScreenshotEvent:      __slots__ = ('yoff', 'data', 'url') @@ -276,7 +264,7 @@ class Screenshot (Behavior):          tree = await tab.Page.getFrameTree ()          try: -            url = removeFragment (tree['frameTree']['frame']['url']) +            url = URL (tree['frameTree']['frame']['url']).with_fragment (None)          except KeyError:              self.logger.error ('frame without url', tree=tree)              url = None @@ -333,7 +321,7 @@ class ExtractLinks (Behavior):          tab = self.loader.tab          yield self.script          result = await tab.Runtime.evaluate (expression=str (self.script), returnByValue=True) -        yield ExtractLinksEvent (list (set (result['result']['value']))) +        yield ExtractLinksEvent (list (set (map (URL, result['result']['value']))))  class Crash (Behavior):      """ Crash the browser. For testing only. Obviously. """ diff --git a/crocoite/browser.py b/crocoite/browser.py index c472746..1c7ac3b 100644 --- a/crocoite/browser.py +++ b/crocoite/browser.py @@ -23,9 +23,9 @@ Chrome browser interactions.  """  import asyncio -from urllib.parse import urlsplit  from base64 import b64decode  from http.server import BaseHTTPRequestHandler +from yarl import URL  from .logger import Level  from .devtools import Browser, TabException @@ -72,11 +72,7 @@ class Item:      @property      def url (self): -        return self.response.get ('url', self.request.get ('url')) - -    @property -    def parsedUrl (self): -        return urlsplit (self.url) +        return URL (self.response.get ('url', self.request.get ('url')))      @property      def requestHeaders (self): @@ -274,9 +270,9 @@ class SiteLoader:      async def _requestWillBeSent (self, **kwargs):          reqId = kwargs['requestId']          req = kwargs['request'] -        logger = self.logger.bind (reqId=reqId, reqUrl=req['url']) +        url = URL (req['url']) +        logger = self.logger.bind (reqId=reqId, reqUrl=url) -        url = urlsplit (req['url'])          if url.scheme not in self.allowedSchemes:              return @@ -292,7 +288,7 @@ class SiteLoader:                  resp = {'requestId': reqId, 'encodedDataLength': 0, 'timestamp': kwargs['timestamp']}                  item.setFinished (resp)                  item.isRedirect = True -                logger.info ('redirect', uuid='85eaec41-e2a9-49c2-9445-6f19690278b8', target=req['url']) +                logger.info ('redirect', uuid='85eaec41-e2a9-49c2-9445-6f19690278b8', target=url)                  await item.prefetchRequestBody (self.tab)                  # cannot fetch request body due to race condition (item id reused)                  ret = item @@ -313,8 +309,10 @@ class SiteLoader:              return          resp = kwargs['response'] -        logger = self.logger.bind (reqId=reqId, respUrl=resp['url']) -        url = urlsplit (resp['url']) +        url = URL (resp['url']) +        logger = self.logger.bind (reqId=reqId, respUrl=url) +        if item.url != url: +            logger.error ('url mismatch', uuid='7385f45f-0b06-4cbc-81f9-67bcd72ee7d0', respUrl=url)          if url.scheme in self.allowedSchemes:              logger.debug ('response', uuid='84461c4e-e8ef-4cbd-8e8e-e10a901c8bd0')              item.setResponse (kwargs) @@ -332,12 +330,8 @@ class SiteLoader:              # we never recorded this request (blacklisted scheme, for example)              return          req = item.request -        logger = self.logger.bind (reqId=reqId, reqUrl=req['url']) -        resp = item.response -        if req['url'] != resp['url']: -            logger.error ('url mismatch', uuid='7385f45f-0b06-4cbc-81f9-67bcd72ee7d0', respUrl=resp['url']) -        url = urlsplit (resp['url']) -        if url.scheme in self.allowedSchemes: +        logger = self.logger.bind (reqId=reqId, reqUrl=item.url) +        if item.url.scheme in self.allowedSchemes:              logger.info ('finished', uuid='5a8b4bad-f86a-4fe6-a53e-8da4130d6a02')              item.setFinished (kwargs)              await asyncio.gather (item.prefetchRequestBody (self.tab), item.prefetchResponseBody (self.tab)) diff --git a/crocoite/cli.py b/crocoite/cli.py index c3c41a4..b0ad53a 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -24,6 +24,7 @@ Command line interface  import argparse, sys, signal, asyncio, os  from enum import IntEnum +from yarl import URL  from . import behavior  from .controller import SinglePageController, \ @@ -50,7 +51,7 @@ def single ():              default=list (behavior.availableMap.keys ()),              choices=list (behavior.availableMap.keys ()),              metavar='NAME', nargs='*') -    parser.add_argument('url', help='Website URL', metavar='URL') +    parser.add_argument('url', help='Website URL', type=URL, metavar='URL')      parser.add_argument('output', help='WARC filename', metavar='FILE')      args = parser.parse_args () @@ -102,7 +103,7 @@ def recursive ():      parser.add_argument('--tempdir', help='Directory for temporary files', metavar='DIR')      parser.add_argument('--prefix', help='Output filename prefix, supports templates {host} and {date}', metavar='FILENAME', default='{host}-{date}-')      parser.add_argument('--concurrency', '-j', help='Run at most N jobs', metavar='N', default=1, type=int) -    parser.add_argument('url', help='Seed URL', metavar='URL') +    parser.add_argument('url', help='Seed URL', type=URL, metavar='URL')      parser.add_argument('output', help='Output directory', metavar='DIR')      parser.add_argument('command', help='Fetch command, supports templates {url} and {dest}', metavar='CMD', nargs='*', default=['crocoite-grab', '{url}', '{dest}']) diff --git a/crocoite/controller.py b/crocoite/controller.py index f8b1420..c646a61 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -26,12 +26,11 @@ import time  import tempfile, asyncio, json, os  from itertools import islice  from datetime import datetime -from urllib.parse import urlparse  from operator import attrgetter  from . import behavior as cbehavior  from .browser import SiteLoader, Item -from .util import getFormattedViewportMetrics, getSoftwareInfo, removeFragment +from .util import getFormattedViewportMetrics, getSoftwareInfo  from .behavior import ExtractLinksEvent  class ControllerSettings: @@ -316,12 +315,12 @@ class RecursiveController:              return e.format (url=url, dest=dest.name)          def formatPrefix (p): -            return p.format (host=urlparse (url).hostname, date=datetime.utcnow ().isoformat ()) +            return p.format (host=url.host, date=datetime.utcnow ().isoformat ())          def logStats ():              logger.info ('stats', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **self.stats) -        if urlparse (url).scheme not in self.SCHEME_WHITELIST: +        if url.scheme not in self.SCHEME_WHITELIST:              self.stats['ignored'] += 1              logStats ()              self.logger.warning ('scheme not whitelisted', url=url, @@ -344,7 +343,7 @@ class RecursiveController:              data = json.loads (data)              uuid = data.get ('uuid')              if uuid == '8ee5e9c9-1130-4c5c-88ff-718508546e0c': -                links = set (self.policy (map (removeFragment, data.get ('links', [])))) +                links = set (self.policy (map (lambda x: x.with_fragment(None), data.get ('links', []))))                  links.difference_update (self.have)                  self.pending.update (links)              elif uuid == '24d92d16-770e-4088-b769-4020e127a7ff': diff --git a/crocoite/devtools.py b/crocoite/devtools.py index b071d2e..e62d4e0 100644 --- a/crocoite/devtools.py +++ b/crocoite/devtools.py @@ -27,6 +27,8 @@ from tempfile import mkdtemp  import shutil  import aiohttp, websockets +from .util import StrJsonEncoder +  logger = logging.getLogger (__name__)  class Browser: @@ -155,7 +157,7 @@ class Tab:          message = {'method': method, 'params': kwargs, 'id': msgid}          t = self.transactions[msgid] = {'event': asyncio.Event (), 'result': None}          logger.debug ('← {}'.format (message)) -        await self.ws.send (json.dumps (message)) +        await self.ws.send (json.dumps (message, cls=StrJsonEncoder))          await t['event'].wait ()          ret = t['result']          del self.transactions[msgid] diff --git a/crocoite/logger.py b/crocoite/logger.py index cddc42d..7f4a4de 100644 --- a/crocoite/logger.py +++ b/crocoite/logger.py @@ -34,6 +34,8 @@ from enum import IntEnum  from pytz import utc +from .util import StrJsonEncoder +  class Level(IntEnum):      DEBUG = 0      INFO = 1 @@ -102,24 +104,13 @@ class PrintConsumer (Consumer):          sys.stderr.flush ()          return kwargs -class JsonEncoder (json.JSONEncoder): -    def default (self, obj): -        if isinstance (obj, datetime): -            return obj.isoformat () - -        # make sure serialization always succeeds -        try: -            return json.JSONEncoder.default(self, obj) -        except TypeError: -            return str (obj) -  class JsonPrintConsumer (Consumer):      def __init__ (self, minLevel=Level.INFO):          self.minLevel = minLevel      def __call__ (self, **kwargs):          if kwargs['level'] >= self.minLevel: -            json.dump (kwargs, sys.stdout, cls=JsonEncoder) +            json.dump (kwargs, sys.stdout, cls=StrJsonEncoder)              sys.stdout.write ('\n')              sys.stdout.flush ()          return kwargs @@ -136,6 +127,6 @@ class WarcHandlerConsumer (Consumer):      def __call__ (self, **kwargs):          if kwargs['level'] >= self.minLevel: -            self.warc._writeLog (json.dumps (kwargs, cls=JsonEncoder)) +            self.warc._writeLog (json.dumps (kwargs, cls=StrJsonEncoder))          return kwargs diff --git a/crocoite/test_behavior.py b/crocoite/test_behavior.py index 280b35b..0433918 100644 --- a/crocoite/test_behavior.py +++ b/crocoite/test_behavior.py @@ -19,9 +19,9 @@  # THE SOFTWARE.  import asyncio, os, yaml, re -from urllib.parse import urlparse  from functools import partial  import pytest +from yarl import URL  import pkg_resources  from .logger import Logger @@ -87,12 +87,12 @@ matchParam = []  for o in sites:      for s in o['selector']:          for u in s.get ('urls', []): -            matchParam.append ((o['match'], u)) +            matchParam.append ((o['match'], URL (u)))  @pytest.mark.parametrize("match,url", matchParam)  @pytest.mark.asyncio  async def test_click_match (match, url):      """ Test urls must match """ -    host = urlparse (url).netloc -    assert re.match (match, host, re.I) +    # keep this aligned with click.js +    assert re.match (match, url.host, re.I) diff --git a/crocoite/test_browser.py b/crocoite/test_browser.py index 06492b1..5616fcf 100644 --- a/crocoite/test_browser.py +++ b/crocoite/test_browser.py @@ -92,7 +92,7 @@ testItems = [      TItem ('html/fetchPost/binary/large', 200, {'Content-Type': 'application/octet-stream'}, b'\x00', requestBody=(100*1024)*b'\x00'),      TItem ('html/fetchPost/form/large', 200, {'Content-Type': 'application/octet-stream'}, b'\x00', requestBody=b'data=' + (100*1024)*b'%21'),      ] -testItemMap = dict ([(item.parsedUrl.path, item) for item in testItems]) +testItemMap = dict ([(item.url.path, item) for item in testItems])  def itemToResponse (item):      async def f (req): @@ -108,7 +108,7 @@ async def server ():      logging.basicConfig(level=logging.DEBUG)      app = web.Application(debug=True)      for item in testItems: -        app.router.add_route ('*', item.parsedUrl.path, itemToResponse (item)) +        app.router.add_route ('*', item.url.path, itemToResponse (item))      runner = web.AppRunner(app)      await runner.setup()      site = web.TCPSite(runner, 'localhost', 8080) @@ -137,10 +137,10 @@ async def loader (server, logger):          yield f  async def itemsLoaded (l, items): -    items = dict ([(i.parsedUrl.path, i) for i in items]) +    items = dict ([(i.url.path, i) for i in items])      async for item in l:          assert item.chromeResponse is not None -        golden = items.pop (item.parsedUrl.path) +        golden = items.pop (item.url.path)          if not golden:              assert False, 'url {} not supposed to be fetched'.format (item.url)          assert item.failed == golden.failed @@ -167,7 +167,7 @@ async def itemsLoaded (l, items):              break  async def literalItem (lf, item, deps=[]): -    async with lf (item.parsedUrl.path) as l: +    async with lf (item.url.path) as l:          await l.start ()          await asyncio.wait_for (itemsLoaded (l, [item] + deps), timeout=30) @@ -184,7 +184,7 @@ async def test_headers_duplicate (loader):      async with loader ('/headers/duplicate') as l:          await l.start ()          async for it in l: -            if it.parsedUrl.path == '/headers/duplicate': +            if it.url.path == '/headers/duplicate':                  assert not it.failed                  dup = list (filter (lambda x: x[0] == 'Duplicate', it.responseHeaders))                  assert len(dup) == 2 @@ -200,7 +200,7 @@ async def test_headers_req (loader):      async with loader ('/headers/fetch/html') as l:          await l.start ()          async for it in l: -            if it.parsedUrl.path == '/headers/fetch/req': +            if it.url.path == '/headers/fetch/req':                  assert not it.failed                  dup = list (filter (lambda x: x[0] == 'custom', it.requestHeaders))                  assert len(dup) == 1 diff --git a/crocoite/util.py b/crocoite/util.py index bd26909..73a1d65 100644 --- a/crocoite/util.py +++ b/crocoite/util.py @@ -22,9 +22,22 @@  Random utility functions  """ -import random, sys, platform +import random, sys, platform, os, json +from datetime import datetime  import hashlib, pkg_resources -from urllib.parse import urlsplit, urlunsplit + +class StrJsonEncoder (json.JSONEncoder): +    """ JSON encoder that turns unknown classes into a string and thus never +    fails """ +    def default (self, obj): +        if isinstance (obj, datetime): +            return obj.isoformat () + +        # make sure serialization always succeeds +        try: +            return json.JSONEncoder.default(self, obj) +        except TypeError: +            return str (obj)  def packageUrl (path):      """ @@ -38,11 +51,6 @@ async def getFormattedViewportMetrics (tab):      return '{}x{}'.format (layoutMetrics['layoutViewport']['clientWidth'],                  layoutMetrics['layoutViewport']['clientHeight']) -def removeFragment (u): -    """ Remove fragment from url (i.e. #hashvalue) """ -    s = urlsplit (u) -    return urlunsplit ((s.scheme, s.netloc, s.path, s.query, '')) -  def getSoftwareInfo ():      """ Get software info for inclusion into warcinfo """      return { diff --git a/crocoite/warc.py b/crocoite/warc.py index ebc460d..21a99aa 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -24,14 +24,13 @@ Classes writing data to WARC files  import json, threading  from io import BytesIO -from urllib.parse import urlsplit  from datetime import datetime  from warcio.timeutils import datetime_to_iso_date  from warcio.warcwriter import WARCWriter  from warcio.statusandheaders import StatusAndHeaders -from .util import packageUrl +from .util import packageUrl, StrJsonEncoder  from .controller import EventHandler, ControllerStart  from .behavior import Script, DomSnapshotEvent, ScreenshotEvent  from .browser import Item @@ -75,7 +74,7 @@ class WarcHandler (EventHandler):          d.update (warc_headers_dict)          warc_headers_dict = d -        record = self.writer.create_warc_record (url, kind, payload=payload, +        record = self.writer.create_warc_record (str (url), kind, payload=payload,                  warc_headers_dict=warc_headers_dict, http_headers=http_headers)          self.writer.write_record (record) @@ -85,12 +84,9 @@ class WarcHandler (EventHandler):          logger = self.logger.bind (reqId=item.id)          req = item.request -        resp = item.response -        url = urlsplit (resp['url']) +        url = item.url -        path = url.path -        if url.query: -            path += '?' + url.query +        path = url.relative().with_fragment(None)          httpHeaders = StatusAndHeaders('{} {} HTTP/1.1'.format (req['method'], path),                  item.requestHeaders, protocol='HTTP/1.1', is_http_request=True)          initiator = item.initiator @@ -111,7 +107,7 @@ class WarcHandler (EventHandler):          if payload:              payload = BytesIO (payload)              warcHeaders['X-Chrome-Base64Body'] = str (payloadBase64Encoded) -        record = self.writeRecord (req['url'], 'request', +        record = self.writeRecord (url, 'request',                  payload=payload, http_headers=httpHeaders,                  warc_headers_dict=warcHeaders)          return record.rec_headers['WARC-Record-ID'] @@ -172,7 +168,7 @@ class WarcHandler (EventHandler):          else:              bodyIo = BytesIO () -        record = self.writeRecord (resp['url'], 'response', +        record = self.writeRecord (item.url, 'response',                  warc_headers_dict=warcHeaders, payload=bodyIo,                  http_headers=httpHeaders) @@ -225,7 +221,7 @@ class WarcHandler (EventHandler):                  payload=BytesIO (item.data), warc_headers_dict=warcHeaders)      def _writeControllerStart (self, item): -        payload = BytesIO (json.dumps (item.payload, indent=2).encode ('utf-8')) +        payload = BytesIO (json.dumps (item.payload, indent=2, cls=StrJsonEncoder).encode ('utf-8'))          writer = self.writer          warcinfo = self.writeRecord (packageUrl ('warcinfo'), 'warcinfo', | 
