diff options
45 files changed, 5816 insertions, 1686 deletions
@@ -1,3 +1,7 @@ __pycache__ *.sw? *.egg-info/ +.pytest_cache/ +coverage.xml +htmlcov +.coverage diff --git a/.travis.yml b/.travis.yml index e09e518..b1d417c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,14 +1,23 @@ -dist: trusty +dist: xenial language: python -python: - - "3.4" - - "3.5" - - "3.6" - - "pypy3.5" +matrix: + include: + - python: "3.6" + - python: "3.7" + - python: "3.8" + - python: "3.6-dev" + - python: "3.7-dev" + - python: "3.8-dev" + allow_failures: + - python: "3.6-dev" + - python: "3.7-dev" + - python: "3.8-dev" install: - pip install . script: - - python -m unittest crocoite.browser + - python setup.py test addons: chrome: stable sudo: required +after_success: + - bash <(curl -s https://codecov.io/bash) @@ -1,134 +1,15 @@ crocoite ======== -Archive websites using `headless Google Chrome`_ and its DevTools protocol. +.. code:: bash -.. image:: https://travis-ci.org/PromyLOPh/crocoite.svg?branch=master - :target: https://travis-ci.org/PromyLOPh/crocoite - -.. _headless Google Chrome: https://developers.google.com/web/updates/2017/04/headless-chrome - -Dependencies ------------- - -- Python 3 -- pychrome_ -- warcio_ -- html5lib_ -- Celery_ - -.. _pychrome: https://github.com/fate0/pychrome -.. _warcio: https://github.com/webrecorder/warcio -.. _html5lib: https://github.com/html5lib/html5lib-python -.. _Celery: http://www.celeryproject.org/ - -Usage ------ - -One-shot commandline interface and pywb_ playback:: - - crocoite-grab --output example.com.warc.gz http://example.com/ - rm -rf collections && wb-manager init test && wb-manager add test example.com.warc.gz + pip install crocoite pywb + crocoite http://example.com/ example.com.warc.gz + wb-manager init test && wb-manager add test example.com.warc.gz wayback & $BROWSER http://localhost:8080 -.. _pywb: https://github.com/ikreymer/pywb - -Behavior scripts -^^^^^^^^^^^^^^^^ - -A lot of sites need some form of interaction to dynamically load more content. Twitter for -instance continously loads new posts when scrolling to the bottom of the page. -crocoite can emulate these user interactions (and more) by combining control -code written in Python and injecting JavaScript into the page. The code can be -limited to certain URLs or apply to every page loaded. By default all scripts -available are enabled, see command line flag ``--behavior``. - -Caveats -------- - -- Original HTTP requests/responses are not available. They are rebuilt from - parsed data. Character encoding for text documents is changed to UTF-8. -- Some sites request assets based on screen resolution, pixel ratio and - supported image formats (webp). Replaying those with different parameters - won’t work, since assets for those are missing. Example: missguided.com. -- Some fetch different scripts based on user agent. Example: youtube.com. -- Requests containing randomly generated JavaScript callback function names - won’t work. Example: weather.com. -- Range requests (Range: bytes=1-100) are captured as-is, making playback - difficult -- Content body of HTTP redirects cannot be retrived due to race condition - -Most of these issues can be worked around by using the DOM snapshot, which is -also saved. This causes its own set of issues though: - -- JavaScript-based navigation does not work. - -Distributed crawling --------------------- - -Configure using celeryconfig.py - -.. code:: python - - broker_url = 'pyamqp://' - result_backend = 'rpc://' - warc_filename = '{domain}-{date}-{id}.warc.gz' - temp_dir = '/tmp/' - finished_dir = '/tmp/finished' - -Start a Celery worker:: - - celery -A crocoite.task worker -Q crocoite.archive,crocoite.controller --loglevel=info - -Then queue archive job:: - - crocoite-grab --distributed http://example.com - -The worker will create a temporary file named according to ``warc_filename`` in -``/tmp`` while archiving and move it to ``/tmp/finished`` when done. - -IRC bot -^^^^^^^ - -Configure sopel_ (``~/.sopel/default.cfg``) to use the plugin located in -``contrib/celerycrocoite.py`` - -.. code:: ini - - [core] - nick = chromebot - host = irc.efnet.fr - port = 6667 - owner = someone - extra = /path/to/crocoite/contrib - enable = celerycrocoite - channels = #somechannel - -Then start it by running ``sopel``. The bot must be addressed directly (i.e. -``chromebot: <command>``). The following commands are currently supported: - -a <url> - Archives <url> and all of its resources (images, css, …). A unique UID - (UUID) is assigned to each job. -s <uuid> - Get status of job with <uuid> -r <uuid> - Revoke job with <uuid>. If it started already the job will be killed. - -.. _sopel: https://sopel.chat/ - -Related projects ----------------- - -brozzler_ - Uses Google Chrome as well, but intercepts traffic using a proxy. Supports - distributed crawling and immediate playback. -Squidwarc_ - Communicates with headless Google Chrome and uses the Network API to - retrieve requests like crocoite. Supports recursive crawls and page - scrolling, but neither custom JavaScript nor distributed crawling. +See documentation_ for more information. -.. _brozzler: https://github.com/internetarchive/brozzler -.. _Squidwarc: https://github.com/N0taN3rd/Squidwarc +.. _documentation: https://6xq.net/crocoite/ diff --git a/contrib/celerycrocoite.py b/contrib/celerycrocoite.py deleted file mode 100644 index 26c35ce..0000000 --- a/contrib/celerycrocoite.py +++ /dev/null @@ -1,231 +0,0 @@ -# Copyright (c) 2017 crocoite contributors -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. - -""" -Module for Sopel IRC bot -""" - -import os, logging, argparse -from sopel.module import nickname_commands, require_chanmsg, thread, example, require_privilege, VOICE -from sopel.tools import Identifier, SopelMemory -import celery, celery.exceptions -from celery.result import AsyncResult -from urllib.parse import urlsplit -from threading import Thread -from queue import Queue -import queue - -from crocoite import behavior, task -from crocoite.controller import defaultSettings - -def prettyTimeDelta (seconds): - """ - Pretty-print seconds to human readable string 1d 1h 1m 1s - """ - seconds = int(seconds) - days, seconds = divmod(seconds, 86400) - hours, seconds = divmod(seconds, 3600) - minutes, seconds = divmod(seconds, 60) - s = [(days, 'd'), (hours, 'h'), (minutes, 'm'), (seconds, 's')] - s = filter (lambda x: x[0] != 0, s) - return ' '.join (map (lambda x: '{}{}'.format (*x), s)) - -def prettyBytes (b): - """ - Pretty-print bytes - """ - prefixes = ['B', 'KiB', 'MiB', 'GiB', 'TiB'] - while b >= 1024 and len (prefixes) > 1: - b /= 1024 - prefixes.pop (0) - return '{:.1f} {}'.format (b, prefixes[0]) - -def setup (bot): - m = bot.memory['crocoite'] = {} - q = m['q'] = Queue () - t = m['t'] = Thread (target=celeryWorker, args=(bot, q)) - t.start () - -def shutdown (bot): - m = bot.memory['crocoite'] - q = m['q'] - t = m['t'] - q.put_nowait (None) - t.join () - -def isValidUrl (s): - url = urlsplit (s) - if url.scheme and url.netloc and url.scheme in {'http', 'https'}: - return s - raise TypeError () - -def checkCompletedJobs (bot, jobs): - delete = set () - for i, data in jobs.items (): - handle = data['handle'] - trigger = data['trigger'] - args = data['args'] - url = args['url'] - channel = trigger.sender - user = trigger.nick - if Identifier (channel) not in bot.channels: - continue - try: - result = handle.get (timeout=0.1) - stats = result['stats'] - bot.msg (channel, '{}: {} ({}) finished. {} requests, {} failed, {} received.'.format (user, url, - handle.id, stats['requests'], stats['failed'], - prettyBytes (stats['bytesRcv']))) - delete.add (handle.id) - except celery.exceptions.TimeoutError: - pass - except Exception as e: - # json serialization does not work well with exceptions. If their class - # names are unique we can still distinguish them. - ename = type (e).__name__ - if ename == 'TaskRevokedError': - bot.msg (channel, '{}: {} ({}) was revoked'.format (user, url, handle.id)) - else: - bot.msg (channel, '{} ({}) failed'.format (user, url, handle.id)) - logging.exception ('{} ({}) failed'.format (url, handle.id)) - delete.add (handle.id) - for d in delete: - del jobs[d] - -def celeryWorker (bot, q): - """ - Serialize celery operations in a single thread. This is a workaround for - https://github.com/celery/celery/issues/4480 - """ - - jobs = {} - - while True: - try: - item = q.get (timeout=1) - except queue.Empty: - checkCompletedJobs (bot, jobs) - continue - - if item is None: - break - action, trigger, args = item - if action == 'a': - handle = task.controller.delay (**args) - j = jobs[handle.id] = {'handle': handle, 'trigger': trigger, 'args': args} - - # pretty-print a few selected args - showargs = { - 'idleTimeout': prettyTimeDelta (args['settings']['idleTimeout']), - 'timeout': prettyTimeDelta (args['settings']['timeout']), - 'maxBodySize': prettyBytes (args['settings']['maxBodySize']), - 'recursive': args['recursive'], - 'concurrency': args['concurrency'], - } - strargs = ', '.join (map (lambda x: '{}={}'.format (*x), showargs.items ())) - bot.msg (trigger.sender, '{}: {} has been queued as {} with {}'.format (trigger.nick, args['url'], handle.id, strargs)) - elif action == 'status': - if args and args in jobs: - j = jobs[args] - jtrigger = j['trigger'] - handle = j['handle'] - bot.msg (trigger.sender, '{}: {}, queued {}, by {}'.format (handle.id, - handle.status, jtrigger.time, jtrigger.nick)) - else: - bot.msg (trigger.sender, "Job not found.") - elif action == 'revoke': - if args and args in jobs: - j = jobs[args] - handle = j['handle'] - handle.revoke (terminate=True) - # response is handled above - else: - bot.msg (trigger.sender, "Job not found.") - q.task_done () - -class NonExitingArgumentParser (argparse.ArgumentParser): - def exit (self, status=0, message=None): - # should never be called - pass - - def error (self, message): - raise Exception (message) - -archiveparser = NonExitingArgumentParser (prog='a', add_help=False) -archiveparser.add_argument('--timeout', default=1*60*60, type=int, help='Maximum time for archival', metavar='SEC', choices=[60, 1*60*60, 2*60*60]) -archiveparser.add_argument('--idle-timeout', default=10, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout', metavar='SEC', choices=[1, 10, 20, 30, 60]) -archiveparser.add_argument('--max-body-size', default=defaultSettings.maxBodySize, type=int, dest='maxBodySize', help='Max body size', metavar='BYTES', choices=[1*1024*1024, 10*1024*1024, defaultSettings.maxBodySize, 100*1024*1024]) -archiveparser.add_argument('--concurrency', default=1, type=int, help='Parallel workers for this job', choices=range (9)) -archiveparser.add_argument('--recursive', help='Enable recursion', choices=['0', '1', '2', '3', 'prefix']) -archiveparser.add_argument('url', help='Website URL', type=isValidUrl) - -@nickname_commands ('a', 'archive') -@require_chanmsg () -@require_privilege (VOICE) -@example ('a http://example.com') -def archive (bot, trigger): - """ - Archive a URL to WARC - """ - - try: - args = archiveparser.parse_args (trigger.group (2).split ()) - except Exception as e: - bot.reply ('{} -- {}'.format (e.args[0], archiveparser.format_usage ())) - return - if not args: - bot.reply ('Sorry, I don’t understand {}'.format (trigger.group (2))) - return - blacklistedBehavior = {'domSnapshot', 'screenshot'} - settings = dict (maxBodySize=args.maxBodySize, - logBuffer=defaultSettings.logBuffer, idleTimeout=args.idleTimeout, - timeout=args.timeout) - args = dict (url=args.url, - enabledBehaviorNames=list (behavior.availableNames-blacklistedBehavior), - settings=settings, recursive=args.recursive, - concurrency=args.concurrency) - q = bot.memory['crocoite']['q'] - q.put_nowait (('a', trigger, args)) - -@nickname_commands ('s', 'status') -@example ('s c251f09e-3c26-481f-96e0-4b5f58bd1170') -@require_chanmsg () -def status (bot, trigger): - """ - Retrieve status for a job - """ - - i = trigger.group(2) - q = bot.memory['crocoite']['q'] - q.put_nowait (('status', trigger, i)) - -@nickname_commands ('r', 'revoke') -@example ('r c251f09e-3c26-481f-96e0-4b5f58bd1170') -@require_privilege (VOICE) -@require_chanmsg () -def revoke (bot, trigger): - """ - Cancel (revoke) a job - """ - - i = trigger.group(2) - q = bot.memory['crocoite']['q'] - q.put_nowait (('revoke', trigger, i)) - diff --git a/contrib/chromebot.json b/contrib/chromebot.json new file mode 100644 index 0000000..214b770 --- /dev/null +++ b/contrib/chromebot.json @@ -0,0 +1,16 @@ +{ + "irc": { + "host": "irc.example.com", + "port": 6667, + "ssl": false, + "nick": "chromebot", + "channels": ["#testchannel"] + }, + "tempdir": "/path/to/tmp", + "destdir": "/path/to/warc", + "process_limit": 1 + "blacklist": { + "^https?://(.+\\.)?local(host)?/": "Not acceptable" + }, + "need_voice": false +} diff --git a/contrib/dashboard.css b/contrib/dashboard.css new file mode 100644 index 0000000..469db78 --- /dev/null +++ b/contrib/dashboard.css @@ -0,0 +1,7 @@ +.jid { + font-family: Consolas, mono; +} +.job .urls { + max-height: 10em; + overflow-y: scroll; +} diff --git a/contrib/dashboard.html b/contrib/dashboard.html new file mode 100644 index 0000000..49a15bc --- /dev/null +++ b/contrib/dashboard.html @@ -0,0 +1,23 @@ +<!DOCTYPE html> +<html lang="en"> + <head> + <meta charset="utf-8"> + <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"> + <title>chromebot dashboard</title> + <!--<script src="https://cdn.jsdelivr.net/npm/vue@2/dist/vue.js"></script>--> + <script src="https://cdn.jsdelivr.net/npm/vue@2/dist/vue.min.js"></script> + <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bulma@0.7/css/bulma.min.css"> + <link rel="stylesheet" href="dashboard.css"> + </head> + <body> + <noscript>Please enable JavaScript.</noscript> + <section id="app" class="section"> + <h1 class="title">chromebot dashboard</h1> + <bot-status v-bind:jobs="jobs"></bot-status> + <div class="jobs"> + <job-item v-for="j in jobs" v-bind:job="j" v-bind:jobs="jobs" v-bind:key="j.id"></job-item> + </div> + </section> + <script src="dashboard.js"></script> + </body> +</html> diff --git a/contrib/dashboard.js b/contrib/dashboard.js new file mode 100644 index 0000000..b5520dc --- /dev/null +++ b/contrib/dashboard.js @@ -0,0 +1,129 @@ +/* configuration */ +let socket = "wss://localhost:6789/", + urllogMax = 100; + +function formatSize (bytes) { + let prefixes = ['B', 'KiB', 'MiB', 'GiB', 'TiB']; + while (bytes >= 1024 && prefixes.length > 1) { + bytes /= 1024; + prefixes.shift (); + } + return bytes.toFixed (1) + ' ' + prefixes[0]; +} + +class Job { + constructor (id, url, user, queued) { + this.id = id; + this.url = url; + this.user = user; + this.status = undefined; + this.stats = {'pending': 0, 'have': 0, 'running': 0, + 'requests': 0, 'finished': 0, 'failed': 0, + 'bytesRcv': 0, 'crashed': 0, 'ignored': 0}; + this.urllog = []; + this.queued = queued; + this.started = undefined; + this.finished = undefined; + this.aborted = undefined; + } + + addUrl (url) { + if (this.urllog.push (url) > urllogMax) { + this.urllog.shift (); + } + } +} + +let jobs = {}; +let ws = new WebSocket(socket); +ws.onmessage = function (event) { + var msg = JSON.parse (event.data); + let msgdate = new Date (Date.parse (msg.date)); + var j = undefined; + if (msg.job) { + j = jobs[msg.job]; + if (j === undefined) { + j = new Job (msg.job, 'unknown', '<unknown>', new Date ()); + Vue.set (jobs, msg.job, j); + } + } + if (msg.uuid == '36cc34a6-061b-4cc5-84a9-4ab6552c8d75') { + j = new Job (msg.job, msg.url, msg.user, msgdate); + /* jobs[msg.job] = j does not work with vue, see + https://vuejs.org/v2/guide/list.html#Object-Change-Detection-Caveats + */ + Vue.set (jobs, msg.job, j); + j.status = 'pending'; + } else if (msg.uuid == '46e62d60-f498-4ab0-90e1-d08a073b10fb') { + j.status = 'running'; + j.started = msgdate; + } else if (msg.uuid == '7b40ffbb-faab-4224-90ed-cd4febd8f7ec') { + j.status = 'finished'; + j.finished = msgdate; + } else if (msg.uuid == '865b3b3e-a54a-4a56-a545-f38a37bac295') { + j.status = 'aborted'; + j.aborted = msgdate; + } else if (msg.uuid == '5c0f9a11-dcd8-4182-a60f-54f4d3ab3687') { + /* forwarded job message */ + let rmsg = msg.data; + if (rmsg.uuid == '24d92d16-770e-4088-b769-4020e127a7ff') { + /* job status */ + Object.assign (j.stats, rmsg); + } else if (rmsg.uuid == '5b8498e4-868d-413c-a67e-004516b8452c') { + /* recursion status */ + Object.assign (j.stats, rmsg); + } else if (rmsg.uuid == 'd1288fbe-8bae-42c8-af8c-f2fa8b41794f') { + /* fetch */ + j.addUrl (rmsg.url); + } + } +}; +ws.onopen = function (event) { +}; +ws.onerror = function (event) { +}; + +Vue.component('job-item', { + props: ['job', 'jobs'], + template: '<div class="job box" :id="job.id"><ul class="columns"><li class="jid column is-narrow"><a :href="\'#\' + job.id">{{ job.id }}</a></li><li class="url column"><a :href="job.url">{{ job.url }}</a></li><li class="status column is-narrow"><job-status v-bind:job="job"></job-status></li></ul><job-stats v-bind:job="job"></job-stats></div>', +}); +Vue.component('job-status', { + props: ['job'], + template: '<span v-if="job.status == \'pending\'">queued on {{ job.queued.toLocaleString() }}</span><span v-else-if="job.status == \'aborted\'">aborted on {{ job.aborted.toLocaleString() }}</span><span v-else-if="job.status == \'running\'">running since {{ job.started.toLocaleString() }}</span><span v-else-if="job.status == \'finished\'">finished since {{ job.finished.toLocaleString() }}</span>' +}); +Vue.component('job-stats', { + props: ['job'], + template: '<div><progress class="progress is-info" :value="job.stats.have" :max="job.stats.have+job.stats.pending+job.stats.running"></progress><ul class="stats columns"><li class="column">{{ job.stats.have }} <small>have</small></li><li class="column">{{ job.stats.running }} <small>running</small></li><li class="column">{{ job.stats.pending }} <small>pending</small></li><li class="column">{{ job.stats.requests }} <small>requests</small><li class="column"><filesize v-bind:value="job.stats.bytesRcv"></filesize></li></ul><job-urls v-bind:job="job"></job-urls></div>' +}); +Vue.component('job-urls', { + props: ['job'], + template: '<ul class="urls"><li v-for="u in job.urllog">{{ u }}</li></ul>' +}); +Vue.component('filesize', { + props: ['value'], + template: '<span class="filesize">{{ fvalue }}</span>', + computed: { fvalue: function () { return formatSize (this.value); } } +}); +Vue.component('bot-status', { + props: ['jobs'], + template: '<nav class="level"><div class="level-item has-text-centered"><div><p class="heading">Pending</p><p class="title">{{ stats.pending }}</p></div></div><div class="level-item has-text-centered"><div><p class="heading">Running</p><p class="title">{{ stats.running }}</p></div></div><div class="level-item has-text-centered"><div><p class="heading">Finished</p><p class="title">{{ stats.finished+stats.aborted }}</p></div></div><div class="level-item has-text-centered"><div><p class="heading">Transferred</p><p class="title"><filesize v-bind:value="stats.totalBytes"></filesize></p></div></div></nav>', + computed: { + stats: function () { + let s = {pending: 0, running: 0, finished: 0, aborted: 0, totalBytes: 0}; + for (let k in this.jobs) { + let j = this.jobs[k]; + s[j.status]++; + s.totalBytes += j.stats.bytesRcv; + } + return s; + } + } +}); + +let app = new Vue({ + el: '#app', + data: { + jobs: jobs, + } +}); + diff --git a/crocoite/behavior.py b/crocoite/behavior.py index 95e8160..1610751 100644 --- a/crocoite/behavior.py +++ b/crocoite/behavior.py @@ -1,4 +1,4 @@ -# Copyright (c) 2017 crocoite contributors +# Copyright (c) 2017–2018 crocoite contributors # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -19,31 +19,74 @@ # THE SOFTWARE. """ -Generic and per-site behavior scripts +Behavior scripts (i.e. subclasses of Behavior) are a powerful method to +manipulate websites loaded into Chrome. They are executed by the controller +after the page started loading (onload), after it has been idle for a while +(onstop) and after loading was stopped (onfinish). + +The script’s excercise their power either through DevTools API calls or by +injecting JavaScript into the page context. Thus they can manipulate both, the +browser itself (DevTools; modify resolution, get DOM snapshot) as well as the +page (JavaScript; trigger JavaScript events, call web API’s). + +They also emit (yield) data processable by any consumer registered to the +controller. This allows storing captured screenshots inside WARC files, for +instance. """ -import logging -from io import BytesIO -from urllib.parse import urlsplit -import os.path -import pkg_resources +import asyncio, json, os.path from base64 import b64decode +from collections import OrderedDict +import pkg_resources + +from html5lib.serializer import HTMLSerializer +from yarl import URL +import yaml -from .util import randomString, packageUrl, getFormattedViewportMetrics +from .util import getFormattedViewportMetrics from . import html from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker -from html5lib.serializer import HTMLSerializer -from warcio.statusandheaders import StatusAndHeaders +from .devtools import Crashed, TabException + +class Script: + """ A JavaScript resource """ -logger = logging.getLogger(__name__) + __slots__ = ('path', 'data') + datadir = 'data' + + def __init__ (self, path=None, encoding='utf-8'): + self.path = path + if path: + self.data = pkg_resources.resource_string (__name__, os.path.join (self.datadir, path)).decode (encoding) + + def __repr__ (self): + return f'<Script {self.path}>' + + def __str__ (self): + return self.data + + @property + def abspath (self): + return pkg_resources.resource_filename (__name__, + os.path.join (self.datadir, self.path)) + + @classmethod + def fromStr (cls, data, path=None): + s = Script () + s.data = data + s.path = path + return s class Behavior: + __slots__ = ('loader', 'logger') + # unique behavior name name = None - def __init__ (self, loader): + def __init__ (self, loader, logger): assert self.name is not None self.loader = loader + self.logger = logger.bind (context=type (self).__name__) def __contains__ (self, url): """ @@ -51,54 +94,97 @@ class Behavior: """ return True - def loadScript (self, path, encoding='utf-8'): - return pkg_resources.resource_string (__name__, os.path.join ('data', path)).decode (encoding) + def __repr__ (self): + return f'<Behavior {self.name}>' - def useScript (self, script, encoding='utf-8'): - writer = self.loader.writer - record = writer.create_warc_record (packageUrl ('script'), 'metadata', - payload=BytesIO (script.encode (encoding)), - warc_headers_dict={'Content-Type': 'application/javascript; charset={}'.format (encoding)}) - writer.write_record (record) + async def onload (self): + """ After loading the page started """ + # this is a dirty hack to make this function an async generator + return + yield # pragma: no cover - def onload (self): - """ Before loading the page """ - pass - - def onstop (self): + async def onstop (self): """ Before page loading is stopped """ - pass + return + yield # pragma: no cover - def onfinish (self): + async def onfinish (self): """ After the site has stopped loading """ - pass - -class HostnameFilter: - """ Limit behavior script to hostname """ - - hostname = None - - def __contains__ (self, url): - url = urlsplit (url) - hostname = url.hostname.split ('.')[::-1] - return hostname[:2] == self.hostname + return + yield # pragma: no cover class JsOnload (Behavior): """ Execute JavaScript on page load """ - scriptPath = None + __slots__ = ('script', 'context', 'options') - def __init__ (self, loader): - super ().__init__ (loader) - self.script = self.loadScript (self.scriptPath) - self.scriptHandle = None + scriptPath = None - def onload (self): - self.useScript (self.script) - self.scriptHandle = self.loader.tab.Page.addScriptToEvaluateOnNewDocument (source=self.script)['identifier'] + def __init__ (self, loader, logger): + super ().__init__ (loader, logger) + self.script = Script (self.scriptPath) + self.context = None + # options passed to constructor + self.options = {} - def onstop (self): - self.loader.tab.Page.removeScriptToEvaluateOnNewDocument (identifier=self.scriptHandle) + async def onload (self): + tab = self.loader.tab + yield self.script + + # This is slightly awkward, since we cannot compile the class into an + # objectId and then reference it. Therefore the script must return a + # class constructor, which is then called with a generic options + # parameter. + # XXX: is there a better way to do this? + result = await tab.Runtime.evaluate (expression=str (self.script)) + self.logger.debug ('behavior onload inject', + uuid='a2da9b78-5648-44c5-bfa8-5c7573e13ad3', result=result) + exception = result.get ('exceptionDetails', None) + result = result['result'] + assert result['type'] == 'function', result + assert result.get ('subtype') != 'error', exception + constructor = result['objectId'] + + if self.options: + yield Script.fromStr (json.dumps (self.options, indent=2), f'{self.script.path}#options') + + try: + result = await tab.Runtime.callFunctionOn ( + functionDeclaration='function(options){return new this(options);}', + objectId=constructor, + arguments=[{'value': self.options}]) + self.logger.debug ('behavior onload start', + uuid='6c0605ae-93b3-46b3-b575-ba45790909a7', result=result) + result = result['result'] + assert result['type'] == 'object', result + assert result.get ('subtype') != 'error', result + self.context = result['objectId'] + except TabException as e: + if e.args[0] == -32000: + # the site probably reloaded. ignore this, since we’ll be + # re-injected into the new site by the controller. + self.logger.error ('jsonload onload failed', + uuid='c151a863-78d1-41f4-a8e6-c022a6c5d252', + exception=e.args) + else: + raise + + async def onstop (self): + tab = self.loader.tab + try: + assert self.context is not None + await tab.Runtime.callFunctionOn (functionDeclaration='function(){return this.stop();}', + objectId=self.context) + await tab.Runtime.releaseObject (objectId=self.context) + except TabException as e: + # cannot do anything about that. Ignoring should be fine. + self.logger.error ('jsonload onstop failed', + uuid='1786726f-c8ec-4f79-8769-30954d4e32f5', + exception=e.args, + objectId=self.context) + + return + yield # pragma: no cover ### Generic scripts ### @@ -106,24 +192,10 @@ class Scroll (JsOnload): name = 'scroll' scriptPath = 'scroll.js' - def __init__ (self, loader): - super ().__init__ (loader) - stopVarname = '__' + __package__ + '_stop__' - newStopVarname = randomString () - self.script = self.script.replace (stopVarname, newStopVarname) - self.stopVarname = newStopVarname - - def onstop (self): - super ().onstop () - # removing the script does not stop it if running - script = '{} = true; window.scrollTo (0, 0);'.format (self.stopVarname) - self.useScript (script) - self.loader.tab.Runtime.evaluate (expression=script, returnByValue=True) - class EmulateScreenMetrics (Behavior): name = 'emulateScreenMetrics' - def onstop (self): + async def onstop (self): """ Emulate different screen sizes, causing the site to fetch assets (img srcset and css, for example) for different screen resolutions. @@ -139,17 +211,32 @@ class EmulateScreenMetrics (Behavior): {'width': 320, 'height': 480, 'deviceScaleFactor': 163/cssPpi, 'mobile': True}, # 6th gen iPhone (portrait mode) {'width': 750, 'height': 1334, 'deviceScaleFactor': 326/cssPpi, 'mobile': True}, - # and reset - {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1, 'mobile': False}, ] l = self.loader tab = l.tab for s in sizes: - tab.Emulation.setDeviceMetricsOverride (**s) + self.logger.debug ('device override', + uuid='3d2d8096-1a75-4830-ad79-ae5f6f97071d', **s) + await tab.Emulation.setDeviceMetricsOverride (**s) # give the browser time to re-eval page and start requests - l.wait (1) - # XXX: this seems to be broken, it does not clear the override - #tab.Emulation.clearDeviceMetricsOverride () + # XXX: should wait until loader is not busy any more + await asyncio.sleep (1) + self.logger.debug ('clear override', + uuid='f9401683-eb3a-4b86-9bb2-c8c5d876fc8d') + await tab.Emulation.clearDeviceMetricsOverride () + return + yield # pragma: no cover + +class DomSnapshotEvent: + __slots__ = ('url', 'document', 'viewport') + + def __init__ (self, url, document, viewport): + # XXX: document encoding? + assert isinstance (document, bytes) + + self.url = url + self.document = document + self.viewport = viewport class DomSnapshot (Behavior): """ @@ -157,38 +244,39 @@ class DomSnapshot (Behavior): We could use DOMSnapshot.getSnapshot here, but the API is not stable yet. Also computed styles are not really necessary here. - - XXX: Currently writes a response, when it should use “resource”. pywb - can’t handle that though. """ + __slots__ = ('script', ) + name = 'domSnapshot' - def __init__ (self, loader): - super ().__init__ (loader) - self.script = self.loadScript ('canvas-snapshot.js') + def __init__ (self, loader, logger): + super ().__init__ (loader, logger) + self.script = Script ('canvas-snapshot.js') - def onfinish (self): + async def onfinish (self): tab = self.loader.tab - writer = self.loader.writer - self.useScript (self.script) - tab.Runtime.evaluate (expression=self.script, returnByValue=True) + yield self.script + await tab.Runtime.evaluate (expression=str (self.script), returnByValue=True) - viewport = getFormattedViewportMetrics (tab) - dom = tab.DOM.getDocument (depth=-1, pierce=True) + viewport = await getFormattedViewportMetrics (tab) + dom = await tab.DOM.getDocument (depth=-1, pierce=True) + self.logger.debug ('dom snapshot document', + uuid='0c720784-8bd1-4fdc-a811-84394d753539', dom=dom) haveUrls = set () for doc in ChromeTreeWalker (dom['root']).split (): - rawUrl = doc['documentURL'] - if rawUrl in haveUrls: + url = URL (doc['documentURL']) + if url in haveUrls: # ignore duplicate URLs. they are usually caused by # javascript-injected iframes (advertising) with no(?) src - logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl)) - continue - url = urlsplit (rawUrl) - if url.scheme in ('http', 'https'): - logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL'])) - haveUrls.add (rawUrl) + self.logger.warning ('dom snapshot duplicate', + uuid='d44de989-98d4-456e-82e7-9d4c49acab5e') + elif url.scheme in ('http', 'https'): + self.logger.debug ('dom snapshot', + uuid='ece7ff05-ccd9-44b5-b6a8-be25a24b96f4', + base=doc["baseURL"]) + haveUrls.add (url) walker = ChromeTreeWalker (doc) # remove script, to make the page static and noscript, because at the # time we took the snapshot scripts were enabled @@ -196,41 +284,89 @@ class DomSnapshot (Behavior): disallowedAttributes = html.eventAttributes stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes) serializer = HTMLSerializer () - httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') - record = writer.create_warc_record (doc['documentURL'], 'response', - payload=BytesIO (serializer.render (stream, 'utf-8')), - http_headers=httpHeaders, - warc_headers_dict={'X-DOM-Snapshot': str (True), - 'X-Chrome-Viewport': viewport}) - writer.write_record (record) + yield DomSnapshotEvent (url.with_fragment(None), serializer.render (stream, 'utf-8'), viewport) + +class ScreenshotEvent: + __slots__ = ('yoff', 'data', 'url') + + def __init__ (self, url, yoff, data): + self.url = url + self.yoff = yoff + self.data = data class Screenshot (Behavior): """ Create screenshot from tab and write it to WARC + + Chrome will allocate an additional 512MB of RAM when using this plugin. """ + __slots__ = ('script') + name = 'screenshot' - def onfinish (self): + # Hardcoded max texture size of 16,384 (crbug.com/770769) + maxDim = 16*1024 + + def __init__ (self, loader, logger): + super ().__init__ (loader, logger) + self.script = Script ('screenshot.js') + + async def onfinish (self): tab = self.loader.tab - writer = self.loader.writer - # see https://github.com/GoogleChrome/puppeteer/blob/230be28b067b521f0577206899db01f0ca7fc0d2/examples/screenshots-longpage.js - # Hardcoded max texture size of 16,384 (crbug.com/770769) - maxDim = 16*1024 - metrics = tab.Page.getLayoutMetrics () + # for top-level/full-screen elements with position: fixed we need to + # figure out their actual size (i.e. scrollHeight) and use that when + # overriding the viewport size. + # we could do this without javascript, but that would require several + # round-trips to Chrome or pulling down the entire DOM+computed styles + tab = self.loader.tab + yield self.script + result = await tab.Runtime.evaluate (expression=str (self.script), returnByValue=True) + assert result['result']['type'] == 'object', result + result = result['result']['value'] + + # this is required to make the browser render more than just the small + # actual viewport (i.e. entire page). see + # https://github.com/GoogleChrome/puppeteer/blob/45873ea737b4ebe4fa7d6f46256b2ea19ce18aa7/lib/Page.js#L805 + metrics = await tab.Page.getLayoutMetrics () contentSize = metrics['contentSize'] - width = min (contentSize['width'], maxDim) + contentHeight = max (result + [contentSize['height']]) + + override = { + 'width': 0, + 'height': 0, + 'deviceScaleFactor': 0, + 'mobile': False, + 'viewport': {'x': 0, + 'y': 0, + 'width': contentSize['width'], + 'height': contentHeight, + 'scale': 1} + } + self.logger.debug ('screenshot override', + uuid='e0affa18-cbb1-4d97-9d13-9a88f704b1b2', override=override) + await tab.Emulation.setDeviceMetricsOverride (**override) + + tree = await tab.Page.getFrameTree () + try: + url = URL (tree['frameTree']['frame']['url']).with_fragment (None) + except KeyError: + self.logger.error ('frame without url', + uuid='edc2743d-b93e-4ba1-964e-db232f2f96ff', tree=tree) + url = None + + width = min (contentSize['width'], self.maxDim) # we’re ignoring horizontal scroll intentionally. Most horizontal # layouts use JavaScript scrolling and don’t extend the viewport. - for yoff in range (0, contentSize['height'], maxDim): - height = min (contentSize['height'] - yoff, maxDim) + for yoff in range (0, contentHeight, self.maxDim): + height = min (contentHeight - yoff, self.maxDim) clip = {'x': 0, 'y': yoff, 'width': width, 'height': height, 'scale': 1} - data = b64decode (tab.Page.captureScreenshot (format='png', clip=clip)['data']) - url = packageUrl ('screenshot-{}-{}.png'.format (0, yoff)) - record = writer.create_warc_record (url, 'resource', - payload=BytesIO (data), warc_headers_dict={'Content-Type': 'image/png'}) - writer.write_record (record) + ret = await tab.Page.captureScreenshot (format='png', clip=clip) + data = b64decode (ret['data']) + yield ScreenshotEvent (url, yoff, data) + + await tab.Emulation.clearDeviceMetricsOverride () class Click (JsOnload): """ Generic link clicking """ @@ -238,6 +374,27 @@ class Click (JsOnload): name = 'click' scriptPath = 'click.js' + def __init__ (self, loader, logger): + super ().__init__ (loader, logger) + with pkg_resources.resource_stream (__name__, os.path.join ('data', 'click.yaml')) as fd: + self.options['sites'] = list (yaml.safe_load_all (fd)) + +class ExtractLinksEvent: + __slots__ = ('links', ) + + def __init__ (self, links): + self.links = links + + def __repr__ (self): + return f'<ExtractLinksEvent {self.links!r}>' + +def mapOrIgnore (f, l): + for e in l: + try: + yield f (e) + except: + pass + class ExtractLinks (Behavior): """ Extract links from a page using JavaScript @@ -246,23 +403,37 @@ class ExtractLinks (Behavior): manually resolve relative links. """ + __slots__ = ('script', ) + name = 'extractLinks' - def __init__ (self, loader): - super ().__init__ (loader) - self.script = self.loadScript ('extract-links.js') - self.links = None + def __init__ (self, loader, logger): + super ().__init__ (loader, logger) + self.script = Script ('extract-links.js') - def onfinish (self): + async def onfinish (self): tab = self.loader.tab - self.useScript (self.script) - result = tab.Runtime.evaluate (expression=self.script, returnByValue=True) - self.links = list (set (result['result']['value'])) + yield self.script + result = await tab.Runtime.evaluate (expression=str (self.script), returnByValue=True) + yield ExtractLinksEvent (list (set (mapOrIgnore (URL, result['result']['value'])))) + +class Crash (Behavior): + """ Crash the browser. For testing only. Obviously. """ + + name = 'crash' + + async def onstop (self): + try: + await self.loader.tab.Page.crash () + except Crashed: + pass + return + yield # pragma: no cover # available behavior scripts. Order matters, move those modifying the page # towards the end of available -generic = [Scroll, EmulateScreenMetrics, Click, ExtractLinks] -perSite = [] -available = generic + perSite + [Screenshot, DomSnapshot] -availableNames = set (map (lambda x: x.name, available)) +available = [Scroll, Click, ExtractLinks, Screenshot, EmulateScreenMetrics, DomSnapshot] +#available.append (Crash) +# order matters, since behavior can modify the page (dom snapshots, for instance) +availableMap = OrderedDict (map (lambda x: (x.name, x), available)) diff --git a/crocoite/browser.py b/crocoite/browser.py index e58ebcf..3518789 100644 --- a/crocoite/browser.py +++ b/crocoite/browser.py @@ -22,91 +22,198 @@ Chrome browser interactions. """ -import logging -from urllib.parse import urlsplit -from base64 import b64decode -import pychrome +import asyncio +from base64 import b64decode, b64encode +from datetime import datetime, timedelta +from http.server import BaseHTTPRequestHandler -class Item: - """ - Simple wrapper containing Chrome request and response - """ +from yarl import URL +from multidict import CIMultiDict - def __init__ (self, tab): - self.tab = tab - self.chromeRequest = None - self.chromeResponse = None - self.chromeFinished = None +from .logger import Level +from .devtools import Browser, TabException - def __repr__ (self): - return '<Item {}>'.format (self.request['url']) +# These two classes’ only purpose is so we can later tell whether a body was +# base64-encoded or a unicode string +class Base64Body (bytes): + def __new__ (cls, value): + return bytes.__new__ (cls, b64decode (value)) - @property - def request (self): - return self.chromeRequest['request'] + @classmethod + def fromBytes (cls, b): + """ For testing """ + return cls (b64encode (b)) - @property - def response (self): - return self.chromeResponse['response'] +class UnicodeBody (bytes): + def __new__ (cls, value): + if type (value) is not str: + raise TypeError ('expecting unicode string') - @property - def initiator (self): - return self.chromeRequest['initiator'] + return bytes.__new__ (cls, value.encode ('utf-8')) - @property - def id (self): - return self.chromeRequest['requestId'] +class Request: + __slots__ = ('headers', 'body', 'initiator', 'hasPostData', 'method', 'timestamp') - @property - def encodedDataLength (self): - return self.chromeFinished['encodedDataLength'] + def __init__ (self, method=None, headers=None, body=None): + self.headers = headers + self.body = body + self.hasPostData = False + self.initiator = None + # HTTP method + self.method = method + self.timestamp = None - @property - def url (self): - return self.response['url'] + def __repr__ (self): + return f'Request({self.method!r}, {self.headers!r}, {self.body!r})' + + def __eq__ (self, b): + if b is None: + return False + + if not isinstance (b, Request): + raise TypeError ('Can only compare equality with Request.') + + # do not compare hasPostData (only required to fetch body) and + # timestamp (depends on time) + return self.headers == b.headers and \ + self.body == b.body and \ + self.initiator == b.initiator and \ + self.method == b.method + +class Response: + __slots__ = ('status', 'statusText', 'headers', 'body', 'bytesReceived', + 'timestamp', 'mimeType') + + def __init__ (self, status=None, statusText=None, headers=None, body=None, mimeType=None): + self.status = status + self.statusText = statusText + self.headers = headers + self.body = body + # bytes received over the network (not body size!) + self.bytesReceived = 0 + self.timestamp = None + self.mimeType = mimeType - @property - def parsedUrl (self): - return urlsplit (self.url) + def __repr__ (self): + return f'Response({self.status!r}, {self.statusText!r}, {self.headers!r}, {self.body!r}, {self.mimeType!r})' + + def __eq__ (self, b): + if b is None: + return False + + if not isinstance (b, Response): + raise TypeError ('Can only compare equality with Response.') + + # do not compare bytesReceived (depends on network), timestamp + # (depends on time) and statusText (does not matter) + return self.status == b.status and \ + self.statusText == b.statusText and \ + self.headers == b.headers and \ + self.body == b.body and \ + self.mimeType == b.mimeType + +class ReferenceTimestamp: + """ Map relative timestamp to absolute timestamp """ + + def __init__ (self, relative, absolute): + self.relative = timedelta (seconds=relative) + self.absolute = datetime.utcfromtimestamp (absolute) + + def __call__ (self, relative): + if not isinstance (relative, timedelta): + relative = timedelta (seconds=relative) + return self.absolute + (relative-self.relative) + +class RequestResponsePair: + __slots__ = ('request', 'response', 'id', 'url', 'remoteIpAddress', + 'protocol', 'resourceType', '_time') + + def __init__ (self, id=None, url=None, request=None, response=None): + self.request = request + self.response = response + self.id = id + self.url = url + self.remoteIpAddress = None + self.protocol = None + self.resourceType = None + self._time = None - @property - def body (self): - """ Return response body or None """ - try: - body = self.tab.Network.getResponseBody (requestId=self.id, _timeout=60) - rawBody = body['body'] - base64Encoded = body['base64Encoded'] - if base64Encoded: - rawBody = b64decode (rawBody) - else: - rawBody = rawBody.encode ('utf8') - return rawBody, base64Encoded - except (pychrome.exceptions.CallMethodException, pychrome.exceptions.TimeoutException): - raise ValueError ('Cannot fetch response body') - - @property - def requestBody (self): - """ Get request/POST body """ - req = self.request - postData = req.get ('postData') - if postData: - return postData.encode ('utf8'), False - elif req.get ('hasPostData', False): - try: - return b64decode (self.tab.Network.getRequestPostData (requestId=self.id, _timeout=60)['postData']), True - except (pychrome.exceptions.CallMethodException, pychrome.exceptions.TimeoutException): - raise ValueError ('Cannot fetch request body') - return None, False + def __repr__ (self): + return f'RequestResponsePair({self.id!r}, {self.url!r}, {self.request!r}, {self.response!r})' + + def __eq__ (self, b): + if not isinstance (b, RequestResponsePair): + raise TypeError (f'Can only compare with {self.__class__.__name__}') + + # do not compare id and _time. These depend on external factors and do + # not influence the request/response *content* + return self.request == b.request and \ + self.response == b.response and \ + self.url == b.url and \ + self.remoteIpAddress == b.remoteIpAddress and \ + self.protocol == b.protocol and \ + self.resourceType == b.resourceType + + def fromRequestWillBeSent (self, req): + """ Set request data from Chrome Network.requestWillBeSent event """ + r = req['request'] + + self.id = req['requestId'] + self.url = URL (r['url']) + self.resourceType = req.get ('type') + self._time = ReferenceTimestamp (req['timestamp'], req['wallTime']) + + assert self.request is None, req + self.request = Request () + self.request.initiator = req['initiator'] + self.request.headers = CIMultiDict (self._unfoldHeaders (r['headers'])) + self.request.hasPostData = r.get ('hasPostData', False) + self.request.method = r['method'] + self.request.timestamp = self._time (req['timestamp']) + if self.request.hasPostData: + postData = r.get ('postData') + if postData is not None: + self.request.body = UnicodeBody (postData) + + def fromResponse (self, r, timestamp=None, resourceType=None): + """ + Set response data from Chrome’s Response object. + + Request must exist. Updates if response was set before. Sometimes + fromResponseReceived is triggered twice by Chrome. No idea why. + """ + assert self.request is not None, (self.request, r) + + if not timestamp: + timestamp = self.request.timestamp + + self.remoteIpAddress = r.get ('remoteIPAddress') + self.protocol = r.get ('protocol') + if resourceType: + self.resourceType = resourceType + + # a response may contain updated request headers (i.e. those actually + # sent over the wire) + if 'requestHeaders' in r: + self.request.headers = CIMultiDict (self._unfoldHeaders (r['requestHeaders'])) + + self.response = Response () + self.response.headers = CIMultiDict (self._unfoldHeaders (r['headers'])) + self.response.status = r['status'] + self.response.statusText = r['statusText'] + self.response.timestamp = timestamp + self.response.mimeType = r['mimeType'] - @property - def requestHeaders (self): - # the response object may contain refined headers, which were - # *actually* sent over the wire - return self._unfoldHeaders (self.response.get ('requestHeaders', self.request['headers'])) + def fromResponseReceived (self, resp): + """ Set response data from Chrome Network.responseReceived """ + return self.fromResponse (resp['response'], + self._time (resp['timestamp']), resp['type']) - @property - def responseHeaders (self): - return self._unfoldHeaders (self.response['headers']) + def fromLoadingFinished (self, data): + self.response.bytesReceived = data['encodedDataLength'] + + def fromLoadingFailed (self, data): + self.response = None @staticmethod def _unfoldHeaders (headers): @@ -120,14 +227,46 @@ class Item: items.append ((k, v)) return items - def setRequest (self, req): - self.chromeRequest = req + async def prefetchRequestBody (self, tab): + if self.request.hasPostData and self.request.body is None: + try: + postData = await tab.Network.getRequestPostData (requestId=self.id) + self.request.body = UnicodeBody (postData['postData']) + except TabException: + self.request.body = None + + async def prefetchResponseBody (self, tab): + """ Fetch response body """ + try: + body = await tab.Network.getResponseBody (requestId=self.id) + if body['base64Encoded']: + self.response.body = Base64Body (body['body']) + else: + self.response.body = UnicodeBody (body['body']) + except TabException: + self.response.body = None - def setResponse (self, resp): - self.chromeResponse = resp +class NavigateError (IOError): + pass - def setFinished (self, finished): - self.chromeFinished = finished +class PageIdle: + """ Page idle event """ + + __slots__ = ('idle', ) + + def __init__ (self, idle): + self.idle = idle + + def __bool__ (self): + return self.idle + +class FrameNavigated: + __slots__ = ('id', 'url', 'mimeType') + + def __init__ (self, id, url, mimeType): + self.id = id + self.url = URL (url) + self.mimeType = mimeType class SiteLoader: """ @@ -136,505 +275,245 @@ class SiteLoader: XXX: track popup windows/new tabs and close them """ + __slots__ = ('requests', 'browser', 'logger', 'tab', '_iterRunning', + '_framesLoading', '_rootFrame') allowedSchemes = {'http', 'https'} - def __init__ (self, browser, url, logger=logging.getLogger(__name__)): + def __init__ (self, browser, logger): self.requests = {} - self.browser = browser - self.url = url - self.logger = logger + self.browser = Browser (url=browser) + self.logger = logger.bind (context=type (self).__name__) + self._iterRunning = [] - self.tab = browser.new_tab() + self._framesLoading = set () + self._rootFrame = None - def __enter__ (self): - tab = self.tab - # setup callbacks - tab.Network.requestWillBeSent = self._requestWillBeSent - tab.Network.responseReceived = self._responseReceived - tab.Network.loadingFinished = self._loadingFinished - tab.Network.loadingFailed = self._loadingFailed - tab.Log.entryAdded = self._entryAdded - #tab.Page.loadEventFired = loadEventFired - tab.Page.javascriptDialogOpening = self._javascriptDialogOpening - - # start the tab - tab.start() + async def __aenter__ (self): + tab = self.tab = await self.browser.__aenter__ () # enable events - tab.Log.enable () - tab.Network.enable() - tab.Page.enable () - tab.Network.clearBrowserCache () - if tab.Network.canClearBrowserCookies ()['result']: - tab.Network.clearBrowserCookies () - + await asyncio.gather (*[ + tab.Log.enable (), + tab.Network.enable(), + tab.Page.enable (), + tab.Inspector.enable (), + tab.Network.clearBrowserCache (), + tab.Network.clearBrowserCookies (), + ]) return self + async def __aexit__ (self, exc_type, exc_value, traceback): + for task in self._iterRunning: + # ignore any results from stuff we did not end up using anyway + if not task.done (): + task.cancel () + self._iterRunning = [] + await self.browser.__aexit__ (exc_type, exc_value, traceback) + self.tab = None + return False + def __len__ (self): return len (self.requests) - def start (self): - self.tab.Page.navigate(url=self.url) - - def wait (self, timeout=1): - self.tab.wait (timeout) - - def waitIdle (self, idleTimeout=1, maxTimeout=60): - step = 0 - for i in range (0, maxTimeout): - self.wait (1) - if len (self) == 0: - step += 1 - if step > idleTimeout: - break - else: - step = 0 - - def stop (self): - """ - Stop loading site - - XXX: stop executing scripts - """ - + async def __aiter__ (self): + """ Retrieve network items """ tab = self.tab + assert tab is not None + handler = { + tab.Network.requestWillBeSent: self._requestWillBeSent, + tab.Network.responseReceived: self._responseReceived, + tab.Network.loadingFinished: self._loadingFinished, + tab.Network.loadingFailed: self._loadingFailed, + tab.Log.entryAdded: self._entryAdded, + tab.Page.javascriptDialogOpening: self._javascriptDialogOpening, + tab.Page.frameStartedLoading: self._frameStartedLoading, + tab.Page.frameStoppedLoading: self._frameStoppedLoading, + tab.Page.frameNavigated: self._frameNavigated, + } + + # The implementation is a little advanced. Why? The goal here is to + # process events from the tab as quickly as possible (i.e. + # asynchronously). We need to make sure that JavaScript dialogs are + # handled immediately for instance. Otherwise they stall every + # other request. Also, we don’t want to use an unbounded queue, + # since the items yielded can get quite big (response body). Thus + # we need to block (yield) for every item completed, but not + # handled by the consumer (caller). + running = self._iterRunning + tabGetTask = asyncio.ensure_future (self.tab.get ()) + running.append (tabGetTask) + while True: + done, pending = await asyncio.wait (running, return_when=asyncio.FIRST_COMPLETED) + for t in done: + result = t.result () + if result is None: + pass + elif t == tabGetTask: + method, data = result + f = handler.get (method, None) + if f is not None: + task = asyncio.ensure_future (f (**data)) + pending.add (task) + tabGetTask = asyncio.ensure_future (self.tab.get ()) + pending.add (tabGetTask) + else: + yield result - tab.Page.stopLoading () - tab.Network.disable () - tab.Page.disable () - tab.Log.disable () - # XXX: we can’t drain the event queue directly, so insert (yet another) wait - tab.wait (1) - tab.Network.requestWillBeSent = None - tab.Network.responseReceived = None - tab.Network.loadingFinished = None - tab.Network.loadingFailed = None - tab.Page.loadEventFired = None - tab.Page.javascriptDialogOpening = None - tab.Log.entryAdded = None - - def __exit__ (self, exc_type, exc_value, traceback): - self.tab.stop () - self.browser.close_tab(self.tab) - return False - - # overrideable callbacks - def loadingFinished (self, item, redirect=False): - pass + running = pending + self._iterRunning = running - def loadingFailed (self, item): - pass + async def navigate (self, url): + ret = await self.tab.Page.navigate(url=url) + self.logger.debug ('navigate', + uuid='9d47ded2-951f-4e09-86ee-fd4151e20666', result=ret) + if 'errorText' in ret: + raise NavigateError (ret['errorText']) + self._rootFrame = ret['frameId'] # internal chrome callbacks - def _requestWillBeSent (self, **kwargs): + async def _requestWillBeSent (self, **kwargs): + self.logger.debug ('requestWillBeSent', + uuid='b828d75a-650d-42d2-8c66-14f4547512da', args=kwargs) + reqId = kwargs['requestId'] req = kwargs['request'] + url = URL (req['url']) + logger = self.logger.bind (reqId=reqId, reqUrl=url) - url = urlsplit (req['url']) if url.scheme not in self.allowedSchemes: return + ret = None item = self.requests.get (reqId) if item: # redirects never “finish” loading, but yield another requestWillBeSent with this key set redirectResp = kwargs.get ('redirectResponse') if redirectResp: - # create fake responses - resp = {'requestId': reqId, 'response': redirectResp, 'timestamp': kwargs['timestamp']} - item.setResponse (resp) - resp = {'requestId': reqId, 'encodedDataLength': 0, 'timestamp': kwargs['timestamp']} - item.setFinished (resp) - self.loadingFinished (item, redirect=True) - self.logger.info ('redirected request {} has url {}'.format (reqId, req['url'])) + if item.url != url: + # this happens for unknown reasons. the docs simply state + # it can differ in case of a redirect. Fix it and move on. + logger.warning ('redirect url differs', + uuid='558a7df7-2258-4fe4-b16d-22b6019cc163', + expected=item.url) + redirectResp['url'] = str (item.url) + item.fromResponse (redirectResp) + logger.info ('redirect', uuid='85eaec41-e2a9-49c2-9445-6f19690278b8', target=url) + # XXX: queue this? no need to wait for it + await item.prefetchRequestBody (self.tab) + # cannot fetch response body due to race condition (item id reused) + ret = item else: - self.logger.warning ('request {} already exists, overwriting.'.format (reqId)) + logger.warning ('request exists', uuid='2c989142-ba00-4791-bb03-c2a14e91a56b') - item = Item (self.tab) - item.setRequest (kwargs) + item = RequestResponsePair () + item.fromRequestWillBeSent (kwargs) self.requests[reqId] = item - def _responseReceived (self, **kwargs): + return ret + + async def _responseReceived (self, **kwargs): + self.logger.debug ('responseReceived', + uuid='ecd67e69-401a-41cb-b4ec-eeb1f1ec6abb', args=kwargs) + reqId = kwargs['requestId'] item = self.requests.get (reqId) if item is None: return resp = kwargs['response'] - url = urlsplit (resp['url']) + url = URL (resp['url']) + logger = self.logger.bind (reqId=reqId, respUrl=url) + if item.url != url: + logger.error ('url mismatch', uuid='7385f45f-0b06-4cbc-81f9-67bcd72ee7d0', respUrl=url) if url.scheme in self.allowedSchemes: - self.logger.info ('response {} {}'.format (reqId, resp['url'])) - item.setResponse (kwargs) + item.fromResponseReceived (kwargs) else: - self.logger.warning ('response: ignoring scheme {}'.format (url.scheme)) + logger.warning ('scheme forbidden', uuid='2ea6e5d7-dd3b-4881-b9de-156c1751c666') - def _loadingFinished (self, **kwargs): + async def _loadingFinished (self, **kwargs): """ Item was fully loaded. For some items the request body is not available when responseReceived is fired, thus move everything here. """ + self.logger.debug ('loadingFinished', + uuid='35479405-a5b5-4395-8c33-d3601d1796b9', args=kwargs) + reqId = kwargs['requestId'] item = self.requests.pop (reqId, None) if item is None: # we never recorded this request (blacklisted scheme, for example) return + if not item.response: + # chrome failed to send us a responseReceived event for this item, + # so we can’t record it (missing request/response headers) + self.logger.error ('response missing', + uuid='fac3ab96-3f9b-4c5a-95c7-f83b675cdcb9', requestId=item.id) + return + req = item.request - resp = item.response - assert req['url'] == resp['url'], 'req and resp urls are not the same {} vs {}'.format (req['url'], resp['url']) - url = urlsplit (resp['url']) - if url.scheme in self.allowedSchemes: - self.logger.info ('finished {} {}'.format (reqId, req['url'])) - item.setFinished (kwargs) - self.loadingFinished (item) + if item.url.scheme in self.allowedSchemes: + item.fromLoadingFinished (kwargs) + # XXX queue both + await asyncio.gather (item.prefetchRequestBody (self.tab), item.prefetchResponseBody (self.tab)) + return item + + async def _loadingFailed (self, **kwargs): + self.logger.info ('loadingFailed', + uuid='4a944e85-5fae-4aa6-9e7c-e578b29392e4', args=kwargs) - def _loadingFailed (self, **kwargs): reqId = kwargs['requestId'] - self.logger.warning ('failed {} {}'.format (reqId, kwargs['errorText'], kwargs.get ('blockedReason'))) + logger = self.logger.bind (reqId=reqId) item = self.requests.pop (reqId, None) - self.loadingFailed (item) + if item is not None: + item.fromLoadingFailed (kwargs) + return item - def _entryAdded (self, **kwargs): + async def _entryAdded (self, **kwargs): """ Log entry added """ entry = kwargs['entry'] - level = {'verbose': logging.DEBUG, 'info': logging.INFO, - 'warning': logging.WARNING, - 'error': logging.ERROR}[entry['level']] - self.logger.log (level, 'console: {}: {}'.format (entry['source'], entry['text']), extra={'raw': entry}) + level = {'verbose': Level.DEBUG, 'info': Level.INFO, + 'warning': Level.WARNING, + 'error': Level.ERROR}.get (entry.pop ('level'), Level.INFO) + entry['uuid'] = 'e62ffb5a-0521-459c-a3d9-1124551934d2' + self.logger (level, 'console', **entry) - def _javascriptDialogOpening (self, **kwargs): + async def _javascriptDialogOpening (self, **kwargs): t = kwargs.get ('type') if t in {'alert', 'confirm', 'prompt'}: - self.logger.info ('javascript opened a dialog: {}, {}, canceling'.format (t, kwargs.get ('message'))) - self.tab.Page.handleJavaScriptDialog (accept=False) + self.logger.info ('js dialog', + uuid='d6f07ce2-648e-493b-a1df-f353bed27c84', + action='cancel', type=t, message=kwargs.get ('message')) + await self.tab.Page.handleJavaScriptDialog (accept=False) elif t == 'beforeunload': # we must accept this one, otherwise the page will not unload/close - self.logger.info ('javascript opened a dialog: {}, {}, procceeding'.format (t, kwargs.get ('message'))) - self.tab.Page.handleJavaScriptDialog (accept=True) - else: - self.logger.warning ('unknown javascript dialog type {}'.format (t)) - -class AccountingSiteLoader (SiteLoader): - """ - SiteLoader that keeps basic statistics about retrieved pages. - """ - - def __init__ (self, browser, url, logger=logging.getLogger(__name__)): - super ().__init__ (browser, url, logger) - - self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0} - - def loadingFinished (self, item, redirect=False): - super ().loadingFinished (item, redirect) - - self.stats['finished'] += 1 - self.stats['bytesRcv'] += item.encodedDataLength - - def loadingFailed (self, item): - super ().loadingFailed (item) - - self.stats['failed'] += 1 - - def _requestWillBeSent (self, **kwargs): - super ()._requestWillBeSent (**kwargs) - - self.stats['requests'] += 1 - -import subprocess -from tempfile import mkdtemp -import socket, shutil - -class ChromeService: - """ - Start Chrome with socket activation (i.e. pass listening socket). Polling - is not required with this method, since reads will block until Chrome is - ready. - """ - - def __init__ (self, binary='google-chrome-stable', host='localhost', port=9222, windowSize=(1920, 1080)): - self.binary = binary - self.host = host - self.port = port - self.windowSize = windowSize - self.p = None - - def __enter__ (self): - assert self.p is None - - port = self.port - while True: - s = socket.socket () - s.setsockopt (socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - try: - s.bind ((self.host, port)) - break - except OSError: - # try different port - if port < 65000: - port += 1 - else: - raise - s.listen (10) - self.userDataDir = mkdtemp () - args = [self.binary, - '--window-size={},{}'.format (*self.windowSize), - '--user-data-dir={}'.format (self.userDataDir), # use temporory user dir - '--no-default-browser-check', - '--no-first-run', # don’t show first run screen - '--disable-breakpad', # no error reports - '--disable-extensions', - '--disable-infobars', - '--disable-notifications', # no libnotify - '--headless', - '--disable-gpu', - '--hide-scrollbars', # hide scrollbars on screenshots - '--mute-audio', # don’t play any audio - '--remote-debugging-socket-fd={}'.format (s.fileno ()), - '--homepage=about:blank', - 'about:blank'] - # start new session, so ^C does not affect subprocess - self.p = subprocess.Popen (args, pass_fds=[s.fileno()], start_new_session=True, - stdin=subprocess.DEVNULL, stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL) - s.close () - - return 'http://{}:{}'.format (self.host, port) - - def __exit__ (self, *exc): - self.p.terminate () - self.p.wait () - shutil.rmtree (self.userDataDir) - self.p = None - -class NullService: - def __init__ (self, url): - self.url = url - - def __enter__ (self): - return self.url - - def __exit__ (self, *exc): - pass - -### tests ### - -import unittest, time -from http.server import BaseHTTPRequestHandler - -class TestHTTPRequestHandler (BaseHTTPRequestHandler): - encodingTestString = { - 'latin1': 'äöü', - 'utf-8': 'äöü', - 'ISO-8859-1': 'äöü', - } - binaryTestData = b'\x00\x01\x02' - # 1×1 pixel PNG - imageTestData = b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x00\x00\x00\x00:~\x9bU\x00\x00\x00\nIDAT\x08\x1dc\xf8\x0f\x00\x01\x01\x01\x006_g\x80\x00\x00\x00\x00IEND\xaeB`\x82' - htmlTestData = '<html><body><img src="/image"><img src="/nonexistent"></body></html>' - alertData = '<html><body><script>window.addEventListener("beforeunload", function (e) { e.returnValue = "bye?"; return e.returnValue; }); alert("stopping here"); if (confirm("are you sure?") || prompt ("42?")) { window.location = "/nonexistent"; }</script><img src="/image"></body></html>' - - def do_GET(self): - path = self.path - if path.startswith ('/redirect/301'): - self.send_response(301) - self.send_header ('Location', path[13:]) - self.end_headers() - elif path == '/empty': - self.send_response (200) - self.end_headers () - elif path.startswith ('/encoding'): - # send text data with different encodings - _, _, encoding = path.split ('/', 3) - self.send_response (200) - self.send_header ('Content-Type', 'text/plain; charset={}'.format (encoding)) - self.end_headers () - self.wfile.write (self.encodingTestString[encoding].encode (encoding)) - elif path == '/binary': - # send binary data - self.send_response (200) - self.send_header ('Content-Type', 'application/octet-stream') - self.send_header ('Content-Length', len (self.binaryTestData)) - self.end_headers () - self.wfile.write (self.binaryTestData) - elif path == '/image': - # send binary data - self.send_response (200) - self.send_header ('Content-Type', 'image/png') - self.end_headers () - self.wfile.write (self.imageTestData) - elif path == '/attachment': - self.send_response (200) - self.send_header ('Content-Type', 'text/plain; charset=utf-8') - self.send_header ('Content-Disposition', 'attachment; filename="attachment.txt"') - self.end_headers () - self.wfile.write (self.encodingTestString['utf-8'].encode ('utf-8')) - elif path == '/html': - self.send_response (200) - self.send_header ('Content-Type', 'text/html; charset=utf-8') - self.end_headers () - self.wfile.write (self.htmlTestData.encode ('utf-8')) - elif path == '/alert': - self.send_response (200) - self.send_header ('Content-Type', 'text/html; charset=utf-8') - self.end_headers () - self.wfile.write (self.alertData.encode ('utf-8')) - else: - self.send_response (404) - self.end_headers () - - def log_message (self, format, *args): - pass - -def startServer (): - import http.server - PORT = 8000 - httpd = http.server.HTTPServer (("localhost", PORT), TestHTTPRequestHandler) - httpd.serve_forever() - -class TestSiteLoaderAdapter (SiteLoader): - def __init__ (self, browser, url): - SiteLoader.__init__ (self, browser, url) - self.finished = [] - - def loadingFinished (self, item, redirect=False): - self.finished.append (item) - -class TestSiteLoader (unittest.TestCase): - def setUp (self): - from multiprocessing import Process - self.server = Process (target=startServer) - self.server.start () - self.baseurl = 'http://localhost:8000/' - self.service = ChromeService () - browserUrl = self.service.__enter__ () - self.browser = pychrome.Browser(url=browserUrl) - - def buildAdapter (self, path): - return TestSiteLoaderAdapter (self.browser, '{}{}'.format (self.baseurl, path)) - - def assertUrls (self, l, expect): - urls = set (map (lambda x: x.parsedUrl.path, l.finished)) - expect = set (expect) - self.assertEqual (urls, expect) - - def test_wait (self): - waittime = 2 - with self.buildAdapter ('empty') as l: - l.start () - before = time.time () - l.wait (waittime) - after = time.time () - self.assertTrue ((after-before) >= waittime) - - def test_empty (self): - with self.buildAdapter ('empty') as l: - l.start () - l.waitIdle () - self.assertEqual (len (l.finished), 1) - - def test_redirect301 (self): - with self.buildAdapter ('redirect/301/empty') as l: - l.start () - l.waitIdle () - self.assertEqual (len (l.finished), 2) - self.assertUrls (l, ['/redirect/301/empty', '/empty']) - for item in l.finished: - if item.parsedUrl.path == '/empty': - self.assertEqual (item.response['status'], 200) - self.assertEqual (item.body[0], b'') - elif item.parsedUrl.path == '/redirect/301/empty': - self.assertEqual (item.response['status'], 301) - else: - self.fail ('unknown url') - - def test_redirect301multi (self): - with self.buildAdapter ('redirect/301/redirect/301/empty') as l: - l.start () - l.waitIdle () - self.assertEqual (len (l.finished), 3) - self.assertUrls (l, ['/redirect/301/redirect/301/empty', '/redirect/301/empty', '/empty']) - for item in l.finished: - if item.parsedUrl.path == '/empty': - self.assertEqual (item.response['status'], 200) - self.assertEqual (item.body[0], b'') - elif item.parsedUrl.path in {'/redirect/301/empty', \ - '/redirect/301/redirect/301/empty'}: - self.assertEqual (item.response['status'], 301) - else: - self.fail ('unknown url') - - def test_encoding (self): - """ Text responses are transformed to UTF-8. Make sure this works - correctly. """ - for encoding, expected in TestHTTPRequestHandler.encodingTestString.items (): - with self.buildAdapter ('encoding/{}'.format (encoding)) as l: - l.start () - l.waitIdle () - self.assertEqual (len (l.finished), 1) - self.assertUrls (l, ['/encoding/{}'.format (encoding)]) - self.assertEqual (l.finished[0].body[0], expected.encode ('utf8')) - - def test_binary (self): - """ Browser should ignore content it cannot display (i.e. octet-stream) """ - with self.buildAdapter ('binary') as l: - l.start () - l.waitIdle () - self.assertEqual (len (l.finished), 0) - - def test_image (self): - """ Images should be displayed inline """ - with self.buildAdapter ('image') as l: - l.start () - l.waitIdle () - self.assertEqual (len (l.finished), 1) - self.assertUrls (l, ['/image']) - self.assertEqual (l.finished[0].body[0], TestHTTPRequestHandler.imageTestData) - - def test_attachment (self): - """ And downloads won’t work in headless mode """ - with self.buildAdapter ('attachment') as l: - l.start () - l.waitIdle () - self.assertEqual (len (l.finished), 0) - - def test_html (self): - with self.buildAdapter ('html') as l: - l.start () - l.waitIdle () - self.assertEqual (len (l.finished), 3) - self.assertUrls (l, ['/html', '/image', '/nonexistent']) - for item in l.finished: - if item.parsedUrl.path == '/html': - self.assertEqual (item.response['status'], 200) - self.assertEqual (item.body[0], TestHTTPRequestHandler.htmlTestData.encode ('utf-8')) - elif item.parsedUrl.path == '/image': - self.assertEqual (item.response['status'], 200) - self.assertEqual (item.body[0], TestHTTPRequestHandler.imageTestData) - elif item.parsedUrl.path == '/nonexistent': - self.assertEqual (item.response['status'], 404) - else: - self.fail ('unknown url') - - def test_alert (self): - with self.buildAdapter ('alert') as l: - l.start () - l.waitIdle () - self.assertUrls (l, ['/alert', '/image']) - for item in l.finished: - if item.parsedUrl.path == '/alert': - self.assertEqual (item.response['status'], 200) - self.assertEqual (item.body[0], TestHTTPRequestHandler.alertData.encode ('utf-8')) - elif item.parsedUrl.path == '/image': - self.assertEqual (item.response['status'], 200) - self.assertEqual (item.body[0], TestHTTPRequestHandler.imageTestData) - else: - self.fail ('unknown url') - - def tearDown (self): - self.service.__exit__ (None, None, None) - self.server.terminate () - self.server.join () - -if __name__ == '__main__': - import sys - if sys.argv[1] == 'server': - startServer () + self.logger.info ('js dialog', + uuid='96399b99-9834-4c8f-bd93-cb9fa2225abd', + action='proceed', type=t, message=kwargs.get ('message')) + await self.tab.Page.handleJavaScriptDialog (accept=True) + else: # pragma: no cover + self.logger.warning ('js dialog unknown', + uuid='3ef7292e-8595-4e89-b834-0cc6bc40ee38', **kwargs) + + async def _frameStartedLoading (self, **kwargs): + self.logger.debug ('frameStartedLoading', + uuid='bbeb39c0-3304-4221-918e-f26bd443c566', args=kwargs) + + self._framesLoading.add (kwargs['frameId']) + return PageIdle (False) + + async def _frameStoppedLoading (self, **kwargs): + self.logger.debug ('frameStoppedLoading', + uuid='fcbe8110-511c-4cbb-ac2b-f61a5782c5a0', args=kwargs) + + self._framesLoading.remove (kwargs['frameId']) + if not self._framesLoading: + return PageIdle (True) + + async def _frameNavigated (self, **kwargs): + self.logger.debug ('frameNavigated', + uuid='0e876f7d-7129-4612-8632-686f42ac6e1f', args=kwargs) + frame = kwargs['frame'] + if self._rootFrame == frame['id']: + assert frame.get ('parentId', None) is None, "root frame must not have a parent" + return FrameNavigated (frame['id'], frame['url'], frame['mimeType']) diff --git a/crocoite/cli.py b/crocoite/cli.py index f6454da..04bbb19 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -22,72 +22,235 @@ Command line interface """ -import logging, argparse, json, sys +import argparse, sys, signal, asyncio, os, json +from traceback import TracebackException +from enum import IntEnum +from yarl import URL +from http.cookies import SimpleCookie +import pkg_resources +try: + import manhole + manhole.install (patch_fork=False, oneshot_on='USR1') +except ModuleNotFoundError: + pass -from . import behavior -from .controller import RecursiveController, defaultSettings, \ - ControllerSettings, DepthLimit, PrefixLimit -from .browser import NullService, ChromeService +from . import behavior, browser +from .controller import SinglePageController, \ + ControllerSettings, StatsHandler, LogHandler, \ + RecursiveController, DepthLimit, PrefixLimit +from .devtools import Passthrough, Process +from .warc import WarcHandler +from .logger import Logger, JsonPrintConsumer, DatetimeConsumer, \ + WarcHandlerConsumer, Level +from .devtools import Crashed -def parseRecursive (recursive, url): +def absurl (s): + """ argparse: Absolute URL """ + u = URL (s) + if u.is_absolute (): + return u + raise argparse.ArgumentTypeError ('Must be absolute') + +def cookie (s): + """ argparse: Cookie """ + c = SimpleCookie (s) + # for some reason the constructor does not raise an exception if the cookie + # supplied is invalid. It’ll simply be empty. + if len (c) != 1: + raise argparse.ArgumentTypeError ('Invalid cookie') + # we want a single Morsel + return next (iter (c.values ())) + +def cookiejar (f): + """ argparse: Cookies from file """ + cookies = [] + try: + with open (f, 'r') as fd: + for l in fd: + l = l.lstrip () + if l and not l.startswith ('#'): + cookies.append (cookie (l)) + except FileNotFoundError: + raise argparse.ArgumentTypeError (f'Cookie jar "{f}" does not exist') + return cookies + +class SingleExitStatus(IntEnum): + """ Exit status for single-shot command line """ + Ok = 0 + Fail = 1 + BrowserCrash = 2 + Navigate = 3 + +def single (): + parser = argparse.ArgumentParser(description='crocoite helper tools to fetch individual pages.') + parser.add_argument('--browser', help='DevTools URL', type=absurl, metavar='URL') + parser.add_argument('--timeout', default=1*60*60, type=int, help='Maximum time for archival', metavar='SEC') + parser.add_argument('--idle-timeout', default=30, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout', metavar='SEC') + parser.add_argument('--behavior', help='Enable behavior script', + dest='enabledBehaviorNames', + default=list (behavior.availableMap.keys ()), + choices=list (behavior.availableMap.keys ()), + metavar='NAME', nargs='*') + parser.add_argument('--warcinfo', help='Add extra information to warcinfo record', + metavar='JSON', type=json.loads) + # re-using curl’s short/long switch names whenever possible + parser.add_argument('-k', '--insecure', + action='store_true', + help='Disable certificate validation') + parser.add_argument ('-b', '--cookie', type=cookie, metavar='SET-COOKIE', + action='append', default=[], help='Cookies in Set-Cookie format.') + parser.add_argument ('-c', '--cookie-jar', dest='cookieJar', + type=cookiejar, metavar='FILE', + default=pkg_resources.resource_filename (__name__, 'data/cookies.txt'), + help='Cookie jar file, read-only.') + parser.add_argument('url', help='Website URL', type=absurl, metavar='URL') + parser.add_argument('output', help='WARC filename', metavar='FILE') + + args = parser.parse_args () + + logger = Logger (consumer=[DatetimeConsumer (), JsonPrintConsumer ()]) + + ret = SingleExitStatus.Fail + service = Process () + if args.browser: + service = Passthrough (args.browser) + settings = ControllerSettings ( + idleTimeout=args.idleTimeout, + timeout=args.timeout, + insecure=args.insecure, + cookies=args.cookieJar + args.cookie, + ) + with open (args.output, 'wb') as fd, WarcHandler (fd, logger) as warcHandler: + logger.connect (WarcHandlerConsumer (warcHandler)) + handler = [StatsHandler (), LogHandler (logger), warcHandler] + b = list (map (lambda x: behavior.availableMap[x], args.enabledBehaviorNames)) + controller = SinglePageController (url=args.url, settings=settings, + service=service, handler=handler, behavior=b, logger=logger, + warcinfo=args.warcinfo) + try: + loop = asyncio.get_event_loop() + run = asyncio.ensure_future (controller.run ()) + stop = lambda signum: run.cancel () + loop.add_signal_handler (signal.SIGINT, stop, signal.SIGINT) + loop.add_signal_handler (signal.SIGTERM, stop, signal.SIGTERM) + loop.run_until_complete(run) + loop.close() + ret = SingleExitStatus.Ok + except Crashed: + ret = SingleExitStatus.BrowserCrash + except asyncio.CancelledError: + # don’t log this one + pass + except browser.NavigateError: + ret = SingleExitStatus.Navigate + except Exception as e: + ret = SingleExitStatus.Fail + logger.error ('cli exception', + uuid='7fd69858-ecaa-4225-b213-8ab880aa3cc5', + traceback=list (TracebackException.from_exception (e).format ())) + finally: + r = handler[0].stats + logger.info ('stats', context='cli', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **r) + logger.info ('exit', context='cli', uuid='9b1bd603-f7cd-4745-895a-5b894a5166f2', status=ret) + + return ret + +def parsePolicy (recursive, url): if recursive is None: return DepthLimit (0) elif recursive.isdigit (): return DepthLimit (int (recursive)) elif recursive == 'prefix': return PrefixLimit (url) - else: - raise ValueError ('Unsupported') + raise argparse.ArgumentTypeError ('Unsupported recursion mode') + +def recursive (): + logger = Logger (consumer=[DatetimeConsumer (), JsonPrintConsumer ()]) -def main (): parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.') - parser.add_argument('--browser', help='DevTools URL', metavar='URL') - parser.add_argument('--recursive', help='Follow links recursively') - parser.add_argument('--concurrency', '-j', type=int, default=1) - parser.add_argument('--timeout', default=10, type=int, help='Maximum time for archival', metavar='SEC') - parser.add_argument('--idle-timeout', default=2, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout', metavar='SEC') - parser.add_argument('--log-buffer', default=defaultSettings.logBuffer, type=int, dest='logBuffer', metavar='LINES') - parser.add_argument('--max-body-size', default=defaultSettings.maxBodySize, type=int, dest='maxBodySize', help='Max body size', metavar='BYTES') - parser.add_argument('--behavior', help='Comma-separated list of enabled behavior scripts', - dest='enabledBehaviorNames', - default=list (behavior.availableNames), - choices=list (behavior.availableNames)) - group = parser.add_mutually_exclusive_group (required=True) - group.add_argument('--output', help='WARC filename', metavar='FILE') - group.add_argument('--distributed', help='Use celery worker', action='store_true') - parser.add_argument('url', help='Website URL') + parser.add_argument('-j', '--concurrency', + help='Run at most N jobs concurrently', metavar='N', default=1, + type=int) + parser.add_argument('-r', '--recursion', help='Recursion policy', + metavar='POLICY') + parser.add_argument('--tempdir', help='Directory for temporary files', + metavar='DIR') + parser.add_argument('url', help='Seed URL', type=absurl, metavar='URL') + parser.add_argument('output', + help='Output file, supports templates {host}, {date} and {seqnum}', + metavar='FILE') + parser.add_argument('command', + help='Fetch command, supports templates {url} and {dest}', + metavar='CMD', nargs='*', + default=['crocoite-single', '{url}', '{dest}']) args = parser.parse_args () + try: + policy = parsePolicy (args.recursion, args.url) + except argparse.ArgumentTypeError as e: + parser.error (str (e)) - if args.distributed: - if args.browser: - parser.error ('--browser is not supported for distributed jobs') - from . import task - settings = dict (maxBodySize=args.maxBodySize, - logBuffer=args.logBuffer, idleTimeout=args.idleTimeout, - timeout=args.timeout) - result = task.controller.delay (url=args.url, settings=settings, - enabledBehaviorNames=args.enabledBehaviorNames, - recursive=args.recursive, concurrency=args.concurrency) - r = result.get () - else: - logging.basicConfig (level=logging.INFO) + try: + controller = RecursiveController (url=args.url, output=args.output, + command=args.command, logger=logger, policy=policy, + tempdir=args.tempdir, concurrency=args.concurrency) + except ValueError as e: + parser.error (str (e)) - try: - recursionPolicy = parseRecursive (args.recursive, args.url) - except ValueError: - parser.error ('Invalid argument for --recursive') - service = ChromeService () - if args.browser: - service = NullService (args.browser) - settings = ControllerSettings (maxBodySize=args.maxBodySize, - logBuffer=args.logBuffer, idleTimeout=args.idleTimeout, - timeout=args.timeout) - with open (args.output, 'wb') as fd: - controller = RecursiveController (args.url, fd, settings=settings, - recursionPolicy=recursionPolicy, service=service) - r = controller.run () - json.dump (r, sys.stdout) - - return True + run = asyncio.ensure_future (controller.run ()) + loop = asyncio.get_event_loop() + stop = lambda signum: run.cancel () + loop.add_signal_handler (signal.SIGINT, stop, signal.SIGINT) + loop.add_signal_handler (signal.SIGTERM, stop, signal.SIGTERM) + try: + loop.run_until_complete(run) + except asyncio.CancelledError: + pass + finally: + loop.close() + + return 0 + +def irc (): + import json, re + from .irc import Chromebot + + logger = Logger (consumer=[DatetimeConsumer (), JsonPrintConsumer ()]) + + parser = argparse.ArgumentParser(description='IRC bot.') + parser.add_argument('--config', '-c', help='Config file location', metavar='PATH', default='chromebot.json') + + args = parser.parse_args () + + with open (args.config) as fd: + config = json.load (fd) + s = config['irc'] + blacklist = dict (map (lambda x: (re.compile (x[0], re.I), x[1]), config['blacklist'].items ())) + + loop = asyncio.get_event_loop() + bot = Chromebot ( + host=s['host'], + port=s['port'], + ssl=s['ssl'], + nick=s['nick'], + channels=s['channels'], + tempdir=config['tempdir'], + destdir=config['destdir'], + processLimit=config['process_limit'], + logger=logger, + blacklist=blacklist, + needVoice=config['need_voice'], + loop=loop) + stop = lambda signum: bot.cancel () + loop.add_signal_handler (signal.SIGINT, stop, signal.SIGINT) + loop.add_signal_handler (signal.SIGTERM, stop, signal.SIGTERM) + loop.run_until_complete(bot.run ()) + +def dashboard (): + from .irc import Dashboard + + loop = asyncio.get_event_loop() + d = Dashboard (sys.stdin, loop) + loop.run_until_complete(d.run ()) + loop.run_forever() diff --git a/crocoite/controller.py b/crocoite/controller.py index bc6f948..8374b4e 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -22,115 +22,297 @@ Controller classes, handling actions required for archival """ +import time, tempfile, asyncio, json, os, shutil, signal +from itertools import islice +from datetime import datetime +from operator import attrgetter +from abc import ABC, abstractmethod +from yarl import URL + +from . import behavior as cbehavior +from .browser import SiteLoader, RequestResponsePair, PageIdle, FrameNavigated +from .util import getFormattedViewportMetrics, getSoftwareInfo +from .behavior import ExtractLinksEvent +from .devtools import toCookieParam + class ControllerSettings: - def __init__ (self, logBuffer=1000, maxBodySize=50*1024*1024, idleTimeout=2, timeout=10): - self.logBuffer = logBuffer - self.maxBodySize = maxBodySize + __slots__ = ('idleTimeout', 'timeout', 'insecure', 'cookies') + + def __init__ (self, idleTimeout=2, timeout=10, insecure=False, cookies=None): self.idleTimeout = idleTimeout self.timeout = timeout + self.insecure = insecure + self.cookies = cookies or [] - def toDict (self): - return dict (logBuffer=self.logBuffer, maxBodySize=self.maxBodySize, - idleTimeout=self.idleTimeout, timeout=self.timeout) + def __repr__ (self): + return f'<ControllerSetting idleTimeout={self.idleTimeout!r}, timeout={self.timeout!r}, insecure={self.insecure!r}, cookies={self.cookies!r}>' defaultSettings = ControllerSettings () -import logging -from urllib.parse import urlsplit, urlunsplit +class EventHandler (ABC): + """ Abstract base class for event handler """ -import pychrome + __slots__ = () -from . import behavior as cbehavior -from .browser import ChromeService -from .warc import WarcLoader, SerializingWARCWriter -from .util import getFormattedViewportMetrics + @abstractmethod + async def push (self, item): + raise NotImplementedError () + +class StatsHandler (EventHandler): + __slots__ = ('stats', ) + + def __init__ (self): + self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0} + + async def push (self, item): + if isinstance (item, RequestResponsePair): + self.stats['requests'] += 1 + if not item.response: + self.stats['failed'] += 1 + else: + self.stats['finished'] += 1 + self.stats['bytesRcv'] += item.response.bytesReceived + +class LogHandler (EventHandler): + """ Handle items by logging information about them """ + + __slots__ = ('logger', ) + + def __init__ (self, logger): + self.logger = logger.bind (context=type (self).__name__) + + async def push (self, item): + if isinstance (item, ExtractLinksEvent): + # limit number of links per message, so json blob won’t get too big + it = iter (item.links) + limit = 100 + while True: + limitlinks = list (islice (it, 0, limit)) + if not limitlinks: + break + self.logger.info ('extracted links', context=type (item).__name__, + uuid='8ee5e9c9-1130-4c5c-88ff-718508546e0c', links=limitlinks) + + +class ControllerStart: + __slots__ = ('payload', ) + + def __init__ (self, payload): + self.payload = payload + +class IdleStateTracker (EventHandler): + """ Track SiteLoader’s idle state by listening to PageIdle events """ + + __slots__ = ('_idle', '_loop', '_idleSince') + + def __init__ (self, loop): + self._idle = True + self._loop = loop + + self._idleSince = self._loop.time () + + async def push (self, item): + if isinstance (item, PageIdle): + self._idle = bool (item) + if self._idle: + self._idleSince = self._loop.time () + + async def wait (self, timeout): + """ Wait until page has been idle for at least timeout seconds. If the + page has been idle before calling this function it may return + immediately. """ + + assert timeout > 0 + while True: + if self._idle: + now = self._loop.time () + sleep = timeout-(now-self._idleSince) + if sleep <= 0: + break + else: + # not idle, check again after timeout expires + sleep = timeout + await asyncio.sleep (sleep) + +class InjectBehaviorOnload (EventHandler): + """ Control behavior script injection based on frame navigation messages. + When a page is reloaded (for whatever reason), the scripts need to be + reinjected. """ + + __slots__ = ('controller', '_loaded') + + def __init__ (self, controller): + self.controller = controller + self._loaded = False + + async def push (self, item): + if isinstance (item, FrameNavigated): + await self._runon ('load') + self._loaded = True + + async def stop (self): + if self._loaded: + await self._runon ('stop') -def firstOrNone (it): - """ Return first item of iterator it or None if empty """ - try: - return next (it) - except StopIteration: - return None + async def finish (self): + if self._loaded: + await self._runon ('finish') + + async def _runon (self, method): + controller = self.controller + for b in controller._enabledBehavior: + f = getattr (b, 'on' + method) + async for item in f (): + await controller.processItem (item) class SinglePageController: """ - Archive a single page url to file output. + Archive a single page url. + + Dispatches between producer (site loader and behavior scripts) and consumer + (stats, warc writer). """ - def __init__ (self, url, output, service=ChromeService (), behavior=cbehavior.available, \ - logger=logging.getLogger(__name__), settings=defaultSettings): + __slots__ = ('url', 'service', 'behavior', 'settings', 'logger', 'handler', + 'warcinfo', '_enabledBehavior') + + def __init__ (self, url, logger, \ + service, behavior=cbehavior.available, \ + settings=defaultSettings, handler=None, \ + warcinfo=None): self.url = url - self.output = output self.service = service self.behavior = behavior self.settings = settings - self.logger = logger - - def run (self): - ret = {'stats': None, 'links': []} - - with self.service as browser: - browser = pychrome.Browser (url=browser) - writer = SerializingWARCWriter (self.output, gzip=True) - - with WarcLoader (browser, self.url, writer, - logBuffer=self.settings.logBuffer, - maxBodySize=self.settings.maxBodySize) as l: - version = l.tab.Browser.getVersion () - payload = { - 'software': __package__, - 'browser': version['product'], + self.logger = logger.bind (context=type (self).__name__, url=url) + self.handler = handler or [] + self.warcinfo = warcinfo + + async def processItem (self, item): + for h in self.handler: + await h.push (item) + + async def run (self): + logger = self.logger + async def processQueue (): + async for item in l: + await self.processItem (item) + + idle = IdleStateTracker (asyncio.get_event_loop ()) + self.handler.append (idle) + behavior = InjectBehaviorOnload (self) + self.handler.append (behavior) + + async with self.service as browser, SiteLoader (browser, logger=logger) as l: + handle = asyncio.ensure_future (processQueue ()) + timeoutProc = asyncio.ensure_future (asyncio.sleep (self.settings.timeout)) + + # configure browser + tab = l.tab + await tab.Security.setIgnoreCertificateErrors (ignore=self.settings.insecure) + await tab.Network.setCookies (cookies=list (map (toCookieParam, self.settings.cookies))) + + # not all behavior scripts are allowed for every URL, filter them + self._enabledBehavior = list (filter (lambda x: self.url in x, + map (lambda x: x (l, logger), self.behavior))) + + version = await tab.Browser.getVersion () + payload = { + 'software': getSoftwareInfo (), + 'browser': { + 'product': version['product'], 'useragent': version['userAgent'], - 'viewport': getFormattedViewportMetrics (l.tab), - } - warcinfo = writer.create_warcinfo_record (filename=None, info=payload) - writer.write_record (warcinfo) - - # not all behavior scripts are allowed for every URL, filter them - enabledBehavior = list (filter (lambda x: self.url in x, - map (lambda x: x (l), self.behavior))) - linksBehavior = firstOrNone (filter (lambda x: isinstance (x, cbehavior.ExtractLinks), - enabledBehavior)) - - for b in enabledBehavior: - self.logger.debug ('starting onload behavior {}'.format (b.name)) - b.onload () - l.start () - - l.waitIdle (self.settings.idleTimeout, self.settings.timeout) - - for b in enabledBehavior: - self.logger.debug ('starting onstop behavior {}'.format (b.name)) - b.onstop () - - # if we stopped due to timeout, wait for remaining assets - l.waitIdle (2, 60) - l.stop () - - for b in enabledBehavior: - self.logger.debug ('starting onfinish behavior {}'.format (b.name)) - b.onfinish () - - ret['stats'] = l.stats - ret['links'] = linksBehavior.links if linksBehavior else None - writer.flush () - return ret - -from collections import UserDict - -class IntegerDict (UserDict): - """ Dict with dict/dict per-item arithmetic propagation, i.e. {1: 2}+{1: 1}={1: 3} """ - def __add__ (self, b): - newdict = self.__class__ (self) - for k, v in b.items (): - if k in self: - newdict[k] += v + 'viewport': await getFormattedViewportMetrics (tab), + }, + 'tool': 'crocoite-single', # not the name of the cli utility + 'parameters': { + 'url': self.url, + 'idleTimeout': self.settings.idleTimeout, + 'timeout': self.settings.timeout, + 'behavior': list (map (attrgetter('name'), self._enabledBehavior)), + 'insecure': self.settings.insecure, + 'cookies': list (map (lambda x: x.OutputString(), self.settings.cookies)), + }, + } + if self.warcinfo: + payload['extra'] = self.warcinfo + await self.processItem (ControllerStart (payload)) + + await l.navigate (self.url) + + idleProc = asyncio.ensure_future (idle.wait (self.settings.idleTimeout)) + while True: + try: + finished, pending = await asyncio.wait([idleProc, timeoutProc, handle], + return_when=asyncio.FIRST_COMPLETED) + except asyncio.CancelledError: + idleProc.cancel () + timeoutProc.cancel () + break + + if handle in finished: + # something went wrong while processing the data + logger.error ('fetch failed', + uuid='43a0686a-a3a9-4214-9acd-43f6976f8ff3') + idleProc.cancel () + timeoutProc.cancel () + handle.result () + assert False # previous line should always raise Exception + elif timeoutProc in finished: + # global timeout + logger.debug ('global timeout', + uuid='2f858adc-9448-4ace-94b4-7cd1484c0728') + idleProc.cancel () + timeoutProc.result () + break + elif idleProc in finished: + # idle timeout + logger.debug ('idle timeout', + uuid='90702590-94c4-44ef-9b37-02a16de444c3') + idleProc.result () + timeoutProc.cancel () + break + + await behavior.stop () + await tab.Page.stopLoading () + await asyncio.sleep (1) + await behavior.finish () + + # wait until loads from behavior scripts are done and browser is + # idle for at least 1 second + try: + await asyncio.wait_for (idle.wait (1), timeout=1) + except (asyncio.TimeoutError, asyncio.CancelledError): + pass + + if handle.done (): + handle.result () else: - newdict[k] = v - return newdict + handle.cancel () + +class SetEntry: + """ A object, to be used with sets, that compares equality only on its + primary property. """ + def __init__ (self, value, **props): + self.value = value + for k, v in props.items (): + setattr (self, k, v) + + def __eq__ (self, b): + assert isinstance (b, SetEntry) + return self.value == b.value + + def __hash__ (self): + return hash (self.value) + + def __repr__ (self): + return f'<SetEntry {self.value!r}>' class RecursionPolicy: """ Abstract recursion policy """ + + __slots__ = () + def __call__ (self, urls): raise NotImplementedError @@ -138,20 +320,23 @@ class DepthLimit (RecursionPolicy): """ Limit recursion by depth. - depth==0 means no recursion, depth==1 is the page and outgoing links, … + depth==0 means no recursion, depth==1 is the page and outgoing links """ + + __slots__ = ('maxdepth', ) + def __init__ (self, maxdepth=0): self.maxdepth = maxdepth def __call__ (self, urls): - if self.maxdepth <= 0: - return {} - else: - self.maxdepth -= 1 - return urls + newurls = set () + for u in urls: + if u.depth <= self.maxdepth: + newurls.add (u) + return newurls def __repr__ (self): - return '<DepthLimit {}>'.format (self.maxdepth) + return f'<DepthLimit {self.maxdepth}>' class PrefixLimit (RecursionPolicy): """ @@ -161,65 +346,196 @@ class PrefixLimit (RecursionPolicy): ignored: http://example.com/bar http://offsite.example/foo accepted: http://example.com/foobar http://example.com/foo/bar """ + + __slots__ = ('prefix', ) + def __init__ (self, prefix): self.prefix = prefix def __call__ (self, urls): - return set (filter (lambda u: u.startswith (self.prefix), urls)) + return set (filter (lambda u: str(u.value).startswith (str (self.prefix)), urls)) -def removeFragment (u): - """ Remove fragment from url (i.e. #hashvalue) """ - s = urlsplit (u) - return urlunsplit ((s.scheme, s.netloc, s.path, s.query, '')) +def hasTemplate (s): + """ Return True if string s has string templates """ + return '{' in s and '}' in s class RecursiveController: """ Simple recursive controller - Visits links acording to recursionPolicy + Visits links acording to policy """ - def __init__ (self, url, output, service=ChromeService (), behavior=cbehavior.available, \ - logger=logging.getLogger(__name__), settings=defaultSettings, - recursionPolicy=DepthLimit (0)): + __slots__ = ('url', 'output', 'command', 'logger', 'policy', 'have', + 'pending', 'stats', 'tempdir', 'running', 'concurrency', + 'copyLock') + + SCHEME_WHITELIST = {'http', 'https'} + + def __init__ (self, url, output, command, logger, + tempdir=None, policy=DepthLimit (0), concurrency=1): self.url = url self.output = output - self.service = service - self.behavior = behavior - self.settings = settings - self.logger = logger - self.recursionPolicy = recursionPolicy - - def fetch (self, urls): + self.command = command + self.logger = logger.bind (context=type(self).__name__, seedurl=url) + self.policy = policy + self.tempdir = tempdir + # A lock if only a single output file (no template) is requested + self.copyLock = None if hasTemplate (output) else asyncio.Lock () + # some sanity checks. XXX move to argparse? + if self.copyLock and os.path.exists (self.output): + raise ValueError ('Output file exists') + # tasks currently running + self.running = set () + # max number of tasks running + self.concurrency = concurrency + # keep in sync with StatsHandler + self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0, 'crashed': 0, 'ignored': 0} + + async def fetch (self, entry, seqnum): """ - Overrideable fetch action for URLs. Defaults to sequential - SinglePageController. + Fetch a single URL using an external command + + command is usually crocoite-single """ - result = [] - for u in urls: - c = SinglePageController (u, self.output, self.service, - self.behavior, self.logger, self.settings) - result.append (c.run ()) - return result - - def run (self): - have = set () - urls = set ([self.url]) - ret = {'stats': IntegerDict ()} - - while urls: - self.logger.info ('retrieving {} urls'.format (len (urls))) - result = self.fetch (urls) - - have.update (urls) - urls = set () - for r in result: - ret['stats'] += r['stats'] - urls.update (map (removeFragment, r['links'])) - urls.difference_update (have) - - urls = self.recursionPolicy (urls) - # everything in ret must be serializeable - ret['stats'] = dict (ret['stats']) - return ret + + assert isinstance (entry, SetEntry) + + url = entry.value + depth = entry.depth + logger = self.logger.bind (url=url) + + def formatCommand (e): + # provide means to disable variable expansion + if e.startswith ('!'): + return e[1:] + else: + return e.format (url=url, dest=dest.name) + + def formatOutput (p): + return p.format (host=url.host, + date=datetime.utcnow ().isoformat (), seqnum=seqnum) + + def logStats (): + logger.info ('stats', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **self.stats) + + if url.scheme not in self.SCHEME_WHITELIST: + self.stats['ignored'] += 1 + logStats () + self.logger.warning ('scheme not whitelisted', url=url, + uuid='57e838de-4494-4316-ae98-cd3a2ebf541b') + return + + dest = tempfile.NamedTemporaryFile (dir=self.tempdir, + prefix=os.path.basename (self.output) + '-', suffix='.warc.gz', + delete=False) + command = list (map (formatCommand, self.command)) + logger.info ('fetch', uuid='d1288fbe-8bae-42c8-af8c-f2fa8b41794f', + command=command) + try: + process = await asyncio.create_subprocess_exec (*command, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.DEVNULL, + stdin=asyncio.subprocess.DEVNULL, + start_new_session=True, limit=100*1024*1024) + while True: + data = await process.stdout.readline () + if not data: + break + data = json.loads (data) + uuid = data.get ('uuid') + if uuid == '8ee5e9c9-1130-4c5c-88ff-718508546e0c': + links = set (self.policy (map (lambda x: SetEntry (URL(x).with_fragment(None), depth=depth+1), data.get ('links', [])))) + links.difference_update (self.have) + self.pending.update (links) + elif uuid == '24d92d16-770e-4088-b769-4020e127a7ff': + for k in self.stats.keys (): + self.stats[k] += data.get (k, 0) + logStats () + except asyncio.CancelledError: + # graceful cancellation + process.send_signal (signal.SIGINT) + except Exception as e: + process.kill () + raise e + finally: + code = await process.wait() + if code == 0: + if self.copyLock is None: + # atomically move once finished + lastDestpath = None + while True: + # XXX: must generate a new name every time, otherwise + # this loop never terminates + destpath = formatOutput (self.output) + assert destpath != lastDestpath + lastDestpath = destpath + + # python does not have rename(…, …, RENAME_NOREPLACE), + # but this is safe nontheless, since we’re + # single-threaded + if not os.path.exists (destpath): + # create the directory, so templates like + # /{host}/{date}/… are possible + os.makedirs (os.path.dirname (destpath), exist_ok=True) + os.rename (dest.name, destpath) + break + else: + # atomically (in the context of this process) append to + # existing file + async with self.copyLock: + with open (dest.name, 'rb') as infd, \ + open (self.output, 'ab') as outfd: + shutil.copyfileobj (infd, outfd) + os.unlink (dest.name) + else: + self.stats['crashed'] += 1 + logStats () + + async def run (self): + def log (): + # self.have includes running jobs + self.logger.info ('recursing', + uuid='5b8498e4-868d-413c-a67e-004516b8452c', + pending=len (self.pending), + have=len (self.have)-len(self.running), + running=len (self.running)) + + seqnum = 1 + try: + self.have = set () + self.pending = set ([SetEntry (self.url, depth=0)]) + + while self.pending: + # since pending is a set this picks a random item, which is fine + u = self.pending.pop () + self.have.add (u) + t = asyncio.ensure_future (self.fetch (u, seqnum)) + self.running.add (t) + seqnum += 1 + + log () + + if len (self.running) >= self.concurrency or not self.pending: + done, pending = await asyncio.wait (self.running, + return_when=asyncio.FIRST_COMPLETED) + self.running.difference_update (done) + # propagate exceptions + for r in done: + r.result () + except asyncio.CancelledError: + self.logger.info ('cancel', + uuid='d58154c8-ec27-40f2-ab9e-e25c1b21cd88', + pending=len (self.pending), + have=len (self.have)-len (self.running), + running=len (self.running)) + finally: + done = await asyncio.gather (*self.running, + return_exceptions=True) + # propagate exceptions + for r in done: + if isinstance (r, Exception): + raise r + self.running = set () + log () diff --git a/crocoite/data/click.js b/crocoite/data/click.js index c51a690..ae189da 100644 --- a/crocoite/data/click.js +++ b/crocoite/data/click.js @@ -4,109 +4,7 @@ * like navigating to a different location. Thus whitelist known elements. */ -(function(){ -const selectorFlag = Object.freeze ({ - none: 0, - multi: 1, /* click item multiple times */ -}); -const defaultClickThrottle = 50; /* in ms */ -const discoverInterval = 1000; /* 1 second */ -const sites = Object.freeze ([ - { - hostname: /^www\.facebook\.com$/i, - selector: [ - /* show more comments */ - {s: 'a.UFIPagerLink[role=button]', flags: selectorFlag.none}, - /* show nested comments*/ - {s: 'a.UFICommentLink[role=button]', flags: selectorFlag.none}, - ], - }, { - hostname: /^twitter\.com$/i, - selector: [ - /* expand threads */ - {s: 'a.ThreadedConversation-moreRepliesLink', flags: selectorFlag.none}, - /* show hidden profiles */ - {s: 'button.ProfileWarningTimeline-button', flags: selectorFlag.none}, - /* show hidden/sensitive media */ - {s: 'button.Tombstone-action.js-display-this-media', flags: selectorFlag.none}, - ], - }, { - hostname: /^disqus\.com$/i, - selector: [ - /* load more comments */ - {s: 'a.load-more__button', flags: selectorFlag.multi}, - ], - }, { - hostname: /^(www|np)\.reddit\.com$/i, - selector: [ - /* show more comments, reddit’s javascript ignores events if too - * frequent */ - {s: 'span.morecomments a', flags: selectorFlag.none, throttle: 500}, - ], - }, { - hostname: /^www\.instagram\.com$/i, - selector: [ - /* posts may have multiple images that load dynamically, click the arrow */ - {s: 'a[role=button].coreSpriteRightChevron', flags: selectorFlag.multi, throttle: 500}, - /* load more comments */ - {s: 'article div ul li a[role=button]', flags: selectorFlag.multi}, - ], - }, { - hostname: /^www\.youtube\.com$/i, - selector: [ - /* expand comment thread */ - {s: 'ytd-comment-thread-renderer div.more-button', flags: selectorFlag.none}, - ], - }, { - hostname: /^www\.patreon\.com$/i, - selector: [ - /* load more content */ - {s: 'div[display=flex] div[display=block] button[color=gray][type=button]', flags: selectorFlag.multi}, - /* load more comments */ - {s: 'div.stackable[display=block] > div > div > a[color=dark][target=_self]', flags: selectorFlag.none}, - /* load more replies */ - {s: 'div > a[scale="0"][color=blue][size="1"]', flags: selectorFlag.none}, - ], - } - ]); - -/* pick selectors matching current location */ -let hostname = document.location.hostname; -let selector = []; -for (let s of sites) { - if (s.hostname.test (hostname)) { - selector = selector.concat (s.selector); - } -} - -function makeClickEvent () { - return new MouseEvent('click', { - view: window, - bubbles: true, - cancelable: true - }); -} - -/* throttle clicking */ -let queue = []; -let clickTimeout = null; -function click () { - if (queue.length > 0) { - const item = queue.shift (); - const o = item.o; - const selector = item.selector; - o.dispatchEvent (makeClickEvent ()); - - if (queue.length > 0) { - const nextTimeout = 'throttle' in selector ? - selector.throttle : defaultClickThrottle; - clickTimeout = window.setTimeout (click, nextTimeout); - } else { - clickTimeout = null; - } - } -} - +(function() { /* Element is visible if itself and all of its parents are */ function isVisible (o) { @@ -128,28 +26,82 @@ function isClickable (o) { return !o.hasAttribute ('disabled') && isVisible (o); } -/* some sites don’t remove/replace the element immediately, so keep track of - * which ones we already clicked */ -let have = new Set (); -function discover () { - for (let s of selector) { - let obj = document.querySelectorAll (s.s); - for (let o of obj) { - if (!have.has (o) && isClickable (o)) { - queue.push ({o: o, selector: s}); - if (!(s.flags & selectorFlag.multi)) { - have.add (o); +const defaultClickThrottle = 50; /* in ms */ +const discoverInterval = 1000; /* 1 second */ + +class Click { + constructor(options) { + /* pick selectors matching current location */ + let hostname = document.location.hostname; + this.selector = []; + for (let s of options['sites']) { + let r = new RegExp (s.match, 'i'); + if (r.test (hostname)) { + this.selector = this.selector.concat (s.selector); + } + } + /* throttle clicking */ + this.queue = []; + this.clickTimeout = null; + + /* some sites don’t remove/replace the element immediately, so keep track of + * which ones we already clicked */ + this.have = new Set (); + + /* XXX: can we use a mutation observer instead? */ + this.interval = window.setInterval (this.discover.bind (this), discoverInterval); + } + + makeClickEvent () { + return new MouseEvent('click', { + view: window, + bubbles: true, + cancelable: true + }); + } + + click () { + if (this.queue.length > 0) { + const item = this.queue.shift (); + const o = item.o; + const selector = item.selector; + o.dispatchEvent (this.makeClickEvent ()); + + if (this.queue.length > 0) { + const nextTimeout = 'throttle' in selector ? + selector.throttle : defaultClickThrottle; + this.clickTimeout = window.setTimeout (this.click.bind (this), nextTimeout); + } else { + this.clickTimeout = null; + } + } + } + + discover () { + for (let s of this.selector) { + let obj = document.querySelectorAll (s.selector); + for (let o of obj) { + if (!this.have.has (o) && isClickable (o)) { + this.queue.push ({o: o, selector: s}); + if (!s.multi) { + this.have.add (o); + } } } } + if (this.queue.length > 0 && this.clickTimeout === null) { + /* start clicking immediately */ + this.clickTimeout = window.setTimeout (this.click.bind (this), 0); + } + return true; } - if (queue.length > 0 && clickTimeout === null) { - /* start clicking immediately */ - clickTimeout = window.setTimeout (click, 0); + + + stop () { + window.clearInterval (this.interval); + window.clearTimeout (this.clickTimeout); } - return true; } -/* XXX: can we use a mutation observer instead? */ -window.setInterval (discover, discoverInterval); -}()); +return Click; +}()) diff --git a/crocoite/data/click.yaml b/crocoite/data/click.yaml new file mode 100644 index 0000000..78278b9 --- /dev/null +++ b/crocoite/data/click.yaml @@ -0,0 +1,117 @@ +# Configuration for behavior.py:Click +# Example URLs are random. Believe me. +match: ^www\.facebook\.com$ +selector: + - description: Show comments and replies/nested comments on user pages. + selector: form[action="/ajax/ufi/modify.php"] a[data-testid^="UFI2CommentsPagerRenderer/pager_depth_"] + urls: ["https://www.facebook.com/tagesschau"] + - description: Initially show comments below a single post/video, i.e. /user/post/123. + selector: form[action="/ajax/ufi/modify.php"] a[data-testid="UFI2CommentsCount/root"] + urls: ["https://www.facebook.com/tagesschau/posts/10157061068659407"] + - description: Close the “register now” nag screen. For screenshots. + selector: a#expanding_cta_close_button[role=button] + urls: ["https://www.facebook.com/tagesschau"] +--- +match: ^twitter\.com$ +selector: + - description: Expand threads. + selector: a.ThreadedConversation-moreRepliesLink + urls: ["https://twitter.com/realDonaldTrump/status/1068826073775964160"] + - description: Show hidden profiles. + selector: button.ProfileWarningTimeline-button + urls: ["https://twitter.com/CookieCyboid"] + - description: Show hidden/sensitive media. For screen-/snapshots. + selector: button.Tombstone-action.js-display-this-media + urls: ["https://twitter.com/CookieCyboid/status/1070807283305713665"] + - description: Show more replies. + selector: button.ThreadedConversation-showMoreThreadsButton + urls: ["https://twitter.com/fuglydug/status/1172160128101076995"] +--- +match: ^disqus\.com$ +selector: + - description: Load more comments. + selector: a.load-more__button + multi: True +--- +# new layout +match: ^www\.reddit\.com$ +selector: + - description: Show more comments. + selector: div[id^=moreComments-] > div > p + # reddit’s javascript ignores events if too frequent + throttle: 500 + urls: ["https://www.reddit.com/r/subredditcancer/comments/b2b80f/we_are_moderators_of_rwatchpeopledie_amaa_just/"] +--- +# old layout +match: ^(old|np)\.reddit\.com$ +selector: + - description: Show more comments. + selector: span.morecomments a + # reddit’s javascript ignores events if too frequent + throttle: 500 + urls: ["https://old.reddit.com/r/subredditcancer/comments/b2b80f/we_are_moderators_of_rwatchpeopledie_amaa_just/"] +--- +match: ^www\.youtube\.com$ +selector: + - description: Expand single comment. + selector: ytd-comment-thread-renderer span[slot=more-button] + urls: ["https://www.youtube.com/watch?v=udtFqQuBFSc"] + - description: Show more comment thread replies. + selector: div.ytd-comment-replies-renderer > yt-next-continuation > paper-button + urls: ["https://www.youtube.com/watch?v=Lov0T3eXI2k"] + multi: True +--- +match: ^www\.patreon\.com$ +selector: + - description: Load more comments. + selector: div[data-tag=post-card] button[data-tag=loadMoreCommentsCta] + urls: ["https://www.patreon.com/posts/what-im-on-22124040"] +--- +match: ^(www\.)?gab\.com$ +selector: + - description: Load more posts. + selector: div.item-list[role=feed] button.load-more + multi: True + urls: ["https://gab.com/gab"] +--- +match: ^(www\.)?github\.com$ +selector: + - description: Show hidden issue items. + urls: ["https://github.com/dominictarr/event-stream/issues/116"] + selector: div#discussion_bucket form.ajax-pagination-form button.ajax-pagination-btn +--- +match: ^www\.gamasutra\.com$ +selector: + - description: Load more comments. + urls: ["http://www.gamasutra.com/blogs/RaminShokrizade/20130626/194933/The_Top_F2P_Monetization_Tricks.php"] + selector: div#dynamiccomments div.viewTopCmts a +--- +match: ^(www\.)?steamcommunity\.com$ +selector: + - description: Load more content. + urls: ["https://steamcommunity.com/app/252950/reviews/?p=1&browsefilter=toprated&filterLanguage=all"] + selector: "#GetMoreContentBtn a" + multi: True +--- +match: ^imgur\.com$ +selector: + - description: Load more images of an album. + urls: ["https://imgur.com/a/JG1yc"] + selector: div.js-post-truncated a.post-loadall + - description: Expand all comments. For snapshots. + urls: ["https://imgur.com/a/JG1yc"] + selector: div.comments-info span.comments-expand + - description: Show bad replies. for snapshots. + urls: ["https://imgur.com/gallery/jRzMfRG"] + selector: div#comments div.bad-captions a.link +--- +match: ^(www\.)?vimeo\.com$ +selector: + - description: Load more videos on profile page. + urls: ["https://vimeo.com/dsam4a"] + selector: div.profile_main div.profile-load-more__button--wrapper button +# XXX: this works when using a non-headless browser, but does not otherwise +# - description: Expand video comments +# urls: ["https://vimeo.com/22439234"] +# selector: section#comments button.iris_comment-more +# multi: True diff --git a/crocoite/data/cookies.txt b/crocoite/data/cookies.txt new file mode 100644 index 0000000..6ac62c3 --- /dev/null +++ b/crocoite/data/cookies.txt @@ -0,0 +1,9 @@ +# Default cookies for crocoite. This file does *not* use Netscape’s cookie +# file format. Lines are expected to be in Set-Cookie format. +# And this line is a comment. + +# Reddit: +# skip over 18 prompt +over18=1; Domain=www.reddit.com +# skip quarantined subreddit prompt +_options={%22pref_quarantine_optin%22:true}; Domain=www.reddit.com diff --git a/crocoite/data/extract-links.js b/crocoite/data/extract-links.js index 4d1a3d0..5a4f9f0 100644 --- a/crocoite/data/extract-links.js +++ b/crocoite/data/extract-links.js @@ -25,11 +25,26 @@ function isClickable (o) { } /* --- end copy&paste */ -let x = document.body.querySelectorAll('a[href]'); let ret = []; +['a[href]', 'area[href]'].forEach (function (s) { + let x = document.querySelectorAll(s); + for (let i=0; i < x.length; i++) { + if (isClickable (x[i])) { + ret.push (x[i].href); + } + } +}); + +/* If Chrome loads plain-text documents it’ll wrap them into <pre>. Check those + * for links as well, assuming the whole line is a link (i.e. list of links). */ +let x = document.querySelectorAll ('body > pre'); for (let i=0; i < x.length; i++) { - if (isClickable (x[i])) { - ret.push (x[i].href); + if (isVisible (x[i])) { + x[i].innerText.split ('\n').forEach (function (s) { + if (s.match ('^https?://')) { + ret.push (s); + } + }); } } return ret; /* immediately return results, for use with Runtime.evaluate() */ diff --git a/crocoite/data/screenshot.js b/crocoite/data/screenshot.js new file mode 100644 index 0000000..a9a41e1 --- /dev/null +++ b/crocoite/data/screenshot.js @@ -0,0 +1,20 @@ +/* Find and scrollable full-screen elements and return their actual size + */ +(function () { +/* limit the number of elements queried */ +let elem = document.querySelectorAll ('body > div'); +let ret = []; +for (let i = 0; i < elem.length; i++) { + let e = elem[i]; + let s = window.getComputedStyle (e); + if (s.getPropertyValue ('position') == 'fixed' && + s.getPropertyValue ('overflow') == 'auto' && + s.getPropertyValue ('left') == '0px' && + s.getPropertyValue ('right') == '0px' && + s.getPropertyValue ('top') == '0px' && + s.getPropertyValue ('bottom') == '0px') { + ret.push (e.scrollHeight); + } +} +return ret; /* immediately return results, for use with Runtime.evaluate() */ +})(); diff --git a/crocoite/data/scroll.js b/crocoite/data/scroll.js index 13e856d..be88edf 100644 --- a/crocoite/data/scroll.js +++ b/crocoite/data/scroll.js @@ -1,23 +1,38 @@ /* Continuously scrolls the page */ -var __crocoite_stop__ = false; (function(){ -function scroll (event) { - if (__crocoite_stop__) { - return false; - } else { +class Scroll { + constructor (options) { + this.scrolled = new Map (); + this.interval = window.setInterval (this.scroll.bind (this), 200); + } + + stop() { + window.clearInterval (this.interval); + window.scrollTo (0, 0); + this.scrolled.forEach (function (value, key, map) { + key.scrollTop = value; + }); + } + /* save initial scroll state */ + save(obj) { + if (!this.scrolled.has (obj)) { + this.scrolled.set (obj, obj.scrollTop); + } + } + /* perform a single scroll step */ + scroll (event) { window.scrollBy (0, window.innerHeight/2); - document.querySelectorAll ('*').forEach ( + document.querySelectorAll ('html body *').forEach ( function (d) { - if (d.clientHeight < d.scrollHeight) { + if (d.scrollHeight-d.scrollTop > d.clientHeight) { + this.save (d); d.scrollBy (0, d.clientHeight/2); } - }); + }.bind (this)); return true; } } -function onload (event) { - window.setInterval (scroll, 200); -} -document.addEventListener("DOMContentLoaded", onload); -}()); + +return Scroll; +}()) diff --git a/crocoite/devtools.py b/crocoite/devtools.py new file mode 100644 index 0000000..8b5c69d --- /dev/null +++ b/crocoite/devtools.py @@ -0,0 +1,392 @@ +# Copyright (c) 2017 crocoite contributors +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +""" +Communication with Google Chrome through its DevTools protocol. +""" + +import json, asyncio, logging, os +from tempfile import mkdtemp +import shutil +from http.cookies import Morsel + +import aiohttp, websockets +from yarl import URL + +from .util import StrJsonEncoder + +logger = logging.getLogger (__name__) + +class Browser: + """ + Communicate with Google Chrome through its DevTools protocol. + + Asynchronous context manager that creates a new Tab when entering. + Destroyed upon exit. + """ + + __slots__ = ('session', 'url', 'tab') + + def __init__ (self, url): + self.url = URL (url) + self.session = None + self.tab = None + + async def __aiter__ (self): + """ List all tabs """ + async with aiohttp.ClientSession () as session: + async with session.get (self.url.with_path ('/json/list')) as r: + resp = await r.json () + for tab in resp: + if tab['type'] == 'page': + yield tab + + async def __aenter__ (self): + """ Create tab """ + assert self.tab is None + assert self.session is None + self.session = aiohttp.ClientSession () + async with self.session.get (self.url.with_path ('/json/new')) as r: + resp = await r.json () + self.tab = await Tab.create (**resp) + return self.tab + + async def __aexit__ (self, excType, excValue, traceback): + assert self.tab is not None + assert self.session is not None + + await self.tab.close () + + try: + async with self.session.get (self.url.with_path (f'/json/close/{self.tab.id}')) as r: + resp = await r.text () + assert resp == 'Target is closing' + except aiohttp.client_exceptions.ClientConnectorError: + # oh boy, the whole browser crashed instead + if excType is Crashed: + # exception is reraised by `return False` + pass + else: + # this one is more important + raise + + self.tab = None + await self.session.close () + self.session = None + + return False + +class TabFunction: + """ + Helper class for infinite-depth tab functions. + + A method usually consists of namespace (Page, Network, …) and function name + (getFoobar) separated by a dot. This class creates these function names + while providing an intuitive Python interface (tab.Network.getFoobar). + + This was inspired by pychrome. + """ + + __slots__ = ('name', 'tab') + + def __init__ (self, name, tab): + self.name = name + self.tab = tab + + def __eq__ (self, b): + assert isinstance (b, TabFunction) + return self.name == b.name + + def __hash__ (self): + return hash (self.name) + + def __getattr__ (self, k): + return TabFunction (f'{self.name}.{k}', self.tab) + + async def __call__ (self, **kwargs): + return await self.tab (self.name, **kwargs) + + def __repr__ (self): + return f'<TabFunction {self.name}>' + +class TabException (Exception): + pass + +class Crashed (TabException): + pass + +class MethodNotFound (TabException): + pass + +class InvalidParameter (TabException): + pass + +# map error codes to native exceptions +errorMap = {-32601: MethodNotFound, -32602: InvalidParameter} + +class Tab: + """ + Communicate with a single Google Chrome browser tab. + """ + __slots__ = ('id', 'wsUrl', 'ws', 'msgid', 'transactions', 'queue', '_recvHandle', 'crashed') + + def __init__ (self, tabid, ws): + """ Do not use this method, use Browser context manager. """ + self.id = tabid + self.ws = ws + self.msgid = 1 + self.crashed = False + self.transactions = {} + self.queue = asyncio.Queue () + + def __getattr__ (self, k): + return TabFunction (k, self) + + async def __call__ (self, method, **kwargs): + """ + Actually call browser method with kwargs + """ + + if self.crashed or self._recvHandle.done (): + raise Crashed () + + msgid = self.msgid + self.msgid += 1 + message = {'method': method, 'params': kwargs, 'id': msgid} + t = self.transactions[msgid] = {'event': asyncio.Event (), 'result': None} + logger.debug (f'← {message}') + await self.ws.send (json.dumps (message, cls=StrJsonEncoder)) + await t['event'].wait () + ret = t['result'] + del self.transactions[msgid] + if isinstance (ret, Exception): + raise ret + return ret + + async def _recvProcess (self): + """ + Receive process that dispatches received websocket frames + + These are either events which will be put into a queue or request + responses which unblock a __call__. + """ + + async def markCrashed (reason): + # all pending requests can be considered failed since the + # browser state is lost + for v in self.transactions.values (): + v['result'] = Crashed (reason) + v['event'].set () + # and all future requests will fail as well until reloaded + self.crashed = True + await self.queue.put (Crashed (reason)) + + while True: + try: + msg = await self.ws.recv () + msg = json.loads (msg) + except Exception as e: + # right now we cannot recover from this + await markCrashed (e) + break + logger.debug (f'→ {msg}') + if 'id' in msg: + msgid = msg['id'] + t = self.transactions.get (msgid, None) + if t is not None: + if 'error' in msg: + e = msg['error'] + t['result'] = errorMap.get (e['code'], TabException) (e['code'], e['message']) + else: + t['result'] = msg['result'] + t['event'].set () + else: + # ignore stale result + pass # pragma: no cover + elif 'method' in msg: + # special treatment + if msg['method'] == 'Inspector.targetCrashed': + await markCrashed ('target') + else: + await self.queue.put (msg) + else: + assert False # pragma: no cover + + async def run (self): + self._recvHandle = asyncio.ensure_future (self._recvProcess ()) + + async def close (self): + self._recvHandle.cancel () + await self.ws.close () + # no join, throw away the queue. There will be nobody listening on the + # other end. + #await self.queue.join () + + @property + def pending (self): + return self.queue.qsize () + + async def get (self): + def getattrRecursive (obj, name): + if '.' in name: + n, ext = name.split ('.', 1) + return getattrRecursive (getattr (obj, n), ext) + return getattr (obj, name) + + if self.crashed: + raise Crashed () + + ret = await self.queue.get () + if isinstance (ret, Exception): + raise ret + return getattrRecursive (self, ret['method']), ret['params'] + + @classmethod + async def create (cls, **kwargs): + """ Async init """ + # increase size limit of a single frame to something ridiciously high, + # so we can safely grab screenshots + maxSize = 100*1024*1024 # 100 MB + # chrome does not like pings and kills the connection, disable them + ws = await websockets.connect(kwargs['webSocketDebuggerUrl'], + max_size=maxSize, ping_interval=None) + ret = cls (kwargs['id'], ws) + await ret.run () + return ret + +class Process: + """ Start Google Chrome listening on a random port """ + + __slots__ = ('binary', 'windowSize', 'p', 'userDataDir') + + def __init__ (self, binary='google-chrome-stable', windowSize=(1920, 1080)): + self.binary = binary + self.windowSize = windowSize + self.p = None + + async def __aenter__ (self): + assert self.p is None + self.userDataDir = mkdtemp (prefix=__package__ + '-chrome-userdata-') + # see https://github.com/GoogleChrome/chrome-launcher/blob/master/docs/chrome-flags-for-tools.md + args = [self.binary, + '--window-size={},{}'.format (*self.windowSize), + f'--user-data-dir={self.userDataDir}', # use temporory user dir + '--no-default-browser-check', + '--no-first-run', # don’t show first run screen + '--disable-breakpad', # no error reports + '--disable-extensions', + '--disable-infobars', + '--disable-notifications', # no libnotify + '--disable-background-networking', # disable background services (updating, safe browsing, …) + '--safebrowsing-disable-auto-update', + '--disable-sync', # no google account syncing + '--metrics-recording-only', # do not submit metrics + '--disable-default-apps', + '--disable-background-timer-throttling', + '--disable-client-side-phishing-detection', + '--disable-popup-blocking', + '--disable-prompt-on-repost', + '--enable-automation', # enable various automation-related things + '--password-store=basic', + '--headless', + '--disable-gpu', + '--hide-scrollbars', # hide scrollbars on screenshots + '--mute-audio', # don’t play any audio + '--remote-debugging-port=0', # pick a port. XXX: we may want to use --remote-debugging-pipe instead + '--homepage=about:blank', + 'about:blank'] + # start new session, so ^C does not affect subprocess + self.p = await asyncio.create_subprocess_exec (*args, + stdout=asyncio.subprocess.DEVNULL, + stderr=asyncio.subprocess.DEVNULL, + stdin=asyncio.subprocess.DEVNULL, + start_new_session=True) + port = None + # chrome writes its current active devtools port to a file. due to the + # sleep() this is rather ugly, but should work with all versions of the + # browser. + for i in range (100): + try: + with open (os.path.join (self.userDataDir, 'DevToolsActivePort'), 'r') as fd: + port = int (fd.readline ().strip ()) + break + except FileNotFoundError: + await asyncio.sleep (0.2) + if port is None: + raise Exception ('Chrome died on us.') + + return URL.build(scheme='http', host='localhost', port=port) + + async def __aexit__ (self, *exc): + try: + self.p.terminate () + await self.p.wait () + except ProcessLookupError: + # ok, fine, dead already + pass + + # Try to delete the temporary directory multiple times. It looks like + # Chrome will change files in there even after it exited (i.e. .wait() + # returned). Very strange. + for i in range (5): + try: + shutil.rmtree (self.userDataDir) + break + except: + await asyncio.sleep (0.2) + + self.p = None + return False + +class Passthrough: + __slots__ = ('url', ) + + def __init__ (self, url): + self.url = URL (url) + + async def __aenter__ (self): + return self.url + + async def __aexit__ (self, *exc): + return False + +def toCookieParam (m): + """ + Convert Python’s http.cookies.Morsel to Chrome’s CookieParam, see + https://chromedevtools.github.io/devtools-protocol/1-3/Network#type-CookieParam + """ + + assert isinstance (m, Morsel) + + out = {'name': m.key, 'value': m.value} + + # unsupported by chrome + for k in ('max-age', 'comment', 'version'): + if m[k]: + raise ValueError (f'Unsupported cookie attribute {k} set, cannot convert') + + for mname, cname in [('expires', None), ('path', None), ('domain', None), ('secure', None), ('httponly', 'httpOnly')]: + value = m[mname] + if value: + cname = cname or mname + out[cname] = value + + return out + diff --git a/crocoite/html.py b/crocoite/html.py index f891101..30f6ca5 100644 --- a/crocoite/html.py +++ b/crocoite/html.py @@ -22,6 +22,10 @@ HTML helper """ +from html5lib.treewalkers.base import TreeWalker +from html5lib.filters.base import Filter +from html5lib import constants + # HTML void tags, see https://html.spec.whatwg.org/multipage/syntax.html#void-elements voidTags = {'area', 'base', @@ -103,10 +107,7 @@ eventAttributes = {'onabort', 'onvolumechange', 'onwaiting'} -from html5lib.treewalkers.base import TreeWalker -from html5lib.filters.base import Filter -from html5lib.serializer import HTMLSerializer -from html5lib import constants +default_namespace = constants.namespaces["html"] class ChromeTreeWalker (TreeWalker): """ @@ -123,11 +124,14 @@ class ChromeTreeWalker (TreeWalker): elif name == '#document': for child in node.get ('children', []): yield from self.recurse (child) + elif name == '#cdata-section': + # html5lib cannot generate cdata, so we’re faking it by using + # an empty tag + yield from self.emptyTag (default_namespace, + '![CDATA[' + node['nodeValue'] + ']]', {}) else: - assert False, name + assert False, (name, node) else: - default_namespace = constants.namespaces["html"] - attributes = node.get ('attributes', []) convertedAttr = {} for i in range (0, len (attributes), 2): @@ -195,7 +199,6 @@ class StripAttributeFilter (Filter): self.attributes = set (map (str.lower, attributes)) def __iter__(self): - default_namespace = constants.namespaces["html"] for token in Filter.__iter__(self): data = token.get ('data') if data and token['type'] in {'StartTag', 'EmptyTag'}: diff --git a/crocoite/irc.py b/crocoite/irc.py new file mode 100644 index 0000000..d9c0634 --- /dev/null +++ b/crocoite/irc.py @@ -0,0 +1,671 @@ +# Copyright (c) 2017–2018 crocoite contributors +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +""" +IRC bot “chromebot” +""" + +import asyncio, argparse, json, tempfile, time, random, os, shlex +from datetime import datetime +from urllib.parse import urlsplit +from enum import IntEnum, unique +from collections import defaultdict +from abc import abstractmethod +from functools import wraps +import bottom +import websockets + +from .util import StrJsonEncoder +from .cli import cookie + +### helper functions ### +def prettyTimeDelta (seconds): + """ + Pretty-print seconds to human readable string 1d 1h 1m 1s + """ + seconds = int(seconds) + days, seconds = divmod(seconds, 86400) + hours, seconds = divmod(seconds, 3600) + minutes, seconds = divmod(seconds, 60) + s = [(days, 'd'), (hours, 'h'), (minutes, 'm'), (seconds, 's')] + s = filter (lambda x: x[0] != 0, s) + return ' '.join (map (lambda x: '{}{}'.format (*x), s)) + +def prettyBytes (b): + """ + Pretty-print bytes + """ + prefixes = ['B', 'KiB', 'MiB', 'GiB', 'TiB'] + while b >= 1024 and len (prefixes) > 1: + b /= 1024 + prefixes.pop (0) + return f'{b:.1f} {prefixes[0]}' + +def isValidUrl (s): + url = urlsplit (s) + if url.scheme and url.netloc and url.scheme in {'http', 'https'}: + return s + raise TypeError () + +class NonExitingArgumentParser (argparse.ArgumentParser): + """ Argument parser that does not call exit(), suitable for interactive use """ + + def exit (self, status=0, message=None): + # should never be called + pass + + def error (self, message): + # if we use subparsers it’s important to return self, so we can show + # the correct help + raise Exception (self, message) + + def format_usage (self): + return super().format_usage ().replace ('\n', ' ') + +class Status(IntEnum): + """ Job status """ + undefined = 0 + pending = 1 + running = 2 + aborted = 3 + finished = 4 + +# see https://arxiv.org/html/0901.4016 on how to build proquints (human +# pronouncable unique ids) +toConsonant = 'bdfghjklmnprstvz' +toVowel = 'aiou' + +def u16ToQuint (v): + """ Transform a 16 bit unsigned integer into a single quint """ + assert 0 <= v < 2**16 + # quints are “big-endian” + return ''.join ([ + toConsonant[(v>>(4+2+4+2))&0xf], + toVowel[(v>>(4+2+4))&0x3], + toConsonant[(v>>(4+2))&0xf], + toVowel[(v>>4)&0x3], + toConsonant[(v>>0)&0xf], + ]) + +def uintToQuint (v, length=2): + """ Turn any integer into a proquint with fixed length """ + assert 0 <= v < 2**(length*16) + + return '-'.join (reversed ([u16ToQuint ((v>>(x*16))&0xffff) for x in range (length)])) + +def makeJobId (): + """ Create job id from time and randomness source """ + # allocate 48 bits for the time (in milliseconds) and add 16 random bits + # at the end (just to be sure) for a total of 64 bits. Should be enough to + # avoid collisions. + randbits = 16 + stamp = (int (time.time ()*1000) << randbits) | random.randint (0, 2**randbits-1) + return uintToQuint (stamp, 4) + +class Job: + """ Archival job """ + + __slots__ = ('id', 'stats', 'rstats', 'started', 'finished', 'nick', 'status', 'process', 'url') + + def __init__ (self, url, nick): + self.id = makeJobId () + self.stats = {} + self.rstats = {} + self.started = datetime.utcnow () + self.finished = None + self.url = url + # user who scheduled this job + self.nick = nick + self.status = Status.pending + self.process = None + + def formatStatus (self): + stats = self.stats + rstats = self.rstats + return (f"{self.url} ({self.id}) {self.status.name}. " + f"{rstats.get ('have', 0)} pages finished, " + f"{rstats.get ('pending', 0)} pending; " + f"{stats.get ('crashed', 0)} crashed, " + f"{stats.get ('requests', 0)} requests, " + f"{stats.get ('failed', 0)} failed, " + f"{prettyBytes (stats.get ('bytesRcv', 0))} received.") + +@unique +class NickMode(IntEnum): + # the actual numbers don’t matter, but their order must be strictly + # increasing (with priviledge level) + operator = 100 + voice = 10 + + @classmethod + def fromMode (cls, mode): + return {'v': cls.voice, 'o': cls.operator}[mode] + + @classmethod + def fromNickPrefix (cls, mode): + return {'@': cls.operator, '+': cls.voice}[mode] + + @property + def human (self): + return {self.operator: 'operator', self.voice: 'voice'}[self] + +class User: + """ IRC user """ + __slots__ = ('name', 'modes') + + def __init__ (self, name, modes=None): + self.name = name + self.modes = modes or set () + + def __eq__ (self, b): + return self.name == b.name + + def __hash__ (self): + return hash (self.name) + + def __repr__ (self): + return f'<User {self.name} {self.modes}>' + + def hasPriv (self, p): + if p is None: + return True + else: + return self.modes and max (self.modes) >= p + + @classmethod + def fromName (cls, name): + """ Get mode and name from NAMES command """ + try: + modes = {NickMode.fromNickPrefix (name[0])} + name = name[1:] + except KeyError: + modes = set () + return cls (name, modes) + +class ReplyContext: + __slots__ = ('client', 'target', 'user') + + def __init__ (self, client, target, user): + self.client = client + self.target = target + self.user = user + + def __call__ (self, message): + self.client.send ('PRIVMSG', target=self.target, + message=f'{self.user.name}: {message}') + +class RefCountEvent: + """ + Ref-counted event that triggers if a) armed and b) refcount drops to zero. + + Must be used as a context manager. + """ + __slots__ = ('count', 'event', 'armed') + + def __init__ (self): + self.armed = False + self.count = 0 + self.event = asyncio.Event () + + def __enter__ (self): + self.count += 1 + self.event.clear () + + def __exit__ (self, exc_type, exc_val, exc_tb): + self.count -= 1 + if self.armed and self.count == 0: + self.event.set () + + async def wait (self): + await self.event.wait () + + def arm (self): + self.armed = True + if self.count == 0: + self.event.set () + +class ArgparseBot (bottom.Client): + """ + Simple IRC bot using argparse + + Tracks user’s modes, reconnects on disconnect + """ + + __slots__ = ('channels', 'nick', 'parser', 'users', '_quit') + + def __init__ (self, host, port, ssl, nick, logger, channels=None, loop=None): + super().__init__ (host=host, port=port, ssl=ssl, loop=loop) + self.channels = channels or [] + self.nick = nick + # map channel -> nick -> user + self.users = defaultdict (dict) + self.logger = logger.bind (context=type (self).__name__) + self.parser = self.getParser () + + # bot does not accept new queries in shutdown mode, unless explicitly + # permitted by the parser + self._quit = RefCountEvent () + + # register bottom event handler + self.on('CLIENT_CONNECT', self.onConnect) + self.on('PING', self.onKeepalive) + self.on('PRIVMSG', self.onMessage) + self.on('CLIENT_DISCONNECT', self.onDisconnect) + self.on('RPL_NAMREPLY', self.onNameReply) + self.on('CHANNELMODE', self.onMode) + self.on('PART', self.onPart) + self.on('JOIN', self.onJoin) + # XXX: we would like to handle KICK, but bottom does not support that at the moment + + @abstractmethod + def getParser (self): + pass + + def cancel (self): + self.logger.info ('cancel', uuid='1eb34aea-a854-4fec-90b2-7f8a3812a9cd') + self._quit.arm () + + async def run (self): + await self.connect () + await self._quit.wait () + self.send ('QUIT', message='Bye.') + await self.disconnect () + + async def onConnect (self, **kwargs): + self.logger.info ('connect', nick=self.nick, uuid='01f7b138-ea53-4609-88e9-61f3eca3e7e7') + + self.send('NICK', nick=self.nick) + self.send('USER', user=self.nick, realname='https://github.com/PromyLOPh/crocoite') + + # Don't try to join channels until the server has + # sent the MOTD, or signaled that there's no MOTD. + done, pending = await asyncio.wait( + [self.wait('RPL_ENDOFMOTD'), self.wait('ERR_NOMOTD')], + loop=self.loop, return_when=asyncio.FIRST_COMPLETED) + + # Cancel whichever waiter's event didn't come in. + for future in pending: + future.cancel() + + for c in self.channels: + self.logger.info ('join', channel=c, uuid='367063a5-9069-4025-907c-65ba88af8593') + self.send ('JOIN', channel=c) + # no need for NAMES here, server sends this automatically + + async def onNameReply (self, channel, users, **kwargs): + # channels may be too big for a single message + addusers = dict (map (lambda x: (x.name, x), map (User.fromName, users))) + if channel not in self.users: + self.users[channel] = addusers + else: + self.users[channel].update (addusers) + + @staticmethod + def parseMode (mode): + """ Parse mode strings like +a, -b, +a-b, -b+a, … """ + action = '+' + ret = [] + for c in mode: + if c in {'+', '-'}: + action = c + else: + ret.append ((action, c)) + return ret + + async def onMode (self, channel, modes, params, **kwargs): + if channel not in self.channels: + return + + for (action, mode), nick in zip (self.parseMode (modes), params): + try: + m = NickMode.fromMode (mode) + u = self.users[channel].get (nick, User (nick)) + if action == '+': + u.modes.add (m) + elif action == '-': + u.modes.remove (m) + except KeyError: + # unknown mode, ignore + pass + + async def onPart (self, nick, channel, **kwargs): + if channel not in self.channels: + return + + try: + self.users[channel].pop (nick) + except KeyError: + # gone already + pass + + async def onJoin (self, nick, channel, **kwargs): + if channel not in self.channels: + return + + self.users[channel][nick] = User (nick) + + async def onKeepalive (self, message, **kwargs): + """ Ping received """ + self.send('PONG', message=message) + + async def onMessage (self, nick, target, message, **kwargs): + """ Message received """ + msgPrefix = self.nick + ':' + if target in self.channels and message.startswith (msgPrefix): + user = self.users[target].get (nick, User (nick)) + reply = ReplyContext (client=self, target=target, user=user) + + # shlex.split supports quoting arguments, which str.split() does not + command = shlex.split (message[len (msgPrefix):]) + try: + args = self.parser.parse_args (command) + except Exception as e: + reply (f'{e.args[1]} -- {e.args[0].format_usage ()}') + return + if not args or not hasattr (args, 'func'): + reply (f'Sorry, I don’t understand {command}') + return + + minPriv = getattr (args, 'minPriv', None) + if self._quit.armed and not getattr (args, 'allowOnShutdown', False): + reply ('Sorry, I’m shutting down and cannot accept your request right now.') + elif not user.hasPriv (minPriv): + reply (f'Sorry, you need the privilege {minPriv.human} to use this command.') + else: + with self._quit: + await args.func (user=user, args=args, reply=reply) + + async def onDisconnect (self, **kwargs): + """ Auto-reconnect """ + self.logger.info ('disconnect', uuid='4c74b2c8-2403-4921-879d-2279ad85db72') + while True: + if not self._quit.armed: + await asyncio.sleep (10, loop=self.loop) + self.logger.info ('reconnect', uuid='c53555cb-e1a4-4b69-b1c9-3320269c19d7') + try: + await self.connect () + finally: + break + +def jobExists (func): + """ Chromebot job exists """ + @wraps (func) + async def inner (self, **kwargs): + # XXX: not sure why it works with **kwargs, but not (user, args, reply) + args = kwargs.get ('args') + reply = kwargs.get ('reply') + j = self.jobs.get (args.id, None) + if not j: + reply (f'Job {args.id} is unknown') + else: + ret = await func (self, job=j, **kwargs) + return ret + return inner + +class Chromebot (ArgparseBot): + __slots__ = ('jobs', 'tempdir', 'destdir', 'processLimit', 'blacklist', 'needVoice') + + def __init__ (self, host, port, ssl, nick, logger, channels=None, + tempdir=None, destdir='.', processLimit=1, + blacklist={}, needVoice=False, loop=None): + self.needVoice = needVoice + + super().__init__ (host=host, port=port, ssl=ssl, nick=nick, + logger=logger, channels=channels, loop=loop) + + self.jobs = {} + self.tempdir = tempdir or tempfile.gettempdir() + self.destdir = destdir + self.processLimit = asyncio.Semaphore (processLimit) + self.blacklist = blacklist + + def getParser (self): + parser = NonExitingArgumentParser (prog=self.nick + ': ', add_help=False) + subparsers = parser.add_subparsers(help='Sub-commands') + + archiveparser = subparsers.add_parser('a', help='Archive a site', add_help=False) + archiveparser.add_argument('--concurrency', '-j', default=1, type=int, help='Parallel workers for this job', choices=range (1, 5)) + archiveparser.add_argument('--recursive', '-r', help='Enable recursion', choices=['0', '1', 'prefix'], default='0') + archiveparser.add_argument('--insecure', '-k', + help='Disable certificate checking', action='store_true') + # parsing the cookie here, so we can give an early feedback without + # waiting for the job to crash on invalid arguments. + archiveparser.add_argument('--cookie', '-b', type=cookie, + help='Add a cookie', action='append', default=[]) + archiveparser.add_argument('url', help='Website URL', type=isValidUrl, metavar='URL') + archiveparser.set_defaults (func=self.handleArchive, + minPriv=NickMode.voice if self.needVoice else None) + + statusparser = subparsers.add_parser ('s', help='Get job status', add_help=False) + statusparser.add_argument('id', help='Job id', metavar='UUID') + statusparser.set_defaults (func=self.handleStatus, allowOnShutdown=True) + + abortparser = subparsers.add_parser ('r', help='Revoke/abort job', add_help=False) + abortparser.add_argument('id', help='Job id', metavar='UUID') + abortparser.set_defaults (func=self.handleAbort, allowOnShutdown=True, + minPriv=NickMode.voice if self.needVoice else None) + + return parser + + def isBlacklisted (self, url): + for k, v in self.blacklist.items(): + if k.match (url): + return v + return False + + async def handleArchive (self, user, args, reply): + """ Handle the archive command """ + + msg = self.isBlacklisted (args.url) + if msg: + reply (f'{args.url} cannot be queued: {msg}') + return + + # make sure the job id is unique. Since ids are time-based we can just + # wait. + while True: + j = Job (args.url, user.name) + if j.id not in self.jobs: + break + time.sleep (0.01) + self.jobs[j.id] = j + + logger = self.logger.bind (job=j.id) + + showargs = { + 'recursive': args.recursive, + 'concurrency': args.concurrency, + } + if args.insecure: + showargs['insecure'] = args.insecure + warcinfo = {'chromebot': { + 'jobid': j.id, + 'user': user.name, + 'queued': j.started, + 'url': args.url, + 'recursive': args.recursive, + 'concurrency': args.concurrency, + }} + grabCmd = ['crocoite-single'] + # prefix warcinfo with !, so it won’t get expanded + grabCmd.extend (['--warcinfo', + '!' + json.dumps (warcinfo, cls=StrJsonEncoder)]) + for v in args.cookie: + grabCmd.extend (['--cookie', v.OutputString ()]) + if args.insecure: + grabCmd.append ('--insecure') + grabCmd.extend (['{url}', '{dest}']) + cmdline = ['crocoite', + '--tempdir', self.tempdir, + '--recursion', args.recursive, + '--concurrency', str (args.concurrency), + args.url, + os.path.join (self.destdir, + j.id + '-{host}-{date}-{seqnum}.warc.gz'), + '--'] + grabCmd + + strargs = ', '.join (map (lambda x: '{}={}'.format (*x), showargs.items ())) + reply (f'{args.url} has been queued as {j.id} with {strargs}') + logger.info ('queue', user=user.name, url=args.url, cmdline=cmdline, + uuid='36cc34a6-061b-4cc5-84a9-4ab6552c8d75') + + async with self.processLimit: + if j.status == Status.pending: + # job was not aborted + j.process = await asyncio.create_subprocess_exec (*cmdline, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.DEVNULL, + stdin=asyncio.subprocess.DEVNULL, + start_new_session=True, limit=100*1024*1024) + while True: + data = await j.process.stdout.readline () + if not data: + break + + # job is marked running after the first message is received from it + if j.status == Status.pending: + logger.info ('start', uuid='46e62d60-f498-4ab0-90e1-d08a073b10fb') + j.status = Status.running + + data = json.loads (data) + msgid = data.get ('uuid') + if msgid == '24d92d16-770e-4088-b769-4020e127a7ff': + j.stats = data + elif msgid == '5b8498e4-868d-413c-a67e-004516b8452c': + j.rstats = data + + # forward message, so the dashboard can use it + logger.info ('message', + uuid='5c0f9a11-dcd8-4182-a60f-54f4d3ab3687', + data=data) + code = await j.process.wait () + + if j.status == Status.running: + logger.info ('finish', uuid='7b40ffbb-faab-4224-90ed-cd4febd8f7ec') + j.status = Status.finished + j.finished = datetime.utcnow () + + stats = j.stats + rstats = j.rstats + reply (j.formatStatus ()) + + @jobExists + async def handleStatus (self, user, args, reply, job): + """ Handle status command """ + + rstats = job.rstats + reply (job.formatStatus ()) + + @jobExists + async def handleAbort (self, user, args, reply, job): + """ Handle abort command """ + + if job.status not in {Status.pending, Status.running}: + reply ('This job is not running.') + return + + job.status = Status.aborted + self.logger.info ('abort', job=job.id, user=user.name, + uuid='865b3b3e-a54a-4a56-a545-f38a37bac295') + if job.process and job.process.returncode is None: + job.process.terminate () + +class Dashboard: + __slots__ = ('fd', 'clients', 'loop', 'log', 'maxLog', 'pingInterval', 'pingTimeout') + # these messages will not be forwarded to the browser + ignoreMsgid = { + # connect + '01f7b138-ea53-4609-88e9-61f3eca3e7e7', + # join + '367063a5-9069-4025-907c-65ba88af8593', + # disconnect + '4c74b2c8-2403-4921-879d-2279ad85db72', + # reconnect + 'c53555cb-e1a4-4b69-b1c9-3320269c19d7', + } + + def __init__ (self, fd, loop, maxLog=1000, pingInterval=30, pingTimeout=10): + self.fd = fd + self.clients = set () + self.loop = loop + # log buffer + self.log = [] + self.maxLog = maxLog + self.pingInterval = pingInterval + self.pingTimeout = pingTimeout + + async def client (self, websocket, path): + self.clients.add (websocket) + try: + for l in self.log: + buf = json.dumps (l) + await websocket.send (buf) + while True: + try: + msg = await asyncio.wait_for (websocket.recv(), timeout=self.pingInterval) + except asyncio.TimeoutError: + try: + pong = await websocket.ping() + await asyncio.wait_for (pong, timeout=self.pingTimeout) + except asyncio.TimeoutError: + break + except websockets.exceptions.ConnectionClosed: + break + finally: + self.clients.remove (websocket) + + def handleStdin (self): + buf = self.fd.readline () + if not buf: + return + + try: + data = json.loads (buf) + except json.decoder.JSONDecodeError: + # ignore invalid + return + msgid = data['uuid'] + + if msgid in self.ignoreMsgid: + return + + # a few messages may contain sensitive information that we want to hide + if msgid == '36cc34a6-061b-4cc5-84a9-4ab6552c8d75': + # queue + del data['cmdline'] + elif msgid == '5c0f9a11-dcd8-4182-a60f-54f4d3ab3687': + nesteddata = data['data'] + nestedmsgid = nesteddata['uuid'] + if nestedmsgid == 'd1288fbe-8bae-42c8-af8c-f2fa8b41794f': + del nesteddata['command'] + + buf = json.dumps (data) + for c in self.clients: + # XXX can’t await here + asyncio.ensure_future (c.send (buf)) + + self.log.append (data) + while len (self.log) > self.maxLog: + self.log.pop (0) + + def run (self, host='localhost', port=6789): + self.loop.add_reader (self.fd, self.handleStdin) + return websockets.serve(self.client, host, port) + diff --git a/crocoite/logger.py b/crocoite/logger.py new file mode 100644 index 0000000..ac389ca --- /dev/null +++ b/crocoite/logger.py @@ -0,0 +1,132 @@ +# Copyright (c) 2017–2018 crocoite contributors +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +""" +Simple logger inspired by structlog. + +It is usually used like this: Classes are passed a logger instance. They bind +context to their name, so identifying the source of messages is easier. Every +log message carries a unique id (uuid) for automated identification as well as +a short human-readable message (msg) and arbitrary payload. +""" + +import sys, json +from datetime import datetime +from functools import partial +from enum import IntEnum + +from pytz import utc + +from .util import StrJsonEncoder + +class Level(IntEnum): + DEBUG = 0 + INFO = 1 + WARNING = 2 + ERROR = 3 + +class Logger: + def __init__ (self, consumer=None, bindings=None): + self.bindings = bindings or {} + self.consumer = consumer or [] + + def __call__ (self, level, *args, **kwargs): + if not isinstance (level, Level): + level = Level[level.upper ()] + kwargs['level'] = level + if args: + if len (args) == 1: + args, = args + kwargs['msg'] = args + # do not overwrite arguments + for k, v in self.bindings.items (): + if k not in kwargs: + kwargs[k] = v + for c in self.consumer: + kwargs = c (**kwargs) + return kwargs + + def __getattr__ (self, k): + """ Bind all method names to level, so Logger.info, Logger.warning, … work """ + return partial (self.__call__, k) + + def bind (self, **kwargs): + d = self.bindings.copy () + d.update (kwargs) + # consumer is not a copy intentionally, so attaching to the parent + # logger will attach to all children as well + return self.__class__ (consumer=self.consumer, bindings=d) + + def unbind (self, **kwargs): + d = self.bindings.copy () + for k in kwargs.keys (): + del d[k] + return self.__class__ (consumer=self.consumer, bindings=d) + + def connect (self, consumer): + self.consumer.append (consumer) + + def disconnect (self, consumer): + self.consumer.remove (consumer) + +class Consumer: + def __call__ (self, **kwargs): # pragma: no cover + raise NotImplementedError () + +class NullConsumer (Consumer): + def __call__ (self, **kwargs): + return kwargs + +class PrintConsumer (Consumer): + """ + Simple printing consumer + """ + def __call__ (self, **kwargs): + sys.stderr.write (str (kwargs)) + sys.stderr.write ('\n') + sys.stderr.flush () + return kwargs + +class JsonPrintConsumer (Consumer): + def __init__ (self, minLevel=Level.DEBUG): + self.minLevel = minLevel + + def __call__ (self, **kwargs): + if kwargs['level'] >= self.minLevel: + json.dump (kwargs, sys.stdout, cls=StrJsonEncoder) + sys.stdout.write ('\n') + sys.stdout.flush () + return kwargs + +class DatetimeConsumer (Consumer): + def __call__ (self, **kwargs): + kwargs['date'] = datetime.utcnow ().replace (tzinfo=utc) + return kwargs + +class WarcHandlerConsumer (Consumer): + def __init__ (self, warc, minLevel=Level.DEBUG): + self.warc = warc + self.minLevel = minLevel + + def __call__ (self, **kwargs): + if kwargs['level'] >= self.minLevel: + self.warc._writeLog (json.dumps (kwargs, cls=StrJsonEncoder)) + return kwargs + diff --git a/crocoite/task.py b/crocoite/task.py deleted file mode 100644 index e93cfde..0000000 --- a/crocoite/task.py +++ /dev/null @@ -1,119 +0,0 @@ -# Copyright (c) 2017–2018 crocoite contributors -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. - -""" -Celery distributed tasks -""" - -import os, logging - -from urllib.parse import urlsplit -from datetime import datetime -from operator import attrgetter -from itertools import chain - -def _monkeyPatchSyncTasks (): - """ Result sets don’t support the argument disable_sync_subtasks argument """ - import celery.result - celery.result.assert_will_not_block = lambda: None - -_monkeyPatchSyncTasks () -from celery import Celery -from celery.utils.log import get_task_logger - -from .browser import ChromeService -from .controller import SinglePageController, ControllerSettings, RecursiveController, defaultSettings, DepthLimit, PrefixLimit -from . import behavior -from .cli import parseRecursive - -app = Celery ('crocoite.distributed') -app.config_from_object('celeryconfig') -app.conf.task_routes = { - 'crocoite.task.archive': {'queue': 'crocoite.archive'}, - 'crocoite.task.controller': {'queue': 'crocoite.controller'}, - # <method>.chunks is actually a starmap job - 'celery.starmap': {'queue': 'crocoite.archive'}, - } -app.conf.task_default_queue = 'crocoite.default' -# disable prefetching, since our tasks usually run for a _very_ long time -app.conf.worker_prefetch_multiplier = 1 -logger = get_task_logger('crocoite.distributed.archive') - -@app.task(bind=True, track_started=True) -def archive (self, url, settings, enabledBehaviorNames): - """ - Archive a single URL - - Supports these config keys (celeryconfig): - - warc_filename = '{domain}-{date}-{id}.warc.gz' - temp_dir = '/tmp/' - finished_dir = '/tmp/finished' - """ - - parsedUrl = urlsplit (url) - outFile = app.conf.warc_filename.format ( - id=self.request.root_id, - domain=parsedUrl.hostname.replace ('/', '-'), - date=datetime.utcnow ().isoformat (), - ) - outPath = os.path.join (app.conf.temp_dir, outFile) - fd = open (outPath, 'wb') - - enabledBehavior = list (filter (lambda x: x.name in enabledBehaviorNames, behavior.available)) - settings = ControllerSettings (**settings) - controller = SinglePageController (url, fd, behavior=enabledBehavior, settings=settings) - ret = controller.run () - - os.makedirs (app.conf.finished_dir, exist_ok=True) - outPath = os.path.join (app.conf.finished_dir, outFile) - os.rename (fd.name, outPath) - - return ret - -class DistributedRecursiveController (RecursiveController): - """ Distributed, recursive controller using celery """ - - def __init__ (self, url, service=ChromeService (), behavior=behavior.available, \ - logger=logging.getLogger(__name__), settings=defaultSettings, - recursionPolicy=DepthLimit (0), concurrency=1): - super ().__init__ (url, None, service, behavior, logger, settings, recursionPolicy) - self.concurrency = concurrency - - def fetch (self, urls): - def chunksIter (urls): - for u in urls: - yield (u, self.settings.toDict (), list (map (attrgetter ('name'), self.behavior))) - itemsPerTask = len (urls)//self.concurrency - if itemsPerTask <= 0: - itemsPerTask = len (urls) - return chain.from_iterable (archive.chunks (chunksIter (urls), itemsPerTask).apply_async ().get ()) - -@app.task(bind=True, track_started=True) -def controller (self, url, settings, enabledBehaviorNames, recursive, concurrency): - """ Recursive controller """ - - recursionPolicy = parseRecursive (recursive, url) - enabledBehavior = list (filter (lambda x: x.name in enabledBehaviorNames, behavior.available)) - settings = ControllerSettings (**settings) - controller = DistributedRecursiveController (url, None, behavior=enabledBehavior, - settings=settings, recursionPolicy=recursionPolicy, concurrency=concurrency) - return controller.run () - diff --git a/crocoite/test_behavior.py b/crocoite/test_behavior.py new file mode 100644 index 0000000..1efea08 --- /dev/null +++ b/crocoite/test_behavior.py @@ -0,0 +1,266 @@ +# Copyright (c) 2017 crocoite contributors +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +import asyncio, os, yaml, re, math, struct +from functools import partial +from operator import attrgetter + +import pytest +from yarl import URL +from aiohttp import web + +import pkg_resources +from .logger import Logger +from .devtools import Process +from .behavior import Scroll, Behavior, ExtractLinks, ExtractLinksEvent, Crash, \ + Screenshot, ScreenshotEvent, DomSnapshot, DomSnapshotEvent, mapOrIgnore +from .controller import SinglePageController, EventHandler, ControllerSettings +from .devtools import Crashed + +with pkg_resources.resource_stream (__name__, os.path.join ('data', 'click.yaml')) as fd: + sites = list (yaml.safe_load_all (fd)) +clickParam = [] +for o in sites: + for s in o['selector']: + for u in s.get ('urls', []): + clickParam.append ((u, s['selector'])) + +class ClickTester (Behavior): + """ + Test adapter checking a given selector exists after loading the page + """ + + __slots__ = ('selector', ) + + name = 'testclick' + + def __init__ (self, loader, logger, selector): + super ().__init__ (loader, logger) + self.selector = selector + + async def onfinish (self): + tab = self.loader.tab + results = await tab.DOM.getDocument () + rootNode = results['root']['nodeId'] + results = await tab.DOM.querySelectorAll (nodeId=rootNode, selector=self.selector) + assert results['nodeIds'], self.selector + + # XXX: this is not true for every element we click. Github uses <button + # type=submit> and <form> without an event listener on the <button> +# # verify that an event listener exists +# for nid in results['nodeIds']: +# obj = (await tab.DOM.resolveNode (nodeId=nid))['object'] +# assert obj['type'] == 'object' +# listeners = (await tab.DOMDebugger.getEventListeners (objectId=obj['objectId']))['listeners'] +# assert any (map (lambda x: x['type'] == 'click', listeners)), listeners + + return + yield # pragma: no cover + +@pytest.mark.parametrize("url,selector", clickParam) +@pytest.mark.asyncio +@pytest.mark.xfail(reason='depends on network access') +async def test_click_selectors (url, selector): + """ + Make sure the CSS selector exists on an example url + """ + logger = Logger () + settings = ControllerSettings (idleTimeout=5, timeout=10) + # Some selectors are loaded dynamically and require scrolling + controller = SinglePageController (url=url, logger=logger, + settings=settings, + service=Process (), + behavior=[Scroll, partial(ClickTester, selector=selector)]) + await controller.run () + +matchParam = [] +for o in sites: + for s in o['selector']: + for u in s.get ('urls', []): + matchParam.append ((o['match'], URL (u))) + +@pytest.mark.parametrize("match,url", matchParam) +@pytest.mark.asyncio +async def test_click_match (match, url): + """ Test urls must match """ + # keep this aligned with click.js + assert re.match (match, url.host, re.I) + + +class AccumHandler (EventHandler): + """ Test adapter that accumulates all incoming items """ + __slots__ = ('data') + + def __init__ (self): + super().__init__ () + self.data = [] + + async def push (self, item): + self.data.append (item) + +async def simpleServer (url, response): + async def f (req): + return web.Response (body=response, status=200, content_type='text/html', charset='utf-8') + + app = web.Application () + app.router.add_route ('GET', url.path, f) + runner = web.AppRunner(app) + await runner.setup() + site = web.TCPSite(runner, url.host, url.port) + await site.start() + return runner + +@pytest.mark.asyncio +async def test_extract_links (): + """ + Make sure the CSS selector exists on an example url + """ + + url = URL.build (scheme='http', host='localhost', port=8080) + runner = await simpleServer (url, """<html><head></head> + <body> + <div> + <a href="/relative">foo</a> + <a href="http://example.com/absolute/">foo</a> + <a href="https://example.com/absolute/secure">foo</a> + <a href="#anchor">foo</a> + <a href="http://neue_preise_f%c3%bcr_zahnimplantate_k%c3%b6nnten_sie_%c3%bcberraschen">foo</a> + + <a href="/hidden/visibility" style="visibility: hidden">foo</a> + <a href="/hidden/display" style="display: none">foo</a> + <div style="display: none"> + <a href="/hidden/display/insidediv">foo</a> + </div> + <!--<a href="/hidden/comment">foo</a>--> + + <p><img src="shapes.png" usemap="#shapes"> + <map name="shapes"><area shape=rect coords="50,50,100,100" href="/map/rect"></map></p> + </div> + </body></html>""") + + try: + handler = AccumHandler () + logger = Logger () + controller = SinglePageController (url=url, logger=logger, + service=Process (), behavior=[ExtractLinks], handler=[handler]) + await controller.run () + + links = [] + for d in handler.data: + if isinstance (d, ExtractLinksEvent): + links.extend (d.links) + assert sorted (links) == sorted ([ + url.with_path ('/relative'), + url.with_fragment ('anchor'), + URL ('http://neue_preise_f%C3%BCr_zahnimplantate_k%C3%B6nnten_sie_%C3%BCberraschen'), + URL ('http://example.com/absolute/'), + URL ('https://example.com/absolute/secure'), + url.with_path ('/hidden/visibility'), # XXX: shall we ignore these as well? + url.with_path ('/map/rect'), + ]) + finally: + await runner.cleanup () + +@pytest.mark.asyncio +async def test_crash (): + """ + Crashing through Behavior works? + """ + + url = URL.build (scheme='http', host='localhost', port=8080) + runner = await simpleServer (url, '<html></html>') + + try: + logger = Logger () + controller = SinglePageController (url=url, logger=logger, + service=Process (), behavior=[Crash]) + with pytest.raises (Crashed): + await controller.run () + finally: + await runner.cleanup () + +@pytest.mark.asyncio +async def test_screenshot (): + """ + Make sure screenshots are taken and have the correct dimensions. We can’t + and don’t want to check their content. + """ + # ceil(0) == 0, so starting with 1 + for expectHeight in (1, Screenshot.maxDim, Screenshot.maxDim+1, Screenshot.maxDim*2+Screenshot.maxDim//2): + url = URL.build (scheme='http', host='localhost', port=8080) + runner = await simpleServer (url, f'<html><body style="margin: 0; padding: 0;"><div style="height: {expectHeight}"></div></body></html>') + + try: + handler = AccumHandler () + logger = Logger () + controller = SinglePageController (url=url, logger=logger, + service=Process (), behavior=[Screenshot], handler=[handler]) + await controller.run () + + screenshots = list (filter (lambda x: isinstance (x, ScreenshotEvent), handler.data)) + assert len (screenshots) == math.ceil (expectHeight/Screenshot.maxDim) + totalHeight = 0 + for s in screenshots: + assert s.url == url + # PNG ident is fixed, IHDR is always the first chunk + assert s.data.startswith (b'\x89PNG\r\n\x1a\n\x00\x00\x00\x0dIHDR') + width, height = struct.unpack ('>II', s.data[16:24]) + assert height <= Screenshot.maxDim + totalHeight += height + # screenshot height is at least canvas height (XXX: get hardcoded + # value from devtools.Process) + assert totalHeight == max (expectHeight, 1080) + finally: + await runner.cleanup () + +@pytest.mark.asyncio +async def test_dom_snapshot (): + """ + Behavior plug-in works, <canvas> is replaced by static image, <script> is + stripped. Actual conversion from Chrome DOM to HTML is validated by module + .test_html + """ + + url = URL.build (scheme='http', host='localhost', port=8080) + runner = await simpleServer (url, f'<html><body><p>ÄÖÜäöü</p><script>alert("yes");</script><canvas id="canvas" width="1" height="1">Alternate text.</canvas></body></html>') + + try: + handler = AccumHandler () + logger = Logger () + controller = SinglePageController (url=url, logger=logger, + service=Process (), behavior=[DomSnapshot], handler=[handler]) + await controller.run () + + snapshots = list (filter (lambda x: isinstance (x, DomSnapshotEvent), handler.data)) + assert len (snapshots) == 1 + doc = snapshots[0].document + assert doc.startswith ('<HTML><HEAD><meta charset=utf-8></HEAD><BODY><P>ÄÖÜäöü</P><IMG id=canvas width=1 height=1 src="data:image/png;base64,'.encode ('utf-8')) + assert doc.endswith ('></BODY></HTML>'.encode ('utf-8')) + finally: + await runner.cleanup () + +def test_mapOrIgnore (): + def fail (x): + if x < 50: + raise Exception () + return x+1 + + assert list (mapOrIgnore (fail, range (100))) == list (range (51, 101)) + diff --git a/crocoite/test_browser.py b/crocoite/test_browser.py new file mode 100644 index 0000000..7084214 --- /dev/null +++ b/crocoite/test_browser.py @@ -0,0 +1,387 @@ +# Copyright (c) 2017 crocoite contributors +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +import asyncio, socket +from operator import itemgetter +from http.server import BaseHTTPRequestHandler +from datetime import datetime + +from yarl import URL +from aiohttp import web +from multidict import CIMultiDict + +from hypothesis import given +import hypothesis.strategies as st +from hypothesis.provisional import domains +import pytest + +from .browser import RequestResponsePair, SiteLoader, Request, \ + UnicodeBody, ReferenceTimestamp, Base64Body, UnicodeBody, Request, \ + Response, NavigateError, PageIdle, FrameNavigated +from .logger import Logger, Consumer +from .devtools import Crashed, Process + +# if you want to know what’s going on: +#import logging +#logging.basicConfig(level=logging.DEBUG) + +class AssertConsumer (Consumer): + def __call__ (self, **kwargs): + assert 'uuid' in kwargs + assert 'msg' in kwargs + assert 'context' in kwargs + return kwargs + +@pytest.fixture +def logger (): + return Logger (consumer=[AssertConsumer ()]) + +@pytest.fixture +async def loader (logger): + async with Process () as browser, SiteLoader (browser, logger) as l: + yield l + +@pytest.mark.asyncio +async def test_crash (loader): + with pytest.raises (Crashed): + await loader.tab.Page.crash () + +@pytest.mark.asyncio +async def test_invalidurl (loader): + host = 'nonexistent.example' + + # make sure the url does *not* resolve (some DNS intercepting ISP’s mess + # with this) + loop = asyncio.get_event_loop () + try: + resolved = await loop.getaddrinfo (host, None) + except socket.gaierror: + url = URL.build (scheme='http', host=host) + with pytest.raises (NavigateError): + await loader.navigate (url) + else: + pytest.skip (f'host {host} resolved to {resolved}') + +timestamp = st.one_of ( + st.integers(min_value=0, max_value=2**32-1), + st.floats (min_value=0, max_value=2**32-1), + ) + +@given(timestamp, timestamp, timestamp) +def test_referencetimestamp (relativeA, absoluteA, relativeB): + ts = ReferenceTimestamp (relativeA, absoluteA) + absoluteA = datetime.utcfromtimestamp (absoluteA) + absoluteB = ts (relativeB) + assert (absoluteA < absoluteB and relativeA < relativeB) or \ + (absoluteA >= absoluteB and relativeA >= relativeB) + assert abs ((absoluteB - absoluteA).total_seconds () - (relativeB - relativeA)) < 10e-6 + +def urls (): + """ Build http/https URL """ + scheme = st.sampled_from (['http', 'https']) + # Path must start with a slash + pathSt = st.builds (lambda x: '/' + x, st.text ()) + args = st.fixed_dictionaries ({ + 'scheme': scheme, + 'host': domains (), + 'port': st.one_of (st.none (), st.integers (min_value=1, max_value=2**16-1)), + 'path': pathSt, + 'query_string': st.text (), + 'fragment': st.text (), + }) + return st.builds (lambda x: URL.build (**x), args) + +def urlsStr (): + return st.builds (lambda x: str (x), urls ()) + +asciiText = st.text (st.characters (min_codepoint=32, max_codepoint=126)) + +def chromeHeaders (): + # token as defined by https://tools.ietf.org/html/rfc7230#section-3.2.6 + token = st.sampled_from('abcdefghijklmnopqrstuvwxyz0123456789!#$%&\'*+-.^_`|~') + # XXX: the value should be asciiText without leading/trailing spaces + return st.dictionaries (token, token) + +def fixedDicts (fixed, dynamic): + return st.builds (lambda x, y: x.update (y), st.fixed_dictionaries (fixed), st.lists (dynamic)) + +def chromeRequestWillBeSent (reqid, url): + methodSt = st.sampled_from (['GET', 'POST', 'PUT', 'DELETE']) + return st.fixed_dictionaries ({ + 'requestId': reqid, + 'initiator': st.just ('Test'), + 'wallTime': timestamp, + 'timestamp': timestamp, + 'request': st.fixed_dictionaries ({ + 'url': url, + 'method': methodSt, + 'headers': chromeHeaders (), + # XXX: postData, hasPostData + }) + }) + +def chromeResponseReceived (reqid, url): + mimeTypeSt = st.one_of (st.none (), st.just ('text/html')) + remoteIpAddressSt = st.one_of (st.none (), st.just ('127.0.0.1')) + protocolSt = st.one_of (st.none (), st.just ('h2')) + statusCodeSt = st.integers (min_value=100, max_value=999) + typeSt = st.sampled_from (['Document', 'Stylesheet', 'Image', 'Media', + 'Font', 'Script', 'TextTrack', 'XHR', 'Fetch', 'EventSource', + 'WebSocket', 'Manifest', 'SignedExchange', 'Ping', + 'CSPViolationReport', 'Other']) + return st.fixed_dictionaries ({ + 'requestId': reqid, + 'timestamp': timestamp, + 'type': typeSt, + 'response': st.fixed_dictionaries ({ + 'url': url, + 'requestHeaders': chromeHeaders (), # XXX: make this optional + 'headers': chromeHeaders (), + 'status': statusCodeSt, + 'statusText': asciiText, + 'mimeType': mimeTypeSt, + 'remoteIPAddress': remoteIpAddressSt, + 'protocol': protocolSt, + }) + }) + +def chromeReqResp (): + # XXX: will this gnerated the same url for all testcases? + reqid = st.shared (st.text (), 'reqresp') + url = st.shared (urlsStr (), 'reqresp') + return st.tuples (chromeRequestWillBeSent (reqid, url), + chromeResponseReceived (reqid, url)) + +def requestResponsePair (): + def f (creq, cresp, hasPostData, reqBody, respBody): + i = RequestResponsePair () + i.fromRequestWillBeSent (creq) + i.request.hasPostData = hasPostData + if hasPostData: + i.request.body = reqBody + + if cresp is not None: + i.fromResponseReceived (cresp) + if respBody is not None: + i.response.body = respBody + return i + + bodySt = st.one_of ( + st.none (), + st.builds (UnicodeBody, st.text ()), + st.builds (Base64Body.fromBytes, st.binary ()) + ) + return st.builds (lambda reqresp, hasPostData, reqBody, respBody: + f (reqresp[0], reqresp[1], hasPostData, reqBody, respBody), + chromeReqResp (), st.booleans (), bodySt, bodySt) + +@given(chromeReqResp ()) +def test_requestResponsePair (creqresp): + creq, cresp = creqresp + + item = RequestResponsePair () + + assert item.id is None + assert item.url is None + assert item.request is None + assert item.response is None + + item.fromRequestWillBeSent (creq) + + assert item.id == creq['requestId'] + url = URL (creq['request']['url']) + assert item.url == url + assert item.request is not None + assert item.request.timestamp == datetime.utcfromtimestamp (creq['wallTime']) + assert set (item.request.headers.keys ()) == set (creq['request']['headers'].keys ()) + assert item.response is None + + item.fromResponseReceived (cresp) + + # url will not be overwritten + assert item.id == creq['requestId'] == cresp['requestId'] + assert item.url == url + assert item.request is not None + assert set (item.request.headers.keys ()) == set (cresp['response']['requestHeaders'].keys ()) + assert item.response is not None + assert set (item.response.headers.keys ()) == set (cresp['response']['headers'].keys ()) + assert (item.response.timestamp - item.request.timestamp).total_seconds () - \ + (cresp['timestamp'] - creq['timestamp']) < 10e-6 + +@given(chromeReqResp ()) +def test_requestResponsePair_eq (creqresp): + creq, cresp = creqresp + + item = RequestResponsePair () + item2 = RequestResponsePair () + assert item == item + assert item == item2 + + item.fromRequestWillBeSent (creq) + assert item != item2 + item2.fromRequestWillBeSent (creq) + assert item == item + assert item == item2 + + item.fromResponseReceived (cresp) + assert item != item2 + item2.fromResponseReceived (cresp) + assert item == item + assert item == item2 + + # XXX: test for inequality with different parameters + +### Google Chrome integration tests ### + +serverUrl = URL.build (scheme='http', host='localhost', port=8080) +items = [ + RequestResponsePair ( + url=serverUrl.with_path ('/encoding/utf-8'), + request=Request (method='GET'), + response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=utf-8')]), + body=UnicodeBody ('äöü'), mimeType='text/html') + ), + RequestResponsePair ( + url=serverUrl.with_path ('/encoding/latin1'), + request=Request (method='GET'), + response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=latin1')]), + body=UnicodeBody ('äöü'), mimeType='text/html') + ), + RequestResponsePair ( + url=serverUrl.with_path ('/encoding/utf-16'), + request=Request (method='GET'), + response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=utf-16')]), + body=UnicodeBody ('äöü'), mimeType='text/html') + ), + RequestResponsePair ( + url=serverUrl.with_path ('/encoding/ISO-8859-1'), + request=Request (method='GET'), + response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=ISO-8859-1')]), + body=UnicodeBody ('äöü'), mimeType='text/html') + ), + RequestResponsePair ( + url=serverUrl.with_path ('/status/200'), + request=Request (method='GET'), + response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/plain')]), + body=b'', + mimeType='text/plain'), + ), + # redirects never have a response body + RequestResponsePair ( + url=serverUrl.with_path ('/status/301'), + request=Request (method='GET'), + response=Response (status=301, + headers=CIMultiDict ([('Content-Type', 'text/plain'), + ('Location', str (serverUrl.with_path ('/status/301/redirected')))]), + body=None, + mimeType='text/plain'), + ), + RequestResponsePair ( + url=serverUrl.with_path ('/image/png'), + request=Request (method='GET'), + response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'image/png')]), + body=Base64Body.fromBytes (b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x00\x00\x00\x00:~\x9bU\x00\x00\x00\nIDAT\x08\x1dc\xf8\x0f\x00\x01\x01\x01\x006_g\x80\x00\x00\x00\x00IEND\xaeB`\x82'), + mimeType='image/png'), + ), + RequestResponsePair ( + url=serverUrl.with_path ('/script/alert'), + request=Request (method='GET'), + response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=utf-8')]), + body=UnicodeBody ('''<html><body><script> +window.addEventListener("beforeunload", function (e) { + e.returnValue = "bye?"; + return e.returnValue; +}); +alert("stopping here"); +if (confirm("are you sure?") || prompt ("42?")) { + window.location = "/nonexistent"; +} +</script></body></html>'''), mimeType='text/html') + ), + ] + +@pytest.mark.asyncio +# would be nice if we could use hypothesis here somehow +@pytest.mark.parametrize("golden", items) +async def test_integration_item (loader, golden): + async def f (req): + body = golden.response.body + contentType = golden.response.headers.get ('content-type', '') if golden.response.headers is not None else '' + charsetOff = contentType.find ('charset=') + if isinstance (body, UnicodeBody) and charsetOff != -1: + encoding = contentType[charsetOff+len ('charset='):] + body = golden.response.body.decode ('utf-8').encode (encoding) + return web.Response (body=body, status=golden.response.status, + headers=golden.response.headers) + + app = web.Application () + app.router.add_route (golden.request.method, golden.url.path, f) + runner = web.AppRunner(app) + await runner.setup() + site = web.TCPSite(runner, serverUrl.host, serverUrl.port) + try: + await site.start() + except Exception as e: + pytest.skip (e) + + haveReqResp = False + haveNavigated = False + try: + await loader.navigate (golden.url) + + it = loader.__aiter__ () + while True: + try: + item = await asyncio.wait_for (it.__anext__ (), timeout=1) + except asyncio.TimeoutError: + break + # XXX: can only check the first req/resp right now (due to redirect) + if isinstance (item, RequestResponsePair) and not haveReqResp: + # we do not know this in advance + item.request.initiator = None + item.request.headers = None + item.remoteIpAddress = None + item.protocol = None + item.resourceType = None + + if item.response: + assert item.response.statusText is not None + item.response.statusText = None + + del item.response.headers['server'] + del item.response.headers['content-length'] + del item.response.headers['date'] + assert item == golden + haveReqResp = True + elif isinstance (item, FrameNavigated): + # XXX: can’t check this, because of the redirect + #assert item.url == golden.url + haveNavigated = True + finally: + assert haveReqResp + assert haveNavigated + await runner.cleanup () + +def test_page_idle (): + for v in (True, False): + idle = PageIdle (v) + assert bool (idle) == v + + diff --git a/crocoite/test_controller.py b/crocoite/test_controller.py new file mode 100644 index 0000000..7216a42 --- /dev/null +++ b/crocoite/test_controller.py @@ -0,0 +1,203 @@ +# Copyright (c) 2017–2018 crocoite contributors +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +import asyncio + +from yarl import URL +from aiohttp import web + +import pytest + +from .logger import Logger +from .controller import ControllerSettings, SinglePageController, SetEntry, \ + IdleStateTracker +from .browser import PageIdle +from .devtools import Process +from .test_browser import loader + +@pytest.mark.asyncio +async def test_controller_timeout (): + """ Make sure the controller terminates, even if the site keeps reloading/fetching stuff """ + + async def f (req): + return web.Response (body="""<html> +<body> +<p>hello</p> +<script> +window.setTimeout (function () { window.location = '/' }, 250); +window.setInterval (function () { fetch('/').then (function (e) { console.log (e) }) }, 150); +</script> +</body> +</html>""", status=200, content_type='text/html', charset='utf-8') + + url = URL.build (scheme='http', host='localhost', port=8080) + app = web.Application () + app.router.add_route ('GET', '/', f) + runner = web.AppRunner(app) + await runner.setup() + site = web.TCPSite(runner, url.host, url.port) + await site.start() + + loop = asyncio.get_event_loop () + try: + logger = Logger () + settings = ControllerSettings (idleTimeout=1, timeout=5) + controller = SinglePageController (url=url, logger=logger, + service=Process (), behavior=[], settings=settings) + # give the controller a little more time to finish, since there are + # hard-coded asyncio.sleep calls in there right now. + # XXX fix this + before = loop.time () + await asyncio.wait_for (controller.run (), timeout=settings.timeout*2) + after = loop.time () + assert after-before >= settings.timeout, (settings.timeout*2, after-before) + finally: + # give the browser some time to close before interrupting the + # connection by destroying the HTTP server + await asyncio.sleep (1) + await runner.cleanup () + +@pytest.mark.asyncio +async def test_controller_idle_timeout (): + """ Make sure the controller terminates, even if the site keeps reloading/fetching stuff """ + + async def f (req): + return web.Response (body="""<html> +<body> +<p>hello</p> +<script> +window.setInterval (function () { fetch('/').then (function (e) { console.log (e) }) }, 2000); +</script> +</body> +</html>""", status=200, content_type='text/html', charset='utf-8') + + url = URL.build (scheme='http', host='localhost', port=8080) + app = web.Application () + app.router.add_route ('GET', '/', f) + runner = web.AppRunner(app) + await runner.setup() + site = web.TCPSite(runner, url.host, url.port) + await site.start() + + loop = asyncio.get_event_loop () + try: + logger = Logger () + settings = ControllerSettings (idleTimeout=1, timeout=60) + controller = SinglePageController (url=url, logger=logger, + service=Process (), behavior=[], settings=settings) + before = loop.time () + await asyncio.wait_for (controller.run (), settings.timeout*2) + after = loop.time () + assert settings.idleTimeout <= after-before <= settings.idleTimeout*2+3 + finally: + await runner.cleanup () + +def test_set_entry (): + a = SetEntry (1, a=2, b=3) + assert a == a + assert hash (a) == hash (a) + + b = SetEntry (1, a=2, b=4) + assert a == b + assert hash (a) == hash (b) + + c = SetEntry (2, a=2, b=3) + assert a != c + assert hash (a) != hash (c) + +@pytest.mark.asyncio +async def test_idle_state_tracker (): + # default is idle + loop = asyncio.get_event_loop () + idle = IdleStateTracker (loop) + assert idle._idle + + # idle change + await idle.push (PageIdle (False)) + assert not idle._idle + + # nothing happens for other objects + await idle.push ({}) + assert not idle._idle + + # no state change -> wait does not return + with pytest.raises (asyncio.TimeoutError): + await asyncio.wait_for (idle.wait (0.1), timeout=1) + + # wait at least timeout + delta = 0.2 + timeout = 1 + await idle.push (PageIdle (True)) + assert idle._idle + start = loop.time () + await idle.wait (timeout) + end = loop.time () + assert (timeout-delta) < (end-start) < (timeout+delta) + +@pytest.fixture +async def recordingServer (): + """ Simple HTTP server that records raw requests """ + url = URL ('http://localhost:8080') + reqs = [] + async def record (request): + reqs.append (request) + return web.Response(text='ok', content_type='text/plain') + app = web.Application() + app.add_routes([web.get(url.path, record)]) + runner = web.AppRunner(app) + await runner.setup() + site = web.TCPSite (runner, url.host, url.port) + await site.start() + yield url, reqs + await runner.cleanup () + +from .test_devtools import tab, browser +from http.cookies import Morsel, SimpleCookie + +@pytest.mark.asyncio +async def test_set_cookies (tab, recordingServer): + """ Make sure cookies are set properly and only affect the domain they were + set for """ + + logger = Logger () + + url, reqs = recordingServer + + cookies = [] + c = Morsel () + c.set ('foo', 'bar', '') + c['domain'] = 'localhost' + cookies.append (c) + c = Morsel () + c.set ('buz', 'beef', '') + c['domain'] = 'nonexistent.example' + + settings = ControllerSettings (idleTimeout=1, timeout=60, cookies=cookies) + controller = SinglePageController (url=url, logger=logger, + service=Process (), behavior=[], settings=settings) + await asyncio.wait_for (controller.run (), settings.timeout*2) + + assert len (reqs) == 1 + req = reqs[0] + reqCookies = SimpleCookie (req.headers['cookie']) + assert len (reqCookies) == 1 + c = next (iter (reqCookies.values ())) + assert c.key == cookies[0].key + assert c.value == cookies[0].value diff --git a/crocoite/test_devtools.py b/crocoite/test_devtools.py new file mode 100644 index 0000000..bd1a828 --- /dev/null +++ b/crocoite/test_devtools.py @@ -0,0 +1,208 @@ +# Copyright (c) 2017 crocoite contributors +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +import asyncio +import pytest + +from aiohttp import web +import websockets + +from .devtools import Browser, Tab, MethodNotFound, Crashed, \ + InvalidParameter, Process, Passthrough + +@pytest.fixture +async def browser (): + async with Process () as url: + yield Browser (url) + +@pytest.fixture +async def tab (browser): + async with browser as tab: + yield tab + # make sure there are no transactions left over (i.e. no unawaited requests) + assert not tab.transactions + +docBody = "<html><body><p>Hello, world</p></body></html>" +async def hello(request): + return web.Response(text=docBody, content_type='text/html') + +@pytest.fixture +async def server (): + """ Simple HTTP server for testing notifications """ + app = web.Application() + app.add_routes([web.get('/', hello)]) + runner = web.AppRunner(app) + await runner.setup() + site = web.TCPSite(runner, 'localhost', 8080) + await site.start() + yield app + await runner.cleanup () + +@pytest.mark.asyncio +async def test_tab_create (tab): + """ Creating tabs works """ + assert isinstance (tab, Tab) + version = await tab.Browser.getVersion () + assert 'protocolVersion' in version + assert tab.pending == 0 + +@pytest.mark.asyncio +async def test_tab_close (browser): + """ Tabs are closed after using them """ + async with browser as tab: + tid = tab.id + # give the browser some time to close the tab + await asyncio.sleep (0.5) + tabs = [t['id'] async for t in browser] + assert tid not in tabs + +@pytest.mark.asyncio +async def test_tab_notify_enable_disable (tab): + """ Make sure enabling/disabling notifications works for all known + namespaces """ + for name in ('Debugger', 'DOM', 'Log', 'Network', 'Page', 'Performance', + 'Profiler', 'Runtime', 'Security'): + f = getattr (tab, name) + await f.enable () + await f.disable () + +@pytest.mark.asyncio +async def test_tab_unknown_method (tab): + with pytest.raises (MethodNotFound): + await tab.Nonexistent.foobar () + +@pytest.mark.asyncio +async def test_tab_invalid_argument (tab): + # should be string + with pytest.raises (InvalidParameter): + await tab.Page.captureScreenshot (format=123) + + with pytest.raises (InvalidParameter): + await tab.Page.captureScreenshot (format=[123]) + + with pytest.raises (InvalidParameter): + await tab.Page.captureScreenshot (format={123: '456'}) + +@pytest.mark.asyncio +async def test_tab_crash (tab): + with pytest.raises (Crashed): + await tab.Page.crash () + + # caling anything else now should fail as well + with pytest.raises (Crashed): + await tab.Browser.getVersion () + +@pytest.mark.asyncio +async def test_load (tab, server): + await tab.Network.enable () + await tab.Page.navigate (url='http://localhost:8080') + + haveRequest = False + haveResponse = False + haveData = False + haveFinished = False + haveBody = False + req = None + resp = None + while not haveBody: + method, data = await tab.get () + + # it can be either of those two in no specified order + if method in (tab.Network.requestWillBeSent, tab.Network.requestWillBeSentExtraInfo) and not haveResponse: + if req is None: + req = data + assert data['requestId'] == req['requestId'] + haveRequest = True + elif method in (tab.Network.responseReceived, tab.Network.responseReceivedExtraInfo) and haveRequest: + if resp is None: + resp = data + assert data['requestId'] == resp['requestId'] + haveResponse = True + elif haveRequest and haveResponse and method == tab.Network.dataReceived: + assert data['dataLength'] == len (docBody) + assert data['requestId'] == req['requestId'] + haveData = True + elif haveData: + assert method == tab.Network.loadingFinished + assert data['requestId'] == req['requestId'] + haveBody = True + elif haveFinished: + body = await tab.Network.getResponseBody (requestId=req['requestId']) + assert body['body'] == docBody + haveBody = True + else: + assert False, (method, req) + + await tab.Network.disable () + assert tab.pending == 0 + +@pytest.mark.asyncio +async def test_recv_failure(browser): + """ Inject failure into receiver process and crash it """ + async with browser as tab: + await tab.ws.close () + with pytest.raises (Crashed): + await tab.Browser.getVersion () + + async with browser as tab: + await tab.ws.close () + with pytest.raises (Crashed): + await tab.get () + + async with browser as tab: + handle = asyncio.ensure_future (tab.get ()) + await tab.ws.close () + with pytest.raises (Crashed): + await handle + +@pytest.mark.asyncio +async def test_tab_function (tab): + assert tab.Network.enable.name == 'Network.enable' + assert tab.Network.disable == tab.Network.disable + assert tab.Network.enable != tab.Network.disable + assert tab.Network != tab.Network.enable + assert callable (tab.Network.enable) + assert not callable (tab.Network.enable.name) + assert 'Network.enable' in repr (tab.Network.enable) + +@pytest.mark.asyncio +async def test_tab_function_hash (tab): + d = {tab.Network.enable: 1, tab.Network.disable: 2, tab.Page: 3, + tab.Page.enable: 4} + assert len (d) == 4 + +@pytest.mark.asyncio +async def test_ws_ping(tab): + """ + Chrome does not like websocket pings and closes the connection if it + receives one. Not sure why. + """ + with pytest.raises (Crashed): + await tab.ws.ping () + await tab.Browser.getVersion () + +@pytest.mark.asyncio +async def test_passthrough (): + """ Null service returns the url as is """ + + url = 'http://localhost:12345' + async with Passthrough (url) as u: + assert str (u) == url + diff --git a/crocoite/test_html.py b/crocoite/test_html.py new file mode 100644 index 0000000..c17903b --- /dev/null +++ b/crocoite/test_html.py @@ -0,0 +1,96 @@ +# Copyright (c) 2017 crocoite contributors +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +import asyncio +import pytest, html5lib +from html5lib.serializer import HTMLSerializer +from html5lib.treewalkers import getTreeWalker +from aiohttp import web + +from .html import StripTagFilter, StripAttributeFilter, ChromeTreeWalker +from .test_devtools import tab, browser + +def test_strip_tag (): + d = html5lib.parse ('<a>barbaz<b>foobar</b>.</a><b>foobar</b>.<b attr=1><c></c>') + stream = StripTagFilter (getTreeWalker ('etree')(d), ['b', 'c']) + serializer = HTMLSerializer () + assert serializer.render (stream) == '<a>barbaz.</a>.' + +def test_strip_attribute (): + d = html5lib.parse ('<a b=1 c="yes" d></a><br b=2 c="no" d keep=1>') + stream = StripAttributeFilter (getTreeWalker ('etree')(d), ['b', 'c', 'd']) + serializer = HTMLSerializer () + assert serializer.render (stream) == '<a></a><br keep=1>' + +@pytest.mark.asyncio +async def test_treewalker (tab): + frames = await tab.Page.getFrameTree () + + framehtml = '<HTML><HEAD></HEAD><BODY></BODY></HTML>' + html = '<HTML><HEAD><META charset=utf-8></HEAD><BODY><H1>Hello</H1><!-- comment --><IFRAME></IFRAME></BODY></HTML>' + rootframe = frames['frameTree']['frame']['id'] + await tab.Page.setDocumentContent (frameId=rootframe, html=html) + + dom = await tab.DOM.getDocument (depth=-1, pierce=True) + docs = list (ChromeTreeWalker (dom['root']).split ()) + assert len(docs) == 2 + for i, doc in enumerate (docs): + walker = ChromeTreeWalker (doc) + serializer = HTMLSerializer () + result = serializer.render (iter(walker)) + if i == 0: + assert result == html + elif i == 1: + assert result == framehtml + +cdataDoc = '<test><![CDATA[Hello world]]></test>' +xmlHeader = '<?xml version="1.0" encoding="UTF-8"?>' +async def hello(request): + return web.Response(text=xmlHeader + cdataDoc, content_type='text/xml') + +@pytest.fixture +async def server (): + """ Simple HTTP server for testing notifications """ + app = web.Application() + app.add_routes([web.get('/test.xml', hello)]) + runner = web.AppRunner(app) + await runner.setup() + site = web.TCPSite(runner, 'localhost', 8080) + await site.start() + yield app + await runner.cleanup () + +@pytest.mark.asyncio +async def test_treewalker_cdata (tab, server): + ret = await tab.Page.navigate (url='http://localhost:8080/test.xml') + # wait until loaded XXX: replace with idle check + await asyncio.sleep (0.5) + dom = await tab.DOM.getDocument (depth=-1, pierce=True) + docs = list (ChromeTreeWalker (dom['root']).split ()) + assert len(docs) == 1 + for i, doc in enumerate (docs): + walker = ChromeTreeWalker (doc) + serializer = HTMLSerializer () + result = serializer.render (iter(walker)) + # chrome will display a pretty-printed viewer *plus* the original + # source (stripped of its xml header) + assert cdataDoc in result + + diff --git a/crocoite/test_irc.py b/crocoite/test_irc.py new file mode 100644 index 0000000..9344de4 --- /dev/null +++ b/crocoite/test_irc.py @@ -0,0 +1,70 @@ +# Copyright (c) 2017–2018 crocoite contributors +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +import pytest +from .irc import ArgparseBot, RefCountEvent, User, NickMode + +def test_mode_parse (): + assert ArgparseBot.parseMode ('+a') == [('+', 'a')] + assert ArgparseBot.parseMode ('+ab') == [('+', 'a'), ('+', 'b')] + assert ArgparseBot.parseMode ('+a+b') == [('+', 'a'), ('+', 'b')] + assert ArgparseBot.parseMode ('-a') == [('-', 'a')] + assert ArgparseBot.parseMode ('-ab') == [('-', 'a'), ('-', 'b')] + assert ArgparseBot.parseMode ('-a-b') == [('-', 'a'), ('-', 'b')] + assert ArgparseBot.parseMode ('+a-b') == [('+', 'a'), ('-', 'b')] + assert ArgparseBot.parseMode ('-a+b') == [('-', 'a'), ('+', 'b')] + assert ArgparseBot.parseMode ('-ab+cd') == [('-', 'a'), ('-', 'b'), ('+', 'c'), ('+', 'd')] + +@pytest.fixture +def event (): + return RefCountEvent () + +def test_refcountevent_arm (event): + event.arm () + assert event.event.is_set () + +def test_refcountevent_ctxmgr (event): + with event: + assert event.count == 1 + with event: + assert event.count == 2 + +def test_refcountevent_arm_with (event): + with event: + event.arm () + assert not event.event.is_set () + assert event.event.is_set () + +def test_nick_mode (): + a = User.fromName ('a') + a2 = User.fromName ('a') + a3 = User.fromName ('+a') + b = User.fromName ('+b') + c = User.fromName ('@c') + + # equality is based on name only, not mode + assert a == a2 + assert a == a3 + assert a != b + + assert a.hasPriv (None) and not a.hasPriv (NickMode.voice) and not a.hasPriv (NickMode.operator) + assert b.hasPriv (None) and b.hasPriv (NickMode.voice) and not b.hasPriv (NickMode.operator) + assert c.hasPriv (None) and c.hasPriv (NickMode.voice) and c.hasPriv (NickMode.operator) + diff --git a/crocoite/test_logger.py b/crocoite/test_logger.py new file mode 100644 index 0000000..26e420a --- /dev/null +++ b/crocoite/test_logger.py @@ -0,0 +1,91 @@ +import pytest +from .logger import Logger, Consumer, NullConsumer, Level, DatetimeConsumer + +@pytest.fixture +def logger (): + return Logger (consumer=[NullConsumer (), DatetimeConsumer ()]) + +class QueueConsumer (Consumer): + def __init__ (self): + self.data = [] + + def __call__ (self, **kwargs): + self.data.append (kwargs) + return kwargs + +def test_bind (logger): + # simple bind + logger = logger.bind (foo='bar') + ret = logger.debug () + assert ret['foo'] == 'bar' + + # additional + ret = logger.debug (bar='baz') + assert ret['foo'] == 'bar' + assert ret['bar'] == 'baz' + + # override + ret = logger.debug (foo='baz') + assert ret['foo'] == 'baz' + + # unbind + logger = logger.unbind (foo=None) + ret = logger.debug () + assert 'foo' not in ret + +def test_consumer (logger): + c = QueueConsumer () + logger.connect (c) + ret = logger.debug (foo='bar') + assert len (c.data) == 1 + assert c.data[0] == ret + assert ret['foo'] == 'bar' + c.data = [] + + # inheritance + logger = logger.bind (inherit=1) + ret = logger.debug (foo='bar') + assert len (c.data) == 1 + assert c.data[0] == ret + assert ret['foo'] == 'bar' + assert ret['inherit'] == 1 + c.data = [] + + # removal + logger.disconnect (c) + ret = logger.debug (foo='bar') + assert len (c.data) == 0 + assert ret['foo'] == 'bar' + assert ret['inherit'] == 1 + +def test_multiarg (logger): + # single argument + ret = logger.debug('maybe', foo='bar') + assert ret['msg'] == 'maybe' + assert ret['foo'] == 'bar' + + # multi arguments + ret = logger.debug('may', 'be', foo='bar') + assert ret['msg'] == ('may', 'be') + assert ret['foo'] == 'bar' + +def test_call (logger): + for level in ('debug', Level.DEBUG): + ret = logger(level, 'arg1', 'arg2', foo='bar') + assert ret['level'] == Level.DEBUG + assert ret['msg'] == ('arg1', 'arg2') + assert ret['foo'] == 'bar' + +def test_datetime (logger): + ret = logger.debug() + assert 'date' in ret + +def test_independence (): + """ Make sure two instances are completely independent """ + l1 = Logger () + c = QueueConsumer () + l1.connect (c) + l2 = Logger () + l2.info (nothing='nothing') + assert not c.data + diff --git a/crocoite/test_tools.py b/crocoite/test_tools.py new file mode 100644 index 0000000..416b954 --- /dev/null +++ b/crocoite/test_tools.py @@ -0,0 +1,224 @@ +# Copyright (c) 2018 crocoite contributors +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +from tempfile import NamedTemporaryFile +from operator import itemgetter +from io import BytesIO +import pytest +from warcio.archiveiterator import ArchiveIterator +from warcio.warcwriter import WARCWriter +from warcio.statusandheaders import StatusAndHeaders +from pkg_resources import parse_version + +from .tools import mergeWarc, Errata, FixableErrata + +@pytest.fixture +def writer(): + return WARCWriter (NamedTemporaryFile(), gzip=True) + +def recordsEqual(golden, underTest): + for a, b in zip (golden, underTest): + # record ids are not predictable, so we cannot compare them. Dito for + # dates. Content-* seems to be added when writing to file. + for x in {'WARC-Record-Id', 'WARC-Block-Digest', 'WARC-Date', + 'Content-Length', 'Content-Type'}: + a.rec_headers.remove_header(x) + b.rec_headers.remove_header(x) + aheader = sorted(a.rec_headers.headers, key=itemgetter(0)) + bheader = sorted(b.rec_headers.headers, key=itemgetter(0)) + assert aheader == bheader + assert a.http_headers == b.http_headers + +def makeGolden(writer, records): + # additional warcinfo is written. Content does not matter. + record = writer.create_warc_record ( + '', + 'warcinfo', + payload=b'', + warc_headers_dict={'Content-Type': 'application/json; charset=utf-8'}) + records.insert (0, record) + return records + +def test_unmodified(writer): + """ + Single request/response pair, no revisits + """ + + records = [] + + httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True) + warcHeaders = {} + record = writer.create_warc_record ('http://example.com/', 'request', payload=BytesIO(b'foobar'), + warc_headers_dict=warcHeaders, http_headers=httpHeaders) + records.append (record) + + httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') + record = writer.create_warc_record ('http://example.com/', 'response', payload=BytesIO(b'data'), + warc_headers_dict=warcHeaders, http_headers=httpHeaders) + records.append (record) + + for r in records: + writer.write_record (r) + + output = NamedTemporaryFile() + mergeWarc ([writer.out.name], output) + + output.seek(0) + recordsEqual (makeGolden (writer, records), ArchiveIterator (output)) + +def test_different_payload(writer): + """ + Duplicate URL, but different payload + """ + + records = [] + for i in range (2): + httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True) + warcHeaders = {} + record = writer.create_warc_record ('http://example.com/', 'request', payload=BytesIO(b'foobar'), + warc_headers_dict=warcHeaders, http_headers=httpHeaders) + records.append (record) + + httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') + record = writer.create_warc_record ('http://example.com/', 'response', + payload=BytesIO(f'data{i}'.encode ('utf8')), + warc_headers_dict=warcHeaders, http_headers=httpHeaders) + records.append (record) + + for r in records: + writer.write_record (r) + + output = NamedTemporaryFile() + mergeWarc ([writer.out.name], output) + + output.seek(0) + recordsEqual (makeGolden (writer, records), ArchiveIterator (output)) + +def makeRevisit(writer, ref, dup): + """ Make revisit record for reference """ + dupHeaders = dup.rec_headers + refHeaders = ref.rec_headers + record = writer.create_revisit_record (dupHeaders.get_header('WARC-Target-URI'), + digest=refHeaders.get_header('WARC-Payload-Digest'), + refers_to_uri=refHeaders.get_header('WARC-Target-URI'), + refers_to_date=refHeaders.get_header('WARC-Date'), + http_headers=dup.http_headers) + record.rec_headers.add_header ('WARC-Refers-To', refHeaders.get_header('WARC-Record-ID')) + record.rec_headers.add_header ('WARC-Truncated', 'length') + return record + +def test_resp_revisit_same_url(writer): + """ + Duplicate record for the same URL, creates a revisit + """ + + records = [] + for i in range (2): + httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True) + warcHeaders = {} + record = writer.create_warc_record ('http://example.com/', 'request', payload=BytesIO(b'foobar'), + warc_headers_dict=warcHeaders, http_headers=httpHeaders) + records.append (record) + + httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') + record = writer.create_warc_record ('http://example.com/', 'response', payload=BytesIO(b'data'), + warc_headers_dict=warcHeaders, http_headers=httpHeaders) + records.append (record) + + for r in records: + writer.write_record (r) + + dup = records.pop () + ref = records[1] + records.append (makeRevisit (writer, ref, dup)) + + output = NamedTemporaryFile() + mergeWarc ([writer.out.name], output) + + output.seek(0) + recordsEqual (makeGolden (writer, records), ArchiveIterator (output)) + +def test_resp_revisit_other_url(writer): + """ + Duplicate record for different URL, creates a revisit + """ + + records = [] + + httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True) + warcHeaders = {} + record = writer.create_warc_record ('http://example.com/', 'request', payload=BytesIO(b'foobar'), + warc_headers_dict=warcHeaders, http_headers=httpHeaders) + records.append (record) + + httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') + record = writer.create_warc_record ('http://example.com/', 'response', payload=BytesIO(b'data'), + warc_headers_dict=warcHeaders, http_headers=httpHeaders) + records.append (record) + + httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True) + warcHeaders = {} + record = writer.create_warc_record ('http://example.com/one', 'request', payload=BytesIO(b'foobar'), + warc_headers_dict=warcHeaders, http_headers=httpHeaders) + records.append (record) + + httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') + record = writer.create_warc_record ('http://example.com/one', 'response', payload=BytesIO(b'data'), + warc_headers_dict=warcHeaders, http_headers=httpHeaders) + records.append (record) + + for r in records: + writer.write_record (r) + + dup = records.pop () + ref = records[1] + records.append (makeRevisit (writer, ref, dup)) + + output = NamedTemporaryFile() + mergeWarc ([writer.out.name], output) + + output.seek(0) + recordsEqual (makeGolden (writer, records), ArchiveIterator (output)) + +def test_errata_contains(): + """ Test version matching """ + e = Errata('some-uuid', 'description', ['a<1.0']) + assert {'a': parse_version('0.1')} in e + assert {'a': parse_version('1.0')} not in e + assert {'b': parse_version('1.0')} not in e + + e = Errata('some-uuid', 'description', ['a<1.0,>0.1']) + assert {'a': parse_version('0.1')} not in e + assert {'a': parse_version('0.2')} in e + assert {'a': parse_version('1.0')} not in e + + # a AND b + e = Errata('some-uuid', 'description', ['a<1.0', 'b>1.0']) + assert {'a': parse_version('0.1')} not in e + assert {'b': parse_version('1.1')} not in e + assert {'a': parse_version('0.1'), 'b': parse_version('1.1')} in e + +def test_errata_fixable (): + e = Errata('some-uuid', 'description', ['a<1.0', 'b>1.0']) + assert not e.fixable + + e = FixableErrata('some-uuid', 'description', ['a<1.0', 'b>1.0']) + assert e.fixable + diff --git a/crocoite/test_warc.py b/crocoite/test_warc.py new file mode 100644 index 0000000..3ec310c --- /dev/null +++ b/crocoite/test_warc.py @@ -0,0 +1,225 @@ +# Copyright (c) 2018 crocoite contributors +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +from tempfile import NamedTemporaryFile +import json, urllib +from operator import itemgetter + +from warcio.archiveiterator import ArchiveIterator +from yarl import URL +from multidict import CIMultiDict +from hypothesis import given, reproduce_failure +import hypothesis.strategies as st +import pytest + +from .warc import WarcHandler +from .logger import Logger, WarcHandlerConsumer +from .controller import ControllerStart +from .behavior import Script, ScreenshotEvent, DomSnapshotEvent +from .browser import RequestResponsePair, Base64Body, UnicodeBody +from .test_browser import requestResponsePair, urls + +def test_log (): + logger = Logger () + + with NamedTemporaryFile() as fd: + with WarcHandler (fd, logger) as handler: + warclogger = WarcHandlerConsumer (handler) + logger.connect (warclogger) + golden = [] + + assert handler.log.tell () == 0 + golden.append (logger.info (foo=1, bar='baz', encoding='äöü⇔ΓΨ')) + assert handler.log.tell () != 0 + + handler.maxLogSize = 0 + golden.append (logger.info (bar=1, baz='baz')) + # should flush the log + assert handler.log.tell () == 0 + + fd.seek (0) + for it in ArchiveIterator (fd): + headers = it.rec_headers + assert headers['warc-type'] == 'metadata' + assert 'warc-target-uri' not in headers + assert headers['x-crocoite-type'] == 'log' + assert headers['content-type'] == f'application/json; charset={handler.logEncoding}' + + while True: + l = it.raw_stream.readline () + if not l: + break + data = json.loads (l.strip ()) + assert data == golden.pop (0) + +def jsonObject (): + """ JSON-encodable objects """ + return st.dictionaries (st.text (), st.one_of (st.integers (), st.text ())) + +def viewport (): + return st.builds (lambda x, y: f'{x}x{y}', st.integers (), st.integers ()) + +def event (): + return st.one_of ( + st.builds (ControllerStart, jsonObject ()), + st.builds (Script.fromStr, st.text (), st.one_of(st.none (), st.text ())), + st.builds (ScreenshotEvent, urls (), st.integers (), st.binary ()), + st.builds (DomSnapshotEvent, urls (), st.builds (lambda x: x.encode ('utf-8'), st.text ()), viewport()), + requestResponsePair (), + ) + +@pytest.mark.asyncio +@given (st.lists (event ())) +async def test_push (golden): + def checkWarcinfoId (headers): + if lastWarcinfoRecordid is not None: + assert headers['WARC-Warcinfo-ID'] == lastWarcinfoRecordid + + lastWarcinfoRecordid = None + + # null logger + logger = Logger () + with open('/tmp/test.warc.gz', 'w+b') as fd: + with WarcHandler (fd, logger) as handler: + for g in golden: + await handler.push (g) + + fd.seek (0) + it = iter (ArchiveIterator (fd)) + for g in golden: + if isinstance (g, ControllerStart): + rec = next (it) + + headers = rec.rec_headers + assert headers['warc-type'] == 'warcinfo' + assert 'warc-target-uri' not in headers + assert 'x-crocoite-type' not in headers + + data = json.load (rec.raw_stream) + assert data == g.payload + + lastWarcinfoRecordid = headers['warc-record-id'] + assert lastWarcinfoRecordid + elif isinstance (g, Script): + rec = next (it) + + headers = rec.rec_headers + assert headers['warc-type'] == 'resource' + assert headers['content-type'] == 'application/javascript; charset=utf-8' + assert headers['x-crocoite-type'] == 'script' + checkWarcinfoId (headers) + if g.path: + assert URL (headers['warc-target-uri']) == URL ('file://' + g.abspath) + else: + assert 'warc-target-uri' not in headers + + data = rec.raw_stream.read ().decode ('utf-8') + assert data == g.data + elif isinstance (g, ScreenshotEvent): + # XXX: check refers-to header + rec = next (it) + + headers = rec.rec_headers + assert headers['warc-type'] == 'conversion' + assert headers['x-crocoite-type'] == 'screenshot' + checkWarcinfoId (headers) + assert URL (headers['warc-target-uri']) == g.url, (headers['warc-target-uri'], g.url) + assert headers['warc-refers-to'] is None + assert int (headers['X-Crocoite-Screenshot-Y-Offset']) == g.yoff + + assert rec.raw_stream.read () == g.data + elif isinstance (g, DomSnapshotEvent): + rec = next (it) + + headers = rec.rec_headers + assert headers['warc-type'] == 'conversion' + assert headers['x-crocoite-type'] == 'dom-snapshot' + checkWarcinfoId (headers) + assert URL (headers['warc-target-uri']) == g.url + assert headers['warc-refers-to'] is None + + assert rec.raw_stream.read () == g.document + elif isinstance (g, RequestResponsePair): + rec = next (it) + + # request + headers = rec.rec_headers + assert headers['warc-type'] == 'request' + assert 'x-crocoite-type' not in headers + checkWarcinfoId (headers) + assert URL (headers['warc-target-uri']) == g.url + assert headers['x-chrome-request-id'] == g.id + + assert CIMultiDict (rec.http_headers.headers) == g.request.headers + if g.request.hasPostData: + if g.request.body is not None: + assert rec.raw_stream.read () == g.request.body + else: + # body fetch failed + assert headers['warc-truncated'] == 'unspecified' + assert not rec.raw_stream.read () + else: + assert not rec.raw_stream.read () + + # response + if g.response: + rec = next (it) + headers = rec.rec_headers + httpheaders = rec.http_headers + assert headers['warc-type'] == 'response' + checkWarcinfoId (headers) + assert URL (headers['warc-target-uri']) == g.url + assert headers['x-chrome-request-id'] == g.id + assert 'x-crocoite-type' not in headers + + # these are checked separately + filteredHeaders = CIMultiDict (httpheaders.headers) + for b in {'content-type', 'content-length'}: + if b in g.response.headers: + g.response.headers.popall (b) + if b in filteredHeaders: + filteredHeaders.popall (b) + assert filteredHeaders == g.response.headers + + expectedContentType = g.response.mimeType + if expectedContentType is not None: + assert httpheaders['content-type'].startswith (expectedContentType) + + if g.response.body is not None: + assert rec.raw_stream.read () == g.response.body + assert httpheaders['content-length'] == str (len (g.response.body)) + # body is never truncated if it exists + assert headers['warc-truncated'] is None + + # unencoded strings are converted to utf8 + if isinstance (g.response.body, UnicodeBody) and httpheaders['content-type'] is not None: + assert httpheaders['content-type'].endswith ('; charset=utf-8') + else: + # body fetch failed + assert headers['warc-truncated'] == 'unspecified' + assert not rec.raw_stream.read () + # content-length header should be kept intact + else: + assert False, f"invalid golden type {type(g)}" # pragma: no cover + + # no further records + with pytest.raises (StopIteration): + next (it) + diff --git a/crocoite/tools.py b/crocoite/tools.py index bc92f8f..a2ddaa3 100644 --- a/crocoite/tools.py +++ b/crocoite/tools.py @@ -22,76 +22,296 @@ Misc tools """ -import shutil, sys, re, os, logging, argparse +import shutil, sys, os, logging, argparse, json +from io import BytesIO + from warcio.archiveiterator import ArchiveIterator from warcio.warcwriter import WARCWriter +from yarl import URL -def mergeWarc (): - """ - Merge multiple WARC files into a single file, writing revisit records for - items which occur multiple times - """ - - parser = argparse.ArgumentParser(description='Merge WARCs, reads filenames from stdin.') - parser.add_argument('--verbose', '-v', action='store_true') - parser.add_argument('output', type=argparse.FileType ('wb'), help='Output WARC') +from pkg_resources import parse_version, parse_requirements - args = parser.parse_args() - loglevel = logging.DEBUG if args.verbose else logging.INFO - logging.basicConfig (level=loglevel) +from .util import getSoftwareInfo, StrJsonEncoder +from .warc import jsonMime, makeContentType +def mergeWarc (files, output): + # stats unique = 0 revisit = 0 + uniqueLength = 0 + revisitLength = 0 + payloadMap = {} - writer = WARCWriter (args.output, gzip=True) - for l in sys.stdin: - l = l.strip () + writer = WARCWriter (output, gzip=True) + + # Add an additional warcinfo record, describing the transformations. This + # is not ideal, since + # “A ‘warcinfo’ record describes the records that + # follow it […] until next ‘warcinfo’” + # -- https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#warcinfo + # A warcinfo record is expected at the beginning of every file. But it + # might have written by a different software, so we don’t want to + # strip/replace that information, but supplement it. + warcinfo = { + 'software': getSoftwareInfo (), + 'tool': 'crocoite-merge', # not the name of the cli tool + 'parameters': {'inputs': files}, + } + payload = BytesIO (json.dumps (warcinfo, indent=2).encode ('utf-8')) + record = writer.create_warc_record ('', 'warcinfo', + payload=payload, + warc_headers_dict={'Content-Type': makeContentType (jsonMime, 'utf-8')}) + writer.write_record (record) + + for l in files: with open (l, 'rb') as fd: for record in ArchiveIterator (fd): if record.rec_type in {'resource', 'response'}: headers = record.rec_headers rid = headers.get_header('WARC-Record-ID') csum = headers.get_header('WARC-Payload-Digest') + length = int (headers.get_header ('Content-Length')) dup = payloadMap.get (csum, None) if dup is None: payloadMap[csum] = {'uri': headers.get_header('WARC-Target-URI'), 'id': rid, 'date': headers.get_header('WARC-Date')} unique += 1 + uniqueLength += length else: - logging.debug ('Record {} is duplicate of {}'.format (rid, dup['id'])) - record = writer.create_revisit_record (dup['uri'], csum, dup['uri'], dup['date']) + logging.debug (f'Record {rid} is duplicate of {dup["id"]}') + # Payload may be identical, but HTTP headers are + # (probably) not. Include them. + record = writer.create_revisit_record ( + headers.get_header('WARC-Target-URI'), digest=csum, + refers_to_uri=dup['uri'], refers_to_date=dup['date'], + http_headers=record.http_headers) record.rec_headers.add_header ('WARC-Truncated', 'length') record.rec_headers.add_header ('WARC-Refers-To', dup['id']) revisit += 1 + revisitLength += length else: unique += 1 writer.write_record (record) - logging.info ('Wrote {} unique records, {} revisits'.format (unique, revisit)) + json.dump (dict ( + unique=dict (records=unique, bytes=uniqueLength), + revisit=dict (records=revisit, bytes=revisitLength), + ratio=dict ( + records=unique/(unique+revisit), + bytes=uniqueLength/(uniqueLength+revisitLength) + ), + ), + sys.stdout, + cls=StrJsonEncoder) + sys.stdout.write ('\n') + +def mergeWarcCli(): + parser = argparse.ArgumentParser(description='Merge WARCs, reads filenames from stdin.') + parser.add_argument('--verbose', '-v', action='store_true') + parser.add_argument('output', type=argparse.FileType ('wb'), help='Output WARC') + + args = parser.parse_args() + loglevel = logging.DEBUG if args.verbose else logging.INFO + logging.basicConfig (level=loglevel) + + mergeWarc([l.strip() for l in sys.stdin], args.output) def extractScreenshot (): """ Extract page screenshots from a WARC generated by crocoite into files """ - parser = argparse.ArgumentParser(description='Extract screenshots.') - parser.add_argument('-f', '--force', action='store_true', help='Overwrite existing files') - parser.add_argument('input', type=argparse.FileType ('rb'), help='Input WARC') + parser = argparse.ArgumentParser(description='Extract screenshots from ' + 'WARC, write JSON info to stdout.') + parser.add_argument('-f', '--force', action='store_true', + help='Overwrite existing files') + parser.add_argument('-1', '--one', action='store_true', + help='Only extract the first screenshot into a file named prefix') + parser.add_argument('input', type=argparse.FileType ('rb'), + help='Input WARC') parser.add_argument('prefix', help='Output file prefix') args = parser.parse_args() - screenshotRe = re.compile (r'^urn:crocoite:screenshot-(\d+)-(\d+).png$', re.I) + i = 0 with args.input: - for record in ArchiveIterator(args.input): - uri = record.rec_headers.get_header('WARC-Target-URI') - if record.rec_type == 'resource': - m = screenshotRe.match (uri) - xoff, yoff = m.groups () - if m: - outpath = '{}-{}-{}.png'.format (args.prefix, xoff, yoff) - if args.force or not os.path.exists (outpath): - with open (outpath, 'wb') as out: - shutil.copyfileobj (record.raw_stream, out) - else: - print ('not overwriting {}'.format (outpath)) + for record in ArchiveIterator (args.input): + headers = record.rec_headers + if record.rec_type != 'conversion' or \ + headers['Content-Type'] != 'image/png' or \ + 'X-Crocoite-Screenshot-Y-Offset' not in headers: + continue + + url = URL (headers.get_header ('WARC-Target-URI')) + yoff = int (headers.get_header ('X-Crocoite-Screenshot-Y-Offset')) + outpath = f'{args.prefix}{i:05d}.png' if not args.one else args.prefix + if args.force or not os.path.exists (outpath): + json.dump ({'file': outpath, 'url': url, 'yoff': yoff}, + sys.stdout, cls=StrJsonEncoder) + sys.stdout.write ('\n') + with open (outpath, 'wb') as out: + shutil.copyfileobj (record.raw_stream, out) + i += 1 + else: + print (f'not overwriting {outpath}', file=sys.stderr) + + if args.one: + break + +class Errata: + __slots__ = ('uuid', 'description', 'url', 'affects') + + def __init__ (self, uuid, description, affects, url=None): + self.uuid = uuid + self.description = description + self.url = url + # slightly abusing setuptool’s version parsing/matching here + self.affects = list (parse_requirements(affects)) + + def __contains__ (self, pkg): + """ + Return True if the versions in pkg are affected by this errata + + pkg must be a mapping from project_name to version + """ + matchedAll = [] + for a in self.affects: + haveVersion = pkg.get (a.project_name, None) + matchedAll.append (haveVersion is not None and haveVersion in a) + return all (matchedAll) + + def __repr__ (self): + return f'{self.__class__.__name__}({self.uuid!r}, {self.description!r}, {self.affects!r})' + + @property + def fixable (self): + return getattr (self, 'applyFix', None) is not None + + def toDict (self): + return {'uuid': self.uuid, + 'description': self.description, + 'url': self.url, + 'affects': list (map (str, self.affects)), + 'fixable': self.fixable} + +class FixableErrata(Errata): + __slots__ = ('stats') + + def __init__ (self, uuid, description, affects, url=None): + super().__init__ (uuid, description, affects, url) + # statistics for fixable erratas + self.stats = dict (records=dict (fixed=0, processed=0)) + + def applyFix (self, record): + raise NotImplementedError () # pragma: no cover + +class ContentTypeErrata (FixableErrata): + def __init__ (self): + super().__init__ ( + uuid='552c13dc-56e5-4539-9ad8-184ccae60930', + description='Content-Type header uses wrong argument name encoding instead of charset.', + url='https://github.com/PromyLOPh/crocoite/issues/19', + affects=['crocoite==1.0.0']) + + def applyFix (self, record): + # XXX: this is ugly. warcio’s write_record replaces any Content-Type + # header we’re setting with this one. But printing rec_headers shows + # the header, not .content_type. + contentType = record.content_type + if '; encoding=' in contentType: + contentType = contentType.replace ('; encoding=', '; charset=') + record.content_type = contentType + self.stats['records']['fixed'] += 1 + + self.stats['records']['processed'] += 1 + return record + +bugs = [ + Errata (uuid='34a176b3-ad3d-430f-a082-68087f304572', + description='Generated by version < 1.0. No erratas are supported for this version.', + affects=['crocoite<1.0'], + ), + ContentTypeErrata (), + ] + +def makeReport (fd): + alreadyFixed = set () + + for record in ArchiveIterator (fd): + if record.rec_type == 'warcinfo': + try: + data = json.load (record.raw_stream) + # errata records precceed everything else and indicate which + # ones were fixed already + if data['tool'] == 'crocoite-errata': + alreadyFixed.update (data['parameters']['errata']) + else: + haveVersions = dict ([(pkg['projectName'], parse_version(pkg['version'])) for pkg in data['software']['self']]) + yield from filter (lambda b: haveVersions in b and b.uuid not in alreadyFixed, bugs) + except json.decoder.JSONDecodeError: + pass + +def errataCheck (args): + hasErrata = False + for item in makeReport (args.input): + json.dump (item.toDict (), sys.stdout) + sys.stdout.write ('\n') + sys.stdout.flush () + hasErrata = True + return int (hasErrata) + +def errataFix (args): + errata = args.errata + + with args.input as infd, args.output as outfd: + writer = WARCWriter (outfd, gzip=True) + + warcinfo = { + 'software': getSoftwareInfo (), + 'tool': 'crocoite-errata', # not the name of the cli tool + 'parameters': {'errata': [errata.uuid]}, + } + payload = BytesIO (json.dumps (warcinfo, indent=2).encode ('utf-8')) + record = writer.create_warc_record ('', 'warcinfo', + payload=payload, + warc_headers_dict={'Content-Type': makeContentType (jsonMime, 'utf-8')}) + writer.write_record (record) + + for record in ArchiveIterator (infd): + fixedRecord = errata.applyFix (record) + writer.write_record (fixedRecord) + json.dump (errata.stats, sys.stdout) + sys.stdout.write ('\n') + sys.stdout.flush () + +def uuidToErrata (uuid, onlyFixable=True): + try: + e = next (filter (lambda x: x.uuid == uuid, bugs)) + except StopIteration: + raise argparse.ArgumentTypeError (f'Errata {uuid} does not exist') + if not isinstance (e, FixableErrata): + raise argparse.ArgumentTypeError (f'Errata {uuid} is not fixable') + return e + +def errata (): + parser = argparse.ArgumentParser(description=f'Show/fix erratas for WARCs generated by {__package__}.') + parser.add_argument('input', metavar='INPUT', type=argparse.FileType ('rb'), help='Input WARC') + + # XXX: required argument does not work here?! + subparsers = parser.add_subparsers() + + checkparser = subparsers.add_parser('check', help='Show erratas') + checkparser.set_defaults (func=errataCheck) + + fixparser = subparsers.add_parser('fix', help='Fix erratas') + fixparser.add_argument('errata', metavar='UUID', type=uuidToErrata, help='Apply fix for this errata') + fixparser.add_argument('output', metavar='OUTPUT', type=argparse.FileType ('wb'), help='Output WARC') + fixparser.set_defaults (func=errataFix) + + args = parser.parse_args() + + if not hasattr (args, 'func'): + parser.print_usage () + parser.exit () + + return args.func (args) diff --git a/crocoite/util.py b/crocoite/util.py index ec257f1..da377a3 100644 --- a/crocoite/util.py +++ b/crocoite/util.py @@ -22,22 +22,82 @@ Random utility functions """ -import random +import random, sys, platform, os, json, urllib +from datetime import datetime +import hashlib, pkg_resources -def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'): - if length is None: - length = random.randint (16, 32) - return ''.join (map (lambda x: random.choice (chars), range (length))) +from yarl import URL -def packageUrl (path): - """ - Create URL for package data stored into WARC - """ - return 'urn:' + __package__ + ':' + path +class StrJsonEncoder (json.JSONEncoder): + """ JSON encoder that turns unknown classes into a string and thus never + fails """ + def default (self, obj): + if isinstance (obj, datetime): + return obj.isoformat () + + # make sure serialization always succeeds + try: + return json.JSONEncoder.default(self, obj) + except TypeError: + return str (obj) -def getFormattedViewportMetrics (tab): - layoutMetrics = tab.Page.getLayoutMetrics () +async def getFormattedViewportMetrics (tab): + layoutMetrics = await tab.Page.getLayoutMetrics () # XXX: I’m not entirely sure which one we should use here - return '{}x{}'.format (layoutMetrics['layoutViewport']['clientWidth'], - layoutMetrics['layoutViewport']['clientHeight']) + viewport = layoutMetrics['layoutViewport'] + return f"{viewport['clientWidth']}x{viewport['clientHeight']}" + +def getSoftwareInfo (): + """ Get software info for inclusion into warcinfo """ + return { + 'platform': platform.platform (), + 'python': { + 'implementation': platform.python_implementation(), + 'version': platform.python_version (), + 'build': platform.python_build () + }, + 'self': getRequirements (__package__) + } + +def getRequirements (dist): + """ Get dependencies of a package. + + Figure out packages’ dependencies based on setup/distutils, then look at + modules loaded and compute hashes of each loaded dependency. + + This does not and cannot protect against malicious people. It’s only + purpose is to recreate this exact environment. + """ + + pending = {dist} + have = set () + packages = [] + while pending: + d = pkg_resources.get_distribution (pending.pop ()) + + modules = list (filter (lambda x: x, d.get_metadata ('top_level.txt').split ('\n'))) + modhashes = {} + # hash loaded modules + for m in sys.modules.values (): + f = getattr (m, '__file__', None) + pkg = getattr (m, '__package__', None) + # is loaded? + if pkg in modules: + if f and os.path.isfile (f): + with open (f, 'rb') as fd: + contents = fd.read () + h = hashlib.new ('sha512') + h.update (contents) + modhashes[m.__name__] = {'sha512': h.hexdigest (), 'len': len (contents)} + else: + modhashes[m.__name__] = {} + + # only if one of the packages’ modules is actually loaded + if modhashes: + packages.append ({'projectName': d.project_name, 'modules': modhashes, 'version': d.version}) + + have.add (dist) + pending.update (d.requires ()) + pending.difference_update (have) + return packages diff --git a/crocoite/warc.py b/crocoite/warc.py index 3fd65e4..415b487 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -22,178 +22,131 @@ Classes writing data to WARC files """ -import logging -import json -from http.server import BaseHTTPRequestHandler +import json, threading from io import BytesIO -from warcio.statusandheaders import StatusAndHeaders -from urllib.parse import urlsplit -from logging.handlers import BufferingHandler from datetime import datetime -from threading import Thread -from queue import Queue +from http.server import BaseHTTPRequestHandler from warcio.timeutils import datetime_to_iso_date from warcio.warcwriter import WARCWriter - -from .browser import AccountingSiteLoader -from .util import packageUrl -from .controller import defaultSettings - -class SerializingWARCWriter (WARCWriter): - """ - Serializing WARC writer using separate writer thread and queue for - non-blocking operation - - Needs an explicit .flush() before deletion. - """ - - def __init__ (self, filebuf, *args, **kwargs): - WARCWriter.__init__ (self, filebuf, *args, **kwargs) - self.queue = Queue () - self.thread = Thread (target=self._run_writer) - self.thread.start () - - def flush (self): - self.queue.put (None) - self.thread.join () - self.queue = None - self.thread = None - - def _run_writer (self): - while True: - item = self.queue.get () - if not item: - break - out, record = item - WARCWriter._write_warc_record (self, out, record) - - def _write_warc_record (self, out, record): - self.queue.put ((out, record)) - -class WARCLogHandler (BufferingHandler): - """ - Buffered log handler, flushing to warcio - """ - - contentType = 'text/plain; charset=utf-8' - - def __init__ (self, capacity, warcfile): - BufferingHandler.__init__ (self, capacity) - self.warcfile = warcfile - - def flush (self): - self.acquire () - try: - if self.buffer: - buf = '' - for record in self.buffer: - buf += self.format (record) - buf += '\n' - # XXX: record type? - record = self.warcfile.create_warc_record ( - packageUrl ('log'), 'metadata', - payload=BytesIO (buf.encode ('utf8')), - warc_headers_dict={'Content-Type': self.contentType}) - self.warcfile.write_record(record) - self.buffer = [] - finally: - self.release () - -class WarcLoader (AccountingSiteLoader): - def __init__ (self, browser, url, writer, - logger=logging.getLogger(__name__), - logBuffer=defaultSettings.logBuffer, - maxBodySize=defaultSettings.maxBodySize): - super ().__init__ (browser, url, logger) - self.writer = writer - self.maxBodySize = maxBodySize - self.warcLogger = WARCLogHandler (logBuffer, writer) - self.logger.addHandler (self.warcLogger) - - def __exit__ (self, exc_type, exc_value, traceback): - self.logger.removeHandler (self.warcLogger) - self.warcLogger.flush () - return super ().__exit__ (exc_type, exc_value, traceback) - - @staticmethod - def getStatusText (response): - text = response.get ('statusText') - if text: - return text - text = BaseHTTPRequestHandler.responses.get (response['status']) - if text: - return text[0] - return 'No status text available' +from warcio.statusandheaders import StatusAndHeaders +from yarl import URL + +from .util import StrJsonEncoder +from .controller import EventHandler, ControllerStart +from .behavior import Script, DomSnapshotEvent, ScreenshotEvent +from .browser import RequestResponsePair, UnicodeBody + +# the official mimetype for json, according to https://tools.ietf.org/html/rfc8259 +jsonMime = 'application/json' +# mime for javascript, according to https://tools.ietf.org/html/rfc4329#section-7.2 +jsMime = 'application/javascript' + +def makeContentType (mime, charset=None): + """ Create value of Content-Type WARC header with optional charset """ + s = [mime] + if charset: + s.extend (['; charset=', charset]) + return ''.join (s) + +class WarcHandler (EventHandler): + __slots__ = ('logger', 'writer', 'documentRecords', 'log', + 'maxLogSize', 'logEncoding', 'warcinfoRecordId') + + def __init__ (self, fd, logger): + self.logger = logger + self.writer = WARCWriter (fd, gzip=True) + + self.logEncoding = 'utf-8' + self.log = BytesIO () + # max log buffer size (bytes) + self.maxLogSize = 500*1024 + + # maps document urls to WARC record ids, required for DomSnapshotEvent + # and ScreenshotEvent + self.documentRecords = {} + # record id of warcinfo record + self.warcinfoRecordId = None + + def __enter__ (self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self._flushLogEntries () + + def writeRecord (self, url, kind, payload, warc_headers_dict=None, http_headers=None): + """ + Thin wrapper around writer.create_warc_record and writer.write_record. + + Adds default WARC headers. + """ + assert url is None or isinstance (url, URL) + + d = {} + if self.warcinfoRecordId: + d['WARC-Warcinfo-ID'] = self.warcinfoRecordId + d.update (warc_headers_dict) + warc_headers_dict = d + + record = self.writer.create_warc_record (str (url) if url else '', + kind, + payload=payload, + warc_headers_dict=warc_headers_dict, + http_headers=http_headers) + self.writer.write_record (record) + + return record def _writeRequest (self, item): - writer = self.writer + logger = self.logger.bind (reqId=item.id) req = item.request - resp = item.response - url = urlsplit (resp['url']) - - path = url.path - if url.query: - path += '?' + url.query - httpHeaders = StatusAndHeaders('{} {} HTTP/1.1'.format (req['method'], path), - item.requestHeaders, protocol='HTTP/1.1', is_http_request=True) - initiator = item.initiator + url = item.url + + path = url.relative().with_fragment(None) + httpHeaders = StatusAndHeaders(f'{req.method} {path} HTTP/1.1', + req.headers, protocol='HTTP/1.1', is_http_request=True) warcHeaders = { - 'X-Chrome-Initiator': json.dumps (initiator), - 'WARC-Date': datetime_to_iso_date (datetime.utcfromtimestamp (item.chromeRequest['wallTime'])), + # required to correlate request with log entries + 'X-Chrome-Request-ID': item.id, + 'WARC-Date': datetime_to_iso_date (req.timestamp), } - payload, payloadBase64Encoded = item.requestBody - if payload: - payload = BytesIO (payload) - warcHeaders['X-Chrome-Base64Body'] = str (payloadBase64Encoded) - record = writer.create_warc_record(req['url'], 'request', - payload=payload, http_headers=httpHeaders, - warc_headers_dict=warcHeaders) - writer.write_record(record) - return record.rec_headers['WARC-Record-ID'] - - def _getBody (self, item, redirect): - reqId = item.id - resp = item.response - - rawBody = b'' - base64Encoded = False - if redirect: - # redirects reuse the same request, thus we cannot safely retrieve - # the body (i.e getResponseBody may return the new location’s - # body). This is fine. - pass - elif item.encodedDataLength > self.maxBodySize: - # check body size first, since we’re loading everything into memory - raise ValueError ('body for {} too large {} vs {}'.format (reqId, - item.encodedDataLength, self.maxBodySize)) + body = item.request.body + if item.request.hasPostData and body is None: + # oops, don’t know what went wrong here + logger.error ('requestBody missing', + uuid='ee9adc58-e723-4595-9feb-312a67ead6a0') + warcHeaders['WARC-Truncated'] = 'unspecified' else: - rawBody, base64Encoded = item.body - return rawBody, base64Encoded + body = BytesIO (body) + record = self.writeRecord (url, 'request', + payload=body, http_headers=httpHeaders, + warc_headers_dict=warcHeaders) + return record.rec_headers['WARC-Record-ID'] - def _writeResponse (self, item, redirect, concurrentTo, rawBody, base64Encoded): - writer = self.writer + def _writeResponse (self, item, concurrentTo): + # fetch the body reqId = item.id - resp = item.response # now the response + resp = item.response warcHeaders = { 'WARC-Concurrent-To': concurrentTo, - 'WARC-IP-Address': resp.get ('remoteIPAddress', ''), - 'X-Chrome-Protocol': resp.get ('protocol', ''), - 'X-Chrome-FromDiskCache': str (resp.get ('fromDiskCache')), - 'X-Chrome-ConnectionReused': str (resp.get ('connectionReused')), - 'X-Chrome-Base64Body': str (base64Encoded), - 'WARC-Date': datetime_to_iso_date (datetime.utcfromtimestamp ( - item.chromeRequest['wallTime']+ - (item.chromeResponse['timestamp']-item.chromeRequest['timestamp']))), + # required to correlate request with log entries + 'X-Chrome-Request-ID': item.id, + 'WARC-Date': datetime_to_iso_date (resp.timestamp), } + # conditional WARC headers + if item.remoteIpAddress: + warcHeaders['WARC-IP-Address'] = item.remoteIpAddress - httpHeaders = StatusAndHeaders('{} {}'.format (resp['status'], - self.getStatusText (resp)), item.responseHeaders, - protocol='HTTP/1.1') + # HTTP headers + statusText = resp.statusText or \ + BaseHTTPRequestHandler.responses.get ( + resp.status, ('No status text available', ))[0] + httpHeaders = StatusAndHeaders(f'{resp.status} {statusText}', + resp.headers, protocol='HTTP/1.1') # Content is saved decompressed and decoded, remove these headers blacklistedHeaders = {'transfer-encoding', 'content-encoding'} @@ -203,34 +156,118 @@ class WarcLoader (AccountingSiteLoader): # chrome sends nothing but utf8 encoded text. Fortunately HTTP # headers take precedence over the document’s <meta>, thus we can # easily override those. - contentType = resp.get ('mimeType') - if contentType: - if not base64Encoded: - contentType += '; charset=utf-8' - httpHeaders.replace_header ('content-type', contentType) - - httpHeaders.replace_header ('content-length', '{:d}'.format (len (rawBody))) + if resp.mimeType: + charset = 'utf-8' if isinstance (resp.body, UnicodeBody) else None + contentType = makeContentType (resp.mimeType, charset=charset) + httpHeaders.replace_header ('Content-Type', contentType) + + # response body + body = resp.body + if body is None: + warcHeaders['WARC-Truncated'] = 'unspecified' + else: + httpHeaders.replace_header ('Content-Length', str (len (body))) + body = BytesIO (body) - record = writer.create_warc_record(resp['url'], 'response', - warc_headers_dict=warcHeaders, payload=BytesIO (rawBody), + record = self.writeRecord (item.url, 'response', + warc_headers_dict=warcHeaders, payload=body, http_headers=httpHeaders) - writer.write_record(record) - def loadingFinished (self, item, redirect=False): - super ().loadingFinished (item, redirect) + if item.resourceType == 'Document': + self.documentRecords[item.url] = record.rec_headers.get_header ('WARC-Record-ID') + def _writeScript (self, item): writer = self.writer + encoding = 'utf-8' + # XXX: yes, we’re leaking information about the user here, but this is + # the one and only source URL of the scripts. + uri = URL(f'file://{item.abspath}') if item.path else None + self.writeRecord (uri, 'resource', + payload=BytesIO (str (item).encode (encoding)), + warc_headers_dict={ + 'Content-Type': makeContentType (jsMime, encoding), + 'X-Crocoite-Type': 'script', + }) + + def _writeItem (self, item): + assert item.request + concurrentTo = self._writeRequest (item) + # items that failed loading don’t have a response + if item.response: + self._writeResponse (item, concurrentTo) + + def _addRefersTo (self, headers, url): + refersTo = self.documentRecords.get (url) + if refersTo: + headers['WARC-Refers-To'] = refersTo + else: + self.logger.error (f'No document record found for {url}') + return headers - req = item.request - reqId = item.id - resp = item.response - url = urlsplit (resp['url']) - - try: - # write neither request nor response if we cannot retrieve the body - rawBody, base64Encoded = self._getBody (item, redirect) - concurrentTo = self._writeRequest (item) - self._writeResponse (item, redirect, concurrentTo, rawBody, base64Encoded) - except ValueError as e: - self.logger.error (e.args[0]) + def _writeDomSnapshot (self, item): + writer = self.writer + + warcHeaders = { + 'X-Crocoite-Type': 'dom-snapshot', + 'X-Chrome-Viewport': item.viewport, + 'Content-Type': makeContentType ('text/html', 'utf-8') + } + + self._addRefersTo (warcHeaders, item.url) + + self.writeRecord (item.url, 'conversion', + payload=BytesIO (item.document), + warc_headers_dict=warcHeaders) + + def _writeScreenshot (self, item): + writer = self.writer + warcHeaders = { + 'Content-Type': makeContentType ('image/png'), + 'X-Crocoite-Screenshot-Y-Offset': str (item.yoff), + 'X-Crocoite-Type': 'screenshot', + } + self._addRefersTo (warcHeaders, item.url) + self.writeRecord (item.url, 'conversion', + payload=BytesIO (item.data), warc_headers_dict=warcHeaders) + + def _writeControllerStart (self, item, encoding='utf-8'): + payload = BytesIO (json.dumps (item.payload, indent=2, cls=StrJsonEncoder).encode (encoding)) + + writer = self.writer + warcinfo = self.writeRecord (None, 'warcinfo', + warc_headers_dict={'Content-Type': makeContentType (jsonMime, encoding)}, + payload=payload) + self.warcinfoRecordId = warcinfo.rec_headers['WARC-Record-ID'] + + def _flushLogEntries (self): + if self.log.tell () > 0: + writer = self.writer + self.log.seek (0) + warcHeaders = { + 'Content-Type': makeContentType (jsonMime, self.logEncoding), + 'X-Crocoite-Type': 'log', + } + self.writeRecord (None, 'metadata', payload=self.log, + warc_headers_dict=warcHeaders) + self.log = BytesIO () + + def _writeLog (self, item): + """ Handle log entries, called by .logger.WarcHandlerConsumer only """ + self.log.write (item.encode (self.logEncoding)) + self.log.write (b'\n') + if self.log.tell () > self.maxLogSize: + self._flushLogEntries () + + route = {Script: _writeScript, + RequestResponsePair: _writeItem, + DomSnapshotEvent: _writeDomSnapshot, + ScreenshotEvent: _writeScreenshot, + ControllerStart: _writeControllerStart, + } + + async def push (self, item): + for k, v in self.route.items (): + if isinstance (item, k): + v (self, item) + break diff --git a/doc/_ext/clicklist.py b/doc/_ext/clicklist.py new file mode 100644 index 0000000..a69452c --- /dev/null +++ b/doc/_ext/clicklist.py @@ -0,0 +1,45 @@ +""" +Render click.yaml config file into human-readable list of supported sites +""" + +import pkg_resources, yaml +from docutils import nodes +from docutils.parsers.rst import Directive +from yarl import URL + +class ClickList (Directive): + def run(self): + # XXX: do this once only + fd = pkg_resources.resource_stream ('crocoite', 'data/click.yaml') + config = list (yaml.safe_load_all (fd)) + + l = nodes.definition_list () + for site in config: + urls = set () + v = nodes.definition () + vl = nodes.bullet_list () + v += vl + for s in site['selector']: + i = nodes.list_item () + i += nodes.paragraph (text=s['description']) + vl += i + urls.update (map (lambda x: URL(x).with_path ('/'), s.get ('urls', []))) + + item = nodes.definition_list_item () + term = ', '.join (map (lambda x: x.host, urls)) if urls else site['match'] + k = nodes.term (text=term) + item += k + + item += v + l += item + return [l] + +def setup(app): + app.add_directive ("clicklist", ClickList) + + return { + 'version': '0.1', + 'parallel_read_safe': True, + 'parallel_write_safe': True, + } + diff --git a/doc/conf.py b/doc/conf.py new file mode 100644 index 0000000..8336c27 --- /dev/null +++ b/doc/conf.py @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- +import os, sys + +# -- Project information ----------------------------------------------------- + +project = 'crocoite' +copyright = '2019 crocoite contributors' +author = 'crocoite contributors' + +# -- General configuration --------------------------------------------------- + +sys.path.append(os.path.abspath("./_ext")) +extensions = [ + 'sphinx.ext.viewcode', + 'sphinx.ext.autodoc', + 'clicklist', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +source_suffix = '.rst' +master_doc = 'index' +language = 'en' +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] +pygments_style = 'tango' + +# -- Options for HTML output ------------------------------------------------- + +html_theme = 'alabaster' +html_theme_options = { + "description": "Preservation for the modern web", + "github_user": "PromyLOPh", + "github_repo": "crocoite", + "travis_button": True, + "github_button": True, + "codecov_button": True, + "fixed_sidebar": True, +} +#html_static_path = ['_static'] +html_sidebars = { + '**': ['about.html', 'navigation.html', 'searchbox.html'], +} + diff --git a/doc/develop.rst b/doc/develop.rst new file mode 100644 index 0000000..801ab21 --- /dev/null +++ b/doc/develop.rst @@ -0,0 +1,39 @@ +Development +----------- + +Generally crocoite provides reasonable defaults for Google Chrome via +:py:mod:`crocoite.devtools`. When debugging this software it might be necessary +to open a non-headless instance of the browser by running + +.. code:: bash + + google-chrome-stable --remote-debugging-port=9222 --auto-open-devtools-for-tabs + +and then passing the option :option:`--browser=http://localhost:9222` to +:program:`crocoite-single`. This allows human intervention through the +browser’s builtin console. + +Release guide +^^^^^^^^^^^^^ + +crocoite uses `semantic versioning`_. To create a new release, bump the version +number in ``setup.py`` according to the linked guide, create distribution +packages:: + + python setup.py sdist bdist_wheel + +Verify them:: + + twine check dist/* + +Try to install and use them in a separate sandbox. And finally sign and upload +a new version to pypi_:: + + gpg --detach-sign --armor dist/*.tar.gz + twine upload dist/* + +Then update the documentation using :program:`sphing-doc` and upload it as well. + +.. _semantic versioning: https://semver.org/spec/v2.0.0.html +.. _pypi: https://pypi.org + diff --git a/doc/index.rst b/doc/index.rst new file mode 100644 index 0000000..53f5f77 --- /dev/null +++ b/doc/index.rst @@ -0,0 +1,36 @@ +crocoite +======== + +Preservation for the modern web, powered by `headless Google +Chrome`_. + +.. _headless Google Chrome: https://developers.google.com/web/updates/2017/04/headless-chrome + +.. toctree:: + :maxdepth: 1 + :hidden: + + usage.rst + plugins.rst + rationale.rst + develop.rst + related.rst + +Features +-------- + +Google Chrome-powered + HTML renderer, JavaScript engine and network stack, supporting modern web + technologies and protocols +WARC output + Includes all network requests made by the browser +Site interaction + :ref:`Auto-expand on-click content <click>`, infinite-scrolling +DOM snapshot + Contains the page’s state, renderable without JavaScript +Image screenshot + Entire page +Machine-readable interface + Easy integration into custom tools/scripts + + diff --git a/doc/plugins.rst b/doc/plugins.rst new file mode 100644 index 0000000..062e1bf --- /dev/null +++ b/doc/plugins.rst @@ -0,0 +1,16 @@ +Plugins +======= + +crocoite comes with plug-ins that modify loaded sites’ or interact with them. + +.. _click: + +click +----- + +The following sites are currently supported. Note this is an ongoing +battle against layout changes and thus older software versions will stop +working very soon. + +.. clicklist:: + diff --git a/doc/rationale.rst b/doc/rationale.rst new file mode 100644 index 0000000..f37db7c --- /dev/null +++ b/doc/rationale.rst @@ -0,0 +1,76 @@ +Rationale +--------- + +Most modern websites depend heavily on executing code, usually JavaScript, on +the user’s machine. They also make use of new and emerging Web technologies +like HTML5, WebSockets, service workers and more. Even worse from the +preservation point of view, they also require some form of user interaction to +dynamically load more content (infinite scrolling, dynamic comment loading, +etc). + +The naive approach of fetching a HTML page, parsing it and extracting +links to referenced resources therefore is not sufficient to create a faithful +snapshot of these web applications. A full browser, capable of running scripts and +providing modern Web API’s is absolutely required for this task. Thankfully +Google Chrome runs without a display (headless mode) and can be controlled by +external programs, allowing them to navigate and extract or inject data. +This section describes the solutions crocoite offers and explains design +decisions taken. + +crocoite captures resources by listening to Chrome’s `network events`_ and +requesting the response body using `Network.getResponseBody`_. This approach +has caveats: The original HTTP requests and responses, as sent over the wire, +are not available. They are reconstructed from parsed data. The character +encoding for text documents is changed to UTF-8. And the content body of HTTP +redirects cannot be retrieved due to a race condition. + +.. _network events: https://chromedevtools.github.io/devtools-protocol/1-3/Network +.. _Network.getResponseBody: https://chromedevtools.github.io/devtools-protocol/1-3/Network#method-getResponseBody + +But at the same time it allows crocoite to rely on Chrome’s well-tested network +stack and HTTP parser. Thus it supports HTTP version 1 and 2 as well as +transport protocols like SSL and QUIC. Depending on Chrome also eliminates the +need for a man-in-the-middle proxy, like warcprox_, which has to decrypt SSL +traffic and present a fake certificate to the browser in order to store the +transmitted content. + +.. _warcprox: https://github.com/internetarchive/warcprox + +WARC records generated by crocoite therefore are an abstract view on the +resource they represent and not necessarily the data sent over the wire. A URL +fetched with HTTP/2 for example will still result in a HTTP/1.1 +request/response pair in the WARC file. This may be undesireable from +an archivist’s point of view (“save the data exactly like we received it”). But +this level of abstraction is inevitable when dealing with more than one +protocol. + +crocoite also interacts with and therefore alters the grabbed websites. It does +so by injecting `behavior scripts`_ into the site. Typically these are written +in JavaScript, because interacting with a page is easier this way. These +scripts then perform different tasks: Extracting targets from visible +hyperlinks, clicking buttons or scrolling the website to to load more content, +as well as taking a static screenshot of ``<canvas>`` elements for the DOM +snapshot (see below). + +.. _behavior scripts: https://github.com/PromyLOPh/crocoite/tree/master/crocoite/data + +Replaying archived WARC’s can be quite challenging and might not be possible +with current technology (or even at all): + +- Some sites request assets based on screen resolution, pixel ratio and + supported image formats (webp). Replaying those with different parameters + won’t work, since assets for those are missing. Example: missguided.com. +- Some fetch different scripts based on user agent. Example: youtube.com. +- Requests containing randomly generated JavaScript callback function names + won’t work. Example: weather.com. +- Range requests (Range: bytes=1-100) are captured as-is, making playback + difficult + +crocoite offers two methods to work around these issues. Firstly it can save a +DOM snapshot to the WARC file. It contains the entire DOM in HTML format minus +``<script>`` tags after the site has been fully loaded and thus can be +displayed without executing scripts. Obviously JavaScript-based navigation +does not work any more. Secondly it also saves a screenshot of the full page, +so even if future browsers cannot render and display the stored HTML a fully +rendered version of the website can be replayed instead. + diff --git a/doc/related.rst b/doc/related.rst new file mode 100644 index 0000000..62e2569 --- /dev/null +++ b/doc/related.rst @@ -0,0 +1,14 @@ +Related projects +---------------- + +brozzler_ + Uses Google Chrome as well, but intercepts traffic using a proxy. Supports + distributed crawling and immediate playback. +Squidwarc_ + Communicates with headless Google Chrome and uses the Network API to + retrieve requests like crocoite. Supports recursive crawls and page + scrolling, but neither custom JavaScript nor distributed crawling. + +.. _brozzler: https://github.com/internetarchive/brozzler +.. _Squidwarc: https://github.com/N0taN3rd/Squidwarc + diff --git a/doc/usage.rst b/doc/usage.rst new file mode 100644 index 0000000..34a3e7b --- /dev/null +++ b/doc/usage.rst @@ -0,0 +1,162 @@ +Usage +----- + +Quick start using pywb_, expects Google Chrome to be installed already: + +.. code:: bash + + pip install crocoite pywb + crocoite http://example.com/ example.com.warc.gz + wb-manager init test && wb-manager add test example.com.warc.gz + wayback & + $BROWSER http://localhost:8080 + +.. _pywb: https://github.com/ikreymer/pywb + +It is recommended to install at least Micrsoft’s Corefonts_ as well as DejaVu_, +Liberation_ or a similar font family covering a wide range of character sets. +Otherwise page screenshots may be unusable due to missing glyphs. + +.. _Corefonts: http://corefonts.sourceforge.net/ +.. _DejaVu: https://dejavu-fonts.github.io/ +.. _Liberation: https://pagure.io/liberation-fonts + +Recursion +^^^^^^^^^ + +.. program:: crocoite + +By default crocoite will only retrieve the URL specified on the command line. +However it can follow links as well. There’s currently two recursion strategies +available, depth- and prefix-based. + +.. code:: bash + + crocoite -r 1 https://example.com/ example.com.warc.gz + +will retrieve ``example.com`` and all pages directly refered to by it. +Increasing the number increases the depth, so a value of :samp:`2` would first grab +``example.com``, queue all pages linked there as well as every reference on +each of those pages. + +On the other hand + +.. code:: bash + + crocoite -r prefix https://example.com/dir/ example.com.warc.gz + +will retrieve the URL specified and all pages referenced which have the same +URL prefix. There trailing slash is significant. Without it crocoite would also +grab ``/dir-something`` or ``/dir.html`` for example. + +If an output file template is used each page is written to an individual file. For example + +.. code:: bash + + crocoite -r prefix https://example.com/ '{host}-{date}-{seqnum}.warc.gz' + +will write one file page page to files like +:file:`example.com-2019-09-09T15:15:15+02:00-1.warc.gz`. ``seqnum`` is unique to +each page of a single job and should always be used. + +When running a recursive job, increasing the concurrency (i.e. how many pages +are fetched at the same time) can speed up the process. For example you can +pass :option:`-j` :samp:`4` to retrieve four pages at the same time. Keep in mind +that each process starts a full browser that requires a lot of resources (one +to two GB of RAM and one or two CPU cores). + +Customizing +^^^^^^^^^^^ + +.. program:: crocoite-single + +Under the hood :program:`crocoite` starts one instance of +:program:`crocoite-single` to fetch each page. You can customize its options by +appending a command template like this: + +.. code:: bash + + crocoite -r prefix https://example.com example.com.warc.gz -- \ + crocoite-single --timeout 5 -k '{url}' '{dest}' + +This reduces the global timeout to 5 seconds and ignores TLS errors. If an +option is prefixed with an exclamation mark (``!``) it will not be expanded. +This is useful for passing :option:`--warcinfo`, which expects JSON-encoded data. + +Command line options +^^^^^^^^^^^^^^^^^^^^ + +Below is a list of all command line arguments available: + +.. program:: crocoite + +crocoite +++++++++ + +Front-end with recursion support and simple job management. + +.. option:: -j N, --concurrency N + + Maximum number of concurrent fetch jobs. + +.. option:: -r POLICY, --recursion POLICY + + Enables recursion based on POLICY, which can be a positive integer + (recursion depth) or the string :kbd:`prefix`. + +.. option:: --tempdir DIR + + Directory for temporary WARC files. + +.. program:: crocoite-single + +crocoite-single ++++++++++++++++ + +Back-end to fetch a single page. + +.. option:: -b SET-COOKIE, --cookie SET-COOKIE + + Add cookie to browser’s cookie jar. This option always *appends* cookies, + replacing those provided by :option:`-c`. + + .. versionadded:: 1.1 + +.. option:: -c FILE, --cookie-jar FILE + + Load cookies from FILE. :program:`crocoite` provides a default cookie file, + which contains cookies to, for example, circumvent age restrictions. This + option *replaces* that default file. + + .. versionadded:: 1.1 + +.. option:: --idle-timeout SEC + + Time after which a page is considered “idle”. + +.. option:: -k, --insecure + + Allow insecure connections, i.e. self-signed ore expired HTTPS certificates. + +.. option:: --timeout SEC + + Global archiving timeout. + + +.. option:: --warcinfo JSON + + Inject additional JSON-encoded information into the resulting WARC. + +IRC bot +^^^^^^^ + +A simple IRC bot (“chromebot”) is provided with the command :program:`crocoite-irc`. +It reads its configuration from a config file like the example provided in +:file:`contrib/chromebot.json` and supports the following commands: + +a <url> -j <concurrency> -r <policy> -k -b <set-cookie> + Archive <url> with <concurrency> processes according to recursion <policy> +s <uuid> + Get job status for <uuid> +r <uuid> + Revoke or abort running job with <uuid> diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..ec7d730 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,8 @@ +[aliases] +test=pytest +[tool:pytest] +addopts=--cov-report=html --cov-report=xml --cov=crocoite --cov-config=setup.cfg +[coverage:run] +branch=True +[build_sphinx] +builder=dirhtml @@ -2,27 +2,56 @@ from setuptools import setup setup( name='crocoite', - version='0.1.0', + version='1.1.1', author='Lars-Dominik Braun', author_email='lars+crocoite@6xq.net', + url='https://6xq.net/crocoite/', packages=['crocoite'], license='LICENSE.txt', description='Save website to WARC using Google Chrome.', long_description=open('README.rst').read(), + long_description_content_type='text/x-rst', install_requires=[ - 'pychrome', 'warcio', 'html5lib>=0.999999999', - 'Celery', + 'bottom', + 'pytz', + 'websockets', + 'aiohttp', + 'PyYAML', + 'yarl>=1.4,<1.5', + 'multidict', ], + extras_require={ + 'manhole': ['manhole>=1.6'], + }, entry_points={ 'console_scripts': [ - 'crocoite-grab = crocoite.cli:main', - 'crocoite-merge-warc = crocoite.tools:mergeWarc', + # the main executable + 'crocoite = crocoite.cli:recursive', + # backend helper + 'crocoite-single = crocoite.cli:single', + # irc bot and dashboard + 'crocoite-irc = crocoite.cli:irc', + 'crocoite-irc-dashboard = crocoite.cli:dashboard', + # misc tools + 'crocoite-merge-warc = crocoite.tools:mergeWarcCli', 'crocoite-extract-screenshot = crocoite.tools:extractScreenshot', + 'crocoite-errata = crocoite.tools:errata', ], }, package_data={ 'crocoite': ['data/*'], }, + setup_requires=['pytest-runner'], + tests_require=["pytest", 'pytest-asyncio', 'pytest-cov', 'hypothesis'], + python_requires='>=3.6', + classifiers=[ + 'Development Status :: 5 - Production/Stable', + 'License :: OSI Approved :: MIT License', + 'Operating System :: POSIX', + 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', + 'Topic :: Internet :: WWW/HTTP', + ], ) |