diff options
45 files changed, 5294 insertions, 1677 deletions
| @@ -2,3 +2,6 @@ __pycache__  *.sw?  *.egg-info/  .pytest_cache/ +coverage.xml +htmlcov +.coverage diff --git a/.travis.yml b/.travis.yml index ea09d94..b1d417c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,10 +1,17 @@ -dist: trusty +dist: xenial  language: python -python: -    - "3.4" -    - "3.5" -    - "3.6" -    - "pypy3.5" +matrix: +  include: +    - python: "3.6" +    - python: "3.7" +    - python: "3.8" +    - python: "3.6-dev" +    - python: "3.7-dev" +    - python: "3.8-dev" +  allow_failures: +    - python: "3.6-dev" +    - python: "3.7-dev" +    - python: "3.8-dev"  install:      - pip install .  script: @@ -12,3 +19,5 @@ script:  addons:      chrome: stable  sudo: required +after_success: +  - bash <(curl -s https://codecov.io/bash) @@ -1,134 +1,15 @@  crocoite  ======== -Archive websites using `headless Google Chrome`_ and its DevTools protocol. +.. code:: bash -.. image:: https://travis-ci.org/PromyLOPh/crocoite.svg?branch=master -    :target: https://travis-ci.org/PromyLOPh/crocoite - -.. _headless Google Chrome: https://developers.google.com/web/updates/2017/04/headless-chrome - -Dependencies ------------- - -- Python 3 -- pychrome_  -- warcio_ -- html5lib_ -- Celery_ - -.. _pychrome: https://github.com/fate0/pychrome -.. _warcio: https://github.com/webrecorder/warcio -.. _html5lib: https://github.com/html5lib/html5lib-python -.. _Celery: http://www.celeryproject.org/ - -Usage ------ - -One-shot commandline interface and pywb_ playback:: - -    crocoite-grab --output example.com.warc.gz http://example.com/ -    rm -rf collections && wb-manager init test && wb-manager add test example.com.warc.gz +    pip install crocoite pywb +    crocoite http://example.com/ example.com.warc.gz +    wb-manager init test && wb-manager add test example.com.warc.gz      wayback &      $BROWSER http://localhost:8080 -.. _pywb: https://github.com/ikreymer/pywb - -Behavior scripts -^^^^^^^^^^^^^^^^ - -A lot of sites need some form of interaction to dynamically load more content. Twitter for -instance continously loads new posts when scrolling to the bottom of the page. -crocoite can emulate these user interactions (and more) by combining control -code written in Python and injecting JavaScript into the page. The code can be -limited to certain URLs or apply to every page loaded. By default all scripts -available are enabled, see command line flag ``--behavior``. - -Caveats -------- - -- Original HTTP requests/responses are not available. They are rebuilt from -  parsed data. Character encoding for text documents is changed to UTF-8. -- Some sites request assets based on screen resolution, pixel ratio and -  supported image formats (webp). Replaying those with different parameters -  won’t work, since assets for those are missing. Example: missguided.com. -- Some fetch different scripts based on user agent. Example: youtube.com. -- Requests containing randomly generated JavaScript callback function names -  won’t work. Example: weather.com. -- Range requests (Range: bytes=1-100) are captured as-is, making playback -  difficult -- Content body of HTTP redirects cannot be retrived due to race condition - -Most of these issues can be worked around by using the DOM snapshot, which is -also saved. This causes its own set of issues though: - -- JavaScript-based navigation does not work. - -Distributed crawling --------------------- - -Configure using celeryconfig.py - -.. code:: python - -    broker_url = 'pyamqp://' -    result_backend = 'rpc://' -    warc_filename = '{domain}-{date}-{id}.warc.gz' -    temp_dir = '/tmp/' -    finished_dir = '/tmp/finished' - -Start a Celery worker:: - -    celery -A crocoite.task worker -Q crocoite.archive,crocoite.controller --loglevel=info - -Then queue archive job:: - -    crocoite-grab --distributed http://example.com - -The worker will create a temporary file named according to ``warc_filename`` in -``/tmp`` while archiving and move it to ``/tmp/finished`` when done. - -IRC bot -^^^^^^^ - -Configure sopel_ (``~/.sopel/default.cfg``) to use the plugin located in -``contrib/celerycrocoite.py`` - -.. code:: ini - -    [core] -    nick = chromebot -    host = irc.efnet.fr -    port = 6667 -    owner = someone -    extra = /path/to/crocoite/contrib -    enable = celerycrocoite -    channels = #somechannel - -Then start it by running ``sopel``. The bot must be addressed directly (i.e. -``chromebot: <command>``). The following commands are currently supported: - -a <url> -    Archives <url> and all of its resources (images, css, …). A unique UID -    (UUID) is assigned to each job. -s <uuid> -    Get status of job with <uuid> -r <uuid> -    Revoke job with <uuid>. If it started already the job will be killed. - -.. _sopel: https://sopel.chat/ - -Related projects ----------------- - -brozzler_ -    Uses Google Chrome as well, but intercepts traffic using a proxy. Supports -    distributed crawling and immediate playback. -Squidwarc_ -    Communicates with headless Google Chrome and uses the Network API to -    retrieve requests like crocoite. Supports recursive crawls and page -    scrolling, but neither custom JavaScript nor distributed crawling. +See documentation_ for more information. -.. _brozzler: https://github.com/internetarchive/brozzler -.. _Squidwarc: https://github.com/N0taN3rd/Squidwarc +.. _documentation: https://6xq.net/crocoite/ diff --git a/contrib/celerycrocoite.py b/contrib/celerycrocoite.py deleted file mode 100644 index 3da43d9..0000000 --- a/contrib/celerycrocoite.py +++ /dev/null @@ -1,229 +0,0 @@ -# Copyright (c) 2017 crocoite contributors -#  -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -#  -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -#  -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. - -""" -Module for Sopel IRC bot -""" - -import os, logging, argparse -from sopel.module import nickname_commands, require_chanmsg, thread, example, require_privilege, VOICE -from sopel.tools import Identifier, SopelMemory -import celery, celery.exceptions -from celery.result import AsyncResult -from urllib.parse import urlsplit -from threading import Thread -from queue import Queue -import queue - -from crocoite import behavior, task -from crocoite.controller import defaultSettings - -def prettyTimeDelta (seconds): -    """ -    Pretty-print seconds to human readable string 1d 1h 1m 1s -    """ -    seconds = int(seconds) -    days, seconds = divmod(seconds, 86400) -    hours, seconds = divmod(seconds, 3600) -    minutes, seconds = divmod(seconds, 60) -    s = [(days, 'd'), (hours, 'h'), (minutes, 'm'), (seconds, 's')] -    s = filter (lambda x: x[0] != 0, s) -    return ' '.join (map (lambda x: '{}{}'.format (*x), s)) - -def prettyBytes (b): -    """ -    Pretty-print bytes -    """ -    prefixes = ['B', 'KiB', 'MiB', 'GiB', 'TiB'] -    while b >= 1024 and len (prefixes) > 1: -        b /= 1024 -        prefixes.pop (0) -    return '{:.1f} {}'.format (b, prefixes[0]) - -def setup (bot): -    m = bot.memory['crocoite'] = {} -    q = m['q'] = Queue () -    t = m['t'] = Thread (target=celeryWorker, args=(bot, q)) -    t.start () - -def shutdown (bot): -    m = bot.memory['crocoite'] -    q = m['q'] -    t = m['t'] -    q.put_nowait (None) -    t.join () - -def isValidUrl (s): -    url = urlsplit (s) -    if url.scheme and url.netloc and url.scheme in {'http', 'https'}: -        return s -    raise TypeError () - -def checkCompletedJobs (bot, jobs): -    delete = set () -    for i, data in jobs.items (): -        handle = data['handle'] -        trigger = data['trigger'] -        args = data['args'] -        url = args['url'] -        channel = trigger.sender -        user = trigger.nick -        if Identifier (channel) not in bot.channels: -            continue -        try: -            stats = handle.get (timeout=0.1) -            bot.msg (channel, '{}: {} ({}) finished. {} crashed, {} requests, {} failed, {} received.'.format (user, url, -                    handle.id, stats['crashed'], stats['requests'], stats['failed'], -                    prettyBytes (stats['bytesRcv']))) -            delete.add (handle.id) -        except celery.exceptions.TimeoutError: -            pass -        except Exception as e: -            # json serialization does not work well with exceptions. If their class -            # names are unique we can still distinguish them. -            ename = type (e).__name__ -            if ename == 'TaskRevokedError': -                bot.msg (channel, '{}: {} ({}) was revoked'.format (user, url, handle.id)) -            else: -                bot.msg (channel, '{} ({}) failed'.format (user, url, handle.id)) -                logging.exception ('{} ({}) failed'.format (url, handle.id)) -            delete.add (handle.id) -    for d in delete: -        del jobs[d] - -def celeryWorker (bot, q): -    """ -    Serialize celery operations in a single thread. This is a workaround for -    https://github.com/celery/celery/issues/4480 -    """ - -    jobs = {} - -    while True: -        try: -            item = q.get (timeout=1) -        except queue.Empty: -            checkCompletedJobs (bot, jobs) -            continue - -        if item is None: -            break -        action, trigger, args = item -        if action == 'a': -            handle = task.controller.delay (**args) -            j = jobs[handle.id] = {'handle': handle, 'trigger': trigger, 'args': args} - -            # pretty-print a few selected args -            showargs = { -                    'idleTimeout': prettyTimeDelta (args['settings']['idleTimeout']), -                    'timeout': prettyTimeDelta (args['settings']['timeout']), -                    'maxBodySize': prettyBytes (args['settings']['maxBodySize']), -                    'recursive': args['recursive'], -                    'concurrency': args['concurrency'], -                    } -            strargs = ', '.join (map (lambda x: '{}={}'.format (*x), showargs.items ())) -            bot.msg (trigger.sender, '{}: {} has been queued as {} with {}'.format (trigger.nick, args['url'], handle.id, strargs)) -        elif action == 'status': -            if args and args in jobs: -                j = jobs[args] -                jtrigger = j['trigger'] -                handle = j['handle'] -                bot.msg (trigger.sender, '{}: {}, queued {}, by {}'.format (handle.id, -                        handle.status, jtrigger.time, jtrigger.nick)) -            else: -                bot.msg (trigger.sender, "Job not found.") -        elif action == 'revoke': -            if args and args in jobs: -                j = jobs[args] -                handle = j['handle'] -                handle.revoke (terminate=True) -                # response is handled above -            else: -                bot.msg (trigger.sender, "Job not found.") -        q.task_done () - -class NonExitingArgumentParser (argparse.ArgumentParser): -    def exit (self, status=0, message=None): -        # should never be called -        pass - -    def error (self, message): -        raise Exception (message) - -archiveparser = NonExitingArgumentParser (prog='a', add_help=False) -archiveparser.add_argument('--timeout', default=1*60*60, type=int, help='Maximum time for archival', metavar='SEC', choices=[60, 1*60*60, 2*60*60]) -archiveparser.add_argument('--idle-timeout', default=10, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout', metavar='SEC', choices=[1, 10, 20, 30, 60]) -archiveparser.add_argument('--max-body-size', default=defaultSettings.maxBodySize, type=int, dest='maxBodySize', help='Max body size', metavar='BYTES', choices=[1*1024*1024, 10*1024*1024, defaultSettings.maxBodySize, 100*1024*1024]) -archiveparser.add_argument('--concurrency', default=1, type=int, help='Parallel workers for this job', choices=range (9)) -archiveparser.add_argument('--recursive', help='Enable recursion', choices=['0', '1', '2', '3', 'prefix']) -archiveparser.add_argument('url', help='Website URL', type=isValidUrl) - -@nickname_commands ('a', 'archive') -@require_chanmsg () -@require_privilege (VOICE) -@example ('a http://example.com') -def archive (bot, trigger): -    """ -    Archive a URL to WARC -    """ - -    try: -        args = archiveparser.parse_args (trigger.group (2).split ()) -    except Exception as e: -        bot.reply ('{} -- {}'.format (e.args[0], archiveparser.format_usage ())) -        return -    if not args: -        bot.reply ('Sorry, I don’t understand {}'.format (trigger.group (2))) -        return -    settings = dict (maxBodySize=args.maxBodySize, -            logBuffer=defaultSettings.logBuffer, idleTimeout=args.idleTimeout, -            timeout=args.timeout) -    args = dict (url=args.url, -            enabledBehaviorNames=list (behavior.availableMap.keys ()), -            settings=settings, recursive=args.recursive, -            concurrency=args.concurrency) -    q = bot.memory['crocoite']['q'] -    q.put_nowait (('a', trigger, args)) - -@nickname_commands ('s', 'status') -@example ('s c251f09e-3c26-481f-96e0-4b5f58bd1170') -@require_chanmsg () -def status (bot, trigger): -    """ -    Retrieve status for a job -    """ - -    i = trigger.group(2) -    q = bot.memory['crocoite']['q'] -    q.put_nowait (('status', trigger, i)) - -@nickname_commands ('r', 'revoke') -@example ('r c251f09e-3c26-481f-96e0-4b5f58bd1170') -@require_privilege (VOICE) -@require_chanmsg () -def revoke (bot, trigger): -    """ -    Cancel (revoke) a job -    """ - -    i = trigger.group(2) -    q = bot.memory['crocoite']['q'] -    q.put_nowait (('revoke', trigger, i)) - diff --git a/contrib/chromebot.json b/contrib/chromebot.json new file mode 100644 index 0000000..214b770 --- /dev/null +++ b/contrib/chromebot.json @@ -0,0 +1,16 @@ +{ +  "irc": { +    "host": "irc.example.com", +    "port": 6667, +    "ssl": false, +    "nick": "chromebot", +    "channels": ["#testchannel"] +  }, +  "tempdir": "/path/to/tmp", +  "destdir": "/path/to/warc", +  "process_limit": 1 +  "blacklist": { +    "^https?://(.+\\.)?local(host)?/": "Not acceptable" +  }, +  "need_voice": false +} diff --git a/contrib/dashboard.css b/contrib/dashboard.css new file mode 100644 index 0000000..469db78 --- /dev/null +++ b/contrib/dashboard.css @@ -0,0 +1,7 @@ +.jid { +	font-family: Consolas, mono; +} +.job .urls { +	max-height: 10em; +	overflow-y: scroll; +} diff --git a/contrib/dashboard.html b/contrib/dashboard.html new file mode 100644 index 0000000..49a15bc --- /dev/null +++ b/contrib/dashboard.html @@ -0,0 +1,23 @@ +<!DOCTYPE html> +<html lang="en"> +    <head> +		<meta charset="utf-8"> +		 <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"> +        <title>chromebot dashboard</title> +		<!--<script src="https://cdn.jsdelivr.net/npm/vue@2/dist/vue.js"></script>--> +		<script src="https://cdn.jsdelivr.net/npm/vue@2/dist/vue.min.js"></script> +		<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bulma@0.7/css/bulma.min.css"> +		<link rel="stylesheet" href="dashboard.css"> +    </head> +    <body> +		<noscript>Please enable JavaScript.</noscript> +		<section id="app" class="section"> +		<h1 class="title">chromebot dashboard</h1> +		<bot-status v-bind:jobs="jobs"></bot-status> +		<div class="jobs"> +			<job-item v-for="j in jobs" v-bind:job="j" v-bind:jobs="jobs" v-bind:key="j.id"></job-item> +		</div> +		</section> +		<script src="dashboard.js"></script> +    </body> +</html> diff --git a/contrib/dashboard.js b/contrib/dashboard.js new file mode 100644 index 0000000..b5520dc --- /dev/null +++ b/contrib/dashboard.js @@ -0,0 +1,129 @@ +/* configuration */ +let socket = "wss://localhost:6789/", +	urllogMax = 100; + +function formatSize (bytes) { +	let prefixes = ['B', 'KiB', 'MiB', 'GiB', 'TiB']; +	while (bytes >= 1024 && prefixes.length > 1) { +		bytes /= 1024; +		prefixes.shift (); +	} +	return bytes.toFixed (1) + ' ' + prefixes[0]; +} + +class Job { +	constructor (id, url, user, queued) { +		this.id = id; +		this.url = url; +		this.user = user; +		this.status = undefined; +		this.stats = {'pending': 0, 'have': 0, 'running': 0, +				'requests': 0, 'finished': 0, 'failed': 0, +				'bytesRcv': 0, 'crashed': 0, 'ignored': 0}; +		this.urllog = []; +		this.queued = queued; +		this.started = undefined; +		this.finished = undefined; +		this.aborted = undefined; +	} + +	addUrl (url) { +		if (this.urllog.push (url) > urllogMax) { +			this.urllog.shift (); +		} +	} +} + +let jobs = {}; +let ws = new WebSocket(socket); +ws.onmessage = function (event) { +	var msg = JSON.parse (event.data); +	let msgdate = new Date (Date.parse (msg.date)); +	var j = undefined; +	if (msg.job) { +		j = jobs[msg.job]; +		if (j === undefined) { +			j = new Job (msg.job, 'unknown', '<unknown>', new Date ()); +			Vue.set (jobs, msg.job, j); +		} +	} +	if (msg.uuid == '36cc34a6-061b-4cc5-84a9-4ab6552c8d75') { +		j = new Job (msg.job, msg.url, msg.user, msgdate); +		/* jobs[msg.job] = j does not work with vue, see +		https://vuejs.org/v2/guide/list.html#Object-Change-Detection-Caveats +		*/ +		Vue.set (jobs, msg.job, j); +		j.status = 'pending'; +	} else if (msg.uuid == '46e62d60-f498-4ab0-90e1-d08a073b10fb') { +		j.status = 'running'; +		j.started = msgdate; +	} else if (msg.uuid == '7b40ffbb-faab-4224-90ed-cd4febd8f7ec') { +		j.status = 'finished'; +		j.finished = msgdate; +	} else if (msg.uuid == '865b3b3e-a54a-4a56-a545-f38a37bac295') { +		j.status = 'aborted'; +		j.aborted = msgdate; +	} else if (msg.uuid == '5c0f9a11-dcd8-4182-a60f-54f4d3ab3687') { +		/* forwarded job message */ +		let rmsg = msg.data; +		if (rmsg.uuid == '24d92d16-770e-4088-b769-4020e127a7ff') { +			/* job status */ +			Object.assign (j.stats, rmsg); +		} else if (rmsg.uuid == '5b8498e4-868d-413c-a67e-004516b8452c') { +			/* recursion status */ +			Object.assign (j.stats, rmsg); +		} else if (rmsg.uuid == 'd1288fbe-8bae-42c8-af8c-f2fa8b41794f') { +			/* fetch */ +			j.addUrl (rmsg.url); +		} +	} +}; +ws.onopen = function (event) { +}; +ws.onerror = function (event) { +}; + +Vue.component('job-item', { +  props: ['job', 'jobs'], +  template: '<div class="job box" :id="job.id"><ul class="columns"><li class="jid column is-narrow"><a :href="\'#\' + job.id">{{ job.id }}</a></li><li class="url column"><a :href="job.url">{{ job.url }}</a></li><li class="status column is-narrow"><job-status v-bind:job="job"></job-status></li></ul><job-stats v-bind:job="job"></job-stats></div>', +}); +Vue.component('job-status', { +	props: ['job'], +	template: '<span v-if="job.status == \'pending\'">queued on {{ job.queued.toLocaleString() }}</span><span v-else-if="job.status == \'aborted\'">aborted on {{ job.aborted.toLocaleString() }}</span><span v-else-if="job.status == \'running\'">running since {{ job.started.toLocaleString() }}</span><span v-else-if="job.status == \'finished\'">finished since {{ job.finished.toLocaleString() }}</span>' +}); +Vue.component('job-stats', { +  props: ['job'], +  template: '<div><progress class="progress is-info" :value="job.stats.have" :max="job.stats.have+job.stats.pending+job.stats.running"></progress><ul class="stats columns"><li class="column">{{ job.stats.have }} <small>have</small></li><li class="column">{{ job.stats.running }} <small>running</small></li><li class="column">{{ job.stats.pending }} <small>pending</small></li><li class="column">{{ job.stats.requests }} <small>requests</small><li class="column"><filesize v-bind:value="job.stats.bytesRcv"></filesize></li></ul><job-urls v-bind:job="job"></job-urls></div>' +}); +Vue.component('job-urls', { +  props: ['job'], +  template: '<ul class="urls"><li v-for="u in job.urllog">{{ u }}</li></ul>' +}); +Vue.component('filesize', { +  props: ['value'], +  template: '<span class="filesize">{{ fvalue }}</span>', +  computed: { fvalue: function () { return formatSize (this.value); } } +}); +Vue.component('bot-status', { +	props: ['jobs'], +	template: '<nav class="level"><div class="level-item has-text-centered"><div><p class="heading">Pending</p><p class="title">{{ stats.pending }}</p></div></div><div class="level-item has-text-centered"><div><p class="heading">Running</p><p class="title">{{ stats.running }}</p></div></div><div class="level-item has-text-centered"><div><p class="heading">Finished</p><p class="title">{{ stats.finished+stats.aborted }}</p></div></div><div class="level-item has-text-centered"><div><p class="heading">Transferred</p><p class="title"><filesize v-bind:value="stats.totalBytes"></filesize></p></div></div></nav>', +	computed: { +		stats: function () { +			let s = {pending: 0, running: 0, finished: 0, aborted: 0, totalBytes: 0}; +			for (let k in this.jobs) { +				let j = this.jobs[k]; +				s[j.status]++; +				s.totalBytes += j.stats.bytesRcv; +			} +			return s; +		} +	} +}); + +let app = new Vue({ +  el: '#app', +  data: { +	jobs: jobs, +  } +}); + diff --git a/crocoite/behavior.py b/crocoite/behavior.py index 8c24c59..1610751 100644 --- a/crocoite/behavior.py +++ b/crocoite/behavior.py @@ -1,4 +1,4 @@ -# Copyright (c) 2017 crocoite contributors +# Copyright (c) 2017–2018 crocoite contributors  #   # Permission is hereby granted, free of charge, to any person obtaining a copy  # of this software and associated documentation files (the "Software"), to deal @@ -19,43 +19,62 @@  # THE SOFTWARE.  """ -Generic and per-site behavior scripts +Behavior scripts (i.e. subclasses of Behavior) are a powerful method to +manipulate websites loaded into Chrome. They are executed by the controller +after the page started loading (onload), after it has been idle for a while +(onstop) and after loading was stopped (onfinish). + +The script’s excercise their power either through DevTools API calls or by +injecting JavaScript into the page context. Thus they can manipulate both, the +browser itself (DevTools; modify resolution, get DOM snapshot) as well as the +page (JavaScript; trigger JavaScript events, call web API’s). + +They also emit (yield) data processable by any consumer registered to the +controller. This allows storing captured screenshots inside WARC files, for +instance.  """ -import time -from urllib.parse import urlsplit -import os.path -import pkg_resources +import asyncio, json, os.path  from base64 import b64decode  from collections import OrderedDict +import pkg_resources  from html5lib.serializer import HTMLSerializer -from pychrome.exceptions import TimeoutException +from yarl import URL +import yaml -from .util import randomString, getFormattedViewportMetrics, removeFragment +from .util import getFormattedViewportMetrics  from . import html  from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker +from .devtools import Crashed, TabException  class Script:      """ A JavaScript resource """      __slots__ = ('path', 'data') +    datadir = 'data'      def __init__ (self, path=None, encoding='utf-8'):          self.path = path          if path: -            self.data = pkg_resources.resource_string (__name__, os.path.join ('data', path)).decode (encoding) +            self.data = pkg_resources.resource_string (__name__, os.path.join (self.datadir, path)).decode (encoding)      def __repr__ (self): -        return '<Script {}>'.format (self.path) +        return f'<Script {self.path}>'      def __str__ (self):          return self.data +    @property +    def abspath (self): +        return pkg_resources.resource_filename (__name__, +                os.path.join (self.datadir, self.path)) +      @classmethod -    def fromStr (cls, data): +    def fromStr (cls, data, path=None):          s = Script ()          s.data = data +        s.path = path          return s  class Behavior: @@ -76,76 +95,107 @@ class Behavior:          return True      def __repr__ (self): -        return '<Behavior {}>'.format (self.name) +        return f'<Behavior {self.name}>' -    def onload (self): -        """ Before loading the page """ -        yield from () +    async def onload (self): +        """ After loading the page started """ +        # this is a dirty hack to make this function an async generator +        return +        yield # pragma: no cover -    def onstop (self): +    async def onstop (self):          """ Before page loading is stopped """ -        yield from () +        return +        yield # pragma: no cover -    def onfinish (self): +    async def onfinish (self):          """ After the site has stopped loading """ -        yield from () - -class HostnameFilter: -    """ Limit behavior script to hostname """ - -    hostname = None - -    def __contains__ (self, url): -        url = urlsplit (url) -        hostname = url.hostname.split ('.')[::-1] -        return hostname[:2] == self.hostname +        return +        yield # pragma: no cover  class JsOnload (Behavior):      """ Execute JavaScript on page load """ -    __slots__ = ('script', 'scriptHandle') +    __slots__ = ('script', 'context', 'options')      scriptPath = None      def __init__ (self, loader, logger):          super ().__init__ (loader, logger)          self.script = Script (self.scriptPath) -        self.scriptHandle = None +        self.context = None +        # options passed to constructor +        self.options = {} -    def onload (self): +    async def onload (self): +        tab = self.loader.tab          yield self.script -        self.scriptHandle = self.loader.tab.Page.addScriptToEvaluateOnNewDocument (source=str (self.script))['identifier'] -    def onstop (self): -        self.loader.tab.Page.removeScriptToEvaluateOnNewDocument (identifier=self.scriptHandle) -        yield from () +        # This is slightly awkward, since we cannot compile the class into an +        # objectId and then reference it. Therefore the script must return a +        # class constructor, which is then called with a generic options +        # parameter. +        # XXX: is there a better way to do this? +        result = await tab.Runtime.evaluate (expression=str (self.script)) +        self.logger.debug ('behavior onload inject', +                uuid='a2da9b78-5648-44c5-bfa8-5c7573e13ad3', result=result) +        exception = result.get ('exceptionDetails', None) +        result = result['result'] +        assert result['type'] == 'function', result +        assert result.get ('subtype') != 'error', exception +        constructor = result['objectId'] + +        if self.options: +            yield Script.fromStr (json.dumps (self.options, indent=2), f'{self.script.path}#options') + +        try: +            result = await tab.Runtime.callFunctionOn ( +                    functionDeclaration='function(options){return new this(options);}', +                    objectId=constructor, +                    arguments=[{'value': self.options}]) +            self.logger.debug ('behavior onload start', +                    uuid='6c0605ae-93b3-46b3-b575-ba45790909a7', result=result) +            result = result['result'] +            assert result['type'] == 'object', result +            assert result.get ('subtype') != 'error', result +            self.context = result['objectId'] +        except TabException as e: +            if e.args[0] == -32000: +                # the site probably reloaded. ignore this, since we’ll be +                # re-injected into the new site by the controller. +                self.logger.error ('jsonload onload failed', +                        uuid='c151a863-78d1-41f4-a8e6-c022a6c5d252', +                        exception=e.args) +            else: +                raise + +    async def onstop (self): +        tab = self.loader.tab +        try: +            assert self.context is not None +            await tab.Runtime.callFunctionOn (functionDeclaration='function(){return this.stop();}', +                    objectId=self.context) +            await tab.Runtime.releaseObject (objectId=self.context) +        except TabException as e: +            # cannot do anything about that. Ignoring should be fine. +            self.logger.error ('jsonload onstop failed', +                    uuid='1786726f-c8ec-4f79-8769-30954d4e32f5', +                    exception=e.args, +                    objectId=self.context) + +        return +        yield # pragma: no cover  ### Generic scripts ###  class Scroll (JsOnload): -    __slots__ = ('stopVarname') -      name = 'scroll'      scriptPath = 'scroll.js' -    def __init__ (self, loader, logger): -        super ().__init__ (loader, logger) -        stopVarname = '__' + __package__ + '_stop__' -        newStopVarname = randomString () -        self.script.data = self.script.data.replace (stopVarname, newStopVarname) -        self.stopVarname = newStopVarname - -    def onstop (self): -        super ().onstop () -        # removing the script does not stop it if running -        script = Script.fromStr ('{} = true; window.scrollTo (0, 0);'.format (self.stopVarname)) -        yield script -        self.loader.tab.Runtime.evaluate (expression=str (script), returnByValue=True) -  class EmulateScreenMetrics (Behavior):      name = 'emulateScreenMetrics' -    def onstop (self): +    async def onstop (self):          """          Emulate different screen sizes, causing the site to fetch assets (img          srcset and css, for example) for different screen resolutions. @@ -161,23 +211,29 @@ class EmulateScreenMetrics (Behavior):                  {'width': 320, 'height': 480, 'deviceScaleFactor': 163/cssPpi, 'mobile': True},                  # 6th gen iPhone (portrait mode)                  {'width': 750, 'height': 1334, 'deviceScaleFactor': 326/cssPpi, 'mobile': True}, -                # and reset -                {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1, 'mobile': False},                  ]          l = self.loader          tab = l.tab          for s in sizes: -            tab.Emulation.setDeviceMetricsOverride (**s) +            self.logger.debug ('device override', +                    uuid='3d2d8096-1a75-4830-ad79-ae5f6f97071d', **s) +            await tab.Emulation.setDeviceMetricsOverride (**s)              # give the browser time to re-eval page and start requests -            time.sleep (1) -        # XXX: this seems to be broken, it does not clear the override -        #tab.Emulation.clearDeviceMetricsOverride () -        yield from () +            # XXX: should wait until loader is not busy any more +            await asyncio.sleep (1) +        self.logger.debug ('clear override', +                uuid='f9401683-eb3a-4b86-9bb2-c8c5d876fc8d') +        await tab.Emulation.clearDeviceMetricsOverride () +        return +        yield # pragma: no cover  class DomSnapshotEvent:      __slots__ = ('url', 'document', 'viewport')      def __init__ (self, url, document, viewport): +        # XXX: document encoding? +        assert isinstance (document, bytes) +          self.url = url          self.document = document          self.viewport = viewport @@ -188,12 +244,9 @@ class DomSnapshot (Behavior):      We could use DOMSnapshot.getSnapshot here, but the API is not stable      yet. Also computed styles are not really necessary here. - -    XXX: Currently writes a response, when it should use “resource”. pywb -    can’t handle that though.      """ -    __slots__ = ('script') +    __slots__ = ('script', )      name = 'domSnapshot' @@ -201,26 +254,29 @@ class DomSnapshot (Behavior):          super ().__init__ (loader, logger)          self.script = Script ('canvas-snapshot.js') -    def onfinish (self): +    async def onfinish (self):          tab = self.loader.tab          yield self.script -        tab.Runtime.evaluate (expression=str (self.script), returnByValue=True) +        await tab.Runtime.evaluate (expression=str (self.script), returnByValue=True) -        viewport = getFormattedViewportMetrics (tab) -        dom = tab.DOM.getDocument (depth=-1, pierce=True) +        viewport = await getFormattedViewportMetrics (tab) +        dom = await tab.DOM.getDocument (depth=-1, pierce=True) +        self.logger.debug ('dom snapshot document', +                uuid='0c720784-8bd1-4fdc-a811-84394d753539', dom=dom)          haveUrls = set ()          for doc in ChromeTreeWalker (dom['root']).split (): -            rawUrl = doc['documentURL'] -            if rawUrl in haveUrls: +            url = URL (doc['documentURL']) +            if url in haveUrls:                  # ignore duplicate URLs. they are usually caused by                  # javascript-injected iframes (advertising) with no(?) src -                self.logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl)) -                continue -            url = urlsplit (rawUrl) -            if url.scheme in ('http', 'https'): -                self.logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL'])) -                haveUrls.add (rawUrl) +                self.logger.warning ('dom snapshot duplicate', +                        uuid='d44de989-98d4-456e-82e7-9d4c49acab5e') +            elif url.scheme in ('http', 'https'): +                self.logger.debug ('dom snapshot', +                        uuid='ece7ff05-ccd9-44b5-b6a8-be25a24b96f4', +                        base=doc["baseURL"]) +                haveUrls.add (url)                  walker = ChromeTreeWalker (doc)                  # remove script, to make the page static and noscript, because at the                  # time we took the snapshot scripts were enabled @@ -228,7 +284,7 @@ class DomSnapshot (Behavior):                  disallowedAttributes = html.eventAttributes                  stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes)                  serializer = HTMLSerializer () -                yield DomSnapshotEvent (removeFragment (doc['documentURL']), serializer.render (stream, 'utf-8'), viewport) +                yield DomSnapshotEvent (url.with_fragment(None), serializer.render (stream, 'utf-8'), viewport)  class ScreenshotEvent:      __slots__ = ('yoff', 'data', 'url') @@ -241,46 +297,104 @@ class ScreenshotEvent:  class Screenshot (Behavior):      """      Create screenshot from tab and write it to WARC + +    Chrome will allocate an additional 512MB of RAM when using this plugin.      """ +    __slots__ = ('script') +      name = 'screenshot' -    def onfinish (self): +    # Hardcoded max texture size of 16,384 (crbug.com/770769) +    maxDim = 16*1024 + +    def __init__ (self, loader, logger): +        super ().__init__ (loader, logger) +        self.script = Script ('screenshot.js') + +    async def onfinish (self):          tab = self.loader.tab -        tree = tab.Page.getFrameTree () +        # for top-level/full-screen elements with position: fixed we need to +        # figure out their actual size (i.e. scrollHeight) and use that when +        # overriding the viewport size. +        # we could do this without javascript, but that would require several +        # round-trips to Chrome or pulling down the entire DOM+computed styles +        tab = self.loader.tab +        yield self.script +        result = await tab.Runtime.evaluate (expression=str (self.script), returnByValue=True) +        assert result['result']['type'] == 'object', result +        result = result['result']['value'] + +        # this is required to make the browser render more than just the small +        # actual viewport (i.e. entire page).  see +        # https://github.com/GoogleChrome/puppeteer/blob/45873ea737b4ebe4fa7d6f46256b2ea19ce18aa7/lib/Page.js#L805 +        metrics = await tab.Page.getLayoutMetrics () +        contentSize = metrics['contentSize'] +        contentHeight = max (result + [contentSize['height']]) + +        override = { +                'width': 0, +                'height': 0, +                'deviceScaleFactor': 0, +                'mobile': False, +                'viewport': {'x': 0, +                    'y': 0, +                    'width': contentSize['width'], +                    'height': contentHeight, +                    'scale': 1} +                } +        self.logger.debug ('screenshot override', +                uuid='e0affa18-cbb1-4d97-9d13-9a88f704b1b2', override=override) +        await tab.Emulation.setDeviceMetricsOverride (**override) + +        tree = await tab.Page.getFrameTree ()          try: -            url = removeFragment (tree['frameTree']['frame']['url']) +            url = URL (tree['frameTree']['frame']['url']).with_fragment (None)          except KeyError: -            self.logger.error ('frame without url', tree=tree) +            self.logger.error ('frame without url', +                    uuid='edc2743d-b93e-4ba1-964e-db232f2f96ff', tree=tree)              url = None -        # see https://github.com/GoogleChrome/puppeteer/blob/230be28b067b521f0577206899db01f0ca7fc0d2/examples/screenshots-longpage.js -        # Hardcoded max texture size of 16,384 (crbug.com/770769) -        maxDim = 16*1024 -        metrics = tab.Page.getLayoutMetrics () -        contentSize = metrics['contentSize'] -        width = min (contentSize['width'], maxDim) +        width = min (contentSize['width'], self.maxDim)          # we’re ignoring horizontal scroll intentionally. Most horizontal          # layouts use JavaScript scrolling and don’t extend the viewport. -        for yoff in range (0, contentSize['height'], maxDim): -            height = min (contentSize['height'] - yoff, maxDim) +        for yoff in range (0, contentHeight, self.maxDim): +            height = min (contentHeight - yoff, self.maxDim)              clip = {'x': 0, 'y': yoff, 'width': width, 'height': height, 'scale': 1} -            data = b64decode (tab.Page.captureScreenshot (format='png', clip=clip)['data']) +            ret = await tab.Page.captureScreenshot (format='png', clip=clip) +            data = b64decode (ret['data'])              yield ScreenshotEvent (url, yoff, data) +        await tab.Emulation.clearDeviceMetricsOverride () +  class Click (JsOnload):      """ Generic link clicking """      name = 'click'      scriptPath = 'click.js' +    def __init__ (self, loader, logger): +        super ().__init__ (loader, logger) +        with pkg_resources.resource_stream (__name__, os.path.join ('data', 'click.yaml')) as fd: +            self.options['sites'] = list (yaml.safe_load_all (fd)) +  class ExtractLinksEvent: -    __slots__ = ('links') +    __slots__ = ('links', )      def __init__ (self, links):          self.links = links +    def __repr__ (self): +        return f'<ExtractLinksEvent {self.links!r}>' + +def mapOrIgnore (f, l): +    for e in l: +        try: +            yield f (e) +        except: +            pass +  class ExtractLinks (Behavior):      """      Extract links from a page using JavaScript @@ -289,7 +403,7 @@ class ExtractLinks (Behavior):      manually resolve relative links.      """ -    __slots__ = ('script') +    __slots__ = ('script', )      name = 'extractLinks' @@ -297,29 +411,28 @@ class ExtractLinks (Behavior):          super ().__init__ (loader, logger)          self.script = Script ('extract-links.js') -    def onfinish (self): +    async def onfinish (self):          tab = self.loader.tab          yield self.script -        result = tab.Runtime.evaluate (expression=str (self.script), returnByValue=True) -        yield ExtractLinksEvent (list (set (result['result']['value']))) +        result = await tab.Runtime.evaluate (expression=str (self.script), returnByValue=True) +        yield ExtractLinksEvent (list (set (mapOrIgnore (URL, result['result']['value']))))  class Crash (Behavior):      """ Crash the browser. For testing only. Obviously. """      name = 'crash' -    def onstop (self): +    async def onstop (self):          try: -            self.loader.tab.Page.crash (_timeout=1) -        except TimeoutException: +            await self.loader.tab.Page.crash () +        except Crashed:              pass -        yield from () +        return +        yield # pragma: no cover  # available behavior scripts. Order matters, move those modifying the page  # towards the end of available -generic = [Scroll, EmulateScreenMetrics, Click, ExtractLinks] -perSite = [] -available = generic + perSite + [Screenshot, DomSnapshot] +available = [Scroll, Click, ExtractLinks, Screenshot, EmulateScreenMetrics, DomSnapshot]  #available.append (Crash)  # order matters, since behavior can modify the page (dom snapshots, for instance)  availableMap = OrderedDict (map (lambda x: (x.name, x), available)) diff --git a/crocoite/browser.py b/crocoite/browser.py index b5ea4e3..3518789 100644 --- a/crocoite/browser.py +++ b/crocoite/browser.py @@ -22,115 +22,198 @@  Chrome browser interactions.  """ -from urllib.parse import urlsplit -from base64 import b64decode -from collections import deque -from threading import Event +import asyncio +from base64 import b64decode, b64encode +from datetime import datetime, timedelta  from http.server import BaseHTTPRequestHandler -from .logger import Level - -import pychrome -class Item: -    """ -    Simple wrapper containing Chrome request and response -    """ +from yarl import URL +from multidict import CIMultiDict -    __slots__ = ('tab', 'chromeRequest', 'chromeResponse', 'chromeFinished', -            'isRedirect', 'failed') +from .logger import Level +from .devtools import Browser, TabException + +# These two classes’ only purpose is so we can later tell whether a body was +# base64-encoded or a unicode string +class Base64Body (bytes): +    def __new__ (cls, value): +        return bytes.__new__ (cls, b64decode (value)) + +    @classmethod +    def fromBytes (cls, b): +        """ For testing """ +        return cls (b64encode (b)) + +class UnicodeBody (bytes): +    def __new__ (cls, value): +        if type (value) is not str: +            raise TypeError ('expecting unicode string') + +        return bytes.__new__ (cls, value.encode ('utf-8')) + +class Request: +    __slots__ = ('headers', 'body', 'initiator', 'hasPostData', 'method', 'timestamp') + +    def __init__ (self, method=None, headers=None, body=None): +        self.headers = headers +        self.body = body +        self.hasPostData = False +        self.initiator = None +        # HTTP method +        self.method = method +        self.timestamp = None -    def __init__ (self, tab): -        self.tab = tab -        self.chromeRequest = {} -        self.chromeResponse = {} -        self.chromeFinished = {} -        self.isRedirect = False -        self.failed = False +    def __repr__ (self): +        return f'Request({self.method!r}, {self.headers!r}, {self.body!r})' + +    def __eq__ (self, b): +        if b is None: +            return False + +        if not isinstance (b, Request): +            raise TypeError ('Can only compare equality with Request.') + +        # do not compare hasPostData (only required to fetch body) and +        # timestamp (depends on time) +        return self.headers == b.headers and \ +                self.body == b.body and \ +                self.initiator == b.initiator and \ +                self.method == b.method + +class Response: +    __slots__ = ('status', 'statusText', 'headers', 'body', 'bytesReceived', +            'timestamp', 'mimeType') + +    def __init__ (self, status=None, statusText=None, headers=None, body=None, mimeType=None): +        self.status = status +        self.statusText = statusText +        self.headers = headers +        self.body = body +        # bytes received over the network (not body size!) +        self.bytesReceived = 0 +        self.timestamp = None +        self.mimeType = mimeType      def __repr__ (self): -        return '<Item {}>'.format (self.url) +        return f'Response({self.status!r}, {self.statusText!r}, {self.headers!r}, {self.body!r}, {self.mimeType!r})' + +    def __eq__ (self, b): +        if b is None: +            return False + +        if not isinstance (b, Response): +            raise TypeError ('Can only compare equality with Response.') + +        # do not compare bytesReceived (depends on network), timestamp +        # (depends on time) and statusText (does not matter) +        return self.status == b.status and \ +                self.statusText == b.statusText and \ +                self.headers == b.headers and \ +                self.body == b.body and \ +                self.mimeType == b.mimeType + +class ReferenceTimestamp: +    """ Map relative timestamp to absolute timestamp """ + +    def __init__ (self, relative, absolute): +        self.relative = timedelta (seconds=relative) +        self.absolute = datetime.utcfromtimestamp (absolute) + +    def __call__ (self, relative): +        if not isinstance (relative, timedelta): +            relative = timedelta (seconds=relative) +        return self.absolute + (relative-self.relative) + +class RequestResponsePair: +    __slots__ = ('request', 'response', 'id', 'url', 'remoteIpAddress', +            'protocol', 'resourceType', '_time') + +    def __init__ (self, id=None, url=None, request=None, response=None): +        self.request = request +        self.response = response +        self.id = id +        self.url = url +        self.remoteIpAddress = None +        self.protocol = None +        self.resourceType = None +        self._time = None -    @property -    def request (self): -        return self.chromeRequest.get ('request', {}) +    def __repr__ (self): +        return f'RequestResponsePair({self.id!r}, {self.url!r}, {self.request!r}, {self.response!r})' + +    def __eq__ (self, b): +        if not isinstance (b, RequestResponsePair): +            raise TypeError (f'Can only compare with {self.__class__.__name__}') + +        # do not compare id and _time. These depend on external factors and do +        # not influence the request/response *content* +        return self.request == b.request and \ +                self.response == b.response and \ +                self.url == b.url and \ +                self.remoteIpAddress == b.remoteIpAddress and \ +                self.protocol == b.protocol and \ +                self.resourceType == b.resourceType + +    def fromRequestWillBeSent (self, req): +        """ Set request data from Chrome Network.requestWillBeSent event """ +        r = req['request'] + +        self.id = req['requestId'] +        self.url = URL (r['url']) +        self.resourceType = req.get ('type') +        self._time = ReferenceTimestamp (req['timestamp'], req['wallTime']) + +        assert self.request is None, req +        self.request = Request () +        self.request.initiator = req['initiator'] +        self.request.headers = CIMultiDict (self._unfoldHeaders (r['headers'])) +        self.request.hasPostData = r.get ('hasPostData', False) +        self.request.method = r['method'] +        self.request.timestamp = self._time (req['timestamp']) +        if self.request.hasPostData: +            postData = r.get ('postData') +            if postData is not None: +                self.request.body = UnicodeBody (postData) + +    def fromResponse (self, r, timestamp=None, resourceType=None): +        """ +        Set response data from Chrome’s Response object. +         +        Request must exist. Updates if response was set before. Sometimes +        fromResponseReceived is triggered twice by Chrome. No idea why. +        """ +        assert self.request is not None, (self.request, r) -    @property -    def response (self): -        return self.chromeResponse.get ('response', {}) +        if not timestamp: +            timestamp = self.request.timestamp -    @property -    def initiator (self): -        return self.chromeRequest['initiator'] +        self.remoteIpAddress = r.get ('remoteIPAddress') +        self.protocol = r.get ('protocol') +        if resourceType: +            self.resourceType = resourceType -    @property -    def id (self): -        return self.chromeRequest['requestId'] +        # a response may contain updated request headers (i.e. those actually +        # sent over the wire) +        if 'requestHeaders' in r: +            self.request.headers = CIMultiDict (self._unfoldHeaders (r['requestHeaders'])) -    @property -    def encodedDataLength (self): -        return self.chromeFinished['encodedDataLength'] +        self.response = Response () +        self.response.headers = CIMultiDict (self._unfoldHeaders (r['headers'])) +        self.response.status = r['status'] +        self.response.statusText = r['statusText'] +        self.response.timestamp = timestamp +        self.response.mimeType = r['mimeType'] -    @property -    def url (self): -        return self.response.get ('url', self.request.get ('url')) +    def fromResponseReceived (self, resp): +        """ Set response data from Chrome Network.responseReceived """ +        return self.fromResponse (resp['response'], +                self._time (resp['timestamp']), resp['type']) -    @property -    def parsedUrl (self): -        return urlsplit (self.url) +    def fromLoadingFinished (self, data): +        self.response.bytesReceived = data['encodedDataLength'] -    @property -    def body (self): -        """ Return response body or None """ -        try: -            body = self.tab.Network.getResponseBody (requestId=self.id, _timeout=10) -            rawBody = body['body'] -            base64Encoded = body['base64Encoded'] -            if base64Encoded: -                rawBody = b64decode (rawBody) -            else: -                rawBody = rawBody.encode ('utf8') -            return rawBody, base64Encoded -        except (pychrome.exceptions.CallMethodException, pychrome.exceptions.TimeoutException): -            raise ValueError ('Cannot fetch response body') - -    @property -    def requestBody (self): -        """ Get request/POST body """ -        req = self.request -        postData = req.get ('postData') -        if postData: -            return postData.encode ('utf8'), False -        elif req.get ('hasPostData', False): -            try: -                postData = self.tab.Network.getRequestPostData (requestId=self.id, _timeout=10)['postData'] -                return b64decode (postData), True -            except (pychrome.exceptions.CallMethodException, pychrome.exceptions.TimeoutException): -                raise ValueError ('Cannot fetch request body') -        return None, False - -    @property -    def requestHeaders (self): -        # the response object may contain refined headers, which were -        # *actually* sent over the wire -        return self._unfoldHeaders (self.response.get ('requestHeaders', self.request['headers'])) - -    @property -    def responseHeaders (self): -        return self._unfoldHeaders (self.response['headers']) - -    @property -    def statusText (self): -        text = self.response.get ('statusText') -        if text: -            return text -        text = BaseHTTPRequestHandler.responses.get (self.response['status']) -        if text: -            return text[0] -        return 'No status text available' - -    @property -    def resourceType (self): -        return self.chromeResponse.get ('type', self.chromeRequest.get ('type', None)) +    def fromLoadingFailed (self, data): +        self.response = None      @staticmethod      def _unfoldHeaders (headers): @@ -144,172 +227,248 @@ class Item:                  items.append ((k, v))          return items -    def setRequest (self, req): -        self.chromeRequest = req - -    def setResponse (self, resp): -        self.chromeResponse = resp +    async def prefetchRequestBody (self, tab): +        if self.request.hasPostData and self.request.body is None: +            try: +                postData = await tab.Network.getRequestPostData (requestId=self.id) +                self.request.body = UnicodeBody (postData['postData']) +            except TabException: +                self.request.body = None -    def setFinished (self, finished): -        self.chromeFinished = finished +    async def prefetchResponseBody (self, tab): +        """ Fetch response body """ +        try: +            body = await tab.Network.getResponseBody (requestId=self.id) +            if body['base64Encoded']: +                self.response.body = Base64Body (body['body']) +            else: +                self.response.body = UnicodeBody (body['body']) +        except TabException: +            self.response.body = None -class BrowserCrashed (Exception): +class NavigateError (IOError):      pass +class PageIdle: +    """ Page idle event """ + +    __slots__ = ('idle', ) + +    def __init__ (self, idle): +        self.idle = idle + +    def __bool__ (self): +        return self.idle + +class FrameNavigated: +    __slots__ = ('id', 'url', 'mimeType') + +    def __init__ (self, id, url, mimeType): +        self.id = id +        self.url = URL (url) +        self.mimeType = mimeType +  class SiteLoader:      """      Load site in Chrome and monitor network requests -    Chrome’s raw devtools events are preprocessed here (asynchronously, in a -    different thread, spawned by pychrome) and put into a deque. There -    are two reasons for this: First of all, it makes consumer exception -    handling alot easier (no need to propagate them to the main thread). And -    secondly, browser crashes must be handled before everything else, as they -    result in a loss of communication with the browser itself (i.e. we can’t -    fetch a resource’s body any more). -      XXX: track popup windows/new tabs and close them      """ -    __slots__ = ('requests', 'browser', 'url', 'logger', 'queue', 'notify', 'tab') +    __slots__ = ('requests', 'browser', 'logger', 'tab', '_iterRunning', +            '_framesLoading', '_rootFrame')      allowedSchemes = {'http', 'https'} -    def __init__ (self, browser, url, logger): +    def __init__ (self, browser, logger):          self.requests = {} -        self.browser = pychrome.Browser (url=browser) -        self.url = url -        self.logger = logger.bind (context=type (self).__name__, url=url) -        self.queue = deque () -        self.notify = Event () - -    def __enter__ (self): -        tab = self.tab = self.browser.new_tab() -        # setup callbacks -        tab.Network.requestWillBeSent = self._requestWillBeSent -        tab.Network.responseReceived = self._responseReceived -        tab.Network.loadingFinished = self._loadingFinished -        tab.Network.loadingFailed = self._loadingFailed -        tab.Log.entryAdded = self._entryAdded -        tab.Page.javascriptDialogOpening = self._javascriptDialogOpening -        tab.Inspector.targetCrashed = self._targetCrashed - -        # start the tab -        tab.start() +        self.browser = Browser (url=browser) +        self.logger = logger.bind (context=type (self).__name__) +        self._iterRunning = [] -        # enable events -        tab.Log.enable () -        tab.Network.enable() -        tab.Page.enable () -        tab.Inspector.enable () -        tab.Network.clearBrowserCache () -        if tab.Network.canClearBrowserCookies ()['result']: -            tab.Network.clearBrowserCookies () +        self._framesLoading = set () +        self._rootFrame = None + +    async def __aenter__ (self): +        tab = self.tab = await self.browser.__aenter__ () +        # enable events +        await asyncio.gather (*[ +                tab.Log.enable (), +                tab.Network.enable(), +                tab.Page.enable (), +                tab.Inspector.enable (), +                tab.Network.clearBrowserCache (), +                tab.Network.clearBrowserCookies (), +                ])          return self -    def __exit__ (self, exc_type, exc_value, traceback): -        self.tab.Page.stopLoading () -        self.tab.stop () -        self.browser.close_tab(self.tab) +    async def __aexit__ (self, exc_type, exc_value, traceback): +        for task in self._iterRunning: +            # ignore any results from stuff we did not end up using anyway +            if not task.done (): +                task.cancel () +        self._iterRunning = [] +        await self.browser.__aexit__ (exc_type, exc_value, traceback) +        self.tab = None          return False      def __len__ (self):          return len (self.requests) -    def __iter__ (self): -        return iter (self.queue) - -    def start (self): -        self.tab.Page.navigate(url=self.url) - -    # use event to signal presence of new items. This way the controller -    # can wait for them without polling. -    def _append (self, item): -        self.queue.append (item) -        self.notify.set () - -    def _appendleft (self, item): -        self.queue.appendleft (item) -        self.notify.set () +    async def __aiter__ (self): +        """ Retrieve network items """ +        tab = self.tab +        assert tab is not None +        handler = { +                tab.Network.requestWillBeSent: self._requestWillBeSent, +                tab.Network.responseReceived: self._responseReceived, +                tab.Network.loadingFinished: self._loadingFinished, +                tab.Network.loadingFailed: self._loadingFailed, +                tab.Log.entryAdded: self._entryAdded, +                tab.Page.javascriptDialogOpening: self._javascriptDialogOpening, +                tab.Page.frameStartedLoading: self._frameStartedLoading, +                tab.Page.frameStoppedLoading: self._frameStoppedLoading, +                tab.Page.frameNavigated: self._frameNavigated, +                } + +        # The implementation is a little advanced. Why? The goal here is to +        # process events from the tab as quickly as possible (i.e. +        # asynchronously). We need to make sure that JavaScript dialogs are +        # handled immediately for instance. Otherwise they stall every +        # other request. Also, we don’t want to use an unbounded queue, +        # since the items yielded can get quite big (response body). Thus +        # we need to block (yield) for every item completed, but not +        # handled by the consumer (caller). +        running = self._iterRunning +        tabGetTask = asyncio.ensure_future (self.tab.get ()) +        running.append (tabGetTask) +        while True: +            done, pending = await asyncio.wait (running, return_when=asyncio.FIRST_COMPLETED) +            for t in done: +                result = t.result () +                if result is None: +                    pass +                elif t == tabGetTask: +                    method, data = result +                    f = handler.get (method, None) +                    if f is not None: +                        task = asyncio.ensure_future (f (**data)) +                        pending.add (task) +                    tabGetTask = asyncio.ensure_future (self.tab.get ()) +                    pending.add (tabGetTask) +                else: +                    yield result + +            running = pending +            self._iterRunning = running + +    async def navigate (self, url): +        ret = await self.tab.Page.navigate(url=url) +        self.logger.debug ('navigate', +                uuid='9d47ded2-951f-4e09-86ee-fd4151e20666', result=ret) +        if 'errorText' in ret: +            raise NavigateError (ret['errorText']) +        self._rootFrame = ret['frameId']      # internal chrome callbacks -    def _requestWillBeSent (self, **kwargs): +    async def _requestWillBeSent (self, **kwargs): +        self.logger.debug ('requestWillBeSent', +                uuid='b828d75a-650d-42d2-8c66-14f4547512da', args=kwargs) +          reqId = kwargs['requestId']          req = kwargs['request'] -        logger = self.logger.bind (reqId=reqId, reqUrl=req['url']) +        url = URL (req['url']) +        logger = self.logger.bind (reqId=reqId, reqUrl=url) -        url = urlsplit (req['url'])          if url.scheme not in self.allowedSchemes:              return +        ret = None          item = self.requests.get (reqId)          if item:              # redirects never “finish” loading, but yield another requestWillBeSent with this key set              redirectResp = kwargs.get ('redirectResponse')              if redirectResp: -                # create fake responses -                resp = {'requestId': reqId, 'response': redirectResp, 'timestamp': kwargs['timestamp']} -                item.setResponse (resp) -                resp = {'requestId': reqId, 'encodedDataLength': 0, 'timestamp': kwargs['timestamp']} -                item.setFinished (resp) -                item.isRedirect = True -                logger.info ('redirect', uuid='85eaec41-e2a9-49c2-9445-6f19690278b8', target=req['url']) -                self._append (item) +                if item.url != url: +                    # this happens for unknown reasons. the docs simply state +                    # it can differ in case of a redirect. Fix it and move on. +                    logger.warning ('redirect url differs', +                            uuid='558a7df7-2258-4fe4-b16d-22b6019cc163', +                            expected=item.url) +                    redirectResp['url'] = str (item.url) +                item.fromResponse (redirectResp) +                logger.info ('redirect', uuid='85eaec41-e2a9-49c2-9445-6f19690278b8', target=url) +                # XXX: queue this? no need to wait for it +                await item.prefetchRequestBody (self.tab) +                # cannot fetch response body due to race condition (item id reused) +                ret = item              else:                  logger.warning ('request exists', uuid='2c989142-ba00-4791-bb03-c2a14e91a56b') -        item = Item (self.tab) -        item.setRequest (kwargs) +        item = RequestResponsePair () +        item.fromRequestWillBeSent (kwargs)          self.requests[reqId] = item -        logger.debug ('request', uuid='55c17564-1bd0-4499-8724-fa7aad65478f') -    def _responseReceived (self, **kwargs): +        return ret + +    async def _responseReceived (self, **kwargs): +        self.logger.debug ('responseReceived', +                uuid='ecd67e69-401a-41cb-b4ec-eeb1f1ec6abb', args=kwargs) +          reqId = kwargs['requestId']          item = self.requests.get (reqId)          if item is None:              return          resp = kwargs['response'] -        logger = self.logger.bind (reqId=reqId, respUrl=resp['url']) -        url = urlsplit (resp['url']) +        url = URL (resp['url']) +        logger = self.logger.bind (reqId=reqId, respUrl=url) +        if item.url != url: +            logger.error ('url mismatch', uuid='7385f45f-0b06-4cbc-81f9-67bcd72ee7d0', respUrl=url)          if url.scheme in self.allowedSchemes: -            logger.debug ('response', uuid='84461c4e-e8ef-4cbd-8e8e-e10a901c8bd0') -            item.setResponse (kwargs) +            item.fromResponseReceived (kwargs)          else:              logger.warning ('scheme forbidden', uuid='2ea6e5d7-dd3b-4881-b9de-156c1751c666') -    def _loadingFinished (self, **kwargs): +    async def _loadingFinished (self, **kwargs):          """          Item was fully loaded. For some items the request body is not available          when responseReceived is fired, thus move everything here.          """ +        self.logger.debug ('loadingFinished', +                uuid='35479405-a5b5-4395-8c33-d3601d1796b9', args=kwargs) +          reqId = kwargs['requestId']          item = self.requests.pop (reqId, None)          if item is None:              # we never recorded this request (blacklisted scheme, for example)              return +        if not item.response: +            # chrome failed to send us a responseReceived event for this item, +            # so we can’t record it (missing request/response headers) +            self.logger.error ('response missing', +                    uuid='fac3ab96-3f9b-4c5a-95c7-f83b675cdcb9', requestId=item.id) +            return +          req = item.request -        logger = self.logger.bind (reqId=reqId, reqUrl=req['url']) -        resp = item.response -        if req['url'] != resp['url']: -            logger.error ('url mismatch', uuid='7385f45f-0b06-4cbc-81f9-67bcd72ee7d0', respUrl=resp['url']) -        url = urlsplit (resp['url']) -        if url.scheme in self.allowedSchemes: -            logger.info ('finished', uuid='5a8b4bad-f86a-4fe6-a53e-8da4130d6a02') -            item.setFinished (kwargs) -            self._append (item) +        if item.url.scheme in self.allowedSchemes: +            item.fromLoadingFinished (kwargs) +            # XXX queue both +            await asyncio.gather (item.prefetchRequestBody (self.tab), item.prefetchResponseBody (self.tab)) +            return item + +    async def _loadingFailed (self, **kwargs): +        self.logger.info ('loadingFailed', +                uuid='4a944e85-5fae-4aa6-9e7c-e578b29392e4', args=kwargs) -    def _loadingFailed (self, **kwargs):          reqId = kwargs['requestId'] -        self.logger.warning ('loading failed', -                uuid='68410f13-6eea-453e-924e-c1af4601748b', -                errorText=kwargs['errorText'], -                blockedReason=kwargs.get ('blockedReason')) +        logger = self.logger.bind (reqId=reqId)          item = self.requests.pop (reqId, None) -        item.failed = True -        self._append (item) +        if item is not None: +            item.fromLoadingFailed (kwargs) +            return item -    def _entryAdded (self, **kwargs): +    async def _entryAdded (self, **kwargs):          """ Log entry added """          entry = kwargs['entry']          level = {'verbose': Level.DEBUG, 'info': Level.INFO, @@ -318,95 +477,43 @@ class SiteLoader:          entry['uuid'] = 'e62ffb5a-0521-459c-a3d9-1124551934d2'          self.logger (level, 'console', **entry) -    def _javascriptDialogOpening (self, **kwargs): +    async def _javascriptDialogOpening (self, **kwargs):          t = kwargs.get ('type')          if t in {'alert', 'confirm', 'prompt'}:              self.logger.info ('js dialog',                      uuid='d6f07ce2-648e-493b-a1df-f353bed27c84',                      action='cancel', type=t, message=kwargs.get ('message')) -            self.tab.Page.handleJavaScriptDialog (accept=False) +            await self.tab.Page.handleJavaScriptDialog (accept=False)          elif t == 'beforeunload':              # we must accept this one, otherwise the page will not unload/close              self.logger.info ('js dialog',                      uuid='96399b99-9834-4c8f-bd93-cb9fa2225abd',                      action='proceed', type=t, message=kwargs.get ('message')) -            self.tab.Page.handleJavaScriptDialog (accept=True) -        else: -            self.logger.warning ('js dialog unknown', uuid='3ef7292e-8595-4e89-b834-0cc6bc40ee38', **kwargs) - -    def _targetCrashed (self, **kwargs): -        self.logger.error ('browser crashed', uuid='6fe2b3be-ff01-4503-b30c-ad6aeea953ef') -        # priority message -        self._appendleft (BrowserCrashed ()) - -import subprocess, os, time -from tempfile import mkdtemp -import shutil - -class ChromeService: -    """ Start Google Chrome listening on a random port """ - -    __slots__ = ('binary', 'windowSize', 'p', 'userDataDir') - -    def __init__ (self, binary='google-chrome-stable', windowSize=(1920, 1080)): -        self.binary = binary -        self.windowSize = windowSize -        self.p = None - -    def __enter__ (self): -        assert self.p is None -        self.userDataDir = mkdtemp () -        args = [self.binary, -                '--window-size={},{}'.format (*self.windowSize), -                '--user-data-dir={}'.format (self.userDataDir), # use temporory user dir -                '--no-default-browser-check', -                '--no-first-run', # don’t show first run screen -                '--disable-breakpad', # no error reports -                '--disable-extensions', -                '--disable-infobars', -                '--disable-notifications', # no libnotify -                '--headless', -                '--disable-gpu', -                '--hide-scrollbars', # hide scrollbars on screenshots -                '--mute-audio', # don’t play any audio -                '--remote-debugging-port=0', # pick a port. XXX: we may want to use --remote-debugging-pipe instead -                '--homepage=about:blank', -                'about:blank'] -        # start new session, so ^C does not affect subprocess -        self.p = subprocess.Popen (args, start_new_session=True, -                stdin=subprocess.DEVNULL, stdout=subprocess.DEVNULL, -                stderr=subprocess.DEVNULL) -        port = None -        # chrome writes its current active devtools port to a file. due to the -        # sleep() this is rather ugly, but should work with all versions of the -        # browser. -        for i in range (100): -            try: -                with open (os.path.join (self.userDataDir, 'DevToolsActivePort'), 'r') as fd: -                    port = int (fd.readline ().strip ()) -                    break -            except FileNotFoundError: -                time.sleep (0.2) -        if port is None: -            raise Exception ('Chrome died on us.') - -        return 'http://localhost:{}'.format (port) - -    def __exit__ (self, *exc): -        self.p.terminate () -        self.p.wait () -        shutil.rmtree (self.userDataDir) -        self.p = None - -class NullService: -    __slots__ = ('url') - -    def __init__ (self, url): -        self.url = url - -    def __enter__ (self): -        return self.url - -    def __exit__ (self, *exc): -        pass +            await self.tab.Page.handleJavaScriptDialog (accept=True) +        else: # pragma: no cover +            self.logger.warning ('js dialog unknown', +                    uuid='3ef7292e-8595-4e89-b834-0cc6bc40ee38', **kwargs) + +    async def _frameStartedLoading (self, **kwargs): +        self.logger.debug ('frameStartedLoading', +                uuid='bbeb39c0-3304-4221-918e-f26bd443c566', args=kwargs) + +        self._framesLoading.add (kwargs['frameId']) +        return PageIdle (False) + +    async def _frameStoppedLoading (self, **kwargs): +        self.logger.debug ('frameStoppedLoading', +                uuid='fcbe8110-511c-4cbb-ac2b-f61a5782c5a0', args=kwargs) + +        self._framesLoading.remove (kwargs['frameId']) +        if not self._framesLoading: +            return PageIdle (True) + +    async def _frameNavigated (self, **kwargs): +        self.logger.debug ('frameNavigated', +                uuid='0e876f7d-7129-4612-8632-686f42ac6e1f', args=kwargs) +        frame = kwargs['frame'] +        if self._rootFrame == frame['id']: +            assert frame.get ('parentId', None) is None, "root frame must not have a parent" +            return FrameNavigated (frame['id'], frame['url'], frame['mimeType']) diff --git a/crocoite/cli.py b/crocoite/cli.py index 8e225d9..04bbb19 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -22,79 +22,235 @@  Command line interface  """ -import argparse, json, sys +import argparse, sys, signal, asyncio, os, json +from traceback import TracebackException +from enum import IntEnum +from yarl import URL +from http.cookies import SimpleCookie +import pkg_resources +try: +    import manhole +    manhole.install (patch_fork=False, oneshot_on='USR1') +except ModuleNotFoundError: +    pass -from . import behavior -from .controller import RecursiveController, defaultSettings, \ -        ControllerSettings, DepthLimit, PrefixLimit, StatsHandler -from .browser import NullService, ChromeService +from . import behavior, browser +from .controller import SinglePageController, \ +        ControllerSettings, StatsHandler, LogHandler, \ +        RecursiveController, DepthLimit, PrefixLimit +from .devtools import Passthrough, Process  from .warc import WarcHandler -from .logger import Logger, JsonPrintConsumer, DatetimeConsumer, WarcHandlerConsumer +from .logger import Logger, JsonPrintConsumer, DatetimeConsumer, \ +        WarcHandlerConsumer, Level +from .devtools import Crashed -def parseRecursive (recursive, url): +def absurl (s): +    """ argparse: Absolute URL """ +    u = URL (s) +    if u.is_absolute (): +        return u +    raise argparse.ArgumentTypeError ('Must be absolute') + +def cookie (s): +    """ argparse: Cookie """ +    c = SimpleCookie (s) +    # for some reason the constructor does not raise an exception if the cookie +    # supplied is invalid. It’ll simply be empty. +    if len (c) != 1: +        raise argparse.ArgumentTypeError ('Invalid cookie') +    # we want a single Morsel +    return next (iter (c.values ())) + +def cookiejar (f): +    """ argparse: Cookies from file """ +    cookies = [] +    try: +        with open (f, 'r') as fd: +            for l in fd: +                l = l.lstrip () +                if l and not l.startswith ('#'): +                    cookies.append (cookie (l)) +    except FileNotFoundError: +        raise argparse.ArgumentTypeError (f'Cookie jar "{f}" does not exist') +    return cookies + +class SingleExitStatus(IntEnum): +    """ Exit status for single-shot command line """ +    Ok = 0 +    Fail = 1 +    BrowserCrash = 2 +    Navigate = 3 + +def single (): +    parser = argparse.ArgumentParser(description='crocoite helper tools to fetch individual pages.') +    parser.add_argument('--browser', help='DevTools URL', type=absurl, metavar='URL') +    parser.add_argument('--timeout', default=1*60*60, type=int, help='Maximum time for archival', metavar='SEC') +    parser.add_argument('--idle-timeout', default=30, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout', metavar='SEC') +    parser.add_argument('--behavior', help='Enable behavior script', +            dest='enabledBehaviorNames', +            default=list (behavior.availableMap.keys ()), +            choices=list (behavior.availableMap.keys ()), +            metavar='NAME', nargs='*') +    parser.add_argument('--warcinfo', help='Add extra information to warcinfo record', +            metavar='JSON', type=json.loads) +    # re-using curl’s short/long switch names whenever possible +    parser.add_argument('-k', '--insecure', +            action='store_true', +            help='Disable certificate validation') +    parser.add_argument ('-b', '--cookie', type=cookie, metavar='SET-COOKIE', +            action='append', default=[], help='Cookies in Set-Cookie format.') +    parser.add_argument ('-c', '--cookie-jar', dest='cookieJar', +            type=cookiejar, metavar='FILE', +            default=pkg_resources.resource_filename (__name__, 'data/cookies.txt'), +            help='Cookie jar file, read-only.') +    parser.add_argument('url', help='Website URL', type=absurl, metavar='URL') +    parser.add_argument('output', help='WARC filename', metavar='FILE') + +    args = parser.parse_args () + +    logger = Logger (consumer=[DatetimeConsumer (), JsonPrintConsumer ()]) + +    ret = SingleExitStatus.Fail +    service = Process () +    if args.browser: +        service = Passthrough (args.browser) +    settings = ControllerSettings ( +            idleTimeout=args.idleTimeout, +            timeout=args.timeout, +            insecure=args.insecure, +            cookies=args.cookieJar + args.cookie, +            ) +    with open (args.output, 'wb') as fd, WarcHandler (fd, logger) as warcHandler: +        logger.connect (WarcHandlerConsumer (warcHandler)) +        handler = [StatsHandler (), LogHandler (logger), warcHandler] +        b = list (map (lambda x: behavior.availableMap[x], args.enabledBehaviorNames)) +        controller = SinglePageController (url=args.url, settings=settings, +                service=service, handler=handler, behavior=b, logger=logger, +                warcinfo=args.warcinfo) +        try: +            loop = asyncio.get_event_loop() +            run = asyncio.ensure_future (controller.run ()) +            stop = lambda signum: run.cancel () +            loop.add_signal_handler (signal.SIGINT, stop, signal.SIGINT) +            loop.add_signal_handler (signal.SIGTERM, stop, signal.SIGTERM) +            loop.run_until_complete(run) +            loop.close() +            ret = SingleExitStatus.Ok +        except Crashed: +            ret = SingleExitStatus.BrowserCrash +        except asyncio.CancelledError: +            # don’t log this one +            pass +        except browser.NavigateError: +            ret = SingleExitStatus.Navigate +        except Exception as e: +            ret = SingleExitStatus.Fail +            logger.error ('cli exception', +                    uuid='7fd69858-ecaa-4225-b213-8ab880aa3cc5', +                    traceback=list (TracebackException.from_exception (e).format ())) +        finally: +            r = handler[0].stats +            logger.info ('stats', context='cli', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **r) +            logger.info ('exit', context='cli', uuid='9b1bd603-f7cd-4745-895a-5b894a5166f2', status=ret) + +    return ret + +def parsePolicy (recursive, url):      if recursive is None:          return DepthLimit (0)      elif recursive.isdigit ():          return DepthLimit (int (recursive))      elif recursive == 'prefix':          return PrefixLimit (url) -    else: -        raise ValueError ('Unsupported') +    raise argparse.ArgumentTypeError ('Unsupported recursion mode') + +def recursive (): +    logger = Logger (consumer=[DatetimeConsumer (), JsonPrintConsumer ()]) -def main ():      parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.') -    parser.add_argument('--browser', help='DevTools URL', metavar='URL') -    parser.add_argument('--recursive', help='Follow links recursively') -    parser.add_argument('--concurrency', '-j', type=int, default=1) -    parser.add_argument('--timeout', default=10, type=int, help='Maximum time for archival', metavar='SEC') -    parser.add_argument('--idle-timeout', default=2, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout', metavar='SEC') -    parser.add_argument('--log-buffer', default=defaultSettings.logBuffer, type=int, dest='logBuffer', metavar='LINES') -    parser.add_argument('--max-body-size', default=defaultSettings.maxBodySize, type=int, dest='maxBodySize', help='Max body size', metavar='BYTES') -    parser.add_argument('--behavior', help='Comma-separated list of enabled behavior scripts', -            dest='enabledBehaviorNames', -            default=list (behavior.availableMap.keys ()), -            choices=list (behavior.availableMap.keys ())) -    group = parser.add_mutually_exclusive_group (required=True) -    group.add_argument('--output', help='WARC filename', metavar='FILE') -    group.add_argument('--distributed', help='Use celery worker', action='store_true') -    parser.add_argument('url', help='Website URL') +    parser.add_argument('-j', '--concurrency', +            help='Run at most N jobs concurrently', metavar='N', default=1, +            type=int) +    parser.add_argument('-r', '--recursion', help='Recursion policy', +            metavar='POLICY') +    parser.add_argument('--tempdir', help='Directory for temporary files', +            metavar='DIR') +    parser.add_argument('url', help='Seed URL', type=absurl, metavar='URL') +    parser.add_argument('output', +            help='Output file, supports templates {host}, {date} and {seqnum}', +            metavar='FILE') +    parser.add_argument('command', +            help='Fetch command, supports templates {url} and {dest}', +            metavar='CMD', nargs='*', +            default=['crocoite-single', '{url}', '{dest}'])      args = parser.parse_args () +    try: +        policy = parsePolicy (args.recursion, args.url) +    except argparse.ArgumentTypeError as e: +        parser.error (str (e)) -    if args.distributed: -        if args.browser: -            parser.error ('--browser is not supported for distributed jobs') -        from . import task -        settings = dict (maxBodySize=args.maxBodySize, -                logBuffer=args.logBuffer, idleTimeout=args.idleTimeout, -                timeout=args.timeout) -        result = task.controller.delay (url=args.url, settings=settings, -                enabledBehaviorNames=args.enabledBehaviorNames, -                recursive=args.recursive, concurrency=args.concurrency) -        r = result.get () -    else: -        logger = Logger (consumer=[DatetimeConsumer (), JsonPrintConsumer ()]) +    try: +        controller = RecursiveController (url=args.url, output=args.output, +                command=args.command, logger=logger, policy=policy, +                tempdir=args.tempdir, concurrency=args.concurrency) +    except ValueError as e: +        parser.error (str (e)) -        try: -            recursionPolicy = parseRecursive (args.recursive, args.url) -        except ValueError: -            parser.error ('Invalid argument for --recursive') -        service = ChromeService () -        if args.browser: -            service = NullService (args.browser) -        settings = ControllerSettings (maxBodySize=args.maxBodySize, -                logBuffer=args.logBuffer, idleTimeout=args.idleTimeout, -                timeout=args.timeout) -        with open (args.output, 'wb') as fd, WarcHandler (fd, logger) as warcHandler: -            logger.connect (WarcHandlerConsumer (warcHandler)) -            handler = [StatsHandler (), warcHandler] -            b = list (map (lambda x: behavior.availableMap[x], args.enabledBehaviorNames)) -            controller = RecursiveController (args.url, fd, settings=settings, -                    recursionPolicy=recursionPolicy, service=service, -                    handler=handler, behavior=b, logger=logger) -            controller.run () -            r = handler[0].stats -            logger.info ('stats', context='cli', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **r) +    run = asyncio.ensure_future (controller.run ()) +    loop = asyncio.get_event_loop() +    stop = lambda signum: run.cancel () +    loop.add_signal_handler (signal.SIGINT, stop, signal.SIGINT) +    loop.add_signal_handler (signal.SIGTERM, stop, signal.SIGTERM) +    try: +        loop.run_until_complete(run) +    except asyncio.CancelledError: +        pass +    finally: +        loop.close() + +    return 0 + +def irc (): +    import json, re +    from .irc import Chromebot + +    logger = Logger (consumer=[DatetimeConsumer (), JsonPrintConsumer ()]) + +    parser = argparse.ArgumentParser(description='IRC bot.') +    parser.add_argument('--config', '-c', help='Config file location', metavar='PATH', default='chromebot.json') + +    args = parser.parse_args () + +    with open (args.config) as fd: +        config = json.load (fd) +    s = config['irc'] +    blacklist = dict (map (lambda x: (re.compile (x[0], re.I), x[1]), config['blacklist'].items ())) + +    loop = asyncio.get_event_loop() +    bot = Chromebot ( +            host=s['host'], +            port=s['port'], +            ssl=s['ssl'], +            nick=s['nick'], +            channels=s['channels'], +            tempdir=config['tempdir'], +            destdir=config['destdir'], +            processLimit=config['process_limit'], +            logger=logger, +            blacklist=blacklist, +            needVoice=config['need_voice'], +            loop=loop) +    stop = lambda signum: bot.cancel () +    loop.add_signal_handler (signal.SIGINT, stop, signal.SIGINT) +    loop.add_signal_handler (signal.SIGTERM, stop, signal.SIGTERM) +    loop.run_until_complete(bot.run ()) + +def dashboard (): +    from .irc import Dashboard -    return True +    loop = asyncio.get_event_loop() +    d = Dashboard (sys.stdin, loop) +    loop.run_until_complete(d.run ()) +    loop.run_forever() diff --git a/crocoite/controller.py b/crocoite/controller.py index 178d11c..8374b4e 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -22,190 +22,291 @@  Controller classes, handling actions required for archival  """ +import time, tempfile, asyncio, json, os, shutil, signal +from itertools import islice +from datetime import datetime +from operator import attrgetter +from abc import ABC, abstractmethod +from yarl import URL + +from . import behavior as cbehavior +from .browser import SiteLoader, RequestResponsePair, PageIdle, FrameNavigated +from .util import getFormattedViewportMetrics, getSoftwareInfo +from .behavior import ExtractLinksEvent +from .devtools import toCookieParam +  class ControllerSettings: -    __slots__ = ('logBuffer', 'maxBodySize', 'idleTimeout', 'timeout') +    __slots__ = ('idleTimeout', 'timeout', 'insecure', 'cookies') -    def __init__ (self, logBuffer=1000, maxBodySize=50*1024*1024, idleTimeout=2, timeout=10): -        self.logBuffer = logBuffer -        self.maxBodySize = maxBodySize +    def __init__ (self, idleTimeout=2, timeout=10, insecure=False, cookies=None):          self.idleTimeout = idleTimeout          self.timeout = timeout +        self.insecure = insecure +        self.cookies = cookies or [] -    def toDict (self): -        return dict (logBuffer=self.logBuffer, maxBodySize=self.maxBodySize, -                idleTimeout=self.idleTimeout, timeout=self.timeout) +    def __repr__ (self): +        return f'<ControllerSetting idleTimeout={self.idleTimeout!r}, timeout={self.timeout!r}, insecure={self.insecure!r}, cookies={self.cookies!r}>'  defaultSettings = ControllerSettings () -class EventHandler: +class EventHandler (ABC):      """ Abstract base class for event handler """      __slots__ = () -    # this handler wants to know about exceptions before they are reraised by -    # the controller -    acceptException = False - -    def push (self, item): +    @abstractmethod +    async def push (self, item):          raise NotImplementedError () -from .browser import BrowserCrashed -  class StatsHandler (EventHandler): -    __slots__ = ('stats') - -    acceptException = True +    __slots__ = ('stats', )      def __init__ (self): -        self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0, 'crashed': 0} +        self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0} -    def push (self, item): -        if isinstance (item, Item): +    async def push (self, item): +        if isinstance (item, RequestResponsePair):              self.stats['requests'] += 1 -            if item.failed: +            if not item.response:                  self.stats['failed'] += 1              else:                  self.stats['finished'] += 1 -                self.stats['bytesRcv'] += item.encodedDataLength -        elif isinstance (item, BrowserCrashed): -            self.stats['crashed'] += 1 +                self.stats['bytesRcv'] += item.response.bytesReceived -import time, platform +class LogHandler (EventHandler): +    """ Handle items by logging information about them """ + +    __slots__ = ('logger', ) + +    def __init__ (self, logger): +        self.logger = logger.bind (context=type (self).__name__) + +    async def push (self, item): +        if isinstance (item, ExtractLinksEvent): +            # limit number of links per message, so json blob won’t get too big +            it = iter (item.links) +            limit = 100 +            while True: +                limitlinks = list (islice (it, 0, limit)) +                if not limitlinks: +                    break +                self.logger.info ('extracted links', context=type (item).__name__, +                        uuid='8ee5e9c9-1130-4c5c-88ff-718508546e0c', links=limitlinks) -from . import behavior as cbehavior -from .browser import ChromeService, SiteLoader, Item -from .util import getFormattedViewportMetrics, removeFragment, getRequirements  class ControllerStart: -    __slots__ = ('payload') +    __slots__ = ('payload', )      def __init__ (self, payload):          self.payload = payload +class IdleStateTracker (EventHandler): +    """ Track SiteLoader’s idle state by listening to PageIdle events """ + +    __slots__ = ('_idle', '_loop', '_idleSince') + +    def __init__ (self, loop): +        self._idle = True +        self._loop = loop + +        self._idleSince = self._loop.time () + +    async def push (self, item): +        if isinstance (item, PageIdle): +            self._idle = bool (item) +            if self._idle: +                self._idleSince = self._loop.time () + +    async def wait (self, timeout): +        """ Wait until page has been idle for at least timeout seconds. If the +        page has been idle before calling this function it may return +        immediately. """ + +        assert timeout > 0 +        while True: +            if self._idle: +                now = self._loop.time () +                sleep = timeout-(now-self._idleSince) +                if sleep <= 0: +                    break +            else: +                # not idle, check again after timeout expires +                sleep = timeout +            await asyncio.sleep (sleep) + +class InjectBehaviorOnload (EventHandler): +    """ Control behavior script injection based on frame navigation messages. +    When a page is reloaded (for whatever reason), the scripts need to be +    reinjected. """ + +    __slots__ = ('controller', '_loaded') + +    def __init__ (self, controller): +        self.controller = controller +        self._loaded = False + +    async def push (self, item): +        if isinstance (item, FrameNavigated): +            await self._runon ('load') +            self._loaded = True + +    async def stop (self): +        if self._loaded: +            await self._runon ('stop') + +    async def finish (self): +        if self._loaded: +            await self._runon ('finish') + +    async def _runon (self, method): +        controller = self.controller +        for b in controller._enabledBehavior: +            f = getattr (b, 'on' + method) +            async for item in f (): +                await controller.processItem (item) +  class SinglePageController:      """ -    Archive a single page url to file output. +    Archive a single page url.      Dispatches between producer (site loader and behavior scripts) and consumer      (stats, warc writer).      """ -    __slots__ = ('url', 'output', 'service', 'behavior', 'settings', 'logger', 'handler') +    __slots__ = ('url', 'service', 'behavior', 'settings', 'logger', 'handler', +            'warcinfo', '_enabledBehavior') -    def __init__ (self, url, output, logger, \ -            service=ChromeService (), behavior=cbehavior.available, \ -            settings=defaultSettings, handler=[]): +    def __init__ (self, url, logger, \ +            service, behavior=cbehavior.available, \ +            settings=defaultSettings, handler=None, \ +            warcinfo=None):          self.url = url -        self.output = output          self.service = service          self.behavior = behavior          self.settings = settings          self.logger = logger.bind (context=type (self).__name__, url=url) -        self.handler = handler - -    def processItem (self, item): -        if isinstance (item, Exception): -            for h in self.handler: -                if h.acceptException: -                    h.push (item) -            raise item +        self.handler = handler or [] +        self.warcinfo = warcinfo +    async def processItem (self, item):          for h in self.handler: -            h.push (item) +            await h.push (item) -    def run (self): +    async def run (self):          logger = self.logger -        def processQueue (): -            # XXX: this is very ugly code and does not work well. figure out a -            # better way to impose timeouts and still process all items in the -            # queue -            queue = l.queue -            logger.debug ('process queue', -                    uuid='dafbf76b-a37e-44db-a021-efb5593b81f8', -                    queuelen=len (queue)) -            while True: -                now = time.time () -                elapsed = now-start -                maxTimeout = max (min (self.settings.idleTimeout, self.settings.timeout-elapsed), 0) -                logger.debug ('timeout status', -                        uuid='49550447-37e3-49ff-9a73-34da1c3e5984', -                        maxTimeout=maxTimeout, elapsed=elapsed) -                # skip waiting if there is work to do. processes all items in -                # queue, regardless of timeouts, i.e. you need to make sure the -                # queue will actually be empty at some point. -                if len (queue) == 0: -                    if not l.notify.wait (maxTimeout): -                        assert len (queue) == 0, "event must be sent" -                        # timed out -                        logger.debug ('timeout', -                                uuid='6a7e0083-7c1a-45ba-b1ed-dbc4f26697c6', -                                elapsed=elapsed) -                        break -                    else: -                        l.notify.clear () -                # limit number of items processed here, otherwise timeout won’t -                # be checked frequently. this can happen if the site quickly -                # loads a lot of items. -                for i in range (1000): -                    try: -                        item = queue.popleft () -                        logger.debug ('queue pop', -                                uuid='adc96bfa-026d-4092-b732-4a022a1a92ca', -                                item=item, queuelen=len (queue)) -                    except IndexError: -                        break -                    self.processItem (item) -                if maxTimeout == 0: -                    break +        async def processQueue (): +            async for item in l: +                await self.processItem (item) + +        idle = IdleStateTracker (asyncio.get_event_loop ()) +        self.handler.append (idle) +        behavior = InjectBehaviorOnload (self) +        self.handler.append (behavior) + +        async with self.service as browser, SiteLoader (browser, logger=logger) as l: +            handle = asyncio.ensure_future (processQueue ()) +            timeoutProc = asyncio.ensure_future (asyncio.sleep (self.settings.timeout)) -        with self.service as browser, SiteLoader (browser, self.url, logger=logger) as l: -            start = time.time () +            # configure browser +            tab = l.tab +            await tab.Security.setIgnoreCertificateErrors (ignore=self.settings.insecure) +            await tab.Network.setCookies (cookies=list (map (toCookieParam, self.settings.cookies))) -            version = l.tab.Browser.getVersion () +            # not all behavior scripts are allowed for every URL, filter them +            self._enabledBehavior = list (filter (lambda x: self.url in x, +                    map (lambda x: x (l, logger), self.behavior))) + +            version = await tab.Browser.getVersion ()              payload = { -                    'software': { -                        'platform': platform.platform (), -                        'python': { -                            'implementation': platform.python_implementation(), -                            'version': platform.python_version (), -                            'build': platform.python_build () -                            }, -                        'self': getRequirements (__package__) -                        }, +                    'software': getSoftwareInfo (),                      'browser': {                          'product': version['product'],                          'useragent': version['userAgent'], -                        'viewport': getFormattedViewportMetrics (l.tab), +                        'viewport': await getFormattedViewportMetrics (tab), +                        }, +                    'tool': 'crocoite-single', # not the name of the cli utility +                    'parameters': { +                        'url': self.url, +                        'idleTimeout': self.settings.idleTimeout, +                        'timeout': self.settings.timeout, +                        'behavior': list (map (attrgetter('name'), self._enabledBehavior)), +                        'insecure': self.settings.insecure, +                        'cookies': list (map (lambda x: x.OutputString(), self.settings.cookies)),                          },                      } -            self.processItem (ControllerStart (payload)) +            if self.warcinfo: +                payload['extra'] = self.warcinfo +            await self.processItem (ControllerStart (payload)) -            # not all behavior scripts are allowed for every URL, filter them -            enabledBehavior = list (filter (lambda x: self.url in x, -                    map (lambda x: x (l, logger), self.behavior))) +            await l.navigate (self.url) + +            idleProc = asyncio.ensure_future (idle.wait (self.settings.idleTimeout)) +            while True: +                try: +                    finished, pending = await asyncio.wait([idleProc, timeoutProc, handle], +                            return_when=asyncio.FIRST_COMPLETED) +                except asyncio.CancelledError: +                    idleProc.cancel () +                    timeoutProc.cancel () +                    break -            for b in enabledBehavior: -                # I decided against using the queue here to limit memory -                # usage (screenshot behavior would put all images into -                # queue before we could process them) -                for item in b.onload (): -                    self.processItem (item) -            l.start () +                if handle in finished: +                    # something went wrong while processing the data +                    logger.error ('fetch failed', +                        uuid='43a0686a-a3a9-4214-9acd-43f6976f8ff3') +                    idleProc.cancel () +                    timeoutProc.cancel () +                    handle.result () +                    assert False # previous line should always raise Exception +                elif timeoutProc in finished: +                    # global timeout +                    logger.debug ('global timeout', +                            uuid='2f858adc-9448-4ace-94b4-7cd1484c0728') +                    idleProc.cancel () +                    timeoutProc.result () +                    break +                elif idleProc in finished: +                    # idle timeout +                    logger.debug ('idle timeout', +                            uuid='90702590-94c4-44ef-9b37-02a16de444c3') +                    idleProc.result () +                    timeoutProc.cancel () +                    break -            processQueue () +            await behavior.stop () +            await tab.Page.stopLoading () +            await asyncio.sleep (1) +            await behavior.finish () -            for b in enabledBehavior: -                for item in b.onstop (): -                    self.processItem (item) +            # wait until loads from behavior scripts are done and browser is +            # idle for at least 1 second +            try: +                await asyncio.wait_for (idle.wait (1), timeout=1) +            except (asyncio.TimeoutError, asyncio.CancelledError): +                pass -            # if we stopped due to timeout, wait for remaining assets -            processQueue () +            if handle.done (): +                handle.result () +            else: +                handle.cancel () -            for b in enabledBehavior: -                for item in b.onfinish (): -                    self.processItem (item) +class SetEntry: +    """ A object, to be used with sets, that compares equality only on its +    primary property. """ +    def __init__ (self, value, **props): +        self.value = value +        for k, v in props.items (): +            setattr (self, k, v) -            processQueue () +    def __eq__ (self, b): +        assert isinstance (b, SetEntry) +        return self.value == b.value + +    def __hash__ (self): +        return hash (self.value) + +    def __repr__ (self): +        return f'<SetEntry {self.value!r}>'  class RecursionPolicy:      """ Abstract recursion policy """ @@ -219,23 +320,23 @@ class DepthLimit (RecursionPolicy):      """      Limit recursion by depth. -    depth==0 means no recursion, depth==1 is the page and outgoing links, … +    depth==0 means no recursion, depth==1 is the page and outgoing links      """ -    __slots__ = ('maxdepth') +    __slots__ = ('maxdepth', )      def __init__ (self, maxdepth=0):          self.maxdepth = maxdepth      def __call__ (self, urls): -        if self.maxdepth <= 0: -            return {} -        else: -            self.maxdepth -= 1 -            return urls +        newurls = set () +        for u in urls: +            if u.depth <= self.maxdepth: +                newurls.add (u) +        return newurls      def __repr__ (self): -        return '<DepthLimit {}>'.format (self.maxdepth) +        return f'<DepthLimit {self.maxdepth}>'  class PrefixLimit (RecursionPolicy):      """ @@ -246,78 +347,195 @@ class PrefixLimit (RecursionPolicy):      accepted: http://example.com/foobar http://example.com/foo/bar      """ -    __slots__ = ('prefix') +    __slots__ = ('prefix', )      def __init__ (self, prefix):          self.prefix = prefix      def __call__ (self, urls): -        return set (filter (lambda u: u.startswith (self.prefix), urls)) +        return set (filter (lambda u: str(u.value).startswith (str (self.prefix)), urls)) -from .behavior import ExtractLinksEvent +def hasTemplate (s): +    """ Return True if string s has string templates """ +    return '{' in s and '}' in s -class RecursiveController (EventHandler): +class RecursiveController:      """      Simple recursive controller -    Visits links acording to recursionPolicy +    Visits links acording to policy      """ -    __slots__ = ('url', 'output', 'service', 'behavior', 'settings', 'logger', -            'recursionPolicy', 'handler', 'urls', 'have') +    __slots__ = ('url', 'output', 'command', 'logger', 'policy', 'have', +            'pending', 'stats', 'tempdir', 'running', 'concurrency', +            'copyLock') + +    SCHEME_WHITELIST = {'http', 'https'} -    def __init__ (self, url, output, logger, -            service=ChromeService (), behavior=cbehavior.available, \ -            settings=defaultSettings, \ -            recursionPolicy=DepthLimit (0), handler=[]): +    def __init__ (self, url, output, command, logger, +            tempdir=None, policy=DepthLimit (0), concurrency=1):          self.url = url          self.output = output -        self.service = service -        self.behavior = behavior -        self.settings = settings -        self.logger = logger.bind (context=type(self).__name__, url=url) -        self.recursionPolicy = recursionPolicy -        self.handler = handler -        self.handler.append (self) - -    def fetch (self, urls): +        self.command = command +        self.logger = logger.bind (context=type(self).__name__, seedurl=url) +        self.policy = policy +        self.tempdir = tempdir +        # A lock if only a single output file (no template) is requested +        self.copyLock = None if hasTemplate (output) else asyncio.Lock () +        # some sanity checks. XXX move to argparse? +        if self.copyLock and os.path.exists (self.output): +                raise ValueError ('Output file exists') +        # tasks currently running +        self.running = set () +        # max number of tasks running +        self.concurrency = concurrency +        # keep in sync with StatsHandler +        self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0, 'crashed': 0, 'ignored': 0} + +    async def fetch (self, entry, seqnum):          """ -        Overrideable fetch action for URLs. Defaults to sequential -        SinglePageController. +        Fetch a single URL using an external command + +        command is usually crocoite-single          """ -        for u in urls: -            try: -                c = SinglePageController (url=u, output=self.output, service=self.service, -                        behavior=self.behavior, logger=self.logger, -                        settings=self.settings, handler=self.handler) -                c.run () -            except BrowserCrashed: -                # this is fine if reported -                self.logger.error ('browser crashed', uuid='42582cbe-fb83-47ce-b330-d022a1c3b331') - -    def run (self): -        self.have = set () -        self.urls = set ([self.url]) - -        while self.urls: -            self.logger.info ('recursing', -                    uuid='5b8498e4-868d-413c-a67e-004516b8452c', -                    numurls=len (self.urls)) -            self.have.update (self.urls) -            fetchurls = self.urls -            self.urls = set () +        assert isinstance (entry, SetEntry) -            # handler appends new urls to self.urls through push() -            self.fetch (fetchurls) +        url = entry.value +        depth = entry.depth +        logger = self.logger.bind (url=url) -            # remove urls we have and apply recursion policy -            self.urls.difference_update (self.have) -            self.urls = self.recursionPolicy (self.urls) +        def formatCommand (e): +            # provide means to disable variable expansion +            if e.startswith ('!'): +                return e[1:] +            else: +                return e.format (url=url, dest=dest.name) + +        def formatOutput (p): +            return p.format (host=url.host, +                    date=datetime.utcnow ().isoformat (), seqnum=seqnum) + +        def logStats (): +            logger.info ('stats', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **self.stats) + +        if url.scheme not in self.SCHEME_WHITELIST: +            self.stats['ignored'] += 1 +            logStats () +            self.logger.warning ('scheme not whitelisted', url=url, +                    uuid='57e838de-4494-4316-ae98-cd3a2ebf541b') +            return + +        dest = tempfile.NamedTemporaryFile (dir=self.tempdir, +                prefix=os.path.basename (self.output) + '-', suffix='.warc.gz', +                delete=False) +        command = list (map (formatCommand, self.command)) +        logger.info ('fetch', uuid='d1288fbe-8bae-42c8-af8c-f2fa8b41794f', +                command=command) +        try: +            process = await asyncio.create_subprocess_exec (*command, +                    stdout=asyncio.subprocess.PIPE, +                    stderr=asyncio.subprocess.DEVNULL, +                    stdin=asyncio.subprocess.DEVNULL, +                    start_new_session=True, limit=100*1024*1024) +            while True: +                data = await process.stdout.readline () +                if not data: +                    break +                data = json.loads (data) +                uuid = data.get ('uuid') +                if uuid == '8ee5e9c9-1130-4c5c-88ff-718508546e0c': +                    links = set (self.policy (map (lambda x: SetEntry (URL(x).with_fragment(None), depth=depth+1), data.get ('links', [])))) +                    links.difference_update (self.have) +                    self.pending.update (links) +                elif uuid == '24d92d16-770e-4088-b769-4020e127a7ff': +                    for k in self.stats.keys (): +                        self.stats[k] += data.get (k, 0) +                    logStats () +        except asyncio.CancelledError: +            # graceful cancellation +            process.send_signal (signal.SIGINT) +        except Exception as e: +            process.kill () +            raise e +        finally: +            code = await process.wait() +            if code == 0: +                if self.copyLock is None: +                    # atomically move once finished +                    lastDestpath = None +                    while True: +                        # XXX: must generate a new name every time, otherwise +                        # this loop never terminates +                        destpath = formatOutput (self.output) +                        assert destpath != lastDestpath +                        lastDestpath = destpath + +                        # python does not have rename(…, …, RENAME_NOREPLACE), +                        # but this is safe nontheless, since we’re +                        # single-threaded +                        if not os.path.exists (destpath): +                            # create the directory, so templates like +                            # /{host}/{date}/… are possible +                            os.makedirs (os.path.dirname (destpath), exist_ok=True) +                            os.rename (dest.name, destpath) +                            break +                else: +                    # atomically (in the context of this process) append to +                    # existing file +                    async with self.copyLock: +                        with open (dest.name, 'rb') as infd, \ +                                open (self.output, 'ab') as outfd: +                            shutil.copyfileobj (infd, outfd) +                        os.unlink (dest.name) +            else: +                self.stats['crashed'] += 1 +                logStats () -    def push (self, item): -        if isinstance (item, ExtractLinksEvent): -            self.logger.debug ('extracted links', -                    uuid='8ee5e9c9-1130-4c5c-88ff-718508546e0c', links=item.links) -            self.urls.update (map (removeFragment, item.links)) +    async def run (self): +        def log (): +            # self.have includes running jobs +            self.logger.info ('recursing', +                    uuid='5b8498e4-868d-413c-a67e-004516b8452c', +                    pending=len (self.pending), +                    have=len (self.have)-len(self.running), +                    running=len (self.running)) + +        seqnum = 1 +        try: +            self.have = set () +            self.pending = set ([SetEntry (self.url, depth=0)]) + +            while self.pending: +                # since pending is a set this picks a random item, which is fine +                u = self.pending.pop () +                self.have.add (u) +                t = asyncio.ensure_future (self.fetch (u, seqnum)) +                self.running.add (t) +                seqnum += 1 + +                log () + +                if len (self.running) >= self.concurrency or not self.pending: +                    done, pending = await asyncio.wait (self.running, +                            return_when=asyncio.FIRST_COMPLETED) +                    self.running.difference_update (done) +                    # propagate exceptions +                    for r in done: +                        r.result () +        except asyncio.CancelledError: +            self.logger.info ('cancel', +                    uuid='d58154c8-ec27-40f2-ab9e-e25c1b21cd88', +                    pending=len (self.pending), +                    have=len (self.have)-len (self.running), +                    running=len (self.running)) +        finally: +            done = await asyncio.gather (*self.running, +                    return_exceptions=True) +            # propagate exceptions +            for r in done: +                if isinstance (r, Exception): +                    raise r +            self.running = set () +            log () diff --git a/crocoite/data/click.js b/crocoite/data/click.js index 90be282..ae189da 100644 --- a/crocoite/data/click.js +++ b/crocoite/data/click.js @@ -4,113 +4,7 @@   *  like navigating to a different location. Thus whitelist known elements.   */ -(function(){ -const selectorFlag = Object.freeze ({ -	none: 0, -	multi: 1, /* click item multiple times */ -}); -const defaultClickThrottle = 50; /* in ms */ -const discoverInterval = 1000; /* 1 second */ -const sites = Object.freeze ([ -	{ -		hostname: /^www\.facebook\.com$/i, -		selector: [ -			/* show more comments */ -			{s: 'a.UFIPagerLink[role=button]', flags: selectorFlag.none}, -			/* show nested comments*/ -			{s: 'a.UFICommentLink[role=button]', flags: selectorFlag.none}, -			/* initially show comments below a single post/video, i.e. /user/post/123 */ -			{s: 'form.commentable_item a[data-comment-prelude-ref=action_link_bling][rel=ignore]', flags: selectorFlag.none}, -			/* close the “register now” nag screen. for better screen shots */ -			{s: 'div#headerArea a#expanding_cta_close_button[role=button]', flags: selectorFlag.none}, -			], -	}, { -		hostname: /^twitter\.com$/i, -		selector: [ -			/* expand threads */ -			{s: 'a.ThreadedConversation-moreRepliesLink', flags: selectorFlag.none}, -			/* show hidden profiles */ -			{s: 'button.ProfileWarningTimeline-button', flags: selectorFlag.none}, -			/* show hidden/sensitive media */ -			{s: 'button.Tombstone-action.js-display-this-media', flags: selectorFlag.none}, -			], -	}, { -		hostname: /^disqus\.com$/i, -		selector: [ -			/* load more comments */ -			{s: 'a.load-more__button', flags: selectorFlag.multi}, -			], -	}, { -		hostname: /^(www|np)\.reddit\.com$/i, -		selector: [ -			/* show more comments, reddit’s javascript ignores events if too -			 * frequent */ -			{s: 'span.morecomments a', flags: selectorFlag.none, throttle: 500}, -			], -	}, { -		hostname: /^www\.instagram\.com$/i, -		selector: [ -			/* posts may have multiple images that load dynamically, click the arrow */ -			{s: 'a[role=button].coreSpriteRightChevron', flags: selectorFlag.multi, throttle: 500}, -			/* load more comments */ -			{s: 'article div ul li a[role=button]', flags: selectorFlag.multi}, -			], -	}, { -		hostname: /^www\.youtube\.com$/i, -		selector: [ -			/* expand comment thread */ -			{s: 'ytd-comment-thread-renderer div.more-button', flags: selectorFlag.none}, -			], -	}, { -		hostname: /^www\.patreon\.com$/i, -		selector: [ -			/* load more content */ -			{s: 'div[display=flex] div[display=block] button[color=gray][type=button]', flags: selectorFlag.multi}, -			/* load more comments */ -			{s: 'div.stackable[display=block] > div  > div  > a[color=dark][target=_self]', flags: selectorFlag.none}, -			/* load more replies */ -			{s: 'div > a[scale="0"][color=blue][size="1"]', flags: selectorFlag.none}, -			], -	} -	]); - -/* pick selectors matching current location */ -let hostname = document.location.hostname; -let selector = []; -for (let s of sites) { -	if (s.hostname.test (hostname)) { -		selector = selector.concat (s.selector); -	} -} - -function makeClickEvent () { -	return new MouseEvent('click', { -				view: window, -				bubbles: true, -				cancelable: true -				}); -} - -/* throttle clicking */ -let queue = []; -let clickTimeout = null; -function click () { -	if (queue.length > 0) { -		const item = queue.shift (); -		const o = item.o; -		const selector = item.selector; -		o.dispatchEvent (makeClickEvent ()); - -		if (queue.length > 0) { -			const nextTimeout = 'throttle' in selector ? -					selector.throttle : defaultClickThrottle; -			clickTimeout = window.setTimeout (click, nextTimeout); -		} else { -			clickTimeout = null; -		} -	} -} - +(function() {  /*	Element is visible if itself and all of its parents are   */  function isVisible (o) { @@ -132,28 +26,82 @@ function isClickable (o) {  	return !o.hasAttribute ('disabled') && isVisible (o);  } -/* some sites don’t remove/replace the element immediately, so keep track of - * which ones we already clicked */ -let have = new Set (); -function discover () { -	for (let s of selector) { -		let obj = document.querySelectorAll (s.s); -		for (let o of obj) { -			if (!have.has (o) && isClickable (o)) { -				queue.push ({o: o, selector: s}); -				if (!(s.flags & selectorFlag.multi)) { -					have.add (o); +const defaultClickThrottle = 50; /* in ms */ +const discoverInterval = 1000; /* 1 second */ + +class Click { +	constructor(options) { +		/* pick selectors matching current location */ +		let hostname = document.location.hostname; +		this.selector = []; +		for (let s of options['sites']) { +			let r = new RegExp (s.match, 'i'); +			if (r.test (hostname)) { +				this.selector = this.selector.concat (s.selector); +			} +		} +		/* throttle clicking */ +		this.queue = []; +		this.clickTimeout = null; + +		/* some sites don’t remove/replace the element immediately, so keep track of +		 * which ones we already clicked */ +		this.have = new Set (); + +		/* XXX: can we use a mutation observer instead? */ +		this.interval = window.setInterval (this.discover.bind (this), discoverInterval); +	} + +	makeClickEvent () { +		return new MouseEvent('click', { +					view: window, +					bubbles: true, +					cancelable: true +					}); +	} + +	click () { +		if (this.queue.length > 0) { +			const item = this.queue.shift (); +			const o = item.o; +			const selector = item.selector; +			o.dispatchEvent (this.makeClickEvent ()); + +			if (this.queue.length > 0) { +				const nextTimeout = 'throttle' in selector ? +						selector.throttle : defaultClickThrottle; +				this.clickTimeout = window.setTimeout (this.click.bind (this), nextTimeout); +			} else { +				this.clickTimeout = null; +			} +		} +	} + +	discover () { +		for (let s of this.selector) { +			let obj = document.querySelectorAll (s.selector); +			for (let o of obj) { +				if (!this.have.has (o) && isClickable (o)) { +					this.queue.push ({o: o, selector: s}); +					if (!s.multi) { +						this.have.add (o); +					}  				}  			}  		} +		if (this.queue.length > 0 && this.clickTimeout === null) { +			/* start clicking immediately */ +			this.clickTimeout = window.setTimeout (this.click.bind (this), 0); +		} +		return true;  	} -	if (queue.length > 0 && clickTimeout === null) { -		/* start clicking immediately */ -		clickTimeout = window.setTimeout (click, 0); + + +	stop () { +		window.clearInterval (this.interval); +		window.clearTimeout (this.clickTimeout);  	} -	return true;  } -/* XXX: can we use a mutation observer instead? */ -window.setInterval (discover, discoverInterval); -}()); +return Click; +}()) diff --git a/crocoite/data/click.yaml b/crocoite/data/click.yaml new file mode 100644 index 0000000..78278b9 --- /dev/null +++ b/crocoite/data/click.yaml @@ -0,0 +1,117 @@ +# Configuration for behavior.py:Click +# Example URLs are random. Believe me. +match: ^www\.facebook\.com$ +selector: +  - description: Show comments and replies/nested comments on user pages. +    selector: form[action="/ajax/ufi/modify.php"] a[data-testid^="UFI2CommentsPagerRenderer/pager_depth_"] +    urls: ["https://www.facebook.com/tagesschau"] +  - description: Initially show comments below a single post/video, i.e. /user/post/123. +    selector: form[action="/ajax/ufi/modify.php"] a[data-testid="UFI2CommentsCount/root"] +    urls: ["https://www.facebook.com/tagesschau/posts/10157061068659407"] +  - description: Close the “register now” nag screen. For screenshots. +    selector: a#expanding_cta_close_button[role=button] +    urls: ["https://www.facebook.com/tagesschau"] +--- +match: ^twitter\.com$ +selector: +  - description: Expand threads. +    selector: a.ThreadedConversation-moreRepliesLink +    urls: ["https://twitter.com/realDonaldTrump/status/1068826073775964160"] +  - description: Show hidden profiles. +    selector: button.ProfileWarningTimeline-button +    urls: ["https://twitter.com/CookieCyboid"] +  - description: Show hidden/sensitive media. For screen-/snapshots. +    selector: button.Tombstone-action.js-display-this-media +    urls: ["https://twitter.com/CookieCyboid/status/1070807283305713665"] +  - description: Show more replies. +    selector: button.ThreadedConversation-showMoreThreadsButton +    urls: ["https://twitter.com/fuglydug/status/1172160128101076995"] +--- +match: ^disqus\.com$ +selector: +  - description: Load more comments. +    selector: a.load-more__button +    multi: True +--- +# new layout +match: ^www\.reddit\.com$ +selector: +  - description: Show more comments. +    selector: div[id^=moreComments-] > div > p +    # reddit’s javascript ignores events if too frequent +    throttle: 500 +    urls: ["https://www.reddit.com/r/subredditcancer/comments/b2b80f/we_are_moderators_of_rwatchpeopledie_amaa_just/"] +--- +# old layout +match: ^(old|np)\.reddit\.com$ +selector: +  - description: Show more comments. +    selector: span.morecomments a +    # reddit’s javascript ignores events if too frequent +    throttle: 500 +    urls: ["https://old.reddit.com/r/subredditcancer/comments/b2b80f/we_are_moderators_of_rwatchpeopledie_amaa_just/"] +--- +match: ^www\.youtube\.com$ +selector: +  - description: Expand single comment. +    selector: ytd-comment-thread-renderer span[slot=more-button] +    urls: ["https://www.youtube.com/watch?v=udtFqQuBFSc"] +  - description: Show more comment thread replies. +    selector: div.ytd-comment-replies-renderer > yt-next-continuation > paper-button +    urls: ["https://www.youtube.com/watch?v=Lov0T3eXI2k"] +    multi: True +--- +match: ^www\.patreon\.com$ +selector: +  - description: Load more comments. +    selector: div[data-tag=post-card] button[data-tag=loadMoreCommentsCta] +    urls: ["https://www.patreon.com/posts/what-im-on-22124040"] +--- +match: ^(www\.)?gab\.com$ +selector: +  - description: Load more posts. +    selector: div.item-list[role=feed] button.load-more +    multi: True +    urls: ["https://gab.com/gab"] +--- +match: ^(www\.)?github\.com$ +selector: +  - description: Show hidden issue items. +    urls: ["https://github.com/dominictarr/event-stream/issues/116"] +    selector: div#discussion_bucket form.ajax-pagination-form button.ajax-pagination-btn +--- +match: ^www\.gamasutra\.com$ +selector: +    - description: Load more comments. +      urls: ["http://www.gamasutra.com/blogs/RaminShokrizade/20130626/194933/The_Top_F2P_Monetization_Tricks.php"] +      selector: div#dynamiccomments div.viewTopCmts a +--- +match: ^(www\.)?steamcommunity\.com$ +selector: +    - description: Load more content. +      urls: ["https://steamcommunity.com/app/252950/reviews/?p=1&browsefilter=toprated&filterLanguage=all"] +      selector: "#GetMoreContentBtn a" +      multi: True +--- +match: ^imgur\.com$ +selector: +    - description: Load more images of an album. +      urls: ["https://imgur.com/a/JG1yc"] +      selector: div.js-post-truncated a.post-loadall +    - description: Expand all comments. For snapshots. +      urls: ["https://imgur.com/a/JG1yc"] +      selector: div.comments-info span.comments-expand +    - description: Show bad replies. for snapshots. +      urls: ["https://imgur.com/gallery/jRzMfRG"] +      selector: div#comments div.bad-captions a.link +--- +match: ^(www\.)?vimeo\.com$ +selector: +    - description: Load more videos on profile page. +      urls: ["https://vimeo.com/dsam4a"] +      selector: div.profile_main div.profile-load-more__button--wrapper button +#    XXX: this works when using a non-headless browser, but does not otherwise +#    - description: Expand video comments +#      urls: ["https://vimeo.com/22439234"] +#      selector: section#comments button.iris_comment-more +#      multi: True diff --git a/crocoite/data/cookies.txt b/crocoite/data/cookies.txt new file mode 100644 index 0000000..6ac62c3 --- /dev/null +++ b/crocoite/data/cookies.txt @@ -0,0 +1,9 @@ +# Default cookies for crocoite. This file does *not* use Netscape’s cookie +# file format. Lines are expected to be in Set-Cookie format. +# And this line is a comment. + +# Reddit: +# skip over 18 prompt +over18=1; Domain=www.reddit.com +# skip quarantined subreddit prompt +_options={%22pref_quarantine_optin%22:true}; Domain=www.reddit.com diff --git a/crocoite/data/extract-links.js b/crocoite/data/extract-links.js index 4d1a3d0..5a4f9f0 100644 --- a/crocoite/data/extract-links.js +++ b/crocoite/data/extract-links.js @@ -25,11 +25,26 @@ function isClickable (o) {  }  /* --- end copy&paste */ -let x = document.body.querySelectorAll('a[href]');  let ret = []; +['a[href]', 'area[href]'].forEach (function (s) { +	let x = document.querySelectorAll(s); +	for (let i=0; i < x.length; i++) { +		if (isClickable (x[i])) { +			ret.push (x[i].href); +		} +	} +}); + +/* If Chrome loads plain-text documents it’ll wrap them into <pre>. Check those + * for links as well, assuming the whole line is a link (i.e. list of links). */ +let x = document.querySelectorAll ('body > pre');  for (let i=0; i < x.length; i++) { -	if (isClickable (x[i])) { -		ret.push (x[i].href); +	if (isVisible (x[i])) { +		x[i].innerText.split ('\n').forEach (function (s) { +			if (s.match ('^https?://')) { +				ret.push (s); +			} +		});  	}  }  return ret; /* immediately return results, for use with Runtime.evaluate() */ diff --git a/crocoite/data/screenshot.js b/crocoite/data/screenshot.js new file mode 100644 index 0000000..a9a41e1 --- /dev/null +++ b/crocoite/data/screenshot.js @@ -0,0 +1,20 @@ +/* Find and scrollable full-screen elements and return their actual size + */ +(function () { +/* limit the number of elements queried */ +let elem = document.querySelectorAll ('body > div'); +let ret = []; +for (let i = 0; i < elem.length; i++) { +	let e = elem[i]; +	let s = window.getComputedStyle (e); +	if (s.getPropertyValue ('position') == 'fixed' && +			s.getPropertyValue ('overflow') == 'auto' && +			s.getPropertyValue ('left') == '0px' && +			s.getPropertyValue ('right') == '0px' && +			s.getPropertyValue ('top') == '0px' && +			s.getPropertyValue ('bottom') == '0px') { +		ret.push (e.scrollHeight); +	} +} +return ret; /* immediately return results, for use with Runtime.evaluate() */ +})(); diff --git a/crocoite/data/scroll.js b/crocoite/data/scroll.js index 13e856d..be88edf 100644 --- a/crocoite/data/scroll.js +++ b/crocoite/data/scroll.js @@ -1,23 +1,38 @@  /*	Continuously scrolls the page   */ -var __crocoite_stop__ = false;  (function(){ -function scroll (event) { -	if (__crocoite_stop__) { -		return false; -	} else { +class Scroll { +	constructor (options) { +		this.scrolled = new Map (); +		this.interval = window.setInterval (this.scroll.bind (this), 200); +	} + +	stop() { +		window.clearInterval (this.interval); +		window.scrollTo (0, 0); +		this.scrolled.forEach (function (value, key, map) { +			key.scrollTop = value; +		}); +	} +	/* save initial scroll state */ +	save(obj) { +		if (!this.scrolled.has (obj)) { +			this.scrolled.set (obj, obj.scrollTop); +		} +	} +	/* perform a single scroll step */ +	scroll (event) {  		window.scrollBy (0, window.innerHeight/2); -		document.querySelectorAll ('*').forEach ( +		document.querySelectorAll ('html body *').forEach (  			function (d) { -				if (d.clientHeight < d.scrollHeight) { +				if (d.scrollHeight-d.scrollTop > d.clientHeight) { +					this.save (d);  					d.scrollBy (0, d.clientHeight/2);  				} -			}); +			}.bind (this));  		return true;  	}  } -function onload (event) { -    window.setInterval (scroll, 200); -} -document.addEventListener("DOMContentLoaded", onload); -}()); + +return Scroll; +}()) diff --git a/crocoite/devtools.py b/crocoite/devtools.py new file mode 100644 index 0000000..8b5c69d --- /dev/null +++ b/crocoite/devtools.py @@ -0,0 +1,392 @@ +# Copyright (c) 2017 crocoite contributors +#  +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +#  +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +#  +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +""" +Communication with Google Chrome through its DevTools protocol. +""" + +import json, asyncio, logging, os +from tempfile import mkdtemp +import shutil +from http.cookies import Morsel + +import aiohttp, websockets +from yarl import URL + +from .util import StrJsonEncoder + +logger = logging.getLogger (__name__) + +class Browser: +    """ +    Communicate with Google Chrome through its DevTools protocol. +     +    Asynchronous context manager that creates a new Tab when entering. +    Destroyed upon exit. +    """ + +    __slots__ = ('session', 'url', 'tab') + +    def __init__ (self, url): +        self.url = URL (url) +        self.session = None +        self.tab = None + +    async def __aiter__ (self): +        """ List all tabs """ +        async with aiohttp.ClientSession () as session: +            async with session.get (self.url.with_path ('/json/list')) as r: +                resp = await r.json () +                for tab in resp: +                    if tab['type'] == 'page': +                        yield tab + +    async def __aenter__ (self): +        """ Create tab """ +        assert self.tab is None +        assert self.session is None +        self.session = aiohttp.ClientSession () +        async with self.session.get (self.url.with_path ('/json/new')) as r: +            resp = await r.json () +            self.tab = await Tab.create (**resp) +            return self.tab + +    async def __aexit__ (self, excType, excValue, traceback): +        assert self.tab is not None +        assert self.session is not None + +        await self.tab.close () + +        try: +            async with self.session.get (self.url.with_path (f'/json/close/{self.tab.id}')) as r: +                resp = await r.text () +                assert resp == 'Target is closing' +        except aiohttp.client_exceptions.ClientConnectorError: +            # oh boy, the whole browser crashed instead +            if excType is Crashed: +                # exception is reraised by `return False` +                pass +            else: +                # this one is more important +                raise + +        self.tab = None +        await self.session.close () +        self.session = None + +        return False + +class TabFunction: +    """ +    Helper class for infinite-depth tab functions. + +    A method usually consists of namespace (Page, Network, …) and function name +    (getFoobar) separated by a dot. This class creates these function names +    while providing an intuitive Python interface (tab.Network.getFoobar). + +    This was inspired by pychrome. +    """ + +    __slots__ = ('name', 'tab') + +    def __init__ (self, name, tab): +        self.name = name +        self.tab = tab + +    def __eq__ (self, b): +        assert isinstance (b, TabFunction) +        return self.name == b.name + +    def __hash__ (self): +        return hash (self.name) + +    def __getattr__ (self, k): +        return TabFunction (f'{self.name}.{k}', self.tab) + +    async def __call__ (self, **kwargs): +        return await self.tab (self.name, **kwargs) + +    def __repr__ (self): +        return f'<TabFunction {self.name}>' + +class TabException (Exception): +    pass + +class Crashed (TabException): +    pass + +class MethodNotFound (TabException): +    pass + +class InvalidParameter (TabException): +    pass + +# map error codes to native exceptions +errorMap = {-32601: MethodNotFound, -32602: InvalidParameter} + +class Tab: +    """ +    Communicate with a single Google Chrome browser tab. +    """ +    __slots__ = ('id', 'wsUrl', 'ws', 'msgid', 'transactions', 'queue', '_recvHandle', 'crashed') + +    def __init__ (self, tabid, ws): +        """ Do not use this method, use Browser context manager. """ +        self.id = tabid +        self.ws = ws +        self.msgid = 1 +        self.crashed = False +        self.transactions = {} +        self.queue = asyncio.Queue () + +    def __getattr__ (self, k): +        return TabFunction (k, self) + +    async def __call__ (self, method, **kwargs): +        """ +        Actually call browser method with kwargs +        """ + +        if self.crashed or self._recvHandle.done (): +            raise Crashed () + +        msgid = self.msgid +        self.msgid += 1 +        message = {'method': method, 'params': kwargs, 'id': msgid} +        t = self.transactions[msgid] = {'event': asyncio.Event (), 'result': None} +        logger.debug (f'← {message}') +        await self.ws.send (json.dumps (message, cls=StrJsonEncoder)) +        await t['event'].wait () +        ret = t['result'] +        del self.transactions[msgid] +        if isinstance (ret, Exception): +            raise ret +        return ret + +    async def _recvProcess (self): +        """ +        Receive process that dispatches received websocket frames + +        These are either events which will be put into a queue or request +        responses which unblock a __call__. +        """ + +        async def markCrashed (reason): +            # all pending requests can be considered failed since the +            # browser state is lost +            for v in self.transactions.values (): +                v['result'] = Crashed (reason) +                v['event'].set () +            # and all future requests will fail as well until reloaded +            self.crashed = True +            await self.queue.put (Crashed (reason)) + +        while True: +            try: +                msg = await self.ws.recv () +                msg = json.loads (msg) +            except Exception as e: +                # right now we cannot recover from this +                await markCrashed (e) +                break +            logger.debug (f'→ {msg}') +            if 'id' in msg: +                msgid = msg['id'] +                t = self.transactions.get (msgid, None) +                if t is not None: +                    if 'error' in msg: +                        e = msg['error'] +                        t['result'] = errorMap.get (e['code'], TabException) (e['code'], e['message']) +                    else: +                        t['result'] = msg['result'] +                    t['event'].set () +                else: +                    # ignore stale result +                    pass # pragma: no cover +            elif 'method' in msg: +                # special treatment +                if msg['method'] == 'Inspector.targetCrashed': +                    await markCrashed ('target') +                else: +                    await self.queue.put (msg) +            else: +                assert False # pragma: no cover + +    async def run (self): +        self._recvHandle = asyncio.ensure_future (self._recvProcess ()) + +    async def close (self): +        self._recvHandle.cancel () +        await self.ws.close () +        # no join, throw away the queue. There will be nobody listening on the +        # other end. +        #await self.queue.join () + +    @property +    def pending (self): +        return self.queue.qsize () + +    async def get (self): +        def getattrRecursive (obj, name): +            if '.' in name: +                n, ext = name.split ('.', 1) +                return getattrRecursive (getattr (obj, n), ext) +            return getattr (obj, name) + +        if self.crashed: +            raise Crashed () + +        ret = await self.queue.get () +        if isinstance (ret, Exception): +            raise ret +        return getattrRecursive (self, ret['method']), ret['params'] + +    @classmethod +    async def create (cls, **kwargs): +        """ Async init """ +        # increase size limit of a single frame to something ridiciously high, +        # so we can safely grab screenshots +        maxSize = 100*1024*1024 # 100 MB +        # chrome does not like pings and kills the connection, disable them +        ws = await websockets.connect(kwargs['webSocketDebuggerUrl'], +                max_size=maxSize, ping_interval=None) +        ret = cls (kwargs['id'], ws) +        await ret.run () +        return ret + +class Process: +    """ Start Google Chrome listening on a random port """ + +    __slots__ = ('binary', 'windowSize', 'p', 'userDataDir') + +    def __init__ (self, binary='google-chrome-stable', windowSize=(1920, 1080)): +        self.binary = binary +        self.windowSize = windowSize +        self.p = None + +    async def __aenter__ (self): +        assert self.p is None +        self.userDataDir = mkdtemp (prefix=__package__ + '-chrome-userdata-') +        # see https://github.com/GoogleChrome/chrome-launcher/blob/master/docs/chrome-flags-for-tools.md +        args = [self.binary, +                '--window-size={},{}'.format (*self.windowSize), +                f'--user-data-dir={self.userDataDir}', # use temporory user dir +                '--no-default-browser-check', +                '--no-first-run', # don’t show first run screen +                '--disable-breakpad', # no error reports +                '--disable-extensions', +                '--disable-infobars', +                '--disable-notifications', # no libnotify +                '--disable-background-networking', # disable background services (updating, safe browsing, …) +                '--safebrowsing-disable-auto-update', +                '--disable-sync', # no google account syncing +                '--metrics-recording-only', # do not submit metrics +                '--disable-default-apps', +                '--disable-background-timer-throttling', +                '--disable-client-side-phishing-detection', +                '--disable-popup-blocking', +                '--disable-prompt-on-repost', +                '--enable-automation', # enable various automation-related things +                '--password-store=basic', +                '--headless', +                '--disable-gpu', +                '--hide-scrollbars', # hide scrollbars on screenshots +                '--mute-audio', # don’t play any audio +                '--remote-debugging-port=0', # pick a port. XXX: we may want to use --remote-debugging-pipe instead +                '--homepage=about:blank', +                'about:blank'] +        # start new session, so ^C does not affect subprocess +        self.p = await asyncio.create_subprocess_exec (*args, +                stdout=asyncio.subprocess.DEVNULL, +                stderr=asyncio.subprocess.DEVNULL, +                stdin=asyncio.subprocess.DEVNULL, +                start_new_session=True) +        port = None +        # chrome writes its current active devtools port to a file. due to the +        # sleep() this is rather ugly, but should work with all versions of the +        # browser. +        for i in range (100): +            try: +                with open (os.path.join (self.userDataDir, 'DevToolsActivePort'), 'r') as fd: +                    port = int (fd.readline ().strip ()) +                    break +            except FileNotFoundError: +                await asyncio.sleep (0.2) +        if port is None: +            raise Exception ('Chrome died on us.') + +        return URL.build(scheme='http', host='localhost', port=port) + +    async def __aexit__ (self, *exc): +        try: +            self.p.terminate () +            await self.p.wait () +        except ProcessLookupError: +            # ok, fine, dead already +            pass + +        # Try to delete the temporary directory multiple times. It looks like +        # Chrome will change files in there even after it exited (i.e. .wait() +        # returned). Very strange. +        for i in range (5): +            try: +                shutil.rmtree (self.userDataDir) +                break +            except: +                await asyncio.sleep (0.2) + +        self.p = None +        return False + +class Passthrough: +    __slots__ = ('url', ) + +    def __init__ (self, url): +        self.url = URL (url) + +    async def __aenter__ (self): +        return self.url + +    async def __aexit__ (self, *exc): +        return False + +def toCookieParam (m): +    """ +    Convert Python’s http.cookies.Morsel to Chrome’s CookieParam, see +    https://chromedevtools.github.io/devtools-protocol/1-3/Network#type-CookieParam +    """ + +    assert isinstance (m, Morsel) + +    out = {'name': m.key, 'value': m.value} + +    # unsupported by chrome +    for k in ('max-age', 'comment', 'version'): +        if m[k]: +            raise ValueError (f'Unsupported cookie attribute {k} set, cannot convert') + +    for mname, cname in [('expires', None), ('path', None), ('domain', None), ('secure', None), ('httponly', 'httpOnly')]: +        value = m[mname] +        if value: +            cname = cname or mname +            out[cname] = value + +    return out + diff --git a/crocoite/html.py b/crocoite/html.py index c929a10..30f6ca5 100644 --- a/crocoite/html.py +++ b/crocoite/html.py @@ -22,6 +22,10 @@  HTML helper  """ +from html5lib.treewalkers.base import TreeWalker +from html5lib.filters.base import Filter +from html5lib import constants +  # HTML void tags, see https://html.spec.whatwg.org/multipage/syntax.html#void-elements  voidTags = {'area',          'base', @@ -103,9 +107,7 @@ eventAttributes = {'onabort',          'onvolumechange',          'onwaiting'} -from html5lib.treewalkers.base import TreeWalker -from html5lib.filters.base import Filter -from html5lib import constants +default_namespace = constants.namespaces["html"]  class ChromeTreeWalker (TreeWalker):      """ @@ -122,11 +124,14 @@ class ChromeTreeWalker (TreeWalker):              elif name == '#document':                  for child in node.get ('children', []):                      yield from self.recurse (child) +            elif name == '#cdata-section': +                # html5lib cannot generate cdata, so we’re faking it by using +                # an empty tag +                yield from self.emptyTag (default_namespace, +                        '![CDATA[' + node['nodeValue'] + ']]', {})              else: -                assert False, name +                assert False, (name, node)          else: -            default_namespace = constants.namespaces["html"] -              attributes = node.get ('attributes', [])              convertedAttr = {}              for i in range (0, len (attributes), 2): diff --git a/crocoite/irc.py b/crocoite/irc.py new file mode 100644 index 0000000..d9c0634 --- /dev/null +++ b/crocoite/irc.py @@ -0,0 +1,671 @@ +# Copyright (c) 2017–2018 crocoite contributors +#  +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +#  +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +#  +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +""" +IRC bot “chromebot” +""" + +import asyncio, argparse, json, tempfile, time, random, os, shlex +from datetime import datetime +from urllib.parse import urlsplit +from enum import IntEnum, unique +from collections import defaultdict +from abc import abstractmethod +from functools import wraps +import bottom +import websockets + +from .util import StrJsonEncoder +from .cli import cookie + +### helper functions ### +def prettyTimeDelta (seconds): +    """ +    Pretty-print seconds to human readable string 1d 1h 1m 1s +    """ +    seconds = int(seconds) +    days, seconds = divmod(seconds, 86400) +    hours, seconds = divmod(seconds, 3600) +    minutes, seconds = divmod(seconds, 60) +    s = [(days, 'd'), (hours, 'h'), (minutes, 'm'), (seconds, 's')] +    s = filter (lambda x: x[0] != 0, s) +    return ' '.join (map (lambda x: '{}{}'.format (*x), s)) + +def prettyBytes (b): +    """ +    Pretty-print bytes +    """ +    prefixes = ['B', 'KiB', 'MiB', 'GiB', 'TiB'] +    while b >= 1024 and len (prefixes) > 1: +        b /= 1024 +        prefixes.pop (0) +    return f'{b:.1f} {prefixes[0]}' + +def isValidUrl (s): +    url = urlsplit (s) +    if url.scheme and url.netloc and url.scheme in {'http', 'https'}: +        return s +    raise TypeError () + +class NonExitingArgumentParser (argparse.ArgumentParser): +    """ Argument parser that does not call exit(), suitable for interactive use """ + +    def exit (self, status=0, message=None): +        # should never be called +        pass + +    def error (self, message): +        # if we use subparsers it’s important to return self, so we can show +        # the correct help +        raise Exception (self, message) + +    def format_usage (self): +        return super().format_usage ().replace ('\n', ' ') + +class Status(IntEnum): +    """ Job status """ +    undefined = 0 +    pending = 1 +    running = 2 +    aborted = 3 +    finished = 4 + +# see https://arxiv.org/html/0901.4016 on how to build proquints (human +# pronouncable unique ids) +toConsonant = 'bdfghjklmnprstvz' +toVowel = 'aiou' + +def u16ToQuint (v): +    """ Transform a 16 bit unsigned integer into a single quint """ +    assert 0 <= v < 2**16 +    # quints are “big-endian” +    return ''.join ([ +            toConsonant[(v>>(4+2+4+2))&0xf], +            toVowel[(v>>(4+2+4))&0x3], +            toConsonant[(v>>(4+2))&0xf], +            toVowel[(v>>4)&0x3], +            toConsonant[(v>>0)&0xf], +            ]) + +def uintToQuint (v, length=2): +    """ Turn any integer into a proquint with fixed length """ +    assert 0 <= v < 2**(length*16) + +    return '-'.join (reversed ([u16ToQuint ((v>>(x*16))&0xffff) for x in range (length)])) + +def makeJobId (): +    """ Create job id from time and randomness source """ +    # allocate 48 bits for the time (in milliseconds) and add 16 random bits +    # at the end (just to be sure) for a total of 64 bits. Should be enough to +    # avoid collisions. +    randbits = 16 +    stamp = (int (time.time ()*1000) << randbits) | random.randint (0, 2**randbits-1) +    return uintToQuint (stamp, 4) + +class Job: +    """ Archival job """ + +    __slots__ = ('id', 'stats', 'rstats', 'started', 'finished', 'nick', 'status', 'process', 'url') + +    def __init__ (self, url, nick): +        self.id = makeJobId () +        self.stats = {} +        self.rstats = {} +        self.started = datetime.utcnow () +        self.finished = None +        self.url = url +        # user who scheduled this job +        self.nick = nick +        self.status = Status.pending +        self.process = None + +    def formatStatus (self): +        stats = self.stats +        rstats = self.rstats +        return (f"{self.url} ({self.id}) {self.status.name}. " +                f"{rstats.get ('have', 0)} pages finished, " +                f"{rstats.get ('pending', 0)} pending; " +                f"{stats.get ('crashed', 0)} crashed, " +                f"{stats.get ('requests', 0)} requests, " +                f"{stats.get ('failed', 0)} failed, " +                f"{prettyBytes (stats.get ('bytesRcv', 0))} received.") + +@unique +class NickMode(IntEnum): +    # the actual numbers don’t matter, but their order must be strictly +    # increasing (with priviledge level) +    operator = 100 +    voice = 10 + +    @classmethod +    def fromMode (cls, mode): +        return {'v': cls.voice, 'o': cls.operator}[mode] + +    @classmethod +    def fromNickPrefix (cls, mode): +        return {'@': cls.operator, '+': cls.voice}[mode] + +    @property +    def human (self): +        return {self.operator: 'operator', self.voice: 'voice'}[self] + +class User: +    """ IRC user """ +    __slots__ = ('name', 'modes') + +    def __init__ (self, name, modes=None): +        self.name = name +        self.modes = modes or set () + +    def __eq__ (self, b): +        return self.name == b.name + +    def __hash__ (self): +        return hash (self.name) + +    def __repr__ (self): +        return f'<User {self.name} {self.modes}>' + +    def hasPriv (self, p): +        if p is None: +            return True +        else: +            return self.modes and max (self.modes) >= p + +    @classmethod +    def fromName (cls, name): +        """ Get mode and name from NAMES command """ +        try: +            modes = {NickMode.fromNickPrefix (name[0])} +            name = name[1:] +        except KeyError: +            modes = set () +        return cls (name, modes) + +class ReplyContext: +    __slots__ = ('client', 'target', 'user') + +    def __init__ (self, client, target, user): +        self.client = client +        self.target = target +        self.user = user + +    def __call__ (self, message): +        self.client.send ('PRIVMSG', target=self.target, +                message=f'{self.user.name}: {message}') + +class RefCountEvent: +    """ +    Ref-counted event that triggers if a) armed and b) refcount drops to zero. +     +    Must be used as a context manager. +    """ +    __slots__ = ('count', 'event', 'armed') + +    def __init__ (self): +        self.armed = False +        self.count = 0 +        self.event = asyncio.Event () + +    def __enter__ (self): +        self.count += 1 +        self.event.clear () + +    def __exit__ (self, exc_type, exc_val, exc_tb): +        self.count -= 1 +        if self.armed and self.count == 0: +            self.event.set () + +    async def wait (self): +        await self.event.wait () + +    def arm (self): +        self.armed = True +        if self.count == 0: +            self.event.set () + +class ArgparseBot (bottom.Client): +    """ +    Simple IRC bot using argparse +     +    Tracks user’s modes, reconnects on disconnect +    """ + +    __slots__ = ('channels', 'nick', 'parser', 'users', '_quit') + +    def __init__ (self, host, port, ssl, nick, logger, channels=None, loop=None): +        super().__init__ (host=host, port=port, ssl=ssl, loop=loop) +        self.channels = channels or [] +        self.nick = nick +        # map channel -> nick -> user +        self.users = defaultdict (dict) +        self.logger = logger.bind (context=type (self).__name__) +        self.parser = self.getParser () + +        # bot does not accept new queries in shutdown mode, unless explicitly +        # permitted by the parser +        self._quit = RefCountEvent () + +        # register bottom event handler +        self.on('CLIENT_CONNECT', self.onConnect) +        self.on('PING', self.onKeepalive) +        self.on('PRIVMSG', self.onMessage) +        self.on('CLIENT_DISCONNECT', self.onDisconnect) +        self.on('RPL_NAMREPLY', self.onNameReply) +        self.on('CHANNELMODE', self.onMode) +        self.on('PART', self.onPart) +        self.on('JOIN', self.onJoin) +        # XXX: we would like to handle KICK, but bottom does not support that at the moment + +    @abstractmethod +    def getParser (self): +        pass + +    def cancel (self): +        self.logger.info ('cancel', uuid='1eb34aea-a854-4fec-90b2-7f8a3812a9cd') +        self._quit.arm () +     +    async def run (self): +        await self.connect () +        await self._quit.wait () +        self.send ('QUIT', message='Bye.') +        await self.disconnect () + +    async def onConnect (self, **kwargs): +        self.logger.info ('connect', nick=self.nick, uuid='01f7b138-ea53-4609-88e9-61f3eca3e7e7') + +        self.send('NICK', nick=self.nick) +        self.send('USER', user=self.nick, realname='https://github.com/PromyLOPh/crocoite') + +        # Don't try to join channels until the server has +        # sent the MOTD, or signaled that there's no MOTD. +        done, pending = await asyncio.wait( +            [self.wait('RPL_ENDOFMOTD'), self.wait('ERR_NOMOTD')], +            loop=self.loop, return_when=asyncio.FIRST_COMPLETED) + +        # Cancel whichever waiter's event didn't come in. +        for future in pending: +            future.cancel() + +        for c in self.channels: +            self.logger.info ('join', channel=c, uuid='367063a5-9069-4025-907c-65ba88af8593') +            self.send ('JOIN', channel=c) +            # no need for NAMES here, server sends this automatically + +    async def onNameReply (self, channel, users, **kwargs): +        # channels may be too big for a single message +        addusers = dict (map (lambda x: (x.name, x), map (User.fromName, users))) +        if channel not in self.users: +            self.users[channel] = addusers +        else: +            self.users[channel].update (addusers) + +    @staticmethod +    def parseMode (mode): +        """ Parse mode strings like +a, -b, +a-b, -b+a, … """ +        action = '+' +        ret = [] +        for c in mode: +            if c in {'+', '-'}: +                action = c +            else: +                ret.append ((action, c)) +        return ret + +    async def onMode (self, channel, modes, params, **kwargs): +        if channel not in self.channels: +            return + +        for (action, mode), nick in zip (self.parseMode (modes), params): +            try: +                m = NickMode.fromMode (mode) +                u = self.users[channel].get (nick, User (nick)) +                if action == '+': +                    u.modes.add (m) +                elif action == '-': +                    u.modes.remove (m) +            except KeyError: +                # unknown mode, ignore +                pass + +    async def onPart (self, nick, channel, **kwargs): +        if channel not in self.channels: +            return + +        try: +            self.users[channel].pop (nick) +        except KeyError: +            # gone already +            pass + +    async def onJoin (self, nick, channel, **kwargs): +        if channel not in self.channels: +            return + +        self.users[channel][nick] = User (nick) + +    async def onKeepalive (self, message, **kwargs): +        """ Ping received """ +        self.send('PONG', message=message) + +    async def onMessage (self, nick, target, message, **kwargs): +        """ Message received """ +        msgPrefix = self.nick + ':' +        if target in self.channels and message.startswith (msgPrefix): +            user = self.users[target].get (nick, User (nick)) +            reply = ReplyContext (client=self, target=target, user=user) + +            # shlex.split supports quoting arguments, which str.split() does not +            command = shlex.split (message[len (msgPrefix):]) +            try: +                args = self.parser.parse_args (command) +            except Exception as e: +                reply (f'{e.args[1]} -- {e.args[0].format_usage ()}') +                return +            if not args or not hasattr (args, 'func'): +                reply (f'Sorry, I don’t understand {command}') +                return + +            minPriv = getattr (args, 'minPriv', None) +            if self._quit.armed and not getattr (args, 'allowOnShutdown', False): +                reply ('Sorry, I’m shutting down and cannot accept your request right now.') +            elif not user.hasPriv (minPriv): +                reply (f'Sorry, you need the privilege {minPriv.human} to use this command.') +            else: +                with self._quit: +                    await args.func (user=user, args=args, reply=reply) + +    async def onDisconnect (self, **kwargs): +        """ Auto-reconnect """ +        self.logger.info ('disconnect', uuid='4c74b2c8-2403-4921-879d-2279ad85db72') +        while True: +            if not self._quit.armed: +                await asyncio.sleep (10, loop=self.loop) +                self.logger.info ('reconnect', uuid='c53555cb-e1a4-4b69-b1c9-3320269c19d7') +                try: +                    await self.connect () +                finally: +                    break + +def jobExists (func): +    """ Chromebot job exists """ +    @wraps (func) +    async def inner (self, **kwargs): +        # XXX: not sure why it works with **kwargs, but not (user, args, reply) +        args = kwargs.get ('args') +        reply = kwargs.get ('reply') +        j = self.jobs.get (args.id, None) +        if not j: +            reply (f'Job {args.id} is unknown') +        else: +            ret = await func (self, job=j, **kwargs) +            return ret +    return inner + +class Chromebot (ArgparseBot): +    __slots__ = ('jobs', 'tempdir', 'destdir', 'processLimit', 'blacklist', 'needVoice') + +    def __init__ (self, host, port, ssl, nick, logger, channels=None, +            tempdir=None, destdir='.', processLimit=1, +            blacklist={}, needVoice=False, loop=None): +        self.needVoice = needVoice + +        super().__init__ (host=host, port=port, ssl=ssl, nick=nick, +                logger=logger, channels=channels, loop=loop) + +        self.jobs = {} +        self.tempdir = tempdir or tempfile.gettempdir() +        self.destdir = destdir +        self.processLimit = asyncio.Semaphore (processLimit) +        self.blacklist = blacklist + +    def getParser (self): +        parser = NonExitingArgumentParser (prog=self.nick + ': ', add_help=False) +        subparsers = parser.add_subparsers(help='Sub-commands') + +        archiveparser = subparsers.add_parser('a', help='Archive a site', add_help=False) +        archiveparser.add_argument('--concurrency', '-j', default=1, type=int, help='Parallel workers for this job', choices=range (1, 5)) +        archiveparser.add_argument('--recursive', '-r', help='Enable recursion', choices=['0', '1', 'prefix'], default='0') +        archiveparser.add_argument('--insecure', '-k', +                help='Disable certificate checking', action='store_true') +        # parsing the cookie here, so we can give an early feedback without +        # waiting for the job to crash on invalid arguments. +        archiveparser.add_argument('--cookie', '-b', type=cookie, +                help='Add a cookie', action='append', default=[]) +        archiveparser.add_argument('url', help='Website URL', type=isValidUrl, metavar='URL') +        archiveparser.set_defaults (func=self.handleArchive, +                minPriv=NickMode.voice if self.needVoice else None) + +        statusparser = subparsers.add_parser ('s', help='Get job status', add_help=False) +        statusparser.add_argument('id', help='Job id', metavar='UUID') +        statusparser.set_defaults (func=self.handleStatus, allowOnShutdown=True) + +        abortparser = subparsers.add_parser ('r', help='Revoke/abort job', add_help=False) +        abortparser.add_argument('id', help='Job id', metavar='UUID') +        abortparser.set_defaults (func=self.handleAbort, allowOnShutdown=True, +                minPriv=NickMode.voice if self.needVoice else None) + +        return parser + +    def isBlacklisted (self, url): +        for k, v in self.blacklist.items(): +            if k.match (url): +                return v +        return False + +    async def handleArchive (self, user, args, reply): +        """ Handle the archive command """ + +        msg = self.isBlacklisted (args.url) +        if msg: +            reply (f'{args.url} cannot be queued: {msg}') +            return + +        # make sure the job id is unique. Since ids are time-based we can just +        # wait. +        while True: +            j = Job (args.url, user.name) +            if j.id not in self.jobs: +                break +            time.sleep (0.01) +        self.jobs[j.id] = j + +        logger = self.logger.bind (job=j.id) + +        showargs = { +                'recursive': args.recursive, +                'concurrency': args.concurrency, +                } +        if args.insecure: +            showargs['insecure'] = args.insecure +        warcinfo = {'chromebot': { +                'jobid': j.id, +                'user': user.name, +                'queued': j.started, +                'url': args.url, +                'recursive': args.recursive, +                'concurrency': args.concurrency, +                }} +        grabCmd = ['crocoite-single'] +        # prefix warcinfo with !, so it won’t get expanded +        grabCmd.extend (['--warcinfo', +                '!' + json.dumps (warcinfo, cls=StrJsonEncoder)]) +        for v in args.cookie: +            grabCmd.extend (['--cookie', v.OutputString ()]) +        if args.insecure: +            grabCmd.append ('--insecure') +        grabCmd.extend (['{url}', '{dest}']) +        cmdline = ['crocoite', +                '--tempdir', self.tempdir, +                '--recursion', args.recursive, +                '--concurrency', str (args.concurrency), +                args.url, +                os.path.join (self.destdir, +                        j.id + '-{host}-{date}-{seqnum}.warc.gz'), +                '--'] + grabCmd + +        strargs = ', '.join (map (lambda x: '{}={}'.format (*x), showargs.items ())) +        reply (f'{args.url} has been queued as {j.id} with {strargs}') +        logger.info ('queue', user=user.name, url=args.url, cmdline=cmdline, +                uuid='36cc34a6-061b-4cc5-84a9-4ab6552c8d75') + +        async with self.processLimit: +            if j.status == Status.pending: +                # job was not aborted +                j.process = await asyncio.create_subprocess_exec (*cmdline, +                        stdout=asyncio.subprocess.PIPE, +                        stderr=asyncio.subprocess.DEVNULL, +                        stdin=asyncio.subprocess.DEVNULL, +                        start_new_session=True, limit=100*1024*1024) +                while True: +                    data = await j.process.stdout.readline () +                    if not data: +                        break + +                    # job is marked running after the first message is received from it +                    if j.status == Status.pending: +                        logger.info ('start', uuid='46e62d60-f498-4ab0-90e1-d08a073b10fb') +                        j.status = Status.running + +                    data = json.loads (data) +                    msgid = data.get ('uuid') +                    if msgid == '24d92d16-770e-4088-b769-4020e127a7ff': +                        j.stats = data +                    elif msgid == '5b8498e4-868d-413c-a67e-004516b8452c': +                        j.rstats = data + +                    # forward message, so the dashboard can use it +                    logger.info ('message', +                            uuid='5c0f9a11-dcd8-4182-a60f-54f4d3ab3687', +                            data=data) +                code = await j.process.wait () + +        if j.status == Status.running: +            logger.info ('finish', uuid='7b40ffbb-faab-4224-90ed-cd4febd8f7ec') +            j.status = Status.finished +        j.finished = datetime.utcnow () + +        stats = j.stats +        rstats = j.rstats +        reply (j.formatStatus ()) + +    @jobExists +    async def handleStatus (self, user, args, reply, job): +        """ Handle status command """ + +        rstats = job.rstats +        reply (job.formatStatus ()) + +    @jobExists +    async def handleAbort (self, user, args, reply, job): +        """ Handle abort command """ + +        if job.status not in {Status.pending, Status.running}: +            reply ('This job is not running.') +            return + +        job.status = Status.aborted +        self.logger.info ('abort', job=job.id, user=user.name, +                uuid='865b3b3e-a54a-4a56-a545-f38a37bac295') +        if job.process and job.process.returncode is None: +            job.process.terminate () + +class Dashboard: +    __slots__ = ('fd', 'clients', 'loop', 'log', 'maxLog', 'pingInterval', 'pingTimeout') +    # these messages will not be forwarded to the browser +    ignoreMsgid = { +            # connect +            '01f7b138-ea53-4609-88e9-61f3eca3e7e7', +            # join +            '367063a5-9069-4025-907c-65ba88af8593', +            # disconnect +            '4c74b2c8-2403-4921-879d-2279ad85db72', +            # reconnect +            'c53555cb-e1a4-4b69-b1c9-3320269c19d7', +            } + +    def __init__ (self, fd, loop, maxLog=1000, pingInterval=30, pingTimeout=10): +        self.fd = fd +        self.clients = set () +        self.loop = loop +        # log buffer +        self.log = [] +        self.maxLog = maxLog +        self.pingInterval = pingInterval +        self.pingTimeout = pingTimeout + +    async def client (self, websocket, path): +        self.clients.add (websocket) +        try: +            for l in self.log: +                buf = json.dumps (l) +                await websocket.send (buf) +            while True: +                try: +                    msg = await asyncio.wait_for (websocket.recv(), timeout=self.pingInterval) +                except asyncio.TimeoutError: +                    try: +                        pong = await websocket.ping() +                        await asyncio.wait_for (pong, timeout=self.pingTimeout) +                    except asyncio.TimeoutError: +                        break +                except websockets.exceptions.ConnectionClosed: +                    break +        finally: +            self.clients.remove (websocket) + +    def handleStdin (self): +        buf = self.fd.readline () +        if not buf: +            return + +        try: +            data = json.loads (buf) +        except json.decoder.JSONDecodeError: +            # ignore invalid +            return +        msgid = data['uuid'] + +        if msgid in self.ignoreMsgid: +            return + +        # a few messages may contain sensitive information that we want to hide +        if msgid == '36cc34a6-061b-4cc5-84a9-4ab6552c8d75': +            # queue +            del data['cmdline'] +        elif msgid == '5c0f9a11-dcd8-4182-a60f-54f4d3ab3687': +            nesteddata = data['data'] +            nestedmsgid = nesteddata['uuid'] +            if nestedmsgid == 'd1288fbe-8bae-42c8-af8c-f2fa8b41794f': +                del nesteddata['command'] +             +        buf = json.dumps (data) +        for c in self.clients: +            # XXX can’t await here +            asyncio.ensure_future (c.send (buf)) + +        self.log.append (data) +        while len (self.log) > self.maxLog: +            self.log.pop (0) + +    def run (self, host='localhost', port=6789): +        self.loop.add_reader (self.fd, self.handleStdin) +        return websockets.serve(self.client, host, port) + diff --git a/crocoite/logger.py b/crocoite/logger.py index 4b43e02..ac389ca 100644 --- a/crocoite/logger.py +++ b/crocoite/logger.py @@ -32,6 +32,10 @@ from datetime import datetime  from functools import partial  from enum import IntEnum +from pytz import utc + +from .util import StrJsonEncoder +  class Level(IntEnum):      DEBUG = 0      INFO = 1 @@ -39,9 +43,9 @@ class Level(IntEnum):      ERROR = 3  class Logger: -    def __init__ (self, consumer=[], bindings={}): -        self.bindings = bindings -        self.consumer = consumer +    def __init__ (self, consumer=None, bindings=None): +        self.bindings = bindings or {} +        self.consumer = consumer or []      def __call__ (self, level, *args, **kwargs):          if not isinstance (level, Level): @@ -83,7 +87,7 @@ class Logger:          self.consumer.remove (consumer)  class Consumer: -    def __call__ (self, level, *args, **kwargs): +    def __call__ (self, **kwargs): # pragma: no cover          raise NotImplementedError ()  class NullConsumer (Consumer): @@ -97,41 +101,32 @@ class PrintConsumer (Consumer):      def __call__ (self, **kwargs):          sys.stderr.write (str (kwargs))          sys.stderr.write ('\n') +        sys.stderr.flush ()          return kwargs -class JsonEncoder (json.JSONEncoder): -    def default (self, obj): -        if isinstance (obj, datetime): -            return obj.isoformat () - -        # make sure serialization always succeeds -        try: -            return json.JSONEncoder.default(self, obj) -        except TypeError: -            return str (obj) -  class JsonPrintConsumer (Consumer): -    def __init__ (self, minLevel=Level.INFO): +    def __init__ (self, minLevel=Level.DEBUG):          self.minLevel = minLevel      def __call__ (self, **kwargs):          if kwargs['level'] >= self.minLevel: -            json.dump (kwargs, sys.stdout, cls=JsonEncoder) +            json.dump (kwargs, sys.stdout, cls=StrJsonEncoder)              sys.stdout.write ('\n') +            sys.stdout.flush ()          return kwargs  class DatetimeConsumer (Consumer):      def __call__ (self, **kwargs): -        kwargs['date'] = datetime.utcnow () +        kwargs['date'] = datetime.utcnow ().replace (tzinfo=utc)          return kwargs  class WarcHandlerConsumer (Consumer): -    def __init__ (self, warc, minLevel=Level.INFO): +    def __init__ (self, warc, minLevel=Level.DEBUG):          self.warc = warc          self.minLevel = minLevel      def __call__ (self, **kwargs):          if kwargs['level'] >= self.minLevel: -            self.warc._writeLog (json.dumps (kwargs, cls=JsonEncoder)) +            self.warc._writeLog (json.dumps (kwargs, cls=StrJsonEncoder))          return kwargs diff --git a/crocoite/task.py b/crocoite/task.py deleted file mode 100644 index 06dd022..0000000 --- a/crocoite/task.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright (c) 2017–2018 crocoite contributors -#  -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -#  -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -#  -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. - -""" -Celery distributed tasks -""" - -import os - -from urllib.parse import urlsplit -from datetime import datetime -from operator import attrgetter -from itertools import chain - -def _monkeyPatchSyncTasks (): -    """ Result sets don’t support the argument disable_sync_subtasks argument """ -    import celery.result -    celery.result.assert_will_not_block = lambda: None - -_monkeyPatchSyncTasks () -from celery import Celery -from celery.utils.log import get_task_logger - -from .browser import ChromeService, BrowserCrashed -from .controller import SinglePageController, ControllerSettings, RecursiveController, defaultSettings, DepthLimit, StatsHandler -from . import behavior -from .cli import parseRecursive -from .warc import WarcHandler - -app = Celery ('crocoite.distributed') -app.config_from_object('celeryconfig') -app.conf.task_routes = { -        'crocoite.task.archive': {'queue': 'crocoite.archive'}, -        'crocoite.task.controller': {'queue': 'crocoite.controller'}, -        # <method>.chunks is actually a starmap job -        'celery.starmap': {'queue': 'crocoite.archive'}, -        } -app.conf.task_default_queue = 'crocoite.default' -# disable prefetching, since our tasks usually run for a _very_ long time -app.conf.worker_prefetch_multiplier = 1 -logger = get_task_logger('crocoite.distributed.archive') - -@app.task(bind=True, track_started=True) -def archive (self, url, settings, enabledBehaviorNames): -    """ -    Archive a single URL - -    Supports these config keys (celeryconfig): - -    warc_filename = '{domain}-{date}-{id}.warc.gz' -    temp_dir = '/tmp/' -    finished_dir = '/tmp/finished' -    """ - -    parsedUrl = urlsplit (url) -    outFile = app.conf.warc_filename.format ( -                    id=self.request.root_id, -                    domain=parsedUrl.hostname.replace ('/', '-'), -                    date=datetime.utcnow ().isoformat (), -                    ) -    outPath = os.path.join (app.conf.temp_dir, outFile) -    fd = open (outPath, 'wb') - -    handler = [StatsHandler (), WarcHandler (fd)] -    enabledBehavior = list (filter (lambda x: x.name in enabledBehaviorNames, behavior.available)) -    settings = ControllerSettings (**settings) -    try: -        c = SinglePageController (url, fd, behavior=enabledBehavior, -                settings=settings, handler=handler) -        c.run () -    except BrowserCrashed: -        # nothing we can do about that -        logger.error ('browser crashed for {}'.format (url)) - -    os.makedirs (app.conf.finished_dir, exist_ok=True) -    outPath = os.path.join (app.conf.finished_dir, outFile) -    os.rename (fd.name, outPath) - -    return handler[0].stats - -from collections import UserDict - -class IntegerDict (UserDict): -    """ Dict with dict/dict per-item arithmetic propagation, i.e. {1: 2}+{1: 1}={1: 3} """ -    def __add__ (self, b): -        newdict = self.__class__ (self) -        for k, v in b.items (): -            if k in self: -                newdict[k] += v -            else: -                newdict[k] = v -        return newdict - -class DistributedRecursiveController (RecursiveController): -    """ Distributed, recursive controller using celery """ - -    __slots__ = ('concurrency', 'stats') - -    def __init__ (self, url, logger, service=ChromeService (), behavior=behavior.available, \ -            settings=defaultSettings, -            recursionPolicy=DepthLimit (0), concurrency=1): -        super ().__init__ (url, None, service, behavior, logger, settings, recursionPolicy) -        self.concurrency = concurrency -        self.stats = IntegerDict () - -    def fetch (self, urls): -        def chunksIter (urls): -            for u in urls: -                yield (u, self.settings.toDict (), list (map (attrgetter ('name'), self.behavior))) -        itemsPerTask = len (urls)//self.concurrency -        if itemsPerTask <= 0: -            itemsPerTask = len (urls) -        result = archive.chunks (chunksIter (urls), itemsPerTask).apply_async ().get () -        self.stats = sum (chain.from_iterable (result), self.stats) - -@app.task(bind=True, track_started=True) -def controller (self, url, settings, enabledBehaviorNames, recursive, concurrency): -    """ Recursive controller """ - -    logger = Logger (consumer=[DatetimeConsumer (), JsonPrintConsumer ()]) -    recursionPolicy = parseRecursive (recursive, url) -    enabledBehavior = list (filter (lambda x: x.name in enabledBehaviorNames, behavior.available)) -    settings = ControllerSettings (**settings) -    c = DistributedRecursiveController (url, None, logger=logger, behavior=enabledBehavior, -            settings=settings, recursionPolicy=recursionPolicy, concurrency=concurrency) -    c.run () -    return dict (c.stats) - - diff --git a/crocoite/test_behavior.py b/crocoite/test_behavior.py new file mode 100644 index 0000000..1efea08 --- /dev/null +++ b/crocoite/test_behavior.py @@ -0,0 +1,266 @@ +# Copyright (c) 2017 crocoite contributors +#  +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +#  +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +#  +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +import asyncio, os, yaml, re, math, struct +from functools import partial +from operator import attrgetter + +import pytest +from yarl import URL +from aiohttp import web + +import pkg_resources +from .logger import Logger +from .devtools import Process +from .behavior import Scroll, Behavior, ExtractLinks, ExtractLinksEvent, Crash, \ +        Screenshot, ScreenshotEvent, DomSnapshot, DomSnapshotEvent, mapOrIgnore +from .controller import SinglePageController, EventHandler, ControllerSettings +from .devtools import Crashed + +with pkg_resources.resource_stream (__name__, os.path.join ('data', 'click.yaml')) as fd: +    sites = list (yaml.safe_load_all (fd)) +clickParam = [] +for o in sites: +    for s in o['selector']: +        for u in s.get ('urls', []): +            clickParam.append ((u, s['selector'])) + +class ClickTester (Behavior): +    """ +    Test adapter checking a given selector exists after loading the page +    """ + +    __slots__ = ('selector', ) + +    name = 'testclick' + +    def __init__ (self, loader, logger, selector): +        super ().__init__ (loader, logger) +        self.selector = selector + +    async def onfinish (self): +        tab = self.loader.tab +        results = await tab.DOM.getDocument () +        rootNode = results['root']['nodeId'] +        results = await tab.DOM.querySelectorAll (nodeId=rootNode, selector=self.selector) +        assert results['nodeIds'], self.selector + +        # XXX: this is not true for every element we click. Github uses <button +        # type=submit> and <form> without an event listener on the <button> +#        # verify that an event listener exists +#        for nid in results['nodeIds']: +#            obj = (await tab.DOM.resolveNode (nodeId=nid))['object'] +#            assert obj['type'] == 'object' +#            listeners = (await tab.DOMDebugger.getEventListeners (objectId=obj['objectId']))['listeners'] +#            assert any (map (lambda x: x['type'] == 'click', listeners)), listeners + +        return +        yield # pragma: no cover + +@pytest.mark.parametrize("url,selector", clickParam) +@pytest.mark.asyncio +@pytest.mark.xfail(reason='depends on network access') +async def test_click_selectors (url, selector): +    """ +    Make sure the CSS selector exists on an example url +    """ +    logger = Logger () +    settings = ControllerSettings (idleTimeout=5, timeout=10) +    # Some selectors are loaded dynamically and require scrolling +    controller = SinglePageController (url=url, logger=logger, +            settings=settings, +            service=Process (), +            behavior=[Scroll, partial(ClickTester, selector=selector)]) +    await controller.run () + +matchParam = [] +for o in sites: +    for s in o['selector']: +        for u in s.get ('urls', []): +            matchParam.append ((o['match'], URL (u))) + +@pytest.mark.parametrize("match,url", matchParam) +@pytest.mark.asyncio +async def test_click_match (match, url): +    """ Test urls must match """ +    # keep this aligned with click.js +    assert re.match (match, url.host, re.I) + + +class AccumHandler (EventHandler): +    """ Test adapter that accumulates all incoming items """ +    __slots__ = ('data') + +    def __init__ (self): +        super().__init__ () +        self.data = [] + +    async def push (self, item): +        self.data.append (item) + +async def simpleServer (url, response): +    async def f (req): +        return web.Response (body=response, status=200, content_type='text/html', charset='utf-8') + +    app = web.Application () +    app.router.add_route ('GET', url.path, f) +    runner = web.AppRunner(app) +    await runner.setup() +    site = web.TCPSite(runner, url.host, url.port) +    await site.start() +    return runner + +@pytest.mark.asyncio +async def test_extract_links (): +    """ +    Make sure the CSS selector exists on an example url +    """ + +    url = URL.build (scheme='http', host='localhost', port=8080) +    runner = await simpleServer (url, """<html><head></head> +            <body> +            <div> +                <a href="/relative">foo</a> +                <a href="http://example.com/absolute/">foo</a> +                <a href="https://example.com/absolute/secure">foo</a> +                <a href="#anchor">foo</a> +                <a href="http://neue_preise_f%c3%bcr_zahnimplantate_k%c3%b6nnten_sie_%c3%bcberraschen">foo</a> + +                <a href="/hidden/visibility" style="visibility: hidden">foo</a> +                <a href="/hidden/display" style="display: none">foo</a> +                <div style="display: none"> +                <a href="/hidden/display/insidediv">foo</a> +                </div> +                <!--<a href="/hidden/comment">foo</a>--> + +                <p><img src="shapes.png" usemap="#shapes"> +                 <map name="shapes"><area shape=rect coords="50,50,100,100" href="/map/rect"></map></p> +            </div> +            </body></html>""") + +    try: +        handler = AccumHandler () +        logger = Logger () +        controller = SinglePageController (url=url, logger=logger, +                service=Process (), behavior=[ExtractLinks], handler=[handler]) +        await controller.run () + +        links = [] +        for d in handler.data: +            if isinstance (d, ExtractLinksEvent): +                links.extend (d.links) +        assert sorted (links) == sorted ([ +                url.with_path ('/relative'), +                url.with_fragment ('anchor'), +                URL ('http://neue_preise_f%C3%BCr_zahnimplantate_k%C3%B6nnten_sie_%C3%BCberraschen'), +                URL ('http://example.com/absolute/'), +                URL ('https://example.com/absolute/secure'), +                url.with_path ('/hidden/visibility'), # XXX: shall we ignore these as well? +                url.with_path ('/map/rect'), +                ]) +    finally: +        await runner.cleanup () + +@pytest.mark.asyncio +async def test_crash (): +    """ +    Crashing through Behavior works? +    """ + +    url = URL.build (scheme='http', host='localhost', port=8080) +    runner = await simpleServer (url, '<html></html>') + +    try: +        logger = Logger () +        controller = SinglePageController (url=url, logger=logger, +                service=Process (), behavior=[Crash]) +        with pytest.raises (Crashed): +            await controller.run () +    finally: +        await runner.cleanup () + +@pytest.mark.asyncio +async def test_screenshot (): +    """ +    Make sure screenshots are taken and have the correct dimensions. We can’t +    and don’t want to check their content. +    """ +    # ceil(0) == 0, so starting with 1 +    for expectHeight in (1, Screenshot.maxDim, Screenshot.maxDim+1, Screenshot.maxDim*2+Screenshot.maxDim//2): +        url = URL.build (scheme='http', host='localhost', port=8080) +        runner = await simpleServer (url, f'<html><body style="margin: 0; padding: 0;"><div style="height: {expectHeight}"></div></body></html>') + +        try: +            handler = AccumHandler () +            logger = Logger () +            controller = SinglePageController (url=url, logger=logger, +                    service=Process (), behavior=[Screenshot], handler=[handler]) +            await controller.run () + +            screenshots = list (filter (lambda x: isinstance (x, ScreenshotEvent), handler.data)) +            assert len (screenshots) == math.ceil (expectHeight/Screenshot.maxDim) +            totalHeight = 0 +            for s in screenshots: +                assert s.url == url +                # PNG ident is fixed, IHDR is always the first chunk +                assert s.data.startswith (b'\x89PNG\r\n\x1a\n\x00\x00\x00\x0dIHDR') +                width, height = struct.unpack ('>II', s.data[16:24]) +                assert height <= Screenshot.maxDim +                totalHeight += height +            # screenshot height is at least canvas height (XXX: get hardcoded +            # value from devtools.Process) +            assert totalHeight == max (expectHeight, 1080) +        finally: +            await runner.cleanup () + +@pytest.mark.asyncio +async def test_dom_snapshot (): +    """ +    Behavior plug-in works, <canvas> is replaced by static image, <script> is +    stripped. Actual conversion from Chrome DOM to HTML is validated by module +    .test_html +    """ + +    url = URL.build (scheme='http', host='localhost', port=8080) +    runner = await simpleServer (url, f'<html><body><p>ÄÖÜäöü</p><script>alert("yes");</script><canvas id="canvas" width="1" height="1">Alternate text.</canvas></body></html>') + +    try: +        handler = AccumHandler () +        logger = Logger () +        controller = SinglePageController (url=url, logger=logger, +                service=Process (), behavior=[DomSnapshot], handler=[handler]) +        await controller.run () + +        snapshots = list (filter (lambda x: isinstance (x, DomSnapshotEvent), handler.data)) +        assert len (snapshots) == 1 +        doc = snapshots[0].document +        assert doc.startswith ('<HTML><HEAD><meta charset=utf-8></HEAD><BODY><P>ÄÖÜäöü</P><IMG id=canvas width=1 height=1 src="data:image/png;base64,'.encode ('utf-8')) +        assert doc.endswith ('></BODY></HTML>'.encode ('utf-8')) +    finally: +        await runner.cleanup () + +def test_mapOrIgnore (): +    def fail (x): +        if x < 50: +            raise Exception () +        return x+1 + +    assert list (mapOrIgnore (fail, range (100))) == list (range (51, 101)) + diff --git a/crocoite/test_browser.py b/crocoite/test_browser.py index 5c7fc69..7084214 100644 --- a/crocoite/test_browser.py +++ b/crocoite/test_browser.py @@ -18,232 +18,370 @@  # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN  # THE SOFTWARE. -import pytest +import asyncio, socket  from operator import itemgetter  from http.server import BaseHTTPRequestHandler -from pychrome.exceptions import TimeoutException +from datetime import datetime -from .browser import Item, SiteLoader, ChromeService, NullService, BrowserCrashed -from .logger import Logger, Consumer +from yarl import URL +from aiohttp import web +from multidict import CIMultiDict -class TItem (Item): -    """ This should be as close to Item as possible """ - -    __slots__ = ('bodySend', '_body', '_requestBody') -    base = 'http://localhost:8000/' - -    def __init__ (self, path, status, headers, bodyReceive, bodySend=None, requestBody=None, failed=False): -        super ().__init__ (tab=None) -        self.chromeResponse = {'response': {'headers': headers, 'status': status, 'url': self.base + path}} -        self._body = bodyReceive, False -        self.bodySend = bodyReceive if not bodySend else bodySend -        self._requestBody = requestBody, False -        self.failed = failed - -    @property -    def body (self): -        return self._body - -    @property -    def requestBody (self): -        return self._requestBody - -testItems = [ -    TItem ('binary', 200, {'Content-Type': 'application/octet-stream'}, b'\x00\x01\x02', failed=True), -    TItem ('attachment', 200,  -            {'Content-Type': 'text/plain; charset=utf-8', -            'Content-Disposition': 'attachment; filename="attachment.txt"', -            }, -            'This is a simple text file with umlauts. ÄÖU.'.encode ('utf8'), failed=True), -    TItem ('encoding/utf8', 200, {'Content-Type': 'text/plain; charset=utf-8'}, -            'This is a test, äöü μνψκ ¥¥¥¿ýý¡'.encode ('utf8')), -    TItem ('encoding/iso88591', 200, {'Content-Type': 'text/plain; charset=ISO-8859-1'}, -            'This is a test, äöü.'.encode ('utf8'), -            'This is a test, äöü.'.encode ('ISO-8859-1')), -    TItem ('encoding/latin1', 200, {'Content-Type': 'text/plain; charset=latin1'}, -            'This is a test, äöü.'.encode ('utf8'), -            'This is a test, äöü.'.encode ('latin1')), -    TItem ('image', 200, {'Content-Type': 'image/png'}, -            # 1×1 png image -            b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x00\x00\x00\x00:~\x9bU\x00\x00\x00\nIDAT\x08\x1dc\xf8\x0f\x00\x01\x01\x01\x006_g\x80\x00\x00\x00\x00IEND\xaeB`\x82'), -    TItem ('empty', 200, {}, b''), -    TItem ('redirect/301/empty', 301, {'Location': '/empty'}, b''), -    TItem ('redirect/301/redirect/301/empty', 301, {'Location': '/redirect/301/empty'}, b''), -    TItem ('nonexistent', 404, {}, b''), -    TItem ('html', 200, {'Content-Type': 'html'}, -            '<html><body><img src="/image"><img src="/nonexistent"></body></html>'.encode ('utf8')), -    TItem ('html/alert', 200, {'Content-Type': 'html'}, -            '<html><body><script>window.addEventListener("beforeunload", function (e) { e.returnValue = "bye?"; return e.returnValue; }); alert("stopping here"); if (confirm("are you sure?") || prompt ("42?")) { window.location = "/nonexistent"; }</script><img src="/image"></body></html>'.encode ('utf8')), -    TItem ('html/fetchPost', 200, {'Content-Type': 'html'}, -            r"""<html><body><script> -            let a = fetch("/html/fetchPost/binary", {"method": "POST", "body": "\x00"}); -            let b = fetch("/html/fetchPost/form", {"method": "POST", "body": new URLSearchParams({"data": "!"})}); -            let c = fetch("/html/fetchPost/binary/large", {"method": "POST", "body": "\x00".repeat(100*1024)}); -            let d = fetch("/html/fetchPost/form/large", {"method": "POST", "body": new URLSearchParams({"data": "!".repeat(100*1024)})}); -            </script></body></html>""".encode ('utf8')), -    TItem ('html/fetchPost/binary', 200, {'Content-Type': 'application/octet-stream'}, b'\x00', requestBody=b'\x00'), -    TItem ('html/fetchPost/form', 200, {'Content-Type': 'application/octet-stream'}, b'\x00', requestBody=b'data=%21'), -    # XXX: these should trigger the need for getRequestPostData, but they don’t. oh well. -    TItem ('html/fetchPost/binary/large', 200, {'Content-Type': 'application/octet-stream'}, b'\x00', requestBody=(100*1024)*b'\x00'), -    TItem ('html/fetchPost/form/large', 200, {'Content-Type': 'application/octet-stream'}, b'\x00', requestBody=b'data=' + (100*1024)*b'%21'), -    ] -testItemMap = dict ([(item.parsedUrl.path, item) for item in testItems]) - -class RequestHandler (BaseHTTPRequestHandler): -    def do_GET(self): -        item = testItemMap.get (self.path) -        if item: -            self.send_response (item.response['status']) -            for k, v in item.response['headers'].items (): -                self.send_header (k, v) -            body = item.bodySend -            self.send_header ('Content-Length', len (body)) -            self.end_headers() -            self.wfile.write (body) -        return - -    do_POST = do_GET - -    def log_message (self, format, *args): -        pass +from hypothesis import given +import hypothesis.strategies as st +from hypothesis.provisional import domains +import pytest -@pytest.fixture -def http (): -    def run (): -        import http.server -        PORT = 8000 -        httpd = http.server.HTTPServer (("localhost", PORT), RequestHandler) -        print ('starting http server') -        httpd.serve_forever() - -    from multiprocessing import Process -    p = Process (target=run) -    p.start () -    yield p -    p.terminate () -    p.join () +from .browser import RequestResponsePair, SiteLoader, Request, \ +        UnicodeBody, ReferenceTimestamp, Base64Body, UnicodeBody, Request, \ +        Response, NavigateError, PageIdle, FrameNavigated +from .logger import Logger, Consumer +from .devtools import Crashed, Process + +# if you want to know what’s going on: +#import logging +#logging.basicConfig(level=logging.DEBUG)  class AssertConsumer (Consumer):      def __call__ (self, **kwargs):          assert 'uuid' in kwargs          assert 'msg' in kwargs          assert 'context' in kwargs +        return kwargs  @pytest.fixture  def logger ():      return Logger (consumer=[AssertConsumer ()])  @pytest.fixture -def loader (http, logger): -    def f (path): -        if path.startswith ('/'): -            path = 'http://localhost:8000{}'.format (path) -        return SiteLoader (browser, path, logger) -    print ('loader setup') -    with ChromeService () as browser: -        yield f -    print ('loader teardown') - -def itemsLoaded (l, items): -    items = dict ([(i.parsedUrl.path, i) for i in items]) -    timeout = 5 -    while True: -        if not l.notify.wait (timeout) and len (items) > 0: -            assert False, 'timeout' -        if len (l.queue) > 0: -            item = l.queue.popleft () -            if isinstance (item, Exception): -                raise item -            assert item.chromeResponse is not None -            golden = items.pop (item.parsedUrl.path) -            if not golden: -                assert False, 'url {} not supposed to be fetched'.format (item.url) -            assert item.failed == golden.failed -            if item.failed: -                # response will be invalid if request failed -                continue -            assert item.body[0] == golden.body[0] -            assert item.requestBody[0] == golden.requestBody[0] -            assert item.response['status'] == golden.response['status'] -            assert item.statusText == BaseHTTPRequestHandler.responses.get (item.response['status'])[0] -            for k, v in golden.responseHeaders: -                actual = list (map (itemgetter (1), filter (lambda x: x[0] == k, item.responseHeaders))) -                assert v in actual - -        # check queue at least once -        if not items: -            break - -def literalItem (lf, item, deps=[]): -    with lf (item.parsedUrl.path) as l: -        l.start () -        itemsLoaded (l, [item] + deps) - -def test_empty (loader): -    literalItem (loader, testItemMap['/empty']) - -def test_redirect (loader): -    literalItem (loader, testItemMap['/redirect/301/empty'], [testItemMap['/empty']]) -    # chained redirects -    literalItem (loader, testItemMap['/redirect/301/redirect/301/empty'], [testItemMap['/redirect/301/empty'], testItemMap['/empty']]) - -def test_encoding (loader): -    """ Text responses are transformed to UTF-8. Make sure this works -    correctly. """ -    for item in {testItemMap['/encoding/utf8'], testItemMap['/encoding/latin1'], testItemMap['/encoding/iso88591']}: -        literalItem (loader, item) - -def test_binary (loader): -    """ Browser should ignore content it cannot display (i.e. octet-stream) """ -    literalItem (loader, testItemMap['/binary']) - -def test_image (loader): -    """ Images should be displayed inline """ -    literalItem (loader, testItemMap['/image']) - -def test_attachment (loader): -    """ And downloads won’t work in headless mode, even if it’s just a text file """ -    literalItem (loader, testItemMap['/attachment']) - -def test_html (loader): -    literalItem (loader, testItemMap['/html'], [testItemMap['/image'], testItemMap['/nonexistent']]) -    # make sure alerts are dismissed correctly (image won’t load otherwise) -    literalItem (loader, testItemMap['/html/alert'], [testItemMap['/image']]) - -def test_post (loader): -    """ XHR POST request with binary data""" -    literalItem (loader, testItemMap['/html/fetchPost'], -            [testItemMap['/html/fetchPost/binary'], -            testItemMap['/html/fetchPost/binary/large'], -            testItemMap['/html/fetchPost/form'], -            testItemMap['/html/fetchPost/form/large']]) - -def test_crash (loader): -    with loader ('/html') as l: -        l.start () -        try: -            l.tab.Page.crash (_timeout=1) -        except TimeoutException: -            pass -        q = l.queue -        assert isinstance (q.popleft (), BrowserCrashed) - -def test_invalidurl (loader): -    url = 'http://nonexistent.example/' -    with loader (url) as l: -        l.start () - -        q = l.queue -        if not l.notify.wait (10): -            assert False, 'timeout' - -        it = q.popleft () -        assert it.failed - -def test_nullservice (): -    """ Null service returns the url as is """ - -    url = 'http://localhost:12345' -    with NullService (url) as u: -        assert u == url +async def loader (logger): +    async with Process () as browser, SiteLoader (browser, logger) as l: +        yield l + +@pytest.mark.asyncio +async def test_crash (loader): +    with pytest.raises (Crashed): +        await loader.tab.Page.crash () + +@pytest.mark.asyncio +async def test_invalidurl (loader): +    host = 'nonexistent.example' + +    # make sure the url does *not* resolve (some DNS intercepting ISP’s mess +    # with this) +    loop = asyncio.get_event_loop () +    try: +        resolved = await loop.getaddrinfo (host, None) +    except socket.gaierror: +        url = URL.build (scheme='http', host=host) +        with pytest.raises (NavigateError): +            await loader.navigate (url) +    else: +        pytest.skip (f'host {host} resolved to {resolved}') + +timestamp = st.one_of ( +                st.integers(min_value=0, max_value=2**32-1), +                st.floats (min_value=0, max_value=2**32-1), +                ) + +@given(timestamp, timestamp, timestamp) +def test_referencetimestamp (relativeA, absoluteA, relativeB): +    ts = ReferenceTimestamp (relativeA, absoluteA) +    absoluteA = datetime.utcfromtimestamp (absoluteA) +    absoluteB = ts (relativeB) +    assert (absoluteA < absoluteB and relativeA < relativeB) or \ +            (absoluteA >= absoluteB and relativeA >= relativeB) +    assert abs ((absoluteB - absoluteA).total_seconds () - (relativeB - relativeA)) < 10e-6 + +def urls (): +    """ Build http/https URL """ +    scheme = st.sampled_from (['http', 'https']) +    # Path must start with a slash +    pathSt = st.builds (lambda x: '/' + x, st.text ()) +    args = st.fixed_dictionaries ({ +            'scheme': scheme, +            'host': domains (), +            'port': st.one_of (st.none (), st.integers (min_value=1, max_value=2**16-1)), +            'path': pathSt, +            'query_string': st.text (), +            'fragment': st.text (), +            }) +    return st.builds (lambda x: URL.build (**x), args) + +def urlsStr (): +    return st.builds (lambda x: str (x), urls ()) + +asciiText = st.text (st.characters (min_codepoint=32, max_codepoint=126)) + +def chromeHeaders (): +    # token as defined by https://tools.ietf.org/html/rfc7230#section-3.2.6 +    token = st.sampled_from('abcdefghijklmnopqrstuvwxyz0123456789!#$%&\'*+-.^_`|~') +    # XXX: the value should be asciiText without leading/trailing spaces +    return st.dictionaries (token, token) + +def fixedDicts (fixed, dynamic): +    return st.builds (lambda x, y: x.update (y), st.fixed_dictionaries (fixed), st.lists (dynamic)) + +def chromeRequestWillBeSent (reqid, url): +    methodSt = st.sampled_from (['GET', 'POST', 'PUT', 'DELETE']) +    return st.fixed_dictionaries ({ +            'requestId': reqid, +            'initiator': st.just ('Test'), +            'wallTime': timestamp, +            'timestamp': timestamp, +            'request': st.fixed_dictionaries ({ +                'url': url, +                'method': methodSt, +                'headers': chromeHeaders (), +                # XXX: postData, hasPostData +                }) +            }) + +def chromeResponseReceived (reqid, url): +    mimeTypeSt = st.one_of (st.none (), st.just ('text/html')) +    remoteIpAddressSt = st.one_of (st.none (), st.just ('127.0.0.1')) +    protocolSt = st.one_of (st.none (), st.just ('h2')) +    statusCodeSt = st.integers (min_value=100, max_value=999) +    typeSt = st.sampled_from (['Document', 'Stylesheet', 'Image', 'Media', +            'Font', 'Script', 'TextTrack', 'XHR', 'Fetch', 'EventSource', +            'WebSocket', 'Manifest', 'SignedExchange', 'Ping', +            'CSPViolationReport', 'Other']) +    return st.fixed_dictionaries ({ +            'requestId': reqid, +            'timestamp': timestamp, +            'type': typeSt, +            'response': st.fixed_dictionaries ({ +                'url': url, +                'requestHeaders': chromeHeaders (), # XXX: make this optional +                'headers': chromeHeaders (), +                'status': statusCodeSt, +                'statusText': asciiText, +                'mimeType': mimeTypeSt, +                'remoteIPAddress': remoteIpAddressSt, +                'protocol': protocolSt, +                }) +            }) + +def chromeReqResp (): +    # XXX: will this gnerated the same url for all testcases? +    reqid = st.shared (st.text (), 'reqresp') +    url = st.shared (urlsStr (), 'reqresp') +    return st.tuples (chromeRequestWillBeSent (reqid, url), +            chromeResponseReceived (reqid, url)) + +def requestResponsePair (): +    def f (creq, cresp, hasPostData, reqBody, respBody): +        i = RequestResponsePair () +        i.fromRequestWillBeSent (creq) +        i.request.hasPostData = hasPostData +        if hasPostData: +            i.request.body = reqBody + +        if cresp is not None: +            i.fromResponseReceived (cresp) +            if respBody is not None: +                i.response.body = respBody +        return i + +    bodySt = st.one_of ( +            st.none (), +            st.builds (UnicodeBody, st.text ()), +            st.builds (Base64Body.fromBytes, st.binary ()) +            ) +    return st.builds (lambda reqresp, hasPostData, reqBody, respBody: +            f (reqresp[0], reqresp[1], hasPostData, reqBody, respBody), +            chromeReqResp (), st.booleans (), bodySt, bodySt) + +@given(chromeReqResp ()) +def test_requestResponsePair (creqresp): +    creq, cresp = creqresp + +    item = RequestResponsePair () + +    assert item.id is None +    assert item.url is None +    assert item.request is None +    assert item.response is None + +    item.fromRequestWillBeSent (creq) + +    assert item.id == creq['requestId'] +    url = URL (creq['request']['url']) +    assert item.url == url +    assert item.request is not None +    assert item.request.timestamp == datetime.utcfromtimestamp (creq['wallTime']) +    assert set (item.request.headers.keys ()) == set (creq['request']['headers'].keys ()) +    assert item.response is None + +    item.fromResponseReceived (cresp) + +    # url will not be overwritten +    assert item.id == creq['requestId'] == cresp['requestId'] +    assert item.url == url +    assert item.request is not None +    assert set (item.request.headers.keys ()) == set (cresp['response']['requestHeaders'].keys ()) +    assert item.response is not None +    assert set (item.response.headers.keys ()) == set (cresp['response']['headers'].keys ()) +    assert (item.response.timestamp - item.request.timestamp).total_seconds () - \ +            (cresp['timestamp'] - creq['timestamp']) < 10e-6 + +@given(chromeReqResp ()) +def test_requestResponsePair_eq (creqresp): +    creq, cresp = creqresp + +    item = RequestResponsePair () +    item2 = RequestResponsePair () +    assert item == item +    assert item == item2 + +    item.fromRequestWillBeSent (creq) +    assert item != item2 +    item2.fromRequestWillBeSent (creq) +    assert item == item +    assert item == item2 + +    item.fromResponseReceived (cresp) +    assert item != item2 +    item2.fromResponseReceived (cresp) +    assert item == item +    assert item == item2 + +    # XXX: test for inequality with different parameters + +### Google Chrome integration tests ### + +serverUrl = URL.build (scheme='http', host='localhost', port=8080) +items = [ +    RequestResponsePair ( +        url=serverUrl.with_path ('/encoding/utf-8'), +        request=Request (method='GET'), +        response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=utf-8')]), +            body=UnicodeBody ('äöü'), mimeType='text/html') +        ), +    RequestResponsePair ( +        url=serverUrl.with_path ('/encoding/latin1'), +        request=Request (method='GET'), +        response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=latin1')]), +            body=UnicodeBody ('äöü'), mimeType='text/html') +        ), +    RequestResponsePair ( +        url=serverUrl.with_path ('/encoding/utf-16'), +        request=Request (method='GET'), +        response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=utf-16')]), +            body=UnicodeBody ('äöü'), mimeType='text/html') +        ), +    RequestResponsePair ( +        url=serverUrl.with_path ('/encoding/ISO-8859-1'), +        request=Request (method='GET'), +        response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=ISO-8859-1')]), +            body=UnicodeBody ('äöü'), mimeType='text/html') +        ), +    RequestResponsePair ( +        url=serverUrl.with_path ('/status/200'), +        request=Request (method='GET'), +        response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/plain')]), +            body=b'', +            mimeType='text/plain'), +        ), +    # redirects never have a response body +    RequestResponsePair ( +        url=serverUrl.with_path ('/status/301'), +        request=Request (method='GET'), +        response=Response (status=301, +            headers=CIMultiDict ([('Content-Type', 'text/plain'), +                ('Location', str (serverUrl.with_path ('/status/301/redirected')))]), +            body=None, +            mimeType='text/plain'), +        ), +    RequestResponsePair ( +        url=serverUrl.with_path ('/image/png'), +        request=Request (method='GET'), +        response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'image/png')]), +            body=Base64Body.fromBytes (b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x00\x00\x00\x00:~\x9bU\x00\x00\x00\nIDAT\x08\x1dc\xf8\x0f\x00\x01\x01\x01\x006_g\x80\x00\x00\x00\x00IEND\xaeB`\x82'), +            mimeType='image/png'), +        ), +    RequestResponsePair ( +        url=serverUrl.with_path ('/script/alert'), +        request=Request (method='GET'), +        response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=utf-8')]), +            body=UnicodeBody ('''<html><body><script> +window.addEventListener("beforeunload", function (e) { +    e.returnValue = "bye?"; +    return e.returnValue; +}); +alert("stopping here"); +if (confirm("are you sure?") || prompt ("42?")) { +    window.location = "/nonexistent"; +} +</script></body></html>'''), mimeType='text/html') +        ), +    ] + +@pytest.mark.asyncio +# would be nice if we could use hypothesis here somehow +@pytest.mark.parametrize("golden", items) +async def test_integration_item (loader, golden): +    async def f (req): +        body = golden.response.body +        contentType = golden.response.headers.get ('content-type', '') if golden.response.headers is not None else '' +        charsetOff = contentType.find ('charset=') +        if isinstance (body, UnicodeBody) and charsetOff != -1: +            encoding = contentType[charsetOff+len ('charset='):] +            body = golden.response.body.decode ('utf-8').encode (encoding) +        return web.Response (body=body, status=golden.response.status, +                headers=golden.response.headers) + +    app = web.Application () +    app.router.add_route (golden.request.method, golden.url.path, f) +    runner = web.AppRunner(app) +    await runner.setup() +    site = web.TCPSite(runner, serverUrl.host, serverUrl.port) +    try: +        await site.start() +    except Exception as e: +        pytest.skip (e) + +    haveReqResp = False +    haveNavigated = False +    try: +        await loader.navigate (golden.url) + +        it = loader.__aiter__ () +        while True: +            try: +                item = await asyncio.wait_for (it.__anext__ (), timeout=1) +            except asyncio.TimeoutError: +                break +            # XXX: can only check the first req/resp right now (due to redirect) +            if isinstance (item, RequestResponsePair) and not haveReqResp: +                # we do not know this in advance +                item.request.initiator = None +                item.request.headers = None +                item.remoteIpAddress = None +                item.protocol = None +                item.resourceType = None + +                if item.response: +                    assert item.response.statusText is not None +                    item.response.statusText = None + +                    del item.response.headers['server'] +                    del item.response.headers['content-length'] +                    del item.response.headers['date'] +                assert item == golden +                haveReqResp = True +            elif isinstance (item, FrameNavigated): +                # XXX: can’t check this, because of the redirect +                #assert item.url == golden.url +                haveNavigated = True +    finally: +        assert haveReqResp +        assert haveNavigated +        await runner.cleanup () + +def test_page_idle (): +    for v in (True, False): +        idle = PageIdle (v) +        assert bool (idle) == v + diff --git a/crocoite/test_controller.py b/crocoite/test_controller.py new file mode 100644 index 0000000..7216a42 --- /dev/null +++ b/crocoite/test_controller.py @@ -0,0 +1,203 @@ +# Copyright (c) 2017–2018 crocoite contributors +#  +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +#  +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +#  +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +import asyncio + +from yarl import URL +from aiohttp import web + +import pytest + +from .logger import Logger +from .controller import ControllerSettings, SinglePageController, SetEntry, \ +        IdleStateTracker +from .browser import PageIdle +from .devtools import Process +from .test_browser import loader + +@pytest.mark.asyncio +async def test_controller_timeout (): +    """ Make sure the controller terminates, even if the site keeps reloading/fetching stuff """ + +    async def f (req): +        return web.Response (body="""<html> +<body> +<p>hello</p> +<script> +window.setTimeout (function () { window.location = '/' }, 250); +window.setInterval (function () { fetch('/').then (function (e) { console.log (e) }) }, 150); +</script> +</body> +</html>""", status=200, content_type='text/html', charset='utf-8') + +    url = URL.build (scheme='http', host='localhost', port=8080) +    app = web.Application () +    app.router.add_route ('GET', '/', f) +    runner = web.AppRunner(app) +    await runner.setup() +    site = web.TCPSite(runner, url.host, url.port) +    await site.start() + +    loop = asyncio.get_event_loop () +    try: +        logger = Logger () +        settings = ControllerSettings (idleTimeout=1, timeout=5) +        controller = SinglePageController (url=url, logger=logger, +                service=Process (), behavior=[], settings=settings) +        # give the controller a little more time to finish, since there are +        # hard-coded asyncio.sleep calls in there right now. +        # XXX fix this +        before = loop.time () +        await asyncio.wait_for (controller.run (), timeout=settings.timeout*2) +        after = loop.time () +        assert after-before >= settings.timeout, (settings.timeout*2, after-before) +    finally: +        # give the browser some time to close before interrupting the +        # connection by destroying the HTTP server +        await asyncio.sleep (1) +        await runner.cleanup () + +@pytest.mark.asyncio +async def test_controller_idle_timeout (): +    """ Make sure the controller terminates, even if the site keeps reloading/fetching stuff """ + +    async def f (req): +        return web.Response (body="""<html> +<body> +<p>hello</p> +<script> +window.setInterval (function () { fetch('/').then (function (e) { console.log (e) }) }, 2000); +</script> +</body> +</html>""", status=200, content_type='text/html', charset='utf-8') + +    url = URL.build (scheme='http', host='localhost', port=8080) +    app = web.Application () +    app.router.add_route ('GET', '/', f) +    runner = web.AppRunner(app) +    await runner.setup() +    site = web.TCPSite(runner, url.host, url.port) +    await site.start() + +    loop = asyncio.get_event_loop () +    try: +        logger = Logger () +        settings = ControllerSettings (idleTimeout=1, timeout=60) +        controller = SinglePageController (url=url, logger=logger, +                service=Process (), behavior=[], settings=settings) +        before = loop.time () +        await asyncio.wait_for (controller.run (), settings.timeout*2) +        after = loop.time () +        assert settings.idleTimeout <= after-before <= settings.idleTimeout*2+3 +    finally: +        await runner.cleanup () + +def test_set_entry (): +    a = SetEntry (1, a=2, b=3) +    assert a == a +    assert hash (a) == hash (a) + +    b = SetEntry (1, a=2, b=4) +    assert a == b +    assert hash (a) == hash (b) + +    c = SetEntry (2, a=2, b=3) +    assert a != c +    assert hash (a) != hash (c) + +@pytest.mark.asyncio +async def test_idle_state_tracker (): +    # default is idle +    loop = asyncio.get_event_loop () +    idle = IdleStateTracker (loop) +    assert idle._idle + +    # idle change +    await idle.push (PageIdle (False)) +    assert not idle._idle + +    # nothing happens for other objects +    await idle.push ({}) +    assert not idle._idle + +    # no state change -> wait does not return +    with pytest.raises (asyncio.TimeoutError): +        await asyncio.wait_for (idle.wait (0.1), timeout=1) + +    # wait at least timeout +    delta = 0.2 +    timeout = 1 +    await idle.push (PageIdle (True)) +    assert idle._idle +    start = loop.time () +    await idle.wait (timeout) +    end = loop.time () +    assert (timeout-delta) < (end-start) < (timeout+delta) + +@pytest.fixture +async def recordingServer (): +    """ Simple HTTP server that records raw requests """ +    url = URL ('http://localhost:8080') +    reqs = [] +    async def record (request): +        reqs.append (request) +        return web.Response(text='ok', content_type='text/plain') +    app = web.Application() +    app.add_routes([web.get(url.path, record)]) +    runner = web.AppRunner(app) +    await runner.setup() +    site = web.TCPSite (runner, url.host, url.port) +    await site.start() +    yield url, reqs +    await runner.cleanup () + +from .test_devtools import tab, browser +from http.cookies import Morsel, SimpleCookie + +@pytest.mark.asyncio +async def test_set_cookies (tab, recordingServer): +    """ Make sure cookies are set properly and only affect the domain they were +    set for """ + +    logger = Logger () + +    url, reqs = recordingServer + +    cookies = [] +    c = Morsel () +    c.set ('foo', 'bar', '') +    c['domain'] = 'localhost' +    cookies.append (c) +    c = Morsel () +    c.set ('buz', 'beef', '') +    c['domain'] = 'nonexistent.example' + +    settings = ControllerSettings (idleTimeout=1, timeout=60, cookies=cookies) +    controller = SinglePageController (url=url, logger=logger, +            service=Process (), behavior=[], settings=settings) +    await asyncio.wait_for (controller.run (), settings.timeout*2) +     +    assert len (reqs) == 1 +    req = reqs[0] +    reqCookies = SimpleCookie (req.headers['cookie']) +    assert len (reqCookies) == 1 +    c = next (iter (reqCookies.values ())) +    assert c.key == cookies[0].key +    assert c.value == cookies[0].value diff --git a/crocoite/test_devtools.py b/crocoite/test_devtools.py new file mode 100644 index 0000000..bd1a828 --- /dev/null +++ b/crocoite/test_devtools.py @@ -0,0 +1,208 @@ +# Copyright (c) 2017 crocoite contributors +#  +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +#  +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +#  +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +import asyncio +import pytest + +from aiohttp import web +import websockets + +from .devtools import Browser, Tab, MethodNotFound, Crashed, \ +        InvalidParameter, Process, Passthrough + +@pytest.fixture +async def browser (): +    async with Process () as url: +        yield Browser (url) + +@pytest.fixture +async def tab (browser): +    async with browser as tab: +        yield tab +        # make sure there are no transactions left over (i.e. no unawaited requests) +        assert not tab.transactions + +docBody = "<html><body><p>Hello, world</p></body></html>" +async def hello(request): +    return web.Response(text=docBody, content_type='text/html') + +@pytest.fixture +async def server (): +    """ Simple HTTP server for testing notifications """ +    app = web.Application() +    app.add_routes([web.get('/', hello)]) +    runner = web.AppRunner(app) +    await runner.setup() +    site = web.TCPSite(runner, 'localhost', 8080) +    await site.start() +    yield app +    await runner.cleanup () + +@pytest.mark.asyncio +async def test_tab_create (tab): +    """ Creating tabs works """ +    assert isinstance (tab, Tab) +    version = await tab.Browser.getVersion () +    assert 'protocolVersion' in version +    assert tab.pending == 0 + +@pytest.mark.asyncio +async def test_tab_close (browser): +    """ Tabs are closed after using them """ +    async with browser as tab: +        tid = tab.id +    # give the browser some time to close the tab +    await asyncio.sleep (0.5) +    tabs = [t['id'] async for t in browser] +    assert tid not in tabs + +@pytest.mark.asyncio +async def test_tab_notify_enable_disable (tab): +    """ Make sure enabling/disabling notifications works for all known +    namespaces """ +    for name in ('Debugger', 'DOM', 'Log', 'Network', 'Page', 'Performance', +            'Profiler', 'Runtime', 'Security'): +        f = getattr (tab, name) +        await f.enable () +        await f.disable () + +@pytest.mark.asyncio +async def test_tab_unknown_method (tab): +    with pytest.raises (MethodNotFound): +        await tab.Nonexistent.foobar () + +@pytest.mark.asyncio +async def test_tab_invalid_argument (tab): +    # should be string +    with pytest.raises (InvalidParameter): +        await tab.Page.captureScreenshot (format=123) + +    with pytest.raises (InvalidParameter): +        await tab.Page.captureScreenshot (format=[123]) + +    with pytest.raises (InvalidParameter): +        await tab.Page.captureScreenshot (format={123: '456'}) + +@pytest.mark.asyncio +async def test_tab_crash (tab): +    with pytest.raises (Crashed): +        await tab.Page.crash () + +    # caling anything else now should fail as well +    with pytest.raises (Crashed): +        await tab.Browser.getVersion () + +@pytest.mark.asyncio +async def test_load (tab, server): +    await tab.Network.enable () +    await tab.Page.navigate (url='http://localhost:8080') + +    haveRequest = False +    haveResponse = False +    haveData = False +    haveFinished = False +    haveBody = False +    req = None +    resp = None +    while not haveBody: +        method, data = await tab.get () + +        # it can be either of those two in no specified order +        if method in (tab.Network.requestWillBeSent, tab.Network.requestWillBeSentExtraInfo) and not haveResponse: +            if req is None: +                req = data +            assert data['requestId'] == req['requestId'] +            haveRequest = True +        elif method in (tab.Network.responseReceived, tab.Network.responseReceivedExtraInfo) and haveRequest: +            if resp is None: +                resp = data +            assert data['requestId'] == resp['requestId'] +            haveResponse = True +        elif haveRequest and haveResponse and method == tab.Network.dataReceived: +            assert data['dataLength'] == len (docBody) +            assert data['requestId'] == req['requestId'] +            haveData = True +        elif haveData: +            assert method == tab.Network.loadingFinished +            assert data['requestId'] == req['requestId'] +            haveBody = True +        elif haveFinished: +            body = await tab.Network.getResponseBody (requestId=req['requestId']) +            assert body['body'] == docBody +            haveBody = True +        else: +            assert False, (method, req) + +    await tab.Network.disable () +    assert tab.pending == 0 + +@pytest.mark.asyncio +async def test_recv_failure(browser): +    """ Inject failure into receiver process and crash it """ +    async with browser as tab: +        await tab.ws.close () +        with pytest.raises (Crashed): +            await tab.Browser.getVersion () + +    async with browser as tab: +        await tab.ws.close () +        with pytest.raises (Crashed): +            await tab.get () + +    async with browser as tab: +        handle = asyncio.ensure_future (tab.get ()) +        await tab.ws.close () +        with pytest.raises (Crashed): +            await handle + +@pytest.mark.asyncio +async def test_tab_function (tab): +    assert tab.Network.enable.name == 'Network.enable' +    assert tab.Network.disable == tab.Network.disable +    assert tab.Network.enable != tab.Network.disable +    assert tab.Network != tab.Network.enable +    assert callable (tab.Network.enable) +    assert not callable (tab.Network.enable.name) +    assert 'Network.enable' in repr (tab.Network.enable) + +@pytest.mark.asyncio +async def test_tab_function_hash (tab): +    d = {tab.Network.enable: 1, tab.Network.disable: 2, tab.Page: 3, +            tab.Page.enable: 4} +    assert len (d) == 4 + +@pytest.mark.asyncio +async def test_ws_ping(tab): +    """ +    Chrome does not like websocket pings and closes the connection if it +    receives one. Not sure why. +    """ +    with pytest.raises (Crashed): +        await tab.ws.ping () +        await tab.Browser.getVersion () + +@pytest.mark.asyncio +async def test_passthrough (): +    """ Null service returns the url as is """ + +    url = 'http://localhost:12345' +    async with Passthrough (url) as u: +        assert str (u) == url + diff --git a/crocoite/test_html.py b/crocoite/test_html.py new file mode 100644 index 0000000..c17903b --- /dev/null +++ b/crocoite/test_html.py @@ -0,0 +1,96 @@ +# Copyright (c) 2017 crocoite contributors +#  +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +#  +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +#  +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +import asyncio +import pytest, html5lib +from html5lib.serializer import HTMLSerializer +from html5lib.treewalkers import getTreeWalker +from aiohttp import web + +from .html import StripTagFilter, StripAttributeFilter, ChromeTreeWalker +from .test_devtools import tab, browser + +def test_strip_tag (): +    d = html5lib.parse ('<a>barbaz<b>foobar</b>.</a><b>foobar</b>.<b attr=1><c></c>') +    stream = StripTagFilter (getTreeWalker ('etree')(d), ['b', 'c']) +    serializer = HTMLSerializer () +    assert serializer.render (stream) == '<a>barbaz.</a>.' + +def test_strip_attribute (): +    d = html5lib.parse ('<a b=1 c="yes" d></a><br b=2 c="no" d keep=1>') +    stream = StripAttributeFilter (getTreeWalker ('etree')(d), ['b', 'c', 'd']) +    serializer = HTMLSerializer () +    assert serializer.render (stream) == '<a></a><br keep=1>' + +@pytest.mark.asyncio +async def test_treewalker (tab): +    frames = await tab.Page.getFrameTree () + +    framehtml = '<HTML><HEAD></HEAD><BODY></BODY></HTML>' +    html = '<HTML><HEAD><META charset=utf-8></HEAD><BODY><H1>Hello</H1><!-- comment --><IFRAME></IFRAME></BODY></HTML>' +    rootframe = frames['frameTree']['frame']['id'] +    await tab.Page.setDocumentContent (frameId=rootframe, html=html) + +    dom = await tab.DOM.getDocument (depth=-1, pierce=True) +    docs = list (ChromeTreeWalker (dom['root']).split ()) +    assert len(docs) == 2 +    for i, doc in enumerate (docs): +        walker = ChromeTreeWalker (doc) +        serializer = HTMLSerializer () +        result = serializer.render (iter(walker)) +        if i == 0: +            assert result == html +        elif i == 1: +            assert result == framehtml + +cdataDoc = '<test><![CDATA[Hello world]]></test>' +xmlHeader = '<?xml version="1.0" encoding="UTF-8"?>' +async def hello(request): +    return web.Response(text=xmlHeader + cdataDoc, content_type='text/xml') + +@pytest.fixture +async def server (): +    """ Simple HTTP server for testing notifications """ +    app = web.Application() +    app.add_routes([web.get('/test.xml', hello)]) +    runner = web.AppRunner(app) +    await runner.setup() +    site = web.TCPSite(runner, 'localhost', 8080) +    await site.start() +    yield app +    await runner.cleanup () + +@pytest.mark.asyncio +async def test_treewalker_cdata (tab, server): +    ret = await tab.Page.navigate (url='http://localhost:8080/test.xml') +    # wait until loaded XXX: replace with idle check +    await asyncio.sleep (0.5) +    dom = await tab.DOM.getDocument (depth=-1, pierce=True) +    docs = list (ChromeTreeWalker (dom['root']).split ()) +    assert len(docs) == 1 +    for i, doc in enumerate (docs): +        walker = ChromeTreeWalker (doc) +        serializer = HTMLSerializer () +        result = serializer.render (iter(walker)) +        # chrome will display a pretty-printed viewer *plus* the original +        # source (stripped of its xml header) +        assert cdataDoc in result + + diff --git a/crocoite/test_irc.py b/crocoite/test_irc.py new file mode 100644 index 0000000..9344de4 --- /dev/null +++ b/crocoite/test_irc.py @@ -0,0 +1,70 @@ +# Copyright (c) 2017–2018 crocoite contributors +#  +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +#  +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +#  +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +import pytest +from .irc import ArgparseBot, RefCountEvent, User, NickMode + +def test_mode_parse (): +    assert ArgparseBot.parseMode ('+a') == [('+', 'a')] +    assert ArgparseBot.parseMode ('+ab') == [('+', 'a'), ('+', 'b')] +    assert ArgparseBot.parseMode ('+a+b') == [('+', 'a'), ('+', 'b')] +    assert ArgparseBot.parseMode ('-a') == [('-', 'a')] +    assert ArgparseBot.parseMode ('-ab') == [('-', 'a'), ('-', 'b')] +    assert ArgparseBot.parseMode ('-a-b') == [('-', 'a'), ('-', 'b')] +    assert ArgparseBot.parseMode ('+a-b') == [('+', 'a'), ('-', 'b')] +    assert ArgparseBot.parseMode ('-a+b') == [('-', 'a'), ('+', 'b')] +    assert ArgparseBot.parseMode ('-ab+cd') == [('-', 'a'), ('-', 'b'), ('+', 'c'), ('+', 'd')] + +@pytest.fixture +def event (): +    return RefCountEvent () + +def test_refcountevent_arm (event): +    event.arm () +    assert event.event.is_set () + +def test_refcountevent_ctxmgr (event): +    with event: +        assert event.count == 1 +        with event: +            assert event.count == 2 + +def test_refcountevent_arm_with (event): +    with event: +        event.arm () +        assert not event.event.is_set () +    assert event.event.is_set () + +def test_nick_mode (): +    a = User.fromName ('a') +    a2 = User.fromName ('a') +    a3 = User.fromName ('+a') +    b = User.fromName ('+b') +    c = User.fromName ('@c') + +    # equality is based on name only, not mode +    assert a == a2 +    assert a == a3 +    assert a != b + +    assert a.hasPriv (None) and not a.hasPriv (NickMode.voice) and not a.hasPriv (NickMode.operator) +    assert b.hasPriv (None) and b.hasPriv (NickMode.voice) and not b.hasPriv (NickMode.operator) +    assert c.hasPriv (None) and c.hasPriv (NickMode.voice) and c.hasPriv (NickMode.operator) + diff --git a/crocoite/test_logger.py b/crocoite/test_logger.py index 8a34aab..26e420a 100644 --- a/crocoite/test_logger.py +++ b/crocoite/test_logger.py @@ -1,9 +1,9 @@  import pytest -from .logger import Logger, Consumer, NullConsumer +from .logger import Logger, Consumer, NullConsumer, Level, DatetimeConsumer  @pytest.fixture  def logger (): -    return Logger (consumer=[NullConsumer ()]) +    return Logger (consumer=[NullConsumer (), DatetimeConsumer ()])  class QueueConsumer (Consumer):      def __init__ (self): @@ -58,3 +58,34 @@ def test_consumer (logger):      assert ret['foo'] == 'bar'      assert ret['inherit'] == 1 +def test_multiarg (logger): +    # single argument +    ret = logger.debug('maybe', foo='bar') +    assert ret['msg'] == 'maybe' +    assert ret['foo'] == 'bar' + +    # multi arguments +    ret = logger.debug('may', 'be', foo='bar') +    assert ret['msg'] == ('may', 'be') +    assert ret['foo'] == 'bar' + +def test_call (logger): +    for level in ('debug', Level.DEBUG): +        ret = logger(level, 'arg1', 'arg2', foo='bar') +        assert ret['level'] == Level.DEBUG +        assert ret['msg'] == ('arg1', 'arg2') +        assert ret['foo'] == 'bar' + +def test_datetime (logger): +    ret = logger.debug() +    assert 'date' in ret + +def test_independence (): +    """ Make sure two instances are completely independent """ +    l1 = Logger () +    c = QueueConsumer () +    l1.connect (c) +    l2 = Logger () +    l2.info (nothing='nothing') +    assert not c.data + diff --git a/crocoite/test_tools.py b/crocoite/test_tools.py new file mode 100644 index 0000000..416b954 --- /dev/null +++ b/crocoite/test_tools.py @@ -0,0 +1,224 @@ +# Copyright (c) 2018 crocoite contributors +#  +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +#  +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +#  +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +from tempfile import NamedTemporaryFile +from operator import itemgetter +from io import BytesIO +import pytest +from warcio.archiveiterator import ArchiveIterator +from warcio.warcwriter import WARCWriter +from warcio.statusandheaders import StatusAndHeaders +from pkg_resources import parse_version + +from .tools import mergeWarc, Errata, FixableErrata + +@pytest.fixture +def writer(): +    return WARCWriter (NamedTemporaryFile(), gzip=True) + +def recordsEqual(golden, underTest): +    for a, b in zip (golden, underTest): +        # record ids are not predictable, so we cannot compare them. Dito for +        # dates. Content-* seems to be added when writing to file. +        for x in {'WARC-Record-Id', 'WARC-Block-Digest', 'WARC-Date', +                'Content-Length', 'Content-Type'}: +            a.rec_headers.remove_header(x) +            b.rec_headers.remove_header(x) +        aheader = sorted(a.rec_headers.headers, key=itemgetter(0)) +        bheader = sorted(b.rec_headers.headers, key=itemgetter(0)) +        assert aheader == bheader +        assert a.http_headers == b.http_headers + +def makeGolden(writer, records): +    # additional warcinfo is written. Content does not matter. +    record = writer.create_warc_record ( +            '', +            'warcinfo', +            payload=b'', +            warc_headers_dict={'Content-Type': 'application/json; charset=utf-8'}) +    records.insert (0, record) +    return records + +def test_unmodified(writer): +    """ +    Single request/response pair, no revisits +    """ + +    records = [] + +    httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True) +    warcHeaders = {} +    record = writer.create_warc_record ('http://example.com/', 'request', payload=BytesIO(b'foobar'), +            warc_headers_dict=warcHeaders, http_headers=httpHeaders) +    records.append (record) + +    httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') +    record = writer.create_warc_record ('http://example.com/', 'response', payload=BytesIO(b'data'), +            warc_headers_dict=warcHeaders, http_headers=httpHeaders) +    records.append (record) + +    for r in records: +        writer.write_record (r) + +    output = NamedTemporaryFile() +    mergeWarc ([writer.out.name], output) + +    output.seek(0) +    recordsEqual (makeGolden (writer, records), ArchiveIterator (output)) + +def test_different_payload(writer): +    """ +    Duplicate URL, but different payload +    """ + +    records = [] +    for i in range (2): +        httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True) +        warcHeaders = {} +        record = writer.create_warc_record ('http://example.com/', 'request', payload=BytesIO(b'foobar'), +                warc_headers_dict=warcHeaders, http_headers=httpHeaders) +        records.append (record) + +        httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') +        record = writer.create_warc_record ('http://example.com/', 'response', +                payload=BytesIO(f'data{i}'.encode ('utf8')), +                warc_headers_dict=warcHeaders, http_headers=httpHeaders) +        records.append (record) + +    for r in records: +        writer.write_record (r) + +    output = NamedTemporaryFile() +    mergeWarc ([writer.out.name], output) + +    output.seek(0) +    recordsEqual (makeGolden (writer, records), ArchiveIterator (output)) + +def makeRevisit(writer, ref, dup): +    """ Make revisit record for reference """ +    dupHeaders = dup.rec_headers +    refHeaders = ref.rec_headers +    record = writer.create_revisit_record (dupHeaders.get_header('WARC-Target-URI'), +            digest=refHeaders.get_header('WARC-Payload-Digest'), +            refers_to_uri=refHeaders.get_header('WARC-Target-URI'), +            refers_to_date=refHeaders.get_header('WARC-Date'), +            http_headers=dup.http_headers) +    record.rec_headers.add_header ('WARC-Refers-To', refHeaders.get_header('WARC-Record-ID')) +    record.rec_headers.add_header ('WARC-Truncated', 'length') +    return record + +def test_resp_revisit_same_url(writer): +    """ +    Duplicate record for the same URL, creates a revisit +    """ + +    records = [] +    for i in range (2): +        httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True) +        warcHeaders = {} +        record = writer.create_warc_record ('http://example.com/', 'request', payload=BytesIO(b'foobar'), +                warc_headers_dict=warcHeaders, http_headers=httpHeaders) +        records.append (record) + +        httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') +        record = writer.create_warc_record ('http://example.com/', 'response', payload=BytesIO(b'data'), +                warc_headers_dict=warcHeaders, http_headers=httpHeaders) +        records.append (record) + +    for r in records: +        writer.write_record (r) + +    dup = records.pop () +    ref = records[1] +    records.append (makeRevisit (writer, ref, dup)) + +    output = NamedTemporaryFile() +    mergeWarc ([writer.out.name], output) + +    output.seek(0) +    recordsEqual (makeGolden (writer, records), ArchiveIterator (output)) + +def test_resp_revisit_other_url(writer): +    """ +    Duplicate record for different URL, creates a revisit +    """ + +    records = [] + +    httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True) +    warcHeaders = {} +    record = writer.create_warc_record ('http://example.com/', 'request', payload=BytesIO(b'foobar'), +            warc_headers_dict=warcHeaders, http_headers=httpHeaders) +    records.append (record) + +    httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') +    record = writer.create_warc_record ('http://example.com/', 'response', payload=BytesIO(b'data'), +            warc_headers_dict=warcHeaders, http_headers=httpHeaders) +    records.append (record) + +    httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True) +    warcHeaders = {} +    record = writer.create_warc_record ('http://example.com/one', 'request', payload=BytesIO(b'foobar'), +            warc_headers_dict=warcHeaders, http_headers=httpHeaders) +    records.append (record) + +    httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') +    record = writer.create_warc_record ('http://example.com/one', 'response', payload=BytesIO(b'data'), +            warc_headers_dict=warcHeaders, http_headers=httpHeaders) +    records.append (record) + +    for r in records: +        writer.write_record (r) + +    dup = records.pop () +    ref = records[1] +    records.append (makeRevisit (writer, ref, dup)) + +    output = NamedTemporaryFile() +    mergeWarc ([writer.out.name], output) + +    output.seek(0) +    recordsEqual (makeGolden (writer, records), ArchiveIterator (output)) + +def test_errata_contains(): +    """ Test version matching """ +    e = Errata('some-uuid', 'description', ['a<1.0']) +    assert {'a': parse_version('0.1')} in e +    assert {'a': parse_version('1.0')} not in e +    assert {'b': parse_version('1.0')} not in e + +    e = Errata('some-uuid', 'description', ['a<1.0,>0.1']) +    assert {'a': parse_version('0.1')} not in e +    assert {'a': parse_version('0.2')} in e +    assert {'a': parse_version('1.0')} not in e + +    # a AND b +    e = Errata('some-uuid', 'description', ['a<1.0', 'b>1.0']) +    assert {'a': parse_version('0.1')} not in e +    assert {'b': parse_version('1.1')} not in e +    assert {'a': parse_version('0.1'), 'b': parse_version('1.1')} in e + +def test_errata_fixable (): +    e = Errata('some-uuid', 'description', ['a<1.0', 'b>1.0']) +    assert not e.fixable + +    e = FixableErrata('some-uuid', 'description', ['a<1.0', 'b>1.0']) +    assert e.fixable + diff --git a/crocoite/test_warc.py b/crocoite/test_warc.py new file mode 100644 index 0000000..3ec310c --- /dev/null +++ b/crocoite/test_warc.py @@ -0,0 +1,225 @@ +# Copyright (c) 2018 crocoite contributors +#  +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +#  +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +#  +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +from tempfile import NamedTemporaryFile +import json, urllib +from operator import itemgetter + +from warcio.archiveiterator import ArchiveIterator +from yarl import URL +from multidict import CIMultiDict +from hypothesis import given, reproduce_failure +import hypothesis.strategies as st +import pytest + +from .warc import WarcHandler +from .logger import Logger, WarcHandlerConsumer +from .controller import ControllerStart +from .behavior import Script, ScreenshotEvent, DomSnapshotEvent +from .browser import RequestResponsePair, Base64Body, UnicodeBody +from .test_browser import requestResponsePair, urls + +def test_log (): +    logger = Logger () + +    with NamedTemporaryFile() as fd: +        with WarcHandler (fd, logger) as handler: +            warclogger = WarcHandlerConsumer (handler) +            logger.connect (warclogger) +            golden = [] + +            assert handler.log.tell () == 0 +            golden.append (logger.info (foo=1, bar='baz', encoding='äöü⇔ΓΨ')) +            assert handler.log.tell () != 0 + +            handler.maxLogSize = 0 +            golden.append (logger.info (bar=1, baz='baz')) +            # should flush the log +            assert handler.log.tell () == 0 + +        fd.seek (0) +        for it in ArchiveIterator (fd): +            headers = it.rec_headers +            assert headers['warc-type'] == 'metadata' +            assert 'warc-target-uri' not in headers +            assert headers['x-crocoite-type'] == 'log' +            assert headers['content-type'] == f'application/json; charset={handler.logEncoding}' + +            while True: +                l = it.raw_stream.readline () +                if not l: +                    break +                data = json.loads (l.strip ()) +                assert data == golden.pop (0) + +def jsonObject (): +    """ JSON-encodable objects """ +    return st.dictionaries (st.text (), st.one_of (st.integers (), st.text ())) + +def viewport (): +    return st.builds (lambda x, y: f'{x}x{y}', st.integers (), st.integers ()) + +def event (): +    return st.one_of ( +            st.builds (ControllerStart, jsonObject ()), +            st.builds (Script.fromStr, st.text (), st.one_of(st.none (), st.text ())), +            st.builds (ScreenshotEvent, urls (), st.integers (), st.binary ()), +            st.builds (DomSnapshotEvent, urls (), st.builds (lambda x: x.encode ('utf-8'), st.text ()), viewport()), +            requestResponsePair (), +            ) + +@pytest.mark.asyncio +@given (st.lists (event ())) +async def test_push (golden): +    def checkWarcinfoId (headers): +        if lastWarcinfoRecordid is not None: +            assert headers['WARC-Warcinfo-ID'] == lastWarcinfoRecordid + +    lastWarcinfoRecordid = None + +    # null logger +    logger = Logger () +    with open('/tmp/test.warc.gz', 'w+b') as fd: +        with WarcHandler (fd, logger) as handler: +            for g in golden: +                await handler.push (g) + +        fd.seek (0) +        it = iter (ArchiveIterator (fd)) +        for g in golden: +            if isinstance (g, ControllerStart): +                rec = next (it) + +                headers = rec.rec_headers +                assert headers['warc-type'] == 'warcinfo' +                assert 'warc-target-uri' not in headers +                assert 'x-crocoite-type' not in headers + +                data = json.load (rec.raw_stream) +                assert data == g.payload + +                lastWarcinfoRecordid = headers['warc-record-id'] +                assert lastWarcinfoRecordid +            elif isinstance (g, Script): +                rec = next (it) + +                headers = rec.rec_headers +                assert headers['warc-type'] == 'resource' +                assert headers['content-type'] == 'application/javascript; charset=utf-8' +                assert headers['x-crocoite-type'] == 'script' +                checkWarcinfoId (headers) +                if g.path: +                    assert URL (headers['warc-target-uri']) == URL ('file://' + g.abspath) +                else: +                    assert 'warc-target-uri' not in headers + +                data = rec.raw_stream.read ().decode ('utf-8') +                assert data == g.data +            elif isinstance (g, ScreenshotEvent): +                # XXX: check refers-to header +                rec = next (it) + +                headers = rec.rec_headers +                assert headers['warc-type'] == 'conversion' +                assert headers['x-crocoite-type'] == 'screenshot' +                checkWarcinfoId (headers) +                assert URL (headers['warc-target-uri']) == g.url, (headers['warc-target-uri'], g.url) +                assert headers['warc-refers-to'] is None +                assert int (headers['X-Crocoite-Screenshot-Y-Offset']) == g.yoff + +                assert rec.raw_stream.read () == g.data +            elif isinstance (g, DomSnapshotEvent): +                rec = next (it) + +                headers = rec.rec_headers +                assert headers['warc-type'] == 'conversion' +                assert headers['x-crocoite-type'] == 'dom-snapshot' +                checkWarcinfoId (headers) +                assert URL (headers['warc-target-uri']) == g.url +                assert headers['warc-refers-to'] is None + +                assert rec.raw_stream.read () == g.document +            elif isinstance (g, RequestResponsePair): +                rec = next (it) + +                # request +                headers = rec.rec_headers +                assert headers['warc-type'] == 'request' +                assert 'x-crocoite-type' not in headers +                checkWarcinfoId (headers) +                assert URL (headers['warc-target-uri']) == g.url +                assert headers['x-chrome-request-id'] == g.id +                 +                assert CIMultiDict (rec.http_headers.headers) == g.request.headers +                if g.request.hasPostData: +                    if g.request.body is not None: +                        assert rec.raw_stream.read () == g.request.body +                    else: +                        # body fetch failed +                        assert headers['warc-truncated'] == 'unspecified' +                        assert not rec.raw_stream.read () +                else: +                    assert not rec.raw_stream.read () + +                # response +                if g.response: +                    rec = next (it) +                    headers = rec.rec_headers +                    httpheaders = rec.http_headers +                    assert headers['warc-type'] == 'response' +                    checkWarcinfoId (headers) +                    assert URL (headers['warc-target-uri']) == g.url +                    assert headers['x-chrome-request-id'] == g.id +                    assert 'x-crocoite-type' not in headers + +                    # these are checked separately +                    filteredHeaders = CIMultiDict (httpheaders.headers) +                    for b in {'content-type', 'content-length'}: +                        if b in g.response.headers: +                            g.response.headers.popall (b) +                        if b in filteredHeaders: +                            filteredHeaders.popall (b) +                    assert filteredHeaders == g.response.headers + +                    expectedContentType = g.response.mimeType +                    if expectedContentType is not None: +                        assert httpheaders['content-type'].startswith (expectedContentType) + +                    if g.response.body is not None: +                        assert rec.raw_stream.read () == g.response.body +                        assert httpheaders['content-length'] == str (len (g.response.body)) +                        # body is never truncated if it exists +                        assert headers['warc-truncated'] is None + +                        # unencoded strings are converted to utf8 +                        if isinstance (g.response.body, UnicodeBody) and httpheaders['content-type'] is not None: +                            assert httpheaders['content-type'].endswith ('; charset=utf-8') +                    else: +                        # body fetch failed +                        assert headers['warc-truncated'] == 'unspecified' +                        assert not rec.raw_stream.read () +                        # content-length header should be kept intact +            else: +                assert False, f"invalid golden type {type(g)}" # pragma: no cover + +        # no further records +        with pytest.raises (StopIteration): +            next (it) + diff --git a/crocoite/tools.py b/crocoite/tools.py index 3aeaaad..a2ddaa3 100644 --- a/crocoite/tools.py +++ b/crocoite/tools.py @@ -22,64 +22,117 @@  Misc tools  """ -import shutil, sys, re, os, logging, argparse +import shutil, sys, os, logging, argparse, json +from io import BytesIO +  from warcio.archiveiterator import ArchiveIterator  from warcio.warcwriter import WARCWriter +from yarl import URL -def mergeWarc (): -    """ -    Merge multiple WARC files into a single file, writing revisit records for -    items which occur multiple times -    """ - -    parser = argparse.ArgumentParser(description='Merge WARCs, reads filenames from stdin.') -    parser.add_argument('--verbose', '-v', action='store_true') -    parser.add_argument('output', type=argparse.FileType ('wb'), help='Output WARC') +from pkg_resources import parse_version, parse_requirements -    args = parser.parse_args() -    loglevel = logging.DEBUG if args.verbose else logging.INFO -    logging.basicConfig (level=loglevel) +from .util import getSoftwareInfo, StrJsonEncoder +from .warc import jsonMime, makeContentType +def mergeWarc (files, output): +    # stats      unique = 0      revisit = 0 +    uniqueLength = 0 +    revisitLength = 0 +      payloadMap = {} -    writer = WARCWriter (args.output, gzip=True) -    for l in sys.stdin: -        l = l.strip () +    writer = WARCWriter (output, gzip=True) + +    # Add an additional warcinfo record, describing the transformations. This +    # is not ideal, since +    #   “A ‘warcinfo’ record describes the records that +    #   follow it […] until next ‘warcinfo’” +    #   -- https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#warcinfo +    # A warcinfo record is expected at the beginning of every file. But it +    # might have written by a different software, so we don’t want to +    # strip/replace that information, but supplement it. +    warcinfo = { +            'software': getSoftwareInfo (), +            'tool': 'crocoite-merge', # not the name of the cli tool +            'parameters': {'inputs': files}, +            } +    payload = BytesIO (json.dumps (warcinfo, indent=2).encode ('utf-8')) +    record = writer.create_warc_record ('', 'warcinfo', +            payload=payload, +            warc_headers_dict={'Content-Type': makeContentType (jsonMime, 'utf-8')}) +    writer.write_record (record) + +    for l in files:          with open (l, 'rb') as fd:              for record in ArchiveIterator (fd):                  if record.rec_type in {'resource', 'response'}:                      headers = record.rec_headers                      rid = headers.get_header('WARC-Record-ID')                      csum = headers.get_header('WARC-Payload-Digest') +                    length = int (headers.get_header ('Content-Length'))                      dup = payloadMap.get (csum, None)                      if dup is None:                          payloadMap[csum] = {'uri': headers.get_header('WARC-Target-URI'),                                  'id': rid, 'date': headers.get_header('WARC-Date')}                          unique += 1 +                        uniqueLength += length                      else: -                        logging.debug ('Record {} is duplicate of {}'.format (rid, dup['id'])) -                        record = writer.create_revisit_record (dup['uri'], csum, dup['uri'], dup['date']) +                        logging.debug (f'Record {rid} is duplicate of {dup["id"]}') +                        # Payload may be identical, but HTTP headers are +                        # (probably) not. Include them. +                        record = writer.create_revisit_record ( +                                headers.get_header('WARC-Target-URI'), digest=csum, +                                refers_to_uri=dup['uri'], refers_to_date=dup['date'], +                                http_headers=record.http_headers)                          record.rec_headers.add_header ('WARC-Truncated', 'length')                          record.rec_headers.add_header ('WARC-Refers-To', dup['id'])                          revisit += 1 +                        revisitLength += length                  else:                      unique += 1                  writer.write_record (record) -    logging.info ('Wrote {} unique records, {} revisits'.format (unique, revisit)) +    json.dump (dict ( +            unique=dict (records=unique, bytes=uniqueLength), +            revisit=dict (records=revisit, bytes=revisitLength), +            ratio=dict ( +                    records=unique/(unique+revisit), +                    bytes=uniqueLength/(uniqueLength+revisitLength) +                    ), +            ), +            sys.stdout, +            cls=StrJsonEncoder) +    sys.stdout.write ('\n') + +def mergeWarcCli(): +    parser = argparse.ArgumentParser(description='Merge WARCs, reads filenames from stdin.') +    parser.add_argument('--verbose', '-v', action='store_true') +    parser.add_argument('output', type=argparse.FileType ('wb'), help='Output WARC') + +    args = parser.parse_args() +    loglevel = logging.DEBUG if args.verbose else logging.INFO +    logging.basicConfig (level=loglevel) + +    mergeWarc([l.strip() for l in sys.stdin], args.output)  def extractScreenshot ():      """      Extract page screenshots from a WARC generated by crocoite into files      """ -    parser = argparse.ArgumentParser(description='Extract screenshots.') -    parser.add_argument('-f', '--force', action='store_true', help='Overwrite existing files') -    parser.add_argument('input', type=argparse.FileType ('rb'), help='Input WARC') +    parser = argparse.ArgumentParser(description='Extract screenshots from ' +            'WARC, write JSON info to stdout.') +    parser.add_argument('-f', '--force', action='store_true', +            help='Overwrite existing files') +    parser.add_argument('-1', '--one', action='store_true', +            help='Only extract the first screenshot into a file named prefix') +    parser.add_argument('input', type=argparse.FileType ('rb'), +            help='Input WARC')      parser.add_argument('prefix', help='Output file prefix')      args = parser.parse_args() +    i = 0      with args.input:          for record in ArchiveIterator (args.input):              headers = record.rec_headers @@ -88,13 +141,177 @@ def extractScreenshot ():                      'X-Crocoite-Screenshot-Y-Offset' not in headers:                  continue -            urlSanitized = headers.get_header('WARC-Target-URI').replace ('/', '_') -            xoff = 0 +            url = URL (headers.get_header ('WARC-Target-URI'))              yoff = int (headers.get_header ('X-Crocoite-Screenshot-Y-Offset')) -            outpath = '{}-{}-{}-{}.png'.format (args.prefix, urlSanitized, xoff, yoff) +            outpath = f'{args.prefix}{i:05d}.png' if not args.one else args.prefix              if args.force or not os.path.exists (outpath): +                json.dump ({'file': outpath, 'url': url, 'yoff': yoff}, +                        sys.stdout, cls=StrJsonEncoder) +                sys.stdout.write ('\n')                  with open (outpath, 'wb') as out:                      shutil.copyfileobj (record.raw_stream, out) +                i += 1              else: -                print ('not overwriting {}'.format (outpath)) +                print (f'not overwriting {outpath}', file=sys.stderr) + +            if args.one: +                break + +class Errata: +    __slots__ = ('uuid', 'description', 'url', 'affects') + +    def __init__ (self, uuid, description, affects, url=None): +        self.uuid = uuid +        self.description = description +        self.url = url +        # slightly abusing setuptool’s version parsing/matching here +        self.affects = list (parse_requirements(affects)) + +    def __contains__ (self, pkg): +        """ +        Return True if the versions in pkg are affected by this errata + +        pkg must be a mapping from project_name to version +        """ +        matchedAll = [] +        for a in self.affects: +            haveVersion = pkg.get (a.project_name, None) +            matchedAll.append (haveVersion is not None and haveVersion in a) +        return all (matchedAll) + +    def __repr__ (self): +        return f'{self.__class__.__name__}({self.uuid!r}, {self.description!r}, {self.affects!r})' + +    @property +    def fixable (self): +        return getattr (self, 'applyFix', None) is not None + +    def toDict (self): +        return {'uuid': self.uuid, +                'description': self.description, +                'url': self.url, +                'affects': list (map (str, self.affects)), +                'fixable': self.fixable} + +class FixableErrata(Errata): +    __slots__ = ('stats') + +    def __init__ (self, uuid, description, affects, url=None): +        super().__init__ (uuid, description, affects, url) +        # statistics for fixable erratas +        self.stats = dict (records=dict (fixed=0, processed=0)) + +    def applyFix (self, record): +        raise NotImplementedError () # pragma: no cover + +class ContentTypeErrata (FixableErrata): +    def __init__ (self): +        super().__init__ ( +            uuid='552c13dc-56e5-4539-9ad8-184ccae60930', +            description='Content-Type header uses wrong argument name encoding instead of charset.', +            url='https://github.com/PromyLOPh/crocoite/issues/19', +            affects=['crocoite==1.0.0']) + +    def applyFix (self, record): +        # XXX: this is ugly. warcio’s write_record replaces any Content-Type +        # header we’re setting with this one. But printing rec_headers shows +        # the header, not .content_type. +        contentType = record.content_type +        if '; encoding=' in contentType: +            contentType = contentType.replace ('; encoding=', '; charset=') +            record.content_type = contentType +            self.stats['records']['fixed'] += 1 + +        self.stats['records']['processed'] += 1 +        return record + +bugs = [ +    Errata (uuid='34a176b3-ad3d-430f-a082-68087f304572', +            description='Generated by version < 1.0. No erratas are supported for this version.', +            affects=['crocoite<1.0'], +            ), +    ContentTypeErrata (), +    ] + +def makeReport (fd): +    alreadyFixed = set () + +    for record in ArchiveIterator (fd): +        if record.rec_type == 'warcinfo': +            try: +                data = json.load (record.raw_stream) +                # errata records precceed everything else and indicate which +                # ones were fixed already +                if data['tool'] == 'crocoite-errata': +                    alreadyFixed.update (data['parameters']['errata']) +                else: +                    haveVersions = dict ([(pkg['projectName'], parse_version(pkg['version'])) for pkg in data['software']['self']]) +                    yield from filter (lambda b: haveVersions in b and b.uuid not in alreadyFixed, bugs) +            except json.decoder.JSONDecodeError: +                pass + +def errataCheck (args): +    hasErrata = False +    for item in makeReport (args.input): +        json.dump (item.toDict (), sys.stdout) +        sys.stdout.write ('\n') +        sys.stdout.flush () +        hasErrata = True +    return int (hasErrata) + +def errataFix (args): +    errata = args.errata + +    with args.input as infd, args.output as outfd: +        writer = WARCWriter (outfd, gzip=True) + +        warcinfo = { +                'software': getSoftwareInfo (), +                'tool': 'crocoite-errata', # not the name of the cli tool +                'parameters': {'errata': [errata.uuid]}, +                } +        payload = BytesIO (json.dumps (warcinfo, indent=2).encode ('utf-8')) +        record = writer.create_warc_record ('', 'warcinfo', +                payload=payload, +                warc_headers_dict={'Content-Type': makeContentType (jsonMime, 'utf-8')}) +        writer.write_record (record) + +        for record in ArchiveIterator (infd): +            fixedRecord = errata.applyFix (record) +            writer.write_record (fixedRecord) +    json.dump (errata.stats, sys.stdout) +    sys.stdout.write ('\n') +    sys.stdout.flush () + +def uuidToErrata (uuid, onlyFixable=True): +    try: +        e = next (filter (lambda x: x.uuid == uuid, bugs)) +    except StopIteration: +        raise argparse.ArgumentTypeError (f'Errata {uuid} does not exist') +    if not isinstance (e, FixableErrata): +        raise argparse.ArgumentTypeError (f'Errata {uuid} is not fixable') +    return e + +def errata (): +    parser = argparse.ArgumentParser(description=f'Show/fix erratas for WARCs generated by {__package__}.') +    parser.add_argument('input', metavar='INPUT', type=argparse.FileType ('rb'), help='Input WARC') + +    # XXX: required argument does not work here?! +    subparsers = parser.add_subparsers() + +    checkparser = subparsers.add_parser('check', help='Show erratas') +    checkparser.set_defaults (func=errataCheck) + +    fixparser = subparsers.add_parser('fix', help='Fix erratas') +    fixparser.add_argument('errata', metavar='UUID', type=uuidToErrata, help='Apply fix for this errata') +    fixparser.add_argument('output', metavar='OUTPUT', type=argparse.FileType ('wb'), help='Output WARC') +    fixparser.set_defaults (func=errataFix) + +    args = parser.parse_args() + +    if not hasattr (args, 'func'): +        parser.print_usage () +        parser.exit () + +    return args.func (args) diff --git a/crocoite/util.py b/crocoite/util.py index 3a62533..da377a3 100644 --- a/crocoite/util.py +++ b/crocoite/util.py @@ -22,31 +22,42 @@  Random utility functions  """ -import random, sys -import hashlib, os, pkg_resources -from urllib.parse import urlsplit, urlunsplit +import random, sys, platform, os, json, urllib +from datetime import datetime +import hashlib, pkg_resources -def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'): -    if length is None: -        length = random.randint (16, 32) -    return ''.join (map (lambda x: random.choice (chars), range (length))) +from yarl import URL -def packageUrl (path): -    """ -    Create URL for package data stored into WARC -    """ -    return 'urn:' + __package__ + ':' + path +class StrJsonEncoder (json.JSONEncoder): +    """ JSON encoder that turns unknown classes into a string and thus never +    fails """ +    def default (self, obj): +        if isinstance (obj, datetime): +            return obj.isoformat () + +        # make sure serialization always succeeds +        try: +            return json.JSONEncoder.default(self, obj) +        except TypeError: +            return str (obj) -def getFormattedViewportMetrics (tab): -    layoutMetrics = tab.Page.getLayoutMetrics () +async def getFormattedViewportMetrics (tab): +    layoutMetrics = await tab.Page.getLayoutMetrics ()      # XXX: I’m not entirely sure which one we should use here -    return '{}x{}'.format (layoutMetrics['layoutViewport']['clientWidth'], -                layoutMetrics['layoutViewport']['clientHeight']) +    viewport = layoutMetrics['layoutViewport'] +    return f"{viewport['clientWidth']}x{viewport['clientHeight']}" -def removeFragment (u): -    """ Remove fragment from url (i.e. #hashvalue) """ -    s = urlsplit (u) -    return urlunsplit ((s.scheme, s.netloc, s.path, s.query, '')) +def getSoftwareInfo (): +    """ Get software info for inclusion into warcinfo """ +    return { +            'platform': platform.platform (), +            'python': { +                'implementation': platform.python_implementation(), +                'version': platform.python_version (), +                'build': platform.python_build () +                }, +            'self': getRequirements (__package__) +        }  def getRequirements (dist):      """ Get dependencies of a package. @@ -72,7 +83,7 @@ def getRequirements (dist):              pkg = getattr (m, '__package__', None)              # is loaded?              if pkg in modules: -                if f: +                if f and os.path.isfile (f):                      with open (f, 'rb') as fd:                          contents = fd.read ()                          h = hashlib.new ('sha512') diff --git a/crocoite/warc.py b/crocoite/warc.py index 9b97e75..415b487 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -24,28 +24,38 @@ Classes writing data to WARC files  import json, threading  from io import BytesIO -from warcio.statusandheaders import StatusAndHeaders -from urllib.parse import urlsplit  from datetime import datetime +from http.server import BaseHTTPRequestHandler  from warcio.timeutils import datetime_to_iso_date  from warcio.warcwriter import WARCWriter +from warcio.statusandheaders import StatusAndHeaders +from yarl import URL -from .util import packageUrl -from .controller import defaultSettings, EventHandler, ControllerStart +from .util import StrJsonEncoder +from .controller import EventHandler, ControllerStart  from .behavior import Script, DomSnapshotEvent, ScreenshotEvent -from .browser import Item +from .browser import RequestResponsePair, UnicodeBody + +# the official mimetype for json, according to https://tools.ietf.org/html/rfc8259 +jsonMime = 'application/json' +# mime for javascript, according to https://tools.ietf.org/html/rfc4329#section-7.2 +jsMime = 'application/javascript' + +def makeContentType (mime, charset=None): +    """ Create value of Content-Type WARC header with optional charset """ +    s = [mime] +    if charset: +        s.extend (['; charset=', charset]) +    return ''.join (s)  class WarcHandler (EventHandler): -    __slots__ = ('logger', 'writer', 'maxBodySize', 'documentRecords', 'log', +    __slots__ = ('logger', 'writer', 'documentRecords', 'log',              'maxLogSize', 'logEncoding', 'warcinfoRecordId') -    def __init__ (self, fd, -            logger, -            maxBodySize=defaultSettings.maxBodySize): +    def __init__ (self, fd, logger):          self.logger = logger          self.writer = WARCWriter (fd, gzip=True) -        self.maxBodySize = maxBodySize          self.logEncoding = 'utf-8'          self.log = BytesIO () @@ -70,6 +80,7 @@ class WarcHandler (EventHandler):          Adds default WARC headers.          """ +        assert url is None or isinstance (url, URL)          d = {}          if self.warcinfoRecordId: @@ -77,8 +88,11 @@ class WarcHandler (EventHandler):          d.update (warc_headers_dict)          warc_headers_dict = d -        record = self.writer.create_warc_record (url, kind, payload=payload, -                warc_headers_dict=warc_headers_dict, http_headers=http_headers) +        record = self.writer.create_warc_record (str (url) if url else '', +                kind, +                payload=payload, +                warc_headers_dict=warc_headers_dict, +                http_headers=http_headers)          self.writer.write_record (record)          return record @@ -87,84 +101,52 @@ class WarcHandler (EventHandler):          logger = self.logger.bind (reqId=item.id)          req = item.request -        resp = item.response -        url = urlsplit (resp['url']) - -        path = url.path -        if url.query: -            path += '?' + url.query -        httpHeaders = StatusAndHeaders('{} {} HTTP/1.1'.format (req['method'], path), -                item.requestHeaders, protocol='HTTP/1.1', is_http_request=True) -        initiator = item.initiator +        url = item.url + +        path = url.relative().with_fragment(None) +        httpHeaders = StatusAndHeaders(f'{req.method} {path} HTTP/1.1', +                req.headers, protocol='HTTP/1.1', is_http_request=True)          warcHeaders = { -                'X-Chrome-Initiator': json.dumps (initiator), +                # required to correlate request with log entries                  'X-Chrome-Request-ID': item.id, -                'WARC-Date': datetime_to_iso_date (datetime.utcfromtimestamp (item.chromeRequest['wallTime'])), +                'WARC-Date': datetime_to_iso_date (req.timestamp),                  } -        try: -            bodyTruncated = None -            payload, payloadBase64Encoded = item.requestBody -        except ValueError: + +        body = item.request.body +        if item.request.hasPostData and body is None:              # oops, don’t know what went wrong here -            bodyTruncated = 'unspecified' -            logger.error ('requestBody missing', uuid='ee9adc58-e723-4595-9feb-312a67ead6a0') - -        if bodyTruncated: -            warcHeaders['WARC-Truncated'] = bodyTruncated -            payload = None - -        if payload: -            payload = BytesIO (payload) -            warcHeaders['X-Chrome-Base64Body'] = str (payloadBase64Encoded) -        record = self.writeRecord (req['url'], 'request', -                payload=payload, http_headers=httpHeaders, +            logger.error ('requestBody missing', +                    uuid='ee9adc58-e723-4595-9feb-312a67ead6a0') +            warcHeaders['WARC-Truncated'] = 'unspecified' +        else: +            body = BytesIO (body) +        record = self.writeRecord (url, 'request', +                payload=body, http_headers=httpHeaders,                  warc_headers_dict=warcHeaders)          return record.rec_headers['WARC-Record-ID']      def _writeResponse (self, item, concurrentTo):          # fetch the body          reqId = item.id -        rawBody = None -        base64Encoded = False -        bodyTruncated = None -        if item.isRedirect: -            # redirects reuse the same request, thus we cannot safely retrieve -            # the body (i.e getResponseBody may return the new location’s -            # body). -            bodyTruncated = 'unspecified' -        elif item.encodedDataLength > self.maxBodySize: -            bodyTruncated = 'length' -            # check body size first, since we’re loading everything into memory -            self.logger.error ('body for {} too large {} vs {}'.format (reqId, -                    item.encodedDataLength, self.maxBodySize)) -        else: -            try: -                rawBody, base64Encoded = item.body -            except ValueError: -                # oops, don’t know what went wrong here -                bodyTruncated = 'unspecified'          # now the response          resp = item.response          warcHeaders = {                  'WARC-Concurrent-To': concurrentTo, -                'WARC-IP-Address': resp.get ('remoteIPAddress', ''), -                'X-Chrome-Protocol': resp.get ('protocol', ''), -                'X-Chrome-FromDiskCache': str (resp.get ('fromDiskCache')), -                'X-Chrome-ConnectionReused': str (resp.get ('connectionReused')), +                # required to correlate request with log entries                  'X-Chrome-Request-ID': item.id, -                'WARC-Date': datetime_to_iso_date (datetime.utcfromtimestamp ( -                        item.chromeRequest['wallTime']+ -                        (item.chromeResponse['timestamp']-item.chromeRequest['timestamp']))), +                'WARC-Date': datetime_to_iso_date (resp.timestamp),                  } -        if bodyTruncated: -            warcHeaders['WARC-Truncated'] = bodyTruncated -        else: -            warcHeaders['X-Chrome-Base64Body'] = str (base64Encoded) +        # conditional WARC headers +        if item.remoteIpAddress: +            warcHeaders['WARC-IP-Address'] = item.remoteIpAddress -        httpHeaders = StatusAndHeaders('{} {}'.format (resp['status'], -                item.statusText), item.responseHeaders, -                protocol='HTTP/1.1') +        # HTTP headers +        statusText = resp.statusText or \ +                BaseHTTPRequestHandler.responses.get ( +                resp.status, ('No status text available', ))[0] +        httpHeaders = StatusAndHeaders(f'{resp.status} {statusText}', +                resp.headers, protocol='HTTP/1.1')          # Content is saved decompressed and decoded, remove these headers          blacklistedHeaders = {'transfer-encoding', 'content-encoding'} @@ -174,20 +156,21 @@ class WarcHandler (EventHandler):          # chrome sends nothing but utf8 encoded text. Fortunately HTTP          # headers take precedence over the document’s <meta>, thus we can          # easily override those. -        contentType = resp.get ('mimeType') -        if contentType: -            if not base64Encoded: -                contentType += '; charset=utf-8' -            httpHeaders.replace_header ('content-type', contentType) - -        if rawBody is not None: -            httpHeaders.replace_header ('content-length', '{:d}'.format (len (rawBody))) -            bodyIo = BytesIO (rawBody) +        if resp.mimeType: +            charset = 'utf-8' if isinstance (resp.body, UnicodeBody) else None +            contentType = makeContentType (resp.mimeType, charset=charset) +            httpHeaders.replace_header ('Content-Type', contentType) + +        # response body +        body = resp.body +        if body is None: +            warcHeaders['WARC-Truncated'] = 'unspecified'          else: -            bodyIo = BytesIO () +            httpHeaders.replace_header ('Content-Length', str (len (body))) +            body = BytesIO (body) -        record = self.writeRecord (resp['url'], 'response', -                warc_headers_dict=warcHeaders, payload=bodyIo, +        record = self.writeRecord (item.url, 'response', +                warc_headers_dict=warcHeaders, payload=body,                  http_headers=httpHeaders)          if item.resourceType == 'Document': @@ -196,32 +179,38 @@ class WarcHandler (EventHandler):      def _writeScript (self, item):          writer = self.writer          encoding = 'utf-8' -        self.writeRecord (packageUrl ('script/{}'.format (item.path)), 'metadata', +        # XXX: yes, we’re leaking information about the user here, but this is +        # the one and only source URL of the scripts. +        uri = URL(f'file://{item.abspath}') if item.path else None +        self.writeRecord (uri, 'resource',                  payload=BytesIO (str (item).encode (encoding)), -                warc_headers_dict={'Content-Type': 'application/javascript; charset={}'.format (encoding)}) +                warc_headers_dict={ +                    'Content-Type': makeContentType (jsMime, encoding), +                    'X-Crocoite-Type': 'script', +                    })      def _writeItem (self, item): -        if item.failed: -            # should have been handled by the logger already -            return - +        assert item.request          concurrentTo = self._writeRequest (item) -        self._writeResponse (item, concurrentTo) +        # items that failed loading don’t have a response +        if item.response: +            self._writeResponse (item, concurrentTo)      def _addRefersTo (self, headers, url):          refersTo = self.documentRecords.get (url)          if refersTo:              headers['WARC-Refers-To'] = refersTo          else: -            self.logger.error ('No document record found for {}'.format (url)) +            self.logger.error (f'No document record found for {url}')          return headers      def _writeDomSnapshot (self, item):          writer = self.writer -        warcHeaders = {'X-DOM-Snapshot': str (True), +        warcHeaders = { +                'X-Crocoite-Type': 'dom-snapshot',                  'X-Chrome-Viewport': item.viewport, -                'Content-Type': 'text/html; charset=utf-8', +                'Content-Type': makeContentType ('text/html', 'utf-8')                  }          self._addRefersTo (warcHeaders, item.url) @@ -232,53 +221,53 @@ class WarcHandler (EventHandler):      def _writeScreenshot (self, item):          writer = self.writer -        warcHeaders = {'Content-Type': 'image/png', -                'X-Crocoite-Screenshot-Y-Offset': str (item.yoff)} +        warcHeaders = { +                'Content-Type': makeContentType ('image/png'), +                'X-Crocoite-Screenshot-Y-Offset': str (item.yoff), +                'X-Crocoite-Type': 'screenshot', +                }          self._addRefersTo (warcHeaders, item.url)          self.writeRecord (item.url, 'conversion',                  payload=BytesIO (item.data), warc_headers_dict=warcHeaders) -    def _writeControllerStart (self, item): -        payload = BytesIO (json.dumps (item.payload, indent=2).encode ('utf-8')) +    def _writeControllerStart (self, item, encoding='utf-8'): +        payload = BytesIO (json.dumps (item.payload, indent=2, cls=StrJsonEncoder).encode (encoding))          writer = self.writer -        warcinfo = self.writeRecord (packageUrl ('warcinfo'), 'warcinfo', -                warc_headers_dict={'Content-Type': 'text/plain; encoding=utf-8'}, +        warcinfo = self.writeRecord (None, 'warcinfo', +                warc_headers_dict={'Content-Type': makeContentType (jsonMime, encoding)},                  payload=payload)          self.warcinfoRecordId = warcinfo.rec_headers['WARC-Record-ID']      def _flushLogEntries (self): -        writer = self.writer -        self.log.seek (0) -        # XXX: we should use the type continuation here -        self.writeRecord (packageUrl ('log'), 'resource', payload=self.log, -                warc_headers_dict={'Content-Type': 'text/plain; encoding={}'.format (self.logEncoding)}) -        self.log = BytesIO () +        if self.log.tell () > 0: +            writer = self.writer +            self.log.seek (0) +            warcHeaders = { +                    'Content-Type': makeContentType (jsonMime, self.logEncoding), +                    'X-Crocoite-Type': 'log', +                    } +            self.writeRecord (None, 'metadata', payload=self.log, +                    warc_headers_dict=warcHeaders) +            self.log = BytesIO ()      def _writeLog (self, item):          """ Handle log entries, called by .logger.WarcHandlerConsumer only """          self.log.write (item.encode (self.logEncoding))          self.log.write (b'\n') -        # instead of locking, check we’re running in the main thread -        if self.log.tell () > self.maxLogSize and \ -                threading.current_thread () is threading.main_thread (): +        if self.log.tell () > self.maxLogSize:              self._flushLogEntries ()      route = {Script: _writeScript, -            Item: _writeItem, +            RequestResponsePair: _writeItem,              DomSnapshotEvent: _writeDomSnapshot,              ScreenshotEvent: _writeScreenshot,              ControllerStart: _writeControllerStart,              } -    def push (self, item): -        processed = False +    async def push (self, item):          for k, v in self.route.items ():              if isinstance (item, k):                  v (self, item) -                processed = True                  break -        if not processed: -            self.logger.debug ('unknown event {}'.format (repr (item))) - diff --git a/doc/_ext/clicklist.py b/doc/_ext/clicklist.py new file mode 100644 index 0000000..a69452c --- /dev/null +++ b/doc/_ext/clicklist.py @@ -0,0 +1,45 @@ +""" +Render click.yaml config file into human-readable list of supported sites +""" + +import pkg_resources, yaml +from docutils import nodes +from docutils.parsers.rst import Directive +from yarl import URL + +class ClickList (Directive): +    def run(self): +        # XXX: do this once only +        fd = pkg_resources.resource_stream ('crocoite', 'data/click.yaml') +        config = list (yaml.safe_load_all (fd)) + +        l = nodes.definition_list () +        for site in config: +            urls = set () +            v = nodes.definition () +            vl = nodes.bullet_list () +            v += vl +            for s in site['selector']: +                i = nodes.list_item () +                i += nodes.paragraph (text=s['description']) +                vl += i +                urls.update (map (lambda x: URL(x).with_path ('/'), s.get ('urls', []))) + +            item = nodes.definition_list_item () +            term = ', '.join (map (lambda x: x.host, urls)) if urls else site['match'] +            k = nodes.term (text=term) +            item += k + +            item += v +            l += item +        return [l] + +def setup(app): +    app.add_directive ("clicklist", ClickList) + +    return { +        'version': '0.1', +        'parallel_read_safe': True, +        'parallel_write_safe': True, +    } + diff --git a/doc/conf.py b/doc/conf.py new file mode 100644 index 0000000..8336c27 --- /dev/null +++ b/doc/conf.py @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- +import os, sys + +# -- Project information ----------------------------------------------------- + +project = 'crocoite' +copyright = '2019 crocoite contributors' +author = 'crocoite contributors' + +# -- General configuration --------------------------------------------------- + +sys.path.append(os.path.abspath("./_ext")) +extensions = [ +    'sphinx.ext.viewcode', +    'sphinx.ext.autodoc', +    'clicklist', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +source_suffix = '.rst' +master_doc = 'index' +language = 'en' +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] +pygments_style = 'tango' + +# -- Options for HTML output ------------------------------------------------- + +html_theme = 'alabaster' +html_theme_options = { +    "description": "Preservation for the modern web", +    "github_user": "PromyLOPh", +    "github_repo": "crocoite", +    "travis_button": True, +    "github_button": True, +    "codecov_button": True, +    "fixed_sidebar": True, +} +#html_static_path = ['_static'] +html_sidebars = { +   '**': ['about.html', 'navigation.html', 'searchbox.html'], +} + diff --git a/doc/develop.rst b/doc/develop.rst new file mode 100644 index 0000000..801ab21 --- /dev/null +++ b/doc/develop.rst @@ -0,0 +1,39 @@ +Development +----------- + +Generally crocoite provides reasonable defaults for Google Chrome via +:py:mod:`crocoite.devtools`. When debugging this software it might be necessary +to open a non-headless instance of the browser by running + +.. code:: bash + +   google-chrome-stable --remote-debugging-port=9222 --auto-open-devtools-for-tabs + +and then passing the option :option:`--browser=http://localhost:9222` to +:program:`crocoite-single`. This allows human intervention through the +browser’s builtin console. + +Release guide +^^^^^^^^^^^^^ + +crocoite uses `semantic versioning`_. To create a new release, bump the version +number in ``setup.py`` according to the linked guide, create distribution +packages:: + +    python setup.py sdist bdist_wheel + +Verify them:: + +    twine check dist/* + +Try to install and use them in a separate sandbox. And finally sign and upload +a new version to pypi_:: + +    gpg --detach-sign --armor dist/*.tar.gz +    twine upload dist/* + +Then update the documentation using :program:`sphing-doc` and upload it as well. + +.. _semantic versioning: https://semver.org/spec/v2.0.0.html +.. _pypi: https://pypi.org + diff --git a/doc/index.rst b/doc/index.rst new file mode 100644 index 0000000..53f5f77 --- /dev/null +++ b/doc/index.rst @@ -0,0 +1,36 @@ +crocoite +======== + +Preservation for the modern web, powered by `headless Google +Chrome`_. + +.. _headless Google Chrome: https://developers.google.com/web/updates/2017/04/headless-chrome + +.. toctree:: +   :maxdepth: 1 +   :hidden: + +   usage.rst +   plugins.rst +   rationale.rst +   develop.rst +   related.rst + +Features +-------- + +Google Chrome-powered +    HTML renderer, JavaScript engine and network stack, supporting modern web +    technologies and protocols +WARC output +    Includes all network requests made by the browser +Site interaction +    :ref:`Auto-expand on-click content <click>`, infinite-scrolling +DOM snapshot +    Contains the page’s state, renderable without JavaScript +Image screenshot +    Entire page +Machine-readable interface +    Easy integration into custom tools/scripts + + diff --git a/doc/plugins.rst b/doc/plugins.rst new file mode 100644 index 0000000..062e1bf --- /dev/null +++ b/doc/plugins.rst @@ -0,0 +1,16 @@ +Plugins +======= + +crocoite comes with plug-ins that modify loaded sites’ or interact with them. + +.. _click: + +click +----- + +The following sites are currently supported. Note this is an ongoing +battle against layout changes and thus older software versions will stop +working very soon. + +.. clicklist:: + diff --git a/doc/rationale.rst b/doc/rationale.rst new file mode 100644 index 0000000..f37db7c --- /dev/null +++ b/doc/rationale.rst @@ -0,0 +1,76 @@ +Rationale +--------- + +Most modern websites depend heavily on executing code, usually JavaScript, on +the user’s machine. They also make use of new and emerging Web technologies +like HTML5, WebSockets, service workers and more. Even worse from the +preservation point of view, they also require some form of user interaction to +dynamically load more content (infinite scrolling, dynamic comment loading, +etc). + +The naive approach of fetching a HTML page, parsing it and extracting +links to referenced resources therefore is not sufficient to create a faithful +snapshot of these web applications. A full browser, capable of running scripts and +providing modern Web API’s is absolutely required for this task. Thankfully +Google Chrome runs without a display (headless mode) and can be controlled by +external programs, allowing them to navigate and extract or inject data. +This section describes the solutions crocoite offers and explains design +decisions taken. + +crocoite captures resources by listening to Chrome’s `network events`_ and +requesting the response body using `Network.getResponseBody`_. This approach +has caveats: The original HTTP requests and responses, as sent over the wire, +are not available. They are reconstructed from parsed data. The character +encoding for text documents is changed to UTF-8. And the content body of HTTP +redirects cannot be retrieved due to a race condition. + +.. _network events: https://chromedevtools.github.io/devtools-protocol/1-3/Network +.. _Network.getResponseBody: https://chromedevtools.github.io/devtools-protocol/1-3/Network#method-getResponseBody + +But at the same time it allows crocoite to rely on Chrome’s well-tested network +stack and HTTP parser. Thus it supports HTTP version 1 and 2 as well as +transport protocols like SSL and QUIC. Depending on Chrome also eliminates the +need for a man-in-the-middle proxy, like warcprox_, which has to decrypt SSL +traffic and present a fake certificate to the browser in order to store the +transmitted content. + +.. _warcprox: https://github.com/internetarchive/warcprox + +WARC records generated by crocoite therefore are an abstract view on the +resource they represent and not necessarily the data sent over the wire. A URL +fetched with HTTP/2 for example will still result in a HTTP/1.1 +request/response pair in the WARC file. This may be undesireable from +an archivist’s point of view (“save the data exactly like we received it”). But +this level of abstraction is inevitable when dealing with more than one +protocol. + +crocoite also interacts with and therefore alters the grabbed websites. It does +so by injecting `behavior scripts`_ into the site. Typically these are written +in JavaScript, because interacting with a page is easier this way. These +scripts then perform different tasks: Extracting targets from visible +hyperlinks, clicking buttons or scrolling the website to to load more content, +as well as taking a static screenshot of ``<canvas>`` elements for the DOM +snapshot (see below). + +.. _behavior scripts: https://github.com/PromyLOPh/crocoite/tree/master/crocoite/data + +Replaying archived WARC’s can be quite challenging and might not be possible +with current technology (or even at all): + +- Some sites request assets based on screen resolution, pixel ratio and +  supported image formats (webp). Replaying those with different parameters +  won’t work, since assets for those are missing. Example: missguided.com. +- Some fetch different scripts based on user agent. Example: youtube.com. +- Requests containing randomly generated JavaScript callback function names +  won’t work. Example: weather.com. +- Range requests (Range: bytes=1-100) are captured as-is, making playback +  difficult + +crocoite offers two methods to work around these issues. Firstly it can save a +DOM snapshot to the WARC file. It contains the entire DOM in HTML format minus +``<script>`` tags after the site has been fully loaded and thus can be +displayed without executing scripts.  Obviously JavaScript-based navigation +does not work any more. Secondly it also saves a screenshot of the full page, +so even if future browsers cannot render and display the stored HTML a fully +rendered version of the website can be replayed instead. + diff --git a/doc/related.rst b/doc/related.rst new file mode 100644 index 0000000..62e2569 --- /dev/null +++ b/doc/related.rst @@ -0,0 +1,14 @@ +Related projects +---------------- + +brozzler_ +    Uses Google Chrome as well, but intercepts traffic using a proxy. Supports +    distributed crawling and immediate playback. +Squidwarc_ +    Communicates with headless Google Chrome and uses the Network API to +    retrieve requests like crocoite. Supports recursive crawls and page +    scrolling, but neither custom JavaScript nor distributed crawling. + +.. _brozzler: https://github.com/internetarchive/brozzler +.. _Squidwarc: https://github.com/N0taN3rd/Squidwarc + diff --git a/doc/usage.rst b/doc/usage.rst new file mode 100644 index 0000000..34a3e7b --- /dev/null +++ b/doc/usage.rst @@ -0,0 +1,162 @@ +Usage +----- + +Quick start using pywb_, expects Google Chrome to be installed already: + +.. code:: bash + +    pip install crocoite pywb +    crocoite http://example.com/ example.com.warc.gz +    wb-manager init test && wb-manager add test example.com.warc.gz +    wayback & +    $BROWSER http://localhost:8080 + +.. _pywb: https://github.com/ikreymer/pywb + +It is recommended to install at least Micrsoft’s Corefonts_ as well as DejaVu_, +Liberation_ or a similar font family covering a wide range of character sets. +Otherwise page screenshots may be unusable due to missing glyphs. + +.. _Corefonts: http://corefonts.sourceforge.net/ +.. _DejaVu: https://dejavu-fonts.github.io/ +.. _Liberation: https://pagure.io/liberation-fonts + +Recursion +^^^^^^^^^ + +.. program:: crocoite + +By default crocoite will only retrieve the URL specified on the command line. +However it can follow links as well. There’s currently two recursion strategies +available, depth- and prefix-based. + +.. code:: bash + +   crocoite -r 1 https://example.com/ example.com.warc.gz + +will retrieve ``example.com`` and all pages directly refered to by it. +Increasing the number increases the depth, so a value of :samp:`2` would first grab +``example.com``, queue all pages linked there as well as every reference on +each of those pages. + +On the other hand + +.. code:: bash + +   crocoite -r prefix https://example.com/dir/ example.com.warc.gz + +will retrieve the URL specified and all pages referenced which have the same +URL prefix. There trailing slash is significant. Without it crocoite would also +grab ``/dir-something`` or ``/dir.html`` for example. + +If an output file template is used each page is written to an individual file. For example + +.. code:: bash + +   crocoite -r prefix https://example.com/ '{host}-{date}-{seqnum}.warc.gz' + +will write one file page page to files like +:file:`example.com-2019-09-09T15:15:15+02:00-1.warc.gz`. ``seqnum`` is unique to +each page of a single job and should always be used. + +When running a recursive job, increasing the concurrency (i.e. how many pages +are fetched at the same time) can speed up the process. For example you can +pass :option:`-j` :samp:`4` to retrieve four pages at the same time. Keep in mind +that each process starts a full browser that requires a lot of resources (one +to two GB of RAM and one or two CPU cores). + +Customizing +^^^^^^^^^^^ + +.. program:: crocoite-single + +Under the hood :program:`crocoite` starts one instance of +:program:`crocoite-single` to fetch each page. You can customize its options by +appending a command template like this: + +.. code:: bash + +   crocoite -r prefix https://example.com example.com.warc.gz -- \ +        crocoite-single --timeout 5 -k '{url}' '{dest}' + +This reduces the global timeout to 5 seconds and ignores TLS errors. If an +option is prefixed with an exclamation mark (``!``) it will not be expanded. +This is useful for passing :option:`--warcinfo`, which expects JSON-encoded data. + +Command line options +^^^^^^^^^^^^^^^^^^^^ + +Below is a list of all command line arguments available: + +.. program:: crocoite + +crocoite +++++++++ + +Front-end with recursion support and simple job management. + +.. option:: -j N, --concurrency N + +   Maximum number of concurrent fetch jobs. + +.. option:: -r POLICY, --recursion POLICY + +   Enables recursion based on POLICY, which can be a positive integer +   (recursion depth) or the string :kbd:`prefix`. + +.. option:: --tempdir DIR + +   Directory for temporary WARC files. + +.. program:: crocoite-single + +crocoite-single ++++++++++++++++ + +Back-end to fetch a single page. + +.. option:: -b SET-COOKIE, --cookie SET-COOKIE + +   Add cookie to browser’s cookie jar. This option always *appends* cookies, +   replacing those provided by :option:`-c`. + +   .. versionadded:: 1.1 + +.. option:: -c FILE, --cookie-jar FILE + +   Load cookies from FILE. :program:`crocoite` provides a default cookie file, +   which contains cookies to, for example, circumvent age restrictions. This +   option *replaces* that default file. + +   .. versionadded:: 1.1 + +.. option:: --idle-timeout SEC + +   Time after which a page is considered “idle”. + +.. option:: -k, --insecure + +   Allow insecure connections, i.e. self-signed ore expired HTTPS certificates. + +.. option:: --timeout SEC + +   Global archiving timeout. + + +.. option:: --warcinfo JSON + +   Inject additional JSON-encoded information into the resulting WARC. + +IRC bot +^^^^^^^ + +A simple IRC bot (“chromebot”) is provided with the command :program:`crocoite-irc`. +It reads its configuration from a config file like the example provided in +:file:`contrib/chromebot.json` and supports the following commands: + +a <url> -j <concurrency> -r <policy> -k -b <set-cookie> +    Archive <url> with <concurrency> processes according to recursion <policy> +s <uuid> +    Get job status for <uuid> +r <uuid> +    Revoke or abort running job with <uuid> @@ -1,2 +1,8 @@  [aliases]  test=pytest +[tool:pytest] +addopts=--cov-report=html --cov-report=xml --cov=crocoite --cov-config=setup.cfg +[coverage:run] +branch=True +[build_sphinx] +builder=dirhtml @@ -2,29 +2,56 @@ from setuptools import setup  setup(      name='crocoite', -    version='0.1.0', +    version='1.1.1',      author='Lars-Dominik Braun',      author_email='lars+crocoite@6xq.net', +    url='https://6xq.net/crocoite/',      packages=['crocoite'],      license='LICENSE.txt',      description='Save website to WARC using Google Chrome.',      long_description=open('README.rst').read(), +    long_description_content_type='text/x-rst',      install_requires=[ -        'pychrome',          'warcio',          'html5lib>=0.999999999', -        'Celery', +        'bottom', +        'pytz', +        'websockets', +        'aiohttp', +        'PyYAML', +        'yarl>=1.4,<1.5', +        'multidict',      ], +    extras_require={ +        'manhole': ['manhole>=1.6'], +    },      entry_points={      'console_scripts': [ -            'crocoite-grab = crocoite.cli:main', -            'crocoite-merge-warc = crocoite.tools:mergeWarc', +            # the main executable +            'crocoite = crocoite.cli:recursive', +            # backend helper +            'crocoite-single = crocoite.cli:single', +            # irc bot and dashboard +            'crocoite-irc = crocoite.cli:irc', +            'crocoite-irc-dashboard = crocoite.cli:dashboard', +            # misc tools +            'crocoite-merge-warc = crocoite.tools:mergeWarcCli',              'crocoite-extract-screenshot = crocoite.tools:extractScreenshot', +            'crocoite-errata = crocoite.tools:errata',              ],      },      package_data={              'crocoite': ['data/*'],      }, -    setup_requires=["pytest-runner"], -    tests_require=["pytest"], +    setup_requires=['pytest-runner'], +    tests_require=["pytest", 'pytest-asyncio', 'pytest-cov', 'hypothesis'], +    python_requires='>=3.6', +    classifiers=[ +        'Development Status :: 5 - Production/Stable', +        'License :: OSI Approved :: MIT License', +        'Operating System :: POSIX', +        'Programming Language :: Python :: 3.6', +        'Programming Language :: Python :: 3.7', +        'Topic :: Internet :: WWW/HTTP', +    ],  ) | 
