diff options
-rw-r--r-- | crocoite/cli.py | 33 | ||||
-rw-r--r-- | crocoite/controller.py | 16 | ||||
-rw-r--r-- | crocoite/data/cookies.txt | 9 | ||||
-rw-r--r-- | crocoite/devtools.py | 25 | ||||
-rw-r--r-- | crocoite/irc.py | 21 | ||||
-rw-r--r-- | crocoite/test_controller.py | 50 | ||||
-rw-r--r-- | doc/usage.rst | 80 | ||||
-rw-r--r-- | setup.cfg | 2 |
8 files changed, 214 insertions, 22 deletions
diff --git a/crocoite/cli.py b/crocoite/cli.py index 93b742b..53a0b32 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -26,6 +26,8 @@ import argparse, sys, signal, asyncio, os, json from traceback import TracebackException from enum import IntEnum from yarl import URL +from http.cookies import SimpleCookie +import pkg_resources try: import manhole manhole.install (patch_fork=False, oneshot_on='USR1') @@ -49,6 +51,29 @@ def absurl (s): return u raise argparse.ArgumentTypeError ('Must be absolute') +def cookie (s): + """ argparse: Cookie """ + c = SimpleCookie (s) + # for some reason the constructor does not raise an exception if the cookie + # supplied is invalid. It’ll simply be empty. + if len (c) != 1: + raise argparse.ArgumentTypeError ('Invalid cookie') + # we want a single Morsel + return next (iter (c.values ())) + +def cookiejar (f): + """ argparse: Cookies from file """ + cookies = [] + try: + with open (f, 'r') as fd: + for l in fd: + l = l.lstrip () + if l and not l.startswith ('#'): + cookies.append (cookie (l)) + except FileNotFoundError: + raise argparse.ArgumentTypeError (f'Cookie jar "{f}" does not exist') + return cookies + class SingleExitStatus(IntEnum): """ Exit status for single-shot command line """ Ok = 0 @@ -68,9 +93,16 @@ def single (): metavar='NAME', nargs='*') parser.add_argument('--warcinfo', help='Add extra information to warcinfo record', metavar='JSON', type=json.loads) + # re-using curl’s short/long switch names whenever possible parser.add_argument('-k', '--insecure', action='store_true', help='Disable certificate validation') + parser.add_argument ('-b', '--cookie', type=cookie, metavar='SET-COOKIE', + action='append', default=[], help='Cookies in Set-Cookie format.') + parser.add_argument ('-c', '--cookie-jar', dest='cookieJar', + type=cookiejar, metavar='FILE', + default=pkg_resources.resource_filename (__name__, 'data/cookies.txt'), + help='Cookie jar file, read-only.') parser.add_argument('url', help='Website URL', type=absurl, metavar='URL') parser.add_argument('output', help='WARC filename', metavar='FILE') @@ -86,6 +118,7 @@ def single (): idleTimeout=args.idleTimeout, timeout=args.timeout, insecure=args.insecure, + cookies=args.cookieJar + args.cookie, ) with open (args.output, 'wb') as fd, WarcHandler (fd, logger) as warcHandler: logger.connect (WarcHandlerConsumer (warcHandler)) diff --git a/crocoite/controller.py b/crocoite/controller.py index 2a848e8..4c9c4b3 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -33,21 +33,19 @@ from . import behavior as cbehavior from .browser import SiteLoader, RequestResponsePair, PageIdle, FrameNavigated from .util import getFormattedViewportMetrics, getSoftwareInfo from .behavior import ExtractLinksEvent +from .devtools import toCookieParam class ControllerSettings: - __slots__ = ('idleTimeout', 'timeout', 'insecure') + __slots__ = ('idleTimeout', 'timeout', 'insecure', 'cookies') - def __init__ (self, idleTimeout=2, timeout=10, insecure=False): + def __init__ (self, idleTimeout=2, timeout=10, insecure=False, cookies=None): self.idleTimeout = idleTimeout self.timeout = timeout self.insecure = insecure + self.cookies = cookies or [] - def toDict (self): - return dict ( - idleTimeout=self.idleTimeout, - timeout=self.timeout, - insecure=self.insecure, - ) + def __repr__ (self): + return f'<ControllerSetting idleTimeout={self.idleTimeout!r}, timeout={self.timeout!r}, insecure={self.insecure!r}, cookies={self.cookies!r}>' defaultSettings = ControllerSettings () @@ -212,6 +210,7 @@ class SinglePageController: # configure browser tab = l.tab await tab.Security.setIgnoreCertificateErrors (ignore=self.settings.insecure) + await tab.Network.setCookies (cookies=list (map (toCookieParam, self.settings.cookies))) # not all behavior scripts are allowed for every URL, filter them self._enabledBehavior = list (filter (lambda x: self.url in x, @@ -232,6 +231,7 @@ class SinglePageController: 'timeout': self.settings.timeout, 'behavior': list (map (attrgetter('name'), self._enabledBehavior)), 'insecure': self.settings.insecure, + 'cookies': list (map (lambda x: x.OutputString(), self.settings.cookies)), }, } if self.warcinfo: diff --git a/crocoite/data/cookies.txt b/crocoite/data/cookies.txt new file mode 100644 index 0000000..6ac62c3 --- /dev/null +++ b/crocoite/data/cookies.txt @@ -0,0 +1,9 @@ +# Default cookies for crocoite. This file does *not* use Netscape’s cookie +# file format. Lines are expected to be in Set-Cookie format. +# And this line is a comment. + +# Reddit: +# skip over 18 prompt +over18=1; Domain=www.reddit.com +# skip quarantined subreddit prompt +_options={%22pref_quarantine_optin%22:true}; Domain=www.reddit.com diff --git a/crocoite/devtools.py b/crocoite/devtools.py index 412ab08..05680f1 100644 --- a/crocoite/devtools.py +++ b/crocoite/devtools.py @@ -25,6 +25,8 @@ Communication with Google Chrome through its DevTools protocol. import json, asyncio, logging, os from tempfile import mkdtemp import shutil +from http.cookies import Morsel + import aiohttp, websockets from yarl import URL @@ -366,3 +368,26 @@ class Passthrough: async def __aexit__ (self, *exc): return False +def toCookieParam (m): + """ + Convert Python’s http.cookies.Morsel to Chrome’s CookieParam, see + https://chromedevtools.github.io/devtools-protocol/1-3/Network#type-CookieParam + """ + + assert isinstance (m, Morsel) + + out = {'name': m.key, 'value': m.value} + + # unsupported by chrome + for k in ('max-age', 'comment', 'version'): + if m[k]: + raise ValueError (f'Unsupported cookie attribute {k} set, cannot convert') + + for mname, cname in [('expires', None), ('path', None), ('domain', None), ('secure', None), ('httponly', 'httpOnly')]: + value = m[mname] + if value: + cname = cname or mname + out[cname] = value + + return out + diff --git a/crocoite/irc.py b/crocoite/irc.py index bd13831..d9c0634 100644 --- a/crocoite/irc.py +++ b/crocoite/irc.py @@ -22,7 +22,7 @@ IRC bot “chromebot” """ -import asyncio, argparse, json, tempfile, time, random, os +import asyncio, argparse, json, tempfile, time, random, os, shlex from datetime import datetime from urllib.parse import urlsplit from enum import IntEnum, unique @@ -33,6 +33,7 @@ import bottom import websockets from .util import StrJsonEncoder +from .cli import cookie ### helper functions ### def prettyTimeDelta (seconds): @@ -366,12 +367,13 @@ class ArgparseBot (bottom.Client): async def onMessage (self, nick, target, message, **kwargs): """ Message received """ - if target in self.channels and message.startswith (self.nick + ':'): + msgPrefix = self.nick + ':' + if target in self.channels and message.startswith (msgPrefix): user = self.users[target].get (nick, User (nick)) reply = ReplyContext (client=self, target=target, user=user) - # channel message that starts with our nick - command = message.split (' ')[1:] + # shlex.split supports quoting arguments, which str.split() does not + command = shlex.split (message[len (msgPrefix):]) try: args = self.parser.parse_args (command) except Exception as e: @@ -439,13 +441,14 @@ class Chromebot (ArgparseBot): subparsers = parser.add_subparsers(help='Sub-commands') archiveparser = subparsers.add_parser('a', help='Archive a site', add_help=False) - #archiveparser.add_argument('--timeout', default=1*60*60, type=int, help='Maximum time for archival', metavar='SEC', choices=[60, 1*60*60, 2*60*60]) - #archiveparser.add_argument('--idle-timeout', default=10, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout', metavar='SEC', choices=[1, 10, 20, 30, 60]) - #archiveparser.add_argument('--max-body-size', default=None, type=int, dest='maxBodySize', help='Max body size', metavar='BYTES', choices=[1*1024*1024, 10*1024*1024, 100*1024*1024]) archiveparser.add_argument('--concurrency', '-j', default=1, type=int, help='Parallel workers for this job', choices=range (1, 5)) archiveparser.add_argument('--recursive', '-r', help='Enable recursion', choices=['0', '1', 'prefix'], default='0') archiveparser.add_argument('--insecure', '-k', help='Disable certificate checking', action='store_true') + # parsing the cookie here, so we can give an early feedback without + # waiting for the job to crash on invalid arguments. + archiveparser.add_argument('--cookie', '-b', type=cookie, + help='Add a cookie', action='append', default=[]) archiveparser.add_argument('url', help='Website URL', type=isValidUrl, metavar='URL') archiveparser.set_defaults (func=self.handleArchive, minPriv=NickMode.voice if self.needVoice else None) @@ -501,12 +504,14 @@ class Chromebot (ArgparseBot): 'concurrency': args.concurrency, }} grabCmd = ['crocoite-single'] + # prefix warcinfo with !, so it won’t get expanded grabCmd.extend (['--warcinfo', '!' + json.dumps (warcinfo, cls=StrJsonEncoder)]) + for v in args.cookie: + grabCmd.extend (['--cookie', v.OutputString ()]) if args.insecure: grabCmd.append ('--insecure') grabCmd.extend (['{url}', '{dest}']) - # prefix warcinfo with !, so it won’t get expanded cmdline = ['crocoite', '--tempdir', self.tempdir, '--recursion', args.recursive, diff --git a/crocoite/test_controller.py b/crocoite/test_controller.py index 7e79dbe..7216a42 100644 --- a/crocoite/test_controller.py +++ b/crocoite/test_controller.py @@ -151,3 +151,53 @@ async def test_idle_state_tracker (): end = loop.time () assert (timeout-delta) < (end-start) < (timeout+delta) +@pytest.fixture +async def recordingServer (): + """ Simple HTTP server that records raw requests """ + url = URL ('http://localhost:8080') + reqs = [] + async def record (request): + reqs.append (request) + return web.Response(text='ok', content_type='text/plain') + app = web.Application() + app.add_routes([web.get(url.path, record)]) + runner = web.AppRunner(app) + await runner.setup() + site = web.TCPSite (runner, url.host, url.port) + await site.start() + yield url, reqs + await runner.cleanup () + +from .test_devtools import tab, browser +from http.cookies import Morsel, SimpleCookie + +@pytest.mark.asyncio +async def test_set_cookies (tab, recordingServer): + """ Make sure cookies are set properly and only affect the domain they were + set for """ + + logger = Logger () + + url, reqs = recordingServer + + cookies = [] + c = Morsel () + c.set ('foo', 'bar', '') + c['domain'] = 'localhost' + cookies.append (c) + c = Morsel () + c.set ('buz', 'beef', '') + c['domain'] = 'nonexistent.example' + + settings = ControllerSettings (idleTimeout=1, timeout=60, cookies=cookies) + controller = SinglePageController (url=url, logger=logger, + service=Process (), behavior=[], settings=settings) + await asyncio.wait_for (controller.run (), settings.timeout*2) + + assert len (reqs) == 1 + req = reqs[0] + reqCookies = SimpleCookie (req.headers['cookie']) + assert len (reqCookies) == 1 + c = next (iter (reqCookies.values ())) + assert c.key == cookies[0].key + assert c.value == cookies[0].value diff --git a/doc/usage.rst b/doc/usage.rst index 9bba693..c18f9fb 100644 --- a/doc/usage.rst +++ b/doc/usage.rst @@ -24,6 +24,8 @@ Otherwise page screenshots may be unusable due to missing glyphs. Recursion ^^^^^^^^^ +.. program:: crocoite + By default crocoite will only retrieve the URL specified on the command line. However it can follow links as well. There’s currently two recursion strategies available, depth- and prefix-based. @@ -59,16 +61,18 @@ each page of a single job and should always be used. When running a recursive job, increasing the concurrency (i.e. how many pages are fetched at the same time) can speed up the process. For example you can -pass :option:`-j 4` to retrieve four pages at the same time. Keep in mind that each -process starts a full browser that requires a lot of resources (one to two GB -of RAM and one or two CPU cores). +pass :option:`-j` :samp:`4` to retrieve four pages at the same time. Keep in mind +that each process starts a full browser that requires a lot of resources (one +to two GB of RAM and one or two CPU cores). Customizing ^^^^^^^^^^^ -Under the hood crocoite starts one instance of :program:`crocoite-single` to fetch -each page. You can customize its options by appending a command template like -this: +.. program:: crocoite-single + +Under the hood :program:`crocoite` starts one instance of +:program:`crocoite-single` to fetch each page. You can customize its options by +appending a command template like this: .. code:: bash @@ -79,6 +83,70 @@ This reduces the global timeout to 5 seconds and ignores TLS errors. If an option is prefixed with an exclamation mark (``!``) it will not be expanded. This is useful for passing :option:`--warcinfo`, which expects JSON-encoded data. +Command line options +^^^^^^^^^^^^^^^^^^^^ + +Below is a list of all command line arguments available: + +.. program:: crocoite + +crocoite +++++++++ + +Front-end with recursion support and simple job management. + +.. option:: -j N, --concurrency N + + Maximum number of concurrent fetch jobs. + +.. option:: -r POLICY, --recursion POLICY + + Enables recursion based on POLICY, which can be a positive integer + (recursion depth) or the string :kbd:`prefix`. + +.. option:: --tempdir DIR + + Directory for temporary WARC files. + +.. program:: crocoite-single + +crocoite-single ++++++++++++++++ + +Back-end to fetch a single page. + +.. option:: -b SET-COOKIE, --cookie SET-COOKIE + + Add cookie to browser’s cookie jar. This option always *appends* cookies, + replacing those provided by :option:`-c`. + + .. versionadded:: 1.1 + +.. option:: -c FILE, --cookie-jar FILE + + Load cookies from FILE. :program:`crocoite` provides a default cookie file, + which contains cookies to, for example, circumvent age restrictions. This + option *replaces* that default file. + + .. versionadded:: 1.1 + +.. option:: --idle-timeout SEC + + Time after which a page is considered “idle”. + +.. option:: -k, --insecure + + Allow insecure connections, i.e. self-signed ore expired HTTPS certificates. + +.. option:: --timeout SEC + + Global archiving timeout. + + +.. option:: --warcinfo JSON + + Inject additional JSON-encoded information into the resulting WARC. + IRC bot ^^^^^^^ @@ -4,3 +4,5 @@ test=pytest addopts=--cov-report=html --cov-report=xml --cov=crocoite --cov-config=setup.cfg [coverage:run] branch=True +[build_sphinx] +builder=dirhtml |