diff options
| author | Lars-Dominik Braun <lars@6xq.net> | 2019-07-11 10:59:05 +0200 | 
|---|---|---|
| committer | Lars-Dominik Braun <lars@6xq.net> | 2019-07-13 10:32:56 +0200 | 
| commit | 4905ac083b5f570988446a2b9dde3a8747020f1a (patch) | |
| tree | c9866b6831835526e5dbee038cf48df3de628a82 /crocoite | |
| parent | 8761275f1f569b747cb26578e1c3411e108fb8dd (diff) | |
| download | crocoite-4905ac083b5f570988446a2b9dde3a8747020f1a.tar.gz crocoite-4905ac083b5f570988446a2b9dde3a8747020f1a.tar.bz2 crocoite-4905ac083b5f570988446a2b9dde3a8747020f1a.zip  | |
Cookie injection support
Add command-line options injecting individual cookies or cookie file
into Chrome. Provide default cookie file.
This changes the IRC bot’s command splitting to shlex.split, which
allows shell-like argument quoting.
Fixes #7.
Diffstat (limited to 'crocoite')
| -rw-r--r-- | crocoite/cli.py | 33 | ||||
| -rw-r--r-- | crocoite/controller.py | 16 | ||||
| -rw-r--r-- | crocoite/data/cookies.txt | 9 | ||||
| -rw-r--r-- | crocoite/devtools.py | 25 | ||||
| -rw-r--r-- | crocoite/irc.py | 21 | ||||
| -rw-r--r-- | crocoite/test_controller.py | 50 | 
6 files changed, 138 insertions, 16 deletions
diff --git a/crocoite/cli.py b/crocoite/cli.py index 93b742b..53a0b32 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -26,6 +26,8 @@ import argparse, sys, signal, asyncio, os, json  from traceback import TracebackException  from enum import IntEnum  from yarl import URL +from http.cookies import SimpleCookie +import pkg_resources  try:      import manhole      manhole.install (patch_fork=False, oneshot_on='USR1') @@ -49,6 +51,29 @@ def absurl (s):          return u      raise argparse.ArgumentTypeError ('Must be absolute') +def cookie (s): +    """ argparse: Cookie """ +    c = SimpleCookie (s) +    # for some reason the constructor does not raise an exception if the cookie +    # supplied is invalid. It’ll simply be empty. +    if len (c) != 1: +        raise argparse.ArgumentTypeError ('Invalid cookie') +    # we want a single Morsel +    return next (iter (c.values ())) + +def cookiejar (f): +    """ argparse: Cookies from file """ +    cookies = [] +    try: +        with open (f, 'r') as fd: +            for l in fd: +                l = l.lstrip () +                if l and not l.startswith ('#'): +                    cookies.append (cookie (l)) +    except FileNotFoundError: +        raise argparse.ArgumentTypeError (f'Cookie jar "{f}" does not exist') +    return cookies +  class SingleExitStatus(IntEnum):      """ Exit status for single-shot command line """      Ok = 0 @@ -68,9 +93,16 @@ def single ():              metavar='NAME', nargs='*')      parser.add_argument('--warcinfo', help='Add extra information to warcinfo record',              metavar='JSON', type=json.loads) +    # re-using curl’s short/long switch names whenever possible      parser.add_argument('-k', '--insecure',              action='store_true',              help='Disable certificate validation') +    parser.add_argument ('-b', '--cookie', type=cookie, metavar='SET-COOKIE', +            action='append', default=[], help='Cookies in Set-Cookie format.') +    parser.add_argument ('-c', '--cookie-jar', dest='cookieJar', +            type=cookiejar, metavar='FILE', +            default=pkg_resources.resource_filename (__name__, 'data/cookies.txt'), +            help='Cookie jar file, read-only.')      parser.add_argument('url', help='Website URL', type=absurl, metavar='URL')      parser.add_argument('output', help='WARC filename', metavar='FILE') @@ -86,6 +118,7 @@ def single ():              idleTimeout=args.idleTimeout,              timeout=args.timeout,              insecure=args.insecure, +            cookies=args.cookieJar + args.cookie,              )      with open (args.output, 'wb') as fd, WarcHandler (fd, logger) as warcHandler:          logger.connect (WarcHandlerConsumer (warcHandler)) diff --git a/crocoite/controller.py b/crocoite/controller.py index 2a848e8..4c9c4b3 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -33,21 +33,19 @@ from . import behavior as cbehavior  from .browser import SiteLoader, RequestResponsePair, PageIdle, FrameNavigated  from .util import getFormattedViewportMetrics, getSoftwareInfo  from .behavior import ExtractLinksEvent +from .devtools import toCookieParam  class ControllerSettings: -    __slots__ = ('idleTimeout', 'timeout', 'insecure') +    __slots__ = ('idleTimeout', 'timeout', 'insecure', 'cookies') -    def __init__ (self, idleTimeout=2, timeout=10, insecure=False): +    def __init__ (self, idleTimeout=2, timeout=10, insecure=False, cookies=None):          self.idleTimeout = idleTimeout          self.timeout = timeout          self.insecure = insecure +        self.cookies = cookies or [] -    def toDict (self): -        return dict ( -                idleTimeout=self.idleTimeout, -                timeout=self.timeout, -                insecure=self.insecure, -                ) +    def __repr__ (self): +        return f'<ControllerSetting idleTimeout={self.idleTimeout!r}, timeout={self.timeout!r}, insecure={self.insecure!r}, cookies={self.cookies!r}>'  defaultSettings = ControllerSettings () @@ -212,6 +210,7 @@ class SinglePageController:              # configure browser              tab = l.tab              await tab.Security.setIgnoreCertificateErrors (ignore=self.settings.insecure) +            await tab.Network.setCookies (cookies=list (map (toCookieParam, self.settings.cookies)))              # not all behavior scripts are allowed for every URL, filter them              self._enabledBehavior = list (filter (lambda x: self.url in x, @@ -232,6 +231,7 @@ class SinglePageController:                          'timeout': self.settings.timeout,                          'behavior': list (map (attrgetter('name'), self._enabledBehavior)),                          'insecure': self.settings.insecure, +                        'cookies': list (map (lambda x: x.OutputString(), self.settings.cookies)),                          },                      }              if self.warcinfo: diff --git a/crocoite/data/cookies.txt b/crocoite/data/cookies.txt new file mode 100644 index 0000000..6ac62c3 --- /dev/null +++ b/crocoite/data/cookies.txt @@ -0,0 +1,9 @@ +# Default cookies for crocoite. This file does *not* use Netscape’s cookie +# file format. Lines are expected to be in Set-Cookie format. +# And this line is a comment. + +# Reddit: +# skip over 18 prompt +over18=1; Domain=www.reddit.com +# skip quarantined subreddit prompt +_options={%22pref_quarantine_optin%22:true}; Domain=www.reddit.com diff --git a/crocoite/devtools.py b/crocoite/devtools.py index 412ab08..05680f1 100644 --- a/crocoite/devtools.py +++ b/crocoite/devtools.py @@ -25,6 +25,8 @@ Communication with Google Chrome through its DevTools protocol.  import json, asyncio, logging, os  from tempfile import mkdtemp  import shutil +from http.cookies import Morsel +  import aiohttp, websockets  from yarl import URL @@ -366,3 +368,26 @@ class Passthrough:      async def __aexit__ (self, *exc):          return False +def toCookieParam (m): +    """ +    Convert Python’s http.cookies.Morsel to Chrome’s CookieParam, see +    https://chromedevtools.github.io/devtools-protocol/1-3/Network#type-CookieParam +    """ + +    assert isinstance (m, Morsel) + +    out = {'name': m.key, 'value': m.value} + +    # unsupported by chrome +    for k in ('max-age', 'comment', 'version'): +        if m[k]: +            raise ValueError (f'Unsupported cookie attribute {k} set, cannot convert') + +    for mname, cname in [('expires', None), ('path', None), ('domain', None), ('secure', None), ('httponly', 'httpOnly')]: +        value = m[mname] +        if value: +            cname = cname or mname +            out[cname] = value + +    return out + diff --git a/crocoite/irc.py b/crocoite/irc.py index bd13831..d9c0634 100644 --- a/crocoite/irc.py +++ b/crocoite/irc.py @@ -22,7 +22,7 @@  IRC bot “chromebot”  """ -import asyncio, argparse, json, tempfile, time, random, os +import asyncio, argparse, json, tempfile, time, random, os, shlex  from datetime import datetime  from urllib.parse import urlsplit  from enum import IntEnum, unique @@ -33,6 +33,7 @@ import bottom  import websockets  from .util import StrJsonEncoder +from .cli import cookie  ### helper functions ###  def prettyTimeDelta (seconds): @@ -366,12 +367,13 @@ class ArgparseBot (bottom.Client):      async def onMessage (self, nick, target, message, **kwargs):          """ Message received """ -        if target in self.channels and message.startswith (self.nick + ':'): +        msgPrefix = self.nick + ':' +        if target in self.channels and message.startswith (msgPrefix):              user = self.users[target].get (nick, User (nick))              reply = ReplyContext (client=self, target=target, user=user) -            # channel message that starts with our nick -            command = message.split (' ')[1:] +            # shlex.split supports quoting arguments, which str.split() does not +            command = shlex.split (message[len (msgPrefix):])              try:                  args = self.parser.parse_args (command)              except Exception as e: @@ -439,13 +441,14 @@ class Chromebot (ArgparseBot):          subparsers = parser.add_subparsers(help='Sub-commands')          archiveparser = subparsers.add_parser('a', help='Archive a site', add_help=False) -        #archiveparser.add_argument('--timeout', default=1*60*60, type=int, help='Maximum time for archival', metavar='SEC', choices=[60, 1*60*60, 2*60*60]) -        #archiveparser.add_argument('--idle-timeout', default=10, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout', metavar='SEC', choices=[1, 10, 20, 30, 60]) -        #archiveparser.add_argument('--max-body-size', default=None, type=int, dest='maxBodySize', help='Max body size', metavar='BYTES', choices=[1*1024*1024, 10*1024*1024, 100*1024*1024])          archiveparser.add_argument('--concurrency', '-j', default=1, type=int, help='Parallel workers for this job', choices=range (1, 5))          archiveparser.add_argument('--recursive', '-r', help='Enable recursion', choices=['0', '1', 'prefix'], default='0')          archiveparser.add_argument('--insecure', '-k',                  help='Disable certificate checking', action='store_true') +        # parsing the cookie here, so we can give an early feedback without +        # waiting for the job to crash on invalid arguments. +        archiveparser.add_argument('--cookie', '-b', type=cookie, +                help='Add a cookie', action='append', default=[])          archiveparser.add_argument('url', help='Website URL', type=isValidUrl, metavar='URL')          archiveparser.set_defaults (func=self.handleArchive,                  minPriv=NickMode.voice if self.needVoice else None) @@ -501,12 +504,14 @@ class Chromebot (ArgparseBot):                  'concurrency': args.concurrency,                  }}          grabCmd = ['crocoite-single'] +        # prefix warcinfo with !, so it won’t get expanded          grabCmd.extend (['--warcinfo',                  '!' + json.dumps (warcinfo, cls=StrJsonEncoder)]) +        for v in args.cookie: +            grabCmd.extend (['--cookie', v.OutputString ()])          if args.insecure:              grabCmd.append ('--insecure')          grabCmd.extend (['{url}', '{dest}']) -        # prefix warcinfo with !, so it won’t get expanded          cmdline = ['crocoite',                  '--tempdir', self.tempdir,                  '--recursion', args.recursive, diff --git a/crocoite/test_controller.py b/crocoite/test_controller.py index 7e79dbe..7216a42 100644 --- a/crocoite/test_controller.py +++ b/crocoite/test_controller.py @@ -151,3 +151,53 @@ async def test_idle_state_tracker ():      end = loop.time ()      assert (timeout-delta) < (end-start) < (timeout+delta) +@pytest.fixture +async def recordingServer (): +    """ Simple HTTP server that records raw requests """ +    url = URL ('http://localhost:8080') +    reqs = [] +    async def record (request): +        reqs.append (request) +        return web.Response(text='ok', content_type='text/plain') +    app = web.Application() +    app.add_routes([web.get(url.path, record)]) +    runner = web.AppRunner(app) +    await runner.setup() +    site = web.TCPSite (runner, url.host, url.port) +    await site.start() +    yield url, reqs +    await runner.cleanup () + +from .test_devtools import tab, browser +from http.cookies import Morsel, SimpleCookie + +@pytest.mark.asyncio +async def test_set_cookies (tab, recordingServer): +    """ Make sure cookies are set properly and only affect the domain they were +    set for """ + +    logger = Logger () + +    url, reqs = recordingServer + +    cookies = [] +    c = Morsel () +    c.set ('foo', 'bar', '') +    c['domain'] = 'localhost' +    cookies.append (c) +    c = Morsel () +    c.set ('buz', 'beef', '') +    c['domain'] = 'nonexistent.example' + +    settings = ControllerSettings (idleTimeout=1, timeout=60, cookies=cookies) +    controller = SinglePageController (url=url, logger=logger, +            service=Process (), behavior=[], settings=settings) +    await asyncio.wait_for (controller.run (), settings.timeout*2) +     +    assert len (reqs) == 1 +    req = reqs[0] +    reqCookies = SimpleCookie (req.headers['cookie']) +    assert len (reqCookies) == 1 +    c = next (iter (reqCookies.values ())) +    assert c.key == cookies[0].key +    assert c.value == cookies[0].value  | 
