summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--crocoite/cli.py33
-rw-r--r--crocoite/controller.py16
-rw-r--r--crocoite/data/cookies.txt9
-rw-r--r--crocoite/devtools.py25
-rw-r--r--crocoite/irc.py21
-rw-r--r--crocoite/test_controller.py50
-rw-r--r--doc/usage.rst80
-rw-r--r--setup.cfg2
8 files changed, 214 insertions, 22 deletions
diff --git a/crocoite/cli.py b/crocoite/cli.py
index 93b742b..53a0b32 100644
--- a/crocoite/cli.py
+++ b/crocoite/cli.py
@@ -26,6 +26,8 @@ import argparse, sys, signal, asyncio, os, json
from traceback import TracebackException
from enum import IntEnum
from yarl import URL
+from http.cookies import SimpleCookie
+import pkg_resources
try:
import manhole
manhole.install (patch_fork=False, oneshot_on='USR1')
@@ -49,6 +51,29 @@ def absurl (s):
return u
raise argparse.ArgumentTypeError ('Must be absolute')
+def cookie (s):
+ """ argparse: Cookie """
+ c = SimpleCookie (s)
+ # for some reason the constructor does not raise an exception if the cookie
+ # supplied is invalid. It’ll simply be empty.
+ if len (c) != 1:
+ raise argparse.ArgumentTypeError ('Invalid cookie')
+ # we want a single Morsel
+ return next (iter (c.values ()))
+
+def cookiejar (f):
+ """ argparse: Cookies from file """
+ cookies = []
+ try:
+ with open (f, 'r') as fd:
+ for l in fd:
+ l = l.lstrip ()
+ if l and not l.startswith ('#'):
+ cookies.append (cookie (l))
+ except FileNotFoundError:
+ raise argparse.ArgumentTypeError (f'Cookie jar "{f}" does not exist')
+ return cookies
+
class SingleExitStatus(IntEnum):
""" Exit status for single-shot command line """
Ok = 0
@@ -68,9 +93,16 @@ def single ():
metavar='NAME', nargs='*')
parser.add_argument('--warcinfo', help='Add extra information to warcinfo record',
metavar='JSON', type=json.loads)
+ # re-using curl’s short/long switch names whenever possible
parser.add_argument('-k', '--insecure',
action='store_true',
help='Disable certificate validation')
+ parser.add_argument ('-b', '--cookie', type=cookie, metavar='SET-COOKIE',
+ action='append', default=[], help='Cookies in Set-Cookie format.')
+ parser.add_argument ('-c', '--cookie-jar', dest='cookieJar',
+ type=cookiejar, metavar='FILE',
+ default=pkg_resources.resource_filename (__name__, 'data/cookies.txt'),
+ help='Cookie jar file, read-only.')
parser.add_argument('url', help='Website URL', type=absurl, metavar='URL')
parser.add_argument('output', help='WARC filename', metavar='FILE')
@@ -86,6 +118,7 @@ def single ():
idleTimeout=args.idleTimeout,
timeout=args.timeout,
insecure=args.insecure,
+ cookies=args.cookieJar + args.cookie,
)
with open (args.output, 'wb') as fd, WarcHandler (fd, logger) as warcHandler:
logger.connect (WarcHandlerConsumer (warcHandler))
diff --git a/crocoite/controller.py b/crocoite/controller.py
index 2a848e8..4c9c4b3 100644
--- a/crocoite/controller.py
+++ b/crocoite/controller.py
@@ -33,21 +33,19 @@ from . import behavior as cbehavior
from .browser import SiteLoader, RequestResponsePair, PageIdle, FrameNavigated
from .util import getFormattedViewportMetrics, getSoftwareInfo
from .behavior import ExtractLinksEvent
+from .devtools import toCookieParam
class ControllerSettings:
- __slots__ = ('idleTimeout', 'timeout', 'insecure')
+ __slots__ = ('idleTimeout', 'timeout', 'insecure', 'cookies')
- def __init__ (self, idleTimeout=2, timeout=10, insecure=False):
+ def __init__ (self, idleTimeout=2, timeout=10, insecure=False, cookies=None):
self.idleTimeout = idleTimeout
self.timeout = timeout
self.insecure = insecure
+ self.cookies = cookies or []
- def toDict (self):
- return dict (
- idleTimeout=self.idleTimeout,
- timeout=self.timeout,
- insecure=self.insecure,
- )
+ def __repr__ (self):
+ return f'<ControllerSetting idleTimeout={self.idleTimeout!r}, timeout={self.timeout!r}, insecure={self.insecure!r}, cookies={self.cookies!r}>'
defaultSettings = ControllerSettings ()
@@ -212,6 +210,7 @@ class SinglePageController:
# configure browser
tab = l.tab
await tab.Security.setIgnoreCertificateErrors (ignore=self.settings.insecure)
+ await tab.Network.setCookies (cookies=list (map (toCookieParam, self.settings.cookies)))
# not all behavior scripts are allowed for every URL, filter them
self._enabledBehavior = list (filter (lambda x: self.url in x,
@@ -232,6 +231,7 @@ class SinglePageController:
'timeout': self.settings.timeout,
'behavior': list (map (attrgetter('name'), self._enabledBehavior)),
'insecure': self.settings.insecure,
+ 'cookies': list (map (lambda x: x.OutputString(), self.settings.cookies)),
},
}
if self.warcinfo:
diff --git a/crocoite/data/cookies.txt b/crocoite/data/cookies.txt
new file mode 100644
index 0000000..6ac62c3
--- /dev/null
+++ b/crocoite/data/cookies.txt
@@ -0,0 +1,9 @@
+# Default cookies for crocoite. This file does *not* use Netscape’s cookie
+# file format. Lines are expected to be in Set-Cookie format.
+# And this line is a comment.
+
+# Reddit:
+# skip over 18 prompt
+over18=1; Domain=www.reddit.com
+# skip quarantined subreddit prompt
+_options={%22pref_quarantine_optin%22:true}; Domain=www.reddit.com
diff --git a/crocoite/devtools.py b/crocoite/devtools.py
index 412ab08..05680f1 100644
--- a/crocoite/devtools.py
+++ b/crocoite/devtools.py
@@ -25,6 +25,8 @@ Communication with Google Chrome through its DevTools protocol.
import json, asyncio, logging, os
from tempfile import mkdtemp
import shutil
+from http.cookies import Morsel
+
import aiohttp, websockets
from yarl import URL
@@ -366,3 +368,26 @@ class Passthrough:
async def __aexit__ (self, *exc):
return False
+def toCookieParam (m):
+ """
+ Convert Python’s http.cookies.Morsel to Chrome’s CookieParam, see
+ https://chromedevtools.github.io/devtools-protocol/1-3/Network#type-CookieParam
+ """
+
+ assert isinstance (m, Morsel)
+
+ out = {'name': m.key, 'value': m.value}
+
+ # unsupported by chrome
+ for k in ('max-age', 'comment', 'version'):
+ if m[k]:
+ raise ValueError (f'Unsupported cookie attribute {k} set, cannot convert')
+
+ for mname, cname in [('expires', None), ('path', None), ('domain', None), ('secure', None), ('httponly', 'httpOnly')]:
+ value = m[mname]
+ if value:
+ cname = cname or mname
+ out[cname] = value
+
+ return out
+
diff --git a/crocoite/irc.py b/crocoite/irc.py
index bd13831..d9c0634 100644
--- a/crocoite/irc.py
+++ b/crocoite/irc.py
@@ -22,7 +22,7 @@
IRC bot “chromebot”
"""
-import asyncio, argparse, json, tempfile, time, random, os
+import asyncio, argparse, json, tempfile, time, random, os, shlex
from datetime import datetime
from urllib.parse import urlsplit
from enum import IntEnum, unique
@@ -33,6 +33,7 @@ import bottom
import websockets
from .util import StrJsonEncoder
+from .cli import cookie
### helper functions ###
def prettyTimeDelta (seconds):
@@ -366,12 +367,13 @@ class ArgparseBot (bottom.Client):
async def onMessage (self, nick, target, message, **kwargs):
""" Message received """
- if target in self.channels and message.startswith (self.nick + ':'):
+ msgPrefix = self.nick + ':'
+ if target in self.channels and message.startswith (msgPrefix):
user = self.users[target].get (nick, User (nick))
reply = ReplyContext (client=self, target=target, user=user)
- # channel message that starts with our nick
- command = message.split (' ')[1:]
+ # shlex.split supports quoting arguments, which str.split() does not
+ command = shlex.split (message[len (msgPrefix):])
try:
args = self.parser.parse_args (command)
except Exception as e:
@@ -439,13 +441,14 @@ class Chromebot (ArgparseBot):
subparsers = parser.add_subparsers(help='Sub-commands')
archiveparser = subparsers.add_parser('a', help='Archive a site', add_help=False)
- #archiveparser.add_argument('--timeout', default=1*60*60, type=int, help='Maximum time for archival', metavar='SEC', choices=[60, 1*60*60, 2*60*60])
- #archiveparser.add_argument('--idle-timeout', default=10, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout', metavar='SEC', choices=[1, 10, 20, 30, 60])
- #archiveparser.add_argument('--max-body-size', default=None, type=int, dest='maxBodySize', help='Max body size', metavar='BYTES', choices=[1*1024*1024, 10*1024*1024, 100*1024*1024])
archiveparser.add_argument('--concurrency', '-j', default=1, type=int, help='Parallel workers for this job', choices=range (1, 5))
archiveparser.add_argument('--recursive', '-r', help='Enable recursion', choices=['0', '1', 'prefix'], default='0')
archiveparser.add_argument('--insecure', '-k',
help='Disable certificate checking', action='store_true')
+ # parsing the cookie here, so we can give an early feedback without
+ # waiting for the job to crash on invalid arguments.
+ archiveparser.add_argument('--cookie', '-b', type=cookie,
+ help='Add a cookie', action='append', default=[])
archiveparser.add_argument('url', help='Website URL', type=isValidUrl, metavar='URL')
archiveparser.set_defaults (func=self.handleArchive,
minPriv=NickMode.voice if self.needVoice else None)
@@ -501,12 +504,14 @@ class Chromebot (ArgparseBot):
'concurrency': args.concurrency,
}}
grabCmd = ['crocoite-single']
+ # prefix warcinfo with !, so it won’t get expanded
grabCmd.extend (['--warcinfo',
'!' + json.dumps (warcinfo, cls=StrJsonEncoder)])
+ for v in args.cookie:
+ grabCmd.extend (['--cookie', v.OutputString ()])
if args.insecure:
grabCmd.append ('--insecure')
grabCmd.extend (['{url}', '{dest}'])
- # prefix warcinfo with !, so it won’t get expanded
cmdline = ['crocoite',
'--tempdir', self.tempdir,
'--recursion', args.recursive,
diff --git a/crocoite/test_controller.py b/crocoite/test_controller.py
index 7e79dbe..7216a42 100644
--- a/crocoite/test_controller.py
+++ b/crocoite/test_controller.py
@@ -151,3 +151,53 @@ async def test_idle_state_tracker ():
end = loop.time ()
assert (timeout-delta) < (end-start) < (timeout+delta)
+@pytest.fixture
+async def recordingServer ():
+ """ Simple HTTP server that records raw requests """
+ url = URL ('http://localhost:8080')
+ reqs = []
+ async def record (request):
+ reqs.append (request)
+ return web.Response(text='ok', content_type='text/plain')
+ app = web.Application()
+ app.add_routes([web.get(url.path, record)])
+ runner = web.AppRunner(app)
+ await runner.setup()
+ site = web.TCPSite (runner, url.host, url.port)
+ await site.start()
+ yield url, reqs
+ await runner.cleanup ()
+
+from .test_devtools import tab, browser
+from http.cookies import Morsel, SimpleCookie
+
+@pytest.mark.asyncio
+async def test_set_cookies (tab, recordingServer):
+ """ Make sure cookies are set properly and only affect the domain they were
+ set for """
+
+ logger = Logger ()
+
+ url, reqs = recordingServer
+
+ cookies = []
+ c = Morsel ()
+ c.set ('foo', 'bar', '')
+ c['domain'] = 'localhost'
+ cookies.append (c)
+ c = Morsel ()
+ c.set ('buz', 'beef', '')
+ c['domain'] = 'nonexistent.example'
+
+ settings = ControllerSettings (idleTimeout=1, timeout=60, cookies=cookies)
+ controller = SinglePageController (url=url, logger=logger,
+ service=Process (), behavior=[], settings=settings)
+ await asyncio.wait_for (controller.run (), settings.timeout*2)
+
+ assert len (reqs) == 1
+ req = reqs[0]
+ reqCookies = SimpleCookie (req.headers['cookie'])
+ assert len (reqCookies) == 1
+ c = next (iter (reqCookies.values ()))
+ assert c.key == cookies[0].key
+ assert c.value == cookies[0].value
diff --git a/doc/usage.rst b/doc/usage.rst
index 9bba693..c18f9fb 100644
--- a/doc/usage.rst
+++ b/doc/usage.rst
@@ -24,6 +24,8 @@ Otherwise page screenshots may be unusable due to missing glyphs.
Recursion
^^^^^^^^^
+.. program:: crocoite
+
By default crocoite will only retrieve the URL specified on the command line.
However it can follow links as well. There’s currently two recursion strategies
available, depth- and prefix-based.
@@ -59,16 +61,18 @@ each page of a single job and should always be used.
When running a recursive job, increasing the concurrency (i.e. how many pages
are fetched at the same time) can speed up the process. For example you can
-pass :option:`-j 4` to retrieve four pages at the same time. Keep in mind that each
-process starts a full browser that requires a lot of resources (one to two GB
-of RAM and one or two CPU cores).
+pass :option:`-j` :samp:`4` to retrieve four pages at the same time. Keep in mind
+that each process starts a full browser that requires a lot of resources (one
+to two GB of RAM and one or two CPU cores).
Customizing
^^^^^^^^^^^
-Under the hood crocoite starts one instance of :program:`crocoite-single` to fetch
-each page. You can customize its options by appending a command template like
-this:
+.. program:: crocoite-single
+
+Under the hood :program:`crocoite` starts one instance of
+:program:`crocoite-single` to fetch each page. You can customize its options by
+appending a command template like this:
.. code:: bash
@@ -79,6 +83,70 @@ This reduces the global timeout to 5 seconds and ignores TLS errors. If an
option is prefixed with an exclamation mark (``!``) it will not be expanded.
This is useful for passing :option:`--warcinfo`, which expects JSON-encoded data.
+Command line options
+^^^^^^^^^^^^^^^^^^^^
+
+Below is a list of all command line arguments available:
+
+.. program:: crocoite
+
+crocoite
+++++++++
+
+Front-end with recursion support and simple job management.
+
+.. option:: -j N, --concurrency N
+
+ Maximum number of concurrent fetch jobs.
+
+.. option:: -r POLICY, --recursion POLICY
+
+ Enables recursion based on POLICY, which can be a positive integer
+ (recursion depth) or the string :kbd:`prefix`.
+
+.. option:: --tempdir DIR
+
+ Directory for temporary WARC files.
+
+.. program:: crocoite-single
+
+crocoite-single
++++++++++++++++
+
+Back-end to fetch a single page.
+
+.. option:: -b SET-COOKIE, --cookie SET-COOKIE
+
+ Add cookie to browser’s cookie jar. This option always *appends* cookies,
+ replacing those provided by :option:`-c`.
+
+ .. versionadded:: 1.1
+
+.. option:: -c FILE, --cookie-jar FILE
+
+ Load cookies from FILE. :program:`crocoite` provides a default cookie file,
+ which contains cookies to, for example, circumvent age restrictions. This
+ option *replaces* that default file.
+
+ .. versionadded:: 1.1
+
+.. option:: --idle-timeout SEC
+
+ Time after which a page is considered “idle”.
+
+.. option:: -k, --insecure
+
+ Allow insecure connections, i.e. self-signed ore expired HTTPS certificates.
+
+.. option:: --timeout SEC
+
+ Global archiving timeout.
+
+
+.. option:: --warcinfo JSON
+
+ Inject additional JSON-encoded information into the resulting WARC.
+
IRC bot
^^^^^^^
diff --git a/setup.cfg b/setup.cfg
index 32dfadf..ec7d730 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -4,3 +4,5 @@ test=pytest
addopts=--cov-report=html --cov-report=xml --cov=crocoite --cov-config=setup.cfg
[coverage:run]
branch=True
+[build_sphinx]
+builder=dirhtml