From 94be61a303127335eb39b26302392a0f237c3ac1 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Wed, 26 Jun 2019 16:44:55 +0200 Subject: Allow turning off cert validation Add --insecure switch (shamelessly stolen from CURL) to both, -grab and -irc. --- crocoite/cli.py | 9 ++++++++- crocoite/controller.py | 22 ++++++++++++++++------ crocoite/irc.py | 17 +++++++++++++---- 3 files changed, 37 insertions(+), 11 deletions(-) diff --git a/crocoite/cli.py b/crocoite/cli.py index cccf728..3294061 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -74,6 +74,9 @@ def single (): metavar='NAME', nargs='*') parser.add_argument('--warcinfo', help='Add extra information to warcinfo record', metavar='JSON', type=json.loads) + parser.add_argument('-k', '--insecure', + action='store_true', + help='Disable certificate validation') parser.add_argument('url', help='Website URL', type=URL, metavar='URL') parser.add_argument('output', help='WARC filename', metavar='FILE') @@ -85,7 +88,11 @@ def single (): service = Process () if args.browser: service = Passthrough (args.browser) - settings = ControllerSettings (idleTimeout=args.idleTimeout, timeout=args.timeout) + settings = ControllerSettings ( + idleTimeout=args.idleTimeout, + timeout=args.timeout, + insecure=args.insecure, + ) with open (args.output, 'wb') as fd, WarcHandler (fd, logger) as warcHandler: logger.connect (WarcHandlerConsumer (warcHandler)) handler = [StatsHandler (), LogHandler (logger), warcHandler] diff --git a/crocoite/controller.py b/crocoite/controller.py index 1bcca0f..02017c3 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -35,14 +35,19 @@ from .util import getFormattedViewportMetrics, getSoftwareInfo from .behavior import ExtractLinksEvent class ControllerSettings: - __slots__ = ('idleTimeout', 'timeout') + __slots__ = ('idleTimeout', 'timeout', 'insecure') - def __init__ (self, idleTimeout=2, timeout=10): + def __init__ (self, idleTimeout=2, timeout=10, insecure=False): self.idleTimeout = idleTimeout self.timeout = timeout + self.insecure = insecure def toDict (self): - return dict (idleTimeout=self.idleTimeout, timeout=self.timeout) + return dict ( + idleTimeout=self.idleTimeout, + timeout=self.timeout, + insecure=self.insecure, + ) defaultSettings = ControllerSettings () @@ -204,17 +209,21 @@ class SinglePageController: handle = asyncio.ensure_future (processQueue ()) timeoutProc = asyncio.ensure_future (asyncio.sleep (self.settings.timeout)) + # configure browser + tab = l.tab + await tab.Security.setIgnoreCertificateErrors (ignore=self.settings.insecure) + # not all behavior scripts are allowed for every URL, filter them self._enabledBehavior = list (filter (lambda x: self.url in x, map (lambda x: x (l, logger), self.behavior))) - version = await l.tab.Browser.getVersion () + version = await tab.Browser.getVersion () payload = { 'software': getSoftwareInfo (), 'browser': { 'product': version['product'], 'useragent': version['userAgent'], - 'viewport': await getFormattedViewportMetrics (l.tab), + 'viewport': await getFormattedViewportMetrics (tab), }, 'tool': 'crocoite-single', # not the name of the cli utility 'parameters': { @@ -222,6 +231,7 @@ class SinglePageController: 'idleTimeout': self.settings.idleTimeout, 'timeout': self.settings.timeout, 'behavior': list (map (attrgetter('name'), self._enabledBehavior)), + 'insecure': self.settings.insecure, }, } if self.warcinfo: @@ -264,7 +274,7 @@ class SinglePageController: break await behavior.stop () - await l.tab.Page.stopLoading () + await tab.Page.stopLoading () await asyncio.sleep (1) await behavior.finish () diff --git a/crocoite/irc.py b/crocoite/irc.py index 8e7061a..d0b5bb9 100644 --- a/crocoite/irc.py +++ b/crocoite/irc.py @@ -444,6 +444,8 @@ class Chromebot (ArgparseBot): #archiveparser.add_argument('--max-body-size', default=None, type=int, dest='maxBodySize', help='Max body size', metavar='BYTES', choices=[1*1024*1024, 10*1024*1024, 100*1024*1024]) archiveparser.add_argument('--concurrency', '-j', default=1, type=int, help='Parallel workers for this job', choices=range (1, 5)) archiveparser.add_argument('--recursive', '-r', help='Enable recursion', choices=['0', '1', 'prefix'], default='0') + archiveparser.add_argument('--insecure', '-k', + help='Disable certificate checking', action='store_true') archiveparser.add_argument('url', help='Website URL', type=isValidUrl, metavar='URL') archiveparser.set_defaults (func=self.handleArchive, minPriv=NickMode.voice if self.needVoice else None) @@ -488,20 +490,27 @@ class Chromebot (ArgparseBot): 'recursive': args.recursive, 'concurrency': args.concurrency, } + if args.insecure: + showargs['insecure'] = args.insecure warcinfo = {'chromebot': { 'jobid': j.id, 'user': user.name, 'queued': j.started, 'url': args.url, + 'recursive': args.recursive, + 'concurrency': args.concurrency, }} - warcinfo['chromebot'].update (showargs) + grabCmd = ['crocoite-grab'] + grabCmd.extend (['--warcinfo', + '!' + json.dumps (warcinfo, cls=StrJsonEncoder)]) + if args.insecure: + grabCmd.append ('--insecure') + grabCmd.extend (['{url}', '{dest}']) # prefix warcinfo with !, so it won’t get expanded cmdline = ['crocoite-recursive', args.url, '--tempdir', self.tempdir, '--prefix', j.id + '-{host}-{date}-', '--policy', args.recursive, '--concurrency', str (args.concurrency), - self.destdir, '--', 'crocoite-grab', '--warcinfo', - '!' + json.dumps (warcinfo, cls=StrJsonEncoder), '{url}', - '{dest}'] + self.destdir, '--'] + grabCmd strargs = ', '.join (map (lambda x: '{}={}'.format (*x), showargs.items ())) reply (f'{args.url} has been queued as {j.id} with {strargs}') -- cgit v1.2.3