summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--crocoite/cli.py9
-rw-r--r--crocoite/controller.py22
-rw-r--r--crocoite/irc.py17
3 files changed, 37 insertions, 11 deletions
diff --git a/crocoite/cli.py b/crocoite/cli.py
index cccf728..3294061 100644
--- a/crocoite/cli.py
+++ b/crocoite/cli.py
@@ -74,6 +74,9 @@ def single ():
metavar='NAME', nargs='*')
parser.add_argument('--warcinfo', help='Add extra information to warcinfo record',
metavar='JSON', type=json.loads)
+ parser.add_argument('-k', '--insecure',
+ action='store_true',
+ help='Disable certificate validation')
parser.add_argument('url', help='Website URL', type=URL, metavar='URL')
parser.add_argument('output', help='WARC filename', metavar='FILE')
@@ -85,7 +88,11 @@ def single ():
service = Process ()
if args.browser:
service = Passthrough (args.browser)
- settings = ControllerSettings (idleTimeout=args.idleTimeout, timeout=args.timeout)
+ settings = ControllerSettings (
+ idleTimeout=args.idleTimeout,
+ timeout=args.timeout,
+ insecure=args.insecure,
+ )
with open (args.output, 'wb') as fd, WarcHandler (fd, logger) as warcHandler:
logger.connect (WarcHandlerConsumer (warcHandler))
handler = [StatsHandler (), LogHandler (logger), warcHandler]
diff --git a/crocoite/controller.py b/crocoite/controller.py
index 1bcca0f..02017c3 100644
--- a/crocoite/controller.py
+++ b/crocoite/controller.py
@@ -35,14 +35,19 @@ from .util import getFormattedViewportMetrics, getSoftwareInfo
from .behavior import ExtractLinksEvent
class ControllerSettings:
- __slots__ = ('idleTimeout', 'timeout')
+ __slots__ = ('idleTimeout', 'timeout', 'insecure')
- def __init__ (self, idleTimeout=2, timeout=10):
+ def __init__ (self, idleTimeout=2, timeout=10, insecure=False):
self.idleTimeout = idleTimeout
self.timeout = timeout
+ self.insecure = insecure
def toDict (self):
- return dict (idleTimeout=self.idleTimeout, timeout=self.timeout)
+ return dict (
+ idleTimeout=self.idleTimeout,
+ timeout=self.timeout,
+ insecure=self.insecure,
+ )
defaultSettings = ControllerSettings ()
@@ -204,17 +209,21 @@ class SinglePageController:
handle = asyncio.ensure_future (processQueue ())
timeoutProc = asyncio.ensure_future (asyncio.sleep (self.settings.timeout))
+ # configure browser
+ tab = l.tab
+ await tab.Security.setIgnoreCertificateErrors (ignore=self.settings.insecure)
+
# not all behavior scripts are allowed for every URL, filter them
self._enabledBehavior = list (filter (lambda x: self.url in x,
map (lambda x: x (l, logger), self.behavior)))
- version = await l.tab.Browser.getVersion ()
+ version = await tab.Browser.getVersion ()
payload = {
'software': getSoftwareInfo (),
'browser': {
'product': version['product'],
'useragent': version['userAgent'],
- 'viewport': await getFormattedViewportMetrics (l.tab),
+ 'viewport': await getFormattedViewportMetrics (tab),
},
'tool': 'crocoite-single', # not the name of the cli utility
'parameters': {
@@ -222,6 +231,7 @@ class SinglePageController:
'idleTimeout': self.settings.idleTimeout,
'timeout': self.settings.timeout,
'behavior': list (map (attrgetter('name'), self._enabledBehavior)),
+ 'insecure': self.settings.insecure,
},
}
if self.warcinfo:
@@ -264,7 +274,7 @@ class SinglePageController:
break
await behavior.stop ()
- await l.tab.Page.stopLoading ()
+ await tab.Page.stopLoading ()
await asyncio.sleep (1)
await behavior.finish ()
diff --git a/crocoite/irc.py b/crocoite/irc.py
index 8e7061a..d0b5bb9 100644
--- a/crocoite/irc.py
+++ b/crocoite/irc.py
@@ -444,6 +444,8 @@ class Chromebot (ArgparseBot):
#archiveparser.add_argument('--max-body-size', default=None, type=int, dest='maxBodySize', help='Max body size', metavar='BYTES', choices=[1*1024*1024, 10*1024*1024, 100*1024*1024])
archiveparser.add_argument('--concurrency', '-j', default=1, type=int, help='Parallel workers for this job', choices=range (1, 5))
archiveparser.add_argument('--recursive', '-r', help='Enable recursion', choices=['0', '1', 'prefix'], default='0')
+ archiveparser.add_argument('--insecure', '-k',
+ help='Disable certificate checking', action='store_true')
archiveparser.add_argument('url', help='Website URL', type=isValidUrl, metavar='URL')
archiveparser.set_defaults (func=self.handleArchive,
minPriv=NickMode.voice if self.needVoice else None)
@@ -488,20 +490,27 @@ class Chromebot (ArgparseBot):
'recursive': args.recursive,
'concurrency': args.concurrency,
}
+ if args.insecure:
+ showargs['insecure'] = args.insecure
warcinfo = {'chromebot': {
'jobid': j.id,
'user': user.name,
'queued': j.started,
'url': args.url,
+ 'recursive': args.recursive,
+ 'concurrency': args.concurrency,
}}
- warcinfo['chromebot'].update (showargs)
+ grabCmd = ['crocoite-grab']
+ grabCmd.extend (['--warcinfo',
+ '!' + json.dumps (warcinfo, cls=StrJsonEncoder)])
+ if args.insecure:
+ grabCmd.append ('--insecure')
+ grabCmd.extend (['{url}', '{dest}'])
# prefix warcinfo with !, so it won’t get expanded
cmdline = ['crocoite-recursive', args.url, '--tempdir', self.tempdir,
'--prefix', j.id + '-{host}-{date}-', '--policy',
args.recursive, '--concurrency', str (args.concurrency),
- self.destdir, '--', 'crocoite-grab', '--warcinfo',
- '!' + json.dumps (warcinfo, cls=StrJsonEncoder), '{url}',
- '{dest}']
+ self.destdir, '--'] + grabCmd
strargs = ', '.join (map (lambda x: '{}={}'.format (*x), showargs.items ()))
reply (f'{args.url} has been queued as {j.id} with {strargs}')