diff options
Diffstat (limited to 'crocoite')
| -rw-r--r-- | crocoite/cli.py | 9 | ||||
| -rw-r--r-- | crocoite/controller.py | 22 | ||||
| -rw-r--r-- | crocoite/irc.py | 17 | 
3 files changed, 37 insertions, 11 deletions
| diff --git a/crocoite/cli.py b/crocoite/cli.py index cccf728..3294061 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -74,6 +74,9 @@ def single ():              metavar='NAME', nargs='*')      parser.add_argument('--warcinfo', help='Add extra information to warcinfo record',              metavar='JSON', type=json.loads) +    parser.add_argument('-k', '--insecure', +            action='store_true', +            help='Disable certificate validation')      parser.add_argument('url', help='Website URL', type=URL, metavar='URL')      parser.add_argument('output', help='WARC filename', metavar='FILE') @@ -85,7 +88,11 @@ def single ():      service = Process ()      if args.browser:          service = Passthrough (args.browser) -    settings = ControllerSettings (idleTimeout=args.idleTimeout, timeout=args.timeout) +    settings = ControllerSettings ( +            idleTimeout=args.idleTimeout, +            timeout=args.timeout, +            insecure=args.insecure, +            )      with open (args.output, 'wb') as fd, WarcHandler (fd, logger) as warcHandler:          logger.connect (WarcHandlerConsumer (warcHandler))          handler = [StatsHandler (), LogHandler (logger), warcHandler] diff --git a/crocoite/controller.py b/crocoite/controller.py index 1bcca0f..02017c3 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -35,14 +35,19 @@ from .util import getFormattedViewportMetrics, getSoftwareInfo  from .behavior import ExtractLinksEvent  class ControllerSettings: -    __slots__ = ('idleTimeout', 'timeout') +    __slots__ = ('idleTimeout', 'timeout', 'insecure') -    def __init__ (self, idleTimeout=2, timeout=10): +    def __init__ (self, idleTimeout=2, timeout=10, insecure=False):          self.idleTimeout = idleTimeout          self.timeout = timeout +        self.insecure = insecure      def toDict (self): -        return dict (idleTimeout=self.idleTimeout, timeout=self.timeout) +        return dict ( +                idleTimeout=self.idleTimeout, +                timeout=self.timeout, +                insecure=self.insecure, +                )  defaultSettings = ControllerSettings () @@ -204,17 +209,21 @@ class SinglePageController:              handle = asyncio.ensure_future (processQueue ())              timeoutProc = asyncio.ensure_future (asyncio.sleep (self.settings.timeout)) +            # configure browser +            tab = l.tab +            await tab.Security.setIgnoreCertificateErrors (ignore=self.settings.insecure) +              # not all behavior scripts are allowed for every URL, filter them              self._enabledBehavior = list (filter (lambda x: self.url in x,                      map (lambda x: x (l, logger), self.behavior))) -            version = await l.tab.Browser.getVersion () +            version = await tab.Browser.getVersion ()              payload = {                      'software': getSoftwareInfo (),                      'browser': {                          'product': version['product'],                          'useragent': version['userAgent'], -                        'viewport': await getFormattedViewportMetrics (l.tab), +                        'viewport': await getFormattedViewportMetrics (tab),                          },                      'tool': 'crocoite-single', # not the name of the cli utility                      'parameters': { @@ -222,6 +231,7 @@ class SinglePageController:                          'idleTimeout': self.settings.idleTimeout,                          'timeout': self.settings.timeout,                          'behavior': list (map (attrgetter('name'), self._enabledBehavior)), +                        'insecure': self.settings.insecure,                          },                      }              if self.warcinfo: @@ -264,7 +274,7 @@ class SinglePageController:                      break              await behavior.stop () -            await l.tab.Page.stopLoading () +            await tab.Page.stopLoading ()              await asyncio.sleep (1)              await behavior.finish () diff --git a/crocoite/irc.py b/crocoite/irc.py index 8e7061a..d0b5bb9 100644 --- a/crocoite/irc.py +++ b/crocoite/irc.py @@ -444,6 +444,8 @@ class Chromebot (ArgparseBot):          #archiveparser.add_argument('--max-body-size', default=None, type=int, dest='maxBodySize', help='Max body size', metavar='BYTES', choices=[1*1024*1024, 10*1024*1024, 100*1024*1024])          archiveparser.add_argument('--concurrency', '-j', default=1, type=int, help='Parallel workers for this job', choices=range (1, 5))          archiveparser.add_argument('--recursive', '-r', help='Enable recursion', choices=['0', '1', 'prefix'], default='0') +        archiveparser.add_argument('--insecure', '-k', +                help='Disable certificate checking', action='store_true')          archiveparser.add_argument('url', help='Website URL', type=isValidUrl, metavar='URL')          archiveparser.set_defaults (func=self.handleArchive,                  minPriv=NickMode.voice if self.needVoice else None) @@ -488,20 +490,27 @@ class Chromebot (ArgparseBot):                  'recursive': args.recursive,                  'concurrency': args.concurrency,                  } +        if args.insecure: +            showargs['insecure'] = args.insecure          warcinfo = {'chromebot': {                  'jobid': j.id,                  'user': user.name,                  'queued': j.started,                  'url': args.url, +                'recursive': args.recursive, +                'concurrency': args.concurrency,                  }} -        warcinfo['chromebot'].update (showargs) +        grabCmd = ['crocoite-grab'] +        grabCmd.extend (['--warcinfo', +                '!' + json.dumps (warcinfo, cls=StrJsonEncoder)]) +        if args.insecure: +            grabCmd.append ('--insecure') +        grabCmd.extend (['{url}', '{dest}'])          # prefix warcinfo with !, so it won’t get expanded          cmdline = ['crocoite-recursive', args.url, '--tempdir', self.tempdir,                  '--prefix', j.id + '-{host}-{date}-', '--policy',                  args.recursive, '--concurrency', str (args.concurrency), -                self.destdir, '--', 'crocoite-grab', '--warcinfo', -                '!' + json.dumps (warcinfo, cls=StrJsonEncoder), '{url}', -                '{dest}'] +                self.destdir, '--'] + grabCmd          strargs = ', '.join (map (lambda x: '{}={}'.format (*x), showargs.items ()))          reply (f'{args.url} has been queued as {j.id} with {strargs}') | 
