From 5ad1cc9ef693e4832fc3be7617efccc782a37e3f Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Sat, 4 May 2019 19:05:54 +0300 Subject: cli: Allow adding extra data to warcinfo record --- crocoite/cli.py | 7 +++++-- crocoite/controller.py | 9 +++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/crocoite/cli.py b/crocoite/cli.py index d9ebc4d..4e64b97 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -22,7 +22,7 @@ Command line interface """ -import argparse, sys, signal, asyncio, os +import argparse, sys, signal, asyncio, os, json from traceback import TracebackException from enum import IntEnum from yarl import URL @@ -72,6 +72,8 @@ def single (): default=list (behavior.availableMap.keys ()), choices=list (behavior.availableMap.keys ()), metavar='NAME', nargs='*') + parser.add_argument('--warcinfo', help='Add extra information to warcinfo record', + metavar='JSON', type=json.loads) parser.add_argument('url', help='Website URL', type=URL, metavar='URL') parser.add_argument('output', help='WARC filename', metavar='FILE') @@ -89,7 +91,8 @@ def single (): handler = [StatsHandler (), LogHandler (logger), warcHandler] b = list (map (lambda x: behavior.availableMap[x], args.enabledBehaviorNames)) controller = SinglePageController (url=args.url, settings=settings, - service=service, handler=handler, behavior=b, logger=logger) + service=service, handler=handler, behavior=b, logger=logger, + warcinfo=args.warcinfo) try: loop = asyncio.get_event_loop() run = asyncio.ensure_future (controller.run ()) diff --git a/crocoite/controller.py b/crocoite/controller.py index 772bf44..9105997 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -103,17 +103,20 @@ class SinglePageController: (stats, warc writer). """ - __slots__ = ('url', 'service', 'behavior', 'settings', 'logger', 'handler') + __slots__ = ('url', 'service', 'behavior', 'settings', 'logger', 'handler', + 'warcinfo') def __init__ (self, url, logger, \ service, behavior=cbehavior.available, \ - settings=defaultSettings, handler=None): + settings=defaultSettings, handler=None, \ + warcinfo=None): self.url = url self.service = service self.behavior = behavior self.settings = settings self.logger = logger.bind (context=type (self).__name__, url=url) self.handler = handler or [] + self.warcinfo = warcinfo def processItem (self, item): for h in self.handler: @@ -150,6 +153,8 @@ class SinglePageController: 'behavior': list (map (attrgetter('name'), enabledBehavior)), }, } + if self.warcinfo: + payload['extra'] = self.warcinfo self.processItem (ControllerStart (payload)) await l.navigate (self.url) -- cgit v1.2.3