summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2019-05-04 19:05:54 +0300
committerLars-Dominik Braun <lars@6xq.net>2019-05-05 09:33:01 +0300
commit5ad1cc9ef693e4832fc3be7617efccc782a37e3f (patch)
treef77d92e99e014bd8fef0d2c7d623322df52fa361
parent002b9f5a766699aa280ee1e96b308752f0fd557b (diff)
downloadcrocoite-5ad1cc9ef693e4832fc3be7617efccc782a37e3f.tar.gz
crocoite-5ad1cc9ef693e4832fc3be7617efccc782a37e3f.tar.bz2
crocoite-5ad1cc9ef693e4832fc3be7617efccc782a37e3f.zip
cli: Allow adding extra data to warcinfo record
-rw-r--r--crocoite/cli.py7
-rw-r--r--crocoite/controller.py9
2 files changed, 12 insertions, 4 deletions
diff --git a/crocoite/cli.py b/crocoite/cli.py
index d9ebc4d..4e64b97 100644
--- a/crocoite/cli.py
+++ b/crocoite/cli.py
@@ -22,7 +22,7 @@
Command line interface
"""
-import argparse, sys, signal, asyncio, os
+import argparse, sys, signal, asyncio, os, json
from traceback import TracebackException
from enum import IntEnum
from yarl import URL
@@ -72,6 +72,8 @@ def single ():
default=list (behavior.availableMap.keys ()),
choices=list (behavior.availableMap.keys ()),
metavar='NAME', nargs='*')
+ parser.add_argument('--warcinfo', help='Add extra information to warcinfo record',
+ metavar='JSON', type=json.loads)
parser.add_argument('url', help='Website URL', type=URL, metavar='URL')
parser.add_argument('output', help='WARC filename', metavar='FILE')
@@ -89,7 +91,8 @@ def single ():
handler = [StatsHandler (), LogHandler (logger), warcHandler]
b = list (map (lambda x: behavior.availableMap[x], args.enabledBehaviorNames))
controller = SinglePageController (url=args.url, settings=settings,
- service=service, handler=handler, behavior=b, logger=logger)
+ service=service, handler=handler, behavior=b, logger=logger,
+ warcinfo=args.warcinfo)
try:
loop = asyncio.get_event_loop()
run = asyncio.ensure_future (controller.run ())
diff --git a/crocoite/controller.py b/crocoite/controller.py
index 772bf44..9105997 100644
--- a/crocoite/controller.py
+++ b/crocoite/controller.py
@@ -103,17 +103,20 @@ class SinglePageController:
(stats, warc writer).
"""
- __slots__ = ('url', 'service', 'behavior', 'settings', 'logger', 'handler')
+ __slots__ = ('url', 'service', 'behavior', 'settings', 'logger', 'handler',
+ 'warcinfo')
def __init__ (self, url, logger, \
service, behavior=cbehavior.available, \
- settings=defaultSettings, handler=None):
+ settings=defaultSettings, handler=None, \
+ warcinfo=None):
self.url = url
self.service = service
self.behavior = behavior
self.settings = settings
self.logger = logger.bind (context=type (self).__name__, url=url)
self.handler = handler or []
+ self.warcinfo = warcinfo
def processItem (self, item):
for h in self.handler:
@@ -150,6 +153,8 @@ class SinglePageController:
'behavior': list (map (attrgetter('name'), enabledBehavior)),
},
}
+ if self.warcinfo:
+ payload['extra'] = self.warcinfo
self.processItem (ControllerStart (payload))
await l.navigate (self.url)