summaryrefslogtreecommitdiff
path: root/crocoite/tools.py
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2018-12-08 09:05:12 +0100
committerLars-Dominik Braun <lars@6xq.net>2018-12-08 09:05:45 +0100
commit6ccd72ab96cfba36c217a77641b3b8a91906c512 (patch)
tree0a96f837e8ef4776af1b87aa7dd43edc6c55be3a /crocoite/tools.py
parentaec7a8c583c8228e9538c923d39ef80862bafdde (diff)
downloadcrocoite-6ccd72ab96cfba36c217a77641b3b8a91906c512.tar.gz
crocoite-6ccd72ab96cfba36c217a77641b3b8a91906c512.tar.bz2
crocoite-6ccd72ab96cfba36c217a77641b3b8a91906c512.zip
tools: Add version info to merged WARCs
In preparation for #9. I was hoping to reuse one of schema.org’s microdata schema’s, but neither Action (archival action) nor SoftwareApplication (version information) seem to be suitable.
Diffstat (limited to 'crocoite/tools.py')
-rw-r--r--crocoite/tools.py24
1 files changed, 23 insertions, 1 deletions
diff --git a/crocoite/tools.py b/crocoite/tools.py
index 843270e..e2dc6a7 100644
--- a/crocoite/tools.py
+++ b/crocoite/tools.py
@@ -22,15 +22,37 @@
Misc tools
"""
-import shutil, sys, os, logging, argparse
+import shutil, sys, os, logging, argparse, json
+from io import BytesIO
from warcio.archiveiterator import ArchiveIterator
from warcio.warcwriter import WARCWriter
+from .util import packageUrl, getSoftwareInfo
def mergeWarc (files, output):
unique = 0
revisit = 0
payloadMap = {}
writer = WARCWriter (output, gzip=True)
+
+ # Add an additional warcinfo record, describing the transformations. This
+ # is not ideal, since
+ # “A ‘warcinfo’ record describes the records that
+ # follow it […] until next ‘warcinfo’”
+ # -- https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#warcinfo
+ # A warcinfo record is expected at the beginning of every file. But it
+ # might have written by a different software, so we don’t want to
+ # strip/replace that information, but supplement it.
+ warcinfo = {
+ 'software': getSoftwareInfo (),
+ 'tool': 'crocoite-merge', # not the name of the cli tool
+ 'parameters': {'inputs': files},
+ }
+ payload = BytesIO (json.dumps (warcinfo, indent=2).encode ('utf-8'))
+ record = writer.create_warc_record (packageUrl ('warcinfo'), 'warcinfo',
+ payload=payload,
+ warc_headers_dict={'Content-Type': 'text/plain; encoding=utf-8'})
+ writer.write_record (record)
+
for l in files:
with open (l, 'rb') as fd:
for record in ArchiveIterator (fd):