tools: Add version info to merged WARCs

In preparation for #9. I was hoping to reuse one of schema.org’s microdata schema’s, but neither Action (archival action) nor SoftwareApplication (version information) seem to be suitable.
author: Lars-Dominik Braun <lars@6xq.net> 2018-12-08 09:05:12 +0100
committer: Lars-Dominik Braun <lars@6xq.net> 2018-12-08 09:05:45 +0100
commit: 6ccd72ab96cfba36c217a77641b3b8a91906c512 (patch)
tree: 0a96f837e8ef4776af1b87aa7dd43edc6c55be3a /crocoite/tools.py
parent: aec7a8c583c8228e9538c923d39ef80862bafdde (diff)
download: crocoite-6ccd72ab96cfba36c217a77641b3b8a91906c512.tar.gz
crocoite-6ccd72ab96cfba36c217a77641b3b8a91906c512.tar.bz2
crocoite-6ccd72ab96cfba36c217a77641b3b8a91906c512.zip
1 files changed, 23 insertions, 1 deletions
diff --git a/crocoite/tools.py b/crocoite/tools.py
index 843270e..e2dc6a7 100644
--- a/crocoite/tools.py
+++ b/crocoite/tools.py
@@ -22,15 +22,37 @@
 Misc tools
 """
 
-import shutil, sys, os, logging, argparse
+import shutil, sys, os, logging, argparse, json
+from io import BytesIO
 from warcio.archiveiterator import ArchiveIterator
 from warcio.warcwriter import WARCWriter
+from .util import packageUrl, getSoftwareInfo
 
 def mergeWarc (files, output):
     unique = 0
     revisit = 0
     payloadMap = {}
     writer = WARCWriter (output, gzip=True)
+
+    # Add an additional warcinfo record, describing the transformations. This
+    # is not ideal, since
+    #   “A ‘warcinfo’ record describes the records that
+    #   follow it […] until next ‘warcinfo’”
+    #   -- https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#warcinfo
+    # A warcinfo record is expected at the beginning of every file. But it
+    # might have written by a different software, so we don’t want to
+    # strip/replace that information, but supplement it.
+    warcinfo = {
+            'software': getSoftwareInfo (),
+            'tool': 'crocoite-merge', # not the name of the cli tool
+            'parameters': {'inputs': files},
+            }
+    payload = BytesIO (json.dumps (warcinfo, indent=2).encode ('utf-8'))
+    record = writer.create_warc_record (packageUrl ('warcinfo'), 'warcinfo',
+            payload=payload,
+            warc_headers_dict={'Content-Type': 'text/plain; encoding=utf-8'})
+    writer.write_record (record)
+
     for l in files:
         with open (l, 'rb') as fd:
             for record in ArchiveIterator (fd):
author	Lars-Dominik Braun <lars@6xq.net>	2018-12-08 09:05:12 +0100
committer	Lars-Dominik Braun <lars@6xq.net>	2018-12-08 09:05:45 +0100
commit	6ccd72ab96cfba36c217a77641b3b8a91906c512 (patch)
tree	0a96f837e8ef4776af1b87aa7dd43edc6c55be3a /crocoite/tools.py
parent	aec7a8c583c8228e9538c923d39ef80862bafdde (diff)
download	crocoite-6ccd72ab96cfba36c217a77641b3b8a91906c512.tar.gz crocoite-6ccd72ab96cfba36c217a77641b3b8a91906c512.tar.bz2 crocoite-6ccd72ab96cfba36c217a77641b3b8a91906c512.zip