summaryrefslogtreecommitdiff
path: root/crocoite/tools.py
diff options
context:
space:
mode:
Diffstat (limited to 'crocoite/tools.py')
-rw-r--r--crocoite/tools.py24
1 files changed, 23 insertions, 1 deletions
diff --git a/crocoite/tools.py b/crocoite/tools.py
index 843270e..e2dc6a7 100644
--- a/crocoite/tools.py
+++ b/crocoite/tools.py
@@ -22,15 +22,37 @@
Misc tools
"""
-import shutil, sys, os, logging, argparse
+import shutil, sys, os, logging, argparse, json
+from io import BytesIO
from warcio.archiveiterator import ArchiveIterator
from warcio.warcwriter import WARCWriter
+from .util import packageUrl, getSoftwareInfo
def mergeWarc (files, output):
unique = 0
revisit = 0
payloadMap = {}
writer = WARCWriter (output, gzip=True)
+
+ # Add an additional warcinfo record, describing the transformations. This
+ # is not ideal, since
+ # “A ‘warcinfo’ record describes the records that
+ # follow it […] until next ‘warcinfo’”
+ # -- https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#warcinfo
+ # A warcinfo record is expected at the beginning of every file. But it
+ # might have written by a different software, so we don’t want to
+ # strip/replace that information, but supplement it.
+ warcinfo = {
+ 'software': getSoftwareInfo (),
+ 'tool': 'crocoite-merge', # not the name of the cli tool
+ 'parameters': {'inputs': files},
+ }
+ payload = BytesIO (json.dumps (warcinfo, indent=2).encode ('utf-8'))
+ record = writer.create_warc_record (packageUrl ('warcinfo'), 'warcinfo',
+ payload=payload,
+ warc_headers_dict={'Content-Type': 'text/plain; encoding=utf-8'})
+ writer.write_record (record)
+
for l in files:
with open (l, 'rb') as fd:
for record in ArchiveIterator (fd):