From 6ccd72ab96cfba36c217a77641b3b8a91906c512 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Sat, 8 Dec 2018 09:05:12 +0100 Subject: tools: Add version info to merged WARCs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation for #9. I was hoping to reuse one of schema.org’s microdata schema’s, but neither Action (archival action) nor SoftwareApplication (version information) seem to be suitable. --- crocoite/tools.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'crocoite/tools.py') diff --git a/crocoite/tools.py b/crocoite/tools.py index 843270e..e2dc6a7 100644 --- a/crocoite/tools.py +++ b/crocoite/tools.py @@ -22,15 +22,37 @@ Misc tools """ -import shutil, sys, os, logging, argparse +import shutil, sys, os, logging, argparse, json +from io import BytesIO from warcio.archiveiterator import ArchiveIterator from warcio.warcwriter import WARCWriter +from .util import packageUrl, getSoftwareInfo def mergeWarc (files, output): unique = 0 revisit = 0 payloadMap = {} writer = WARCWriter (output, gzip=True) + + # Add an additional warcinfo record, describing the transformations. This + # is not ideal, since + # “A ‘warcinfo’ record describes the records that + # follow it […] until next ‘warcinfo’” + # -- https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#warcinfo + # A warcinfo record is expected at the beginning of every file. But it + # might have written by a different software, so we don’t want to + # strip/replace that information, but supplement it. + warcinfo = { + 'software': getSoftwareInfo (), + 'tool': 'crocoite-merge', # not the name of the cli tool + 'parameters': {'inputs': files}, + } + payload = BytesIO (json.dumps (warcinfo, indent=2).encode ('utf-8')) + record = writer.create_warc_record (packageUrl ('warcinfo'), 'warcinfo', + payload=payload, + warc_headers_dict={'Content-Type': 'text/plain; encoding=utf-8'}) + writer.write_record (record) + for l in files: with open (l, 'rb') as fd: for record in ArchiveIterator (fd): -- cgit v1.2.3