diff options
author | Lars-Dominik Braun <lars@6xq.net> | 2018-11-10 11:21:11 +0100 |
---|---|---|
committer | Lars-Dominik Braun <lars@6xq.net> | 2018-11-10 11:21:11 +0100 |
commit | 1d9c607207b49d62f5f853312bb808da47699398 (patch) | |
tree | e2fcf6c0934856b17f3389ad1892268747dd7b13 /crocoite/tools.py | |
parent | f30ab5515a2775d35e66da9d5dfc52a29a68bf9a (diff) | |
download | crocoite-1d9c607207b49d62f5f853312bb808da47699398.tar.gz crocoite-1d9c607207b49d62f5f853312bb808da47699398.tar.bz2 crocoite-1d9c607207b49d62f5f853312bb808da47699398.zip |
tools: Fix WARC merging
WARC-Target-URI was taken from the previous record, even if the URI was
different. This essentially removes the revisited URL from the archive.
Also add a few tests. And boy, warcio is a mess.
Diffstat (limited to 'crocoite/tools.py')
-rw-r--r-- | crocoite/tools.py | 35 |
1 files changed, 17 insertions, 18 deletions
diff --git a/crocoite/tools.py b/crocoite/tools.py index 3aeaaad..8541ca2 100644 --- a/crocoite/tools.py +++ b/crocoite/tools.py @@ -26,26 +26,12 @@ import shutil, sys, re, os, logging, argparse from warcio.archiveiterator import ArchiveIterator from warcio.warcwriter import WARCWriter -def mergeWarc (): - """ - Merge multiple WARC files into a single file, writing revisit records for - items which occur multiple times - """ - - parser = argparse.ArgumentParser(description='Merge WARCs, reads filenames from stdin.') - parser.add_argument('--verbose', '-v', action='store_true') - parser.add_argument('output', type=argparse.FileType ('wb'), help='Output WARC') - - args = parser.parse_args() - loglevel = logging.DEBUG if args.verbose else logging.INFO - logging.basicConfig (level=loglevel) - +def mergeWarc (files, output): unique = 0 revisit = 0 payloadMap = {} - writer = WARCWriter (args.output, gzip=True) - for l in sys.stdin: - l = l.strip () + writer = WARCWriter (output, gzip=True) + for l in files: with open (l, 'rb') as fd: for record in ArchiveIterator (fd): if record.rec_type in {'resource', 'response'}: @@ -59,7 +45,9 @@ def mergeWarc (): unique += 1 else: logging.debug ('Record {} is duplicate of {}'.format (rid, dup['id'])) - record = writer.create_revisit_record (dup['uri'], csum, dup['uri'], dup['date']) + record = writer.create_revisit_record ( + headers.get_header('WARC-Target-URI'), digest=csum, + refers_to_uri=dup['uri'], refers_to_date=dup['date']) record.rec_headers.add_header ('WARC-Truncated', 'length') record.rec_headers.add_header ('WARC-Refers-To', dup['id']) revisit += 1 @@ -68,6 +56,17 @@ def mergeWarc (): writer.write_record (record) logging.info ('Wrote {} unique records, {} revisits'.format (unique, revisit)) +def mergeWarcCli(): + parser = argparse.ArgumentParser(description='Merge WARCs, reads filenames from stdin.') + parser.add_argument('--verbose', '-v', action='store_true') + parser.add_argument('output', type=argparse.FileType ('wb'), help='Output WARC') + + args = parser.parse_args() + loglevel = logging.DEBUG if args.verbose else logging.INFO + logging.basicConfig (level=loglevel) + + mergeWarc([l.strip() for l in sys.stdin], args.output) + def extractScreenshot (): """ Extract page screenshots from a WARC generated by crocoite into files |