From 1d9c607207b49d62f5f853312bb808da47699398 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Sat, 10 Nov 2018 11:21:11 +0100 Subject: tools: Fix WARC merging WARC-Target-URI was taken from the previous record, even if the URI was different. This essentially removes the revisited URL from the archive. Also add a few tests. And boy, warcio is a mess. --- crocoite/test_tools.py | 188 +++++++++++++++++++++++++++++++++++++++++++++++++ crocoite/tools.py | 35 +++++---- 2 files changed, 205 insertions(+), 18 deletions(-) create mode 100644 crocoite/test_tools.py (limited to 'crocoite') diff --git a/crocoite/test_tools.py b/crocoite/test_tools.py new file mode 100644 index 0000000..09886b2 --- /dev/null +++ b/crocoite/test_tools.py @@ -0,0 +1,188 @@ +# Copyright (c) 2018 crocoite contributors +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +from tempfile import NamedTemporaryFile +from operator import itemgetter +from io import BytesIO +import pytest +from warcio.archiveiterator import ArchiveIterator +from warcio.warcwriter import WARCWriter +from warcio.statusandheaders import StatusAndHeaders + +from .tools import mergeWarc + +@pytest.fixture +def writer(): + return WARCWriter (NamedTemporaryFile(), gzip=True) + +def recordsEqual(golden, underTest): + for a, b in zip (golden, underTest): + # record ids are not predictable, so we cannot compare them + a.rec_headers.remove_header('WARC-Record-Id') + a.rec_headers.remove_header('WARC-Block-Digest') + b.rec_headers.remove_header('WARC-Record-Id') + b.rec_headers.remove_header('WARC-Block-Digest') + aheader = sorted(a.rec_headers.headers, key=itemgetter(0)) + bheader = sorted(b.rec_headers.headers, key=itemgetter(0)) + assert aheader == bheader + +def test_unmodified(writer): + """ + Single request/response pair, no revisits + """ + + records = [] + httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True) + warcHeaders = {} + record = writer.create_warc_record ('http://example.com/', 'request', payload=BytesIO(b'foobar'), + warc_headers_dict=warcHeaders, http_headers=httpHeaders) + records.append (record) + + httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') + record = writer.create_warc_record ('http://example.com/', 'response', payload=BytesIO(b'data'), + warc_headers_dict=warcHeaders, http_headers=httpHeaders) + records.append (record) + + for r in records: + writer.write_record (r) + + output = NamedTemporaryFile() + mergeWarc ([writer.out.name], output) + + output.seek(0) + recordsEqual (records, ArchiveIterator (output)) + +def test_different_payload(writer): + """ + Duplicate URL, but different payload + """ + + records = [] + for i in range (2): + httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True) + warcHeaders = {} + record = writer.create_warc_record ('http://example.com/', 'request', payload=BytesIO(b'foobar'), + warc_headers_dict=warcHeaders, http_headers=httpHeaders) + records.append (record) + + httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') + record = writer.create_warc_record ('http://example.com/', 'response', + payload=BytesIO('data{}'.format(i).encode ('utf8')), + warc_headers_dict=warcHeaders, http_headers=httpHeaders) + records.append (record) + + for r in records: + writer.write_record (r) + + output = NamedTemporaryFile() + mergeWarc ([writer.out.name], output) + + output.seek(0) + recordsEqual (records, ArchiveIterator (output)) + +def makeRevisit(writer, ref, dup): + """ Make revisit record for reference """ + dupHeaders = dup.rec_headers + refHeaders = ref.rec_headers + httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') + record = writer.create_revisit_record (dupHeaders.get_header('WARC-Target-URI'), + digest=refHeaders.get_header('WARC-Payload-Digest'), + refers_to_uri=refHeaders.get_header('WARC-Target-URI'), + refers_to_date=refHeaders.get_header('WARC-Date')) + record.rec_headers.add_header ('WARC-Refers-To', refHeaders.get_header('WARC-Record-ID')) + record.rec_headers.add_header ('WARC-Truncated', 'length') + record.rec_headers.add_header ('Content-Length', '0') + # XXX: added by warcio, but this seems wrong + record.rec_headers.add_header ('Content-Type', 'application/http; msgtype=response') + return record + +def test_resp_revisit_same_url(writer): + """ + Duplicate record for the same URL, creates a revisit + """ + + records = [] + for i in range (2): + httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True) + warcHeaders = {} + record = writer.create_warc_record ('http://example.com/', 'request', payload=BytesIO(b'foobar'), + warc_headers_dict=warcHeaders, http_headers=httpHeaders) + records.append (record) + + httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') + record = writer.create_warc_record ('http://example.com/', 'response', payload=BytesIO(b'data'), + warc_headers_dict=warcHeaders, http_headers=httpHeaders) + records.append (record) + + for r in records: + writer.write_record (r) + + dup = records.pop () + ref = records[1] + records.append (makeRevisit (writer, ref, dup)) + + output = NamedTemporaryFile() + mergeWarc ([writer.out.name], output) + + output.seek(0) + recordsEqual (records, ArchiveIterator (output)) + +def test_resp_revisit_other_url(writer): + """ + Duplicate record for different URL, creates a revisit + """ + + records = [] + + httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True) + warcHeaders = {} + record = writer.create_warc_record ('http://example.com/', 'request', payload=BytesIO(b'foobar'), + warc_headers_dict=warcHeaders, http_headers=httpHeaders) + records.append (record) + + httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') + record = writer.create_warc_record ('http://example.com/', 'response', payload=BytesIO(b'data'), + warc_headers_dict=warcHeaders, http_headers=httpHeaders) + records.append (record) + + httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True) + warcHeaders = {} + record = writer.create_warc_record ('http://example.com/one', 'request', payload=BytesIO(b'foobar'), + warc_headers_dict=warcHeaders, http_headers=httpHeaders) + records.append (record) + + httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') + record = writer.create_warc_record ('http://example.com/one', 'response', payload=BytesIO(b'data'), + warc_headers_dict=warcHeaders, http_headers=httpHeaders) + records.append (record) + + for r in records: + writer.write_record (r) + + dup = records.pop () + ref = records[1] + records.append (makeRevisit (writer, ref, dup)) + + output = NamedTemporaryFile() + mergeWarc ([writer.out.name], output) + + output.seek(0) + recordsEqual (records, ArchiveIterator (output)) + diff --git a/crocoite/tools.py b/crocoite/tools.py index 3aeaaad..8541ca2 100644 --- a/crocoite/tools.py +++ b/crocoite/tools.py @@ -26,26 +26,12 @@ import shutil, sys, re, os, logging, argparse from warcio.archiveiterator import ArchiveIterator from warcio.warcwriter import WARCWriter -def mergeWarc (): - """ - Merge multiple WARC files into a single file, writing revisit records for - items which occur multiple times - """ - - parser = argparse.ArgumentParser(description='Merge WARCs, reads filenames from stdin.') - parser.add_argument('--verbose', '-v', action='store_true') - parser.add_argument('output', type=argparse.FileType ('wb'), help='Output WARC') - - args = parser.parse_args() - loglevel = logging.DEBUG if args.verbose else logging.INFO - logging.basicConfig (level=loglevel) - +def mergeWarc (files, output): unique = 0 revisit = 0 payloadMap = {} - writer = WARCWriter (args.output, gzip=True) - for l in sys.stdin: - l = l.strip () + writer = WARCWriter (output, gzip=True) + for l in files: with open (l, 'rb') as fd: for record in ArchiveIterator (fd): if record.rec_type in {'resource', 'response'}: @@ -59,7 +45,9 @@ def mergeWarc (): unique += 1 else: logging.debug ('Record {} is duplicate of {}'.format (rid, dup['id'])) - record = writer.create_revisit_record (dup['uri'], csum, dup['uri'], dup['date']) + record = writer.create_revisit_record ( + headers.get_header('WARC-Target-URI'), digest=csum, + refers_to_uri=dup['uri'], refers_to_date=dup['date']) record.rec_headers.add_header ('WARC-Truncated', 'length') record.rec_headers.add_header ('WARC-Refers-To', dup['id']) revisit += 1 @@ -68,6 +56,17 @@ def mergeWarc (): writer.write_record (record) logging.info ('Wrote {} unique records, {} revisits'.format (unique, revisit)) +def mergeWarcCli(): + parser = argparse.ArgumentParser(description='Merge WARCs, reads filenames from stdin.') + parser.add_argument('--verbose', '-v', action='store_true') + parser.add_argument('output', type=argparse.FileType ('wb'), help='Output WARC') + + args = parser.parse_args() + loglevel = logging.DEBUG if args.verbose else logging.INFO + logging.basicConfig (level=loglevel) + + mergeWarc([l.strip() for l in sys.stdin], args.output) + def extractScreenshot (): """ Extract page screenshots from a WARC generated by crocoite into files -- cgit v1.2.3