From 16f12498a18119fcee6d3278aea9d7a7cfdbd7c9 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Sat, 17 Nov 2018 10:35:49 +0100 Subject: tools: Add original HTTP header to revisit record The payloads may be the same, but the headers are usually not. --- crocoite/test_tools.py | 19 +++++++++---------- crocoite/tools.py | 5 ++++- 2 files changed, 13 insertions(+), 11 deletions(-) (limited to 'crocoite') diff --git a/crocoite/test_tools.py b/crocoite/test_tools.py index 09886b2..5e980d0 100644 --- a/crocoite/test_tools.py +++ b/crocoite/test_tools.py @@ -34,14 +34,16 @@ def writer(): def recordsEqual(golden, underTest): for a, b in zip (golden, underTest): - # record ids are not predictable, so we cannot compare them - a.rec_headers.remove_header('WARC-Record-Id') - a.rec_headers.remove_header('WARC-Block-Digest') - b.rec_headers.remove_header('WARC-Record-Id') - b.rec_headers.remove_header('WARC-Block-Digest') + # record ids are not predictable, so we cannot compare them. Dito for + # dates. Content-* seems to be added when writing to file. + for x in {'WARC-Record-Id', 'WARC-Block-Digest', 'WARC-Date', + 'Content-Length', 'Content-Type'}: + a.rec_headers.remove_header(x) + b.rec_headers.remove_header(x) aheader = sorted(a.rec_headers.headers, key=itemgetter(0)) bheader = sorted(b.rec_headers.headers, key=itemgetter(0)) assert aheader == bheader + assert a.http_headers == b.http_headers def test_unmodified(writer): """ @@ -101,16 +103,13 @@ def makeRevisit(writer, ref, dup): """ Make revisit record for reference """ dupHeaders = dup.rec_headers refHeaders = ref.rec_headers - httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') record = writer.create_revisit_record (dupHeaders.get_header('WARC-Target-URI'), digest=refHeaders.get_header('WARC-Payload-Digest'), refers_to_uri=refHeaders.get_header('WARC-Target-URI'), - refers_to_date=refHeaders.get_header('WARC-Date')) + refers_to_date=refHeaders.get_header('WARC-Date'), + http_headers=dup.http_headers) record.rec_headers.add_header ('WARC-Refers-To', refHeaders.get_header('WARC-Record-ID')) record.rec_headers.add_header ('WARC-Truncated', 'length') - record.rec_headers.add_header ('Content-Length', '0') - # XXX: added by warcio, but this seems wrong - record.rec_headers.add_header ('Content-Type', 'application/http; msgtype=response') return record def test_resp_revisit_same_url(writer): diff --git a/crocoite/tools.py b/crocoite/tools.py index 8541ca2..da32f85 100644 --- a/crocoite/tools.py +++ b/crocoite/tools.py @@ -45,9 +45,12 @@ def mergeWarc (files, output): unique += 1 else: logging.debug ('Record {} is duplicate of {}'.format (rid, dup['id'])) + # Payload may be identical, but HTTP headers are + # (probably) not. Include them. record = writer.create_revisit_record ( headers.get_header('WARC-Target-URI'), digest=csum, - refers_to_uri=dup['uri'], refers_to_date=dup['date']) + refers_to_uri=dup['uri'], refers_to_date=dup['date'], + http_headers=record.http_headers) record.rec_headers.add_header ('WARC-Truncated', 'length') record.rec_headers.add_header ('WARC-Refers-To', dup['id']) revisit += 1 -- cgit v1.2.3