summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2018-11-17 10:35:49 +0100
committerLars-Dominik Braun <lars@6xq.net>2018-11-17 10:37:00 +0100
commit16f12498a18119fcee6d3278aea9d7a7cfdbd7c9 (patch)
tree75030819ca40f76c0c7ffd088ae56231c1e9683d
parent25b5cc982dc1e5c12a3c2ec528e3b4cf518fd780 (diff)
downloadcrocoite-16f12498a18119fcee6d3278aea9d7a7cfdbd7c9.tar.gz
crocoite-16f12498a18119fcee6d3278aea9d7a7cfdbd7c9.tar.bz2
crocoite-16f12498a18119fcee6d3278aea9d7a7cfdbd7c9.zip
tools: Add original HTTP header to revisit record
The payloads may be the same, but the headers are usually not.
-rw-r--r--crocoite/test_tools.py19
-rw-r--r--crocoite/tools.py5
2 files changed, 13 insertions, 11 deletions
diff --git a/crocoite/test_tools.py b/crocoite/test_tools.py
index 09886b2..5e980d0 100644
--- a/crocoite/test_tools.py
+++ b/crocoite/test_tools.py
@@ -34,14 +34,16 @@ def writer():
def recordsEqual(golden, underTest):
for a, b in zip (golden, underTest):
- # record ids are not predictable, so we cannot compare them
- a.rec_headers.remove_header('WARC-Record-Id')
- a.rec_headers.remove_header('WARC-Block-Digest')
- b.rec_headers.remove_header('WARC-Record-Id')
- b.rec_headers.remove_header('WARC-Block-Digest')
+ # record ids are not predictable, so we cannot compare them. Dito for
+ # dates. Content-* seems to be added when writing to file.
+ for x in {'WARC-Record-Id', 'WARC-Block-Digest', 'WARC-Date',
+ 'Content-Length', 'Content-Type'}:
+ a.rec_headers.remove_header(x)
+ b.rec_headers.remove_header(x)
aheader = sorted(a.rec_headers.headers, key=itemgetter(0))
bheader = sorted(b.rec_headers.headers, key=itemgetter(0))
assert aheader == bheader
+ assert a.http_headers == b.http_headers
def test_unmodified(writer):
"""
@@ -101,16 +103,13 @@ def makeRevisit(writer, ref, dup):
""" Make revisit record for reference """
dupHeaders = dup.rec_headers
refHeaders = ref.rec_headers
- httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
record = writer.create_revisit_record (dupHeaders.get_header('WARC-Target-URI'),
digest=refHeaders.get_header('WARC-Payload-Digest'),
refers_to_uri=refHeaders.get_header('WARC-Target-URI'),
- refers_to_date=refHeaders.get_header('WARC-Date'))
+ refers_to_date=refHeaders.get_header('WARC-Date'),
+ http_headers=dup.http_headers)
record.rec_headers.add_header ('WARC-Refers-To', refHeaders.get_header('WARC-Record-ID'))
record.rec_headers.add_header ('WARC-Truncated', 'length')
- record.rec_headers.add_header ('Content-Length', '0')
- # XXX: added by warcio, but this seems wrong
- record.rec_headers.add_header ('Content-Type', 'application/http; msgtype=response')
return record
def test_resp_revisit_same_url(writer):
diff --git a/crocoite/tools.py b/crocoite/tools.py
index 8541ca2..da32f85 100644
--- a/crocoite/tools.py
+++ b/crocoite/tools.py
@@ -45,9 +45,12 @@ def mergeWarc (files, output):
unique += 1
else:
logging.debug ('Record {} is duplicate of {}'.format (rid, dup['id']))
+ # Payload may be identical, but HTTP headers are
+ # (probably) not. Include them.
record = writer.create_revisit_record (
headers.get_header('WARC-Target-URI'), digest=csum,
- refers_to_uri=dup['uri'], refers_to_date=dup['date'])
+ refers_to_uri=dup['uri'], refers_to_date=dup['date'],
+ http_headers=record.http_headers)
record.rec_headers.add_header ('WARC-Truncated', 'length')
record.rec_headers.add_header ('WARC-Refers-To', dup['id'])
revisit += 1