diff options
author | Lars-Dominik Braun <lars@6xq.net> | 2018-11-17 10:35:49 +0100 |
---|---|---|
committer | Lars-Dominik Braun <lars@6xq.net> | 2018-11-17 10:37:00 +0100 |
commit | 16f12498a18119fcee6d3278aea9d7a7cfdbd7c9 (patch) | |
tree | 75030819ca40f76c0c7ffd088ae56231c1e9683d /crocoite/tools.py | |
parent | 25b5cc982dc1e5c12a3c2ec528e3b4cf518fd780 (diff) | |
download | crocoite-16f12498a18119fcee6d3278aea9d7a7cfdbd7c9.tar.gz crocoite-16f12498a18119fcee6d3278aea9d7a7cfdbd7c9.tar.bz2 crocoite-16f12498a18119fcee6d3278aea9d7a7cfdbd7c9.zip |
tools: Add original HTTP header to revisit record
The payloads may be the same, but the headers are usually not.
Diffstat (limited to 'crocoite/tools.py')
-rw-r--r-- | crocoite/tools.py | 5 |
1 files changed, 4 insertions, 1 deletions
diff --git a/crocoite/tools.py b/crocoite/tools.py index 8541ca2..da32f85 100644 --- a/crocoite/tools.py +++ b/crocoite/tools.py @@ -45,9 +45,12 @@ def mergeWarc (files, output): unique += 1 else: logging.debug ('Record {} is duplicate of {}'.format (rid, dup['id'])) + # Payload may be identical, but HTTP headers are + # (probably) not. Include them. record = writer.create_revisit_record ( headers.get_header('WARC-Target-URI'), digest=csum, - refers_to_uri=dup['uri'], refers_to_date=dup['date']) + refers_to_uri=dup['uri'], refers_to_date=dup['date'], + http_headers=record.http_headers) record.rec_headers.add_header ('WARC-Truncated', 'length') record.rec_headers.add_header ('WARC-Refers-To', dup['id']) revisit += 1 |