diff options
author | Lars-Dominik Braun <lars@6xq.net> | 2018-07-29 09:19:06 +0200 |
---|---|---|
committer | Lars-Dominik Braun <lars@6xq.net> | 2018-08-04 14:11:49 +0200 |
commit | b25c4cccafbd9572fe3e3c9c83c48c19b714a6c3 (patch) | |
tree | 5f41c03124adc3f3a0010b890977dbcb2271002b /crocoite/warc.py | |
parent | 3deded13df1339ef59a760c188804adffd9ed902 (diff) | |
download | crocoite-b25c4cccafbd9572fe3e3c9c83c48c19b714a6c3.tar.gz crocoite-b25c4cccafbd9572fe3e3c9c83c48c19b714a6c3.tar.bz2 crocoite-b25c4cccafbd9572fe3e3c9c83c48c19b714a6c3.zip |
Add package information to warcinfo
Change warcinfo record format to JSON (this is permitted by the specs)
and add Python version, dependencies and their versions as well as file
hashes.
This should give us enough information to figure out the exact
environment used to create the WARC.
Diffstat (limited to 'crocoite/warc.py')
-rw-r--r-- | crocoite/warc.py | 6 |
1 files changed, 5 insertions, 1 deletions
diff --git a/crocoite/warc.py b/crocoite/warc.py index a4a70ac..324d161 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -212,8 +212,12 @@ class WarcHandler (EventHandler): writer.write_record (record) def _writeControllerStart (self, item): + payload = BytesIO (json.dumps (item.payload, indent=2).encode ('utf-8')) + writer = self.writer - warcinfo = writer.create_warcinfo_record (filename=None, info=item.payload) + warcinfo = writer.create_warc_record (packageUrl ('warcinfo'), 'warcinfo', + warc_headers_dict={'Content-Type': 'text/plain; encoding=utf-8'}, + payload=payload) writer.write_record (warcinfo) def _flushLogEntries (self): |