diff options
author | Lars-Dominik Braun <lars@6xq.net> | 2018-12-08 09:05:12 +0100 |
---|---|---|
committer | Lars-Dominik Braun <lars@6xq.net> | 2018-12-08 09:05:45 +0100 |
commit | 6ccd72ab96cfba36c217a77641b3b8a91906c512 (patch) | |
tree | 0a96f837e8ef4776af1b87aa7dd43edc6c55be3a /crocoite | |
parent | aec7a8c583c8228e9538c923d39ef80862bafdde (diff) | |
download | crocoite-6ccd72ab96cfba36c217a77641b3b8a91906c512.tar.gz crocoite-6ccd72ab96cfba36c217a77641b3b8a91906c512.tar.bz2 crocoite-6ccd72ab96cfba36c217a77641b3b8a91906c512.zip |
tools: Add version info to merged WARCs
In preparation for #9.
I was hoping to reuse one of schema.org’s microdata schema’s, but
neither Action (archival action) nor SoftwareApplication (version
information) seem to be suitable.
Diffstat (limited to 'crocoite')
-rw-r--r-- | crocoite/controller.py | 15 | ||||
-rw-r--r-- | crocoite/test_tools.py | 18 | ||||
-rw-r--r-- | crocoite/tools.py | 24 | ||||
-rw-r--r-- | crocoite/util.py | 14 |
4 files changed, 54 insertions, 17 deletions
diff --git a/crocoite/controller.py b/crocoite/controller.py index 22e7e28..1ee1943 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -22,7 +22,7 @@ Controller classes, handling actions required for archival """ -import time, platform +import time import tempfile, asyncio, json, os from itertools import islice from datetime import datetime @@ -31,7 +31,7 @@ from operator import attrgetter from . import behavior as cbehavior from .browser import SiteLoader, Item -from .util import getFormattedViewportMetrics, getRequirements, removeFragment +from .util import getFormattedViewportMetrics, getSoftwareInfo, removeFragment from .behavior import ExtractLinksEvent class ControllerSettings: @@ -143,20 +143,13 @@ class SinglePageController: version = await l.tab.Browser.getVersion () payload = { - 'software': { - 'platform': platform.platform (), - 'python': { - 'implementation': platform.python_implementation(), - 'version': platform.python_version (), - 'build': platform.python_build () - }, - 'self': getRequirements (__package__) - }, + 'software': getSoftwareInfo (), 'browser': { 'product': version['product'], 'useragent': version['userAgent'], 'viewport': await getFormattedViewportMetrics (l.tab), }, + 'tool': 'crocoite-single', # not the name of the cli utility 'parameters': { 'url': self.url, 'idleTimeout': self.settings.idleTimeout, diff --git a/crocoite/test_tools.py b/crocoite/test_tools.py index 5e980d0..947d020 100644 --- a/crocoite/test_tools.py +++ b/crocoite/test_tools.py @@ -27,6 +27,7 @@ from warcio.warcwriter import WARCWriter from warcio.statusandheaders import StatusAndHeaders from .tools import mergeWarc +from .util import packageUrl @pytest.fixture def writer(): @@ -45,12 +46,21 @@ def recordsEqual(golden, underTest): assert aheader == bheader assert a.http_headers == b.http_headers +def makeGolden(writer, records): + # additional warcinfo is written. Content does not matter. + record = writer.create_warc_record (packageUrl ('warcinfo'), 'warcinfo', + payload=b'', + warc_headers_dict={'Content-Type': 'text/plain; encoding=utf-8'}) + records.insert (0, record) + return records + def test_unmodified(writer): """ Single request/response pair, no revisits """ records = [] + httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True) warcHeaders = {} record = writer.create_warc_record ('http://example.com/', 'request', payload=BytesIO(b'foobar'), @@ -69,7 +79,7 @@ def test_unmodified(writer): mergeWarc ([writer.out.name], output) output.seek(0) - recordsEqual (records, ArchiveIterator (output)) + recordsEqual (makeGolden (writer, records), ArchiveIterator (output)) def test_different_payload(writer): """ @@ -97,7 +107,7 @@ def test_different_payload(writer): mergeWarc ([writer.out.name], output) output.seek(0) - recordsEqual (records, ArchiveIterator (output)) + recordsEqual (makeGolden (writer, records), ArchiveIterator (output)) def makeRevisit(writer, ref, dup): """ Make revisit record for reference """ @@ -141,7 +151,7 @@ def test_resp_revisit_same_url(writer): mergeWarc ([writer.out.name], output) output.seek(0) - recordsEqual (records, ArchiveIterator (output)) + recordsEqual (makeGolden (writer, records), ArchiveIterator (output)) def test_resp_revisit_other_url(writer): """ @@ -183,5 +193,5 @@ def test_resp_revisit_other_url(writer): mergeWarc ([writer.out.name], output) output.seek(0) - recordsEqual (records, ArchiveIterator (output)) + recordsEqual (makeGolden (writer, records), ArchiveIterator (output)) diff --git a/crocoite/tools.py b/crocoite/tools.py index 843270e..e2dc6a7 100644 --- a/crocoite/tools.py +++ b/crocoite/tools.py @@ -22,15 +22,37 @@ Misc tools """ -import shutil, sys, os, logging, argparse +import shutil, sys, os, logging, argparse, json +from io import BytesIO from warcio.archiveiterator import ArchiveIterator from warcio.warcwriter import WARCWriter +from .util import packageUrl, getSoftwareInfo def mergeWarc (files, output): unique = 0 revisit = 0 payloadMap = {} writer = WARCWriter (output, gzip=True) + + # Add an additional warcinfo record, describing the transformations. This + # is not ideal, since + # “A ‘warcinfo’ record describes the records that + # follow it […] until next ‘warcinfo’” + # -- https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#warcinfo + # A warcinfo record is expected at the beginning of every file. But it + # might have written by a different software, so we don’t want to + # strip/replace that information, but supplement it. + warcinfo = { + 'software': getSoftwareInfo (), + 'tool': 'crocoite-merge', # not the name of the cli tool + 'parameters': {'inputs': files}, + } + payload = BytesIO (json.dumps (warcinfo, indent=2).encode ('utf-8')) + record = writer.create_warc_record (packageUrl ('warcinfo'), 'warcinfo', + payload=payload, + warc_headers_dict={'Content-Type': 'text/plain; encoding=utf-8'}) + writer.write_record (record) + for l in files: with open (l, 'rb') as fd: for record in ArchiveIterator (fd): diff --git a/crocoite/util.py b/crocoite/util.py index da6d54a..bd26909 100644 --- a/crocoite/util.py +++ b/crocoite/util.py @@ -22,7 +22,7 @@ Random utility functions """ -import random, sys +import random, sys, platform import hashlib, pkg_resources from urllib.parse import urlsplit, urlunsplit @@ -43,6 +43,18 @@ def removeFragment (u): s = urlsplit (u) return urlunsplit ((s.scheme, s.netloc, s.path, s.query, '')) +def getSoftwareInfo (): + """ Get software info for inclusion into warcinfo """ + return { + 'platform': platform.platform (), + 'python': { + 'implementation': platform.python_implementation(), + 'version': platform.python_version (), + 'build': platform.python_build () + }, + 'self': getRequirements (__package__) + } + def getRequirements (dist): """ Get dependencies of a package. |