From 6ccd72ab96cfba36c217a77641b3b8a91906c512 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Sat, 8 Dec 2018 09:05:12 +0100 Subject: tools: Add version info to merged WARCs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation for #9. I was hoping to reuse one of schema.org’s microdata schema’s, but neither Action (archival action) nor SoftwareApplication (version information) seem to be suitable. --- crocoite/controller.py | 15 ++++----------- crocoite/test_tools.py | 18 ++++++++++++++---- crocoite/tools.py | 24 +++++++++++++++++++++++- crocoite/util.py | 14 +++++++++++++- 4 files changed, 54 insertions(+), 17 deletions(-) (limited to 'crocoite') diff --git a/crocoite/controller.py b/crocoite/controller.py index 22e7e28..1ee1943 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -22,7 +22,7 @@ Controller classes, handling actions required for archival """ -import time, platform +import time import tempfile, asyncio, json, os from itertools import islice from datetime import datetime @@ -31,7 +31,7 @@ from operator import attrgetter from . import behavior as cbehavior from .browser import SiteLoader, Item -from .util import getFormattedViewportMetrics, getRequirements, removeFragment +from .util import getFormattedViewportMetrics, getSoftwareInfo, removeFragment from .behavior import ExtractLinksEvent class ControllerSettings: @@ -143,20 +143,13 @@ class SinglePageController: version = await l.tab.Browser.getVersion () payload = { - 'software': { - 'platform': platform.platform (), - 'python': { - 'implementation': platform.python_implementation(), - 'version': platform.python_version (), - 'build': platform.python_build () - }, - 'self': getRequirements (__package__) - }, + 'software': getSoftwareInfo (), 'browser': { 'product': version['product'], 'useragent': version['userAgent'], 'viewport': await getFormattedViewportMetrics (l.tab), }, + 'tool': 'crocoite-single', # not the name of the cli utility 'parameters': { 'url': self.url, 'idleTimeout': self.settings.idleTimeout, diff --git a/crocoite/test_tools.py b/crocoite/test_tools.py index 5e980d0..947d020 100644 --- a/crocoite/test_tools.py +++ b/crocoite/test_tools.py @@ -27,6 +27,7 @@ from warcio.warcwriter import WARCWriter from warcio.statusandheaders import StatusAndHeaders from .tools import mergeWarc +from .util import packageUrl @pytest.fixture def writer(): @@ -45,12 +46,21 @@ def recordsEqual(golden, underTest): assert aheader == bheader assert a.http_headers == b.http_headers +def makeGolden(writer, records): + # additional warcinfo is written. Content does not matter. + record = writer.create_warc_record (packageUrl ('warcinfo'), 'warcinfo', + payload=b'', + warc_headers_dict={'Content-Type': 'text/plain; encoding=utf-8'}) + records.insert (0, record) + return records + def test_unmodified(writer): """ Single request/response pair, no revisits """ records = [] + httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True) warcHeaders = {} record = writer.create_warc_record ('http://example.com/', 'request', payload=BytesIO(b'foobar'), @@ -69,7 +79,7 @@ def test_unmodified(writer): mergeWarc ([writer.out.name], output) output.seek(0) - recordsEqual (records, ArchiveIterator (output)) + recordsEqual (makeGolden (writer, records), ArchiveIterator (output)) def test_different_payload(writer): """ @@ -97,7 +107,7 @@ def test_different_payload(writer): mergeWarc ([writer.out.name], output) output.seek(0) - recordsEqual (records, ArchiveIterator (output)) + recordsEqual (makeGolden (writer, records), ArchiveIterator (output)) def makeRevisit(writer, ref, dup): """ Make revisit record for reference """ @@ -141,7 +151,7 @@ def test_resp_revisit_same_url(writer): mergeWarc ([writer.out.name], output) output.seek(0) - recordsEqual (records, ArchiveIterator (output)) + recordsEqual (makeGolden (writer, records), ArchiveIterator (output)) def test_resp_revisit_other_url(writer): """ @@ -183,5 +193,5 @@ def test_resp_revisit_other_url(writer): mergeWarc ([writer.out.name], output) output.seek(0) - recordsEqual (records, ArchiveIterator (output)) + recordsEqual (makeGolden (writer, records), ArchiveIterator (output)) diff --git a/crocoite/tools.py b/crocoite/tools.py index 843270e..e2dc6a7 100644 --- a/crocoite/tools.py +++ b/crocoite/tools.py @@ -22,15 +22,37 @@ Misc tools """ -import shutil, sys, os, logging, argparse +import shutil, sys, os, logging, argparse, json +from io import BytesIO from warcio.archiveiterator import ArchiveIterator from warcio.warcwriter import WARCWriter +from .util import packageUrl, getSoftwareInfo def mergeWarc (files, output): unique = 0 revisit = 0 payloadMap = {} writer = WARCWriter (output, gzip=True) + + # Add an additional warcinfo record, describing the transformations. This + # is not ideal, since + # “A ‘warcinfo’ record describes the records that + # follow it […] until next ‘warcinfo’” + # -- https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#warcinfo + # A warcinfo record is expected at the beginning of every file. But it + # might have written by a different software, so we don’t want to + # strip/replace that information, but supplement it. + warcinfo = { + 'software': getSoftwareInfo (), + 'tool': 'crocoite-merge', # not the name of the cli tool + 'parameters': {'inputs': files}, + } + payload = BytesIO (json.dumps (warcinfo, indent=2).encode ('utf-8')) + record = writer.create_warc_record (packageUrl ('warcinfo'), 'warcinfo', + payload=payload, + warc_headers_dict={'Content-Type': 'text/plain; encoding=utf-8'}) + writer.write_record (record) + for l in files: with open (l, 'rb') as fd: for record in ArchiveIterator (fd): diff --git a/crocoite/util.py b/crocoite/util.py index da6d54a..bd26909 100644 --- a/crocoite/util.py +++ b/crocoite/util.py @@ -22,7 +22,7 @@ Random utility functions """ -import random, sys +import random, sys, platform import hashlib, pkg_resources from urllib.parse import urlsplit, urlunsplit @@ -43,6 +43,18 @@ def removeFragment (u): s = urlsplit (u) return urlunsplit ((s.scheme, s.netloc, s.path, s.query, '')) +def getSoftwareInfo (): + """ Get software info for inclusion into warcinfo """ + return { + 'platform': platform.platform (), + 'python': { + 'implementation': platform.python_implementation(), + 'version': platform.python_version (), + 'build': platform.python_build () + }, + 'self': getRequirements (__package__) + } + def getRequirements (dist): """ Get dependencies of a package. -- cgit v1.2.3