summaryrefslogtreecommitdiff
path: root/crocoite
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2018-12-08 09:05:12 +0100
committerLars-Dominik Braun <lars@6xq.net>2018-12-08 09:05:45 +0100
commit6ccd72ab96cfba36c217a77641b3b8a91906c512 (patch)
tree0a96f837e8ef4776af1b87aa7dd43edc6c55be3a /crocoite
parentaec7a8c583c8228e9538c923d39ef80862bafdde (diff)
downloadcrocoite-6ccd72ab96cfba36c217a77641b3b8a91906c512.tar.gz
crocoite-6ccd72ab96cfba36c217a77641b3b8a91906c512.tar.bz2
crocoite-6ccd72ab96cfba36c217a77641b3b8a91906c512.zip
tools: Add version info to merged WARCs
In preparation for #9. I was hoping to reuse one of schema.org’s microdata schema’s, but neither Action (archival action) nor SoftwareApplication (version information) seem to be suitable.
Diffstat (limited to 'crocoite')
-rw-r--r--crocoite/controller.py15
-rw-r--r--crocoite/test_tools.py18
-rw-r--r--crocoite/tools.py24
-rw-r--r--crocoite/util.py14
4 files changed, 54 insertions, 17 deletions
diff --git a/crocoite/controller.py b/crocoite/controller.py
index 22e7e28..1ee1943 100644
--- a/crocoite/controller.py
+++ b/crocoite/controller.py
@@ -22,7 +22,7 @@
Controller classes, handling actions required for archival
"""
-import time, platform
+import time
import tempfile, asyncio, json, os
from itertools import islice
from datetime import datetime
@@ -31,7 +31,7 @@ from operator import attrgetter
from . import behavior as cbehavior
from .browser import SiteLoader, Item
-from .util import getFormattedViewportMetrics, getRequirements, removeFragment
+from .util import getFormattedViewportMetrics, getSoftwareInfo, removeFragment
from .behavior import ExtractLinksEvent
class ControllerSettings:
@@ -143,20 +143,13 @@ class SinglePageController:
version = await l.tab.Browser.getVersion ()
payload = {
- 'software': {
- 'platform': platform.platform (),
- 'python': {
- 'implementation': platform.python_implementation(),
- 'version': platform.python_version (),
- 'build': platform.python_build ()
- },
- 'self': getRequirements (__package__)
- },
+ 'software': getSoftwareInfo (),
'browser': {
'product': version['product'],
'useragent': version['userAgent'],
'viewport': await getFormattedViewportMetrics (l.tab),
},
+ 'tool': 'crocoite-single', # not the name of the cli utility
'parameters': {
'url': self.url,
'idleTimeout': self.settings.idleTimeout,
diff --git a/crocoite/test_tools.py b/crocoite/test_tools.py
index 5e980d0..947d020 100644
--- a/crocoite/test_tools.py
+++ b/crocoite/test_tools.py
@@ -27,6 +27,7 @@ from warcio.warcwriter import WARCWriter
from warcio.statusandheaders import StatusAndHeaders
from .tools import mergeWarc
+from .util import packageUrl
@pytest.fixture
def writer():
@@ -45,12 +46,21 @@ def recordsEqual(golden, underTest):
assert aheader == bheader
assert a.http_headers == b.http_headers
+def makeGolden(writer, records):
+ # additional warcinfo is written. Content does not matter.
+ record = writer.create_warc_record (packageUrl ('warcinfo'), 'warcinfo',
+ payload=b'',
+ warc_headers_dict={'Content-Type': 'text/plain; encoding=utf-8'})
+ records.insert (0, record)
+ return records
+
def test_unmodified(writer):
"""
Single request/response pair, no revisits
"""
records = []
+
httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True)
warcHeaders = {}
record = writer.create_warc_record ('http://example.com/', 'request', payload=BytesIO(b'foobar'),
@@ -69,7 +79,7 @@ def test_unmodified(writer):
mergeWarc ([writer.out.name], output)
output.seek(0)
- recordsEqual (records, ArchiveIterator (output))
+ recordsEqual (makeGolden (writer, records), ArchiveIterator (output))
def test_different_payload(writer):
"""
@@ -97,7 +107,7 @@ def test_different_payload(writer):
mergeWarc ([writer.out.name], output)
output.seek(0)
- recordsEqual (records, ArchiveIterator (output))
+ recordsEqual (makeGolden (writer, records), ArchiveIterator (output))
def makeRevisit(writer, ref, dup):
""" Make revisit record for reference """
@@ -141,7 +151,7 @@ def test_resp_revisit_same_url(writer):
mergeWarc ([writer.out.name], output)
output.seek(0)
- recordsEqual (records, ArchiveIterator (output))
+ recordsEqual (makeGolden (writer, records), ArchiveIterator (output))
def test_resp_revisit_other_url(writer):
"""
@@ -183,5 +193,5 @@ def test_resp_revisit_other_url(writer):
mergeWarc ([writer.out.name], output)
output.seek(0)
- recordsEqual (records, ArchiveIterator (output))
+ recordsEqual (makeGolden (writer, records), ArchiveIterator (output))
diff --git a/crocoite/tools.py b/crocoite/tools.py
index 843270e..e2dc6a7 100644
--- a/crocoite/tools.py
+++ b/crocoite/tools.py
@@ -22,15 +22,37 @@
Misc tools
"""
-import shutil, sys, os, logging, argparse
+import shutil, sys, os, logging, argparse, json
+from io import BytesIO
from warcio.archiveiterator import ArchiveIterator
from warcio.warcwriter import WARCWriter
+from .util import packageUrl, getSoftwareInfo
def mergeWarc (files, output):
unique = 0
revisit = 0
payloadMap = {}
writer = WARCWriter (output, gzip=True)
+
+ # Add an additional warcinfo record, describing the transformations. This
+ # is not ideal, since
+ # “A ‘warcinfo’ record describes the records that
+ # follow it […] until next ‘warcinfo’”
+ # -- https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#warcinfo
+ # A warcinfo record is expected at the beginning of every file. But it
+ # might have written by a different software, so we don’t want to
+ # strip/replace that information, but supplement it.
+ warcinfo = {
+ 'software': getSoftwareInfo (),
+ 'tool': 'crocoite-merge', # not the name of the cli tool
+ 'parameters': {'inputs': files},
+ }
+ payload = BytesIO (json.dumps (warcinfo, indent=2).encode ('utf-8'))
+ record = writer.create_warc_record (packageUrl ('warcinfo'), 'warcinfo',
+ payload=payload,
+ warc_headers_dict={'Content-Type': 'text/plain; encoding=utf-8'})
+ writer.write_record (record)
+
for l in files:
with open (l, 'rb') as fd:
for record in ArchiveIterator (fd):
diff --git a/crocoite/util.py b/crocoite/util.py
index da6d54a..bd26909 100644
--- a/crocoite/util.py
+++ b/crocoite/util.py
@@ -22,7 +22,7 @@
Random utility functions
"""
-import random, sys
+import random, sys, platform
import hashlib, pkg_resources
from urllib.parse import urlsplit, urlunsplit
@@ -43,6 +43,18 @@ def removeFragment (u):
s = urlsplit (u)
return urlunsplit ((s.scheme, s.netloc, s.path, s.query, ''))
+def getSoftwareInfo ():
+ """ Get software info for inclusion into warcinfo """
+ return {
+ 'platform': platform.platform (),
+ 'python': {
+ 'implementation': platform.python_implementation(),
+ 'version': platform.python_version (),
+ 'build': platform.python_build ()
+ },
+ 'self': getRequirements (__package__)
+ }
+
def getRequirements (dist):
""" Get dependencies of a package.