summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2018-07-29 09:19:06 +0200
committerLars-Dominik Braun <lars@6xq.net>2018-08-04 14:11:49 +0200
commitb25c4cccafbd9572fe3e3c9c83c48c19b714a6c3 (patch)
tree5f41c03124adc3f3a0010b890977dbcb2271002b
parent3deded13df1339ef59a760c188804adffd9ed902 (diff)
downloadcrocoite-b25c4cccafbd9572fe3e3c9c83c48c19b714a6c3.tar.gz
crocoite-b25c4cccafbd9572fe3e3c9c83c48c19b714a6c3.tar.bz2
crocoite-b25c4cccafbd9572fe3e3c9c83c48c19b714a6c3.zip
Add package information to warcinfo
Change warcinfo record format to JSON (this is permitted by the specs) and add Python version, dependencies and their versions as well as file hashes. This should give us enough information to figure out the exact environment used to create the WARC.
-rw-r--r--crocoite/controller.py22
-rw-r--r--crocoite/util.py45
-rw-r--r--crocoite/warc.py6
3 files changed, 65 insertions, 8 deletions
diff --git a/crocoite/controller.py b/crocoite/controller.py
index cbf0037..178d11c 100644
--- a/crocoite/controller.py
+++ b/crocoite/controller.py
@@ -70,11 +70,11 @@ class StatsHandler (EventHandler):
elif isinstance (item, BrowserCrashed):
self.stats['crashed'] += 1
-import time
+import time, platform
from . import behavior as cbehavior
from .browser import ChromeService, SiteLoader, Item
-from .util import getFormattedViewportMetrics, removeFragment
+from .util import getFormattedViewportMetrics, removeFragment, getRequirements
class ControllerStart:
__slots__ = ('payload')
@@ -163,10 +163,20 @@ class SinglePageController:
version = l.tab.Browser.getVersion ()
payload = {
- 'software': __package__,
- 'browser': version['product'],
- 'useragent': version['userAgent'],
- 'viewport': getFormattedViewportMetrics (l.tab),
+ 'software': {
+ 'platform': platform.platform (),
+ 'python': {
+ 'implementation': platform.python_implementation(),
+ 'version': platform.python_version (),
+ 'build': platform.python_build ()
+ },
+ 'self': getRequirements (__package__)
+ },
+ 'browser': {
+ 'product': version['product'],
+ 'useragent': version['userAgent'],
+ 'viewport': getFormattedViewportMetrics (l.tab),
+ },
}
self.processItem (ControllerStart (payload))
diff --git a/crocoite/util.py b/crocoite/util.py
index fe43f01..3a62533 100644
--- a/crocoite/util.py
+++ b/crocoite/util.py
@@ -22,7 +22,8 @@
Random utility functions
"""
-import random
+import random, sys
+import hashlib, os, pkg_resources
from urllib.parse import urlsplit, urlunsplit
def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'):
@@ -47,3 +48,45 @@ def removeFragment (u):
s = urlsplit (u)
return urlunsplit ((s.scheme, s.netloc, s.path, s.query, ''))
+def getRequirements (dist):
+ """ Get dependencies of a package.
+
+ Figure out packages’ dependencies based on setup/distutils, then look at
+ modules loaded and compute hashes of each loaded dependency.
+
+ This does not and cannot protect against malicious people. It’s only
+ purpose is to recreate this exact environment.
+ """
+
+ pending = {dist}
+ have = set ()
+ packages = []
+ while pending:
+ d = pkg_resources.get_distribution (pending.pop ())
+
+ modules = list (filter (lambda x: x, d.get_metadata ('top_level.txt').split ('\n')))
+ modhashes = {}
+ # hash loaded modules
+ for m in sys.modules.values ():
+ f = getattr (m, '__file__', None)
+ pkg = getattr (m, '__package__', None)
+ # is loaded?
+ if pkg in modules:
+ if f:
+ with open (f, 'rb') as fd:
+ contents = fd.read ()
+ h = hashlib.new ('sha512')
+ h.update (contents)
+ modhashes[m.__name__] = {'sha512': h.hexdigest (), 'len': len (contents)}
+ else:
+ modhashes[m.__name__] = {}
+
+ # only if one of the packages’ modules is actually loaded
+ if modhashes:
+ packages.append ({'projectName': d.project_name, 'modules': modhashes, 'version': d.version})
+
+ have.add (dist)
+ pending.update (d.requires ())
+ pending.difference_update (have)
+ return packages
+
diff --git a/crocoite/warc.py b/crocoite/warc.py
index a4a70ac..324d161 100644
--- a/crocoite/warc.py
+++ b/crocoite/warc.py
@@ -212,8 +212,12 @@ class WarcHandler (EventHandler):
writer.write_record (record)
def _writeControllerStart (self, item):
+ payload = BytesIO (json.dumps (item.payload, indent=2).encode ('utf-8'))
+
writer = self.writer
- warcinfo = writer.create_warcinfo_record (filename=None, info=item.payload)
+ warcinfo = writer.create_warc_record (packageUrl ('warcinfo'), 'warcinfo',
+ warc_headers_dict={'Content-Type': 'text/plain; encoding=utf-8'},
+ payload=payload)
writer.write_record (warcinfo)
def _flushLogEntries (self):