summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--crocoite/controller.py22
-rw-r--r--crocoite/util.py45
-rw-r--r--crocoite/warc.py6
3 files changed, 65 insertions, 8 deletions
diff --git a/crocoite/controller.py b/crocoite/controller.py
index cbf0037..178d11c 100644
--- a/crocoite/controller.py
+++ b/crocoite/controller.py
@@ -70,11 +70,11 @@ class StatsHandler (EventHandler):
elif isinstance (item, BrowserCrashed):
self.stats['crashed'] += 1
-import time
+import time, platform
from . import behavior as cbehavior
from .browser import ChromeService, SiteLoader, Item
-from .util import getFormattedViewportMetrics, removeFragment
+from .util import getFormattedViewportMetrics, removeFragment, getRequirements
class ControllerStart:
__slots__ = ('payload')
@@ -163,10 +163,20 @@ class SinglePageController:
version = l.tab.Browser.getVersion ()
payload = {
- 'software': __package__,
- 'browser': version['product'],
- 'useragent': version['userAgent'],
- 'viewport': getFormattedViewportMetrics (l.tab),
+ 'software': {
+ 'platform': platform.platform (),
+ 'python': {
+ 'implementation': platform.python_implementation(),
+ 'version': platform.python_version (),
+ 'build': platform.python_build ()
+ },
+ 'self': getRequirements (__package__)
+ },
+ 'browser': {
+ 'product': version['product'],
+ 'useragent': version['userAgent'],
+ 'viewport': getFormattedViewportMetrics (l.tab),
+ },
}
self.processItem (ControllerStart (payload))
diff --git a/crocoite/util.py b/crocoite/util.py
index fe43f01..3a62533 100644
--- a/crocoite/util.py
+++ b/crocoite/util.py
@@ -22,7 +22,8 @@
Random utility functions
"""
-import random
+import random, sys
+import hashlib, os, pkg_resources
from urllib.parse import urlsplit, urlunsplit
def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'):
@@ -47,3 +48,45 @@ def removeFragment (u):
s = urlsplit (u)
return urlunsplit ((s.scheme, s.netloc, s.path, s.query, ''))
+def getRequirements (dist):
+ """ Get dependencies of a package.
+
+ Figure out packages’ dependencies based on setup/distutils, then look at
+ modules loaded and compute hashes of each loaded dependency.
+
+ This does not and cannot protect against malicious people. It’s only
+ purpose is to recreate this exact environment.
+ """
+
+ pending = {dist}
+ have = set ()
+ packages = []
+ while pending:
+ d = pkg_resources.get_distribution (pending.pop ())
+
+ modules = list (filter (lambda x: x, d.get_metadata ('top_level.txt').split ('\n')))
+ modhashes = {}
+ # hash loaded modules
+ for m in sys.modules.values ():
+ f = getattr (m, '__file__', None)
+ pkg = getattr (m, '__package__', None)
+ # is loaded?
+ if pkg in modules:
+ if f:
+ with open (f, 'rb') as fd:
+ contents = fd.read ()
+ h = hashlib.new ('sha512')
+ h.update (contents)
+ modhashes[m.__name__] = {'sha512': h.hexdigest (), 'len': len (contents)}
+ else:
+ modhashes[m.__name__] = {}
+
+ # only if one of the packages’ modules is actually loaded
+ if modhashes:
+ packages.append ({'projectName': d.project_name, 'modules': modhashes, 'version': d.version})
+
+ have.add (dist)
+ pending.update (d.requires ())
+ pending.difference_update (have)
+ return packages
+
diff --git a/crocoite/warc.py b/crocoite/warc.py
index a4a70ac..324d161 100644
--- a/crocoite/warc.py
+++ b/crocoite/warc.py
@@ -212,8 +212,12 @@ class WarcHandler (EventHandler):
writer.write_record (record)
def _writeControllerStart (self, item):
+ payload = BytesIO (json.dumps (item.payload, indent=2).encode ('utf-8'))
+
writer = self.writer
- warcinfo = writer.create_warcinfo_record (filename=None, info=item.payload)
+ warcinfo = writer.create_warc_record (packageUrl ('warcinfo'), 'warcinfo',
+ warc_headers_dict={'Content-Type': 'text/plain; encoding=utf-8'},
+ payload=payload)
writer.write_record (warcinfo)
def _flushLogEntries (self):