diff options
-rw-r--r-- | crocoite/controller.py | 22 | ||||
-rw-r--r-- | crocoite/util.py | 45 | ||||
-rw-r--r-- | crocoite/warc.py | 6 |
3 files changed, 65 insertions, 8 deletions
diff --git a/crocoite/controller.py b/crocoite/controller.py index cbf0037..178d11c 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -70,11 +70,11 @@ class StatsHandler (EventHandler): elif isinstance (item, BrowserCrashed): self.stats['crashed'] += 1 -import time +import time, platform from . import behavior as cbehavior from .browser import ChromeService, SiteLoader, Item -from .util import getFormattedViewportMetrics, removeFragment +from .util import getFormattedViewportMetrics, removeFragment, getRequirements class ControllerStart: __slots__ = ('payload') @@ -163,10 +163,20 @@ class SinglePageController: version = l.tab.Browser.getVersion () payload = { - 'software': __package__, - 'browser': version['product'], - 'useragent': version['userAgent'], - 'viewport': getFormattedViewportMetrics (l.tab), + 'software': { + 'platform': platform.platform (), + 'python': { + 'implementation': platform.python_implementation(), + 'version': platform.python_version (), + 'build': platform.python_build () + }, + 'self': getRequirements (__package__) + }, + 'browser': { + 'product': version['product'], + 'useragent': version['userAgent'], + 'viewport': getFormattedViewportMetrics (l.tab), + }, } self.processItem (ControllerStart (payload)) diff --git a/crocoite/util.py b/crocoite/util.py index fe43f01..3a62533 100644 --- a/crocoite/util.py +++ b/crocoite/util.py @@ -22,7 +22,8 @@ Random utility functions """ -import random +import random, sys +import hashlib, os, pkg_resources from urllib.parse import urlsplit, urlunsplit def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'): @@ -47,3 +48,45 @@ def removeFragment (u): s = urlsplit (u) return urlunsplit ((s.scheme, s.netloc, s.path, s.query, '')) +def getRequirements (dist): + """ Get dependencies of a package. + + Figure out packages’ dependencies based on setup/distutils, then look at + modules loaded and compute hashes of each loaded dependency. + + This does not and cannot protect against malicious people. It’s only + purpose is to recreate this exact environment. + """ + + pending = {dist} + have = set () + packages = [] + while pending: + d = pkg_resources.get_distribution (pending.pop ()) + + modules = list (filter (lambda x: x, d.get_metadata ('top_level.txt').split ('\n'))) + modhashes = {} + # hash loaded modules + for m in sys.modules.values (): + f = getattr (m, '__file__', None) + pkg = getattr (m, '__package__', None) + # is loaded? + if pkg in modules: + if f: + with open (f, 'rb') as fd: + contents = fd.read () + h = hashlib.new ('sha512') + h.update (contents) + modhashes[m.__name__] = {'sha512': h.hexdigest (), 'len': len (contents)} + else: + modhashes[m.__name__] = {} + + # only if one of the packages’ modules is actually loaded + if modhashes: + packages.append ({'projectName': d.project_name, 'modules': modhashes, 'version': d.version}) + + have.add (dist) + pending.update (d.requires ()) + pending.difference_update (have) + return packages + diff --git a/crocoite/warc.py b/crocoite/warc.py index a4a70ac..324d161 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -212,8 +212,12 @@ class WarcHandler (EventHandler): writer.write_record (record) def _writeControllerStart (self, item): + payload = BytesIO (json.dumps (item.payload, indent=2).encode ('utf-8')) + writer = self.writer - warcinfo = writer.create_warcinfo_record (filename=None, info=item.payload) + warcinfo = writer.create_warc_record (packageUrl ('warcinfo'), 'warcinfo', + warc_headers_dict={'Content-Type': 'text/plain; encoding=utf-8'}, + payload=payload) writer.write_record (warcinfo) def _flushLogEntries (self): |