diff options
Diffstat (limited to 'crocoite')
| -rw-r--r-- | crocoite/controller.py | 22 | ||||
| -rw-r--r-- | crocoite/util.py | 45 | ||||
| -rw-r--r-- | crocoite/warc.py | 6 | 
3 files changed, 65 insertions, 8 deletions
| diff --git a/crocoite/controller.py b/crocoite/controller.py index cbf0037..178d11c 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -70,11 +70,11 @@ class StatsHandler (EventHandler):          elif isinstance (item, BrowserCrashed):              self.stats['crashed'] += 1 -import time +import time, platform  from . import behavior as cbehavior  from .browser import ChromeService, SiteLoader, Item -from .util import getFormattedViewportMetrics, removeFragment +from .util import getFormattedViewportMetrics, removeFragment, getRequirements  class ControllerStart:      __slots__ = ('payload') @@ -163,10 +163,20 @@ class SinglePageController:              version = l.tab.Browser.getVersion ()              payload = { -                    'software': __package__, -                    'browser': version['product'], -                    'useragent': version['userAgent'], -                    'viewport': getFormattedViewportMetrics (l.tab), +                    'software': { +                        'platform': platform.platform (), +                        'python': { +                            'implementation': platform.python_implementation(), +                            'version': platform.python_version (), +                            'build': platform.python_build () +                            }, +                        'self': getRequirements (__package__) +                        }, +                    'browser': { +                        'product': version['product'], +                        'useragent': version['userAgent'], +                        'viewport': getFormattedViewportMetrics (l.tab), +                        },                      }              self.processItem (ControllerStart (payload)) diff --git a/crocoite/util.py b/crocoite/util.py index fe43f01..3a62533 100644 --- a/crocoite/util.py +++ b/crocoite/util.py @@ -22,7 +22,8 @@  Random utility functions  """ -import random +import random, sys +import hashlib, os, pkg_resources  from urllib.parse import urlsplit, urlunsplit  def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'): @@ -47,3 +48,45 @@ def removeFragment (u):      s = urlsplit (u)      return urlunsplit ((s.scheme, s.netloc, s.path, s.query, '')) +def getRequirements (dist): +    """ Get dependencies of a package. + +    Figure out packages’ dependencies based on setup/distutils, then look at +    modules loaded and compute hashes of each loaded dependency. + +    This does not and cannot protect against malicious people. It’s only +    purpose is to recreate this exact environment. +    """ + +    pending = {dist} +    have = set () +    packages = [] +    while pending: +        d = pkg_resources.get_distribution (pending.pop ()) + +        modules = list (filter (lambda x: x, d.get_metadata ('top_level.txt').split ('\n'))) +        modhashes = {} +        # hash loaded modules +        for m in sys.modules.values (): +            f = getattr (m, '__file__', None) +            pkg = getattr (m, '__package__', None) +            # is loaded? +            if pkg in modules: +                if f: +                    with open (f, 'rb') as fd: +                        contents = fd.read () +                        h = hashlib.new ('sha512') +                        h.update (contents) +                        modhashes[m.__name__] = {'sha512': h.hexdigest (), 'len': len (contents)} +                else: +                    modhashes[m.__name__] = {} + +        # only if one of the packages’ modules is actually loaded +        if modhashes: +            packages.append ({'projectName': d.project_name, 'modules': modhashes, 'version': d.version}) + +        have.add (dist) +        pending.update (d.requires ()) +        pending.difference_update (have) +    return packages + diff --git a/crocoite/warc.py b/crocoite/warc.py index a4a70ac..324d161 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -212,8 +212,12 @@ class WarcHandler (EventHandler):          writer.write_record (record)      def _writeControllerStart (self, item): +        payload = BytesIO (json.dumps (item.payload, indent=2).encode ('utf-8')) +          writer = self.writer -        warcinfo = writer.create_warcinfo_record (filename=None, info=item.payload) +        warcinfo = writer.create_warc_record (packageUrl ('warcinfo'), 'warcinfo', +                warc_headers_dict={'Content-Type': 'text/plain; encoding=utf-8'}, +                payload=payload)          writer.write_record (warcinfo)      def _flushLogEntries (self): | 
