diff options
| author | Lars-Dominik Braun <lars@6xq.net> | 2019-06-18 16:57:29 +0200 | 
|---|---|---|
| committer | Lars-Dominik Braun <lars@6xq.net> | 2019-06-18 16:57:29 +0200 | 
| commit | 9c1de8ebb7b9e00bc6afc9f2ed2978a003d9abd8 (patch) | |
| tree | 4ad0c2a78e23896739d6dadbfbea9cfd1bb4d722 /crocoite | |
| parent | b4669705fa8e581c17bbe0ca0c7cf4fadbd3deb8 (diff) | |
| download | crocoite-9c1de8ebb7b9e00bc6afc9f2ed2978a003d9abd8.tar.gz crocoite-9c1de8ebb7b9e00bc6afc9f2ed2978a003d9abd8.tar.bz2 crocoite-9c1de8ebb7b9e00bc6afc9f2ed2978a003d9abd8.zip | |
Re-inject behavior scripts on site reload
Fixes #13. Event handler’s push() is async now.
Diffstat (limited to 'crocoite')
| -rw-r--r-- | crocoite/browser.py | 23 | ||||
| -rw-r--r-- | crocoite/controller.py | 74 | ||||
| -rw-r--r-- | crocoite/test_behavior.py | 2 | ||||
| -rw-r--r-- | crocoite/test_browser.py | 54 | ||||
| -rw-r--r-- | crocoite/test_controller.py | 6 | ||||
| -rw-r--r-- | crocoite/test_warc.py | 5 | ||||
| -rw-r--r-- | crocoite/warc.py | 2 | 
7 files changed, 114 insertions, 52 deletions
| diff --git a/crocoite/browser.py b/crocoite/browser.py index 577e77a..dc67c51 100644 --- a/crocoite/browser.py +++ b/crocoite/browser.py @@ -252,12 +252,22 @@ class NavigateError (IOError):  class PageIdle:      """ Page idle event """ +    __slots__ = ('idle', ) +      def __init__ (self, idle):          self.idle = idle      def __bool__ (self):          return self.idle +class FrameNavigated: +    __slots__ = ('id', 'url', 'mimeType') + +    def __init__ (self, id, url, mimeType): +        self.id = id +        self.url = URL (url) +        self.mimeType = mimeType +  class SiteLoader:      """      Load site in Chrome and monitor network requests @@ -266,7 +276,7 @@ class SiteLoader:      """      __slots__ = ('requests', 'browser', 'logger', 'tab', '_iterRunning', -            '_framesLoading') +            '_framesLoading', '_rootFrame')      allowedSchemes = {'http', 'https'}      def __init__ (self, browser, logger): @@ -276,6 +286,7 @@ class SiteLoader:          self._iterRunning = []          self._framesLoading = set () +        self._rootFrame = None      async def __aenter__ (self):          tab = self.tab = await self.browser.__aenter__ () @@ -317,6 +328,7 @@ class SiteLoader:                  tab.Page.javascriptDialogOpening: self._javascriptDialogOpening,                  tab.Page.frameStartedLoading: self._frameStartedLoading,                  tab.Page.frameStoppedLoading: self._frameStoppedLoading, +                tab.Page.frameNavigated: self._frameNavigated,                  }          # The implementation is a little advanced. Why? The goal here is to @@ -356,6 +368,7 @@ class SiteLoader:                  uuid='9d47ded2-951f-4e09-86ee-fd4151e20666', result=ret)          if 'errorText' in ret:              raise NavigateError (ret['errorText']) +        self._rootFrame = ret['frameId']      # internal chrome callbacks      async def _requestWillBeSent (self, **kwargs): @@ -489,3 +502,11 @@ class SiteLoader:          if not self._framesLoading:              return PageIdle (True) +    async def _frameNavigated (self, **kwargs): +        self.logger.debug ('frameNavigated', +                uuid='0e876f7d-7129-4612-8632-686f42ac6e1f', args=kwargs) +        frame = kwargs['frame'] +        if self._rootFrame == frame['id']: +            assert frame.get ('parentId', None) is None, "root frame must not have a parent" +            return FrameNavigated (frame['id'], frame['url'], frame['mimeType']) + diff --git a/crocoite/controller.py b/crocoite/controller.py index 08482af..1bcca0f 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -26,10 +26,11 @@ import time, tempfile, asyncio, json, os  from itertools import islice  from datetime import datetime  from operator import attrgetter +from abc import ABC, abstractmethod  from yarl import URL  from . import behavior as cbehavior -from .browser import SiteLoader, RequestResponsePair, PageIdle +from .browser import SiteLoader, RequestResponsePair, PageIdle, FrameNavigated  from .util import getFormattedViewportMetrics, getSoftwareInfo  from .behavior import ExtractLinksEvent @@ -45,12 +46,13 @@ class ControllerSettings:  defaultSettings = ControllerSettings () -class EventHandler: +class EventHandler (ABC):      """ Abstract base class for event handler """      __slots__ = () -    def push (self, item): +    @abstractmethod +    async def push (self, item):          raise NotImplementedError ()  class StatsHandler (EventHandler): @@ -59,7 +61,7 @@ class StatsHandler (EventHandler):      def __init__ (self):          self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0} -    def push (self, item): +    async def push (self, item):          if isinstance (item, RequestResponsePair):              self.stats['requests'] += 1              if not item.response: @@ -76,7 +78,7 @@ class LogHandler (EventHandler):      def __init__ (self, logger):          self.logger = logger.bind (context=type (self).__name__) -    def push (self, item): +    async def push (self, item):          if isinstance (item, ExtractLinksEvent):              # limit number of links per message, so json blob won’t get too big              it = iter (item.links) @@ -106,7 +108,7 @@ class IdleStateTracker (EventHandler):          self._idleSince = self._loop.time () -    def push (self, item): +    async def push (self, item):          if isinstance (item, PageIdle):              self._idle = bool (item)              if self._idle: @@ -129,6 +131,37 @@ class IdleStateTracker (EventHandler):                  sleep = timeout              await asyncio.sleep (sleep) +class InjectBehaviorOnload (EventHandler): +    """ Control behavior script injection based on frame navigation messages. +    When a page is reloaded (for whatever reason), the scripts need to be +    reinjected. """ + +    __slots__ = ('controller', '_loaded') + +    def __init__ (self, controller): +        self.controller = controller +        self._loaded = False + +    async def push (self, item): +        if isinstance (item, FrameNavigated): +            await self._runon ('load') +            self._loaded = True + +    async def stop (self): +        if self._loaded: +            await self._runon ('stop') + +    async def finish (self): +        if self._loaded: +            await self._runon ('finish') + +    async def _runon (self, method): +        controller = self.controller +        for b in controller._enabledBehavior: +            f = getattr (b, 'on' + method) +            async for item in f (): +                await controller.processItem (item) +  class SinglePageController:      """      Archive a single page url. @@ -138,7 +171,7 @@ class SinglePageController:      """      __slots__ = ('url', 'service', 'behavior', 'settings', 'logger', 'handler', -            'warcinfo') +            'warcinfo', '_enabledBehavior')      def __init__ (self, url, logger, \              service, behavior=cbehavior.available, \ @@ -152,25 +185,27 @@ class SinglePageController:          self.handler = handler or []          self.warcinfo = warcinfo -    def processItem (self, item): +    async def processItem (self, item):          for h in self.handler: -            h.push (item) +            await h.push (item)      async def run (self):          logger = self.logger          async def processQueue ():              async for item in l: -                self.processItem (item) +                await self.processItem (item)          idle = IdleStateTracker (asyncio.get_event_loop ())          self.handler.append (idle) +        behavior = InjectBehaviorOnload (self) +        self.handler.append (behavior)          async with self.service as browser, SiteLoader (browser, logger=logger) as l:              handle = asyncio.ensure_future (processQueue ())              timeoutProc = asyncio.ensure_future (asyncio.sleep (self.settings.timeout))              # not all behavior scripts are allowed for every URL, filter them -            enabledBehavior = list (filter (lambda x: self.url in x, +            self._enabledBehavior = list (filter (lambda x: self.url in x,                      map (lambda x: x (l, logger), self.behavior)))              version = await l.tab.Browser.getVersion () @@ -186,17 +221,14 @@ class SinglePageController:                          'url': self.url,                          'idleTimeout': self.settings.idleTimeout,                          'timeout': self.settings.timeout, -                        'behavior': list (map (attrgetter('name'), enabledBehavior)), +                        'behavior': list (map (attrgetter('name'), self._enabledBehavior)),                          },                      }              if self.warcinfo:                  payload['extra'] = self.warcinfo -            self.processItem (ControllerStart (payload)) +            await self.processItem (ControllerStart (payload))              await l.navigate (self.url) -            for b in enabledBehavior: -                async for item in b.onload (): -                    self.processItem (item)              idleProc = asyncio.ensure_future (idle.wait (self.settings.idleTimeout))              while True: @@ -231,16 +263,10 @@ class SinglePageController:                      timeoutProc.cancel ()                      break -            for b in enabledBehavior: -                async for item in b.onstop (): -                    self.processItem (item) +            await behavior.stop ()              await l.tab.Page.stopLoading () -              await asyncio.sleep (1) - -            for b in enabledBehavior: -                async for item in b.onfinish (): -                    self.processItem (item) +            await behavior.finish ()              # wait until loads from behavior scripts are done and browser is              # idle for at least 1 second diff --git a/crocoite/test_behavior.py b/crocoite/test_behavior.py index 9a13c65..c76a267 100644 --- a/crocoite/test_behavior.py +++ b/crocoite/test_behavior.py @@ -110,7 +110,7 @@ class AccumHandler (EventHandler):          super().__init__ ()          self.data = [] -    def push (self, item): +    async def push (self, item):          self.data.append (item)  async def simpleServer (url, response): diff --git a/crocoite/test_browser.py b/crocoite/test_browser.py index 713c434..7084214 100644 --- a/crocoite/test_browser.py +++ b/crocoite/test_browser.py @@ -34,7 +34,7 @@ import pytest  from .browser import RequestResponsePair, SiteLoader, Request, \          UnicodeBody, ReferenceTimestamp, Base64Body, UnicodeBody, Request, \ -        Response, NavigateError, PageIdle +        Response, NavigateError, PageIdle, FrameNavigated  from .logger import Logger, Consumer  from .devtools import Crashed, Process @@ -336,33 +336,47 @@ async def test_integration_item (loader, golden):      runner = web.AppRunner(app)      await runner.setup()      site = web.TCPSite(runner, serverUrl.host, serverUrl.port) -    await site.start() +    try: +        await site.start() +    except Exception as e: +        pytest.skip (e) +    haveReqResp = False +    haveNavigated = False      try:          await loader.navigate (golden.url)          it = loader.__aiter__ ()          while True: -            item = await it.__anext__ () -            if isinstance (item, RequestResponsePair): +            try: +                item = await asyncio.wait_for (it.__anext__ (), timeout=1) +            except asyncio.TimeoutError:                  break - -        # we do not know this in advance -        item.request.initiator = None -        item.request.headers = None -        item.remoteIpAddress = None -        item.protocol = None -        item.resourceType = None - -        if item.response: -            assert item.response.statusText is not None -            item.response.statusText = None - -            del item.response.headers['server'] -            del item.response.headers['content-length'] -            del item.response.headers['date'] -        assert item == golden +            # XXX: can only check the first req/resp right now (due to redirect) +            if isinstance (item, RequestResponsePair) and not haveReqResp: +                # we do not know this in advance +                item.request.initiator = None +                item.request.headers = None +                item.remoteIpAddress = None +                item.protocol = None +                item.resourceType = None + +                if item.response: +                    assert item.response.statusText is not None +                    item.response.statusText = None + +                    del item.response.headers['server'] +                    del item.response.headers['content-length'] +                    del item.response.headers['date'] +                assert item == golden +                haveReqResp = True +            elif isinstance (item, FrameNavigated): +                # XXX: can’t check this, because of the redirect +                #assert item.url == golden.url +                haveNavigated = True      finally: +        assert haveReqResp +        assert haveNavigated          await runner.cleanup ()  def test_page_idle (): diff --git a/crocoite/test_controller.py b/crocoite/test_controller.py index fa478a1..7e79dbe 100644 --- a/crocoite/test_controller.py +++ b/crocoite/test_controller.py @@ -130,11 +130,11 @@ async def test_idle_state_tracker ():      assert idle._idle      # idle change -    idle.push (PageIdle (False)) +    await idle.push (PageIdle (False))      assert not idle._idle      # nothing happens for other objects -    idle.push ({}) +    await idle.push ({})      assert not idle._idle      # no state change -> wait does not return @@ -144,7 +144,7 @@ async def test_idle_state_tracker ():      # wait at least timeout      delta = 0.2      timeout = 1 -    idle.push (PageIdle (True)) +    await idle.push (PageIdle (True))      assert idle._idle      start = loop.time ()      await idle.wait (timeout) diff --git a/crocoite/test_warc.py b/crocoite/test_warc.py index 954e8c8..478892a 100644 --- a/crocoite/test_warc.py +++ b/crocoite/test_warc.py @@ -84,8 +84,9 @@ def event ():              requestResponsePair (),              ) +@pytest.mark.asyncio  @given (st.lists (event ())) -def test_push (golden): +async def test_push (golden):      def checkWarcinfoId (headers):          if lastWarcinfoRecordid is not None:              assert headers['WARC-Warcinfo-ID'] == lastWarcinfoRecordid @@ -97,7 +98,7 @@ def test_push (golden):      with open('/tmp/test.warc.gz', 'w+b') as fd:          with WarcHandler (fd, logger) as handler:              for g in golden: -                handler.push (g) +                await handler.push (g)          fd.seek (0)          it = iter (ArchiveIterator (fd)) diff --git a/crocoite/warc.py b/crocoite/warc.py index 4106995..3a084a1 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -241,7 +241,7 @@ class WarcHandler (EventHandler):              ControllerStart: _writeControllerStart,              } -    def push (self, item): +    async def push (self, item):          for k, v in self.route.items ():              if isinstance (item, k):                  v (self, item) | 
