diff options
-rw-r--r-- | crocoite/browser.py | 23 | ||||
-rw-r--r-- | crocoite/controller.py | 74 | ||||
-rw-r--r-- | crocoite/test_behavior.py | 2 | ||||
-rw-r--r-- | crocoite/test_browser.py | 54 | ||||
-rw-r--r-- | crocoite/test_controller.py | 6 | ||||
-rw-r--r-- | crocoite/test_warc.py | 5 | ||||
-rw-r--r-- | crocoite/warc.py | 2 |
7 files changed, 114 insertions, 52 deletions
diff --git a/crocoite/browser.py b/crocoite/browser.py index 577e77a..dc67c51 100644 --- a/crocoite/browser.py +++ b/crocoite/browser.py @@ -252,12 +252,22 @@ class NavigateError (IOError): class PageIdle: """ Page idle event """ + __slots__ = ('idle', ) + def __init__ (self, idle): self.idle = idle def __bool__ (self): return self.idle +class FrameNavigated: + __slots__ = ('id', 'url', 'mimeType') + + def __init__ (self, id, url, mimeType): + self.id = id + self.url = URL (url) + self.mimeType = mimeType + class SiteLoader: """ Load site in Chrome and monitor network requests @@ -266,7 +276,7 @@ class SiteLoader: """ __slots__ = ('requests', 'browser', 'logger', 'tab', '_iterRunning', - '_framesLoading') + '_framesLoading', '_rootFrame') allowedSchemes = {'http', 'https'} def __init__ (self, browser, logger): @@ -276,6 +286,7 @@ class SiteLoader: self._iterRunning = [] self._framesLoading = set () + self._rootFrame = None async def __aenter__ (self): tab = self.tab = await self.browser.__aenter__ () @@ -317,6 +328,7 @@ class SiteLoader: tab.Page.javascriptDialogOpening: self._javascriptDialogOpening, tab.Page.frameStartedLoading: self._frameStartedLoading, tab.Page.frameStoppedLoading: self._frameStoppedLoading, + tab.Page.frameNavigated: self._frameNavigated, } # The implementation is a little advanced. Why? The goal here is to @@ -356,6 +368,7 @@ class SiteLoader: uuid='9d47ded2-951f-4e09-86ee-fd4151e20666', result=ret) if 'errorText' in ret: raise NavigateError (ret['errorText']) + self._rootFrame = ret['frameId'] # internal chrome callbacks async def _requestWillBeSent (self, **kwargs): @@ -489,3 +502,11 @@ class SiteLoader: if not self._framesLoading: return PageIdle (True) + async def _frameNavigated (self, **kwargs): + self.logger.debug ('frameNavigated', + uuid='0e876f7d-7129-4612-8632-686f42ac6e1f', args=kwargs) + frame = kwargs['frame'] + if self._rootFrame == frame['id']: + assert frame.get ('parentId', None) is None, "root frame must not have a parent" + return FrameNavigated (frame['id'], frame['url'], frame['mimeType']) + diff --git a/crocoite/controller.py b/crocoite/controller.py index 08482af..1bcca0f 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -26,10 +26,11 @@ import time, tempfile, asyncio, json, os from itertools import islice from datetime import datetime from operator import attrgetter +from abc import ABC, abstractmethod from yarl import URL from . import behavior as cbehavior -from .browser import SiteLoader, RequestResponsePair, PageIdle +from .browser import SiteLoader, RequestResponsePair, PageIdle, FrameNavigated from .util import getFormattedViewportMetrics, getSoftwareInfo from .behavior import ExtractLinksEvent @@ -45,12 +46,13 @@ class ControllerSettings: defaultSettings = ControllerSettings () -class EventHandler: +class EventHandler (ABC): """ Abstract base class for event handler """ __slots__ = () - def push (self, item): + @abstractmethod + async def push (self, item): raise NotImplementedError () class StatsHandler (EventHandler): @@ -59,7 +61,7 @@ class StatsHandler (EventHandler): def __init__ (self): self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0} - def push (self, item): + async def push (self, item): if isinstance (item, RequestResponsePair): self.stats['requests'] += 1 if not item.response: @@ -76,7 +78,7 @@ class LogHandler (EventHandler): def __init__ (self, logger): self.logger = logger.bind (context=type (self).__name__) - def push (self, item): + async def push (self, item): if isinstance (item, ExtractLinksEvent): # limit number of links per message, so json blob won’t get too big it = iter (item.links) @@ -106,7 +108,7 @@ class IdleStateTracker (EventHandler): self._idleSince = self._loop.time () - def push (self, item): + async def push (self, item): if isinstance (item, PageIdle): self._idle = bool (item) if self._idle: @@ -129,6 +131,37 @@ class IdleStateTracker (EventHandler): sleep = timeout await asyncio.sleep (sleep) +class InjectBehaviorOnload (EventHandler): + """ Control behavior script injection based on frame navigation messages. + When a page is reloaded (for whatever reason), the scripts need to be + reinjected. """ + + __slots__ = ('controller', '_loaded') + + def __init__ (self, controller): + self.controller = controller + self._loaded = False + + async def push (self, item): + if isinstance (item, FrameNavigated): + await self._runon ('load') + self._loaded = True + + async def stop (self): + if self._loaded: + await self._runon ('stop') + + async def finish (self): + if self._loaded: + await self._runon ('finish') + + async def _runon (self, method): + controller = self.controller + for b in controller._enabledBehavior: + f = getattr (b, 'on' + method) + async for item in f (): + await controller.processItem (item) + class SinglePageController: """ Archive a single page url. @@ -138,7 +171,7 @@ class SinglePageController: """ __slots__ = ('url', 'service', 'behavior', 'settings', 'logger', 'handler', - 'warcinfo') + 'warcinfo', '_enabledBehavior') def __init__ (self, url, logger, \ service, behavior=cbehavior.available, \ @@ -152,25 +185,27 @@ class SinglePageController: self.handler = handler or [] self.warcinfo = warcinfo - def processItem (self, item): + async def processItem (self, item): for h in self.handler: - h.push (item) + await h.push (item) async def run (self): logger = self.logger async def processQueue (): async for item in l: - self.processItem (item) + await self.processItem (item) idle = IdleStateTracker (asyncio.get_event_loop ()) self.handler.append (idle) + behavior = InjectBehaviorOnload (self) + self.handler.append (behavior) async with self.service as browser, SiteLoader (browser, logger=logger) as l: handle = asyncio.ensure_future (processQueue ()) timeoutProc = asyncio.ensure_future (asyncio.sleep (self.settings.timeout)) # not all behavior scripts are allowed for every URL, filter them - enabledBehavior = list (filter (lambda x: self.url in x, + self._enabledBehavior = list (filter (lambda x: self.url in x, map (lambda x: x (l, logger), self.behavior))) version = await l.tab.Browser.getVersion () @@ -186,17 +221,14 @@ class SinglePageController: 'url': self.url, 'idleTimeout': self.settings.idleTimeout, 'timeout': self.settings.timeout, - 'behavior': list (map (attrgetter('name'), enabledBehavior)), + 'behavior': list (map (attrgetter('name'), self._enabledBehavior)), }, } if self.warcinfo: payload['extra'] = self.warcinfo - self.processItem (ControllerStart (payload)) + await self.processItem (ControllerStart (payload)) await l.navigate (self.url) - for b in enabledBehavior: - async for item in b.onload (): - self.processItem (item) idleProc = asyncio.ensure_future (idle.wait (self.settings.idleTimeout)) while True: @@ -231,16 +263,10 @@ class SinglePageController: timeoutProc.cancel () break - for b in enabledBehavior: - async for item in b.onstop (): - self.processItem (item) + await behavior.stop () await l.tab.Page.stopLoading () - await asyncio.sleep (1) - - for b in enabledBehavior: - async for item in b.onfinish (): - self.processItem (item) + await behavior.finish () # wait until loads from behavior scripts are done and browser is # idle for at least 1 second diff --git a/crocoite/test_behavior.py b/crocoite/test_behavior.py index 9a13c65..c76a267 100644 --- a/crocoite/test_behavior.py +++ b/crocoite/test_behavior.py @@ -110,7 +110,7 @@ class AccumHandler (EventHandler): super().__init__ () self.data = [] - def push (self, item): + async def push (self, item): self.data.append (item) async def simpleServer (url, response): diff --git a/crocoite/test_browser.py b/crocoite/test_browser.py index 713c434..7084214 100644 --- a/crocoite/test_browser.py +++ b/crocoite/test_browser.py @@ -34,7 +34,7 @@ import pytest from .browser import RequestResponsePair, SiteLoader, Request, \ UnicodeBody, ReferenceTimestamp, Base64Body, UnicodeBody, Request, \ - Response, NavigateError, PageIdle + Response, NavigateError, PageIdle, FrameNavigated from .logger import Logger, Consumer from .devtools import Crashed, Process @@ -336,33 +336,47 @@ async def test_integration_item (loader, golden): runner = web.AppRunner(app) await runner.setup() site = web.TCPSite(runner, serverUrl.host, serverUrl.port) - await site.start() + try: + await site.start() + except Exception as e: + pytest.skip (e) + haveReqResp = False + haveNavigated = False try: await loader.navigate (golden.url) it = loader.__aiter__ () while True: - item = await it.__anext__ () - if isinstance (item, RequestResponsePair): + try: + item = await asyncio.wait_for (it.__anext__ (), timeout=1) + except asyncio.TimeoutError: break - - # we do not know this in advance - item.request.initiator = None - item.request.headers = None - item.remoteIpAddress = None - item.protocol = None - item.resourceType = None - - if item.response: - assert item.response.statusText is not None - item.response.statusText = None - - del item.response.headers['server'] - del item.response.headers['content-length'] - del item.response.headers['date'] - assert item == golden + # XXX: can only check the first req/resp right now (due to redirect) + if isinstance (item, RequestResponsePair) and not haveReqResp: + # we do not know this in advance + item.request.initiator = None + item.request.headers = None + item.remoteIpAddress = None + item.protocol = None + item.resourceType = None + + if item.response: + assert item.response.statusText is not None + item.response.statusText = None + + del item.response.headers['server'] + del item.response.headers['content-length'] + del item.response.headers['date'] + assert item == golden + haveReqResp = True + elif isinstance (item, FrameNavigated): + # XXX: can’t check this, because of the redirect + #assert item.url == golden.url + haveNavigated = True finally: + assert haveReqResp + assert haveNavigated await runner.cleanup () def test_page_idle (): diff --git a/crocoite/test_controller.py b/crocoite/test_controller.py index fa478a1..7e79dbe 100644 --- a/crocoite/test_controller.py +++ b/crocoite/test_controller.py @@ -130,11 +130,11 @@ async def test_idle_state_tracker (): assert idle._idle # idle change - idle.push (PageIdle (False)) + await idle.push (PageIdle (False)) assert not idle._idle # nothing happens for other objects - idle.push ({}) + await idle.push ({}) assert not idle._idle # no state change -> wait does not return @@ -144,7 +144,7 @@ async def test_idle_state_tracker (): # wait at least timeout delta = 0.2 timeout = 1 - idle.push (PageIdle (True)) + await idle.push (PageIdle (True)) assert idle._idle start = loop.time () await idle.wait (timeout) diff --git a/crocoite/test_warc.py b/crocoite/test_warc.py index 954e8c8..478892a 100644 --- a/crocoite/test_warc.py +++ b/crocoite/test_warc.py @@ -84,8 +84,9 @@ def event (): requestResponsePair (), ) +@pytest.mark.asyncio @given (st.lists (event ())) -def test_push (golden): +async def test_push (golden): def checkWarcinfoId (headers): if lastWarcinfoRecordid is not None: assert headers['WARC-Warcinfo-ID'] == lastWarcinfoRecordid @@ -97,7 +98,7 @@ def test_push (golden): with open('/tmp/test.warc.gz', 'w+b') as fd: with WarcHandler (fd, logger) as handler: for g in golden: - handler.push (g) + await handler.push (g) fd.seek (0) it = iter (ArchiveIterator (fd)) diff --git a/crocoite/warc.py b/crocoite/warc.py index 4106995..3a084a1 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -241,7 +241,7 @@ class WarcHandler (EventHandler): ControllerStart: _writeControllerStart, } - def push (self, item): + async def push (self, item): for k, v in self.route.items (): if isinstance (item, k): v (self, item) |