summaryrefslogtreecommitdiff
path: root/crocoite
diff options
context:
space:
mode:
Diffstat (limited to 'crocoite')
-rw-r--r--crocoite/browser.py23
-rw-r--r--crocoite/controller.py74
-rw-r--r--crocoite/test_behavior.py2
-rw-r--r--crocoite/test_browser.py54
-rw-r--r--crocoite/test_controller.py6
-rw-r--r--crocoite/test_warc.py5
-rw-r--r--crocoite/warc.py2
7 files changed, 114 insertions, 52 deletions
diff --git a/crocoite/browser.py b/crocoite/browser.py
index 577e77a..dc67c51 100644
--- a/crocoite/browser.py
+++ b/crocoite/browser.py
@@ -252,12 +252,22 @@ class NavigateError (IOError):
class PageIdle:
""" Page idle event """
+ __slots__ = ('idle', )
+
def __init__ (self, idle):
self.idle = idle
def __bool__ (self):
return self.idle
+class FrameNavigated:
+ __slots__ = ('id', 'url', 'mimeType')
+
+ def __init__ (self, id, url, mimeType):
+ self.id = id
+ self.url = URL (url)
+ self.mimeType = mimeType
+
class SiteLoader:
"""
Load site in Chrome and monitor network requests
@@ -266,7 +276,7 @@ class SiteLoader:
"""
__slots__ = ('requests', 'browser', 'logger', 'tab', '_iterRunning',
- '_framesLoading')
+ '_framesLoading', '_rootFrame')
allowedSchemes = {'http', 'https'}
def __init__ (self, browser, logger):
@@ -276,6 +286,7 @@ class SiteLoader:
self._iterRunning = []
self._framesLoading = set ()
+ self._rootFrame = None
async def __aenter__ (self):
tab = self.tab = await self.browser.__aenter__ ()
@@ -317,6 +328,7 @@ class SiteLoader:
tab.Page.javascriptDialogOpening: self._javascriptDialogOpening,
tab.Page.frameStartedLoading: self._frameStartedLoading,
tab.Page.frameStoppedLoading: self._frameStoppedLoading,
+ tab.Page.frameNavigated: self._frameNavigated,
}
# The implementation is a little advanced. Why? The goal here is to
@@ -356,6 +368,7 @@ class SiteLoader:
uuid='9d47ded2-951f-4e09-86ee-fd4151e20666', result=ret)
if 'errorText' in ret:
raise NavigateError (ret['errorText'])
+ self._rootFrame = ret['frameId']
# internal chrome callbacks
async def _requestWillBeSent (self, **kwargs):
@@ -489,3 +502,11 @@ class SiteLoader:
if not self._framesLoading:
return PageIdle (True)
+ async def _frameNavigated (self, **kwargs):
+ self.logger.debug ('frameNavigated',
+ uuid='0e876f7d-7129-4612-8632-686f42ac6e1f', args=kwargs)
+ frame = kwargs['frame']
+ if self._rootFrame == frame['id']:
+ assert frame.get ('parentId', None) is None, "root frame must not have a parent"
+ return FrameNavigated (frame['id'], frame['url'], frame['mimeType'])
+
diff --git a/crocoite/controller.py b/crocoite/controller.py
index 08482af..1bcca0f 100644
--- a/crocoite/controller.py
+++ b/crocoite/controller.py
@@ -26,10 +26,11 @@ import time, tempfile, asyncio, json, os
from itertools import islice
from datetime import datetime
from operator import attrgetter
+from abc import ABC, abstractmethod
from yarl import URL
from . import behavior as cbehavior
-from .browser import SiteLoader, RequestResponsePair, PageIdle
+from .browser import SiteLoader, RequestResponsePair, PageIdle, FrameNavigated
from .util import getFormattedViewportMetrics, getSoftwareInfo
from .behavior import ExtractLinksEvent
@@ -45,12 +46,13 @@ class ControllerSettings:
defaultSettings = ControllerSettings ()
-class EventHandler:
+class EventHandler (ABC):
""" Abstract base class for event handler """
__slots__ = ()
- def push (self, item):
+ @abstractmethod
+ async def push (self, item):
raise NotImplementedError ()
class StatsHandler (EventHandler):
@@ -59,7 +61,7 @@ class StatsHandler (EventHandler):
def __init__ (self):
self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0}
- def push (self, item):
+ async def push (self, item):
if isinstance (item, RequestResponsePair):
self.stats['requests'] += 1
if not item.response:
@@ -76,7 +78,7 @@ class LogHandler (EventHandler):
def __init__ (self, logger):
self.logger = logger.bind (context=type (self).__name__)
- def push (self, item):
+ async def push (self, item):
if isinstance (item, ExtractLinksEvent):
# limit number of links per message, so json blob won’t get too big
it = iter (item.links)
@@ -106,7 +108,7 @@ class IdleStateTracker (EventHandler):
self._idleSince = self._loop.time ()
- def push (self, item):
+ async def push (self, item):
if isinstance (item, PageIdle):
self._idle = bool (item)
if self._idle:
@@ -129,6 +131,37 @@ class IdleStateTracker (EventHandler):
sleep = timeout
await asyncio.sleep (sleep)
+class InjectBehaviorOnload (EventHandler):
+ """ Control behavior script injection based on frame navigation messages.
+ When a page is reloaded (for whatever reason), the scripts need to be
+ reinjected. """
+
+ __slots__ = ('controller', '_loaded')
+
+ def __init__ (self, controller):
+ self.controller = controller
+ self._loaded = False
+
+ async def push (self, item):
+ if isinstance (item, FrameNavigated):
+ await self._runon ('load')
+ self._loaded = True
+
+ async def stop (self):
+ if self._loaded:
+ await self._runon ('stop')
+
+ async def finish (self):
+ if self._loaded:
+ await self._runon ('finish')
+
+ async def _runon (self, method):
+ controller = self.controller
+ for b in controller._enabledBehavior:
+ f = getattr (b, 'on' + method)
+ async for item in f ():
+ await controller.processItem (item)
+
class SinglePageController:
"""
Archive a single page url.
@@ -138,7 +171,7 @@ class SinglePageController:
"""
__slots__ = ('url', 'service', 'behavior', 'settings', 'logger', 'handler',
- 'warcinfo')
+ 'warcinfo', '_enabledBehavior')
def __init__ (self, url, logger, \
service, behavior=cbehavior.available, \
@@ -152,25 +185,27 @@ class SinglePageController:
self.handler = handler or []
self.warcinfo = warcinfo
- def processItem (self, item):
+ async def processItem (self, item):
for h in self.handler:
- h.push (item)
+ await h.push (item)
async def run (self):
logger = self.logger
async def processQueue ():
async for item in l:
- self.processItem (item)
+ await self.processItem (item)
idle = IdleStateTracker (asyncio.get_event_loop ())
self.handler.append (idle)
+ behavior = InjectBehaviorOnload (self)
+ self.handler.append (behavior)
async with self.service as browser, SiteLoader (browser, logger=logger) as l:
handle = asyncio.ensure_future (processQueue ())
timeoutProc = asyncio.ensure_future (asyncio.sleep (self.settings.timeout))
# not all behavior scripts are allowed for every URL, filter them
- enabledBehavior = list (filter (lambda x: self.url in x,
+ self._enabledBehavior = list (filter (lambda x: self.url in x,
map (lambda x: x (l, logger), self.behavior)))
version = await l.tab.Browser.getVersion ()
@@ -186,17 +221,14 @@ class SinglePageController:
'url': self.url,
'idleTimeout': self.settings.idleTimeout,
'timeout': self.settings.timeout,
- 'behavior': list (map (attrgetter('name'), enabledBehavior)),
+ 'behavior': list (map (attrgetter('name'), self._enabledBehavior)),
},
}
if self.warcinfo:
payload['extra'] = self.warcinfo
- self.processItem (ControllerStart (payload))
+ await self.processItem (ControllerStart (payload))
await l.navigate (self.url)
- for b in enabledBehavior:
- async for item in b.onload ():
- self.processItem (item)
idleProc = asyncio.ensure_future (idle.wait (self.settings.idleTimeout))
while True:
@@ -231,16 +263,10 @@ class SinglePageController:
timeoutProc.cancel ()
break
- for b in enabledBehavior:
- async for item in b.onstop ():
- self.processItem (item)
+ await behavior.stop ()
await l.tab.Page.stopLoading ()
-
await asyncio.sleep (1)
-
- for b in enabledBehavior:
- async for item in b.onfinish ():
- self.processItem (item)
+ await behavior.finish ()
# wait until loads from behavior scripts are done and browser is
# idle for at least 1 second
diff --git a/crocoite/test_behavior.py b/crocoite/test_behavior.py
index 9a13c65..c76a267 100644
--- a/crocoite/test_behavior.py
+++ b/crocoite/test_behavior.py
@@ -110,7 +110,7 @@ class AccumHandler (EventHandler):
super().__init__ ()
self.data = []
- def push (self, item):
+ async def push (self, item):
self.data.append (item)
async def simpleServer (url, response):
diff --git a/crocoite/test_browser.py b/crocoite/test_browser.py
index 713c434..7084214 100644
--- a/crocoite/test_browser.py
+++ b/crocoite/test_browser.py
@@ -34,7 +34,7 @@ import pytest
from .browser import RequestResponsePair, SiteLoader, Request, \
UnicodeBody, ReferenceTimestamp, Base64Body, UnicodeBody, Request, \
- Response, NavigateError, PageIdle
+ Response, NavigateError, PageIdle, FrameNavigated
from .logger import Logger, Consumer
from .devtools import Crashed, Process
@@ -336,33 +336,47 @@ async def test_integration_item (loader, golden):
runner = web.AppRunner(app)
await runner.setup()
site = web.TCPSite(runner, serverUrl.host, serverUrl.port)
- await site.start()
+ try:
+ await site.start()
+ except Exception as e:
+ pytest.skip (e)
+ haveReqResp = False
+ haveNavigated = False
try:
await loader.navigate (golden.url)
it = loader.__aiter__ ()
while True:
- item = await it.__anext__ ()
- if isinstance (item, RequestResponsePair):
+ try:
+ item = await asyncio.wait_for (it.__anext__ (), timeout=1)
+ except asyncio.TimeoutError:
break
-
- # we do not know this in advance
- item.request.initiator = None
- item.request.headers = None
- item.remoteIpAddress = None
- item.protocol = None
- item.resourceType = None
-
- if item.response:
- assert item.response.statusText is not None
- item.response.statusText = None
-
- del item.response.headers['server']
- del item.response.headers['content-length']
- del item.response.headers['date']
- assert item == golden
+ # XXX: can only check the first req/resp right now (due to redirect)
+ if isinstance (item, RequestResponsePair) and not haveReqResp:
+ # we do not know this in advance
+ item.request.initiator = None
+ item.request.headers = None
+ item.remoteIpAddress = None
+ item.protocol = None
+ item.resourceType = None
+
+ if item.response:
+ assert item.response.statusText is not None
+ item.response.statusText = None
+
+ del item.response.headers['server']
+ del item.response.headers['content-length']
+ del item.response.headers['date']
+ assert item == golden
+ haveReqResp = True
+ elif isinstance (item, FrameNavigated):
+ # XXX: can’t check this, because of the redirect
+ #assert item.url == golden.url
+ haveNavigated = True
finally:
+ assert haveReqResp
+ assert haveNavigated
await runner.cleanup ()
def test_page_idle ():
diff --git a/crocoite/test_controller.py b/crocoite/test_controller.py
index fa478a1..7e79dbe 100644
--- a/crocoite/test_controller.py
+++ b/crocoite/test_controller.py
@@ -130,11 +130,11 @@ async def test_idle_state_tracker ():
assert idle._idle
# idle change
- idle.push (PageIdle (False))
+ await idle.push (PageIdle (False))
assert not idle._idle
# nothing happens for other objects
- idle.push ({})
+ await idle.push ({})
assert not idle._idle
# no state change -> wait does not return
@@ -144,7 +144,7 @@ async def test_idle_state_tracker ():
# wait at least timeout
delta = 0.2
timeout = 1
- idle.push (PageIdle (True))
+ await idle.push (PageIdle (True))
assert idle._idle
start = loop.time ()
await idle.wait (timeout)
diff --git a/crocoite/test_warc.py b/crocoite/test_warc.py
index 954e8c8..478892a 100644
--- a/crocoite/test_warc.py
+++ b/crocoite/test_warc.py
@@ -84,8 +84,9 @@ def event ():
requestResponsePair (),
)
+@pytest.mark.asyncio
@given (st.lists (event ()))
-def test_push (golden):
+async def test_push (golden):
def checkWarcinfoId (headers):
if lastWarcinfoRecordid is not None:
assert headers['WARC-Warcinfo-ID'] == lastWarcinfoRecordid
@@ -97,7 +98,7 @@ def test_push (golden):
with open('/tmp/test.warc.gz', 'w+b') as fd:
with WarcHandler (fd, logger) as handler:
for g in golden:
- handler.push (g)
+ await handler.push (g)
fd.seek (0)
it = iter (ArchiveIterator (fd))
diff --git a/crocoite/warc.py b/crocoite/warc.py
index 4106995..3a084a1 100644
--- a/crocoite/warc.py
+++ b/crocoite/warc.py
@@ -241,7 +241,7 @@ class WarcHandler (EventHandler):
ControllerStart: _writeControllerStart,
}
- def push (self, item):
+ async def push (self, item):
for k, v in self.route.items ():
if isinstance (item, k):
v (self, item)