summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2019-06-18 13:41:53 +0200
committerLars-Dominik Braun <lars@6xq.net>2019-06-18 13:41:53 +0200
commitb4669705fa8e581c17bbe0ca0c7cf4fadbd3deb8 (patch)
treeb64aa972023caed27ab5158e4e49aecb008a4bdf
parentc33431e6c5ccf5c0b274e2ed9c21ddf776759b67 (diff)
downloadcrocoite-b4669705fa8e581c17bbe0ca0c7cf4fadbd3deb8.tar.gz
crocoite-b4669705fa8e581c17bbe0ca0c7cf4fadbd3deb8.tar.bz2
crocoite-b4669705fa8e581c17bbe0ca0c7cf4fadbd3deb8.zip
Fix idle state tracking race condition
Closes #16. Expose SiteLoader’s page idle changes through events and move state tracking into controller event handler. Relies on tracking time instead of asyncio event, which is more reliable.
-rw-r--r--crocoite/browser.py49
-rw-r--r--crocoite/controller.py87
-rw-r--r--crocoite/test_browser.py38
-rw-r--r--crocoite/test_controller.py40
4 files changed, 121 insertions, 93 deletions
diff --git a/crocoite/browser.py b/crocoite/browser.py
index 10eaaff..577e77a 100644
--- a/crocoite/browser.py
+++ b/crocoite/browser.py
@@ -246,31 +246,17 @@ class RequestResponsePair:
except TabException:
self.response.body = None
-class VarChangeEvent:
- """ Notify when variable is changed """
-
- __slots__ = ('_value', 'event')
-
- def __init__ (self, value):
- self._value = value
- self.event = asyncio.Event()
+class NavigateError (IOError):
+ pass
- def set (self, value):
- if value != self._value:
- self._value = value
- # unblock waiting threads
- self.event.set ()
- self.event.clear ()
+class PageIdle:
+ """ Page idle event """
- def get (self):
- return self._value
+ def __init__ (self, idle):
+ self.idle = idle
- async def wait (self):
- await self.event.wait ()
- return self._value
-
-class NavigateError (IOError):
- pass
+ def __bool__ (self):
+ return self.idle
class SiteLoader:
"""
@@ -280,7 +266,7 @@ class SiteLoader:
"""
__slots__ = ('requests', 'browser', 'logger', 'tab', '_iterRunning',
- 'idle', '_framesLoading')
+ '_framesLoading')
allowedSchemes = {'http', 'https'}
def __init__ (self, browser, logger):
@@ -289,7 +275,6 @@ class SiteLoader:
self.logger = logger.bind (context=type (self).__name__)
self._iterRunning = []
- self.idle = VarChangeEvent (True)
self._framesLoading = set ()
async def __aenter__ (self):
@@ -343,22 +328,24 @@ class SiteLoader:
# we need to block (yield) for every item completed, but not
# handled by the consumer (caller).
running = self._iterRunning
- running.append (asyncio.ensure_future (self.tab.get ()))
+ tabGetTask = asyncio.ensure_future (self.tab.get ())
+ running.append (tabGetTask)
while True:
done, pending = await asyncio.wait (running, return_when=asyncio.FIRST_COMPLETED)
for t in done:
result = t.result ()
if result is None:
pass
- elif isinstance (result, RequestResponsePair):
- yield result
- else:
+ elif t == tabGetTask:
method, data = result
f = handler.get (method, None)
if f is not None:
task = asyncio.ensure_future (f (**data))
pending.add (task)
- pending.add (asyncio.ensure_future (self.tab.get ()))
+ tabGetTask = asyncio.ensure_future (self.tab.get ())
+ pending.add (tabGetTask)
+ else:
+ yield result
running = pending
self._iterRunning = running
@@ -492,7 +479,7 @@ class SiteLoader:
uuid='bbeb39c0-3304-4221-918e-f26bd443c566', args=kwargs)
self._framesLoading.add (kwargs['frameId'])
- self.idle.set (False)
+ return PageIdle (False)
async def _frameStoppedLoading (self, **kwargs):
self.logger.debug ('frameStoppedLoading',
@@ -500,5 +487,5 @@ class SiteLoader:
self._framesLoading.remove (kwargs['frameId'])
if not self._framesLoading:
- self.idle.set (True)
+ return PageIdle (True)
diff --git a/crocoite/controller.py b/crocoite/controller.py
index b531491..08482af 100644
--- a/crocoite/controller.py
+++ b/crocoite/controller.py
@@ -29,7 +29,7 @@ from operator import attrgetter
from yarl import URL
from . import behavior as cbehavior
-from .browser import SiteLoader, RequestResponsePair
+from .browser import SiteLoader, RequestResponsePair, PageIdle
from .util import getFormattedViewportMetrics, getSoftwareInfo
from .behavior import ExtractLinksEvent
@@ -95,6 +95,40 @@ class ControllerStart:
def __init__ (self, payload):
self.payload = payload
+class IdleStateTracker (EventHandler):
+ """ Track SiteLoader’s idle state by listening to PageIdle events """
+
+ __slots__ = ('_idle', '_loop', '_idleSince')
+
+ def __init__ (self, loop):
+ self._idle = True
+ self._loop = loop
+
+ self._idleSince = self._loop.time ()
+
+ def push (self, item):
+ if isinstance (item, PageIdle):
+ self._idle = bool (item)
+ if self._idle:
+ self._idleSince = self._loop.time ()
+
+ async def wait (self, timeout):
+ """ Wait until page has been idle for at least timeout seconds. If the
+ page has been idle before calling this function it may return
+ immediately. """
+
+ assert timeout > 0
+ while True:
+ if self._idle:
+ now = self._loop.time ()
+ sleep = timeout-(now-self._idleSince)
+ if sleep <= 0:
+ break
+ else:
+ # not idle, check again after timeout expires
+ sleep = timeout
+ await asyncio.sleep (sleep)
+
class SinglePageController:
"""
Archive a single page url.
@@ -128,10 +162,12 @@ class SinglePageController:
async for item in l:
self.processItem (item)
+ idle = IdleStateTracker (asyncio.get_event_loop ())
+ self.handler.append (idle)
+
async with self.service as browser, SiteLoader (browser, logger=logger) as l:
handle = asyncio.ensure_future (processQueue ())
-
- start = time.time ()
+ timeoutProc = asyncio.ensure_future (asyncio.sleep (self.settings.timeout))
# not all behavior scripts are allowed for every URL, filter them
enabledBehavior = list (filter (lambda x: self.url in x,
@@ -162,30 +198,17 @@ class SinglePageController:
async for item in b.onload ():
self.processItem (item)
- # wait until the browser has a) been idle for at least
- # settings.idleTimeout or b) settings.timeout is exceeded
- timeoutProc = asyncio.ensure_future (asyncio.sleep (self.settings.timeout))
- # the browser might have changed to idle from .navigate until here
- # due to awaits inbetween. Thus, idleProc may never be triggered.
- idleTimeout = None if not l.idle.get() else self.settings.idleTimeout
+ idleProc = asyncio.ensure_future (idle.wait (self.settings.idleTimeout))
while True:
- idleProc = asyncio.ensure_future (l.idle.wait ())
try:
finished, pending = await asyncio.wait([idleProc, timeoutProc, handle],
- return_when=asyncio.FIRST_COMPLETED, timeout=idleTimeout)
+ return_when=asyncio.FIRST_COMPLETED)
except asyncio.CancelledError:
idleProc.cancel ()
timeoutProc.cancel ()
break
- if not finished:
- # idle timeout
- logger.debug ('idle timeout',
- uuid='90702590-94c4-44ef-9b37-02a16de444c3')
- idleProc.cancel ()
- timeoutProc.cancel ()
- break
- elif handle in finished:
+ if handle in finished:
# something went wrong while processing the data
logger.error ('fetch failed',
uuid='43a0686a-a3a9-4214-9acd-43f6976f8ff3')
@@ -201,16 +224,12 @@ class SinglePageController:
timeoutProc.result ()
break
elif idleProc in finished:
- # idle state change
- isIdle = idleProc.result ()
- logger.debug ('idle state',
- uuid='e3eaff79-7b56-4d17-aa42-d32fa1ec268b',
- idle=isIdle)
- if isIdle:
- # browser is idle, start the clock
- idleTimeout = self.settings.idleTimeout
- else:
- idleTimeout = None
+ # idle timeout
+ logger.debug ('idle timeout',
+ uuid='90702590-94c4-44ef-9b37-02a16de444c3')
+ idleProc.result ()
+ timeoutProc.cancel ()
+ break
for b in enabledBehavior:
async for item in b.onstop ():
@@ -223,10 +242,12 @@ class SinglePageController:
async for item in b.onfinish ():
self.processItem (item)
- # wait until loads from behavior scripts are done
- await asyncio.sleep (1)
- if not l.idle.get ():
- while not await l.idle.wait (): pass
+ # wait until loads from behavior scripts are done and browser is
+ # idle for at least 1 second
+ try:
+ await asyncio.wait_for (idle.wait (1), timeout=1)
+ except (asyncio.TimeoutError, asyncio.CancelledError):
+ pass
if handle.done ():
handle.result ()
diff --git a/crocoite/test_browser.py b/crocoite/test_browser.py
index 6341bd6..713c434 100644
--- a/crocoite/test_browser.py
+++ b/crocoite/test_browser.py
@@ -32,9 +32,9 @@ import hypothesis.strategies as st
from hypothesis.provisional import domains
import pytest
-from .browser import RequestResponsePair, SiteLoader, VarChangeEvent, Request, \
+from .browser import RequestResponsePair, SiteLoader, Request, \
UnicodeBody, ReferenceTimestamp, Base64Body, UnicodeBody, Request, \
- Response, NavigateError
+ Response, NavigateError, PageIdle
from .logger import Logger, Consumer
from .devtools import Crashed, Process
@@ -79,29 +79,6 @@ async def test_invalidurl (loader):
else:
pytest.skip (f'host {host} resolved to {resolved}')
-@pytest.mark.asyncio
-async def test_varchangeevent ():
- e = VarChangeEvent (True)
- assert e.get () == True
-
- # no change at all
- w = asyncio.ensure_future (e.wait ())
- finished, pending = await asyncio.wait ([w], timeout=0.1)
- assert not finished and pending
-
- # no change
- e.set (True)
- finished, pending = await asyncio.wait ([w], timeout=0.1)
- assert not finished and pending
-
- # changed
- e.set (False)
- await asyncio.sleep (0.1) # XXX: is there a yield() ?
- assert w.done ()
- ret = w.result ()
- assert ret == False
- assert e.get () == ret
-
timestamp = st.one_of (
st.integers(min_value=0, max_value=2**32-1),
st.floats (min_value=0, max_value=2**32-1),
@@ -365,7 +342,10 @@ async def test_integration_item (loader, golden):
await loader.navigate (golden.url)
it = loader.__aiter__ ()
- item = await it.__anext__ ()
+ while True:
+ item = await it.__anext__ ()
+ if isinstance (item, RequestResponsePair):
+ break
# we do not know this in advance
item.request.initiator = None
@@ -385,3 +365,9 @@ async def test_integration_item (loader, golden):
finally:
await runner.cleanup ()
+def test_page_idle ():
+ for v in (True, False):
+ idle = PageIdle (v)
+ assert bool (idle) == v
+
+
diff --git a/crocoite/test_controller.py b/crocoite/test_controller.py
index 6f92e23..fa478a1 100644
--- a/crocoite/test_controller.py
+++ b/crocoite/test_controller.py
@@ -26,7 +26,9 @@ from aiohttp import web
import pytest
from .logger import Logger
-from .controller import ControllerSettings, SinglePageController, SetEntry
+from .controller import ControllerSettings, SinglePageController, SetEntry, \
+ IdleStateTracker
+from .browser import PageIdle
from .devtools import Process
from .test_browser import loader
@@ -63,10 +65,13 @@ window.setInterval (function () { fetch('/').then (function (e) { console.log (e
# hard-coded asyncio.sleep calls in there right now.
# XXX fix this
before = loop.time ()
- await asyncio.wait_for (controller.run (), settings.timeout*2)
+ await asyncio.wait_for (controller.run (), timeout=settings.timeout*2)
after = loop.time ()
- assert after-before >= settings.timeout
+ assert after-before >= settings.timeout, (settings.timeout*2, after-before)
finally:
+ # give the browser some time to close before interrupting the
+ # connection by destroying the HTTP server
+ await asyncio.sleep (1)
await runner.cleanup ()
@pytest.mark.asyncio
@@ -117,3 +122,32 @@ def test_set_entry ():
assert a != c
assert hash (a) != hash (c)
+@pytest.mark.asyncio
+async def test_idle_state_tracker ():
+ # default is idle
+ loop = asyncio.get_event_loop ()
+ idle = IdleStateTracker (loop)
+ assert idle._idle
+
+ # idle change
+ idle.push (PageIdle (False))
+ assert not idle._idle
+
+ # nothing happens for other objects
+ idle.push ({})
+ assert not idle._idle
+
+ # no state change -> wait does not return
+ with pytest.raises (asyncio.TimeoutError):
+ await asyncio.wait_for (idle.wait (0.1), timeout=1)
+
+ # wait at least timeout
+ delta = 0.2
+ timeout = 1
+ idle.push (PageIdle (True))
+ assert idle._idle
+ start = loop.time ()
+ await idle.wait (timeout)
+ end = loop.time ()
+ assert (timeout-delta) < (end-start) < (timeout+delta)
+