summaryrefslogtreecommitdiff
path: root/crocoite/browser.py
diff options
context:
space:
mode:
Diffstat (limited to 'crocoite/browser.py')
-rw-r--r--crocoite/browser.py913
1 files changed, 396 insertions, 517 deletions
diff --git a/crocoite/browser.py b/crocoite/browser.py
index e58ebcf..3518789 100644
--- a/crocoite/browser.py
+++ b/crocoite/browser.py
@@ -22,91 +22,198 @@
Chrome browser interactions.
"""
-import logging
-from urllib.parse import urlsplit
-from base64 import b64decode
-import pychrome
+import asyncio
+from base64 import b64decode, b64encode
+from datetime import datetime, timedelta
+from http.server import BaseHTTPRequestHandler
-class Item:
- """
- Simple wrapper containing Chrome request and response
- """
+from yarl import URL
+from multidict import CIMultiDict
- def __init__ (self, tab):
- self.tab = tab
- self.chromeRequest = None
- self.chromeResponse = None
- self.chromeFinished = None
+from .logger import Level
+from .devtools import Browser, TabException
- def __repr__ (self):
- return '<Item {}>'.format (self.request['url'])
+# These two classes’ only purpose is so we can later tell whether a body was
+# base64-encoded or a unicode string
+class Base64Body (bytes):
+ def __new__ (cls, value):
+ return bytes.__new__ (cls, b64decode (value))
- @property
- def request (self):
- return self.chromeRequest['request']
+ @classmethod
+ def fromBytes (cls, b):
+ """ For testing """
+ return cls (b64encode (b))
- @property
- def response (self):
- return self.chromeResponse['response']
+class UnicodeBody (bytes):
+ def __new__ (cls, value):
+ if type (value) is not str:
+ raise TypeError ('expecting unicode string')
- @property
- def initiator (self):
- return self.chromeRequest['initiator']
+ return bytes.__new__ (cls, value.encode ('utf-8'))
- @property
- def id (self):
- return self.chromeRequest['requestId']
+class Request:
+ __slots__ = ('headers', 'body', 'initiator', 'hasPostData', 'method', 'timestamp')
- @property
- def encodedDataLength (self):
- return self.chromeFinished['encodedDataLength']
+ def __init__ (self, method=None, headers=None, body=None):
+ self.headers = headers
+ self.body = body
+ self.hasPostData = False
+ self.initiator = None
+ # HTTP method
+ self.method = method
+ self.timestamp = None
- @property
- def url (self):
- return self.response['url']
+ def __repr__ (self):
+ return f'Request({self.method!r}, {self.headers!r}, {self.body!r})'
+
+ def __eq__ (self, b):
+ if b is None:
+ return False
+
+ if not isinstance (b, Request):
+ raise TypeError ('Can only compare equality with Request.')
+
+ # do not compare hasPostData (only required to fetch body) and
+ # timestamp (depends on time)
+ return self.headers == b.headers and \
+ self.body == b.body and \
+ self.initiator == b.initiator and \
+ self.method == b.method
+
+class Response:
+ __slots__ = ('status', 'statusText', 'headers', 'body', 'bytesReceived',
+ 'timestamp', 'mimeType')
+
+ def __init__ (self, status=None, statusText=None, headers=None, body=None, mimeType=None):
+ self.status = status
+ self.statusText = statusText
+ self.headers = headers
+ self.body = body
+ # bytes received over the network (not body size!)
+ self.bytesReceived = 0
+ self.timestamp = None
+ self.mimeType = mimeType
- @property
- def parsedUrl (self):
- return urlsplit (self.url)
+ def __repr__ (self):
+ return f'Response({self.status!r}, {self.statusText!r}, {self.headers!r}, {self.body!r}, {self.mimeType!r})'
+
+ def __eq__ (self, b):
+ if b is None:
+ return False
+
+ if not isinstance (b, Response):
+ raise TypeError ('Can only compare equality with Response.')
+
+ # do not compare bytesReceived (depends on network), timestamp
+ # (depends on time) and statusText (does not matter)
+ return self.status == b.status and \
+ self.statusText == b.statusText and \
+ self.headers == b.headers and \
+ self.body == b.body and \
+ self.mimeType == b.mimeType
+
+class ReferenceTimestamp:
+ """ Map relative timestamp to absolute timestamp """
+
+ def __init__ (self, relative, absolute):
+ self.relative = timedelta (seconds=relative)
+ self.absolute = datetime.utcfromtimestamp (absolute)
+
+ def __call__ (self, relative):
+ if not isinstance (relative, timedelta):
+ relative = timedelta (seconds=relative)
+ return self.absolute + (relative-self.relative)
+
+class RequestResponsePair:
+ __slots__ = ('request', 'response', 'id', 'url', 'remoteIpAddress',
+ 'protocol', 'resourceType', '_time')
+
+ def __init__ (self, id=None, url=None, request=None, response=None):
+ self.request = request
+ self.response = response
+ self.id = id
+ self.url = url
+ self.remoteIpAddress = None
+ self.protocol = None
+ self.resourceType = None
+ self._time = None
- @property
- def body (self):
- """ Return response body or None """
- try:
- body = self.tab.Network.getResponseBody (requestId=self.id, _timeout=60)
- rawBody = body['body']
- base64Encoded = body['base64Encoded']
- if base64Encoded:
- rawBody = b64decode (rawBody)
- else:
- rawBody = rawBody.encode ('utf8')
- return rawBody, base64Encoded
- except (pychrome.exceptions.CallMethodException, pychrome.exceptions.TimeoutException):
- raise ValueError ('Cannot fetch response body')
-
- @property
- def requestBody (self):
- """ Get request/POST body """
- req = self.request
- postData = req.get ('postData')
- if postData:
- return postData.encode ('utf8'), False
- elif req.get ('hasPostData', False):
- try:
- return b64decode (self.tab.Network.getRequestPostData (requestId=self.id, _timeout=60)['postData']), True
- except (pychrome.exceptions.CallMethodException, pychrome.exceptions.TimeoutException):
- raise ValueError ('Cannot fetch request body')
- return None, False
+ def __repr__ (self):
+ return f'RequestResponsePair({self.id!r}, {self.url!r}, {self.request!r}, {self.response!r})'
+
+ def __eq__ (self, b):
+ if not isinstance (b, RequestResponsePair):
+ raise TypeError (f'Can only compare with {self.__class__.__name__}')
+
+ # do not compare id and _time. These depend on external factors and do
+ # not influence the request/response *content*
+ return self.request == b.request and \
+ self.response == b.response and \
+ self.url == b.url and \
+ self.remoteIpAddress == b.remoteIpAddress and \
+ self.protocol == b.protocol and \
+ self.resourceType == b.resourceType
+
+ def fromRequestWillBeSent (self, req):
+ """ Set request data from Chrome Network.requestWillBeSent event """
+ r = req['request']
+
+ self.id = req['requestId']
+ self.url = URL (r['url'])
+ self.resourceType = req.get ('type')
+ self._time = ReferenceTimestamp (req['timestamp'], req['wallTime'])
+
+ assert self.request is None, req
+ self.request = Request ()
+ self.request.initiator = req['initiator']
+ self.request.headers = CIMultiDict (self._unfoldHeaders (r['headers']))
+ self.request.hasPostData = r.get ('hasPostData', False)
+ self.request.method = r['method']
+ self.request.timestamp = self._time (req['timestamp'])
+ if self.request.hasPostData:
+ postData = r.get ('postData')
+ if postData is not None:
+ self.request.body = UnicodeBody (postData)
+
+ def fromResponse (self, r, timestamp=None, resourceType=None):
+ """
+ Set response data from Chrome’s Response object.
+
+ Request must exist. Updates if response was set before. Sometimes
+ fromResponseReceived is triggered twice by Chrome. No idea why.
+ """
+ assert self.request is not None, (self.request, r)
+
+ if not timestamp:
+ timestamp = self.request.timestamp
+
+ self.remoteIpAddress = r.get ('remoteIPAddress')
+ self.protocol = r.get ('protocol')
+ if resourceType:
+ self.resourceType = resourceType
+
+ # a response may contain updated request headers (i.e. those actually
+ # sent over the wire)
+ if 'requestHeaders' in r:
+ self.request.headers = CIMultiDict (self._unfoldHeaders (r['requestHeaders']))
+
+ self.response = Response ()
+ self.response.headers = CIMultiDict (self._unfoldHeaders (r['headers']))
+ self.response.status = r['status']
+ self.response.statusText = r['statusText']
+ self.response.timestamp = timestamp
+ self.response.mimeType = r['mimeType']
- @property
- def requestHeaders (self):
- # the response object may contain refined headers, which were
- # *actually* sent over the wire
- return self._unfoldHeaders (self.response.get ('requestHeaders', self.request['headers']))
+ def fromResponseReceived (self, resp):
+ """ Set response data from Chrome Network.responseReceived """
+ return self.fromResponse (resp['response'],
+ self._time (resp['timestamp']), resp['type'])
- @property
- def responseHeaders (self):
- return self._unfoldHeaders (self.response['headers'])
+ def fromLoadingFinished (self, data):
+ self.response.bytesReceived = data['encodedDataLength']
+
+ def fromLoadingFailed (self, data):
+ self.response = None
@staticmethod
def _unfoldHeaders (headers):
@@ -120,14 +227,46 @@ class Item:
items.append ((k, v))
return items
- def setRequest (self, req):
- self.chromeRequest = req
+ async def prefetchRequestBody (self, tab):
+ if self.request.hasPostData and self.request.body is None:
+ try:
+ postData = await tab.Network.getRequestPostData (requestId=self.id)
+ self.request.body = UnicodeBody (postData['postData'])
+ except TabException:
+ self.request.body = None
+
+ async def prefetchResponseBody (self, tab):
+ """ Fetch response body """
+ try:
+ body = await tab.Network.getResponseBody (requestId=self.id)
+ if body['base64Encoded']:
+ self.response.body = Base64Body (body['body'])
+ else:
+ self.response.body = UnicodeBody (body['body'])
+ except TabException:
+ self.response.body = None
- def setResponse (self, resp):
- self.chromeResponse = resp
+class NavigateError (IOError):
+ pass
- def setFinished (self, finished):
- self.chromeFinished = finished
+class PageIdle:
+ """ Page idle event """
+
+ __slots__ = ('idle', )
+
+ def __init__ (self, idle):
+ self.idle = idle
+
+ def __bool__ (self):
+ return self.idle
+
+class FrameNavigated:
+ __slots__ = ('id', 'url', 'mimeType')
+
+ def __init__ (self, id, url, mimeType):
+ self.id = id
+ self.url = URL (url)
+ self.mimeType = mimeType
class SiteLoader:
"""
@@ -136,505 +275,245 @@ class SiteLoader:
XXX: track popup windows/new tabs and close them
"""
+ __slots__ = ('requests', 'browser', 'logger', 'tab', '_iterRunning',
+ '_framesLoading', '_rootFrame')
allowedSchemes = {'http', 'https'}
- def __init__ (self, browser, url, logger=logging.getLogger(__name__)):
+ def __init__ (self, browser, logger):
self.requests = {}
- self.browser = browser
- self.url = url
- self.logger = logger
+ self.browser = Browser (url=browser)
+ self.logger = logger.bind (context=type (self).__name__)
+ self._iterRunning = []
- self.tab = browser.new_tab()
+ self._framesLoading = set ()
+ self._rootFrame = None
- def __enter__ (self):
- tab = self.tab
- # setup callbacks
- tab.Network.requestWillBeSent = self._requestWillBeSent
- tab.Network.responseReceived = self._responseReceived
- tab.Network.loadingFinished = self._loadingFinished
- tab.Network.loadingFailed = self._loadingFailed
- tab.Log.entryAdded = self._entryAdded
- #tab.Page.loadEventFired = loadEventFired
- tab.Page.javascriptDialogOpening = self._javascriptDialogOpening
-
- # start the tab
- tab.start()
+ async def __aenter__ (self):
+ tab = self.tab = await self.browser.__aenter__ ()
# enable events
- tab.Log.enable ()
- tab.Network.enable()
- tab.Page.enable ()
- tab.Network.clearBrowserCache ()
- if tab.Network.canClearBrowserCookies ()['result']:
- tab.Network.clearBrowserCookies ()
-
+ await asyncio.gather (*[
+ tab.Log.enable (),
+ tab.Network.enable(),
+ tab.Page.enable (),
+ tab.Inspector.enable (),
+ tab.Network.clearBrowserCache (),
+ tab.Network.clearBrowserCookies (),
+ ])
return self
+ async def __aexit__ (self, exc_type, exc_value, traceback):
+ for task in self._iterRunning:
+ # ignore any results from stuff we did not end up using anyway
+ if not task.done ():
+ task.cancel ()
+ self._iterRunning = []
+ await self.browser.__aexit__ (exc_type, exc_value, traceback)
+ self.tab = None
+ return False
+
def __len__ (self):
return len (self.requests)
- def start (self):
- self.tab.Page.navigate(url=self.url)
-
- def wait (self, timeout=1):
- self.tab.wait (timeout)
-
- def waitIdle (self, idleTimeout=1, maxTimeout=60):
- step = 0
- for i in range (0, maxTimeout):
- self.wait (1)
- if len (self) == 0:
- step += 1
- if step > idleTimeout:
- break
- else:
- step = 0
-
- def stop (self):
- """
- Stop loading site
-
- XXX: stop executing scripts
- """
-
+ async def __aiter__ (self):
+ """ Retrieve network items """
tab = self.tab
+ assert tab is not None
+ handler = {
+ tab.Network.requestWillBeSent: self._requestWillBeSent,
+ tab.Network.responseReceived: self._responseReceived,
+ tab.Network.loadingFinished: self._loadingFinished,
+ tab.Network.loadingFailed: self._loadingFailed,
+ tab.Log.entryAdded: self._entryAdded,
+ tab.Page.javascriptDialogOpening: self._javascriptDialogOpening,
+ tab.Page.frameStartedLoading: self._frameStartedLoading,
+ tab.Page.frameStoppedLoading: self._frameStoppedLoading,
+ tab.Page.frameNavigated: self._frameNavigated,
+ }
+
+ # The implementation is a little advanced. Why? The goal here is to
+ # process events from the tab as quickly as possible (i.e.
+ # asynchronously). We need to make sure that JavaScript dialogs are
+ # handled immediately for instance. Otherwise they stall every
+ # other request. Also, we don’t want to use an unbounded queue,
+ # since the items yielded can get quite big (response body). Thus
+ # we need to block (yield) for every item completed, but not
+ # handled by the consumer (caller).
+ running = self._iterRunning
+ tabGetTask = asyncio.ensure_future (self.tab.get ())
+ running.append (tabGetTask)
+ while True:
+ done, pending = await asyncio.wait (running, return_when=asyncio.FIRST_COMPLETED)
+ for t in done:
+ result = t.result ()
+ if result is None:
+ pass
+ elif t == tabGetTask:
+ method, data = result
+ f = handler.get (method, None)
+ if f is not None:
+ task = asyncio.ensure_future (f (**data))
+ pending.add (task)
+ tabGetTask = asyncio.ensure_future (self.tab.get ())
+ pending.add (tabGetTask)
+ else:
+ yield result
- tab.Page.stopLoading ()
- tab.Network.disable ()
- tab.Page.disable ()
- tab.Log.disable ()
- # XXX: we can’t drain the event queue directly, so insert (yet another) wait
- tab.wait (1)
- tab.Network.requestWillBeSent = None
- tab.Network.responseReceived = None
- tab.Network.loadingFinished = None
- tab.Network.loadingFailed = None
- tab.Page.loadEventFired = None
- tab.Page.javascriptDialogOpening = None
- tab.Log.entryAdded = None
-
- def __exit__ (self, exc_type, exc_value, traceback):
- self.tab.stop ()
- self.browser.close_tab(self.tab)
- return False
-
- # overrideable callbacks
- def loadingFinished (self, item, redirect=False):
- pass
+ running = pending
+ self._iterRunning = running
- def loadingFailed (self, item):
- pass
+ async def navigate (self, url):
+ ret = await self.tab.Page.navigate(url=url)
+ self.logger.debug ('navigate',
+ uuid='9d47ded2-951f-4e09-86ee-fd4151e20666', result=ret)
+ if 'errorText' in ret:
+ raise NavigateError (ret['errorText'])
+ self._rootFrame = ret['frameId']
# internal chrome callbacks
- def _requestWillBeSent (self, **kwargs):
+ async def _requestWillBeSent (self, **kwargs):
+ self.logger.debug ('requestWillBeSent',
+ uuid='b828d75a-650d-42d2-8c66-14f4547512da', args=kwargs)
+
reqId = kwargs['requestId']
req = kwargs['request']
+ url = URL (req['url'])
+ logger = self.logger.bind (reqId=reqId, reqUrl=url)
- url = urlsplit (req['url'])
if url.scheme not in self.allowedSchemes:
return
+ ret = None
item = self.requests.get (reqId)
if item:
# redirects never “finish” loading, but yield another requestWillBeSent with this key set
redirectResp = kwargs.get ('redirectResponse')
if redirectResp:
- # create fake responses
- resp = {'requestId': reqId, 'response': redirectResp, 'timestamp': kwargs['timestamp']}
- item.setResponse (resp)
- resp = {'requestId': reqId, 'encodedDataLength': 0, 'timestamp': kwargs['timestamp']}
- item.setFinished (resp)
- self.loadingFinished (item, redirect=True)
- self.logger.info ('redirected request {} has url {}'.format (reqId, req['url']))
+ if item.url != url:
+ # this happens for unknown reasons. the docs simply state
+ # it can differ in case of a redirect. Fix it and move on.
+ logger.warning ('redirect url differs',
+ uuid='558a7df7-2258-4fe4-b16d-22b6019cc163',
+ expected=item.url)
+ redirectResp['url'] = str (item.url)
+ item.fromResponse (redirectResp)
+ logger.info ('redirect', uuid='85eaec41-e2a9-49c2-9445-6f19690278b8', target=url)
+ # XXX: queue this? no need to wait for it
+ await item.prefetchRequestBody (self.tab)
+ # cannot fetch response body due to race condition (item id reused)
+ ret = item
else:
- self.logger.warning ('request {} already exists, overwriting.'.format (reqId))
+ logger.warning ('request exists', uuid='2c989142-ba00-4791-bb03-c2a14e91a56b')
- item = Item (self.tab)
- item.setRequest (kwargs)
+ item = RequestResponsePair ()
+ item.fromRequestWillBeSent (kwargs)
self.requests[reqId] = item
- def _responseReceived (self, **kwargs):
+ return ret
+
+ async def _responseReceived (self, **kwargs):
+ self.logger.debug ('responseReceived',
+ uuid='ecd67e69-401a-41cb-b4ec-eeb1f1ec6abb', args=kwargs)
+
reqId = kwargs['requestId']
item = self.requests.get (reqId)
if item is None:
return
resp = kwargs['response']
- url = urlsplit (resp['url'])
+ url = URL (resp['url'])
+ logger = self.logger.bind (reqId=reqId, respUrl=url)
+ if item.url != url:
+ logger.error ('url mismatch', uuid='7385f45f-0b06-4cbc-81f9-67bcd72ee7d0', respUrl=url)
if url.scheme in self.allowedSchemes:
- self.logger.info ('response {} {}'.format (reqId, resp['url']))
- item.setResponse (kwargs)
+ item.fromResponseReceived (kwargs)
else:
- self.logger.warning ('response: ignoring scheme {}'.format (url.scheme))
+ logger.warning ('scheme forbidden', uuid='2ea6e5d7-dd3b-4881-b9de-156c1751c666')
- def _loadingFinished (self, **kwargs):
+ async def _loadingFinished (self, **kwargs):
"""
Item was fully loaded. For some items the request body is not available
when responseReceived is fired, thus move everything here.
"""
+ self.logger.debug ('loadingFinished',
+ uuid='35479405-a5b5-4395-8c33-d3601d1796b9', args=kwargs)
+
reqId = kwargs['requestId']
item = self.requests.pop (reqId, None)
if item is None:
# we never recorded this request (blacklisted scheme, for example)
return
+ if not item.response:
+ # chrome failed to send us a responseReceived event for this item,
+ # so we can’t record it (missing request/response headers)
+ self.logger.error ('response missing',
+ uuid='fac3ab96-3f9b-4c5a-95c7-f83b675cdcb9', requestId=item.id)
+ return
+
req = item.request
- resp = item.response
- assert req['url'] == resp['url'], 'req and resp urls are not the same {} vs {}'.format (req['url'], resp['url'])
- url = urlsplit (resp['url'])
- if url.scheme in self.allowedSchemes:
- self.logger.info ('finished {} {}'.format (reqId, req['url']))
- item.setFinished (kwargs)
- self.loadingFinished (item)
+ if item.url.scheme in self.allowedSchemes:
+ item.fromLoadingFinished (kwargs)
+ # XXX queue both
+ await asyncio.gather (item.prefetchRequestBody (self.tab), item.prefetchResponseBody (self.tab))
+ return item
+
+ async def _loadingFailed (self, **kwargs):
+ self.logger.info ('loadingFailed',
+ uuid='4a944e85-5fae-4aa6-9e7c-e578b29392e4', args=kwargs)
- def _loadingFailed (self, **kwargs):
reqId = kwargs['requestId']
- self.logger.warning ('failed {} {}'.format (reqId, kwargs['errorText'], kwargs.get ('blockedReason')))
+ logger = self.logger.bind (reqId=reqId)
item = self.requests.pop (reqId, None)
- self.loadingFailed (item)
+ if item is not None:
+ item.fromLoadingFailed (kwargs)
+ return item
- def _entryAdded (self, **kwargs):
+ async def _entryAdded (self, **kwargs):
""" Log entry added """
entry = kwargs['entry']
- level = {'verbose': logging.DEBUG, 'info': logging.INFO,
- 'warning': logging.WARNING,
- 'error': logging.ERROR}[entry['level']]
- self.logger.log (level, 'console: {}: {}'.format (entry['source'], entry['text']), extra={'raw': entry})
+ level = {'verbose': Level.DEBUG, 'info': Level.INFO,
+ 'warning': Level.WARNING,
+ 'error': Level.ERROR}.get (entry.pop ('level'), Level.INFO)
+ entry['uuid'] = 'e62ffb5a-0521-459c-a3d9-1124551934d2'
+ self.logger (level, 'console', **entry)
- def _javascriptDialogOpening (self, **kwargs):
+ async def _javascriptDialogOpening (self, **kwargs):
t = kwargs.get ('type')
if t in {'alert', 'confirm', 'prompt'}:
- self.logger.info ('javascript opened a dialog: {}, {}, canceling'.format (t, kwargs.get ('message')))
- self.tab.Page.handleJavaScriptDialog (accept=False)
+ self.logger.info ('js dialog',
+ uuid='d6f07ce2-648e-493b-a1df-f353bed27c84',
+ action='cancel', type=t, message=kwargs.get ('message'))
+ await self.tab.Page.handleJavaScriptDialog (accept=False)
elif t == 'beforeunload':
# we must accept this one, otherwise the page will not unload/close
- self.logger.info ('javascript opened a dialog: {}, {}, procceeding'.format (t, kwargs.get ('message')))
- self.tab.Page.handleJavaScriptDialog (accept=True)
- else:
- self.logger.warning ('unknown javascript dialog type {}'.format (t))
-
-class AccountingSiteLoader (SiteLoader):
- """
- SiteLoader that keeps basic statistics about retrieved pages.
- """
-
- def __init__ (self, browser, url, logger=logging.getLogger(__name__)):
- super ().__init__ (browser, url, logger)
-
- self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0}
-
- def loadingFinished (self, item, redirect=False):
- super ().loadingFinished (item, redirect)
-
- self.stats['finished'] += 1
- self.stats['bytesRcv'] += item.encodedDataLength
-
- def loadingFailed (self, item):
- super ().loadingFailed (item)
-
- self.stats['failed'] += 1
-
- def _requestWillBeSent (self, **kwargs):
- super ()._requestWillBeSent (**kwargs)
-
- self.stats['requests'] += 1
-
-import subprocess
-from tempfile import mkdtemp
-import socket, shutil
-
-class ChromeService:
- """
- Start Chrome with socket activation (i.e. pass listening socket). Polling
- is not required with this method, since reads will block until Chrome is
- ready.
- """
-
- def __init__ (self, binary='google-chrome-stable', host='localhost', port=9222, windowSize=(1920, 1080)):
- self.binary = binary
- self.host = host
- self.port = port
- self.windowSize = windowSize
- self.p = None
-
- def __enter__ (self):
- assert self.p is None
-
- port = self.port
- while True:
- s = socket.socket ()
- s.setsockopt (socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
- try:
- s.bind ((self.host, port))
- break
- except OSError:
- # try different port
- if port < 65000:
- port += 1
- else:
- raise
- s.listen (10)
- self.userDataDir = mkdtemp ()
- args = [self.binary,
- '--window-size={},{}'.format (*self.windowSize),
- '--user-data-dir={}'.format (self.userDataDir), # use temporory user dir
- '--no-default-browser-check',
- '--no-first-run', # don’t show first run screen
- '--disable-breakpad', # no error reports
- '--disable-extensions',
- '--disable-infobars',
- '--disable-notifications', # no libnotify
- '--headless',
- '--disable-gpu',
- '--hide-scrollbars', # hide scrollbars on screenshots
- '--mute-audio', # don’t play any audio
- '--remote-debugging-socket-fd={}'.format (s.fileno ()),
- '--homepage=about:blank',
- 'about:blank']
- # start new session, so ^C does not affect subprocess
- self.p = subprocess.Popen (args, pass_fds=[s.fileno()], start_new_session=True,
- stdin=subprocess.DEVNULL, stdout=subprocess.DEVNULL,
- stderr=subprocess.DEVNULL)
- s.close ()
-
- return 'http://{}:{}'.format (self.host, port)
-
- def __exit__ (self, *exc):
- self.p.terminate ()
- self.p.wait ()
- shutil.rmtree (self.userDataDir)
- self.p = None
-
-class NullService:
- def __init__ (self, url):
- self.url = url
-
- def __enter__ (self):
- return self.url
-
- def __exit__ (self, *exc):
- pass
-
-### tests ###
-
-import unittest, time
-from http.server import BaseHTTPRequestHandler
-
-class TestHTTPRequestHandler (BaseHTTPRequestHandler):
- encodingTestString = {
- 'latin1': 'äöü',
- 'utf-8': 'äöü',
- 'ISO-8859-1': 'äöü',
- }
- binaryTestData = b'\x00\x01\x02'
- # 1×1 pixel PNG
- imageTestData = b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x00\x00\x00\x00:~\x9bU\x00\x00\x00\nIDAT\x08\x1dc\xf8\x0f\x00\x01\x01\x01\x006_g\x80\x00\x00\x00\x00IEND\xaeB`\x82'
- htmlTestData = '<html><body><img src="/image"><img src="/nonexistent"></body></html>'
- alertData = '<html><body><script>window.addEventListener("beforeunload", function (e) { e.returnValue = "bye?"; return e.returnValue; }); alert("stopping here"); if (confirm("are you sure?") || prompt ("42?")) { window.location = "/nonexistent"; }</script><img src="/image"></body></html>'
-
- def do_GET(self):
- path = self.path
- if path.startswith ('/redirect/301'):
- self.send_response(301)
- self.send_header ('Location', path[13:])
- self.end_headers()
- elif path == '/empty':
- self.send_response (200)
- self.end_headers ()
- elif path.startswith ('/encoding'):
- # send text data with different encodings
- _, _, encoding = path.split ('/', 3)
- self.send_response (200)
- self.send_header ('Content-Type', 'text/plain; charset={}'.format (encoding))
- self.end_headers ()
- self.wfile.write (self.encodingTestString[encoding].encode (encoding))
- elif path == '/binary':
- # send binary data
- self.send_response (200)
- self.send_header ('Content-Type', 'application/octet-stream')
- self.send_header ('Content-Length', len (self.binaryTestData))
- self.end_headers ()
- self.wfile.write (self.binaryTestData)
- elif path == '/image':
- # send binary data
- self.send_response (200)
- self.send_header ('Content-Type', 'image/png')
- self.end_headers ()
- self.wfile.write (self.imageTestData)
- elif path == '/attachment':
- self.send_response (200)
- self.send_header ('Content-Type', 'text/plain; charset=utf-8')
- self.send_header ('Content-Disposition', 'attachment; filename="attachment.txt"')
- self.end_headers ()
- self.wfile.write (self.encodingTestString['utf-8'].encode ('utf-8'))
- elif path == '/html':
- self.send_response (200)
- self.send_header ('Content-Type', 'text/html; charset=utf-8')
- self.end_headers ()
- self.wfile.write (self.htmlTestData.encode ('utf-8'))
- elif path == '/alert':
- self.send_response (200)
- self.send_header ('Content-Type', 'text/html; charset=utf-8')
- self.end_headers ()
- self.wfile.write (self.alertData.encode ('utf-8'))
- else:
- self.send_response (404)
- self.end_headers ()
-
- def log_message (self, format, *args):
- pass
-
-def startServer ():
- import http.server
- PORT = 8000
- httpd = http.server.HTTPServer (("localhost", PORT), TestHTTPRequestHandler)
- httpd.serve_forever()
-
-class TestSiteLoaderAdapter (SiteLoader):
- def __init__ (self, browser, url):
- SiteLoader.__init__ (self, browser, url)
- self.finished = []
-
- def loadingFinished (self, item, redirect=False):
- self.finished.append (item)
-
-class TestSiteLoader (unittest.TestCase):
- def setUp (self):
- from multiprocessing import Process
- self.server = Process (target=startServer)
- self.server.start ()
- self.baseurl = 'http://localhost:8000/'
- self.service = ChromeService ()
- browserUrl = self.service.__enter__ ()
- self.browser = pychrome.Browser(url=browserUrl)
-
- def buildAdapter (self, path):
- return TestSiteLoaderAdapter (self.browser, '{}{}'.format (self.baseurl, path))
-
- def assertUrls (self, l, expect):
- urls = set (map (lambda x: x.parsedUrl.path, l.finished))
- expect = set (expect)
- self.assertEqual (urls, expect)
-
- def test_wait (self):
- waittime = 2
- with self.buildAdapter ('empty') as l:
- l.start ()
- before = time.time ()
- l.wait (waittime)
- after = time.time ()
- self.assertTrue ((after-before) >= waittime)
-
- def test_empty (self):
- with self.buildAdapter ('empty') as l:
- l.start ()
- l.waitIdle ()
- self.assertEqual (len (l.finished), 1)
-
- def test_redirect301 (self):
- with self.buildAdapter ('redirect/301/empty') as l:
- l.start ()
- l.waitIdle ()
- self.assertEqual (len (l.finished), 2)
- self.assertUrls (l, ['/redirect/301/empty', '/empty'])
- for item in l.finished:
- if item.parsedUrl.path == '/empty':
- self.assertEqual (item.response['status'], 200)
- self.assertEqual (item.body[0], b'')
- elif item.parsedUrl.path == '/redirect/301/empty':
- self.assertEqual (item.response['status'], 301)
- else:
- self.fail ('unknown url')
-
- def test_redirect301multi (self):
- with self.buildAdapter ('redirect/301/redirect/301/empty') as l:
- l.start ()
- l.waitIdle ()
- self.assertEqual (len (l.finished), 3)
- self.assertUrls (l, ['/redirect/301/redirect/301/empty', '/redirect/301/empty', '/empty'])
- for item in l.finished:
- if item.parsedUrl.path == '/empty':
- self.assertEqual (item.response['status'], 200)
- self.assertEqual (item.body[0], b'')
- elif item.parsedUrl.path in {'/redirect/301/empty', \
- '/redirect/301/redirect/301/empty'}:
- self.assertEqual (item.response['status'], 301)
- else:
- self.fail ('unknown url')
-
- def test_encoding (self):
- """ Text responses are transformed to UTF-8. Make sure this works
- correctly. """
- for encoding, expected in TestHTTPRequestHandler.encodingTestString.items ():
- with self.buildAdapter ('encoding/{}'.format (encoding)) as l:
- l.start ()
- l.waitIdle ()
- self.assertEqual (len (l.finished), 1)
- self.assertUrls (l, ['/encoding/{}'.format (encoding)])
- self.assertEqual (l.finished[0].body[0], expected.encode ('utf8'))
-
- def test_binary (self):
- """ Browser should ignore content it cannot display (i.e. octet-stream) """
- with self.buildAdapter ('binary') as l:
- l.start ()
- l.waitIdle ()
- self.assertEqual (len (l.finished), 0)
-
- def test_image (self):
- """ Images should be displayed inline """
- with self.buildAdapter ('image') as l:
- l.start ()
- l.waitIdle ()
- self.assertEqual (len (l.finished), 1)
- self.assertUrls (l, ['/image'])
- self.assertEqual (l.finished[0].body[0], TestHTTPRequestHandler.imageTestData)
-
- def test_attachment (self):
- """ And downloads won’t work in headless mode """
- with self.buildAdapter ('attachment') as l:
- l.start ()
- l.waitIdle ()
- self.assertEqual (len (l.finished), 0)
-
- def test_html (self):
- with self.buildAdapter ('html') as l:
- l.start ()
- l.waitIdle ()
- self.assertEqual (len (l.finished), 3)
- self.assertUrls (l, ['/html', '/image', '/nonexistent'])
- for item in l.finished:
- if item.parsedUrl.path == '/html':
- self.assertEqual (item.response['status'], 200)
- self.assertEqual (item.body[0], TestHTTPRequestHandler.htmlTestData.encode ('utf-8'))
- elif item.parsedUrl.path == '/image':
- self.assertEqual (item.response['status'], 200)
- self.assertEqual (item.body[0], TestHTTPRequestHandler.imageTestData)
- elif item.parsedUrl.path == '/nonexistent':
- self.assertEqual (item.response['status'], 404)
- else:
- self.fail ('unknown url')
-
- def test_alert (self):
- with self.buildAdapter ('alert') as l:
- l.start ()
- l.waitIdle ()
- self.assertUrls (l, ['/alert', '/image'])
- for item in l.finished:
- if item.parsedUrl.path == '/alert':
- self.assertEqual (item.response['status'], 200)
- self.assertEqual (item.body[0], TestHTTPRequestHandler.alertData.encode ('utf-8'))
- elif item.parsedUrl.path == '/image':
- self.assertEqual (item.response['status'], 200)
- self.assertEqual (item.body[0], TestHTTPRequestHandler.imageTestData)
- else:
- self.fail ('unknown url')
-
- def tearDown (self):
- self.service.__exit__ (None, None, None)
- self.server.terminate ()
- self.server.join ()
-
-if __name__ == '__main__':
- import sys
- if sys.argv[1] == 'server':
- startServer ()
+ self.logger.info ('js dialog',
+ uuid='96399b99-9834-4c8f-bd93-cb9fa2225abd',
+ action='proceed', type=t, message=kwargs.get ('message'))
+ await self.tab.Page.handleJavaScriptDialog (accept=True)
+ else: # pragma: no cover
+ self.logger.warning ('js dialog unknown',
+ uuid='3ef7292e-8595-4e89-b834-0cc6bc40ee38', **kwargs)
+
+ async def _frameStartedLoading (self, **kwargs):
+ self.logger.debug ('frameStartedLoading',
+ uuid='bbeb39c0-3304-4221-918e-f26bd443c566', args=kwargs)
+
+ self._framesLoading.add (kwargs['frameId'])
+ return PageIdle (False)
+
+ async def _frameStoppedLoading (self, **kwargs):
+ self.logger.debug ('frameStoppedLoading',
+ uuid='fcbe8110-511c-4cbb-ac2b-f61a5782c5a0', args=kwargs)
+
+ self._framesLoading.remove (kwargs['frameId'])
+ if not self._framesLoading:
+ return PageIdle (True)
+
+ async def _frameNavigated (self, **kwargs):
+ self.logger.debug ('frameNavigated',
+ uuid='0e876f7d-7129-4612-8632-686f42ac6e1f', args=kwargs)
+ frame = kwargs['frame']
+ if self._rootFrame == frame['id']:
+ assert frame.get ('parentId', None) is None, "root frame must not have a parent"
+ return FrameNavigated (frame['id'], frame['url'], frame['mimeType'])