summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2019-01-07 10:41:45 +0100
committerLars-Dominik Braun <lars@6xq.net>2019-01-07 10:41:45 +0100
commit3a2fcc69a8eb4237b2862b3e291971d38748f115 (patch)
treef7918781bc17ee4abea810271e1ee50a4b2b3b80
parent73c0647fd35a6ca07bf53a987d6be7ef1055cf43 (diff)
downloadcrocoite-3a2fcc69a8eb4237b2862b3e291971d38748f115.tar.gz
crocoite-3a2fcc69a8eb4237b2862b3e291971d38748f115.tar.bz2
crocoite-3a2fcc69a8eb4237b2862b3e291971d38748f115.zip
Log Chrome’s responses to WARC by default
We may not be able to reproduce every failure, so logging as much as possible is important to figure out what went wrong. Also, in case a bug is uncovered in the future, we can check the logs and possibly fix it with -errata.
-rw-r--r--crocoite/behavior.py9
-rw-r--r--crocoite/browser.py30
-rw-r--r--crocoite/cli.py3
-rw-r--r--crocoite/logger.py4
-rw-r--r--crocoite/warc.py5
5 files changed, 32 insertions, 19 deletions
diff --git a/crocoite/behavior.py b/crocoite/behavior.py
index 2c94bb8..3ecc92f 100644
--- a/crocoite/behavior.py
+++ b/crocoite/behavior.py
@@ -131,6 +131,8 @@ class JsOnload (Behavior):
# parameter.
# XXX: is there a better way to do this?
result = await tab.Runtime.evaluate (expression=str (self.script))
+ self.logger.debug ('behavior onload inject',
+ uuid='a2da9b78-5648-44c5-bfa8-5c7573e13ad3', result=result)
exception = result.get ('exceptionDetails', None)
result = result['result']
assert result['type'] == 'function', result
@@ -143,6 +145,8 @@ class JsOnload (Behavior):
functionDeclaration='function(options){return new this(options);}',
objectId=constructor,
arguments=[{'value': self.options}])
+ self.logger.debug ('behavior onload start',
+ uuid='6c0605ae-93b3-46b3-b575-ba45790909a7', result=result)
result = result['result']
assert result['type'] == 'object', result
assert result.get ('subtype') != 'error', result
@@ -160,8 +164,7 @@ class JsOnload (Behavior):
self.logger.error ('jsonload onstop failed',
uuid='1786726f-c8ec-4f79-8769-30954d4e32f5',
exception=e.args,
- objectId=self.context,
- context=self.__class__.__name__)
+ objectId=self.context)
return
yield # pragma: no cover
@@ -238,6 +241,8 @@ class DomSnapshot (Behavior):
viewport = await getFormattedViewportMetrics (tab)
dom = await tab.DOM.getDocument (depth=-1, pierce=True)
+ self.logger.debug ('dom snapshot document',
+ uuid='0c720784-8bd1-4fdc-a811-84394d753539', dom=dom)
haveUrls = set ()
for doc in ChromeTreeWalker (dom['root']).split ():
url = URL (doc['documentURL'])
diff --git a/crocoite/browser.py b/crocoite/browser.py
index 902a068..6ad305c 100644
--- a/crocoite/browser.py
+++ b/crocoite/browser.py
@@ -361,10 +361,15 @@ class SiteLoader:
self._iterRunning = running
async def navigate (self, url):
- await self.tab.Page.navigate(url=url)
+ ret = await self.tab.Page.navigate(url=url)
+ self.logger.debug ('navigate',
+ uuid='9d47ded2-951f-4e09-86ee-fd4151e20666', result=ret)
# internal chrome callbacks
async def _requestWillBeSent (self, **kwargs):
+ self.logger.debug ('requestWillBeSent',
+ uuid='b828d75a-650d-42d2-8c66-14f4547512da', args=kwargs)
+
reqId = kwargs['requestId']
req = kwargs['request']
url = URL (req['url'])
@@ -398,11 +403,13 @@ class SiteLoader:
item = RequestResponsePair ()
item.fromRequestWillBeSent (kwargs)
self.requests[reqId] = item
- logger.debug ('request', uuid='55c17564-1bd0-4499-8724-fa7aad65478f')
return ret
async def _responseReceived (self, **kwargs):
+ self.logger.debug ('responseReceived',
+ uuid='ecd67e69-401a-41cb-b4ec-eeb1f1ec6abb', args=kwargs)
+
reqId = kwargs['requestId']
item = self.requests.get (reqId)
if item is None:
@@ -414,7 +421,6 @@ class SiteLoader:
if item.url != url:
logger.error ('url mismatch', uuid='7385f45f-0b06-4cbc-81f9-67bcd72ee7d0', respUrl=url)
if url.scheme in self.allowedSchemes:
- logger.debug ('response', uuid='84461c4e-e8ef-4cbd-8e8e-e10a901c8bd0')
item.fromResponseReceived (kwargs)
else:
logger.warning ('scheme forbidden', uuid='2ea6e5d7-dd3b-4881-b9de-156c1751c666')
@@ -424,27 +430,27 @@ class SiteLoader:
Item was fully loaded. For some items the request body is not available
when responseReceived is fired, thus move everything here.
"""
+ self.logger.debug ('loadingFinished',
+ uuid='35479405-a5b5-4395-8c33-d3601d1796b9', args=kwargs)
+
reqId = kwargs['requestId']
item = self.requests.pop (reqId, None)
if item is None:
# we never recorded this request (blacklisted scheme, for example)
return
req = item.request
- logger = self.logger.bind (reqId=reqId, reqUrl=item.url)
if item.url.scheme in self.allowedSchemes:
- logger.info ('finished', uuid='5a8b4bad-f86a-4fe6-a53e-8da4130d6a02')
item.fromLoadingFinished (kwargs)
# XXX queue both
await asyncio.gather (item.prefetchRequestBody (self.tab), item.prefetchResponseBody (self.tab))
return item
async def _loadingFailed (self, **kwargs):
+ self.logger.info ('loadingFailed',
+ uuid='35479405-a5b5-4395-8c33-d3601d1796b9', args=kwargs)
+
reqId = kwargs['requestId']
logger = self.logger.bind (reqId=reqId)
- logger.warning ('loading failed',
- uuid='68410f13-6eea-453e-924e-c1af4601748b',
- errorText=kwargs['errorText'],
- blockedReason=kwargs.get ('blockedReason'))
item = self.requests.pop (reqId, None)
if item is not None:
item.fromLoadingFailed (kwargs)
@@ -477,10 +483,16 @@ class SiteLoader:
uuid='3ef7292e-8595-4e89-b834-0cc6bc40ee38', **kwargs)
async def _frameStartedLoading (self, **kwargs):
+ self.logger.debug ('frameStartedLoading',
+ uuid='bbeb39c0-3304-4221-918e-f26bd443c566', args=kwargs)
+
self._framesLoading.add (kwargs['frameId'])
self.idle.set (False)
async def _frameStoppedLoading (self, **kwargs):
+ self.logger.debug ('frameStoppedLoading',
+ uuid='fcbe8110-511c-4cbb-ac2b-f61a5782c5a0', args=kwargs)
+
self._framesLoading.remove (kwargs['frameId'])
if not self._framesLoading:
self.idle.set (True)
diff --git a/crocoite/cli.py b/crocoite/cli.py
index f9ef52c..a8de73b 100644
--- a/crocoite/cli.py
+++ b/crocoite/cli.py
@@ -32,7 +32,8 @@ from .controller import SinglePageController, \
RecursiveController, DepthLimit, PrefixLimit
from .devtools import Passthrough, Process
from .warc import WarcHandler
-from .logger import Logger, JsonPrintConsumer, DatetimeConsumer, WarcHandlerConsumer
+from .logger import Logger, JsonPrintConsumer, DatetimeConsumer, \
+ WarcHandlerConsumer, Level
from .devtools import Crashed
class SingleExitStatus(IntEnum):
diff --git a/crocoite/logger.py b/crocoite/logger.py
index d882eaf..ac389ca 100644
--- a/crocoite/logger.py
+++ b/crocoite/logger.py
@@ -105,7 +105,7 @@ class PrintConsumer (Consumer):
return kwargs
class JsonPrintConsumer (Consumer):
- def __init__ (self, minLevel=Level.INFO):
+ def __init__ (self, minLevel=Level.DEBUG):
self.minLevel = minLevel
def __call__ (self, **kwargs):
@@ -121,7 +121,7 @@ class DatetimeConsumer (Consumer):
return kwargs
class WarcHandlerConsumer (Consumer):
- def __init__ (self, warc, minLevel=Level.INFO):
+ def __init__ (self, warc, minLevel=Level.DEBUG):
self.warc = warc
self.minLevel = minLevel
diff --git a/crocoite/warc.py b/crocoite/warc.py
index cb1f2f7..4106995 100644
--- a/crocoite/warc.py
+++ b/crocoite/warc.py
@@ -242,13 +242,8 @@ class WarcHandler (EventHandler):
}
def push (self, item):
- processed = False
for k, v in self.route.items ():
if isinstance (item, k):
v (self, item)
- processed = True
break
- if not processed:
- self.logger.debug (f'unknown event {item!r}')
-