diff options
author | Lars-Dominik Braun <lars@6xq.net> | 2019-01-07 10:41:45 +0100 |
---|---|---|
committer | Lars-Dominik Braun <lars@6xq.net> | 2019-01-07 10:41:45 +0100 |
commit | 3a2fcc69a8eb4237b2862b3e291971d38748f115 (patch) | |
tree | f7918781bc17ee4abea810271e1ee50a4b2b3b80 /crocoite | |
parent | 73c0647fd35a6ca07bf53a987d6be7ef1055cf43 (diff) | |
download | crocoite-3a2fcc69a8eb4237b2862b3e291971d38748f115.tar.gz crocoite-3a2fcc69a8eb4237b2862b3e291971d38748f115.tar.bz2 crocoite-3a2fcc69a8eb4237b2862b3e291971d38748f115.zip |
Log Chrome’s responses to WARC by default
We may not be able to reproduce every failure, so logging as much as
possible is important to figure out what went wrong. Also, in case a bug
is uncovered in the future, we can check the logs and possibly fix it
with -errata.
Diffstat (limited to 'crocoite')
-rw-r--r-- | crocoite/behavior.py | 9 | ||||
-rw-r--r-- | crocoite/browser.py | 30 | ||||
-rw-r--r-- | crocoite/cli.py | 3 | ||||
-rw-r--r-- | crocoite/logger.py | 4 | ||||
-rw-r--r-- | crocoite/warc.py | 5 |
5 files changed, 32 insertions, 19 deletions
diff --git a/crocoite/behavior.py b/crocoite/behavior.py index 2c94bb8..3ecc92f 100644 --- a/crocoite/behavior.py +++ b/crocoite/behavior.py @@ -131,6 +131,8 @@ class JsOnload (Behavior): # parameter. # XXX: is there a better way to do this? result = await tab.Runtime.evaluate (expression=str (self.script)) + self.logger.debug ('behavior onload inject', + uuid='a2da9b78-5648-44c5-bfa8-5c7573e13ad3', result=result) exception = result.get ('exceptionDetails', None) result = result['result'] assert result['type'] == 'function', result @@ -143,6 +145,8 @@ class JsOnload (Behavior): functionDeclaration='function(options){return new this(options);}', objectId=constructor, arguments=[{'value': self.options}]) + self.logger.debug ('behavior onload start', + uuid='6c0605ae-93b3-46b3-b575-ba45790909a7', result=result) result = result['result'] assert result['type'] == 'object', result assert result.get ('subtype') != 'error', result @@ -160,8 +164,7 @@ class JsOnload (Behavior): self.logger.error ('jsonload onstop failed', uuid='1786726f-c8ec-4f79-8769-30954d4e32f5', exception=e.args, - objectId=self.context, - context=self.__class__.__name__) + objectId=self.context) return yield # pragma: no cover @@ -238,6 +241,8 @@ class DomSnapshot (Behavior): viewport = await getFormattedViewportMetrics (tab) dom = await tab.DOM.getDocument (depth=-1, pierce=True) + self.logger.debug ('dom snapshot document', + uuid='0c720784-8bd1-4fdc-a811-84394d753539', dom=dom) haveUrls = set () for doc in ChromeTreeWalker (dom['root']).split (): url = URL (doc['documentURL']) diff --git a/crocoite/browser.py b/crocoite/browser.py index 902a068..6ad305c 100644 --- a/crocoite/browser.py +++ b/crocoite/browser.py @@ -361,10 +361,15 @@ class SiteLoader: self._iterRunning = running async def navigate (self, url): - await self.tab.Page.navigate(url=url) + ret = await self.tab.Page.navigate(url=url) + self.logger.debug ('navigate', + uuid='9d47ded2-951f-4e09-86ee-fd4151e20666', result=ret) # internal chrome callbacks async def _requestWillBeSent (self, **kwargs): + self.logger.debug ('requestWillBeSent', + uuid='b828d75a-650d-42d2-8c66-14f4547512da', args=kwargs) + reqId = kwargs['requestId'] req = kwargs['request'] url = URL (req['url']) @@ -398,11 +403,13 @@ class SiteLoader: item = RequestResponsePair () item.fromRequestWillBeSent (kwargs) self.requests[reqId] = item - logger.debug ('request', uuid='55c17564-1bd0-4499-8724-fa7aad65478f') return ret async def _responseReceived (self, **kwargs): + self.logger.debug ('responseReceived', + uuid='ecd67e69-401a-41cb-b4ec-eeb1f1ec6abb', args=kwargs) + reqId = kwargs['requestId'] item = self.requests.get (reqId) if item is None: @@ -414,7 +421,6 @@ class SiteLoader: if item.url != url: logger.error ('url mismatch', uuid='7385f45f-0b06-4cbc-81f9-67bcd72ee7d0', respUrl=url) if url.scheme in self.allowedSchemes: - logger.debug ('response', uuid='84461c4e-e8ef-4cbd-8e8e-e10a901c8bd0') item.fromResponseReceived (kwargs) else: logger.warning ('scheme forbidden', uuid='2ea6e5d7-dd3b-4881-b9de-156c1751c666') @@ -424,27 +430,27 @@ class SiteLoader: Item was fully loaded. For some items the request body is not available when responseReceived is fired, thus move everything here. """ + self.logger.debug ('loadingFinished', + uuid='35479405-a5b5-4395-8c33-d3601d1796b9', args=kwargs) + reqId = kwargs['requestId'] item = self.requests.pop (reqId, None) if item is None: # we never recorded this request (blacklisted scheme, for example) return req = item.request - logger = self.logger.bind (reqId=reqId, reqUrl=item.url) if item.url.scheme in self.allowedSchemes: - logger.info ('finished', uuid='5a8b4bad-f86a-4fe6-a53e-8da4130d6a02') item.fromLoadingFinished (kwargs) # XXX queue both await asyncio.gather (item.prefetchRequestBody (self.tab), item.prefetchResponseBody (self.tab)) return item async def _loadingFailed (self, **kwargs): + self.logger.info ('loadingFailed', + uuid='35479405-a5b5-4395-8c33-d3601d1796b9', args=kwargs) + reqId = kwargs['requestId'] logger = self.logger.bind (reqId=reqId) - logger.warning ('loading failed', - uuid='68410f13-6eea-453e-924e-c1af4601748b', - errorText=kwargs['errorText'], - blockedReason=kwargs.get ('blockedReason')) item = self.requests.pop (reqId, None) if item is not None: item.fromLoadingFailed (kwargs) @@ -477,10 +483,16 @@ class SiteLoader: uuid='3ef7292e-8595-4e89-b834-0cc6bc40ee38', **kwargs) async def _frameStartedLoading (self, **kwargs): + self.logger.debug ('frameStartedLoading', + uuid='bbeb39c0-3304-4221-918e-f26bd443c566', args=kwargs) + self._framesLoading.add (kwargs['frameId']) self.idle.set (False) async def _frameStoppedLoading (self, **kwargs): + self.logger.debug ('frameStoppedLoading', + uuid='fcbe8110-511c-4cbb-ac2b-f61a5782c5a0', args=kwargs) + self._framesLoading.remove (kwargs['frameId']) if not self._framesLoading: self.idle.set (True) diff --git a/crocoite/cli.py b/crocoite/cli.py index f9ef52c..a8de73b 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -32,7 +32,8 @@ from .controller import SinglePageController, \ RecursiveController, DepthLimit, PrefixLimit from .devtools import Passthrough, Process from .warc import WarcHandler -from .logger import Logger, JsonPrintConsumer, DatetimeConsumer, WarcHandlerConsumer +from .logger import Logger, JsonPrintConsumer, DatetimeConsumer, \ + WarcHandlerConsumer, Level from .devtools import Crashed class SingleExitStatus(IntEnum): diff --git a/crocoite/logger.py b/crocoite/logger.py index d882eaf..ac389ca 100644 --- a/crocoite/logger.py +++ b/crocoite/logger.py @@ -105,7 +105,7 @@ class PrintConsumer (Consumer): return kwargs class JsonPrintConsumer (Consumer): - def __init__ (self, minLevel=Level.INFO): + def __init__ (self, minLevel=Level.DEBUG): self.minLevel = minLevel def __call__ (self, **kwargs): @@ -121,7 +121,7 @@ class DatetimeConsumer (Consumer): return kwargs class WarcHandlerConsumer (Consumer): - def __init__ (self, warc, minLevel=Level.INFO): + def __init__ (self, warc, minLevel=Level.DEBUG): self.warc = warc self.minLevel = minLevel diff --git a/crocoite/warc.py b/crocoite/warc.py index cb1f2f7..4106995 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -242,13 +242,8 @@ class WarcHandler (EventHandler): } def push (self, item): - processed = False for k, v in self.route.items (): if isinstance (item, k): v (self, item) - processed = True break - if not processed: - self.logger.debug (f'unknown event {item!r}') - |