From 3a2fcc69a8eb4237b2862b3e291971d38748f115 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Mon, 7 Jan 2019 10:41:45 +0100 Subject: Log Chrome’s responses to WARC by default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We may not be able to reproduce every failure, so logging as much as possible is important to figure out what went wrong. Also, in case a bug is uncovered in the future, we can check the logs and possibly fix it with -errata. --- crocoite/behavior.py | 9 +++++++-- crocoite/browser.py | 30 +++++++++++++++++++++--------- crocoite/cli.py | 3 ++- crocoite/logger.py | 4 ++-- crocoite/warc.py | 5 ----- 5 files changed, 32 insertions(+), 19 deletions(-) (limited to 'crocoite') diff --git a/crocoite/behavior.py b/crocoite/behavior.py index 2c94bb8..3ecc92f 100644 --- a/crocoite/behavior.py +++ b/crocoite/behavior.py @@ -131,6 +131,8 @@ class JsOnload (Behavior): # parameter. # XXX: is there a better way to do this? result = await tab.Runtime.evaluate (expression=str (self.script)) + self.logger.debug ('behavior onload inject', + uuid='a2da9b78-5648-44c5-bfa8-5c7573e13ad3', result=result) exception = result.get ('exceptionDetails', None) result = result['result'] assert result['type'] == 'function', result @@ -143,6 +145,8 @@ class JsOnload (Behavior): functionDeclaration='function(options){return new this(options);}', objectId=constructor, arguments=[{'value': self.options}]) + self.logger.debug ('behavior onload start', + uuid='6c0605ae-93b3-46b3-b575-ba45790909a7', result=result) result = result['result'] assert result['type'] == 'object', result assert result.get ('subtype') != 'error', result @@ -160,8 +164,7 @@ class JsOnload (Behavior): self.logger.error ('jsonload onstop failed', uuid='1786726f-c8ec-4f79-8769-30954d4e32f5', exception=e.args, - objectId=self.context, - context=self.__class__.__name__) + objectId=self.context) return yield # pragma: no cover @@ -238,6 +241,8 @@ class DomSnapshot (Behavior): viewport = await getFormattedViewportMetrics (tab) dom = await tab.DOM.getDocument (depth=-1, pierce=True) + self.logger.debug ('dom snapshot document', + uuid='0c720784-8bd1-4fdc-a811-84394d753539', dom=dom) haveUrls = set () for doc in ChromeTreeWalker (dom['root']).split (): url = URL (doc['documentURL']) diff --git a/crocoite/browser.py b/crocoite/browser.py index 902a068..6ad305c 100644 --- a/crocoite/browser.py +++ b/crocoite/browser.py @@ -361,10 +361,15 @@ class SiteLoader: self._iterRunning = running async def navigate (self, url): - await self.tab.Page.navigate(url=url) + ret = await self.tab.Page.navigate(url=url) + self.logger.debug ('navigate', + uuid='9d47ded2-951f-4e09-86ee-fd4151e20666', result=ret) # internal chrome callbacks async def _requestWillBeSent (self, **kwargs): + self.logger.debug ('requestWillBeSent', + uuid='b828d75a-650d-42d2-8c66-14f4547512da', args=kwargs) + reqId = kwargs['requestId'] req = kwargs['request'] url = URL (req['url']) @@ -398,11 +403,13 @@ class SiteLoader: item = RequestResponsePair () item.fromRequestWillBeSent (kwargs) self.requests[reqId] = item - logger.debug ('request', uuid='55c17564-1bd0-4499-8724-fa7aad65478f') return ret async def _responseReceived (self, **kwargs): + self.logger.debug ('responseReceived', + uuid='ecd67e69-401a-41cb-b4ec-eeb1f1ec6abb', args=kwargs) + reqId = kwargs['requestId'] item = self.requests.get (reqId) if item is None: @@ -414,7 +421,6 @@ class SiteLoader: if item.url != url: logger.error ('url mismatch', uuid='7385f45f-0b06-4cbc-81f9-67bcd72ee7d0', respUrl=url) if url.scheme in self.allowedSchemes: - logger.debug ('response', uuid='84461c4e-e8ef-4cbd-8e8e-e10a901c8bd0') item.fromResponseReceived (kwargs) else: logger.warning ('scheme forbidden', uuid='2ea6e5d7-dd3b-4881-b9de-156c1751c666') @@ -424,27 +430,27 @@ class SiteLoader: Item was fully loaded. For some items the request body is not available when responseReceived is fired, thus move everything here. """ + self.logger.debug ('loadingFinished', + uuid='35479405-a5b5-4395-8c33-d3601d1796b9', args=kwargs) + reqId = kwargs['requestId'] item = self.requests.pop (reqId, None) if item is None: # we never recorded this request (blacklisted scheme, for example) return req = item.request - logger = self.logger.bind (reqId=reqId, reqUrl=item.url) if item.url.scheme in self.allowedSchemes: - logger.info ('finished', uuid='5a8b4bad-f86a-4fe6-a53e-8da4130d6a02') item.fromLoadingFinished (kwargs) # XXX queue both await asyncio.gather (item.prefetchRequestBody (self.tab), item.prefetchResponseBody (self.tab)) return item async def _loadingFailed (self, **kwargs): + self.logger.info ('loadingFailed', + uuid='35479405-a5b5-4395-8c33-d3601d1796b9', args=kwargs) + reqId = kwargs['requestId'] logger = self.logger.bind (reqId=reqId) - logger.warning ('loading failed', - uuid='68410f13-6eea-453e-924e-c1af4601748b', - errorText=kwargs['errorText'], - blockedReason=kwargs.get ('blockedReason')) item = self.requests.pop (reqId, None) if item is not None: item.fromLoadingFailed (kwargs) @@ -477,10 +483,16 @@ class SiteLoader: uuid='3ef7292e-8595-4e89-b834-0cc6bc40ee38', **kwargs) async def _frameStartedLoading (self, **kwargs): + self.logger.debug ('frameStartedLoading', + uuid='bbeb39c0-3304-4221-918e-f26bd443c566', args=kwargs) + self._framesLoading.add (kwargs['frameId']) self.idle.set (False) async def _frameStoppedLoading (self, **kwargs): + self.logger.debug ('frameStoppedLoading', + uuid='fcbe8110-511c-4cbb-ac2b-f61a5782c5a0', args=kwargs) + self._framesLoading.remove (kwargs['frameId']) if not self._framesLoading: self.idle.set (True) diff --git a/crocoite/cli.py b/crocoite/cli.py index f9ef52c..a8de73b 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -32,7 +32,8 @@ from .controller import SinglePageController, \ RecursiveController, DepthLimit, PrefixLimit from .devtools import Passthrough, Process from .warc import WarcHandler -from .logger import Logger, JsonPrintConsumer, DatetimeConsumer, WarcHandlerConsumer +from .logger import Logger, JsonPrintConsumer, DatetimeConsumer, \ + WarcHandlerConsumer, Level from .devtools import Crashed class SingleExitStatus(IntEnum): diff --git a/crocoite/logger.py b/crocoite/logger.py index d882eaf..ac389ca 100644 --- a/crocoite/logger.py +++ b/crocoite/logger.py @@ -105,7 +105,7 @@ class PrintConsumer (Consumer): return kwargs class JsonPrintConsumer (Consumer): - def __init__ (self, minLevel=Level.INFO): + def __init__ (self, minLevel=Level.DEBUG): self.minLevel = minLevel def __call__ (self, **kwargs): @@ -121,7 +121,7 @@ class DatetimeConsumer (Consumer): return kwargs class WarcHandlerConsumer (Consumer): - def __init__ (self, warc, minLevel=Level.INFO): + def __init__ (self, warc, minLevel=Level.DEBUG): self.warc = warc self.minLevel = minLevel diff --git a/crocoite/warc.py b/crocoite/warc.py index cb1f2f7..4106995 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -242,13 +242,8 @@ class WarcHandler (EventHandler): } def push (self, item): - processed = False for k, v in self.route.items (): if isinstance (item, k): v (self, item) - processed = True break - if not processed: - self.logger.debug (f'unknown event {item!r}') - -- cgit v1.2.3