summaryrefslogtreecommitdiff
path: root/crocoite/behavior.py
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2018-07-28 20:25:49 +0200
committerLars-Dominik Braun <lars@6xq.net>2018-08-04 14:11:31 +0200
commit3deded13df1339ef59a760c188804adffd9ed902 (patch)
tree5eaf69ee38389073e7323585c6afdbbf5eeab487 /crocoite/behavior.py
parent33a137f2d7c04468038d689b53a70fb534297f55 (diff)
downloadcrocoite-3deded13df1339ef59a760c188804adffd9ed902.tar.gz
crocoite-3deded13df1339ef59a760c188804adffd9ed902.tar.bz2
crocoite-3deded13df1339ef59a760c188804adffd9ed902.zip
Reintroduce WARC logging
Commit 7730e0d64ec895091a0dd7eb0e3c6ce2ed02d981 removed logging to WARC files. Add it again, but with a different implementation.. Credits to structlog for inspiration.
Diffstat (limited to 'crocoite/behavior.py')
-rw-r--r--crocoite/behavior.py34
1 files changed, 17 insertions, 17 deletions
diff --git a/crocoite/behavior.py b/crocoite/behavior.py
index b34d3d9..8c24c59 100644
--- a/crocoite/behavior.py
+++ b/crocoite/behavior.py
@@ -22,7 +22,7 @@
Generic and per-site behavior scripts
"""
-import logging, time
+import time
from urllib.parse import urlsplit
import os.path
import pkg_resources
@@ -36,8 +36,6 @@ from .util import randomString, getFormattedViewportMetrics, removeFragment
from . import html
from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker
-logger = logging.getLogger(__name__)
-
class Script:
""" A JavaScript resource """
@@ -61,14 +59,15 @@ class Script:
return s
class Behavior:
- __slots__ = ('loader')
+ __slots__ = ('loader', 'logger')
# unique behavior name
name = None
- def __init__ (self, loader):
+ def __init__ (self, loader, logger):
assert self.name is not None
self.loader = loader
+ self.logger = logger.bind (context=type (self).__name__)
def __contains__ (self, url):
"""
@@ -108,8 +107,8 @@ class JsOnload (Behavior):
scriptPath = None
- def __init__ (self, loader):
- super ().__init__ (loader)
+ def __init__ (self, loader, logger):
+ super ().__init__ (loader, logger)
self.script = Script (self.scriptPath)
self.scriptHandle = None
@@ -129,8 +128,8 @@ class Scroll (JsOnload):
name = 'scroll'
scriptPath = 'scroll.js'
- def __init__ (self, loader):
- super ().__init__ (loader)
+ def __init__ (self, loader, logger):
+ super ().__init__ (loader, logger)
stopVarname = '__' + __package__ + '_stop__'
newStopVarname = randomString ()
self.script.data = self.script.data.replace (stopVarname, newStopVarname)
@@ -198,8 +197,8 @@ class DomSnapshot (Behavior):
name = 'domSnapshot'
- def __init__ (self, loader):
- super ().__init__ (loader)
+ def __init__ (self, loader, logger):
+ super ().__init__ (loader, logger)
self.script = Script ('canvas-snapshot.js')
def onfinish (self):
@@ -216,11 +215,11 @@ class DomSnapshot (Behavior):
if rawUrl in haveUrls:
# ignore duplicate URLs. they are usually caused by
# javascript-injected iframes (advertising) with no(?) src
- logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl))
+ self.logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl))
continue
url = urlsplit (rawUrl)
if url.scheme in ('http', 'https'):
- logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL']))
+ self.logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL']))
haveUrls.add (rawUrl)
walker = ChromeTreeWalker (doc)
# remove script, to make the page static and noscript, because at the
@@ -249,10 +248,11 @@ class Screenshot (Behavior):
def onfinish (self):
tab = self.loader.tab
+ tree = tab.Page.getFrameTree ()
try:
- url = removeFragment (tab.Page.getFrameTree ()['frameTree']['frame']['url'])
+ url = removeFragment (tree['frameTree']['frame']['url'])
except KeyError:
- logger.error ('frame has no url')
+ self.logger.error ('frame without url', tree=tree)
url = None
# see https://github.com/GoogleChrome/puppeteer/blob/230be28b067b521f0577206899db01f0ca7fc0d2/examples/screenshots-longpage.js
@@ -293,8 +293,8 @@ class ExtractLinks (Behavior):
name = 'extractLinks'
- def __init__ (self, loader):
- super ().__init__ (loader)
+ def __init__ (self, loader, logger):
+ super ().__init__ (loader, logger)
self.script = Script ('extract-links.js')
def onfinish (self):