diff options
author | Lars-Dominik Braun <lars@6xq.net> | 2018-06-25 19:55:48 +0200 |
---|---|---|
committer | Lars-Dominik Braun <lars@6xq.net> | 2018-06-25 19:55:48 +0200 |
commit | 785ef19736cc9a21746e00a022b76fd756c162de (patch) | |
tree | 041a8696c852294fe9573485831398933e26ee13 /crocoite/behavior.py | |
parent | 344a6b449075a8fb42054801144c40760f791366 (diff) | |
download | crocoite-785ef19736cc9a21746e00a022b76fd756c162de.tar.gz crocoite-785ef19736cc9a21746e00a022b76fd756c162de.tar.bz2 crocoite-785ef19736cc9a21746e00a022b76fd756c162de.zip |
warc: Save DOM-/image screenshot as WARC conversion
Judging from the docs this is the proper way to store these resources.
Enable both for the IRC bot by default, since they won’t interfere with
IA’s wayback machine.
Diffstat (limited to 'crocoite/behavior.py')
-rw-r--r-- | crocoite/behavior.py | 17 |
1 files changed, 12 insertions, 5 deletions
diff --git a/crocoite/behavior.py b/crocoite/behavior.py index ab859f8..b34d3d9 100644 --- a/crocoite/behavior.py +++ b/crocoite/behavior.py @@ -32,7 +32,7 @@ from collections import OrderedDict from html5lib.serializer import HTMLSerializer from pychrome.exceptions import TimeoutException -from .util import randomString, getFormattedViewportMetrics +from .util import randomString, getFormattedViewportMetrics, removeFragment from . import html from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker @@ -229,12 +229,13 @@ class DomSnapshot (Behavior): disallowedAttributes = html.eventAttributes stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes) serializer = HTMLSerializer () - yield DomSnapshotEvent (doc['documentURL'], serializer.render (stream, 'utf-8'), viewport) + yield DomSnapshotEvent (removeFragment (doc['documentURL']), serializer.render (stream, 'utf-8'), viewport) class ScreenshotEvent: - __slots__ = ('yoff', 'data') + __slots__ = ('yoff', 'data', 'url') - def __init__ (self, yoff, data): + def __init__ (self, url, yoff, data): + self.url = url self.yoff = yoff self.data = data @@ -248,6 +249,12 @@ class Screenshot (Behavior): def onfinish (self): tab = self.loader.tab + try: + url = removeFragment (tab.Page.getFrameTree ()['frameTree']['frame']['url']) + except KeyError: + logger.error ('frame has no url') + url = None + # see https://github.com/GoogleChrome/puppeteer/blob/230be28b067b521f0577206899db01f0ca7fc0d2/examples/screenshots-longpage.js # Hardcoded max texture size of 16,384 (crbug.com/770769) maxDim = 16*1024 @@ -260,7 +267,7 @@ class Screenshot (Behavior): height = min (contentSize['height'] - yoff, maxDim) clip = {'x': 0, 'y': yoff, 'width': width, 'height': height, 'scale': 1} data = b64decode (tab.Page.captureScreenshot (format='png', clip=clip)['data']) - yield ScreenshotEvent (yoff, data) + yield ScreenshotEvent (url, yoff, data) class Click (JsOnload): """ Generic link clicking """ |