summaryrefslogtreecommitdiff
path: root/crocoite/behavior.py
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2018-06-25 19:55:48 +0200
committerLars-Dominik Braun <lars@6xq.net>2018-06-25 19:55:48 +0200
commit785ef19736cc9a21746e00a022b76fd756c162de (patch)
tree041a8696c852294fe9573485831398933e26ee13 /crocoite/behavior.py
parent344a6b449075a8fb42054801144c40760f791366 (diff)
downloadcrocoite-785ef19736cc9a21746e00a022b76fd756c162de.tar.gz
crocoite-785ef19736cc9a21746e00a022b76fd756c162de.tar.bz2
crocoite-785ef19736cc9a21746e00a022b76fd756c162de.zip
warc: Save DOM-/image screenshot as WARC conversion
Judging from the docs this is the proper way to store these resources. Enable both for the IRC bot by default, since they won’t interfere with IA’s wayback machine.
Diffstat (limited to 'crocoite/behavior.py')
-rw-r--r--crocoite/behavior.py17
1 files changed, 12 insertions, 5 deletions
diff --git a/crocoite/behavior.py b/crocoite/behavior.py
index ab859f8..b34d3d9 100644
--- a/crocoite/behavior.py
+++ b/crocoite/behavior.py
@@ -32,7 +32,7 @@ from collections import OrderedDict
from html5lib.serializer import HTMLSerializer
from pychrome.exceptions import TimeoutException
-from .util import randomString, getFormattedViewportMetrics
+from .util import randomString, getFormattedViewportMetrics, removeFragment
from . import html
from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker
@@ -229,12 +229,13 @@ class DomSnapshot (Behavior):
disallowedAttributes = html.eventAttributes
stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes)
serializer = HTMLSerializer ()
- yield DomSnapshotEvent (doc['documentURL'], serializer.render (stream, 'utf-8'), viewport)
+ yield DomSnapshotEvent (removeFragment (doc['documentURL']), serializer.render (stream, 'utf-8'), viewport)
class ScreenshotEvent:
- __slots__ = ('yoff', 'data')
+ __slots__ = ('yoff', 'data', 'url')
- def __init__ (self, yoff, data):
+ def __init__ (self, url, yoff, data):
+ self.url = url
self.yoff = yoff
self.data = data
@@ -248,6 +249,12 @@ class Screenshot (Behavior):
def onfinish (self):
tab = self.loader.tab
+ try:
+ url = removeFragment (tab.Page.getFrameTree ()['frameTree']['frame']['url'])
+ except KeyError:
+ logger.error ('frame has no url')
+ url = None
+
# see https://github.com/GoogleChrome/puppeteer/blob/230be28b067b521f0577206899db01f0ca7fc0d2/examples/screenshots-longpage.js
# Hardcoded max texture size of 16,384 (crbug.com/770769)
maxDim = 16*1024
@@ -260,7 +267,7 @@ class Screenshot (Behavior):
height = min (contentSize['height'] - yoff, maxDim)
clip = {'x': 0, 'y': yoff, 'width': width, 'height': height, 'scale': 1}
data = b64decode (tab.Page.captureScreenshot (format='png', clip=clip)['data'])
- yield ScreenshotEvent (yoff, data)
+ yield ScreenshotEvent (url, yoff, data)
class Click (JsOnload):
""" Generic link clicking """