summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2018-06-25 19:55:48 +0200
committerLars-Dominik Braun <lars@6xq.net>2018-06-25 19:55:48 +0200
commit785ef19736cc9a21746e00a022b76fd756c162de (patch)
tree041a8696c852294fe9573485831398933e26ee13
parent344a6b449075a8fb42054801144c40760f791366 (diff)
downloadcrocoite-785ef19736cc9a21746e00a022b76fd756c162de.tar.gz
crocoite-785ef19736cc9a21746e00a022b76fd756c162de.tar.bz2
crocoite-785ef19736cc9a21746e00a022b76fd756c162de.zip
warc: Save DOM-/image screenshot as WARC conversion
Judging from the docs this is the proper way to store these resources. Enable both for the IRC bot by default, since they won’t interfere with IA’s wayback machine.
-rw-r--r--contrib/celerycrocoite.py3
-rw-r--r--crocoite/behavior.py17
-rw-r--r--crocoite/browser.py10
-rw-r--r--crocoite/controller.py8
-rw-r--r--crocoite/tools.py29
-rw-r--r--crocoite/util.py6
-rw-r--r--crocoite/warc.py39
7 files changed, 73 insertions, 39 deletions
diff --git a/contrib/celerycrocoite.py b/contrib/celerycrocoite.py
index d0a02e9..3da43d9 100644
--- a/contrib/celerycrocoite.py
+++ b/contrib/celerycrocoite.py
@@ -192,12 +192,11 @@ def archive (bot, trigger):
if not args:
bot.reply ('Sorry, I don’t understand {}'.format (trigger.group (2)))
return
- blacklistedBehavior = {'domSnapshot', 'screenshot'}
settings = dict (maxBodySize=args.maxBodySize,
logBuffer=defaultSettings.logBuffer, idleTimeout=args.idleTimeout,
timeout=args.timeout)
args = dict (url=args.url,
- enabledBehaviorNames=list (set (behavior.availableMap.keys())-blacklistedBehavior),
+ enabledBehaviorNames=list (behavior.availableMap.keys ()),
settings=settings, recursive=args.recursive,
concurrency=args.concurrency)
q = bot.memory['crocoite']['q']
diff --git a/crocoite/behavior.py b/crocoite/behavior.py
index ab859f8..b34d3d9 100644
--- a/crocoite/behavior.py
+++ b/crocoite/behavior.py
@@ -32,7 +32,7 @@ from collections import OrderedDict
from html5lib.serializer import HTMLSerializer
from pychrome.exceptions import TimeoutException
-from .util import randomString, getFormattedViewportMetrics
+from .util import randomString, getFormattedViewportMetrics, removeFragment
from . import html
from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker
@@ -229,12 +229,13 @@ class DomSnapshot (Behavior):
disallowedAttributes = html.eventAttributes
stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes)
serializer = HTMLSerializer ()
- yield DomSnapshotEvent (doc['documentURL'], serializer.render (stream, 'utf-8'), viewport)
+ yield DomSnapshotEvent (removeFragment (doc['documentURL']), serializer.render (stream, 'utf-8'), viewport)
class ScreenshotEvent:
- __slots__ = ('yoff', 'data')
+ __slots__ = ('yoff', 'data', 'url')
- def __init__ (self, yoff, data):
+ def __init__ (self, url, yoff, data):
+ self.url = url
self.yoff = yoff
self.data = data
@@ -248,6 +249,12 @@ class Screenshot (Behavior):
def onfinish (self):
tab = self.loader.tab
+ try:
+ url = removeFragment (tab.Page.getFrameTree ()['frameTree']['frame']['url'])
+ except KeyError:
+ logger.error ('frame has no url')
+ url = None
+
# see https://github.com/GoogleChrome/puppeteer/blob/230be28b067b521f0577206899db01f0ca7fc0d2/examples/screenshots-longpage.js
# Hardcoded max texture size of 16,384 (crbug.com/770769)
maxDim = 16*1024
@@ -260,7 +267,7 @@ class Screenshot (Behavior):
height = min (contentSize['height'] - yoff, maxDim)
clip = {'x': 0, 'y': yoff, 'width': width, 'height': height, 'scale': 1}
data = b64decode (tab.Page.captureScreenshot (format='png', clip=clip)['data'])
- yield ScreenshotEvent (yoff, data)
+ yield ScreenshotEvent (url, yoff, data)
class Click (JsOnload):
""" Generic link clicking """
diff --git a/crocoite/browser.py b/crocoite/browser.py
index 1c09598..6a4bee2 100644
--- a/crocoite/browser.py
+++ b/crocoite/browser.py
@@ -41,9 +41,9 @@ class Item:
def __init__ (self, tab):
self.tab = tab
- self.chromeRequest = None
- self.chromeResponse = None
- self.chromeFinished = None
+ self.chromeRequest = {}
+ self.chromeResponse = {}
+ self.chromeFinished = {}
self.isRedirect = False
self.failed = False
@@ -128,6 +128,10 @@ class Item:
return text[0]
return 'No status text available'
+ @property
+ def resourceType (self):
+ return self.chromeResponse.get ('type', self.chromeRequest.get ('type', None))
+
@staticmethod
def _unfoldHeaders (headers):
"""
diff --git a/crocoite/controller.py b/crocoite/controller.py
index 84001b7..ef042cc 100644
--- a/crocoite/controller.py
+++ b/crocoite/controller.py
@@ -71,11 +71,10 @@ class StatsHandler (EventHandler):
self.stats['crashed'] += 1
import logging, time
-from urllib.parse import urlsplit, urlunsplit
from . import behavior as cbehavior
from .browser import ChromeService, SiteLoader, Item
-from .util import getFormattedViewportMetrics
+from .util import getFormattedViewportMetrics, removeFragment
class ControllerStart:
__slots__ = ('payload')
@@ -238,11 +237,6 @@ class PrefixLimit (RecursionPolicy):
def __call__ (self, urls):
return set (filter (lambda u: u.startswith (self.prefix), urls))
-def removeFragment (u):
- """ Remove fragment from url (i.e. #hashvalue) """
- s = urlsplit (u)
- return urlunsplit ((s.scheme, s.netloc, s.path, s.query, ''))
-
from .behavior import ExtractLinksEvent
class RecursiveController (EventHandler):
diff --git a/crocoite/tools.py b/crocoite/tools.py
index bc92f8f..3aeaaad 100644
--- a/crocoite/tools.py
+++ b/crocoite/tools.py
@@ -80,18 +80,21 @@ def extractScreenshot ():
args = parser.parse_args()
- screenshotRe = re.compile (r'^urn:crocoite:screenshot-(\d+)-(\d+).png$', re.I)
with args.input:
- for record in ArchiveIterator(args.input):
- uri = record.rec_headers.get_header('WARC-Target-URI')
- if record.rec_type == 'resource':
- m = screenshotRe.match (uri)
- xoff, yoff = m.groups ()
- if m:
- outpath = '{}-{}-{}.png'.format (args.prefix, xoff, yoff)
- if args.force or not os.path.exists (outpath):
- with open (outpath, 'wb') as out:
- shutil.copyfileobj (record.raw_stream, out)
- else:
- print ('not overwriting {}'.format (outpath))
+ for record in ArchiveIterator (args.input):
+ headers = record.rec_headers
+ if record.rec_type != 'conversion' or \
+ headers['Content-Type'] != 'image/png' or \
+ 'X-Crocoite-Screenshot-Y-Offset' not in headers:
+ continue
+
+ urlSanitized = headers.get_header('WARC-Target-URI').replace ('/', '_')
+ xoff = 0
+ yoff = int (headers.get_header ('X-Crocoite-Screenshot-Y-Offset'))
+ outpath = '{}-{}-{}-{}.png'.format (args.prefix, urlSanitized, xoff, yoff)
+ if args.force or not os.path.exists (outpath):
+ with open (outpath, 'wb') as out:
+ shutil.copyfileobj (record.raw_stream, out)
+ else:
+ print ('not overwriting {}'.format (outpath))
diff --git a/crocoite/util.py b/crocoite/util.py
index ec257f1..fe43f01 100644
--- a/crocoite/util.py
+++ b/crocoite/util.py
@@ -23,6 +23,7 @@ Random utility functions
"""
import random
+from urllib.parse import urlsplit, urlunsplit
def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'):
if length is None:
@@ -41,3 +42,8 @@ def getFormattedViewportMetrics (tab):
return '{}x{}'.format (layoutMetrics['layoutViewport']['clientWidth'],
layoutMetrics['layoutViewport']['clientHeight'])
+def removeFragment (u):
+ """ Remove fragment from url (i.e. #hashvalue) """
+ s = urlsplit (u)
+ return urlunsplit ((s.scheme, s.netloc, s.path, s.query, ''))
+
diff --git a/crocoite/warc.py b/crocoite/warc.py
index af04cf9..e472f16 100644
--- a/crocoite/warc.py
+++ b/crocoite/warc.py
@@ -38,7 +38,7 @@ from .behavior import Script, DomSnapshotEvent, ScreenshotEvent
from .browser import Item
class WarcHandler (EventHandler):
- __slots__ = ('logger', 'writer', 'maxBodySize')
+ __slots__ = ('logger', 'writer', 'maxBodySize', 'documentRecords')
def __init__ (self, fd,
logger=logging.getLogger(__name__),
@@ -46,6 +46,9 @@ class WarcHandler (EventHandler):
self.logger = logger
self.writer = WARCWriter (fd, gzip=True)
self.maxBodySize = maxBodySize
+ # maps document urls to WARC record ids, required for DomSnapshotEvent
+ # and ScreenshotEvent
+ self.documentRecords = {}
def _writeRequest (self, item):
writer = self.writer
@@ -135,6 +138,9 @@ class WarcHandler (EventHandler):
http_headers=httpHeaders)
writer.write_record(record)
+ if item.resourceType == 'Document':
+ self.documentRecords[item.url] = record.rec_headers.get_header ('WARC-Record-ID')
+
def _writeScript (self, item):
writer = self.writer
encoding = 'utf-8'
@@ -155,21 +161,36 @@ class WarcHandler (EventHandler):
except ValueError as e:
self.logger.error (e.args[0])
+ def _addRefersTo (self, headers, url):
+ refersTo = self.documentRecords.get (url)
+ if refersTo:
+ headers['WARC-Refers-To'] = refersTo
+ else:
+ self.logger.error ('No document record found for {}'.format (url))
+ return headers
+
def _writeDomSnapshot (self, item):
writer = self.writer
- httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
- record = writer.create_warc_record (item.url, 'response',
+
+ warcHeaders = {'X-DOM-Snapshot': str (True),
+ 'X-Chrome-Viewport': item.viewport,
+ 'Content-Type': 'text/html; charset=utf-8',
+ }
+
+ self._addRefersTo (warcHeaders, item.url)
+
+ record = writer.create_warc_record (item.url, 'conversion',
payload=BytesIO (item.document),
- http_headers=httpHeaders,
- warc_headers_dict={'X-DOM-Snapshot': str (True),
- 'X-Chrome-Viewport': item.viewport})
+ warc_headers_dict=warcHeaders)
writer.write_record (record)
def _writeScreenshot (self, item):
writer = self.writer
- url = packageUrl ('screenshot-{}-{}.png'.format (0, item.yoff))
- record = writer.create_warc_record (url, 'resource',
- payload=BytesIO (item.data), warc_headers_dict={'Content-Type': 'image/png'})
+ warcHeaders = {'Content-Type': 'image/png',
+ 'X-Crocoite-Screenshot-Y-Offset': str (item.yoff)}
+ self._addRefersTo (warcHeaders, item.url)
+ record = writer.create_warc_record (item.url, 'conversion',
+ payload=BytesIO (item.data), warc_headers_dict=warcHeaders)
writer.write_record (record)
def _writeControllerStart (self, item):