summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--contrib/celerycrocoite.py3
-rw-r--r--crocoite/behavior.py17
-rw-r--r--crocoite/browser.py10
-rw-r--r--crocoite/controller.py8
-rw-r--r--crocoite/tools.py29
-rw-r--r--crocoite/util.py6
-rw-r--r--crocoite/warc.py39
7 files changed, 73 insertions, 39 deletions
diff --git a/contrib/celerycrocoite.py b/contrib/celerycrocoite.py
index d0a02e9..3da43d9 100644
--- a/contrib/celerycrocoite.py
+++ b/contrib/celerycrocoite.py
@@ -192,12 +192,11 @@ def archive (bot, trigger):
if not args:
bot.reply ('Sorry, I don’t understand {}'.format (trigger.group (2)))
return
- blacklistedBehavior = {'domSnapshot', 'screenshot'}
settings = dict (maxBodySize=args.maxBodySize,
logBuffer=defaultSettings.logBuffer, idleTimeout=args.idleTimeout,
timeout=args.timeout)
args = dict (url=args.url,
- enabledBehaviorNames=list (set (behavior.availableMap.keys())-blacklistedBehavior),
+ enabledBehaviorNames=list (behavior.availableMap.keys ()),
settings=settings, recursive=args.recursive,
concurrency=args.concurrency)
q = bot.memory['crocoite']['q']
diff --git a/crocoite/behavior.py b/crocoite/behavior.py
index ab859f8..b34d3d9 100644
--- a/crocoite/behavior.py
+++ b/crocoite/behavior.py
@@ -32,7 +32,7 @@ from collections import OrderedDict
from html5lib.serializer import HTMLSerializer
from pychrome.exceptions import TimeoutException
-from .util import randomString, getFormattedViewportMetrics
+from .util import randomString, getFormattedViewportMetrics, removeFragment
from . import html
from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker
@@ -229,12 +229,13 @@ class DomSnapshot (Behavior):
disallowedAttributes = html.eventAttributes
stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes)
serializer = HTMLSerializer ()
- yield DomSnapshotEvent (doc['documentURL'], serializer.render (stream, 'utf-8'), viewport)
+ yield DomSnapshotEvent (removeFragment (doc['documentURL']), serializer.render (stream, 'utf-8'), viewport)
class ScreenshotEvent:
- __slots__ = ('yoff', 'data')
+ __slots__ = ('yoff', 'data', 'url')
- def __init__ (self, yoff, data):
+ def __init__ (self, url, yoff, data):
+ self.url = url
self.yoff = yoff
self.data = data
@@ -248,6 +249,12 @@ class Screenshot (Behavior):
def onfinish (self):
tab = self.loader.tab
+ try:
+ url = removeFragment (tab.Page.getFrameTree ()['frameTree']['frame']['url'])
+ except KeyError:
+ logger.error ('frame has no url')
+ url = None
+
# see https://github.com/GoogleChrome/puppeteer/blob/230be28b067b521f0577206899db01f0ca7fc0d2/examples/screenshots-longpage.js
# Hardcoded max texture size of 16,384 (crbug.com/770769)
maxDim = 16*1024
@@ -260,7 +267,7 @@ class Screenshot (Behavior):
height = min (contentSize['height'] - yoff, maxDim)
clip = {'x': 0, 'y': yoff, 'width': width, 'height': height, 'scale': 1}
data = b64decode (tab.Page.captureScreenshot (format='png', clip=clip)['data'])
- yield ScreenshotEvent (yoff, data)
+ yield ScreenshotEvent (url, yoff, data)
class Click (JsOnload):
""" Generic link clicking """
diff --git a/crocoite/browser.py b/crocoite/browser.py
index 1c09598..6a4bee2 100644
--- a/crocoite/browser.py
+++ b/crocoite/browser.py
@@ -41,9 +41,9 @@ class Item:
def __init__ (self, tab):
self.tab = tab
- self.chromeRequest = None
- self.chromeResponse = None
- self.chromeFinished = None
+ self.chromeRequest = {}
+ self.chromeResponse = {}
+ self.chromeFinished = {}
self.isRedirect = False
self.failed = False
@@ -128,6 +128,10 @@ class Item:
return text[0]
return 'No status text available'
+ @property
+ def resourceType (self):
+ return self.chromeResponse.get ('type', self.chromeRequest.get ('type', None))
+
@staticmethod
def _unfoldHeaders (headers):
"""
diff --git a/crocoite/controller.py b/crocoite/controller.py
index 84001b7..ef042cc 100644
--- a/crocoite/controller.py
+++ b/crocoite/controller.py
@@ -71,11 +71,10 @@ class StatsHandler (EventHandler):
self.stats['crashed'] += 1
import logging, time
-from urllib.parse import urlsplit, urlunsplit
from . import behavior as cbehavior
from .browser import ChromeService, SiteLoader, Item
-from .util import getFormattedViewportMetrics
+from .util import getFormattedViewportMetrics, removeFragment
class ControllerStart:
__slots__ = ('payload')
@@ -238,11 +237,6 @@ class PrefixLimit (RecursionPolicy):
def __call__ (self, urls):
return set (filter (lambda u: u.startswith (self.prefix), urls))
-def removeFragment (u):
- """ Remove fragment from url (i.e. #hashvalue) """
- s = urlsplit (u)
- return urlunsplit ((s.scheme, s.netloc, s.path, s.query, ''))
-
from .behavior import ExtractLinksEvent
class RecursiveController (EventHandler):
diff --git a/crocoite/tools.py b/crocoite/tools.py
index bc92f8f..3aeaaad 100644
--- a/crocoite/tools.py
+++ b/crocoite/tools.py
@@ -80,18 +80,21 @@ def extractScreenshot ():
args = parser.parse_args()
- screenshotRe = re.compile (r'^urn:crocoite:screenshot-(\d+)-(\d+).png$', re.I)
with args.input:
- for record in ArchiveIterator(args.input):
- uri = record.rec_headers.get_header('WARC-Target-URI')
- if record.rec_type == 'resource':
- m = screenshotRe.match (uri)
- xoff, yoff = m.groups ()
- if m:
- outpath = '{}-{}-{}.png'.format (args.prefix, xoff, yoff)
- if args.force or not os.path.exists (outpath):
- with open (outpath, 'wb') as out:
- shutil.copyfileobj (record.raw_stream, out)
- else:
- print ('not overwriting {}'.format (outpath))
+ for record in ArchiveIterator (args.input):
+ headers = record.rec_headers
+ if record.rec_type != 'conversion' or \
+ headers['Content-Type'] != 'image/png' or \
+ 'X-Crocoite-Screenshot-Y-Offset' not in headers:
+ continue
+
+ urlSanitized = headers.get_header('WARC-Target-URI').replace ('/', '_')
+ xoff = 0
+ yoff = int (headers.get_header ('X-Crocoite-Screenshot-Y-Offset'))
+ outpath = '{}-{}-{}-{}.png'.format (args.prefix, urlSanitized, xoff, yoff)
+ if args.force or not os.path.exists (outpath):
+ with open (outpath, 'wb') as out:
+ shutil.copyfileobj (record.raw_stream, out)
+ else:
+ print ('not overwriting {}'.format (outpath))
diff --git a/crocoite/util.py b/crocoite/util.py
index ec257f1..fe43f01 100644
--- a/crocoite/util.py
+++ b/crocoite/util.py
@@ -23,6 +23,7 @@ Random utility functions
"""
import random
+from urllib.parse import urlsplit, urlunsplit
def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'):
if length is None:
@@ -41,3 +42,8 @@ def getFormattedViewportMetrics (tab):
return '{}x{}'.format (layoutMetrics['layoutViewport']['clientWidth'],
layoutMetrics['layoutViewport']['clientHeight'])
+def removeFragment (u):
+ """ Remove fragment from url (i.e. #hashvalue) """
+ s = urlsplit (u)
+ return urlunsplit ((s.scheme, s.netloc, s.path, s.query, ''))
+
diff --git a/crocoite/warc.py b/crocoite/warc.py
index af04cf9..e472f16 100644
--- a/crocoite/warc.py
+++ b/crocoite/warc.py
@@ -38,7 +38,7 @@ from .behavior import Script, DomSnapshotEvent, ScreenshotEvent
from .browser import Item
class WarcHandler (EventHandler):
- __slots__ = ('logger', 'writer', 'maxBodySize')
+ __slots__ = ('logger', 'writer', 'maxBodySize', 'documentRecords')
def __init__ (self, fd,
logger=logging.getLogger(__name__),
@@ -46,6 +46,9 @@ class WarcHandler (EventHandler):
self.logger = logger
self.writer = WARCWriter (fd, gzip=True)
self.maxBodySize = maxBodySize
+ # maps document urls to WARC record ids, required for DomSnapshotEvent
+ # and ScreenshotEvent
+ self.documentRecords = {}
def _writeRequest (self, item):
writer = self.writer
@@ -135,6 +138,9 @@ class WarcHandler (EventHandler):
http_headers=httpHeaders)
writer.write_record(record)
+ if item.resourceType == 'Document':
+ self.documentRecords[item.url] = record.rec_headers.get_header ('WARC-Record-ID')
+
def _writeScript (self, item):
writer = self.writer
encoding = 'utf-8'
@@ -155,21 +161,36 @@ class WarcHandler (EventHandler):
except ValueError as e:
self.logger.error (e.args[0])
+ def _addRefersTo (self, headers, url):
+ refersTo = self.documentRecords.get (url)
+ if refersTo:
+ headers['WARC-Refers-To'] = refersTo
+ else:
+ self.logger.error ('No document record found for {}'.format (url))
+ return headers
+
def _writeDomSnapshot (self, item):
writer = self.writer
- httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
- record = writer.create_warc_record (item.url, 'response',
+
+ warcHeaders = {'X-DOM-Snapshot': str (True),
+ 'X-Chrome-Viewport': item.viewport,
+ 'Content-Type': 'text/html; charset=utf-8',
+ }
+
+ self._addRefersTo (warcHeaders, item.url)
+
+ record = writer.create_warc_record (item.url, 'conversion',
payload=BytesIO (item.document),
- http_headers=httpHeaders,
- warc_headers_dict={'X-DOM-Snapshot': str (True),
- 'X-Chrome-Viewport': item.viewport})
+ warc_headers_dict=warcHeaders)
writer.write_record (record)
def _writeScreenshot (self, item):
writer = self.writer
- url = packageUrl ('screenshot-{}-{}.png'.format (0, item.yoff))
- record = writer.create_warc_record (url, 'resource',
- payload=BytesIO (item.data), warc_headers_dict={'Content-Type': 'image/png'})
+ warcHeaders = {'Content-Type': 'image/png',
+ 'X-Crocoite-Screenshot-Y-Offset': str (item.yoff)}
+ self._addRefersTo (warcHeaders, item.url)
+ record = writer.create_warc_record (item.url, 'conversion',
+ payload=BytesIO (item.data), warc_headers_dict=warcHeaders)
writer.write_record (record)
def _writeControllerStart (self, item):