summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2017-11-29 13:07:08 +0100
committerLars-Dominik Braun <lars@6xq.net>2017-11-29 13:07:08 +0100
commit6b656f5ccffe79f7cdb11fce3dd72359b3cbbc1b (patch)
treed822bf4e43baf23819a54f6c14dc3736c26e39d9
parent6b1d593ec841ebe18dcbdd18902f7faad0868fd4 (diff)
downloadcrocoite-6b656f5ccffe79f7cdb11fce3dd72359b3cbbc1b.tar.gz
crocoite-6b656f5ccffe79f7cdb11fce3dd72359b3cbbc1b.tar.bz2
crocoite-6b656f5ccffe79f7cdb11fce3dd72359b3cbbc1b.zip
Refactoring
Reusable browser communication and WARC writing.
-rw-r--r--crocoite/__init__.py32
-rw-r--r--crocoite/browser.py209
-rw-r--r--crocoite/cli.py452
-rw-r--r--crocoite/html.py107
-rw-r--r--crocoite/warc.py174
5 files changed, 571 insertions, 403 deletions
diff --git a/crocoite/__init__.py b/crocoite/__init__.py
index 066c83e..e23cd60 100644
--- a/crocoite/__init__.py
+++ b/crocoite/__init__.py
@@ -1,3 +1,33 @@
-#!/usr/bin/env python3
+# Copyright (c) 2017 crocoite contributors
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+import os
+def packageData (path):
+ """
+ Locate package data, see setup.py’s data_files
+ """
+ return os.path.join (os.path.dirname (__file__), 'data', path)
+
+def packageUrl (path):
+ """
+ Create URL for package data stored into WARC
+ """
+ return 'urn:' + __package__ + ':' + path
diff --git a/crocoite/browser.py b/crocoite/browser.py
new file mode 100644
index 0000000..756fd64
--- /dev/null
+++ b/crocoite/browser.py
@@ -0,0 +1,209 @@
+# Copyright (c) 2017 crocoite contributors
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+"""
+Chrome browser interactions.
+"""
+
+import logging
+from urllib.parse import urlsplit
+
+class Item:
+ """
+ Simple wrapper containing Chrome request and response
+ """
+
+ def __init__ (self):
+ self._chromeRequest = None
+ self._chromeResponse = None
+ self.encodedDataLength = 0
+
+ def __repr__ (self):
+ return '<Item {}>'.format (self.request['url'])
+
+ @property
+ def request (self):
+ return self._chromeRequest['request']
+
+ @property
+ def response (self):
+ return self._chromeResponse['response']
+
+ @property
+ def initiator (self):
+ return self._chromeRequest['initiator']
+
+ @property
+ def id (self):
+ return self._chromeRequest['requestId']
+
+ def setRequest (self, req):
+ self._chromeRequest = req
+
+ def setResponse (self, resp):
+ self._chromeResponse = resp
+
+class SiteLoader:
+ """
+ Load site in Chrome and monitor network requests
+
+ XXX: track popup windows/new tabs and close them
+ """
+
+ allowedSchemes = {'http', 'https'}
+
+ def __init__ (self, browser, url, logger=logging.getLogger(__name__)):
+ self.requests = {}
+ self.browser = browser
+ self.url = url
+ self.logger = logger
+
+ self.tab = browser.new_tab()
+
+ def __enter__ (self):
+ tab = self.tab
+ # setup callbacks
+ tab.Network.requestWillBeSent = self._requestWillBeSent
+ tab.Network.responseReceived = self._responseReceived
+ tab.Network.loadingFinished = self._loadingFinished
+ tab.Network.loadingFailed = self._loadingFailed
+ #tab.Page.loadEventFired = loadEventFired
+
+ # start the tab
+ tab.start()
+
+ # enable events
+ tab.Network.enable()
+ tab.Page.enable ()
+ tab.Network.clearBrowserCache ()
+ if tab.Network.canClearBrowserCookies ()['result']:
+ tab.Network.clearBrowserCookies ()
+
+ return self
+
+ def __len__ (self):
+ return len (self.requests)
+
+ def start (self):
+ self.tab.Page.navigate(url=self.url)
+
+ def wait (self, timeout=1):
+ self.tab.wait (timeout)
+
+ def waitIdle (self, idleTimeout=1, maxTimeout=60):
+ step = 0
+ for i in range (0, maxTimeout):
+ self.wait (1)
+ if len (self) == 0:
+ step += 1
+ if step > idleTimeout:
+ break
+ else:
+ step = 0
+
+ def stop (self):
+ """
+ Stop loading site
+
+ XXX: stop executing scripts
+ """
+
+ tab = self.tab
+
+ tab.Page.stopLoading ()
+ tab.Network.disable ()
+ tab.Page.disable ()
+ tab.Network.requestWillBeSent = None
+ tab.Network.responseReceived = None
+ tab.Network.loadingFinished = None
+ tab.Network.loadingFailed = None
+ tab.Page.loadEventFired = None
+
+ def __exit__ (self, exc_type, exc_value, traceback):
+ self.tab.stop ()
+ self.browser.close_tab(self.tab)
+ return False
+
+ def loadingFinished (self, item):
+ self.logger.debug ('item finished {}'.format (item))
+
+ # internal chrome callbacks
+ def _requestWillBeSent (self, **kwargs):
+ reqId = kwargs['requestId']
+ req = kwargs['request']
+
+ url = urlsplit (req['url'])
+ if url.scheme not in self.allowedSchemes:
+ return
+
+ item = self.requests.get (reqId)
+ if item:
+ # redirects never “finish” loading, but yield another requestWillBeSent with this key set
+ redirectResp = kwargs.get ('redirectResponse')
+ if redirectResp:
+ resp = {'requestId': reqId, 'response': redirectResp}
+ item.setResponse (resp)
+ self.loadingFinished (item, redirect=True)
+ self.logger.debug ('redirected request {} has url {}'.format (reqId, req['url']))
+ else:
+ self.logger.warn ('request {} already exists, overwriting.'.format (reqId))
+
+ item = Item ()
+ item.setRequest (kwargs)
+ self.requests[reqId] = item
+
+ def _responseReceived (self, **kwargs):
+ reqId = kwargs['requestId']
+ item = self.requests.get (reqId)
+ if item is None:
+ return
+
+ resp = kwargs['response']
+ url = urlsplit (resp['url'])
+ if url.scheme in self.allowedSchemes:
+ self.logger.debug ('response {} {}'.format (reqId, resp['url']))
+ item.setResponse (kwargs)
+ else:
+ self.logger.warn ('response: ignoring scheme {}'.format (url.scheme))
+
+ def _loadingFinished (self, **kwargs):
+ """
+ Item was fully loaded. For some items the request body is not available
+ when responseReceived is fired, thus move everything here.
+ """
+ reqId = kwargs['requestId']
+ item = self.requests.pop (reqId, None)
+ if item is None:
+ # we never recorded this request (blacklisted scheme, for example)
+ return
+ req = item.request
+ resp = item.response
+ assert req['url'] == resp['url'], 'req and resp urls are not the same {} vs {}'.format (req['url'], resp['url'])
+ url = urlsplit (resp['url'])
+ if url.scheme in self.allowedSchemes:
+ self.logger.debug ('finished {} {}'.format (reqId, req['url']))
+ item.encodedDataLength = kwargs['encodedDataLength']
+ self.loadingFinished (item)
+
+ def _loadingFailed (self, **kwargs):
+ reqId = kwargs['requestId']
+ self.logger.debug ('failed {} {}'.format (reqId, kwargs['errorText'], kwargs.get ('blockedReason')))
+ item = self.requests.pop (reqId, None)
+
diff --git a/crocoite/cli.py b/crocoite/cli.py
index 640d207..a97f0ce 100644
--- a/crocoite/cli.py
+++ b/crocoite/cli.py
@@ -18,327 +18,23 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
-import os, sys, json
-import pychrome
-from urllib.parse import urlsplit
-from warcio.warcwriter import WARCWriter
-from warcio.statusandheaders import StatusAndHeaders
-from base64 import b64decode
-import logging
-from logging.handlers import BufferingHandler
-from http.server import BaseHTTPRequestHandler
-from io import BytesIO
-import argparse
-import tempfile, random
-
-from html5lib.treewalkers.base import TreeWalker
-from html5lib.filters.base import Filter
-from html5lib.serializer import HTMLSerializer
-from html5lib import constants
-
-from . import html
-
-logger = logging.getLogger(__name__)
-
-# 10 MB, encoded! (i.e. actual data can be larger due to compression)
-maxBodySize = 10*1024*1024
-
-def packageData (path):
- """
- Locate package data, see setup.py’s data_files
- """
- return os.path.join (os.path.dirname (__file__), 'data', path)
-
-def packageUrl (path):
- """
- Create URL for package data stored into WARC
- """
- return 'urn:' + __package__ + ':' + path
-
-def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'):
- if length is None:
- length = random.randint (16, 32)
- return ''.join (map (lambda x: random.choice (chars), range (length)))
-
-class WARCLogHandler (BufferingHandler):
- """
- Buffered log handler, flushing to warcio
- """
-
- contentType = 'text/plain; charset=utf-8'
-
- def __init__ (self, capacity, warcfile):
- BufferingHandler.__init__ (self, capacity)
- self.warcfile = warcfile
-
- def flush (self):
- self.acquire ()
- try:
- buf = ''
- for record in self.buffer:
- buf += self.format (record)
- buf += '\n'
- # XXX: record type?
- record = self.warcfile.create_warc_record (
- packageUrl ('log'), 'metadata',
- payload=BytesIO (buf.encode ('utf8')),
- warc_headers_dict={'Content-Type': self.contentType})
- self.warcfile.write_record(record)
- self.buffer = []
- finally:
- self.release ()
-
-class ChromeTreeWalker (TreeWalker):
- """
- Recursive html5lib TreeWalker for Google Chrome method DOM.getDocument
- """
-
- def recurse (self, node):
- name = node['nodeName']
- if name.startswith ('#'):
- if name == '#text':
- yield from self.text (node['nodeValue'])
- elif name == '#comment':
- yield self.comment (node['nodeValue'])
- elif name == '#document':
- for child in node.get ('children', []):
- yield from self.recurse (child)
- else:
- assert False, name
- else:
- default_namespace = constants.namespaces["html"]
-
- attributes = node.get ('attributes', [])
- convertedAttr = {}
- for i in range (0, len (attributes), 2):
- convertedAttr[(default_namespace, attributes[i])] = attributes[i+1]
-
- children = node.get ('children', [])
- if name.lower() in html.voidTags and not children:
- yield from self.emptyTag (default_namespace, name, convertedAttr)
- else:
- yield self.startTag (default_namespace, name, convertedAttr)
- for child in node.get ('children', []):
- yield from self.recurse (child)
- yield self.endTag ('', name)
-
- def __iter__ (self):
- assert self.tree['nodeName'] == '#document'
- return self.recurse (self.tree)
-
- def split (self):
- """
- Split response returned by DOM.getDocument(pierce=True) into independent documents
- """
- def recurse (node):
- contentDocument = node.get ('contentDocument')
- if contentDocument:
- assert contentDocument['nodeName'] == '#document'
- yield contentDocument
- yield from recurse (contentDocument)
-
- for child in node.get ('children', []):
- yield from recurse (child)
-
- if self.tree['nodeName'] == '#document':
- yield self.tree
- yield from recurse (self.tree)
-
-class StripTagFilter (Filter):
- """
- Remove arbitrary tags
- """
-
- def __init__ (self, source, tags):
- Filter.__init__ (self, source)
- self.tags = set (map (str.lower, tags))
-
- def __iter__(self):
- delete = 0
- for token in Filter.__iter__(self):
- tokenType = token['type']
- if tokenType in {'StartTag', 'EmptyTag'}:
- if delete > 0 or token['name'].lower () in self.tags:
- delete += 1
- if delete == 0:
- yield token
- if tokenType == 'EndTag' and delete > 0:
- delete -= 1
-
-class StripAttributeFilter (Filter):
- """
- Remove arbitrary HTML attributes
- """
-
- def __init__ (self, source, attributes):
- Filter.__init__ (self, source)
- self.attributes = set (map (str.lower, attributes))
-
- def __iter__(self):
- default_namespace = constants.namespaces["html"]
- for token in Filter.__iter__(self):
- data = token.get ('data')
- if data and token['type'] in {'StartTag', 'EmptyTag'}:
- newdata = {}
- for (namespace, k), v in data.items ():
- if k.lower () not in self.attributes:
- newdata[(namespace, k)] = v
- token['data'] = newdata
- yield token
-
def main ():
- def getStatusText (response):
- text = response.get ('statusText')
- if text:
- return text
- text = BaseHTTPRequestHandler.responses.get (response['status'])
- if text:
- return text[0]
- return 'No status text available'
-
- def requestWillBeSent (**kwargs):
- req = kwargs.get ('request')
- reqId = kwargs['requestId']
- url = urlsplit (req['url'])
- if url.scheme in ('http', 'https'):
- logger.debug ('sending {} {}'.format (reqId, req['url']))
- if reqId in requests:
- redirectResp = kwargs.get ('redirectResponse')
- if redirectResp:
- requests[reqId]['response'] = redirectResp
- # XXX: can we retrieve the original response body right now?
- itemToWarc (reqId, requests[reqId], ignoreBody=True)
- else:
- logger.warn ('request {} already exists, overwriting.'.format (reqId))
- requests[reqId] = {}
- requests[reqId]['request'] = req
- requests[reqId]['initiator'] = kwargs['initiator']
- else:
- logger.warn ('request: ignoring scheme {}'.format (url.scheme))
-
- def responseReceived (**kwargs):
- resp = kwargs['response']
- reqId = kwargs['requestId']
- url = urlsplit (resp['url'])
- if url.scheme in ('http', 'https') and reqId in requests:
- logger.debug ('response {} {}'.format (reqId, resp['url']))
- requests[reqId]['response'] = resp
- else:
- logger.warn ('response: ignoring scheme {}'.format (url.scheme))
-
- def loadEventFired (**kwargs):
- """
- Equivalent to DOM ready JavaScript event
- """
- root = tab.DOM.getDocument ()
- rootId = root['root']['nodeId']
- links = tab.DOM.querySelectorAll (nodeId=rootId, selector='a')
- #for i in links['nodeIds']:
- # print ('link', tab.DOM.getAttributes (nodeId=i))
-
- def loadingFinished (**kwargs):
- """
- Item was fully loaded. For some items the request body is not available
- when responseReceived is fired, thus move everything here.
- """
- reqId = kwargs['requestId']
- # we never recorded this request
- if reqId not in requests:
- return
- item = requests[reqId]
- req = item['request']
- resp = item['response']
- url = urlsplit (resp['url'])
- if url.scheme in ('http', 'https'):
- logger.debug ('finished {} {}'.format (reqId, req['url']))
- itemToWarc (reqId, item, kwargs['encodedDataLength'])
- del requests[reqId]
-
- def itemToWarc (reqId, item, encodedDataLength=0, ignoreBody=False):
- req = item['request']
- resp = item['response']
- url = urlsplit (resp['url'])
-
- # overwrite request headers with those actually sent
- newReqHeaders = resp.get ('requestHeaders')
- if newReqHeaders:
- req['headers'] = newReqHeaders
-
- postData = req.get ('postData')
- if postData:
- postData = BytesIO (postData.encode ('utf8'))
- path = url.path
- if url.query:
- path += '?' + url.query
- httpHeaders = StatusAndHeaders('{} {} HTTP/1.1'.format (req['method'], path),
- req['headers'], protocol='HTTP/1.1', is_http_request=True)
- initiator = item['initiator']
- warcHeaders = {
- 'X-Chrome-Initiator': json.dumps (initiator),
- }
- record = writer.create_warc_record(req['url'], 'request',
- payload=postData, http_headers=httpHeaders,
- warc_headers_dict=warcHeaders)
- writer.write_record(record)
- concurrentTo = record.rec_headers['WARC-Record-ID']
-
- # check body size first, since we’re loading everything into memory
- if encodedDataLength < maxBodySize:
- try:
- if ignoreBody:
- rawBody = b''
- base64Encoded = True
- else:
- body = tab.Network.getResponseBody (requestId=reqId)
- rawBody = body['body']
- base64Encoded = body['base64Encoded']
- if base64Encoded:
- rawBody = b64decode (rawBody)
- else:
- rawBody = rawBody.encode ('utf8')
-
- httpHeaders = StatusAndHeaders('{} {}'.format (resp['status'],
- getStatusText (resp)), resp['headers'], protocol='HTTP/1.1')
-
- # Content is saved decompressed and decoded, remove these headers
- blacklistedHeaders = {'transfer-encoding', 'content-encoding'}
- for h in blacklistedHeaders:
- httpHeaders.remove_header (h)
-
- # chrome sends nothing but utf8 encoded text. Fortunately HTTP
- # headers take precedence over the document’s <meta>, thus we can
- # easily override those.
- contentType = resp['mimeType']
- if not base64Encoded:
- contentType += '; charset=utf-8'
- httpHeaders.replace_header ('content-type', contentType)
-
- httpHeaders.replace_header ('content-length', '{:d}'.format (len (rawBody)))
-
- warcHeaders = {
- 'WARC-Concurrent-To': concurrentTo,
- 'WARC-IP-Address': resp.get ('remoteIPAddress', ''),
- 'X-Chrome-Protocol': resp.get ('protocol', ''),
- 'X-Chrome-FromDiskCache': str (resp.get ('fromDiskCache')),
- 'X-Chrome-ConnectionReused': str (resp.get ('connectionReused')),
- 'X-Chrome-Base64Body': str (base64Encoded),
- }
- record = writer.create_warc_record(resp['url'], 'response',
- warc_headers_dict=warcHeaders, payload=BytesIO (rawBody),
- http_headers=httpHeaders)
- writer.write_record(record)
- except pychrome.exceptions.CallMethodException:
- logger.error ('no data for {} {} {}'.format (resp['url'],
- resp['status'], reqId))
- else:
- logger.warn ('body for {} is too large, {} bytes'.format (resp['url'], kwargs['encodedDataLength']))
-
- def loadingFailed (**kwargs):
- reqId = kwargs['requestId']
- logger.debug ('failed {} {}'.format (reqId, kwargs['errorText'], kwargs.get ('blockedReason')))
- if reqId in requests:
- del requests[reqId]
-
+ import os, random, logging, argparse
+ from io import BytesIO
+ import pychrome
+ from urllib.parse import urlsplit
+ from warcio.warcwriter import WARCWriter
+ from warcio.statusandheaders import StatusAndHeaders
+ from html5lib.serializer import HTMLSerializer
+ from . import html, packageData, packageUrl
+ from .warc import WarcLoader
+ from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker
+
+ def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'):
+ if length is None:
+ length = random.randint (16, 32)
+ return ''.join (map (lambda x: random.choice (chars), range (length)))
+
def getFormattedViewportMetrics (tab):
layoutMetrics = tab.Page.getLayoutMetrics ()
# XXX: I’m not entirely sure which one we should use here
@@ -384,7 +80,7 @@ def main ():
'X-Chrome-Viewport': viewport})
writer.write_record (record)
- def emulateScreenMetrics (tab):
+ def emulateScreenMetrics (l):
"""
Emulate different screen sizes, causing the site to fetch assets (img
srcset and css, for example) for different screen resolutions.
@@ -404,13 +100,12 @@ def main ():
{'width': 1920, 'height': 1080, 'deviceScaleFactor': 1, 'mobile': False},
]
for s in sizes:
- tab.Emulation.setDeviceMetricsOverride (**s)
- tab.wait (1)
+ l.tab.Emulation.setDeviceMetricsOverride (**s)
+ l.wait (1)
# XXX: this seems to be broken, it does not clear the override
#tab.Emulation.clearDeviceMetricsOverride ()
# wait until assets finished loading
- while len (requests) != 0:
- tab.wait (1)
+ l.waitIdle (2, 10)
def loadScripts (paths, scripts=[]):
for p in paths:
@@ -427,6 +122,7 @@ def main ():
warc_headers_dict={'Content-Type': 'application/javascript; charset=utf-8'})
writer.write_record (record)
+ logger = logging.getLogger(__name__)
logging.basicConfig (level=logging.DEBUG)
parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.')
@@ -434,7 +130,8 @@ def main ():
parser.add_argument('--timeout', default=10, type=int, help='Maximum time for archival')
parser.add_argument('--idle-timeout', default=2, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout')
parser.add_argument('--log-buffer', default=1000, type=int, dest='logBuffer')
- parser.add_argument('--keep-tab', action='store_true', default=False, dest='keepTab', help='Keep tab open')
+ parser.add_argument('--max-body-size', default=10*1024*1024, type=int, dest='maxBodySize', help='Max body size in bytes')
+ #parser.add_argument('--keep-tab', action='store_true', default=False, dest='keepTab', help='Keep tab open')
parser.add_argument('--onload', default=[], action='append', help='Inject JavaScript file before loading page')
parser.add_argument('--onsnapshot', default=[], action='append', help='Run JavaScript files before creating DOM snapshot')
parser.add_argument('url', help='Website URL')
@@ -448,93 +145,44 @@ def main ():
onload = loadScripts (args.onload, ['var {} = false;\n'.format (stopVarname)]).replace (stopVarname, newStopVarname)
stopVarname = newStopVarname
- # temporary store for requests
- requests = {}
-
- # create a browser instance
browser = pychrome.Browser(url=args.browser)
- # create a tab
- tab = browser.new_tab()
-
- # setup callbacks
- tab.Network.requestWillBeSent = requestWillBeSent
- tab.Network.responseReceived = responseReceived
- tab.Network.loadingFinished = loadingFinished
- tab.Network.loadingFailed = loadingFailed
- tab.Page.loadEventFired = loadEventFired
-
- # start the tab
- tab.start()
-
fd = open (args.output, 'wb')
writer = WARCWriter (fd, gzip=True)
- version = tab.Browser.getVersion ()
- payload = {
- 'software': __package__,
- 'browser': version['product'],
- 'useragent': version['userAgent'],
- 'viewport': getFormattedViewportMetrics (tab),
- }
- warcinfo = writer.create_warcinfo_record (filename=None, info=payload)
- writer.write_record (warcinfo)
-
- warcLogger = WARCLogHandler (args.logBuffer, writer)
- logger.addHandler (warcLogger)
-
- # save onload script
- writeScript ('onload', onload, writer)
- # enable events
- tab.Network.enable()
- tab.Page.enable ()
- tab.Network.clearBrowserCache ()
- if tab.Network.canClearBrowserCookies ()['result']:
- tab.Network.clearBrowserCookies ()
- # inject our custom javascript to the page before loading
- tab.Page.addScriptToEvaluateOnNewDocument (source=onload)
-
- tab.Page.navigate(url=args.url)
-
- idleTimeout = 0
- for i in range (0, args.timeout):
- tab.wait (1)
- if len (requests) == 0:
- idleTimeout += 1
- if idleTimeout > args.idleTimeout:
- break
- else:
- idleTimeout = 0
+ with WarcLoader (browser, args.url, writer, logBuffer=args.logBuffer,
+ maxBodySize=args.maxBodySize) as l:
+ version = l.tab.Browser.getVersion ()
+ payload = {
+ 'software': __package__,
+ 'browser': version['product'],
+ 'useragent': version['userAgent'],
+ 'viewport': getFormattedViewportMetrics (l.tab),
+ }
+ warcinfo = writer.create_warcinfo_record (filename=None, info=payload)
+ writer.write_record (warcinfo)
+ # save onload script as well
+ writeScript ('onload', onload, writer)
- # get ready for snapshot: stop loading and scripts, disable events
- tab.Runtime.evaluate (expression='{} = true; window.scrollTo (0, 0);'.format (stopVarname), returnByValue=True)
- # if we stopped due to timeout, wait for remaining assets
- while len (requests) != 0:
- tab.wait (1)
+ # inject our custom javascript to the page before loading
+ l.tab.Page.addScriptToEvaluateOnNewDocument (source=onload)
+ l.start ()
- emulateScreenMetrics (tab)
+ l.waitIdle (args.idleTimeout, args.timeout)
- tab.Page.stopLoading ()
- tab.Network.disable ()
- tab.Page.disable ()
- tab.Network.requestWillBeSent = None
- tab.Network.responseReceived = None
- tab.Network.loadingFinished = None
- tab.Network.loadingFailed = None
- tab.Page.loadEventFired = None
+ # get ready for snapshot: stop loading and scripts, disable events
+ l.tab.Runtime.evaluate (expression='{} = true; window.scrollTo (0, 0);'.format (stopVarname), returnByValue=True)
+ # if we stopped due to timeout, wait for remaining assets
+ l.waitIdle (2, 10)
- script = loadScripts (args.onsnapshot)
- writeScript ('onsnapshot', script, writer)
- tab.Runtime.evaluate (expression=script, returnByValue=True)
- writeDOMSnapshot (tab, writer)
+ emulateScreenMetrics (l)
- tab.stop()
- if not args.keepTab:
- browser.close_tab(tab)
+ l.stop ()
- logger.removeHandler (warcLogger)
- warcLogger.flush ()
- fd.close ()
+ script = loadScripts (args.onsnapshot)
+ writeScript ('onsnapshot', script, writer)
+ l.tab.Runtime.evaluate (expression=script, returnByValue=True)
+ writeDOMSnapshot (l.tab, writer)
return True
diff --git a/crocoite/html.py b/crocoite/html.py
index 34fe26b..f891101 100644
--- a/crocoite/html.py
+++ b/crocoite/html.py
@@ -18,6 +18,10 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
+"""
+HTML helper
+"""
+
# HTML void tags, see https://html.spec.whatwg.org/multipage/syntax.html#void-elements
voidTags = {'area',
'base',
@@ -99,3 +103,106 @@ eventAttributes = {'onabort',
'onvolumechange',
'onwaiting'}
+from html5lib.treewalkers.base import TreeWalker
+from html5lib.filters.base import Filter
+from html5lib.serializer import HTMLSerializer
+from html5lib import constants
+
+class ChromeTreeWalker (TreeWalker):
+ """
+ Recursive html5lib TreeWalker for Google Chrome method DOM.getDocument
+ """
+
+ def recurse (self, node):
+ name = node['nodeName']
+ if name.startswith ('#'):
+ if name == '#text':
+ yield from self.text (node['nodeValue'])
+ elif name == '#comment':
+ yield self.comment (node['nodeValue'])
+ elif name == '#document':
+ for child in node.get ('children', []):
+ yield from self.recurse (child)
+ else:
+ assert False, name
+ else:
+ default_namespace = constants.namespaces["html"]
+
+ attributes = node.get ('attributes', [])
+ convertedAttr = {}
+ for i in range (0, len (attributes), 2):
+ convertedAttr[(default_namespace, attributes[i])] = attributes[i+1]
+
+ children = node.get ('children', [])
+ if name.lower() in voidTags and not children:
+ yield from self.emptyTag (default_namespace, name, convertedAttr)
+ else:
+ yield self.startTag (default_namespace, name, convertedAttr)
+ for child in node.get ('children', []):
+ yield from self.recurse (child)
+ yield self.endTag ('', name)
+
+ def __iter__ (self):
+ assert self.tree['nodeName'] == '#document'
+ return self.recurse (self.tree)
+
+ def split (self):
+ """
+ Split response returned by DOM.getDocument(pierce=True) into independent documents
+ """
+ def recurse (node):
+ contentDocument = node.get ('contentDocument')
+ if contentDocument:
+ assert contentDocument['nodeName'] == '#document'
+ yield contentDocument
+ yield from recurse (contentDocument)
+
+ for child in node.get ('children', []):
+ yield from recurse (child)
+
+ if self.tree['nodeName'] == '#document':
+ yield self.tree
+ yield from recurse (self.tree)
+
+class StripTagFilter (Filter):
+ """
+ Remove arbitrary tags
+ """
+
+ def __init__ (self, source, tags):
+ Filter.__init__ (self, source)
+ self.tags = set (map (str.lower, tags))
+
+ def __iter__(self):
+ delete = 0
+ for token in Filter.__iter__(self):
+ tokenType = token['type']
+ if tokenType in {'StartTag', 'EmptyTag'}:
+ if delete > 0 or token['name'].lower () in self.tags:
+ delete += 1
+ if delete == 0:
+ yield token
+ if tokenType == 'EndTag' and delete > 0:
+ delete -= 1
+
+class StripAttributeFilter (Filter):
+ """
+ Remove arbitrary HTML attributes
+ """
+
+ def __init__ (self, source, attributes):
+ Filter.__init__ (self, source)
+ self.attributes = set (map (str.lower, attributes))
+
+ def __iter__(self):
+ default_namespace = constants.namespaces["html"]
+ for token in Filter.__iter__(self):
+ data = token.get ('data')
+ if data and token['type'] in {'StartTag', 'EmptyTag'}:
+ newdata = {}
+ for (namespace, k), v in data.items ():
+ if k.lower () not in self.attributes:
+ newdata[(namespace, k)] = v
+ token['data'] = newdata
+ yield token
+
diff --git a/crocoite/warc.py b/crocoite/warc.py
new file mode 100644
index 0000000..e06b1c7
--- /dev/null
+++ b/crocoite/warc.py
@@ -0,0 +1,174 @@
+# Copyright (c) 2017 crocoite contributors
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+"""
+Classes writing data to WARC files
+"""
+
+import logging
+import json
+from .browser import SiteLoader
+from . import packageUrl
+from http.server import BaseHTTPRequestHandler
+from base64 import b64decode
+from io import BytesIO
+from warcio.statusandheaders import StatusAndHeaders
+from urllib.parse import urlsplit
+from logging.handlers import BufferingHandler
+import pychrome
+
+class WARCLogHandler (BufferingHandler):
+ """
+ Buffered log handler, flushing to warcio
+ """
+
+ contentType = 'text/plain; charset=utf-8'
+
+ def __init__ (self, capacity, warcfile):
+ BufferingHandler.__init__ (self, capacity)
+ self.warcfile = warcfile
+
+ def flush (self):
+ self.acquire ()
+ try:
+ if self.buffer:
+ buf = ''
+ for record in self.buffer:
+ buf += self.format (record)
+ buf += '\n'
+ # XXX: record type?
+ record = self.warcfile.create_warc_record (
+ packageUrl ('log'), 'metadata',
+ payload=BytesIO (buf.encode ('utf8')),
+ warc_headers_dict={'Content-Type': self.contentType})
+ self.warcfile.write_record(record)
+ self.buffer = []
+ finally:
+ self.release ()
+
+class WarcLoader (SiteLoader):
+ def __init__ (self, browser, url, writer,
+ logger=logging.getLogger(__name__), logBuffer=1000,
+ maxBodySize=10*1024*1024):
+ SiteLoader.__init__ (self, browser, url, logger)
+ self.writer = writer
+ self.maxBodySize = maxBodySize
+ self.warcLogger = WARCLogHandler (logBuffer, writer)
+ self.logger.addHandler (self.warcLogger)
+
+ def __exit__ (self, exc_type, exc_value, traceback):
+ self.logger.removeHandler (self.warcLogger)
+ self.warcLogger.flush ()
+ return SiteLoader.__exit__ (self, exc_type, exc_value, traceback)
+
+ @staticmethod
+ def getStatusText (response):
+ text = response.get ('statusText')
+ if text:
+ return text
+ text = BaseHTTPRequestHandler.responses.get (response['status'])
+ if text:
+ return text[0]
+ return 'No status text available'
+
+ def loadingFinished (self, item, redirect=False):
+ writer = self.writer
+
+ req = item.request
+ reqId = item.id
+ resp = item.response
+ url = urlsplit (resp['url'])
+
+ # overwrite request headers with those actually sent
+ newReqHeaders = resp.get ('requestHeaders')
+ if newReqHeaders:
+ req['headers'] = newReqHeaders
+
+ postData = req.get ('postData')
+ if postData:
+ postData = BytesIO (postData.encode ('utf8'))
+ path = url.path
+ if url.query:
+ path += '?' + url.query
+ httpHeaders = StatusAndHeaders('{} {} HTTP/1.1'.format (req['method'], path),
+ req['headers'], protocol='HTTP/1.1', is_http_request=True)
+ initiator = item.initiator
+ warcHeaders = {
+ 'X-Chrome-Initiator': json.dumps (initiator),
+ }
+ record = writer.create_warc_record(req['url'], 'request',
+ payload=postData, http_headers=httpHeaders,
+ warc_headers_dict=warcHeaders)
+ writer.write_record(record)
+ concurrentTo = record.rec_headers['WARC-Record-ID']
+
+ # now the response
+ warcHeaders = {
+ 'WARC-Concurrent-To': concurrentTo,
+ 'WARC-IP-Address': resp.get ('remoteIPAddress', ''),
+ 'X-Chrome-Protocol': resp.get ('protocol', ''),
+ 'X-Chrome-FromDiskCache': str (resp.get ('fromDiskCache')),
+ 'X-Chrome-ConnectionReused': str (resp.get ('connectionReused')),
+ }
+
+ rawBody = b''
+ base64Encoded = False
+ try:
+ # check body size first, since we’re loading everything into memory
+ if item.encodedDataLength < self.maxBodySize:
+ body = self.tab.Network.getResponseBody (requestId=reqId)
+ rawBody = body['body']
+ base64Encoded = body['base64Encoded']
+ if base64Encoded:
+ rawBody = b64decode (rawBody)
+ warcHeaders['X-Chrome-Base64Body'] = str (True)
+ else:
+ rawBody = rawBody.encode ('utf8')
+ else:
+ self.logger.error ('body for {} too large {} vs {}'.format (reqId,
+ item.encodedDataLength, self.maxBodySize))
+ except pychrome.exceptions.CallMethodException:
+ self.logger.error ('no data for {} {} {}'.format (resp['url'],
+ resp['status'], reqId))
+
+ httpHeaders = StatusAndHeaders('{} {}'.format (resp['status'],
+ self.getStatusText (resp)), resp['headers'], protocol='HTTP/1.1')
+
+ # Content is saved decompressed and decoded, remove these headers
+ blacklistedHeaders = {'transfer-encoding', 'content-encoding'}
+ for h in blacklistedHeaders:
+ httpHeaders.remove_header (h)
+
+ # chrome sends nothing but utf8 encoded text. Fortunately HTTP
+ # headers take precedence over the document’s <meta>, thus we can
+ # easily override those.
+ contentType = resp.get ('mimeType')
+ if contentType:
+ if not base64Encoded:
+ contentType += '; charset=utf-8'
+ httpHeaders.replace_header ('content-type', contentType)
+
+ httpHeaders.replace_header ('content-length', '{:d}'.format (len (rawBody)))
+
+ record = writer.create_warc_record(resp['url'], 'response',
+ warc_headers_dict=warcHeaders, payload=BytesIO (rawBody),
+ http_headers=httpHeaders)
+ writer.write_record(record)
+