# Copyright (c) 2017 crocoite contributors # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. def main (): import os, random, logging, argparse from io import BytesIO import pychrome from urllib.parse import urlsplit from warcio.warcwriter import WARCWriter from warcio.statusandheaders import StatusAndHeaders from html5lib.serializer import HTMLSerializer from . import html, packageData, packageUrl from .warc import WarcLoader from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'): if length is None: length = random.randint (16, 32) return ''.join (map (lambda x: random.choice (chars), range (length))) def getFormattedViewportMetrics (tab): layoutMetrics = tab.Page.getLayoutMetrics () # XXX: I’m not entirely sure which one we should use here return '{}x{}'.format (layoutMetrics['layoutViewport']['clientWidth'], layoutMetrics['layoutViewport']['clientHeight']) def writeDOMSnapshot (tab, writer): """ Get a DOM snapshot of tab and write it to WARC. We could use DOMSnapshot.getSnapshot here, but the API is not stable yet. Also computed styles are not really necessary here. XXX: Currently writes a response, when it should use “resource”. pywb can’t handle that though. """ viewport = getFormattedViewportMetrics (tab) dom = tab.DOM.getDocument (depth=-1, pierce=True) haveUrls = set () for doc in ChromeTreeWalker (dom['root']).split (): rawUrl = doc['documentURL'] if rawUrl in haveUrls: # ignore duplicate URLs. they are usually caused by # javascript-injected iframes (advertising) with no(?) src logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl)) continue url = urlsplit (rawUrl) if url.scheme in ('http', 'https'): logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL'])) haveUrls.add (rawUrl) walker = ChromeTreeWalker (doc) # remove script, to make the page static and noscript, because at the # time we took the snapshot scripts were enabled disallowedTags = ['script', 'noscript'] disallowedAttributes = html.eventAttributes stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes) serializer = HTMLSerializer () httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') record = writer.create_warc_record (doc['documentURL'], 'response', payload=BytesIO (serializer.render (stream, 'utf8')), http_headers=httpHeaders, warc_headers_dict={'X-DOM-Snapshot': str (True), 'X-Chrome-Viewport': viewport}) writer.write_record (record) def emulateScreenMetrics (l): """ Emulate different screen sizes, causing the site to fetch assets (img srcset and css, for example) for different screen resolutions. """ cssPpi = 96 sizes = [ {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1.5, 'mobile': False}, {'width': 1920, 'height': 1080, 'deviceScaleFactor': 2, 'mobile': False}, # very dense display {'width': 1920, 'height': 1080, 'deviceScaleFactor': 4, 'mobile': False}, # just a few samples: # 1st gen iPhone (portrait mode) {'width': 320, 'height': 480, 'deviceScaleFactor': 163/cssPpi, 'mobile': True}, # 6th gen iPhone (portrait mode) {'width': 750, 'height': 1334, 'deviceScaleFactor': 326/cssPpi, 'mobile': True}, # and reset {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1, 'mobile': False}, ] for s in sizes: l.tab.Emulation.setDeviceMetricsOverride (**s) l.wait (1) # XXX: this seems to be broken, it does not clear the override #tab.Emulation.clearDeviceMetricsOverride () # wait until assets finished loading l.waitIdle (2, 10) def loadScripts (paths, scripts=[]): for p in paths: if not os.path.exists (p): # search for defaults scripts in package data directory p = packageData (p) with open (p, 'r') as fd: scripts.append (fd.read ()) return '\n'.join (scripts) def writeScript (path, source, writer): record = writer.create_warc_record (packageUrl (path), 'metadata', payload=BytesIO (source.encode ('utf8')), warc_headers_dict={'Content-Type': 'application/javascript; charset=utf-8'}) writer.write_record (record) logger = logging.getLogger(__name__) logging.basicConfig (level=logging.DEBUG) parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.') parser.add_argument('--browser', default='http://127.0.0.1:9222', help='DevTools URL', metavar='URL') parser.add_argument('--timeout', default=10, type=int, help='Maximum time for archival', metavar='SEC') parser.add_argument('--idle-timeout', default=2, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout', metavar='SEC') parser.add_argument('--log-buffer', default=1000, type=int, dest='logBuffer', metavar='LINES') parser.add_argument('--max-body-size', default=10*1024*1024, type=int, dest='maxBodySize', help='Max body size', metavar='BYTES') #parser.add_argument('--keep-tab', action='store_true', default=False, dest='keepTab', help='Keep tab open') parser.add_argument('--onload', default=[], action='append', help='Inject JavaScript file before loading page', metavar='FILE') parser.add_argument('--onsnapshot', default=[], action='append', help='Run JavaScript files before creating DOM snapshot', metavar='FILE') parser.add_argument('url', help='Website URL') parser.add_argument('output', help='WARC filename') args = parser.parse_args () stopVarname = '__' + __package__ + '_stop__' # avoid sites messing with our scripts by using a random stop variable name newStopVarname = randomString () onload = loadScripts (args.onload, ['var {} = false;\n'.format (stopVarname)]).replace (stopVarname, newStopVarname) stopVarname = newStopVarname browser = pychrome.Browser(url=args.browser) fd = open (args.output, 'wb') writer = WARCWriter (fd, gzip=True) with WarcLoader (browser, args.url, writer, logBuffer=args.logBuffer, maxBodySize=args.maxBodySize) as l: version = l.tab.Browser.getVersion () payload = { 'software': __package__, 'browser': version['product'], 'useragent': version['userAgent'], 'viewport': getFormattedViewportMetrics (l.tab), } warcinfo = writer.create_warcinfo_record (filename=None, info=payload) writer.write_record (warcinfo) # save onload script as well writeScript ('onload', onload, writer) # inject our custom javascript to the page before loading l.tab.Page.addScriptToEvaluateOnNewDocument (source=onload) l.start () l.waitIdle (args.idleTimeout, args.timeout) # get ready for snapshot: stop loading and scripts, disable events l.tab.Runtime.evaluate (expression='{} = true; window.scrollTo (0, 0);'.format (stopVarname), returnByValue=True) # if we stopped due to timeout, wait for remaining assets l.waitIdle (2, 10) emulateScreenMetrics (l) l.stop () script = loadScripts (args.onsnapshot) writeScript ('onsnapshot', script, writer) l.tab.Runtime.evaluate (expression=script, returnByValue=True) writeDOMSnapshot (l.tab, writer) return True