From f816319081d5253974ddb70b655d55f4a880a77a Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Wed, 6 Dec 2017 19:37:53 +0100 Subject: Start Chrome browser instance Unless --browser argument is given. Uses sane settings and a temporary profile directory. --- crocoite/cli.py | 93 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 49 insertions(+), 44 deletions(-) (limited to 'crocoite/cli.py') diff --git a/crocoite/cli.py b/crocoite/cli.py index 62f488a..8a55269 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -27,7 +27,7 @@ def main (): from warcio.warcwriter import WARCWriter from warcio.statusandheaders import StatusAndHeaders from html5lib.serializer import HTMLSerializer - from . import html, packageData, packageUrl + from . import html, packageData, packageUrl, browser from .warc import WarcLoader from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker @@ -138,7 +138,7 @@ def main (): logging.basicConfig (level=logging.DEBUG) parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.') - parser.add_argument('--browser', default='http://127.0.0.1:9222', help='DevTools URL', metavar='URL') + parser.add_argument('--browser', help='DevTools URL', metavar='URL') parser.add_argument('--timeout', default=10, type=int, help='Maximum time for archival', metavar='SEC') parser.add_argument('--idle-timeout', default=2, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout', metavar='SEC') parser.add_argument('--log-buffer', default=1000, type=int, dest='logBuffer', metavar='LINES') @@ -159,48 +159,53 @@ def main (): onload = loadScripts (args.onload, ['var {} = false;\n'.format (stopVarname)]).replace (stopVarname, newStopVarname) stopVarname = newStopVarname - browser = pychrome.Browser(url=args.browser) - - fd = open (args.output, 'wb') - writer = WARCWriter (fd, gzip=True) - - with WarcLoader (browser, args.url, writer, logBuffer=args.logBuffer, - maxBodySize=args.maxBodySize) as l: - version = l.tab.Browser.getVersion () - payload = { - 'software': __package__, - 'browser': version['product'], - 'useragent': version['userAgent'], - 'viewport': getFormattedViewportMetrics (l.tab), - } - warcinfo = writer.create_warcinfo_record (filename=None, info=payload) - writer.write_record (warcinfo) - # save onload script as well - writeScript ('onload', onload, writer) - - # inject our custom javascript to the page before loading - l.tab.Page.addScriptToEvaluateOnNewDocument (source=onload) - l.start () - - l.waitIdle (args.idleTimeout, args.timeout) - - # get ready for snapshot: stop loading and scripts, disable events - l.tab.Runtime.evaluate (expression='{} = true; window.scrollTo (0, 0);'.format (stopVarname), returnByValue=True) - # if we stopped due to timeout, wait for remaining assets - l.waitIdle (2, 10) - - emulateScreenMetrics (l) - - l.stop () - - if args.domSnapshot: - script = loadScripts (args.onsnapshot) - writeScript ('onsnapshot', script, writer) - l.tab.Runtime.evaluate (expression=script, returnByValue=True) - writeDOMSnapshot (l.tab, writer) - - if args.screenshot: - writeScreenshot (l.tab, writer) + service = browser.ChromeService () + if args.browser: + service = browser.NullService (args.browser) + + with service as browserUrl: + browser = pychrome.Browser(url=browserUrl) + + fd = open (args.output, 'wb') + writer = WARCWriter (fd, gzip=True) + + with WarcLoader (browser, args.url, writer, logBuffer=args.logBuffer, + maxBodySize=args.maxBodySize) as l: + version = l.tab.Browser.getVersion () + payload = { + 'software': __package__, + 'browser': version['product'], + 'useragent': version['userAgent'], + 'viewport': getFormattedViewportMetrics (l.tab), + } + warcinfo = writer.create_warcinfo_record (filename=None, info=payload) + writer.write_record (warcinfo) + # save onload script as well + writeScript ('onload', onload, writer) + + # inject our custom javascript to the page before loading + l.tab.Page.addScriptToEvaluateOnNewDocument (source=onload) + l.start () + + l.waitIdle (args.idleTimeout, args.timeout) + + # get ready for snapshot: stop loading and scripts, disable events + l.tab.Runtime.evaluate (expression='{} = true; window.scrollTo (0, 0);'.format (stopVarname), returnByValue=True) + # if we stopped due to timeout, wait for remaining assets + l.waitIdle (2, 10) + + emulateScreenMetrics (l) + + l.stop () + + if args.domSnapshot: + script = loadScripts (args.onsnapshot) + writeScript ('onsnapshot', script, writer) + l.tab.Runtime.evaluate (expression=script, returnByValue=True) + writeDOMSnapshot (l.tab, writer) + + if args.screenshot: + writeScreenshot (l.tab, writer) return True -- cgit v1.2.3