From f816319081d5253974ddb70b655d55f4a880a77a Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Wed, 6 Dec 2017 19:37:53 +0100 Subject: Start Chrome browser instance Unless --browser argument is given. Uses sane settings and a temporary profile directory. --- crocoite/browser.py | 52 ++++++++++++++++++++++++++++++ crocoite/cli.py | 93 ++++++++++++++++++++++++++++------------------------- 2 files changed, 101 insertions(+), 44 deletions(-) (limited to 'crocoite') diff --git a/crocoite/browser.py b/crocoite/browser.py index e58fce8..765acbb 100644 --- a/crocoite/browser.py +++ b/crocoite/browser.py @@ -207,3 +207,55 @@ class SiteLoader: self.logger.debug ('failed {} {}'.format (reqId, kwargs['errorText'], kwargs.get ('blockedReason'))) item = self.requests.pop (reqId, None) +import subprocess +from tempfile import mkdtemp +from contextlib import contextmanager +import socket, shutil + +@contextmanager +def ChromeService (binary='google-chrome-stable', host='localhost', port=9222, windowSize=(1920, 1080)): + """ + Start Chrome with socket activation (i.e. pass listening socket). Polling + is not required with this method, since reads will block until Chrome is + ready. + """ + s = socket.socket () + s.setsockopt (socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + s.bind ((host, port)) + s.listen (10) + userDataDir = mkdtemp () + args = [binary, + '--window-size={},{}'.format (*windowSize), + '--user-data-dir={}'.format (userDataDir), # use temporory user dir + '--no-default-browser-check', + '--no-first-run', # don’t show first run screen + '--disable-breakpad', # no error reports + '--disable-extensions', + '--disable-infobars', + '--disable-notifications', # no libnotify + '--headless', + '--disable-gpu', + '--hide-scrollbars', # hide scrollbars on screenshots + '--mute-audio', # don’t play any audio + '--remote-debugging-socket-fd={}'.format (s.fileno ()), + '--homepage=about:blank', + 'about:blank'] + # start new session, so ^C does not affect subprocess + p = subprocess.Popen (args, pass_fds=[s.fileno()], start_new_session=True, + stdin=subprocess.DEVNULL, stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL) + + # must be wrapped in try-finally, otherwise code in __exit__/finally is not + # executed + try: + yield 'http://{}:{}'.format (host, port) + finally: + p.terminate () + p.wait () + shutil.rmtree (userDataDir) + +@contextmanager +def NullService (url): + yield url + + diff --git a/crocoite/cli.py b/crocoite/cli.py index 62f488a..8a55269 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -27,7 +27,7 @@ def main (): from warcio.warcwriter import WARCWriter from warcio.statusandheaders import StatusAndHeaders from html5lib.serializer import HTMLSerializer - from . import html, packageData, packageUrl + from . import html, packageData, packageUrl, browser from .warc import WarcLoader from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker @@ -138,7 +138,7 @@ def main (): logging.basicConfig (level=logging.DEBUG) parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.') - parser.add_argument('--browser', default='http://127.0.0.1:9222', help='DevTools URL', metavar='URL') + parser.add_argument('--browser', help='DevTools URL', metavar='URL') parser.add_argument('--timeout', default=10, type=int, help='Maximum time for archival', metavar='SEC') parser.add_argument('--idle-timeout', default=2, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout', metavar='SEC') parser.add_argument('--log-buffer', default=1000, type=int, dest='logBuffer', metavar='LINES') @@ -159,48 +159,53 @@ def main (): onload = loadScripts (args.onload, ['var {} = false;\n'.format (stopVarname)]).replace (stopVarname, newStopVarname) stopVarname = newStopVarname - browser = pychrome.Browser(url=args.browser) - - fd = open (args.output, 'wb') - writer = WARCWriter (fd, gzip=True) - - with WarcLoader (browser, args.url, writer, logBuffer=args.logBuffer, - maxBodySize=args.maxBodySize) as l: - version = l.tab.Browser.getVersion () - payload = { - 'software': __package__, - 'browser': version['product'], - 'useragent': version['userAgent'], - 'viewport': getFormattedViewportMetrics (l.tab), - } - warcinfo = writer.create_warcinfo_record (filename=None, info=payload) - writer.write_record (warcinfo) - # save onload script as well - writeScript ('onload', onload, writer) - - # inject our custom javascript to the page before loading - l.tab.Page.addScriptToEvaluateOnNewDocument (source=onload) - l.start () - - l.waitIdle (args.idleTimeout, args.timeout) - - # get ready for snapshot: stop loading and scripts, disable events - l.tab.Runtime.evaluate (expression='{} = true; window.scrollTo (0, 0);'.format (stopVarname), returnByValue=True) - # if we stopped due to timeout, wait for remaining assets - l.waitIdle (2, 10) - - emulateScreenMetrics (l) - - l.stop () - - if args.domSnapshot: - script = loadScripts (args.onsnapshot) - writeScript ('onsnapshot', script, writer) - l.tab.Runtime.evaluate (expression=script, returnByValue=True) - writeDOMSnapshot (l.tab, writer) - - if args.screenshot: - writeScreenshot (l.tab, writer) + service = browser.ChromeService () + if args.browser: + service = browser.NullService (args.browser) + + with service as browserUrl: + browser = pychrome.Browser(url=browserUrl) + + fd = open (args.output, 'wb') + writer = WARCWriter (fd, gzip=True) + + with WarcLoader (browser, args.url, writer, logBuffer=args.logBuffer, + maxBodySize=args.maxBodySize) as l: + version = l.tab.Browser.getVersion () + payload = { + 'software': __package__, + 'browser': version['product'], + 'useragent': version['userAgent'], + 'viewport': getFormattedViewportMetrics (l.tab), + } + warcinfo = writer.create_warcinfo_record (filename=None, info=payload) + writer.write_record (warcinfo) + # save onload script as well + writeScript ('onload', onload, writer) + + # inject our custom javascript to the page before loading + l.tab.Page.addScriptToEvaluateOnNewDocument (source=onload) + l.start () + + l.waitIdle (args.idleTimeout, args.timeout) + + # get ready for snapshot: stop loading and scripts, disable events + l.tab.Runtime.evaluate (expression='{} = true; window.scrollTo (0, 0);'.format (stopVarname), returnByValue=True) + # if we stopped due to timeout, wait for remaining assets + l.waitIdle (2, 10) + + emulateScreenMetrics (l) + + l.stop () + + if args.domSnapshot: + script = loadScripts (args.onsnapshot) + writeScript ('onsnapshot', script, writer) + l.tab.Runtime.evaluate (expression=script, returnByValue=True) + writeDOMSnapshot (l.tab, writer) + + if args.screenshot: + writeScreenshot (l.tab, writer) return True -- cgit v1.2.3