summaryrefslogtreecommitdiff
path: root/crocoite
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2017-12-06 19:37:53 +0100
committerLars-Dominik Braun <lars@6xq.net>2017-12-06 19:37:53 +0100
commitf816319081d5253974ddb70b655d55f4a880a77a (patch)
tree2188686ad3fd606f5bcab05fc8da271b6e106713 /crocoite
parent1b3710f9586acca56c8384afe60e51b5ae16fe36 (diff)
downloadcrocoite-f816319081d5253974ddb70b655d55f4a880a77a.tar.gz
crocoite-f816319081d5253974ddb70b655d55f4a880a77a.tar.bz2
crocoite-f816319081d5253974ddb70b655d55f4a880a77a.zip
Start Chrome browser instance
Unless --browser argument is given. Uses sane settings and a temporary profile directory.
Diffstat (limited to 'crocoite')
-rw-r--r--crocoite/browser.py52
-rw-r--r--crocoite/cli.py93
2 files changed, 101 insertions, 44 deletions
diff --git a/crocoite/browser.py b/crocoite/browser.py
index e58fce8..765acbb 100644
--- a/crocoite/browser.py
+++ b/crocoite/browser.py
@@ -207,3 +207,55 @@ class SiteLoader:
self.logger.debug ('failed {} {}'.format (reqId, kwargs['errorText'], kwargs.get ('blockedReason')))
item = self.requests.pop (reqId, None)
+import subprocess
+from tempfile import mkdtemp
+from contextlib import contextmanager
+import socket, shutil
+
+@contextmanager
+def ChromeService (binary='google-chrome-stable', host='localhost', port=9222, windowSize=(1920, 1080)):
+ """
+ Start Chrome with socket activation (i.e. pass listening socket). Polling
+ is not required with this method, since reads will block until Chrome is
+ ready.
+ """
+ s = socket.socket ()
+ s.setsockopt (socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+ s.bind ((host, port))
+ s.listen (10)
+ userDataDir = mkdtemp ()
+ args = [binary,
+ '--window-size={},{}'.format (*windowSize),
+ '--user-data-dir={}'.format (userDataDir), # use temporory user dir
+ '--no-default-browser-check',
+ '--no-first-run', # don’t show first run screen
+ '--disable-breakpad', # no error reports
+ '--disable-extensions',
+ '--disable-infobars',
+ '--disable-notifications', # no libnotify
+ '--headless',
+ '--disable-gpu',
+ '--hide-scrollbars', # hide scrollbars on screenshots
+ '--mute-audio', # don’t play any audio
+ '--remote-debugging-socket-fd={}'.format (s.fileno ()),
+ '--homepage=about:blank',
+ 'about:blank']
+ # start new session, so ^C does not affect subprocess
+ p = subprocess.Popen (args, pass_fds=[s.fileno()], start_new_session=True,
+ stdin=subprocess.DEVNULL, stdout=subprocess.DEVNULL,
+ stderr=subprocess.DEVNULL)
+
+ # must be wrapped in try-finally, otherwise code in __exit__/finally is not
+ # executed
+ try:
+ yield 'http://{}:{}'.format (host, port)
+ finally:
+ p.terminate ()
+ p.wait ()
+ shutil.rmtree (userDataDir)
+
+@contextmanager
+def NullService (url):
+ yield url
+
+
diff --git a/crocoite/cli.py b/crocoite/cli.py
index 62f488a..8a55269 100644
--- a/crocoite/cli.py
+++ b/crocoite/cli.py
@@ -27,7 +27,7 @@ def main ():
from warcio.warcwriter import WARCWriter
from warcio.statusandheaders import StatusAndHeaders
from html5lib.serializer import HTMLSerializer
- from . import html, packageData, packageUrl
+ from . import html, packageData, packageUrl, browser
from .warc import WarcLoader
from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker
@@ -138,7 +138,7 @@ def main ():
logging.basicConfig (level=logging.DEBUG)
parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.')
- parser.add_argument('--browser', default='http://127.0.0.1:9222', help='DevTools URL', metavar='URL')
+ parser.add_argument('--browser', help='DevTools URL', metavar='URL')
parser.add_argument('--timeout', default=10, type=int, help='Maximum time for archival', metavar='SEC')
parser.add_argument('--idle-timeout', default=2, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout', metavar='SEC')
parser.add_argument('--log-buffer', default=1000, type=int, dest='logBuffer', metavar='LINES')
@@ -159,48 +159,53 @@ def main ():
onload = loadScripts (args.onload, ['var {} = false;\n'.format (stopVarname)]).replace (stopVarname, newStopVarname)
stopVarname = newStopVarname
- browser = pychrome.Browser(url=args.browser)
-
- fd = open (args.output, 'wb')
- writer = WARCWriter (fd, gzip=True)
-
- with WarcLoader (browser, args.url, writer, logBuffer=args.logBuffer,
- maxBodySize=args.maxBodySize) as l:
- version = l.tab.Browser.getVersion ()
- payload = {
- 'software': __package__,
- 'browser': version['product'],
- 'useragent': version['userAgent'],
- 'viewport': getFormattedViewportMetrics (l.tab),
- }
- warcinfo = writer.create_warcinfo_record (filename=None, info=payload)
- writer.write_record (warcinfo)
- # save onload script as well
- writeScript ('onload', onload, writer)
-
- # inject our custom javascript to the page before loading
- l.tab.Page.addScriptToEvaluateOnNewDocument (source=onload)
- l.start ()
-
- l.waitIdle (args.idleTimeout, args.timeout)
-
- # get ready for snapshot: stop loading and scripts, disable events
- l.tab.Runtime.evaluate (expression='{} = true; window.scrollTo (0, 0);'.format (stopVarname), returnByValue=True)
- # if we stopped due to timeout, wait for remaining assets
- l.waitIdle (2, 10)
-
- emulateScreenMetrics (l)
-
- l.stop ()
-
- if args.domSnapshot:
- script = loadScripts (args.onsnapshot)
- writeScript ('onsnapshot', script, writer)
- l.tab.Runtime.evaluate (expression=script, returnByValue=True)
- writeDOMSnapshot (l.tab, writer)
-
- if args.screenshot:
- writeScreenshot (l.tab, writer)
+ service = browser.ChromeService ()
+ if args.browser:
+ service = browser.NullService (args.browser)
+
+ with service as browserUrl:
+ browser = pychrome.Browser(url=browserUrl)
+
+ fd = open (args.output, 'wb')
+ writer = WARCWriter (fd, gzip=True)
+
+ with WarcLoader (browser, args.url, writer, logBuffer=args.logBuffer,
+ maxBodySize=args.maxBodySize) as l:
+ version = l.tab.Browser.getVersion ()
+ payload = {
+ 'software': __package__,
+ 'browser': version['product'],
+ 'useragent': version['userAgent'],
+ 'viewport': getFormattedViewportMetrics (l.tab),
+ }
+ warcinfo = writer.create_warcinfo_record (filename=None, info=payload)
+ writer.write_record (warcinfo)
+ # save onload script as well
+ writeScript ('onload', onload, writer)
+
+ # inject our custom javascript to the page before loading
+ l.tab.Page.addScriptToEvaluateOnNewDocument (source=onload)
+ l.start ()
+
+ l.waitIdle (args.idleTimeout, args.timeout)
+
+ # get ready for snapshot: stop loading and scripts, disable events
+ l.tab.Runtime.evaluate (expression='{} = true; window.scrollTo (0, 0);'.format (stopVarname), returnByValue=True)
+ # if we stopped due to timeout, wait for remaining assets
+ l.waitIdle (2, 10)
+
+ emulateScreenMetrics (l)
+
+ l.stop ()
+
+ if args.domSnapshot:
+ script = loadScripts (args.onsnapshot)
+ writeScript ('onsnapshot', script, writer)
+ l.tab.Runtime.evaluate (expression=script, returnByValue=True)
+ writeDOMSnapshot (l.tab, writer)
+
+ if args.screenshot:
+ writeScreenshot (l.tab, writer)
return True