authorLars-Dominik Braun <>2017-11-25 12:03:17 +0100
committerLars-Dominik Braun <>2017-11-25 12:03:17 +0100
commitde94e6bc320ddc38f4b0baf006c254378be5d845 (patch)
parentc1fda1e1899d12d6a582b07a6a69b4e2743867ac (diff)
Rename --run-before-snapshot and document --on* options
2 files changed, 20 insertions, 4 deletions
diff --git a/README.rst b/README.rst
index 760742b..262258b 100644
--- a/README.rst
+++ b/README.rst
@@ -10,10 +10,11 @@ Dependencies
- Python 3
- pychrome_
- warcio_
-- html5lib
+- html5lib_
.. _pychrome:
.. _warcio:
+.. _html5lib:
@@ -31,6 +32,21 @@ For `headless Google Chrome`_ add the parameters ``--headless --disable-gpu``.
.. _pywb:
.. _headless Google Chrome:
+Injecting JavaScript
+A lot of sites need some form of interaction to load more content. Twitter for
+instance continously loads new posts when scrolling to the bottom of the page.
+crocoite can emulate these user interactions by injecting JavaScript into the
+page before loading it. For instance ``--onload=scroll.js`` scrolls the page to
+the bottom.
+If extra work is required before taking a DOM snapshot, additional scripts can
+be run with ``--onsnapshot=canvas-snapshot.js``, which replaces all HTML
+``<canvas>`` elements with a static picture of their current contents.
+Example scripts can be found in the directory ``crocoite/data/``.
diff --git a/crocoite/ b/crocoite/
index 60d6661..f070625 100644
--- a/crocoite/
+++ b/crocoite/
@@ -391,10 +391,10 @@ def main ():
parser.add_argument('--browser', default='', help='DevTools URL')
parser.add_argument('--timeout', default=10, type=int, help='Maximum time for archival')
parser.add_argument('--idle-timeout', default=2, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout')
- parser.add_argument('--onload', action='append', help='')
parser.add_argument('--log-buffer', default=1000, type=int, dest='logBuffer')
parser.add_argument('--keep-tab', action='store_true', default=False, dest='keepTab', help='Keep tab open')
- parser.add_argument('--run-before-snapshot', default=[], action='append', dest='runBeforeSnapshot', help='Run JavaScript files before creating DOM snapshot')
+ parser.add_argument('--onload', default=[], action='append', help='Inject JavaScript file before loading page')
+ parser.add_argument('--onsnapshot', default=[], action='append', help='Run JavaScript files before creating DOM snapshot')
parser.add_argument('url', help='Website URL')
parser.add_argument('output', help='WARC filename')
@@ -481,7 +481,7 @@ def main ():
tab.Network.loadingFailed = None
tab.Page.loadEventFired = None
- script = loadScripts (args.runBeforeSnapshot)
+ script = loadScripts (args.onsnapshot)
writeScript ('onsnapshot', script, writer)
tab.Runtime.evaluate (expression=script, returnByValue=True)
writeDOMSnapshot (tab, writer)