diff options
| author | Lars-Dominik Braun <lars@6xq.net> | 2017-11-25 12:03:17 +0100 | 
|---|---|---|
| committer | Lars-Dominik Braun <lars@6xq.net> | 2017-11-25 12:03:17 +0100 | 
| commit | de94e6bc320ddc38f4b0baf006c254378be5d845 (patch) | |
| tree | 03d816ff1cf01464dce5e046ce811fb649540824 | |
| parent | c1fda1e1899d12d6a582b07a6a69b4e2743867ac (diff) | |
| download | crocoite-de94e6bc320ddc38f4b0baf006c254378be5d845.tar.gz crocoite-de94e6bc320ddc38f4b0baf006c254378be5d845.tar.bz2 crocoite-de94e6bc320ddc38f4b0baf006c254378be5d845.zip | |
Rename --run-before-snapshot and document --on* options
| -rw-r--r-- | README.rst | 18 | ||||
| -rw-r--r-- | crocoite/cli.py | 6 | 
2 files changed, 20 insertions, 4 deletions
| @@ -10,10 +10,11 @@ Dependencies  - Python 3  - pychrome_   - warcio_ -- html5lib +- html5lib_  .. _pychrome: https://github.com/fate0/pychrome  .. _warcio: https://github.com/webrecorder/warcio +.. _html5lib: https://github.com/html5lib/html5lib-python  Usage  ----- @@ -31,6 +32,21 @@ For `headless Google Chrome`_ add the parameters ``--headless --disable-gpu``.  .. _pywb: https://github.com/ikreymer/pywb  .. _headless Google Chrome: https://developers.google.com/web/updates/2017/04/headless-chrome +Injecting JavaScript +^^^^^^^^^^^^^^^^^^^^ + +A lot of sites need some form of interaction to load more content. Twitter for +instance continously loads new posts when scrolling to the bottom of the page. +crocoite can emulate these user interactions by injecting JavaScript into the +page before loading it. For instance ``--onload=scroll.js`` scrolls the page to +the bottom. + +If extra work is required before taking a DOM snapshot, additional scripts can +be run with ``--onsnapshot=canvas-snapshot.js``, which replaces all HTML +``<canvas>`` elements with a static picture of their current contents. + +Example scripts can be found in the directory ``crocoite/data/``. +  Caveats  ------- diff --git a/crocoite/cli.py b/crocoite/cli.py index 60d6661..f070625 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -391,10 +391,10 @@ def main ():      parser.add_argument('--browser', default='http://127.0.0.1:9222', help='DevTools URL')      parser.add_argument('--timeout', default=10, type=int, help='Maximum time for archival')      parser.add_argument('--idle-timeout', default=2, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout') -    parser.add_argument('--onload', action='append', help='')      parser.add_argument('--log-buffer', default=1000, type=int, dest='logBuffer')      parser.add_argument('--keep-tab', action='store_true', default=False, dest='keepTab', help='Keep tab open') -    parser.add_argument('--run-before-snapshot', default=[], action='append', dest='runBeforeSnapshot', help='Run JavaScript files before creating DOM snapshot') +    parser.add_argument('--onload', default=[], action='append', help='Inject JavaScript file before loading page') +    parser.add_argument('--onsnapshot', default=[], action='append', help='Run JavaScript files before creating DOM snapshot')      parser.add_argument('url', help='Website URL')      parser.add_argument('output', help='WARC filename') @@ -481,7 +481,7 @@ def main ():      tab.Network.loadingFailed = None      tab.Page.loadEventFired = None -    script = loadScripts (args.runBeforeSnapshot) +    script = loadScripts (args.onsnapshot)      writeScript ('onsnapshot', script, writer)      tab.Runtime.evaluate (expression=script, returnByValue=True)      writeDOMSnapshot (tab, writer) | 
