From 22adde79940d32c5f094f26f3e18b7160e7ccafc Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Sat, 1 Dec 2018 13:14:06 +0100 Subject: behavior: Move click script data to external file First step of issue #3 --- README.rst | 2 + crocoite/behavior.py | 27 +++++- crocoite/data/click.js | 217 ++++++++++++++++------------------------------- crocoite/data/click.yaml | 70 +++++++++++++++ crocoite/data/scroll.js | 4 +- setup.py | 1 + 6 files changed, 172 insertions(+), 149 deletions(-) create mode 100644 crocoite/data/click.yaml diff --git a/README.rst b/README.rst index 428744f..459f201 100644 --- a/README.rst +++ b/README.rst @@ -18,6 +18,7 @@ Quick start These dependencies must be present to run crocoite: - Python ≥3.6 +- PyYAML_ - aiohttp_ - websockets_ - warcio_ @@ -25,6 +26,7 @@ These dependencies must be present to run crocoite: - bottom_ (IRC client) - `Google Chrome`_ +.. _PyYAML: https://pyyaml.org/wiki/PyYAML .. _aiohttp: https://aiohttp.readthedocs.io/ .. _websockets: https://websockets.readthedocs.io/ .. _warcio: https://github.com/webrecorder/warcio diff --git a/crocoite/behavior.py b/crocoite/behavior.py index 5f66538..8cc7ab4 100644 --- a/crocoite/behavior.py +++ b/crocoite/behavior.py @@ -30,6 +30,7 @@ from collections import OrderedDict import pkg_resources from html5lib.serializer import HTMLSerializer +import yaml from .util import randomString, getFormattedViewportMetrics, removeFragment from . import html @@ -107,7 +108,7 @@ class HostnameFilter: class JsOnload (Behavior): """ Execute JavaScript on page load """ - __slots__ = ('script', 'context') + __slots__ = ('script', 'context', 'options') scriptPath = None @@ -115,15 +116,32 @@ class JsOnload (Behavior): super ().__init__ (loader, logger) self.script = Script (self.scriptPath) self.context = None + # options passed to constructor + self.options = {} async def onload (self): tab = self.loader.tab yield self.script + + # This is slightly awkward, since we cannot compile the class into an + # objectId and then reference it. Therefore the script must return a + # class constructor, which is then called with a generic options + # parameter. + # XXX: is there a better way to do this? result = await tab.Runtime.evaluate (expression=str (self.script)) exception = result.get ('exceptionDetails', None) result = result['result'] - assert result['type'] == 'object' + assert result['type'] == 'function', result assert result.get ('subtype') != 'error', exception + constructor = result['objectId'] + + result = await tab.Runtime.callFunctionOn ( + functionDeclaration='function(options){return new this(options);}', + objectId=constructor, + arguments=[{'value': self.options}]) + result = result['result'] + assert result['type'] == 'object', result + assert result.get ('subtype') != 'error', result self.context = result['objectId'] async def onstop (self): @@ -278,6 +296,11 @@ class Click (JsOnload): name = 'click' scriptPath = 'click.js' + def __init__ (self, loader, logger): + super ().__init__ (loader, logger) + with pkg_resources.resource_stream (__name__, os.path.join ('data', 'click.yaml')) as fd: + self.options['sites'] = list (yaml.load_all (fd)) + class ExtractLinksEvent: __slots__ = ('links', ) diff --git a/crocoite/data/click.js b/crocoite/data/click.js index b3e425b..ae189da 100644 --- a/crocoite/data/click.js +++ b/crocoite/data/click.js @@ -4,129 +4,7 @@ * like navigating to a different location. Thus whitelist known elements. */ -(function(){ -const selectorFlag = Object.freeze ({ - none: 0, - multi: 1, /* click item multiple times */ -}); -const defaultClickThrottle = 50; /* in ms */ -const discoverInterval = 1000; /* 1 second */ -const sites = Object.freeze ([ - { - hostname: /^www\.facebook\.com$/i, - selector: [ - /* show more comments */ - {s: 'a.UFIPagerLink[role=button]', flags: selectorFlag.none}, - /* show nested comments*/ - {s: 'a.UFICommentLink[role=button]', flags: selectorFlag.none}, - /* initially show comments below a single post/video, i.e. /user/post/123 */ - {s: 'form.commentable_item a[data-comment-prelude-ref=action_link_bling][rel=ignore]', flags: selectorFlag.none}, - /* close the “register now” nag screen. for better screen shots */ - {s: 'a#expanding_cta_close_button[role=button]', flags: selectorFlag.none}, - ], - }, { - hostname: /^twitter\.com$/i, - selector: [ - /* expand threads */ - {s: 'a.ThreadedConversation-moreRepliesLink', flags: selectorFlag.none}, - /* show hidden profiles */ - {s: 'button.ProfileWarningTimeline-button', flags: selectorFlag.none}, - /* show hidden/sensitive media */ - {s: 'button.Tombstone-action.js-display-this-media', flags: selectorFlag.none}, - ], - }, { - hostname: /^disqus\.com$/i, - selector: [ - /* load more comments */ - {s: 'a.load-more__button', flags: selectorFlag.multi}, - ], - }, { - hostname: /^(www|np)\.reddit\.com$/i, - selector: [ - /* show more comments, reddit’s javascript ignores events if too - * frequent */ - {s: 'span.morecomments a', flags: selectorFlag.none, throttle: 500}, - ], - }, { - hostname: /^www\.instagram\.com$/i, - selector: [ - /* posts may have multiple images that load dynamically, click the arrow */ - {s: 'a[role=button].coreSpriteRightChevron', flags: selectorFlag.multi, throttle: 500}, - /* load more comments */ - {s: 'article div ul li a[role=button]', flags: selectorFlag.multi}, - ], - }, { - hostname: /^www\.youtube\.com$/i, - selector: [ - /* expand comment thread */ - {s: 'ytd-comment-thread-renderer div.more-button', flags: selectorFlag.none}, - ], - }, { - hostname: /^www\.patreon\.com$/i, - selector: [ - /* load more content */ - {s: 'div[display=flex] div[display=block] button[color=gray][type=button]', flags: selectorFlag.multi}, - /* load more comments */ - {s: 'div.stackable[display=block] > div > div > a[color=dark][target=_self]', flags: selectorFlag.none}, - /* load more replies */ - {s: 'div > a[scale="0"][color=blue][size="1"]', flags: selectorFlag.none}, - ], - }, { - hostname: /^(www\.)?gab\.ai$/i, - selector: [ - /* post comments */ - {s: 'post-detail post-comment .post-comment__replies__count a', flags: selectorFlag.none}, - /* more comments */ - {s: 'post-detail .post-comment-list__loading a', flags: selectorFlag.none}, - /* more posts */ - {s: 'post-list a.post-list__load-more', flags: selectorFlag.multi}, - ], - }, { - hostname: /^(www\.)?github\.com$/i, - selector: [ - /* show hidden issue items, see https://github.com/dominictarr/event-stream/issues/116 */ - {s: 'div#discussion_bucket form.ajax-pagination-form button.ajax-pagination-btn', flags: selectorFlag.none}, - ], - } - ]); - -/* pick selectors matching current location */ -let hostname = document.location.hostname; -let selector = []; -for (let s of sites) { - if (s.hostname.test (hostname)) { - selector = selector.concat (s.selector); - } -} - -function makeClickEvent () { - return new MouseEvent('click', { - view: window, - bubbles: true, - cancelable: true - }); -} - -/* throttle clicking */ -let queue = []; -let clickTimeout = null; -function click () { - if (queue.length > 0) { - const item = queue.shift (); - const o = item.o; - const selector = item.selector; - o.dispatchEvent (makeClickEvent ()); - - if (queue.length > 0) { - const nextTimeout = 'throttle' in selector ? - selector.throttle : defaultClickThrottle; - clickTimeout = window.setTimeout (click, nextTimeout); - } else { - clickTimeout = null; - } - } -} - +(function() { /* Element is visible if itself and all of its parents are */ function isVisible (o) { @@ -148,33 +26,82 @@ function isClickable (o) { return !o.hasAttribute ('disabled') && isVisible (o); } -/* some sites don’t remove/replace the element immediately, so keep track of - * which ones we already clicked */ -let have = new Set (); -function discover () { - for (let s of selector) { - let obj = document.querySelectorAll (s.s); - for (let o of obj) { - if (!have.has (o) && isClickable (o)) { - queue.push ({o: o, selector: s}); - if (!(s.flags & selectorFlag.multi)) { - have.add (o); - } +const defaultClickThrottle = 50; /* in ms */ +const discoverInterval = 1000; /* 1 second */ + +class Click { + constructor(options) { + /* pick selectors matching current location */ + let hostname = document.location.hostname; + this.selector = []; + for (let s of options['sites']) { + let r = new RegExp (s.match, 'i'); + if (r.test (hostname)) { + this.selector = this.selector.concat (s.selector); } } + /* throttle clicking */ + this.queue = []; + this.clickTimeout = null; + + /* some sites don’t remove/replace the element immediately, so keep track of + * which ones we already clicked */ + this.have = new Set (); + + /* XXX: can we use a mutation observer instead? */ + this.interval = window.setInterval (this.discover.bind (this), discoverInterval); } - if (queue.length > 0 && clickTimeout === null) { - /* start clicking immediately */ - clickTimeout = window.setTimeout (click, 0); + + makeClickEvent () { + return new MouseEvent('click', { + view: window, + bubbles: true, + cancelable: true + }); + } + + click () { + if (this.queue.length > 0) { + const item = this.queue.shift (); + const o = item.o; + const selector = item.selector; + o.dispatchEvent (this.makeClickEvent ()); + + if (this.queue.length > 0) { + const nextTimeout = 'throttle' in selector ? + selector.throttle : defaultClickThrottle; + this.clickTimeout = window.setTimeout (this.click.bind (this), nextTimeout); + } else { + this.clickTimeout = null; + } + } + } + + discover () { + for (let s of this.selector) { + let obj = document.querySelectorAll (s.selector); + for (let o of obj) { + if (!this.have.has (o) && isClickable (o)) { + this.queue.push ({o: o, selector: s}); + if (!s.multi) { + this.have.add (o); + } + } + } + } + if (this.queue.length > 0 && this.clickTimeout === null) { + /* start clicking immediately */ + this.clickTimeout = window.setTimeout (this.click.bind (this), 0); + } + return true; } - return true; -} -/* XXX: can we use a mutation observer instead? */ -let interval = window.setInterval (discover, discoverInterval); -function stop() { - window.clearInterval (interval); + stop () { + window.clearInterval (this.interval); + window.clearTimeout (this.clickTimeout); + } } -return {'stop': stop}; + +return Click; }()) diff --git a/crocoite/data/click.yaml b/crocoite/data/click.yaml new file mode 100644 index 0000000..d9557eb --- /dev/null +++ b/crocoite/data/click.yaml @@ -0,0 +1,70 @@ +# configuration for behavior.py:Click +match: ^www\.facebook\.com$ +selector: + - description: show more comments + selector: a.UFIPagerLink[role=button] + - description: show nested comments + selector: a.UFICommentLink[role=button] + - description: initially show comments below a single post/video, i.e. /user/post/123 + selector: form.commentable_item a[data-comment-prelude-ref=action_link_bling][rel=ignore] + - description: close the “register now” nag screen. for better screen shots + selector: a#expanding_cta_close_button[role=button] +--- +match: ^twitter\.com$ +selector: + - description: expand threads + selector: a.ThreadedConversation-moreRepliesLink + - description: show hidden profiles + selector: button.ProfileWarningTimeline-button + - description: show hidden/sensitive media + selector: button.Tombstone-action.js-display-this-media +--- +match: ^disqus\.com$ +selector: + - description: load more comments + selector: a.load-more__button + multi: True +--- +match: ^(www|np)\.reddit\.com$ +selector: + - description: show more comments, reddit’s javascript ignores events if too frequent + selector: span.morecomments a + throttle: 500 +--- +match: ^www\.instagram\.com$ +selector: + - description: load more comments + selector: article div ul li button[type=button] + multi: True + urls: ["https://www.instagram.com/p/BqvAm_XnmdJ/"] +--- +match: ^www\.youtube\.com$ +selector: + - description: expand comment thread + selector: ytd-comment-thread-renderer div.more-button +--- +match: ^www\.patreon\.com$ +selector: + - description: load more content + selector: div[display=flex] div[display=block] button[color=gray][type=button] + multi: True + - description: load more comments + selector: div.stackable[display=block] > div > div > a[color=dark][target=_self] + - description: load more replies + selector: div > a[scale="0"][color=blue][size="1"] +--- +match: ^(www\.)?gab\.ai$ +selector: + - description: post comments + selector: post-detail post-comment .post-comment__replies__count a + - description: more comments + selector: post-detail .post-comment-list__loading a + - description: more posts + selector: post-list a.post-list__load-more + multi: True +--- +match: ^(www\.)?github\.com$ +selector: + - description: show hidden issue items + urls: ["https://github.com/dominictarr/event-stream/issues/116"] + selector: div#discussion_bucket form.ajax-pagination-form button.ajax-pagination-btn diff --git a/crocoite/data/scroll.js b/crocoite/data/scroll.js index aacfe83..be88edf 100644 --- a/crocoite/data/scroll.js +++ b/crocoite/data/scroll.js @@ -2,7 +2,7 @@ */ (function(){ class Scroll { - constructor () { + constructor (options) { this.scrolled = new Map (); this.interval = window.setInterval (this.scroll.bind (this), 200); } @@ -34,5 +34,5 @@ class Scroll { } } -return new Scroll(); +return Scroll; }()) diff --git a/setup.py b/setup.py index 8b21983..5ae7e65 100644 --- a/setup.py +++ b/setup.py @@ -16,6 +16,7 @@ setup( 'pytz', 'websockets', 'aiohttp', + 'PyYAML', ], entry_points={ 'console_scripts': [ -- cgit v1.2.3