summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2018-12-01 13:14:06 +0100
committerLars-Dominik Braun <lars@6xq.net>2018-12-01 13:56:44 +0100
commit22adde79940d32c5f094f26f3e18b7160e7ccafc (patch)
tree8fb59939a8281e33c2c05c140409430c30ed5c58
parent6176991ac7ff0e6dcb4612b43da89abd350e3aa5 (diff)
downloadcrocoite-22adde79940d32c5f094f26f3e18b7160e7ccafc.tar.gz
crocoite-22adde79940d32c5f094f26f3e18b7160e7ccafc.tar.bz2
crocoite-22adde79940d32c5f094f26f3e18b7160e7ccafc.zip
behavior: Move click script data to external file
First step of issue #3
-rw-r--r--README.rst2
-rw-r--r--crocoite/behavior.py27
-rw-r--r--crocoite/data/click.js217
-rw-r--r--crocoite/data/click.yaml70
-rw-r--r--crocoite/data/scroll.js4
-rw-r--r--setup.py1
6 files changed, 172 insertions, 149 deletions
diff --git a/README.rst b/README.rst
index 428744f..459f201 100644
--- a/README.rst
+++ b/README.rst
@@ -18,6 +18,7 @@ Quick start
These dependencies must be present to run crocoite:
- Python ≥3.6
+- PyYAML_
- aiohttp_
- websockets_
- warcio_
@@ -25,6 +26,7 @@ These dependencies must be present to run crocoite:
- bottom_ (IRC client)
- `Google Chrome`_
+.. _PyYAML: https://pyyaml.org/wiki/PyYAML
.. _aiohttp: https://aiohttp.readthedocs.io/
.. _websockets: https://websockets.readthedocs.io/
.. _warcio: https://github.com/webrecorder/warcio
diff --git a/crocoite/behavior.py b/crocoite/behavior.py
index 5f66538..8cc7ab4 100644
--- a/crocoite/behavior.py
+++ b/crocoite/behavior.py
@@ -30,6 +30,7 @@ from collections import OrderedDict
import pkg_resources
from html5lib.serializer import HTMLSerializer
+import yaml
from .util import randomString, getFormattedViewportMetrics, removeFragment
from . import html
@@ -107,7 +108,7 @@ class HostnameFilter:
class JsOnload (Behavior):
""" Execute JavaScript on page load """
- __slots__ = ('script', 'context')
+ __slots__ = ('script', 'context', 'options')
scriptPath = None
@@ -115,15 +116,32 @@ class JsOnload (Behavior):
super ().__init__ (loader, logger)
self.script = Script (self.scriptPath)
self.context = None
+ # options passed to constructor
+ self.options = {}
async def onload (self):
tab = self.loader.tab
yield self.script
+
+ # This is slightly awkward, since we cannot compile the class into an
+ # objectId and then reference it. Therefore the script must return a
+ # class constructor, which is then called with a generic options
+ # parameter.
+ # XXX: is there a better way to do this?
result = await tab.Runtime.evaluate (expression=str (self.script))
exception = result.get ('exceptionDetails', None)
result = result['result']
- assert result['type'] == 'object'
+ assert result['type'] == 'function', result
assert result.get ('subtype') != 'error', exception
+ constructor = result['objectId']
+
+ result = await tab.Runtime.callFunctionOn (
+ functionDeclaration='function(options){return new this(options);}',
+ objectId=constructor,
+ arguments=[{'value': self.options}])
+ result = result['result']
+ assert result['type'] == 'object', result
+ assert result.get ('subtype') != 'error', result
self.context = result['objectId']
async def onstop (self):
@@ -278,6 +296,11 @@ class Click (JsOnload):
name = 'click'
scriptPath = 'click.js'
+ def __init__ (self, loader, logger):
+ super ().__init__ (loader, logger)
+ with pkg_resources.resource_stream (__name__, os.path.join ('data', 'click.yaml')) as fd:
+ self.options['sites'] = list (yaml.load_all (fd))
+
class ExtractLinksEvent:
__slots__ = ('links', )
diff --git a/crocoite/data/click.js b/crocoite/data/click.js
index b3e425b..ae189da 100644
--- a/crocoite/data/click.js
+++ b/crocoite/data/click.js
@@ -4,129 +4,7 @@
* like navigating to a different location. Thus whitelist known elements.
*/
-(function(){
-const selectorFlag = Object.freeze ({
- none: 0,
- multi: 1, /* click item multiple times */
-});
-const defaultClickThrottle = 50; /* in ms */
-const discoverInterval = 1000; /* 1 second */
-const sites = Object.freeze ([
- {
- hostname: /^www\.facebook\.com$/i,
- selector: [
- /* show more comments */
- {s: 'a.UFIPagerLink[role=button]', flags: selectorFlag.none},
- /* show nested comments*/
- {s: 'a.UFICommentLink[role=button]', flags: selectorFlag.none},
- /* initially show comments below a single post/video, i.e. /user/post/123 */
- {s: 'form.commentable_item a[data-comment-prelude-ref=action_link_bling][rel=ignore]', flags: selectorFlag.none},
- /* close the “register now” nag screen. for better screen shots */
- {s: 'a#expanding_cta_close_button[role=button]', flags: selectorFlag.none},
- ],
- }, {
- hostname: /^twitter\.com$/i,
- selector: [
- /* expand threads */
- {s: 'a.ThreadedConversation-moreRepliesLink', flags: selectorFlag.none},
- /* show hidden profiles */
- {s: 'button.ProfileWarningTimeline-button', flags: selectorFlag.none},
- /* show hidden/sensitive media */
- {s: 'button.Tombstone-action.js-display-this-media', flags: selectorFlag.none},
- ],
- }, {
- hostname: /^disqus\.com$/i,
- selector: [
- /* load more comments */
- {s: 'a.load-more__button', flags: selectorFlag.multi},
- ],
- }, {
- hostname: /^(www|np)\.reddit\.com$/i,
- selector: [
- /* show more comments, reddit’s javascript ignores events if too
- * frequent */
- {s: 'span.morecomments a', flags: selectorFlag.none, throttle: 500},
- ],
- }, {
- hostname: /^www\.instagram\.com$/i,
- selector: [
- /* posts may have multiple images that load dynamically, click the arrow */
- {s: 'a[role=button].coreSpriteRightChevron', flags: selectorFlag.multi, throttle: 500},
- /* load more comments */
- {s: 'article div ul li a[role=button]', flags: selectorFlag.multi},
- ],
- }, {
- hostname: /^www\.youtube\.com$/i,
- selector: [
- /* expand comment thread */
- {s: 'ytd-comment-thread-renderer div.more-button', flags: selectorFlag.none},
- ],
- }, {
- hostname: /^www\.patreon\.com$/i,
- selector: [
- /* load more content */
- {s: 'div[display=flex] div[display=block] button[color=gray][type=button]', flags: selectorFlag.multi},
- /* load more comments */
- {s: 'div.stackable[display=block] > div > div > a[color=dark][target=_self]', flags: selectorFlag.none},
- /* load more replies */
- {s: 'div > a[scale="0"][color=blue][size="1"]', flags: selectorFlag.none},
- ],
- }, {
- hostname: /^(www\.)?gab\.ai$/i,
- selector: [
- /* post comments */
- {s: 'post-detail post-comment .post-comment__replies__count a', flags: selectorFlag.none},
- /* more comments */
- {s: 'post-detail .post-comment-list__loading a', flags: selectorFlag.none},
- /* more posts */
- {s: 'post-list a.post-list__load-more', flags: selectorFlag.multi},
- ],
- }, {
- hostname: /^(www\.)?github\.com$/i,
- selector: [
- /* show hidden issue items, see https://github.com/dominictarr/event-stream/issues/116 */
- {s: 'div#discussion_bucket form.ajax-pagination-form button.ajax-pagination-btn', flags: selectorFlag.none},
- ],
- }
- ]);
-
-/* pick selectors matching current location */
-let hostname = document.location.hostname;
-let selector = [];
-for (let s of sites) {
- if (s.hostname.test (hostname)) {
- selector = selector.concat (s.selector);
- }
-}
-
-function makeClickEvent () {
- return new MouseEvent('click', {
- view: window,
- bubbles: true,
- cancelable: true
- });
-}
-
-/* throttle clicking */
-let queue = [];
-let clickTimeout = null;
-function click () {
- if (queue.length > 0) {
- const item = queue.shift ();
- const o = item.o;
- const selector = item.selector;
- o.dispatchEvent (makeClickEvent ());
-
- if (queue.length > 0) {
- const nextTimeout = 'throttle' in selector ?
- selector.throttle : defaultClickThrottle;
- clickTimeout = window.setTimeout (click, nextTimeout);
- } else {
- clickTimeout = null;
- }
- }
-}
-
+(function() {
/* Element is visible if itself and all of its parents are
*/
function isVisible (o) {
@@ -148,33 +26,82 @@ function isClickable (o) {
return !o.hasAttribute ('disabled') && isVisible (o);
}
-/* some sites don’t remove/replace the element immediately, so keep track of
- * which ones we already clicked */
-let have = new Set ();
-function discover () {
- for (let s of selector) {
- let obj = document.querySelectorAll (s.s);
- for (let o of obj) {
- if (!have.has (o) && isClickable (o)) {
- queue.push ({o: o, selector: s});
- if (!(s.flags & selectorFlag.multi)) {
- have.add (o);
- }
+const defaultClickThrottle = 50; /* in ms */
+const discoverInterval = 1000; /* 1 second */
+
+class Click {
+ constructor(options) {
+ /* pick selectors matching current location */
+ let hostname = document.location.hostname;
+ this.selector = [];
+ for (let s of options['sites']) {
+ let r = new RegExp (s.match, 'i');
+ if (r.test (hostname)) {
+ this.selector = this.selector.concat (s.selector);
}
}
+ /* throttle clicking */
+ this.queue = [];
+ this.clickTimeout = null;
+
+ /* some sites don’t remove/replace the element immediately, so keep track of
+ * which ones we already clicked */
+ this.have = new Set ();
+
+ /* XXX: can we use a mutation observer instead? */
+ this.interval = window.setInterval (this.discover.bind (this), discoverInterval);
}
- if (queue.length > 0 && clickTimeout === null) {
- /* start clicking immediately */
- clickTimeout = window.setTimeout (click, 0);
+
+ makeClickEvent () {
+ return new MouseEvent('click', {
+ view: window,
+ bubbles: true,
+ cancelable: true
+ });
+ }
+
+ click () {
+ if (this.queue.length > 0) {
+ const item = this.queue.shift ();
+ const o = item.o;
+ const selector = item.selector;
+ o.dispatchEvent (this.makeClickEvent ());
+
+ if (this.queue.length > 0) {
+ const nextTimeout = 'throttle' in selector ?
+ selector.throttle : defaultClickThrottle;
+ this.clickTimeout = window.setTimeout (this.click.bind (this), nextTimeout);
+ } else {
+ this.clickTimeout = null;
+ }
+ }
+ }
+
+ discover () {
+ for (let s of this.selector) {
+ let obj = document.querySelectorAll (s.selector);
+ for (let o of obj) {
+ if (!this.have.has (o) && isClickable (o)) {
+ this.queue.push ({o: o, selector: s});
+ if (!s.multi) {
+ this.have.add (o);
+ }
+ }
+ }
+ }
+ if (this.queue.length > 0 && this.clickTimeout === null) {
+ /* start clicking immediately */
+ this.clickTimeout = window.setTimeout (this.click.bind (this), 0);
+ }
+ return true;
}
- return true;
-}
-/* XXX: can we use a mutation observer instead? */
-let interval = window.setInterval (discover, discoverInterval);
-function stop() {
- window.clearInterval (interval);
+ stop () {
+ window.clearInterval (this.interval);
+ window.clearTimeout (this.clickTimeout);
+ }
}
-return {'stop': stop};
+
+return Click;
}())
diff --git a/crocoite/data/click.yaml b/crocoite/data/click.yaml
new file mode 100644
index 0000000..d9557eb
--- /dev/null
+++ b/crocoite/data/click.yaml
@@ -0,0 +1,70 @@
+# configuration for behavior.py:Click
+match: ^www\.facebook\.com$
+selector:
+ - description: show more comments
+ selector: a.UFIPagerLink[role=button]
+ - description: show nested comments
+ selector: a.UFICommentLink[role=button]
+ - description: initially show comments below a single post/video, i.e. /user/post/123
+ selector: form.commentable_item a[data-comment-prelude-ref=action_link_bling][rel=ignore]
+ - description: close the “register now” nag screen. for better screen shots
+ selector: a#expanding_cta_close_button[role=button]
+---
+match: ^twitter\.com$
+selector:
+ - description: expand threads
+ selector: a.ThreadedConversation-moreRepliesLink
+ - description: show hidden profiles
+ selector: button.ProfileWarningTimeline-button
+ - description: show hidden/sensitive media
+ selector: button.Tombstone-action.js-display-this-media
+---
+match: ^disqus\.com$
+selector:
+ - description: load more comments
+ selector: a.load-more__button
+ multi: True
+---
+match: ^(www|np)\.reddit\.com$
+selector:
+ - description: show more comments, reddit’s javascript ignores events if too frequent
+ selector: span.morecomments a
+ throttle: 500
+---
+match: ^www\.instagram\.com$
+selector:
+ - description: load more comments
+ selector: article div ul li button[type=button]
+ multi: True
+ urls: ["https://www.instagram.com/p/BqvAm_XnmdJ/"]
+---
+match: ^www\.youtube\.com$
+selector:
+ - description: expand comment thread
+ selector: ytd-comment-thread-renderer div.more-button
+---
+match: ^www\.patreon\.com$
+selector:
+ - description: load more content
+ selector: div[display=flex] div[display=block] button[color=gray][type=button]
+ multi: True
+ - description: load more comments
+ selector: div.stackable[display=block] > div > div > a[color=dark][target=_self]
+ - description: load more replies
+ selector: div > a[scale="0"][color=blue][size="1"]
+---
+match: ^(www\.)?gab\.ai$
+selector:
+ - description: post comments
+ selector: post-detail post-comment .post-comment__replies__count a
+ - description: more comments
+ selector: post-detail .post-comment-list__loading a
+ - description: more posts
+ selector: post-list a.post-list__load-more
+ multi: True
+---
+match: ^(www\.)?github\.com$
+selector:
+ - description: show hidden issue items
+ urls: ["https://github.com/dominictarr/event-stream/issues/116"]
+ selector: div#discussion_bucket form.ajax-pagination-form button.ajax-pagination-btn
diff --git a/crocoite/data/scroll.js b/crocoite/data/scroll.js
index aacfe83..be88edf 100644
--- a/crocoite/data/scroll.js
+++ b/crocoite/data/scroll.js
@@ -2,7 +2,7 @@
*/
(function(){
class Scroll {
- constructor () {
+ constructor (options) {
this.scrolled = new Map ();
this.interval = window.setInterval (this.scroll.bind (this), 200);
}
@@ -34,5 +34,5 @@ class Scroll {
}
}
-return new Scroll();
+return Scroll;
}())
diff --git a/setup.py b/setup.py
index 8b21983..5ae7e65 100644
--- a/setup.py
+++ b/setup.py
@@ -16,6 +16,7 @@ setup(
'pytz',
'websockets',
'aiohttp',
+ 'PyYAML',
],
entry_points={
'console_scripts': [