From 8e939f8922815bd917f4dd750aa5f8a17a8f750c Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Tue, 19 Dec 2017 09:14:43 +0100 Subject: Select default behavior scripts by site URL --- contrib/celerycrocoite.py | 26 +++++++++++++++++++++-- crocoite/behavior.py | 41 +++++++++++++++++++++++++++++++++++++ crocoite/cli.py | 11 +++++++++- crocoite/data/fixups.instagram.js | 21 ------------------- crocoite/data/fixups.twitter.js | 17 --------------- crocoite/data/per-site/instagram.js | 21 +++++++++++++++++++ crocoite/data/per-site/twitter.js | 17 +++++++++++++++ 7 files changed, 113 insertions(+), 41 deletions(-) create mode 100644 crocoite/behavior.py delete mode 100644 crocoite/data/fixups.instagram.js delete mode 100644 crocoite/data/fixups.twitter.js create mode 100644 crocoite/data/per-site/instagram.js create mode 100644 crocoite/data/per-site/twitter.js diff --git a/contrib/celerycrocoite.py b/contrib/celerycrocoite.py index 8fab046..ede58be 100644 --- a/contrib/celerycrocoite.py +++ b/contrib/celerycrocoite.py @@ -29,6 +29,19 @@ import celery from urllib.parse import urlsplit import crocoite.cli +from crocoite import behavior + +def prettyTimeDelta (seconds): + """ + Pretty-print seconds to human readable string 1d 1h 1m 1s + """ + seconds = int(seconds) + days, seconds = divmod(seconds, 86400) + hours, seconds = divmod(seconds, 3600) + minutes, seconds = divmod(seconds, 60) + s = [(days, 'd'), (hours, 'h'), (minutes, 'm'), (seconds, 's')] + s = filter (lambda x: x[0] != 0, s) + return ' '.join (map (lambda x: '{}{}'.format (*x), s)) def setup (bot): m = bot.memory['crocoite'] = SopelMemory () @@ -62,7 +75,7 @@ def archive (bot, trigger): args = { 'url': url, 'output': None, - 'onload': ['scroll.js'], + 'onload': ['scroll.js'] + behavior.getByUrl (url), 'onsnapshot': [], 'browser': None, 'logBuffer': 1000, @@ -80,7 +93,16 @@ def archive (bot, trigger): # XXX: for some reason we cannot access the job’s state through handle, # instead use a callback quirk j = jobs[handle.id] = {'handle': handle, 'trigger': trigger, 'state': {}} - bot.reply ('{} has been queued as {}'.format (url, handle.id)) + + # pretty-print a few selected args + showargs = { + 'onload': ','.join (args['onload']), + 'idleTimeout': prettyTimeDelta (args['idleTimeout']), + 'timeout': prettyTimeDelta (args['timeout']), + } + strargs = ', '.join (map (lambda x: '{}={}'.format (*x), showargs.items ())) + bot.reply ('{} has been queued as {} with {}'.format (url, handle.id, strargs)) + try: result = handle.get (on_message=lambda x: updateState (j, x)) bot.reply ('{} ({}) finished'.format (url, handle.id)) diff --git a/crocoite/behavior.py b/crocoite/behavior.py new file mode 100644 index 0000000..13530fe --- /dev/null +++ b/crocoite/behavior.py @@ -0,0 +1,41 @@ +# Copyright (c) 2017 crocoite contributors +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +""" +Per-site JavaScript injections +""" + +from urllib.parse import urlsplit + +def getByUrl (url): + """ + Get site-specific onload behavior scripts + """ + url = urlsplit (url) + + hostname = url.hostname.split ('.')[::-1] + + if hostname[0] == 'com': + if hostname[1] == 'twitter': + return ['per-site/twitter.js'] + elif hostname[1] == 'instagram': + return ['per-site/instagram.js'] + return [] + diff --git a/crocoite/cli.py b/crocoite/cli.py index 3527ceb..c085326 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -143,7 +143,6 @@ def writeScreenshot (tab, writer): 'X-Chrome-Viewport': viewport}) writer.write_record (record) -# XXX: rabbitmq is hardcoded app = Celery ('crocoite.distributed') app.config_from_object('celeryconfig') logger = get_task_logger('crocoite.distributed.archive') @@ -242,6 +241,8 @@ def stateCallback (data): print (data['task_id'], result['step']) def main (): + from crocoite import behavior + parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.') parser.add_argument('--browser', help='DevTools URL', metavar='URL') parser.add_argument('--distributed', help='Use celery worker', action='store_true') @@ -251,6 +252,7 @@ def main (): parser.add_argument('--max-body-size', default=10*1024*1024, type=int, dest='maxBodySize', help='Max body size', metavar='BYTES') #parser.add_argument('--keep-tab', action='store_true', default=False, dest='keepTab', help='Keep tab open') parser.add_argument('--onload', default=[], action='append', help='Inject JavaScript file before loading page', metavar='FILE') + parser.add_argument('--no-behavior', default=True, action='store_false', help='Do not inject default behavior scripts', dest='behavior') parser.add_argument('--onsnapshot', default=[], action='append', help='Run JavaScript files before creating DOM snapshot', metavar='FILE') parser.add_argument('--no-screenshot', default=True, action='store_false', help='Do not create a screenshot of the website', dest='screenshot') parser.add_argument('--no-dom-snapshot', default=True, action='store_false', help='Do not create a DOM snapshot of the website', dest='domSnapshot') @@ -258,14 +260,21 @@ def main (): parser.add_argument('output', help='WARC filename') args = parser.parse_args () + if args.behavior: + args.onload.extend (['scroll.js'] + behavior.getByUrl (args.url)) + + # prepare args for function distributed = args.distributed passArgs = vars (args) del passArgs['distributed'] + del passArgs['behavior'] if distributed: result = archive.delay (**passArgs) result.get (on_message=stateCallback) else: + # XXX: local evaluation does not init celery logging? + logging.basicConfig (level=logging.INFO) archive (**passArgs) return True diff --git a/crocoite/data/fixups.instagram.js b/crocoite/data/fixups.instagram.js deleted file mode 100644 index da7b5ea..0000000 --- a/crocoite/data/fixups.instagram.js +++ /dev/null @@ -1,21 +0,0 @@ -/* Fixups for instagram: searches for the “show more” button and clicks it - */ -(function(){ -function fixup () { - var links = document.querySelectorAll ("main a"); - for (var i = 0; i < links.length; i++) { - var href = links[i].getAttribute ("href"); - if (href.search (/\?max_id=\d+$/) != -1) { - var click = new MouseEvent('click', { - view: window, - bubbles: true, - cancelable: true - }); - console.log ('clicking', href); - links[i].dispatchEvent (click); - break; - } - } -} -window.addEventListener("load", fixup); -}()); diff --git a/crocoite/data/fixups.twitter.js b/crocoite/data/fixups.twitter.js deleted file mode 100644 index 330370a..0000000 --- a/crocoite/data/fixups.twitter.js +++ /dev/null @@ -1,17 +0,0 @@ -/* Fixups for twitter: Some accounts are hidden behind a “suspicious activity” - * message, click that. - */ -(function(){ -function fixup () { - var show = document.querySelector ("button.ProfileWarningTimeline-button"); - if (show) { - var click = new MouseEvent('click', { - view: window, - bubbles: true, - cancelable: true - }); - show.dispatchEvent (click); - } -} -window.addEventListener("load", fixup); -}()); diff --git a/crocoite/data/per-site/instagram.js b/crocoite/data/per-site/instagram.js new file mode 100644 index 0000000..da7b5ea --- /dev/null +++ b/crocoite/data/per-site/instagram.js @@ -0,0 +1,21 @@ +/* Fixups for instagram: searches for the “show more” button and clicks it + */ +(function(){ +function fixup () { + var links = document.querySelectorAll ("main a"); + for (var i = 0; i < links.length; i++) { + var href = links[i].getAttribute ("href"); + if (href.search (/\?max_id=\d+$/) != -1) { + var click = new MouseEvent('click', { + view: window, + bubbles: true, + cancelable: true + }); + console.log ('clicking', href); + links[i].dispatchEvent (click); + break; + } + } +} +window.addEventListener("load", fixup); +}()); diff --git a/crocoite/data/per-site/twitter.js b/crocoite/data/per-site/twitter.js new file mode 100644 index 0000000..330370a --- /dev/null +++ b/crocoite/data/per-site/twitter.js @@ -0,0 +1,17 @@ +/* Fixups for twitter: Some accounts are hidden behind a “suspicious activity” + * message, click that. + */ +(function(){ +function fixup () { + var show = document.querySelector ("button.ProfileWarningTimeline-button"); + if (show) { + var click = new MouseEvent('click', { + view: window, + bubbles: true, + cancelable: true + }); + show.dispatchEvent (click); + } +} +window.addEventListener("load", fixup); +}()); -- cgit v1.2.3