summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2017-12-19 09:14:43 +0100
committerLars-Dominik Braun <lars@6xq.net>2017-12-19 09:14:43 +0100
commit8e939f8922815bd917f4dd750aa5f8a17a8f750c (patch)
treef30d630a33c1aba991b723da4fcb139852bac674
parent6a771cf9df0e9446feab59b50cea2998d42a459f (diff)
downloadcrocoite-8e939f8922815bd917f4dd750aa5f8a17a8f750c.tar.gz
crocoite-8e939f8922815bd917f4dd750aa5f8a17a8f750c.tar.bz2
crocoite-8e939f8922815bd917f4dd750aa5f8a17a8f750c.zip
Select default behavior scripts by site URL
-rw-r--r--contrib/celerycrocoite.py26
-rw-r--r--crocoite/behavior.py41
-rw-r--r--crocoite/cli.py11
-rw-r--r--crocoite/data/per-site/instagram.js (renamed from crocoite/data/fixups.instagram.js)0
-rw-r--r--crocoite/data/per-site/twitter.js (renamed from crocoite/data/fixups.twitter.js)0
5 files changed, 75 insertions, 3 deletions
diff --git a/contrib/celerycrocoite.py b/contrib/celerycrocoite.py
index 8fab046..ede58be 100644
--- a/contrib/celerycrocoite.py
+++ b/contrib/celerycrocoite.py
@@ -29,6 +29,19 @@ import celery
from urllib.parse import urlsplit
import crocoite.cli
+from crocoite import behavior
+
+def prettyTimeDelta (seconds):
+ """
+ Pretty-print seconds to human readable string 1d 1h 1m 1s
+ """
+ seconds = int(seconds)
+ days, seconds = divmod(seconds, 86400)
+ hours, seconds = divmod(seconds, 3600)
+ minutes, seconds = divmod(seconds, 60)
+ s = [(days, 'd'), (hours, 'h'), (minutes, 'm'), (seconds, 's')]
+ s = filter (lambda x: x[0] != 0, s)
+ return ' '.join (map (lambda x: '{}{}'.format (*x), s))
def setup (bot):
m = bot.memory['crocoite'] = SopelMemory ()
@@ -62,7 +75,7 @@ def archive (bot, trigger):
args = {
'url': url,
'output': None,
- 'onload': ['scroll.js'],
+ 'onload': ['scroll.js'] + behavior.getByUrl (url),
'onsnapshot': [],
'browser': None,
'logBuffer': 1000,
@@ -80,7 +93,16 @@ def archive (bot, trigger):
# XXX: for some reason we cannot access the job’s state through handle,
# instead use a callback quirk
j = jobs[handle.id] = {'handle': handle, 'trigger': trigger, 'state': {}}
- bot.reply ('{} has been queued as {}'.format (url, handle.id))
+
+ # pretty-print a few selected args
+ showargs = {
+ 'onload': ','.join (args['onload']),
+ 'idleTimeout': prettyTimeDelta (args['idleTimeout']),
+ 'timeout': prettyTimeDelta (args['timeout']),
+ }
+ strargs = ', '.join (map (lambda x: '{}={}'.format (*x), showargs.items ()))
+ bot.reply ('{} has been queued as {} with {}'.format (url, handle.id, strargs))
+
try:
result = handle.get (on_message=lambda x: updateState (j, x))
bot.reply ('{} ({}) finished'.format (url, handle.id))
diff --git a/crocoite/behavior.py b/crocoite/behavior.py
new file mode 100644
index 0000000..13530fe
--- /dev/null
+++ b/crocoite/behavior.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2017 crocoite contributors
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+"""
+Per-site JavaScript injections
+"""
+
+from urllib.parse import urlsplit
+
+def getByUrl (url):
+ """
+ Get site-specific onload behavior scripts
+ """
+ url = urlsplit (url)
+
+ hostname = url.hostname.split ('.')[::-1]
+
+ if hostname[0] == 'com':
+ if hostname[1] == 'twitter':
+ return ['per-site/twitter.js']
+ elif hostname[1] == 'instagram':
+ return ['per-site/instagram.js']
+ return []
+
diff --git a/crocoite/cli.py b/crocoite/cli.py
index 3527ceb..c085326 100644
--- a/crocoite/cli.py
+++ b/crocoite/cli.py
@@ -143,7 +143,6 @@ def writeScreenshot (tab, writer):
'X-Chrome-Viewport': viewport})
writer.write_record (record)
-# XXX: rabbitmq is hardcoded
app = Celery ('crocoite.distributed')
app.config_from_object('celeryconfig')
logger = get_task_logger('crocoite.distributed.archive')
@@ -242,6 +241,8 @@ def stateCallback (data):
print (data['task_id'], result['step'])
def main ():
+ from crocoite import behavior
+
parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.')
parser.add_argument('--browser', help='DevTools URL', metavar='URL')
parser.add_argument('--distributed', help='Use celery worker', action='store_true')
@@ -251,6 +252,7 @@ def main ():
parser.add_argument('--max-body-size', default=10*1024*1024, type=int, dest='maxBodySize', help='Max body size', metavar='BYTES')
#parser.add_argument('--keep-tab', action='store_true', default=False, dest='keepTab', help='Keep tab open')
parser.add_argument('--onload', default=[], action='append', help='Inject JavaScript file before loading page', metavar='FILE')
+ parser.add_argument('--no-behavior', default=True, action='store_false', help='Do not inject default behavior scripts', dest='behavior')
parser.add_argument('--onsnapshot', default=[], action='append', help='Run JavaScript files before creating DOM snapshot', metavar='FILE')
parser.add_argument('--no-screenshot', default=True, action='store_false', help='Do not create a screenshot of the website', dest='screenshot')
parser.add_argument('--no-dom-snapshot', default=True, action='store_false', help='Do not create a DOM snapshot of the website', dest='domSnapshot')
@@ -258,14 +260,21 @@ def main ():
parser.add_argument('output', help='WARC filename')
args = parser.parse_args ()
+ if args.behavior:
+ args.onload.extend (['scroll.js'] + behavior.getByUrl (args.url))
+
+ # prepare args for function
distributed = args.distributed
passArgs = vars (args)
del passArgs['distributed']
+ del passArgs['behavior']
if distributed:
result = archive.delay (**passArgs)
result.get (on_message=stateCallback)
else:
+ # XXX: local evaluation does not init celery logging?
+ logging.basicConfig (level=logging.INFO)
archive (**passArgs)
return True
diff --git a/crocoite/data/fixups.instagram.js b/crocoite/data/per-site/instagram.js
index da7b5ea..da7b5ea 100644
--- a/crocoite/data/fixups.instagram.js
+++ b/crocoite/data/per-site/instagram.js
diff --git a/crocoite/data/fixups.twitter.js b/crocoite/data/per-site/twitter.js
index 330370a..330370a 100644
--- a/crocoite/data/fixups.twitter.js
+++ b/crocoite/data/per-site/twitter.js