diff options
-rw-r--r-- | crocoite/cli.py | 19 | ||||
-rw-r--r-- | crocoite/irc.py | 254 | ||||
-rw-r--r-- | setup.py | 2 |
3 files changed, 275 insertions, 0 deletions
diff --git a/crocoite/cli.py b/crocoite/cli.py index 3f5904c..dadfc45 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -107,3 +107,22 @@ def recursive (): loop.run_until_complete(controller.run ()) loop.close() +def irc (): + from configparser import ConfigParser + from .irc import Bot + + config = ConfigParser () + config.read ('chromebot.ini') + s = config['irc'] + + bot = Bot ( + host=s.get ('host'), + port=s.getint ('port'), + ssl=s.getboolean ('ssl'), + nick=s.get ('nick'), + channels=[s.get ('channel')], + tempdir=s.get ('tempdir'), + destdir=s.get ('destdir')) + bot.loop.create_task(bot.connect()) + bot.loop.run_forever() + diff --git a/crocoite/irc.py b/crocoite/irc.py new file mode 100644 index 0000000..fea09b3 --- /dev/null +++ b/crocoite/irc.py @@ -0,0 +1,254 @@ +# Copyright (c) 2017–2018 crocoite contributors +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +""" +IRC bot “chromebot” +""" + +import asyncio, argparse, uuid, json, tempfile +from datetime import datetime +from urllib.parse import urlsplit +from enum import IntEnum +import bottom + +### helper functions ### +def prettyTimeDelta (seconds): + """ + Pretty-print seconds to human readable string 1d 1h 1m 1s + """ + seconds = int(seconds) + days, seconds = divmod(seconds, 86400) + hours, seconds = divmod(seconds, 3600) + minutes, seconds = divmod(seconds, 60) + s = [(days, 'd'), (hours, 'h'), (minutes, 'm'), (seconds, 's')] + s = filter (lambda x: x[0] != 0, s) + return ' '.join (map (lambda x: '{}{}'.format (*x), s)) + +def prettyBytes (b): + """ + Pretty-print bytes + """ + prefixes = ['B', 'KiB', 'MiB', 'GiB', 'TiB'] + while b >= 1024 and len (prefixes) > 1: + b /= 1024 + prefixes.pop (0) + return '{:.1f} {}'.format (b, prefixes[0]) + +def isValidUrl (s): + url = urlsplit (s) + if url.scheme and url.netloc and url.scheme in {'http', 'https'}: + return s + raise TypeError () + +class NonExitingArgumentParser (argparse.ArgumentParser): + """ Argument parser that does not call exit(), suitable for interactive use """ + def exit (self, status=0, message=None): + # should never be called + pass + + def error (self, message): + raise Exception (message) + + def format_usage (self): + return super().format_usage ().replace ('\n', ' ') + +class Status(IntEnum): + undefined = 0 + pending = 1 + running = 2 + aborted = 3 + finished = 4 + +class Job: + __slots__ = ('id', 'stats', 'rstats', 'started', 'finished', 'nick', 'status', 'process', 'url') + + def __init__ (self, url, nick): + self.id = str (uuid.uuid4 ()) + self.stats = {} + self.rstats = {} + self.started = datetime.utcnow () + self.finished = None + self.url = url + # user who scheduled this job + self.nick = nick + self.status = Status.pending + self.process = None + + def formatStatus (self): + stats = self.stats + rstats = self.rstats + return '{} ({}) {}. {} pages finished, {} pending; {} crashed, {} requests, {} failed, {} received.'.format ( + self.url, + self.id, + self.status.name, + rstats.get ('have', 0), + rstats.get ('pending', 0), + stats.get ('crashed', 0), + stats.get ('requests', 0), + stats.get ('failed', 0), + prettyBytes (stats.get ('bytesRcv', 0))) + +class Bot(bottom.Client): + __slots__ = ('jobs', 'channels', 'nick', 'tempdir', 'destdir', 'parser') + + def __init__ (self, host, port, ssl, nick, channels=[], tempdir=tempfile.gettempdir(), destdir='.'): + super().__init__ (host=host, port=port, ssl=ssl) + self.jobs = {} + self.channels = channels + self.nick = nick + self.tempdir = tempdir + self.destdir = destdir + + self.parser = NonExitingArgumentParser (prog=self.nick + ': ', add_help=False) + subparsers = self.parser.add_subparsers(help='Sub-commands') + + archiveparser = subparsers.add_parser('a', help='Archive a site') + #archiveparser.add_argument('--timeout', default=1*60*60, type=int, help='Maximum time for archival', metavar='SEC', choices=[60, 1*60*60, 2*60*60]) + #archiveparser.add_argument('--idle-timeout', default=10, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout', metavar='SEC', choices=[1, 10, 20, 30, 60]) + #archiveparser.add_argument('--max-body-size', default=None, type=int, dest='maxBodySize', help='Max body size', metavar='BYTES', choices=[1*1024*1024, 10*1024*1024, 100*1024*1024]) + archiveparser.add_argument('--concurrency', '-j', default=1, type=int, help='Parallel workers for this job', choices=range (9)) + archiveparser.add_argument('--recursive', '-r', help='Enable recursion', choices=['0', '1', 'prefix'], default='0') + archiveparser.add_argument('url', help='Website URL', type=isValidUrl) + archiveparser.set_defaults (func=self.handleArchive) + + statusparser = subparsers.add_parser ('s', help='Get job status') + statusparser.add_argument('id', help='Job id', metavar='UUID') + statusparser.set_defaults (func=self.handleStatus) + + abortparser = subparsers.add_parser ('r', help='Revoke/abort job') + abortparser.add_argument('id', help='Job id', metavar='UUID') + abortparser.set_defaults (func=self.handleAbort) + + # register bottom event handler + self.on('CLIENT_CONNECT', self.onConnect) + self.on('PING', self.onKeepalive) + self.on('PRIVMSG', self.onMessage) + self.on('CLIENT_DISCONNECT', self.onDisconnect) + + async def handleArchive (self, args, nick, target, message, **kwargs): + """ Handle the archive command """ + + j = Job (args.url, nick) + assert j.id not in self.jobs, 'duplicate job id' + self.jobs[j.id] = j + + cmdline = ['crocoite-recursive', args.url, '--tempdir', self.tempdir, + '--prefix', j.id + '-{host}-{date}-', '--policy', + args.recursive, '--concurrency', str (args.concurrency), + self.destdir] + + showargs = { + 'recursive': args.recursive, + 'concurrency': args.concurrency, + } + strargs = ', '.join (map (lambda x: '{}={}'.format (*x), showargs.items ())) + self.send ('PRIVMSG', target=target, message='{}: {} has been queued as {} with {}'.format ( + nick, args.url, j.id, strargs)) + + j.process = await asyncio.create_subprocess_exec (*cmdline, stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.DEVNULL, stdin=asyncio.subprocess.DEVNULL) + while True: + data = await j.process.stdout.readline () + if not data: + break + + # job is marked running after the first message is received from it + if j.status == Status.pending: + j.status = Status.running + + data = json.loads (data) + msgid = data.get ('uuid') + if msgid == '24d92d16-770e-4088-b769-4020e127a7ff': + j.stats = data + elif msgid == '5b8498e4-868d-413c-a67e-004516b8452c': + j.rstats = data + code = await j.process.wait () + + if j.status == Status.running: + j.status = Status.finished + j.finished = datetime.utcnow () + + stats = j.stats + rstats = j.rstats + self.send ('PRIVMSG', target=target, message='{}: {}'.format (nick, j.formatStatus ())) + + async def handleStatus (self, args, nick, target, message, **kwargs): + """ Handle status command """ + + j = self.jobs.get (args.id, None) + if not j: + self.send ('PRIVMSG', target=target, message='{}: Job {} is unknown'.format (nick, args.id)) + else: + rstats = j.rstats + self.send ('PRIVMSG', target=target, message='{}: {}'.format (nick, j.formatStatus ())) + + async def handleAbort (self, args, nick, target, message, **kwargs): + """ Handle abort command """ + + j = self.jobs.get (args.id, None) + if not j: + self.send ('PRIVMSG', target=target, message='{}: Job {} is unknown'.format (nick, args.id)) + else: + j.status = Status.aborted + j.process.terminate () + + async def onConnect (self, **kwargs): + + self.send('NICK', nick=self.nick) + self.send('USER', user=self.nick, realname='https://github.com/PromyLOPh/crocoite') + + # Don't try to join channels until the server has + # sent the MOTD, or signaled that there's no MOTD. + done, pending = await asyncio.wait( + [self.wait('RPL_ENDOFMOTD'), self.wait('ERR_NOMOTD')], + loop=self.loop, return_when=asyncio.FIRST_COMPLETED) + + # Cancel whichever waiter's event didn't come in. + for future in pending: + future.cancel() + + for c in self.channels: + self.send('JOIN', channel=c) + + async def onKeepalive (self, message, **kwargs): + """ Ping received """ + self.send('PONG', message=message) + + async def onMessage (self, nick, target, message, **kwargs): + """ Message received """ + if target in self.channels and message.startswith (self.nick): + # channel message that starts with our nick + command = message.split (' ')[1:] + try: + args = self.parser.parse_args (command) + except Exception as e: + self.send ('PRIVMSG', target=target, message='{} -- {}'.format (e.args[0], self.parser.format_usage ())) + return + if not args: + self.send ('PRIVMSG', target=target, message='Sorry, I don’t understand {}'.format (command)) + return + + await args.func (args, nick, target, message, **kwargs) + + async def onDisconnect (**kwargs): + """ Auto-reconnect """ + await asynio.sleep (10, loop=self.loop) + await self.connect () + @@ -13,11 +13,13 @@ setup( 'pychrome', 'warcio', 'html5lib>=0.999999999', + 'bottom', ], entry_points={ 'console_scripts': [ 'crocoite-grab = crocoite.cli:single', 'crocoite-recursive = crocoite.cli:recursive', + 'crocoite-irc = crocoite.cli:irc', 'crocoite-merge-warc = crocoite.tools:mergeWarc', 'crocoite-extract-screenshot = crocoite.tools:extractScreenshot', ], |