diff options
| -rw-r--r-- | crocoite/browser.py | 6 | ||||
| -rw-r--r-- | crocoite/cli.py | 13 | ||||
| -rw-r--r-- | crocoite/controller.py | 36 | ||||
| -rw-r--r-- | crocoite/devtools.py | 14 | ||||
| -rw-r--r-- | crocoite/html.py | 8 | ||||
| -rw-r--r-- | crocoite/irc.py | 8 | ||||
| -rw-r--r-- | crocoite/logger.py | 2 | ||||
| -rw-r--r-- | crocoite/test_browser.py | 5 | ||||
| -rw-r--r-- | crocoite/test_devtools.py | 2 | ||||
| -rw-r--r-- | crocoite/tools.py | 2 | ||||
| -rw-r--r-- | crocoite/util.py | 2 | ||||
| -rw-r--r-- | crocoite/warc.py | 4 | 
12 files changed, 44 insertions, 58 deletions
| diff --git a/crocoite/browser.py b/crocoite/browser.py index 91f0a0a..44b94e1 100644 --- a/crocoite/browser.py +++ b/crocoite/browser.py @@ -25,8 +25,6 @@ Chrome browser interactions.  import asyncio  from urllib.parse import urlsplit  from base64 import b64decode -from collections import deque -from threading import Event  from http.server import BaseHTTPRequestHandler  from .logger import Level @@ -40,7 +38,7 @@ class Item:      __slots__ = ('chromeRequest', 'chromeResponse', 'chromeFinished',              'isRedirect', 'failed', 'body', 'requestBody') -    def __init__ (self, tab): +    def __init__ (self):          self.chromeRequest = {}          self.chromeResponse = {}          self.chromeFinished = {} @@ -274,7 +272,7 @@ class SiteLoader:              else:                  logger.warning ('request exists', uuid='2c989142-ba00-4791-bb03-c2a14e91a56b') -        item = Item (self.tab) +        item = Item ()          item.setRequest (kwargs)          self.requests[reqId] = item          logger.debug ('request', uuid='55c17564-1bd0-4499-8724-fa7aad65478f') diff --git a/crocoite/cli.py b/crocoite/cli.py index 8ebf557..e4a46ee 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -22,12 +22,13 @@  Command line interface  """ -import argparse, json, sys, signal +import argparse, sys, signal, asyncio, os  from enum import IntEnum  from . import behavior -from .controller import SinglePageController, defaultSettings, \ -        ControllerSettings, StatsHandler, LogHandler +from .controller import SinglePageController, \ +        ControllerSettings, StatsHandler, LogHandler, \ +        RecursiveController, DepthLimit, PrefixLimit  from .devtools import Passthrough, Process  from .warc import WarcHandler  from .logger import Logger, JsonPrintConsumer, DatetimeConsumer, WarcHandlerConsumer @@ -79,9 +80,6 @@ def single ():      return ret -import asyncio, os -from .controller import RecursiveController, DepthLimit, PrefixLimit -  def parsePolicy (recursive, url):      if recursive is None:          return DepthLimit (0) @@ -89,8 +87,7 @@ def parsePolicy (recursive, url):          return DepthLimit (int (recursive))      elif recursive == 'prefix':          return PrefixLimit (url) -    else: -        raise ValueError ('Unsupported') +    raise ValueError ('Unsupported')  def recursive ():      logger = Logger (consumer=[DatetimeConsumer (), JsonPrintConsumer ()]) diff --git a/crocoite/controller.py b/crocoite/controller.py index 3acbf26..1a41117 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -22,6 +22,17 @@  Controller classes, handling actions required for archival  """ +import time, platform +import tempfile, asyncio, json, os +from itertools import islice +from datetime import datetime +from urllib.parse import urlparse + +from . import behavior as cbehavior +from .browser import SiteLoader, Item +from .util import getFormattedViewportMetrics, getRequirements, removeFragment +from .behavior import ExtractLinksEvent +  class ControllerSettings:      __slots__ = ('idleTimeout', 'timeout') @@ -47,7 +58,7 @@ class EventHandler:          raise NotImplementedError ()  class StatsHandler (EventHandler): -    __slots__ = ('stats') +    __slots__ = ('stats', )      acceptException = True @@ -63,13 +74,10 @@ class StatsHandler (EventHandler):                  self.stats['finished'] += 1                  self.stats['bytesRcv'] += item.encodedDataLength -from .behavior import ExtractLinksEvent -from itertools import islice -  class LogHandler (EventHandler):      """ Handle items by logging information about them """ -    __slots__ = ('logger') +    __slots__ = ('logger', )      def __init__ (self, logger):          self.logger = logger.bind (context=type (self).__name__) @@ -86,15 +94,9 @@ class LogHandler (EventHandler):                  self.logger.info ('extracted links', context=type (item).__name__,                          uuid='8ee5e9c9-1130-4c5c-88ff-718508546e0c', links=limitlinks) -import time, platform - -from . import behavior as cbehavior -from .browser import SiteLoader, Item -from .devtools import Process -from .util import getFormattedViewportMetrics, getRequirements  class ControllerStart: -    __slots__ = ('payload') +    __slots__ = ('payload', )      def __init__ (self, payload):          self.payload = payload @@ -214,7 +216,7 @@ class DepthLimit (RecursionPolicy):      depth==0 means no recursion, depth==1 is the page and outgoing links      """ -    __slots__ = ('maxdepth') +    __slots__ = ('maxdepth', )      def __init__ (self, maxdepth=0):          if maxdepth < 0 or maxdepth > 1: @@ -240,7 +242,7 @@ class PrefixLimit (RecursionPolicy):      accepted: http://example.com/foobar http://example.com/foo/bar      """ -    __slots__ = ('prefix') +    __slots__ = ('prefix', )      def __init__ (self, prefix):          self.prefix = prefix @@ -248,12 +250,6 @@ class PrefixLimit (RecursionPolicy):      def __call__ (self, urls):          return set (filter (lambda u: u.startswith (self.prefix), urls)) -import tempfile, asyncio, json, os -from datetime import datetime -from urllib.parse import urlparse -from .behavior import ExtractLinksEvent -from .util import removeFragment -  class RecursiveController:      """      Simple recursive controller diff --git a/crocoite/devtools.py b/crocoite/devtools.py index 0bf2255..b071d2e 100644 --- a/crocoite/devtools.py +++ b/crocoite/devtools.py @@ -22,7 +22,10 @@  Communication with Google Chrome through its DevTools protocol.  """ -import aiohttp, websockets, json, asyncio, logging +import json, asyncio, logging, os +from tempfile import mkdtemp +import shutil +import aiohttp, websockets  logger = logging.getLogger (__name__) @@ -228,8 +231,7 @@ class Tab:              if '.' in name:                  n, ext = name.split ('.', 1)                  return getattrRecursive (getattr (obj, n), ext) -            else: -                return getattr (obj, name) +            return getattr (obj, name)          if self.crashed:              raise Crashed () @@ -252,10 +254,6 @@ class Tab:          await ret.run ()          return ret -import os, time -from tempfile import mkdtemp -import shutil -  class Process:      """ Start Google Chrome listening on a random port """ @@ -327,7 +325,7 @@ class Process:          return False  class Passthrough: -    __slots__ = ('url') +    __slots__ = ('url', )      def __init__ (self, url):          self.url = url diff --git a/crocoite/html.py b/crocoite/html.py index c929a10..fec9760 100644 --- a/crocoite/html.py +++ b/crocoite/html.py @@ -22,6 +22,10 @@  HTML helper  """ +from html5lib.treewalkers.base import TreeWalker +from html5lib.filters.base import Filter +from html5lib import constants +  # HTML void tags, see https://html.spec.whatwg.org/multipage/syntax.html#void-elements  voidTags = {'area',          'base', @@ -103,10 +107,6 @@ eventAttributes = {'onabort',          'onvolumechange',          'onwaiting'} -from html5lib.treewalkers.base import TreeWalker -from html5lib.filters.base import Filter -from html5lib import constants -  class ChromeTreeWalker (TreeWalker):      """      Recursive html5lib TreeWalker for Google Chrome method DOM.getDocument diff --git a/crocoite/irc.py b/crocoite/irc.py index 095c55f..99485e4 100644 --- a/crocoite/irc.py +++ b/crocoite/irc.py @@ -29,8 +29,8 @@ from enum import IntEnum, Enum  from collections import defaultdict  from abc import abstractmethod  from functools import wraps -from io import BytesIO  import bottom +import websockets  ### helper functions ###  def prettyTimeDelta (seconds): @@ -333,11 +333,11 @@ class ArgparseBot (bottom.Client):                  with self._quit:                      await args.func (user=user, args=args, reply=reply) -    async def onDisconnect (**kwargs): +    async def onDisconnect (self, **kwargs):          """ Auto-reconnect """          self.logger.info ('disconnect', uuid='4c74b2c8-2403-4921-879d-2279ad85db72')          if not self._quit.armed: -            await asynio.sleep (10, loop=self.loop) +            await asyncio.sleep (10, loop=self.loop)              self.logger.info ('reconnect', uuid='c53555cb-e1a4-4b69-b1c9-3320269c19d7')              await self.connect () @@ -492,8 +492,6 @@ class Chromebot (ArgparseBot):          if job.process and job.process.returncode is None:              job.process.terminate () -import websockets -  class Dashboard:      __slots__ = ('fd', 'clients', 'loop', 'log', 'maxLog', 'pingInterval', 'pingTimeout')      # these messages will not be forwarded to the browser diff --git a/crocoite/logger.py b/crocoite/logger.py index e69df5e..cddc42d 100644 --- a/crocoite/logger.py +++ b/crocoite/logger.py @@ -85,7 +85,7 @@ class Logger:          self.consumer.remove (consumer)  class Consumer: -    def __call__ (self, level, *args, **kwargs): # pragma: no cover +    def __call__ (self, **kwargs): # pragma: no cover          raise NotImplementedError ()  class NullConsumer (Consumer): diff --git a/crocoite/test_browser.py b/crocoite/test_browser.py index f72d899..8adf0cd 100644 --- a/crocoite/test_browser.py +++ b/crocoite/test_browser.py @@ -18,7 +18,6 @@  # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN  # THE SOFTWARE. -import logging  import asyncio  import pytest  from operator import itemgetter @@ -26,7 +25,7 @@ from aiohttp import web  from http.server import BaseHTTPRequestHandler  from .browser import Item, SiteLoader -from .logger import Logger, Consumer, JsonPrintConsumer +from .logger import Logger, Consumer  from .devtools import Crashed, Process  # if you want to know what’s going on: @@ -39,7 +38,7 @@ class TItem (Item):      base = 'http://localhost:8000/'      def __init__ (self, path, status, headers, bodyReceive, bodySend=None, requestBody=None, failed=False, isRedirect=False): -        super ().__init__ (tab=None) +        super ().__init__ ()          self.chromeResponse = {'response': {'headers': headers, 'status': status, 'url': self.base + path}}          self.body = bodyReceive, False          self.bodySend = bodyReceive if not bodySend else bodySend diff --git a/crocoite/test_devtools.py b/crocoite/test_devtools.py index 8676e6c..74d223f 100644 --- a/crocoite/test_devtools.py +++ b/crocoite/test_devtools.py @@ -103,7 +103,7 @@ async def test_tab_crash (tab):      # caling anything else now should fail as well      with pytest.raises (Crashed): -        version = await tab.Browser.getVersion () +        await tab.Browser.getVersion ()  @pytest.mark.asyncio  async def test_load (tab, server): diff --git a/crocoite/tools.py b/crocoite/tools.py index da32f85..843270e 100644 --- a/crocoite/tools.py +++ b/crocoite/tools.py @@ -22,7 +22,7 @@  Misc tools  """ -import shutil, sys, re, os, logging, argparse +import shutil, sys, os, logging, argparse  from warcio.archiveiterator import ArchiveIterator  from warcio.warcwriter import WARCWriter diff --git a/crocoite/util.py b/crocoite/util.py index daa60db..18a051a 100644 --- a/crocoite/util.py +++ b/crocoite/util.py @@ -23,7 +23,7 @@ Random utility functions  """  import random, sys -import hashlib, os, pkg_resources +import hashlib, pkg_resources  from urllib.parse import urlsplit, urlunsplit  def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'): diff --git a/crocoite/warc.py b/crocoite/warc.py index c1cbff2..ebc460d 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -24,15 +24,15 @@ Classes writing data to WARC files  import json, threading  from io import BytesIO -from warcio.statusandheaders import StatusAndHeaders  from urllib.parse import urlsplit  from datetime import datetime  from warcio.timeutils import datetime_to_iso_date  from warcio.warcwriter import WARCWriter +from warcio.statusandheaders import StatusAndHeaders  from .util import packageUrl -from .controller import defaultSettings, EventHandler, ControllerStart +from .controller import EventHandler, ControllerStart  from .behavior import Script, DomSnapshotEvent, ScreenshotEvent  from .browser import Item | 
