summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--crocoite/browser.py6
-rw-r--r--crocoite/cli.py13
-rw-r--r--crocoite/controller.py36
-rw-r--r--crocoite/devtools.py14
-rw-r--r--crocoite/html.py8
-rw-r--r--crocoite/irc.py8
-rw-r--r--crocoite/logger.py2
-rw-r--r--crocoite/test_browser.py5
-rw-r--r--crocoite/test_devtools.py2
-rw-r--r--crocoite/tools.py2
-rw-r--r--crocoite/util.py2
-rw-r--r--crocoite/warc.py4
12 files changed, 44 insertions, 58 deletions
diff --git a/crocoite/browser.py b/crocoite/browser.py
index 91f0a0a..44b94e1 100644
--- a/crocoite/browser.py
+++ b/crocoite/browser.py
@@ -25,8 +25,6 @@ Chrome browser interactions.
import asyncio
from urllib.parse import urlsplit
from base64 import b64decode
-from collections import deque
-from threading import Event
from http.server import BaseHTTPRequestHandler
from .logger import Level
@@ -40,7 +38,7 @@ class Item:
__slots__ = ('chromeRequest', 'chromeResponse', 'chromeFinished',
'isRedirect', 'failed', 'body', 'requestBody')
- def __init__ (self, tab):
+ def __init__ (self):
self.chromeRequest = {}
self.chromeResponse = {}
self.chromeFinished = {}
@@ -274,7 +272,7 @@ class SiteLoader:
else:
logger.warning ('request exists', uuid='2c989142-ba00-4791-bb03-c2a14e91a56b')
- item = Item (self.tab)
+ item = Item ()
item.setRequest (kwargs)
self.requests[reqId] = item
logger.debug ('request', uuid='55c17564-1bd0-4499-8724-fa7aad65478f')
diff --git a/crocoite/cli.py b/crocoite/cli.py
index 8ebf557..e4a46ee 100644
--- a/crocoite/cli.py
+++ b/crocoite/cli.py
@@ -22,12 +22,13 @@
Command line interface
"""
-import argparse, json, sys, signal
+import argparse, sys, signal, asyncio, os
from enum import IntEnum
from . import behavior
-from .controller import SinglePageController, defaultSettings, \
- ControllerSettings, StatsHandler, LogHandler
+from .controller import SinglePageController, \
+ ControllerSettings, StatsHandler, LogHandler, \
+ RecursiveController, DepthLimit, PrefixLimit
from .devtools import Passthrough, Process
from .warc import WarcHandler
from .logger import Logger, JsonPrintConsumer, DatetimeConsumer, WarcHandlerConsumer
@@ -79,9 +80,6 @@ def single ():
return ret
-import asyncio, os
-from .controller import RecursiveController, DepthLimit, PrefixLimit
-
def parsePolicy (recursive, url):
if recursive is None:
return DepthLimit (0)
@@ -89,8 +87,7 @@ def parsePolicy (recursive, url):
return DepthLimit (int (recursive))
elif recursive == 'prefix':
return PrefixLimit (url)
- else:
- raise ValueError ('Unsupported')
+ raise ValueError ('Unsupported')
def recursive ():
logger = Logger (consumer=[DatetimeConsumer (), JsonPrintConsumer ()])
diff --git a/crocoite/controller.py b/crocoite/controller.py
index 3acbf26..1a41117 100644
--- a/crocoite/controller.py
+++ b/crocoite/controller.py
@@ -22,6 +22,17 @@
Controller classes, handling actions required for archival
"""
+import time, platform
+import tempfile, asyncio, json, os
+from itertools import islice
+from datetime import datetime
+from urllib.parse import urlparse
+
+from . import behavior as cbehavior
+from .browser import SiteLoader, Item
+from .util import getFormattedViewportMetrics, getRequirements, removeFragment
+from .behavior import ExtractLinksEvent
+
class ControllerSettings:
__slots__ = ('idleTimeout', 'timeout')
@@ -47,7 +58,7 @@ class EventHandler:
raise NotImplementedError ()
class StatsHandler (EventHandler):
- __slots__ = ('stats')
+ __slots__ = ('stats', )
acceptException = True
@@ -63,13 +74,10 @@ class StatsHandler (EventHandler):
self.stats['finished'] += 1
self.stats['bytesRcv'] += item.encodedDataLength
-from .behavior import ExtractLinksEvent
-from itertools import islice
-
class LogHandler (EventHandler):
""" Handle items by logging information about them """
- __slots__ = ('logger')
+ __slots__ = ('logger', )
def __init__ (self, logger):
self.logger = logger.bind (context=type (self).__name__)
@@ -86,15 +94,9 @@ class LogHandler (EventHandler):
self.logger.info ('extracted links', context=type (item).__name__,
uuid='8ee5e9c9-1130-4c5c-88ff-718508546e0c', links=limitlinks)
-import time, platform
-
-from . import behavior as cbehavior
-from .browser import SiteLoader, Item
-from .devtools import Process
-from .util import getFormattedViewportMetrics, getRequirements
class ControllerStart:
- __slots__ = ('payload')
+ __slots__ = ('payload', )
def __init__ (self, payload):
self.payload = payload
@@ -214,7 +216,7 @@ class DepthLimit (RecursionPolicy):
depth==0 means no recursion, depth==1 is the page and outgoing links
"""
- __slots__ = ('maxdepth')
+ __slots__ = ('maxdepth', )
def __init__ (self, maxdepth=0):
if maxdepth < 0 or maxdepth > 1:
@@ -240,7 +242,7 @@ class PrefixLimit (RecursionPolicy):
accepted: http://example.com/foobar http://example.com/foo/bar
"""
- __slots__ = ('prefix')
+ __slots__ = ('prefix', )
def __init__ (self, prefix):
self.prefix = prefix
@@ -248,12 +250,6 @@ class PrefixLimit (RecursionPolicy):
def __call__ (self, urls):
return set (filter (lambda u: u.startswith (self.prefix), urls))
-import tempfile, asyncio, json, os
-from datetime import datetime
-from urllib.parse import urlparse
-from .behavior import ExtractLinksEvent
-from .util import removeFragment
-
class RecursiveController:
"""
Simple recursive controller
diff --git a/crocoite/devtools.py b/crocoite/devtools.py
index 0bf2255..b071d2e 100644
--- a/crocoite/devtools.py
+++ b/crocoite/devtools.py
@@ -22,7 +22,10 @@
Communication with Google Chrome through its DevTools protocol.
"""
-import aiohttp, websockets, json, asyncio, logging
+import json, asyncio, logging, os
+from tempfile import mkdtemp
+import shutil
+import aiohttp, websockets
logger = logging.getLogger (__name__)
@@ -228,8 +231,7 @@ class Tab:
if '.' in name:
n, ext = name.split ('.', 1)
return getattrRecursive (getattr (obj, n), ext)
- else:
- return getattr (obj, name)
+ return getattr (obj, name)
if self.crashed:
raise Crashed ()
@@ -252,10 +254,6 @@ class Tab:
await ret.run ()
return ret
-import os, time
-from tempfile import mkdtemp
-import shutil
-
class Process:
""" Start Google Chrome listening on a random port """
@@ -327,7 +325,7 @@ class Process:
return False
class Passthrough:
- __slots__ = ('url')
+ __slots__ = ('url', )
def __init__ (self, url):
self.url = url
diff --git a/crocoite/html.py b/crocoite/html.py
index c929a10..fec9760 100644
--- a/crocoite/html.py
+++ b/crocoite/html.py
@@ -22,6 +22,10 @@
HTML helper
"""
+from html5lib.treewalkers.base import TreeWalker
+from html5lib.filters.base import Filter
+from html5lib import constants
+
# HTML void tags, see https://html.spec.whatwg.org/multipage/syntax.html#void-elements
voidTags = {'area',
'base',
@@ -103,10 +107,6 @@ eventAttributes = {'onabort',
'onvolumechange',
'onwaiting'}
-from html5lib.treewalkers.base import TreeWalker
-from html5lib.filters.base import Filter
-from html5lib import constants
-
class ChromeTreeWalker (TreeWalker):
"""
Recursive html5lib TreeWalker for Google Chrome method DOM.getDocument
diff --git a/crocoite/irc.py b/crocoite/irc.py
index 095c55f..99485e4 100644
--- a/crocoite/irc.py
+++ b/crocoite/irc.py
@@ -29,8 +29,8 @@ from enum import IntEnum, Enum
from collections import defaultdict
from abc import abstractmethod
from functools import wraps
-from io import BytesIO
import bottom
+import websockets
### helper functions ###
def prettyTimeDelta (seconds):
@@ -333,11 +333,11 @@ class ArgparseBot (bottom.Client):
with self._quit:
await args.func (user=user, args=args, reply=reply)
- async def onDisconnect (**kwargs):
+ async def onDisconnect (self, **kwargs):
""" Auto-reconnect """
self.logger.info ('disconnect', uuid='4c74b2c8-2403-4921-879d-2279ad85db72')
if not self._quit.armed:
- await asynio.sleep (10, loop=self.loop)
+ await asyncio.sleep (10, loop=self.loop)
self.logger.info ('reconnect', uuid='c53555cb-e1a4-4b69-b1c9-3320269c19d7')
await self.connect ()
@@ -492,8 +492,6 @@ class Chromebot (ArgparseBot):
if job.process and job.process.returncode is None:
job.process.terminate ()
-import websockets
-
class Dashboard:
__slots__ = ('fd', 'clients', 'loop', 'log', 'maxLog', 'pingInterval', 'pingTimeout')
# these messages will not be forwarded to the browser
diff --git a/crocoite/logger.py b/crocoite/logger.py
index e69df5e..cddc42d 100644
--- a/crocoite/logger.py
+++ b/crocoite/logger.py
@@ -85,7 +85,7 @@ class Logger:
self.consumer.remove (consumer)
class Consumer:
- def __call__ (self, level, *args, **kwargs): # pragma: no cover
+ def __call__ (self, **kwargs): # pragma: no cover
raise NotImplementedError ()
class NullConsumer (Consumer):
diff --git a/crocoite/test_browser.py b/crocoite/test_browser.py
index f72d899..8adf0cd 100644
--- a/crocoite/test_browser.py
+++ b/crocoite/test_browser.py
@@ -18,7 +18,6 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
-import logging
import asyncio
import pytest
from operator import itemgetter
@@ -26,7 +25,7 @@ from aiohttp import web
from http.server import BaseHTTPRequestHandler
from .browser import Item, SiteLoader
-from .logger import Logger, Consumer, JsonPrintConsumer
+from .logger import Logger, Consumer
from .devtools import Crashed, Process
# if you want to know what’s going on:
@@ -39,7 +38,7 @@ class TItem (Item):
base = 'http://localhost:8000/'
def __init__ (self, path, status, headers, bodyReceive, bodySend=None, requestBody=None, failed=False, isRedirect=False):
- super ().__init__ (tab=None)
+ super ().__init__ ()
self.chromeResponse = {'response': {'headers': headers, 'status': status, 'url': self.base + path}}
self.body = bodyReceive, False
self.bodySend = bodyReceive if not bodySend else bodySend
diff --git a/crocoite/test_devtools.py b/crocoite/test_devtools.py
index 8676e6c..74d223f 100644
--- a/crocoite/test_devtools.py
+++ b/crocoite/test_devtools.py
@@ -103,7 +103,7 @@ async def test_tab_crash (tab):
# caling anything else now should fail as well
with pytest.raises (Crashed):
- version = await tab.Browser.getVersion ()
+ await tab.Browser.getVersion ()
@pytest.mark.asyncio
async def test_load (tab, server):
diff --git a/crocoite/tools.py b/crocoite/tools.py
index da32f85..843270e 100644
--- a/crocoite/tools.py
+++ b/crocoite/tools.py
@@ -22,7 +22,7 @@
Misc tools
"""
-import shutil, sys, re, os, logging, argparse
+import shutil, sys, os, logging, argparse
from warcio.archiveiterator import ArchiveIterator
from warcio.warcwriter import WARCWriter
diff --git a/crocoite/util.py b/crocoite/util.py
index daa60db..18a051a 100644
--- a/crocoite/util.py
+++ b/crocoite/util.py
@@ -23,7 +23,7 @@ Random utility functions
"""
import random, sys
-import hashlib, os, pkg_resources
+import hashlib, pkg_resources
from urllib.parse import urlsplit, urlunsplit
def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'):
diff --git a/crocoite/warc.py b/crocoite/warc.py
index c1cbff2..ebc460d 100644
--- a/crocoite/warc.py
+++ b/crocoite/warc.py
@@ -24,15 +24,15 @@ Classes writing data to WARC files
import json, threading
from io import BytesIO
-from warcio.statusandheaders import StatusAndHeaders
from urllib.parse import urlsplit
from datetime import datetime
from warcio.timeutils import datetime_to_iso_date
from warcio.warcwriter import WARCWriter
+from warcio.statusandheaders import StatusAndHeaders
from .util import packageUrl
-from .controller import defaultSettings, EventHandler, ControllerStart
+from .controller import EventHandler, ControllerStart
from .behavior import Script, DomSnapshotEvent, ScreenshotEvent
from .browser import Item