summaryrefslogtreecommitdiff
path: root/crocoite
diff options
context:
space:
mode:
Diffstat (limited to 'crocoite')
-rw-r--r--crocoite/behavior.py5
-rw-r--r--crocoite/browser.py9
-rw-r--r--crocoite/html.py2
-rw-r--r--crocoite/task.py12
-rw-r--r--crocoite/warc.py4
5 files changed, 10 insertions, 22 deletions
diff --git a/crocoite/behavior.py b/crocoite/behavior.py
index ffdc50c..ab859f8 100644
--- a/crocoite/behavior.py
+++ b/crocoite/behavior.py
@@ -23,7 +23,6 @@ Generic and per-site behavior scripts
"""
import logging, time
-from io import BytesIO
from urllib.parse import urlsplit
import os.path
import pkg_resources
@@ -31,13 +30,11 @@ from base64 import b64decode
from collections import OrderedDict
from html5lib.serializer import HTMLSerializer
-from warcio.statusandheaders import StatusAndHeaders
from pychrome.exceptions import TimeoutException
-from .util import randomString, packageUrl, getFormattedViewportMetrics
+from .util import randomString, getFormattedViewportMetrics
from . import html
from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker
-from .browser import SiteLoader
logger = logging.getLogger(__name__)
diff --git a/crocoite/browser.py b/crocoite/browser.py
index 3b9f7ab..1c09598 100644
--- a/crocoite/browser.py
+++ b/crocoite/browser.py
@@ -27,6 +27,7 @@ from urllib.parse import urlsplit
from base64 import b64decode
from collections import deque
from threading import Event
+from http.server import BaseHTTPRequestHandler
import pychrome
@@ -323,14 +324,10 @@ class SiteLoader:
import subprocess, os, time
from tempfile import mkdtemp
-import socket, shutil
+import shutil
class ChromeService:
- """
- Start Chrome with socket activation (i.e. pass listening socket). Polling
- is not required with this method, since reads will block until Chrome is
- ready.
- """
+ """ Start Google Chrome listening on a random port """
__slots__ = ('binary', 'windowSize', 'p', 'userDataDir')
diff --git a/crocoite/html.py b/crocoite/html.py
index f891101..c929a10 100644
--- a/crocoite/html.py
+++ b/crocoite/html.py
@@ -105,7 +105,6 @@ eventAttributes = {'onabort',
from html5lib.treewalkers.base import TreeWalker
from html5lib.filters.base import Filter
-from html5lib.serializer import HTMLSerializer
from html5lib import constants
class ChromeTreeWalker (TreeWalker):
@@ -195,7 +194,6 @@ class StripAttributeFilter (Filter):
self.attributes = set (map (str.lower, attributes))
def __iter__(self):
- default_namespace = constants.namespaces["html"]
for token in Filter.__iter__(self):
data = token.get ('data')
if data and token['type'] in {'StartTag', 'EmptyTag'}:
diff --git a/crocoite/task.py b/crocoite/task.py
index 9054627..6b3c9d1 100644
--- a/crocoite/task.py
+++ b/crocoite/task.py
@@ -39,7 +39,7 @@ from celery import Celery
from celery.utils.log import get_task_logger
from .browser import ChromeService, BrowserCrashed
-from .controller import SinglePageController, ControllerSettings, RecursiveController, defaultSettings, DepthLimit, PrefixLimit, StatsHandler
+from .controller import SinglePageController, ControllerSettings, RecursiveController, defaultSettings, DepthLimit, StatsHandler
from . import behavior
from .cli import parseRecursive
from .warc import WarcHandler
@@ -82,9 +82,9 @@ def archive (self, url, settings, enabledBehaviorNames):
enabledBehavior = list (filter (lambda x: x.name in enabledBehaviorNames, behavior.available))
settings = ControllerSettings (**settings)
try:
- controller = SinglePageController (url, fd, behavior=enabledBehavior,
+ c = SinglePageController (url, fd, behavior=enabledBehavior,
settings=settings, handler=handler)
- controller.run ()
+ c.run ()
except BrowserCrashed:
# nothing we can do about that
logger.error ('browser crashed for {}'.format (url))
@@ -137,9 +137,9 @@ def controller (self, url, settings, enabledBehaviorNames, recursive, concurrenc
recursionPolicy = parseRecursive (recursive, url)
enabledBehavior = list (filter (lambda x: x.name in enabledBehaviorNames, behavior.available))
settings = ControllerSettings (**settings)
- controller = DistributedRecursiveController (url, None, behavior=enabledBehavior,
+ c = DistributedRecursiveController (url, None, behavior=enabledBehavior,
settings=settings, recursionPolicy=recursionPolicy, concurrency=concurrency)
- controller.run ()
- return dict (controller.stats)
+ c.run ()
+ return dict (c.stats)
diff --git a/crocoite/warc.py b/crocoite/warc.py
index b45bcbe..af04cf9 100644
--- a/crocoite/warc.py
+++ b/crocoite/warc.py
@@ -27,10 +27,7 @@ import json
from io import BytesIO
from warcio.statusandheaders import StatusAndHeaders
from urllib.parse import urlsplit
-from logging.handlers import BufferingHandler
from datetime import datetime
-from threading import Thread
-from queue import Queue
from warcio.timeutils import datetime_to_iso_date
from warcio.warcwriter import WARCWriter
@@ -98,7 +95,6 @@ class WarcHandler (EventHandler):
def _writeResponse (self, item, concurrentTo, rawBody, base64Encoded):
writer = self.writer
- reqId = item.id
resp = item.response
# now the response