diff options
-rw-r--r-- | crocoite/behavior.py | 5 | ||||
-rw-r--r-- | crocoite/browser.py | 9 | ||||
-rw-r--r-- | crocoite/html.py | 2 | ||||
-rw-r--r-- | crocoite/task.py | 12 | ||||
-rw-r--r-- | crocoite/warc.py | 4 |
5 files changed, 10 insertions, 22 deletions
diff --git a/crocoite/behavior.py b/crocoite/behavior.py index ffdc50c..ab859f8 100644 --- a/crocoite/behavior.py +++ b/crocoite/behavior.py @@ -23,7 +23,6 @@ Generic and per-site behavior scripts """ import logging, time -from io import BytesIO from urllib.parse import urlsplit import os.path import pkg_resources @@ -31,13 +30,11 @@ from base64 import b64decode from collections import OrderedDict from html5lib.serializer import HTMLSerializer -from warcio.statusandheaders import StatusAndHeaders from pychrome.exceptions import TimeoutException -from .util import randomString, packageUrl, getFormattedViewportMetrics +from .util import randomString, getFormattedViewportMetrics from . import html from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker -from .browser import SiteLoader logger = logging.getLogger(__name__) diff --git a/crocoite/browser.py b/crocoite/browser.py index 3b9f7ab..1c09598 100644 --- a/crocoite/browser.py +++ b/crocoite/browser.py @@ -27,6 +27,7 @@ from urllib.parse import urlsplit from base64 import b64decode from collections import deque from threading import Event +from http.server import BaseHTTPRequestHandler import pychrome @@ -323,14 +324,10 @@ class SiteLoader: import subprocess, os, time from tempfile import mkdtemp -import socket, shutil +import shutil class ChromeService: - """ - Start Chrome with socket activation (i.e. pass listening socket). Polling - is not required with this method, since reads will block until Chrome is - ready. - """ + """ Start Google Chrome listening on a random port """ __slots__ = ('binary', 'windowSize', 'p', 'userDataDir') diff --git a/crocoite/html.py b/crocoite/html.py index f891101..c929a10 100644 --- a/crocoite/html.py +++ b/crocoite/html.py @@ -105,7 +105,6 @@ eventAttributes = {'onabort', from html5lib.treewalkers.base import TreeWalker from html5lib.filters.base import Filter -from html5lib.serializer import HTMLSerializer from html5lib import constants class ChromeTreeWalker (TreeWalker): @@ -195,7 +194,6 @@ class StripAttributeFilter (Filter): self.attributes = set (map (str.lower, attributes)) def __iter__(self): - default_namespace = constants.namespaces["html"] for token in Filter.__iter__(self): data = token.get ('data') if data and token['type'] in {'StartTag', 'EmptyTag'}: diff --git a/crocoite/task.py b/crocoite/task.py index 9054627..6b3c9d1 100644 --- a/crocoite/task.py +++ b/crocoite/task.py @@ -39,7 +39,7 @@ from celery import Celery from celery.utils.log import get_task_logger from .browser import ChromeService, BrowserCrashed -from .controller import SinglePageController, ControllerSettings, RecursiveController, defaultSettings, DepthLimit, PrefixLimit, StatsHandler +from .controller import SinglePageController, ControllerSettings, RecursiveController, defaultSettings, DepthLimit, StatsHandler from . import behavior from .cli import parseRecursive from .warc import WarcHandler @@ -82,9 +82,9 @@ def archive (self, url, settings, enabledBehaviorNames): enabledBehavior = list (filter (lambda x: x.name in enabledBehaviorNames, behavior.available)) settings = ControllerSettings (**settings) try: - controller = SinglePageController (url, fd, behavior=enabledBehavior, + c = SinglePageController (url, fd, behavior=enabledBehavior, settings=settings, handler=handler) - controller.run () + c.run () except BrowserCrashed: # nothing we can do about that logger.error ('browser crashed for {}'.format (url)) @@ -137,9 +137,9 @@ def controller (self, url, settings, enabledBehaviorNames, recursive, concurrenc recursionPolicy = parseRecursive (recursive, url) enabledBehavior = list (filter (lambda x: x.name in enabledBehaviorNames, behavior.available)) settings = ControllerSettings (**settings) - controller = DistributedRecursiveController (url, None, behavior=enabledBehavior, + c = DistributedRecursiveController (url, None, behavior=enabledBehavior, settings=settings, recursionPolicy=recursionPolicy, concurrency=concurrency) - controller.run () - return dict (controller.stats) + c.run () + return dict (c.stats) diff --git a/crocoite/warc.py b/crocoite/warc.py index b45bcbe..af04cf9 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -27,10 +27,7 @@ import json from io import BytesIO from warcio.statusandheaders import StatusAndHeaders from urllib.parse import urlsplit -from logging.handlers import BufferingHandler from datetime import datetime -from threading import Thread -from queue import Queue from warcio.timeutils import datetime_to_iso_date from warcio.warcwriter import WARCWriter @@ -98,7 +95,6 @@ class WarcHandler (EventHandler): def _writeResponse (self, item, concurrentTo, rawBody, base64Encoded): writer = self.writer - reqId = item.id resp = item.response # now the response |