From 785ef19736cc9a21746e00a022b76fd756c162de Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Mon, 25 Jun 2018 19:55:48 +0200 Subject: warc: Save DOM-/image screenshot as WARC conversion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Judging from the docs this is the proper way to store these resources. Enable both for the IRC bot by default, since they won’t interfere with IA’s wayback machine. --- crocoite/controller.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'crocoite/controller.py') diff --git a/crocoite/controller.py b/crocoite/controller.py index 84001b7..ef042cc 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -71,11 +71,10 @@ class StatsHandler (EventHandler): self.stats['crashed'] += 1 import logging, time -from urllib.parse import urlsplit, urlunsplit from . import behavior as cbehavior from .browser import ChromeService, SiteLoader, Item -from .util import getFormattedViewportMetrics +from .util import getFormattedViewportMetrics, removeFragment class ControllerStart: __slots__ = ('payload') @@ -238,11 +237,6 @@ class PrefixLimit (RecursionPolicy): def __call__ (self, urls): return set (filter (lambda u: u.startswith (self.prefix), urls)) -def removeFragment (u): - """ Remove fragment from url (i.e. #hashvalue) """ - s = urlsplit (u) - return urlunsplit ((s.scheme, s.netloc, s.path, s.query, '')) - from .behavior import ExtractLinksEvent class RecursiveController (EventHandler): -- cgit v1.2.3