diff options
Diffstat (limited to 'crocoite')
-rw-r--r-- | crocoite/behavior.py | 20 | ||||
-rw-r--r-- | crocoite/browser.py | 11 | ||||
-rw-r--r-- | crocoite/controller.py | 22 | ||||
-rw-r--r-- | crocoite/task.py | 2 | ||||
-rw-r--r-- | crocoite/warc.py | 2 |
5 files changed, 56 insertions, 1 deletions
diff --git a/crocoite/behavior.py b/crocoite/behavior.py index fd8fff8..ffdc50c 100644 --- a/crocoite/behavior.py +++ b/crocoite/behavior.py @@ -43,6 +43,9 @@ logger = logging.getLogger(__name__) class Script: """ A JavaScript resource """ + + __slots__ = ('path', 'data') + def __init__ (self, path=None, encoding='utf-8'): self.path = path if path: @@ -61,6 +64,8 @@ class Script: return s class Behavior: + __slots__ = ('loader') + # unique behavior name name = None @@ -102,6 +107,8 @@ class HostnameFilter: class JsOnload (Behavior): """ Execute JavaScript on page load """ + __slots__ = ('script', 'scriptHandle') + scriptPath = None def __init__ (self, loader): @@ -120,6 +127,8 @@ class JsOnload (Behavior): ### Generic scripts ### class Scroll (JsOnload): + __slots__ = ('stopVarname') + name = 'scroll' scriptPath = 'scroll.js' @@ -170,6 +179,8 @@ class EmulateScreenMetrics (Behavior): yield from () class DomSnapshotEvent: + __slots__ = ('url', 'document', 'viewport') + def __init__ (self, url, document, viewport): self.url = url self.document = document @@ -186,6 +197,8 @@ class DomSnapshot (Behavior): can’t handle that though. """ + __slots__ = ('script') + name = 'domSnapshot' def __init__ (self, loader): @@ -222,6 +235,8 @@ class DomSnapshot (Behavior): yield DomSnapshotEvent (doc['documentURL'], serializer.render (stream, 'utf-8'), viewport) class ScreenshotEvent: + __slots__ = ('yoff', 'data') + def __init__ (self, yoff, data): self.yoff = yoff self.data = data @@ -257,6 +272,8 @@ class Click (JsOnload): scriptPath = 'click.js' class ExtractLinksEvent: + __slots__ = ('links') + def __init__ (self, links): self.links = links @@ -268,12 +285,13 @@ class ExtractLinks (Behavior): manually resolve relative links. """ + __slots__ = ('script') + name = 'extractLinks' def __init__ (self, loader): super ().__init__ (loader) self.script = Script ('extract-links.js') - self.links = None def onfinish (self): tab = self.loader.tab diff --git a/crocoite/browser.py b/crocoite/browser.py index 57d0dd0..f583c9b 100644 --- a/crocoite/browser.py +++ b/crocoite/browser.py @@ -36,6 +36,9 @@ class Item: Simple wrapper containing Chrome request and response """ + __slots__ = ('tab', 'chromeRequest', 'chromeResponse', 'chromeFinished', + 'isRedirect', 'failed') + def __init__ (self, tab): self.tab = tab self.chromeRequest = None @@ -164,6 +167,7 @@ class SiteLoader: XXX: track popup windows/new tabs and close them """ + __slots__ = ('requests', 'browser', 'url', 'logger', 'queue', 'notify', 'tab') allowedSchemes = {'http', 'https'} def __init__ (self, browser, url, logger=logging.getLogger(__name__)): @@ -329,6 +333,8 @@ class ChromeService: ready. """ + __slots__ = ('binary', 'windowSize', 'p', 'userDataDir') + def __init__ (self, binary='google-chrome-stable', windowSize=(1920, 1080)): self.binary = binary self.windowSize = windowSize @@ -380,6 +386,8 @@ class ChromeService: self.p = None class NullService: + __slots__ = ('url') + def __init__ (self, url): self.url = url @@ -397,6 +405,7 @@ from operator import itemgetter class TestItem (Item): """ This should be as close to Item as possible """ + __slots__ = ('bodySend', '_body') base = 'http://localhost:8000/' def __init__ (self, path, status, headers, bodyReceive, bodySend=None): @@ -461,6 +470,8 @@ def startServer (): httpd.serve_forever() class TestSiteLoader (unittest.TestCase): + __slots__ = ('server', 'baseurl', 'service', 'browser') + def setUp (self): from multiprocessing import Process self.server = Process (target=startServer) diff --git a/crocoite/controller.py b/crocoite/controller.py index cdae268..84001b7 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -23,6 +23,8 @@ Controller classes, handling actions required for archival """ class ControllerSettings: + __slots__ = ('logBuffer', 'maxBodySize', 'idleTimeout', 'timeout') + def __init__ (self, logBuffer=1000, maxBodySize=50*1024*1024, idleTimeout=2, timeout=10): self.logBuffer = logBuffer self.maxBodySize = maxBodySize @@ -38,6 +40,8 @@ defaultSettings = ControllerSettings () class EventHandler: """ Abstract base class for event handler """ + __slots__ = () + # this handler wants to know about exceptions before they are reraised by # the controller acceptException = False @@ -48,6 +52,8 @@ class EventHandler: from .browser import BrowserCrashed class StatsHandler (EventHandler): + __slots__ = ('stats') + acceptException = True def __init__ (self): @@ -72,6 +78,8 @@ from .browser import ChromeService, SiteLoader, Item from .util import getFormattedViewportMetrics class ControllerStart: + __slots__ = ('payload') + def __init__ (self, payload): self.payload = payload @@ -83,6 +91,8 @@ class SinglePageController: (stats, warc writer). """ + __slots__ = ('url', 'output', 'service', 'behavior', 'settings', 'logger', 'handler') + def __init__ (self, url, output, service=ChromeService (), behavior=cbehavior.available, \ logger=logging.getLogger(__name__), settings=defaultSettings, handler=[]): self.url = url @@ -183,6 +193,9 @@ class SinglePageController: class RecursionPolicy: """ Abstract recursion policy """ + + __slots__ = () + def __call__ (self, urls): raise NotImplementedError @@ -192,6 +205,9 @@ class DepthLimit (RecursionPolicy): depth==0 means no recursion, depth==1 is the page and outgoing links, … """ + + __slots__ = ('maxdepth') + def __init__ (self, maxdepth=0): self.maxdepth = maxdepth @@ -213,6 +229,9 @@ class PrefixLimit (RecursionPolicy): ignored: http://example.com/bar http://offsite.example/foo accepted: http://example.com/foobar http://example.com/foo/bar """ + + __slots__ = ('prefix') + def __init__ (self, prefix): self.prefix = prefix @@ -233,6 +252,9 @@ class RecursiveController (EventHandler): Visits links acording to recursionPolicy """ + __slots__ = ('url', 'output', 'service', 'behavior', 'settings', 'logger', + 'recursionPolicy', 'handler', 'urls', 'have') + def __init__ (self, url, output, service=ChromeService (), behavior=cbehavior.available, \ logger=logging.getLogger(__name__), settings=defaultSettings, recursionPolicy=DepthLimit (0), handler=[]): diff --git a/crocoite/task.py b/crocoite/task.py index 48fe5d8..9054627 100644 --- a/crocoite/task.py +++ b/crocoite/task.py @@ -111,6 +111,8 @@ class IntegerDict (UserDict): class DistributedRecursiveController (RecursiveController): """ Distributed, recursive controller using celery """ + __slots__ = ('concurrency', 'stats') + def __init__ (self, url, service=ChromeService (), behavior=behavior.available, \ logger=logging.getLogger(__name__), settings=defaultSettings, recursionPolicy=DepthLimit (0), concurrency=1): diff --git a/crocoite/warc.py b/crocoite/warc.py index fd8ce8d..b45bcbe 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -41,6 +41,8 @@ from .behavior import Script, DomSnapshotEvent, ScreenshotEvent from .browser import Item class WarcHandler (EventHandler): + __slots__ = ('logger', 'writer', 'maxBodySize') + def __init__ (self, fd, logger=logging.getLogger(__name__), maxBodySize=defaultSettings.maxBodySize): |