summaryrefslogtreecommitdiff
path: root/crocoite
diff options
context:
space:
mode:
Diffstat (limited to 'crocoite')
-rw-r--r--crocoite/behavior.py20
-rw-r--r--crocoite/browser.py11
-rw-r--r--crocoite/controller.py22
-rw-r--r--crocoite/task.py2
-rw-r--r--crocoite/warc.py2
5 files changed, 56 insertions, 1 deletions
diff --git a/crocoite/behavior.py b/crocoite/behavior.py
index fd8fff8..ffdc50c 100644
--- a/crocoite/behavior.py
+++ b/crocoite/behavior.py
@@ -43,6 +43,9 @@ logger = logging.getLogger(__name__)
class Script:
""" A JavaScript resource """
+
+ __slots__ = ('path', 'data')
+
def __init__ (self, path=None, encoding='utf-8'):
self.path = path
if path:
@@ -61,6 +64,8 @@ class Script:
return s
class Behavior:
+ __slots__ = ('loader')
+
# unique behavior name
name = None
@@ -102,6 +107,8 @@ class HostnameFilter:
class JsOnload (Behavior):
""" Execute JavaScript on page load """
+ __slots__ = ('script', 'scriptHandle')
+
scriptPath = None
def __init__ (self, loader):
@@ -120,6 +127,8 @@ class JsOnload (Behavior):
### Generic scripts ###
class Scroll (JsOnload):
+ __slots__ = ('stopVarname')
+
name = 'scroll'
scriptPath = 'scroll.js'
@@ -170,6 +179,8 @@ class EmulateScreenMetrics (Behavior):
yield from ()
class DomSnapshotEvent:
+ __slots__ = ('url', 'document', 'viewport')
+
def __init__ (self, url, document, viewport):
self.url = url
self.document = document
@@ -186,6 +197,8 @@ class DomSnapshot (Behavior):
can’t handle that though.
"""
+ __slots__ = ('script')
+
name = 'domSnapshot'
def __init__ (self, loader):
@@ -222,6 +235,8 @@ class DomSnapshot (Behavior):
yield DomSnapshotEvent (doc['documentURL'], serializer.render (stream, 'utf-8'), viewport)
class ScreenshotEvent:
+ __slots__ = ('yoff', 'data')
+
def __init__ (self, yoff, data):
self.yoff = yoff
self.data = data
@@ -257,6 +272,8 @@ class Click (JsOnload):
scriptPath = 'click.js'
class ExtractLinksEvent:
+ __slots__ = ('links')
+
def __init__ (self, links):
self.links = links
@@ -268,12 +285,13 @@ class ExtractLinks (Behavior):
manually resolve relative links.
"""
+ __slots__ = ('script')
+
name = 'extractLinks'
def __init__ (self, loader):
super ().__init__ (loader)
self.script = Script ('extract-links.js')
- self.links = None
def onfinish (self):
tab = self.loader.tab
diff --git a/crocoite/browser.py b/crocoite/browser.py
index 57d0dd0..f583c9b 100644
--- a/crocoite/browser.py
+++ b/crocoite/browser.py
@@ -36,6 +36,9 @@ class Item:
Simple wrapper containing Chrome request and response
"""
+ __slots__ = ('tab', 'chromeRequest', 'chromeResponse', 'chromeFinished',
+ 'isRedirect', 'failed')
+
def __init__ (self, tab):
self.tab = tab
self.chromeRequest = None
@@ -164,6 +167,7 @@ class SiteLoader:
XXX: track popup windows/new tabs and close them
"""
+ __slots__ = ('requests', 'browser', 'url', 'logger', 'queue', 'notify', 'tab')
allowedSchemes = {'http', 'https'}
def __init__ (self, browser, url, logger=logging.getLogger(__name__)):
@@ -329,6 +333,8 @@ class ChromeService:
ready.
"""
+ __slots__ = ('binary', 'windowSize', 'p', 'userDataDir')
+
def __init__ (self, binary='google-chrome-stable', windowSize=(1920, 1080)):
self.binary = binary
self.windowSize = windowSize
@@ -380,6 +386,8 @@ class ChromeService:
self.p = None
class NullService:
+ __slots__ = ('url')
+
def __init__ (self, url):
self.url = url
@@ -397,6 +405,7 @@ from operator import itemgetter
class TestItem (Item):
""" This should be as close to Item as possible """
+ __slots__ = ('bodySend', '_body')
base = 'http://localhost:8000/'
def __init__ (self, path, status, headers, bodyReceive, bodySend=None):
@@ -461,6 +470,8 @@ def startServer ():
httpd.serve_forever()
class TestSiteLoader (unittest.TestCase):
+ __slots__ = ('server', 'baseurl', 'service', 'browser')
+
def setUp (self):
from multiprocessing import Process
self.server = Process (target=startServer)
diff --git a/crocoite/controller.py b/crocoite/controller.py
index cdae268..84001b7 100644
--- a/crocoite/controller.py
+++ b/crocoite/controller.py
@@ -23,6 +23,8 @@ Controller classes, handling actions required for archival
"""
class ControllerSettings:
+ __slots__ = ('logBuffer', 'maxBodySize', 'idleTimeout', 'timeout')
+
def __init__ (self, logBuffer=1000, maxBodySize=50*1024*1024, idleTimeout=2, timeout=10):
self.logBuffer = logBuffer
self.maxBodySize = maxBodySize
@@ -38,6 +40,8 @@ defaultSettings = ControllerSettings ()
class EventHandler:
""" Abstract base class for event handler """
+ __slots__ = ()
+
# this handler wants to know about exceptions before they are reraised by
# the controller
acceptException = False
@@ -48,6 +52,8 @@ class EventHandler:
from .browser import BrowserCrashed
class StatsHandler (EventHandler):
+ __slots__ = ('stats')
+
acceptException = True
def __init__ (self):
@@ -72,6 +78,8 @@ from .browser import ChromeService, SiteLoader, Item
from .util import getFormattedViewportMetrics
class ControllerStart:
+ __slots__ = ('payload')
+
def __init__ (self, payload):
self.payload = payload
@@ -83,6 +91,8 @@ class SinglePageController:
(stats, warc writer).
"""
+ __slots__ = ('url', 'output', 'service', 'behavior', 'settings', 'logger', 'handler')
+
def __init__ (self, url, output, service=ChromeService (), behavior=cbehavior.available, \
logger=logging.getLogger(__name__), settings=defaultSettings, handler=[]):
self.url = url
@@ -183,6 +193,9 @@ class SinglePageController:
class RecursionPolicy:
""" Abstract recursion policy """
+
+ __slots__ = ()
+
def __call__ (self, urls):
raise NotImplementedError
@@ -192,6 +205,9 @@ class DepthLimit (RecursionPolicy):
depth==0 means no recursion, depth==1 is the page and outgoing links, …
"""
+
+ __slots__ = ('maxdepth')
+
def __init__ (self, maxdepth=0):
self.maxdepth = maxdepth
@@ -213,6 +229,9 @@ class PrefixLimit (RecursionPolicy):
ignored: http://example.com/bar http://offsite.example/foo
accepted: http://example.com/foobar http://example.com/foo/bar
"""
+
+ __slots__ = ('prefix')
+
def __init__ (self, prefix):
self.prefix = prefix
@@ -233,6 +252,9 @@ class RecursiveController (EventHandler):
Visits links acording to recursionPolicy
"""
+ __slots__ = ('url', 'output', 'service', 'behavior', 'settings', 'logger',
+ 'recursionPolicy', 'handler', 'urls', 'have')
+
def __init__ (self, url, output, service=ChromeService (), behavior=cbehavior.available, \
logger=logging.getLogger(__name__), settings=defaultSettings,
recursionPolicy=DepthLimit (0), handler=[]):
diff --git a/crocoite/task.py b/crocoite/task.py
index 48fe5d8..9054627 100644
--- a/crocoite/task.py
+++ b/crocoite/task.py
@@ -111,6 +111,8 @@ class IntegerDict (UserDict):
class DistributedRecursiveController (RecursiveController):
""" Distributed, recursive controller using celery """
+ __slots__ = ('concurrency', 'stats')
+
def __init__ (self, url, service=ChromeService (), behavior=behavior.available, \
logger=logging.getLogger(__name__), settings=defaultSettings,
recursionPolicy=DepthLimit (0), concurrency=1):
diff --git a/crocoite/warc.py b/crocoite/warc.py
index fd8ce8d..b45bcbe 100644
--- a/crocoite/warc.py
+++ b/crocoite/warc.py
@@ -41,6 +41,8 @@ from .behavior import Script, DomSnapshotEvent, ScreenshotEvent
from .browser import Item
class WarcHandler (EventHandler):
+ __slots__ = ('logger', 'writer', 'maxBodySize')
+
def __init__ (self, fd,
logger=logging.getLogger(__name__),
maxBodySize=defaultSettings.maxBodySize):