diff options
Diffstat (limited to 'crocoite')
| -rw-r--r-- | crocoite/behavior.py | 20 | ||||
| -rw-r--r-- | crocoite/browser.py | 11 | ||||
| -rw-r--r-- | crocoite/controller.py | 22 | ||||
| -rw-r--r-- | crocoite/task.py | 2 | ||||
| -rw-r--r-- | crocoite/warc.py | 2 | 
5 files changed, 56 insertions, 1 deletions
| diff --git a/crocoite/behavior.py b/crocoite/behavior.py index fd8fff8..ffdc50c 100644 --- a/crocoite/behavior.py +++ b/crocoite/behavior.py @@ -43,6 +43,9 @@ logger = logging.getLogger(__name__)  class Script:      """ A JavaScript resource """ + +    __slots__ = ('path', 'data') +      def __init__ (self, path=None, encoding='utf-8'):          self.path = path          if path: @@ -61,6 +64,8 @@ class Script:          return s  class Behavior: +    __slots__ = ('loader') +      # unique behavior name      name = None @@ -102,6 +107,8 @@ class HostnameFilter:  class JsOnload (Behavior):      """ Execute JavaScript on page load """ +    __slots__ = ('script', 'scriptHandle') +      scriptPath = None      def __init__ (self, loader): @@ -120,6 +127,8 @@ class JsOnload (Behavior):  ### Generic scripts ###  class Scroll (JsOnload): +    __slots__ = ('stopVarname') +      name = 'scroll'      scriptPath = 'scroll.js' @@ -170,6 +179,8 @@ class EmulateScreenMetrics (Behavior):          yield from ()  class DomSnapshotEvent: +    __slots__ = ('url', 'document', 'viewport') +      def __init__ (self, url, document, viewport):          self.url = url          self.document = document @@ -186,6 +197,8 @@ class DomSnapshot (Behavior):      can’t handle that though.      """ +    __slots__ = ('script') +      name = 'domSnapshot'      def __init__ (self, loader): @@ -222,6 +235,8 @@ class DomSnapshot (Behavior):                  yield DomSnapshotEvent (doc['documentURL'], serializer.render (stream, 'utf-8'), viewport)  class ScreenshotEvent: +    __slots__ = ('yoff', 'data') +      def __init__ (self, yoff, data):          self.yoff = yoff          self.data = data @@ -257,6 +272,8 @@ class Click (JsOnload):      scriptPath = 'click.js'  class ExtractLinksEvent: +    __slots__ = ('links') +      def __init__ (self, links):          self.links = links @@ -268,12 +285,13 @@ class ExtractLinks (Behavior):      manually resolve relative links.      """ +    __slots__ = ('script') +      name = 'extractLinks'      def __init__ (self, loader):          super ().__init__ (loader)          self.script = Script ('extract-links.js') -        self.links = None      def onfinish (self):          tab = self.loader.tab diff --git a/crocoite/browser.py b/crocoite/browser.py index 57d0dd0..f583c9b 100644 --- a/crocoite/browser.py +++ b/crocoite/browser.py @@ -36,6 +36,9 @@ class Item:      Simple wrapper containing Chrome request and response      """ +    __slots__ = ('tab', 'chromeRequest', 'chromeResponse', 'chromeFinished', +            'isRedirect', 'failed') +      def __init__ (self, tab):          self.tab = tab          self.chromeRequest = None @@ -164,6 +167,7 @@ class SiteLoader:      XXX: track popup windows/new tabs and close them      """ +    __slots__ = ('requests', 'browser', 'url', 'logger', 'queue', 'notify', 'tab')      allowedSchemes = {'http', 'https'}      def __init__ (self, browser, url, logger=logging.getLogger(__name__)): @@ -329,6 +333,8 @@ class ChromeService:      ready.      """ +    __slots__ = ('binary', 'windowSize', 'p', 'userDataDir') +      def __init__ (self, binary='google-chrome-stable', windowSize=(1920, 1080)):          self.binary = binary          self.windowSize = windowSize @@ -380,6 +386,8 @@ class ChromeService:          self.p = None  class NullService: +    __slots__ = ('url') +      def __init__ (self, url):          self.url = url @@ -397,6 +405,7 @@ from operator import itemgetter  class TestItem (Item):      """ This should be as close to Item as possible """ +    __slots__ = ('bodySend', '_body')      base = 'http://localhost:8000/'      def __init__ (self, path, status, headers, bodyReceive, bodySend=None): @@ -461,6 +470,8 @@ def startServer ():      httpd.serve_forever()  class TestSiteLoader (unittest.TestCase): +    __slots__ = ('server', 'baseurl', 'service', 'browser') +      def setUp (self):          from multiprocessing import Process          self.server = Process (target=startServer) diff --git a/crocoite/controller.py b/crocoite/controller.py index cdae268..84001b7 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -23,6 +23,8 @@ Controller classes, handling actions required for archival  """  class ControllerSettings: +    __slots__ = ('logBuffer', 'maxBodySize', 'idleTimeout', 'timeout') +      def __init__ (self, logBuffer=1000, maxBodySize=50*1024*1024, idleTimeout=2, timeout=10):          self.logBuffer = logBuffer          self.maxBodySize = maxBodySize @@ -38,6 +40,8 @@ defaultSettings = ControllerSettings ()  class EventHandler:      """ Abstract base class for event handler """ +    __slots__ = () +      # this handler wants to know about exceptions before they are reraised by      # the controller      acceptException = False @@ -48,6 +52,8 @@ class EventHandler:  from .browser import BrowserCrashed  class StatsHandler (EventHandler): +    __slots__ = ('stats') +      acceptException = True      def __init__ (self): @@ -72,6 +78,8 @@ from .browser import ChromeService, SiteLoader, Item  from .util import getFormattedViewportMetrics  class ControllerStart: +    __slots__ = ('payload') +      def __init__ (self, payload):          self.payload = payload @@ -83,6 +91,8 @@ class SinglePageController:      (stats, warc writer).      """ +    __slots__ = ('url', 'output', 'service', 'behavior', 'settings', 'logger', 'handler') +      def __init__ (self, url, output, service=ChromeService (), behavior=cbehavior.available, \              logger=logging.getLogger(__name__), settings=defaultSettings, handler=[]):          self.url = url @@ -183,6 +193,9 @@ class SinglePageController:  class RecursionPolicy:      """ Abstract recursion policy """ + +    __slots__ = () +      def __call__ (self, urls):          raise NotImplementedError @@ -192,6 +205,9 @@ class DepthLimit (RecursionPolicy):      depth==0 means no recursion, depth==1 is the page and outgoing links, …      """ + +    __slots__ = ('maxdepth') +      def __init__ (self, maxdepth=0):          self.maxdepth = maxdepth @@ -213,6 +229,9 @@ class PrefixLimit (RecursionPolicy):      ignored: http://example.com/bar http://offsite.example/foo      accepted: http://example.com/foobar http://example.com/foo/bar      """ + +    __slots__ = ('prefix') +      def __init__ (self, prefix):          self.prefix = prefix @@ -233,6 +252,9 @@ class RecursiveController (EventHandler):      Visits links acording to recursionPolicy      """ +    __slots__ = ('url', 'output', 'service', 'behavior', 'settings', 'logger', +            'recursionPolicy', 'handler', 'urls', 'have') +      def __init__ (self, url, output, service=ChromeService (), behavior=cbehavior.available, \              logger=logging.getLogger(__name__), settings=defaultSettings,              recursionPolicy=DepthLimit (0), handler=[]): diff --git a/crocoite/task.py b/crocoite/task.py index 48fe5d8..9054627 100644 --- a/crocoite/task.py +++ b/crocoite/task.py @@ -111,6 +111,8 @@ class IntegerDict (UserDict):  class DistributedRecursiveController (RecursiveController):      """ Distributed, recursive controller using celery """ +    __slots__ = ('concurrency', 'stats') +      def __init__ (self, url, service=ChromeService (), behavior=behavior.available, \              logger=logging.getLogger(__name__), settings=defaultSettings,              recursionPolicy=DepthLimit (0), concurrency=1): diff --git a/crocoite/warc.py b/crocoite/warc.py index fd8ce8d..b45bcbe 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -41,6 +41,8 @@ from .behavior import Script, DomSnapshotEvent, ScreenshotEvent  from .browser import Item  class WarcHandler (EventHandler): +    __slots__ = ('logger', 'writer', 'maxBodySize') +      def __init__ (self, fd,              logger=logging.getLogger(__name__),              maxBodySize=defaultSettings.maxBodySize): | 
