summaryrefslogtreecommitdiff
path: root/crocoite/controller.py
blob: a338559bad7b71aaac58a7433e853e2e54869c87 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# Copyright (c) 2017–2018 crocoite contributors
# 
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# 
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# 
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

"""
Controller classes, handling actions required for archival
"""

class ControllerSettings:
    def __init__ (self, logBuffer=1000, maxBodySize=50*1024*1024, idleTimeout=2, timeout=10):
        self.logBuffer = logBuffer
        self.maxBodySize = maxBodySize
        self.idleTimeout = idleTimeout
        self.timeout = timeout

defaultSettings = ControllerSettings ()

import logging

import pychrome

from . import behavior as cbehavior
from .browser import ChromeService
from .warc import WarcLoader, SerializingWARCWriter
from .util import getFormattedViewportMetrics

class SinglePageController:
    """
    Archive a single page url to file output.
    """

    def __init__ (self, url, output, service=ChromeService (), behavior=cbehavior.available, \
            logger=logging.getLogger(__name__), settings=defaultSettings):
        self.url = url
        self.output = output
        self.service = service
        self.behavior = behavior
        self.settings = settings
        self.logger = logger

    def run (self):
        ret = {'stats': None}

        with self.service as browser:
            browser = pychrome.Browser (url=browser)
            writer = SerializingWARCWriter (self.output, gzip=True)

            with WarcLoader (browser, self.url, writer,
                    logBuffer=self.settings.logBuffer,
                    maxBodySize=self.settings.maxBodySize) as l:
                version = l.tab.Browser.getVersion ()
                payload = {
                        'software': __package__,
                        'browser': version['product'],
                        'useragent': version['userAgent'],
                        'viewport': getFormattedViewportMetrics (l.tab),
                        }
                warcinfo = writer.create_warcinfo_record (filename=None, info=payload)
                writer.write_record (warcinfo)

                # not all behavior scripts are allowed for every URL, filter them
                enabledBehavior = list (filter (lambda x: self.url in x,
                        map (lambda x: x (l), self.behavior)))

                for b in enabledBehavior:
                    self.logger.debug ('starting onload behavior {}'.format (b.name))
                    b.onload ()
                l.start ()

                l.waitIdle (self.settings.idleTimeout, self.settings.timeout)

                for b in enabledBehavior:
                    self.logger.debug ('starting onstop behavior {}'.format (b.name))
                    b.onstop ()

                # if we stopped due to timeout, wait for remaining assets
                l.waitIdle (2, 60)
                l.stop ()

                for b in enabledBehavior:
                    self.logger.debug ('starting onfinish behavior {}'.format (b.name))
                    b.onfinish ()

                ret['stats'] = l.stats
            writer.flush ()
        return ret