# Copyright (c) 2017 crocoite contributors
# 
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# 
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# 
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

def main ():
    import os, random, logging, argparse
    from io import BytesIO
    import pychrome
    from urllib.parse import urlsplit
    from warcio.warcwriter import WARCWriter
    from warcio.statusandheaders import StatusAndHeaders
    from html5lib.serializer import HTMLSerializer
    from . import html, packageData, packageUrl
    from .warc import WarcLoader
    from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker

    def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'):
        if length is None:
            length = random.randint (16, 32)
        return ''.join (map (lambda x: random.choice (chars), range (length)))
    
    def getFormattedViewportMetrics (tab):
        layoutMetrics = tab.Page.getLayoutMetrics ()
        # XXX: I’m not entirely sure which one we should use here
        return '{}x{}'.format (layoutMetrics['layoutViewport']['clientWidth'],
                    layoutMetrics['layoutViewport']['clientHeight'])

    def writeDOMSnapshot (tab, writer):
        """
        Get a DOM snapshot of tab and write it to WARC.

        We could use DOMSnapshot.getSnapshot here, but the API is not stable
        yet. Also computed styles are not really necessary here.

        XXX: Currently writes a response, when it should use “resource”. pywb
        can’t handle that though.
        """
        viewport = getFormattedViewportMetrics (tab)
        dom = tab.DOM.getDocument (depth=-1, pierce=True)
        haveUrls = set ()
        for doc in ChromeTreeWalker (dom['root']).split ():
            rawUrl = doc['documentURL']
            if rawUrl in haveUrls:
                # ignore duplicate URLs. they are usually caused by
                # javascript-injected iframes (advertising) with no(?) src
                logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl))
                continue
            url = urlsplit (rawUrl)
            if url.scheme in ('http', 'https'):
                logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL']))
                haveUrls.add (rawUrl)
                walker = ChromeTreeWalker (doc)
                # remove script, to make the page static and noscript, because at the
                # time we took the snapshot scripts were enabled
                disallowedTags = ['script', 'noscript']
                disallowedAttributes = html.eventAttributes
                stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes)
                serializer = HTMLSerializer ()
                httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
                record = writer.create_warc_record (doc['documentURL'], 'response',
                        payload=BytesIO (serializer.render (stream, 'utf8')),
                        http_headers=httpHeaders,
                        warc_headers_dict={'X-DOM-Snapshot': str (True),
                                'X-Chrome-Viewport': viewport})
                writer.write_record (record)

    def emulateScreenMetrics (l):
        """
        Emulate different screen sizes, causing the site to fetch assets (img
        srcset and css, for example) for different screen resolutions.
        """
        cssPpi = 96
        sizes = [
                {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1.5, 'mobile': False},
                {'width': 1920, 'height': 1080, 'deviceScaleFactor': 2, 'mobile': False},
                # very dense display
                {'width': 1920, 'height': 1080, 'deviceScaleFactor': 4, 'mobile': False},
                # just a few samples:
                # 1st gen iPhone (portrait mode)
                {'width': 320, 'height': 480, 'deviceScaleFactor': 163/cssPpi, 'mobile': True},
                # 6th gen iPhone (portrait mode)
                {'width': 750, 'height': 1334, 'deviceScaleFactor': 326/cssPpi, 'mobile': True},
                # and reset
                {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1, 'mobile': False},
                ]
        for s in sizes:
            l.tab.Emulation.setDeviceMetricsOverride (**s)
            l.wait (1)
        # XXX: this seems to be broken, it does not clear the override
        #tab.Emulation.clearDeviceMetricsOverride ()
        # wait until assets finished loading
        l.waitIdle (2, 10)

    def loadScripts (paths, scripts=[]):
        for p in paths:
            if not os.path.exists (p):
                # search for defaults scripts in package data directory
                p = packageData (p)
            with open (p, 'r') as fd:
                scripts.append (fd.read ())
        return '\n'.join (scripts)

    def writeScript (path, source, writer):
        record = writer.create_warc_record (packageUrl (path), 'metadata',
                payload=BytesIO (source.encode ('utf8')),
                warc_headers_dict={'Content-Type': 'application/javascript; charset=utf-8'})
        writer.write_record (record)

    logger = logging.getLogger(__name__)
    logging.basicConfig (level=logging.DEBUG)

    parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.')
    parser.add_argument('--browser', default='http://127.0.0.1:9222', help='DevTools URL')
    parser.add_argument('--timeout', default=10, type=int, help='Maximum time for archival')
    parser.add_argument('--idle-timeout', default=2, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout')
    parser.add_argument('--log-buffer', default=1000, type=int, dest='logBuffer')
    parser.add_argument('--max-body-size', default=10*1024*1024, type=int, dest='maxBodySize', help='Max body size in bytes')
    #parser.add_argument('--keep-tab', action='store_true', default=False, dest='keepTab', help='Keep tab open')
    parser.add_argument('--onload', default=[], action='append', help='Inject JavaScript file before loading page')
    parser.add_argument('--onsnapshot', default=[], action='append', help='Run JavaScript files before creating DOM snapshot')
    parser.add_argument('url', help='Website URL')
    parser.add_argument('output', help='WARC filename')

    args = parser.parse_args ()

    stopVarname = '__' + __package__ + '_stop__'
    # avoid sites messing with our scripts by using a random stop variable name
    newStopVarname = randomString ()
    onload = loadScripts (args.onload, ['var {} = false;\n'.format (stopVarname)]).replace (stopVarname, newStopVarname)
    stopVarname = newStopVarname

    browser = pychrome.Browser(url=args.browser)

    fd = open (args.output, 'wb')
    writer = WARCWriter (fd, gzip=True)

    with WarcLoader (browser, args.url, writer, logBuffer=args.logBuffer,
            maxBodySize=args.maxBodySize) as l:
        version = l.tab.Browser.getVersion ()
        payload = {
                'software': __package__,
                'browser': version['product'],
                'useragent': version['userAgent'],
                'viewport': getFormattedViewportMetrics (l.tab),
                }
        warcinfo = writer.create_warcinfo_record (filename=None, info=payload)
        writer.write_record (warcinfo)
        # save onload script as well
        writeScript ('onload', onload, writer)

        # inject our custom javascript to the page before loading
        l.tab.Page.addScriptToEvaluateOnNewDocument (source=onload)
        l.start ()

        l.waitIdle (args.idleTimeout, args.timeout)

        # get ready for snapshot: stop loading and scripts, disable events
        l.tab.Runtime.evaluate (expression='{} = true; window.scrollTo (0, 0);'.format (stopVarname), returnByValue=True)
        # if we stopped due to timeout, wait for remaining assets
        l.waitIdle (2, 10)

        emulateScreenMetrics (l)

        l.stop ()

        script = loadScripts (args.onsnapshot)
        writeScript ('onsnapshot', script, writer)
        l.tab.Runtime.evaluate (expression=script, returnByValue=True)
        writeDOMSnapshot (l.tab, writer)

    return True