1 files changed, 243 insertions, 26 deletions
diff --git a/crocoite/tools.py b/crocoite/tools.py
index 3aeaaad..a2ddaa3 100644
--- a/crocoite/tools.py
+++ b/crocoite/tools.py
@@ -22,64 +22,117 @@
 Misc tools
 """
 
-import shutil, sys, re, os, logging, argparse
+import shutil, sys, os, logging, argparse, json
+from io import BytesIO
+
 from warcio.archiveiterator import ArchiveIterator
 from warcio.warcwriter import WARCWriter
+from yarl import URL
 
-def mergeWarc ():
-    """
-    Merge multiple WARC files into a single file, writing revisit records for
-    items which occur multiple times
-    """
-
-    parser = argparse.ArgumentParser(description='Merge WARCs, reads filenames from stdin.')
-    parser.add_argument('--verbose', '-v', action='store_true')
-    parser.add_argument('output', type=argparse.FileType ('wb'), help='Output WARC')
+from pkg_resources import parse_version, parse_requirements
 
-    args = parser.parse_args()
-    loglevel = logging.DEBUG if args.verbose else logging.INFO
-    logging.basicConfig (level=loglevel)
+from .util import getSoftwareInfo, StrJsonEncoder
+from .warc import jsonMime, makeContentType
 
+def mergeWarc (files, output):
+    # stats
     unique = 0
     revisit = 0
+    uniqueLength = 0
+    revisitLength = 0
+
     payloadMap = {}
-    writer = WARCWriter (args.output, gzip=True)
-    for l in sys.stdin:
-        l = l.strip ()
+    writer = WARCWriter (output, gzip=True)
+
+    # Add an additional warcinfo record, describing the transformations. This
+    # is not ideal, since
+    #   “A ‘warcinfo’ record describes the records that
+    #   follow it […] until next ‘warcinfo’”
+    #   -- https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#warcinfo
+    # A warcinfo record is expected at the beginning of every file. But it
+    # might have written by a different software, so we don’t want to
+    # strip/replace that information, but supplement it.
+    warcinfo = {
+            'software': getSoftwareInfo (),
+            'tool': 'crocoite-merge', # not the name of the cli tool
+            'parameters': {'inputs': files},
+            }
+    payload = BytesIO (json.dumps (warcinfo, indent=2).encode ('utf-8'))
+    record = writer.create_warc_record ('', 'warcinfo',
+            payload=payload,
+            warc_headers_dict={'Content-Type': makeContentType (jsonMime, 'utf-8')})
+    writer.write_record (record)
+
+    for l in files:
         with open (l, 'rb') as fd:
             for record in ArchiveIterator (fd):
                 if record.rec_type in {'resource', 'response'}:
                     headers = record.rec_headers
                     rid = headers.get_header('WARC-Record-ID')
                     csum = headers.get_header('WARC-Payload-Digest')
+                    length = int (headers.get_header ('Content-Length'))
                     dup = payloadMap.get (csum, None)
                     if dup is None:
                         payloadMap[csum] = {'uri': headers.get_header('WARC-Target-URI'),
                                 'id': rid, 'date': headers.get_header('WARC-Date')}
                         unique += 1
+                        uniqueLength += length
                     else:
-                        logging.debug ('Record {} is duplicate of {}'.format (rid, dup['id']))
-                        record = writer.create_revisit_record (dup['uri'], csum, dup['uri'], dup['date'])
+                        logging.debug (f'Record {rid} is duplicate of {dup["id"]}')
+                        # Payload may be identical, but HTTP headers are
+                        # (probably) not. Include them.
+                        record = writer.create_revisit_record (
+                                headers.get_header('WARC-Target-URI'), digest=csum,
+                                refers_to_uri=dup['uri'], refers_to_date=dup['date'],
+                                http_headers=record.http_headers)
                         record.rec_headers.add_header ('WARC-Truncated', 'length')
                         record.rec_headers.add_header ('WARC-Refers-To', dup['id'])
                         revisit += 1
+                        revisitLength += length
                 else:
                     unique += 1
                 writer.write_record (record)
-    logging.info ('Wrote {} unique records, {} revisits'.format (unique, revisit))
+    json.dump (dict (
+            unique=dict (records=unique, bytes=uniqueLength),
+            revisit=dict (records=revisit, bytes=revisitLength),
+            ratio=dict (
+                    records=unique/(unique+revisit),
+                    bytes=uniqueLength/(uniqueLength+revisitLength)
+                    ),
+            ),
+            sys.stdout,
+            cls=StrJsonEncoder)
+    sys.stdout.write ('\n')
+
+def mergeWarcCli():
+    parser = argparse.ArgumentParser(description='Merge WARCs, reads filenames from stdin.')
+    parser.add_argument('--verbose', '-v', action='store_true')
+    parser.add_argument('output', type=argparse.FileType ('wb'), help='Output WARC')
+
+    args = parser.parse_args()
+    loglevel = logging.DEBUG if args.verbose else logging.INFO
+    logging.basicConfig (level=loglevel)
+
+    mergeWarc([l.strip() for l in sys.stdin], args.output)
 
 def extractScreenshot ():
     """
     Extract page screenshots from a WARC generated by crocoite into files
     """
 
-    parser = argparse.ArgumentParser(description='Extract screenshots.')
-    parser.add_argument('-f', '--force', action='store_true', help='Overwrite existing files')
-    parser.add_argument('input', type=argparse.FileType ('rb'), help='Input WARC')
+    parser = argparse.ArgumentParser(description='Extract screenshots from '
+            'WARC, write JSON info to stdout.')
+    parser.add_argument('-f', '--force', action='store_true',
+            help='Overwrite existing files')
+    parser.add_argument('-1', '--one', action='store_true',
+            help='Only extract the first screenshot into a file named prefix')
+    parser.add_argument('input', type=argparse.FileType ('rb'),
+            help='Input WARC')
     parser.add_argument('prefix', help='Output file prefix')
 
     args = parser.parse_args()
 
+    i = 0
     with args.input:
         for record in ArchiveIterator (args.input):
             headers = record.rec_headers
@@ -88,13 +141,177 @@ def extractScreenshot ():
                     'X-Crocoite-Screenshot-Y-Offset' not in headers:
                 continue
 
-            urlSanitized = headers.get_header('WARC-Target-URI').replace ('/', '_')
-            xoff = 0
+            url = URL (headers.get_header ('WARC-Target-URI'))
             yoff = int (headers.get_header ('X-Crocoite-Screenshot-Y-Offset'))
-            outpath = '{}-{}-{}-{}.png'.format (args.prefix, urlSanitized, xoff, yoff)
+            outpath = f'{args.prefix}{i:05d}.png' if not args.one else args.prefix
             if args.force or not os.path.exists (outpath):
+                json.dump ({'file': outpath, 'url': url, 'yoff': yoff},
+                        sys.stdout, cls=StrJsonEncoder)
+                sys.stdout.write ('\n')
                 with open (outpath, 'wb') as out:
                     shutil.copyfileobj (record.raw_stream, out)
+                i += 1
             else:
-                print ('not overwriting {}'.format (outpath))
+                print (f'not overwriting {outpath}', file=sys.stderr)
+
+            if args.one:
+                break
+
+class Errata:
+    __slots__ = ('uuid', 'description', 'url', 'affects')
+
+    def __init__ (self, uuid, description, affects, url=None):
+        self.uuid = uuid
+        self.description = description
+        self.url = url
+        # slightly abusing setuptool’s version parsing/matching here
+        self.affects = list (parse_requirements(affects))
+
+    def __contains__ (self, pkg):
+        """
+        Return True if the versions in pkg are affected by this errata
+
+        pkg must be a mapping from project_name to version
+        """
+        matchedAll = []
+        for a in self.affects:
+            haveVersion = pkg.get (a.project_name, None)
+            matchedAll.append (haveVersion is not None and haveVersion in a)
+        return all (matchedAll)
+
+    def __repr__ (self):
+        return f'{self.__class__.__name__}({self.uuid!r}, {self.description!r}, {self.affects!r})'
+
+    @property
+    def fixable (self):
+        return getattr (self, 'applyFix', None) is not None
+
+    def toDict (self):
+        return {'uuid': self.uuid,
+                'description': self.description,
+                'url': self.url,
+                'affects': list (map (str, self.affects)),
+                'fixable': self.fixable}
+
+class FixableErrata(Errata):
+    __slots__ = ('stats')
+
+    def __init__ (self, uuid, description, affects, url=None):
+        super().__init__ (uuid, description, affects, url)
+        # statistics for fixable erratas
+        self.stats = dict (records=dict (fixed=0, processed=0))
+
+    def applyFix (self, record):
+        raise NotImplementedError () # pragma: no cover
+
+class ContentTypeErrata (FixableErrata):
+    def __init__ (self):
+        super().__init__ (
+            uuid='552c13dc-56e5-4539-9ad8-184ccae60930',
+            description='Content-Type header uses wrong argument name encoding instead of charset.',
+            url='https://github.com/PromyLOPh/crocoite/issues/19',
+            affects=['crocoite==1.0.0'])
+
+    def applyFix (self, record):
+        # XXX: this is ugly. warcio’s write_record replaces any Content-Type
+        # header we’re setting with this one. But printing rec_headers shows
+        # the header, not .content_type.
+        contentType = record.content_type
+        if '; encoding=' in contentType:
+            contentType = contentType.replace ('; encoding=', '; charset=')
+            record.content_type = contentType
+            self.stats['records']['fixed'] += 1
+
+        self.stats['records']['processed'] += 1
+        return record
+
+bugs = [
+    Errata (uuid='34a176b3-ad3d-430f-a082-68087f304572',
+            description='Generated by version < 1.0. No erratas are supported for this version.',
+            affects=['crocoite<1.0'],
+            ),
+    ContentTypeErrata (),
+    ]
+
+def makeReport (fd):
+    alreadyFixed = set ()
+
+    for record in ArchiveIterator (fd):
+        if record.rec_type == 'warcinfo':
+            try:
+                data = json.load (record.raw_stream)
+                # errata records precceed everything else and indicate which
+                # ones were fixed already
+                if data['tool'] == 'crocoite-errata':
+                    alreadyFixed.update (data['parameters']['errata'])
+                else:
+                    haveVersions = dict ([(pkg['projectName'], parse_version(pkg['version'])) for pkg in data['software']['self']])
+                    yield from filter (lambda b: haveVersions in b and b.uuid not in alreadyFixed, bugs)
+            except json.decoder.JSONDecodeError:
+                pass
+
+def errataCheck (args):
+    hasErrata = False
+    for item in makeReport (args.input):
+        json.dump (item.toDict (), sys.stdout)
+        sys.stdout.write ('\n')
+        sys.stdout.flush ()
+        hasErrata = True
+    return int (hasErrata)
+
+def errataFix (args):
+    errata = args.errata
+
+    with args.input as infd, args.output as outfd:
+        writer = WARCWriter (outfd, gzip=True)
+
+        warcinfo = {
+                'software': getSoftwareInfo (),
+                'tool': 'crocoite-errata', # not the name of the cli tool
+                'parameters': {'errata': [errata.uuid]},
+                }
+        payload = BytesIO (json.dumps (warcinfo, indent=2).encode ('utf-8'))
+        record = writer.create_warc_record ('', 'warcinfo',
+                payload=payload,
+                warc_headers_dict={'Content-Type': makeContentType (jsonMime, 'utf-8')})
+        writer.write_record (record)
+
+        for record in ArchiveIterator (infd):
+            fixedRecord = errata.applyFix (record)
+            writer.write_record (fixedRecord)
+    json.dump (errata.stats, sys.stdout)
+    sys.stdout.write ('\n')
+    sys.stdout.flush ()
+
+def uuidToErrata (uuid, onlyFixable=True):
+    try:
+        e = next (filter (lambda x: x.uuid == uuid, bugs))
+    except StopIteration:
+        raise argparse.ArgumentTypeError (f'Errata {uuid} does not exist')
+    if not isinstance (e, FixableErrata):
+        raise argparse.ArgumentTypeError (f'Errata {uuid} is not fixable')
+    return e
+
+def errata ():
+    parser = argparse.ArgumentParser(description=f'Show/fix erratas for WARCs generated by {__package__}.')
+    parser.add_argument('input', metavar='INPUT', type=argparse.FileType ('rb'), help='Input WARC')
+
+    # XXX: required argument does not work here?!
+    subparsers = parser.add_subparsers()
+
+    checkparser = subparsers.add_parser('check', help='Show erratas')
+    checkparser.set_defaults (func=errataCheck)
+
+    fixparser = subparsers.add_parser('fix', help='Fix erratas')
+    fixparser.add_argument('errata', metavar='UUID', type=uuidToErrata, help='Apply fix for this errata')
+    fixparser.add_argument('output', metavar='OUTPUT', type=argparse.FileType ('wb'), help='Output WARC')
+    fixparser.set_defaults (func=errataFix)
+
+    args = parser.parse_args()
+
+    if not hasattr (args, 'func'):
+        parser.print_usage ()
+        parser.exit ()
+
+    return args.func (args)