diff options
Diffstat (limited to 'crocoite/tools.py')
| -rw-r--r-- | crocoite/tools.py | 269 | 
1 files changed, 243 insertions, 26 deletions
| diff --git a/crocoite/tools.py b/crocoite/tools.py index 3aeaaad..a2ddaa3 100644 --- a/crocoite/tools.py +++ b/crocoite/tools.py @@ -22,64 +22,117 @@  Misc tools  """ -import shutil, sys, re, os, logging, argparse +import shutil, sys, os, logging, argparse, json +from io import BytesIO +  from warcio.archiveiterator import ArchiveIterator  from warcio.warcwriter import WARCWriter +from yarl import URL -def mergeWarc (): -    """ -    Merge multiple WARC files into a single file, writing revisit records for -    items which occur multiple times -    """ - -    parser = argparse.ArgumentParser(description='Merge WARCs, reads filenames from stdin.') -    parser.add_argument('--verbose', '-v', action='store_true') -    parser.add_argument('output', type=argparse.FileType ('wb'), help='Output WARC') +from pkg_resources import parse_version, parse_requirements -    args = parser.parse_args() -    loglevel = logging.DEBUG if args.verbose else logging.INFO -    logging.basicConfig (level=loglevel) +from .util import getSoftwareInfo, StrJsonEncoder +from .warc import jsonMime, makeContentType +def mergeWarc (files, output): +    # stats      unique = 0      revisit = 0 +    uniqueLength = 0 +    revisitLength = 0 +      payloadMap = {} -    writer = WARCWriter (args.output, gzip=True) -    for l in sys.stdin: -        l = l.strip () +    writer = WARCWriter (output, gzip=True) + +    # Add an additional warcinfo record, describing the transformations. This +    # is not ideal, since +    #   “A ‘warcinfo’ record describes the records that +    #   follow it […] until next ‘warcinfo’” +    #   -- https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#warcinfo +    # A warcinfo record is expected at the beginning of every file. But it +    # might have written by a different software, so we don’t want to +    # strip/replace that information, but supplement it. +    warcinfo = { +            'software': getSoftwareInfo (), +            'tool': 'crocoite-merge', # not the name of the cli tool +            'parameters': {'inputs': files}, +            } +    payload = BytesIO (json.dumps (warcinfo, indent=2).encode ('utf-8')) +    record = writer.create_warc_record ('', 'warcinfo', +            payload=payload, +            warc_headers_dict={'Content-Type': makeContentType (jsonMime, 'utf-8')}) +    writer.write_record (record) + +    for l in files:          with open (l, 'rb') as fd:              for record in ArchiveIterator (fd):                  if record.rec_type in {'resource', 'response'}:                      headers = record.rec_headers                      rid = headers.get_header('WARC-Record-ID')                      csum = headers.get_header('WARC-Payload-Digest') +                    length = int (headers.get_header ('Content-Length'))                      dup = payloadMap.get (csum, None)                      if dup is None:                          payloadMap[csum] = {'uri': headers.get_header('WARC-Target-URI'),                                  'id': rid, 'date': headers.get_header('WARC-Date')}                          unique += 1 +                        uniqueLength += length                      else: -                        logging.debug ('Record {} is duplicate of {}'.format (rid, dup['id'])) -                        record = writer.create_revisit_record (dup['uri'], csum, dup['uri'], dup['date']) +                        logging.debug (f'Record {rid} is duplicate of {dup["id"]}') +                        # Payload may be identical, but HTTP headers are +                        # (probably) not. Include them. +                        record = writer.create_revisit_record ( +                                headers.get_header('WARC-Target-URI'), digest=csum, +                                refers_to_uri=dup['uri'], refers_to_date=dup['date'], +                                http_headers=record.http_headers)                          record.rec_headers.add_header ('WARC-Truncated', 'length')                          record.rec_headers.add_header ('WARC-Refers-To', dup['id'])                          revisit += 1 +                        revisitLength += length                  else:                      unique += 1                  writer.write_record (record) -    logging.info ('Wrote {} unique records, {} revisits'.format (unique, revisit)) +    json.dump (dict ( +            unique=dict (records=unique, bytes=uniqueLength), +            revisit=dict (records=revisit, bytes=revisitLength), +            ratio=dict ( +                    records=unique/(unique+revisit), +                    bytes=uniqueLength/(uniqueLength+revisitLength) +                    ), +            ), +            sys.stdout, +            cls=StrJsonEncoder) +    sys.stdout.write ('\n') + +def mergeWarcCli(): +    parser = argparse.ArgumentParser(description='Merge WARCs, reads filenames from stdin.') +    parser.add_argument('--verbose', '-v', action='store_true') +    parser.add_argument('output', type=argparse.FileType ('wb'), help='Output WARC') + +    args = parser.parse_args() +    loglevel = logging.DEBUG if args.verbose else logging.INFO +    logging.basicConfig (level=loglevel) + +    mergeWarc([l.strip() for l in sys.stdin], args.output)  def extractScreenshot ():      """      Extract page screenshots from a WARC generated by crocoite into files      """ -    parser = argparse.ArgumentParser(description='Extract screenshots.') -    parser.add_argument('-f', '--force', action='store_true', help='Overwrite existing files') -    parser.add_argument('input', type=argparse.FileType ('rb'), help='Input WARC') +    parser = argparse.ArgumentParser(description='Extract screenshots from ' +            'WARC, write JSON info to stdout.') +    parser.add_argument('-f', '--force', action='store_true', +            help='Overwrite existing files') +    parser.add_argument('-1', '--one', action='store_true', +            help='Only extract the first screenshot into a file named prefix') +    parser.add_argument('input', type=argparse.FileType ('rb'), +            help='Input WARC')      parser.add_argument('prefix', help='Output file prefix')      args = parser.parse_args() +    i = 0      with args.input:          for record in ArchiveIterator (args.input):              headers = record.rec_headers @@ -88,13 +141,177 @@ def extractScreenshot ():                      'X-Crocoite-Screenshot-Y-Offset' not in headers:                  continue -            urlSanitized = headers.get_header('WARC-Target-URI').replace ('/', '_') -            xoff = 0 +            url = URL (headers.get_header ('WARC-Target-URI'))              yoff = int (headers.get_header ('X-Crocoite-Screenshot-Y-Offset')) -            outpath = '{}-{}-{}-{}.png'.format (args.prefix, urlSanitized, xoff, yoff) +            outpath = f'{args.prefix}{i:05d}.png' if not args.one else args.prefix              if args.force or not os.path.exists (outpath): +                json.dump ({'file': outpath, 'url': url, 'yoff': yoff}, +                        sys.stdout, cls=StrJsonEncoder) +                sys.stdout.write ('\n')                  with open (outpath, 'wb') as out:                      shutil.copyfileobj (record.raw_stream, out) +                i += 1              else: -                print ('not overwriting {}'.format (outpath)) +                print (f'not overwriting {outpath}', file=sys.stderr) + +            if args.one: +                break + +class Errata: +    __slots__ = ('uuid', 'description', 'url', 'affects') + +    def __init__ (self, uuid, description, affects, url=None): +        self.uuid = uuid +        self.description = description +        self.url = url +        # slightly abusing setuptool’s version parsing/matching here +        self.affects = list (parse_requirements(affects)) + +    def __contains__ (self, pkg): +        """ +        Return True if the versions in pkg are affected by this errata + +        pkg must be a mapping from project_name to version +        """ +        matchedAll = [] +        for a in self.affects: +            haveVersion = pkg.get (a.project_name, None) +            matchedAll.append (haveVersion is not None and haveVersion in a) +        return all (matchedAll) + +    def __repr__ (self): +        return f'{self.__class__.__name__}({self.uuid!r}, {self.description!r}, {self.affects!r})' + +    @property +    def fixable (self): +        return getattr (self, 'applyFix', None) is not None + +    def toDict (self): +        return {'uuid': self.uuid, +                'description': self.description, +                'url': self.url, +                'affects': list (map (str, self.affects)), +                'fixable': self.fixable} + +class FixableErrata(Errata): +    __slots__ = ('stats') + +    def __init__ (self, uuid, description, affects, url=None): +        super().__init__ (uuid, description, affects, url) +        # statistics for fixable erratas +        self.stats = dict (records=dict (fixed=0, processed=0)) + +    def applyFix (self, record): +        raise NotImplementedError () # pragma: no cover + +class ContentTypeErrata (FixableErrata): +    def __init__ (self): +        super().__init__ ( +            uuid='552c13dc-56e5-4539-9ad8-184ccae60930', +            description='Content-Type header uses wrong argument name encoding instead of charset.', +            url='https://github.com/PromyLOPh/crocoite/issues/19', +            affects=['crocoite==1.0.0']) + +    def applyFix (self, record): +        # XXX: this is ugly. warcio’s write_record replaces any Content-Type +        # header we’re setting with this one. But printing rec_headers shows +        # the header, not .content_type. +        contentType = record.content_type +        if '; encoding=' in contentType: +            contentType = contentType.replace ('; encoding=', '; charset=') +            record.content_type = contentType +            self.stats['records']['fixed'] += 1 + +        self.stats['records']['processed'] += 1 +        return record + +bugs = [ +    Errata (uuid='34a176b3-ad3d-430f-a082-68087f304572', +            description='Generated by version < 1.0. No erratas are supported for this version.', +            affects=['crocoite<1.0'], +            ), +    ContentTypeErrata (), +    ] + +def makeReport (fd): +    alreadyFixed = set () + +    for record in ArchiveIterator (fd): +        if record.rec_type == 'warcinfo': +            try: +                data = json.load (record.raw_stream) +                # errata records precceed everything else and indicate which +                # ones were fixed already +                if data['tool'] == 'crocoite-errata': +                    alreadyFixed.update (data['parameters']['errata']) +                else: +                    haveVersions = dict ([(pkg['projectName'], parse_version(pkg['version'])) for pkg in data['software']['self']]) +                    yield from filter (lambda b: haveVersions in b and b.uuid not in alreadyFixed, bugs) +            except json.decoder.JSONDecodeError: +                pass + +def errataCheck (args): +    hasErrata = False +    for item in makeReport (args.input): +        json.dump (item.toDict (), sys.stdout) +        sys.stdout.write ('\n') +        sys.stdout.flush () +        hasErrata = True +    return int (hasErrata) + +def errataFix (args): +    errata = args.errata + +    with args.input as infd, args.output as outfd: +        writer = WARCWriter (outfd, gzip=True) + +        warcinfo = { +                'software': getSoftwareInfo (), +                'tool': 'crocoite-errata', # not the name of the cli tool +                'parameters': {'errata': [errata.uuid]}, +                } +        payload = BytesIO (json.dumps (warcinfo, indent=2).encode ('utf-8')) +        record = writer.create_warc_record ('', 'warcinfo', +                payload=payload, +                warc_headers_dict={'Content-Type': makeContentType (jsonMime, 'utf-8')}) +        writer.write_record (record) + +        for record in ArchiveIterator (infd): +            fixedRecord = errata.applyFix (record) +            writer.write_record (fixedRecord) +    json.dump (errata.stats, sys.stdout) +    sys.stdout.write ('\n') +    sys.stdout.flush () + +def uuidToErrata (uuid, onlyFixable=True): +    try: +        e = next (filter (lambda x: x.uuid == uuid, bugs)) +    except StopIteration: +        raise argparse.ArgumentTypeError (f'Errata {uuid} does not exist') +    if not isinstance (e, FixableErrata): +        raise argparse.ArgumentTypeError (f'Errata {uuid} is not fixable') +    return e + +def errata (): +    parser = argparse.ArgumentParser(description=f'Show/fix erratas for WARCs generated by {__package__}.') +    parser.add_argument('input', metavar='INPUT', type=argparse.FileType ('rb'), help='Input WARC') + +    # XXX: required argument does not work here?! +    subparsers = parser.add_subparsers() + +    checkparser = subparsers.add_parser('check', help='Show erratas') +    checkparser.set_defaults (func=errataCheck) + +    fixparser = subparsers.add_parser('fix', help='Fix erratas') +    fixparser.add_argument('errata', metavar='UUID', type=uuidToErrata, help='Apply fix for this errata') +    fixparser.add_argument('output', metavar='OUTPUT', type=argparse.FileType ('wb'), help='Output WARC') +    fixparser.set_defaults (func=errataFix) + +    args = parser.parse_args() + +    if not hasattr (args, 'func'): +        parser.print_usage () +        parser.exit () + +    return args.func (args) | 
