diff options
Diffstat (limited to 'crocoite/tools.py')
-rw-r--r-- | crocoite/tools.py | 217 |
1 files changed, 205 insertions, 12 deletions
diff --git a/crocoite/tools.py b/crocoite/tools.py index e2dc6a7..a2ddaa3 100644 --- a/crocoite/tools.py +++ b/crocoite/tools.py @@ -24,13 +24,23 @@ Misc tools import shutil, sys, os, logging, argparse, json from io import BytesIO + from warcio.archiveiterator import ArchiveIterator from warcio.warcwriter import WARCWriter -from .util import packageUrl, getSoftwareInfo +from yarl import URL + +from pkg_resources import parse_version, parse_requirements + +from .util import getSoftwareInfo, StrJsonEncoder +from .warc import jsonMime, makeContentType def mergeWarc (files, output): + # stats unique = 0 revisit = 0 + uniqueLength = 0 + revisitLength = 0 + payloadMap = {} writer = WARCWriter (output, gzip=True) @@ -48,9 +58,9 @@ def mergeWarc (files, output): 'parameters': {'inputs': files}, } payload = BytesIO (json.dumps (warcinfo, indent=2).encode ('utf-8')) - record = writer.create_warc_record (packageUrl ('warcinfo'), 'warcinfo', + record = writer.create_warc_record ('', 'warcinfo', payload=payload, - warc_headers_dict={'Content-Type': 'text/plain; encoding=utf-8'}) + warc_headers_dict={'Content-Type': makeContentType (jsonMime, 'utf-8')}) writer.write_record (record) for l in files: @@ -60,13 +70,15 @@ def mergeWarc (files, output): headers = record.rec_headers rid = headers.get_header('WARC-Record-ID') csum = headers.get_header('WARC-Payload-Digest') + length = int (headers.get_header ('Content-Length')) dup = payloadMap.get (csum, None) if dup is None: payloadMap[csum] = {'uri': headers.get_header('WARC-Target-URI'), 'id': rid, 'date': headers.get_header('WARC-Date')} unique += 1 + uniqueLength += length else: - logging.debug ('Record {} is duplicate of {}'.format (rid, dup['id'])) + logging.debug (f'Record {rid} is duplicate of {dup["id"]}') # Payload may be identical, but HTTP headers are # (probably) not. Include them. record = writer.create_revisit_record ( @@ -76,10 +88,21 @@ def mergeWarc (files, output): record.rec_headers.add_header ('WARC-Truncated', 'length') record.rec_headers.add_header ('WARC-Refers-To', dup['id']) revisit += 1 + revisitLength += length else: unique += 1 writer.write_record (record) - logging.info ('Wrote {} unique records, {} revisits'.format (unique, revisit)) + json.dump (dict ( + unique=dict (records=unique, bytes=uniqueLength), + revisit=dict (records=revisit, bytes=revisitLength), + ratio=dict ( + records=unique/(unique+revisit), + bytes=uniqueLength/(uniqueLength+revisitLength) + ), + ), + sys.stdout, + cls=StrJsonEncoder) + sys.stdout.write ('\n') def mergeWarcCli(): parser = argparse.ArgumentParser(description='Merge WARCs, reads filenames from stdin.') @@ -97,13 +120,19 @@ def extractScreenshot (): Extract page screenshots from a WARC generated by crocoite into files """ - parser = argparse.ArgumentParser(description='Extract screenshots.') - parser.add_argument('-f', '--force', action='store_true', help='Overwrite existing files') - parser.add_argument('input', type=argparse.FileType ('rb'), help='Input WARC') + parser = argparse.ArgumentParser(description='Extract screenshots from ' + 'WARC, write JSON info to stdout.') + parser.add_argument('-f', '--force', action='store_true', + help='Overwrite existing files') + parser.add_argument('-1', '--one', action='store_true', + help='Only extract the first screenshot into a file named prefix') + parser.add_argument('input', type=argparse.FileType ('rb'), + help='Input WARC') parser.add_argument('prefix', help='Output file prefix') args = parser.parse_args() + i = 0 with args.input: for record in ArchiveIterator (args.input): headers = record.rec_headers @@ -112,13 +141,177 @@ def extractScreenshot (): 'X-Crocoite-Screenshot-Y-Offset' not in headers: continue - urlSanitized = headers.get_header('WARC-Target-URI').replace ('/', '_') - xoff = 0 + url = URL (headers.get_header ('WARC-Target-URI')) yoff = int (headers.get_header ('X-Crocoite-Screenshot-Y-Offset')) - outpath = '{}-{}-{}-{}.png'.format (args.prefix, urlSanitized, xoff, yoff) + outpath = f'{args.prefix}{i:05d}.png' if not args.one else args.prefix if args.force or not os.path.exists (outpath): + json.dump ({'file': outpath, 'url': url, 'yoff': yoff}, + sys.stdout, cls=StrJsonEncoder) + sys.stdout.write ('\n') with open (outpath, 'wb') as out: shutil.copyfileobj (record.raw_stream, out) + i += 1 else: - print ('not overwriting {}'.format (outpath)) + print (f'not overwriting {outpath}', file=sys.stderr) + + if args.one: + break + +class Errata: + __slots__ = ('uuid', 'description', 'url', 'affects') + + def __init__ (self, uuid, description, affects, url=None): + self.uuid = uuid + self.description = description + self.url = url + # slightly abusing setuptool’s version parsing/matching here + self.affects = list (parse_requirements(affects)) + + def __contains__ (self, pkg): + """ + Return True if the versions in pkg are affected by this errata + + pkg must be a mapping from project_name to version + """ + matchedAll = [] + for a in self.affects: + haveVersion = pkg.get (a.project_name, None) + matchedAll.append (haveVersion is not None and haveVersion in a) + return all (matchedAll) + + def __repr__ (self): + return f'{self.__class__.__name__}({self.uuid!r}, {self.description!r}, {self.affects!r})' + + @property + def fixable (self): + return getattr (self, 'applyFix', None) is not None + + def toDict (self): + return {'uuid': self.uuid, + 'description': self.description, + 'url': self.url, + 'affects': list (map (str, self.affects)), + 'fixable': self.fixable} + +class FixableErrata(Errata): + __slots__ = ('stats') + + def __init__ (self, uuid, description, affects, url=None): + super().__init__ (uuid, description, affects, url) + # statistics for fixable erratas + self.stats = dict (records=dict (fixed=0, processed=0)) + + def applyFix (self, record): + raise NotImplementedError () # pragma: no cover + +class ContentTypeErrata (FixableErrata): + def __init__ (self): + super().__init__ ( + uuid='552c13dc-56e5-4539-9ad8-184ccae60930', + description='Content-Type header uses wrong argument name encoding instead of charset.', + url='https://github.com/PromyLOPh/crocoite/issues/19', + affects=['crocoite==1.0.0']) + + def applyFix (self, record): + # XXX: this is ugly. warcio’s write_record replaces any Content-Type + # header we’re setting with this one. But printing rec_headers shows + # the header, not .content_type. + contentType = record.content_type + if '; encoding=' in contentType: + contentType = contentType.replace ('; encoding=', '; charset=') + record.content_type = contentType + self.stats['records']['fixed'] += 1 + + self.stats['records']['processed'] += 1 + return record + +bugs = [ + Errata (uuid='34a176b3-ad3d-430f-a082-68087f304572', + description='Generated by version < 1.0. No erratas are supported for this version.', + affects=['crocoite<1.0'], + ), + ContentTypeErrata (), + ] + +def makeReport (fd): + alreadyFixed = set () + + for record in ArchiveIterator (fd): + if record.rec_type == 'warcinfo': + try: + data = json.load (record.raw_stream) + # errata records precceed everything else and indicate which + # ones were fixed already + if data['tool'] == 'crocoite-errata': + alreadyFixed.update (data['parameters']['errata']) + else: + haveVersions = dict ([(pkg['projectName'], parse_version(pkg['version'])) for pkg in data['software']['self']]) + yield from filter (lambda b: haveVersions in b and b.uuid not in alreadyFixed, bugs) + except json.decoder.JSONDecodeError: + pass + +def errataCheck (args): + hasErrata = False + for item in makeReport (args.input): + json.dump (item.toDict (), sys.stdout) + sys.stdout.write ('\n') + sys.stdout.flush () + hasErrata = True + return int (hasErrata) + +def errataFix (args): + errata = args.errata + + with args.input as infd, args.output as outfd: + writer = WARCWriter (outfd, gzip=True) + + warcinfo = { + 'software': getSoftwareInfo (), + 'tool': 'crocoite-errata', # not the name of the cli tool + 'parameters': {'errata': [errata.uuid]}, + } + payload = BytesIO (json.dumps (warcinfo, indent=2).encode ('utf-8')) + record = writer.create_warc_record ('', 'warcinfo', + payload=payload, + warc_headers_dict={'Content-Type': makeContentType (jsonMime, 'utf-8')}) + writer.write_record (record) + + for record in ArchiveIterator (infd): + fixedRecord = errata.applyFix (record) + writer.write_record (fixedRecord) + json.dump (errata.stats, sys.stdout) + sys.stdout.write ('\n') + sys.stdout.flush () + +def uuidToErrata (uuid, onlyFixable=True): + try: + e = next (filter (lambda x: x.uuid == uuid, bugs)) + except StopIteration: + raise argparse.ArgumentTypeError (f'Errata {uuid} does not exist') + if not isinstance (e, FixableErrata): + raise argparse.ArgumentTypeError (f'Errata {uuid} is not fixable') + return e + +def errata (): + parser = argparse.ArgumentParser(description=f'Show/fix erratas for WARCs generated by {__package__}.') + parser.add_argument('input', metavar='INPUT', type=argparse.FileType ('rb'), help='Input WARC') + + # XXX: required argument does not work here?! + subparsers = parser.add_subparsers() + + checkparser = subparsers.add_parser('check', help='Show erratas') + checkparser.set_defaults (func=errataCheck) + + fixparser = subparsers.add_parser('fix', help='Fix erratas') + fixparser.add_argument('errata', metavar='UUID', type=uuidToErrata, help='Apply fix for this errata') + fixparser.add_argument('output', metavar='OUTPUT', type=argparse.FileType ('wb'), help='Output WARC') + fixparser.set_defaults (func=errataFix) + + args = parser.parse_args() + + if not hasattr (args, 'func'): + parser.print_usage () + parser.exit () + + return args.func (args) |