summaryrefslogtreecommitdiff
path: root/crocoite/tools.py
diff options
context:
space:
mode:
Diffstat (limited to 'crocoite/tools.py')
-rw-r--r--crocoite/tools.py114
1 files changed, 102 insertions, 12 deletions
diff --git a/crocoite/tools.py b/crocoite/tools.py
index 42ced35..a2ddaa3 100644
--- a/crocoite/tools.py
+++ b/crocoite/tools.py
@@ -32,6 +32,7 @@ from yarl import URL
from pkg_resources import parse_version, parse_requirements
from .util import getSoftwareInfo, StrJsonEncoder
+from .warc import jsonMime, makeContentType
def mergeWarc (files, output):
# stats
@@ -59,7 +60,7 @@ def mergeWarc (files, output):
payload = BytesIO (json.dumps (warcinfo, indent=2).encode ('utf-8'))
record = writer.create_warc_record ('', 'warcinfo',
payload=payload,
- warc_headers_dict={'Content-Type': 'text/plain; encoding=utf-8'})
+ warc_headers_dict={'Content-Type': makeContentType (jsonMime, 'utf-8')})
writer.write_record (record)
for l in files:
@@ -157,11 +158,12 @@ def extractScreenshot ():
break
class Errata:
- __slots__ = ('uuid', 'description', 'affects')
+ __slots__ = ('uuid', 'description', 'url', 'affects')
- def __init__ (self, uuid, description, affects):
+ def __init__ (self, uuid, description, affects, url=None):
self.uuid = uuid
self.description = description
+ self.url = url
# slightly abusing setuptool’s version parsing/matching here
self.affects = list (parse_requirements(affects))
@@ -187,36 +189,68 @@ class Errata:
def toDict (self):
return {'uuid': self.uuid,
'description': self.description,
+ 'url': self.url,
'affects': list (map (str, self.affects)),
'fixable': self.fixable}
class FixableErrata(Errata):
- def applyFix (self, records):
+ __slots__ = ('stats')
+
+ def __init__ (self, uuid, description, affects, url=None):
+ super().__init__ (uuid, description, affects, url)
+ # statistics for fixable erratas
+ self.stats = dict (records=dict (fixed=0, processed=0))
+
+ def applyFix (self, record):
raise NotImplementedError () # pragma: no cover
+class ContentTypeErrata (FixableErrata):
+ def __init__ (self):
+ super().__init__ (
+ uuid='552c13dc-56e5-4539-9ad8-184ccae60930',
+ description='Content-Type header uses wrong argument name encoding instead of charset.',
+ url='https://github.com/PromyLOPh/crocoite/issues/19',
+ affects=['crocoite==1.0.0'])
+
+ def applyFix (self, record):
+ # XXX: this is ugly. warcio’s write_record replaces any Content-Type
+ # header we’re setting with this one. But printing rec_headers shows
+ # the header, not .content_type.
+ contentType = record.content_type
+ if '; encoding=' in contentType:
+ contentType = contentType.replace ('; encoding=', '; charset=')
+ record.content_type = contentType
+ self.stats['records']['fixed'] += 1
+
+ self.stats['records']['processed'] += 1
+ return record
+
bugs = [
Errata (uuid='34a176b3-ad3d-430f-a082-68087f304572',
description='Generated by version < 1.0. No erratas are supported for this version.',
affects=['crocoite<1.0'],
),
+ ContentTypeErrata (),
]
def makeReport (fd):
+ alreadyFixed = set ()
+
for record in ArchiveIterator (fd):
if record.rec_type == 'warcinfo':
try:
data = json.load (record.raw_stream)
- haveVersions = dict ([(pkg['projectName'], parse_version(pkg['version'])) for pkg in data['software']['self']])
- yield from filter (lambda b: haveVersions in b, bugs)
+ # errata records precceed everything else and indicate which
+ # ones were fixed already
+ if data['tool'] == 'crocoite-errata':
+ alreadyFixed.update (data['parameters']['errata'])
+ else:
+ haveVersions = dict ([(pkg['projectName'], parse_version(pkg['version'])) for pkg in data['software']['self']])
+ yield from filter (lambda b: haveVersions in b and b.uuid not in alreadyFixed, bugs)
except json.decoder.JSONDecodeError:
pass
-def errata ():
- parser = argparse.ArgumentParser(description=f'Show/fix erratas for WARCs generated by {__package__}.')
- parser.add_argument('input', type=argparse.FileType ('rb'), help='Input WARC')
-
- args = parser.parse_args()
-
+def errataCheck (args):
hasErrata = False
for item in makeReport (args.input):
json.dump (item.toDict (), sys.stdout)
@@ -225,3 +259,59 @@ def errata ():
hasErrata = True
return int (hasErrata)
+def errataFix (args):
+ errata = args.errata
+
+ with args.input as infd, args.output as outfd:
+ writer = WARCWriter (outfd, gzip=True)
+
+ warcinfo = {
+ 'software': getSoftwareInfo (),
+ 'tool': 'crocoite-errata', # not the name of the cli tool
+ 'parameters': {'errata': [errata.uuid]},
+ }
+ payload = BytesIO (json.dumps (warcinfo, indent=2).encode ('utf-8'))
+ record = writer.create_warc_record ('', 'warcinfo',
+ payload=payload,
+ warc_headers_dict={'Content-Type': makeContentType (jsonMime, 'utf-8')})
+ writer.write_record (record)
+
+ for record in ArchiveIterator (infd):
+ fixedRecord = errata.applyFix (record)
+ writer.write_record (fixedRecord)
+ json.dump (errata.stats, sys.stdout)
+ sys.stdout.write ('\n')
+ sys.stdout.flush ()
+
+def uuidToErrata (uuid, onlyFixable=True):
+ try:
+ e = next (filter (lambda x: x.uuid == uuid, bugs))
+ except StopIteration:
+ raise argparse.ArgumentTypeError (f'Errata {uuid} does not exist')
+ if not isinstance (e, FixableErrata):
+ raise argparse.ArgumentTypeError (f'Errata {uuid} is not fixable')
+ return e
+
+def errata ():
+ parser = argparse.ArgumentParser(description=f'Show/fix erratas for WARCs generated by {__package__}.')
+ parser.add_argument('input', metavar='INPUT', type=argparse.FileType ('rb'), help='Input WARC')
+
+ # XXX: required argument does not work here?!
+ subparsers = parser.add_subparsers()
+
+ checkparser = subparsers.add_parser('check', help='Show erratas')
+ checkparser.set_defaults (func=errataCheck)
+
+ fixparser = subparsers.add_parser('fix', help='Fix erratas')
+ fixparser.add_argument('errata', metavar='UUID', type=uuidToErrata, help='Apply fix for this errata')
+ fixparser.add_argument('output', metavar='OUTPUT', type=argparse.FileType ('wb'), help='Output WARC')
+ fixparser.set_defaults (func=errataFix)
+
+ args = parser.parse_args()
+
+ if not hasattr (args, 'func'):
+ parser.print_usage ()
+ parser.exit ()
+
+ return args.func (args)
+