diff options
-rw-r--r-- | crocoite/test_tools.py | 2 | ||||
-rw-r--r-- | crocoite/test_warc.py | 2 | ||||
-rw-r--r-- | crocoite/tools.py | 114 | ||||
-rw-r--r-- | crocoite/warc.py | 24 | ||||
-rw-r--r-- | setup.py | 2 |
5 files changed, 120 insertions, 24 deletions
diff --git a/crocoite/test_tools.py b/crocoite/test_tools.py index e8edb98..416b954 100644 --- a/crocoite/test_tools.py +++ b/crocoite/test_tools.py @@ -52,7 +52,7 @@ def makeGolden(writer, records): '', 'warcinfo', payload=b'', - warc_headers_dict={'Content-Type': 'text/plain; encoding=utf-8'}) + warc_headers_dict={'Content-Type': 'application/json; charset=utf-8'}) records.insert (0, record) return records diff --git a/crocoite/test_warc.py b/crocoite/test_warc.py index 17f3840..3ec310c 100644 --- a/crocoite/test_warc.py +++ b/crocoite/test_warc.py @@ -60,7 +60,7 @@ def test_log (): assert headers['warc-type'] == 'metadata' assert 'warc-target-uri' not in headers assert headers['x-crocoite-type'] == 'log' - assert headers['content-type'] == f'application/json; encoding={handler.logEncoding}' + assert headers['content-type'] == f'application/json; charset={handler.logEncoding}' while True: l = it.raw_stream.readline () diff --git a/crocoite/tools.py b/crocoite/tools.py index 42ced35..a2ddaa3 100644 --- a/crocoite/tools.py +++ b/crocoite/tools.py @@ -32,6 +32,7 @@ from yarl import URL from pkg_resources import parse_version, parse_requirements from .util import getSoftwareInfo, StrJsonEncoder +from .warc import jsonMime, makeContentType def mergeWarc (files, output): # stats @@ -59,7 +60,7 @@ def mergeWarc (files, output): payload = BytesIO (json.dumps (warcinfo, indent=2).encode ('utf-8')) record = writer.create_warc_record ('', 'warcinfo', payload=payload, - warc_headers_dict={'Content-Type': 'text/plain; encoding=utf-8'}) + warc_headers_dict={'Content-Type': makeContentType (jsonMime, 'utf-8')}) writer.write_record (record) for l in files: @@ -157,11 +158,12 @@ def extractScreenshot (): break class Errata: - __slots__ = ('uuid', 'description', 'affects') + __slots__ = ('uuid', 'description', 'url', 'affects') - def __init__ (self, uuid, description, affects): + def __init__ (self, uuid, description, affects, url=None): self.uuid = uuid self.description = description + self.url = url # slightly abusing setuptool’s version parsing/matching here self.affects = list (parse_requirements(affects)) @@ -187,36 +189,68 @@ class Errata: def toDict (self): return {'uuid': self.uuid, 'description': self.description, + 'url': self.url, 'affects': list (map (str, self.affects)), 'fixable': self.fixable} class FixableErrata(Errata): - def applyFix (self, records): + __slots__ = ('stats') + + def __init__ (self, uuid, description, affects, url=None): + super().__init__ (uuid, description, affects, url) + # statistics for fixable erratas + self.stats = dict (records=dict (fixed=0, processed=0)) + + def applyFix (self, record): raise NotImplementedError () # pragma: no cover +class ContentTypeErrata (FixableErrata): + def __init__ (self): + super().__init__ ( + uuid='552c13dc-56e5-4539-9ad8-184ccae60930', + description='Content-Type header uses wrong argument name encoding instead of charset.', + url='https://github.com/PromyLOPh/crocoite/issues/19', + affects=['crocoite==1.0.0']) + + def applyFix (self, record): + # XXX: this is ugly. warcio’s write_record replaces any Content-Type + # header we’re setting with this one. But printing rec_headers shows + # the header, not .content_type. + contentType = record.content_type + if '; encoding=' in contentType: + contentType = contentType.replace ('; encoding=', '; charset=') + record.content_type = contentType + self.stats['records']['fixed'] += 1 + + self.stats['records']['processed'] += 1 + return record + bugs = [ Errata (uuid='34a176b3-ad3d-430f-a082-68087f304572', description='Generated by version < 1.0. No erratas are supported for this version.', affects=['crocoite<1.0'], ), + ContentTypeErrata (), ] def makeReport (fd): + alreadyFixed = set () + for record in ArchiveIterator (fd): if record.rec_type == 'warcinfo': try: data = json.load (record.raw_stream) - haveVersions = dict ([(pkg['projectName'], parse_version(pkg['version'])) for pkg in data['software']['self']]) - yield from filter (lambda b: haveVersions in b, bugs) + # errata records precceed everything else and indicate which + # ones were fixed already + if data['tool'] == 'crocoite-errata': + alreadyFixed.update (data['parameters']['errata']) + else: + haveVersions = dict ([(pkg['projectName'], parse_version(pkg['version'])) for pkg in data['software']['self']]) + yield from filter (lambda b: haveVersions in b and b.uuid not in alreadyFixed, bugs) except json.decoder.JSONDecodeError: pass -def errata (): - parser = argparse.ArgumentParser(description=f'Show/fix erratas for WARCs generated by {__package__}.') - parser.add_argument('input', type=argparse.FileType ('rb'), help='Input WARC') - - args = parser.parse_args() - +def errataCheck (args): hasErrata = False for item in makeReport (args.input): json.dump (item.toDict (), sys.stdout) @@ -225,3 +259,59 @@ def errata (): hasErrata = True return int (hasErrata) +def errataFix (args): + errata = args.errata + + with args.input as infd, args.output as outfd: + writer = WARCWriter (outfd, gzip=True) + + warcinfo = { + 'software': getSoftwareInfo (), + 'tool': 'crocoite-errata', # not the name of the cli tool + 'parameters': {'errata': [errata.uuid]}, + } + payload = BytesIO (json.dumps (warcinfo, indent=2).encode ('utf-8')) + record = writer.create_warc_record ('', 'warcinfo', + payload=payload, + warc_headers_dict={'Content-Type': makeContentType (jsonMime, 'utf-8')}) + writer.write_record (record) + + for record in ArchiveIterator (infd): + fixedRecord = errata.applyFix (record) + writer.write_record (fixedRecord) + json.dump (errata.stats, sys.stdout) + sys.stdout.write ('\n') + sys.stdout.flush () + +def uuidToErrata (uuid, onlyFixable=True): + try: + e = next (filter (lambda x: x.uuid == uuid, bugs)) + except StopIteration: + raise argparse.ArgumentTypeError (f'Errata {uuid} does not exist') + if not isinstance (e, FixableErrata): + raise argparse.ArgumentTypeError (f'Errata {uuid} is not fixable') + return e + +def errata (): + parser = argparse.ArgumentParser(description=f'Show/fix erratas for WARCs generated by {__package__}.') + parser.add_argument('input', metavar='INPUT', type=argparse.FileType ('rb'), help='Input WARC') + + # XXX: required argument does not work here?! + subparsers = parser.add_subparsers() + + checkparser = subparsers.add_parser('check', help='Show erratas') + checkparser.set_defaults (func=errataCheck) + + fixparser = subparsers.add_parser('fix', help='Fix erratas') + fixparser.add_argument('errata', metavar='UUID', type=uuidToErrata, help='Apply fix for this errata') + fixparser.add_argument('output', metavar='OUTPUT', type=argparse.FileType ('wb'), help='Output WARC') + fixparser.set_defaults (func=errataFix) + + args = parser.parse_args() + + if not hasattr (args, 'func'): + parser.print_usage () + parser.exit () + + return args.func (args) + diff --git a/crocoite/warc.py b/crocoite/warc.py index e1cdf35..415b487 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -42,6 +42,13 @@ jsonMime = 'application/json' # mime for javascript, according to https://tools.ietf.org/html/rfc4329#section-7.2 jsMime = 'application/javascript' +def makeContentType (mime, charset=None): + """ Create value of Content-Type WARC header with optional charset """ + s = [mime] + if charset: + s.extend (['; charset=', charset]) + return ''.join (s) + class WarcHandler (EventHandler): __slots__ = ('logger', 'writer', 'documentRecords', 'log', 'maxLogSize', 'logEncoding', 'warcinfoRecordId') @@ -149,10 +156,9 @@ class WarcHandler (EventHandler): # chrome sends nothing but utf8 encoded text. Fortunately HTTP # headers take precedence over the document’s <meta>, thus we can # easily override those. - contentType = resp.mimeType - if contentType: - if isinstance (resp.body, UnicodeBody): - contentType += '; charset=utf-8' + if resp.mimeType: + charset = 'utf-8' if isinstance (resp.body, UnicodeBody) else None + contentType = makeContentType (resp.mimeType, charset=charset) httpHeaders.replace_header ('Content-Type', contentType) # response body @@ -179,7 +185,7 @@ class WarcHandler (EventHandler): self.writeRecord (uri, 'resource', payload=BytesIO (str (item).encode (encoding)), warc_headers_dict={ - 'Content-Type': f'{jsMime}; charset={encoding}', + 'Content-Type': makeContentType (jsMime, encoding), 'X-Crocoite-Type': 'script', }) @@ -204,7 +210,7 @@ class WarcHandler (EventHandler): warcHeaders = { 'X-Crocoite-Type': 'dom-snapshot', 'X-Chrome-Viewport': item.viewport, - 'Content-Type': 'text/html; charset=utf-8', + 'Content-Type': makeContentType ('text/html', 'utf-8') } self._addRefersTo (warcHeaders, item.url) @@ -216,7 +222,7 @@ class WarcHandler (EventHandler): def _writeScreenshot (self, item): writer = self.writer warcHeaders = { - 'Content-Type': 'image/png', + 'Content-Type': makeContentType ('image/png'), 'X-Crocoite-Screenshot-Y-Offset': str (item.yoff), 'X-Crocoite-Type': 'screenshot', } @@ -229,7 +235,7 @@ class WarcHandler (EventHandler): writer = self.writer warcinfo = self.writeRecord (None, 'warcinfo', - warc_headers_dict={'Content-Type': f'{jsonMime}; encoding={encoding}'}, + warc_headers_dict={'Content-Type': makeContentType (jsonMime, encoding)}, payload=payload) self.warcinfoRecordId = warcinfo.rec_headers['WARC-Record-ID'] @@ -238,7 +244,7 @@ class WarcHandler (EventHandler): writer = self.writer self.log.seek (0) warcHeaders = { - 'Content-Type': f'application/json; encoding={self.logEncoding}', + 'Content-Type': makeContentType (jsonMime, self.logEncoding), 'X-Crocoite-Type': 'log', } self.writeRecord (None, 'metadata', payload=self.log, @@ -2,7 +2,7 @@ from setuptools import setup setup( name='crocoite', - version='1.0.0', + version='1.1.0.dev0', author='Lars-Dominik Braun', author_email='lars+crocoite@6xq.net', url='https://6xq.net/crocoite/', |