summaryrefslogtreecommitdiff
path: root/crocoite
diff options
context:
space:
mode:
Diffstat (limited to 'crocoite')
-rw-r--r--crocoite/test_tools.py2
-rw-r--r--crocoite/test_warc.py2
-rw-r--r--crocoite/tools.py114
-rw-r--r--crocoite/warc.py24
4 files changed, 119 insertions, 23 deletions
diff --git a/crocoite/test_tools.py b/crocoite/test_tools.py
index e8edb98..416b954 100644
--- a/crocoite/test_tools.py
+++ b/crocoite/test_tools.py
@@ -52,7 +52,7 @@ def makeGolden(writer, records):
'',
'warcinfo',
payload=b'',
- warc_headers_dict={'Content-Type': 'text/plain; encoding=utf-8'})
+ warc_headers_dict={'Content-Type': 'application/json; charset=utf-8'})
records.insert (0, record)
return records
diff --git a/crocoite/test_warc.py b/crocoite/test_warc.py
index 17f3840..3ec310c 100644
--- a/crocoite/test_warc.py
+++ b/crocoite/test_warc.py
@@ -60,7 +60,7 @@ def test_log ():
assert headers['warc-type'] == 'metadata'
assert 'warc-target-uri' not in headers
assert headers['x-crocoite-type'] == 'log'
- assert headers['content-type'] == f'application/json; encoding={handler.logEncoding}'
+ assert headers['content-type'] == f'application/json; charset={handler.logEncoding}'
while True:
l = it.raw_stream.readline ()
diff --git a/crocoite/tools.py b/crocoite/tools.py
index 42ced35..a2ddaa3 100644
--- a/crocoite/tools.py
+++ b/crocoite/tools.py
@@ -32,6 +32,7 @@ from yarl import URL
from pkg_resources import parse_version, parse_requirements
from .util import getSoftwareInfo, StrJsonEncoder
+from .warc import jsonMime, makeContentType
def mergeWarc (files, output):
# stats
@@ -59,7 +60,7 @@ def mergeWarc (files, output):
payload = BytesIO (json.dumps (warcinfo, indent=2).encode ('utf-8'))
record = writer.create_warc_record ('', 'warcinfo',
payload=payload,
- warc_headers_dict={'Content-Type': 'text/plain; encoding=utf-8'})
+ warc_headers_dict={'Content-Type': makeContentType (jsonMime, 'utf-8')})
writer.write_record (record)
for l in files:
@@ -157,11 +158,12 @@ def extractScreenshot ():
break
class Errata:
- __slots__ = ('uuid', 'description', 'affects')
+ __slots__ = ('uuid', 'description', 'url', 'affects')
- def __init__ (self, uuid, description, affects):
+ def __init__ (self, uuid, description, affects, url=None):
self.uuid = uuid
self.description = description
+ self.url = url
# slightly abusing setuptool’s version parsing/matching here
self.affects = list (parse_requirements(affects))
@@ -187,36 +189,68 @@ class Errata:
def toDict (self):
return {'uuid': self.uuid,
'description': self.description,
+ 'url': self.url,
'affects': list (map (str, self.affects)),
'fixable': self.fixable}
class FixableErrata(Errata):
- def applyFix (self, records):
+ __slots__ = ('stats')
+
+ def __init__ (self, uuid, description, affects, url=None):
+ super().__init__ (uuid, description, affects, url)
+ # statistics for fixable erratas
+ self.stats = dict (records=dict (fixed=0, processed=0))
+
+ def applyFix (self, record):
raise NotImplementedError () # pragma: no cover
+class ContentTypeErrata (FixableErrata):
+ def __init__ (self):
+ super().__init__ (
+ uuid='552c13dc-56e5-4539-9ad8-184ccae60930',
+ description='Content-Type header uses wrong argument name encoding instead of charset.',
+ url='https://github.com/PromyLOPh/crocoite/issues/19',
+ affects=['crocoite==1.0.0'])
+
+ def applyFix (self, record):
+ # XXX: this is ugly. warcio’s write_record replaces any Content-Type
+ # header we’re setting with this one. But printing rec_headers shows
+ # the header, not .content_type.
+ contentType = record.content_type
+ if '; encoding=' in contentType:
+ contentType = contentType.replace ('; encoding=', '; charset=')
+ record.content_type = contentType
+ self.stats['records']['fixed'] += 1
+
+ self.stats['records']['processed'] += 1
+ return record
+
bugs = [
Errata (uuid='34a176b3-ad3d-430f-a082-68087f304572',
description='Generated by version < 1.0. No erratas are supported for this version.',
affects=['crocoite<1.0'],
),
+ ContentTypeErrata (),
]
def makeReport (fd):
+ alreadyFixed = set ()
+
for record in ArchiveIterator (fd):
if record.rec_type == 'warcinfo':
try:
data = json.load (record.raw_stream)
- haveVersions = dict ([(pkg['projectName'], parse_version(pkg['version'])) for pkg in data['software']['self']])
- yield from filter (lambda b: haveVersions in b, bugs)
+ # errata records precceed everything else and indicate which
+ # ones were fixed already
+ if data['tool'] == 'crocoite-errata':
+ alreadyFixed.update (data['parameters']['errata'])
+ else:
+ haveVersions = dict ([(pkg['projectName'], parse_version(pkg['version'])) for pkg in data['software']['self']])
+ yield from filter (lambda b: haveVersions in b and b.uuid not in alreadyFixed, bugs)
except json.decoder.JSONDecodeError:
pass
-def errata ():
- parser = argparse.ArgumentParser(description=f'Show/fix erratas for WARCs generated by {__package__}.')
- parser.add_argument('input', type=argparse.FileType ('rb'), help='Input WARC')
-
- args = parser.parse_args()
-
+def errataCheck (args):
hasErrata = False
for item in makeReport (args.input):
json.dump (item.toDict (), sys.stdout)
@@ -225,3 +259,59 @@ def errata ():
hasErrata = True
return int (hasErrata)
+def errataFix (args):
+ errata = args.errata
+
+ with args.input as infd, args.output as outfd:
+ writer = WARCWriter (outfd, gzip=True)
+
+ warcinfo = {
+ 'software': getSoftwareInfo (),
+ 'tool': 'crocoite-errata', # not the name of the cli tool
+ 'parameters': {'errata': [errata.uuid]},
+ }
+ payload = BytesIO (json.dumps (warcinfo, indent=2).encode ('utf-8'))
+ record = writer.create_warc_record ('', 'warcinfo',
+ payload=payload,
+ warc_headers_dict={'Content-Type': makeContentType (jsonMime, 'utf-8')})
+ writer.write_record (record)
+
+ for record in ArchiveIterator (infd):
+ fixedRecord = errata.applyFix (record)
+ writer.write_record (fixedRecord)
+ json.dump (errata.stats, sys.stdout)
+ sys.stdout.write ('\n')
+ sys.stdout.flush ()
+
+def uuidToErrata (uuid, onlyFixable=True):
+ try:
+ e = next (filter (lambda x: x.uuid == uuid, bugs))
+ except StopIteration:
+ raise argparse.ArgumentTypeError (f'Errata {uuid} does not exist')
+ if not isinstance (e, FixableErrata):
+ raise argparse.ArgumentTypeError (f'Errata {uuid} is not fixable')
+ return e
+
+def errata ():
+ parser = argparse.ArgumentParser(description=f'Show/fix erratas for WARCs generated by {__package__}.')
+ parser.add_argument('input', metavar='INPUT', type=argparse.FileType ('rb'), help='Input WARC')
+
+ # XXX: required argument does not work here?!
+ subparsers = parser.add_subparsers()
+
+ checkparser = subparsers.add_parser('check', help='Show erratas')
+ checkparser.set_defaults (func=errataCheck)
+
+ fixparser = subparsers.add_parser('fix', help='Fix erratas')
+ fixparser.add_argument('errata', metavar='UUID', type=uuidToErrata, help='Apply fix for this errata')
+ fixparser.add_argument('output', metavar='OUTPUT', type=argparse.FileType ('wb'), help='Output WARC')
+ fixparser.set_defaults (func=errataFix)
+
+ args = parser.parse_args()
+
+ if not hasattr (args, 'func'):
+ parser.print_usage ()
+ parser.exit ()
+
+ return args.func (args)
+
diff --git a/crocoite/warc.py b/crocoite/warc.py
index e1cdf35..415b487 100644
--- a/crocoite/warc.py
+++ b/crocoite/warc.py
@@ -42,6 +42,13 @@ jsonMime = 'application/json'
# mime for javascript, according to https://tools.ietf.org/html/rfc4329#section-7.2
jsMime = 'application/javascript'
+def makeContentType (mime, charset=None):
+ """ Create value of Content-Type WARC header with optional charset """
+ s = [mime]
+ if charset:
+ s.extend (['; charset=', charset])
+ return ''.join (s)
+
class WarcHandler (EventHandler):
__slots__ = ('logger', 'writer', 'documentRecords', 'log',
'maxLogSize', 'logEncoding', 'warcinfoRecordId')
@@ -149,10 +156,9 @@ class WarcHandler (EventHandler):
# chrome sends nothing but utf8 encoded text. Fortunately HTTP
# headers take precedence over the document’s <meta>, thus we can
# easily override those.
- contentType = resp.mimeType
- if contentType:
- if isinstance (resp.body, UnicodeBody):
- contentType += '; charset=utf-8'
+ if resp.mimeType:
+ charset = 'utf-8' if isinstance (resp.body, UnicodeBody) else None
+ contentType = makeContentType (resp.mimeType, charset=charset)
httpHeaders.replace_header ('Content-Type', contentType)
# response body
@@ -179,7 +185,7 @@ class WarcHandler (EventHandler):
self.writeRecord (uri, 'resource',
payload=BytesIO (str (item).encode (encoding)),
warc_headers_dict={
- 'Content-Type': f'{jsMime}; charset={encoding}',
+ 'Content-Type': makeContentType (jsMime, encoding),
'X-Crocoite-Type': 'script',
})
@@ -204,7 +210,7 @@ class WarcHandler (EventHandler):
warcHeaders = {
'X-Crocoite-Type': 'dom-snapshot',
'X-Chrome-Viewport': item.viewport,
- 'Content-Type': 'text/html; charset=utf-8',
+ 'Content-Type': makeContentType ('text/html', 'utf-8')
}
self._addRefersTo (warcHeaders, item.url)
@@ -216,7 +222,7 @@ class WarcHandler (EventHandler):
def _writeScreenshot (self, item):
writer = self.writer
warcHeaders = {
- 'Content-Type': 'image/png',
+ 'Content-Type': makeContentType ('image/png'),
'X-Crocoite-Screenshot-Y-Offset': str (item.yoff),
'X-Crocoite-Type': 'screenshot',
}
@@ -229,7 +235,7 @@ class WarcHandler (EventHandler):
writer = self.writer
warcinfo = self.writeRecord (None, 'warcinfo',
- warc_headers_dict={'Content-Type': f'{jsonMime}; encoding={encoding}'},
+ warc_headers_dict={'Content-Type': makeContentType (jsonMime, encoding)},
payload=payload)
self.warcinfoRecordId = warcinfo.rec_headers['WARC-Record-ID']
@@ -238,7 +244,7 @@ class WarcHandler (EventHandler):
writer = self.writer
self.log.seek (0)
warcHeaders = {
- 'Content-Type': f'application/json; encoding={self.logEncoding}',
+ 'Content-Type': makeContentType (jsonMime, self.logEncoding),
'X-Crocoite-Type': 'log',
}
self.writeRecord (None, 'metadata', payload=self.log,