summaryrefslogtreecommitdiff
path: root/crocoite/tools.py
diff options
context:
space:
mode:
Diffstat (limited to 'crocoite/tools.py')
-rw-r--r--crocoite/tools.py217
1 files changed, 205 insertions, 12 deletions
diff --git a/crocoite/tools.py b/crocoite/tools.py
index e2dc6a7..a2ddaa3 100644
--- a/crocoite/tools.py
+++ b/crocoite/tools.py
@@ -24,13 +24,23 @@ Misc tools
import shutil, sys, os, logging, argparse, json
from io import BytesIO
+
from warcio.archiveiterator import ArchiveIterator
from warcio.warcwriter import WARCWriter
-from .util import packageUrl, getSoftwareInfo
+from yarl import URL
+
+from pkg_resources import parse_version, parse_requirements
+
+from .util import getSoftwareInfo, StrJsonEncoder
+from .warc import jsonMime, makeContentType
def mergeWarc (files, output):
+ # stats
unique = 0
revisit = 0
+ uniqueLength = 0
+ revisitLength = 0
+
payloadMap = {}
writer = WARCWriter (output, gzip=True)
@@ -48,9 +58,9 @@ def mergeWarc (files, output):
'parameters': {'inputs': files},
}
payload = BytesIO (json.dumps (warcinfo, indent=2).encode ('utf-8'))
- record = writer.create_warc_record (packageUrl ('warcinfo'), 'warcinfo',
+ record = writer.create_warc_record ('', 'warcinfo',
payload=payload,
- warc_headers_dict={'Content-Type': 'text/plain; encoding=utf-8'})
+ warc_headers_dict={'Content-Type': makeContentType (jsonMime, 'utf-8')})
writer.write_record (record)
for l in files:
@@ -60,13 +70,15 @@ def mergeWarc (files, output):
headers = record.rec_headers
rid = headers.get_header('WARC-Record-ID')
csum = headers.get_header('WARC-Payload-Digest')
+ length = int (headers.get_header ('Content-Length'))
dup = payloadMap.get (csum, None)
if dup is None:
payloadMap[csum] = {'uri': headers.get_header('WARC-Target-URI'),
'id': rid, 'date': headers.get_header('WARC-Date')}
unique += 1
+ uniqueLength += length
else:
- logging.debug ('Record {} is duplicate of {}'.format (rid, dup['id']))
+ logging.debug (f'Record {rid} is duplicate of {dup["id"]}')
# Payload may be identical, but HTTP headers are
# (probably) not. Include them.
record = writer.create_revisit_record (
@@ -76,10 +88,21 @@ def mergeWarc (files, output):
record.rec_headers.add_header ('WARC-Truncated', 'length')
record.rec_headers.add_header ('WARC-Refers-To', dup['id'])
revisit += 1
+ revisitLength += length
else:
unique += 1
writer.write_record (record)
- logging.info ('Wrote {} unique records, {} revisits'.format (unique, revisit))
+ json.dump (dict (
+ unique=dict (records=unique, bytes=uniqueLength),
+ revisit=dict (records=revisit, bytes=revisitLength),
+ ratio=dict (
+ records=unique/(unique+revisit),
+ bytes=uniqueLength/(uniqueLength+revisitLength)
+ ),
+ ),
+ sys.stdout,
+ cls=StrJsonEncoder)
+ sys.stdout.write ('\n')
def mergeWarcCli():
parser = argparse.ArgumentParser(description='Merge WARCs, reads filenames from stdin.')
@@ -97,13 +120,19 @@ def extractScreenshot ():
Extract page screenshots from a WARC generated by crocoite into files
"""
- parser = argparse.ArgumentParser(description='Extract screenshots.')
- parser.add_argument('-f', '--force', action='store_true', help='Overwrite existing files')
- parser.add_argument('input', type=argparse.FileType ('rb'), help='Input WARC')
+ parser = argparse.ArgumentParser(description='Extract screenshots from '
+ 'WARC, write JSON info to stdout.')
+ parser.add_argument('-f', '--force', action='store_true',
+ help='Overwrite existing files')
+ parser.add_argument('-1', '--one', action='store_true',
+ help='Only extract the first screenshot into a file named prefix')
+ parser.add_argument('input', type=argparse.FileType ('rb'),
+ help='Input WARC')
parser.add_argument('prefix', help='Output file prefix')
args = parser.parse_args()
+ i = 0
with args.input:
for record in ArchiveIterator (args.input):
headers = record.rec_headers
@@ -112,13 +141,177 @@ def extractScreenshot ():
'X-Crocoite-Screenshot-Y-Offset' not in headers:
continue
- urlSanitized = headers.get_header('WARC-Target-URI').replace ('/', '_')
- xoff = 0
+ url = URL (headers.get_header ('WARC-Target-URI'))
yoff = int (headers.get_header ('X-Crocoite-Screenshot-Y-Offset'))
- outpath = '{}-{}-{}-{}.png'.format (args.prefix, urlSanitized, xoff, yoff)
+ outpath = f'{args.prefix}{i:05d}.png' if not args.one else args.prefix
if args.force or not os.path.exists (outpath):
+ json.dump ({'file': outpath, 'url': url, 'yoff': yoff},
+ sys.stdout, cls=StrJsonEncoder)
+ sys.stdout.write ('\n')
with open (outpath, 'wb') as out:
shutil.copyfileobj (record.raw_stream, out)
+ i += 1
else:
- print ('not overwriting {}'.format (outpath))
+ print (f'not overwriting {outpath}', file=sys.stderr)
+
+ if args.one:
+ break
+
+class Errata:
+ __slots__ = ('uuid', 'description', 'url', 'affects')
+
+ def __init__ (self, uuid, description, affects, url=None):
+ self.uuid = uuid
+ self.description = description
+ self.url = url
+ # slightly abusing setuptool’s version parsing/matching here
+ self.affects = list (parse_requirements(affects))
+
+ def __contains__ (self, pkg):
+ """
+ Return True if the versions in pkg are affected by this errata
+
+ pkg must be a mapping from project_name to version
+ """
+ matchedAll = []
+ for a in self.affects:
+ haveVersion = pkg.get (a.project_name, None)
+ matchedAll.append (haveVersion is not None and haveVersion in a)
+ return all (matchedAll)
+
+ def __repr__ (self):
+ return f'{self.__class__.__name__}({self.uuid!r}, {self.description!r}, {self.affects!r})'
+
+ @property
+ def fixable (self):
+ return getattr (self, 'applyFix', None) is not None
+
+ def toDict (self):
+ return {'uuid': self.uuid,
+ 'description': self.description,
+ 'url': self.url,
+ 'affects': list (map (str, self.affects)),
+ 'fixable': self.fixable}
+
+class FixableErrata(Errata):
+ __slots__ = ('stats')
+
+ def __init__ (self, uuid, description, affects, url=None):
+ super().__init__ (uuid, description, affects, url)
+ # statistics for fixable erratas
+ self.stats = dict (records=dict (fixed=0, processed=0))
+
+ def applyFix (self, record):
+ raise NotImplementedError () # pragma: no cover
+
+class ContentTypeErrata (FixableErrata):
+ def __init__ (self):
+ super().__init__ (
+ uuid='552c13dc-56e5-4539-9ad8-184ccae60930',
+ description='Content-Type header uses wrong argument name encoding instead of charset.',
+ url='https://github.com/PromyLOPh/crocoite/issues/19',
+ affects=['crocoite==1.0.0'])
+
+ def applyFix (self, record):
+ # XXX: this is ugly. warcio’s write_record replaces any Content-Type
+ # header we’re setting with this one. But printing rec_headers shows
+ # the header, not .content_type.
+ contentType = record.content_type
+ if '; encoding=' in contentType:
+ contentType = contentType.replace ('; encoding=', '; charset=')
+ record.content_type = contentType
+ self.stats['records']['fixed'] += 1
+
+ self.stats['records']['processed'] += 1
+ return record
+
+bugs = [
+ Errata (uuid='34a176b3-ad3d-430f-a082-68087f304572',
+ description='Generated by version < 1.0. No erratas are supported for this version.',
+ affects=['crocoite<1.0'],
+ ),
+ ContentTypeErrata (),
+ ]
+
+def makeReport (fd):
+ alreadyFixed = set ()
+
+ for record in ArchiveIterator (fd):
+ if record.rec_type == 'warcinfo':
+ try:
+ data = json.load (record.raw_stream)
+ # errata records precceed everything else and indicate which
+ # ones were fixed already
+ if data['tool'] == 'crocoite-errata':
+ alreadyFixed.update (data['parameters']['errata'])
+ else:
+ haveVersions = dict ([(pkg['projectName'], parse_version(pkg['version'])) for pkg in data['software']['self']])
+ yield from filter (lambda b: haveVersions in b and b.uuid not in alreadyFixed, bugs)
+ except json.decoder.JSONDecodeError:
+ pass
+
+def errataCheck (args):
+ hasErrata = False
+ for item in makeReport (args.input):
+ json.dump (item.toDict (), sys.stdout)
+ sys.stdout.write ('\n')
+ sys.stdout.flush ()
+ hasErrata = True
+ return int (hasErrata)
+
+def errataFix (args):
+ errata = args.errata
+
+ with args.input as infd, args.output as outfd:
+ writer = WARCWriter (outfd, gzip=True)
+
+ warcinfo = {
+ 'software': getSoftwareInfo (),
+ 'tool': 'crocoite-errata', # not the name of the cli tool
+ 'parameters': {'errata': [errata.uuid]},
+ }
+ payload = BytesIO (json.dumps (warcinfo, indent=2).encode ('utf-8'))
+ record = writer.create_warc_record ('', 'warcinfo',
+ payload=payload,
+ warc_headers_dict={'Content-Type': makeContentType (jsonMime, 'utf-8')})
+ writer.write_record (record)
+
+ for record in ArchiveIterator (infd):
+ fixedRecord = errata.applyFix (record)
+ writer.write_record (fixedRecord)
+ json.dump (errata.stats, sys.stdout)
+ sys.stdout.write ('\n')
+ sys.stdout.flush ()
+
+def uuidToErrata (uuid, onlyFixable=True):
+ try:
+ e = next (filter (lambda x: x.uuid == uuid, bugs))
+ except StopIteration:
+ raise argparse.ArgumentTypeError (f'Errata {uuid} does not exist')
+ if not isinstance (e, FixableErrata):
+ raise argparse.ArgumentTypeError (f'Errata {uuid} is not fixable')
+ return e
+
+def errata ():
+ parser = argparse.ArgumentParser(description=f'Show/fix erratas for WARCs generated by {__package__}.')
+ parser.add_argument('input', metavar='INPUT', type=argparse.FileType ('rb'), help='Input WARC')
+
+ # XXX: required argument does not work here?!
+ subparsers = parser.add_subparsers()
+
+ checkparser = subparsers.add_parser('check', help='Show erratas')
+ checkparser.set_defaults (func=errataCheck)
+
+ fixparser = subparsers.add_parser('fix', help='Fix erratas')
+ fixparser.add_argument('errata', metavar='UUID', type=uuidToErrata, help='Apply fix for this errata')
+ fixparser.add_argument('output', metavar='OUTPUT', type=argparse.FileType ('wb'), help='Output WARC')
+ fixparser.set_defaults (func=errataFix)
+
+ args = parser.parse_args()
+
+ if not hasattr (args, 'func'):
+ parser.print_usage ()
+ parser.exit ()
+
+ return args.func (args)