summaryrefslogtreecommitdiff
path: root/crocoite/tools.py
diff options
context:
space:
mode:
Diffstat (limited to 'crocoite/tools.py')
-rw-r--r--crocoite/tools.py290
1 files changed, 255 insertions, 35 deletions
diff --git a/crocoite/tools.py b/crocoite/tools.py
index bc92f8f..a2ddaa3 100644
--- a/crocoite/tools.py
+++ b/crocoite/tools.py
@@ -22,76 +22,296 @@
Misc tools
"""
-import shutil, sys, re, os, logging, argparse
+import shutil, sys, os, logging, argparse, json
+from io import BytesIO
+
from warcio.archiveiterator import ArchiveIterator
from warcio.warcwriter import WARCWriter
+from yarl import URL
-def mergeWarc ():
- """
- Merge multiple WARC files into a single file, writing revisit records for
- items which occur multiple times
- """
-
- parser = argparse.ArgumentParser(description='Merge WARCs, reads filenames from stdin.')
- parser.add_argument('--verbose', '-v', action='store_true')
- parser.add_argument('output', type=argparse.FileType ('wb'), help='Output WARC')
+from pkg_resources import parse_version, parse_requirements
- args = parser.parse_args()
- loglevel = logging.DEBUG if args.verbose else logging.INFO
- logging.basicConfig (level=loglevel)
+from .util import getSoftwareInfo, StrJsonEncoder
+from .warc import jsonMime, makeContentType
+def mergeWarc (files, output):
+ # stats
unique = 0
revisit = 0
+ uniqueLength = 0
+ revisitLength = 0
+
payloadMap = {}
- writer = WARCWriter (args.output, gzip=True)
- for l in sys.stdin:
- l = l.strip ()
+ writer = WARCWriter (output, gzip=True)
+
+ # Add an additional warcinfo record, describing the transformations. This
+ # is not ideal, since
+ # “A ‘warcinfo’ record describes the records that
+ # follow it […] until next ‘warcinfo’”
+ # -- https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#warcinfo
+ # A warcinfo record is expected at the beginning of every file. But it
+ # might have written by a different software, so we don’t want to
+ # strip/replace that information, but supplement it.
+ warcinfo = {
+ 'software': getSoftwareInfo (),
+ 'tool': 'crocoite-merge', # not the name of the cli tool
+ 'parameters': {'inputs': files},
+ }
+ payload = BytesIO (json.dumps (warcinfo, indent=2).encode ('utf-8'))
+ record = writer.create_warc_record ('', 'warcinfo',
+ payload=payload,
+ warc_headers_dict={'Content-Type': makeContentType (jsonMime, 'utf-8')})
+ writer.write_record (record)
+
+ for l in files:
with open (l, 'rb') as fd:
for record in ArchiveIterator (fd):
if record.rec_type in {'resource', 'response'}:
headers = record.rec_headers
rid = headers.get_header('WARC-Record-ID')
csum = headers.get_header('WARC-Payload-Digest')
+ length = int (headers.get_header ('Content-Length'))
dup = payloadMap.get (csum, None)
if dup is None:
payloadMap[csum] = {'uri': headers.get_header('WARC-Target-URI'),
'id': rid, 'date': headers.get_header('WARC-Date')}
unique += 1
+ uniqueLength += length
else:
- logging.debug ('Record {} is duplicate of {}'.format (rid, dup['id']))
- record = writer.create_revisit_record (dup['uri'], csum, dup['uri'], dup['date'])
+ logging.debug (f'Record {rid} is duplicate of {dup["id"]}')
+ # Payload may be identical, but HTTP headers are
+ # (probably) not. Include them.
+ record = writer.create_revisit_record (
+ headers.get_header('WARC-Target-URI'), digest=csum,
+ refers_to_uri=dup['uri'], refers_to_date=dup['date'],
+ http_headers=record.http_headers)
record.rec_headers.add_header ('WARC-Truncated', 'length')
record.rec_headers.add_header ('WARC-Refers-To', dup['id'])
revisit += 1
+ revisitLength += length
else:
unique += 1
writer.write_record (record)
- logging.info ('Wrote {} unique records, {} revisits'.format (unique, revisit))
+ json.dump (dict (
+ unique=dict (records=unique, bytes=uniqueLength),
+ revisit=dict (records=revisit, bytes=revisitLength),
+ ratio=dict (
+ records=unique/(unique+revisit),
+ bytes=uniqueLength/(uniqueLength+revisitLength)
+ ),
+ ),
+ sys.stdout,
+ cls=StrJsonEncoder)
+ sys.stdout.write ('\n')
+
+def mergeWarcCli():
+ parser = argparse.ArgumentParser(description='Merge WARCs, reads filenames from stdin.')
+ parser.add_argument('--verbose', '-v', action='store_true')
+ parser.add_argument('output', type=argparse.FileType ('wb'), help='Output WARC')
+
+ args = parser.parse_args()
+ loglevel = logging.DEBUG if args.verbose else logging.INFO
+ logging.basicConfig (level=loglevel)
+
+ mergeWarc([l.strip() for l in sys.stdin], args.output)
def extractScreenshot ():
"""
Extract page screenshots from a WARC generated by crocoite into files
"""
- parser = argparse.ArgumentParser(description='Extract screenshots.')
- parser.add_argument('-f', '--force', action='store_true', help='Overwrite existing files')
- parser.add_argument('input', type=argparse.FileType ('rb'), help='Input WARC')
+ parser = argparse.ArgumentParser(description='Extract screenshots from '
+ 'WARC, write JSON info to stdout.')
+ parser.add_argument('-f', '--force', action='store_true',
+ help='Overwrite existing files')
+ parser.add_argument('-1', '--one', action='store_true',
+ help='Only extract the first screenshot into a file named prefix')
+ parser.add_argument('input', type=argparse.FileType ('rb'),
+ help='Input WARC')
parser.add_argument('prefix', help='Output file prefix')
args = parser.parse_args()
- screenshotRe = re.compile (r'^urn:crocoite:screenshot-(\d+)-(\d+).png$', re.I)
+ i = 0
with args.input:
- for record in ArchiveIterator(args.input):
- uri = record.rec_headers.get_header('WARC-Target-URI')
- if record.rec_type == 'resource':
- m = screenshotRe.match (uri)
- xoff, yoff = m.groups ()
- if m:
- outpath = '{}-{}-{}.png'.format (args.prefix, xoff, yoff)
- if args.force or not os.path.exists (outpath):
- with open (outpath, 'wb') as out:
- shutil.copyfileobj (record.raw_stream, out)
- else:
- print ('not overwriting {}'.format (outpath))
+ for record in ArchiveIterator (args.input):
+ headers = record.rec_headers
+ if record.rec_type != 'conversion' or \
+ headers['Content-Type'] != 'image/png' or \
+ 'X-Crocoite-Screenshot-Y-Offset' not in headers:
+ continue
+
+ url = URL (headers.get_header ('WARC-Target-URI'))
+ yoff = int (headers.get_header ('X-Crocoite-Screenshot-Y-Offset'))
+ outpath = f'{args.prefix}{i:05d}.png' if not args.one else args.prefix
+ if args.force or not os.path.exists (outpath):
+ json.dump ({'file': outpath, 'url': url, 'yoff': yoff},
+ sys.stdout, cls=StrJsonEncoder)
+ sys.stdout.write ('\n')
+ with open (outpath, 'wb') as out:
+ shutil.copyfileobj (record.raw_stream, out)
+ i += 1
+ else:
+ print (f'not overwriting {outpath}', file=sys.stderr)
+
+ if args.one:
+ break
+
+class Errata:
+ __slots__ = ('uuid', 'description', 'url', 'affects')
+
+ def __init__ (self, uuid, description, affects, url=None):
+ self.uuid = uuid
+ self.description = description
+ self.url = url
+ # slightly abusing setuptool’s version parsing/matching here
+ self.affects = list (parse_requirements(affects))
+
+ def __contains__ (self, pkg):
+ """
+ Return True if the versions in pkg are affected by this errata
+
+ pkg must be a mapping from project_name to version
+ """
+ matchedAll = []
+ for a in self.affects:
+ haveVersion = pkg.get (a.project_name, None)
+ matchedAll.append (haveVersion is not None and haveVersion in a)
+ return all (matchedAll)
+
+ def __repr__ (self):
+ return f'{self.__class__.__name__}({self.uuid!r}, {self.description!r}, {self.affects!r})'
+
+ @property
+ def fixable (self):
+ return getattr (self, 'applyFix', None) is not None
+
+ def toDict (self):
+ return {'uuid': self.uuid,
+ 'description': self.description,
+ 'url': self.url,
+ 'affects': list (map (str, self.affects)),
+ 'fixable': self.fixable}
+
+class FixableErrata(Errata):
+ __slots__ = ('stats')
+
+ def __init__ (self, uuid, description, affects, url=None):
+ super().__init__ (uuid, description, affects, url)
+ # statistics for fixable erratas
+ self.stats = dict (records=dict (fixed=0, processed=0))
+
+ def applyFix (self, record):
+ raise NotImplementedError () # pragma: no cover
+
+class ContentTypeErrata (FixableErrata):
+ def __init__ (self):
+ super().__init__ (
+ uuid='552c13dc-56e5-4539-9ad8-184ccae60930',
+ description='Content-Type header uses wrong argument name encoding instead of charset.',
+ url='https://github.com/PromyLOPh/crocoite/issues/19',
+ affects=['crocoite==1.0.0'])
+
+ def applyFix (self, record):
+ # XXX: this is ugly. warcio’s write_record replaces any Content-Type
+ # header we’re setting with this one. But printing rec_headers shows
+ # the header, not .content_type.
+ contentType = record.content_type
+ if '; encoding=' in contentType:
+ contentType = contentType.replace ('; encoding=', '; charset=')
+ record.content_type = contentType
+ self.stats['records']['fixed'] += 1
+
+ self.stats['records']['processed'] += 1
+ return record
+
+bugs = [
+ Errata (uuid='34a176b3-ad3d-430f-a082-68087f304572',
+ description='Generated by version < 1.0. No erratas are supported for this version.',
+ affects=['crocoite<1.0'],
+ ),
+ ContentTypeErrata (),
+ ]
+
+def makeReport (fd):
+ alreadyFixed = set ()
+
+ for record in ArchiveIterator (fd):
+ if record.rec_type == 'warcinfo':
+ try:
+ data = json.load (record.raw_stream)
+ # errata records precceed everything else and indicate which
+ # ones were fixed already
+ if data['tool'] == 'crocoite-errata':
+ alreadyFixed.update (data['parameters']['errata'])
+ else:
+ haveVersions = dict ([(pkg['projectName'], parse_version(pkg['version'])) for pkg in data['software']['self']])
+ yield from filter (lambda b: haveVersions in b and b.uuid not in alreadyFixed, bugs)
+ except json.decoder.JSONDecodeError:
+ pass
+
+def errataCheck (args):
+ hasErrata = False
+ for item in makeReport (args.input):
+ json.dump (item.toDict (), sys.stdout)
+ sys.stdout.write ('\n')
+ sys.stdout.flush ()
+ hasErrata = True
+ return int (hasErrata)
+
+def errataFix (args):
+ errata = args.errata
+
+ with args.input as infd, args.output as outfd:
+ writer = WARCWriter (outfd, gzip=True)
+
+ warcinfo = {
+ 'software': getSoftwareInfo (),
+ 'tool': 'crocoite-errata', # not the name of the cli tool
+ 'parameters': {'errata': [errata.uuid]},
+ }
+ payload = BytesIO (json.dumps (warcinfo, indent=2).encode ('utf-8'))
+ record = writer.create_warc_record ('', 'warcinfo',
+ payload=payload,
+ warc_headers_dict={'Content-Type': makeContentType (jsonMime, 'utf-8')})
+ writer.write_record (record)
+
+ for record in ArchiveIterator (infd):
+ fixedRecord = errata.applyFix (record)
+ writer.write_record (fixedRecord)
+ json.dump (errata.stats, sys.stdout)
+ sys.stdout.write ('\n')
+ sys.stdout.flush ()
+
+def uuidToErrata (uuid, onlyFixable=True):
+ try:
+ e = next (filter (lambda x: x.uuid == uuid, bugs))
+ except StopIteration:
+ raise argparse.ArgumentTypeError (f'Errata {uuid} does not exist')
+ if not isinstance (e, FixableErrata):
+ raise argparse.ArgumentTypeError (f'Errata {uuid} is not fixable')
+ return e
+
+def errata ():
+ parser = argparse.ArgumentParser(description=f'Show/fix erratas for WARCs generated by {__package__}.')
+ parser.add_argument('input', metavar='INPUT', type=argparse.FileType ('rb'), help='Input WARC')
+
+ # XXX: required argument does not work here?!
+ subparsers = parser.add_subparsers()
+
+ checkparser = subparsers.add_parser('check', help='Show erratas')
+ checkparser.set_defaults (func=errataCheck)
+
+ fixparser = subparsers.add_parser('fix', help='Fix erratas')
+ fixparser.add_argument('errata', metavar='UUID', type=uuidToErrata, help='Apply fix for this errata')
+ fixparser.add_argument('output', metavar='OUTPUT', type=argparse.FileType ('wb'), help='Output WARC')
+ fixparser.set_defaults (func=errataFix)
+
+ args = parser.parse_args()
+
+ if not hasattr (args, 'func'):
+ parser.print_usage ()
+ parser.exit ()
+
+ return args.func (args)