diff options
| author | Lars-Dominik Braun <lars@6xq.net> | 2019-06-27 21:49:57 +0200 | 
|---|---|---|
| committer | Lars-Dominik Braun <lars@6xq.net> | 2019-06-27 22:03:02 +0200 | 
| commit | 76811bd3f0b3fc8688939e31fdab2c71c89cc75b (patch) | |
| tree | b3ecc9256377d761a658c4401cdd2f6fac3d2672 | |
| parent | 94be61a303127335eb39b26302392a0f237c3ac1 (diff) | |
| download | crocoite-76811bd3f0b3fc8688939e31fdab2c71c89cc75b.tar.gz crocoite-76811bd3f0b3fc8688939e31fdab2c71c89cc75b.tar.bz2 crocoite-76811bd3f0b3fc8688939e31fdab2c71c89cc75b.zip | |
merge: Dump machine-readable info
| -rw-r--r-- | crocoite/tools.py | 20 | 
1 files changed, 18 insertions, 2 deletions
| diff --git a/crocoite/tools.py b/crocoite/tools.py index f4d506d..0beb884 100644 --- a/crocoite/tools.py +++ b/crocoite/tools.py @@ -34,8 +34,12 @@ from pkg_resources import parse_version, parse_requirements  from .util import packageUrl, getSoftwareInfo, StrJsonEncoder  def mergeWarc (files, output): +    # stats      unique = 0      revisit = 0 +    uniqueLength = 0 +    revisitLength = 0 +      payloadMap = {}      writer = WARCWriter (output, gzip=True) @@ -65,11 +69,13 @@ def mergeWarc (files, output):                      headers = record.rec_headers                      rid = headers.get_header('WARC-Record-ID')                      csum = headers.get_header('WARC-Payload-Digest') +                    length = int (headers.get_header ('Content-Length'))                      dup = payloadMap.get (csum, None)                      if dup is None:                          payloadMap[csum] = {'uri': headers.get_header('WARC-Target-URI'),                                  'id': rid, 'date': headers.get_header('WARC-Date')}                          unique += 1 +                        uniqueLength += length                      else:                          logging.debug (f'Record {rid} is duplicate of {dup["id"]}')                          # Payload may be identical, but HTTP headers are @@ -81,10 +87,20 @@ def mergeWarc (files, output):                          record.rec_headers.add_header ('WARC-Truncated', 'length')                          record.rec_headers.add_header ('WARC-Refers-To', dup['id'])                          revisit += 1 +                        revisitLength += length                  else:                      unique += 1                  writer.write_record (record) -    logging.info (f'Wrote {unique} unique records, {revisit} revisits') +    json.dump (dict ( +            unique=dict (records=unique, bytes=uniqueLength), +            revisit=dict (records=revisit, bytes=revisitLength), +            ratio=dict ( +                    records=unique/(unique+revisit), +                    bytes=uniqueLength/(uniqueLength+revisitLength) +                    ), +            ), +            sys.stdout, +            cls=StrJsonEncoder)  def mergeWarcCli():      parser = argparse.ArgumentParser(description='Merge WARCs, reads filenames from stdin.') @@ -103,7 +119,7 @@ def extractScreenshot ():      """      parser = argparse.ArgumentParser(description='Extract screenshots from ' -            'WARC, write TSV-formatted info to stdout.') +            'WARC, write JSON info to stdout.')      parser.add_argument('-f', '--force', action='store_true',              help='Overwrite existing files')      parser.add_argument('input', type=argparse.FileType ('rb'), | 
