diff options
| author | Lars-Dominik Braun <lars@6xq.net> | 2018-12-31 19:29:49 +0100 | 
|---|---|---|
| committer | Lars-Dominik Braun <lars@6xq.net> | 2018-12-31 19:29:49 +0100 | 
| commit | ad4e119bf1a55c84dc7c6260588ed7db9e7199c6 (patch) | |
| tree | cdbec18406ffa54dc3a7e2a4a0df4b6a9ca745c5 | |
| parent | cc1132a5b4677d089e024bcd0e16e1e817a3581c (diff) | |
| download | crocoite-ad4e119bf1a55c84dc7c6260588ed7db9e7199c6.tar.gz crocoite-ad4e119bf1a55c84dc7c6260588ed7db9e7199c6.tar.bz2 crocoite-ad4e119bf1a55c84dc7c6260588ed7db9e7199c6.zip | |
extract-screenshot: Remove URL from filename
URL’s can get quite long, overflowing the file name length limit.
Instead use sequential filenames and output metadata to stdout.
| -rw-r--r-- | crocoite/tools.py | 27 | 
1 files changed, 19 insertions, 8 deletions
| diff --git a/crocoite/tools.py b/crocoite/tools.py index 9c5d836..f4d506d 100644 --- a/crocoite/tools.py +++ b/crocoite/tools.py @@ -24,10 +24,14 @@ Misc tools  import shutil, sys, os, logging, argparse, json  from io import BytesIO +  from warcio.archiveiterator import ArchiveIterator  from warcio.warcwriter import WARCWriter +from yarl import URL +  from pkg_resources import parse_version, parse_requirements -from .util import packageUrl, getSoftwareInfo + +from .util import packageUrl, getSoftwareInfo, StrJsonEncoder  def mergeWarc (files, output):      unique = 0 @@ -98,13 +102,17 @@ def extractScreenshot ():      Extract page screenshots from a WARC generated by crocoite into files      """ -    parser = argparse.ArgumentParser(description='Extract screenshots.') -    parser.add_argument('-f', '--force', action='store_true', help='Overwrite existing files') -    parser.add_argument('input', type=argparse.FileType ('rb'), help='Input WARC') +    parser = argparse.ArgumentParser(description='Extract screenshots from ' +            'WARC, write TSV-formatted info to stdout.') +    parser.add_argument('-f', '--force', action='store_true', +            help='Overwrite existing files') +    parser.add_argument('input', type=argparse.FileType ('rb'), +            help='Input WARC')      parser.add_argument('prefix', help='Output file prefix')      args = parser.parse_args() +    i = 0      with args.input:          for record in ArchiveIterator (args.input):              headers = record.rec_headers @@ -113,15 +121,18 @@ def extractScreenshot ():                      'X-Crocoite-Screenshot-Y-Offset' not in headers:                  continue -            urlSanitized = headers.get_header('WARC-Target-URI').replace ('/', '_') -            xoff = 0 +            url = URL (headers.get_header ('WARC-Target-URI'))              yoff = int (headers.get_header ('X-Crocoite-Screenshot-Y-Offset')) -            outpath = f'{args.prefix}-{urlSanitized}-{xoff}-{yoff}.png' +            outpath = f'{args.prefix}{i:05d}.png'              if args.force or not os.path.exists (outpath): +                json.dump ({'file': outpath, 'url': url, 'yoff': yoff}, +                        sys.stdout, cls=StrJsonEncoder) +                sys.stdout.write ('\n')                  with open (outpath, 'wb') as out:                      shutil.copyfileobj (record.raw_stream, out) +                i += 1              else: -                print (f'not overwriting {outputh}') +                print (f'not overwriting {outpath}', file=sys.stderr)  class Errata:      __slots__ = ('uuid', 'description', 'affects') | 
