extract-screenshot: Remove URL from filename

URL’s can get quite long, overflowing the file name length limit. Instead use sequential filenames and output metadata to stdout.
author: Lars-Dominik Braun <lars@6xq.net> 2018-12-31 19:29:49 +0100
committer: Lars-Dominik Braun <lars@6xq.net> 2018-12-31 19:29:49 +0100
commit: ad4e119bf1a55c84dc7c6260588ed7db9e7199c6 (patch)
tree: cdbec18406ffa54dc3a7e2a4a0df4b6a9ca745c5
parent: cc1132a5b4677d089e024bcd0e16e1e817a3581c (diff)
download: crocoite-ad4e119bf1a55c84dc7c6260588ed7db9e7199c6.tar.gz
crocoite-ad4e119bf1a55c84dc7c6260588ed7db9e7199c6.tar.bz2
crocoite-ad4e119bf1a55c84dc7c6260588ed7db9e7199c6.zip
1 files changed, 19 insertions, 8 deletions
diff --git a/crocoite/tools.py b/crocoite/tools.py
index 9c5d836..f4d506d 100644
--- a/crocoite/tools.py
+++ b/crocoite/tools.py
@@ -24,10 +24,14 @@ Misc tools
 
 import shutil, sys, os, logging, argparse, json
 from io import BytesIO
+
 from warcio.archiveiterator import ArchiveIterator
 from warcio.warcwriter import WARCWriter
+from yarl import URL
+
 from pkg_resources import parse_version, parse_requirements
-from .util import packageUrl, getSoftwareInfo
+
+from .util import packageUrl, getSoftwareInfo, StrJsonEncoder
 
 def mergeWarc (files, output):
     unique = 0
@@ -98,13 +102,17 @@ def extractScreenshot ():
     Extract page screenshots from a WARC generated by crocoite into files
     """
 
-    parser = argparse.ArgumentParser(description='Extract screenshots.')
-    parser.add_argument('-f', '--force', action='store_true', help='Overwrite existing files')
-    parser.add_argument('input', type=argparse.FileType ('rb'), help='Input WARC')
+    parser = argparse.ArgumentParser(description='Extract screenshots from '
+            'WARC, write TSV-formatted info to stdout.')
+    parser.add_argument('-f', '--force', action='store_true',
+            help='Overwrite existing files')
+    parser.add_argument('input', type=argparse.FileType ('rb'),
+            help='Input WARC')
     parser.add_argument('prefix', help='Output file prefix')
 
     args = parser.parse_args()
 
+    i = 0
     with args.input:
         for record in ArchiveIterator (args.input):
             headers = record.rec_headers
@@ -113,15 +121,18 @@ def extractScreenshot ():
                     'X-Crocoite-Screenshot-Y-Offset' not in headers:
                 continue
 
-            urlSanitized = headers.get_header('WARC-Target-URI').replace ('/', '_')
-            xoff = 0
+            url = URL (headers.get_header ('WARC-Target-URI'))
             yoff = int (headers.get_header ('X-Crocoite-Screenshot-Y-Offset'))
-            outpath = f'{args.prefix}-{urlSanitized}-{xoff}-{yoff}.png'
+            outpath = f'{args.prefix}{i:05d}.png'
             if args.force or not os.path.exists (outpath):
+                json.dump ({'file': outpath, 'url': url, 'yoff': yoff},
+                        sys.stdout, cls=StrJsonEncoder)
+                sys.stdout.write ('\n')
                 with open (outpath, 'wb') as out:
                     shutil.copyfileobj (record.raw_stream, out)
+                i += 1
             else:
-                print (f'not overwriting {outputh}')
+                print (f'not overwriting {outpath}', file=sys.stderr)
 
 class Errata:
     __slots__ = ('uuid', 'description', 'affects')
author	Lars-Dominik Braun <lars@6xq.net>	2018-12-31 19:29:49 +0100
committer	Lars-Dominik Braun <lars@6xq.net>	2018-12-31 19:29:49 +0100
commit	ad4e119bf1a55c84dc7c6260588ed7db9e7199c6 (patch)
tree	cdbec18406ffa54dc3a7e2a4a0df4b6a9ca745c5
parent	cc1132a5b4677d089e024bcd0e16e1e817a3581c (diff)
download	crocoite-ad4e119bf1a55c84dc7c6260588ed7db9e7199c6.tar.gz crocoite-ad4e119bf1a55c84dc7c6260588ed7db9e7199c6.tar.bz2 crocoite-ad4e119bf1a55c84dc7c6260588ed7db9e7199c6.zip