summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2018-12-31 19:29:49 +0100
committerLars-Dominik Braun <lars@6xq.net>2018-12-31 19:29:49 +0100
commitad4e119bf1a55c84dc7c6260588ed7db9e7199c6 (patch)
treecdbec18406ffa54dc3a7e2a4a0df4b6a9ca745c5
parentcc1132a5b4677d089e024bcd0e16e1e817a3581c (diff)
downloadcrocoite-ad4e119bf1a55c84dc7c6260588ed7db9e7199c6.tar.gz
crocoite-ad4e119bf1a55c84dc7c6260588ed7db9e7199c6.tar.bz2
crocoite-ad4e119bf1a55c84dc7c6260588ed7db9e7199c6.zip
extract-screenshot: Remove URL from filename
URL’s can get quite long, overflowing the file name length limit. Instead use sequential filenames and output metadata to stdout.
-rw-r--r--crocoite/tools.py27
1 files changed, 19 insertions, 8 deletions
diff --git a/crocoite/tools.py b/crocoite/tools.py
index 9c5d836..f4d506d 100644
--- a/crocoite/tools.py
+++ b/crocoite/tools.py
@@ -24,10 +24,14 @@ Misc tools
import shutil, sys, os, logging, argparse, json
from io import BytesIO
+
from warcio.archiveiterator import ArchiveIterator
from warcio.warcwriter import WARCWriter
+from yarl import URL
+
from pkg_resources import parse_version, parse_requirements
-from .util import packageUrl, getSoftwareInfo
+
+from .util import packageUrl, getSoftwareInfo, StrJsonEncoder
def mergeWarc (files, output):
unique = 0
@@ -98,13 +102,17 @@ def extractScreenshot ():
Extract page screenshots from a WARC generated by crocoite into files
"""
- parser = argparse.ArgumentParser(description='Extract screenshots.')
- parser.add_argument('-f', '--force', action='store_true', help='Overwrite existing files')
- parser.add_argument('input', type=argparse.FileType ('rb'), help='Input WARC')
+ parser = argparse.ArgumentParser(description='Extract screenshots from '
+ 'WARC, write TSV-formatted info to stdout.')
+ parser.add_argument('-f', '--force', action='store_true',
+ help='Overwrite existing files')
+ parser.add_argument('input', type=argparse.FileType ('rb'),
+ help='Input WARC')
parser.add_argument('prefix', help='Output file prefix')
args = parser.parse_args()
+ i = 0
with args.input:
for record in ArchiveIterator (args.input):
headers = record.rec_headers
@@ -113,15 +121,18 @@ def extractScreenshot ():
'X-Crocoite-Screenshot-Y-Offset' not in headers:
continue
- urlSanitized = headers.get_header('WARC-Target-URI').replace ('/', '_')
- xoff = 0
+ url = URL (headers.get_header ('WARC-Target-URI'))
yoff = int (headers.get_header ('X-Crocoite-Screenshot-Y-Offset'))
- outpath = f'{args.prefix}-{urlSanitized}-{xoff}-{yoff}.png'
+ outpath = f'{args.prefix}{i:05d}.png'
if args.force or not os.path.exists (outpath):
+ json.dump ({'file': outpath, 'url': url, 'yoff': yoff},
+ sys.stdout, cls=StrJsonEncoder)
+ sys.stdout.write ('\n')
with open (outpath, 'wb') as out:
shutil.copyfileobj (record.raw_stream, out)
+ i += 1
else:
- print (f'not overwriting {outputh}')
+ print (f'not overwriting {outpath}', file=sys.stderr)
class Errata:
__slots__ = ('uuid', 'description', 'affects')