#!/usr/bin/env python3 """ Enrich videobot’s europarl streamdumps with metadata from europarltv, including topic, speaker and timestamps. Expects a config file videobot_enricheuroparl.ini in the working directory. TODO: Additional speaker info: http://www.europarl.europa.eu/ep-live/en/speaker/1911?date=27-04-2016 """ import json, math from datetime import datetime, timedelta from pytz import timezone from dateutil.parser import parse import requests brussels = timezone('Europe/Brussels') def fromTimestamp (ts): """ Convert timestamp to datetime. Dates are local time (i.e. utc+1) and in milliseconds. """ return datetime.fromtimestamp (ts/1000, tz=brussels) def overlap (a, b): """ Do intervals (a0, a1) and (b0, b1) overlap? """ def contains (a, b): return a[0] <= b[0] <= a[1] or a[0] <= b[1] <= a[1] return contains (a, b) or contains (b, a) def convertTimestamps (event): """ Convert timestamps to datetime """ chapters = event['chapters'] for c in chapters: c['startTimeCode'] = fromTimestamp (c['startTimeCode']) c['endTimeCode'] = fromTimestamp (c['endTimeCode']) for s in c['speeches']: s['startTimeCode'] = fromTimestamp (s['startTimeCode']) s['endTimeCode'] = fromTimestamp (s['endTimeCode']) return event def filterRecording (event, recstart, recend): """ Remove event data not within recording range """ def inRecording (o): return overlap ((recstart, recend), (o['startTimeCode'], o['endTimeCode'])) event['chapters'] = sorted (filter (inRecording, event['chapters']), key = lambda x: x['startTimeCode']) for c in event['chapters']: c['speeches'] = sorted (filter (inRecording, c['speeches']), key = lambda x: x['startTimeCode']) return event def formatTimedelta (d, precision=0, minlen=3): """ Convert timedelta to string """ sec = round (d.total_seconds ()*1000) div = [(1000, '{:03d}', ''), (60, '{:02d}', '.'), (60, '{:02d}', ':'), (24, '{:02d}', ':')] res = "" c = 1 for factor, fmt, sep in div: sec, r = divmod (sec, factor) if c > precision: # no separator if nothing is there yet if not res: sep = '' res = fmt.format (r) + sep + res if sec < 1 and c >= minlen: break c += 1 return res def formatDescription (event, recstart, fd): """ Create IA description """ chapters = ['Recording of European Parliament plenary session on {humandate}.
Agenda, Minutes'.format (date=recstart.strftime ('%Y%m%d'), humandate=recstart.strftime ('%Y-%m-%d %H:%M %Z'))] for c in event['chapters']: start = c['startTimeCode'] end = c['endTimeCode'] if start < recstart: prefix = '{} (continued)'.format (c['title']['en']) else: prefix = '{}'.format (c['title']['en']) speeches = [] for s in c['speeches']: start = s['startTimeCode'] end = s['endTimeCode'] continued = False if start < recstart: continued = True line = '{}'.format (formatTimedelta (timedelta (0), 1)) else: line = '{}'.format (formatTimedelta (start-recstart, 1)) line += ' {} {}'.format (s['externalId'], s['firstName'], s['lastName']) if continued: line += ' (continued)' speeches.append (line) chapters.append (prefix + '
' + '
'.join (speeches)) print ('

'.join (chapters), file=fd) def formatWebVTT (event, recstart, fd): """ Create WebVTT subtitles with speaker and topic """ def formatEvent (start, end, data): return '{} --> {}\n{}\n\n'.format (formatTimedelta (start), formatTimedelta (end), data) print ('WEBVTT\n', file=fd) for c in event['chapters']: # XXX: assuming the player can show multiple subtitles at the same time speeches = c['speeches'] start = c['startTimeCode'] if start < recstart: start = recstart relstart = start-recstart relend = c['endTimeCode']-recstart print (formatEvent (relstart, relend, 'Topic: {}'.format (c['title']['en'])), file=fd) for s in c['speeches']: start = s['startTimeCode'] if start < recstart: start = recstart relstart = start-recstart relend = s['endTimeCode']-recstart print (formatEvent (relstart, relend, 'Speaker: {} {}'.format (s['firstName'], s['lastName'])), file=fd) if __name__ == '__main__': import argparse, sys, configparser config = configparser.ConfigParser() config.read ('videobot_enricheuroparl.ini') parser = argparse.ArgumentParser(description='Enrich videobot’s europarl items with metadata.') parser.add_argument ('identifier', help='Internet Archive identifier') args = parser.parse_args() iaitem = requests.get ('https://archive.org/metadata/{}'.format (args.identifier)).json () if 'error' in iaitem: print (iaitem['error'], file=sys.stderr) sys.exit (1) recordingtz = timezone (config['recording']['tz']) recordingstart = recordingtz.localize (parse (iaitem['metadata']['date'])).astimezone (brussels) recordingend = recordingstart+timedelta (seconds = int (config['recording']['length'])) o = requests.get ('http://www.europarl.europa.eu/ep-live/en/json/plenary/video?date={}'.format (recordingstart.strftime ('%d-%m-%Y'))).json () # save raw data with open ('{}.json'.format (args.identifier), 'w') as fd: json.dump (o, fd) event = filterRecording (convertTimestamps (o['event']), recordingstart, recordingend) #event = convertTimestamps (o['event']) with open ('{}.html'.format (args.identifier), 'w') as fd: formatDescription (event, recordingstart, fd) with open ('{}.vtt'.format (args.identifier), 'w') as fd: formatWebVTT (event, recordingstart, fd)