From 7c6778e41101a339690eaae0501210b5b637d39f Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Thu, 5 May 2016 13:57:00 +0200 Subject: Initial import --- videobot_enricheuroparl.py | 169 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 169 insertions(+) create mode 100755 videobot_enricheuroparl.py (limited to 'videobot_enricheuroparl.py') diff --git a/videobot_enricheuroparl.py b/videobot_enricheuroparl.py new file mode 100755 index 0000000..19243d5 --- /dev/null +++ b/videobot_enricheuroparl.py @@ -0,0 +1,169 @@ +#!/usr/bin/env python3 + +""" +Enrich videobot’s europarl streamdumps with metadata from europarltv, including +topic, speaker and timestamps. + +Expects a config file videobot_enricheuroparl.ini in the working directory. + +TODO: +Additional speaker info: http://www.europarl.europa.eu/ep-live/en/speaker/1911?date=27-04-2016 +""" + + +import json, math +from datetime import datetime, timedelta +from pytz import timezone +from dateutil.parser import parse +import requests + +brussels = timezone('Europe/Brussels') + +def fromTimestamp (ts): + """ + Convert timestamp to datetime. Dates are local time (i.e. utc+1) and in + milliseconds. + """ + return datetime.fromtimestamp (ts/1000, tz=brussels) + +def overlap (a, b): + """ + Do intervals (a0, a1) and (b0, b1) overlap? + """ + def contains (a, b): + return a[0] <= b[0] <= a[1] or a[0] <= b[1] <= a[1] + return contains (a, b) or contains (b, a) + +def convertTimestamps (event): + """ + Convert timestamps to datetime + """ + chapters = event['chapters'] + for c in chapters: + c['startTimeCode'] = fromTimestamp (c['startTimeCode']) + c['endTimeCode'] = fromTimestamp (c['endTimeCode']) + + for s in c['speeches']: + s['startTimeCode'] = fromTimestamp (s['startTimeCode']) + s['endTimeCode'] = fromTimestamp (s['endTimeCode']) + return event + +def filterRecording (event, recstart, recend): + """ + Remove event data not within recording range + """ + def inRecording (o): + return overlap ((recstart, recend), (o['startTimeCode'], o['endTimeCode'])) + + event['chapters'] = sorted (filter (inRecording, event['chapters']), key = lambda x: x['startTimeCode']) + for c in event['chapters']: + c['speeches'] = sorted (filter (inRecording, c['speeches']), key = lambda x: x['startTimeCode']) + return event + +def formatTimedelta (d, precision=0, minlen=3): + """ + Convert timedelta to string + """ + sec = round (d.total_seconds ()*1000) + div = [(1000, '{:03d}', ''), (60, '{:02d}', '.'), (60, '{:02d}', ':'), (24, '{:02d}', ':')] + res = "" + c = 1 + for factor, fmt, sep in div: + sec, r = divmod (sec, factor) + if c > precision: + # no separator if nothing is there yet + if not res: + sep = '' + res = fmt.format (r) + sep + res + if sec < 1 and c >= minlen: + break + c += 1 + return res + +def formatDescription (event, recstart, fd): + """ + Create IA description + """ + chapters = ['Recording of European Parliament plenary session on {humandate}.
Agenda, Minutes'.format (date=recstart.strftime ('%Y%m%d'), humandate=recstart.strftime ('%Y-%m-%d %H:%M %Z'))] + for c in event['chapters']: + start = c['startTimeCode'] + end = c['endTimeCode'] + if start < recstart: + prefix = '{} (continued)'.format (c['title']['en']) + else: + prefix = '{}'.format (c['title']['en']) + + speeches = [] + for s in c['speeches']: + start = s['startTimeCode'] + end = s['endTimeCode'] + continued = False + if start < recstart: + continued = True + line = '{}'.format (formatTimedelta (timedelta (0), 1)) + else: + line = '{}'.format (formatTimedelta (start-recstart, 1)) + line += ' {} {}'.format (s['externalId'], s['firstName'], s['lastName']) + if continued: + line += ' (continued)' + speeches.append (line) + chapters.append (prefix + '
' + '
'.join (speeches)) + print ('

'.join (chapters), file=fd) + +def formatWebVTT (event, recstart, fd): + """ + Create WebVTT subtitles with speaker and topic + """ + def formatEvent (start, end, data): + return '{} --> {}\n{}\n\n'.format (formatTimedelta (start), formatTimedelta (end), data) + + print ('WEBVTT\n', file=fd) + for c in event['chapters']: + # XXX: assuming the player can show multiple subtitles at the same time + speeches = c['speeches'] + start = c['startTimeCode'] + if start < recstart: + start = recstart + relstart = start-recstart + relend = c['endTimeCode']-recstart + print (formatEvent (relstart, relend, 'Topic: {}'.format (c['title']['en'])), file=fd) + + for s in c['speeches']: + start = s['startTimeCode'] + if start < recstart: + start = recstart + relstart = start-recstart + relend = s['endTimeCode']-recstart + print (formatEvent (relstart, relend, 'Speaker: {} {}'.format (s['firstName'], s['lastName'])), file=fd) + +if __name__ == '__main__': + import argparse, sys, configparser + + config = configparser.ConfigParser() + config.read ('videobot_enricheuroparl.ini') + + parser = argparse.ArgumentParser(description='Enrich videobot’s europarl items with metadata.') + parser.add_argument ('identifier', help='Internet Archive identifier') + args = parser.parse_args() + + iaitem = requests.get ('https://archive.org/metadata/{}'.format (args.identifier)).json () + if 'error' in iaitem: + print (iaitem['error'], file=sys.stderr) + sys.exit (1) + + recordingtz = timezone (config['recording']['tz']) + recordingstart = recordingtz.localize (parse (iaitem['metadata']['date'])).astimezone (brussels) + recordingend = recordingstart+timedelta (seconds = int (config['recording']['length'])) + + o = requests.get ('http://www.europarl.europa.eu/ep-live/en/json/plenary/video?date={}'.format (recordingstart.strftime ('%d-%m-%Y'))).json () + # save raw data + with open ('{}.json'.format (args.identifier), 'w') as fd: + json.dump (o, fd) + + event = filterRecording (convertTimestamps (o['event']), recordingstart, recordingend) + #event = convertTimestamps (o['event']) + with open ('{}.html'.format (args.identifier), 'w') as fd: + formatDescription (event, recordingstart, fd) + with open ('{}.vtt'.format (args.identifier), 'w') as fd: + formatWebVTT (event, recordingstart, fd) + -- cgit v1.2.3