path: root/
diff options
authorLars-Dominik Braun <>2016-05-05 13:57:00 +0200
committerLars-Dominik Braun <>2016-05-05 13:57:00 +0200
commit7c6778e41101a339690eaae0501210b5b637d39f (patch)
treeb1324b9d321974d4771a03b5eef7c8c3aae7dacb /
Initial importHEADmaster
Diffstat (limited to '')
1 files changed, 169 insertions, 0 deletions
diff --git a/ b/
new file mode 100755
index 0000000..19243d5
--- /dev/null
+++ b/
@@ -0,0 +1,169 @@
+#!/usr/bin/env python3
+Enrich videobot’s europarl streamdumps with metadata from europarltv, including
+topic, speaker and timestamps.
+Expects a config file videobot_enricheuroparl.ini in the working directory.
+Additional speaker info:
+import json, math
+from datetime import datetime, timedelta
+from pytz import timezone
+from dateutil.parser import parse
+import requests
+brussels = timezone('Europe/Brussels')
+def fromTimestamp (ts):
+ """
+ Convert timestamp to datetime. Dates are local time (i.e. utc+1) and in
+ milliseconds.
+ """
+ return datetime.fromtimestamp (ts/1000, tz=brussels)
+def overlap (a, b):
+ """
+ Do intervals (a0, a1) and (b0, b1) overlap?
+ """
+ def contains (a, b):
+ return a[0] <= b[0] <= a[1] or a[0] <= b[1] <= a[1]
+ return contains (a, b) or contains (b, a)
+def convertTimestamps (event):
+ """
+ Convert timestamps to datetime
+ """
+ chapters = event['chapters']
+ for c in chapters:
+ c['startTimeCode'] = fromTimestamp (c['startTimeCode'])
+ c['endTimeCode'] = fromTimestamp (c['endTimeCode'])
+ for s in c['speeches']:
+ s['startTimeCode'] = fromTimestamp (s['startTimeCode'])
+ s['endTimeCode'] = fromTimestamp (s['endTimeCode'])
+ return event
+def filterRecording (event, recstart, recend):
+ """
+ Remove event data not within recording range
+ """
+ def inRecording (o):
+ return overlap ((recstart, recend), (o['startTimeCode'], o['endTimeCode']))
+ event['chapters'] = sorted (filter (inRecording, event['chapters']), key = lambda x: x['startTimeCode'])
+ for c in event['chapters']:
+ c['speeches'] = sorted (filter (inRecording, c['speeches']), key = lambda x: x['startTimeCode'])
+ return event
+def formatTimedelta (d, precision=0, minlen=3):
+ """
+ Convert timedelta to string
+ """
+ sec = round (d.total_seconds ()*1000)
+ div = [(1000, '{:03d}', ''), (60, '{:02d}', '.'), (60, '{:02d}', ':'), (24, '{:02d}', ':')]
+ res = ""
+ c = 1
+ for factor, fmt, sep in div:
+ sec, r = divmod (sec, factor)
+ if c > precision:
+ # no separator if nothing is there yet
+ if not res:
+ sep = ''
+ res = fmt.format (r) + sep + res
+ if sec < 1 and c >= minlen:
+ break
+ c += 1
+ return res
+def formatDescription (event, recstart, fd):
+ """
+ Create IA description
+ """
+ chapters = ['Recording of European Parliament plenary session on {humandate}.<br><a href="{date}&secondRef=SIT&language=EN">Agenda</a>, <a href="{date}&secondRef=TOC&language=EN">Minutes</a>'.format (date=recstart.strftime ('%Y%m%d'), humandate=recstart.strftime ('%Y-%m-%d %H:%M %Z'))]
+ for c in event['chapters']:
+ start = c['startTimeCode']
+ end = c['endTimeCode']
+ if start < recstart:
+ prefix = '{} (continued)'.format (c['title']['en'])
+ else:
+ prefix = '{}'.format (c['title']['en'])
+ speeches = []
+ for s in c['speeches']:
+ start = s['startTimeCode']
+ end = s['endTimeCode']
+ continued = False
+ if start < recstart:
+ continued = True
+ line = '{}'.format (formatTimedelta (timedelta (0), 1))
+ else:
+ line = '{}'.format (formatTimedelta (start-recstart, 1))
+ line += ' <a href="{}/mep.html">{} {}</a>'.format (s['externalId'], s['firstName'], s['lastName'])
+ if continued:
+ line += ' (continued)'
+ speeches.append (line)
+ chapters.append (prefix + '<br>' + '<br>'.join (speeches))
+ print ('<br><br>'.join (chapters), file=fd)
+def formatWebVTT (event, recstart, fd):
+ """
+ Create WebVTT subtitles with speaker and topic
+ """
+ def formatEvent (start, end, data):
+ return '{} --> {}\n{}\n\n'.format (formatTimedelta (start), formatTimedelta (end), data)
+ print ('WEBVTT\n', file=fd)
+ for c in event['chapters']:
+ # XXX: assuming the player can show multiple subtitles at the same time
+ speeches = c['speeches']
+ start = c['startTimeCode']
+ if start < recstart:
+ start = recstart
+ relstart = start-recstart
+ relend = c['endTimeCode']-recstart
+ print (formatEvent (relstart, relend, 'Topic: {}'.format (c['title']['en'])), file=fd)
+ for s in c['speeches']:
+ start = s['startTimeCode']
+ if start < recstart:
+ start = recstart
+ relstart = start-recstart
+ relend = s['endTimeCode']-recstart
+ print (formatEvent (relstart, relend, 'Speaker: {} {}'.format (s['firstName'], s['lastName'])), file=fd)
+if __name__ == '__main__':
+ import argparse, sys, configparser
+ config = configparser.ConfigParser()
+ ('videobot_enricheuroparl.ini')
+ parser = argparse.ArgumentParser(description='Enrich videobot’s europarl items with metadata.')
+ parser.add_argument ('identifier', help='Internet Archive identifier')
+ args = parser.parse_args()
+ iaitem = requests.get ('{}'.format (args.identifier)).json ()
+ if 'error' in iaitem:
+ print (iaitem['error'], file=sys.stderr)
+ sys.exit (1)
+ recordingtz = timezone (config['recording']['tz'])
+ recordingstart = recordingtz.localize (parse (iaitem['metadata']['date'])).astimezone (brussels)
+ recordingend = recordingstart+timedelta (seconds = int (config['recording']['length']))
+ o = requests.get ('{}'.format (recordingstart.strftime ('%d-%m-%Y'))).json ()
+ # save raw data
+ with open ('{}.json'.format (args.identifier), 'w') as fd:
+ json.dump (o, fd)
+ event = filterRecording (convertTimestamps (o['event']), recordingstart, recordingend)
+ #event = convertTimestamps (o['event'])
+ with open ('{}.html'.format (args.identifier), 'w') as fd:
+ formatDescription (event, recordingstart, fd)
+ with open ('{}.vtt'.format (args.identifier), 'w') as fd:
+ formatWebVTT (event, recordingstart, fd)