#!/usr/bin/env python3
"""
Enrich videobot’s europarl streamdumps with metadata from europarltv, including
topic, speaker and timestamps.
Expects a config file videobot_enricheuroparl.ini in the working directory.
TODO:
Additional speaker info: http://www.europarl.europa.eu/ep-live/en/speaker/1911?date=27-04-2016
"""
import json, math
from datetime import datetime, timedelta
from pytz import timezone
from dateutil.parser import parse
import requests
brussels = timezone('Europe/Brussels')
def fromTimestamp (ts):
"""
Convert timestamp to datetime. Dates are local time (i.e. utc+1) and in
milliseconds.
"""
return datetime.fromtimestamp (ts/1000, tz=brussels)
def overlap (a, b):
"""
Do intervals (a0, a1) and (b0, b1) overlap?
"""
def contains (a, b):
return a[0] <= b[0] <= a[1] or a[0] <= b[1] <= a[1]
return contains (a, b) or contains (b, a)
def convertTimestamps (event):
"""
Convert timestamps to datetime
"""
chapters = event['chapters']
for c in chapters:
c['startTimeCode'] = fromTimestamp (c['startTimeCode'])
c['endTimeCode'] = fromTimestamp (c['endTimeCode'])
for s in c['speeches']:
s['startTimeCode'] = fromTimestamp (s['startTimeCode'])
s['endTimeCode'] = fromTimestamp (s['endTimeCode'])
return event
def filterRecording (event, recstart, recend):
"""
Remove event data not within recording range
"""
def inRecording (o):
return overlap ((recstart, recend), (o['startTimeCode'], o['endTimeCode']))
event['chapters'] = sorted (filter (inRecording, event['chapters']), key = lambda x: x['startTimeCode'])
for c in event['chapters']:
c['speeches'] = sorted (filter (inRecording, c['speeches']), key = lambda x: x['startTimeCode'])
return event
def formatTimedelta (d, precision=0, minlen=3):
"""
Convert timedelta to string
"""
sec = round (d.total_seconds ()*1000)
div = [(1000, '{:03d}', ''), (60, '{:02d}', '.'), (60, '{:02d}', ':'), (24, '{:02d}', ':')]
res = ""
c = 1
for factor, fmt, sep in div:
sec, r = divmod (sec, factor)
if c > precision:
# no separator if nothing is there yet
if not res:
sep = ''
res = fmt.format (r) + sep + res
if sec < 1 and c >= minlen:
break
c += 1
return res
def formatDescription (event, recstart, fd):
"""
Create IA description
"""
chapters = ['Recording of European Parliament plenary session on {humandate}.
Agenda, Minutes'.format (date=recstart.strftime ('%Y%m%d'), humandate=recstart.strftime ('%Y-%m-%d %H:%M %Z'))]
for c in event['chapters']:
start = c['startTimeCode']
end = c['endTimeCode']
if start < recstart:
prefix = '{} (continued)'.format (c['title']['en'])
else:
prefix = '{}'.format (c['title']['en'])
speeches = []
for s in c['speeches']:
start = s['startTimeCode']
end = s['endTimeCode']
continued = False
if start < recstart:
continued = True
line = '{}'.format (formatTimedelta (timedelta (0), 1))
else:
line = '{}'.format (formatTimedelta (start-recstart, 1))
line += ' {} {}'.format (s['externalId'], s['firstName'], s['lastName'])
if continued:
line += ' (continued)'
speeches.append (line)
chapters.append (prefix + '
' + '
'.join (speeches))
print ('
'.join (chapters), file=fd)
def formatWebVTT (event, recstart, fd):
"""
Create WebVTT subtitles with speaker and topic
"""
def formatEvent (start, end, data):
return '{} --> {}\n{}\n\n'.format (formatTimedelta (start), formatTimedelta (end), data)
print ('WEBVTT\n', file=fd)
for c in event['chapters']:
# XXX: assuming the player can show multiple subtitles at the same time
speeches = c['speeches']
start = c['startTimeCode']
if start < recstart:
start = recstart
relstart = start-recstart
relend = c['endTimeCode']-recstart
print (formatEvent (relstart, relend, 'Topic: {}'.format (c['title']['en'])), file=fd)
for s in c['speeches']:
start = s['startTimeCode']
if start < recstart:
start = recstart
relstart = start-recstart
relend = s['endTimeCode']-recstart
print (formatEvent (relstart, relend, 'Speaker: {} {}'.format (s['firstName'], s['lastName'])), file=fd)
if __name__ == '__main__':
import argparse, sys, configparser
config = configparser.ConfigParser()
config.read ('videobot_enricheuroparl.ini')
parser = argparse.ArgumentParser(description='Enrich videobot’s europarl items with metadata.')
parser.add_argument ('identifier', help='Internet Archive identifier')
args = parser.parse_args()
iaitem = requests.get ('https://archive.org/metadata/{}'.format (args.identifier)).json ()
if 'error' in iaitem:
print (iaitem['error'], file=sys.stderr)
sys.exit (1)
recordingtz = timezone (config['recording']['tz'])
recordingstart = recordingtz.localize (parse (iaitem['metadata']['date'])).astimezone (brussels)
recordingend = recordingstart+timedelta (seconds = int (config['recording']['length']))
o = requests.get ('http://www.europarl.europa.eu/ep-live/en/json/plenary/video?date={}'.format (recordingstart.strftime ('%d-%m-%Y'))).json ()
# save raw data
with open ('{}.json'.format (args.identifier), 'w') as fd:
json.dump (o, fd)
event = filterRecording (convertTimestamps (o['event']), recordingstart, recordingend)
#event = convertTimestamps (o['event'])
with open ('{}.html'.format (args.identifier), 'w') as fd:
formatDescription (event, recordingstart, fd)
with open ('{}.vtt'.format (args.identifier), 'w') as fd:
formatWebVTT (event, recordingstart, fd)