From 7c6778e41101a339690eaae0501210b5b637d39f Mon Sep 17 00:00:00 2001
From: Lars-Dominik Braun <lars@6xq.net>
Date: Thu, 5 May 2016 13:57:00 +0200
Subject: Initial import

---
 videobot_enricheuroparl.py | 169 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 169 insertions(+)
 create mode 100755 videobot_enricheuroparl.py

(limited to 'videobot_enricheuroparl.py')
diff --git a/videobot_enricheuroparl.py b/videobot_enricheuroparl.py
new file mode 100755
index 0000000..19243d5
--- /dev/null
+++ b/videobot_enricheuroparl.py
@@ -0,0 +1,169 @@
+#!/usr/bin/env python3
+
+"""
+Enrich videobot’s europarl streamdumps with metadata from europarltv, including
+topic, speaker and timestamps.
+
+Expects a config file videobot_enricheuroparl.ini in the working directory.
+
+TODO:
+Additional speaker info: http://www.europarl.europa.eu/ep-live/en/speaker/1911?date=27-04-2016
+"""
+
+
+import json, math
+from datetime import datetime, timedelta
+from pytz import timezone
+from dateutil.parser import parse
+import requests
+
+brussels = timezone('Europe/Brussels')
+
+def fromTimestamp (ts):
+    """
+    Convert timestamp to datetime. Dates are local time (i.e. utc+1) and in
+    milliseconds.
+    """
+    return datetime.fromtimestamp (ts/1000, tz=brussels)
+
+def overlap (a, b):
+    """
+    Do intervals (a0, a1) and (b0, b1) overlap?
+    """
+    def contains (a, b):
+        return a[0] <= b[0] <= a[1] or a[0] <= b[1] <= a[1]
+    return contains (a, b) or contains (b, a)
+
+def convertTimestamps (event):
+    """
+    Convert timestamps to datetime
+    """
+    chapters = event['chapters']
+    for c in chapters:
+        c['startTimeCode'] = fromTimestamp (c['startTimeCode'])
+        c['endTimeCode'] = fromTimestamp (c['endTimeCode'])
+
+        for s in c['speeches']:
+            s['startTimeCode'] = fromTimestamp (s['startTimeCode'])
+            s['endTimeCode'] = fromTimestamp (s['endTimeCode'])
+    return event
+
+def filterRecording (event, recstart, recend):
+    """
+    Remove event data not within recording range
+    """
+    def inRecording (o):
+        return overlap ((recstart, recend), (o['startTimeCode'], o['endTimeCode']))
+        
+    event['chapters'] = sorted (filter (inRecording, event['chapters']), key = lambda x: x['startTimeCode'])
+    for c in event['chapters']:
+        c['speeches'] = sorted (filter (inRecording, c['speeches']), key = lambda x: x['startTimeCode'])
+    return event
+
+def formatTimedelta (d, precision=0, minlen=3):
+    """
+    Convert timedelta to string
+    """
+    sec = round (d.total_seconds ()*1000)
+    div = [(1000, '{:03d}', ''), (60, '{:02d}', '.'), (60, '{:02d}', ':'), (24, '{:02d}', ':')]
+    res = ""
+    c = 1
+    for factor, fmt, sep in div:
+        sec, r = divmod (sec, factor)
+        if c > precision:
+            # no separator if nothing is there yet
+            if not res:
+                sep = ''
+            res = fmt.format (r) + sep + res
+        if sec < 1 and c >= minlen:
+            break
+        c += 1
+    return res
+
+def formatDescription (event, recstart, fd):
+    """
+    Create IA description
+    """
+    chapters = ['Recording of European Parliament plenary session on {humandate}.<br><a href="http://www.europarl.europa.eu/sides/getDoc.do?type=AGENDA&reference={date}&secondRef=SIT&language=EN">Agenda</a>, <a href="http://www.europarl.europa.eu/sides/getDoc.do?type=PV&reference={date}&secondRef=TOC&language=EN">Minutes</a>'.format (date=recstart.strftime ('%Y%m%d'), humandate=recstart.strftime ('%Y-%m-%d %H:%M %Z'))]
+    for c in event['chapters']:
+        start = c['startTimeCode']
+        end = c['endTimeCode']
+        if start < recstart:
+            prefix = '{} (continued)'.format (c['title']['en'])
+        else:
+            prefix = '{}'.format (c['title']['en'])
+
+        speeches = []
+        for s in c['speeches']:
+            start = s['startTimeCode']
+            end = s['endTimeCode']
+            continued = False
+            if start < recstart:
+                continued = True
+                line = '{}'.format (formatTimedelta (timedelta (0), 1))
+            else:
+                line = '{}'.format (formatTimedelta (start-recstart, 1))
+            line += ' <a href="http://www.europarl.europa.eu/meps/en/{}/mep.html">{} {}</a>'.format (s['externalId'], s['firstName'], s['lastName'])
+            if continued:
+                line += ' (continued)'
+            speeches.append (line)
+        chapters.append (prefix + '<br>' + '<br>'.join (speeches))
+    print ('<br><br>'.join (chapters), file=fd)
+
+def formatWebVTT (event, recstart, fd):
+    """
+    Create WebVTT subtitles with speaker and topic
+    """
+    def formatEvent (start, end, data):
+        return '{} --> {}\n{}\n\n'.format (formatTimedelta (start), formatTimedelta (end), data)
+        
+    print ('WEBVTT\n', file=fd)
+    for c in event['chapters']:
+        # XXX: assuming the player can show multiple subtitles at the same time
+        speeches = c['speeches']
+        start = c['startTimeCode']
+        if start < recstart:
+            start = recstart
+        relstart = start-recstart
+        relend = c['endTimeCode']-recstart
+        print (formatEvent (relstart, relend, 'Topic: {}'.format (c['title']['en'])), file=fd)
+
+        for s in c['speeches']:
+            start = s['startTimeCode']
+            if start < recstart:
+                start = recstart
+            relstart = start-recstart
+            relend = s['endTimeCode']-recstart
+            print (formatEvent (relstart, relend, 'Speaker: {} {}'.format (s['firstName'], s['lastName'])), file=fd)
+
+if __name__ == '__main__':
+    import argparse, sys, configparser
+
+    config = configparser.ConfigParser()
+    config.read ('videobot_enricheuroparl.ini')
+
+    parser = argparse.ArgumentParser(description='Enrich videobot’s europarl items with metadata.')
+    parser.add_argument ('identifier', help='Internet Archive identifier')
+    args = parser.parse_args()
+
+    iaitem = requests.get ('https://archive.org/metadata/{}'.format (args.identifier)).json ()
+    if 'error' in iaitem:
+        print (iaitem['error'], file=sys.stderr)
+        sys.exit (1)
+
+    recordingtz = timezone (config['recording']['tz'])
+    recordingstart = recordingtz.localize (parse (iaitem['metadata']['date'])).astimezone (brussels)
+    recordingend = recordingstart+timedelta (seconds = int (config['recording']['length']))
+
+    o = requests.get ('http://www.europarl.europa.eu/ep-live/en/json/plenary/video?date={}'.format (recordingstart.strftime ('%d-%m-%Y'))).json ()
+    # save raw data
+    with open ('{}.json'.format (args.identifier), 'w') as fd:
+        json.dump (o, fd)
+
+    event = filterRecording (convertTimestamps (o['event']), recordingstart, recordingend)
+    #event = convertTimestamps (o['event'])
+    with open ('{}.html'.format (args.identifier), 'w') as fd:
+        formatDescription (event, recordingstart, fd)
+    with open ('{}.vtt'.format (args.identifier), 'w') as fd:
+        formatWebVTT (event, recordingstart, fd)
+
-- 
cgit v1.2.3