diff options
| author | Lars-Dominik Braun <lars@6xq.net> | 2016-05-05 13:57:00 +0200 | 
|---|---|---|
| committer | Lars-Dominik Braun <lars@6xq.net> | 2016-05-05 13:57:00 +0200 | 
| commit | 7c6778e41101a339690eaae0501210b5b637d39f (patch) | |
| tree | b1324b9d321974d4771a03b5eef7c8c3aae7dacb | |
| download | videobot_enricheuroparl-7c6778e41101a339690eaae0501210b5b637d39f.tar.gz videobot_enricheuroparl-7c6778e41101a339690eaae0501210b5b637d39f.tar.bz2 videobot_enricheuroparl-7c6778e41101a339690eaae0501210b5b637d39f.zip | |
| -rw-r--r-- | COPYING | 19 | ||||
| -rwxr-xr-x | fixitems.sh | 11 | ||||
| -rw-r--r-- | videobot_enricheuroparl.ini | 4 | ||||
| -rwxr-xr-x | videobot_enricheuroparl.py | 169 | 
4 files changed, 203 insertions, 0 deletions
| @@ -0,0 +1,19 @@ +Copyright (c) 2016 videobot_enricheuroparl authors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/fixitems.sh b/fixitems.sh new file mode 100755 index 0000000..653bb2d --- /dev/null +++ b/fixitems.sh @@ -0,0 +1,11 @@ +#!/bin/sh +# Example script fixing all videobot europarl items by adding descriptions, +# subtitles and raw json data. Modify as needed. + +ia search -i 'collection:archiveteam_videobot subject:plenary' | while read -r ident; do +	echo "$ident" +	./videobot_enricheuroparl.py "$ident" || exit 1 +	# ia metadata -m "description:`cat "${ident}.html"`" "$ident" +	# ia upload "$ident" "${ident}.json" "${ident}.vtt" +done + diff --git a/videobot_enricheuroparl.ini b/videobot_enricheuroparl.ini new file mode 100644 index 0000000..21d3bfe --- /dev/null +++ b/videobot_enricheuroparl.ini @@ -0,0 +1,4 @@ +[recording] +tz = US/Eastern +length = 3600 + diff --git a/videobot_enricheuroparl.py b/videobot_enricheuroparl.py new file mode 100755 index 0000000..19243d5 --- /dev/null +++ b/videobot_enricheuroparl.py @@ -0,0 +1,169 @@ +#!/usr/bin/env python3 + +""" +Enrich videobot’s europarl streamdumps with metadata from europarltv, including +topic, speaker and timestamps. + +Expects a config file videobot_enricheuroparl.ini in the working directory. + +TODO: +Additional speaker info: http://www.europarl.europa.eu/ep-live/en/speaker/1911?date=27-04-2016 +""" + + +import json, math +from datetime import datetime, timedelta +from pytz import timezone +from dateutil.parser import parse +import requests + +brussels = timezone('Europe/Brussels') + +def fromTimestamp (ts): +    """ +    Convert timestamp to datetime. Dates are local time (i.e. utc+1) and in +    milliseconds. +    """ +    return datetime.fromtimestamp (ts/1000, tz=brussels) + +def overlap (a, b): +    """ +    Do intervals (a0, a1) and (b0, b1) overlap? +    """ +    def contains (a, b): +        return a[0] <= b[0] <= a[1] or a[0] <= b[1] <= a[1] +    return contains (a, b) or contains (b, a) + +def convertTimestamps (event): +    """ +    Convert timestamps to datetime +    """ +    chapters = event['chapters'] +    for c in chapters: +        c['startTimeCode'] = fromTimestamp (c['startTimeCode']) +        c['endTimeCode'] = fromTimestamp (c['endTimeCode']) + +        for s in c['speeches']: +            s['startTimeCode'] = fromTimestamp (s['startTimeCode']) +            s['endTimeCode'] = fromTimestamp (s['endTimeCode']) +    return event + +def filterRecording (event, recstart, recend): +    """ +    Remove event data not within recording range +    """ +    def inRecording (o): +        return overlap ((recstart, recend), (o['startTimeCode'], o['endTimeCode'])) +         +    event['chapters'] = sorted (filter (inRecording, event['chapters']), key = lambda x: x['startTimeCode']) +    for c in event['chapters']: +        c['speeches'] = sorted (filter (inRecording, c['speeches']), key = lambda x: x['startTimeCode']) +    return event + +def formatTimedelta (d, precision=0, minlen=3): +    """ +    Convert timedelta to string +    """ +    sec = round (d.total_seconds ()*1000) +    div = [(1000, '{:03d}', ''), (60, '{:02d}', '.'), (60, '{:02d}', ':'), (24, '{:02d}', ':')] +    res = "" +    c = 1 +    for factor, fmt, sep in div: +        sec, r = divmod (sec, factor) +        if c > precision: +            # no separator if nothing is there yet +            if not res: +                sep = '' +            res = fmt.format (r) + sep + res +        if sec < 1 and c >= minlen: +            break +        c += 1 +    return res + +def formatDescription (event, recstart, fd): +    """ +    Create IA description +    """ +    chapters = ['Recording of European Parliament plenary session on {humandate}.<br><a href="http://www.europarl.europa.eu/sides/getDoc.do?type=AGENDA&reference={date}&secondRef=SIT&language=EN">Agenda</a>, <a href="http://www.europarl.europa.eu/sides/getDoc.do?type=PV&reference={date}&secondRef=TOC&language=EN">Minutes</a>'.format (date=recstart.strftime ('%Y%m%d'), humandate=recstart.strftime ('%Y-%m-%d %H:%M %Z'))] +    for c in event['chapters']: +        start = c['startTimeCode'] +        end = c['endTimeCode'] +        if start < recstart: +            prefix = '{} (continued)'.format (c['title']['en']) +        else: +            prefix = '{}'.format (c['title']['en']) + +        speeches = [] +        for s in c['speeches']: +            start = s['startTimeCode'] +            end = s['endTimeCode'] +            continued = False +            if start < recstart: +                continued = True +                line = '{}'.format (formatTimedelta (timedelta (0), 1)) +            else: +                line = '{}'.format (formatTimedelta (start-recstart, 1)) +            line += ' <a href="http://www.europarl.europa.eu/meps/en/{}/mep.html">{} {}</a>'.format (s['externalId'], s['firstName'], s['lastName']) +            if continued: +                line += ' (continued)' +            speeches.append (line) +        chapters.append (prefix + '<br>' + '<br>'.join (speeches)) +    print ('<br><br>'.join (chapters), file=fd) + +def formatWebVTT (event, recstart, fd): +    """ +    Create WebVTT subtitles with speaker and topic +    """ +    def formatEvent (start, end, data): +        return '{} --> {}\n{}\n\n'.format (formatTimedelta (start), formatTimedelta (end), data) +         +    print ('WEBVTT\n', file=fd) +    for c in event['chapters']: +        # XXX: assuming the player can show multiple subtitles at the same time +        speeches = c['speeches'] +        start = c['startTimeCode'] +        if start < recstart: +            start = recstart +        relstart = start-recstart +        relend = c['endTimeCode']-recstart +        print (formatEvent (relstart, relend, 'Topic: {}'.format (c['title']['en'])), file=fd) + +        for s in c['speeches']: +            start = s['startTimeCode'] +            if start < recstart: +                start = recstart +            relstart = start-recstart +            relend = s['endTimeCode']-recstart +            print (formatEvent (relstart, relend, 'Speaker: {} {}'.format (s['firstName'], s['lastName'])), file=fd) + +if __name__ == '__main__': +    import argparse, sys, configparser + +    config = configparser.ConfigParser() +    config.read ('videobot_enricheuroparl.ini') + +    parser = argparse.ArgumentParser(description='Enrich videobot’s europarl items with metadata.') +    parser.add_argument ('identifier', help='Internet Archive identifier') +    args = parser.parse_args() + +    iaitem = requests.get ('https://archive.org/metadata/{}'.format (args.identifier)).json () +    if 'error' in iaitem: +        print (iaitem['error'], file=sys.stderr) +        sys.exit (1) + +    recordingtz = timezone (config['recording']['tz']) +    recordingstart = recordingtz.localize (parse (iaitem['metadata']['date'])).astimezone (brussels) +    recordingend = recordingstart+timedelta (seconds = int (config['recording']['length'])) + +    o = requests.get ('http://www.europarl.europa.eu/ep-live/en/json/plenary/video?date={}'.format (recordingstart.strftime ('%d-%m-%Y'))).json () +    # save raw data +    with open ('{}.json'.format (args.identifier), 'w') as fd: +        json.dump (o, fd) + +    event = filterRecording (convertTimestamps (o['event']), recordingstart, recordingend) +    #event = convertTimestamps (o['event']) +    with open ('{}.html'.format (args.identifier), 'w') as fd: +        formatDescription (event, recordingstart, fd) +    with open ('{}.vtt'.format (args.identifier), 'w') as fd: +        formatWebVTT (event, recordingstart, fd) + | 
