1
1
mirror of https://github.com/ytdl-org/youtube-dl synced 2024-11-14 21:59:29 +01:00
youtube-dl/youtube_dl/extractor/techtalks.py

80 lines
2.4 KiB
Python
Raw Normal View History

2014-09-04 14:48:29 +02:00
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
get_element_by_attribute,
clean_html,
)
class TechTalksIE(InfoExtractor):
_VALID_URL = r'https?://techtalks\.tv/talks/[^/]*/(?P<id>\d+)/'
_TEST = {
2014-09-04 14:48:29 +02:00
'url': 'http://techtalks.tv/talks/learning-topic-models-going-beyond-svd/57758/',
'info_dict': {
'id': '57758',
'title': 'Learning Topic Models --- Going beyond SVD',
},
'playlist': [
{
2014-09-04 14:48:29 +02:00
'info_dict': {
'id': '57758',
'ext': 'flv',
'title': 'Learning Topic Models --- Going beyond SVD',
},
},
{
2014-09-04 14:48:29 +02:00
'info_dict': {
'id': '57758-slides',
'ext': 'flv',
'title': 'Learning Topic Models --- Going beyond SVD',
},
},
],
2014-09-04 14:48:29 +02:00
'params': {
# rtmp download
2014-09-04 14:48:29 +02:00
'skip_download': True,
},
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
talk_id = mobj.group('id')
webpage = self._download_webpage(url, talk_id)
2014-09-04 14:48:29 +02:00
rtmp_url = self._search_regex(
r'netConnectionUrl: \'(.*?)\'', webpage, 'rtmp url')
play_path = self._search_regex(
r'href=\'(.*?)\' [^>]*id="flowplayer_presenter"',
webpage, 'presenter play path')
title = clean_html(get_element_by_attribute('class', 'title', webpage))
video_info = {
2014-09-04 14:48:29 +02:00
'id': talk_id,
'title': title,
'url': rtmp_url,
'play_path': play_path,
'ext': 'flv',
}
m_slides = re.search(r'<a class="slides" href=\'(.*?)\'', webpage)
if m_slides is None:
return video_info
else:
2014-09-04 14:48:29 +02:00
return {
'_type': 'playlist',
'id': talk_id,
'title': title,
'entries': [
video_info,
# The slides video
{
'id': talk_id + '-slides',
'title': title,
'url': rtmp_url,
'play_path': m_slides.group(1),
'ext': 'flv',
},
],
}