[jamendo] Improve and extract more metadata (closes #11836)

This commit is contained in:
Sergey M․ 2017-01-26 23:25:40 +07:00
parent 15846398ca
commit 3cbecdd111
No known key found for this signature in database
GPG Key ID: 2C393E0F18A9236D

View File

@ -5,9 +5,27 @@
from ..compat import compat_urlparse from ..compat import compat_urlparse
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import parse_duration
class JamendoIE(InfoExtractor): class JamendoBaseIE(InfoExtractor):
def _extract_meta(self, webpage, fatal=True):
title = self._og_search_title(
webpage, default=None) or self._search_regex(
r'<title>([^<]+)', webpage,
'title', default=None)
if title:
title = self._search_regex(
r'(.+?)\s*\|\s*Jamendo Music', title, 'title', default=None)
if not title:
title = self._html_search_meta(
'name', webpage, 'title', fatal=fatal)
mobj = re.search(r'(.+) - (.+)', title or '')
artist, second = mobj.groups() if mobj else [None] * 2
return title, artist, second
class JamendoIE(JamendoBaseIE):
_VALID_URL = r'https?://(?:www\.)?jamendo\.com/track/(?P<id>[0-9]+)/(?P<display_id>[^/?#&]+)' _VALID_URL = r'https?://(?:www\.)?jamendo\.com/track/(?P<id>[0-9]+)/(?P<display_id>[^/?#&]+)'
_TEST = { _TEST = {
'url': 'https://www.jamendo.com/track/196219/stories-from-emona-i', 'url': 'https://www.jamendo.com/track/196219/stories-from-emona-i',
@ -17,6 +35,9 @@ class JamendoIE(InfoExtractor):
'display_id': 'stories-from-emona-i', 'display_id': 'stories-from-emona-i',
'ext': 'flac', 'ext': 'flac',
'title': 'Maya Filipič - Stories from Emona I', 'title': 'Maya Filipič - Stories from Emona I',
'artist': 'Maya Filipič',
'track': 'Stories from Emona I',
'duration': 210,
'thumbnail': r're:^https?://.*\.jpg' 'thumbnail': r're:^https?://.*\.jpg'
} }
} }
@ -28,7 +49,7 @@ def _real_extract(self, url):
webpage = self._download_webpage(url, display_id) webpage = self._download_webpage(url, display_id)
title = self._search_regex(r'<title>(.*?)\ \|\ Jamendo\ Music\ .*</title>', webpage, 'title') title, artist, track = self._extract_meta(webpage)
formats = [{ formats = [{
'url': 'https://%s.jamendo.com/?trackid=%s&format=%s&from=app-97dab294' 'url': 'https://%s.jamendo.com/?trackid=%s&format=%s&from=app-97dab294'
@ -46,17 +67,23 @@ def _real_extract(self, url):
thumbnail = self._html_search_meta( thumbnail = self._html_search_meta(
'image', webpage, 'thumbnail', fatal=False) 'image', webpage, 'thumbnail', fatal=False)
duration = parse_duration(self._search_regex(
r'<span[^>]+itemprop=["\']duration["\'][^>]+content=["\'](.+?)["\']',
webpage, 'duration', fatal=False))
return { return {
'id': track_id, 'id': track_id,
'display_id': display_id, 'display_id': display_id,
'thumbnail': thumbnail, 'thumbnail': thumbnail,
'title': title, 'title': title,
'duration': duration,
'artist': artist,
'track': track,
'formats': formats 'formats': formats
} }
class JamendoAlbumIE(InfoExtractor): class JamendoAlbumIE(JamendoBaseIE):
_VALID_URL = r'https?://(?:www\.)?jamendo\.com/album/(?P<id>[0-9]+)/(?P<display_id>[\w-]+)' _VALID_URL = r'https?://(?:www\.)?jamendo\.com/album/(?P<id>[0-9]+)/(?P<display_id>[\w-]+)'
_TEST = { _TEST = {
'url': 'https://www.jamendo.com/album/121486/duck-on-cover', 'url': 'https://www.jamendo.com/album/121486/duck-on-cover',
@ -69,14 +96,18 @@ class JamendoAlbumIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': '1032333', 'id': '1032333',
'ext': 'flac', 'ext': 'flac',
'title': 'Shearer - Warmachine' 'title': 'Shearer - Warmachine',
'artist': 'Shearer',
'track': 'Warmachine',
} }
}, { }, {
'md5': '1f358d7b2f98edfe90fd55dac0799d50', 'md5': '1f358d7b2f98edfe90fd55dac0799d50',
'info_dict': { 'info_dict': {
'id': '1032330', 'id': '1032330',
'ext': 'flac', 'ext': 'flac',
'title': 'Shearer - Without Your Ghost' 'title': 'Shearer - Without Your Ghost',
'artist': 'Shearer',
'track': 'Without Your Ghost',
} }
}], }],
'params': { 'params': {
@ -90,18 +121,18 @@ def _real_extract(self, url):
webpage = self._download_webpage(url, mobj.group('display_id')) webpage = self._download_webpage(url, mobj.group('display_id'))
title = self._search_regex(r'<title>(.*?)\ \|\ Jamendo\ Music\ .*</title>', webpage, 'title') title, artist, album = self._extract_meta(webpage, fatal=False)
entries = [ entries = [{
self.url_result( '_type': 'url_transparent',
compat_urlparse.urljoin(url, m.group('path')), 'url': compat_urlparse.urljoin(url, m.group('path')),
ie=JamendoIE.ie_key(), 'ie_key': JamendoIE.ie_key(),
video_id=self._search_regex( 'id': self._search_regex(
r'/track/(\d+)', m.group('path'), r'/track/(\d+)', m.group('path'), 'track id', default=None),
'track id', default=None)) 'artist': artist,
for m in re.finditer( 'album': album,
} for m in re.finditer(
r'<a[^>]+href=(["\'])(?P<path>(?:(?!\1).)+)\1[^>]+class=["\'][^>]*js-trackrow-albumpage-link', r'<a[^>]+href=(["\'])(?P<path>(?:(?!\1).)+)\1[^>]+class=["\'][^>]*js-trackrow-albumpage-link',
webpage) webpage)]
]
return self.playlist_result(entries, album_id, title) return self.playlist_result(entries, album_id, title)