From 5da6bd00837236cf8a5dc5aeeadae5cfed7f2021 Mon Sep 17 00:00:00 2001 From: "Leslie P. Polzer" Date: Fri, 20 Feb 2015 10:49:45 +0100 Subject: [PATCH 1/9] [chirbit] Add new extractor. --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/chirbit.py | 34 ++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 youtube_dl/extractor/chirbit.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f225ac654..de08e69bc 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -63,6 +63,7 @@ from .ceskatelevize import CeskaTelevizeIE from .channel9 import Channel9IE from .chilloutzone import ChilloutzoneIE +from .chirbit import ChirbitIE from .cinchcast import CinchcastIE from .clipfish import ClipfishIE from .cliphunter import CliphunterIE diff --git a/youtube_dl/extractor/chirbit.py b/youtube_dl/extractor/chirbit.py new file mode 100644 index 000000000..06a3e1a7a --- /dev/null +++ b/youtube_dl/extractor/chirbit.py @@ -0,0 +1,34 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class ChirbitIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?chirb\.it/(?P[^/]+)' + _TEST = { + 'url': 'http://chirb.it/PrIPv5', + 'md5': '9847b0dad6ac3e074568bf2cfb197de8', + 'info_dict': { + 'id': 'PrIPv5', + 'display_id': 'kukushtv_1423231243', + 'ext': 'mp3', + 'title': 'Фасадстрой', + 'url': 'http://audio.chirbit.com/kukushtv_1423231243.mp3' + } + } + + def _real_extract(self, url): + audio_linkid = self._match_id(url) + webpage = self._download_webpage(url, audio_linkid) + + audio_title = self._html_search_regex(r'(.*?)', webpage, 'title') + audio_id = self._html_search_regex(r'\("setFile",\s+"http://audio.chirbit.com/(.*?).mp3"\)', webpage, 'audio ID') + audio_url = 'http://audio.chirbit.com/' + audio_id + '.mp3'; + + return { + 'id': audio_linkid, + 'display_id': audio_id, + 'title': audio_title, + 'url': audio_url + } From 365577f5676d63089cb834855dd4cdce7d0dc8aa Mon Sep 17 00:00:00 2001 From: "Leslie P. Polzer" Date: Fri, 20 Feb 2015 14:48:12 +0100 Subject: [PATCH 2/9] [chirbit] add profile extractor. --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/chirbit.py | 63 ++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index de08e69bc..94e150826 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -63,7 +63,7 @@ from .ceskatelevize import CeskaTelevizeIE from .channel9 import Channel9IE from .chilloutzone import ChilloutzoneIE -from .chirbit import ChirbitIE +from .chirbit import ChirbitIE, ChirbitProfileIE from .cinchcast import CinchcastIE from .clipfish import ClipfishIE from .cliphunter import CliphunterIE diff --git a/youtube_dl/extractor/chirbit.py b/youtube_dl/extractor/chirbit.py index 06a3e1a7a..47ce94aa0 100644 --- a/youtube_dl/extractor/chirbit.py +++ b/youtube_dl/extractor/chirbit.py @@ -1,7 +1,10 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..utils import clean_html class ChirbitIE(InfoExtractor): @@ -32,3 +35,63 @@ def _real_extract(self, url): 'title': audio_title, 'url': audio_url } + +class ChirbitProfileIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?chirbit.com/(?P[^/]+)' + _TEST = { + 'url': 'http://chirbit.com/ScarletBeauty', + 'playlist_count': 3, + 'info_dict': { + '_type': 'playlist', + 'title': 'ScarletBeauty', + 'id': 'ScarletBeauty' + } + } + + def _real_extract(self, url): + profile_id = self._match_id(url) + + # Chirbit has a pretty weird "Last Page" navigation behavior. + # We grab the profile's oldest entry to determine when to + # stop fetching entries. + oldestpage = self._download_webpage(url + '/24599', profile_id) + oldest_page_entries = re.findall( + r'''soundFile:\s*"http://audio.chirbit.com/(.*?).mp3"''', + oldestpage); + oldestentry = clean_html(oldest_page_entries[-1]); + + ids = [] + titles = [] + n = 0 + while True: + page = self._download_webpage(url + '/' + str(n), profile_id) + page_ids = re.findall( + r'''soundFile:\s*"http://audio.chirbit.com/(.*?).mp3"''', + page); + page_titles = re.findall( + r'''(.*?)''', + page); + ids += page_ids + titles += page_titles + if oldestentry in page_ids: + break + n += 1 + + entries = [] + i = 0 + for id in ids: + entries.append({ + 'id': id, + 'title': titles[i], + 'url': 'http://audio.chirbit.com/' + id + '.mp3' + }); + i += 1 + + info_dict = { + '_type': 'playlist', + 'id': profile_id, + 'title': profile_id, + 'entries': entries + } + + return info_dict; From ddc369f073fda4ddd429c2d9a104e561cefd417f Mon Sep 17 00:00:00 2001 From: "Leslie P. Polzer" Date: Mon, 23 Feb 2015 12:00:43 +0100 Subject: [PATCH 3/9] [chirbit] fix profile downloader regex. --- youtube_dl/extractor/chirbit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/chirbit.py b/youtube_dl/extractor/chirbit.py index 47ce94aa0..443192f43 100644 --- a/youtube_dl/extractor/chirbit.py +++ b/youtube_dl/extractor/chirbit.py @@ -37,7 +37,7 @@ def _real_extract(self, url): } class ChirbitProfileIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?chirbit.com/(?P[^/]+)' + _VALID_URL = r'https?://(?:www\.)?chirbit.com/(?P[^/]+)/?$' _TEST = { 'url': 'http://chirbit.com/ScarletBeauty', 'playlist_count': 3, From 93b5071f73738d788c878b38a57f2b6efe0da883 Mon Sep 17 00:00:00 2001 From: "Leslie P. Polzer" Date: Mon, 23 Feb 2015 12:11:19 +0100 Subject: [PATCH 4/9] [soundgasm] add profile IE. --- youtube_dl/extractor/__init__.py | 5 ++++- youtube_dl/extractor/soundgasm.py | 36 +++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 94e150826..cf58f0800 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -425,7 +425,10 @@ SoundcloudUserIE, SoundcloudPlaylistIE ) -from .soundgasm import SoundgasmIE +from .soundgasm import ( + SoundgasmIE, + SoundgasmProfileIE +) from .southpark import ( SouthParkIE, SouthparkDeIE, diff --git a/youtube_dl/extractor/soundgasm.py b/youtube_dl/extractor/soundgasm.py index a4f8ce6c3..e568ff18c 100644 --- a/youtube_dl/extractor/soundgasm.py +++ b/youtube_dl/extractor/soundgasm.py @@ -4,6 +4,7 @@ import re from .common import InfoExtractor +from ..utils import clean_html class SoundgasmIE(InfoExtractor): @@ -38,3 +39,38 @@ def _real_extract(self, url): 'title': audio_title, 'description': description } + +class SoundgasmProfileIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P[0-9a-zA-Z_\-]+)/?$' + _TEST = { + 'url': 'http://soundgasm.net/u/ytdl', + 'playlist_count': 1, + 'info_dict': { + '_type': 'playlist', + 'id': 'ytdl', + 'title': 'ytdl' + } + } + + def _real_extract(self, url): + profile_id = self._match_id(url) + webpage = self._download_webpage(url, profile_id) + + ids = re.findall(r'''''' % re.escape(profile_id), webpage) + ids = [clean_html(id) for id in ids] + + entries = [] + for id in ids: + entries.append({ + '_type': 'url', + 'url': ('http://soundgasm.net/u/%s/%s' % (profile_id, id)) + }) + + info_dict = { + '_type': 'playlist', + 'id': profile_id, + 'title': profile_id, + 'entries': entries + } + + return info_dict; From a65d4e7f1458a681f250d6e2e0190644b50d6793 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 23 Feb 2015 21:15:16 +0600 Subject: [PATCH 5/9] [chirbit] Simplify and extract profile from RSS (#5032) --- youtube_dl/extractor/__init__.py | 5 +- youtube_dl/extractor/chirbit.py | 113 ++++++++++++++----------------- 2 files changed, 53 insertions(+), 65 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index c3088fba2..40fc92cf7 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -63,7 +63,10 @@ from .ceskatelevize import CeskaTelevizeIE from .channel9 import Channel9IE from .chilloutzone import ChilloutzoneIE -from .chirbit import ChirbitIE, ChirbitProfileIE +from .chirbit import ( + ChirbitIE, + ChirbitProfileIE, +) from .cinchcast import CinchcastIE from .clipfish import ClipfishIE from .cliphunter import CliphunterIE diff --git a/youtube_dl/extractor/chirbit.py b/youtube_dl/extractor/chirbit.py index 443192f43..124307b7c 100644 --- a/youtube_dl/extractor/chirbit.py +++ b/youtube_dl/extractor/chirbit.py @@ -1,97 +1,82 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import clean_html +from ..utils import ( + parse_duration, + int_or_none, +) class ChirbitIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?chirb\.it/(?P[^/]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?chirb\.it/(?:(?:wp|pl)/|fb_chirbit_player\.swf\?key=)?(?P[\da-zA-Z]+)' + _TESTS = [{ 'url': 'http://chirb.it/PrIPv5', 'md5': '9847b0dad6ac3e074568bf2cfb197de8', 'info_dict': { 'id': 'PrIPv5', - 'display_id': 'kukushtv_1423231243', 'ext': 'mp3', 'title': 'Фасадстрой', - 'url': 'http://audio.chirbit.com/kukushtv_1423231243.mp3' + 'duration': 52, + 'view_count': int, + 'comment_count': int, } - } + }, { + 'url': 'https://chirb.it/fb_chirbit_player.swf?key=PrIPv5', + 'only_matching': True, + }] def _real_extract(self, url): - audio_linkid = self._match_id(url) - webpage = self._download_webpage(url, audio_linkid) + audio_id = self._match_id(url) - audio_title = self._html_search_regex(r'(.*?)', webpage, 'title') - audio_id = self._html_search_regex(r'\("setFile",\s+"http://audio.chirbit.com/(.*?).mp3"\)', webpage, 'audio ID') - audio_url = 'http://audio.chirbit.com/' + audio_id + '.mp3'; + webpage = self._download_webpage( + 'http://chirb.it/%s' % audio_id, audio_id) + + audio_url = self._search_regex( + r'"setFile"\s*,\s*"([^"]+)"', webpage, 'audio url') + + title = self._search_regex( + r'itemprop="name">([^<]+)', webpage, 'title') + duration = parse_duration(self._html_search_meta( + 'duration', webpage, 'duration', fatal=False)) + view_count = int_or_none(self._search_regex( + r'itemprop="playCount"\s*>(\d+)', webpage, + 'listen count', fatal=False)) + comment_count = int_or_none(self._search_regex( + r'>(\d+) Comments?:', webpage, + 'comment count', fatal=False)) return { - 'id': audio_linkid, - 'display_id': audio_id, - 'title': audio_title, - 'url': audio_url + 'id': audio_id, + 'url': audio_url, + 'title': title, + 'duration': duration, + 'view_count': view_count, + 'comment_count': comment_count, } + class ChirbitProfileIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?chirbit.com/(?P[^/]+)/?$' + _VALID_URL = r'https?://(?:www\.)?chirbit.com/(?:rss/)?(?P[^/]+)' _TEST = { 'url': 'http://chirbit.com/ScarletBeauty', - 'playlist_count': 3, 'info_dict': { - '_type': 'playlist', - 'title': 'ScarletBeauty', - 'id': 'ScarletBeauty' - } + 'id': 'ScarletBeauty', + 'title': 'Chirbits by ScarletBeauty', + }, + 'playlist_mincount': 3, } def _real_extract(self, url): profile_id = self._match_id(url) - # Chirbit has a pretty weird "Last Page" navigation behavior. - # We grab the profile's oldest entry to determine when to - # stop fetching entries. - oldestpage = self._download_webpage(url + '/24599', profile_id) - oldest_page_entries = re.findall( - r'''soundFile:\s*"http://audio.chirbit.com/(.*?).mp3"''', - oldestpage); - oldestentry = clean_html(oldest_page_entries[-1]); + rss = self._download_xml( + 'http://chirbit.com/rss/%s' % profile_id, profile_id) - ids = [] - titles = [] - n = 0 - while True: - page = self._download_webpage(url + '/' + str(n), profile_id) - page_ids = re.findall( - r'''soundFile:\s*"http://audio.chirbit.com/(.*?).mp3"''', - page); - page_titles = re.findall( - r'''(.*?)''', - page); - ids += page_ids - titles += page_titles - if oldestentry in page_ids: - break - n += 1 + entries = [ + self.url_result(audio_url.text, 'Chirbit') + for audio_url in rss.findall('./channel/item/link')] - entries = [] - i = 0 - for id in ids: - entries.append({ - 'id': id, - 'title': titles[i], - 'url': 'http://audio.chirbit.com/' + id + '.mp3' - }); - i += 1 + title = rss.find('./channel/title').text - info_dict = { - '_type': 'playlist', - 'id': profile_id, - 'title': profile_id, - 'entries': entries - } - - return info_dict; + return self.playlist_result(entries, profile_id, title) From 3cc57f96455ce14cc5c72264a25b8d434174f7dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 23 Feb 2015 21:27:24 +0600 Subject: [PATCH 6/9] [soundgasm:profile] Simplify --- youtube_dl/extractor/soundgasm.py | 30 ++++++++---------------------- 1 file changed, 8 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/soundgasm.py b/youtube_dl/extractor/soundgasm.py index e568ff18c..e11d999f3 100644 --- a/youtube_dl/extractor/soundgasm.py +++ b/youtube_dl/extractor/soundgasm.py @@ -41,36 +41,22 @@ def _real_extract(self, url): } class SoundgasmProfileIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P[0-9a-zA-Z_\-]+)/?$' + _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P[^/]+)' _TEST = { 'url': 'http://soundgasm.net/u/ytdl', - 'playlist_count': 1, 'info_dict': { - '_type': 'playlist', 'id': 'ytdl', - 'title': 'ytdl' - } + }, + 'playlist_count': 1, } def _real_extract(self, url): profile_id = self._match_id(url) + webpage = self._download_webpage(url, profile_id) - ids = re.findall(r'''''' % re.escape(profile_id), webpage) - ids = [clean_html(id) for id in ids] + entries = [ + self.url_result(audio_url, 'Soundgasm') + for audio_url in re.findall(r'href="([^"]+/u/%s/[^"]+)' % profile_id, webpage)] - entries = [] - for id in ids: - entries.append({ - '_type': 'url', - 'url': ('http://soundgasm.net/u/%s/%s' % (profile_id, id)) - }) - - info_dict = { - '_type': 'playlist', - 'id': profile_id, - 'title': profile_id, - 'entries': entries - } - - return info_dict; + return self.playlist_result(entries, profile_id) From 80af2b73ab0b51e4416500301948caa71ec39cb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 23 Feb 2015 21:27:56 +0600 Subject: [PATCH 7/9] [soundgasm] Clarify extractors' IE_NAMEs --- youtube_dl/extractor/soundgasm.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/soundgasm.py b/youtube_dl/extractor/soundgasm.py index e11d999f3..26e96a120 100644 --- a/youtube_dl/extractor/soundgasm.py +++ b/youtube_dl/extractor/soundgasm.py @@ -8,6 +8,7 @@ class SoundgasmIE(InfoExtractor): + IE_NAME = 'soundgasm' _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P[0-9a-zA-Z_\-]+)/(?P[0-9a-zA-Z_\-]+)' _TEST = { 'url': 'http://soundgasm.net/u/ytdl/Piano-sample', @@ -41,6 +42,7 @@ def _real_extract(self, url): } class SoundgasmProfileIE(InfoExtractor): + IE_NAME = 'soundgasm:profile' _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<id>[^/]+)' _TEST = { 'url': 'http://soundgasm.net/u/ytdl', From 04e8c1108023d9fe5c466d16f988a469e04f326e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 23 Feb 2015 21:28:14 +0600 Subject: [PATCH 8/9] [chirbit] Clarify extractors' IE_NAMEs --- youtube_dl/extractor/chirbit.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/chirbit.py b/youtube_dl/extractor/chirbit.py index 124307b7c..b1eeaf101 100644 --- a/youtube_dl/extractor/chirbit.py +++ b/youtube_dl/extractor/chirbit.py @@ -9,6 +9,7 @@ class ChirbitIE(InfoExtractor): + IE_NAME = 'chirbit' _VALID_URL = r'https?://(?:www\.)?chirb\.it/(?:(?:wp|pl)/|fb_chirbit_player\.swf\?key=)?(?P<id>[\da-zA-Z]+)' _TESTS = [{ 'url': 'http://chirb.it/PrIPv5', @@ -57,6 +58,7 @@ def _real_extract(self, url): class ChirbitProfileIE(InfoExtractor): + IE_NAME = 'chirbit:profile' _VALID_URL = r'https?://(?:www\.)?chirbit.com/(?:rss/)?(?P<id>[^/]+)' _TEST = { 'url': 'http://chirbit.com/ScarletBeauty', From 409693984f0acb8fbbf006c0d7965bc138211ac6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 23 Feb 2015 21:30:30 +0600 Subject: [PATCH 9/9] [soundgasm:profile] Fix _VALID_URL --- youtube_dl/extractor/soundgasm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/soundgasm.py b/youtube_dl/extractor/soundgasm.py index 26e96a120..9e992c9b7 100644 --- a/youtube_dl/extractor/soundgasm.py +++ b/youtube_dl/extractor/soundgasm.py @@ -43,7 +43,7 @@ def _real_extract(self, url): class SoundgasmProfileIE(InfoExtractor): IE_NAME = 'soundgasm:profile' - _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<id>[^/]+)/?(?:\#.*)?$' _TEST = { 'url': 'http://soundgasm.net/u/ytdl', 'info_dict': {