From 74dc105210987c5754e6f9c03c01f02644a0baf3 Mon Sep 17 00:00:00 2001 From: Hugo Alves De Azevedo Date: Tue, 25 Aug 2020 22:39:20 -0300 Subject: [PATCH 1/6] - adding alura support --- youtube_dl/extractor/alura.py | 119 +++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 120 insertions(+) create mode 100644 youtube_dl/extractor/alura.py diff --git a/youtube_dl/extractor/alura.py b/youtube_dl/extractor/alura.py new file mode 100644 index 000000000..5f75bdaff --- /dev/null +++ b/youtube_dl/extractor/alura.py @@ -0,0 +1,119 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + +from ..compat import ( + compat_str, + compat_urlparse, +) + +from ..utils import ( + dict_get, + ExtractorError, + float_or_none, + int_or_none, + parse_duration, + qualities, + srt_subtitles_timecode, + try_get, + update_url_query, + urlencode_postdata, +) + + +class AluraIE(InfoExtractor): + _VALID_URL = r'https?://(?:cursos\.)?alura\.com\.br/course/(?P[^/]+)/task/(?P\d+)' + _LOGIN_URL = 'https://cursos.alura.com.br/loginForm?urlAfterLogin=/loginForm' + _VIDEO_URL = 'https://cursos.alura.com.br/course/%s/task/%s/video' + _TEST = { + 'url': 'https://cursos.alura.com.br/course/design-patterns-python/task/9651', + 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', + 'info_dict': { + 'id': '9651', + 'ext': 'mp4', + 'title': 'Video title goes here', + 'thumbnail': r're:^https?://.*\.jpg$', + # TODO more properties, either as: + } + } + + def _real_extract(self, url): + + video_id = self._match_id(url) + course = self._search_regex(self._VALID_URL, url, 'post url', group='course_name') + video_url = self._VIDEO_URL % (course,video_id) + + video_dict = self._download_json(video_url, None, 'Searching for videos', expected_status=[404,500]) + + if video_dict: + webpage = self._download_webpage(url, video_id) + video_title = self._search_regex( + r']+class=(["\'])task-body-header-title-text\1[^>]*>(?P[^<]+)', + webpage, 'title', group='title') + + formats = [] + for video_obj in video_dict: + video_url_m3u8 = video_obj.get('link') + video_format = self._extract_m3u8_formats( + video_url_m3u8, None, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) + + formats.extend(video_format) + + return { + 'id': video_id, + 'title': video_title, + "formats": formats + } + + def extract_output_format(src): + return { + 'url': src.get('link'), + 'manifest_url': src.get('linkWebm'), + 'format': src.get('quality') + } + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + pass + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login popup') + + def is_logged(webpage): + return any(re.search(p, webpage) for p in ( + r'href=[\"|\']?/signout[\"|\']', + r'>Logout<')) + + # already logged in + if is_logged(login_page): + return + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'username': username, + 'password': password, + }) + + post_url = self._search_regex( + r'<form[^>]+class=["|\']signin-form["|\'] action=["|\'](?P<url>.+?)["|\']', login_page, + 'post url', default=self._LOGIN_URL, group='url') + + if not post_url.startswith('http'): + post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) + + self._download_webpage( + post_url, None, 'Logging in', + data=urlencode_postdata(login_form), + headers={'Content-Type': 'application/x-www-form-urlencoded'}) + + diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 1db21529f..1a1b3746b 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -36,6 +36,7 @@ from .airmozilla import AirMozillaIE from .aljazeera import AlJazeeraIE from .alphaporno import AlphaPornoIE +from .alura import AluraIE from .amcnetworks import AMCNetworksIE from .americastestkitchen import AmericasTestKitchenIE from .animeondemand import AnimeOnDemandIE From 4d75c363a725d31a292a0209853b281fc3fd4e6c Mon Sep 17 00:00:00 2001 From: Hugo Alves De Azevedo <hugo.azevedo@mercadolivre.com> Date: Fri, 28 Aug 2020 10:02:31 -0300 Subject: [PATCH 2/6] - adding alura support --- youtube_dl/extractor/alura.py | 99 +++++++++++++++++++++++------- youtube_dl/extractor/extractors.py | 5 +- 2 files changed, 81 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/alura.py b/youtube_dl/extractor/alura.py index 5f75bdaff..7baf7af81 100644 --- a/youtube_dl/extractor/alura.py +++ b/youtube_dl/extractor/alura.py @@ -6,21 +6,15 @@ from .common import InfoExtractor from ..compat import ( - compat_str, compat_urlparse, ) from ..utils import ( - dict_get, - ExtractorError, - float_or_none, - int_or_none, - parse_duration, - qualities, - srt_subtitles_timecode, - try_get, - update_url_query, urlencode_postdata, + urljoin, + int_or_none, + clean_html, + ExtractorError ) @@ -28,17 +22,26 @@ class AluraIE(InfoExtractor): _VALID_URL = r'https?://(?:cursos\.)?alura\.com\.br/course/(?P<course_name>[^/]+)/task/(?P<id>\d+)' _LOGIN_URL = 'https://cursos.alura.com.br/loginForm?urlAfterLogin=/loginForm' _VIDEO_URL = 'https://cursos.alura.com.br/course/%s/task/%s/video' - _TEST = { - 'url': 'https://cursos.alura.com.br/course/design-patterns-python/task/9651', - 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', + _NETRC_MACHINE = 'alura' + _TESTS = [{ + 'url': 'https://cursos.alura.com.br/course/clojure-mutabilidade-com-atoms-e-refs/task/60095', 'info_dict': { - 'id': '9651', + 'id': '60095', 'ext': 'mp4', - 'title': 'Video title goes here', - 'thumbnail': r're:^https?://.*\.jpg$', - # TODO more properties, either as: + 'title': 'ReferĂȘncias, ref-set e alter' + }, + 'skip': 'Requires alura account credentials', + }, + { + # URL without video + 'url': 'https://cursos.alura.com.br/course/clojure-mutabilidade-com-atoms-e-refs/task/60098', + 'only_matching': True, + }, + { + 'url': 'https://cursos.alura.com.br/course/fundamentos-market-digital/task/55219', + 'only_matching': True, } - } + ] def _real_extract(self, url): @@ -50,9 +53,9 @@ def _real_extract(self, url): if video_dict: webpage = self._download_webpage(url, video_id) - video_title = self._search_regex( + video_title = clean_html(self._search_regex( r'<span[^>]+class=(["\'])task-body-header-title-text\1[^>]*>(?P<title>[^<]+)', - webpage, 'title', group='title') + webpage, 'title', group='title')) formats = [] for video_obj in video_dict: @@ -60,9 +63,15 @@ def _real_extract(self, url): video_format = self._extract_m3u8_formats( video_url_m3u8, None, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) - + for f in video_format: + m = re.search(r'^[\w \W]*-(?P<res>\w*).mp4[\W \w]*', f['url']) + if m: + if not f.get('height'): + f['height'] = int('720' if m.group('res') == 'hd' else '480') formats.extend(video_format) + self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id')) + return { 'id': video_id, 'title': video_title, @@ -111,9 +120,55 @@ def is_logged(webpage): if not post_url.startswith('http'): post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) - self._download_webpage( + response = self._download_webpage( post_url, None, 'Logging in', data=urlencode_postdata(login_form), headers={'Content-Type': 'application/x-www-form-urlencoded'}) + if not is_logged(response): + error = self._html_search_regex( + r'(?s)<p[^>]+class="alert-message[^"]*">(.+?)</p>', + response, 'error message', default=None) + if error: + raise ExtractorError('Unable to login: %s' % error, expected=True) + raise ExtractorError('Unable to log in') + +class AluraCourseIE(AluraIE): + + _VALID_URL = r'https?://(?:cursos\.)?alura\.com\.br/course/(?P<id>[^/]+)' + _LOGIN_URL = 'https://cursos.alura.com.br/loginForm?urlAfterLogin=/loginForm' + _NETRC_MACHINE = 'aluracourse' + _TESTS = [{ + 'url': 'https://cursos.alura.com.br/course/clojure-mutabilidade-com-atoms-e-refs', + 'only_matching': True, + }] + + def _real_extract(self, url): + + course_path = self._match_id(url) + webpage = self._download_webpage(url, course_path) + + course_title = self._search_regex( + r'<h1.*?>(.*?)<strong>(?P<course_title>.*?)</strong></h[0-9]>', webpage, + 'course title', default=course_path, group='course_title') + + entries = [] + if webpage: + for path in re.findall(r'<a\b(?=[^>]* class="[^"]*(?<=[" ])courseSectionList-section[" ])(?=[^>]* href="([^"]*))', webpage): + page_url = urljoin(url, path) + section_path = self._download_webpage(page_url, course_path) + for path_video in re.findall(r'<a\b(?=[^>]* class="[^"]*(?<=[" ])task-menu-nav-item-link-VIDEO[" ])(?=[^>]* href="([^"]*))', section_path): + chapter = clean_html(self._search_regex(r'<h3[^>]+class=(["\'])task-menu-section-title-text\1[^>]*>(?P<chapter>[^<]+)',section_path, 'chapter', group='chapter')) + chapter_number = int_or_none(self._search_regex(r'<span[^>]+class=(["\'])task-menu-section-title-number[^>]*>(.*?)<strong>(?P<chapter_number>[^<]+)</strong>',section_path, 'chapter number', group='chapter_number')) + video_url = urljoin(url, path_video) + entry = { + '_type': 'url_transparent', + 'id': self._match_id(video_url), + 'url': video_url, + 'id_key': self.ie_key(), + 'chapter': chapter, + 'chapter_number': chapter_number + } + entries.append(entry) + return self.playlist_result(entries, course_path, course_title) \ No newline at end of file diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 1a1b3746b..0d34237af 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -36,7 +36,10 @@ from .airmozilla import AirMozillaIE from .aljazeera import AlJazeeraIE from .alphaporno import AlphaPornoIE -from .alura import AluraIE +from .alura import ( + AluraIE, + AluraCourseIE +) from .amcnetworks import AMCNetworksIE from .americastestkitchen import AmericasTestKitchenIE from .animeondemand import AnimeOnDemandIE From 2adf91f94fd9570c670a050e47b7fd4499ed6812 Mon Sep 17 00:00:00 2001 From: Hugo Alves De Azevedo <hugo.azevedo@mercadolivre.com> Date: Fri, 28 Aug 2020 10:03:13 -0300 Subject: [PATCH 3/6] - removing unused method --- youtube_dl/extractor/alura.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/youtube_dl/extractor/alura.py b/youtube_dl/extractor/alura.py index 7baf7af81..623187620 100644 --- a/youtube_dl/extractor/alura.py +++ b/youtube_dl/extractor/alura.py @@ -78,13 +78,6 @@ def _real_extract(self, url): "formats": formats } - def extract_output_format(src): - return { - 'url': src.get('link'), - 'manifest_url': src.get('linkWebm'), - 'format': src.get('quality') - } - def _real_initialize(self): self._login() From 39672cdc792e52dd5acc0770553e1b4dd78163c7 Mon Sep 17 00:00:00 2001 From: Hugo Alves De Azevedo <hugo.azevedo@mercadolivre.com> Date: Fri, 28 Aug 2020 14:25:37 -0300 Subject: [PATCH 4/6] - formatting --- youtube_dl/extractor/alura.py | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/alura.py b/youtube_dl/extractor/alura.py index 623187620..9ff2bd347 100644 --- a/youtube_dl/extractor/alura.py +++ b/youtube_dl/extractor/alura.py @@ -29,27 +29,24 @@ class AluraIE(InfoExtractor): 'id': '60095', 'ext': 'mp4', 'title': 'ReferĂȘncias, ref-set e alter' - }, - 'skip': 'Requires alura account credentials', }, + 'skip': 'Requires alura account credentials'}, { # URL without video 'url': 'https://cursos.alura.com.br/course/clojure-mutabilidade-com-atoms-e-refs/task/60098', - 'only_matching': True, - }, + 'only_matching': True}, { 'url': 'https://cursos.alura.com.br/course/fundamentos-market-digital/task/55219', - 'only_matching': True, - } + 'only_matching': True} ] def _real_extract(self, url): video_id = self._match_id(url) course = self._search_regex(self._VALID_URL, url, 'post url', group='course_name') - video_url = self._VIDEO_URL % (course,video_id) + video_url = self._VIDEO_URL % (course, video_id) - video_dict = self._download_json(video_url, None, 'Searching for videos', expected_status=[404,500]) + video_dict = self._download_json(video_url, None, 'Searching for videos', expected_status=[404, 500]) if video_dict: webpage = self._download_webpage(url, video_id) @@ -152,9 +149,21 @@ def _real_extract(self, url): page_url = urljoin(url, path) section_path = self._download_webpage(page_url, course_path) for path_video in re.findall(r'<a\b(?=[^>]* class="[^"]*(?<=[" ])task-menu-nav-item-link-VIDEO[" ])(?=[^>]* href="([^"]*))', section_path): - chapter = clean_html(self._search_regex(r'<h3[^>]+class=(["\'])task-menu-section-title-text\1[^>]*>(?P<chapter>[^<]+)',section_path, 'chapter', group='chapter')) - chapter_number = int_or_none(self._search_regex(r'<span[^>]+class=(["\'])task-menu-section-title-number[^>]*>(.*?)<strong>(?P<chapter_number>[^<]+)</strong>',section_path, 'chapter number', group='chapter_number')) + chapter = clean_html( + self._search_regex( + r'<h3[^>]+class=(["\'])task-menu-section-title-text\1[^>]*>(?P<chapter>[^<]+)', + section_path, + 'chapter', + group='chapter')) + + chapter_number = int_or_none( + self._search_regex( + r'<span[^>]+class=(["\'])task-menu-section-title-number[^>]*>(.*?)<strong>(?P<chapter_number>[^<]+)</strong>', + section_path, + 'chapter number', + group='chapter_number')) video_url = urljoin(url, path_video) + entry = { '_type': 'url_transparent', 'id': self._match_id(video_url), @@ -164,4 +173,4 @@ def _real_extract(self, url): 'chapter_number': chapter_number } entries.append(entry) - return self.playlist_result(entries, course_path, course_title) \ No newline at end of file + return self.playlist_result(entries, course_path, course_title) From c2baf165e44f20cb6ae625623d69775b2d5f12f7 Mon Sep 17 00:00:00 2001 From: Hugo Alves De Azevedo <hugo.azevedo@mercadolivre.com> Date: Fri, 28 Aug 2020 14:40:03 -0300 Subject: [PATCH 5/6] - formatting --- youtube_dl/extractor/alura.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/alura.py b/youtube_dl/extractor/alura.py index 9ff2bd347..f934d4e92 100644 --- a/youtube_dl/extractor/alura.py +++ b/youtube_dl/extractor/alura.py @@ -46,7 +46,7 @@ def _real_extract(self, url): course = self._search_regex(self._VALID_URL, url, 'post url', group='course_name') video_url = self._VIDEO_URL % (course, video_id) - video_dict = self._download_json(video_url, None, 'Searching for videos', expected_status=[404, 500]) + video_dict = self._download_json(video_url, video_id, 'Searching for videos') if video_dict: webpage = self._download_webpage(url, video_id) From 377be41755decfe97358ef586f68c4f9cd06b437 Mon Sep 17 00:00:00 2001 From: Hugo Alves De Azevedo <hugo.azevedo@mercadolivre.com> Date: Fri, 28 Aug 2020 15:21:52 -0300 Subject: [PATCH 6/6] - fixing suitable --- youtube_dl/extractor/alura.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/alura.py b/youtube_dl/extractor/alura.py index f934d4e92..36b4d95b3 100644 --- a/youtube_dl/extractor/alura.py +++ b/youtube_dl/extractor/alura.py @@ -134,6 +134,10 @@ class AluraCourseIE(AluraIE): 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return False if AluraIE.suitable(url) else super(AluraCourseIE, cls).suitable(url) + def _real_extract(self, url): course_path = self._match_id(url)