MegaTVComEmbedIE: Make canonical URL extraction more robust

2024-11-27 10:56:48 +01:00 · 2021-11-13 11:50:05 +02:00 · 2021-11-13 11:50:05 +02:00 · 96a0ad4778
commit 96a0ad4778
parent 28fddc1758
1 changed files with 17 additions and 1 deletions
--- a/youtube_dl/extractor/megatvcom.py
+++ b/youtube_dl/extractor/megatvcom.py
@ -166,10 +166,26 @@ class MegaTVComEmbedIE(MegaTVComBaseIE):
                url = '%s:%s' % (scheme, url)
            yield url
    def _match_canonical_url(self, webpage):
        LINK_RE = r'''(?x)
        <link(?:
            rel=(?P<_q1>%(quot_re)s)(?P<canonical>canonical)(?P=_q1)|
            href=(?P<_q2>%(quot_re)s)(?P<href>(?:(?!(?P=_q2)).)+)(?P=_q2)|
            [^>]*?
        )+>
        ''' % {'quot_re': r'["\']'}
        for mobj in re.finditer(LINK_RE, webpage):
            canonical, href = mobj.group('canonical', 'href')
            if canonical and href:
                return unescapeHTML(href)
    def _real_extract(self, url):
        webpage = self._download_webpage(url, 'N/A')
        player_attrs = self._extract_player_attrs(webpage)
-        canonical_url = player_attrs['share_url']
+        canonical_url = player_attrs.get('share_url') or \
            self._match_canonical_url(webpage)
        if not canonical_url:
            raise ExtractorError('canonical URL not found')
        video_id = compat_parse_qs(compat_urllib_parse_urlparse(
            canonical_url).query)['p'][0]