[extractor/anvato] Fix extractor and refactor (#5074)

Authored by: bashonly
This commit is contained in:
bashonly 2022-10-03 16:15:22 +00:00 committed by GitHub
parent 7244895bde
commit 4a61501db9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 116 additions and 113 deletions

View File

@ -74,8 +74,7 @@ offlinetest: codetest
$(PYTHON) -m pytest -k "not download" $(PYTHON) -m pytest -k "not download"
# XXX: This is hard to maintain # XXX: This is hard to maintain
CODE_FOLDERS = yt_dlp yt_dlp/downloader yt_dlp/extractor yt_dlp/postprocessor yt_dlp/compat \ CODE_FOLDERS = yt_dlp yt_dlp/downloader yt_dlp/extractor yt_dlp/postprocessor yt_dlp/compat
yt_dlp/extractor/anvato_token_generator
yt-dlp: yt_dlp/*.py yt_dlp/*/*.py yt-dlp: yt_dlp/*.py yt_dlp/*/*.py
mkdir -p zip mkdir -p zip
for d in $(CODE_FOLDERS) ; do \ for d in $(CODE_FOLDERS) ; do \

View File

@ -32,7 +32,6 @@ def packages():
return [ return [
'yt_dlp', 'yt_dlp.extractor', 'yt_dlp.downloader', 'yt_dlp.postprocessor', 'yt_dlp.compat', 'yt_dlp', 'yt_dlp.extractor', 'yt_dlp.downloader', 'yt_dlp.postprocessor', 'yt_dlp.compat',
'yt_dlp.extractor.anvato_token_generator',
] ]

View File

@ -5,10 +5,8 @@
import re import re
import time import time
from .anvato_token_generator import NFLTokenGenerator
from .common import InfoExtractor from .common import InfoExtractor
from ..aes import aes_encrypt from ..aes import aes_encrypt
from ..compat import compat_str
from ..utils import ( from ..utils import (
bytes_to_intlist, bytes_to_intlist,
determine_ext, determine_ext,
@ -16,20 +14,61 @@
int_or_none, int_or_none,
join_nonempty, join_nonempty,
strip_jsonp, strip_jsonp,
smuggle_url,
traverse_obj,
unescapeHTML, unescapeHTML,
unsmuggle_url, unsmuggle_url,
) )
def md5_text(s): def md5_text(s):
if not isinstance(s, compat_str): return hashlib.md5(str(s).encode()).hexdigest()
s = compat_str(s)
return hashlib.md5(s.encode('utf-8')).hexdigest()
class AnvatoIE(InfoExtractor): class AnvatoIE(InfoExtractor):
_VALID_URL = r'anvato:(?P<access_key_or_mcp>[^:]+):(?P<id>\d+)' _VALID_URL = r'anvato:(?P<access_key_or_mcp>[^:]+):(?P<id>\d+)'
_API_BASE_URL = 'https://tkx.mp.lura.live/rest/v2'
_ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1'
_AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce' # from anvplayer.min.js
_TESTS = [{
# from https://www.nfl.com/videos/baker-mayfield-s-game-changing-plays-from-3-td-game-week-14
'url': 'anvato:GXvEgwyJeWem8KCYXfeoHWknwP48Mboj:899441',
'md5': '921919dab3cd0b849ff3d624831ae3e2',
'info_dict': {
'id': '899441',
'ext': 'mp4',
'title': 'Baker Mayfield\'s game-changing plays from 3-TD game Week 14',
'description': 'md5:85e05a3cc163f8c344340f220521136d',
'upload_date': '20201215',
'timestamp': 1608009755,
'thumbnail': r're:^https?://.*\.jpg',
'uploader': 'NFL',
'tags': ['Baltimore Ravens at Cleveland Browns (2020-REG-14)', 'Baker Mayfield', 'Game Highlights',
'Player Highlights', 'Cleveland Browns', 'league'],
'duration': 157,
'categories': ['Entertainment', 'Game', 'Highlights'],
},
}, {
# from https://ktla.com/news/99-year-old-woman-learns-to-fly-in-torrance-checks-off-bucket-list-dream/
'url': 'anvato:X8POa4zpGZMmeiq0wqiO8IP5rMqQM9VN:8032455',
'md5': '837718bcfb3a7778d022f857f7a9b19e',
'info_dict': {
'id': '8032455',
'ext': 'mp4',
'title': '99-year-old woman learns to fly plane in Torrance, checks off bucket list dream',
'description': 'md5:0a12bab8159445e78f52a297a35c6609',
'upload_date': '20220928',
'timestamp': 1664408881,
'thumbnail': r're:^https?://.*\.jpg',
'uploader': 'LIN',
'tags': ['video', 'news', '5live'],
'duration': 155,
'categories': ['News'],
},
}]
# Copied from anvplayer.min.js # Copied from anvplayer.min.js
_ANVACK_TABLE = { _ANVACK_TABLE = {
'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ', 'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ',
@ -202,86 +241,74 @@ class AnvatoIE(InfoExtractor):
'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582' 'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582'
} }
def _generate_nfl_token(self, anvack, mcp_id):
reroute = self._download_json(
'https://api.nfl.com/v1/reroute', mcp_id, data=b'grant_type=client_credentials',
headers={'X-Domain-Id': 100}, note='Fetching token info')
token_type = reroute.get('token_type') or 'Bearer'
auth_token = f'{token_type} {reroute["access_token"]}'
response = self._download_json(
'https://api.nfl.com/v3/shield/', mcp_id, data=json.dumps({
'query': '''{
viewer {
mediaToken(anvack: "%s", id: %s) {
token
}
}
}''' % (anvack, mcp_id),
}).encode(), headers={
'Authorization': auth_token,
'Content-Type': 'application/json',
}, note='Fetching NFL API token')
return traverse_obj(response, ('data', 'viewer', 'mediaToken', 'token'))
_TOKEN_GENERATORS = { _TOKEN_GENERATORS = {
'GXvEgwyJeWem8KCYXfeoHWknwP48Mboj': NFLTokenGenerator, 'GXvEgwyJeWem8KCYXfeoHWknwP48Mboj': _generate_nfl_token,
} }
_API_KEY = '3hwbSuqqT690uxjNYBktSQpa5ZrpYYR0Iofx7NcJHyA'
_ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1'
_AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce'
_TESTS = [{
# from https://www.boston25news.com/news/watch-humpback-whale-breaches-right-next-to-fishing-boat-near-nh/817484874
'url': 'anvato:8v9BEynrwx8EFLYpgfOWcG1qJqyXKlRM:4465496',
'info_dict': {
'id': '4465496',
'ext': 'mp4',
'title': 'VIDEO: Humpback whale breaches right next to NH boat',
'description': 'VIDEO: Humpback whale breaches right next to NH boat. Footage courtesy: Zach Fahey.',
'duration': 22,
'timestamp': 1534855680,
'upload_date': '20180821',
'uploader': 'ANV',
},
'params': {
'skip_download': True,
},
}, {
# from https://sanfrancisco.cbslocal.com/2016/06/17/source-oakland-cop-on-leave-for-having-girlfriend-help-with-police-reports/
'url': 'anvato:DVzl9QRzox3ZZsP9bNu5Li3X7obQOnqP:3417601',
'only_matching': True,
}]
def __init__(self, *args, **kwargs):
super(AnvatoIE, self).__init__(*args, **kwargs)
self.__server_time = None
def _server_time(self, access_key, video_id): def _server_time(self, access_key, video_id):
if self.__server_time is not None: return int_or_none(traverse_obj(self._download_json(
return self.__server_time f'{self._API_BASE_URL}/server_time', video_id, query={'anvack': access_key},
note='Fetching server time', fatal=False), 'server_time')) or int(time.time())
self.__server_time = int(self._download_json( def _get_video_json(self, access_key, video_id, extracted_token):
self._api_prefix(access_key) + 'server_time?anvack=' + access_key, video_id,
note='Fetching server time')['server_time'])
return self.__server_time
def _api_prefix(self, access_key):
return 'https://tkx2-%s.anvato.net/rest/v2/' % ('prod' if 'prod' in access_key else 'stage')
def _get_video_json(self, access_key, video_id):
# See et() in anvplayer.min.js, which is an alias of getVideoJSON() # See et() in anvplayer.min.js, which is an alias of getVideoJSON()
video_data_url = self._api_prefix(access_key) + 'mcp/video/%s?anvack=%s' % (video_id, access_key) video_data_url = f'{self._API_BASE_URL}/mcp/video/{video_id}?anvack={access_key}'
server_time = self._server_time(access_key, video_id) server_time = self._server_time(access_key, video_id)
input_data = '%d~%s~%s' % (server_time, md5_text(video_data_url), md5_text(server_time)) input_data = f'{server_time}~{md5_text(video_data_url)}~{md5_text(server_time)}'
auth_secret = intlist_to_bytes(aes_encrypt( auth_secret = intlist_to_bytes(aes_encrypt(
bytes_to_intlist(input_data[:64]), bytes_to_intlist(self._AUTH_KEY))) bytes_to_intlist(input_data[:64]), bytes_to_intlist(self._AUTH_KEY)))
query = {
video_data_url += '&X-Anvato-Adst-Auth=' + base64.b64encode(auth_secret).decode('ascii') 'X-Anvato-Adst-Auth': base64.b64encode(auth_secret).decode('ascii'),
'rtyp': 'fp',
}
anvrid = md5_text(time.time() * 1000 * random.random())[:30] anvrid = md5_text(time.time() * 1000 * random.random())[:30]
api = { api = {
'anvrid': anvrid, 'anvrid': anvrid,
'anvts': server_time, 'anvts': server_time,
} }
if self._TOKEN_GENERATORS.get(access_key) is not None: if extracted_token is not None:
api['anvstk2'] = self._TOKEN_GENERATORS[access_key].generate(self, access_key, video_id) api['anvstk2'] = extracted_token
elif self._TOKEN_GENERATORS.get(access_key) is not None:
api['anvstk2'] = self._TOKEN_GENERATORS[access_key](self, access_key, video_id)
elif self._ANVACK_TABLE.get(access_key) is not None:
api['anvstk'] = md5_text(f'{access_key}|{anvrid}|{server_time}|{self._ANVACK_TABLE[access_key]}')
else: else:
api['anvstk'] = md5_text('%s|%s|%d|%s' % ( api['anvstk2'] = 'default'
access_key, anvrid, server_time,
self._ANVACK_TABLE.get(access_key, self._API_KEY)))
return self._download_json( return self._download_json(
video_data_url, video_id, transform_source=strip_jsonp, video_data_url, video_id, transform_source=strip_jsonp, query=query,
data=json.dumps({'api': api}).encode('utf-8')) data=json.dumps({'api': api}, separators=(',', ':')).encode('utf-8'))
def _get_anvato_videos(self, access_key, video_id): def _get_anvato_videos(self, access_key, video_id, token):
video_data = self._get_video_json(access_key, video_id) video_data = self._get_video_json(access_key, video_id, token)
formats = [] formats = []
for published_url in video_data['published_urls']: for published_url in video_data['published_urls']:
video_url = published_url['embed_url'] video_url = published_url.get('embed_url')
if not video_url:
continue
media_format = published_url.get('format') media_format = published_url.get('format')
ext = determine_ext(video_url) ext = determine_ext(video_url)
@ -296,15 +323,27 @@ def _get_anvato_videos(self, access_key, video_id):
'tbr': tbr or None, 'tbr': tbr or None,
} }
if media_format == 'm3u8' and tbr is not None: vtt_subs, hls_subs = {}, {}
if media_format == 'vtt':
_, vtt_subs = self._extract_m3u8_formats_and_subtitles(
video_url, video_id, m3u8_id='vtt', fatal=False)
continue
elif media_format == 'm3u8' and tbr is not None:
a_format.update({ a_format.update({
'format_id': join_nonempty('hls', tbr), 'format_id': join_nonempty('hls', tbr),
'ext': 'mp4', 'ext': 'mp4',
}) })
elif media_format == 'm3u8-variant' or ext == 'm3u8': elif media_format == 'm3u8-variant' or ext == 'm3u8':
formats.extend(self._extract_m3u8_formats( # For some videos the initial m3u8 URL returns JSON instead
video_url, video_id, 'mp4', entry_protocol='m3u8_native', manifest_json = self._download_json(
m3u8_id='hls', fatal=False)) video_url, video_id, note='Downloading manifest JSON', errnote=False)
if manifest_json:
video_url = manifest_json.get('master_m3u8')
if not video_url:
continue
hls_fmts, hls_subs = self._extract_m3u8_formats_and_subtitles(
video_url, video_id, ext='mp4', m3u8_id='hls', fatal=False)
formats.extend(hls_fmts)
continue continue
elif ext == 'mp3' or media_format == 'mp3': elif ext == 'mp3' or media_format == 'mp3':
a_format['vcodec'] = 'none' a_format['vcodec'] = 'none'
@ -324,6 +363,7 @@ def _get_anvato_videos(self, access_key, video_id):
'ext': 'tt' if caption.get('format') == 'SMPTE-TT' else None 'ext': 'tt' if caption.get('format') == 'SMPTE-TT' else None
} }
subtitles.setdefault(caption['language'], []).append(a_caption) subtitles.setdefault(caption['language'], []).append(a_caption)
subtitles = self._merge_subtitles(subtitles, hls_subs, vtt_subs)
return { return {
'id': video_id, 'id': video_id,
@ -349,7 +389,10 @@ def _extract_from_webpage(cls, url, webpage):
access_key = cls._MCP_TO_ACCESS_KEY_TABLE.get((anvplayer_data.get('mcp') or '').lower()) access_key = cls._MCP_TO_ACCESS_KEY_TABLE.get((anvplayer_data.get('mcp') or '').lower())
if not (video_id or '').isdigit() or not access_key: if not (video_id or '').isdigit() or not access_key:
continue continue
yield cls.url_result(f'anvato:{access_key}:{video_id}', AnvatoIE, video_id) url = f'anvato:{access_key}:{video_id}'
if anvplayer_data.get('token'):
url = smuggle_url(url, {'token': anvplayer_data['token']})
yield cls.url_result(url, AnvatoIE, video_id)
def _extract_anvato_videos(self, webpage, video_id): def _extract_anvato_videos(self, webpage, video_id):
anvplayer_data = self._parse_json( anvplayer_data = self._parse_json(
@ -357,7 +400,7 @@ def _extract_anvato_videos(self, webpage, video_id):
self._ANVP_RE, webpage, 'Anvato player data', group='anvp'), self._ANVP_RE, webpage, 'Anvato player data', group='anvp'),
video_id) video_id)
return self._get_anvato_videos( return self._get_anvato_videos(
anvplayer_data['accessKey'], anvplayer_data['video']) anvplayer_data['accessKey'], anvplayer_data['video'], 'default') # cbslocal token = 'default'
def _real_extract(self, url): def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {}) url, smuggled_data = unsmuggle_url(url, {})
@ -365,9 +408,7 @@ def _real_extract(self, url):
'countries': smuggled_data.get('geo_countries'), 'countries': smuggled_data.get('geo_countries'),
}) })
mobj = self._match_valid_url(url) access_key, video_id = self._match_valid_url(url).group('access_key_or_mcp', 'id')
access_key, video_id = mobj.group('access_key_or_mcp', 'id')
if access_key not in self._ANVACK_TABLE: if access_key not in self._ANVACK_TABLE:
access_key = self._MCP_TO_ACCESS_KEY_TABLE.get( access_key = self._MCP_TO_ACCESS_KEY_TABLE.get(access_key) or access_key
access_key) or access_key return self._get_anvato_videos(access_key, video_id, smuggled_data.get('token'))
return self._get_anvato_videos(access_key, video_id)

View File

@ -1,5 +0,0 @@
from .nfl import NFLTokenGenerator
__all__ = [
'NFLTokenGenerator',
]

View File

@ -1,3 +0,0 @@
class TokenGenerator:
def generate(self, anvack, mcp_id):
raise NotImplementedError('This method must be implemented by subclasses')

View File

@ -1,28 +0,0 @@
import json
from .common import TokenGenerator
class NFLTokenGenerator(TokenGenerator):
_AUTHORIZATION = None
def generate(ie, anvack, mcp_id):
if not NFLTokenGenerator._AUTHORIZATION:
reroute = ie._download_json(
'https://api.nfl.com/v1/reroute', mcp_id,
data=b'grant_type=client_credentials',
headers={'X-Domain-Id': 100})
NFLTokenGenerator._AUTHORIZATION = '%s %s' % (reroute.get('token_type') or 'Bearer', reroute['access_token'])
return ie._download_json(
'https://api.nfl.com/v3/shield/', mcp_id, data=json.dumps({
'query': '''{
viewer {
mediaToken(anvack: "%s", id: %s) {
token
}
}
}''' % (anvack, mcp_id),
}).encode(), headers={
'Authorization': NFLTokenGenerator._AUTHORIZATION,
'Content-Type': 'application/json',
})['data']['viewer']['mediaToken']['token']