mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2025-01-11 21:15:53 +01:00
[ie/facebook:ads] Fix extractor (#10704)
Closes #10701 Authored by: kclauhk
This commit is contained in:
parent
cc88a54bb1
commit
d62fef7e07
@ -963,6 +963,7 @@ class FacebookAdsIE(InfoExtractor):
|
|||||||
'id': '899206155126718',
|
'id': '899206155126718',
|
||||||
'ext': 'mp4',
|
'ext': 'mp4',
|
||||||
'title': 'video by Kandao',
|
'title': 'video by Kandao',
|
||||||
|
'description': 'md5:0822724069e3aca97cbed5dabbab282e',
|
||||||
'uploader': 'Kandao',
|
'uploader': 'Kandao',
|
||||||
'uploader_id': '774114102743284',
|
'uploader_id': '774114102743284',
|
||||||
'uploader_url': r're:^https?://.*',
|
'uploader_url': r're:^https?://.*',
|
||||||
@ -971,6 +972,22 @@ class FacebookAdsIE(InfoExtractor):
|
|||||||
'upload_date': '20231214',
|
'upload_date': '20231214',
|
||||||
'like_count': int,
|
'like_count': int,
|
||||||
},
|
},
|
||||||
|
}, {
|
||||||
|
# key 'watermarked_video_sd_url' missing
|
||||||
|
'url': 'https://www.facebook.com/ads/library/?id=501152689226254',
|
||||||
|
'info_dict': {
|
||||||
|
'id': '501152689226254',
|
||||||
|
'ext': 'mp4',
|
||||||
|
'title': 'video by mat.nawrocki',
|
||||||
|
'description': 'md5:02a446ace7ff8c3c37a2892922492490',
|
||||||
|
'uploader': 'mat.nawrocki',
|
||||||
|
'uploader_id': '148586968341456',
|
||||||
|
'uploader_url': r're:^https?://.*',
|
||||||
|
'timestamp': 1723452305,
|
||||||
|
'thumbnail': r're:^https?://.*',
|
||||||
|
'upload_date': '20240812',
|
||||||
|
'like_count': int,
|
||||||
|
},
|
||||||
}, {
|
}, {
|
||||||
'url': 'https://www.facebook.com/ads/library/?id=893637265423481',
|
'url': 'https://www.facebook.com/ads/library/?id=893637265423481',
|
||||||
'info_dict': {
|
'info_dict': {
|
||||||
@ -1017,34 +1034,42 @@ def _real_extract(self, url):
|
|||||||
video_id = self._match_id(url)
|
video_id = self._match_id(url)
|
||||||
webpage = self._download_webpage(url, video_id)
|
webpage = self._download_webpage(url, video_id)
|
||||||
|
|
||||||
post_data = [self._parse_json(j, video_id, fatal=False)
|
post_data = traverse_obj(
|
||||||
for j in re.findall(r's\.handle\(({.*})\);requireLazy\(', webpage)]
|
re.findall(r'data-sjs>({.*?ScheduledServerJS.*?})</script>', webpage), (..., {json.loads}))
|
||||||
data = traverse_obj(post_data, (
|
data = get_first(post_data, (
|
||||||
..., 'require', ..., ..., ..., 'props', 'deeplinkAdCard', 'snapshot', {dict}), get_all=False)
|
'require', ..., ..., ..., '__bbox', 'require', ..., ..., ...,
|
||||||
|
'entryPointRoot', 'otherProps', 'deeplinkAdCard', 'snapshot', {dict}))
|
||||||
if not data:
|
if not data:
|
||||||
raise ExtractorError('Unable to extract ad data')
|
raise ExtractorError('Unable to extract ad data')
|
||||||
|
|
||||||
title = data.get('title')
|
title = data.get('title')
|
||||||
if not title or title == '{{product.name}}':
|
if not title or title == '{{product.name}}':
|
||||||
title = join_nonempty('display_format', 'page_name', delim=' by ', from_dict=data)
|
title = join_nonempty('display_format', 'page_name', delim=' by ', from_dict=data)
|
||||||
|
markup_id = traverse_obj(data, ('body', '__m', {str}))
|
||||||
|
markup = traverse_obj(post_data, (
|
||||||
|
..., 'require', ..., ..., ..., '__bbox', 'markup', lambda _, v: v[0].startswith(markup_id),
|
||||||
|
..., '__html', {clean_html}, {lambda x: not x.startswith('{{product.') and x}, any))
|
||||||
|
|
||||||
info_dict = traverse_obj(data, {
|
info_dict = merge_dicts({
|
||||||
'description': ('link_description', {str}, {lambda x: x if x != '{{product.description}}' else None}),
|
'title': title,
|
||||||
|
'description': markup or None,
|
||||||
|
}, traverse_obj(data, {
|
||||||
|
'description': ('link_description', {lambda x: x if not x.startswith('{{product.') else None}),
|
||||||
'uploader': ('page_name', {str}),
|
'uploader': ('page_name', {str}),
|
||||||
'uploader_id': ('page_id', {str_or_none}),
|
'uploader_id': ('page_id', {str_or_none}),
|
||||||
'uploader_url': ('page_profile_uri', {url_or_none}),
|
'uploader_url': ('page_profile_uri', {url_or_none}),
|
||||||
'timestamp': ('creation_time', {int_or_none}),
|
'timestamp': ('creation_time', {int_or_none}),
|
||||||
'like_count': ('page_like_count', {int_or_none}),
|
'like_count': ('page_like_count', {int_or_none}),
|
||||||
})
|
}))
|
||||||
|
|
||||||
entries = []
|
entries = []
|
||||||
for idx, entry in enumerate(traverse_obj(
|
for idx, entry in enumerate(traverse_obj(
|
||||||
data, (('videos', 'cards'), lambda _, v: any(url_or_none(v[f]) for f in self._FORMATS_MAP))), 1,
|
data, (('videos', 'cards'), lambda _, v: any(url_or_none(v.get(f)) for f in self._FORMATS_MAP))), 1,
|
||||||
):
|
):
|
||||||
entries.append({
|
entries.append({
|
||||||
'id': f'{video_id}_{idx}',
|
'id': f'{video_id}_{idx}',
|
||||||
'title': entry.get('title') or title,
|
'title': entry.get('title') or title,
|
||||||
'description': entry.get('link_description') or info_dict.get('description'),
|
'description': traverse_obj(entry, 'body', 'link_description') or info_dict.get('description'),
|
||||||
'thumbnail': url_or_none(entry.get('video_preview_image_url')),
|
'thumbnail': url_or_none(entry.get('video_preview_image_url')),
|
||||||
'formats': self._extract_formats(entry),
|
'formats': self._extract_formats(entry),
|
||||||
})
|
})
|
||||||
|
Loading…
Reference in New Issue
Block a user