import os import re import types import urllib.parse import xml.etree.ElementTree from .common import InfoExtractor from .commonprotocols import RtmpIE from .youtube import YoutubeIE from ..compat import compat_etree_fromstring from ..utils import ( KNOWN_EXTENSIONS, MEDIA_EXTENSIONS, ExtractorError, UnsupportedError, determine_ext, determine_protocol, dict_get, extract_basic_auth, filter_dict, format_field, int_or_none, is_html, js_to_json, merge_dicts, mimetype2ext, orderedSet, parse_duration, parse_resolution, smuggle_url, str_or_none, traverse_obj, try_call, unescapeHTML, unified_timestamp, unsmuggle_url, update_url_query, url_or_none, urlhandle_detect_ext, urljoin, variadic, xpath_attr, xpath_text, xpath_with_ns, ) class GenericIE(InfoExtractor): IE_DESC = 'Generic downloader that works on some sites' _VALID_URL = r'.*' IE_NAME = 'generic' _NETRC_MACHINE = False # Suppress username warning _TESTS = [ # Direct link to a video { 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4', 'md5': '67d406c2bcb6af27fa886f31aa934bbe', 'info_dict': { 'id': 'trailer', 'ext': 'mp4', 'title': 'trailer', 'upload_date': '20100513', 'direct': True, 'timestamp': 1273772943.0, }, }, # Direct link to media delivered compressed (until Accept-Encoding is *) { 'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac', 'md5': '128c42e68b13950268b648275386fc74', 'info_dict': { 'id': 'FictionJunction-Parallel_Hearts', 'ext': 'flac', 'title': 'FictionJunction-Parallel_Hearts', 'upload_date': '20140522', }, 'expected_warnings': [ 'URL could be a direct video link, returning it as such.', ], 'skip': 'URL invalid', }, # Direct download with broken HEAD { 'url': 'http://ai-radio.org:8000/radio.opus', 'info_dict': { 'id': 'radio', 'ext': 'opus', 'title': 'radio', }, 'params': { 'skip_download': True, # infinite live stream }, 'expected_warnings': [ r'501.*Not Implemented', r'400.*Bad Request', ], }, # Direct link with incorrect MIME type { 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', 'md5': '4ccbebe5f36706d85221f204d7eb5913', 'info_dict': { 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', 'id': '5_Lennart_Poettering_-_Systemd', 'ext': 'webm', 'title': '5_Lennart_Poettering_-_Systemd', 'upload_date': '20141120', 'direct': True, 'timestamp': 1416498816.0, }, 'expected_warnings': [ 'URL could be a direct video link, returning it as such.', ], }, # RSS feed { 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml', 'info_dict': { 'id': 'https://phihag.de/2014/youtube-dl/rss2.xml', 'title': 'Zero Punctuation', 'description': 're:.*groundbreaking video review series.*', }, 'playlist_mincount': 11, }, # RSS feed with enclosure { 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', 'info_dict': { 'id': 'http://podcastfeeds.nbcnews.com/nbcnews/video/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', 'title': 'MSNBC Rachel Maddow (video)', 'description': 're:.*her unique approach to storytelling.*', }, 'playlist': [{ 'info_dict': { 'ext': 'mov', 'id': 'pdv_maddow_netcast_mov-12-03-2020-223726', 'title': 'MSNBC Rachel Maddow (video) - 12-03-2020-223726', 'description': 're:.*her unique approach to storytelling.*', 'upload_date': '20201204', }, }], 'skip': 'Dead link', }, # RSS feed with item with description and thumbnails { 'url': 'https://anchor.fm/s/dd00e14/podcast/rss', 'info_dict': { 'id': 'https://anchor.fm/s/dd00e14/podcast/rss', 'title': 're:.*100% Hydrogen.*', 'description': 're:.*In this episode.*', }, 'playlist': [{ 'info_dict': { 'ext': 'm4a', 'id': '818a5d38-01cd-152f-2231-ee479677fa82', 'title': 're:Hydrogen!', 'description': 're:.*In this episode we are going.*', 'timestamp': 1567977776, 'upload_date': '20190908', 'duration': 423, 'thumbnail': r're:^https?://.*\.jpg$', 'episode_number': 1, 'season_number': 1, 'age_limit': 0, 'season': 'Season 1', 'direct': True, 'episode': 'Episode 1', }, }], 'params': { 'skip_download': True, }, }, # RSS feed with enclosures and unsupported link URLs { 'url': 'http://www.hellointernet.fm/podcast?format=rss', 'info_dict': { 'id': 'http://www.hellointernet.fm/podcast?format=rss', 'description': 'CGP Grey and Brady Haran talk about YouTube, life, work, whatever.', 'title': 'Hello Internet', }, 'playlist_mincount': 100, }, # RSS feed with guid { 'url': 'https://www.omnycontent.com/d/playlist/a7b4f8fe-59d9-4afc-a79a-a90101378abf/bf2c1d80-3656-4449-9d00-a903004e8f84/efbff746-e7c1-463a-9d80-a903004e8f8f/podcast.rss', 'info_dict': { 'id': 'https://www.omnycontent.com/d/playlist/a7b4f8fe-59d9-4afc-a79a-a90101378abf/bf2c1d80-3656-4449-9d00-a903004e8f84/efbff746-e7c1-463a-9d80-a903004e8f8f/podcast.rss', 'description': 'md5:be809a44b63b0c56fb485caf68685520', 'title': 'The Little Red Podcast', }, 'playlist_mincount': 76, }, # SMIL from http://videolectures.net/promogram_igor_mekjavic_eng { 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/video/1/smil.xml', 'info_dict': { 'id': 'smil', 'ext': 'mp4', 'title': 'Automatics, robotics and biocybernetics', 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', 'upload_date': '20130627', 'formats': 'mincount:16', 'subtitles': 'mincount:1', }, 'params': { 'force_generic_extractor': True, 'skip_download': True, }, }, # SMIL from http://www1.wdr.de/mediathek/video/livestream/index.html { 'url': 'http://metafilegenerator.de/WDR/WDR_FS/hds/hds.smil', 'info_dict': { 'id': 'hds', 'ext': 'flv', 'title': 'hds', 'formats': 'mincount:1', }, 'params': { 'skip_download': True, }, }, # SMIL from https://www.restudy.dk/video/play/id/1637 { 'url': 'https://www.restudy.dk/awsmedia/SmilDirectory/video_1637.xml', 'info_dict': { 'id': 'video_1637', 'ext': 'flv', 'title': 'video_1637', 'formats': 'mincount:3', }, 'params': { 'skip_download': True, }, }, # SMIL from http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm { 'url': 'http://services.media.howstuffworks.com/videos/450221/smil-service.smil', 'info_dict': { 'id': 'smil-service', 'ext': 'flv', 'title': 'smil-service', 'formats': 'mincount:1', }, 'params': { 'skip_download': True, }, }, # SMIL from http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370 { 'url': 'http://api.new.livestream.com/accounts/1570303/events/1585861/videos/4719370.smil', 'info_dict': { 'id': '4719370', 'ext': 'mp4', 'title': '571de1fd-47bc-48db-abf9-238872a58d1f', 'formats': 'mincount:3', }, 'params': { 'skip_download': True, }, }, # XSPF playlist from http://www.telegraaf.nl/tv/nieuws/binnenland/24353229/__Tikibad_ontruimd_wegens_brand__.html { 'url': 'http://www.telegraaf.nl/xml/playlist/2015/8/7/mZlp2ctYIUEB.xspf', 'info_dict': { 'id': 'mZlp2ctYIUEB', 'ext': 'mp4', 'title': 'Tikibad ontruimd wegens brand', 'description': 'md5:05ca046ff47b931f9b04855015e163a4', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 33, }, 'params': { 'skip_download': True, }, 'skip': '404 Not Found', }, # MPD from http://dash-mse-test.appspot.com/media.html { 'url': 'http://yt-dash-mse-test.commondatastorage.googleapis.com/media/car-20120827-manifest.mpd', 'md5': '4b57baab2e30d6eb3a6a09f0ba57ef53', 'info_dict': { 'id': 'car-20120827-manifest', 'ext': 'mp4', 'title': 'car-20120827-manifest', 'formats': 'mincount:9', 'upload_date': '20130904', 'timestamp': 1378272859.0, }, }, # m3u8 served with Content-Type: audio/x-mpegURL; charset=utf-8 { 'url': 'http://once.unicornmedia.com/now/master/playlist/bb0b18ba-64f5-4b1b-a29f-0ac252f06b68/77a785f3-5188-4806-b788-0893a61634ed/93677179-2d99-4ef4-9e17-fe70d49abfbf/content.m3u8', 'info_dict': { 'id': 'content', 'ext': 'mp4', 'title': 'content', 'formats': 'mincount:8', }, 'params': { # m3u8 downloads 'skip_download': True, }, 'skip': 'video gone', }, # m3u8 served with Content-Type: text/plain { 'url': 'http://www.nacentapps.com/m3u8/index.m3u8', 'info_dict': { 'id': 'index', 'ext': 'mp4', 'title': 'index', 'upload_date': '20140720', 'formats': 'mincount:11', }, 'params': { # m3u8 downloads 'skip_download': True, }, 'skip': 'video gone', }, # google redirect { 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', 'info_dict': { 'id': 'cmQHVoWB5FY', 'ext': 'mp4', 'upload_date': '20130224', 'uploader_id': '@TheVerge', 'description': r're:^Chris Ziegler takes a look at the\.*', 'uploader': 'The Verge', 'title': 'First Firefox OS phones side-by-side', }, 'params': { 'skip_download': False, }, }, { # redirect in Refresh HTTP header 'url': 'https://www.facebook.com/l.php?u=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DpO8h3EaFRdo&h=TAQHsoToz&enc=AZN16h-b6o4Zq9pZkCCdOLNKMN96BbGMNtcFwHSaazus4JHT_MFYkAA-WARTX2kvsCIdlAIyHZjl6d33ILIJU7Jzwk_K3mcenAXoAzBNoZDI_Q7EXGDJnIhrGkLXo_LJ_pAa2Jzbx17UHMd3jAs--6j2zaeto5w9RTn8T_1kKg3fdC5WPX9Dbb18vzH7YFX0eSJmoa6SP114rvlkw6pkS1-T&s=1', 'info_dict': { 'id': 'pO8h3EaFRdo', 'ext': 'mp4', 'title': 'Tripeo Boiler Room x Dekmantel Festival DJ Set', 'description': 'md5:6294cc1af09c4049e0652b51a2df10d5', 'upload_date': '20150917', 'uploader_id': 'brtvofficial', 'uploader': 'Boiler Room', }, 'params': { 'skip_download': False, }, }, { 'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html', 'md5': '85b90ccc9d73b4acd9138d3af4c27f89', 'info_dict': { 'id': '13601338388002', 'ext': 'mp4', 'uploader': 'www.hodiho.fr', 'title': 'R\u00e9gis plante sa Jeep', }, }, # bandcamp page with custom domain { 'add_ie': ['Bandcamp'], 'url': 'http://bronyrock.com/track/the-pony-mash', 'info_dict': { 'id': '3235767654', 'ext': 'mp3', 'title': 'The Pony Mash', 'uploader': 'M_Pallante', }, 'skip': 'There is a limit of 200 free downloads / month for the test song', }, # embed.ly video { 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/', 'info_dict': { 'id': '9ODmcdjQcHQ', 'ext': 'mp4', 'title': 'Tested: Grinding Coffee at 2000 Frames Per Second', 'upload_date': '20140225', 'description': 'md5:06a40fbf30b220468f1e0957c0f558ff', 'uploader': 'Tested', 'uploader_id': 'testedcom', }, # No need to test YoutubeIE here 'params': { 'skip_download': True, }, }, # funnyordie embed { 'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns', 'info_dict': { 'id': '18e820ec3f', 'ext': 'mp4', 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama', 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.', }, # HEAD requests lead to endless 301, while GET is OK 'expected_warnings': ['301'], }, # RUTV embed { 'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html', 'info_dict': { 'id': '776940', 'ext': 'mp4', 'title': 'Охотское море стало целиком российским', 'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43', }, 'params': { # m3u8 download 'skip_download': True, }, }, # TVC embed { 'url': 'http://sch1298sz.mskobr.ru/dou_edu/karamel_ki/filial_galleries/video/iframe_src_http_tvc_ru_video_iframe_id_55304_isplay_false_acc_video_id_channel_brand_id_11_show_episodes_episode_id_32307_frameb/', 'info_dict': { 'id': '55304', 'ext': 'mp4', 'title': 'Дошкольное воспитание', }, }, # SportBox embed { 'url': 'http://www.vestifinance.ru/articles/25753', 'info_dict': { 'id': '25753', 'title': 'Прямые трансляции с Форума-выставки "Госзаказ-2013"', }, 'playlist': [{ 'info_dict': { 'id': '370908', 'title': 'Госзаказ. День 3', 'ext': 'mp4', }, }, { 'info_dict': { 'id': '370905', 'title': 'Госзаказ. День 2', 'ext': 'mp4', }, }, { 'info_dict': { 'id': '370902', 'title': 'Госзаказ. День 1', 'ext': 'mp4', }, }], 'params': { # m3u8 download 'skip_download': True, }, }, # Myvi.ru embed { 'url': 'http://www.kinomyvi.tv/news/detail/Pervij-dublirovannij-trejler--Uzhastikov-_nOw1', 'info_dict': { 'id': 'f4dafcad-ff21-423d-89b5-146cfd89fa1e', 'ext': 'mp4', 'title': 'Ужастики, русский трейлер (2015)', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 153, }, 'skip': 'Site dead', }, # XHamster embed { 'url': 'http://www.numisc.com/forum/showthread.php?11696-FM15-which-pumiscer-was-this-%28-vid-%29-%28-alfa-as-fuck-srx-%29&s=711f5db534502e22260dec8c5e2d66d8', 'info_dict': { 'id': 'showthread', 'title': '[NSFL] [FM15] which pumiscer was this ( vid ) ( alfa as fuck srx )', }, 'playlist_mincount': 7, # This forum does not allow