yt-dlp/yt_dlp/extractor/common.py

import base64
import collections
import functools
import getpass
import hashlib
import http.client
import http.cookiejar
import http.cookies
import inspect
import itertools
import json
import math
import netrc
import os
import random
import re
import subprocess
import sys
import time
import types
import urllib.parse
import urllib.request
import xml.etree.ElementTree

from ..compat import (
    compat_etree_fromstring,
    compat_expanduser,
    compat_os_name,
    urllib_req_to_req,
)
from ..cookies import LenientSimpleCookie
from ..downloader.f4m import get_base_url, remove_encrypted_media
from ..downloader.hls import HlsFD
from ..networking import HEADRequest, Request
from ..networking.exceptions import (
    HTTPError,
    IncompleteRead,
    network_exceptions,
)
from ..networking.impersonate import ImpersonateTarget
from ..utils import (
    IDENTITY,
    JSON_LD_RE,
    NO_DEFAULT,
    ExtractorError,
    FormatSorter,
    GeoRestrictedError,
    GeoUtils,
    LenientJSONDecoder,
    Popen,
    RegexNotFoundError,
    RetryManager,
    UnsupportedError,
    age_restricted,
    base_url,
    bug_reports_message,
    classproperty,
    clean_html,
    deprecation_warning,
    determine_ext,
    dict_get,
    encode_data_uri,
    extract_attributes,
    filter_dict,
    fix_xml_ampersands,
    float_or_none,
    format_field,
    int_or_none,
    join_nonempty,
    js_to_json,
    mimetype2ext,
    netrc_from_content,
    orderedSet,
    parse_bitrate,
    parse_codecs,
    parse_duration,
    parse_iso8601,
    parse_m3u8_attributes,
    parse_resolution,
    sanitize_filename,
    sanitize_url,
    smuggle_url,
    str_or_none,
    str_to_int,
    strip_or_none,
    traverse_obj,
    truncate_string,
    try_call,
    try_get,
    unescapeHTML,
    unified_strdate,
    unified_timestamp,
    url_basename,
    url_or_none,
    urlhandle_detect_ext,
    urljoin,
    variadic,
    xpath_element,
    xpath_text,
    xpath_with_ns,
)


class InfoExtractor:
    """Information Extractor class.

    Information extractors are the classes that, given a URL, extract
    information about the video (or videos) the URL refers to. This
    information includes the real video URL, the video title, author and
    others. The information is stored in a dictionary which is then
    passed to the YoutubeDL. The YoutubeDL processes this
    information possibly downloading the video to the file system, among
    other possible outcomes.

    The type field determines the type of the result.
    By far the most common value (and the default if _type is missing) is
    "video", which indicates a single video.

    For a video, the dictionaries must include the following fields:

    id:             Video identifier.
    title:          Video title, unescaped. Set to an empty string if video has
                    no title as opposed to "None" which signifies that the
                    extractor failed to obtain a title

    Additionally, it must contain either a formats entry or a url one:

    formats:        A list of dictionaries for each format available, ordered
                    from worst to best quality.

                    Potential fields:
                    * url        The mandatory URL representing the media:
                                   for plain file media - HTTP URL of this file,
                                   for RTMP - RTMP URL,
                                   for HLS - URL of the M3U8 media playlist,
                                   for HDS - URL of the F4M manifest,
                                   for DASH
                                     - HTTP URL to plain file media (in case of
                                       unfragmented media)
                                     - URL of the MPD manifest or base URL
                                       representing the media if MPD manifest
                                       is parsed from a string (in case of
                                       fragmented media)
                                   for MSS - URL of the ISM manifest.
                    * request_data  Data to send in POST request to the URL
                    * manifest_url
                                 The URL of the manifest file in case of
                                 fragmented media:
                                   for HLS - URL of the M3U8 master playlist,
                                   for HDS - URL of the F4M manifest,
                                   for DASH - URL of the MPD manifest,
                                   for MSS - URL of the ISM manifest.
                    * manifest_stream_number  (For internal use only)
                                 The index of the stream in the manifest file
                    * ext        Will be calculated from URL if missing
                    * format     A human-readable description of the format
                                 ("mp4 container with h264/opus").
                                 Calculated from the format_id, width, height.
                                 and format_note fields if missing.
                    * format_id  A short description of the format
                                 ("mp4_h264_opus" or "19").
                                Technically optional, but strongly recommended.
                    * format_note Additional info about the format
                                 ("3D" or "DASH video")
                    * width      Width of the video, if known
                    * height     Height of the video, if known
                    * aspect_ratio  Aspect ratio of the video, if known
                                 Automatically calculated from width and height
                    * resolution Textual description of width and height
                                 Automatically calculated from width and height
                    * dynamic_range The dynamic range of the video. One of:
                                 "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
                    * tbr        Average bitrate of audio and video in kbps (1000 bits/sec)
                    * abr        Average audio bitrate in kbps (1000 bits/sec)
                    * acodec     Name of the audio codec in use
                    * asr        Audio sampling rate in Hertz
                    * audio_channels  Number of audio channels
                    * vbr        Average video bitrate in kbps (1000 bits/sec)
                    * fps        Frame rate
                    * vcodec     Name of the video codec in use
                    * container  Name of the container format
                    * filesize   The number of bytes, if known in advance
                    * filesize_approx  An estimate for the number of bytes
                    * player_url SWF Player URL (used for rtmpdump).
                    * protocol   The protocol that will be used for the actual
                                 download, lower-case. One of "http", "https" or
                                 one of the protocols defined in downloader.PROTOCOL_MAP
                    * fragment_base_url
                                 Base URL for fragments. Each fragment's path
                                 value (if present) will be relative to
                                 this URL.
                    * fragments  A list of fragments of a fragmented media.
                                 Each fragment entry must contain either an url
                                 or a path. If an url is present it should be
                                 considered by a client. Otherwise both path and
                                 fragment_base_url must be present. Here is
                                 the list of all potential fields:
                                 * "url" - fragment's URL
                                 * "path" - fragment's path relative to
                                            fragment_base_url
                                 * "duration" (optional, int or float)
                                 * "filesize" (optional, int)
                    * is_from_start  Is a live format that can be downloaded
                                from the start. Boolean
                    * preference Order number of this format. If this field is
                                 present and not None, the formats get sorted
                                 by this field, regardless of all other values.
                                 -1 for default (order by other properties),
                                 -2 or smaller for less than default.
                                 < -1000 to hide the format (if there is
                                    another one which is strictly better)
                    * language   Language code, e.g. "de" or "en-US".
                    * language_preference  Is this in the language mentioned in
                                 the URL?
                                 10 if it's what the URL is about,
                                 -1 for default (don't know),
                                 -10 otherwise, other values reserved for now.
                    * quality    Order number of the video quality of this
                                 format, irrespective of the file format.
                                 -1 for default (order by other properties),
                                 -2 or smaller for less than default.
                    * source_preference  Order number for this video source
                                  (quality takes higher priority)
                                 -1 for default (order by other properties),
                                 -2 or smaller for less than default.
                    * http_headers  A dictionary of additional HTTP headers
                                 to add to the request.
                    * stretched_ratio  If given and not 1, indicates that the
                                 video's pixels are not square.
                                 width : height ratio as float.
                    * no_resume  The server does not support resuming the
                                 (HTTP or RTMP) download. Boolean.
                    * has_drm    True if the format has DRM and cannot be downloaded.
                                 'maybe' if the format may have DRM and has to be tested before download.
                    * extra_param_to_segment_url  A query string to append to each
                                 fragment's URL, or to update each existing query string
                                 with. If it is an HLS stream with an AES-128 decryption key,
                                 the query paramaters will be passed to the key URI as well,
                                 unless there is an `extra_param_to_key_url` given,
                                 or unless an external key URI is provided via `hls_aes`.
                                 Only applied by the native HLS/DASH downloaders.
                    * extra_param_to_key_url  A query string to append to the URL
                                 of the format's HLS AES-128 decryption key.
                                 Only applied by the native HLS downloader.
                    * hls_aes    A dictionary of HLS AES-128 decryption information
                                 used by the native HLS downloader to override the
                                 values in the media playlist when an '#EXT-X-KEY' tag
                                 is present in the playlist:
                                 * uri  The URI from which the key will be downloaded
                                 * key  The key (as hex) used to decrypt fragments.
                                        If `key` is given, any key URI will be ignored
                                 * iv   The IV (as hex) used to decrypt fragments
                    * downloader_options  A dictionary of downloader options
                                 (For internal use only)
                                 * http_chunk_size Chunk size for HTTP downloads
                                 * ffmpeg_args     Extra arguments for ffmpeg downloader (input)
                                 * ffmpeg_args_out Extra arguments for ffmpeg downloader (output)
                    * is_dash_periods  Whether the format is a result of merging
                                 multiple DASH periods.
                    RTMP formats can also have the additional fields: page_url,
                    app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
                    rtmp_protocol, rtmp_real_time

    url:            Final video URL.
    ext:            Video filename extension.
    format:         The video format, defaults to ext (used for --get-format)
    player_url:     SWF Player URL (used for rtmpdump).

    The following fields are optional:

    direct:         True if a direct video file was given (must only be set by GenericIE)
    alt_title:      A secondary title of the video.
    display_id:     An alternative identifier for the video, not necessarily
                    unique, but available before title. Typically, id is
                    something like "4234987", title "Dancing naked mole rats",
                    and display_id "dancing-naked-mole-rats"
    thumbnails:     A list of dictionaries, with the following entries:
                        * "id" (optional, string) - Thumbnail format ID
                        * "url"
                        * "preference" (optional, int) - quality of the image
                        * "width" (optional, int)
                        * "height" (optional, int)
                        * "resolution" (optional, string "{width}x{height}",
                                        deprecated)
                        * "filesize" (optional, int)
                        * "http_headers" (dict) - HTTP headers for the request
    thumbnail:      Full URL to a video thumbnail image.
    description:    Full video description.
    uploader:       Full name of the video uploader.
    license:        License name the video is licensed under.
    creators:       List of creators of the video.
    timestamp:      UNIX timestamp of the moment the video was uploaded
    upload_date:    Video upload date in UTC (YYYYMMDD).
                    If not explicitly set, calculated from timestamp
    release_timestamp: UNIX timestamp of the moment the video was released.
                    If it is not clear whether to use timestamp or this, use the former
    release_date:   The date (YYYYMMDD) when the video was released in UTC.
                    If not explicitly set, calculated from release_timestamp
    release_year:   Year (YYYY) as integer when the video or album was released.
                    To be used if no exact release date is known.
                    If not explicitly set, calculated from release_date.
    modified_timestamp: UNIX timestamp of the moment the video was last modified.
    modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
                    If not explicitly set, calculated from modified_timestamp
    uploader_id:    Nickname or id of the video uploader.
    uploader_url:   Full URL to a personal webpage of the video uploader.
    channel:        Full name of the channel the video is uploaded on.
                    Note that channel fields may or may not repeat uploader
                    fields. This depends on a particular extractor.
    channel_id:     Id of the channel.
    channel_url:    Full URL to a channel webpage.
    channel_follower_count: Number of followers of the channel.
    channel_is_verified: Whether the channel is verified on the platform.
    location:       Physical location where the video was filmed.
    subtitles:      The available subtitles as a dictionary in the format
                    {tag: subformats}. "tag" is usually a language code, and
                    "subformats" is a list sorted from lower to higher
                    preference, each element is a dictionary with the "ext"
                    entry and one of:
                        * "data": The subtitles file contents
                        * "url": A URL pointing to the subtitles file
                    It can optionally also have:
                        * "name": Name or description of the subtitles
                        * "http_headers": A dictionary of additional HTTP headers
                                  to add to the request.
                    "ext" will be calculated from URL if missing
    automatic_captions: Like 'subtitles'; contains automatically generated
                    captions instead of normal subtitles
    duration:       Length of the video in seconds, as an integer or float.
    view_count:     How many users have watched the video on the platform.
    concurrent_view_count: How many users are currently watching the video on the platform.
    like_count:     Number of positive ratings of the video
    dislike_count:  Number of negative ratings of the video
    repost_count:   Number of reposts of the video
    average_rating: Average rating give by users, the scale used depends on the webpage
    comment_count:  Number of comments on the video
    comments:       A list of comments, each with one or more of the following
                    properties (all but one of text or html optional):
                        * "author" - human-readable name of the comment author
                        * "author_id" - user ID of the comment author
                        * "author_thumbnail" - The thumbnail of the comment author
                        * "author_url" - The url to the comment author's page
                        * "author_is_verified" - Whether the author is verified
                                                 on the platform
                        * "author_is_uploader" - Whether the comment is made by
                                                 the video uploader
                        * "id" - Comment ID
                        * "html" - Comment as HTML
                        * "text" - Plain text of the comment
                        * "timestamp" - UNIX timestamp of comment
                        * "parent" - ID of the comment this one is replying to.
                                     Set to "root" to indicate that this is a
                                     comment to the original video.
                        * "like_count" - Number of positive ratings of the comment
                        * "dislike_count" - Number of negative ratings of the comment
                        * "is_favorited" - Whether the comment is marked as
                                           favorite by the video uploader
                        * "is_pinned" - Whether the comment is pinned to
                                        the top of the comments
    age_limit:      Age restriction for the video, as an integer (years)
    webpage_url:    The URL to the video webpage, if given to yt-dlp it
                    should allow to get the same result again. (It will be set
                    by YoutubeDL if it's missing)
    categories:     A list of categories that the video falls in, for example
                    ["Sports", "Berlin"]
    tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
    cast:           A list of the video cast
    is_live:        True, False, or None (=unknown). Whether this video is a
                    live stream that goes on instead of a fixed-length video.
    was_live:       True, False, or None (=unknown). Whether this video was
                    originally a live stream.
    live_status:    None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
                    or 'post_live' (was live, but VOD is not yet processed)
                    If absent, automatically set from is_live, was_live
    start_time:     Time in seconds where the reproduction should start, as
                    specified in the URL.
    end_time:       Time in seconds where the reproduction should end, as
                    specified in the URL.
    chapters:       A list of dictionaries, with the following entries:
                        * "start_time" - The start time of the chapter in seconds
                        * "end_time" - The end time of the chapter in seconds
                        * "title" (optional, string)
    heatmap:        A list of dictionaries, with the following entries:
                        * "start_time" - The start time of the data point in seconds
                        * "end_time" - The end time of the data point in seconds
                        * "value" - The normalized value of the data point (float between 0 and 1)
    playable_in_embed: Whether this video is allowed to play in embedded
                    players on other sites. Can be True (=always allowed),
                    False (=never allowed), None (=unknown), or a string
                    specifying the criteria for embedability; e.g. 'whitelist'
    availability:   Under what condition the video is available. One of
                    'private', 'premium_only', 'subscriber_only', 'needs_auth',
                    'unlisted' or 'public'. Use 'InfoExtractor._availability'
                    to set it
    media_type:     The type of media as classified by the site, e.g. "episode", "clip", "trailer"
    _old_archive_ids: A list of old archive ids needed for backward compatibility
    _format_sort_fields: A list of fields to use for sorting formats
    __post_extractor: A function to be called just before the metadata is
                    written to either disk, logger or console. The function
                    must return a dict which will be added to the info_dict.
                    This is usefull for additional information that is
                    time-consuming to extract. Note that the fields thus
                    extracted will not be available to output template and
                    match_filter. So, only "comments" and "comment_count" are
                    currently allowed to be extracted via this method.

    The following fields should only be used when the video belongs to some logical
    chapter or section:

    chapter:        Name or title of the chapter the video belongs to.
    chapter_number: Number of the chapter the video belongs to, as an integer.
    chapter_id:     Id of the chapter the video belongs to, as a unicode string.

    The following fields should only be used when the video is an episode of some
    series, programme or podcast:

    series:         Title of the series or programme the video episode belongs to.
    series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
    season:         Title of the season the video episode belongs to.
    season_number:  Number of the season the video episode belongs to, as an integer.
    season_id:      Id of the season the video episode belongs to, as a unicode string.
    episode:        Title of the video episode. Unlike mandatory video title field,
                    this field should denote the exact title of the video episode
                    without any kind of decoration.
    episode_number: Number of the video episode within a season, as an integer.
    episode_id:     Id of the video episode, as a unicode string.

    The following fields should only be used when the media is a track or a part of
    a music album:

    track:          Title of the track.
    track_number:   Number of the track within an album or a disc, as an integer.
    track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
                    as a unicode string.
    artists:        List of artists of the track.
    composers:      List of composers of the piece.
    genres:         List of genres of the track.
    album:          Title of the album the track belongs to.
    album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
    album_artists:  List of all artists appeared on the album.
                    E.g. ["Ash Borer", "Fell Voices"] or ["Various Artists"].
                    Useful for splits and compilations.
    disc_number:    Number of the disc or other physical medium the track belongs to,
                    as an integer.

    The following fields should only be set for clips that should be cut from the original video:

    section_start:  Start time of the section in seconds
    section_end:    End time of the section in seconds

    The following fields should only be set for storyboards:
    rows:           Number of rows in each storyboard fragment, as an integer
    columns:        Number of columns in each storyboard fragment, as an integer

    The following fields are deprecated and should not be set by new code:
    composer:       Use "composers" instead.
                    Composer(s) of the piece, comma-separated.
    artist:         Use "artists" instead.
                    Artist(s) of the track, comma-separated.
    genre:          Use "genres" instead.
                    Genre(s) of the track, comma-separated.
    album_artist:   Use "album_artists" instead.
                    All artists appeared on the album, comma-separated.
    creator:        Use "creators" instead.
                    The creator of the video.

    Unless mentioned otherwise, the fields should be Unicode strings.

    Unless mentioned otherwise, None is equivalent to absence of information.


    _type "playlist" indicates multiple videos.
    There must be a key "entries", which is a list, an iterable, or a PagedList
    object, each element of which is a valid dictionary by this specification.

    Additionally, playlists can have "id", "title", and any other relevant
    attributes with the same semantics as videos (see above).

    It can also have the following optional fields:

    playlist_count: The total number of videos in a playlist. If not given,
                    YoutubeDL tries to calculate it from "entries"


    _type "multi_video" indicates that there are multiple videos that
    form a single show, for examples multiple acts of an opera or TV episode.
    It must have an entries key like a playlist and contain all the keys
    required for a video at the same time.


    _type "url" indicates that the video must be extracted from another
    location, possibly by a different extractor. Its only required key is:
    "url" - the next URL to extract.
    The key "ie_key" can be set to the class name (minus the trailing "IE",
    e.g. "Youtube") if the extractor class is known in advance.
    Additionally, the dictionary may have any properties of the resolved entity
    known in advance, for example "title" if the title of the referred video is
    known ahead of time.


    _type "url_transparent" entities have the same specification as "url", but
    indicate that the given additional information is more precise than the one
    associated with the resolved URL.
    This is useful when a site employs a video service that hosts the video and
    its technical metadata, but that video service does not embed a useful
    title, description etc.


    Subclasses of this should also be added to the list of extractors and
    should define _VALID_URL as a regexp or a Sequence of regexps, and
    re-define the _real_extract() and (optionally) _real_initialize() methods.

    Subclasses may also override suitable() if necessary, but ensure the function
    signature is preserved and that this function imports everything it needs
    (except other extractors), so that lazy_extractors works correctly.

    Subclasses can define a list of _EMBED_REGEX, which will be searched for in
    the HTML of Generic webpages. It may also override _extract_embed_urls
    or _extract_from_webpage as necessary. While these are normally classmethods,
    _extract_from_webpage is allowed to be an instance method.

    _extract_from_webpage may raise self.StopExtraction() to stop further
    processing of the webpage and obtain exclusive rights to it. This is useful
    when the extractor cannot reliably be matched using just the URL,
    e.g. invidious/peertube instances

    Embed-only extractors can be defined by setting _VALID_URL = False.

    To support username + password (or netrc) login, the extractor must define a
    _NETRC_MACHINE and re-define _perform_login(username, password) and
    (optionally) _initialize_pre_login() methods. The _perform_login method will
    be called between _initialize_pre_login and _real_initialize if credentials
    are passed by the user. In cases where it is necessary to have the login
    process as part of the extraction rather than initialization, _perform_login
    can be left undefined.

    _GEO_BYPASS attribute may be set to False in order to disable
    geo restriction bypass mechanisms for a particular extractor.
    Though it won't disable explicit geo restriction bypass based on
    country code provided with geo_bypass_country.

    _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
    countries for this extractor. One of these countries will be used by
    geo restriction bypass mechanism right away in order to bypass
    geo restriction, of course, if the mechanism is not disabled.

    _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
    IP blocks in CIDR notation for this extractor. One of these IP blocks
    will be used by geo restriction bypass mechanism similarly
    to _GEO_COUNTRIES.

    The _ENABLED attribute should be set to False for IEs that
    are disabled by default and must be explicitly enabled.

    The _WORKING attribute should be set to False for broken IEs
    in order to warn the users and skip the tests.
    """

    _ready = False
    _downloader = None
    _x_forwarded_for_ip = None
    _GEO_BYPASS = True
    _GEO_COUNTRIES = None
    _GEO_IP_BLOCKS = None
    _WORKING = True
    _ENABLED = True
    _NETRC_MACHINE = None
    IE_DESC = None
    SEARCH_KEY = None
    _VALID_URL = None
    _EMBED_REGEX = []

    def _login_hint(self, method=NO_DEFAULT, netrc=None):
        password_hint = f'--username and --password, --netrc-cmd, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
        return {
            None: '',
            'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
            'password': f'Use {password_hint}',
            'cookies': (
                'Use --cookies-from-browser or --cookies for the authentication. '
                'See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies'),
        }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']

    def __init__(self, downloader=None):
        """Constructor. Receives an optional downloader (a YoutubeDL instance).
        If a downloader is not passed during initialization,
        it must be set using "set_downloader()" before "extract()" is called"""
        self._ready = False
        self._x_forwarded_for_ip = None
        self._printed_messages = set()
        self.set_downloader(downloader)

    @classmethod
    def _match_valid_url(cls, url):
        if cls._VALID_URL is False:
            return None
        # This does not use has/getattr intentionally - we want to know whether
        # we have cached the regexp for *this* class, whereas getattr would also
        # match the superclass
        if '_VALID_URL_RE' not in cls.__dict__:
            cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL)))
        return next(filter(None, (regex.match(url) for regex in cls._VALID_URL_RE)), None)

    @classmethod
    def suitable(cls, url):
        """Receives a URL and returns True if suitable for this IE."""
        # This function must import everything it needs (except other extractors),
        # so that lazy_extractors works correctly
        return cls._match_valid_url(url) is not None

    @classmethod
    def _match_id(cls, url):
        return cls._match_valid_url(url).group('id')

    @classmethod
    def get_temp_id(cls, url):
        try:
            return cls._match_id(url)
        except (IndexError, AttributeError):
            return None

    @classmethod
    def working(cls):
        """Getter method for _WORKING."""
        return cls._WORKING

    @classmethod
    def supports_login(cls):
        return bool(cls._NETRC_MACHINE)

    def initialize(self):
        """Initializes an instance (authentication, etc)."""
        self._printed_messages = set()
        self._initialize_geo_bypass({
            'countries': self._GEO_COUNTRIES,
            'ip_blocks': self._GEO_IP_BLOCKS,
        })
        if not self._ready:
            self._initialize_pre_login()
            if self.supports_login():
                username, password = self._get_login_info()
                if username:
                    self._perform_login(username, password)
            elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
                self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
            self._real_initialize()
            self._ready = True

    def _initialize_geo_bypass(self, geo_bypass_context):
        """
        Initialize geo restriction bypass mechanism.

        This method is used to initialize geo bypass mechanism based on faking
        X-Forwarded-For HTTP header. A random country from provided country list
        is selected and a random IP belonging to this country is generated. This
        IP will be passed as X-Forwarded-For HTTP header in all subsequent
        HTTP requests.

        This method will be used for initial geo bypass mechanism initialization
        during the instance initialization with _GEO_COUNTRIES and
        _GEO_IP_BLOCKS.

        You may also manually call it from extractor's code if geo bypass
        information is not available beforehand (e.g. obtained during
        extraction) or due to some other reason. In this case you should pass
        this information in geo bypass context passed as first argument. It may
        contain following fields:

        countries:  List of geo unrestricted countries (similar
                    to _GEO_COUNTRIES)
        ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
                    (similar to _GEO_IP_BLOCKS)

        """
        if not self._x_forwarded_for_ip:

            # Geo bypass mechanism is explicitly disabled by user
            if not self.get_param('geo_bypass', True):
                return

            if not geo_bypass_context:
                geo_bypass_context = {}

            # Backward compatibility: previously _initialize_geo_bypass
            # expected a list of countries, some 3rd party code may still use
            # it this way
            if isinstance(geo_bypass_context, (list, tuple)):
                geo_bypass_context = {
                    'countries': geo_bypass_context,
                }

            # The whole point of geo bypass mechanism is to fake IP
            # as X-Forwarded-For HTTP header based on some IP block or
            # country code.

            # Path 1: bypassing based on IP block in CIDR notation

            # Explicit IP block specified by user, use it right away
            # regardless of whether extractor is geo bypassable or not
            ip_block = self.get_param('geo_bypass_ip_block', None)

            # Otherwise use random IP block from geo bypass context but only
            # if extractor is known as geo bypassable
            if not ip_block:
                ip_blocks = geo_bypass_context.get('ip_blocks')
                if self._GEO_BYPASS and ip_blocks:
                    ip_block = random.choice(ip_blocks)

            if ip_block:
                self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
                self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
                return

            # Path 2: bypassing based on country code

            # Explicit country code specified by user, use it right away
            # regardless of whether extractor is geo bypassable or not
            country = self.get_param('geo_bypass_country', None)

            # Otherwise use random country code from geo bypass context but
            # only if extractor is known as geo bypassable
            if not country:
                countries = geo_bypass_context.get('countries')
                if self._GEO_BYPASS and countries:
                    country = random.choice(countries)

            if country:
                self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
                self._downloader.write_debug(
                    f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')

    def extract(self, url):
        """Extracts URL information and returns it in list of dicts."""
        try:
            for _ in range(2):
                try:
                    self.initialize()
                    self.to_screen('Extracting URL: %s' % (
                        url if self.get_param('verbose') else truncate_string(url, 100, 20)))
                    ie_result = self._real_extract(url)
                    if ie_result is None:
                        return None
                    if self._x_forwarded_for_ip:
                        ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
                    subtitles = ie_result.get('subtitles') or {}
                    if 'no-live-chat' in self.get_param('compat_opts'):
                        for lang in ('live_chat', 'comments', 'danmaku'):
                            subtitles.pop(lang, None)
                    return ie_result
                except GeoRestrictedError as e:
                    if self.__maybe_fake_ip_and_retry(e.countries):
                        continue
                    raise
        except UnsupportedError:
            raise
        except ExtractorError as e:
            e.video_id = e.video_id or self.get_temp_id(url)
            e.ie = e.ie or self.IE_NAME
            e.traceback = e.traceback or sys.exc_info()[2]
            raise
        except IncompleteRead as e:
            raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
        except (KeyError, StopIteration) as e:
            raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))

    def __maybe_fake_ip_and_retry(self, countries):
        if (not self.get_param('geo_bypass_country', None)
                and self._GEO_BYPASS
                and self.get_param('geo_bypass', True)
                and not self._x_forwarded_for_ip
                and countries):
            country_code = random.choice(countries)
            self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
            if self._x_forwarded_for_ip:
                self.report_warning(
                    'Video is geo restricted. Retrying extraction with fake IP '
                    f'{self._x_forwarded_for_ip} ({country_code.upper()}) as X-Forwarded-For.')
                return True
        return False

    def set_downloader(self, downloader):
        """Sets a YoutubeDL instance as the downloader for this IE."""
        self._downloader = downloader

    @property
    def cache(self):
        return self._downloader.cache

    @property
    def cookiejar(self):
        return self._downloader.cookiejar

    def _initialize_pre_login(self):
        """ Initialization before login. Redefine in subclasses."""
        pass

    def _perform_login(self, username, password):
        """ Login with username and password. Redefine in subclasses."""
        pass

    def _real_initialize(self):
        """Real initialization process. Redefine in subclasses."""
        pass

    def _real_extract(self, url):
        """Real extraction process. Redefine in subclasses."""
        raise NotImplementedError('This method must be implemented by subclasses')

    @classmethod
    def ie_key(cls):
        """A string for getting the InfoExtractor with get_info_extractor"""
        return cls.__name__[:-2]

    @classproperty
    def IE_NAME(cls):
        return cls.__name__[:-2]

    @staticmethod
    def __can_accept_status_code(err, expected_status):
        assert isinstance(err, HTTPError)
        if expected_status is None:
            return False
        elif callable(expected_status):
            return expected_status(err.status) is True
        else:
            return err.status in variadic(expected_status)

    def _create_request(self, url_or_request, data=None, headers=None, query=None, extensions=None):
        if isinstance(url_or_request, urllib.request.Request):
            self._downloader.deprecation_warning(
                'Passing a urllib.request.Request to _create_request() is deprecated. '
                'Use yt_dlp.networking.common.Request instead.')
            url_or_request = urllib_req_to_req(url_or_request)
        elif not isinstance(url_or_request, Request):
            url_or_request = Request(url_or_request)

        url_or_request.update(data=data, headers=headers, query=query, extensions=extensions)
        return url_or_request

    def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None,
                         headers=None, query=None, expected_status=None, impersonate=None, require_impersonation=False):
        """
        Return the response handle.

        See _download_webpage docstring for arguments specification.
        """
        if not self._downloader._first_webpage_request:
            sleep_interval = self.get_param('sleep_interval_requests') or 0
            if sleep_interval > 0:
                self.to_screen(f'Sleeping {sleep_interval} seconds ...')
                time.sleep(sleep_interval)
        else:
            self._downloader._first_webpage_request = False

        if note is None:
            self.report_download_webpage(video_id)
        elif note is not False:
            if video_id is None:
                self.to_screen(str(note))
            else:
                self.to_screen(f'{video_id}: {note}')

        # Some sites check X-Forwarded-For HTTP header in order to figure out
        # the origin of the client behind proxy. This allows bypassing geo
        # restriction by faking this header's value to IP that belongs to some
        # geo unrestricted country. We will do so once we encounter any
        # geo restriction error.
        if self._x_forwarded_for_ip:
            headers = (headers or {}).copy()
            headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)

        extensions = {}

        if impersonate in (True, ''):
            impersonate = ImpersonateTarget()
        requested_targets = [
            t if isinstance(t, ImpersonateTarget) else ImpersonateTarget.from_str(t)
            for t in variadic(impersonate)
        ] if impersonate else []

        available_target = next(filter(self._downloader._impersonate_target_available, requested_targets), None)
        if available_target:
            extensions['impersonate'] = available_target
        elif requested_targets:
            message = 'The extractor is attempting impersonation, but '
            message += (
                'no impersonate target is available' if not str(impersonate)
                else f'none of these impersonate targets are available: "{", ".join(map(str, requested_targets))}"')
            info_msg = ('see  https://github.com/yt-dlp/yt-dlp#impersonation  '
                        'for information on installing the required dependencies')
            if require_impersonation:
                raise ExtractorError(f'{message}; {info_msg}', expected=True)
            self.report_warning(f'{message}; if you encounter errors, then {info_msg}', only_once=True)

        try:
            return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query, extensions))
        except network_exceptions as err:
            if isinstance(err, HTTPError):
                if self.__can_accept_status_code(err, expected_status):
                    return err.response

            if errnote is False:
                return False
            if errnote is None:
                errnote = 'Unable to download webpage'

            errmsg = f'{errnote}: {err}'
            if fatal:
                raise ExtractorError(errmsg, cause=err)
            else:
                self.report_warning(errmsg)
                return False

    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
                                 encoding=None, data=None, headers={}, query={}, expected_status=None,
                                 impersonate=None, require_impersonation=False):
        """
        Return a tuple (page content as string, URL handle).

        Arguments:
        url_or_request -- plain text URL as a string or
            a yt_dlp.networking.Request object
        video_id -- Video/playlist/item identifier (string)

        Keyword arguments:
        note -- note printed before downloading (string)
        errnote -- note printed in case of an error (string)
        fatal -- flag denoting whether error should be considered fatal,
            i.e. whether it should cause ExtractionError to be raised,
            otherwise a warning will be reported and extraction continued
        encoding -- encoding for a page content decoding, guessed automatically
            when not explicitly specified
        data -- POST data (bytes)
        headers -- HTTP headers (dict)
        query -- URL query (dict)
        expected_status -- allows to accept failed HTTP requests (non 2xx
            status code) by explicitly specifying a set of accepted status
            codes. Can be any of the following entities:
                - an integer type specifying an exact failed status code to
                  accept
                - a list or a tuple of integer types specifying a list of
                  failed status codes to accept
                - a callable accepting an actual failed status code and
                  returning True if it should be accepted
            Note that this argument does not affect success status codes (2xx)
            which are always accepted.
        impersonate -- the impersonate target. Can be any of the following entities:
                - an instance of yt_dlp.networking.impersonate.ImpersonateTarget
                - a string in the format of CLIENT[:OS]
                - a list or a tuple of CLIENT[:OS] strings or ImpersonateTarget instances
                - a boolean value; True means any impersonate target is sufficient
        require_impersonation -- flag to toggle whether the request should raise an error
            if impersonation is not possible (bool, default: False)
        """

        # Strip hashes from the URL (#1038)
        if isinstance(url_or_request, str):
            url_or_request = url_or_request.partition('#')[0]

        urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data,
                                     headers=headers, query=query, expected_status=expected_status,
                                     impersonate=impersonate, require_impersonation=require_impersonation)
        if urlh is False:
            assert not fatal
            return False
        content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal,
                                             encoding=encoding, data=data)
        return (content, urlh)

    @staticmethod
    def _guess_encoding_from_content(content_type, webpage_bytes):
        m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
        if m:
            encoding = m.group(1)
        else:
            m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
                          webpage_bytes[:1024])
            if m:
                encoding = m.group(1).decode('ascii')
            elif webpage_bytes.startswith(b'\xff\xfe'):
                encoding = 'utf-16'
            else:
                encoding = 'utf-8'

        return encoding

    def __check_blocked(self, content):
        first_block = content[:512]
        if ('<title>Access to this site is blocked</title>' in content
                and 'Websense' in first_block):
            msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
            blocked_iframe = self._html_search_regex(
                r'<iframe src="([^"]+)"', content,
                'Websense information URL', default=None)
            if blocked_iframe:
                msg += f' Visit {blocked_iframe} for more details'
            raise ExtractorError(msg, expected=True)
        if '<title>The URL you requested has been blocked</title>' in first_block:
            msg = (
                'Access to this webpage has been blocked by Indian censorship. '
                'Use a VPN or proxy server (with --proxy) to route around it.')
            block_msg = self._html_search_regex(
                r'</h1><p>(.*?)</p>',
                content, 'block message', default=None)
            if block_msg:
                msg += ' (Message: "{}")'.format(block_msg.replace('\n', ' '))
            raise ExtractorError(msg, expected=True)
        if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
                and 'blocklist.rkn.gov.ru' in content):
            raise ExtractorError(
                'Access to this webpage has been blocked by decision of the Russian government. '
                'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
                expected=True)

    def _request_dump_filename(self, url, video_id, data=None):
        if data is not None:
            data = hashlib.md5(data).hexdigest()
        basen = join_nonempty(video_id, data, url, delim='_')
        trim_length = self.get_param('trim_file_name') or 240
        if len(basen) > trim_length:
            h = '___' + hashlib.md5(basen.encode()).hexdigest()
            basen = basen[:trim_length - len(h)] + h
        filename = sanitize_filename(f'{basen}.dump', restricted=True)
        # Working around MAX_PATH limitation on Windows (see
        # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
        if compat_os_name == 'nt':
            absfilepath = os.path.abspath(filename)
            if len(absfilepath) > 259:
                filename = fR'\\?\{absfilepath}'
        return filename

    def __decode_webpage(self, webpage_bytes, encoding, headers):
        if not encoding:
            encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
        try:
            return webpage_bytes.decode(encoding, 'replace')
        except LookupError:
            return webpage_bytes.decode('utf-8', 'replace')

    def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True,
                              prefix=None, encoding=None, data=None):
        webpage_bytes = urlh.read()
        if prefix is not None:
            webpage_bytes = prefix + webpage_bytes
        if self.get_param('dump_intermediate_pages', False):
            self.to_screen('Dumping request to ' + urlh.url)
            dump = base64.b64encode(webpage_bytes).decode('ascii')
            self._downloader.to_screen(dump)
        if self.get_param('write_pages'):
            if isinstance(url_or_request, Request):
                data = self._create_request(url_or_request, data).data
            filename = self._request_dump_filename(urlh.url, video_id, data)
            self.to_screen(f'Saving request to {filename}')
            with open(filename, 'wb') as outf:
                outf.write(webpage_bytes)

        content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
        self.__check_blocked(content)

        return content

    def __print_error(self, errnote, fatal, video_id, err):
        if fatal:
            raise ExtractorError(f'{video_id}: {errnote}', cause=err)
        elif errnote:
            self.report_warning(f'{video_id}: {errnote}: {err}')

    def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
        if transform_source:
            xml_string = transform_source(xml_string)
        try:
            return compat_etree_fromstring(xml_string.encode())
        except xml.etree.ElementTree.ParseError as ve:
            self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)

    def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
        try:
            return json.loads(
                json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
        except ValueError as ve:
            self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)

    def _parse_socket_response_as_json(self, data, *args, **kwargs):
        return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)

    def __create_download_methods(name, parser, note, errnote, return_value):

        def parse(ie, content, *args, errnote=errnote, **kwargs):
            if parser is None:
                return content
            if errnote is False:
                kwargs['errnote'] = errnote
            # parser is fetched by name so subclasses can override it
            return getattr(ie, parser)(content, *args, **kwargs)

        def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
                            fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None,
                            impersonate=None, require_impersonation=False):
            res = self._download_webpage_handle(
                url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
                data=data, headers=headers, query=query, expected_status=expected_status,
                impersonate=impersonate, require_impersonation=require_impersonation)
            if res is False:
                return res
            content, urlh = res
            return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh

        def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
                             fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None,
                             impersonate=None, require_impersonation=False):
            if self.get_param('load_pages'):
                url_or_request = self._create_request(url_or_request, data, headers, query)
                filename = self._request_dump_filename(url_or_request.url, video_id, url_or_request.data)
                self.to_screen(f'Loading request from {filename}')
                try:
                    with open(filename, 'rb') as dumpf:
                        webpage_bytes = dumpf.read()
                except OSError as e:
                    self.report_warning(f'Unable to load request from disk: {e}')
                else:
                    content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
                    return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
            kwargs = {
                'note': note,
                'errnote': errnote,
                'transform_source': transform_source,
                'fatal': fatal,
                'encoding': encoding,
                'data': data,
                'headers': headers,
                'query': query,
                'expected_status': expected_status,
                'impersonate': impersonate,
                'require_impersonation': require_impersonation,
            }
            if parser is None:
                kwargs.pop('transform_source')
            # The method is fetched by name so subclasses can override _download_..._handle
            res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
            return res if res is False else res[0]

        def impersonate(func, name, return_value):
            func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
            func.__doc__ = f'''
                @param transform_source     Apply this transformation before parsing
                @returns                    {return_value}

                See _download_webpage_handle docstring for other arguments specification
            '''

        impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
        impersonate(download_content, f'_download_{name}', f'{return_value}')
        return download_handle, download_content

    _download_xml_handle, _download_xml = __create_download_methods(
        'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
    _download_json_handle, _download_json = __create_download_methods(
        'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
    _download_socket_json_handle, _download_socket_json = __create_download_methods(
        'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
    __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]

    def _download_webpage(
            self, url_or_request, video_id, note=None, errnote=None,
            fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
        """
        Return the data of the page as a string.

        Keyword arguments:
        tries -- number of tries
        timeout -- sleep interval between tries

        See _download_webpage_handle docstring for other arguments specification.
        """

        R''' # NB: These are unused; should they be deprecated?
        if tries != 1:
            self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
        if timeout is NO_DEFAULT:
            timeout = 5
        else:
            self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
        '''

        try_count = 0
        while True:
            try:
                return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
            except IncompleteRead as e:
                try_count += 1
                if try_count >= tries:
                    raise e
                self._sleep(timeout, video_id)

    def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
        idstr = format_field(video_id, None, '%s: ')
        msg = f'[{self.IE_NAME}] {idstr}{msg}'
        if only_once:
            if f'WARNING: {msg}' in self._printed_messages:
                return
            self._printed_messages.add(f'WARNING: {msg}')
        self._downloader.report_warning(msg, *args, **kwargs)

    def to_screen(self, msg, *args, **kwargs):
        """Print msg to screen, prefixing it with '[ie_name]'"""
        self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)

    def write_debug(self, msg, *args, **kwargs):
        self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)

    def get_param(self, name, default=None, *args, **kwargs):
        if self._downloader:
            return self._downloader.params.get(name, default, *args, **kwargs)
        return default

    def report_drm(self, video_id, partial=NO_DEFAULT):
        if partial is not NO_DEFAULT:
            self._downloader.deprecation_warning('InfoExtractor.report_drm no longer accepts the argument partial')
        self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)

    def report_extraction(self, id_or_name):
        """Report information extraction."""
        self.to_screen(f'{id_or_name}: Extracting information')

    def report_download_webpage(self, video_id):
        """Report webpage download."""
        self.to_screen(f'{video_id}: Downloading webpage')

    def report_age_confirmation(self):
        """Report attempt to confirm age."""
        self.to_screen('Confirming age')

    def report_login(self):
        """Report attempt to log in."""
        self.to_screen('Logging in')

    def raise_login_required(
            self, msg='This video is only available for registered users',
            metadata_available=False, method=NO_DEFAULT):
        if metadata_available and (
                self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
            self.report_warning(msg)
            return
        msg += format_field(self._login_hint(method), None, '. %s')
        raise ExtractorError(msg, expected=True)

    def raise_geo_restricted(
            self, msg='This video is not available from your location due to geo restriction',
            countries=None, metadata_available=False):
        if metadata_available and (
                self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
            self.report_warning(msg)
        else:
            raise GeoRestrictedError(msg, countries=countries)

    def raise_no_formats(self, msg, expected=False, video_id=None):
        if expected and (
                self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
            self.report_warning(msg, video_id)
        elif isinstance(msg, ExtractorError):
            raise msg
        else:
            raise ExtractorError(msg, expected=expected, video_id=video_id)

    # Methods for following #608
    @staticmethod
    def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
        """Returns a URL that points to a page that should be processed"""
        if ie is not None:
            kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
        if video_id is not None:
            kwargs['id'] = video_id
        if video_title is not None:
            kwargs['title'] = video_title
        return {
            **kwargs,
            '_type': 'url_transparent' if url_transparent else 'url',
            'url': url,
        }

    @classmethod
    def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
                              getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
        return cls.playlist_result(
            (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
            playlist_id, playlist_title, **kwargs)

    @staticmethod
    def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
        """Returns a playlist"""
        if playlist_id:
            kwargs['id'] = playlist_id
        if playlist_title:
            kwargs['title'] = playlist_title
        if playlist_description is not None:
            kwargs['description'] = playlist_description
        return {
            **kwargs,
            '_type': 'multi_video' if multi_video else 'playlist',
            'entries': entries,
        }

    def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
        """
        Perform a regex search on the given string, using a single or a list of
        patterns returning the first matching group.
        In case of failure return a default value or raise a WARNING or a
        RegexNotFoundError, depending on fatal, specifying the field name.
        """
        if string is None:
            mobj = None
        elif isinstance(pattern, (str, re.Pattern)):
            mobj = re.search(pattern, string, flags)
        else:
            for p in pattern:
                mobj = re.search(p, string, flags)
                if mobj:
                    break

        _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)

        if mobj:
            if group is None:
                # return the first matching group
                return next(g for g in mobj.groups() if g is not None)
            elif isinstance(group, (list, tuple)):
                return tuple(mobj.group(g) for g in group)
            else:
                return mobj.group(group)
        elif default is not NO_DEFAULT:
            return default
        elif fatal:
            raise RegexNotFoundError(f'Unable to extract {_name}')
        else:
            self.report_warning(f'unable to extract {_name}' + bug_reports_message())
            return None

    def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
                     contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs):
        """Searches string for the JSON object specified by start_pattern"""
        # NB: end_pattern is only used to reduce the size of the initial match
        if default is NO_DEFAULT:
            default, has_default = {}, False
        else:
            fatal, has_default = False, True

        json_string = self._search_regex(
            rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})',
            string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
        if not json_string:
            return default

        _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
        try:
            return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
        except ExtractorError as e:
            if fatal:
                raise ExtractorError(
                    f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
            elif not has_default:
                self.report_warning(
                    f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
        return default

    def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
        """
        Like _search_regex, but strips HTML tags and unescapes entities.
        """
        res = self._search_regex(pattern, string, name, default, fatal, flags, group)
        if isinstance(res, tuple):
            return tuple(map(clean_html, res))
        return clean_html(res)

    def _get_netrc_login_info(self, netrc_machine=None):
        netrc_machine = netrc_machine or self._NETRC_MACHINE

        cmd = self.get_param('netrc_cmd')
        if cmd:
            cmd = cmd.replace('{}', netrc_machine)
            self.to_screen(f'Executing command: {cmd}')
            stdout, _, ret = Popen.run(cmd, text=True, shell=True, stdout=subprocess.PIPE)
            if ret != 0:
                raise OSError(f'Command returned error code {ret}')
            info = netrc_from_content(stdout).authenticators(netrc_machine)

        elif self.get_param('usenetrc', False):
            netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
            if os.path.isdir(netrc_file):
                netrc_file = os.path.join(netrc_file, '.netrc')
            info = netrc.netrc(netrc_file).authenticators(netrc_machine)

        else:
            return None, None
        if not info:
            self.to_screen(f'No authenticators for {netrc_machine}')
            return None, None

        self.write_debug(f'Using netrc for {netrc_machine} authentication')
        return info[0], info[2]

    def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
        """
        Get the login info as (username, password)
        First look for the manually specified credentials using username_option
        and password_option as keys in params dictionary. If no such credentials
        are available try the netrc_cmd if it is defined or look in the
        netrc file using the netrc_machine or _NETRC_MACHINE value.
        If there's no info available, return (None, None)
        """

        username = self.get_param(username_option)
        if username is not None:
            password = self.get_param(password_option)
        else:
            try:
                username, password = self._get_netrc_login_info(netrc_machine)
            except (OSError, netrc.NetrcParseError) as err:
                self.report_warning(f'Failed to parse .netrc: {err}')
                return None, None
        return username, password

    def _get_tfa_info(self, note='two-factor verification code'):
        """
        Get the two-factor authentication info
        TODO - asking the user will be required for sms/phone verify
        currently just uses the command line option
        If there's no info available, return None
        """

        tfa = self.get_param('twofactor')
        if tfa is not None:
            return tfa

        return getpass.getpass(f'Type {note} and press [Return]: ')

    # Helper functions for extracting OpenGraph info
    @staticmethod
    def _og_regexes(prop):
        content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))'
        property_re = r'(?:name|property)=(?:\'og{sep}{prop}\'|"og{sep}{prop}"|\s*og{sep}{prop}\b)'.format(
            prop=re.escape(prop), sep='(?:&#x3A;|[:-])')
        template = r'<meta[^>]+?%s[^>]+?%s'
        return [
            template % (property_re, content_re),
            template % (content_re, property_re),
        ]

    @staticmethod
    def _meta_regex(prop):
        return rf'''(?isx)<meta
                    (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?){re.escape(prop)}\1)
                    [^>]+?content=(["\'])(?P<content>.*?)\2'''

    def _og_search_property(self, prop, html, name=None, **kargs):
        prop = variadic(prop)
        if name is None:
            name = f'OpenGraph {prop[0]}'
        og_regexes = []
        for p in prop:
            og_regexes.extend(self._og_regexes(p))
        escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
        if escaped is None:
            return None
        return unescapeHTML(escaped)

    def _og_search_thumbnail(self, html, **kargs):
        return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)

    def _og_search_description(self, html, **kargs):
        return self._og_search_property('description', html, fatal=False, **kargs)

    def _og_search_title(self, html, *, fatal=False, **kargs):
        return self._og_search_property('title', html, fatal=fatal, **kargs)

    def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
        regexes = self._og_regexes('video') + self._og_regexes('video:url')
        if secure:
            regexes = self._og_regexes('video:secure_url') + regexes
        return self._html_search_regex(regexes, html, name, **kargs)

    def _og_search_url(self, html, **kargs):
        return self._og_search_property('url', html, **kargs)

    def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
        return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)

    def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
        name = variadic(name)
        if display_name is None:
            display_name = name[0]
        return self._html_search_regex(
            [self._meta_regex(n) for n in name],
            html, display_name, fatal=fatal, group='content', **kwargs)

    def _dc_search_uploader(self, html):
        return self._html_search_meta('dc.creator', html, 'uploader')

    @staticmethod
    def _rta_search(html):
        # See http://www.rtalabel.org/index.php?content=howtofaq#single
        if re.search(r'(?ix)<meta\s+name="rating"\s+'
                     r'     content="RTA-5042-1996-1400-1577-RTA"',
                     html):
            return 18

        # And then there are the jokers who advertise that they use RTA, but actually don't.
        AGE_LIMIT_MARKERS = [
            r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
            r'>[^<]*you acknowledge you are at least (\d+) years old',
            r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',
        ]

        age_limit = 0
        for marker in AGE_LIMIT_MARKERS:
            mobj = re.search(marker, html)
            if mobj:
                age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18)))
        return age_limit

    def _media_rating_search(self, html):
        # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
        rating = self._html_search_meta('rating', html)

        if not rating:
            return None

        RATING_TABLE = {
            'safe for kids': 0,
            'general': 8,
            '14 years': 14,
            'mature': 17,
            'restricted': 19,
        }
        return RATING_TABLE.get(rating.lower())

    def _family_friendly_search(self, html):
        # See http://schema.org/VideoObject
        family_friendly = self._html_search_meta(
            'isFamilyFriendly', html, default=None)

        if not family_friendly:
            return None

        RATING_TABLE = {
            '1': 0,
            'true': 0,
            '0': 18,
            'false': 18,
        }
        return RATING_TABLE.get(family_friendly.lower())

    def _twitter_search_player(self, html):
        return self._html_search_meta('twitter:player', html,
                                      'twitter card player')

    def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
        """Yield all json ld objects in the html"""
        if default is not NO_DEFAULT:
            fatal = False
        for mobj in re.finditer(JSON_LD_RE, html):
            json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
            for json_ld in variadic(json_ld_item):
                if isinstance(json_ld, dict):
                    yield json_ld

    def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
        """Search for a video in any json ld in the html"""
        if default is not NO_DEFAULT:
            fatal = False
        info = self._json_ld(
            list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
            video_id, fatal=fatal, expected_type=expected_type)
        if info:
            return info
        if default is not NO_DEFAULT:
            return default
        elif fatal:
            raise RegexNotFoundError('Unable to extract JSON-LD')
        else:
            self.report_warning(f'unable to extract JSON-LD {bug_reports_message()}')
            return {}

    def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
        if isinstance(json_ld, str):
            json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
        if not json_ld:
            return {}
        info = {}

        INTERACTION_TYPE_MAP = {
            'CommentAction': 'comment',
            'AgreeAction': 'like',
            'DisagreeAction': 'dislike',
            'LikeAction': 'like',
            'DislikeAction': 'dislike',
            'ListenAction': 'view',
            'WatchAction': 'view',
            'ViewAction': 'view',
        }

        def is_type(e, *expected_types):
            type_ = variadic(traverse_obj(e, '@type'))
            return any(x in type_ for x in expected_types)

        def extract_interaction_type(e):
            interaction_type = e.get('interactionType')
            if isinstance(interaction_type, dict):
                interaction_type = interaction_type.get('@type')
            return str_or_none(interaction_type)

        def extract_interaction_statistic(e):
            interaction_statistic = e.get('interactionStatistic')
            if isinstance(interaction_statistic, dict):
                interaction_statistic = [interaction_statistic]
            if not isinstance(interaction_statistic, list):
                return
            for is_e in interaction_statistic:
                if not is_type(is_e, 'InteractionCounter'):
                    continue
                interaction_type = extract_interaction_type(is_e)
                if not interaction_type:
                    continue
                # For interaction count some sites provide string instead of
                # an integer (as per spec) with non digit characters (e.g. ",")
                # so extracting count with more relaxed str_to_int
                interaction_count = str_to_int(is_e.get('userInteractionCount'))
                if interaction_count is None:
                    continue
                count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
                if not count_kind:
                    continue
                count_key = f'{count_kind}_count'
                if info.get(count_key) is not None:
                    continue
                info[count_key] = interaction_count

        def extract_chapter_information(e):
            chapters = [{
                'title': part.get('name'),
                'start_time': part.get('startOffset'),
                'end_time': part.get('endOffset'),
            } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
            for idx, (last_c, current_c, next_c) in enumerate(zip(
                    [{'end_time': 0}, *chapters], chapters, chapters[1:])):
                current_c['end_time'] = current_c['end_time'] or next_c['start_time']
                current_c['start_time'] = current_c['start_time'] or last_c['end_time']
                if None in current_c.values():
                    self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
                    return
            if chapters:
                chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
                info['chapters'] = chapters

        def extract_video_object(e):
            author = e.get('author')
            info.update({
                'url': url_or_none(e.get('contentUrl')),
                'ext': mimetype2ext(e.get('encodingFormat')),
                'title': unescapeHTML(e.get('name')),
                'description': unescapeHTML(e.get('description')),
                'thumbnails': [{'url': unescapeHTML(url)}
                               for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
                               if url_or_none(url)],
                'duration': parse_duration(e.get('duration')),
                'timestamp': unified_timestamp(e.get('uploadDate')),
                # author can be an instance of 'Organization' or 'Person' types.
                # both types can have 'name' property(inherited from 'Thing' type). [1]
                # however some websites are using 'Text' type instead.
                # 1. https://schema.org/VideoObject
                'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
                'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),
                'filesize': int_or_none(float_or_none(e.get('contentSize'))),
                'tbr': int_or_none(e.get('bitrate')),
                'width': int_or_none(e.get('width')),
                'height': int_or_none(e.get('height')),
                'view_count': int_or_none(e.get('interactionCount')),
                'tags': try_call(lambda: e.get('keywords').split(',')),
            })
            if is_type(e, 'AudioObject'):
                info.update({
                    'vcodec': 'none',
                    'abr': int_or_none(e.get('bitrate')),
                })
            extract_interaction_statistic(e)
            extract_chapter_information(e)

        def traverse_json_ld(json_ld, at_top_level=True):
            for e in variadic(json_ld):
                if not isinstance(e, dict):
                    continue
                if at_top_level and '@context' not in e:
                    continue
                if at_top_level and set(e.keys()) == {'@context', '@graph'}:
                    traverse_json_ld(e['@graph'], at_top_level=False)
                    continue
                if expected_type is not None and not is_type(e, expected_type):
                    continue
                rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
                if rating is not None:
                    info['average_rating'] = rating
                if is_type(e, 'TVEpisode', 'Episode'):
                    episode_name = unescapeHTML(e.get('name'))
                    info.update({
                        'episode': episode_name,
                        'episode_number': int_or_none(e.get('episodeNumber')),
                        'description': unescapeHTML(e.get('description')),
                    })
                    if not info.get('title') and episode_name:
                        info['title'] = episode_name
                    part_of_season = e.get('partOfSeason')
                    if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
                        info.update({
                            'season': unescapeHTML(part_of_season.get('name')),
                            'season_number': int_or_none(part_of_season.get('seasonNumber')),
                        })
                    part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
                    if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
                        info['series'] = unescapeHTML(part_of_series.get('name'))
                elif is_type(e, 'Movie'):
                    info.update({
                        'title': unescapeHTML(e.get('name')),
                        'description': unescapeHTML(e.get('description')),
                        'duration': parse_duration(e.get('duration')),
                        'timestamp': unified_timestamp(e.get('dateCreated')),
                    })
                elif is_type(e, 'Article', 'NewsArticle'):
                    info.update({
                        'timestamp': parse_iso8601(e.get('datePublished')),
                        'title': unescapeHTML(e.get('headline')),
                        'description': unescapeHTML(e.get('articleBody') or e.get('description')),
                    })
                    if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
                        extract_video_object(e['video'][0])
                    elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
                        extract_video_object(e['subjectOf'][0])
                elif is_type(e, 'VideoObject', 'AudioObject'):
                    extract_video_object(e)
                    if expected_type is None:
                        continue
                    else:
                        break
                video = e.get('video')
                if is_type(video, 'VideoObject'):
                    extract_video_object(video)
                if expected_type is None:
                    continue
                else:
                    break

        traverse_json_ld(json_ld)
        return filter_dict(info)

    def _search_nextjs_data(self, webpage, video_id, *, fatal=True, default=NO_DEFAULT, **kw):
        if default == '{}':
            self._downloader.deprecation_warning('using `default=\'{}\'` is deprecated, use `default={}` instead')
            default = {}
        if default is not NO_DEFAULT:
            fatal = False

        return self._search_json(
            r'<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>', webpage, 'next.js data',
            video_id, end_pattern='</script>', fatal=fatal, default=default, **kw)

    def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
        """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
        rectx = re.escape(context_name)
        FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){.*?\breturn\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
        js, arg_keys, arg_vals = self._search_regex(
            (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
            webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
            default=NO_DEFAULT if fatal else (None, None, None))
        if js is None:
            return {}

        args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
            f'[{arg_vals}]', video_id, transform_source=js_to_json, fatal=fatal) or ())))

        ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
        return traverse_obj(ret, traverse) or {}

    @staticmethod
    def _hidden_inputs(html):
        html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
        hidden_inputs = {}
        for input_el in re.findall(r'(?i)(<input[^>]+>)', html):
            attrs = extract_attributes(input_el)
            if not input_el:
                continue
            if attrs.get('type') not in ('hidden', 'submit'):
                continue
            name = attrs.get('name') or attrs.get('id')
            value = attrs.get('value')
            if name and value is not None:
                hidden_inputs[name] = value
        return hidden_inputs

    def _form_hidden_inputs(self, form_id, html):
        form = self._search_regex(
            rf'(?is)<form[^>]+?id=(["\']){form_id}\1[^>]*>(?P<form>.+?)</form>',
            html, f'{form_id} form', group='form')
        return self._hidden_inputs(form)

    @classproperty(cache=True)
    def FormatSort(cls):
        class FormatSort(FormatSorter):
            def __init__(ie, *args, **kwargs):
                super().__init__(ie._downloader, *args, **kwargs)

        deprecation_warning(
            'yt_dlp.InfoExtractor.FormatSort is deprecated and may be removed in the future. '
            'Use yt_dlp.utils.FormatSorter instead')
        return FormatSort

    def _sort_formats(self, formats, field_preference=[]):
        if not field_preference:
            self._downloader.deprecation_warning(
                'yt_dlp.InfoExtractor._sort_formats is deprecated and is no longer required')
            return
        self._downloader.deprecation_warning(
            'yt_dlp.InfoExtractor._sort_formats is deprecated and no longer works as expected. '
            'Return _format_sort_fields in the info_dict instead')
        if formats:
            formats[0]['__sort_fields'] = field_preference

    def _check_formats(self, formats, video_id):
        if formats:
            formats[:] = filter(
                lambda f: self._is_valid_url(
                    f['url'], video_id,
                    item='{} video format'.format(f.get('format_id')) if f.get('format_id') else 'video'),
                formats)

    @staticmethod
    def _remove_duplicate_formats(formats):
        format_urls = set()
        unique_formats = []
        for f in formats:
            if f['url'] not in format_urls:
                format_urls.add(f['url'])
                unique_formats.append(f)
        formats[:] = unique_formats

    def _is_valid_url(self, url, video_id, item='video', headers={}):
        url = self._proto_relative_url(url, scheme='http:')
        # For now assume non HTTP(S) URLs always valid
        if not url.startswith(('http://', 'https://')):
            return True
        try:
            self._request_webpage(url, video_id, f'Checking {item} URL', headers=headers)
            return True
        except ExtractorError as e:
            self.to_screen(
                f'{video_id}: {item} URL is invalid, skipping: {e.cause!s}')
            return False

    def http_scheme(self):
        """ Either "http:" or "https:", depending on the user's preferences """
        return (
            'http:'
            if self.get_param('prefer_insecure', False)
            else 'https:')

    def _proto_relative_url(self, url, scheme=None):
        scheme = scheme or self.http_scheme()
        assert scheme.endswith(':')
        return sanitize_url(url, scheme=scheme[:-1])

    def _sleep(self, timeout, video_id, msg_template=None):
        if msg_template is None:
            msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
        msg = msg_template % {'video_id': video_id, 'timeout': timeout}
        self.to_screen(msg)
        time.sleep(timeout)

    def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
                             transform_source=lambda s: fix_xml_ampersands(s).strip(),
                             fatal=True, m3u8_id=None, data=None, headers={}, query={}):
        if self.get_param('ignore_no_formats_error'):
            fatal = False

        res = self._download_xml_handle(
            manifest_url, video_id, 'Downloading f4m manifest',
            'Unable to download f4m manifest',
            # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
            # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
            transform_source=transform_source,
            fatal=fatal, data=data, headers=headers, query=query)
        if res is False:
            return []

        manifest, urlh = res
        manifest_url = urlh.url

        return self._parse_f4m_formats(
            manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
            transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)

    def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
                           transform_source=lambda s: fix_xml_ampersands(s).strip(),
                           fatal=True, m3u8_id=None):
        if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
            return []

        # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
        akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
        if akamai_pv is not None and ';' in akamai_pv.text:
            player_verification_challenge = akamai_pv.text.split(';')[0]
            if player_verification_challenge.strip() != '':
                return []

        formats = []
        manifest_version = '1.0'
        media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
        if not media_nodes:
            manifest_version = '2.0'
            media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
        # Remove unsupported DRM protected media from final formats
        # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
        media_nodes = remove_encrypted_media(media_nodes)
        if not media_nodes:
            return formats

        manifest_base_url = get_base_url(manifest)

        bootstrap_info = xpath_element(
            manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
            'bootstrap info', default=None)

        vcodec = None
        mime_type = xpath_text(
            manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
            'base URL', default=None)
        if mime_type and mime_type.startswith('audio/'):
            vcodec = 'none'

        for i, media_el in enumerate(media_nodes):
            tbr = int_or_none(media_el.attrib.get('bitrate'))
            width = int_or_none(media_el.attrib.get('width'))
            height = int_or_none(media_el.attrib.get('height'))
            format_id = join_nonempty(f4m_id, tbr or i)
            # If <bootstrapInfo> is present, the specified f4m is a
            # stream-level manifest, and only set-level manifests may refer to
            # external resources.  See section 11.4 and section 4 of F4M spec
            if bootstrap_info is None:
                media_url = None
                # @href is introduced in 2.0, see section 11.6 of F4M spec
                if manifest_version == '2.0':
                    media_url = media_el.attrib.get('href')
                if media_url is None:
                    media_url = media_el.attrib.get('url')
                if not media_url:
                    continue
                manifest_url = (
                    media_url if media_url.startswith(('http://', 'https://'))
                    else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
                # If media_url is itself a f4m manifest do the recursive extraction
                # since bitrates in parent manifest (this one) and media_url manifest
                # may differ leading to inability to resolve the format by requested
                # bitrate in f4m downloader
                ext = determine_ext(manifest_url)
                if ext == 'f4m':
                    f4m_formats = self._extract_f4m_formats(
                        manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
                        transform_source=transform_source, fatal=fatal)
                    # Sometimes stream-level manifest contains single media entry that
                    # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
                    # At the same time parent's media entry in set-level manifest may
                    # contain it. We will copy it from parent in such cases.
                    if len(f4m_formats) == 1:
                        f = f4m_formats[0]
                        f.update({
                            'tbr': f.get('tbr') or tbr,
                            'width': f.get('width') or width,
                            'height': f.get('height') or height,
                            'format_id': f.get('format_id') if not tbr else format_id,
                            'vcodec': vcodec,
                        })
                    formats.extend(f4m_formats)
                    continue
                elif ext == 'm3u8':
                    formats.extend(self._extract_m3u8_formats(
                        manifest_url, video_id, 'mp4', preference=preference,
                        quality=quality, m3u8_id=m3u8_id, fatal=fatal))
                    continue
            formats.append({
                'format_id': format_id,
                'url': manifest_url,
                'manifest_url': manifest_url,
                'ext': 'flv' if bootstrap_info is not None else None,
                'protocol': 'f4m',
                'tbr': tbr,
                'width': width,
                'height': height,
                'vcodec': vcodec,
                'preference': preference,
                'quality': quality,
            })
        return formats

    def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
        return {
            'format_id': join_nonempty(m3u8_id, 'meta'),
            'url': m3u8_url,
            'ext': ext,
            'protocol': 'm3u8',
            'preference': preference - 100 if preference else -100,
            'quality': quality,
            'resolution': 'multiple',
            'format_note': 'Quality selection URL',
        }

    def _report_ignoring_subs(self, name):
        self.report_warning(bug_reports_message(
            f'Ignoring subtitle tracks found in the {name} manifest; '
            'if any subtitle tracks are missing,',
        ), only_once=True)

    def _extract_m3u8_formats(self, *args, **kwargs):
        fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
        if subs:
            self._report_ignoring_subs('HLS')
        return fmts

    def _extract_m3u8_formats_and_subtitles(
            self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
            preference=None, quality=None, m3u8_id=None, note=None,
            errnote=None, fatal=True, live=False, data=None, headers={},
            query={}):

        if self.get_param('ignore_no_formats_error'):
            fatal = False

        if not m3u8_url:
            if errnote is not False:
                errnote = errnote or 'Failed to obtain m3u8 URL'
                if fatal:
                    raise ExtractorError(errnote, video_id=video_id)
                self.report_warning(f'{errnote}{bug_reports_message()}')
            return [], {}

        res = self._download_webpage_handle(
            m3u8_url, video_id,
            note='Downloading m3u8 information' if note is None else note,
            errnote='Failed to download m3u8 information' if errnote is None else errnote,
            fatal=fatal, data=data, headers=headers, query=query)

        if res is False:
            return [], {}

        m3u8_doc, urlh = res
        m3u8_url = urlh.url

        return self._parse_m3u8_formats_and_subtitles(
            m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
            preference=preference, quality=quality, m3u8_id=m3u8_id,
            note=note, errnote=errnote, fatal=fatal, live=live, data=data,
            headers=headers, query=query, video_id=video_id)

    def _parse_m3u8_formats_and_subtitles(
            self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
            preference=None, quality=None, m3u8_id=None, live=False, note=None,
            errnote=None, fatal=True, data=None, headers={}, query={},
            video_id=None):
        formats, subtitles = [], {}
        has_drm = HlsFD._has_drm(m3u8_doc)

        def format_url(url):
            return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)

        if self.get_param('hls_split_discontinuity', False):
            def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
                if not m3u8_doc:
                    if not manifest_url:
                        return []
                    m3u8_doc = self._download_webpage(
                        manifest_url, video_id, fatal=fatal, data=data, headers=headers,
                        note=False, errnote='Failed to download m3u8 playlist information')
                    if m3u8_doc is False:
                        return []
                return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))

        else:
            def _extract_m3u8_playlist_indices(*args, **kwargs):
                return [None]

        # References:
        # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
        # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
        # 3. https://github.com/ytdl-org/youtube-dl/issues/18923

        # We should try extracting formats only from master playlists [1, 4.3.4],
        # i.e. playlists that describe available qualities. On the other hand
        # media playlists [1, 4.3.3] should be returned as is since they contain
        # just the media without qualities renditions.
        # Fortunately, master playlist can be easily distinguished from media
        # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
        # master playlist tags MUST NOT appear in a media playlist and vice versa.
        # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
        # media playlist and MUST NOT appear in master playlist thus we can
        # clearly detect media playlist with this criterion.

        if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
            formats = [{
                'format_id': join_nonempty(m3u8_id, idx),
                'format_index': idx,
                'url': m3u8_url or encode_data_uri(m3u8_doc.encode(), 'application/x-mpegurl'),
                'ext': ext,
                'protocol': entry_protocol,
                'preference': preference,
                'quality': quality,
                'has_drm': has_drm,
            } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]

            return formats, subtitles

        groups = {}
        last_stream_inf = {}

        def extract_media(x_media_line):
            media = parse_m3u8_attributes(x_media_line)
            # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
            media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
            if not (media_type and group_id and name):
                return
            groups.setdefault(group_id, []).append(media)
            # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
            if media_type == 'SUBTITLES':
                # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
                # EXT-X-MEDIA tag if the media type is SUBTITLES.
                # However, lack of URI has been spotted in the wild.
                # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
                if not media.get('URI'):
                    return
                url = format_url(media['URI'])
                sub_info = {
                    'url': url,
                    'ext': determine_ext(url),
                }
                if sub_info['ext'] == 'm3u8':
                    # Per RFC 8216 §3.1, the only possible subtitle format m3u8
                    # files may contain is WebVTT:
                    # <https://tools.ietf.org/html/rfc8216#section-3.1>
                    sub_info['ext'] = 'vtt'
                    sub_info['protocol'] = 'm3u8_native'
                lang = media.get('LANGUAGE') or 'und'
                subtitles.setdefault(lang, []).append(sub_info)
            if media_type not in ('VIDEO', 'AUDIO'):
                return
            media_url = media.get('URI')
            if media_url:
                manifest_url = format_url(media_url)
                formats.extend({
                    'format_id': join_nonempty(m3u8_id, group_id, name, idx),
                    'format_note': name,
                    'format_index': idx,
                    'url': manifest_url,
                    'manifest_url': m3u8_url,
                    'language': media.get('LANGUAGE'),
                    'ext': ext,
                    'protocol': entry_protocol,
                    'preference': preference,
                    'quality': quality,
                    'has_drm': has_drm,
                    'vcodec': 'none' if media_type == 'AUDIO' else None,
                } for idx in _extract_m3u8_playlist_indices(manifest_url))

        def build_stream_name():
            # Despite specification does not mention NAME attribute for
            # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
            # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
            # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
            stream_name = last_stream_inf.get('NAME')
            if stream_name:
                return stream_name
            # If there is no NAME in EXT-X-STREAM-INF it will be obtained
            # from corresponding rendition group
            stream_group_id = last_stream_inf.get('VIDEO')
            if not stream_group_id:
                return
            stream_group = groups.get(stream_group_id)
            if not stream_group:
                return stream_group_id
            rendition = stream_group[0]
            return rendition.get('NAME') or stream_group_id

        # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
        # chance to detect video only formats when EXT-X-STREAM-INF tags
        # precede EXT-X-MEDIA tags in HLS manifest such as [3].
        for line in m3u8_doc.splitlines():
            if line.startswith('#EXT-X-MEDIA:'):
                extract_media(line)

        for line in m3u8_doc.splitlines():
            if line.startswith('#EXT-X-STREAM-INF:'):
                last_stream_inf = parse_m3u8_attributes(line)
            elif line.startswith('#') or not line.strip():
                continue
            else:
                tbr = float_or_none(
                    last_stream_inf.get('AVERAGE-BANDWIDTH')
                    or last_stream_inf.get('BANDWIDTH'), scale=1000)
                manifest_url = format_url(line.strip())

                for idx in _extract_m3u8_playlist_indices(manifest_url):
                    format_id = [m3u8_id, None, idx]
                    # Bandwidth of live streams may differ over time thus making
                    # format_id unpredictable. So it's better to keep provided
                    # format_id intact.
                    if not live:
                        stream_name = build_stream_name()
                        format_id[1] = stream_name or '%d' % (tbr or len(formats))
                    f = {
                        'format_id': join_nonempty(*format_id),
                        'format_index': idx,
                        'url': manifest_url,
                        'manifest_url': m3u8_url,
                        'tbr': tbr,
                        'ext': ext,
                        'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
                        'protocol': entry_protocol,
                        'preference': preference,
                        'quality': quality,
                        'has_drm': has_drm,
                    }

                    # YouTube-specific
                    if yt_audio_content_id := last_stream_inf.get('YT-EXT-AUDIO-CONTENT-ID'):
                        f['language'] = yt_audio_content_id.split('.')[0]

                    resolution = last_stream_inf.get('RESOLUTION')
                    if resolution:
                        mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
                        if mobj:
                            f['width'] = int(mobj.group('width'))
                            f['height'] = int(mobj.group('height'))
                    # Unified Streaming Platform
                    mobj = re.search(
                        r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
                    if mobj:
                        abr, vbr = mobj.groups()
                        abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
                        f.update({
                            'vbr': vbr,
                            'abr': abr,
                        })
                    codecs = parse_codecs(last_stream_inf.get('CODECS'))
                    f.update(codecs)
                    audio_group_id = last_stream_inf.get('AUDIO')
                    # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
                    # references a rendition group MUST have a CODECS attribute.
                    # However, this is not always respected. E.g. [2]
                    # contains EXT-X-STREAM-INF tag which references AUDIO
                    # rendition group but does not have CODECS and despite
                    # referencing an audio group it represents a complete
                    # (with audio and video) format. So, for such cases we will
                    # ignore references to rendition groups and treat them
                    # as complete formats.
                    if audio_group_id and codecs and f.get('vcodec') != 'none':
                        audio_group = groups.get(audio_group_id)
                        if audio_group and audio_group[0].get('URI'):
                            # TODO: update acodec for audio only formats with
                            # the same GROUP-ID
                            f['acodec'] = 'none'
                    if not f.get('ext'):
                        f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
                    formats.append(f)

                    # for DailyMotion
                    progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
                    if progressive_uri:
                        http_f = f.copy()
                        del http_f['manifest_url']
                        http_f.update({
                            'format_id': f['format_id'].replace('hls-', 'http-'),
                            'protocol': 'http',
                            'url': progressive_uri,
                        })
                        formats.append(http_f)

                last_stream_inf = {}
        return formats, subtitles

    def _extract_m3u8_vod_duration(
            self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):

        m3u8_vod = self._download_webpage(
            m3u8_vod_url, video_id,
            note='Downloading m3u8 VOD manifest' if note is None else note,
            errnote='Failed to download VOD manifest' if errnote is None else errnote,
            fatal=False, data=data, headers=headers, query=query)

        return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)

    def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
        if '#EXT-X-ENDLIST' not in m3u8_vod:
            return None

        return int(sum(
            float(line[len('#EXTINF:'):].split(',')[0])
            for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None

    def _extract_mpd_vod_duration(
            self, mpd_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):

        mpd_doc = self._download_xml(
            mpd_url, video_id,
            note='Downloading MPD VOD manifest' if note is None else note,
            errnote='Failed to download VOD manifest' if errnote is None else errnote,
            fatal=False, data=data, headers=headers, query=query)
        if not isinstance(mpd_doc, xml.etree.ElementTree.Element):
            return None
        return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration')))

    @staticmethod
    def _xpath_ns(path, namespace=None):
        if not namespace:
            return path
        out = []
        for c in path.split('/'):
            if not c or c == '.':
                out.append(c)
            else:
                out.append(f'{{{namespace}}}{c}')
        return '/'.join(out)

    def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
        if self.get_param('ignore_no_formats_error'):
            fatal = False

        res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
        if res is False:
            assert not fatal
            return [], {}
        smil, urlh = res

        return self._parse_smil_formats_and_subtitles(smil, urlh.url, video_id, f4m_params=f4m_params,
                                                      namespace=self._parse_smil_namespace(smil))

    def _extract_smil_formats(self, *args, **kwargs):
        fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
        if subs:
            self._report_ignoring_subs('SMIL')
        return fmts

    def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
        res = self._download_smil(smil_url, video_id, fatal=fatal)
        if res is False:
            return {}

        smil, urlh = res
        smil_url = urlh.url

        return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)

    def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
        return self._download_xml_handle(
            smil_url, video_id, 'Downloading SMIL file',
            'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)

    def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
        namespace = self._parse_smil_namespace(smil)

        formats, subtitles = self._parse_smil_formats_and_subtitles(
            smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)

        video_id = os.path.splitext(url_basename(smil_url))[0]
        title = None
        description = None
        upload_date = None
        for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
            name = meta.attrib.get('name')
            content = meta.attrib.get('content')
            if not name or not content:
                continue
            if not title and name == 'title':
                title = content
            elif not description and name in ('description', 'abstract'):
                description = content
            elif not upload_date and name == 'date':
                upload_date = unified_strdate(content)

        thumbnails = [{
            'id': image.get('type'),
            'url': image.get('src'),
            'width': int_or_none(image.get('width')),
            'height': int_or_none(image.get('height')),
        } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]

        return {
            'id': video_id,
            'title': title or video_id,
            'description': description,
            'upload_date': upload_date,
            'thumbnails': thumbnails,
            'formats': formats,
            'subtitles': subtitles,
        }

    def _parse_smil_namespace(self, smil):
        return self._search_regex(
            r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)

    def _parse_smil_formats(self, *args, **kwargs):
        fmts, subs = self._parse_smil_formats_and_subtitles(*args, **kwargs)
        if subs:
            self._report_ignoring_subs('SMIL')
        return fmts

    def _parse_smil_formats_and_subtitles(
            self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
        base = smil_url
        for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
            b = meta.get('base') or meta.get('httpBase')
            if b:
                base = b
                break

        formats, subtitles = [], {}
        rtmp_count = 0
        http_count = 0
        m3u8_count = 0
        imgs_count = 0

        srcs = set()
        media = itertools.chain.from_iterable(
            smil.findall(self._xpath_ns(arg, namespace))
            for arg in ['.//video', './/audio', './/media'])
        for medium in media:
            src = medium.get('src')
            if not src or src in srcs:
                continue
            srcs.add(src)

            bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
            filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
            width = int_or_none(medium.get('width'))
            height = int_or_none(medium.get('height'))
            proto = medium.get('proto')
            ext = medium.get('ext')
            src_ext = determine_ext(src, default_ext=None) or ext or urlhandle_detect_ext(
                self._request_webpage(HEADRequest(src), video_id, note='Requesting extension info', fatal=False))
            streamer = medium.get('streamer') or base

            if proto == 'rtmp' or streamer.startswith('rtmp'):
                rtmp_count += 1
                formats.append({
                    'url': streamer,
                    'play_path': src,
                    'ext': 'flv',
                    'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
                    'tbr': bitrate,
                    'filesize': filesize,
                    'width': width,
                    'height': height,
                })
                if transform_rtmp_url:
                    streamer, src = transform_rtmp_url(streamer, src)
                    formats[-1].update({
                        'url': streamer,
                        'play_path': src,
                    })
                continue

            src_url = src if src.startswith('http') else urllib.parse.urljoin(f'{base}/', src)
            src_url = src_url.strip()

            if proto == 'm3u8' or src_ext == 'm3u8':
                m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
                    src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
                self._merge_subtitles(m3u8_subs, target=subtitles)
                if len(m3u8_formats) == 1:
                    m3u8_count += 1
                    m3u8_formats[0].update({
                        'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
                        'tbr': bitrate,
                        'width': width,
                        'height': height,
                    })
                formats.extend(m3u8_formats)
            elif src_ext == 'f4m':
                f4m_url = src_url
                if not f4m_params:
                    f4m_params = {
                        'hdcore': '3.2.0',
                        'plugin': 'flowplayer-3.2.0.1',
                    }
                f4m_url += '&' if '?' in f4m_url else '?'
                f4m_url += urllib.parse.urlencode(f4m_params)
                formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
            elif src_ext == 'mpd':
                mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(
                    src_url, video_id, mpd_id='dash', fatal=False)
                formats.extend(mpd_formats)
                self._merge_subtitles(mpd_subs, target=subtitles)
            elif re.search(r'\.ism/[Mm]anifest', src_url):
                ism_formats, ism_subs = self._extract_ism_formats_and_subtitles(
                    src_url, video_id, ism_id='mss', fatal=False)
                formats.extend(ism_formats)
                self._merge_subtitles(ism_subs, target=subtitles)
            elif src_url.startswith('http') and self._is_valid_url(src, video_id):
                http_count += 1
                formats.append({
                    'url': src_url,
                    'ext': ext or src_ext or 'flv',
                    'format_id': 'http-%d' % (bitrate or http_count),
                    'tbr': bitrate,
                    'filesize': filesize,
                    'width': width,
                    'height': height,
                })

        for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
            src = medium.get('src')
            if not src or src in srcs:
                continue
            srcs.add(src)

            imgs_count += 1
            formats.append({
                'format_id': f'imagestream-{imgs_count}',
                'url': src,
                'ext': mimetype2ext(medium.get('type')),
                'acodec': 'none',
                'vcodec': 'none',
                'width': int_or_none(medium.get('width')),
                'height': int_or_none(medium.get('height')),
                'format_note': 'SMIL storyboards',
            })

        smil_subs = self._parse_smil_subtitles(smil, namespace=namespace)
        self._merge_subtitles(smil_subs, target=subtitles)

        return formats, subtitles

    def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
        urls = []
        subtitles = {}
        for textstream in smil.findall(self._xpath_ns('.//textstream', namespace)):
            src = textstream.get('src')
            if not src or src in urls:
                continue
            urls.append(src)
            ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
            lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
            subtitles.setdefault(lang, []).append({
                'url': src,
                'ext': ext,
            })
        return subtitles

    def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
        res = self._download_xml_handle(
            xspf_url, playlist_id, 'Downloading xpsf playlist',
            'Unable to download xspf manifest', fatal=fatal)
        if res is False:
            return []

        xspf, urlh = res
        xspf_url = urlh.url

        return self._parse_xspf(
            xspf, playlist_id, xspf_url=xspf_url,
            xspf_base_url=base_url(xspf_url))

    def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
        NS_MAP = {
            'xspf': 'http://xspf.org/ns/0/',
            's1': 'http://static.streamone.nl/player/ns/0',
        }

        entries = []
        for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
            title = xpath_text(
                track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
            description = xpath_text(
                track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
            thumbnail = xpath_text(
                track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
            duration = float_or_none(
                xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)

            formats = []
            for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
                format_url = urljoin(xspf_base_url, location.text)
                if not format_url:
                    continue
                formats.append({
                    'url': format_url,
                    'manifest_url': xspf_url,
                    'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
                    'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
                    'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
                })

            entries.append({
                'id': playlist_id,
                'title': title,
                'description': description,
                'thumbnail': thumbnail,
                'duration': duration,
                'formats': formats,
            })
        return entries

    def _extract_mpd_formats(self, *args, **kwargs):
        fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
        if subs:
            self._report_ignoring_subs('DASH')
        return fmts

    def _extract_mpd_formats_and_subtitles(self, *args, **kwargs):
        periods = self._extract_mpd_periods(*args, **kwargs)
        return self._merge_mpd_periods(periods)

    def _extract_mpd_periods(
            self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
            fatal=True, data=None, headers={}, query={}):

        if self.get_param('ignore_no_formats_error'):
            fatal = False

        res = self._download_xml_handle(
            mpd_url, video_id,
            note='Downloading MPD manifest' if note is None else note,
            errnote='Failed to download MPD manifest' if errnote is None else errnote,
            fatal=fatal, data=data, headers=headers, query=query)
        if res is False:
            return []
        mpd_doc, urlh = res
        if mpd_doc is None:
            return []

        # We could have been redirected to a new url when we retrieved our mpd file.
        mpd_url = urlh.url
        mpd_base_url = base_url(mpd_url)

        return self._parse_mpd_periods(mpd_doc, mpd_id, mpd_base_url, mpd_url)

    def _parse_mpd_formats(self, *args, **kwargs):
        fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
        if subs:
            self._report_ignoring_subs('DASH')
        return fmts

    def _parse_mpd_formats_and_subtitles(self, *args, **kwargs):
        periods = self._parse_mpd_periods(*args, **kwargs)
        return self._merge_mpd_periods(periods)

    def _merge_mpd_periods(self, periods):
        """
        Combine all formats and subtitles from an MPD manifest into a single list,
        by concatenate streams with similar formats.
        """
        formats, subtitles = {}, {}
        for period in periods:
            for f in period['formats']:
                assert 'is_dash_periods' not in f, 'format already processed'
                f['is_dash_periods'] = True
                format_key = tuple(v for k, v in f.items() if k not in (
                    ('format_id', 'fragments', 'manifest_stream_number')))
                if format_key not in formats:
                    formats[format_key] = f
                elif 'fragments' in f:
                    formats[format_key].setdefault('fragments', []).extend(f['fragments'])

            if subtitles and period['subtitles']:
                self.report_warning(bug_reports_message(
                    'Found subtitles in multiple periods in the DASH manifest; '
                    'if part of the subtitles are missing,',
                ), only_once=True)

            for sub_lang, sub_info in period['subtitles'].items():
                subtitles.setdefault(sub_lang, []).extend(sub_info)

        return list(formats.values()), subtitles

    def _parse_mpd_periods(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
        """
        Parse formats from MPD manifest.
        References:
         1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
            http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
         2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
        """
        if not self.get_param('dynamic_mpd', True):
            if mpd_doc.get('type') == 'dynamic':
                return [], {}

        namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)

        def _add_ns(path):
            return self._xpath_ns(path, namespace)

        def is_drm_protected(element):
            return element.find(_add_ns('ContentProtection')) is not None

        def extract_multisegment_info(element, ms_parent_info):
            ms_info = ms_parent_info.copy()

            # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
            # common attributes and elements.  We will only extract relevant
            # for us.
            def extract_common(source):
                segment_timeline = source.find(_add_ns('SegmentTimeline'))
                if segment_timeline is not None:
                    s_e = segment_timeline.findall(_add_ns('S'))
                    if s_e:
                        ms_info['total_number'] = 0
                        ms_info['s'] = []
                        for s in s_e:
                            r = int(s.get('r', 0))
                            ms_info['total_number'] += 1 + r
                            ms_info['s'].append({
                                't': int(s.get('t', 0)),
                                # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
                                'd': int(s.attrib['d']),
                                'r': r,
                            })
                start_number = source.get('startNumber')
                if start_number:
                    ms_info['start_number'] = int(start_number)
                timescale = source.get('timescale')
                if timescale:
                    ms_info['timescale'] = int(timescale)
                segment_duration = source.get('duration')
                if segment_duration:
                    ms_info['segment_duration'] = float(segment_duration)

            def extract_Initialization(source):
                initialization = source.find(_add_ns('Initialization'))
                if initialization is not None:
                    ms_info['initialization_url'] = initialization.attrib['sourceURL']

            segment_list = element.find(_add_ns('SegmentList'))
            if segment_list is not None:
                extract_common(segment_list)
                extract_Initialization(segment_list)
                segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
                if segment_urls_e:
                    ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
            else:
                segment_template = element.find(_add_ns('SegmentTemplate'))
                if segment_template is not None:
                    extract_common(segment_template)
                    media = segment_template.get('media')
                    if media:
                        ms_info['media'] = media
                    initialization = segment_template.get('initialization')
                    if initialization:
                        ms_info['initialization'] = initialization
                    else:
                        extract_Initialization(segment_template)
            return ms_info

        mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
        stream_numbers = collections.defaultdict(int)
        for period_idx, period in enumerate(mpd_doc.findall(_add_ns('Period'))):
            period_entry = {
                'id': period.get('id', f'period-{period_idx}'),
                'formats': [],
                'subtitles': collections.defaultdict(list),
            }
            period_duration = parse_duration(period.get('duration')) or mpd_duration
            period_ms_info = extract_multisegment_info(period, {
                'start_number': 1,
                'timescale': 1,
            })
            for adaptation_set in period.findall(_add_ns('AdaptationSet')):
                adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
                for representation in adaptation_set.findall(_add_ns('Representation')):
                    representation_attrib = adaptation_set.attrib.copy()
                    representation_attrib.update(representation.attrib)
                    # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
                    mime_type = representation_attrib['mimeType']
                    content_type = representation_attrib.get('contentType', mime_type.split('/')[0])

                    codec_str = representation_attrib.get('codecs', '')
                    # Some kind of binary subtitle found in some youtube livestreams
                    if mime_type == 'application/x-rawcc':
                        codecs = {'scodec': codec_str}
                    else:
                        codecs = parse_codecs(codec_str)
                    if content_type not in ('video', 'audio', 'text'):
                        if mime_type == 'image/jpeg':
                            content_type = mime_type
                        elif codecs.get('vcodec', 'none') != 'none':
                            content_type = 'video'
                        elif codecs.get('acodec', 'none') != 'none':
                            content_type = 'audio'
                        elif codecs.get('scodec', 'none') != 'none':
                            content_type = 'text'
                        elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
                            content_type = 'text'
                        else:
                            self.report_warning(f'Unknown MIME type {mime_type} in DASH manifest')
                            continue

                    base_url = ''
                    for element in (representation, adaptation_set, period, mpd_doc):
                        base_url_e = element.find(_add_ns('BaseURL'))
                        if try_call(lambda: base_url_e.text) is not None:
                            base_url = base_url_e.text + base_url
                            if re.match(r'^https?://', base_url):
                                break
                    if mpd_base_url and base_url.startswith('/'):
                        base_url = urllib.parse.urljoin(mpd_base_url, base_url)
                    elif mpd_base_url and not re.match(r'^https?://', base_url):
                        if not mpd_base_url.endswith('/'):
                            mpd_base_url += '/'
                        base_url = mpd_base_url + base_url
                    representation_id = representation_attrib.get('id')
                    lang = representation_attrib.get('lang')
                    url_el = representation.find(_add_ns('BaseURL'))
                    filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
                    bandwidth = int_or_none(representation_attrib.get('bandwidth'))
                    if representation_id is not None:
                        format_id = representation_id
                    else:
                        format_id = content_type
                    if mpd_id:
                        format_id = mpd_id + '-' + format_id
                    if content_type in ('video', 'audio'):
                        f = {
                            'format_id': format_id,
                            'manifest_url': mpd_url,
                            'ext': mimetype2ext(mime_type),
                            'width': int_or_none(representation_attrib.get('width')),
                            'height': int_or_none(representation_attrib.get('height')),
                            'tbr': float_or_none(bandwidth, 1000),
                            'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
                            'fps': int_or_none(representation_attrib.get('frameRate')),
                            'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
                            'format_note': f'DASH {content_type}',
                            'filesize': filesize,
                            'container': mimetype2ext(mime_type) + '_dash',
                            **codecs,
                        }
                    elif content_type == 'text':
                        f = {
                            'ext': mimetype2ext(mime_type),
                            'manifest_url': mpd_url,
                            'filesize': filesize,
                        }
                    elif content_type == 'image/jpeg':
                        # See test case in VikiIE
                        # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
                        f = {
                            'format_id': format_id,
                            'ext': 'mhtml',
                            'manifest_url': mpd_url,
                            'format_note': 'DASH storyboards (jpeg)',
                            'acodec': 'none',
                            'vcodec': 'none',
                        }
                    if is_drm_protected(adaptation_set) or is_drm_protected(representation):
                        f['has_drm'] = True
                    representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)

                    def prepare_template(template_name, identifiers):
                        tmpl = representation_ms_info[template_name]
                        if representation_id is not None:
                            tmpl = tmpl.replace('$RepresentationID$', representation_id)
                        # First of, % characters outside $...$ templates
                        # must be escaped by doubling for proper processing
                        # by % operator string formatting used further (see
                        # https://github.com/ytdl-org/youtube-dl/issues/16867).
                        t = ''
                        in_template = False
                        for c in tmpl:
                            t += c
                            if c == '$':
                                in_template = not in_template
                            elif c == '%' and not in_template:
                                t += c
                        # Next, $...$ templates are translated to their
                        # %(...) counterparts to be used with % operator
                        t = re.sub(r'\$({})\$'.format('|'.join(identifiers)), r'%(\1)d', t)
                        t = re.sub(r'\$({})%([^$]+)\$'.format('|'.join(identifiers)), r'%(\1)\2', t)
                        t.replace('$$', '$')
                        return t

                    # @initialization is a regular template like @media one
                    # so it should be handled just the same way (see
                    # https://github.com/ytdl-org/youtube-dl/issues/11605)
                    if 'initialization' in representation_ms_info:
                        initialization_template = prepare_template(
                            'initialization',
                            # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
                            # $Time$ shall not be included for @initialization thus
                            # only $Bandwidth$ remains
                            ('Bandwidth', ))
                        representation_ms_info['initialization_url'] = initialization_template % {
                            'Bandwidth': bandwidth,
                        }

                    def location_key(location):
                        return 'url' if re.match(r'^https?://', location) else 'path'

                    if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:

                        media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
                        media_location_key = location_key(media_template)

                        # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
                        # can't be used at the same time
                        if '%(Number' in media_template and 's' not in representation_ms_info:
                            segment_duration = None
                            if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
                                segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
                                representation_ms_info['total_number'] = int(math.ceil(
                                    float_or_none(period_duration, segment_duration, default=0)))
                            representation_ms_info['fragments'] = [{
                                media_location_key: media_template % {
                                    'Number': segment_number,
                                    'Bandwidth': bandwidth,
                                },
                                'duration': segment_duration,
                            } for segment_number in range(
                                representation_ms_info['start_number'],
                                representation_ms_info['total_number'] + representation_ms_info['start_number'])]
                        else:
                            # $Number*$ or $Time$ in media template with S list available
                            # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
                            # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
                            representation_ms_info['fragments'] = []
                            segment_time = 0
                            segment_d = None
                            segment_number = representation_ms_info['start_number']

                            def add_segment_url():
                                segment_url = media_template % {
                                    'Time': segment_time,
                                    'Bandwidth': bandwidth,
                                    'Number': segment_number,
                                }
                                representation_ms_info['fragments'].append({
                                    media_location_key: segment_url,
                                    'duration': float_or_none(segment_d, representation_ms_info['timescale']),
                                })

                            for s in representation_ms_info['s']:
                                segment_time = s.get('t') or segment_time
                                segment_d = s['d']
                                add_segment_url()
                                segment_number += 1
                                for _ in range(s.get('r', 0)):
                                    segment_time += segment_d
                                    add_segment_url()
                                    segment_number += 1
                                segment_time += segment_d
                    elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
                        # No media template,
                        # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
                        # or any YouTube dashsegments video
                        fragments = []
                        segment_index = 0
                        timescale = representation_ms_info['timescale']
                        for s in representation_ms_info['s']:
                            duration = float_or_none(s['d'], timescale)
                            for _ in range(s.get('r', 0) + 1):
                                segment_uri = representation_ms_info['segment_urls'][segment_index]
                                fragments.append({
                                    location_key(segment_uri): segment_uri,
                                    'duration': duration,
                                })
                                segment_index += 1
                        representation_ms_info['fragments'] = fragments
                    elif 'segment_urls' in representation_ms_info:
                        # Segment URLs with no SegmentTimeline
                        # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
                        # https://github.com/ytdl-org/youtube-dl/pull/14844
                        fragments = []
                        segment_duration = float_or_none(
                            representation_ms_info['segment_duration'],
                            representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
                        for segment_url in representation_ms_info['segment_urls']:
                            fragment = {
                                location_key(segment_url): segment_url,
                            }
                            if segment_duration:
                                fragment['duration'] = segment_duration
                            fragments.append(fragment)
                        representation_ms_info['fragments'] = fragments
                    # If there is a fragments key available then we correctly recognized fragmented media.
                    # Otherwise we will assume unfragmented media with direct access. Technically, such
                    # assumption is not necessarily correct since we may simply have no support for
                    # some forms of fragmented media renditions yet, but for now we'll use this fallback.
                    if 'fragments' in representation_ms_info:
                        f.update({
                            # NB: mpd_url may be empty when MPD manifest is parsed from a string
                            'url': mpd_url or base_url,
                            'fragment_base_url': base_url,
                            'fragments': [],
                            'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
                        })
                        if 'initialization_url' in representation_ms_info:
                            initialization_url = representation_ms_info['initialization_url']
                            if not f.get('url'):
                                f['url'] = initialization_url
                            f['fragments'].append({location_key(initialization_url): initialization_url})
                        f['fragments'].extend(representation_ms_info['fragments'])
                        if not period_duration:
                            period_duration = try_get(
                                representation_ms_info,
                                lambda r: sum(frag['duration'] for frag in r['fragments']), float)
                    else:
                        # Assuming direct URL to unfragmented media.
                        f['url'] = base_url
                    if content_type in ('video', 'audio', 'image/jpeg'):
                        f['manifest_stream_number'] = stream_numbers[f['url']]
                        stream_numbers[f['url']] += 1
                        period_entry['formats'].append(f)
                    elif content_type == 'text':
                        period_entry['subtitles'][lang or 'und'].append(f)
            yield period_entry

    def _extract_ism_formats(self, *args, **kwargs):
        fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
        if subs:
            self._report_ignoring_subs('ISM')
        return fmts

    def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
        if self.get_param('ignore_no_formats_error'):
            fatal = False

        res = self._download_xml_handle(
            ism_url, video_id,
            note='Downloading ISM manifest' if note is None else note,
            errnote='Failed to download ISM manifest' if errnote is None else errnote,
            fatal=fatal, data=data, headers=headers, query=query)
        if res is False:
            return [], {}
        ism_doc, urlh = res
        if ism_doc is None:
            return [], {}

        return self._parse_ism_formats_and_subtitles(ism_doc, urlh.url, ism_id)

    def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
        """
        Parse formats from ISM manifest.
        References:
         1. [MS-SSTR]: Smooth Streaming Protocol,
            https://msdn.microsoft.com/en-us/library/ff469518.aspx
        """
        if ism_doc.get('IsLive') == 'TRUE':
            return [], {}

        duration = int(ism_doc.attrib['Duration'])
        timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000

        formats = []
        subtitles = {}
        for stream in ism_doc.findall('StreamIndex'):
            stream_type = stream.get('Type')
            if stream_type not in ('video', 'audio', 'text'):
                continue
            url_pattern = stream.attrib['Url']
            stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
            stream_name = stream.get('Name')
            stream_language = stream.get('Language', 'und')
            for track in stream.findall('QualityLevel'):
                KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}
                fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))
                # TODO: add support for WVC1 and WMAP
                if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):
                    self.report_warning(f'{fourcc} is not a supported codec')
                    continue
                tbr = int(track.attrib['Bitrate']) // 1000
                # [1] does not mention Width and Height attributes. However,
                # they're often present while MaxWidth and MaxHeight are
                # missing, so should be used as fallbacks
                width = int_or_none(track.get('MaxWidth') or track.get('Width'))
                height = int_or_none(track.get('MaxHeight') or track.get('Height'))
                sampling_rate = int_or_none(track.get('SamplingRate'))

                track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
                track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)

                fragments = []
                fragment_ctx = {
                    'time': 0,
                }
                stream_fragments = stream.findall('c')
                for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
                    fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
                    fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
                    fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
                    if not fragment_ctx['duration']:
                        try:
                            next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
                        except IndexError:
                            next_fragment_time = duration
                        fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
                    for _ in range(fragment_repeat):
                        fragments.append({
                            'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
                            'duration': fragment_ctx['duration'] / stream_timescale,
                        })
                        fragment_ctx['time'] += fragment_ctx['duration']

                if stream_type == 'text':
                    subtitles.setdefault(stream_language, []).append({
                        'ext': 'ismt',
                        'protocol': 'ism',
                        'url': ism_url,
                        'manifest_url': ism_url,
                        'fragments': fragments,
                        '_download_params': {
                            'stream_type': stream_type,
                            'duration': duration,
                            'timescale': stream_timescale,
                            'fourcc': fourcc,
                            'language': stream_language,
                            'codec_private_data': track.get('CodecPrivateData'),
                        },
                    })
                elif stream_type in ('video', 'audio'):
                    formats.append({
                        'format_id': join_nonempty(ism_id, stream_name, tbr),
                        'url': ism_url,
                        'manifest_url': ism_url,
                        'ext': 'ismv' if stream_type == 'video' else 'isma',
                        'width': width,
                        'height': height,
                        'tbr': tbr,
                        'asr': sampling_rate,
                        'vcodec': 'none' if stream_type == 'audio' else fourcc,
                        'acodec': 'none' if stream_type == 'video' else fourcc,
                        'protocol': 'ism',
                        'fragments': fragments,
                        'has_drm': ism_doc.find('Protection') is not None,
                        'language': stream_language,
                        'audio_channels': int_or_none(track.get('Channels')),
                        '_download_params': {
                            'stream_type': stream_type,
                            'duration': duration,
                            'timescale': stream_timescale,
                            'width': width or 0,
                            'height': height or 0,
                            'fourcc': fourcc,
                            'language': stream_language,
                            'codec_private_data': track.get('CodecPrivateData'),
                            'sampling_rate': sampling_rate,
                            'channels': int_or_none(track.get('Channels', 2)),
                            'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
                            'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
                        },
                    })
        return formats, subtitles

    def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
        def absolute_url(item_url):
            return urljoin(base_url, item_url)

        def parse_content_type(content_type):
            if not content_type:
                return {}
            ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
            if ctr:
                mimetype, codecs = ctr.groups()
                f = parse_codecs(codecs)
                f['ext'] = mimetype2ext(mimetype)
                return f
            return {}

        def _media_formats(src, cur_media_type, type_info=None):
            type_info = type_info or {}
            full_url = absolute_url(src)
            ext = type_info.get('ext') or determine_ext(full_url)
            if ext == 'm3u8':
                is_plain_url = False
                formats = self._extract_m3u8_formats(
                    full_url, video_id, ext='mp4',
                    entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
                    preference=preference, quality=quality, fatal=False)
            elif ext == 'mpd':
                is_plain_url = False
                formats = self._extract_mpd_formats(
                    full_url, video_id, mpd_id=mpd_id, fatal=False)
            else:
                is_plain_url = True
                formats = [{
                    'url': full_url,
                    'vcodec': 'none' if cur_media_type == 'audio' else None,
                    'ext': ext,
                }]
            return is_plain_url, formats

        entries = []
        # amp-video and amp-audio are very similar to their HTML5 counterparts
        # so we will include them right here (see
        # https://www.ampproject.org/docs/reference/components/amp-video)
        # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
        _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
        media_tags = [(media_tag, media_tag_name, media_type, '')
                      for media_tag, media_tag_name, media_type
                      in re.findall(rf'(?s)(<({_MEDIA_TAG_NAME_RE})[^>]*/>)', webpage)]
        media_tags.extend(re.findall(
            # We only allow video|audio followed by a whitespace or '>'.
            # Allowing more characters may end up in significant slow down (see
            # https://github.com/ytdl-org/youtube-dl/issues/11979,
            # e.g. http://www.porntrex.com/maps/videositemap.xml).
            rf'(?s)(<(?P<tag>{_MEDIA_TAG_NAME_RE})(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
        for media_tag, _, media_type, media_content in media_tags:
            media_info = {
                'formats': [],
                'subtitles': {},
            }
            media_attributes = extract_attributes(media_tag)
            src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))
            if src:
                f = parse_content_type(media_attributes.get('type'))
                _, formats = _media_formats(src, media_type, f)
                media_info['formats'].extend(formats)
            media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
            if media_content:
                for source_tag in re.findall(r'<source[^>]+>', media_content):
                    s_attr = extract_attributes(source_tag)
                    # data-video-src and data-src are non standard but seen
                    # several times in the wild
                    src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))
                    if not src:
                        continue
                    f = parse_content_type(s_attr.get('type'))
                    is_plain_url, formats = _media_formats(src, media_type, f)
                    if is_plain_url:
                        # width, height, res, label and title attributes are
                        # all not standard but seen several times in the wild
                        labels = [
                            s_attr.get(lbl)
                            for lbl in ('label', 'title')
                            if str_or_none(s_attr.get(lbl))
                        ]
                        width = int_or_none(s_attr.get('width'))
                        height = (int_or_none(s_attr.get('height'))
                                  or int_or_none(s_attr.get('res')))
                        if not width or not height:
                            for lbl in labels:
                                resolution = parse_resolution(lbl)
                                if not resolution:
                                    continue
                                width = width or resolution.get('width')
                                height = height or resolution.get('height')
                        for lbl in labels:
                            tbr = parse_bitrate(lbl)
                            if tbr:
                                break
                        else:
                            tbr = None
                        f.update({
                            'width': width,
                            'height': height,
                            'tbr': tbr,
                            'format_id': s_attr.get('label') or s_attr.get('title'),
                        })
                        f.update(formats[0])
                        media_info['formats'].append(f)
                    else:
                        media_info['formats'].extend(formats)
                for track_tag in re.findall(r'<track[^>]+>', media_content):
                    track_attributes = extract_attributes(track_tag)
                    kind = track_attributes.get('kind')
                    if not kind or kind in ('subtitles', 'captions'):
                        src = strip_or_none(track_attributes.get('src'))
                        if not src:
                            continue
                        lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
                        media_info['subtitles'].setdefault(lang, []).append({
                            'url': absolute_url(src),
                        })
            for f in media_info['formats']:
                f.setdefault('http_headers', {})['Referer'] = base_url
            if media_info['formats'] or media_info['subtitles']:
                entries.append(media_info)
        return entries

    def _extract_akamai_formats(self, *args, **kwargs):
        fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
        if subs:
            self._report_ignoring_subs('akamai')
        return fmts

    def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
        signed = 'hdnea=' in manifest_url
        if not signed:
            # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
            manifest_url = re.sub(
                r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
                '', manifest_url).strip('?')

        formats = []
        subtitles = {}

        hdcore_sign = 'hdcore=3.7.0'
        f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
        hds_host = hosts.get('hds')
        if hds_host:
            f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
        if 'hdcore=' not in f4m_url:
            f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
        f4m_formats = self._extract_f4m_formats(
            f4m_url, video_id, f4m_id='hds', fatal=False)
        for entry in f4m_formats:
            entry.update({'extra_param_to_segment_url': hdcore_sign})
        formats.extend(f4m_formats)

        m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
        hls_host = hosts.get('hls')
        if hls_host:
            m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
        m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
            m3u8_url, video_id, 'mp4', 'm3u8_native',
            m3u8_id='hls', fatal=False)
        formats.extend(m3u8_formats)
        subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)

        http_host = hosts.get('http')
        if http_host and m3u8_formats and not signed:
            REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
            qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
            qualities_length = len(qualities)
            if len(m3u8_formats) in (qualities_length, qualities_length + 1):
                i = 0
                for f in m3u8_formats:
                    if f['vcodec'] != 'none':
                        for protocol in ('http', 'https'):
                            http_f = f.copy()
                            del http_f['manifest_url']
                            http_url = re.sub(
                                REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
                            http_f.update({
                                'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
                                'url': http_url,
                                'protocol': protocol,
                            })
                            formats.append(http_f)
                        i += 1

        return formats, subtitles

    def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
        query = urllib.parse.urlparse(url).query
        url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
        mobj = re.search(
            r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
        url_base = mobj.group('url')
        http_base_url = '{}{}:{}'.format('http', mobj.group('s') or '', url_base)
        formats = []

        def manifest_url(manifest):
            m_url = f'{http_base_url}/{manifest}'
            if query:
                m_url += f'?{query}'
            return m_url

        if 'm3u8' not in skip_protocols:
            formats.extend(self._extract_m3u8_formats(
                manifest_url('playlist.m3u8'), video_id, 'mp4',
                m3u8_entry_protocol, m3u8_id='hls', fatal=False))
        if 'f4m' not in skip_protocols:
            formats.extend(self._extract_f4m_formats(
                manifest_url('manifest.f4m'),
                video_id, f4m_id='hds', fatal=False))
        if 'dash' not in skip_protocols:
            formats.extend(self._extract_mpd_formats(
                manifest_url('manifest.mpd'),
                video_id, mpd_id='dash', fatal=False))
        if re.search(r'(?:/smil:|\.smil)', url_base):
            if 'smil' not in skip_protocols:
                rtmp_formats = self._extract_smil_formats(
                    manifest_url('jwplayer.smil'),
                    video_id, fatal=False)
                for rtmp_format in rtmp_formats:
                    rtsp_format = rtmp_format.copy()
                    rtsp_format['url'] = '{}/{}'.format(rtmp_format['url'], rtmp_format['play_path'])
                    del rtsp_format['play_path']
                    del rtsp_format['ext']
                    rtsp_format.update({
                        'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
                        'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
                        'protocol': 'rtsp',
                    })
                    formats.extend([rtmp_format, rtsp_format])
        else:
            for protocol in ('rtmp', 'rtsp'):
                if protocol not in skip_protocols:
                    formats.append({
                        'url': f'{protocol}:{url_base}',
                        'format_id': protocol,
                        'protocol': protocol,
                    })
        return formats

    def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
        return self._search_json(
            r'''(?<!-)\bjwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?:(?!</script>).)*?\.\s*(?:setup\s*\(|(?P<load>load)\s*\(\s*\[)''',
            webpage, 'JWPlayer data', video_id,
            # must be a {...} or sequence, ending
            contains_pattern=r'\{(?s:.*)}(?(load)(?:\s*,\s*\{(?s:.*)})*)', end_pattern=r'(?(load)\]|\))',
            transform_source=transform_source, default=None)

    def _extract_jwplayer_data(self, webpage, video_id, *args, transform_source=js_to_json, **kwargs):
        jwplayer_data = self._find_jwplayer_data(
            webpage, video_id, transform_source=transform_source)
        return self._parse_jwplayer_data(
            jwplayer_data, video_id, *args, **kwargs)

    def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
                             m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
        entries = []
        if not isinstance(jwplayer_data, dict):
            return entries

        playlist_items = jwplayer_data.get('playlist')
        # JWPlayer backward compatibility: single playlist item/flattened playlists
        # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
        # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
        if not isinstance(playlist_items, list):
            playlist_items = (playlist_items or jwplayer_data, )

        for video_data in playlist_items:
            if not isinstance(video_data, dict):
                continue
            # JWPlayer backward compatibility: flattened sources
            # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
            if 'sources' not in video_data:
                video_data['sources'] = [video_data]

            this_video_id = video_id or video_data['mediaid']

            formats = self._parse_jwplayer_formats(
                video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
                mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)

            subtitles = {}
            for track in traverse_obj(video_data, (
                    'tracks', lambda _, v: v['kind'].lower() in ('captions', 'subtitles'))):
                track_url = urljoin(base_url, track.get('file'))
                if not track_url:
                    continue
                subtitles.setdefault(track.get('label') or 'en', []).append({
                    'url': self._proto_relative_url(track_url),
                })

            entry = {
                'id': this_video_id,
                'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
                'description': clean_html(video_data.get('description')),
                'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
                'timestamp': int_or_none(video_data.get('pubdate')),
                'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
                'subtitles': subtitles,
                'alt_title': clean_html(video_data.get('subtitle')),  # attributes used e.g. by Tele5 ...
                'genre': clean_html(video_data.get('genre')),
                'channel': clean_html(dict_get(video_data, ('category', 'channel'))),
                'season_number': int_or_none(video_data.get('season')),
                'episode_number': int_or_none(video_data.get('episode')),
                'release_year': int_or_none(video_data.get('releasedate')),
                'age_limit': int_or_none(video_data.get('age_restriction')),
            }
            # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
            if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
                entry.update({
                    '_type': 'url_transparent',
                    'url': formats[0]['url'],
                })
            else:
                entry['formats'] = formats
            entries.append(entry)
        if len(entries) == 1:
            return entries[0]
        else:
            return self.playlist_result(entries)

    def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
                                m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
        urls = set()
        formats = []
        for source in jwplayer_sources_data:
            if not isinstance(source, dict):
                continue
            source_url = urljoin(
                base_url, self._proto_relative_url(source.get('file')))
            if not source_url or source_url in urls:
                continue
            urls.add(source_url)
            source_type = source.get('type') or ''
            ext = mimetype2ext(source_type) or determine_ext(source_url)
            if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
                formats.extend(self._extract_m3u8_formats(
                    source_url, video_id, 'mp4', entry_protocol='m3u8_native',
                    m3u8_id=m3u8_id, fatal=False))
            elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:
                formats.extend(self._extract_mpd_formats(
                    source_url, video_id, mpd_id=mpd_id, fatal=False))
            elif ext == 'smil':
                formats.extend(self._extract_smil_formats(
                    source_url, video_id, fatal=False))
            # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
            elif source_type.startswith('audio') or ext in (
                    'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
                formats.append({
                    'url': source_url,
                    'vcodec': 'none',
                    'ext': ext,
                })
            else:
                format_id = str_or_none(source.get('label'))
                height = int_or_none(source.get('height'))
                if height is None and format_id:
                    # Often no height is provided but there is a label in
                    # format like "1080p", "720p SD", or 1080.
                    height = parse_resolution(format_id).get('height')
                a_format = {
                    'url': source_url,
                    'width': int_or_none(source.get('width')),
                    'height': height,
                    'tbr': int_or_none(source.get('bitrate'), scale=1000),
                    'filesize': int_or_none(source.get('filesize')),
                    'ext': ext,
                    'format_id': format_id,
                }
                if source_url.startswith('rtmp'):
                    a_format['ext'] = 'flv'
                    # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
                    # of jwplayer.flash.swf
                    rtmp_url_parts = re.split(
                        r'((?:mp4|mp3|flv):)', source_url, maxsplit=1)
                    if len(rtmp_url_parts) == 3:
                        rtmp_url, prefix, play_path = rtmp_url_parts
                        a_format.update({
                            'url': rtmp_url,
                            'play_path': prefix + play_path,
                        })
                    if rtmp_params:
                        a_format.update(rtmp_params)
                formats.append(a_format)
        return formats

    def _live_title(self, name):
        self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
        return name

    def _int(self, v, name, fatal=False, **kwargs):
        res = int_or_none(v, **kwargs)
        if res is None:
            msg = f'Failed to extract {name}: Could not parse value {v!r}'
            if fatal:
                raise ExtractorError(msg)
            else:
                self.report_warning(msg)
        return res

    def _float(self, v, name, fatal=False, **kwargs):
        res = float_or_none(v, **kwargs)
        if res is None:
            msg = f'Failed to extract {name}: Could not parse value {v!r}'
            if fatal:
                raise ExtractorError(msg)
            else:
                self.report_warning(msg)
        return res

    def _set_cookie(self, domain, name, value, expire_time=None, port=None,
                    path='/', secure=False, discard=False, rest={}, **kwargs):
        cookie = http.cookiejar.Cookie(
            0, name, value, port, port is not None, domain, True,
            domain.startswith('.'), path, True, secure, expire_time,
            discard, None, None, rest)
        self.cookiejar.set_cookie(cookie)

    def _get_cookies(self, url):
        """ Return a http.cookies.SimpleCookie with the cookies for the url """
        return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url))

    def _apply_first_set_cookie_header(self, url_handle, cookie):
        """
        Apply first Set-Cookie header instead of the last. Experimental.

        Some sites (e.g. [1-3]) may serve two cookies under the same name
        in Set-Cookie header and expect the first (old) one to be set rather
        than second (new). However, as of RFC6265 the newer one cookie
        should be set into cookie store what actually happens.
        We will workaround this issue by resetting the cookie to
        the first one manually.
        1. https://new.vk.com/
        2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
        3. https://learning.oreilly.com/
        """
        for header, cookies in url_handle.headers.items():
            if header.lower() != 'set-cookie':
                continue
            cookies = cookies.encode('iso-8859-1').decode('utf-8')
            cookie_value = re.search(
                rf'{cookie}=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)', cookies)
            if cookie_value:
                value, domain = cookie_value.groups()
                self._set_cookie(domain, cookie, value)
                break

    @classmethod
    def get_testcases(cls, include_onlymatching=False):
        # Do not look in super classes
        t = vars(cls).get('_TEST')
        if t:
            assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
            tests = [t]
        else:
            tests = vars(cls).get('_TESTS', [])
        for t in tests:
            if not include_onlymatching and t.get('only_matching', False):
                continue
            t['name'] = cls.ie_key()
            yield t
        if getattr(cls, '__wrapped__', None):
            yield from cls.__wrapped__.get_testcases(include_onlymatching)

    @classmethod
    def get_webpage_testcases(cls):
        tests = vars(cls).get('_WEBPAGE_TESTS', [])
        for t in tests:
            t['name'] = cls.ie_key()
            yield t
        if getattr(cls, '__wrapped__', None):
            yield from cls.__wrapped__.get_webpage_testcases()

    @classproperty(cache=True)
    def age_limit(cls):
        """Get age limit from the testcases"""
        return max(traverse_obj(
            (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
            (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])

    @classproperty(cache=True)
    def _RETURN_TYPE(cls):
        """What the extractor returns: "video", "playlist", "any", or None (Unknown)"""
        tests = tuple(cls.get_testcases(include_onlymatching=False))
        if not tests:
            return None
        elif not any(k.startswith('playlist') for test in tests for k in test):
            return 'video'
        elif all(any(k.startswith('playlist') for k in test) for test in tests):
            return 'playlist'
        return 'any'

    @classmethod
    def is_single_video(cls, url):
        """Returns whether the URL is of a single video, None if unknown"""
        if cls.suitable(url):
            return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)

    @classmethod
    def is_suitable(cls, age_limit):
        """Test whether the extractor is generally suitable for the given age limit"""
        return not age_restricted(cls.age_limit, age_limit)

    @classmethod
    def description(cls, *, markdown=True, search_examples=None):
        """Description of the extractor"""
        desc = ''
        if cls._NETRC_MACHINE:
            if markdown:
                desc += f' [*{cls._NETRC_MACHINE}*](## "netrc machine")'
            else:
                desc += f' [{cls._NETRC_MACHINE}]'
        if cls.IE_DESC is False:
            desc += ' [HIDDEN]'
        elif cls.IE_DESC:
            desc += f' {cls.IE_DESC}'
        if cls.SEARCH_KEY:
            desc += f'{";" if cls.IE_DESC else ""} "{cls.SEARCH_KEY}:" prefix'
            if search_examples:
                _COUNTS = ('', '5', '10', 'all')
                desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
        if not cls.working():
            desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'

        # Escape emojis. Ref: https://github.com/github/markup/issues/1153
        name = (' - **{}**'.format(re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME))) if markdown else cls.IE_NAME
        return f'{name}:{desc}' if desc else name

    def extract_subtitles(self, *args, **kwargs):
        if (self.get_param('writesubtitles', False)
                or self.get_param('listsubtitles')):
            return self._get_subtitles(*args, **kwargs)
        return {}

    def _get_subtitles(self, *args, **kwargs):
        raise NotImplementedError('This method must be implemented by subclasses')

    class CommentsDisabled(Exception):
        """Raise in _get_comments if comments are disabled for the video"""

    def extract_comments(self, *args, **kwargs):
        if not self.get_param('getcomments'):
            return None
        generator = self._get_comments(*args, **kwargs)

        def extractor():
            comments = []
            interrupted = True
            try:
                while True:
                    comments.append(next(generator))
            except StopIteration:
                interrupted = False
            except KeyboardInterrupt:
                self.to_screen('Interrupted by user')
            except self.CommentsDisabled:
                return {'comments': None, 'comment_count': None}
            except Exception as e:
                if self.get_param('ignoreerrors') is not True:
                    raise
                self._downloader.report_error(e)
            comment_count = len(comments)
            self.to_screen(f'Extracted {comment_count} comments')
            return {
                'comments': comments,
                'comment_count': None if interrupted else comment_count,
            }
        return extractor

    def _get_comments(self, *args, **kwargs):
        raise NotImplementedError('This method must be implemented by subclasses')

    @staticmethod
    def _merge_subtitle_items(subtitle_list1, subtitle_list2):
        """ Merge subtitle items for one language. Items with duplicated URLs/data
        will be dropped. """
        list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
        ret = list(subtitle_list1)
        ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
        return ret

    @classmethod
    def _merge_subtitles(cls, *dicts, target=None):
        """ Merge subtitle dictionaries, language by language. """
        if target is None:
            target = {}
        for d in dicts:
            for lang, subs in d.items():
                target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
        return target

    def extract_automatic_captions(self, *args, **kwargs):
        if (self.get_param('writeautomaticsub', False)
                or self.get_param('listsubtitles')):
            return self._get_automatic_captions(*args, **kwargs)
        return {}

    def _get_automatic_captions(self, *args, **kwargs):
        raise NotImplementedError('This method must be implemented by subclasses')

    @functools.cached_property
    def _cookies_passed(self):
        """Whether cookies have been passed to YoutubeDL"""
        return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None

    def mark_watched(self, *args, **kwargs):
        if not self.get_param('mark_watched', False):
            return
        if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
            self._mark_watched(*args, **kwargs)

    def _mark_watched(self, *args, **kwargs):
        raise NotImplementedError('This method must be implemented by subclasses')

    def geo_verification_headers(self):
        headers = {}
        geo_verification_proxy = self.get_param('geo_verification_proxy')
        if geo_verification_proxy:
            headers['Ytdl-request-proxy'] = geo_verification_proxy
        return headers

    @staticmethod
    def _generic_id(url):
        return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])

    def _generic_title(self, url='', webpage='', *, default=None):
        return (self._og_search_title(webpage, default=None)
                or self._html_extract_title(webpage, default=None)
                or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
                or default)

    def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True):
        if not duration:
            return
        chapter_list = [{
            'start_time': start_function(chapter),
            'title': title_function(chapter),
        } for chapter in chapter_list or []]
        if strict:
            warn = self.report_warning
        else:
            warn = self.write_debug
            chapter_list.sort(key=lambda c: c['start_time'] or 0)

        chapters = [{'start_time': 0}]
        for idx, chapter in enumerate(chapter_list):
            if chapter['start_time'] is None:
                warn(f'Incomplete chapter {idx}')
            elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
                chapters.append(chapter)
            elif chapter not in chapters:
                issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration
                         else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}')
                warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"')
        return chapters[1:]

    def _extract_chapters_from_description(self, description, duration):
        duration_re = r'(?:\d+:)?\d{1,2}:\d{2}'
        sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$'
        return self._extract_chapters_helper(
            re.findall(sep_re % (duration_re, r'.+?'), description or ''),
            start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1],
            duration=duration, strict=False) or self._extract_chapters_helper(
            re.findall(sep_re % (r'.+?', duration_re), description or ''),
            start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0],
            duration=duration, strict=False)

    @staticmethod
    def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
        all_known = all(
            x is not None for x in
            (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted))
        return (
            'private' if is_private
            else 'premium_only' if needs_premium
            else 'subscriber_only' if needs_subscription
            else 'needs_auth' if needs_auth
            else 'unlisted' if is_unlisted
            else 'public' if all_known
            else None)

    def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
        '''
        @returns            A list of values for the extractor argument given by "key"
                            or "default" if no such key is present
        @param default      The default value to return when the key is not present (default: [])
        @param casesense    When false, the values are converted to lower case
        '''
        ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key()
        val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key))
        if val is None:
            return [] if default is NO_DEFAULT else default
        return list(val) if casesense else [x.lower() for x in val]

    def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
        if not playlist_id or not video_id:
            return not video_id

        no_playlist = (smuggled_data or {}).get('force_noplaylist')
        if no_playlist is not None:
            return not no_playlist

        video_id = '' if video_id is True else f' {video_id}'
        playlist_id = '' if playlist_id is True else f' {playlist_id}'
        if self.get_param('noplaylist'):
            self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
            return False
        self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
        return True

    def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
        RetryManager.report_retry(
            err, _count or int(fatal), _retries,
            info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,
            sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))

    def RetryManager(self, **kwargs):
        return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)

    def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs):
        display_id = traverse_obj(info_dict, 'display_id', 'id')
        self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}')
        return self._downloader.get_info_extractor('Generic')._extract_embeds(
            smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs)

    @classmethod
    def extract_from_webpage(cls, ydl, url, webpage):
        ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
              else ydl.get_info_extractor(cls.ie_key()))
        for info in ie._extract_from_webpage(url, webpage) or []:
            # url = None since we do not want to set (webpage/original)_url
            ydl.add_default_extra_info(info, ie, None)
            yield info

    @classmethod
    def _extract_from_webpage(cls, url, webpage):
        for embed_url in orderedSet(
                cls._extract_embed_urls(url, webpage) or [], lazy=True):
            yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)

    @classmethod
    def _extract_embed_urls(cls, url, webpage):
        """@returns all the embed urls on the webpage"""
        if '_EMBED_URL_RE' not in cls.__dict__:
            assert isinstance(cls._EMBED_REGEX, (list, tuple))
            for idx, regex in enumerate(cls._EMBED_REGEX):
                assert regex.count('(?P<url>') == 1, \
                    f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
            cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))

        for regex in cls._EMBED_URL_RE:
            for mobj in regex.finditer(webpage):
                embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
                if cls._VALID_URL is False or cls.suitable(embed_url):
                    yield embed_url

    class StopExtraction(Exception):
        pass

    @classmethod
    def _extract_url(cls, webpage):  # TODO: Remove
        """Only for compatibility with some older extractors"""
        return next(iter(cls._extract_embed_urls(None, webpage) or []), None)

    @classmethod
    def __init_subclass__(cls, *, plugin_name=None, **kwargs):
        if plugin_name:
            mro = inspect.getmro(cls)
            super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
            cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key
            cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}'
            while getattr(super_class, '__wrapped__', None):
                super_class = super_class.__wrapped__
            setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
            _PLUGIN_OVERRIDES[super_class].append(cls)

        return super().__init_subclass__(**kwargs)


class SearchInfoExtractor(InfoExtractor):
    """
    Base class for paged search queries extractors.
    They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
    Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
    """

    _MAX_RESULTS = float('inf')
    _RETURN_TYPE = 'playlist'

    @classproperty
    def _VALID_URL(cls):
        return rf'{cls._SEARCH_KEY}(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)'

    def _real_extract(self, query):
        prefix, query = self._match_valid_url(query).group('prefix', 'query')
        if prefix == '':
            return self._get_n_results(query, 1)
        elif prefix == 'all':
            return self._get_n_results(query, self._MAX_RESULTS)
        else:
            n = int(prefix)
            if n <= 0:
                raise ExtractorError(f'invalid download number {n} for query "{query}"')
            elif n > self._MAX_RESULTS:
                self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
                n = self._MAX_RESULTS
            return self._get_n_results(query, n)

    def _get_n_results(self, query, n):
        """Get a specified number of results for a query.
        Either this function or _search_results must be overridden by subclasses """
        return self.playlist_result(
            itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
            query, query)

    def _search_results(self, query):
        """Returns an iterator of search results"""
        raise NotImplementedError('This method must be implemented by subclasses')

    @classproperty
    def SEARCH_KEY(cls):
        return cls._SEARCH_KEY


class UnsupportedURLIE(InfoExtractor):
    _VALID_URL = '.*'
    _ENABLED = False
    IE_DESC = False

    def _real_extract(self, url):
        raise UnsupportedError(url)


_PLUGIN_OVERRIDES = collections.defaultdict(list)
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								import base64
-												[downloader/ffmpeg] Fix for direct videos inside mpd manifests
Closes #1751

											
										
										
											2021-11-22 19:02:14 +01:00
+								import collections
-												[misc] Add `hatch`, `ruff`, `pre-commit` and improve dev docs (#7409)

Authored by: bashonly, seproDev, Grub4K

Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com>
Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com>
											
										
										
											2024-05-26 21:27:21 +02:00
+								import functools
-												[compat] Remove more functions

Removing any more will require changes to a large number of extractors

											
										
										
											2022-06-24 10:10:17 +02:00
+								import getpass
-												[extractor/common] Limit --write-pages filename to 200 chars

This avoids problems with very long URLs.

											
										
										
											2014-01-17 14:47:46 +01:00
+								import hashlib
-												[cleanup] Consistent style for file heads

											
										
										
											2022-06-24 13:06:16 +02:00
+								import http.client
 								import http.cookiejar
 								import http.cookies
-												Allow plugin extractors to replace the built-in ones

This allows easier plugin chaining; e.g.
- https://gist.github.com/pukkandan/24f13ff1ed385c5a390c1d7bd130d8f7
- https://gist.github.com/pukkandan/fcf5ca1785c80f64e471f0ee14f990fb

											
										
										
											2022-09-16 13:07:38 +02:00
+								import inspect
-												[extractor] Simplify search extractors

											
										
										
											2021-10-08 22:39:55 +02:00
+								import itertools
-												[khanacademy] Add support (Fixes #2066)

											
										
										
											2014-01-07 09:35:34 +01:00
+								import json
-												[cleanup] Sort imports

Using https://github.com/PyCQA/isort

    isort -m VERTICAL_HANGING_INDENT --py 36 -l 80 --rr -n --tc .

											
										
										
											2022-04-12 00:32:57 +02:00
+								import math
-												[vodlocker] PEP8, generalization, and simplification (#3223)

											
										
										
											2014-07-11 10:57:08 +02:00
+								import netrc
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								import os
-												Add experimental geo restriction bypass mechanism
Based on faking X-Forwarded-For HTTP header

											
										
										
											2017-02-04 12:49:58 +01:00
+								import random
-												Remove Python 3.6 support

Closes #3764

											
										
										
											2022-07-18 02:20:54 +02:00
+								import re
-												Add option `--netrc-cmd` (#6682)

Authored by: NDagestad, pukkandan
Closes #1706
											
										
										
											2023-06-21 05:07:42 +02:00
+								import subprocess
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								import sys
-												[vodlocker] PEP8, generalization, and simplification (#3223)

											
										
										
											2014-07-11 10:57:08 +02:00
+								import time
-												[extractor] Framework for embed detection (#4307)

											
										
										
											2022-08-01 03:22:03 +02:00
+								import types
-												[compat] Remove deprecated functions from core code

											
										
										
											2022-06-24 12:54:43 +02:00
+								import urllib.parse
-												[compat] Remove more functions

Removing any more will require changes to a large number of extractors

											
										
										
											2022-06-24 10:10:17 +02:00
+								import urllib.request
-												[cleanup] Sort imports

Using https://github.com/PyCQA/isort

    isort -m VERTICAL_HANGING_INDENT --py 36 -l 80 --rr -n --tc .

											
										
										
											2022-04-12 00:32:57 +02:00
+								import xml.etree.ElementTree
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
-												[networking] Rewrite architecture (#2861)

New networking interface consists of a `RequestDirector` that directs
each `Request` to appropriate `RequestHandler` and returns the
`Response` or raises `RequestError`. The handlers define adapters to
transform its internal Request/Response/Errors to our interfaces.

User-facing changes:
- Fix issues with per request proxies on redirects for urllib
- Support for `ALL_PROXY` environment variable for proxy setting
- Support for `socks5h` proxy
   - Closes https://github.com/yt-dlp/yt-dlp/issues/6325, https://github.com/ytdl-org/youtube-dl/issues/22618, https://github.com/ytdl-org/youtube-dl/pull/28093
- Raise error when using `https` proxy instead of silently converting it to `http`

Authored by: coletdjnz

											
										
										
											2023-07-15 12:25:23 +02:00
+								from ..compat import (
 								    compat_etree_fromstring,
 								    compat_expanduser,
 								    compat_os_name,
 								    urllib_req_to_req,
 								)
-												[cookies] Parse cookies leniently (#4780)

Closes #4776, #3778
Authored by: Grub4K

											
										
										
											2022-09-16 19:02:00 +02:00
+								from ..cookies import LenientSimpleCookie
-												[cleanup] Sort imports

Using https://github.com/PyCQA/isort

    isort -m VERTICAL_HANGING_INDENT --py 36 -l 80 --rr -n --tc .

											
										
										
											2022-04-12 00:32:57 +02:00
+								from ..downloader.f4m import get_base_url, remove_encrypted_media
-												[core] Allow extractors to mark formats as potentially DRM (#7396)

This is useful for HLS where detecting whether the format is
actually DRM requires the child manifest to be downloaded.

Makes the error message when using `--test` inconsistent,
but doesn't really matter.

											
										
										
											2023-07-06 15:09:50 +02:00
+								from ..downloader.hls import HlsFD
-												[compat, networking] Deprecate old functions (#2861)

Authored by: coletdjnz, pukkandan

											
										
										
											2023-07-09 09:53:02 +02:00
+								from ..networking import HEADRequest, Request
 								from ..networking.exceptions import (
 								    HTTPError,
 								    IncompleteRead,
 								    network_exceptions,
 								)
-												[ie] Add extractor impersonate API (#9474)

Authored by: bashonly, Grub4K, pukkandan
											
										
										
											2024-03-31 00:18:07 +01:00
+								from ..networking.impersonate import ImpersonateTarget
-												[util] Move compatibility functions out of util

utils is large enough without these compatibility functions.

Everything that is present in newer versions of Python (i.e. with dev Python it's just an import) goes into compat.py .
Everything else (i.e. youtube-dl-specific helpers) goes into utils.py .

											
										
										
											2014-11-02 11:23:40 +01:00
+								from ..utils import (
-												[extractor] Framework for embed detection (#4307)

											
										
										
											2022-08-01 03:22:03 +02:00
+								    IDENTITY,
-												[cleanup] Sort imports

Using https://github.com/PyCQA/isort

    isort -m VERTICAL_HANGING_INDENT --py 36 -l 80 --rr -n --tc .

											
										
										
											2022-04-12 00:32:57 +02:00
+								    JSON_LD_RE,
 								    NO_DEFAULT,
 								    ExtractorError,
-												[utils] Move format sorting code into `utils`

											
										
										
											2022-11-17 06:33:20 +01:00
+								    FormatSorter,
-												[cleanup] Sort imports

Using https://github.com/PyCQA/isort

    isort -m VERTICAL_HANGING_INDENT --py 36 -l 80 --rr -n --tc .

											
										
										
											2022-04-12 00:32:57 +02:00
+								    GeoRestrictedError,
 								    GeoUtils,
-												[extractor] Add `_search_json`

All fetching of JSON objects should eventually be done with this function
but only `youtube` is being refactored for now

											
										
										
											2022-06-03 17:32:31 +02:00
+								    LenientJSONDecoder,
-												Add option `--netrc-cmd` (#6682)

Authored by: NDagestad, pukkandan
Closes #1706
											
										
										
											2023-06-21 05:07:42 +02:00
+								    Popen,
-												[cleanup] Sort imports

Using https://github.com/PyCQA/isort

    isort -m VERTICAL_HANGING_INDENT --py 36 -l 80 --rr -n --tc .

											
										
										
											2022-04-12 00:32:57 +02:00
+								    RegexNotFoundError,
-												Standardize retry mechanism (#1649)

* [utils] Create `RetryManager`
* Migrate all retries to use the manager
* [extractor] Add wrapper methods for convenience
* Standardize console messages for retries
* Add `--retry-sleep` for extractors
											
										
										
											2022-08-01 22:13:18 +02:00
+								    RetryManager,
-												[cleanup] Sort imports

Using https://github.com/PyCQA/isort

    isort -m VERTICAL_HANGING_INDENT --py 36 -l 80 --rr -n --tc .

											
										
										
											2022-04-12 00:32:57 +02:00
+								    UnsupportedError,
-												Respect age_limit when listing extractors (Fixes #4653)

											
										
										
											2015-01-07 07:20:20 +01:00
+								    age_restricted,
-												[utils] Introduce base_url

											
										
										
											2016-11-01 20:14:01 +01:00
+								    base_url,
-												InfoExtractor._search_regex: Suggest updating when the regex is not found (suggested in #5442)

Reuse the same message from ExtractorError

											
										
										
											2015-04-17 14:55:24 +02:00
+								    bug_reports_message,
-												[extractor] Use classmethod/property where possible

and refactor lazy extractors accordingly.

This reduces the need to create extractor instances

											
										
										
											2022-05-11 17:54:44 +02:00
+								    classproperty,
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								    clean_html,
-												[utils] Move format sorting code into `utils`

											
										
										
											2022-11-17 06:33:20 +01:00
+								    deprecation_warning,
-												[extractor/common] Recursively extract child f4m manifests

											
										
										
											2015-07-15 21:15:15 +02:00
+								    determine_ext,
-												[extractor/common] Improve HTML5 entries extraction and add some realworld tests

											
										
										
											2019-03-17 03:09:32 +01:00
+								    dict_get,
-												[iq.com] Add extractors (#2354)

Closes #704
Authored by: MinePlayersPE
											
										
										
											2022-01-19 23:23:55 +01:00
+								    encode_data_uri,
-												[BostonGlobe] New. Nonstandard version of Brightcove.

Has a "data-brightcove-video-id" instead of a "data-video-id," otherwise
pretty much just Brightcove. Except the Globe isn't all Brightcove
videos, so fallback to Generic, too.

Also, abstract playlist_from_matches() from generic.py to common.py, and use
it here.

History of these changes can be found in
51170427d4b1143572a498dedaee61863a5b2c5b.

											
										
										
											2017-03-09 00:13:54 +01:00
+								    extract_attributes,
-												[utils] Add `filter_dict`

											
										
										
											2022-03-28 04:51:45 +02:00
+								    filter_dict,
-												[extractor/common] Handle malformed f4m manifests

											
										
										
											2015-07-15 21:14:08 +02:00
+								    fix_xml_ampersands,
-												[golem] Simplify (#3828)

											
										
										
											2014-09-28 10:34:55 +02:00
+								    float_or_none,
-												[cleanup] Misc

											
										
										
											2021-08-07 13:20:46 +02:00
+								    format_field,
-												[bloomberg] Extract the available formats (closes #2776)

It uses a helper method in the InfoExtractor class.
The downloader will pick the requested formats using the bitrate in the info dict.

											
										
										
											2014-07-28 15:25:56 +02:00
+								    int_or_none,
-												[utils] Add `join_nonempty`

											
										
										
											2021-11-06 02:05:24 +01:00
+								    join_nonempty,
-												[generic] Try parsing JWPlayer embedded videos (closes #12030)

											
										
										
											2017-02-16 16:42:36 +01:00
+								    js_to_json,
-												[BostonGlobe] New. Nonstandard version of Brightcove.

Has a "data-brightcove-video-id" instead of a "data-video-id," otherwise
pretty much just Brightcove. Except the Globe isn't all Brightcove
videos, so fallback to Generic, too.

Also, abstract playlist_from_matches() from generic.py to common.py, and use
it here.

History of these changes can be found in
51170427d4b1143572a498dedaee61863a5b2c5b.

											
										
										
											2017-03-09 00:13:54 +01:00
+								    mimetype2ext,
-												[cleanup] Misc

Closes #6288, Closes #7197, Closes #7265, Closes #7353, Closes #5773
Authored by: mikf, freezboltz, pukkandan

											
										
										
											2023-06-21 05:51:20 +02:00
+								    netrc_from_content,
-												[BostonGlobe] New. Nonstandard version of Brightcove.

Has a "data-brightcove-video-id" instead of a "data-video-id," otherwise
pretty much just Brightcove. Except the Globe isn't all Brightcove
videos, so fallback to Generic, too.

Also, abstract playlist_from_matches() from generic.py to common.py, and use
it here.

History of these changes can be found in
51170427d4b1143572a498dedaee61863a5b2c5b.

											
										
										
											2017-03-09 00:13:54 +01:00
+								    orderedSet,
-												[extractor/common] Improve HTML5 entries extraction and add some realworld tests

											
										
										
											2019-03-17 03:09:32 +01:00
+								    parse_bitrate,
-												[BostonGlobe] New. Nonstandard version of Brightcove.

Has a "data-brightcove-video-id" instead of a "data-video-id," otherwise
pretty much just Brightcove. Except the Globe isn't all Brightcove
videos, so fallback to Generic, too.

Also, abstract playlist_from_matches() from generic.py to common.py, and use
it here.

History of these changes can be found in
51170427d4b1143572a498dedaee61863a5b2c5b.

											
										
										
											2017-03-09 00:13:54 +01:00
+								    parse_codecs,
 								    parse_duration,
-												[extractor/common] Add initial support for JSON-LD metadata extraction into info_dict

											
										
										
											2016-01-15 19:36:02 +01:00
+								    parse_iso8601,
-												[BostonGlobe] New. Nonstandard version of Brightcove.

Has a "data-brightcove-video-id" instead of a "data-video-id," otherwise
pretty much just Brightcove. Except the Globe isn't all Brightcove
videos, so fallback to Generic, too.

Also, abstract playlist_from_matches() from generic.py to common.py, and use
it here.

History of these changes can be found in
51170427d4b1143572a498dedaee61863a5b2c5b.

											
										
										
											2017-03-09 00:13:54 +01:00
+								    parse_m3u8_attributes,
-												[extractor/common] Improve HTML5 entries extraction and add some realworld tests

											
										
										
											2019-03-17 03:09:32 +01:00
+								    parse_resolution,
-												[BostonGlobe] New. Nonstandard version of Brightcove.

Has a "data-brightcove-video-id" instead of a "data-video-id," otherwise
pretty much just Brightcove. Except the Globe isn't all Brightcove
videos, so fallback to Generic, too.

Also, abstract playlist_from_matches() from generic.py to common.py, and use
it here.

History of these changes can be found in
51170427d4b1143572a498dedaee61863a5b2c5b.

											
										
										
											2017-03-09 00:13:54 +01:00
+								    sanitize_filename,
-												[extractor] Framework for embed detection (#4307)

											
										
										
											2022-08-01 03:22:03 +02:00
+								    sanitize_url,
-												[extractor/generic] Separate embed extraction into own function (#5176)


											
										
										
											2022-10-09 12:39:36 +02:00
+								    smuggle_url,
-												[extractor/common] Improve HTML5 entries extraction and add some realworld tests

											
										
										
											2019-03-17 03:09:32 +01:00
+								    str_or_none,
-												[extractor/common] Relax interaction count extraction in _json_ld

											
										
										
											2020-09-19 01:33:17 +02:00
+								    str_to_int,
-												[extractor/common] Strip src attribute for HTML5 entries code (closes #18485, closes #21169)

											
										
										
											2019-05-23 18:52:11 +02:00
+								    strip_or_none,
-												Add `--extractor-args` to pass extractor-specific arguments

											
										
										
											2021-06-25 16:05:41 +02:00
+								    traverse_obj,
-												[cleanup] Misc

											
										
										
											2022-11-30 07:04:51 +01:00
+								    truncate_string,
-												[extractor] Fix empty `BaseURL` in MPD

Closes #4113

											
										
										
											2022-06-29 02:37:21 +02:00
+								    try_call,
-												[extractor] Fix for manifests without period duration

Closes #2705
Authored by: dirkf, pukkandan

											
										
										
											2022-02-17 14:36:22 +01:00
+								    try_get,
-												Use unescapeHTML for OpenGraph properties

These are attribute values, so we don't need the more complex and whitespace-destroying cleanHTML - we just need to unescape quotes, that's it.

											
										
										
											2013-07-17 10:38:23 +02:00
+								    unescapeHTML,
-												[extractor/common] Extract upload date from SMIL

											
										
										
											2015-10-01 18:18:59 +02:00
+								    unified_strdate,
-												[extractor/common] Extract more metadata for VideoObject in _json_ld

											
										
										
											2016-07-08 22:27:11 +02:00
+								    unified_timestamp,
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											2015-08-01 21:13:21 +02:00
+								    url_basename,
-												[extractor/common] Add validation for JSON-LD URLs

											
										
										
											2018-10-28 18:19:08 +01:00
+								    url_or_none,
-												[downloader/hls] Allow extractors to provide AES key (#6158)

and related cleanup

Authored by: bashonly, Grub4K

Co-authored-by: Simon Sawicki <contact@grub4k.xyz>

											
										
										
											2023-02-08 06:33:54 +01:00
+								    urlhandle_detect_ext,
-												[cleanup] Misc

											
										
										
											2021-08-07 13:20:46 +02:00
+								    urljoin,
-												[utils] Add `variadic`

											
										
										
											2021-07-10 23:59:44 +02:00
+								    variadic,
-												[common] Fix <bootstrapInfo> detection in F4M manifests

Regression since 0a5685b26fae0940f14cb063a6e4fc6986f9c124

											
										
										
											2016-06-07 18:19:33 +02:00
+								    xpath_element,
-												[extractor/generic] Add generic support for xspf playist extraction

											
										
										
											2015-08-09 15:07:18 +02:00
+								    xpath_text,
 								    xpath_with_ns,
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								)
-												[extractor/common] Use NO_DEFAULT from utils

											
										
										
											2015-06-28 18:56:45 +02:00
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
-												[cleanup] Upgrade syntax

Using https://github.com/asottile/pyupgrade

1. `__future__` imports and `coding: utf-8` were removed
2. Files were rewritten with `pyupgrade --py36-plus --keep-percent-format`
3. f-strings were cherry-picked from `pyupgrade --py36-plus`

Extractors are left untouched (except removing header) to avoid unnecessary merge conflicts

											
										
										
											2022-04-11 17:10:28 +02:00
+								class InfoExtractor:
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								    """Information Extractor class.
 								    Information extractors are the classes that, given a URL, extract
 								    information about the video (or videos) the URL refers to. This
 								    information includes the real video URL, the video title, author and
 								    others. The information is stored in a dictionary which is then
-												[extractor/common] Update docstring: replace FileDownloader with YoutubeDL

											
										
										
											2014-12-21 16:58:29 +01:00
+								    passed to the YoutubeDL. The YoutubeDL processes this
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								    information possibly downloading the video to the file system, among
 								    other possible outcomes.
-												Typo: twice "the the" to "the"
											
										
										
											2015-04-29 17:03:10 +02:00
+								    The type field determines the type of the result.
-												[extractor/common] Document _type values (Motivated by #4254)

											
										
										
											2014-11-20 16:47:59 +01:00
+								    By far the most common value (and the default if _type is missing) is
 								    "video", which indicates a single video.
 								    For a video, the dictionaries must include the following fields:
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
 								    id:             Video identifier.
-												Remove warning for videos with an empty title

											
										
										
											2022-05-07 16:15:00 +02:00
+								    title:          Video title, unescaped. Set to an empty string if video has
 								                    no title as opposed to "None" which signifies that the
 								                    extractor failed to obtain a title
-												Reorder info_dict documentation

											
										
										
											2013-12-16 14:13:40 +01:00
-												Add a resolution field and improve general --list-formats output

											
										
										
											2013-12-24 11:56:02 +01:00
+								    Additionally, it must contain either a formats entry or a url one:
-												Reorder info_dict documentation

											
										
										
											2013-12-16 14:13:40 +01:00
-												Add a resolution field and improve general --list-formats output

											
										
										
											2013-12-24 11:56:02 +01:00
+								    formats:        A list of dictionaries for each format available, ordered
 								                    from worst to best quality.
 								                    Potential fields:
-												[extractor/common] Clarify url and manifest_url meta fields

											
										
										
											2019-03-04 18:39:15 +01:00
+								                    * url        The mandatory URL representing the media:
 								                                   for plain file media - HTTP URL of this file,
 								                                   for RTMP - RTMP URL,
 								                                   for HLS - URL of the M3U8 media playlist,
 								                                   for HDS - URL of the F4M manifest,
-												[extractor/common] Fix url meta field for unfragmented DASH formats (closes #20346)

											
										
										
											2019-03-14 18:42:14 +01:00
+								                                   for DASH
 								                                     - HTTP URL to plain file media (in case of
 								                                       unfragmented media)
 								                                     - URL of the MPD manifest or base URL
 								                                       representing the media if MPD manifest
-												[extractor/common] Fix typo

											
										
										
											2019-05-10 23:53:48 +02:00
+								                                       is parsed from a string (in case of
-												[extractor/common] Fix url meta field for unfragmented DASH formats (closes #20346)

											
										
										
											2019-03-14 18:42:14 +01:00
+								                                       fragmented media)
-												[extractor/common] Clarify url and manifest_url meta fields

											
										
										
											2019-03-04 18:39:15 +01:00
+								                                   for MSS - URL of the ISM manifest.
-												[extractor/youtube] Fix 5038f6d713303e0967d002216e7a88652401c22a

* [fragment] Fix `request_data`
* [youtube] Don't use POST for now. It may be easier to break in future

Authored by: bashonly, coletdjnz

											
										
										
											2023-02-28 19:04:43 +01:00
+								                    * request_data  Data to send in POST request to the URL
-												Refactor fragments interface and dash segments downloader
- Eliminate segment_urls and initialization_url
+ Introduce manifest_url (manifest may contain unfragmented data in this case url will be used for direct media URL and manifest_url for manifest itself correspondingly)
* Rewrite dashsegments downloader to use fragments data
* Improve generic mpd extraction

											
										
										
											2016-09-17 15:35:22 +02:00
+								                    * manifest_url
 								                                 The URL of the manifest file in case of
-												[extractor/common] Clarify url and manifest_url meta fields

											
										
										
											2019-03-04 18:39:15 +01:00
+								                                 fragmented media:
 								                                   for HLS - URL of the M3U8 master playlist,
 								                                   for HDS - URL of the F4M manifest,
 								                                   for DASH - URL of the MPD manifest,
 								                                   for MSS - URL of the ISM manifest.
-												[cleanup] Misc fixes

Closes https://github.com/yt-dlp/yt-dlp/pull/3213, Closes https://github.com/yt-dlp/yt-dlp/pull/3117

Related: https://github.com/yt-dlp/yt-dlp/issues/3146#issuecomment-1077323114, https://github.com/yt-dlp/yt-dlp/pull/3277#discussion_r841019671, https://github.com/yt-dlp/yt-dlp/commit/a825ffbffa0bea322e3ccb44c6f8e01d8d9572fb#commitcomment-68538986, https://github.com/yt-dlp/yt-dlp/issues/2360, https://github.com/yt-dlp/yt-dlp/commit/5fa3c9a88f597625296981a4a26be723e65d4842#r70393519, https://github.com/yt-dlp/yt-dlp/commit/5fa3c9a88f597625296981a4a26be723e65d4842#r70393254

											
										
										
											2022-03-27 04:20:43 +02:00
+								                    * manifest_stream_number  (For internal use only)
 								                                 The index of the stream in the manifest file
-												[extractor/common] Consistent URL spelling

											
										
										
											2015-07-23 19:37:45 +02:00
+								                    * ext        Will be calculated from URL if missing
-												Reorder info_dict documentation

											
										
										
											2013-12-16 14:13:40 +01:00
+								                    * format     A human-readable description of the format
 								                                 ("mp4 container with h264/opus").
 								                                 Calculated from the format_id, width, height.
 								                                 and format_note fields if missing.
 								                    * format_id  A short description of the format
-												Document that format_id field should be present

											
										
										
											2013-12-26 21:19:00 +01:00
+								                                 ("mp4_h264_opus" or "19").
 								                                Technically optional, but strongly recommended.
-												Reorder info_dict documentation

											
										
										
											2013-12-16 14:13:40 +01:00
+								                    * format_note Additional info about the format
 								                                 ("3D" or "DASH video")
 								                    * width      Width of the video, if known
 								                    * height     Height of the video, if known
-												Add new field `aspect_ratio`

Closes #5402

											
										
										
											2022-11-16 02:22:57 +01:00
+								                    * aspect_ratio  Aspect ratio of the video, if known
 								                                 Automatically calculated from width and height
-												Add a resolution field and improve general --list-formats output

											
										
										
											2013-12-24 11:56:02 +01:00
+								                    * resolution Textual description of width and height
-												Add new field `aspect_ratio`

Closes #5402

											
										
										
											2022-11-16 02:22:57 +01:00
+								                                 Automatically calculated from width and height
-												Add HDR information to formats

											
										
										
											2021-10-18 15:04:21 +02:00
+								                    * dynamic_range The dynamic range of the video. One of:
 								                                 "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
-												[core] Fix `filesize_approx` calculation (#9560)

Reverts 22e4dfacb61f62dfbb3eb41b31c7b69ba1059b80

Despite being documented as `Kbit/s`, the extractors/manifests were returning bitrates in SI units of kilobits/sec.

Authored by: seproDev, pukkandan
											
										
										
											2024-04-01 01:17:24 +02:00
+								                    * tbr        Average bitrate of audio and video in kbps (1000 bits/sec)
 								                    * abr        Average audio bitrate in kbps (1000 bits/sec)
-												Reorder info_dict documentation

											
										
										
											2013-12-16 14:13:40 +01:00
+								                    * acodec     Name of the audio codec in use
-												[youtube] Download DASH manifest

If given, download and parse the DASH manifest file, in order to get ultra-HQ formats.
Fixes #2166

											
										
										
											2014-01-19 05:47:20 +01:00
+								                    * asr        Audio sampling rate in Hertz
-												[extractor] Add field `audio_channels`

											
										
										
											2022-08-07 22:05:36 +02:00
+								                    * audio_channels  Number of audio channels
-												[core] Fix `filesize_approx` calculation (#9560)

Reverts 22e4dfacb61f62dfbb3eb41b31c7b69ba1059b80

Despite being documented as `Kbit/s`, the extractors/manifests were returning bitrates in SI units of kilobits/sec.

Authored by: seproDev, pukkandan
											
										
										
											2024-04-01 01:17:24 +02:00
+								                    * vbr        Average video bitrate in kbps (1000 bits/sec)
-												[youtube] Add formats 298, 299 (Fixes #4056)

											
										
										
											2014-10-30 09:34:13 +01:00
+								                    * fps        Frame rate
-												Reorder info_dict documentation

											
										
										
											2013-12-16 14:13:40 +01:00
+								                    * vcodec     Name of the video codec in use
-												[youtube] Add new formats (Fixes #2221)

											
										
										
											2014-01-23 23:54:06 +01:00
+								                    * container  Name of the container format
-												Reorder info_dict documentation

											
										
										
											2013-12-16 14:13:40 +01:00
+								                    * filesize   The number of bytes, if known in advance
-												[snotr] PEP8 and minor fixes (#3296)

											
										
										
											2014-07-21 12:02:44 +02:00
+								                    * filesize_approx  An estimate for the number of bytes
-												Reorder info_dict documentation

											
										
										
											2013-12-16 14:13:40 +01:00
+								                    * player_url SWF Player URL (used for rtmpdump).
-												[zdf] Use centralized sorting

											
										
										
											2013-12-24 23:32:04 +01:00
+								                    * protocol   The protocol that will be used for the actual
-												[dash,youtube] Download live from start to end (#888)

* Add option `--live-from-start` to enable downloading live videos from start
* Add key `is_from_start` in formats to identify formats (of live videos) that downloads from start
* [dash] Create protocol `http_dash_segments_generator` that allows a function to be passed instead of fragments
* [fragment] Allow multiple live dash formats to download simultaneously
* [youtube] Implement fragment re-fetching for the live dash formats
* [youtube] Re-extract dash manifest every 5 hours (manifest expires in 6hrs)
* [postprocessor/ffmpeg] Add `FFmpegFixupDuplicateMoovPP` to fixup duplicated moov atoms

Known issue: Ctrl+C doesn't work on Windows when downloading multiple formats

Closes #1521
Authored by: nao20010128nao, pukkandan

											
										
										
											2021-12-20 07:06:46 +01:00
+								                                 download, lower-case. One of "http", "https" or
 								                                 one of the protocols defined in downloader.PROTOCOL_MAP
-												[extractor/common] Document forgotten fragment base and path interfaces

											
										
										
											2017-01-28 23:56:43 +01:00
+								                    * fragment_base_url
 								                                 Base URL for fragments. Each fragment's path
 								                                 value (if present) will be relative to
 								                                 this URL.
 								                    * fragments  A list of fragments of a fragmented media.
 								                                 Each fragment entry must contain either an url
 								                                 or a path. If an url is present it should be
 								                                 considered by a client. Otherwise both path and
 								                                 fragment_base_url must be present. Here is
 								                                 the list of all potential fields:
 								                                 * "url" - fragment's URL
 								                                 * "path" - fragment's path relative to
 								                                            fragment_base_url
-												[extractor/common] Introduce fragments interface

											
										
										
											2016-09-05 20:18:57 +02:00
+								                                 * "duration" (optional, int or float)
 								                                 * "filesize" (optional, int)
-												[dash,youtube] Download live from start to end (#888)

* Add option `--live-from-start` to enable downloading live videos from start
* Add key `is_from_start` in formats to identify formats (of live videos) that downloads from start
* [dash] Create protocol `http_dash_segments_generator` that allows a function to be passed instead of fragments
* [fragment] Allow multiple live dash formats to download simultaneously
* [youtube] Implement fragment re-fetching for the live dash formats
* [youtube] Re-extract dash manifest every 5 hours (manifest expires in 6hrs)
* [postprocessor/ffmpeg] Add `FFmpegFixupDuplicateMoovPP` to fixup duplicated moov atoms

Known issue: Ctrl+C doesn't work on Windows when downloading multiple formats

Closes #1521
Authored by: nao20010128nao, pukkandan

											
										
										
											2021-12-20 07:06:46 +01:00
+								                    * is_from_start  Is a live format that can be downloaded
 								                                from the start. Boolean
-												Add a resolution field and improve general --list-formats output

											
										
										
											2013-12-24 11:56:02 +01:00
+								                    * preference Order number of this format. If this field is
-												[wistia] Prefer original video format above all others

We could also set up a formula which would weigh filesize/bitrate and vcodec/acodec (say, 1GB h264 < 3 GB MPEG2 < 2 GB h264), but that would get really messy real soon.

											
										
										
											2014-01-01 20:23:47 +01:00
+								                                 present and not None, the formats get sorted
-												[extractor/common] Clarify preference key in formats

											
										
										
											2014-03-23 17:41:43 +01:00
+								                                 by this field, regardless of all other values.
-												Add a resolution field and improve general --list-formats output

											
										
										
											2013-12-24 11:56:02 +01:00
+								                                 -1 for default (order by other properties),
 								                                 -2 or smaller for less than default.
-												[youtube] Correct handling when DASH manifest is not necessary to find all formats

											
										
										
											2015-01-03 18:33:38 +01:00
+								                                 < -1000 to hide the format (if there is
 								                                    another one which is strictly better)
-												[ccc] Add language information to formats

											
										
										
											2016-01-01 13:28:45 +01:00
+								                    * language   Language code, e.g. "de" or "en-US".
 								                    * language_preference  Is this in the language mentioned in
 								                                 the URL?
-												[arte] Clean up format sorting mess

We now use our standard sorting facilities. As a side effect, it's finally possible to download German videos from French URLs and vice versa.

											
										
										
											2014-11-20 12:06:33 +01:00
+if it's what the URL is about,
 								                                 -1 for default (don't know),
 								                                 -10 otherwise, other values reserved for now.
-												[orf] Use new extraction method (Fixes #2057)

											
										
										
											2014-01-06 17:15:27 +01:00
+								                    * quality    Order number of the video quality of this
 								                                 format, irrespective of the file format.
 								                                 -1 for default (order by other properties),
 								                                 -2 or smaller for less than default.
-												[viddler] Use API

											
										
										
											2014-10-25 00:10:11 +02:00
+								                    * source_preference  Order number for this video source
 								                                  (quality takes higher priority)
 								                                 -1 for default (order by other properties),
 								                                 -2 or smaller for less than default.
-												[grooveshark,http] Make HTTP POST downloads work

											
										
										
											2014-08-24 01:31:35 +02:00
+								                    * http_headers  A dictionary of additional HTTP headers
 								                                 to add to the request.
-												[youtube|ffmpeg] Automatically correct video with non-square pixels (Fixes #4674)

											
										
										
											2015-01-10 05:45:51 +01:00
+								                    * stretched_ratio  If given and not 1, indicates that the
-												[rtl2] PEP8, simplify, make rtmp tests run (#470)

											
										
										
											2015-01-25 18:09:48 +01:00
+								                                 video's pixels are not square.
 								                                 width : height ratio as float.
 								                    * no_resume  The server does not support resuming the
 								                                 (HTTP or RTMP) download. Boolean.
-												[core] Allow extractors to mark formats as potentially DRM (#7396)

This is useful for HLS where detecting whether the format is
actually DRM requires the child manifest to be downloaded.

Makes the error message when using `--test` inconsistent,
but doesn't really matter.

											
										
										
											2023-07-06 15:09:50 +02:00
+								                    * has_drm    True if the format has DRM and cannot be downloaded.
 								                                 'maybe' if the format may have DRM and has to be tested before download.
-												[downloader/hls] Allow extractors to provide AES key (#6158)

and related cleanup

Authored by: bashonly, Grub4K

Co-authored-by: Simon Sawicki <contact@grub4k.xyz>

											
										
										
											2023-02-08 06:33:54 +01:00
+								                    * extra_param_to_segment_url  A query string to append to each
 								                                 fragment's URL, or to update each existing query string
-												[fd/hls] Apply `extra_param_to_key_url` from info dict

Authored by: bashonly

											
										
										
											2024-06-03 18:22:49 +02:00
+								                                 with. If it is an HLS stream with an AES-128 decryption key,
 								                                 the query paramaters will be passed to the key URI as well,
 								                                 unless there is an `extra_param_to_key_url` given,
 								                                 or unless an external key URI is provided via `hls_aes`.
 								                                 Only applied by the native HLS/DASH downloaders.
 								                    * extra_param_to_key_url  A query string to append to the URL
 								                                 of the format's HLS AES-128 decryption key.
 								                                 Only applied by the native HLS downloader.
-												[downloader/hls] Allow extractors to provide AES key (#6158)

and related cleanup

Authored by: bashonly, Grub4K

Co-authored-by: Simon Sawicki <contact@grub4k.xyz>

											
										
										
											2023-02-08 06:33:54 +01:00
+								                    * hls_aes    A dictionary of HLS AES-128 decryption information
 								                                 used by the native HLS downloader to override the
 								                                 values in the media playlist when an '#EXT-X-KEY' tag
 								                                 is present in the playlist:
 								                                 * uri  The URI from which the key will be downloaded
 								                                 * key  The key (as hex) used to decrypt fragments.
 								                                        If `key` is given, any key URI will be ignored
 								                                 * iv   The IV (as hex) used to decrypt fragments
-												Improve `--clean-infojson`

It should not removes fields that may be needed for `--load-infojson`.
Eg: `_ffmpeg_args`, `_has_drm`

											
										
										
											2022-04-27 18:22:57 +02:00
+								                    * downloader_options  A dictionary of downloader options
 								                                 (For internal use only)
 								                                 * http_chunk_size Chunk size for HTTP downloads
-												[fd/ffmpeg] Accept output args from info dict (#9278)

Authored by: bashonly
											
										
										
											2024-03-30 00:16:46 +01:00
+								                                 * ffmpeg_args     Extra arguments for ffmpeg downloader (input)
 								                                 * ffmpeg_args_out Extra arguments for ffmpeg downloader (output)
-												[ie] Support multi-period MPD streams (#6654)

											
										
										
											2023-03-27 19:04:23 +02:00
+								                    * is_dash_periods  Whether the format is a result of merging
 								                                 multiple DASH periods.
-												Release 2021.06.08

											
										
										
											2021-06-08 16:43:41 +02:00
+								                    RTMP formats can also have the additional fields: page_url,
 								                    app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
 								                    rtmp_protocol, rtmp_real_time
-												[rtl2] PEP8, simplify, make rtmp tests run (#470)

											
										
										
											2015-01-25 18:09:48 +01:00
-												Document duration field

											
										
										
											2013-12-16 04:09:30 +01:00
+								    url:            Final video URL.
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								    ext:            Video filename extension.
-												Reorder info_dict documentation

											
										
										
											2013-12-16 14:13:40 +01:00
+								    format:         The video format, defaults to ext (used for --get-format)
 								    player_url:     SWF Player URL (used for rtmpdump).
-												Clarify that url and ext are optional when formats is given (#980)

											
										
										
											2013-10-04 11:09:43 +02:00
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								    The following fields are optional:
-												[cleanup, docs] Misc cleanup

Closes #2828, closes #2734, closes #2802, closes #2937

											
										
										
											2022-03-04 15:08:55 +01:00
+								    direct:         True if a direct video file was given (must only be set by GenericIE)
-												[vine] Provide alt_title (Fixes #4448)

											
										
										
											2014-12-12 03:34:28 +01:00
+								    alt_title:      A secondary title of the video.
-												[cleanup] Fix infodict returned fields (#8906)

Authored by: seproDev
											
										
										
											2024-03-08 23:36:41 +01:00
+								    display_id:     An alternative identifier for the video, not necessarily
-												Add display_id field

											
										
										
											2014-03-03 12:06:28 +01:00
+								                    unique, but available before title. Typically, id is
 								                    something like "4234987", title "Dancing naked mole rats",
 								                    and display_id "dancing-naked-mole-rats"
-												[spiegeltv] Simplify and PEP8

											
										
										
											2014-06-07 15:33:45 +02:00
+								    thumbnails:     A list of dictionaries, with the following entries:
-												Add --list-thumbnails

											
										
										
											2015-01-25 02:38:47 +01:00
+								                        * "id" (optional, string) - Thumbnail format ID
-												[spiegeltv] Simplify and PEP8

											
										
										
											2014-06-07 15:33:45 +02:00
+								                        * "url"
-												Add --list-thumbnails

											
										
										
											2015-01-25 02:38:47 +01:00
+								                        * "preference" (optional, int) - quality of the image
-												[spiegeltv] Simplify and PEP8

											
										
										
											2014-06-07 15:33:45 +02:00
+								                        * "width" (optional, int)
 								                        * "height" (optional, int)
-												[extractor/common] Fix typo in thumbnails resolution description (#21817)


											
										
										
											2019-07-17 17:47:53 +02:00
+								                        * "resolution" (optional, string "{width}x{height}",
-												[spiegeltv] Simplify and PEP8

											
										
										
											2014-06-07 15:33:45 +02:00
+								                                        deprecated)
-												[extractor/common] Introduce filesize metafield for thumbnails

											
										
										
											2016-07-08 22:24:36 +02:00
+								                        * "filesize" (optional, int)
-												[extractor] Allow `http_headers` to be specified for `thumbnails`

											
										
										
											2022-02-11 19:00:48 +01:00
+								                        * "http_headers" (dict) - HTTP headers for the request
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								    thumbnail:      Full URL to a video thumbnail image.
-												[vine] Provide alt_title (Fixes #4448)

											
										
										
											2014-12-12 03:34:28 +01:00
+								    description:    Full video description.
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								    uploader:       Full name of the video uploader.
-												[extractor/common] Document license metafield

											
										
										
											2016-03-02 18:06:39 +01:00
+								    license:        License name the video is licensed under.
-												[ie] Migrate commonly plural fields to lists (#8917)

Authored by: llistochek, pukkandan
Related: #3944
											
										
										
											2024-02-20 08:19:24 +01:00
+								    creators:       List of creators of the video.
-												Update to ytdl-commit-3be0980
https://github.com/ytdl-org/youtube-dl/commit/3be098010f667b14075e3dfad1e74e5e2becc8ea

											
										
										
											2021-03-15 00:22:06 +01:00
+								    timestamp:      UNIX timestamp of the moment the video was uploaded
-												[docs] Minor improvements

Closes #3127, Closes #3081, Closes #3177

											
										
										
											2022-03-24 02:30:22 +01:00
+								    upload_date:    Video upload date in UTC (YYYYMMDD).
-												[youtube:tab] Extract more playlist metadata (#2069)

* Add fields modified_date, modified_timestamp
* Add field playlist_count
* [youtube:tab] Extract view_count, playlist_count, modified_date

Authored by: coletdjnz, pukkandan
											
										
										
											2022-01-07 12:03:02 +01:00
+								                    If not explicitly set, calculated from timestamp
 								    release_timestamp: UNIX timestamp of the moment the video was released.
 								                    If it is not clear whether to use timestamp or this, use the former
-												[docs] Minor improvements

Closes #3127, Closes #3081, Closes #3177

											
										
										
											2022-03-24 02:30:22 +01:00
+								    release_date:   The date (YYYYMMDD) when the video was released in UTC.
-												[youtube:tab] Extract more playlist metadata (#2069)

* Add fields modified_date, modified_timestamp
* Add field playlist_count
* [youtube:tab] Extract view_count, playlist_count, modified_date

Authored by: coletdjnz, pukkandan
											
										
										
											2022-01-07 12:03:02 +01:00
+								                    If not explicitly set, calculated from release_timestamp
-												[core] Parse `release_year` from `release_date` (#8524)

Closes #7263
Authored by: seproDev
											
										
										
											2023-11-26 03:12:05 +01:00
+								    release_year:   Year (YYYY) as integer when the video or album was released.
 								                    To be used if no exact release date is known.
 								                    If not explicitly set, calculated from release_date.
-												[youtube:tab] Extract more playlist metadata (#2069)

* Add fields modified_date, modified_timestamp
* Add field playlist_count
* [youtube:tab] Extract view_count, playlist_count, modified_date

Authored by: coletdjnz, pukkandan
											
										
										
											2022-01-07 12:03:02 +01:00
+								    modified_timestamp: UNIX timestamp of the moment the video was last modified.
-												[docs] Minor improvements

Closes #3127, Closes #3081, Closes #3177

											
										
										
											2022-03-24 02:30:22 +01:00
+								    modified_date:   The date (YYYYMMDD) when the video was last modified in UTC.
-												[youtube:tab] Extract more playlist metadata (#2069)

* Add fields modified_date, modified_timestamp
* Add field playlist_count
* [youtube:tab] Extract view_count, playlist_count, modified_date

Authored by: coletdjnz, pukkandan
											
										
										
											2022-01-07 12:03:02 +01:00
+								                    If not explicitly set, calculated from modified_timestamp
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								    uploader_id:    Nickname or id of the video uploader.
-												[extractor/common] Document uploader_url

											
										
										
											2016-03-02 18:31:24 +01:00
+								    uploader_url:   Full URL to a personal webpage of the video uploader.
-												[extractor/common] Introduce channel meta fields

											
										
										
											2018-09-14 20:23:36 +02:00
+								    channel:        Full name of the channel the video is uploaded on.
-												[extractor/common] Fix typos

											
										
										
											2018-09-14 20:53:01 +02:00
+								                    Note that channel fields may or may not repeat uploader
-												[extractor/common] Introduce channel meta fields

											
										
										
											2018-09-14 20:23:36 +02:00
+								                    fields. This depends on a particular extractor.
 								    channel_id:     Id of the channel.
 								    channel_url:    Full URL to a channel webpage.
-												[youtube] Extract channel subscriber count (#2399)

Closes #2350
* Adds `channel_follower_count` field
Authored-by: coletdjnz
											
										
										
											2022-01-21 07:04:36 +01:00
+								    channel_follower_count: Number of followers of the channel.
-												[extractor/youtube] Extract `channel_is_verified` (#7213)

Authored by: coletdjnz

											
										
										
											2023-06-08 09:50:05 +02:00
+								    channel_is_verified: Whether the channel is verified on the platform.
-												[muscivault] Add extractor (Fixes #3593)

											
										
										
											2014-08-27 01:44:47 +02:00
+								    location:       Physical location where the video was filmed.
-												Improve subtitles support

For each language the extractor builds a list with the available formats sorted (like for video formats), then YoutubeDL selects one of them using the '--sub-format' option which now allows giving the format preferences (for example 'ass/srt/best').
For each format the 'url' field can be set so that we only download the contents if needed, or if the contents needs to be processed (like in crunchyroll) the 'data' field can be used.

The reasons for this change are:
* We weren't checking that the format given with '--sub-format' was available, checking it in each extractor would be repetitive.
* It allows to easily support giving a format preference.
* The subtitles were automatically downloaded in the extractor, but I think that if you use for example the '--dump-json' option you want to finish as fast as possible.

Currently only the ted extractor has been updated, but the old system still works.

											
										
										
											2015-02-15 18:03:41 +01:00
+								    subtitles:      The available subtitles as a dictionary in the format
-												[extractor/common] Allow non-lang in subtitles' keys

See 264e77c406a3b14f15aafcd036524cb6fe86aa20

											
										
										
											2016-12-24 18:50:50 +01:00
+								                    {tag: subformats}. "tag" is usually a language code, and
 								                    "subformats" is a list sorted from lower to higher
 								                    preference, each element is a dictionary with the "ext"
 								                    entry and one of:
-												Improve subtitles support

For each language the extractor builds a list with the available formats sorted (like for video formats), then YoutubeDL selects one of them using the '--sub-format' option which now allows giving the format preferences (for example 'ass/srt/best').
For each format the 'url' field can be set so that we only download the contents if needed, or if the contents needs to be processed (like in crunchyroll) the 'data' field can be used.

The reasons for this change are:
* We weren't checking that the format given with '--sub-format' was available, checking it in each extractor would be repetitive.
* It allows to easily support giving a format preference.
* The subtitles were automatically downloaded in the extractor, but I think that if you use for example the '--dump-json' option you want to finish as fast as possible.

Currently only the ted extractor has been updated, but the old system still works.

											
										
										
											2015-02-15 18:03:41 +01:00
+								                        * "data": The subtitles file contents
-												[extractor/common] Consistent URL spelling

											
										
										
											2015-07-23 19:37:45 +02:00
+								                        * "url": A URL pointing to the subtitles file
-												Add field `name` for subtitles

Co-authored by: pukkandan, tpikonen

Based on: #310, https://github.com/ytdl-org/youtube-dl/pull/26112

											
										
										
											2021-05-12 21:37:58 +02:00
+								                    It can optionally also have:
 								                        * "name": Name or description of the subtitles
-												[cleanup, docs] Misc cleanup

Closes #2828, closes #2734, closes #2802, closes #2937

											
										
										
											2022-03-04 15:08:55 +01:00
+								                        * "http_headers": A dictionary of additional HTTP headers
-												[extractor] Allow `http_headers` to be specified for `thumbnails`

											
										
										
											2022-02-11 19:00:48 +01:00
+								                                  to add to the request.
-												[YoutubeDL] Autocalculate ext for subtitles when missing

											
										
										
											2015-10-04 16:33:42 +02:00
+								                    "ext" will be calculated from URL if missing
-												[documentation] Add deprecated options and aliases in readme

											
										
										
											2021-03-18 16:34:09 +01:00
+								    automatic_captions: Like 'subtitles'; contains automatically generated
 								                    captions instead of normal subtitles
-												[extractor/common] Clarify duration can be float

											
										
										
											2015-12-03 15:55:02 +01:00
+								    duration:       Length of the video in seconds, as an integer or float.
-												Document view_count (Closes #963)

											
										
										
											2013-06-29 16:32:28 +02:00
+								    view_count:     How many users have watched the video on the platform.
-												[extractor/youtube] Extract concurrent view count for livestreams (#5152)

Adds new field `concurrent_view_count`
Closes https://github.com/yt-dlp/yt-dlp/issues/4843

Authored by: coletdjnz
											
										
										
											2022-10-07 09:00:40 +02:00
+								    concurrent_view_count: How many users are currently watching the video on the platform.
-												[9gag] Like/dislike count (#1895)

											
										
										
											2013-12-05 18:29:07 +01:00
+								    like_count:     Number of positive ratings of the video
 								    dislike_count:  Number of negative ratings of the video
-												[extractor/common] Document repost_count

											
										
										
											2015-10-18 05:34:54 +02:00
+								    repost_count:   Number of reposts of the video
-												[youtube] Extract average rating (closes #2362)

											
										
										
											2015-02-11 18:39:31 +01:00
+								    average_rating: Average rating give by users, the scale used depends on the webpage
-												[9gag] Like/dislike count (#1895)

											
										
										
											2013-12-05 18:29:07 +01:00
+								    comment_count:  Number of comments on the video
-												[netzkino] Add new extractor (Fixes #4669)

											
										
										
											2015-01-09 23:59:18 +01:00
+								    comments:       A list of comments, each with one or more of the following
 								                    properties (all but one of text or html optional):
 								                        * "author" - human-readable name of the comment author
 								                        * "author_id" - user ID of the comment author
-												[Youtube] Rewrite comment extraction (#167)

Closes #121

TODO:
* Add an option for the user to specify newest/popular and max number of comments
* Refactor the download code and generalize with TabIE
* Parse time_text to timestamp

											
										
										
											2021-03-14 23:41:11 +01:00
+								                        * "author_thumbnail" - The thumbnail of the comment author
-												[extractor/youtube] Extract more metadata for comments (#7179)

Adds new comment fields:
* `author_url` - The url to the comment author's page
* `author_is_verified` - Whether the author is verified on the platform
* `is_pinned` - Whether the comment is pinned to the top of the comments

Closes https://github.com/yt-dlp/yt-dlp/issues/5411

Authored by: coletdjnz
											
										
										
											2023-06-01 10:43:32 +02:00
+								                        * "author_url" - The url to the comment author's page
 								                        * "author_is_verified" - Whether the author is verified
 								                                                 on the platform
 								                        * "author_is_uploader" - Whether the comment is made by
 								                                                 the video uploader
-												[netzkino] Add new extractor (Fixes #4669)

											
										
										
											2015-01-09 23:59:18 +01:00
+								                        * "id" - Comment ID
 								                        * "html" - Comment as HTML
 								                        * "text" - Plain text of the comment
 								                        * "timestamp" - UNIX timestamp of comment
 								                        * "parent" - ID of the comment this one is replying to.
 								                                     Set to "root" to indicate that this is a
 								                                     comment to the original video.
-												[Youtube] Rewrite comment extraction (#167)

Closes #121

TODO:
* Add an option for the user to specify newest/popular and max number of comments
* Refactor the download code and generalize with TabIE
* Parse time_text to timestamp

											
										
										
											2021-03-14 23:41:11 +01:00
+								                        * "like_count" - Number of positive ratings of the comment
 								                        * "dislike_count" - Number of negative ratings of the comment
 								                        * "is_favorited" - Whether the comment is marked as
 								                                           favorite by the video uploader
-												[extractor/youtube] Extract more metadata for comments (#7179)

Adds new comment fields:
* `author_url` - The url to the comment author's page
* `author_is_verified` - Whether the author is verified on the platform
* `is_pinned` - Whether the comment is pinned to the top of the comments

Closes https://github.com/yt-dlp/yt-dlp/issues/5411

Authored by: coletdjnz
											
										
										
											2023-06-01 10:43:32 +02:00
+								                        * "is_pinned" - Whether the comment is pinned to
 								                                        the top of the comments
-												Allow users to specify an age limit (fixes #1545)

With these changes, users can now restrict what videos are downloaded by the intented audience, by specifying their age with --age-limit YEARS .
Add rudimentary support in youtube, pornotube, and youporn.

											
										
										
											2013-10-06 06:06:30 +02:00
+								    age_limit:      Age restriction for the video, as an integer (years)
-												Completely change project name to yt-dlp (#85)

* All modules and binary names are changed
* All documentation references changed
* yt-dlp no longer loads youtube-dlc config files
* All URLs changed to point to organization account

Co-authored-by: Pccode66
Co-authored-by: pukkandan
											
										
										
											2021-02-24 19:45:56 +01:00
+								    webpage_url:    The URL to the video webpage, if given to yt-dlp it
-												Add the 'webpage_url' field to info_dict

The url for the video page, it must allow to reproduce the result.
It's automatically set by YoutubeDL if it's missing.

											
										
										
											2013-11-03 12:11:13 +01:00
+								                    should allow to get the same result again. (It will be set
 								                    by YoutubeDL if it's missing)
-												Document and test categories (#2923)

											
										
										
											2014-05-15 12:41:42 +02:00
+								    categories:     A list of categories that the video falls in, for example
 								                    ["Sports", "Berlin"]
-												[extractor/common] Add _meta_regex and clarify tags field

											
										
										
											2015-07-28 23:43:03 +02:00
+								    tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
-												[pornhub] Extract `cast`
Closes #406, https://github.com/ytdl-org/youtube-dl/pull/27384

											
										
										
											2021-06-13 18:06:47 +02:00
+								    cast:           A list of the video cast
-												[muenchentv] Add support (Fixes #3507)

											
										
										
											2014-09-19 09:57:53 +02:00
+								    is_live:        True, False, or None (=unknown). Whether this video is a
 								                    live stream that goes on instead of a fixed-length video.
-												[youtube] Show if video was a live stream in info

											
										
										
											2021-02-21 22:41:24 +01:00
+								    was_live:       True, False, or None (=unknown). Whether this video was
 								                    originally a live stream.
-												Minor bugfixes

											
										
										
											2022-08-02 00:10:47 +02:00
+								    live_status:    None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
-												[extractor/youtube] Add `live_status=post_live` (#4495)

Related: https://github.com/yt-dlp/yt-dlp/issues/1564
Authored by: lazypete365
											
										
										
											2022-07-31 22:12:04 +02:00
+								                    or 'post_live' (was live, but VOD is not yet processed)
-												Add field `live_status`

											
										
										
											2021-07-21 17:14:18 +02:00
+								                    If absent, automatically set from is_live, was_live
-												[youtube] Extract start_time

From the 't=*' in the url.
Currently youtube-dl doesn't use the value, but it was requested for the mpv plugin.

											
										
										
											2015-07-20 21:10:28 +02:00
+								    start_time:     Time in seconds where the reproduction should start, as
-												[extractor/common] Consistent URL spelling

											
										
										
											2015-07-23 19:37:45 +02:00
+								                    specified in the URL.
-												[youtube] Extract end_time

											
										
										
											2015-07-23 13:20:21 +02:00
+								    end_time:       Time in seconds where the reproduction should end, as
-												[extractor/common] Consistent URL spelling

											
										
										
											2015-07-23 19:37:45 +02:00
+								                    specified in the URL.
-												[common] introduce chapters field

											
										
										
											2016-05-05 22:40:19 +02:00
+								    chapters:       A list of dictionaries, with the following entries:
 								                        * "start_time" - The start time of the chapter in seconds
 								                        * "end_time" - The end time of the chapter in seconds
 								                        * "title" (optional, string)
-												[extractor/youtube] Extract `heatmap` data (#7100)

Closes #3888
Authored by: tntmod54321
											
										
										
											2023-05-26 14:24:39 +02:00
+								    heatmap:        A list of dictionaries, with the following entries:
 								                        * "start_time" - The start time of the data point in seconds
 								                        * "end_time" - The end time of the data point in seconds
 								                        * "value" - The normalized value of the data point (float between 0 and 1)
-												[documentaion] Document `playable_in_embed`

:ci skip all

											
										
										
											2021-01-14 10:07:23 +01:00
+								    playable_in_embed: Whether this video is allowed to play in embedded
 								                    players on other sites. Can be True (=always allowed),
 								                    False (=never allowed), None (=unknown), or a string
-												[docs] Consistent use of `e.g.` (#4643)

Authored by: Lesmiscore
											
										
										
											2022-08-14 14:04:13 +02:00
+								                    specifying the criteria for embedability; e.g. 'whitelist'
-												[youtube] Show if video is `private`, `unlisted` etc in new field `availability` (#188)
Closes: #185, https://github.com/ytdl-org/youtube-dl/issues/25631

Authored by: colethedj, pukkandan

											
										
										
											2021-03-21 22:23:34 +01:00
+								    availability:   Under what condition the video is available. One of
 								                    'private', 'premium_only', 'subscriber_only', 'needs_auth',
 								                    'unlisted' or 'public'. Use 'InfoExtractor._availability'
 								                    to set it
-												[ie] Add `media_type` field

Authored by: trainman261

											
										
										
											2023-12-12 00:52:59 +01:00
+								    media_type:     The type of media as classified by the site, e.g. "episode", "clip", "trailer"
-												[extractor] Support multiple archive ids for one video (#4307)

Closes #4352

											
										
										
											2022-07-13 11:33:05 +02:00
+								    _old_archive_ids: A list of old archive ids needed for backward compatibility
-												Implement universal format sorting

Closes #5566

											
										
										
											2022-11-17 06:23:05 +01:00
+								    _format_sort_fields: A list of fields to use for sorting formats
-												Extract comments only when needed #95 (Closes #94)


											
										
										
											2021-02-28 15:56:08 +01:00
+								    __post_extractor: A function to be called just before the metadata is
 								                    written to either disk, logger or console. The function
 								                    must return a dict which will be added to the info_dict.
 								                    This is usefull for additional information that is
 								                    time-consuming to extract. Note that the fields thus
 								                    extracted will not be available to output template and
 								                    match_filter. So, only "comments" and "comment_count" are
 								                    currently allowed to be extracted via this method.
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
-												[extractor/common] Document chapter and series fields

											
										
										
											2015-12-30 22:10:44 +01:00
+								    The following fields should only be used when the video belongs to some logical
 								    chapter or section:
 								    chapter:        Name or title of the chapter the video belongs to.
-												[extractor/common] Introduce number fields for chapters and series

											
										
										
											2016-01-01 15:26:56 +01:00
+								    chapter_number: Number of the chapter the video belongs to, as an integer.
 								    chapter_id:     Id of the chapter the video belongs to, as a unicode string.
-												[extractor/common] Document chapter and series fields

											
										
										
											2015-12-30 22:10:44 +01:00
 								    The following fields should only be used when the video is an episode of some
-												[extractor/common] Mention podcast in series fields section

											
										
										
											2016-10-16 13:37:17 +02:00
+								    series, programme or podcast:
-												[extractor/common] Document chapter and series fields

											
										
										
											2015-12-30 22:10:44 +01:00
 								    series:         Title of the series or programme the video episode belongs to.
-												[curiositystream] Add more metadata
Closes #1568

											
										
										
											2021-11-13 19:19:14 +01:00
+								    series_id:      Id of the series or programme the video episode belongs to, as a unicode string.
-												[extractor/common] Document chapter and series fields

											
										
										
											2015-12-30 22:10:44 +01:00
+								    season:         Title of the season the video episode belongs to.
-												[extractor/common] Introduce number fields for chapters and series

											
										
										
											2016-01-01 15:26:56 +01:00
+								    season_number:  Number of the season the video episode belongs to, as an integer.
 								    season_id:      Id of the season the video episode belongs to, as a unicode string.
-												[extractor/common] Document chapter and series fields

											
										
										
											2015-12-30 22:10:44 +01:00
+								    episode:        Title of the video episode. Unlike mandatory video title field,
 								                    this field should denote the exact title of the video episode
 								                    without any kind of decoration.
-												[extractor/common] Introduce number fields for chapters and series

											
										
										
											2016-01-01 15:26:56 +01:00
+								    episode_number: Number of the video episode within a season, as an integer.
 								    episode_id:     Id of the video episode, as a unicode string.
-												[extractor/common] Document chapter and series fields

											
										
										
											2015-12-30 22:10:44 +01:00
-												[extractor/common] Introduce music album metafields

											
										
										
											2016-04-06 22:53:53 +02:00
+								    The following fields should only be used when the media is a track or a part of
 								    a music album:
 								    track:          Title of the track.
 								    track_number:   Number of the track within an album or a disc, as an integer.
 								    track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 								                    as a unicode string.
-												[ie] Migrate commonly plural fields to lists (#8917)

Authored by: llistochek, pukkandan
Related: #3944
											
										
										
											2024-02-20 08:19:24 +01:00
+								    artists:        List of artists of the track.
 								    composers:      List of composers of the piece.
 								    genres:         List of genres of the track.
-												[extractor/common] Introduce music album metafields

											
										
										
											2016-04-06 22:53:53 +02:00
+								    album:          Title of the album the track belongs to.
 								    album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
-												[ie] Migrate commonly plural fields to lists (#8917)

Authored by: llistochek, pukkandan
Related: #3944
											
										
										
											2024-02-20 08:19:24 +01:00
+								    album_artists:  List of all artists appeared on the album.
 								                    E.g. ["Ash Borer", "Fell Voices"] or ["Various Artists"].
 								                    Useful for splits and compilations.
-												[extractor/common] Introduce music album metafields

											
										
										
											2016-04-06 22:53:53 +02:00
+								    disc_number:    Number of the disc or other physical medium the track belongs to,
 								                    as an integer.
-												Allow extractors to specify section_start/end for clips

											
										
										
											2022-06-21 23:02:14 +02:00
+								    The following fields should only be set for clips that should be cut from the original video:
 								    section_start:  Start time of the section in seconds
 								    section_end:    End time of the section in seconds
-												[extractor/youtube] More metadata for storyboards (#4334)

Authored by: ftk
											
										
										
											2022-07-12 17:16:45 +02:00
+								    The following fields should only be set for storyboards:
 								    rows:           Number of rows in each storyboard fragment, as an integer
 								    columns:        Number of columns in each storyboard fragment, as an integer
-												[ie] Migrate commonly plural fields to lists (#8917)

Authored by: llistochek, pukkandan
Related: #3944
											
										
										
											2024-02-20 08:19:24 +01:00
+								    The following fields are deprecated and should not be set by new code:
 								    composer:       Use "composers" instead.
 								                    Composer(s) of the piece, comma-separated.
 								    artist:         Use "artists" instead.
 								                    Artist(s) of the track, comma-separated.
 								    genre:          Use "genres" instead.
 								                    Genre(s) of the track, comma-separated.
 								    album_artist:   Use "album_artists" instead.
 								                    All artists appeared on the album, comma-separated.
 								    creator:        Use "creators" instead.
 								                    The creator of the video.
-												Document formats (for #980)

											
										
										
											2013-10-04 10:40:42 +02:00
+								    Unless mentioned otherwise, the fields should be Unicode strings.
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
-												[utils] Default age_limit to None

If we can't parse it, it means we don't have any information, not that the content is unrestricted.

											
										
										
											2014-10-03 20:17:10 +02:00
+								    Unless mentioned otherwise, None is equivalent to absence of information.
-												[extractor/common] Document _type values (Motivated by #4254)

											
										
										
											2014-11-20 16:47:59 +01:00
 								    _type "playlist" indicates multiple videos.
-												Allow iterators for playlist result entries

											
										
										
											2014-12-06 14:02:19 +01:00
+								    There must be a key "entries", which is a list, an iterable, or a PagedList
 								    object, each element of which is a valid dictionary by this specification.
-												[extractor/common] Document _type values (Motivated by #4254)

											
										
										
											2014-11-20 16:47:59 +01:00
-												[cleanup] Fix some typos (#4194)

Authored by: crazymoose77756
											
										
										
											2022-06-27 02:50:06 +02:00
+								    Additionally, playlists can have "id", "title", and any other relevant
-												[youtube] More metadata extraction for channels/playlists

											
										
										
											2021-02-02 17:21:32 +01:00
+								    attributes with the same semantics as videos (see above).
-												[extractor/common] Document _type values (Motivated by #4254)

											
										
										
											2014-11-20 16:47:59 +01:00
-												[youtube:tab] Extract more playlist metadata (#2069)

* Add fields modified_date, modified_timestamp
* Add field playlist_count
* [youtube:tab] Extract view_count, playlist_count, modified_date

Authored by: coletdjnz, pukkandan
											
										
										
											2022-01-07 12:03:02 +01:00
+								    It can also have the following optional fields:
 								    playlist_count: The total number of videos in a playlist. If not given,
 								                    YoutubeDL tries to calculate it from "entries"
-												[extractor/common] Document _type values (Motivated by #4254)

											
										
										
											2014-11-20 16:47:59 +01:00
 								    _type "multi_video" indicates that there are multiple videos that
 								    form a single show, for examples multiple acts of an opera or TV episode.
 								    It must have an entries key like a playlist and contain all the keys
 								    required for a video at the same time.
 								    _type "url" indicates that the video must be extracted from another
 								    location, possibly by a different extractor. Its only required key is:
 								    "url" - the next URL to extract.
-												[extractor/common] Document ie_key in url results

											
										
										
											2014-12-09 10:58:06 +01:00
+								    The key "ie_key" can be set to the class name (minus the trailing "IE",
 								    e.g. "Youtube") if the extractor class is known in advance.
 								    Additionally, the dictionary may have any properties of the resolved entity
 								    known in advance, for example "title" if the title of the referred video is
-												[extractor/common] Document _type values (Motivated by #4254)

											
										
										
											2014-11-20 16:47:59 +01:00
+								    known ahead of time.
 								    _type "url_transparent" entities have the same specification as "url", but
 								    indicate that the given additional information is more precise than the one
 								    associated with the resolved URL.
 								    This is useful when a site employs a video service that hosts the video and
 								    its technical metadata, but that video service does not embed a useful
 								    title, description etc.
-												[extractor] Framework for embed detection (#4307)

											
										
										
											2022-08-01 03:22:03 +02:00
+								    Subclasses of this should also be added to the list of extractors and
-												[extractor] Support multiple `_VALID_URL`s (#5812)

Authored by: nixxo

											
										
										
											2023-06-21 23:27:00 +02:00
+								    should define _VALID_URL as a regexp or a Sequence of regexps, and
 								    re-define the _real_extract() and (optionally) _real_initialize() methods.
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
-												[docs,cleanup] Some minor refactoring and improve docs

											
										
										
											2021-09-17 20:23:55 +02:00
+								    Subclasses may also override suitable() if necessary, but ensure the function
 								    signature is preserved and that this function imports everything it needs
-												[extractor] Add `_perform_login` function (#2943)

* Adds new functions `_initialize_pre_login` and `_perform_login` as part of the extractor API
* Adds `ie.supports_login` to the public API
											
										
										
											2022-03-18 21:53:33 +01:00
+								    (except other extractors), so that lazy_extractors works correctly.
-												[extractor] Framework for embed detection (#4307)

											
										
										
											2022-08-01 03:22:03 +02:00
+								    Subclasses can define a list of _EMBED_REGEX, which will be searched for in
 								    the HTML of Generic webpages. It may also override _extract_embed_urls
 								    or _extract_from_webpage as necessary. While these are normally classmethods,
 								    _extract_from_webpage is allowed to be an instance method.
 								    _extract_from_webpage may raise self.StopExtraction() to stop further
 								    processing of the webpage and obtain exclusive rights to it. This is useful
-												[docs] Consistent use of `e.g.` (#4643)

Authored by: Lesmiscore
											
										
										
											2022-08-14 14:04:13 +02:00
+								    when the extractor cannot reliably be matched using just the URL,
 								    e.g. invidious/peertube instances
-												[extractor] Framework for embed detection (#4307)

											
										
										
											2022-08-01 03:22:03 +02:00
 								    Embed-only extractors can be defined by setting _VALID_URL = False.
-												[extractor] Add `_perform_login` function (#2943)

* Adds new functions `_initialize_pre_login` and `_perform_login` as part of the extractor API
* Adds `ie.supports_login` to the public API
											
										
										
											2022-03-18 21:53:33 +01:00
+								    To support username + password (or netrc) login, the extractor must define a
 								    _NETRC_MACHINE and re-define _perform_login(username, password) and
 								    (optionally) _initialize_pre_login() methods. The _perform_login method will
 								    be called between _initialize_pre_login and _real_initialize if credentials
 								    are passed by the user. In cases where it is necessary to have the login
 								    process as part of the extraction rather than initialization, _perform_login
 								    can be left undefined.
-												[docs,cleanup] Some minor refactoring and improve docs

											
										
										
											2021-09-17 20:23:55 +02:00
-												Improve geo bypass mechanism
* Rename options to preffixly match with --geo-verification-proxy
* Introduce _GEO_COUNTRIES for extractors
* Implement faking IP right away for sites with known geo restriction

											
										
										
											2017-02-18 21:53:23 +01:00
+								    _GEO_BYPASS attribute may be set to False in order to disable
-												Add experimental geo restriction bypass mechanism
Based on faking X-Forwarded-For HTTP header

											
										
										
											2017-02-04 12:49:58 +01:00
+								    geo restriction bypass mechanisms for a particular extractor.
 								    Though it won't disable explicit geo restriction bypass based on
-												Remove experimental mark for some options

											
										
										
											2018-05-19 18:53:24 +02:00
+								    country code provided with geo_bypass_country.
-												Improve geo bypass mechanism
* Rename options to preffixly match with --geo-verification-proxy
* Introduce _GEO_COUNTRIES for extractors
* Implement faking IP right away for sites with known geo restriction

											
										
										
											2017-02-18 21:53:23 +01:00
 								    _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 								    countries for this extractor. One of these countries will be used by
 								    geo restriction bypass mechanism right away in order to bypass
-												Remove experimental mark for some options

											
										
										
											2018-05-19 18:53:24 +02:00
+								    geo restriction, of course, if the mechanism is not disabled.
-												Add experimental geo restriction bypass mechanism
Based on faking X-Forwarded-For HTTP header

											
										
										
											2017-02-04 12:49:58 +01:00
-												Improve geo bypass mechanism
* Introduce geo bypass context
* Add ability to bypass based on IP blocks in CIDR notation
* Introduce --geo-bypass-ip-block

											
										
										
											2018-05-02 02:18:01 +02:00
+								    _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 								    IP blocks in CIDR notation for this extractor. One of these IP blocks
 								    will be used by geo restriction bypass mechanism similarly
-												Remove experimental mark for some options

											
										
										
											2018-05-19 18:53:24 +02:00
+								    to _GEO_COUNTRIES.
-												[extractor/common] Emphasize geo bypass APIs are experimental

											
										
										
											2017-02-20 17:21:15 +01:00
-												Add option `--use-extractors`

Deprecates `--force-generic-extractor`

Closes #3234, Closes #2044

Related: #4307, #1791

											
										
										
											2022-08-24 02:12:16 +02:00
+								    The _ENABLED attribute should be set to False for IEs that
 								    are disabled by default and must be explicitly enabled.
-												[docs,cleanup] Some minor refactoring and improve docs

											
										
										
											2021-09-17 20:23:55 +02:00
+								    The _WORKING attribute should be set to False for broken IEs
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								    in order to warn the users and skip the tests.
 								    """
 								    _ready = False
 								    _downloader = None
-												Add experimental geo restriction bypass mechanism
Based on faking X-Forwarded-For HTTP header

											
										
										
											2017-02-04 12:49:58 +01:00
+								    _x_forwarded_for_ip = None
-												Improve geo bypass mechanism
* Rename options to preffixly match with --geo-verification-proxy
* Introduce _GEO_COUNTRIES for extractors
* Implement faking IP right away for sites with known geo restriction

											
										
										
											2017-02-18 21:53:23 +01:00
+								    _GEO_BYPASS = True
 								    _GEO_COUNTRIES = None
-												Improve geo bypass mechanism
* Introduce geo bypass context
* Add ability to bypass based on IP blocks in CIDR notation
* Introduce --geo-bypass-ip-block

											
										
										
											2018-05-02 02:18:01 +02:00
+								    _GEO_IP_BLOCKS = None
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								    _WORKING = True
-												Add option `--use-extractors`

Deprecates `--force-generic-extractor`

Closes #3234, Closes #2044

Related: #4307, #1791

											
										
										
											2022-08-24 02:12:16 +02:00
+								    _ENABLED = True
-												[extractor] Add `_perform_login` function (#2943)

* Adds new functions `_initialize_pre_login` and `_perform_login` as part of the extractor API
* Adds `ie.supports_login` to the public API
											
										
										
											2022-03-18 21:53:33 +01:00
+								    _NETRC_MACHINE = None
-												Fix bug in 52efa4b31200119adaa8acf33e50b84fcb6948f0

Closes #3173

											
										
										
											2022-03-24 02:23:11 +01:00
+								    IE_DESC = None
-												[extractor] Document netrc machines

Closes #3169

											
										
										
											2022-05-09 06:32:17 +02:00
+								    SEARCH_KEY = None
-												[extractor] Framework for embed detection (#4307)

											
										
										
											2022-08-01 03:22:03 +02:00
+								    _VALID_URL = None
 								    _EMBED_REGEX = []
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
-												[extractor] Document netrc machines

Closes #3169

											
										
										
											2022-05-09 06:32:17 +02:00
+								    def _login_hint(self, method=NO_DEFAULT, netrc=None):
-												Add option `--netrc-cmd` (#6682)

Authored by: NDagestad, pukkandan
Closes #1706
											
										
										
											2023-06-21 05:07:42 +02:00
+								        password_hint = f'--username and --password, --netrc-cmd, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
-												[extractor] Document netrc machines

Closes #3169

											
										
										
											2022-05-09 06:32:17 +02:00
+								        return {
 								            None: '',
 								            'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
 								            'password': f'Use {password_hint}',
 								            'cookies': (
 								                'Use --cookies-from-browser or --cookies for the authentication. '
-												[docs] Improvements

* Move detailed installation instructions to https://github.com/yt-dlp/yt-dlp/wiki/Installation
* Link to wiki where applicable
* Fix some mistakes. Closes #4853, Closes #4855, Closes #4852
* Improve some error messages

											
										
										
											2022-09-07 14:05:45 +02:00
+								                'See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies'),
-												[extractor] Document netrc machines

Closes #3169

											
										
										
											2022-05-09 06:32:17 +02:00
+								        }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
-												[youtube] Better message when login required

											
										
										
											2021-05-19 15:41:44 +02:00
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								    def __init__(self, downloader=None):
-												[cleanup] misc

											
										
										
											2021-10-22 22:37:20 +02:00
+								        """Constructor. Receives an optional downloader (a YoutubeDL instance).
 								        If a downloader is not passed during initialization,
 								        it must be set using "set_downloader()" before "extract()" is called"""
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								        self._ready = False
-												Add experimental geo restriction bypass mechanism
Based on faking X-Forwarded-For HTTP header

											
										
										
											2017-02-04 12:49:58 +01:00
+								        self._x_forwarded_for_ip = None
-												[extractor] Reset non-repeating warnings per video

											
										
										
											2021-08-06 00:17:11 +02:00
+								        self._printed_messages = set()
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								        self.set_downloader(downloader)
 								    @classmethod
-												[extractor] Common function `_match_valid_url`

											
										
										
											2021-08-19 03:41:24 +02:00
+								    def _match_valid_url(cls, url):
-												[extractor] Framework for embed detection (#4307)

											
										
										
											2022-08-01 03:22:03 +02:00
+								        if cls._VALID_URL is False:
 								            return None
-												Cache suitable regular expressions

This speeds up TestAllURLsMatching.test_no_duplicates by about 8000% at the cost of minimal memory overhead.

											
										
										
											2013-08-21 04:06:46 +02:00
+								        # This does not use has/getattr intentionally - we want to know whether
 								        # we have cached the regexp for *this* class, whereas getattr would also
 								        # match the superclass
 								        if '_VALID_URL_RE' not in cls.__dict__:
-												[extractor] Support multiple `_VALID_URL`s (#5812)

Authored by: nixxo

											
										
										
											2023-06-21 23:27:00 +02:00
+								            cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL)))
 								        return next(filter(None, (regex.match(url) for regex in cls._VALID_URL_RE)), None)
-												[extractor] Common function `_match_valid_url`

											
										
										
											2021-08-19 03:41:24 +02:00
 								    @classmethod
 								    def suitable(cls, url):
 								        """Receives a URL and returns True if suitable for this IE."""
-												[lazy_extractors] Fix `suitable` and add flake8 test

											
										
										
											2021-08-22 21:19:23 +02:00
+								        # This function must import everything it needs (except other extractors),
 								        # so that lazy_extractors works correctly
-												[extractor] Common function `_match_valid_url`

											
										
										
											2021-08-19 03:41:24 +02:00
+								        return cls._match_valid_url(url) is not None
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
-												[common] Add new helper function _match_id

											
										
										
											2014-09-28 09:31:58 +02:00
+								    @classmethod
 								    def _match_id(cls, url):
-												[extractor] Common function `_match_valid_url`

											
										
										
											2021-08-19 03:41:24 +02:00
+								        return cls._match_valid_url(url).group('id')
-												[common] Add new helper function _match_id

											
										
										
											2014-09-28 09:31:58 +02:00
-												[extractor] Show video id in error messages if possible

											
										
										
											2021-08-19 03:49:23 +02:00
+								    @classmethod
 								    def get_temp_id(cls, url):
 								        try:
 								            return cls._match_id(url)
 								        except (IndexError, AttributeError):
 								            return None
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								    @classmethod
 								    def working(cls):
 								        """Getter method for _WORKING."""
 								        return cls._WORKING
-												[extractor] Add `_perform_login` function (#2943)

* Adds new functions `_initialize_pre_login` and `_perform_login` as part of the extractor API
* Adds `ie.supports_login` to the public API
											
										
										
											2022-03-18 21:53:33 +01:00
+								    @classmethod
 								    def supports_login(cls):
 								        return bool(cls._NETRC_MACHINE)
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								    def initialize(self):
 								        """Initializes an instance (authentication, etc)."""
-												[extractor] Reset non-repeating warnings per video

											
										
										
											2021-08-06 00:17:11 +02:00
+								        self._printed_messages = set()
-												Improve geo bypass mechanism
* Introduce geo bypass context
* Add ability to bypass based on IP blocks in CIDR notation
* Introduce --geo-bypass-ip-block

											
										
										
											2018-05-02 02:18:01 +02:00
+								        self._initialize_geo_bypass({
 								            'countries': self._GEO_COUNTRIES,
 								            'ip_blocks': self._GEO_IP_BLOCKS,
 								        })
-												Improve geo bypass mechanism
* Rename options to preffixly match with --geo-verification-proxy
* Introduce _GEO_COUNTRIES for extractors
* Implement faking IP right away for sites with known geo restriction

											
										
										
											2017-02-18 21:53:23 +01:00
+								        if not self._ready:
-												[extractor] Add `_perform_login` function (#2943)

* Adds new functions `_initialize_pre_login` and `_perform_login` as part of the extractor API
* Adds `ie.supports_login` to the public API
											
										
										
											2022-03-18 21:53:33 +01:00
+								            self._initialize_pre_login()
 								            if self.supports_login():
 								                username, password = self._get_login_info()
 								                if username:
 								                    self._perform_login(username, password)
 								            elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
-												[extractor] Document netrc machines

Closes #3169

											
										
										
											2022-05-09 06:32:17 +02:00
+								                self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
-												Improve geo bypass mechanism
* Rename options to preffixly match with --geo-verification-proxy
* Introduce _GEO_COUNTRIES for extractors
* Implement faking IP right away for sites with known geo restriction

											
										
										
											2017-02-18 21:53:23 +01:00
+								            self._real_initialize()
 								            self._ready = True
-												Improve geo bypass mechanism
* Introduce geo bypass context
* Add ability to bypass based on IP blocks in CIDR notation
* Introduce --geo-bypass-ip-block

											
										
										
											2018-05-02 02:18:01 +02:00
+								    def _initialize_geo_bypass(self, geo_bypass_context):
-												[extractor/common] Allow calling _initialize_geo_bypass from extractors (#11970)

											
										
										
											2017-02-21 17:00:43 +01:00
+								        """
 								        Initialize geo restriction bypass mechanism.
 								        This method is used to initialize geo bypass mechanism based on faking
 								        X-Forwarded-For HTTP header. A random country from provided country list
-												[extractor/common] Fix typo

											
										
										
											2017-02-21 17:05:31 +01:00
+								        is selected and a random IP belonging to this country is generated. This
-												[extractor/common] Allow calling _initialize_geo_bypass from extractors (#11970)

											
										
										
											2017-02-21 17:00:43 +01:00
+								        IP will be passed as X-Forwarded-For HTTP header in all subsequent
 								        HTTP requests.
 								        This method will be used for initial geo bypass mechanism initialization
-												Improve geo bypass mechanism
* Introduce geo bypass context
* Add ability to bypass based on IP blocks in CIDR notation
* Introduce --geo-bypass-ip-block

											
										
										
											2018-05-02 02:18:01 +02:00
+								        during the instance initialization with _GEO_COUNTRIES and
 								        _GEO_IP_BLOCKS.
-												[extractor/common] Allow calling _initialize_geo_bypass from extractors (#11970)

											
										
										
											2017-02-21 17:00:43 +01:00
-												Improve geo bypass mechanism
* Introduce geo bypass context
* Add ability to bypass based on IP blocks in CIDR notation
* Introduce --geo-bypass-ip-block

											
										
										
											2018-05-02 02:18:01 +02:00
+								        You may also manually call it from extractor's code if geo bypass
-												[extractor/common] Allow calling _initialize_geo_bypass from extractors (#11970)

											
										
										
											2017-02-21 17:00:43 +01:00
+								        information is not available beforehand (e.g. obtained during
-												Improve geo bypass mechanism
* Introduce geo bypass context
* Add ability to bypass based on IP blocks in CIDR notation
* Introduce --geo-bypass-ip-block

											
										
										
											2018-05-02 02:18:01 +02:00
+								        extraction) or due to some other reason. In this case you should pass
 								        this information in geo bypass context passed as first argument. It may
 								        contain following fields:
 								        countries:  List of geo unrestricted countries (similar
 								                    to _GEO_COUNTRIES)
 								        ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 								                    (similar to _GEO_IP_BLOCKS)
-												[extractor/common] Allow calling _initialize_geo_bypass from extractors (#11970)

											
										
										
											2017-02-21 17:00:43 +01:00
+								        """
-												Add experimental geo restriction bypass mechanism
Based on faking X-Forwarded-For HTTP header

											
										
										
											2017-02-04 12:49:58 +01:00
+								        if not self._x_forwarded_for_ip:
-												Improve geo bypass mechanism
* Introduce geo bypass context
* Add ability to bypass based on IP blocks in CIDR notation
* Introduce --geo-bypass-ip-block

											
										
										
											2018-05-02 02:18:01 +02:00
 								            # Geo bypass mechanism is explicitly disabled by user
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											2021-05-17 14:23:08 +02:00
+								            if not self.get_param('geo_bypass', True):
-												Improve geo bypass mechanism
* Introduce geo bypass context
* Add ability to bypass based on IP blocks in CIDR notation
* Introduce --geo-bypass-ip-block

											
										
										
											2018-05-02 02:18:01 +02:00
+								                return
 								            if not geo_bypass_context:
 								                geo_bypass_context = {}
 								            # Backward compatibility: previously _initialize_geo_bypass
 								            # expected a list of countries, some 3rd party code may still use
 								            # it this way
 								            if isinstance(geo_bypass_context, (list, tuple)):
 								                geo_bypass_context = {
 								                    'countries': geo_bypass_context,
 								                }
 								            # The whole point of geo bypass mechanism is to fake IP
 								            # as X-Forwarded-For HTTP header based on some IP block or
 								            # country code.
 								            # Path 1: bypassing based on IP block in CIDR notation
 								            # Explicit IP block specified by user, use it right away
 								            # regardless of whether extractor is geo bypassable or not
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											2021-05-17 14:23:08 +02:00
+								            ip_block = self.get_param('geo_bypass_ip_block', None)
-												Improve geo bypass mechanism
* Introduce geo bypass context
* Add ability to bypass based on IP blocks in CIDR notation
* Introduce --geo-bypass-ip-block

											
										
										
											2018-05-02 02:18:01 +02:00
 								            # Otherwise use random IP block from geo bypass context but only
 								            # if extractor is known as geo bypassable
 								            if not ip_block:
 								                ip_blocks = geo_bypass_context.get('ip_blocks')
 								                if self._GEO_BYPASS and ip_blocks:
 								                    ip_block = random.choice(ip_blocks)
 								            if ip_block:
 								                self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
-												[cleanup] Misc fixes and cleanup

Closes #3780, Closes #3853, Closes #3850

											
										
										
											2022-05-27 01:06:23 +02:00
+								                self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
-												Improve geo bypass mechanism
* Introduce geo bypass context
* Add ability to bypass based on IP blocks in CIDR notation
* Introduce --geo-bypass-ip-block

											
										
										
											2018-05-02 02:18:01 +02:00
+								                return
 								            # Path 2: bypassing based on country code
 								            # Explicit country code specified by user, use it right away
 								            # regardless of whether extractor is geo bypassable or not
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											2021-05-17 14:23:08 +02:00
+								            country = self.get_param('geo_bypass_country', None)
-												Improve geo bypass mechanism
* Introduce geo bypass context
* Add ability to bypass based on IP blocks in CIDR notation
* Introduce --geo-bypass-ip-block

											
										
										
											2018-05-02 02:18:01 +02:00
 								            # Otherwise use random country code from geo bypass context but
 								            # only if extractor is known as geo bypassable
 								            if not country:
 								                countries = geo_bypass_context.get('countries')
 								                if self._GEO_BYPASS and countries:
 								                    country = random.choice(countries)
 								            if country:
 								                self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
-												Standardize `write_debug`

											
										
										
											2021-05-14 09:45:29 +02:00
+								                self._downloader.write_debug(
-												[cleanup] Upgrade syntax

Using https://github.com/asottile/pyupgrade

1. `__future__` imports and `coding: utf-8` were removed
2. Files were rewritten with `pyupgrade --py36-plus --keep-percent-format`
3. f-strings were cherry-picked from `pyupgrade --py36-plus`

Extractors are left untouched (except removing header) to avoid unnecessary merge conflicts

											
										
										
											2022-04-11 17:10:28 +02:00
+								                    f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
 								    def extract(self, url):
 								        """Extracts URL information and returns it in list of dicts."""
-												[extractor/common] Wrap extractor errors (Fixes #1194)

For now, we just wrap some common errors. More may follow. We do not want to catch actual programming errors in the extractors, such as 1 // 0.

											
										
										
											2015-02-10 01:13:57 +01:00
+								        try:
-												Add experimental geo restriction bypass mechanism
Based on faking X-Forwarded-For HTTP header

											
										
										
											2017-02-04 12:49:58 +01:00
+								            for _ in range(2):
 								                try:
 								                    self.initialize()
-												[cleanup] Misc

											
										
										
											2022-11-30 07:04:51 +01:00
+								                    self.to_screen('Extracting URL: %s' % (
 								                        url if self.get_param('verbose') else truncate_string(url, 100, 20)))
-												Add faked X-Forwarded-For to formats' HTTP headers

											
										
										
											2017-02-04 15:06:07 +01:00
+								                    ie_result = self._real_extract(url)
-												[cleanup] linter, code formatting and readme

											
										
										
											2021-05-18 20:20:59 +02:00
+								                    if ie_result is None:
 								                        return None
-												Add faked X-Forwarded-For to formats' HTTP headers

											
										
										
											2017-02-04 15:06:07 +01:00
+								                    if self._x_forwarded_for_ip:
 								                        ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
-												`--compat-option no-live-chat` should disable danmaku

Closes #4387

											
										
										
											2022-07-19 11:57:11 +02:00
+								                    subtitles = ie_result.get('subtitles') or {}
 								                    if 'no-live-chat' in self.get_param('compat_opts'):
 								                        for lang in ('live_chat', 'comments', 'danmaku'):
 								                            subtitles.pop(lang, None)
-												Add faked X-Forwarded-For to formats' HTTP headers

											
										
										
											2017-02-04 15:06:07 +01:00
+								                    return ie_result
-												Add experimental geo restriction bypass mechanism
Based on faking X-Forwarded-For HTTP header

											
										
										
											2017-02-04 12:49:58 +01:00
+								                except GeoRestrictedError as e:
-												Improve geo bypass mechanism
* Rename options to preffixly match with --geo-verification-proxy
* Introduce _GEO_COUNTRIES for extractors
* Implement faking IP right away for sites with known geo restriction

											
										
										
											2017-02-18 21:53:23 +01:00
+								                    if self.__maybe_fake_ip_and_retry(e.countries):
 								                        continue
-												Add experimental geo restriction bypass mechanism
Based on faking X-Forwarded-For HTTP header

											
										
										
											2017-02-04 12:49:58 +01:00
+								                    raise
-												[extractor] Fix some errors being converted to `ExtractorError`

											
										
										
											2021-10-26 16:47:29 +02:00
+								        except UnsupportedError:
 								            raise
-												[extractor] Show video id in error messages if possible

											
										
										
											2021-08-19 03:49:23 +02:00
+								        except ExtractorError as e:
-												[cleanup] Misc

											
										
										
											2023-07-30 00:06:17 +02:00
+								            e.video_id = e.video_id or self.get_temp_id(url)
-												[cleanup] Fix misc bugs (#8968)

Closes #8816

Authored by: bashonly, seproDev, pukkandan, Grub4k

											
										
										
											2024-03-10 15:22:49 +01:00
+								            e.ie = e.ie or self.IE_NAME
-												[utils] Make `ExtractorError` mutable

											
										
										
											2022-11-30 01:40:26 +01:00
+								            e.traceback = e.traceback or sys.exc_info()[2]
 								            raise
-												[compat, networking] Deprecate old functions (#2861)

Authored by: coletdjnz, pukkandan

											
										
										
											2023-07-09 09:53:02 +02:00
+								        except IncompleteRead as e:
-												[extractor] Show video id in error messages if possible

											
										
										
											2021-08-19 03:49:23 +02:00
+								            raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
-												[escapist] Filter video differently (Fixes #4919)

											
										
										
											2015-02-10 15:55:51 +01:00
+								        except (KeyError, StopIteration) as e:
-												[extractor] Show video id in error messages if possible

											
										
										
											2021-08-19 03:49:23 +02:00
+								            raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
-												Improve geo bypass mechanism
* Rename options to preffixly match with --geo-verification-proxy
* Introduce _GEO_COUNTRIES for extractors
* Implement faking IP right away for sites with known geo restriction

											
										
										
											2017-02-18 21:53:23 +01:00
+								    def __maybe_fake_ip_and_retry(self, countries):
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											2021-05-17 14:23:08 +02:00
+								        if (not self.get_param('geo_bypass_country', None)
-												Fix W504 and disable W503 (closes #20863)

											
										
										
											2019-05-10 22:56:22 +02:00
+								                and self._GEO_BYPASS
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											2021-05-17 14:23:08 +02:00
+								                and self.get_param('geo_bypass', True)
-												Fix W504 and disable W503 (closes #20863)

											
										
										
											2019-05-10 22:56:22 +02:00
+								                and not self._x_forwarded_for_ip
 								                and countries):
-												[extractor/common] Print origin country for fake IP

											
										
										
											2017-02-21 17:14:33 +01:00
+								            country_code = random.choice(countries)
 								            self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
-												Improve geo bypass mechanism
* Rename options to preffixly match with --geo-verification-proxy
* Introduce _GEO_COUNTRIES for extractors
* Implement faking IP right away for sites with known geo restriction

											
										
										
											2017-02-18 21:53:23 +01:00
+								            if self._x_forwarded_for_ip:
 								                self.report_warning(
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								                    'Video is geo restricted. Retrying extraction with fake IP '
 								                    f'{self._x_forwarded_for_ip} ({country_code.upper()}) as X-Forwarded-For.')
-												Improve geo bypass mechanism
* Rename options to preffixly match with --geo-verification-proxy
* Introduce _GEO_COUNTRIES for extractors
* Implement faking IP right away for sites with known geo restriction

											
										
										
											2017-02-18 21:53:23 +01:00
+								                return True
 								        return False
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								    def set_downloader(self, downloader):
-												[cleanup, docs] Misc cleanup

Closes #2828, closes #2734, closes #2802, closes #2937

											
										
										
											2022-03-04 15:08:55 +01:00
+								        """Sets a YoutubeDL instance as the downloader for this IE."""
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								        self._downloader = downloader
-												[extractor, cleanup] Reduce direct use of `_downloader`

											
										
										
											2022-06-23 06:14:22 +02:00
+								    @property
 								    def cache(self):
 								        return self._downloader.cache
 								    @property
 								    def cookiejar(self):
 								        return self._downloader.cookiejar
-												[extractor] Add `_perform_login` function (#2943)

* Adds new functions `_initialize_pre_login` and `_perform_login` as part of the extractor API
* Adds `ie.supports_login` to the public API
											
										
										
											2022-03-18 21:53:33 +01:00
+								    def _initialize_pre_login(self):
-												[cleanup] Fix some typos (#4194)

Authored by: crazymoose77756
											
										
										
											2022-06-27 02:50:06 +02:00
+								        """ Initialization before login. Redefine in subclasses."""
-												[extractor] Add `_perform_login` function (#2943)

* Adds new functions `_initialize_pre_login` and `_perform_login` as part of the extractor API
* Adds `ie.supports_login` to the public API
											
										
										
											2022-03-18 21:53:33 +01:00
+								        pass
 								    def _perform_login(self, username, password):
 								        """ Login with username and password. Redefine in subclasses."""
 								        pass
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								    def _real_initialize(self):
 								        """Real initialization process. Redefine in subclasses."""
 								        pass
 								    def _real_extract(self, url):
 								        """Real extraction process. Redefine in subclasses."""
-												[cleanup, docs] Misc cleanup

Closes #2828, closes #2734, closes #2802, closes #2937

											
										
										
											2022-03-04 15:08:55 +01:00
+								        raise NotImplementedError('This method must be implemented by subclasses')
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
-												YoutubeIE: reuse instances of InfoExtractors (closes #998)

When a IE is added to the list, it's also added to a dictionary. When a IE is requested it first looks in the dictionary and if there's no instance it will create a new one.

That way _real_initialize is only called once for each IE, saving time if it needs to login for example.

											
										
										
											2013-07-08 15:14:27 +02:00
+								    @classmethod
 								    def ie_key(cls):
 								        """A string for getting the InfoExtractor with get_info_extractor"""
-												[lazy_extractors] Fix `suitable` and add flake8 test

											
										
										
											2021-08-22 21:19:23 +02:00
+								        return cls.__name__[:-2]
-												YoutubeIE: reuse instances of InfoExtractors (closes #998)

When a IE is added to the list, it's also added to a dictionary. When a IE is requested it first looks in the dictionary and if there's no instance it will create a new one.

That way _real_initialize is only called once for each IE, saving time if it needs to login for example.

											
										
										
											2013-07-08 15:14:27 +02:00
-												[extractor] Use classmethod/property where possible

and refactor lazy extractors accordingly.

This reduces the need to create extractor instances

											
										
										
											2022-05-11 17:54:44 +02:00
+								    @classproperty
 								    def IE_NAME(cls):
 								        return cls.__name__[:-2]
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
-												[extractor/common] Introduce expected_status for convenient accept of failed HTTP requests
Useful when some non-success (2xx) HTTP status codes should be considered normal. Previously this required to manually catch corresponding exceptions and read the response.

											
										
										
											2018-06-17 23:01:48 +02:00
+								    @staticmethod
 								    def __can_accept_status_code(err, expected_status):
-												[compat, networking] Deprecate old functions (#2861)

Authored by: coletdjnz, pukkandan

											
										
										
											2023-07-09 09:53:02 +02:00
+								        assert isinstance(err, HTTPError)
-												[extractor/common] Introduce expected_status for convenient accept of failed HTTP requests
Useful when some non-success (2xx) HTTP status codes should be considered normal. Previously this required to manually catch corresponding exceptions and read the response.

											
										
										
											2018-06-17 23:01:48 +02:00
+								        if expected_status is None:
 								            return False
 								        elif callable(expected_status):
-												[compat, networking] Deprecate old functions (#2861)

Authored by: coletdjnz, pukkandan

											
										
										
											2023-07-09 09:53:02 +02:00
+								            return expected_status(err.status) is True
-												[extractor/common] Introduce expected_status for convenient accept of failed HTTP requests
Useful when some non-success (2xx) HTTP status codes should be considered normal. Previously this required to manually catch corresponding exceptions and read the response.

											
										
										
											2018-06-17 23:01:48 +02:00
+								        else:
-												[compat, networking] Deprecate old functions (#2861)

Authored by: coletdjnz, pukkandan

											
										
										
											2023-07-09 09:53:02 +02:00
+								            return err.status in variadic(expected_status)
-												[extractor/common] Introduce expected_status for convenient accept of failed HTTP requests
Useful when some non-success (2xx) HTTP status codes should be considered normal. Previously this required to manually catch corresponding exceptions and read the response.

											
										
										
											2018-06-17 23:01:48 +02:00
-												[ie] Add extractor impersonate API (#9474)

Authored by: bashonly, Grub4K, pukkandan
											
										
										
											2024-03-31 00:18:07 +01:00
+								    def _create_request(self, url_or_request, data=None, headers=None, query=None, extensions=None):
-												[compat] Remove more functions

Removing any more will require changes to a large number of extractors

											
										
										
											2022-06-24 10:10:17 +02:00
+								        if isinstance(url_or_request, urllib.request.Request):
-												[compat, networking] Deprecate old functions (#2861)

Authored by: coletdjnz, pukkandan

											
										
										
											2023-07-09 09:53:02 +02:00
+								            self._downloader.deprecation_warning(
 								                'Passing a urllib.request.Request to _create_request() is deprecated. '
 								                'Use yt_dlp.networking.common.Request instead.')
-												[networking] Rewrite architecture (#2861)

New networking interface consists of a `RequestDirector` that directs
each `Request` to appropriate `RequestHandler` and returns the
`Response` or raises `RequestError`. The handlers define adapters to
transform its internal Request/Response/Errors to our interfaces.

User-facing changes:
- Fix issues with per request proxies on redirects for urllib
- Support for `ALL_PROXY` environment variable for proxy setting
- Support for `socks5h` proxy
   - Closes https://github.com/yt-dlp/yt-dlp/issues/6325, https://github.com/ytdl-org/youtube-dl/issues/22618, https://github.com/ytdl-org/youtube-dl/pull/28093
- Raise error when using `https` proxy instead of silently converting it to `http`

Authored by: coletdjnz

											
										
										
											2023-07-15 12:25:23 +02:00
+								            url_or_request = urllib_req_to_req(url_or_request)
 								        elif not isinstance(url_or_request, Request):
 								            url_or_request = Request(url_or_request)
-												[ie] Add extractor impersonate API (#9474)

Authored by: bashonly, Grub4K, pukkandan
											
										
										
											2024-03-31 00:18:07 +01:00
+								        url_or_request.update(data=data, headers=headers, query=query, extensions=extensions)
-												[networking] Rewrite architecture (#2861)

New networking interface consists of a `RequestDirector` that directs
each `Request` to appropriate `RequestHandler` and returns the
`Response` or raises `RequestError`. The handlers define adapters to
transform its internal Request/Response/Errors to our interfaces.

User-facing changes:
- Fix issues with per request proxies on redirects for urllib
- Support for `ALL_PROXY` environment variable for proxy setting
- Support for `socks5h` proxy
   - Closes https://github.com/yt-dlp/yt-dlp/issues/6325, https://github.com/ytdl-org/youtube-dl/issues/22618, https://github.com/ytdl-org/youtube-dl/pull/28093
- Raise error when using `https` proxy instead of silently converting it to `http`

Authored by: coletdjnz

											
										
										
											2023-07-15 12:25:23 +02:00
+								        return url_or_request
-												[extractor] Add dev option `--load-pages`

											
										
										
											2022-05-31 22:33:22 +02:00
-												[ie] Add extractor impersonate API (#9474)

Authored by: bashonly, Grub4K, pukkandan
											
										
										
											2024-03-31 00:18:07 +01:00
+								    def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None,
 								                         headers=None, query=None, expected_status=None, impersonate=None, require_impersonation=False):
-												[extractor/common] Introduce expected_status for convenient accept of failed HTTP requests
Useful when some non-success (2xx) HTTP status codes should be considered normal. Previously this required to manually catch corresponding exceptions and read the response.

											
										
										
											2018-06-17 23:01:48 +02:00
+								        """
 								        Return the response handle.
 								        See _download_webpage docstring for arguments specification.
 								        """
-												Add option `--sleep-requests` to sleep b/w requests (Closes #106)

* Also fix documentation of `sleep_interval_subtitles`

Related issues:
https://github.com/blackjack4494/yt-dlc/issues/158
https://github.com/blackjack4494/youtube-dlc/issues/195
https://github.com/ytdl-org/youtube-dl/pull/28270
https://github.com/ytdl-org/youtube-dl/pull/28144
https://github.com/ytdl-org/youtube-dl/issues/27767
https://github.com/ytdl-org/youtube-dl/issues/23638
https://github.com/ytdl-org/youtube-dl/issues/26287
https://github.com/ytdl-org/youtube-dl/issues/26319

											
										
										
											2021-02-27 13:41:23 +01:00
+								        if not self._downloader._first_webpage_request:
-												[cleanup] misc

											
										
										
											2021-10-22 22:37:20 +02:00
+								            sleep_interval = self.get_param('sleep_interval_requests') or 0
-												Add option `--sleep-requests` to sleep b/w requests (Closes #106)

* Also fix documentation of `sleep_interval_subtitles`

Related issues:
https://github.com/blackjack4494/yt-dlc/issues/158
https://github.com/blackjack4494/youtube-dlc/issues/195
https://github.com/ytdl-org/youtube-dl/pull/28270
https://github.com/ytdl-org/youtube-dl/pull/28144
https://github.com/ytdl-org/youtube-dl/issues/27767
https://github.com/ytdl-org/youtube-dl/issues/23638
https://github.com/ytdl-org/youtube-dl/issues/26287
https://github.com/ytdl-org/youtube-dl/issues/26319

											
										
										
											2021-02-27 13:41:23 +01:00
+								            if sleep_interval > 0:
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								                self.to_screen(f'Sleeping {sleep_interval} seconds ...')
-												Add option `--sleep-requests` to sleep b/w requests (Closes #106)

* Also fix documentation of `sleep_interval_subtitles`

Related issues:
https://github.com/blackjack4494/yt-dlc/issues/158
https://github.com/blackjack4494/youtube-dlc/issues/195
https://github.com/ytdl-org/youtube-dl/pull/28270
https://github.com/ytdl-org/youtube-dl/pull/28144
https://github.com/ytdl-org/youtube-dl/issues/27767
https://github.com/ytdl-org/youtube-dl/issues/23638
https://github.com/ytdl-org/youtube-dl/issues/26287
https://github.com/ytdl-org/youtube-dl/issues/26319

											
										
										
											2021-02-27 13:41:23 +01:00
+								                time.sleep(sleep_interval)
 								        else:
 								            self._downloader._first_webpage_request = False
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								        if note is None:
 								            self.report_download_webpage(video_id)
 								        elif note is not False:
-												Add fatal=False parameter to _download_* functions.

This allows us to simplify the calls in the youtube extractor even further.

											
										
										
											2013-12-09 01:49:01 +01:00
+								            if video_id is None:
-												[cleanup] Upgrade syntax

Using https://github.com/asottile/pyupgrade

1. `__future__` imports and `coding: utf-8` were removed
2. Files were rewritten with `pyupgrade --py36-plus --keep-percent-format`
3. f-strings were cherry-picked from `pyupgrade --py36-plus`

Extractors are left untouched (except removing header) to avoid unnecessary merge conflicts

											
										
										
											2022-04-11 17:10:28 +02:00
+								                self.to_screen(str(note))
-												Add fatal=False parameter to _download_* functions.

This allows us to simplify the calls in the youtube extractor even further.

											
										
										
											2013-12-09 01:49:01 +01:00
+								            else:
-												[cleanup] Upgrade syntax

Using https://github.com/asottile/pyupgrade

1. `__future__` imports and `coding: utf-8` were removed
2. Files were rewritten with `pyupgrade --py36-plus --keep-percent-format`
3. f-strings were cherry-picked from `pyupgrade --py36-plus`

Extractors are left untouched (except removing header) to avoid unnecessary merge conflicts

											
										
										
											2022-04-11 17:10:28 +02:00
+								                self.to_screen(f'{video_id}: {note}')
-												[extractor/common] Move X-Forwarded-For setup code into _request_webpage

											
										
										
											2017-12-23 14:57:35 +01:00
 								        # Some sites check X-Forwarded-For HTTP header in order to figure out
 								        # the origin of the client behind proxy. This allows bypassing geo
 								        # restriction by faking this header's value to IP that belongs to some
 								        # geo unrestricted country. We will do so once we encounter any
 								        # geo restriction error.
 								        if self._x_forwarded_for_ip:
-												[extractor] Fix `_create_request` when headers is None

Closes #4164

											
										
										
											2022-06-25 16:11:22 +02:00
+								            headers = (headers or {}).copy()
 								            headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
-												[extractor/common] Move X-Forwarded-For setup code into _request_webpage

											
										
										
											2017-12-23 14:57:35 +01:00
-												[ie] Add extractor impersonate API (#9474)

Authored by: bashonly, Grub4K, pukkandan
											
										
										
											2024-03-31 00:18:07 +01:00
+								        extensions = {}
 								        if impersonate in (True, ''):
 								            impersonate = ImpersonateTarget()
 								        requested_targets = [
 								            t if isinstance(t, ImpersonateTarget) else ImpersonateTarget.from_str(t)
 								            for t in variadic(impersonate)
 								        ] if impersonate else []
 								        available_target = next(filter(self._downloader._impersonate_target_available, requested_targets), None)
 								        if available_target:
 								            extensions['impersonate'] = available_target
 								        elif requested_targets:
 								            message = 'The extractor is attempting impersonation, but '
 								            message += (
 								                'no impersonate target is available' if not str(impersonate)
 								                else f'none of these impersonate targets are available: "{", ".join(map(str, requested_targets))}"')
 								            info_msg = ('see  https://github.com/yt-dlp/yt-dlp#impersonation  '
 								                        'for information on installing the required dependencies')
 								            if require_impersonation:
 								                raise ExtractorError(f'{message}; {info_msg}', expected=True)
 								            self.report_warning(f'{message}; if you encounter errors, then {info_msg}', only_once=True)
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								        try:
-												[ie] Add extractor impersonate API (#9474)

Authored by: bashonly, Grub4K, pukkandan
											
										
										
											2024-03-31 00:18:07 +01:00
+								            return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query, extensions))
-												[utils] Add `network_exceptions`

											
										
										
											2021-05-04 19:06:18 +02:00
+								        except network_exceptions as err:
-												[compat, networking] Deprecate old functions (#2861)

Authored by: coletdjnz, pukkandan

											
										
										
											2023-07-09 09:53:02 +02:00
+								            if isinstance(err, HTTPError):
-												[extractor/common] Introduce expected_status for convenient accept of failed HTTP requests
Useful when some non-success (2xx) HTTP status codes should be considered normal. Previously this required to manually catch corresponding exceptions and read the response.

											
										
										
											2018-06-17 23:01:48 +02:00
+								                if self.__can_accept_status_code(err, expected_status):
-												[networking] Rewrite architecture (#2861)

New networking interface consists of a `RequestDirector` that directs
each `Request` to appropriate `RequestHandler` and returns the
`Response` or raises `RequestError`. The handlers define adapters to
transform its internal Request/Response/Errors to our interfaces.

User-facing changes:
- Fix issues with per request proxies on redirects for urllib
- Support for `ALL_PROXY` environment variable for proxy setting
- Support for `socks5h` proxy
   - Closes https://github.com/yt-dlp/yt-dlp/issues/6325, https://github.com/ytdl-org/youtube-dl/issues/22618, https://github.com/ytdl-org/youtube-dl/pull/28093
- Raise error when using `https` proxy instead of silently converting it to `http`

Authored by: coletdjnz

											
										
										
											2023-07-15 12:25:23 +02:00
+								                    return err.response
-												[extractor/common] Introduce expected_status for convenient accept of failed HTTP requests
Useful when some non-success (2xx) HTTP status codes should be considered normal. Previously this required to manually catch corresponding exceptions and read the response.

											
										
										
											2018-06-17 23:01:48 +02:00
-												[aparat] Add support (Fixes #2012)

											
										
										
											2013-12-20 17:05:28 +01:00
+								            if errnote is False:
 								                return False
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								            if errnote is None:
-												[extractor/common] Modernize

											
										
										
											2014-08-28 01:04:43 +02:00
+								                errnote = 'Unable to download webpage'
-												Properly convert errors to strings

											
										
										
											2015-12-20 00:27:38 +01:00
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								            errmsg = f'{errnote}: {err}'
-												Add fatal=False parameter to _download_* functions.

This allows us to simplify the calls in the youtube extractor even further.

											
										
										
											2013-12-09 01:49:01 +01:00
+								            if fatal:
-												[utils] Better traceback for `ExtractorError`

											
										
										
											2022-03-08 07:34:49 +01:00
+								                raise ExtractorError(errmsg, cause=err)
-												Add fatal=False parameter to _download_* functions.

This allows us to simplify the calls in the youtube extractor even further.

											
										
										
											2013-12-09 01:49:01 +01:00
+								            else:
-												Fix inconsistent use of `report_warning`

											
										
										
											2021-04-16 12:01:10 +02:00
+								                self.report_warning(errmsg)
-												Add fatal=False parameter to _download_* functions.

This allows us to simplify the calls in the youtube extractor even further.

											
										
										
											2013-12-09 01:49:01 +01:00
+								                return False
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
-												[cleanup] Misc fixes

Cherry-picks from: #3498, #3947
Related: #3949, https://github.com/yt-dlp/yt-dlp/issues/1839#issuecomment-1140313836
Authored by: pukkandan, flashdagger, gamer191

											
										
										
											2022-06-03 17:59:03 +02:00
+								    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
-												[ie] Add extractor impersonate API (#9474)

Authored by: bashonly, Grub4K, pukkandan
											
										
										
											2024-03-31 00:18:07 +01:00
+								                                 encoding=None, data=None, headers={}, query={}, expected_status=None,
 								                                 impersonate=None, require_impersonation=False):
-												[extractor/common] Introduce expected_status for convenient accept of failed HTTP requests
Useful when some non-success (2xx) HTTP status codes should be considered normal. Previously this required to manually catch corresponding exceptions and read the response.

											
										
										
											2018-06-17 23:01:48 +02:00
+								        """
 								        Return a tuple (page content as string, URL handle).
-												[extractor, cleanup] Refactor `_download_...` methods

											
										
										
											2022-05-31 19:43:26 +02:00
+								        Arguments:
 								        url_or_request -- plain text URL as a string or
-												[ie] Add extractor impersonate API (#9474)

Authored by: bashonly, Grub4K, pukkandan
											
										
										
											2024-03-31 00:18:07 +01:00
+								            a yt_dlp.networking.Request object
-												[extractor, cleanup] Refactor `_download_...` methods

											
										
										
											2022-05-31 19:43:26 +02:00
+								        video_id -- Video/playlist/item identifier (string)
 								        Keyword arguments:
 								        note -- note printed before downloading (string)
 								        errnote -- note printed in case of an error (string)
 								        fatal -- flag denoting whether error should be considered fatal,
 								            i.e. whether it should cause ExtractionError to be raised,
 								            otherwise a warning will be reported and extraction continued
 								        encoding -- encoding for a page content decoding, guessed automatically
 								            when not explicitly specified
 								        data -- POST data (bytes)
 								        headers -- HTTP headers (dict)
 								        query -- URL query (dict)
 								        expected_status -- allows to accept failed HTTP requests (non 2xx
 								            status code) by explicitly specifying a set of accepted status
 								            codes. Can be any of the following entities:
 								                - an integer type specifying an exact failed status code to
 								                  accept
 								                - a list or a tuple of integer types specifying a list of
 								                  failed status codes to accept
 								                - a callable accepting an actual failed status code and
 								                  returning True if it should be accepted
 								            Note that this argument does not affect success status codes (2xx)
 								            which are always accepted.
-												[ie] Add extractor impersonate API (#9474)

Authored by: bashonly, Grub4K, pukkandan
											
										
										
											2024-03-31 00:18:07 +01:00
+								        impersonate -- the impersonate target. Can be any of the following entities:
 								                - an instance of yt_dlp.networking.impersonate.ImpersonateTarget
 								                - a string in the format of CLIENT[:OS]
 								                - a list or a tuple of CLIENT[:OS] strings or ImpersonateTarget instances
 								                - a boolean value; True means any impersonate target is sufficient
 								        require_impersonation -- flag to toggle whether the request should raise an error
 								            if impersonation is not possible (bool, default: False)
-												[extractor/common] Introduce expected_status for convenient accept of failed HTTP requests
Useful when some non-success (2xx) HTTP status codes should be considered normal. Previously this required to manually catch corresponding exceptions and read the response.

											
										
										
											2018-06-17 23:01:48 +02:00
+								        """
-												[extractor, cleanup] Refactor `_download_...` methods

											
										
										
											2022-05-31 19:43:26 +02:00
-												Strip hash info from URL when making requests (Fixes #1038)

											
										
										
											2013-07-13 22:52:12 +02:00
+								        # Strip hashes from the URL (#1038)
-												[compat] Remove deprecated functions from core code

											
										
										
											2022-06-24 12:54:43 +02:00
+								        if isinstance(url_or_request, str):
-												Strip hash info from URL when making requests (Fixes #1038)

											
										
										
											2013-07-13 22:52:12 +02:00
+								            url_or_request = url_or_request.partition('#')[0]
-												[ie] Add extractor impersonate API (#9474)

Authored by: bashonly, Grub4K, pukkandan
											
										
										
											2024-03-31 00:18:07 +01:00
+								        urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data,
 								                                     headers=headers, query=query, expected_status=expected_status,
 								                                     impersonate=impersonate, require_impersonation=require_impersonation)
-												Add fatal=False parameter to _download_* functions.

This allows us to simplify the calls in the youtube extractor even further.

											
										
										
											2013-12-09 01:49:01 +01:00
+								        if urlh is False:
 								            assert not fatal
 								            return False
-												[ie] Add POST data hash to `--write-pages` filenames (#9879)

Closes #9773
Authored by: minamotorin
											
										
										
											2024-05-17 16:28:36 +02:00
+								        content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal,
 								                                             encoding=encoding, data=data)
-												[generic] Handle audio streams that do not implement HEAD (Fixes #4032)

											
										
										
											2014-10-26 17:05:44 +01:00
+								        return (content, urlh)
-												[extractor/common] Add the encoding parameter

The QQMusic info extractor need forced encoding for correct working.

											
										
										
											2015-03-21 05:21:27 +01:00
+								    @staticmethod
 								    def _guess_encoding_from_content(content_type, webpage_bytes):
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								        m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 								        if m:
 								            encoding = m.group(1)
 								        else:
-												Fix detection of the webpage charset if it's declared using ' instead of "

Like in "<meta charset='utf-8'/>"

											
										
										
											2013-08-29 11:35:15 +02:00
+								            m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
-												[sohu] Handle encoding, and fix tests

											
										
										
											2013-08-28 13:59:08 +02:00
+								                          webpage_bytes[:1024])
 								            if m:
 								                encoding = m.group(1).decode('ascii')
-												Deal with implicitly UTF-16 decoded webpages

These webpages don't specify an encoding and rely on the BOM

											
										
										
											2014-01-21 01:39:39 +01:00
+								            elif webpage_bytes.startswith(b'\xff\xfe'):
 								                encoding = 'utf-16'
-												[sohu] Handle encoding, and fix tests

											
										
										
											2013-08-28 13:59:08 +02:00
+								            else:
 								                encoding = 'utf-8'
-												[extractor/common] Add the encoding parameter

The QQMusic info extractor need forced encoding for correct working.

											
										
										
											2015-03-21 05:21:27 +01:00
 								        return encoding
-												[extractor/common] Move censorship checks to a separate method and add check for just another ISP

											
										
										
											2017-04-01 22:56:49 +02:00
+								    def __check_blocked(self, content):
 								        first_block = content[:512]
-												Fix W504 and disable W503 (closes #20863)

											
										
										
											2019-05-10 22:56:22 +02:00
+								        if ('<title>Access to this site is blocked</title>' in content
 								                and 'Websense' in first_block):
-												[extractor/common] Move censorship checks to a separate method and add check for just another ISP

											
										
										
											2017-04-01 22:56:49 +02:00
+								            msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 								            blocked_iframe = self._html_search_regex(
 								                r'<iframe src="([^"]+)"', content,
 								                'Websense information URL', default=None)
 								            if blocked_iframe:
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								                msg += f' Visit {blocked_iframe} for more details'
-												[extractor/common] Move censorship checks to a separate method and add check for just another ISP

											
										
										
											2017-04-01 22:56:49 +02:00
+								            raise ExtractorError(msg, expected=True)
 								        if '<title>The URL you requested has been blocked</title>' in first_block:
 								            msg = (
 								                'Access to this webpage has been blocked by Indian censorship. '
 								                'Use a VPN or proxy server (with --proxy) to route around it.')
 								            block_msg = self._html_search_regex(
 								                r'</h1><p>(.*?)</p>',
 								                content, 'block message', default=None)
 								            if block_msg:
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								                msg += ' (Message: "{}")'.format(block_msg.replace('\n', ' '))
-												[extractor/common] Move censorship checks to a separate method and add check for just another ISP

											
										
										
											2017-04-01 22:56:49 +02:00
+								            raise ExtractorError(msg, expected=True)
-												Fix W504 and disable W503 (closes #20863)

											
										
										
											2019-05-10 22:56:22 +02:00
+								        if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 								                and 'blocklist.rkn.gov.ru' in content):
-												[extractor/common] Move censorship checks to a separate method and add check for just another ISP

											
										
										
											2017-04-01 22:56:49 +02:00
+								            raise ExtractorError(
 								                'Access to this webpage has been blocked by decision of the Russian government. '
 								                'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 								                expected=True)
-												[ie] Add POST data hash to `--write-pages` filenames (#9879)

Closes #9773
Authored by: minamotorin
											
										
										
											2024-05-17 16:28:36 +02:00
+								    def _request_dump_filename(self, url, video_id, data=None):
 								        if data is not None:
 								            data = hashlib.md5(data).hexdigest()
 								        basen = join_nonempty(video_id, data, url, delim='_')
-												[extractor] Add dev option `--load-pages`

											
										
										
											2022-05-31 22:33:22 +02:00
+								        trim_length = self.get_param('trim_file_name') or 240
 								        if len(basen) > trim_length:
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								            h = '___' + hashlib.md5(basen.encode()).hexdigest()
-												[extractor] Add dev option `--load-pages`

											
										
										
											2022-05-31 22:33:22 +02:00
+								            basen = basen[:trim_length - len(h)] + h
 								        filename = sanitize_filename(f'{basen}.dump', restricted=True)
 								        # Working around MAX_PATH limitation on Windows (see
 								        # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 								        if compat_os_name == 'nt':
 								            absfilepath = os.path.abspath(filename)
 								            if len(absfilepath) > 259:
 								                filename = fR'\\?\{absfilepath}'
 								        return filename
 								    def __decode_webpage(self, webpage_bytes, encoding, headers):
 								        if not encoding:
 								            encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
 								        try:
 								            return webpage_bytes.decode(encoding, 'replace')
 								        except LookupError:
 								            return webpage_bytes.decode('utf-8', 'replace')
-												[ie] Add POST data hash to `--write-pages` filenames (#9879)

Closes #9773
Authored by: minamotorin
											
										
										
											2024-05-17 16:28:36 +02:00
+								    def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True,
 								                              prefix=None, encoding=None, data=None):
-												[extractor/common] Add the encoding parameter

The QQMusic info extractor need forced encoding for correct working.

											
										
										
											2015-03-21 05:21:27 +01:00
+								        webpage_bytes = urlh.read()
 								        if prefix is not None:
 								            webpage_bytes = prefix + webpage_bytes
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											2021-05-17 14:23:08 +02:00
+								        if self.get_param('dump_intermediate_pages', False):
-												[compat, networking] Deprecate old functions (#2861)

Authored by: coletdjnz, pukkandan

											
										
										
											2023-07-09 09:53:02 +02:00
+								            self.to_screen('Dumping request to ' + urlh.url)
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								            dump = base64.b64encode(webpage_bytes).decode('ascii')
 								            self._downloader.to_screen(dump)
-												[extractor] Add dev option `--load-pages`

											
										
										
											2022-05-31 22:33:22 +02:00
+								        if self.get_param('write_pages'):
-												Bugfix for 61b17437dc14a1c7e90ff48a6198df77828c6df4

Authored by: bashonly

											
										
										
											2024-05-18 06:44:11 +02:00
+								            if isinstance(url_or_request, Request):
 								                data = self._create_request(url_or_request, data).data
 								            filename = self._request_dump_filename(urlh.url, video_id, data)
-												[extractor] Add dev option `--load-pages`

											
										
										
											2022-05-31 22:33:22 +02:00
+								            self.to_screen(f'Saving request to {filename}')
-												New debug option --write-pages

											
										
										
											2013-10-28 10:44:02 +01:00
+								            with open(filename, 'wb') as outf:
 								                outf.write(webpage_bytes)
-												[extractor] Add dev option `--load-pages`

											
										
										
											2022-05-31 22:33:22 +02:00
+								        content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
-												[extractor/common] Move censorship checks to a separate method and add check for just another ISP

											
										
										
											2017-04-01 22:56:49 +02:00
+								        self.__check_blocked(content)
-												Detect Websense censorship (Fixes #2670)

											
										
										
											2014-04-03 06:07:35 +02:00
-												[generic] Handle audio streams that do not implement HEAD (Fixes #4032)

											
										
										
											2014-10-26 17:05:44 +01:00
+								        return content
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
-												[extractor] Passthrough `errnote=False` to parsing

											
										
										
											2022-07-15 12:35:00 +02:00
+								    def __print_error(self, errnote, fatal, video_id, err):
 								        if fatal:
-												[cleanup] Misc

											
										
										
											2022-07-18 00:56:50 +02:00
+								            raise ExtractorError(f'{video_id}: {errnote}', cause=err)
-												[extractor] Passthrough `errnote=False` to parsing

											
										
										
											2022-07-15 12:35:00 +02:00
+								        elif errnote:
-												[cleanup] Misc

											
										
										
											2022-07-18 00:56:50 +02:00
+								            self.report_warning(f'{video_id}: {errnote}: {err}')
-												[extractor] Passthrough `errnote=False` to parsing

											
										
										
											2022-07-15 12:35:00 +02:00
 								    def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
-												[mtv] Fixup incorrectly encoded XML documents

											
										
										
											2013-12-10 12:45:22 +01:00
+								        if transform_source:
 								            xml_string = transform_source(xml_string)
-												[extractor/common] Introduce _parse_xml

											
										
										
											2017-08-22 19:32:41 +02:00
+								        try:
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								            return compat_etree_fromstring(xml_string.encode())
-												[cleanup] Mark some compat variables for removal (#2173)

Authored by fstirlitz, pukkandan

											
										
										
											2022-04-11 22:09:26 +02:00
+								        except xml.etree.ElementTree.ParseError as ve:
-												[extractor] Passthrough `errnote=False` to parsing

											
										
										
											2022-07-15 12:35:00 +02:00
+								            self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
-												[collegehumor] Encode the xml before calling xml.etree.ElementTree.fromstring (fixes #1822)

Uses a new helper method in InfoExtractor: _download_xml

											
										
										
											2013-11-24 14:59:19 +01:00
-												[extractor] Passthrough `errnote=False` to parsing

											
										
										
											2022-07-15 12:35:00 +02:00
+								    def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
-												[khanacademy] Add support (Fixes #2066)

											
										
										
											2014-01-07 09:35:34 +01:00
+								        try:
-												[extractor] Add `_search_json`

All fetching of JSON objects should eventually be done with this function
but only `youtube` is being refactored for now

											
										
										
											2022-06-03 17:32:31 +02:00
+								            return json.loads(
 								                json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
-												[khanacademy] Add support (Fixes #2066)

											
										
										
											2014-01-07 09:35:34 +01:00
+								        except ValueError as ve:
-												[extractor] Passthrough `errnote=False` to parsing

											
										
										
											2022-07-15 12:35:00 +02:00
+								            self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
-												[khanacademy] Add support (Fixes #2066)

											
										
										
											2014-01-07 09:35:34 +01:00
-												[extractor] Passthrough `errnote=False` to parsing

											
										
										
											2022-07-15 12:35:00 +02:00
+								    def _parse_socket_response_as_json(self, data, *args, **kwargs):
 								        return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
-												[extractor] Functions to parse socket.io response as json

Authored by: pukkandan, llacb47

											
										
										
											2021-05-30 10:17:39 +02:00
-												[extractor, cleanup] Refactor `_download_...` methods

											
										
										
											2022-05-31 19:43:26 +02:00
+								    def __create_download_methods(name, parser, note, errnote, return_value):
-												[extractor] Passthrough `errnote=False` to parsing

											
										
										
											2022-07-15 12:35:00 +02:00
+								        def parse(ie, content, *args, errnote=errnote, **kwargs):
-												[extractor, cleanup] Refactor `_download_...` methods

											
										
										
											2022-05-31 19:43:26 +02:00
+								            if parser is None:
 								                return content
-												[extractor] Passthrough `errnote=False` to parsing

											
										
										
											2022-07-15 12:35:00 +02:00
+								            if errnote is False:
 								                kwargs['errnote'] = errnote
-												[extractor, cleanup] Refactor `_download_...` methods

											
										
										
											2022-05-31 19:43:26 +02:00
+								            # parser is fetched by name so subclasses can override it
 								            return getattr(ie, parser)(content, *args, **kwargs)
-												[extractor] Fix bug in 617f658b7ec1193749848c1b7343acab125dbc46

While the function signature don't enforce it, some IEs that override
`_download_webpage_handle` assume all optional arguments to be keyword-only

Closes #3954

											
										
										
											2022-06-03 13:55:05 +02:00
+								        def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
-												[ie] Add extractor impersonate API (#9474)

Authored by: bashonly, Grub4K, pukkandan
											
										
										
											2024-03-31 00:18:07 +01:00
+								                            fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None,
 								                            impersonate=None, require_impersonation=False):
-												[extractor] Fix bug in 617f658b7ec1193749848c1b7343acab125dbc46

While the function signature don't enforce it, some IEs that override
`_download_webpage_handle` assume all optional arguments to be keyword-only

Closes #3954

											
										
										
											2022-06-03 13:55:05 +02:00
+								            res = self._download_webpage_handle(
 								                url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
-												[ie] Add extractor impersonate API (#9474)

Authored by: bashonly, Grub4K, pukkandan
											
										
										
											2024-03-31 00:18:07 +01:00
+								                data=data, headers=headers, query=query, expected_status=expected_status,
 								                impersonate=impersonate, require_impersonation=require_impersonation)
-												[extractor, cleanup] Refactor `_download_...` methods

											
										
										
											2022-05-31 19:43:26 +02:00
+								            if res is False:
 								                return res
 								            content, urlh = res
-												[extractor] Passthrough `errnote=False` to parsing

											
										
										
											2022-07-15 12:35:00 +02:00
+								            return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
-												[extractor, cleanup] Refactor `_download_...` methods

											
										
										
											2022-05-31 19:43:26 +02:00
-												[extractor] Add dev option `--load-pages`

											
										
										
											2022-05-31 22:33:22 +02:00
+								        def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
-												[ie] Add extractor impersonate API (#9474)

Authored by: bashonly, Grub4K, pukkandan
											
										
										
											2024-03-31 00:18:07 +01:00
+								                             fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None,
 								                             impersonate=None, require_impersonation=False):
-												[extractor] Add dev option `--load-pages`

											
										
										
											2022-05-31 22:33:22 +02:00
+								            if self.get_param('load_pages'):
 								                url_or_request = self._create_request(url_or_request, data, headers, query)
-												[ie] Add POST data hash to `--write-pages` filenames (#9879)

Closes #9773
Authored by: minamotorin
											
										
										
											2024-05-17 16:28:36 +02:00
+								                filename = self._request_dump_filename(url_or_request.url, video_id, url_or_request.data)
-												[extractor] Add dev option `--load-pages`

											
										
										
											2022-05-31 22:33:22 +02:00
+								                self.to_screen(f'Loading request from {filename}')
 								                try:
 								                    with open(filename, 'rb') as dumpf:
 								                        webpage_bytes = dumpf.read()
 								                except OSError as e:
 								                    self.report_warning(f'Unable to load request from disk: {e}')
 								                else:
 								                    content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
-												[extractor] Passthrough `errnote=False` to parsing

											
										
										
											2022-07-15 12:35:00 +02:00
+								                    return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
-												[extractor] Fix bug in 617f658b7ec1193749848c1b7343acab125dbc46

While the function signature don't enforce it, some IEs that override
`_download_webpage_handle` assume all optional arguments to be keyword-only

Closes #3954

											
										
										
											2022-06-03 13:55:05 +02:00
+								            kwargs = {
 								                'note': note,
 								                'errnote': errnote,
 								                'transform_source': transform_source,
 								                'fatal': fatal,
 								                'encoding': encoding,
 								                'data': data,
 								                'headers': headers,
 								                'query': query,
 								                'expected_status': expected_status,
-												[ie] Add extractor impersonate API (#9474)

Authored by: bashonly, Grub4K, pukkandan
											
										
										
											2024-03-31 00:18:07 +01:00
+								                'impersonate': impersonate,
 								                'require_impersonation': require_impersonation,
-												[extractor] Fix bug in 617f658b7ec1193749848c1b7343acab125dbc46

While the function signature don't enforce it, some IEs that override
`_download_webpage_handle` assume all optional arguments to be keyword-only

Closes #3954

											
										
										
											2022-06-03 13:55:05 +02:00
+								            }
-												[extractor, cleanup] Refactor `_download_...` methods

											
										
										
											2022-05-31 19:43:26 +02:00
+								            if parser is None:
-												[extractor] Fix bug in 617f658b7ec1193749848c1b7343acab125dbc46

While the function signature don't enforce it, some IEs that override
`_download_webpage_handle` assume all optional arguments to be keyword-only

Closes #3954

											
										
										
											2022-06-03 13:55:05 +02:00
+								                kwargs.pop('transform_source')
-												[extractor, cleanup] Refactor `_download_...` methods

											
										
										
											2022-05-31 19:43:26 +02:00
+								            # The method is fetched by name so subclasses can override _download_..._handle
-												[extractor] Fix bug in 617f658b7ec1193749848c1b7343acab125dbc46

While the function signature don't enforce it, some IEs that override
`_download_webpage_handle` assume all optional arguments to be keyword-only

Closes #3954

											
										
										
											2022-06-03 13:55:05 +02:00
+								            res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
-												[extractor, cleanup] Refactor `_download_...` methods

											
										
										
											2022-05-31 19:43:26 +02:00
+								            return res if res is False else res[0]
 								        def impersonate(func, name, return_value):
 								            func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
 								            func.__doc__ = f'''
 								                @param transform_source     Apply this transformation before parsing
 								                @returns                    {return_value}
 								                See _download_webpage_handle docstring for other arguments specification
 								            '''
 								        impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
 								        impersonate(download_content, f'_download_{name}', f'{return_value}')
 								        return download_handle, download_content
 								    _download_xml_handle, _download_xml = __create_download_methods(
 								        'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
 								    _download_json_handle, _download_json = __create_download_methods(
 								        'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
 								    _download_socket_json_handle, _download_socket_json = __create_download_methods(
 								        'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
 								    __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
-												[extractor] Functions to parse socket.io response as json

Authored by: pukkandan, llacb47

											
										
										
											2021-05-30 10:17:39 +02:00
-												[extractor, cleanup] Refactor `_download_...` methods

											
										
										
											2022-05-31 19:43:26 +02:00
+								    def _download_webpage(
 								            self, url_or_request, video_id, note=None, errnote=None,
 								            fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
-												[extractor] Functions to parse socket.io response as json

Authored by: pukkandan, llacb47

											
										
										
											2021-05-30 10:17:39 +02:00
+								        """
-												[extractor, cleanup] Refactor `_download_...` methods

											
										
										
											2022-05-31 19:43:26 +02:00
+								        Return the data of the page as a string.
-												[extractor] Functions to parse socket.io response as json

Authored by: pukkandan, llacb47

											
										
										
											2021-05-30 10:17:39 +02:00
-												[extractor, cleanup] Refactor `_download_...` methods

											
										
										
											2022-05-31 19:43:26 +02:00
+								        Keyword arguments:
 								        tries -- number of tries
 								        timeout -- sleep interval between tries
 								        See _download_webpage_handle docstring for other arguments specification.
-												[extractor] Functions to parse socket.io response as json

Authored by: pukkandan, llacb47

											
										
										
											2021-05-30 10:17:39 +02:00
+								        """
-												[extractor, cleanup] Refactor `_download_...` methods

											
										
										
											2022-05-31 19:43:26 +02:00
 								        R''' # NB: These are unused; should they be deprecated?
 								        if tries != 1:
 								            self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
 								        if timeout is NO_DEFAULT:
 								            timeout = 5
 								        else:
 								            self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
 								        '''
 								        try_count = 0
 								        while True:
 								            try:
 								                return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
-												[compat, networking] Deprecate old functions (#2861)

Authored by: coletdjnz, pukkandan

											
										
										
											2023-07-09 09:53:02 +02:00
+								            except IncompleteRead as e:
-												[extractor, cleanup] Refactor `_download_...` methods

											
										
										
											2022-05-31 19:43:26 +02:00
+								                try_count += 1
 								                if try_count >= tries:
 								                    raise e
 								                self._sleep(timeout, video_id)
-												[extractor] Functions to parse socket.io response as json

Authored by: pukkandan, llacb47

											
										
										
											2021-05-30 10:17:39 +02:00
-												[extractor] Reset non-repeating warnings per video

											
										
										
											2021-08-06 00:17:11 +02:00
+								    def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
-												[cleanup, utils] Don't use kwargs for `format_field`

											
										
										
											2022-06-18 04:00:12 +02:00
+								        idstr = format_field(video_id, None, '%s: ')
-												[extractor] Reset non-repeating warnings per video

											
										
										
											2021-08-06 00:17:11 +02:00
+								        msg = f'[{self.IE_NAME}] {idstr}{msg}'
 								        if only_once:
 								            if f'WARNING: {msg}' in self._printed_messages:
 								                return
 								            self._printed_messages.add(f'WARNING: {msg}')
 								        self._downloader.report_warning(msg, *args, **kwargs)
-												[myvideo] Use RTMP instead of RTMPT (Fixes #2032)

											
										
										
											2013-12-23 15:57:43 +01:00
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											2021-05-17 14:23:08 +02:00
+								    def to_screen(self, msg, *args, **kwargs):
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								        """Print msg to screen, prefixing it with '[ie_name]'"""
-												[cleanup] Upgrade syntax

Using https://github.com/asottile/pyupgrade

1. `__future__` imports and `coding: utf-8` were removed
2. Files were rewritten with `pyupgrade --py36-plus --keep-percent-format`
3. f-strings were cherry-picked from `pyupgrade --py36-plus`

Extractors are left untouched (except removing header) to avoid unnecessary merge conflicts

											
										
										
											2022-04-11 17:10:28 +02:00
+								        self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											2021-05-17 14:23:08 +02:00
 								    def write_debug(self, msg, *args, **kwargs):
-												[cleanup] Upgrade syntax

Using https://github.com/asottile/pyupgrade

1. `__future__` imports and `coding: utf-8` were removed
2. Files were rewritten with `pyupgrade --py36-plus --keep-percent-format`
3. f-strings were cherry-picked from `pyupgrade --py36-plus`

Extractors are left untouched (except removing header) to avoid unnecessary merge conflicts

											
										
										
											2022-04-11 17:10:28 +02:00
+								        self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											2021-05-17 14:23:08 +02:00
 								    def get_param(self, name, default=None, *args, **kwargs):
 								        if self._downloader:
 								            return self._downloader.params.get(name, default, *args, **kwargs)
 								        return default
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
-												[cleanup Misc

Closes #5162

											
										
										
											2022-10-18 19:58:57 +02:00
+								    def report_drm(self, video_id, partial=NO_DEFAULT):
 								        if partial is not NO_DEFAULT:
 								            self._downloader.deprecation_warning('InfoExtractor.report_drm no longer accepts the argument partial')
-												[extractor] Better error message for DRM (#729)

Closes #636
											
										
										
											2021-08-22 22:08:38 +02:00
+								        self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								    def report_extraction(self, id_or_name):
 								        """Report information extraction."""
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								        self.to_screen(f'{id_or_name}: Extracting information')
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
 								    def report_download_webpage(self, video_id):
 								        """Report webpage download."""
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								        self.to_screen(f'{video_id}: Downloading webpage')
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
 								    def report_age_confirmation(self):
 								        """Report attempt to confirm age."""
-												[extractor/common] Modernize

											
										
										
											2014-08-28 01:04:43 +02:00
+								        self.to_screen('Confirming age')
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
-												VimeoIE: authentication support (closes #885) and add a method in the base InfoExtractor to get the login info

											
										
										
											2013-07-07 23:24:34 +02:00
+								    def report_login(self):
 								        """Report attempt to log in."""
-												[extractor/common] Modernize

											
										
										
											2014-08-28 01:04:43 +02:00
+								        self.to_screen('Logging in')
-												VimeoIE: authentication support (closes #885) and add a method in the base InfoExtractor to get the login info

											
										
										
											2013-07-07 23:24:34 +02:00
-												Add option `--ignore-no-formats-error`
* Ignores the "no video format" and similar errors
* Experimental - Some extractors may still throw these errors

											
										
										
											2021-04-17 02:09:58 +02:00
+								    def raise_login_required(
-												[youtube] Better message when login required

											
										
										
											2021-05-19 15:41:44 +02:00
+								            self, msg='This video is only available for registered users',
-												[extractor] Add `_perform_login` function (#2943)

* Adds new functions `_initialize_pre_login` and `_perform_login` as part of the extractor API
* Adds `ie.supports_login` to the public API
											
										
										
											2022-03-18 21:53:33 +01:00
+								            metadata_available=False, method=NO_DEFAULT):
-												Option `--wait-for-video` to wait for scheduled streams

											
										
										
											2021-11-28 19:57:44 +01:00
+								        if metadata_available and (
 								                self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
-												Add option `--ignore-no-formats-error`
* Ignores the "no video format" and similar errors
* Experimental - Some extractors may still throw these errors

											
										
										
											2021-04-17 02:09:58 +02:00
+								            self.report_warning(msg)
-												Fix doubling of `video_id` in `ExtractorError`

											
										
										
											2022-03-04 15:07:43 +01:00
+								            return
-												[cleanup, utils] Don't use kwargs for `format_field`

											
										
										
											2022-06-18 04:00:12 +02:00
+								        msg += format_field(self._login_hint(method), None, '. %s')
-												[extractor] Minor improvements (See desc)

1. Allow removal of login hint - extractors can set their own login hint as part of `msg`
2. Cleanup `_merge_subtitles` signature

											
										
										
											2021-07-06 22:57:53 +02:00
+								        raise ExtractorError(msg, expected=True)
-												[extractor/common] Add raise_login_required

											
										
										
											2015-08-26 17:24:47 +02:00
-												Add option `--ignore-no-formats-error`
* Ignores the "no video format" and similar errors
* Experimental - Some extractors may still throw these errors

											
										
										
											2021-04-17 02:09:58 +02:00
+								    def raise_geo_restricted(
 								            self, msg='This video is not available from your location due to geo restriction',
 								            countries=None, metadata_available=False):
-												Option `--wait-for-video` to wait for scheduled streams

											
										
										
											2021-11-28 19:57:44 +01:00
+								        if metadata_available and (
 								                self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
-												Add option `--ignore-no-formats-error`
* Ignores the "no video format" and similar errors
* Experimental - Some extractors may still throw these errors

											
										
										
											2021-04-17 02:09:58 +02:00
+								            self.report_warning(msg)
 								        else:
 								            raise GeoRestrictedError(msg, countries=countries)
 								    def raise_no_formats(self, msg, expected=False, video_id=None):
-												Option `--wait-for-video` to wait for scheduled streams

											
										
										
											2021-11-28 19:57:44 +01:00
+								        if expected and (
 								                self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
-												Add option `--ignore-no-formats-error`
* Ignores the "no video format" and similar errors
* Experimental - Some extractors may still throw these errors

											
										
										
											2021-04-17 02:09:58 +02:00
+								            self.report_warning(msg, video_id)
-												[CBS] Add fallback (#579)

Related: https://github.com/ytdl-org/youtube-dl/issues/29564
Authored-by: llacb47, pukkandan
											
										
										
											2021-08-02 04:16:12 +02:00
+								        elif isinstance(msg, ExtractorError):
 								            raise msg
-												Add option `--ignore-no-formats-error`
* Ignores the "no video format" and similar errors
* Experimental - Some extractors may still throw these errors

											
										
										
											2021-04-17 02:09:58 +02:00
+								        else:
 								            raise ExtractorError(msg, expected=expected, video_id=video_id)
-												[extractor/common] Add raise_geo_restricted

											
										
										
											2015-09-22 17:50:20 +02:00
-												PEP8 applied

											
										
										
											2014-11-23 20:41:03 +01:00
+								    # Methods for following #608
-												[generic] Detect ooyala videos (fixes #2013)

											
										
										
											2013-12-19 20:28:52 +01:00
+								    @staticmethod
-												[extractor] Improve `url_result` and related

											
										
										
											2022-01-20 02:36:42 +01:00
+								    def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
-												[extractor/common] Consistent URL spelling

											
										
										
											2015-07-23 19:37:45 +02:00
+								        """Returns a URL that points to a page that should be processed"""
-												[extractor] Improve `url_result` and related

											
										
										
											2022-01-20 02:36:42 +01:00
+								        if ie is not None:
 								            kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
-												Match --download-archive during playlist processing (Fixes #1745)

											
										
										
											2013-11-22 22:46:46 +01:00
+								        if video_id is not None:
-												[extractor] Improve `url_result` and related

											
										
										
											2022-01-20 02:36:42 +01:00
+								            kwargs['id'] = video_id
-												[utils] Add `video_title` for `url_result`

											
										
										
											2015-04-12 19:11:47 +02:00
+								        if video_title is not None:
-												[extractor] Improve `url_result` and related

											
										
										
											2022-01-20 02:36:42 +01:00
+								            kwargs['title'] = video_title
 								        return {
 								            **kwargs,
 								            '_type': 'url_transparent' if url_transparent else 'url',
 								            'url': url,
 								        }
-												[extractor] Framework for embed detection (#4307)

											
										
										
											2022-08-01 03:22:03 +02:00
+								    @classmethod
 								    def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
 								                              getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
 								        return cls.playlist_result(
 								            (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
 								            playlist_id, playlist_title, **kwargs)
-												[BostonGlobe] New. Nonstandard version of Brightcove.

Has a "data-brightcove-video-id" instead of a "data-video-id," otherwise
pretty much just Brightcove. Except the Globe isn't all Brightcove
videos, so fallback to Generic, too.

Also, abstract playlist_from_matches() from generic.py to common.py, and use
it here.

History of these changes can be found in
51170427d4b1143572a498dedaee61863a5b2c5b.

											
										
										
											2017-03-09 00:13:54 +01:00
-												[generic] Detect ooyala videos (fixes #2013)

											
										
										
											2013-12-19 20:28:52 +01:00
+								    @staticmethod
-												[extractor] Improve `url_result` and related

											
										
										
											2022-01-20 02:36:42 +01:00
+								    def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								        """Returns a playlist"""
 								        if playlist_id:
-												[extractor] Improve `url_result` and related

											
										
										
											2022-01-20 02:36:42 +01:00
+								            kwargs['id'] = playlist_id
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								        if playlist_title:
-												[extractor] Improve `url_result` and related

											
										
										
											2022-01-20 02:36:42 +01:00
+								            kwargs['title'] = playlist_title
-												[youtube] Don't show warning for empty playlist description (Closes #54)

:ci skip dl

											
										
										
											2021-02-07 15:44:44 +01:00
+								        if playlist_description is not None:
-												[extractor] Improve `url_result` and related

											
										
										
											2022-01-20 02:36:42 +01:00
+								            kwargs['description'] = playlist_description
 								        return {
 								            **kwargs,
 								            '_type': 'multi_video' if multi_video else 'playlist',
 								            'entries': entries,
 								        }
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
-												[extractor/common] Use NO_DEFAULT from utils

											
										
										
											2015-06-28 18:56:45 +02:00
+								    def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								        """
 								        Perform a regex search on the given string, using a single or a list of
 								        patterns returning the first matching group.
 								        In case of failure return a default value or raise a WARNING or a
-												[vimeo] Fix pro videos and player.vimeo.com urls

The old process can still be used for those videos.
Added RegexNotFoundError, which is raised by _search_regex if it can't extract the info.

											
										
										
											2013-10-23 14:38:03 +02:00
+								        RegexNotFoundError, depending on fatal, specifying the field name.
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								        """
-												[youtube] Fix uploader for collaborative playlists (#3332)

Authored by: coletdjnz
											
										
										
											2022-04-07 10:11:16 +02:00
+								        if string is None:
 								            mobj = None
-												[compat] Split into sub-modules (#2173)

Authored by: fstirlitz, pukkandan

											
										
										
											2022-02-04 14:37:02 +01:00
+								        elif isinstance(pattern, (str, re.Pattern)):
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								            mobj = re.search(pattern, string, flags)
 								        else:
 								            for p in pattern:
 								                mobj = re.search(p, string, flags)
-												[extractor/common] PEP8

											
										
										
											2014-07-25 10:43:03 +02:00
+								                if mobj:
 								                    break
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
-												[minicurses] Add more colors

											
										
										
											2021-10-20 18:37:32 +02:00
+								        _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
 								        if mobj:
-												[heise] Fix description, thumbnail and format ID

											
										
										
											2014-11-04 23:14:16 +01:00
+								            if group is None:
 								                # return the first matching group
 								                return next(g for g in mobj.groups() if g is not None)
-												[extractor] Allow extracting multiple groups in `_search_regex`
From #497, Authored by: fstirlitz

											
										
										
											2021-07-13 09:17:39 +02:00
+								            elif isinstance(group, (list, tuple)):
 								                return tuple(mobj.group(g) for g in group)
-												[heise] Fix description, thumbnail and format ID

											
										
										
											2014-11-04 23:14:16 +01:00
+								            else:
 								                return mobj.group(group)
-												[extractor/common] Use NO_DEFAULT from utils

											
										
										
											2015-06-28 18:56:45 +02:00
+								        elif default is not NO_DEFAULT:
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								            return default
 								        elif fatal:
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								            raise RegexNotFoundError(f'Unable to extract {_name}')
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								        else:
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								            self.report_warning(f'unable to extract {_name}' + bug_reports_message())
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								            return None
-												[extractor] Add `default` parameter to `_search_json` (#4057)

Authored by: pukkandan, coletdjnz
											
										
										
											2022-06-19 02:55:18 +02:00
+								    def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
-												[extractor] Make search_json able to parse lists

Now `contains_pattern` can be set to `\[.+\]`

											
										
										
											2022-10-03 13:20:27 +02:00
+								                     contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs):
-												[extractor] Add `_search_json`

All fetching of JSON objects should eventually be done with this function
but only `youtube` is being refactored for now

											
										
										
											2022-06-03 17:32:31 +02:00
+								        """Searches string for the JSON object specified by start_pattern"""
 								        # NB: end_pattern is only used to reduce the size of the initial match
-												[extractor] Add `default` parameter to `_search_json` (#4057)

Authored by: pukkandan, coletdjnz
											
										
										
											2022-06-19 02:55:18 +02:00
+								        if default is NO_DEFAULT:
 								            default, has_default = {}, False
 								        else:
 								            fatal, has_default = False, True
 								        json_string = self._search_regex(
-												[extractor] Make search_json able to parse lists

Now `contains_pattern` can be set to `\[.+\]`

											
										
										
											2022-10-03 13:20:27 +02:00
+								            rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})',
-												[extractor] Add `default` parameter to `_search_json` (#4057)

Authored by: pukkandan, coletdjnz
											
										
										
											2022-06-19 02:55:18 +02:00
+								            string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
 								        if not json_string:
 								            return default
 								        _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
 								        try:
 								            return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
 								        except ExtractorError as e:
 								            if fatal:
 								                raise ExtractorError(
 								                    f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
 								            elif not has_default:
 								                self.report_warning(
 								                    f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
 								        return default
-												[extractor] Add `_search_json`

All fetching of JSON objects should eventually be done with this function
but only `youtube` is being refactored for now

											
										
										
											2022-06-03 17:32:31 +02:00
-												[extractor/common] Use NO_DEFAULT from utils

											
										
										
											2015-06-28 18:56:45 +02:00
+								    def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								        """
 								        Like _search_regex, but strips HTML tags and unescapes entities.
 								        """
-												[heise] Fix description, thumbnail and format ID

											
										
										
											2014-11-04 23:14:16 +01:00
+								        res = self._search_regex(pattern, string, name, default, fatal, flags, group)
-												[cleanup] Misc

Closes #5576, closes #5887

											
										
										
											2023-01-02 15:09:03 +01:00
+								        if isinstance(res, tuple):
-												[cleanup] Misc

											
										
										
											2023-01-06 22:18:34 +01:00
+								            return tuple(map(clean_html, res))
 								        return clean_html(res)
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
-												[common] add separate method for getting netrc ligin info

											
										
										
											2016-08-14 12:48:13 +02:00
+								    def _get_netrc_login_info(self, netrc_machine=None):
 								        netrc_machine = netrc_machine or self._NETRC_MACHINE
-												Fix bug in db3ad8a67661d7b234a6954d9c6a4a9b1749f5eb

Closes #7367

											
										
										
											2023-06-21 08:42:15 +02:00
+								        cmd = self.get_param('netrc_cmd')
-												Add option `--netrc-cmd` (#6682)

Authored by: NDagestad, pukkandan
Closes #1706
											
										
										
											2023-06-21 05:07:42 +02:00
+								        if cmd:
-												Fix bug in db3ad8a67661d7b234a6954d9c6a4a9b1749f5eb

Closes #7367

											
										
										
											2023-06-21 08:42:15 +02:00
+								            cmd = cmd.replace('{}', netrc_machine)
-												Add option `--netrc-cmd` (#6682)

Authored by: NDagestad, pukkandan
Closes #1706
											
										
										
											2023-06-21 05:07:42 +02:00
+								            self.to_screen(f'Executing command: {cmd}')
 								            stdout, _, ret = Popen.run(cmd, text=True, shell=True, stdout=subprocess.PIPE)
 								            if ret != 0:
 								                raise OSError(f'Command returned error code {ret}')
 								            info = netrc_from_content(stdout).authenticators(netrc_machine)
-												[common] add separate method for getting netrc ligin info

											
										
										
											2016-08-14 12:48:13 +02:00
-												Add option `--netrc-cmd` (#6682)

Authored by: NDagestad, pukkandan
Closes #1706
											
										
										
											2023-06-21 05:07:42 +02:00
+								        elif self.get_param('usenetrc', False):
 								            netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
 								            if os.path.isdir(netrc_file):
 								                netrc_file = os.path.join(netrc_file, '.netrc')
 								            info = netrc.netrc(netrc_file).authenticators(netrc_machine)
 								        else:
 								            return None, None
 								        if not info:
-												[cleanup] Fix misc bugs (#8968)

Closes #8816

Authored by: bashonly, seproDev, pukkandan, Grub4k

											
										
										
											2024-03-10 15:22:49 +01:00
+								            self.to_screen(f'No authenticators for {netrc_machine}')
 								            return None, None
 								        self.write_debug(f'Using netrc for {netrc_machine} authentication')
-												Add option `--netrc-cmd` (#6682)

Authored by: NDagestad, pukkandan
Closes #1706
											
										
										
											2023-06-21 05:07:42 +02:00
+								        return info[0], info[2]
-												[common] add separate method for getting netrc ligin info

											
										
										
											2016-08-14 12:48:13 +02:00
-												[adobepass] add specific options for adobe pass authentication

- add --ap-username and --ap-password option to specify
TV provider username and password in the cmd line
- add --ap-retries option to limit the number of retries
- add --list-ap-msi-ids to list the supported TV Providers

											
										
										
											2016-09-13 23:16:01 +02:00
+								    def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
-												VimeoIE: authentication support (closes #885) and add a method in the base InfoExtractor to get the login info

											
										
										
											2013-07-07 23:24:34 +02:00
+								        """
-												Typo: twice "the the" to "the"
											
										
										
											2015-04-29 17:03:10 +02:00
+								        Get the login info as (username, password)
-												[extractor/common] Update _get_login_info's comment

											
										
										
											2016-09-15 17:34:29 +02:00
+								        First look for the manually specified credentials using username_option
 								        and password_option as keys in params dictionary. If no such credentials
-												Add option `--netrc-cmd` (#6682)

Authored by: NDagestad, pukkandan
Closes #1706
											
										
										
											2023-06-21 05:07:42 +02:00
+								        are available try the netrc_cmd if it is defined or look in the
 								        netrc file using the netrc_machine or _NETRC_MACHINE value.
-												VimeoIE: authentication support (closes #885) and add a method in the base InfoExtractor to get the login info

											
										
										
											2013-07-07 23:24:34 +02:00
+								        If there's no info available, return (None, None)
 								        """
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											2021-05-17 14:23:08 +02:00
+								        username = self.get_param(username_option)
 								        if username is not None:
 								            password = self.get_param(password_option)
-												[common] add separate method for getting netrc ligin info

											
										
										
											2016-08-14 12:48:13 +02:00
+								        else:
-												Add option `--netrc-cmd` (#6682)

Authored by: NDagestad, pukkandan
Closes #1706
											
										
										
											2023-06-21 05:07:42 +02:00
+								            try:
 								                username, password = self._get_netrc_login_info(netrc_machine)
 								            except (OSError, netrc.NetrcParseError) as err:
 								                self.report_warning(f'Failed to parse .netrc: {err}')
 								                return None, None
-												[extractor/common] Simplify _get_login_info

											
										
										
											2016-09-15 17:26:37 +02:00
+								        return username, password
-												VimeoIE: authentication support (closes #885) and add a method in the base InfoExtractor to get the login info

											
										
										
											2013-07-07 23:24:34 +02:00
-												[extractor/common] Interactive TFA code input

											
										
										
											2015-08-15 17:55:07 +02:00
+								    def _get_tfa_info(self, note='two-factor verification code'):
-												[youtube] Add two-factor account signin (TOTP only)

Additional work is required to prompt the user for the SMS or phone call codes, as there is no framework currently to prompt the user during an extraction operation.

Fixes #3533

											
										
										
											2014-08-16 23:28:41 +02:00
+								        """
 								        Get the two-factor authentication info
 								        TODO - asking the user will be required for sms/phone verify
 								        currently just uses the command line option
 								        If there's no info available, return None
 								        """
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											2021-05-17 14:23:08 +02:00
+								        tfa = self.get_param('twofactor')
 								        if tfa is not None:
 								            return tfa
-												[youtube] Add two-factor account signin (TOTP only)

Additional work is required to prompt the user for the SMS or phone call codes, as there is no framework currently to prompt the user during an extraction operation.

Fixes #3533

											
										
										
											2014-08-16 23:28:41 +02:00
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								        return getpass.getpass(f'Type {note} and press [Return]: ')
-												[youtube] Add two-factor account signin (TOTP only)

Additional work is required to prompt the user for the SMS or phone call codes, as there is no framework currently to prompt the user during an extraction operation.

Fixes #3533

											
										
										
											2014-08-16 23:28:41 +02:00
-												InfoExtractor: add some helper methods to extract OpenGraph info

											
										
										
											2013-07-12 19:00:19 +02:00
+								    # Helper functions for extracting OpenGraph info
 								    @staticmethod
-												Improve the OpenGraph regex

* Do not accept '>' between the property and content attributes.
* Recognize the properties if the content attribute is before the property attribute using two regexes (fixes the extraction of the description for SlideshareIE).

											
										
										
											2013-11-15 12:24:54 +01:00
+								    def _og_regexes(prop):
-												Update to ytdl-commit-2dd6c6e

[YouTube] Avoid crash if uploader_id extraction fails
https://github.com/ytdl-org/youtube-dl/commit/2dd6c6edd8e0fc5e45865b8e6d865e35147de772

Except:
    * 295736c9cba714fb5de7d1c3dd31d86e50091cf8 [jsinterp] Improve parsing
    * 384f632e8a9b61e864a26678d85b2b39933b9bae [ITV] Overhaul ITV extractor
    * 33db85c571304bbd6863e3407ad8d08764c9e53b [feat]: Add support to external downloader aria2p

											
										
										
											2023-02-17 12:21:34 +01:00
+								        content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))'
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								        property_re = r'(?:name|property)=(?:\'og{sep}{prop}\'|"og{sep}{prop}"|\s*og{sep}{prop}\b)'.format(
 								            prop=re.escape(prop), sep='(?:&#x3A;|[:-])')
-												Don't accept '>' inside the content attribute in OpenGraph regexes

											
										
										
											2013-11-15 12:54:13 +01:00
+								        template = r'<meta[^>]+?%s[^>]+?%s'
-												Improve the OpenGraph regex

* Do not accept '>' between the property and content attributes.
* Recognize the properties if the content attribute is before the property attribute using two regexes (fixes the extraction of the description for SlideshareIE).

											
										
										
											2013-11-15 12:24:54 +01:00
+								        return [
-												Don't accept '>' inside the content attribute in OpenGraph regexes

											
										
										
											2013-11-15 12:54:13 +01:00
+								            template % (property_re, content_re),
 								            template % (content_re, property_re),
-												Improve the OpenGraph regex

* Do not accept '>' between the property and content attributes.
* Recognize the properties if the content attribute is before the property attribute using two regexes (fixes the extraction of the description for SlideshareIE).

											
										
										
											2013-11-15 12:24:54 +01:00
+								        ]
-												InfoExtractor: add some helper methods to extract OpenGraph info

											
										
										
											2013-07-12 19:00:19 +02:00
-												[extractor/common] Add _meta_regex and clarify tags field

											
										
										
											2015-07-28 23:43:03 +02:00
+								    @staticmethod
 								    def _meta_regex(prop):
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								        return rf'''(?isx)<meta
 								                    (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?){re.escape(prop)}\1)
 								                    [^>]+?content=(["\'])(?P<content>.*?)\2'''
-												[extractor/common] Add _meta_regex and clarify tags field

											
										
										
											2015-07-28 23:43:03 +02:00
-												Improve OpenGraph property matching

											
										
										
											2013-07-13 20:39:47 +02:00
+								    def _og_search_property(self, prop, html, name=None, **kargs):
-												[utils] Add `variadic`

											
										
										
											2021-07-10 23:59:44 +02:00
+								        prop = variadic(prop)
-												InfoExtractor: add some helper methods to extract OpenGraph info

											
										
										
											2013-07-12 19:00:19 +02:00
+								        if name is None:
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								            name = f'OpenGraph {prop[0]}'
-												[extractor/common] Support multiple properties in _og_search_property

											
										
										
											2016-08-02 17:55:14 +02:00
+								        og_regexes = []
 								        for p in prop:
 								            og_regexes.extend(self._og_regexes(p))
 								        escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
-												[common] Simplify og_search_property

											
										
										
											2013-11-12 10:36:23 +01:00
+								        if escaped is None:
 								            return None
 								        return unescapeHTML(escaped)
-												InfoExtractor: add some helper methods to extract OpenGraph info

											
										
										
											2013-07-12 19:00:19 +02:00
 								    def _og_search_thumbnail(self, html, **kargs):
-												[extractor/common] Consistent URL spelling

											
										
										
											2015-07-23 19:37:45 +02:00
+								        return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
-												InfoExtractor: add some helper methods to extract OpenGraph info

											
										
										
											2013-07-12 19:00:19 +02:00
 								    def _og_search_description(self, html, **kargs):
 								        return self._og_search_property('description', html, fatal=False, **kargs)
-												[cleanup] Use `_html_extract_title`

											
										
										
											2022-04-04 10:27:35 +02:00
+								    def _og_search_title(self, html, *, fatal=False, **kargs):
 								        return self._og_search_property('title', html, fatal=fatal, **kargs)
-												InfoExtractor: add some helper methods to extract OpenGraph info

											
										
										
											2013-07-12 19:00:19 +02:00
-												[Instagram] get the non-https link, as they are serving Akamai cert from a instagram.com domain

											
										
										
											2013-10-28 07:34:29 +01:00
+								    def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
-												[escapist] Add support for og:video:url (Fixes #3557)

											
										
										
											2014-08-21 13:05:24 +02:00
+								        regexes = self._og_regexes('video') + self._og_regexes('video:url')
 								        if secure:
 								            regexes = self._og_regexes('video:secure_url') + regexes
-												[Instagram] get the non-https link, as they are serving Akamai cert from a instagram.com domain

											
										
										
											2013-10-28 07:34:29 +01:00
+								        return self._html_search_regex(regexes, html, name, **kargs)
-												InfoExtractor: add some helper methods to extract OpenGraph info

											
										
										
											2013-07-12 19:00:19 +02:00
-												[livestream:original] Add support for folder urls (closes #2631)

The webpage only contains shortened links for the videos, since the server
doesn't support HEAD requests, we use an specific extractor for them.

											
										
										
											2014-06-26 16:34:36 +02:00
+								    def _og_search_url(self, html, **kargs):
 								        return self._og_search_property('url', html, **kargs)
-												[cleanup] Use `_html_extract_title`

											
										
										
											2022-04-04 10:27:35 +02:00
+								    def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
-												[cleanup] Minor fixes

											
										
										
											2022-05-18 05:34:30 +02:00
+								        return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
-												[nhk] Add support for NHK for School (#2850)

Authored by: Lesmiscore

											
										
										
											2022-02-22 17:15:08 +01:00
-												[screencast] Add suppot for more video types (#3236)

											
										
										
											2014-07-11 15:38:18 +02:00
+								    def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
-												[utils] Add `variadic`

											
										
										
											2021-07-10 23:59:44 +02:00
+								        name = variadic(name)
-												Add support for tou.tv (Fixes #1792)

											
										
										
											2013-11-20 06:13:19 +01:00
+								        if display_name is None:
-												[utils] Add support for name list in _html_search_meta

											
										
										
											2016-06-26 11:57:14 +02:00
+								            display_name = name[0]
-												Add support for tou.tv (Fixes #1792)

											
										
										
											2013-11-20 06:13:19 +01:00
+								        return self._html_search_regex(
-												[utils] Add support for name list in _html_search_meta

											
										
										
											2016-06-26 11:57:14 +02:00
+								            [self._meta_regex(n) for n in name],
-												[heise] Fix description, thumbnail and format ID

											
										
										
											2014-11-04 23:14:16 +01:00
+								            html, display_name, fatal=fatal, group='content', **kwargs)
-												Add support for tou.tv (Fixes #1792)

											
										
										
											2013-11-20 06:13:19 +01:00
 								    def _dc_search_uploader(self, html):
 								        return self._html_search_meta('dc.creator', html, 'uploader')
-												[extractor] Framework for embed detection (#4307)

											
										
										
											2022-08-01 03:22:03 +02:00
+								    @staticmethod
 								    def _rta_search(html):
-												Allow users to specify an age limit (fixes #1545)

With these changes, users can now restrict what videos are downloaded by the intented audience, by specifying their age with --age-limit YEARS .
Add rudimentary support in youtube, pornotube, and youporn.

											
										
										
											2013-10-06 06:06:30 +02:00
+								        # See http://www.rtalabel.org/index.php?content=howtofaq#single
 								        if re.search(r'(?ix)<meta\s+name="rating"\s+'
 								                     r'     content="RTA-5042-1996-1400-1577-RTA"',
 								                     html):
 								            return 18
-												[extractor] Framework for embed detection (#4307)

											
										
										
											2022-08-01 03:22:03 +02:00
 								        # And then there are the jokers who advertise that they use RTA, but actually don't.
 								        AGE_LIMIT_MARKERS = [
 								            r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
-												Update to ytdl-commit-195f22f6

[generic] Improve KVS (etc) extraction
https://github.com/ytdl-org/youtube-dl/commit/195f22f679330549882a8234e7234942893a4902

Closes #3716
Authored by: Grub4k, pukkandan
											
										
										
											2023-01-02 14:45:36 +01:00
+								            r'>[^<]*you acknowledge you are at least (\d+) years old',
 								            r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',
-												[extractor] Framework for embed detection (#4307)

											
										
										
											2022-08-01 03:22:03 +02:00
+								        ]
-												Update to ytdl-commit-195f22f6

[generic] Improve KVS (etc) extraction
https://github.com/ytdl-org/youtube-dl/commit/195f22f679330549882a8234e7234942893a4902

Closes #3716
Authored by: Grub4k, pukkandan
											
										
										
											2023-01-02 14:45:36 +01:00
 								        age_limit = 0
 								        for marker in AGE_LIMIT_MARKERS:
 								            mobj = re.search(marker, html)
 								            if mobj:
 								                age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18)))
 								        return age_limit
-												Allow users to specify an age limit (fixes #1545)

With these changes, users can now restrict what videos are downloaded by the intented audience, by specifying their age with --age-limit YEARS .
Add rudimentary support in youtube, pornotube, and youporn.

											
										
										
											2013-10-06 06:06:30 +02:00
-												Add support for tou.tv (Fixes #1792)

											
										
										
											2013-11-20 06:13:19 +01:00
+								    def _media_rating_search(self, html):
 								        # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 								        rating = self._html_search_meta('rating', html)
 								        if not rating:
 								            return None
 								        RATING_TABLE = {
 								            'safe for kids': 0,
 								            'general': 8,
 								            '14 years': 14,
 								            'mature': 17,
 								            'restricted': 19,
 								        }
-												[refactor] Do not specify redundant None as second argument in dict.get()

											
										
										
											2016-02-14 09:25:04 +01:00
+								        return RATING_TABLE.get(rating.lower())
-												Add support for tou.tv (Fixes #1792)

											
										
										
											2013-11-20 06:13:19 +01:00
-												[extractor/common] Add new helper method _family_friendly_search

											
										
										
											2015-02-08 16:39:00 +01:00
+								    def _family_friendly_search(self, html):
-												[extractor/common] Fix link to external documentation

											
										
										
											2015-02-14 22:20:24 +01:00
+								        # See http://schema.org/VideoObject
-												[extractor/common] Make _family_friendly_search optional

											
										
										
											2017-08-12 12:11:35 +02:00
+								        family_friendly = self._html_search_meta(
 								            'isFamilyFriendly', html, default=None)
-												[extractor/common] Add new helper method _family_friendly_search

											
										
										
											2015-02-08 16:39:00 +01:00
 								        if not family_friendly:
 								            return None
 								        RATING_TABLE = {
 								            '1': 0,
 								            'true': 0,
 								            '0': 18,
 								            'false': 18,
 								        }
-												[refactor] Do not specify redundant None as second argument in dict.get()

											
										
										
											2016-02-14 09:25:04 +01:00
+								        return RATING_TABLE.get(family_friendly.lower())
-												[extractor/common] Add new helper method _family_friendly_search

											
										
										
											2015-02-08 16:39:00 +01:00
-												[bloomberg] Fix ooyala url extraction

Added a helper method to InfoExtractor for searching the ‘twitter:player’ meta property.
Now the OoyalaIE also recognizes the ‘ec’ parameter in the url as the embed code.

											
										
										
											2014-01-29 18:03:32 +01:00
+								    def _twitter_search_player(self, html):
 								        return self._html_search_meta('twitter:player', html,
-												PEP8: applied even more rules

											
										
										
											2014-11-23 21:39:15 +01:00
+								                                      'twitter card player')
-												[bloomberg] Fix ooyala url extraction

Added a helper method to InfoExtractor for searching the ‘twitter:player’ meta property.
Now the OoyalaIE also recognizes the ‘ec’ parameter in the url as the embed code.

											
										
										
											2014-01-29 18:03:32 +01:00
-												[extractor/npr] Implement e50c3500b43d80e4492569c4b4523c4379c6fbb2 differently

Closes #4141

											
										
										
											2022-06-22 22:12:39 +02:00
+								    def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
 								        """Yield all json ld objects in the html"""
 								        if default is not NO_DEFAULT:
 								            fatal = False
 								        for mobj in re.finditer(JSON_LD_RE, html):
 								            json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
 								            for json_ld in variadic(json_ld_item):
 								                if isinstance(json_ld, dict):
 								                    yield json_ld
 								    def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
 								        """Search for a video in any json ld in the html"""
 								        if default is not NO_DEFAULT:
 								            fatal = False
 								        info = self._json_ld(
 								            list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
 								            video_id, fatal=fatal, expected_type=expected_type)
 								        if info:
 								            return info
-												[extractor/common] Extract multiple JSON-LD entries

											
										
										
											2020-05-02 18:40:30 +02:00
+								        if default is not NO_DEFAULT:
 								            return default
 								        elif fatal:
 								            raise RegexNotFoundError('Unable to extract JSON-LD')
 								        else:
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								            self.report_warning(f'unable to extract JSON-LD {bug_reports_message()}')
-												[extractor/common] Extract multiple JSON-LD entries

											
										
										
											2020-05-02 18:40:30 +02:00
+								            return {}
-												[extractor/common] Add initial support for JSON-LD metadata extraction into info_dict

											
										
										
											2016-01-15 19:36:02 +01:00
-												[extractor/common] Add expected_type in json ld routines

											
										
										
											2016-07-08 22:28:04 +02:00
+								    def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
-												[compat] Remove deprecated functions from core code

											
										
										
											2022-06-24 12:54:43 +02:00
+								        if isinstance(json_ld, str):
-												[extractor/common] Add initial support for JSON-LD metadata extraction into info_dict

											
										
										
											2016-01-15 19:36:02 +01:00
+								            json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 								        if not json_ld:
 								            return {}
 								        info = {}
-												[extractor/common] Add support for video of WebPage context in _json_ld (closes #12778)

											
										
										
											2017-04-18 17:21:38 +02:00
-												[extractor/common] Extract interaction statistic

											
										
										
											2018-04-27 21:48:03 +02:00
+								        INTERACTION_TYPE_MAP = {
 								            'CommentAction': 'comment',
 								            'AgreeAction': 'like',
 								            'DisagreeAction': 'dislike',
 								            'LikeAction': 'like',
 								            'DislikeAction': 'dislike',
 								            'ListenAction': 'view',
 								            'WatchAction': 'view',
 								            'ViewAction': 'view',
 								        }
-												[extractor] Handle `json_ld` with multiple `@type`s

Closes: #4022

											
										
										
											2022-06-13 15:09:58 +02:00
+								        def is_type(e, *expected_types):
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								            type_ = variadic(traverse_obj(e, '@type'))
 								            return any(x in type_ for x in expected_types)
-												[extractor] Handle `json_ld` with multiple `@type`s

Closes: #4022

											
										
										
											2022-06-13 15:09:58 +02:00
-												Update to ytdl-2021.01.03

											
										
										
											2021-01-01 13:26:37 +01:00
+								        def extract_interaction_type(e):
 								            interaction_type = e.get('interactionType')
 								            if isinstance(interaction_type, dict):
 								                interaction_type = interaction_type.get('@type')
 								            return str_or_none(interaction_type)
-												[extractor/common] Extract interaction statistic

											
										
										
											2018-04-27 21:48:03 +02:00
+								        def extract_interaction_statistic(e):
 								            interaction_statistic = e.get('interactionStatistic')
-												Update to ytdl-2021.01.03

											
										
										
											2021-01-01 13:26:37 +01:00
+								            if isinstance(interaction_statistic, dict):
 								                interaction_statistic = [interaction_statistic]
-												[extractor/common] Extract interaction statistic

											
										
										
											2018-04-27 21:48:03 +02:00
+								            if not isinstance(interaction_statistic, list):
 								                return
 								            for is_e in interaction_statistic:
-												[extractor] Handle `json_ld` with multiple `@type`s

Closes: #4022

											
										
										
											2022-06-13 15:09:58 +02:00
+								                if not is_type(is_e, 'InteractionCounter'):
-												[extractor/common] Extract interaction statistic

											
										
										
											2018-04-27 21:48:03 +02:00
+								                    continue
-												Update to ytdl-2021.01.03

											
										
										
											2021-01-01 13:26:37 +01:00
+								                interaction_type = extract_interaction_type(is_e)
 								                if not interaction_type:
-												[extractor/common] Extract interaction statistic

											
										
										
											2018-04-27 21:48:03 +02:00
+								                    continue
-												[extractor/common] Relax interaction count extraction in _json_ld

											
										
										
											2020-09-19 01:33:17 +02:00
+								                # For interaction count some sites provide string instead of
 								                # an integer (as per spec) with non digit characters (e.g. ",")
 								                # so extracting count with more relaxed str_to_int
 								                interaction_count = str_to_int(is_e.get('userInteractionCount'))
-												[extractor/common] Extract interaction statistic

											
										
										
											2018-04-27 21:48:03 +02:00
+								                if interaction_count is None:
 								                    continue
 								                count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
 								                if not count_kind:
 								                    continue
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								                count_key = f'{count_kind}_count'
-												[extractor/common] Extract interaction statistic

											
										
										
											2018-04-27 21:48:03 +02:00
+								                if info.get(count_key) is not None:
 								                    continue
 								                info[count_key] = interaction_count
-												[extractor] Extract chapters from JSON-LD (#2031)

Authored by: iw0nderhow, pukkandan
											
										
										
											2022-01-01 22:07:00 +01:00
+								        def extract_chapter_information(e):
 								            chapters = [{
 								                'title': part.get('name'),
 								                'start_time': part.get('startOffset'),
 								                'end_time': part.get('endOffset'),
-												[generic] Allow further processing of json_ld URL
Closes #2578

											
										
										
											2022-02-02 02:58:01 +01:00
+								            } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
-												[extractor] Extract chapters from JSON-LD (#2031)

Authored by: iw0nderhow, pukkandan
											
										
										
											2022-01-01 22:07:00 +01:00
+								            for idx, (last_c, current_c, next_c) in enumerate(zip(
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								                    [{'end_time': 0}, *chapters], chapters, chapters[1:])):
-												[extractor] Extract chapters from JSON-LD (#2031)

Authored by: iw0nderhow, pukkandan
											
										
										
											2022-01-01 22:07:00 +01:00
+								                current_c['end_time'] = current_c['end_time'] or next_c['start_time']
 								                current_c['start_time'] = current_c['start_time'] or last_c['end_time']
 								                if None in current_c.values():
 								                    self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
 								                    return
 								            if chapters:
 								                chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
 								                info['chapters'] = chapters
-												[extractor/common] Add support for video of WebPage context in _json_ld (closes #12778)

											
										
										
											2017-04-18 17:21:38 +02:00
+								        def extract_video_object(e):
-												Update to ytdl-commit-4fb25ff

[maoritv] Add new extractor
https://github.com/ytdl-org/youtube-dl/commit/4fb25ff5a3be5206bb72e5c4046715b1529fb2c7

Except:
[vimeo] improve extraction https://github.com/ytdl-org/youtube-dl/commit/3ae9c0f410b1d4f63e8bada67dd62a8d2852be32
[youtube:tab] Pass innertube context... https://github.com/ytdl-org/youtube-dl/commit/1b0a13f33cfb3644cc718d35951ea85bb1905459

											
										
										
											2021-04-10 18:47:11 +02:00
+								            author = e.get('author')
-												[extractor/common] Add support for video of WebPage context in _json_ld (closes #12778)

											
										
										
											2017-04-18 17:21:38 +02:00
+								            info.update({
-												[extractor/npr] Implement e50c3500b43d80e4492569c4b4523c4379c6fbb2 differently

Closes #4141

											
										
										
											2022-06-22 22:12:39 +02:00
+								                'url': url_or_none(e.get('contentUrl')),
-												[extractor] Improve json+ld extraction

Related #5035

											
										
										
											2022-09-26 23:00:50 +02:00
+								                'ext': mimetype2ext(e.get('encodingFormat')),
-												[extractor/common] Add support for video of WebPage context in _json_ld (closes #12778)

											
										
										
											2017-04-18 17:21:38 +02:00
+								                'title': unescapeHTML(e.get('name')),
 								                'description': unescapeHTML(e.get('description')),
-												[extractor/StarTrek] Add extractor (#4191)

Authored by: scy
											
										
										
											2022-07-13 20:29:44 +02:00
+								                'thumbnails': [{'url': unescapeHTML(url)}
-												[cleanup] Minor fixes

											
										
										
											2022-05-18 05:34:30 +02:00
+								                               for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
 								                               if url_or_none(url)],
-												[extractor/common] Add support for video of WebPage context in _json_ld (closes #12778)

											
										
										
											2017-04-18 17:21:38 +02:00
+								                'duration': parse_duration(e.get('duration')),
 								                'timestamp': unified_timestamp(e.get('uploadDate')),
-												Update to ytdl-commit-4fb25ff

[maoritv] Add new extractor
https://github.com/ytdl-org/youtube-dl/commit/4fb25ff5a3be5206bb72e5c4046715b1529fb2c7

Except:
[vimeo] improve extraction https://github.com/ytdl-org/youtube-dl/commit/3ae9c0f410b1d4f63e8bada67dd62a8d2852be32
[youtube:tab] Pass innertube context... https://github.com/ytdl-org/youtube-dl/commit/1b0a13f33cfb3644cc718d35951ea85bb1905459

											
										
										
											2021-04-10 18:47:11 +02:00
+								                # author can be an instance of 'Organization' or 'Person' types.
 								                # both types can have 'name' property(inherited from 'Thing' type). [1]
 								                # however some websites are using 'Text' type instead.
 								                # 1. https://schema.org/VideoObject
-												[compat] Remove deprecated functions from core code

											
										
										
											2022-06-24 12:54:43 +02:00
+								                'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
-												[extractor] Improve json+ld extraction

Related #5035

											
										
										
											2022-09-26 23:00:50 +02:00
+								                'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),
-												[cleanup] Misc fixes

Closes #4027

											
										
										
											2022-06-10 21:03:54 +02:00
+								                'filesize': int_or_none(float_or_none(e.get('contentSize'))),
-												[extractor/common] Add support for video of WebPage context in _json_ld (closes #12778)

											
										
										
											2017-04-18 17:21:38 +02:00
+								                'tbr': int_or_none(e.get('bitrate')),
 								                'width': int_or_none(e.get('width')),
 								                'height': int_or_none(e.get('height')),
-												[extractor/common] Extract view count from JSON-LD

											
										
										
											2017-04-30 16:11:55 +02:00
+								                'view_count': int_or_none(e.get('interactionCount')),
-												[extractor] Improve json+ld extraction

Related #5035

											
										
										
											2022-09-26 23:00:50 +02:00
+								                'tags': try_call(lambda: e.get('keywords').split(',')),
-												[extractor/common] Add support for video of WebPage context in _json_ld (closes #12778)

											
										
										
											2017-04-18 17:21:38 +02:00
+								            })
-												[extractor] Improve json+ld extraction

Related #5035

											
										
										
											2022-09-26 23:00:50 +02:00
+								            if is_type(e, 'AudioObject'):
 								                info.update({
 								                    'vcodec': 'none',
 								                    'abr': int_or_none(e.get('bitrate')),
 								                })
-												[extractor/common] Extract interaction statistic

											
										
										
											2018-04-27 21:48:03 +02:00
+								            extract_interaction_statistic(e)
-												[extractor] Extract chapters from JSON-LD (#2031)

Authored by: iw0nderhow, pukkandan
											
										
										
											2022-01-01 22:07:00 +01:00
+								            extract_chapter_information(e)
-												[extractor/common] Add support for video of WebPage context in _json_ld (closes #12778)

											
										
										
											2017-04-18 17:21:38 +02:00
-												[extractor] Support default implicit graph in JSON-LD (#1983)

Original PR: https://github.com/ytdl-org/youtube-dl/pull/30229

Per W3C JSON-LD v1.1 §4.9 (non-normative ref):

    When a JSON-LD document's top-level structure is a map that contains
    no other keys than @graph and optionally @context (properties that
    are not mapped to an IRI or a keyword are ignored), @graph is
    considered to express the otherwise implicit default graph.

Authored by: zmousm
											
										
										
											2021-12-16 21:46:30 +01:00
+								        def traverse_json_ld(json_ld, at_top_level=True):
-												[extractor/common] Fix `json_ld` type checks (#5145)

Closes #5144, #5143
Authored by: Grub4K
											
										
										
											2022-10-09 05:17:58 +02:00
+								            for e in variadic(json_ld):
 								                if not isinstance(e, dict):
 								                    continue
-												[extractor] Support default implicit graph in JSON-LD (#1983)

Original PR: https://github.com/ytdl-org/youtube-dl/pull/30229

Per W3C JSON-LD v1.1 §4.9 (non-normative ref):

    When a JSON-LD document's top-level structure is a map that contains
    no other keys than @graph and optionally @context (properties that
    are not mapped to an IRI or a keyword are ignored), @graph is
    considered to express the otherwise implicit default graph.

Authored by: zmousm
											
										
										
											2021-12-16 21:46:30 +01:00
+								                if at_top_level and '@context' not in e:
 								                    continue
 								                if at_top_level and set(e.keys()) == {'@context', '@graph'}:
-												[extractor/common] Fix `json_ld` type checks (#5145)

Closes #5144, #5143
Authored by: Grub4K
											
										
										
											2022-10-09 05:17:58 +02:00
+								                    traverse_json_ld(e['@graph'], at_top_level=False)
-												[extractor/zeenews] Add extractor (#5289)

Closes #4967 
Authored by: m4tu4g, pukkandan
											
										
										
											2022-10-19 23:47:18 +02:00
+								                    continue
-												[extractor] Handle `json_ld` with multiple `@type`s

Closes: #4022

											
										
										
											2022-06-13 15:09:58 +02:00
+								                if expected_type is not None and not is_type(e, expected_type):
-												[extractor/common] Extract multiple JSON-LD entries

											
										
										
											2020-05-02 18:40:30 +02:00
+								                    continue
-												[extractor] Extract `average_rating` from JSON-LD
Eg: Crunchyroll

											
										
										
											2021-11-23 08:41:28 +01:00
+								                rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
 								                if rating is not None:
 								                    info['average_rating'] = rating
-												[extractor] Handle `json_ld` with multiple `@type`s

Closes: #4022

											
										
										
											2022-06-13 15:09:58 +02:00
+								                if is_type(e, 'TVEpisode', 'Episode'):
-												[extractor/common] Use episode name as title in _json_ld

											
										
										
											2019-01-08 04:02:49 +01:00
+								                    episode_name = unescapeHTML(e.get('name'))
-												[extractor/common] Support root JSON-LD lists (Closes #10203)

											
										
										
											2016-08-05 18:14:32 +02:00
+								                    info.update({
-												[extractor/common] Use episode name as title in _json_ld

											
										
										
											2019-01-08 04:02:49 +01:00
+								                        'episode': episode_name,
-												[extractor/common] Support root JSON-LD lists (Closes #10203)

											
										
										
											2016-08-05 18:14:32 +02:00
+								                        'episode_number': int_or_none(e.get('episodeNumber')),
 								                        'description': unescapeHTML(e.get('description')),
 								                    })
-												[extractor/common] Use episode name as title in _json_ld

											
										
										
											2019-01-08 04:02:49 +01:00
+								                    if not info.get('title') and episode_name:
 								                        info['title'] = episode_name
-												[extractor/common] Support root JSON-LD lists (Closes #10203)

											
										
										
											2016-08-05 18:14:32 +02:00
+								                    part_of_season = e.get('partOfSeason')
-												[extractor] Handle `json_ld` with multiple `@type`s

Closes: #4022

											
										
										
											2022-06-13 15:09:58 +02:00
+								                    if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
-												[extractor/common] Extract season in _json_ld

											
										
										
											2019-01-26 22:36:58 +01:00
+								                        info.update({
 								                            'season': unescapeHTML(part_of_season.get('name')),
 								                            'season_number': int_or_none(part_of_season.get('seasonNumber')),
 								                        })
-												[common] extract partOfTVSeries info in json-ld

											
										
										
											2016-08-06 19:58:38 +02:00
+								                    part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
-												[extractor] Handle `json_ld` with multiple `@type`s

Closes: #4022

											
										
										
											2022-06-13 15:09:58 +02:00
+								                    if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
-												[extractor/common] Support root JSON-LD lists (Closes #10203)

											
										
										
											2016-08-05 18:14:32 +02:00
+								                        info['series'] = unescapeHTML(part_of_series.get('name'))
-												[extractor] Handle `json_ld` with multiple `@type`s

Closes: #4022

											
										
										
											2022-06-13 15:09:58 +02:00
+								                elif is_type(e, 'Movie'):
-												[extractor/common] Add support for movies in _json_ld

											
										
										
											2019-01-08 04:02:00 +01:00
+								                    info.update({
 								                        'title': unescapeHTML(e.get('name')),
 								                        'description': unescapeHTML(e.get('description')),
 								                        'duration': parse_duration(e.get('duration')),
 								                        'timestamp': unified_timestamp(e.get('dateCreated')),
 								                    })
-												[extractor] Handle `json_ld` with multiple `@type`s

Closes: #4022

											
										
										
											2022-06-13 15:09:58 +02:00
+								                elif is_type(e, 'Article', 'NewsArticle'):
-												[extractor/common] Support root JSON-LD lists (Closes #10203)

											
										
										
											2016-08-05 18:14:32 +02:00
+								                    info.update({
 								                        'timestamp': parse_iso8601(e.get('datePublished')),
 								                        'title': unescapeHTML(e.get('headline')),
-												[extractor] Support default implicit graph in JSON-LD (#1983)

Original PR: https://github.com/ytdl-org/youtube-dl/pull/30229

Per W3C JSON-LD v1.1 §4.9 (non-normative ref):

    When a JSON-LD document's top-level structure is a map that contains
    no other keys than @graph and optionally @context (properties that
    are not mapped to an IRI or a keyword are ignored), @graph is
    considered to express the otherwise implicit default graph.

Authored by: zmousm
											
										
										
											2021-12-16 21:46:30 +01:00
+								                        'description': unescapeHTML(e.get('articleBody') or e.get('description')),
-												[extractor/common] Support root JSON-LD lists (Closes #10203)

											
										
										
											2016-08-05 18:14:32 +02:00
+								                    })
-												[extractor] Handle `json_ld` with multiple `@type`s

Closes: #4022

											
										
										
											2022-06-13 15:09:58 +02:00
+								                    if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
-												[extractor] Extract video inside `Article` json_ld

Closes #2448

											
										
										
											2022-01-23 21:10:05 +01:00
+								                        extract_video_object(e['video'][0])
-												[extractor] Handle `json_ld` with multiple `@type`s

Closes: #4022

											
										
										
											2022-06-13 15:09:58 +02:00
+								                    elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
-												[extractor/npr] Use stream url from json-ld (#3455)

Closes #1934
Authored by: r5d
											
										
										
											2022-06-03 02:51:11 +02:00
+								                        extract_video_object(e['subjectOf'][0])
-												[extractor] Improve json+ld extraction

Related #5035

											
										
										
											2022-09-26 23:00:50 +02:00
+								                elif is_type(e, 'VideoObject', 'AudioObject'):
-												[extractor/common] Add support for video of WebPage context in _json_ld (closes #12778)

											
										
										
											2017-04-18 17:21:38 +02:00
+								                    extract_video_object(e)
-												[extractor/common] Extract multiple JSON-LD entries

											
										
										
											2020-05-02 18:40:30 +02:00
+								                    if expected_type is None:
 								                        continue
 								                    else:
 								                        break
-												[extractor/common] Improve _json_ld

											
										
										
											2017-06-30 17:19:06 +02:00
+								                video = e.get('video')
-												[extractor] Handle `json_ld` with multiple `@type`s

Closes: #4022

											
										
										
											2022-06-13 15:09:58 +02:00
+								                if is_type(video, 'VideoObject'):
-												[extractor/common] Improve _json_ld

											
										
										
											2017-06-30 17:19:06 +02:00
+								                    extract_video_object(video)
-												[extractor/common] Extract multiple JSON-LD entries

											
										
										
											2020-05-02 18:40:30 +02:00
+								                if expected_type is None:
 								                    continue
 								                else:
 								                    break
-												[extractor] Support default implicit graph in JSON-LD (#1983)

Original PR: https://github.com/ytdl-org/youtube-dl/pull/30229

Per W3C JSON-LD v1.1 §4.9 (non-normative ref):

    When a JSON-LD document's top-level structure is a map that contains
    no other keys than @graph and optionally @context (properties that
    are not mapped to an IRI or a keyword are ignored), @graph is
    considered to express the otherwise implicit default graph.

Authored by: zmousm
											
										
										
											2021-12-16 21:46:30 +01:00
-												[extractor/common] Fix `json_ld` type checks (#5145)

Closes #5144, #5143
Authored by: Grub4K
											
										
										
											2022-10-09 05:17:58 +02:00
+								        traverse_json_ld(json_ld)
-												[utils] Add `filter_dict`

											
										
										
											2022-03-28 04:51:45 +02:00
+								        return filter_dict(info)
-												[extractor/common] Add initial support for JSON-LD metadata extraction into info_dict

											
										
										
											2016-01-15 19:36:02 +01:00
-												[ie] Make `_search_nextjs_data` non fatal (#8937)

Authored by: Grub4K
											
										
										
											2024-04-21 13:40:38 +02:00
+								    def _search_nextjs_data(self, webpage, video_id, *, fatal=True, default=NO_DEFAULT, **kw):
 								        if default == '{}':
 								            self._downloader.deprecation_warning('using `default=\'{}\'` is deprecated, use `default={}` instead')
 								            default = {}
 								        if default is not NO_DEFAULT:
 								            fatal = False
 								        return self._search_json(
 								            r'<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>', webpage, 'next.js data',
 								            video_id, end_pattern='</script>', fatal=fatal, default=default, **kw)
-												[extractor] Add `_search_nextjs_data` (#1386)
Authored by: selfisekai

											
										
										
											2021-10-23 04:02:23 +02:00
-												[extractor/BiliIntl] Fix metadata extraction

Closes #4116

											
										
										
											2022-06-19 23:33:19 +02:00
+								    def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
 								        """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
-												[extractor] Add `_search_nuxt_data` (#1921)

Authored by: nao20010128nao
											
										
										
											2021-12-07 17:38:50 +01:00
+								        rectx = re.escape(context_name)
-												[cleanup] Misc (#8300)

* Simplify nuxt regex
* Fix tmz quotes and tests
* Update test python versions

Authored by: dirkf, gamer191, Grub4K
											
										
										
											2023-10-07 03:02:45 +02:00
+								        FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){.*?\breturn\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
-												[extractor] Add `_search_nuxt_data` (#1921)

Authored by: nao20010128nao
											
										
										
											2021-12-07 17:38:50 +01:00
+								        js, arg_keys, arg_vals = self._search_regex(
-												[extractor/BiliIntl] Fix metadata extraction

Closes #4116

											
										
										
											2022-06-19 23:33:19 +02:00
+								            (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
-												[extractor] Fix `fatal=False` for `_search_nuxt_data`

Closes #5423

											
										
										
											2022-11-10 23:09:41 +01:00
+								            webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
 								            default=NO_DEFAULT if fatal else (None, None, None))
 								        if js is None:
 								            return {}
-												[extractor] Add `_search_nuxt_data` (#1921)

Authored by: nao20010128nao
											
										
										
											2021-12-07 17:38:50 +01:00
-												[extractor/common] Fix `_search_nuxt_data` (#6062)

Authored by: LowSuggestion912
											
										
										
											2023-02-12 08:25:24 +01:00
+								        args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
 								            f'[{arg_vals}]', video_id, transform_source=js_to_json, fatal=fatal) or ())))
-												[extractor] Add `_search_nuxt_data` (#1921)

Authored by: nao20010128nao
											
										
										
											2021-12-07 17:38:50 +01:00
-												[extractor/BiliIntl] Fix metadata extraction

Closes #4116

											
										
										
											2022-06-19 23:33:19 +02:00
+								        ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
 								        return traverse_obj(ret, traverse) or {}
-												[extractor] Add `_search_nuxt_data` (#1921)

Authored by: nao20010128nao
											
										
										
											2021-12-07 17:38:50 +01:00
-												[extractor/common] Add method for extracting form hidden input fields as dict

											
										
										
											2015-07-10 17:49:09 +02:00
+								    @staticmethod
-												[extractor/common] Improve _form_hidden_inputs and rename to _hidden_inputs

											
										
										
											2015-07-14 18:36:30 +02:00
+								    def _hidden_inputs(html):
-												[extractor/common] Skip html comment tags (Closes #6822)

											
										
										
											2015-09-11 17:07:32 +02:00
+								        html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
-												[extractor/common] Improve _hidden_inputs

											
										
										
											2015-08-15 17:52:22 +02:00
+								        hidden_inputs = {}
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								        for input_el in re.findall(r'(?i)(<input[^>]+>)', html):
 								            attrs = extract_attributes(input_el)
 								            if not input_el:
-												[extractor/common] Improve _hidden_inputs

											
										
										
											2015-08-15 17:52:22 +02:00
+								                continue
-												[utils] Improve _hidden_inputs

											
										
										
											2016-09-15 16:54:48 +02:00
+								            if attrs.get('type') not in ('hidden', 'submit'):
-												[extractor/common] Improve _hidden_inputs

											
										
										
											2015-08-15 17:52:22 +02:00
+								                continue
-												[utils] Improve _hidden_inputs

											
										
										
											2016-09-15 16:54:48 +02:00
+								            name = attrs.get('name') or attrs.get('id')
 								            value = attrs.get('value')
 								            if name and value is not None:
 								                hidden_inputs[name] = value
-												[extractor/common] Improve _hidden_inputs

											
										
										
											2015-08-15 17:52:22 +02:00
+								        return hidden_inputs
-												[extractor/common] Add method for extracting form hidden input fields as dict

											
										
										
											2015-07-10 17:49:09 +02:00
-												[extractor/common] Add _form_hidden_inputs

											
										
										
											2015-07-14 18:38:10 +02:00
+								    def _form_hidden_inputs(self, form_id, html):
 								        form = self._search_regex(
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								            rf'(?is)<form[^>]+?id=(["\']){form_id}\1[^>]*>(?P<form>.+?)</form>',
 								            html, f'{form_id} form', group='form')
-												[extractor/common] Add _form_hidden_inputs

											
										
										
											2015-07-14 18:38:10 +02:00
+								        return self._hidden_inputs(form)
-												[utils] Move format sorting code into `utils`

											
										
										
											2022-11-17 06:33:20 +01:00
+								    @classproperty(cache=True)
 								    def FormatSort(cls):
 								        class FormatSort(FormatSorter):
 								            def __init__(ie, *args, **kwargs):
 								                super().__init__(ie._downloader, *args, **kwargs)
-												Better Format Sorting (Squashed)

* Added --format-sort (-S height,filesize)
* Made fields reversible (-S +height)
* Added --format-sort-force, --no-format-sort-force
* Added limit (-S height:720)
* Added codec preference (-S vcodec,acodec)
* Correct handling of preference<-1000

* Rebased to yt-dlc
* Automatically determine missing bitrates
* aext, vext, protocol, acodec, vcodec can now takes priority as string, not number (-S vext:webm)
* Correct handling of None in codec, audio_codec (None means the codec is unknown while 'none' means it doesn't exist)
* Correctly parse filesize (-S filesize:200M)
* Generalized preference calculation

* Rewrote entire code into the class FormatSort
* Correctly handle user input errors
* Combined fields (-S +ext:webm:webm)
* Closest mode (-S filesize~50M)
* Aliases (framerate=fps, br=bitrate etc)

* Documentation

											
										
										
											2020-10-26 16:50:09 +01:00
-												[utils] Move format sorting code into `utils`

											
										
										
											2022-11-17 06:33:20 +01:00
+								        deprecation_warning(
 								            'yt_dlp.InfoExtractor.FormatSort is deprecated and may be removed in the future. '
 								            'Use yt_dlp.utils.FormatSorter instead')
 								        return FormatSort
-												Better Format Sorting (Squashed)

* Added --format-sort (-S height,filesize)
* Made fields reversible (-S +height)
* Added --format-sort-force, --no-format-sort-force
* Added limit (-S height:720)
* Added codec preference (-S vcodec,acodec)
* Correct handling of preference<-1000

* Rebased to yt-dlc
* Automatically determine missing bitrates
* aext, vext, protocol, acodec, vcodec can now takes priority as string, not number (-S vext:webm)
* Correct handling of None in codec, audio_codec (None means the codec is unknown while 'none' means it doesn't exist)
* Correctly parse filesize (-S filesize:200M)
* Generalized preference calculation

* Rewrote entire code into the class FormatSort
* Correctly handle user input errors
* Combined fields (-S +ext:webm:webm)
* Closest mode (-S filesize~50M)
* Aliases (framerate=fps, br=bitrate etc)

* Documentation

											
										
										
											2020-10-26 16:50:09 +01:00
 								    def _sort_formats(self, formats, field_preference=[]):
-												[extractor] Deprecate `_sort_formats`

											
										
										
											2022-11-17 06:10:03 +01:00
+								        if not field_preference:
 								            self._downloader.deprecation_warning(
 								                'yt_dlp.InfoExtractor._sort_formats is deprecated and is no longer required')
 								            return
 								        self._downloader.deprecation_warning(
 								            'yt_dlp.InfoExtractor._sort_formats is deprecated and no longer works as expected. '
 								            'Return _format_sort_fields in the info_dict instead')
 								        if formats:
-												Implement universal format sorting

Closes #5566

											
										
										
											2022-11-17 06:23:05 +01:00
+								            formats[0]['__sort_fields'] = field_preference
-												Add support for tou.tv (Fixes #1792)

											
										
										
											2013-11-20 06:13:19 +01:00
-												[common] Generalize URLs' HTTP errors pre-testing

											
										
										
											2015-01-25 19:32:31 +01:00
+								    def _check_formats(self, formats, video_id):
 								        if formats:
 								            formats[:] = filter(
 								                lambda f: self._is_valid_url(
 								                    f['url'], video_id,
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								                    item='{} video format'.format(f.get('format_id')) if f.get('format_id') else 'video'),
-												[common] Generalize URLs' HTTP errors pre-testing

											
										
										
											2015-01-25 19:32:31 +01:00
+								                formats)
-												[extractor/common] Add _remove_duplicate_formats

											
										
										
											2016-02-21 20:19:39 +01:00
+								    @staticmethod
 								    def _remove_duplicate_formats(formats):
 								        format_urls = set()
 								        unique_formats = []
 								        for f in formats:
 								            if f['url'] not in format_urls:
 								                format_urls.add(f['url'])
 								                unique_formats.append(f)
 								        formats[:] = unique_formats
-												[infoq] Add audio only format if available (#11565)

* [infoq] Add audio only format if available

Refactor cookie code into a function.
Renamed formats to http_video, http_audio, rtmp_video
Renamed extract functions to video instead of videos as they return
one or no video.

* [infoq] Rename to _extract_cookies as it more than one

* [infoq] Remove redundant determine_ext

* [infoq] Add comment about hardcoded URL

* [infoq] Use _hidden_inputs instead of messy regex

* [infoq] Probe if audio URL is valid

Make it possible to pass headers to _is_valid_url

* [infoq] Add audio only test

											
										
										
											2017-02-03 05:10:13 +01:00
+								    def _is_valid_url(self, url, video_id, item='video', headers={}):
-												[extractor/common] Assume non HTTP(S) URLs valid

											
										
										
											2015-03-02 17:38:44 +01:00
+								        url = self._proto_relative_url(url, scheme='http:')
 								        # For now assume non HTTP(S) URLs always valid
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								        if not url.startswith(('http://', 'https://')):
-												[extractor/common] Assume non HTTP(S) URLs valid

											
										
										
											2015-03-02 17:38:44 +01:00
+								            return True
-												[common] Generalize URLs' HTTP errors pre-testing

											
										
										
											2015-01-25 19:32:31 +01:00
+								        try:
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								            self._request_webpage(url, video_id, f'Checking {item} URL', headers=headers)
-												[common] Generalize URLs' HTTP errors pre-testing

											
										
										
											2015-01-25 19:32:31 +01:00
+								            return True
-												Merge 'ytdl-org/youtube-dl/master' release 2020.11.19

Old Extractors left behind:
	VLivePlaylistIE
	YoutubeSearchURLIE
	YoutubeShowIE
	YoutubeFavouritesIE

If removing old extractors, make corresponding changes in
	docs/supportedsites.md
	youtube_dlc/extractor/extractors.py

Not merged:
	.github/ISSUE_TEMPLATE/1_broken_site.md
	.github/ISSUE_TEMPLATE/2_site_support_request.md
	.github/ISSUE_TEMPLATE/3_site_feature_request.md
	.github/ISSUE_TEMPLATE/4_bug_report.md
	.github/ISSUE_TEMPLATE/5_feature_request.md
	test/test_all_urls.py
	youtube_dlc/version.py
	Changelog

											
										
										
											2020-11-19 20:22:59 +01:00
+								        except ExtractorError as e:
-												[extractor/common] Make _is_valid_url more relaxed

											
										
										
											2019-10-02 19:53:07 +02:00
+								            self.to_screen(
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								                f'{video_id}: {item} URL is invalid, skipping: {e.cause!s}')
-												[extractor/common] Make _is_valid_url more relaxed

											
										
										
											2019-10-02 19:53:07 +02:00
+								            return False
-												[common] Generalize URLs' HTTP errors pre-testing

											
										
										
											2015-01-25 19:32:31 +01:00
-												[soundcloud/generic] Add support for playlists

											
										
										
											2014-05-05 03:12:41 +02:00
+								    def http_scheme(self):
-												[glide] Simplify

											
										
										
											2014-10-24 15:34:19 +02:00
+								        """ Either "http:" or "https:", depending on the user's preferences """
-												[soundcloud/generic] Add support for playlists

											
										
										
											2014-05-05 03:12:41 +02:00
+								        return (
 								            'http:'
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											2021-05-17 14:23:08 +02:00
+								            if self.get_param('prefer_insecure', False)
-												[soundcloud/generic] Add support for playlists

											
										
										
											2014-05-05 03:12:41 +02:00
+								            else 'https:')
-												[mixcloud] Shed API dependency (#2904)

											
										
										
											2014-05-13 09:42:38 +02:00
+								    def _proto_relative_url(self, url, scheme=None):
-												[extractor] Framework for embed detection (#4307)

											
										
										
											2022-08-01 03:22:03 +02:00
+								        scheme = scheme or self.http_scheme()
 								        assert scheme.endswith(':')
 								        return sanitize_url(url, scheme=scheme[:-1])
-												[mixcloud] Shed API dependency (#2904)

											
										
										
											2014-05-13 09:42:38 +02:00
-												[vodlocker] PEP8, generalization, and simplification (#3223)

											
										
										
											2014-07-11 10:57:08 +02:00
+								    def _sleep(self, timeout, video_id, msg_template=None):
 								        if msg_template is None:
-												[extractor/common] Modernize

											
										
										
											2014-08-28 01:04:43 +02:00
+								            msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
-												[vodlocker] PEP8, generalization, and simplification (#3223)

											
										
										
											2014-07-11 10:57:08 +02:00
+								        msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 								        self.to_screen(msg)
 								        time.sleep(timeout)
-												[formatsort] Remove misuse of 'preference'

'preference' is to be used only when the format is better that ALL qualities of a lower preference irrespective of ANY sorting order the user requests. See deezer.py for correct use of this

In the older sorting method, `preference`, `quality` and `language_preference` were functionally almost equivalent. So these disparities doesn't really matter there

Also, despite what the documentation says, the default for `preference` was actually 0 and not -1. I have tried to correct this and also account for it when converting `preference` to `quality`

											
										
										
											2021-02-18 23:03:16 +01:00
+								    def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
-												[extractor/common] Add fatal to _extract_f4m_formats

											
										
										
											2015-10-01 19:03:31 +02:00
+								                             transform_source=lambda s: fix_xml_ampersands(s).strip(),
-												[extractor/common] Add data, headers and query to all major extract methods preserving standard order for potential future use

											
										
										
											2019-11-15 23:44:14 +01:00
+								                             fatal=True, m3u8_id=None, data=None, headers={}, query={}):
-												[extractor] Let `_extract_format` functions obey `--ignore-no-formats`

											
										
										
											2022-12-15 15:28:57 +01:00
+								        if self.get_param('ignore_no_formats_error'):
 								            fatal = False
-												[extractor] Update `manifest_url`s after redirect (#3575)

Authored by: elyse0
											
										
										
											2022-04-28 00:50:01 +02:00
+								        res = self._download_xml_handle(
-												[extractor/common] _extract_f4m_formats: Use more specific messages when downloading the manifest

											
										
										
											2014-07-28 15:42:19 +02:00
+								            manifest_url, video_id, 'Downloading f4m manifest',
-												[extractor/common] Handle malformed f4m manifests

											
										
										
											2015-07-15 21:14:08 +02:00
+								            'Unable to download f4m manifest',
 								            # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
-												Start moving to ytdl-org

											
										
										
											2019-03-09 13:14:41 +01:00
+								            # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
-												[extractor/common] Add fatal to _extract_f4m_formats

											
										
										
											2015-10-01 19:03:31 +02:00
+								            transform_source=transform_source,
-												[extractor/common] Add data, headers and query to all major extract methods preserving standard order for potential future use

											
										
										
											2019-11-15 23:44:14 +01:00
+								            fatal=fatal, data=data, headers=headers, query=query)
-												[extractor] Update `manifest_url`s after redirect (#3575)

Authored by: elyse0
											
										
										
											2022-04-28 00:50:01 +02:00
+								        if res is False:
-												[common] simplify the use of _extract_m3u8_formats and _extract_f4m_formats

											
										
										
											2015-12-27 15:33:39 +01:00
+								            return []
-												[bloomberg] Extract the available formats (closes #2776)

It uses a helper method in the InfoExtractor class.
The downloader will pick the requested formats using the bitrate in the info dict.

											
										
										
											2014-07-28 15:25:56 +02:00
-												[extractor] Update `manifest_url`s after redirect (#3575)

Authored by: elyse0
											
										
										
											2022-04-28 00:50:01 +02:00
+								        manifest, urlh = res
-												[compat, networking] Deprecate old functions (#2861)

Authored by: coletdjnz, pukkandan

											
										
										
											2023-07-09 09:53:02 +02:00
+								        manifest_url = urlh.url
-												[extractor] Update `manifest_url`s after redirect (#3575)

Authored by: elyse0
											
										
										
											2022-04-28 00:50:01 +02:00
-												[extractor/common] Add _parse_f4m_formats routine

											
										
										
											2016-03-12 22:16:08 +01:00
+								        return self._parse_f4m_formats(
-												[formatsort] Remove misuse of 'preference'

'preference' is to be used only when the format is better that ALL qualities of a lower preference irrespective of ANY sorting order the user requests. See deezer.py for correct use of this

In the older sorting method, `preference`, `quality` and `language_preference` were functionally almost equivalent. So these disparities doesn't really matter there

Also, despite what the documentation says, the default for `preference` was actually 0 and not -1. I have tried to correct this and also account for it when converting `preference` to `quality`

											
										
										
											2021-02-18 23:03:16 +01:00
+								            manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
-												[common] Fix non-bootstrapped support in f4m

											
										
										
											2016-05-26 18:03:03 +02:00
+								            transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
-												[extractor/common] Add _parse_f4m_formats routine

											
										
										
											2016-03-12 22:16:08 +01:00
-												[formatsort] Remove misuse of 'preference'

'preference' is to be used only when the format is better that ALL qualities of a lower preference irrespective of ANY sorting order the user requests. See deezer.py for correct use of this

In the older sorting method, `preference`, `quality` and `language_preference` were functionally almost equivalent. So these disparities doesn't really matter there

Also, despite what the documentation says, the default for `preference` was actually 0 and not -1. I have tried to correct this and also account for it when converting `preference` to `quality`

											
										
										
											2021-02-18 23:03:16 +01:00
+								    def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
-												[extractor/common] Add _parse_f4m_formats routine

											
										
										
											2016-03-12 22:16:08 +01:00
+								                           transform_source=lambda s: fix_xml_ampersands(s).strip(),
-												[common] Fix non-bootstrapped support in f4m

											
										
										
											2016-05-26 18:03:03 +02:00
+								                           fatal=True, m3u8_id=None):
-												[cleanup] Mark some compat variables for removal (#2173)

Authored by fstirlitz, pukkandan

											
										
										
											2022-04-11 22:09:26 +02:00
+								        if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
-												[extractor/common] Do not fail on invalid data while parsing F4M manifest in non fatal mode

											
										
										
											2019-03-05 17:45:40 +01:00
+								            return []
-												Completely change project name to yt-dlp (#85)

* All modules and binary names are changed
* All documentation references changed
* yt-dlp no longer loads youtube-dlc config files
* All URLs changed to point to organization account

Co-authored-by: Pccode66
Co-authored-by: pukkandan
											
										
										
											2021-02-24 19:45:56 +01:00
+								        # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
-												[extractor/common] do not process f4m manifest that contain akamai playerVerificationChallenge

											
										
										
											2015-07-30 18:34:38 +02:00
+								        akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
 								        if akamai_pv is not None and ';' in akamai_pv.text:
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								            player_verification_challenge = akamai_pv.text.split(';')[0]
 								            if player_verification_challenge.strip() != '':
-												[extractor/common] do not process f4m manifest that contain akamai playerVerificationChallenge

											
										
										
											2015-07-30 18:34:38 +02:00
+								                return []
-												[bloomberg] Extract the available formats (closes #2776)

It uses a helper method in the InfoExtractor class.
The downloader will pick the requested formats using the bitrate in the info dict.

											
										
										
											2014-07-28 15:25:56 +02:00
+								        formats = []
-												[extractor/common] href attribute added

											
										
										
											2014-10-24 06:17:39 +02:00
+								        manifest_version = '1.0'
-												[extractor/common] Generate better f4m format IDs

											
										
										
											2014-08-25 13:03:08 +02:00
+								        media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
-												[extractor/common] Added support for f4m manifest Version 2.0

											
										
										
											2014-10-23 23:11:10 +02:00
+								        if not media_nodes:
-												[extractor/common] href attribute added

											
										
										
											2014-10-24 06:17:39 +02:00
+								            manifest_version = '2.0'
-												[extractor/common] Added support for f4m manifest Version 2.0

											
										
										
											2014-10-23 23:11:10 +02:00
+								            media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
-												[extractor/common] Filter out unsupported encrypted media for f4m formats (Closes #8573)

											
										
										
											2016-03-27 03:42:38 +02:00
+								        # Remove unsupported DRM protected media from final formats
-												Start moving to ytdl-org

											
										
										
											2019-03-09 13:14:41 +01:00
+								        # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
-												[extractor/common] Filter out unsupported encrypted media for f4m formats (Closes #8573)

											
										
										
											2016-03-27 03:42:38 +02:00
+								        media_nodes = remove_encrypted_media(media_nodes)
 								        if not media_nodes:
 								            return formats
-												[f4m] Prefer baseURL for relative URLs (closes #14660)

											
										
										
											2017-11-04 16:10:55 +01:00
 								        manifest_base_url = get_base_url(manifest)
-												[common] Support non-bootstraped streams in f4m manifests

Related: #9531

											
										
										
											2016-05-26 15:41:47 +02:00
-												[common] Fix <bootstrapInfo> detection in F4M manifests

Regression since 0a5685b26fae0940f14cb063a6e4fc6986f9c124

											
										
										
											2016-06-07 18:19:33 +02:00
+								        bootstrap_info = xpath_element(
-												[common] Support non-bootstraped streams in f4m manifests

Related: #9531

											
										
										
											2016-05-26 15:41:47 +02:00
+								            manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
 								            'bootstrap info', default=None)
-												[extractor/common] detect f4m audio only formats

											
										
										
											2016-10-19 15:42:48 +02:00
+								        vcodec = None
 								        mime_type = xpath_text(
 								            manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
 								            'base URL', default=None)
 								        if mime_type and mime_type.startswith('audio/'):
 								            vcodec = 'none'
-												[extractor/common] Generate better f4m format IDs

											
										
										
											2014-08-25 13:03:08 +02:00
+								        for i, media_el in enumerate(media_nodes):
-												[extractor/common] Borrow quality metadata from parent set-level manifest for f4m

											
										
										
											2016-05-26 21:47:44 +02:00
+								            tbr = int_or_none(media_el.attrib.get('bitrate'))
 								            width = int_or_none(media_el.attrib.get('width'))
 								            height = int_or_none(media_el.attrib.get('height'))
-												[utils] Add `join_nonempty`

											
										
										
											2021-11-06 02:05:24 +01:00
+								            format_id = join_nonempty(f4m_id, tbr or i)
-												[common] Fix non-bootstrapped support in f4m

											
										
										
											2016-05-26 18:03:03 +02:00
+								            # If <bootstrapInfo> is present, the specified f4m is a
 								            # stream-level manifest, and only set-level manifests may refer to
 								            # external resources.  See section 11.4 and section 4 of F4M spec
 								            if bootstrap_info is None:
 								                media_url = None
 								                # @href is introduced in 2.0, see section 11.6 of F4M spec
 								                if manifest_version == '2.0':
 								                    media_url = media_el.attrib.get('href')
 								                if media_url is None:
 								                    media_url = media_el.attrib.get('url')
-												[extractor/common] Keep going in some media_url is missing

											
										
										
											2015-07-15 21:25:33 +02:00
+								                if not media_url:
 								                    continue
-												[extractor/common] Properly handle full URLs

											
										
										
											2015-07-15 21:14:52 +02:00
+								                manifest_url = (
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								                    media_url if media_url.startswith(('http://', 'https://'))
-												[f4m] Prefer baseURL for relative URLs (closes #14660)

											
										
										
											2017-11-04 16:10:55 +01:00
+								                    else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
-												[extractor/common] Recursively extract child f4m manifests

											
										
										
											2015-07-15 21:15:15 +02:00
+								                # If media_url is itself a f4m manifest do the recursive extraction
 								                # since bitrates in parent manifest (this one) and media_url manifest
 								                # may differ leading to inability to resolve the format by requested
 								                # bitrate in f4m downloader
-												[common] Support m3u8 in f4m manifests

Related: #9531

											
										
										
											2016-05-26 15:55:43 +02:00
+								                ext = determine_ext(manifest_url)
 								                if ext == 'f4m':
-												[extractor/common] Borrow quality metadata from parent set-level manifest for f4m

											
										
										
											2016-05-26 21:47:44 +02:00
+								                    f4m_formats = self._extract_f4m_formats(
-												[formatsort] Remove misuse of 'preference'

'preference' is to be used only when the format is better that ALL qualities of a lower preference irrespective of ANY sorting order the user requests. See deezer.py for correct use of this

In the older sorting method, `preference`, `quality` and `language_preference` were functionally almost equivalent. So these disparities doesn't really matter there

Also, despite what the documentation says, the default for `preference` was actually 0 and not -1. I have tried to correct this and also account for it when converting `preference` to `quality`

											
										
										
											2021-02-18 23:03:16 +01:00
+								                        manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
-												[extractor/common] Borrow quality metadata from parent set-level manifest for f4m

											
										
										
											2016-05-26 21:47:44 +02:00
+								                        transform_source=transform_source, fatal=fatal)
 								                    # Sometimes stream-level manifest contains single media entry that
 								                    # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
 								                    # At the same time parent's media entry in set-level manifest may
 								                    # contain it. We will copy it from parent in such cases.
 								                    if len(f4m_formats) == 1:
 								                        f = f4m_formats[0]
 								                        f.update({
 								                            'tbr': f.get('tbr') or tbr,
 								                            'width': f.get('width') or width,
 								                            'height': f.get('height') or height,
 								                            'format_id': f.get('format_id') if not tbr else format_id,
-												[extractor/common] detect f4m audio only formats

											
										
										
											2016-10-19 15:42:48 +02:00
+								                            'vcodec': vcodec,
-												[extractor/common] Borrow quality metadata from parent set-level manifest for f4m

											
										
										
											2016-05-26 21:47:44 +02:00
+								                        })
 								                    formats.extend(f4m_formats)
-												[extractor/common] Recursively extract child f4m manifests

											
										
										
											2015-07-15 21:15:15 +02:00
+								                    continue
-												[common] Support m3u8 in f4m manifests

Related: #9531

											
										
										
											2016-05-26 15:55:43 +02:00
+								                elif ext == 'm3u8':
 								                    formats.extend(self._extract_m3u8_formats(
 								                        manifest_url, video_id, 'mp4', preference=preference,
-												[formatsort] Remove misuse of 'preference'

'preference' is to be used only when the format is better that ALL qualities of a lower preference irrespective of ANY sorting order the user requests. See deezer.py for correct use of this

In the older sorting method, `preference`, `quality` and `language_preference` were functionally almost equivalent. So these disparities doesn't really matter there

Also, despite what the documentation says, the default for `preference` was actually 0 and not -1. I have tried to correct this and also account for it when converting `preference` to `quality`

											
										
										
											2021-02-18 23:03:16 +01:00
+								                        quality=quality, m3u8_id=m3u8_id, fatal=fatal))
-												[common] Support m3u8 in f4m manifests

Related: #9531

											
										
										
											2016-05-26 15:55:43 +02:00
+								                    continue
-												[bloomberg] Extract the available formats (closes #2776)

It uses a helper method in the InfoExtractor class.
The downloader will pick the requested formats using the bitrate in the info dict.

											
										
										
											2014-07-28 15:25:56 +02:00
+								            formats.append({
-												[extractor/common] Borrow quality metadata from parent set-level manifest for f4m

											
										
										
											2016-05-26 21:47:44 +02:00
+								                'format_id': format_id,
-												[bloomberg] Extract the available formats (closes #2776)

It uses a helper method in the InfoExtractor class.
The downloader will pick the requested formats using the bitrate in the info dict.

											
										
										
											2014-07-28 15:25:56 +02:00
+								                'url': manifest_url,
-												[extractor/common] Add manifest_url for hls and hds formats

											
										
										
											2016-09-17 16:33:38 +02:00
+								                'manifest_url': manifest_url,
-												[common] Fix <bootstrapInfo> detection in F4M manifests

Regression since 0a5685b26fae0940f14cb063a6e4fc6986f9c124

											
										
										
											2016-06-07 18:19:33 +02:00
+								                'ext': 'flv' if bootstrap_info is not None else None,
-												[extractor/common] Add protocol for f4m formats

											
										
										
											2017-11-04 16:11:39 +01:00
+								                'protocol': 'f4m',
-												[extractor/common] Generate better f4m format IDs

											
										
										
											2014-08-25 13:03:08 +02:00
+								                'tbr': tbr,
-												[extractor/common] Borrow quality metadata from parent set-level manifest for f4m

											
										
										
											2016-05-26 21:47:44 +02:00
+								                'width': width,
 								                'height': height,
-												[extractor/common] detect f4m audio only formats

											
										
										
											2016-10-19 15:42:48 +02:00
+								                'vcodec': vcodec,
-												[extractor/common] Prefix f4m/m3u8 entries with identifier

											
										
										
											2015-02-05 17:16:27 +01:00
+								                'preference': preference,
-												[formatsort] Remove misuse of 'preference'

'preference' is to be used only when the format is better that ALL qualities of a lower preference irrespective of ANY sorting order the user requests. See deezer.py for correct use of this

In the older sorting method, `preference`, `quality` and `language_preference` were functionally almost equivalent. So these disparities doesn't really matter there

Also, despite what the documentation says, the default for `preference` was actually 0 and not -1. I have tried to correct this and also account for it when converting `preference` to `quality`

											
										
										
											2021-02-18 23:03:16 +01:00
+								                'quality': quality,
-												[bloomberg] Extract the available formats (closes #2776)

It uses a helper method in the InfoExtractor class.
The downloader will pick the requested formats using the bitrate in the info dict.

											
										
										
											2014-07-28 15:25:56 +02:00
+								            })
 								        return formats
-												[formatsort] Remove misuse of 'preference'

'preference' is to be used only when the format is better that ALL qualities of a lower preference irrespective of ANY sorting order the user requests. See deezer.py for correct use of this

In the older sorting method, `preference`, `quality` and `language_preference` were functionally almost equivalent. So these disparities doesn't really matter there

Also, despite what the documentation says, the default for `preference` was actually 0 and not -1. I have tried to correct this and also account for it when converting `preference` to `quality`

											
										
										
											2021-02-18 23:03:16 +01:00
+								    def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
-												[common] Add _m3u8_meta_format() template

For extractors who handle m3u8 manifests by themselves. (eg., AnvatoIE)

Part of #9522

											
										
										
											2016-05-21 07:15:28 +02:00
+								        return {
-												[utils] Add `join_nonempty`

											
										
										
											2021-11-06 02:05:24 +01:00
+								            'format_id': join_nonempty(m3u8_id, 'meta'),
-												[sportdeutschland] add new extractor

											
										
										
											2014-08-26 12:51:13 +02:00
+								            'url': m3u8_url,
 								            'ext': ext,
 								            'protocol': 'm3u8',
-												[common] correctly lower the preference of m3u8 master manifest format

											
										
										
											2016-08-07 11:58:11 +02:00
+								            'preference': preference - 100 if preference else -100,
-												[formatsort] Remove misuse of 'preference'

'preference' is to be used only when the format is better that ALL qualities of a lower preference irrespective of ANY sorting order the user requests. See deezer.py for correct use of this

In the older sorting method, `preference`, `quality` and `language_preference` were functionally almost equivalent. So these disparities doesn't really matter there

Also, despite what the documentation says, the default for `preference` was actually 0 and not -1. I have tried to correct this and also account for it when converting `preference` to `quality`

											
										
										
											2021-02-18 23:03:16 +01:00
+								            'quality': quality,
-												[sportdeutschland] add new extractor

											
										
										
											2014-08-26 12:51:13 +02:00
+								            'resolution': 'multiple',
 								            'format_note': 'Quality selection URL',
-												[common] Add _m3u8_meta_format() template

For extractors who handle m3u8 manifests by themselves. (eg., AnvatoIE)

Part of #9522

											
										
										
											2016-05-21 07:15:28 +02:00
+								        }
-												[cleanup] Misc cleanup

											
										
										
											2021-10-09 02:23:15 +02:00
+								    def _report_ignoring_subs(self, name):
 								        self.report_warning(bug_reports_message(
 								            f'Ignoring subtitle tracks found in the {name} manifest; '
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								            'if any subtitle tracks are missing,',
-												[cleanup] Misc cleanup

											
										
										
											2021-10-09 02:23:15 +02:00
+								        ), only_once=True)
-												[extractor/common] Extract HLS subtitle tracks

_extract_m3u8_formats is renamed to _extract_m3u8_formats_and_subtitles
and extended to handle subtitle tracks instead of skipping them;
a wrapper with the old name is provided for compatibility.

_parse_m3u8_formats is likewise renamed and extended, but without adding
the compatibility wrapper; the test suite is adjusted to test the enhanced
method instead.

											
										
										
											2016-11-07 15:45:42 +01:00
+								    def _extract_m3u8_formats(self, *args, **kwargs):
 								        fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
 								        if subs:
-												[cleanup] Misc cleanup

											
										
										
											2021-10-09 02:23:15 +02:00
+								            self._report_ignoring_subs('HLS')
-												[extractor/common] Extract HLS subtitle tracks

_extract_m3u8_formats is renamed to _extract_m3u8_formats_and_subtitles
and extended to handle subtitle tracks instead of skipping them;
a wrapper with the old name is provided for compatibility.

_parse_m3u8_formats is likewise renamed and extended, but without adding
the compatibility wrapper; the test suite is adjusted to test the enhanced
method instead.

											
										
										
											2016-11-07 15:45:42 +01:00
+								        return fmts
 								    def _extract_m3u8_formats_and_subtitles(
-												[extractor] Always prefer native hls downloader by default

When the manifest is not downloadable by native downloader, it already is able to detect it and switch to `ffmpeg`. So there doesn't seem to be a reason anymore to use ffmpeg as the preferred downloader

											
										
										
											2021-05-22 20:28:11 +02:00
+								            self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
-												[extractor/common] Extract HLS subtitle tracks

_extract_m3u8_formats is renamed to _extract_m3u8_formats_and_subtitles
and extended to handle subtitle tracks instead of skipping them;
a wrapper with the old name is provided for compatibility.

_parse_m3u8_formats is likewise renamed and extended, but without adding
the compatibility wrapper; the test suite is adjusted to test the enhanced
method instead.

											
										
										
											2016-11-07 15:45:42 +01:00
+								            preference=None, quality=None, m3u8_id=None, note=None,
 								            errnote=None, fatal=True, live=False, data=None, headers={},
 								            query={}):
-												[extractor] Let `_extract_format` functions obey `--ignore-no-formats`

											
										
										
											2022-12-15 15:28:57 +01:00
+								        if self.get_param('ignore_no_formats_error'):
 								            fatal = False
-												[cleanup] Misc

											
										
										
											2022-11-30 07:04:51 +01:00
+								        if not m3u8_url:
 								            if errnote is not False:
 								                errnote = errnote or 'Failed to obtain m3u8 URL'
 								                if fatal:
 								                    raise ExtractorError(errnote, video_id=video_id)
 								                self.report_warning(f'{errnote}{bug_reports_message()}')
 								            return [], {}
-												[extractor/common] Fix m3u8 extraction on failure

											
										
										
											2015-10-31 19:01:34 +01:00
+								        res = self._download_webpage_handle(
-												[extractor/common] Improve m3u8 output

											
										
										
											2014-10-27 02:28:37 +01:00
+								            m3u8_url, video_id,
-												[extractor] Allow `note=False` when extracting manifests

											
										
										
											2021-05-29 10:52:44 +02:00
+								            note='Downloading m3u8 information' if note is None else note,
 								            errnote='Failed to download m3u8 information' if errnote is None else errnote,
-												[extractor/common] Add data, headers and query to all major extract methods preserving standard order for potential future use

											
										
										
											2019-11-15 23:44:14 +01:00
+								            fatal=fatal, data=data, headers=headers, query=query)
-												[extractor/common] Improve m3u8 extraction (closes #12211)
* Extract m3u8 parsing to separate method
* Improve rendition groups extraction
* Build stream name according stream GROUP-ID
* Ignore reference to AUDIO group without URI when stream has no CODECS
+ Add test coverage for parsing m3u8 from #11507, #11995, #12211 and twitch vod

											
										
										
											2017-04-22 02:01:00 +02:00
-												[extractor/common] Fix m3u8 extraction on failure

											
										
										
											2015-10-31 19:01:34 +01:00
+								        if res is False:
-												[extractor/common] Extract HLS subtitle tracks

_extract_m3u8_formats is renamed to _extract_m3u8_formats_and_subtitles
and extended to handle subtitle tracks instead of skipping them;
a wrapper with the old name is provided for compatibility.

_parse_m3u8_formats is likewise renamed and extended, but without adding
the compatibility wrapper; the test suite is adjusted to test the enhanced
method instead.

											
										
										
											2016-11-07 15:45:42 +01:00
+								            return [], {}
-												[extractor/common] Improve m3u8 extraction (closes #12211)
* Extract m3u8 parsing to separate method
* Improve rendition groups extraction
* Build stream name according stream GROUP-ID
* Ignore reference to AUDIO group without URI when stream has no CODECS
+ Add test coverage for parsing m3u8 from #11507, #11995, #12211 and twitch vod

											
										
										
											2017-04-22 02:01:00 +02:00
-												[extractor/common] Fix m3u8 extraction on failure

											
										
										
											2015-10-31 19:01:34 +01:00
+								        m3u8_doc, urlh = res
-												[compat, networking] Deprecate old functions (#2861)

Authored by: coletdjnz, pukkandan

											
										
										
											2023-07-09 09:53:02 +02:00
+								        m3u8_url = urlh.url
-												[extractor/common] Clarify rationale on media playlist detection

											
										
										
											2016-02-27 02:01:11 +01:00
-												[extractor/common] Extract HLS subtitle tracks

_extract_m3u8_formats is renamed to _extract_m3u8_formats_and_subtitles
and extended to handle subtitle tracks instead of skipping them;
a wrapper with the old name is provided for compatibility.

_parse_m3u8_formats is likewise renamed and extended, but without adding
the compatibility wrapper; the test suite is adjusted to test the enhanced
method instead.

											
										
										
											2016-11-07 15:45:42 +01:00
+								        return self._parse_m3u8_formats_and_subtitles(
-												[extractor/common] Improve m3u8 extraction (closes #12211)
* Extract m3u8 parsing to separate method
* Improve rendition groups extraction
* Build stream name according stream GROUP-ID
* Ignore reference to AUDIO group without URI when stream has no CODECS
+ Add test coverage for parsing m3u8 from #11507, #11995, #12211 and twitch vod

											
										
										
											2017-04-22 02:01:00 +02:00
+								            m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
-												Better support HLS media discontinuity and fully support media initialization (#105)

* Added options: `--hls-split-discontinuity` and `--no-hls-split-discontinuity`

Authored-by: shirtjs <2660574+shirtjs@users.noreply.github.com>
											
										
										
											2021-02-24 15:47:53 +01:00
+								            preference=preference, quality=quality, m3u8_id=m3u8_id,
 								            note=note, errnote=errnote, fatal=fatal, live=live, data=data,
 								            headers=headers, query=query, video_id=video_id)
-												[extractor/common] Improve m3u8 extraction (closes #12211)
* Extract m3u8 parsing to separate method
* Improve rendition groups extraction
* Build stream name according stream GROUP-ID
* Ignore reference to AUDIO group without URI when stream has no CODECS
+ Add test coverage for parsing m3u8 from #11507, #11995, #12211 and twitch vod

											
										
										
											2017-04-22 02:01:00 +02:00
-												[extractor/common] Extract HLS subtitle tracks

_extract_m3u8_formats is renamed to _extract_m3u8_formats_and_subtitles
and extended to handle subtitle tracks instead of skipping them;
a wrapper with the old name is provided for compatibility.

_parse_m3u8_formats is likewise renamed and extended, but without adding
the compatibility wrapper; the test suite is adjusted to test the enhanced
method instead.

											
										
										
											2016-11-07 15:45:42 +01:00
+								    def _parse_m3u8_formats_and_subtitles(
-												[iq.com] Add extractors (#2354)

Closes #704
Authored by: MinePlayersPE
											
										
										
											2022-01-19 23:23:55 +01:00
+								            self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
-												[extractor/common] Extract HLS subtitle tracks

_extract_m3u8_formats is renamed to _extract_m3u8_formats_and_subtitles
and extended to handle subtitle tracks instead of skipping them;
a wrapper with the old name is provided for compatibility.

_parse_m3u8_formats is likewise renamed and extended, but without adding
the compatibility wrapper; the test suite is adjusted to test the enhanced
method instead.

											
										
										
											2016-11-07 15:45:42 +01:00
+								            preference=None, quality=None, m3u8_id=None, live=False, note=None,
 								            errnote=None, fatal=True, data=None, headers={}, query={},
 								            video_id=None):
-												[extractor] Prevent unnecessary download of hls manifests
and refactor `hls_split_discontinuity` code

											
										
										
											2021-07-06 22:54:58 +02:00
+								        formats, subtitles = [], {}
-												[core] Allow extractors to mark formats as potentially DRM (#7396)

This is useful for HLS where detecting whether the format is
actually DRM requires the child manifest to be downloaded.

Makes the error message when using `--test` inconsistent,
but doesn't really matter.

											
										
										
											2023-07-06 15:09:50 +02:00
+								        has_drm = HlsFD._has_drm(m3u8_doc)
-												[extractor/common] Extract HLS subtitle tracks

_extract_m3u8_formats is renamed to _extract_m3u8_formats_and_subtitles
and extended to handle subtitle tracks instead of skipping them;
a wrapper with the old name is provided for compatibility.

_parse_m3u8_formats is likewise renamed and extended, but without adding
the compatibility wrapper; the test suite is adjusted to test the enhanced
method instead.

											
										
										
											2016-11-07 15:45:42 +01:00
-												[extractor] Prevent unnecessary download of hls manifests
and refactor `hls_split_discontinuity` code

											
										
										
											2021-07-06 22:54:58 +02:00
+								        def format_url(url):
-												[compat] Remove deprecated functions from core code

											
										
										
											2022-06-24 12:54:43 +02:00
+								            return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
-												[extractor] Prevent unnecessary download of hls manifests
and refactor `hls_split_discontinuity` code

											
										
										
											2021-07-06 22:54:58 +02:00
 								        if self.get_param('hls_split_discontinuity', False):
 								            def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
 								                if not m3u8_doc:
 								                    if not manifest_url:
 								                        return []
 								                    m3u8_doc = self._download_webpage(
 								                        manifest_url, video_id, fatal=fatal, data=data, headers=headers,
 								                        note=False, errnote='Failed to download m3u8 playlist information')
 								                    if m3u8_doc is False:
 								                        return []
 								                return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
-												[internetvideoarchive] extract all formats

											
										
										
											2016-09-04 12:42:15 +02:00
-												[extractor] Prevent unnecessary download of hls manifests
and refactor `hls_split_discontinuity` code

											
										
										
											2021-07-06 22:54:58 +02:00
+								        else:
 								            def _extract_m3u8_playlist_indices(*args, **kwargs):
 								                return [None]
-												Better support HLS media discontinuity and fully support media initialization (#105)

* Added options: `--hls-split-discontinuity` and `--no-hls-split-discontinuity`

Authored-by: shirtjs <2660574+shirtjs@users.noreply.github.com>
											
										
										
											2021-02-24 15:47:53 +01:00
-												[extractor/common] Improve m3u8 extraction (closes #12211)
* Extract m3u8 parsing to separate method
* Improve rendition groups extraction
* Build stream name according stream GROUP-ID
* Ignore reference to AUDIO group without URI when stream has no CODECS
+ Add test coverage for parsing m3u8 from #11507, #11995, #12211 and twitch vod

											
										
										
											2017-04-22 02:01:00 +02:00
+								        # References:
 								        # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
-												Start moving to ytdl-org

											
										
										
											2019-03-09 13:14:41 +01:00
+								        # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
 								        # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
-												[extractor/common] Improve m3u8 extraction (closes #12211)
* Extract m3u8 parsing to separate method
* Improve rendition groups extraction
* Build stream name according stream GROUP-ID
* Ignore reference to AUDIO group without URI when stream has no CODECS
+ Add test coverage for parsing m3u8 from #11507, #11995, #12211 and twitch vod

											
										
										
											2017-04-22 02:01:00 +02:00
 								        # We should try extracting formats only from master playlists [1, 4.3.4],
 								        # i.e. playlists that describe available qualities. On the other hand
 								        # media playlists [1, 4.3.3] should be returned as is since they contain
 								        # just the media without qualities renditions.
-												[extractor/common] Clarify rationale on media playlist detection

											
										
										
											2016-02-27 02:01:11 +01:00
+								        # Fortunately, master playlist can be easily distinguished from media
-												[extractor/common] Improve m3u8 extraction (closes #12211)
* Extract m3u8 parsing to separate method
* Improve rendition groups extraction
* Build stream name according stream GROUP-ID
* Ignore reference to AUDIO group without URI when stream has no CODECS
+ Add test coverage for parsing m3u8 from #11507, #11995, #12211 and twitch vod

											
										
										
											2017-04-22 02:01:00 +02:00
+								        # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
-												Updated to release 2020.11.21.1

											
										
										
											2020-11-21 15:50:42 +01:00
+								        # master playlist tags MUST NOT appear in a media playlist and vice versa.
-												[extractor/common] Improve m3u8 extraction (closes #12211)
* Extract m3u8 parsing to separate method
* Improve rendition groups extraction
* Build stream name according stream GROUP-ID
* Ignore reference to AUDIO group without URI when stream has no CODECS
+ Add test coverage for parsing m3u8 from #11507, #11995, #12211 and twitch vod

											
										
										
											2017-04-22 02:01:00 +02:00
+								        # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
 								        # media playlist and MUST NOT appear in master playlist thus we can
 								        # clearly detect media playlist with this criterion.
-												[extractor/common] Clarify rationale on media playlist detection

											
										
										
											2016-02-27 02:01:11 +01:00
+								        if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
-												[extractor] Prevent unnecessary download of hls manifests
and refactor `hls_split_discontinuity` code

											
										
										
											2021-07-06 22:54:58 +02:00
+								            formats = [{
-												[utils] Add `join_nonempty`

											
										
										
											2021-11-06 02:05:24 +01:00
+								                'format_id': join_nonempty(m3u8_id, idx),
-												[extractor] Prevent unnecessary download of hls manifests
and refactor `hls_split_discontinuity` code

											
										
										
											2021-07-06 22:54:58 +02:00
+								                'format_index': idx,
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								                'url': m3u8_url or encode_data_uri(m3u8_doc.encode(), 'application/x-mpegurl'),
-												[extractor] Prevent unnecessary download of hls manifests
and refactor `hls_split_discontinuity` code

											
										
										
											2021-07-06 22:54:58 +02:00
+								                'ext': ext,
 								                'protocol': entry_protocol,
 								                'preference': preference,
 								                'quality': quality,
-												[extractor] Better error message for DRM (#729)

Closes #636
											
										
										
											2021-08-22 22:08:38 +02:00
+								                'has_drm': has_drm,
-												[extractor] Prevent unnecessary download of hls manifests
and refactor `hls_split_discontinuity` code

											
										
										
											2021-07-06 22:54:58 +02:00
+								            } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
-												Better support HLS media discontinuity and fully support media initialization (#105)

* Added options: `--hls-split-discontinuity` and `--no-hls-split-discontinuity`

Authored-by: shirtjs <2660574+shirtjs@users.noreply.github.com>
											
										
										
											2021-02-24 15:47:53 +01:00
-												[extractor/common] Extract HLS subtitle tracks

_extract_m3u8_formats is renamed to _extract_m3u8_formats_and_subtitles
and extended to handle subtitle tracks instead of skipping them;
a wrapper with the old name is provided for compatibility.

_parse_m3u8_formats is likewise renamed and extended, but without adding
the compatibility wrapper; the test suite is adjusted to test the enhanced
method instead.

											
										
										
											2016-11-07 15:45:42 +01:00
+								            return formats, subtitles
-												[extractor/common] Improve m3u8 extraction (closes #12211)
* Extract m3u8 parsing to separate method
* Improve rendition groups extraction
* Build stream name according stream GROUP-ID
* Ignore reference to AUDIO group without URI when stream has no CODECS
+ Add test coverage for parsing m3u8 from #11507, #11995, #12211 and twitch vod

											
										
										
											2017-04-22 02:01:00 +02:00
 								        groups = {}
 								        last_stream_inf = {}
 								        def extract_media(x_media_line):
 								            media = parse_m3u8_attributes(x_media_line)
 								            # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
 								            media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
 								            if not (media_type and group_id and name):
 								                return
 								            groups.setdefault(group_id, []).append(media)
-												[extractor/common] Extract HLS subtitle tracks

_extract_m3u8_formats is renamed to _extract_m3u8_formats_and_subtitles
and extended to handle subtitle tracks instead of skipping them;
a wrapper with the old name is provided for compatibility.

_parse_m3u8_formats is likewise renamed and extended, but without adding
the compatibility wrapper; the test suite is adjusted to test the enhanced
method instead.

											
										
										
											2016-11-07 15:45:42 +01:00
+								            # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
 								            if media_type == 'SUBTITLES':
-												[extractor] Skip subtitles without URI in m3u8 manifests
Closes #339

Authored by: hheimbuerger

											
										
										
											2021-05-23 18:59:28 +02:00
+								                # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
 								                # EXT-X-MEDIA tag if the media type is SUBTITLES.
 								                # However, lack of URI has been spotted in the wild.
 								                # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
 								                if not media.get('URI'):
 								                    return
-												[extractor/common] Extract HLS subtitle tracks

_extract_m3u8_formats is renamed to _extract_m3u8_formats_and_subtitles
and extended to handle subtitle tracks instead of skipping them;
a wrapper with the old name is provided for compatibility.

_parse_m3u8_formats is likewise renamed and extended, but without adding
the compatibility wrapper; the test suite is adjusted to test the enhanced
method instead.

											
										
										
											2016-11-07 15:45:42 +01:00
+								                url = format_url(media['URI'])
 								                sub_info = {
 								                    'url': url,
 								                    'ext': determine_ext(url),
 								                }
-												[downloader/hls] Assemble single-file WebVTT subtitles from HLS segments

											
										
										
											2021-04-28 12:47:30 +02:00
+								                if sub_info['ext'] == 'm3u8':
 								                    # Per RFC 8216 §3.1, the only possible subtitle format m3u8
 								                    # files may contain is WebVTT:
 								                    # <https://tools.ietf.org/html/rfc8216#section-3.1>
 								                    sub_info['ext'] = 'vtt'
 								                    sub_info['protocol'] = 'm3u8_native'
-												[extractor] Allow `note=False` when extracting manifests

											
										
										
											2021-05-29 10:52:44 +02:00
+								                lang = media.get('LANGUAGE') or 'und'
-												[extractor/common] Extract HLS subtitle tracks

_extract_m3u8_formats is renamed to _extract_m3u8_formats_and_subtitles
and extended to handle subtitle tracks instead of skipping them;
a wrapper with the old name is provided for compatibility.

_parse_m3u8_formats is likewise renamed and extended, but without adding
the compatibility wrapper; the test suite is adjusted to test the enhanced
method instead.

											
										
										
											2016-11-07 15:45:42 +01:00
+								                subtitles.setdefault(lang, []).append(sub_info)
-												[extractor/common] Improve m3u8 extraction (closes #12211)
* Extract m3u8 parsing to separate method
* Improve rendition groups extraction
* Build stream name according stream GROUP-ID
* Ignore reference to AUDIO group without URI when stream has no CODECS
+ Add test coverage for parsing m3u8 from #11507, #11995, #12211 and twitch vod

											
										
										
											2017-04-22 02:01:00 +02:00
+								            if media_type not in ('VIDEO', 'AUDIO'):
 								                return
 								            media_url = media.get('URI')
 								            if media_url:
-												Better support HLS media discontinuity and fully support media initialization (#105)

* Added options: `--hls-split-discontinuity` and `--no-hls-split-discontinuity`

Authored-by: shirtjs <2660574+shirtjs@users.noreply.github.com>
											
										
										
											2021-02-24 15:47:53 +01:00
+								                manifest_url = format_url(media_url)
-												[extractor] Prevent unnecessary download of hls manifests
and refactor `hls_split_discontinuity` code

											
										
										
											2021-07-06 22:54:58 +02:00
+								                formats.extend({
-												[utils] Add `join_nonempty`

											
										
										
											2021-11-06 02:05:24 +01:00
+								                    'format_id': join_nonempty(m3u8_id, group_id, name, idx),
-												[extractor] Prevent unnecessary download of hls manifests
and refactor `hls_split_discontinuity` code

											
										
										
											2021-07-06 22:54:58 +02:00
+								                    'format_note': name,
 								                    'format_index': idx,
 								                    'url': manifest_url,
 								                    'manifest_url': m3u8_url,
 								                    'language': media.get('LANGUAGE'),
 								                    'ext': ext,
 								                    'protocol': entry_protocol,
 								                    'preference': preference,
 								                    'quality': quality,
-												[extractor] Fix DRM detection in m3u8

Fixes https://github.com/ytdl-org/youtube-dl/issues/31693#issuecomment-1445202857

											
										
										
											2023-02-26 05:46:30 +01:00
+								                    'has_drm': has_drm,
-												[extractor] Prevent unnecessary download of hls manifests
and refactor `hls_split_discontinuity` code

											
										
										
											2021-07-06 22:54:58 +02:00
+								                    'vcodec': 'none' if media_type == 'AUDIO' else None,
 								                } for idx in _extract_m3u8_playlist_indices(manifest_url))
-												[extractor/common] Improve m3u8 extraction (closes #12211)
* Extract m3u8 parsing to separate method
* Improve rendition groups extraction
* Build stream name according stream GROUP-ID
* Ignore reference to AUDIO group without URI when stream has no CODECS
+ Add test coverage for parsing m3u8 from #11507, #11995, #12211 and twitch vod

											
										
										
											2017-04-22 02:01:00 +02:00
 								        def build_stream_name():
 								            # Despite specification does not mention NAME attribute for
-												[extractor/common] Rephrase comment

											
										
										
											2017-04-23 06:51:53 +02:00
+								            # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
 								            # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
-												[test_InfoExtractor] Add m3u8 parsing test for NAME attribute in EXT-X-STREAM-INF tag

											
										
										
											2017-04-23 06:49:57 +02:00
+								            # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
-												[extractor/common] Improve m3u8 extraction (closes #12211)
* Extract m3u8 parsing to separate method
* Improve rendition groups extraction
* Build stream name according stream GROUP-ID
* Ignore reference to AUDIO group without URI when stream has no CODECS
+ Add test coverage for parsing m3u8 from #11507, #11995, #12211 and twitch vod

											
										
										
											2017-04-22 02:01:00 +02:00
+								            stream_name = last_stream_inf.get('NAME')
 								            if stream_name:
 								                return stream_name
 								            # If there is no NAME in EXT-X-STREAM-INF it will be obtained
 								            # from corresponding rendition group
 								            stream_group_id = last_stream_inf.get('VIDEO')
 								            if not stream_group_id:
 								                return
 								            stream_group = groups.get(stream_group_id)
 								            if not stream_group:
 								                return stream_group_id
 								            rendition = stream_group[0]
 								            return rendition.get('NAME') or stream_group_id
-												[extractor/common] fix typo

											
										
										
											2019-01-19 21:35:02 +01:00
+								        # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
-												[extractor/common] imporove HLS video only format detection(closes #18923)

											
										
										
											2019-01-19 21:25:15 +01:00
+								        # chance to detect video only formats when EXT-X-STREAM-INF tags
 								        # precede EXT-X-MEDIA tags in HLS manifest such as [3].
 								        for line in m3u8_doc.splitlines():
 								            if line.startswith('#EXT-X-MEDIA:'):
 								                extract_media(line)
-												[sportdeutschland] add new extractor

											
										
										
											2014-08-26 12:51:13 +02:00
+								        for line in m3u8_doc.splitlines():
 								            if line.startswith('#EXT-X-STREAM-INF:'):
-												[extractor/common] Improve m3u8 extraction (closes #12211)
* Extract m3u8 parsing to separate method
* Improve rendition groups extraction
* Build stream name according stream GROUP-ID
* Ignore reference to AUDIO group without URI when stream has no CODECS
+ Add test coverage for parsing m3u8 from #11507, #11995, #12211 and twitch vod

											
										
										
											2017-04-22 02:01:00 +02:00
+								                last_stream_inf = parse_m3u8_attributes(line)
-												[sportdeutschland] add new extractor

											
										
										
											2014-08-26 12:51:13 +02:00
+								            elif line.startswith('#') or not line.strip():
 								                continue
 								            else:
-												[extractor/common] Use float for scaled tbr

											
										
										
											2017-04-23 06:33:19 +02:00
+								                tbr = float_or_none(
-												Fix W504 and disable W503 (closes #20863)

											
										
										
											2019-05-10 22:56:22 +02:00
+								                    last_stream_inf.get('AVERAGE-BANDWIDTH')
 								                    or last_stream_inf.get('BANDWIDTH'), scale=1000)
-												[extractor/common] Add manifest_url for hls and hds formats

											
										
										
											2016-09-17 16:33:38 +02:00
+								                manifest_url = format_url(line.strip())
-												[dailymotion] improve extraction

- extract http formats included in m3u8 manifest
- fix user extraction(closes #3553)(closes #21415)
- add suport for User Authentication(closes #11491)
- fix password protected videos extraction(closes #23176)
- respect age limit option and family filter cookie value(closes #18437)
- handle video url playlist query param
- report alowed countries for geo-restricted videos

											
										
										
											2019-11-26 22:01:34 +01:00
-												[extractor] Prevent unnecessary download of hls manifests
and refactor `hls_split_discontinuity` code

											
										
										
											2021-07-06 22:54:58 +02:00
+								                for idx in _extract_m3u8_playlist_indices(manifest_url):
 								                    format_id = [m3u8_id, None, idx]
-												Better support HLS media discontinuity and fully support media initialization (#105)

* Added options: `--hls-split-discontinuity` and `--no-hls-split-discontinuity`

Authored-by: shirtjs <2660574+shirtjs@users.noreply.github.com>
											
										
										
											2021-02-24 15:47:53 +01:00
+								                    # Bandwidth of live streams may differ over time thus making
 								                    # format_id unpredictable. So it's better to keep provided
 								                    # format_id intact.
 								                    if not live:
-												[extractor] Prevent unnecessary download of hls manifests
and refactor `hls_split_discontinuity` code

											
										
										
											2021-07-06 22:54:58 +02:00
+								                        stream_name = build_stream_name()
-												[utils] Add `join_nonempty`

											
										
										
											2021-11-06 02:05:24 +01:00
+								                        format_id[1] = stream_name or '%d' % (tbr or len(formats))
-												Better support HLS media discontinuity and fully support media initialization (#105)

* Added options: `--hls-split-discontinuity` and `--no-hls-split-discontinuity`

Authored-by: shirtjs <2660574+shirtjs@users.noreply.github.com>
											
										
										
											2021-02-24 15:47:53 +01:00
+								                    f = {
-												[utils] Add `join_nonempty`

											
										
										
											2021-11-06 02:05:24 +01:00
+								                        'format_id': join_nonempty(*format_id),
-												[extractor] Prevent unnecessary download of hls manifests
and refactor `hls_split_discontinuity` code

											
										
										
											2021-07-06 22:54:58 +02:00
+								                        'format_index': idx,
-												Better support HLS media discontinuity and fully support media initialization (#105)

* Added options: `--hls-split-discontinuity` and `--no-hls-split-discontinuity`

Authored-by: shirtjs <2660574+shirtjs@users.noreply.github.com>
											
										
										
											2021-02-24 15:47:53 +01:00
+								                        'url': manifest_url,
 								                        'manifest_url': m3u8_url,
 								                        'tbr': tbr,
 								                        'ext': ext,
 								                        'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
 								                        'protocol': entry_protocol,
 								                        'preference': preference,
 								                        'quality': quality,
-												[extractor] Fix DRM detection in m3u8

Fixes https://github.com/ytdl-org/youtube-dl/issues/31693#issuecomment-1445202857

											
										
										
											2023-02-26 05:46:30 +01:00
+								                        'has_drm': has_drm,
-												Better support HLS media discontinuity and fully support media initialization (#105)

* Added options: `--hls-split-discontinuity` and `--no-hls-split-discontinuity`

Authored-by: shirtjs <2660574+shirtjs@users.noreply.github.com>
											
										
										
											2021-02-24 15:47:53 +01:00
+								                    }
-												[ie/youtube] Extract all formats from multi-language m3u8s (#9875)

Authored by: clienthax, bashonly

Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com>
											
										
										
											2024-06-20 23:54:53 +02:00
 								                    # YouTube-specific
 								                    if yt_audio_content_id := last_stream_inf.get('YT-EXT-AUDIO-CONTENT-ID'):
 								                        f['language'] = yt_audio_content_id.split('.')[0]
-												Better support HLS media discontinuity and fully support media initialization (#105)

* Added options: `--hls-split-discontinuity` and `--no-hls-split-discontinuity`

Authored-by: shirtjs <2660574+shirtjs@users.noreply.github.com>
											
										
										
											2021-02-24 15:47:53 +01:00
+								                    resolution = last_stream_inf.get('RESOLUTION')
 								                    if resolution:
 								                        mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
 								                        if mobj:
 								                            f['width'] = int(mobj.group('width'))
 								                            f['height'] = int(mobj.group('height'))
 								                    # Unified Streaming Platform
 								                    mobj = re.search(
 								                        r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
 								                    if mobj:
 								                        abr, vbr = mobj.groups()
 								                        abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
 								                        f.update({
 								                            'vbr': vbr,
 								                            'abr': abr,
 								                        })
 								                    codecs = parse_codecs(last_stream_inf.get('CODECS'))
 								                    f.update(codecs)
 								                    audio_group_id = last_stream_inf.get('AUDIO')
 								                    # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
 								                    # references a rendition group MUST have a CODECS attribute.
-												[docs] Consistent use of `e.g.` (#4643)

Authored by: Lesmiscore
											
										
										
											2022-08-14 14:04:13 +02:00
+								                    # However, this is not always respected. E.g. [2]
-												Better support HLS media discontinuity and fully support media initialization (#105)

* Added options: `--hls-split-discontinuity` and `--no-hls-split-discontinuity`

Authored-by: shirtjs <2660574+shirtjs@users.noreply.github.com>
											
										
										
											2021-02-24 15:47:53 +01:00
+								                    # contains EXT-X-STREAM-INF tag which references AUDIO
 								                    # rendition group but does not have CODECS and despite
 								                    # referencing an audio group it represents a complete
 								                    # (with audio and video) format. So, for such cases we will
 								                    # ignore references to rendition groups and treat them
 								                    # as complete formats.
 								                    if audio_group_id and codecs and f.get('vcodec') != 'none':
 								                        audio_group = groups.get(audio_group_id)
 								                        if audio_group and audio_group[0].get('URI'):
 								                            # TODO: update acodec for audio only formats with
 								                            # the same GROUP-ID
 								                            f['acodec'] = 'none'
-												Fix some videos downloading with m3u8 extension

											
										
										
											2021-03-07 18:52:12 +01:00
+								                    if not f.get('ext'):
 								                        f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
-												Better support HLS media discontinuity and fully support media initialization (#105)

* Added options: `--hls-split-discontinuity` and `--no-hls-split-discontinuity`

Authored-by: shirtjs <2660574+shirtjs@users.noreply.github.com>
											
										
										
											2021-02-24 15:47:53 +01:00
+								                    formats.append(f)
 								                    # for DailyMotion
 								                    progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
 								                    if progressive_uri:
 								                        http_f = f.copy()
 								                        del http_f['manifest_url']
 								                        http_f.update({
 								                            'format_id': f['format_id'].replace('hls-', 'http-'),
 								                            'protocol': 'http',
 								                            'url': progressive_uri,
 								                        })
 								                        formats.append(http_f)
-												[dailymotion] improve extraction

- extract http formats included in m3u8 manifest
- fix user extraction(closes #3553)(closes #21415)
- add suport for User Authentication(closes #11491)
- fix password protected videos extraction(closes #23176)
- respect age limit option and family filter cookie value(closes #18437)
- handle video url playlist query param
- report alowed countries for geo-restricted videos

											
										
										
											2019-11-26 22:01:34 +01:00
-												[extractor/common] Improve m3u8 extraction (closes #12211)
* Extract m3u8 parsing to separate method
* Improve rendition groups extraction
* Build stream name according stream GROUP-ID
* Ignore reference to AUDIO group without URI when stream has no CODECS
+ Add test coverage for parsing m3u8 from #11507, #11995, #12211 and twitch vod

											
										
										
											2017-04-22 02:01:00 +02:00
+								                last_stream_inf = {}
-												[extractor/common] Extract HLS subtitle tracks

_extract_m3u8_formats is renamed to _extract_m3u8_formats_and_subtitles
and extended to handle subtitle tracks instead of skipping them;
a wrapper with the old name is provided for compatibility.

_parse_m3u8_formats is likewise renamed and extended, but without adding
the compatibility wrapper; the test suite is adjusted to test the enhanced
method instead.

											
										
										
											2016-11-07 15:45:42 +01:00
+								        return formats, subtitles
-												[sportdeutschland] add new extractor

											
										
										
											2014-08-26 12:51:13 +02:00
-												[SovietsCloset] Add duration from m3u8 (#908)

Authored by: ChillingPepper
											
										
										
											2021-09-27 23:00:41 +02:00
+								    def _extract_m3u8_vod_duration(
 								            self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
 								        m3u8_vod = self._download_webpage(
 								            m3u8_vod_url, video_id,
 								            note='Downloading m3u8 VOD manifest' if note is None else note,
 								            errnote='Failed to download VOD manifest' if errnote is None else errnote,
 								            fatal=False, data=data, headers=headers, query=query)
 								        return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
 								    def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
-												[extractor/slideslive] Fix slides and chapters/duration (#6024)

* Fix slides/thumbnails extraction
* Extract duration to fix issues w/ `--embed-chapters`, `--split-chapters`
* Add `InfoExtractor._extract_mpd_vod_duration` method
* Expand applicability of `InfoExtractor._parse_m3u8_vod_duration` method
Authored by: bashonly
											
										
										
											2023-01-14 20:52:03 +01:00
+								        if '#EXT-X-ENDLIST' not in m3u8_vod:
-												[SovietsCloset] Add duration from m3u8 (#908)

Authored by: ChillingPepper
											
										
										
											2021-09-27 23:00:41 +02:00
+								            return None
 								        return int(sum(
 								            float(line[len('#EXTINF:'):].split(',')[0])
 								            for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
-												[extractor/slideslive] Fix slides and chapters/duration (#6024)

* Fix slides/thumbnails extraction
* Extract duration to fix issues w/ `--embed-chapters`, `--split-chapters`
* Add `InfoExtractor._extract_mpd_vod_duration` method
* Expand applicability of `InfoExtractor._parse_m3u8_vod_duration` method
Authored by: bashonly
											
										
										
											2023-01-14 20:52:03 +01:00
+								    def _extract_mpd_vod_duration(
 								            self, mpd_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
 								        mpd_doc = self._download_xml(
 								            mpd_url, video_id,
 								            note='Downloading MPD VOD manifest' if note is None else note,
 								            errnote='Failed to download VOD manifest' if errnote is None else errnote,
-												[ie] Do not test truth value of `xml.etree.ElementTree.Element` (#8582)

Testing the truthiness of an `xml.etree.ElementTree.Element` instance is deprecated in py3.12

Authored by: bashonly
											
										
										
											2023-11-14 21:28:18 +01:00
+								            fatal=False, data=data, headers=headers, query=query)
 								        if not isinstance(mpd_doc, xml.etree.ElementTree.Element):
 								            return None
-												[extractor/slideslive] Fix slides and chapters/duration (#6024)

* Fix slides/thumbnails extraction
* Extract duration to fix issues w/ `--embed-chapters`, `--split-chapters`
* Add `InfoExtractor._extract_mpd_vod_duration` method
* Expand applicability of `InfoExtractor._parse_m3u8_vod_duration` method
Authored by: bashonly
											
										
										
											2023-01-14 20:52:03 +01:00
+								        return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration')))
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											2015-08-01 21:13:21 +02:00
+								    @staticmethod
 								    def _xpath_ns(path, namespace=None):
 								        if not namespace:
 								            return path
 								        out = []
 								        for c in path.split('/'):
 								            if not c or c == '.':
 								                out.append(c)
 								            else:
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								                out.append(f'{{{namespace}}}{c}')
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											2015-08-01 21:13:21 +02:00
+								        return '/'.join(out)
-												[generic] Extract previously missed subtitles (#515)

* [generic] Extract subtitles in cases missed previously
* [common] Detect discarded subtitles in SMIL manifests
* [generic] Extract everything in the SMIL manifest

Authored by: fstirlitz 
											
										
										
											2021-07-16 16:22:56 +02:00
+								    def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
-												[extractor] Let `_extract_format` functions obey `--ignore-no-formats`

											
										
										
											2022-12-15 15:28:57 +01:00
+								        if self.get_param('ignore_no_formats_error'):
 								            fatal = False
-												[extractor] Update `manifest_url`s after redirect (#3575)

Authored by: elyse0
											
										
										
											2022-04-28 00:50:01 +02:00
+								        res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
 								        if res is False:
-												[nerdist] Add new extractor (Fixes #4851)

											
										
										
											2015-02-02 23:38:35 +01:00
+								            assert not fatal
-												[npr] Make SMIL extraction non-fatal (#2099)

Closes #1934
Authored by: r5d
											
										
										
											2021-12-24 03:15:48 +01:00
+								            return [], {}
-												[extractor] Update `manifest_url`s after redirect (#3575)

Authored by: elyse0
											
										
										
											2022-04-28 00:50:01 +02:00
+								        smil, urlh = res
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											2015-08-01 21:13:21 +02:00
-												[ie] Extract subtitles from SMIL manifests (#7667)

Authored by: bashonly, pukkandan
											
										
										
											2023-07-24 02:09:52 +02:00
+								        return self._parse_smil_formats_and_subtitles(smil, urlh.url, video_id, f4m_params=f4m_params,
 								                                                      namespace=self._parse_smil_namespace(smil))
-												[generic] Extract previously missed subtitles (#515)

* [generic] Extract subtitles in cases missed previously
* [common] Detect discarded subtitles in SMIL manifests
* [generic] Extract everything in the SMIL manifest

Authored by: fstirlitz 
											
										
										
											2021-07-16 16:22:56 +02:00
 								    def _extract_smil_formats(self, *args, **kwargs):
 								        fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
 								        if subs:
-												[cleanup] Misc cleanup

											
										
										
											2021-10-09 02:23:15 +02:00
+								            self._report_ignoring_subs('SMIL')
-												[generic] Extract previously missed subtitles (#515)

* [generic] Extract subtitles in cases missed previously
* [common] Detect discarded subtitles in SMIL manifests
* [generic] Extract everything in the SMIL manifest

Authored by: fstirlitz 
											
										
										
											2021-07-16 16:22:56 +02:00
+								        return fmts
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											2015-08-01 21:13:21 +02:00
 								    def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
-												[extractor] Update `manifest_url`s after redirect (#3575)

Authored by: elyse0
											
										
										
											2022-04-28 00:50:01 +02:00
+								        res = self._download_smil(smil_url, video_id, fatal=fatal)
 								        if res is False:
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											2015-08-01 21:13:21 +02:00
+								            return {}
-												[extractor] Update `manifest_url`s after redirect (#3575)

Authored by: elyse0
											
										
										
											2022-04-28 00:50:01 +02:00
 								        smil, urlh = res
-												[compat, networking] Deprecate old functions (#2861)

Authored by: coletdjnz, pukkandan

											
										
										
											2023-07-09 09:53:02 +02:00
+								        smil_url = urlh.url
-												[extractor] Update `manifest_url`s after redirect (#3575)

Authored by: elyse0
											
										
										
											2022-04-28 00:50:01 +02:00
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											2015-08-01 21:13:21 +02:00
+								        return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
-												[extractor/common] add transform_source to _download_smil and _extract_smil_formats

											
										
										
											2016-03-11 22:37:07 +01:00
+								    def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
-												[extractor] Update `manifest_url`s after redirect (#3575)

Authored by: elyse0
											
										
										
											2022-04-28 00:50:01 +02:00
+								        return self._download_xml_handle(
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											2015-08-01 21:13:21 +02:00
+								            smil_url, video_id, 'Downloading SMIL file',
-												[extractor/common] add transform_source to _download_smil and _extract_smil_formats

											
										
										
											2016-03-11 22:37:07 +01:00
+								            'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											2015-08-01 21:13:21 +02:00
 								    def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
-												[extractor/common] Extract namespace parse routine

											
										
										
											2015-08-01 21:31:17 +02:00
+								        namespace = self._parse_smil_namespace(smil)
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											2015-08-01 21:13:21 +02:00
-												[ie] Extract subtitles from SMIL manifests (#7667)

Authored by: bashonly, pukkandan
											
										
										
											2023-07-24 02:09:52 +02:00
+								        formats, subtitles = self._parse_smil_formats_and_subtitles(
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											2015-08-01 21:13:21 +02:00
+								            smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
 								        video_id = os.path.splitext(url_basename(smil_url))[0]
 								        title = None
 								        description = None
-												[extractor/common] Extract upload date from SMIL

											
										
										
											2015-10-01 18:18:59 +02:00
+								        upload_date = None
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											2015-08-01 21:13:21 +02:00
+								        for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
 								            name = meta.attrib.get('name')
 								            content = meta.attrib.get('content')
 								            if not name or not content:
 								                continue
 								            if not title and name == 'title':
 								                title = content
 								            elif not description and name in ('description', 'abstract'):
 								                description = content
-												[extractor/common] Extract upload date from SMIL

											
										
										
											2015-10-01 18:18:59 +02:00
+								            elif not upload_date and name == 'date':
 								                upload_date = unified_strdate(content)
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											2015-08-01 21:13:21 +02:00
-												[extractor/common] Extract images from SMIL

											
										
										
											2015-10-01 18:08:16 +02:00
+								        thumbnails = [{
 								            'id': image.get('type'),
 								            'url': image.get('src'),
 								            'width': int_or_none(image.get('width')),
 								            'height': int_or_none(image.get('height')),
 								        } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											2015-08-01 21:13:21 +02:00
+								        return {
 								            'id': video_id,
 								            'title': title or video_id,
 								            'description': description,
-												[extractor/common] Extract upload date from SMIL

											
										
										
											2015-10-01 18:18:59 +02:00
+								            'upload_date': upload_date,
-												[extractor/common] Extract images from SMIL

											
										
										
											2015-10-01 18:08:16 +02:00
+								            'thumbnails': thumbnails,
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											2015-08-01 21:13:21 +02:00
+								            'formats': formats,
 								            'subtitles': subtitles,
 								        }
-												[extractor/common] Extract namespace parse routine

											
										
										
											2015-08-01 21:31:17 +02:00
+								    def _parse_smil_namespace(self, smil):
 								        return self._search_regex(
 								            r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
-												[ie] Extract subtitles from SMIL manifests (#7667)

Authored by: bashonly, pukkandan
											
										
										
											2023-07-24 02:09:52 +02:00
+								    def _parse_smil_formats(self, *args, **kwargs):
 								        fmts, subs = self._parse_smil_formats_and_subtitles(*args, **kwargs)
 								        if subs:
 								            self._report_ignoring_subs('SMIL')
 								        return fmts
 								    def _parse_smil_formats_and_subtitles(
 								            self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											2015-08-01 21:13:21 +02:00
+								        base = smil_url
 								        for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
 								            b = meta.get('base') or meta.get('httpBase')
 								            if b:
 								                base = b
 								                break
-												[extractor/common] Add generic SMIL formats extraction routine

											
										
										
											2014-12-09 17:28:28 +01:00
-												[ie] Extract subtitles from SMIL manifests (#7667)

Authored by: bashonly, pukkandan
											
										
										
											2023-07-24 02:09:52 +02:00
+								        formats, subtitles = [], {}
-												[extractor/common] Add generic SMIL formats extraction routine

											
										
										
											2014-12-09 17:28:28 +01:00
+								        rtmp_count = 0
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											2015-08-01 21:13:21 +02:00
+								        http_count = 0
-												[extractor/common] detect media playlist in _extract_m3u8_formats

											
										
										
											2016-01-26 17:44:44 +01:00
+								        m3u8_count = 0
-												[extractor] Extract storyboards from SMIL manifests (#1128)

Authored by: fstirlitz
											
										
										
											2021-10-02 20:43:42 +02:00
+								        imgs_count = 0
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											2015-08-01 21:13:21 +02:00
-												[extractor] Extract storyboards from SMIL manifests (#1128)

Authored by: fstirlitz
											
										
										
											2021-10-02 20:43:42 +02:00
+								        srcs = set()
-												[ie] Extract from `media` elements in SMIL manifests (#8504)

Authored by: seproDev
											
										
										
											2023-11-18 22:51:18 +01:00
+								        media = itertools.chain.from_iterable(
 								            smil.findall(self._xpath_ns(arg, namespace))
 								            for arg in ['.//video', './/audio', './/media'])
-												[common] Extract audio formats in SMIL

Found in http://www.cbc.ca/player/play/2657631896

Closes #5156

											
										
										
											2016-05-20 13:02:53 +02:00
+								        for medium in media:
 								            src = medium.get('src')
-												[extractor/common] remove duplicate rtmp formats in smil manifest

											
										
										
											2016-02-11 17:58:48 +01:00
+								            if not src or src in srcs:
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											2015-08-01 21:13:21 +02:00
+								                continue
-												[extractor] Extract storyboards from SMIL manifests (#1128)

Authored by: fstirlitz
											
										
										
											2021-10-02 20:43:42 +02:00
+								            srcs.add(src)
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											2015-08-01 21:13:21 +02:00
-												[common] Extract audio formats in SMIL

Found in http://www.cbc.ca/player/play/2657631896

Closes #5156

											
										
										
											2016-05-20 13:02:53 +02:00
+								            bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
 								            filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
 								            width = int_or_none(medium.get('width'))
 								            height = int_or_none(medium.get('height'))
 								            proto = medium.get('proto')
 								            ext = medium.get('ext')
-												[extractor/nbc] Fix `NBC` and `NBCStations` extractors (#6033)

Improve `InfoExtractor._parse_smil_formats` extension detection
Closes #6019
Authored by: bashonly
											
										
										
											2023-01-14 17:40:42 +01:00
+								            src_ext = determine_ext(src, default_ext=None) or ext or urlhandle_detect_ext(
 								                self._request_webpage(HEADRequest(src), video_id, note='Requesting extension info', fatal=False))
-												[common] Extract audio formats in SMIL

Found in http://www.cbc.ca/player/play/2657631896

Closes #5156

											
										
										
											2016-05-20 13:02:53 +02:00
+								            streamer = medium.get('streamer') or base
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											2015-08-01 21:13:21 +02:00
 								            if proto == 'rtmp' or streamer.startswith('rtmp'):
 								                rtmp_count += 1
 								                formats.append({
 								                    'url': streamer,
 								                    'play_path': src,
 								                    'ext': 'flv',
 								                    'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
 								                    'tbr': bitrate,
 								                    'filesize': filesize,
 								                    'width': width,
 								                    'height': height,
 								                })
-												[theplatform] Use InfoExtractor._parse_smil_formats()

											
										
										
											2015-08-19 17:11:25 +02:00
+								                if transform_rtmp_url:
 								                    streamer, src = transform_rtmp_url(streamer, src)
 								                    formats[-1].update({
 								                        'url': streamer,
 								                        'play_path': src,
 								                    })
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											2015-08-01 21:13:21 +02:00
+								                continue
-												[ie] Fix parsing of base URL in SMIL manifest (#9225)

Authored by: seproDev
											
										
										
											2024-05-27 00:06:34 +02:00
+								            src_url = src if src.startswith('http') else urllib.parse.urljoin(f'{base}/', src)
-												[extractor/common] strip http urls in smil manifest

											
										
										
											2016-02-12 17:38:48 +01:00
+								            src_url = src_url.strip()
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											2015-08-01 21:13:21 +02:00
 								            if proto == 'm3u8' or src_ext == 'm3u8':
-												[ie] Extract subtitles from SMIL manifests (#7667)

Authored by: bashonly, pukkandan
											
										
										
											2023-07-24 02:09:52 +02:00
+								                m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
-												[extractor/common] detect media playlist in _extract_m3u8_formats

											
										
										
											2016-01-26 17:44:44 +01:00
+								                    src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
-												[ie] Extract subtitles from SMIL manifests (#7667)

Authored by: bashonly, pukkandan
											
										
										
											2023-07-24 02:09:52 +02:00
+								                self._merge_subtitles(m3u8_subs, target=subtitles)
-												[extractor/common] detect media playlist in _extract_m3u8_formats

											
										
										
											2016-01-26 17:44:44 +01:00
+								                if len(m3u8_formats) == 1:
 								                    m3u8_count += 1
 								                    m3u8_formats[0].update({
 								                        'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
 								                        'tbr': bitrate,
 								                        'width': width,
 								                        'height': height,
 								                    })
 								                formats.extend(m3u8_formats)
-												[extractor/common] add support for DASH and MSS formats extraction in SMIL manifests

											
										
										
											2018-07-18 19:29:18 +02:00
+								            elif src_ext == 'f4m':
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											2015-08-01 21:13:21 +02:00
+								                f4m_url = src_url
 								                if not f4m_params:
 								                    f4m_params = {
 								                        'hdcore': '3.2.0',
 								                        'plugin': 'flowplayer-3.2.0.1',
 								                    }
 								                f4m_url += '&' if '?' in f4m_url else '?'
-												[compat] Remove deprecated functions from core code

											
										
										
											2022-06-24 12:54:43 +02:00
+								                f4m_url += urllib.parse.urlencode(f4m_params)
-												Simplify formats accumulation for f4m/m3u8/smil formats

Now all _extract_*_formats routines return a list

											
										
										
											2015-12-28 19:58:24 +01:00
+								                formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
-												[extractor/common] add support for DASH and MSS formats extraction in SMIL manifests

											
										
										
											2018-07-18 19:29:18 +02:00
+								            elif src_ext == 'mpd':
-												[ie] Extract subtitles from SMIL manifests (#7667)

Authored by: bashonly, pukkandan
											
										
										
											2023-07-24 02:09:52 +02:00
+								                mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(
 								                    src_url, video_id, mpd_id='dash', fatal=False)
 								                formats.extend(mpd_formats)
 								                self._merge_subtitles(mpd_subs, target=subtitles)
-												[extractor/common] add support for DASH and MSS formats extraction in SMIL manifests

											
										
										
											2018-07-18 19:29:18 +02:00
+								            elif re.search(r'\.ism/[Mm]anifest', src_url):
-												[ie] Extract subtitles from SMIL manifests (#7667)

Authored by: bashonly, pukkandan
											
										
										
											2023-07-24 02:09:52 +02:00
+								                ism_formats, ism_subs = self._extract_ism_formats_and_subtitles(
 								                    src_url, video_id, ism_id='mss', fatal=False)
 								                formats.extend(ism_formats)
 								                self._merge_subtitles(ism_subs, target=subtitles)
-												[extractor/common] add support for DASH and MSS formats extraction in SMIL manifests

											
										
										
											2018-07-18 19:29:18 +02:00
+								            elif src_url.startswith('http') and self._is_valid_url(src, video_id):
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											2015-08-01 21:13:21 +02:00
+								                http_count += 1
 								                formats.append({
 								                    'url': src_url,
 								                    'ext': ext or src_ext or 'flv',
 								                    'format_id': 'http-%d' % (bitrate or http_count),
 								                    'tbr': bitrate,
 								                    'filesize': filesize,
 								                    'width': width,
 								                    'height': height,
 								                })
-												[extractor/common] Extract the first of a seq of videos in a .smil file

											
										
										
											2015-02-22 09:16:51 +01:00
-												[extractor] Extract storyboards from SMIL manifests (#1128)

Authored by: fstirlitz
											
										
										
											2021-10-02 20:43:42 +02:00
+								        for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
 								            src = medium.get('src')
 								            if not src or src in srcs:
 								                continue
 								            srcs.add(src)
 								            imgs_count += 1
 								            formats.append({
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								                'format_id': f'imagestream-{imgs_count}',
-												[extractor] Extract storyboards from SMIL manifests (#1128)

Authored by: fstirlitz
											
										
										
											2021-10-02 20:43:42 +02:00
+								                'url': src,
 								                'ext': mimetype2ext(medium.get('type')),
 								                'acodec': 'none',
 								                'vcodec': 'none',
 								                'width': int_or_none(medium.get('width')),
 								                'height': int_or_none(medium.get('height')),
 								                'format_note': 'SMIL storyboards',
 								            })
-												[ie] Extract subtitles from SMIL manifests (#7667)

Authored by: bashonly, pukkandan
											
										
										
											2023-07-24 02:09:52 +02:00
+								        smil_subs = self._parse_smil_subtitles(smil, namespace=namespace)
 								        self._merge_subtitles(smil_subs, target=subtitles)
 								        return formats, subtitles
-												[extractor/common] Add generic SMIL formats extraction routine

											
										
										
											2014-12-09 17:28:28 +01:00
-												[extractor/common] Add default subtitles lang

											
										
										
											2015-08-19 20:56:17 +02:00
+								    def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
-												[extractor/common] remove duplicated formats and subtiles in smil manifests

											
										
										
											2016-02-09 17:15:41 +01:00
+								        urls = []
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											2015-08-01 21:13:21 +02:00
+								        subtitles = {}
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								        for textstream in smil.findall(self._xpath_ns('.//textstream', namespace)):
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											2015-08-01 21:13:21 +02:00
+								            src = textstream.get('src')
-												[extractor/common] remove duplicated formats and subtiles in smil manifests

											
										
										
											2016-02-09 17:15:41 +01:00
+								            if not src or src in urls:
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											2015-08-01 21:13:21 +02:00
+								                continue
-												[extractor/common] remove duplicated formats and subtiles in smil manifests

											
										
										
											2016-02-09 17:15:41 +01:00
+								            urls.append(src)
-												[common] prefer using mime type over ext for smil subtitle extraction

the subtitle ext for http://www.cnet.com/videos/download-amazon-prime-movies-and-tv/
is adb_xml while using the mime type it get tt(application/smptett+xml)

											
										
										
											2016-04-01 20:39:02 +02:00
+								            ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
-												[common] _parse_smil_subtitles: accept `lang` as the subtitle language

											
										
										
											2015-08-20 17:18:58 +02:00
+								            lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
-												[extractor/common] Extract f4m and m3u8 formats, subtitles and info

											
										
										
											2015-08-01 21:13:21 +02:00
+								            subtitles.setdefault(lang, []).append({
 								                'url': src,
 								                'ext': ext,
 								            })
 								        return subtitles
-												[extractor/common] Extract the first of a seq of videos in a .smil file

											
										
										
											2015-02-22 09:16:51 +01:00
-												Generalize XML manifest processing code and improve XSPF parsing (closes #15794)

											
										
										
											2018-03-17 20:46:50 +01:00
+								    def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
-												[extractor] Update `manifest_url`s after redirect (#3575)

Authored by: elyse0
											
										
										
											2022-04-28 00:50:01 +02:00
+								        res = self._download_xml_handle(
-												Generalize XML manifest processing code and improve XSPF parsing (closes #15794)

											
										
										
											2018-03-17 20:46:50 +01:00
+								            xspf_url, playlist_id, 'Downloading xpsf playlist',
-												[extractor/common] Extract _parse_xspf

											
										
										
											2015-08-09 15:41:55 +02:00
+								            'Unable to download xspf manifest', fatal=fatal)
-												[extractor] Update `manifest_url`s after redirect (#3575)

Authored by: elyse0
											
										
										
											2022-04-28 00:50:01 +02:00
+								        if res is False:
-												[extractor/common] Extract _parse_xspf

											
										
										
											2015-08-09 15:41:55 +02:00
+								            return []
-												[extractor] Update `manifest_url`s after redirect (#3575)

Authored by: elyse0
											
										
										
											2022-04-28 00:50:01 +02:00
 								        xspf, urlh = res
-												[compat, networking] Deprecate old functions (#2861)

Authored by: coletdjnz, pukkandan

											
										
										
											2023-07-09 09:53:02 +02:00
+								        xspf_url = urlh.url
-												[extractor] Update `manifest_url`s after redirect (#3575)

Authored by: elyse0
											
										
										
											2022-04-28 00:50:01 +02:00
-												Generalize XML manifest processing code and improve XSPF parsing (closes #15794)

											
										
										
											2018-03-17 20:46:50 +01:00
+								        return self._parse_xspf(
 								            xspf, playlist_id, xspf_url=xspf_url,
 								            xspf_base_url=base_url(xspf_url))
-												[extractor/generic] Add generic support for xspf playist extraction

											
										
										
											2015-08-09 15:07:18 +02:00
-												Generalize XML manifest processing code and improve XSPF parsing (closes #15794)

											
										
										
											2018-03-17 20:46:50 +01:00
+								    def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
-												[extractor/generic] Add generic support for xspf playist extraction

											
										
										
											2015-08-09 15:07:18 +02:00
+								        NS_MAP = {
 								            'xspf': 'http://xspf.org/ns/0/',
 								            's1': 'http://static.streamone.nl/player/ns/0',
 								        }
 								        entries = []
-												Generalize XML manifest processing code and improve XSPF parsing (closes #15794)

											
										
										
											2018-03-17 20:46:50 +01:00
+								        for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
-												[extractor/generic] Add generic support for xspf playist extraction

											
										
										
											2015-08-09 15:07:18 +02:00
+								            title = xpath_text(
-												[extractor/common] Use playlist id as default title

											
										
										
											2015-08-09 15:18:50 +02:00
+								                track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
-												[extractor/generic] Add generic support for xspf playist extraction

											
										
										
											2015-08-09 15:07:18 +02:00
+								            description = xpath_text(
 								                track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
 								            thumbnail = xpath_text(
 								                track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
 								            duration = float_or_none(
 								                xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
-												Generalize XML manifest processing code and improve XSPF parsing (closes #15794)

											
										
										
											2018-03-17 20:46:50 +01:00
+								            formats = []
 								            for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
 								                format_url = urljoin(xspf_base_url, location.text)
 								                if not format_url:
 								                    continue
 								                formats.append({
 								                    'url': format_url,
 								                    'manifest_url': xspf_url,
 								                    'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
 								                    'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
 								                    'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
 								                })
-												[extractor/generic] Add generic support for xspf playist extraction

											
										
										
											2015-08-09 15:07:18 +02:00
 								            entries.append({
 								                'id': playlist_id,
 								                'title': title,
 								                'description': description,
 								                'thumbnail': thumbnail,
 								                'duration': duration,
 								                'formats': formats,
 								            })
 								        return entries
-												[extractor/common] Extract DASH subtitle tracks

_extract_mpd_formats and _parse_mpd_formats were extended into
_…_formats_and_subtitles; wrappers with old names are provided
for compatibility.

											
										
										
											2021-04-18 01:49:22 +02:00
+								    def _extract_mpd_formats(self, *args, **kwargs):
 								        fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
 								        if subs:
-												[cleanup] Misc cleanup

											
										
										
											2021-10-09 02:23:15 +02:00
+								            self._report_ignoring_subs('DASH')
-												[extractor/common] Extract DASH subtitle tracks

_extract_mpd_formats and _parse_mpd_formats were extended into
_…_formats_and_subtitles; wrappers with old names are provided
for compatibility.

											
										
										
											2021-04-18 01:49:22 +02:00
+								        return fmts
-												[ie] Support multi-period MPD streams (#6654)

											
										
										
											2023-03-27 19:04:23 +02:00
+								    def _extract_mpd_formats_and_subtitles(self, *args, **kwargs):
 								        periods = self._extract_mpd_periods(*args, **kwargs)
 								        return self._merge_mpd_periods(periods)
 								    def _extract_mpd_periods(
-												[extractor/common] Extract DASH subtitle tracks

_extract_mpd_formats and _parse_mpd_formats were extended into
_…_formats_and_subtitles; wrappers with old names are provided
for compatibility.

											
										
										
											2021-04-18 01:49:22 +02:00
+								            self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
 								            fatal=True, data=None, headers={}, query={}):
-												[extractor] Let `_extract_format` functions obey `--ignore-no-formats`

											
										
										
											2022-12-15 15:28:57 +01:00
 								        if self.get_param('ignore_no_formats_error'):
 								            fatal = False
-												Generalize XML manifest processing code and improve XSPF parsing (closes #15794)

											
										
										
											2018-03-17 20:46:50 +01:00
+								        res = self._download_xml_handle(
-												[common] add a generic support for mpd manifests

											
										
										
											2016-02-02 18:07:07 +01:00
+								            mpd_url, video_id,
-												[extractor] Allow `note=False` when extracting manifests

											
										
										
											2021-05-29 10:52:44 +02:00
+								            note='Downloading MPD manifest' if note is None else note,
 								            errnote='Failed to download MPD manifest' if errnote is None else errnote,
-												[extractor/common] Add data, headers and query to all major extract methods preserving standard order for potential future use

											
										
										
											2019-11-15 23:44:14 +01:00
+								            fatal=fatal, data=data, headers=headers, query=query)
-												[common] add a generic support for mpd manifests

											
										
										
											2016-02-02 18:07:07 +01:00
+								        if res is False:
-												[ie] Support multi-period MPD streams (#6654)

											
										
										
											2023-03-27 19:04:23 +02:00
+								            return []
-												Generalize XML manifest processing code and improve XSPF parsing (closes #15794)

											
										
										
											2018-03-17 20:46:50 +01:00
+								        mpd_doc, urlh = res
-												[vimeo] add support live streams and improve info extraction(closes #19144)

											
										
										
											2019-04-21 18:20:28 +02:00
+								        if mpd_doc is None:
-												[ie] Support multi-period MPD streams (#6654)

											
										
										
											2023-03-27 19:04:23 +02:00
+								            return []
-												[extractor] Update dash `manifest_url` after redirects (#3563)

Closes #2696 
Authored by: elyse0
											
										
										
											2022-04-27 20:01:35 +02:00
 								        # We could have been redirected to a new url when we retrieved our mpd file.
-												[compat, networking] Deprecate old functions (#2861)

Authored by: coletdjnz, pukkandan

											
										
										
											2023-07-09 09:53:02 +02:00
+								        mpd_url = urlh.url
-												[extractor] Update dash `manifest_url` after redirects (#3563)

Closes #2696 
Authored by: elyse0
											
										
										
											2022-04-27 20:01:35 +02:00
+								        mpd_base_url = base_url(mpd_url)
-												[common] add a generic support for mpd manifests

											
										
										
											2016-02-02 18:07:07 +01:00
-												[ie] Support multi-period MPD streams (#6654)

											
										
										
											2023-03-27 19:04:23 +02:00
+								        return self._parse_mpd_periods(mpd_doc, mpd_id, mpd_base_url, mpd_url)
-												[common] Add _extract_dash_manifest_formats

											
										
										
											2016-01-30 15:52:23 +01:00
-												[extractor/common] Extract DASH subtitle tracks

_extract_mpd_formats and _parse_mpd_formats were extended into
_…_formats_and_subtitles; wrappers with old names are provided
for compatibility.

											
										
										
											2021-04-18 01:49:22 +02:00
+								    def _parse_mpd_formats(self, *args, **kwargs):
 								        fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
 								        if subs:
-												[cleanup] Misc cleanup

											
										
										
											2021-10-09 02:23:15 +02:00
+								            self._report_ignoring_subs('DASH')
-												[extractor/common] Extract DASH subtitle tracks

_extract_mpd_formats and _parse_mpd_formats were extended into
_…_formats_and_subtitles; wrappers with old names are provided
for compatibility.

											
										
										
											2021-04-18 01:49:22 +02:00
+								        return fmts
-												[ie] Support multi-period MPD streams (#6654)

											
										
										
											2023-03-27 19:04:23 +02:00
+								    def _parse_mpd_formats_and_subtitles(self, *args, **kwargs):
 								        periods = self._parse_mpd_periods(*args, **kwargs)
 								        return self._merge_mpd_periods(periods)
 								    def _merge_mpd_periods(self, periods):
 								        """
 								        Combine all formats and subtitles from an MPD manifest into a single list,
 								        by concatenate streams with similar formats.
 								        """
 								        formats, subtitles = {}, {}
 								        for period in periods:
 								            for f in period['formats']:
 								                assert 'is_dash_periods' not in f, 'format already processed'
 								                f['is_dash_periods'] = True
 								                format_key = tuple(v for k, v in f.items() if k not in (
 								                    ('format_id', 'fragments', 'manifest_stream_number')))
 								                if format_key not in formats:
 								                    formats[format_key] = f
 								                elif 'fragments' in f:
 								                    formats[format_key].setdefault('fragments', []).extend(f['fragments'])
 								            if subtitles and period['subtitles']:
 								                self.report_warning(bug_reports_message(
 								                    'Found subtitles in multiple periods in the DASH manifest; '
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								                    'if part of the subtitles are missing,',
-												[ie] Support multi-period MPD streams (#6654)

											
										
										
											2023-03-27 19:04:23 +02:00
+								                ), only_once=True)
 								            for sub_lang, sub_info in period['subtitles'].items():
 								                subtitles.setdefault(sub_lang, []).extend(sub_info)
 								        return list(formats.values()), subtitles
 								    def _parse_mpd_periods(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
-												[extractor/common] Add support for $ in SegmentTemplate in MPD manifests

											
										
										
											2016-07-24 05:27:16 +02:00
+								        """
 								        Parse formats from MPD manifest.
 								        References:
 . MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
 								            http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
 . https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
 								        """
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											2021-05-17 14:23:08 +02:00
+								        if not self.get_param('dynamic_mpd', True):
-												[Core] hls manifests, dynamic mpd


											
										
										
											2020-09-16 13:00:41 +02:00
+								            if mpd_doc.get('type') == 'dynamic':
-												[extractor/common] Extract DASH subtitle tracks

_extract_mpd_formats and _parse_mpd_formats were extended into
_…_formats_and_subtitles; wrappers with old names are provided
for compatibility.

											
										
										
											2021-04-18 01:49:22 +02:00
+								                return [], {}
-												[common] Add _extract_dash_manifest_formats

											
										
										
											2016-01-30 15:52:23 +01:00
-												rename _parse_mpd to _parse_mpd_formats and add default value for mpd namespace

											
										
										
											2016-02-06 14:03:48 +01:00
+								        namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
-												[common] remove duplicate reference to namespace

											
										
										
											2016-02-02 22:02:08 +01:00
 								        def _add_ns(path):
 								            return self._xpath_ns(path, namespace)
-												[common] skip drm protected dash formats

											
										
										
											2016-02-03 18:44:43 +01:00
+								        def is_drm_protected(element):
 								            return element.find(_add_ns('ContentProtection')) is not None
-												[common] add a generic support for mpd manifests

											
										
										
											2016-02-02 18:07:07 +01:00
+								        def extract_multisegment_info(element, ms_parent_info):
 								            ms_info = ms_parent_info.copy()
-												[extractor/common] Expose fragments interface for dashsegments formats

											
										
										
											2016-09-05 20:21:57 +02:00
 								            # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
 								            # common attributes and elements.  We will only extract relevant
 								            # for us.
 								            def extract_common(source):
 								                segment_timeline = source.find(_add_ns('SegmentTimeline'))
 								                if segment_timeline is not None:
 								                    s_e = segment_timeline.findall(_add_ns('S'))
 								                    if s_e:
 								                        ms_info['total_number'] = 0
 								                        ms_info['s'] = []
 								                        for s in s_e:
 								                            r = int(s.get('r', 0))
 								                            ms_info['total_number'] += 1 + r
 								                            ms_info['s'].append({
 								                                't': int(s.get('t', 0)),
 								                                # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
 								                                'd': int(s.attrib['d']),
 								                                'r': r,
 								                            })
 								                start_number = source.get('startNumber')
 								                if start_number:
 								                    ms_info['start_number'] = int(start_number)
 								                timescale = source.get('timescale')
 								                if timescale:
 								                    ms_info['timescale'] = int(timescale)
 								                segment_duration = source.get('duration')
 								                if segment_duration:
-												[extractor/common] Add support for float durations in _parse_mpd_formats (closes #13919)

											
										
										
											2017-08-15 18:58:00 +02:00
+								                    ms_info['segment_duration'] = float(segment_duration)
-												[extractor/common] Expose fragments interface for dashsegments formats

											
										
										
											2016-09-05 20:21:57 +02:00
 								            def extract_Initialization(source):
 								                initialization = source.find(_add_ns('Initialization'))
 								                if initialization is not None:
 								                    ms_info['initialization_url'] = initialization.attrib['sourceURL']
-												[common] remove duplicate reference to namespace

											
										
										
											2016-02-02 22:02:08 +01:00
+								            segment_list = element.find(_add_ns('SegmentList'))
-												[common] add a generic support for mpd manifests

											
										
										
											2016-02-02 18:07:07 +01:00
+								            if segment_list is not None:
-												[extractor/common] Expose fragments interface for dashsegments formats

											
										
										
											2016-09-05 20:21:57 +02:00
+								                extract_common(segment_list)
 								                extract_Initialization(segment_list)
-												[common] remove duplicate reference to namespace

											
										
										
											2016-02-02 22:02:08 +01:00
+								                segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
-												[common] add a generic support for mpd manifests

											
										
										
											2016-02-02 18:07:07 +01:00
+								                if segment_urls_e:
 								                    ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
 								            else:
-												[common] remove duplicate reference to namespace

											
										
										
											2016-02-02 22:02:08 +01:00
+								                segment_template = element.find(_add_ns('SegmentTemplate'))
-												[common] add a generic support for mpd manifests

											
										
										
											2016-02-02 18:07:07 +01:00
+								                if segment_template is not None:
-												[extractor/common] Expose fragments interface for dashsegments formats

											
										
										
											2016-09-05 20:21:57 +02:00
+								                    extract_common(segment_template)
-												[extractor/common] Fix initialization template (closes #11605, closes #11825)

											
										
										
											2017-01-29 00:57:39 +01:00
+								                    media = segment_template.get('media')
 								                    if media:
 								                        ms_info['media'] = media
-												[common] add a generic support for mpd manifests

											
										
										
											2016-02-02 18:07:07 +01:00
+								                    initialization = segment_template.get('initialization')
 								                    if initialization:
-												[extractor/common] Fix initialization template (closes #11605, closes #11825)

											
										
										
											2017-01-29 00:57:39 +01:00
+								                        ms_info['initialization'] = initialization
-												[common] add a generic support for mpd manifests

											
										
										
											2016-02-02 18:07:07 +01:00
+								                    else:
-												[extractor/common] Expose fragments interface for dashsegments formats

											
										
										
											2016-09-05 20:21:57 +02:00
+								                        extract_Initialization(segment_template)
-												[common] add a generic support for mpd manifests

											
										
										
											2016-02-02 18:07:07 +01:00
+								            return ms_info
-												[common] Modify _parse_dash_manifest for use in Facebook

											
										
										
											2016-01-30 14:27:43 +01:00
-												[common] add a generic support for mpd manifests

											
										
										
											2016-02-02 18:07:07 +01:00
+								        mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
-												[downloader/ffmpeg] Fix for direct videos inside mpd manifests
Closes #1751

											
										
										
											2021-11-22 19:02:14 +01:00
+								        stream_numbers = collections.defaultdict(int)
-												[ie] Support multi-period MPD streams (#6654)

											
										
										
											2023-03-27 19:04:23 +02:00
+								        for period_idx, period in enumerate(mpd_doc.findall(_add_ns('Period'))):
 								            period_entry = {
 								                'id': period.get('id', f'period-{period_idx}'),
 								                'formats': [],
 								                'subtitles': collections.defaultdict(list),
 								            }
-												[common] add a generic support for mpd manifests

											
										
										
											2016-02-02 18:07:07 +01:00
+								            period_duration = parse_duration(period.get('duration')) or mpd_duration
 								            period_ms_info = extract_multisegment_info(period, {
 								                'start_number': 1,
 								                'timescale': 1,
 								            })
-												[common] remove duplicate reference to namespace

											
										
										
											2016-02-02 22:02:08 +01:00
+								            for adaptation_set in period.findall(_add_ns('AdaptationSet')):
-												[common] add a generic support for mpd manifests

											
										
										
											2016-02-02 18:07:07 +01:00
+								                adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
-												[common] remove duplicate reference to namespace

											
										
										
											2016-02-02 22:02:08 +01:00
+								                for representation in adaptation_set.findall(_add_ns('Representation')):
-												[common] add a generic support for mpd manifests

											
										
										
											2016-02-02 18:07:07 +01:00
+								                    representation_attrib = adaptation_set.attrib.copy()
 								                    representation_attrib.update(representation.attrib)
-												[extractor/common] Add support for $ in SegmentTemplate in MPD manifests

											
										
										
											2016-07-24 05:27:16 +02:00
+								                    # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
-												[common] Use mimeType to determine file extensions (#8766)

											
										
										
											2016-03-11 16:49:55 +01:00
+								                    mime_type = representation_attrib['mimeType']
-												[extractor/common] Extract DASH subtitle tracks

_extract_mpd_formats and _parse_mpd_formats were extended into
_…_formats_and_subtitles; wrappers with old names are provided
for compatibility.

											
										
										
											2021-04-18 01:49:22 +02:00
+								                    content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
-												[cleanup] Minor fixes

											
										
										
											2022-05-18 05:34:30 +02:00
+								                    codec_str = representation_attrib.get('codecs', '')
 								                    # Some kind of binary subtitle found in some youtube livestreams
 								                    if mime_type == 'application/x-rawcc':
 								                        codecs = {'scodec': codec_str}
 								                    else:
 								                        codecs = parse_codecs(codec_str)
-												[extractor] Detect `sttp` as subtitles in MPD
Closes #656
Solution by: fstirlitz

											
										
										
											2021-08-10 01:12:03 +02:00
+								                    if content_type not in ('video', 'audio', 'text'):
 								                        if mime_type == 'image/jpeg':
-												minor bugfixes
bugs due to be2fc5b212338d89d9c139cb463f785e797d1ad3, e9f4ccd19eb92621970b518fb5984b8aef52bdc8

											
										
										
											2021-08-11 10:12:23 +02:00
+								                            content_type = mime_type
-												[cleanup] Minor fixes

											
										
										
											2022-05-18 05:34:30 +02:00
+								                        elif codecs.get('vcodec', 'none') != 'none':
-												[extractor] Detect more subtitle codecs in MPD manifests (#2174)

Authored by: fstirlitz
											
										
										
											2021-12-31 21:06:45 +01:00
+								                            content_type = 'video'
-												[cleanup] Minor fixes

											
										
										
											2022-05-18 05:34:30 +02:00
+								                        elif codecs.get('acodec', 'none') != 'none':
-												[extractor] Detect more subtitle codecs in MPD manifests (#2174)

Authored by: fstirlitz
											
										
										
											2021-12-31 21:06:45 +01:00
+								                            content_type = 'audio'
-												[cleanup] Misc fixes (see desc)

* Do not warn when fixup is skipped for existing file
* [fragment] Fix `--skip-unavailable-fragments` for HTTP Errors
* [utils] write_string: Fix bug in 59f943cd5097e9bdbc3cb3e6b5675e43d369341a
* [utils] parse_codecs: Subtitle codec is generally referred to as `scodec`. https://github.com/yt-dlp/yt-dlp/pull/2174#discussion_r790156048
* [docs] Remove note about permissions. Closes #3597

											
										
										
											2022-04-29 18:02:31 +02:00
+								                        elif codecs.get('scodec', 'none') != 'none':
-												[extractor] Detect `sttp` as subtitles in MPD
Closes #656
Solution by: fstirlitz

											
										
										
											2021-08-10 01:12:03 +02:00
+								                            content_type = 'text'
-												[extractor,utils] Detect more codecs/mimetypes
Fixes: https://github.com/ytdl-org/youtube-dl/issues/29943

											
										
										
											2021-10-13 01:33:40 +02:00
+								                        elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
 								                            content_type = 'text'
-												[downloader/mhtml] Add new downloader (#343)

This downloader is intended to be used for streams that consist of a
timed sequence of stand-alone images, such as slideshows or thumbnail
streams

This can be used for implementing:

https://github.com/ytdl-org/youtube-dl/issues/4974#issue-58006762
https://github.com/ytdl-org/youtube-dl/issues/4540#issuecomment-69574231
https://github.com/ytdl-org/youtube-dl/pull/11185#issuecomment-335554239

https://github.com/ytdl-org/youtube-dl/issues/9868
https://github.com/ytdl-org/youtube-dl/pull/14951


Authored by: fstirlitz

											
										
										
											2021-05-23 18:34:49 +02:00
+								                        else:
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								                            self.report_warning(f'Unknown MIME type {mime_type} in DASH manifest')
-												[extractor] Detect `sttp` as subtitles in MPD
Closes #656
Solution by: fstirlitz

											
										
										
											2021-08-10 01:12:03 +02:00
+								                            continue
 								                    base_url = ''
 								                    for element in (representation, adaptation_set, period, mpd_doc):
 								                        base_url_e = element.find(_add_ns('BaseURL'))
-												[extractor] Fix empty `BaseURL` in MPD

Closes #4113

											
										
										
											2022-06-29 02:37:21 +02:00
+								                        if try_call(lambda: base_url_e.text) is not None:
-												[extractor] Detect `sttp` as subtitles in MPD
Closes #656
Solution by: fstirlitz

											
										
										
											2021-08-10 01:12:03 +02:00
+								                            base_url = base_url_e.text + base_url
 								                            if re.match(r'^https?://', base_url):
 								                                break
-												[extractor] Fix root-relative URLs in MPD (#1006)

Authored by: DigitalDJ
											
										
										
											2021-09-19 10:37:57 +02:00
+								                    if mpd_base_url and base_url.startswith('/'):
-												[compat] Remove deprecated functions from core code

											
										
										
											2022-06-24 12:54:43 +02:00
+								                        base_url = urllib.parse.urljoin(mpd_base_url, base_url)
-												[extractor] Fix root-relative URLs in MPD (#1006)

Authored by: DigitalDJ
											
										
										
											2021-09-19 10:37:57 +02:00
+								                    elif mpd_base_url and not re.match(r'^https?://', base_url):
 								                        if not mpd_base_url.endswith('/'):
-												[extractor] Detect `sttp` as subtitles in MPD
Closes #656
Solution by: fstirlitz

											
										
										
											2021-08-10 01:12:03 +02:00
+								                            mpd_base_url += '/'
 								                        base_url = mpd_base_url + base_url
 								                    representation_id = representation_attrib.get('id')
 								                    lang = representation_attrib.get('lang')
 								                    url_el = representation.find(_add_ns('BaseURL'))
 								                    filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
 								                    bandwidth = int_or_none(representation_attrib.get('bandwidth'))
 								                    if representation_id is not None:
 								                        format_id = representation_id
 								                    else:
 								                        format_id = content_type
 								                    if mpd_id:
 								                        format_id = mpd_id + '-' + format_id
 								                    if content_type in ('video', 'audio'):
 								                        f = {
 								                            'format_id': format_id,
 								                            'manifest_url': mpd_url,
 								                            'ext': mimetype2ext(mime_type),
 								                            'width': int_or_none(representation_attrib.get('width')),
 								                            'height': int_or_none(representation_attrib.get('height')),
 								                            'tbr': float_or_none(bandwidth, 1000),
 								                            'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
 								                            'fps': int_or_none(representation_attrib.get('frameRate')),
 								                            'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								                            'format_note': f'DASH {content_type}',
-												[extractor] Detect `sttp` as subtitles in MPD
Closes #656
Solution by: fstirlitz

											
										
										
											2021-08-10 01:12:03 +02:00
+								                            'filesize': filesize,
 								                            'container': mimetype2ext(mime_type) + '_dash',
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								                            **codecs,
-												[extractor] Detect `sttp` as subtitles in MPD
Closes #656
Solution by: fstirlitz

											
										
										
											2021-08-10 01:12:03 +02:00
+								                        }
 								                    elif content_type == 'text':
 								                        f = {
 								                            'ext': mimetype2ext(mime_type),
 								                            'manifest_url': mpd_url,
 								                            'filesize': filesize,
 								                        }
 								                    elif content_type == 'image/jpeg':
 								                        # See test case in VikiIE
 								                        # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
 								                        f = {
 								                            'format_id': format_id,
 								                            'ext': 'mhtml',
 								                            'manifest_url': mpd_url,
 								                            'format_note': 'DASH storyboards (jpeg)',
 								                            'acodec': 'none',
 								                            'vcodec': 'none',
 								                        }
-												[extractor] Better error message for DRM (#729)

Closes #636
											
										
										
											2021-08-22 22:08:38 +02:00
+								                    if is_drm_protected(adaptation_set) or is_drm_protected(representation):
 								                        f['has_drm'] = True
-												[extractor] Detect `sttp` as subtitles in MPD
Closes #656
Solution by: fstirlitz

											
										
										
											2021-08-10 01:12:03 +02:00
+								                    representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
 								                    def prepare_template(template_name, identifiers):
 								                        tmpl = representation_ms_info[template_name]
-												[extractor/common] Escape `%` in `representation_id` of m3u8

Closes #4877

											
										
										
											2022-09-09 06:28:41 +02:00
+								                        if representation_id is not None:
 								                            tmpl = tmpl.replace('$RepresentationID$', representation_id)
-												[extractor] Detect `sttp` as subtitles in MPD
Closes #656
Solution by: fstirlitz

											
										
										
											2021-08-10 01:12:03 +02:00
+								                        # First of, % characters outside $...$ templates
 								                        # must be escaped by doubling for proper processing
 								                        # by % operator string formatting used further (see
 								                        # https://github.com/ytdl-org/youtube-dl/issues/16867).
 								                        t = ''
 								                        in_template = False
 								                        for c in tmpl:
 								                            t += c
 								                            if c == '$':
 								                                in_template = not in_template
 								                            elif c == '%' and not in_template:
-												[extractor/common] Properly escape % in MPD templates (closes #16867)

											
										
										
											2018-06-30 21:00:16 +02:00
+								                                t += c
-												[extractor] Detect `sttp` as subtitles in MPD
Closes #656
Solution by: fstirlitz

											
										
										
											2021-08-10 01:12:03 +02:00
+								                        # Next, $...$ templates are translated to their
 								                        # %(...) counterparts to be used with % operator
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								                        t = re.sub(r'\$({})\$'.format('|'.join(identifiers)), r'%(\1)d', t)
 								                        t = re.sub(r'\$({})%([^$]+)\$'.format('|'.join(identifiers)), r'%(\1)\2', t)
-												[extractor] Detect `sttp` as subtitles in MPD
Closes #656
Solution by: fstirlitz

											
										
										
											2021-08-10 01:12:03 +02:00
+								                        t.replace('$$', '$')
 								                        return t
 								                    # @initialization is a regular template like @media one
 								                    # so it should be handled just the same way (see
 								                    # https://github.com/ytdl-org/youtube-dl/issues/11605)
 								                    if 'initialization' in representation_ms_info:
 								                        initialization_template = prepare_template(
 								                            'initialization',
 								                            # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
 								                            # $Time$ shall not be included for @initialization thus
 								                            # only $Bandwidth$ remains
 								                            ('Bandwidth', ))
 								                        representation_ms_info['initialization_url'] = initialization_template % {
 								                            'Bandwidth': bandwidth,
 								                        }
 								                    def location_key(location):
 								                        return 'url' if re.match(r'^https?://', location) else 'path'
 								                    if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
 								                        media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
 								                        media_location_key = location_key(media_template)
 								                        # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
 								                        # can't be used at the same time
 								                        if '%(Number' in media_template and 's' not in representation_ms_info:
 								                            segment_duration = None
 								                            if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
 								                                segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
-												[extractor] Fix for manifests without period duration

Closes #2705
Authored by: dirkf, pukkandan

											
										
										
											2022-02-17 14:36:22 +01:00
+								                                representation_ms_info['total_number'] = int(math.ceil(
 								                                    float_or_none(period_duration, segment_duration, default=0)))
-												[extractor] Detect `sttp` as subtitles in MPD
Closes #656
Solution by: fstirlitz

											
										
										
											2021-08-10 01:12:03 +02:00
+								                            representation_ms_info['fragments'] = [{
 								                                media_location_key: media_template % {
 								                                    'Number': segment_number,
 								                                    'Bandwidth': bandwidth,
 								                                },
 								                                'duration': segment_duration,
 								                            } for segment_number in range(
 								                                representation_ms_info['start_number'],
 								                                representation_ms_info['total_number'] + representation_ms_info['start_number'])]
 								                        else:
 								                            # $Number*$ or $Time$ in media template with S list available
 								                            # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
 								                            # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
 								                            representation_ms_info['fragments'] = []
 								                            segment_time = 0
 								                            segment_d = None
 								                            segment_number = representation_ms_info['start_number']
 								                            def add_segment_url():
 								                                segment_url = media_template % {
 								                                    'Time': segment_time,
 								                                    'Bandwidth': bandwidth,
 								                                    'Number': segment_number,
 								                                }
 								                                representation_ms_info['fragments'].append({
 								                                    media_location_key: segment_url,
 								                                    'duration': float_or_none(segment_d, representation_ms_info['timescale']),
 								                                })
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								                            for s in representation_ms_info['s']:
-												[extractor] Detect `sttp` as subtitles in MPD
Closes #656
Solution by: fstirlitz

											
										
										
											2021-08-10 01:12:03 +02:00
+								                                segment_time = s.get('t') or segment_time
 								                                segment_d = s['d']
 								                                add_segment_url()
 								                                segment_number += 1
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								                                for _ in range(s.get('r', 0)):
-												[extractor] Detect `sttp` as subtitles in MPD
Closes #656
Solution by: fstirlitz

											
										
										
											2021-08-10 01:12:03 +02:00
+								                                    segment_time += segment_d
-												[extractor/common] Add support for $ in SegmentTemplate in MPD manifests

											
										
										
											2016-07-24 05:27:16 +02:00
+								                                    add_segment_url()
-												[extractor/common] Expose fragments interface for dashsegments formats

											
										
										
											2016-09-05 20:21:57 +02:00
+								                                    segment_number += 1
-												[extractor] Detect `sttp` as subtitles in MPD
Closes #656
Solution by: fstirlitz

											
										
										
											2021-08-10 01:12:03 +02:00
+								                                segment_time += segment_d
 								                    elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
-												[docs] Consistent use of `e.g.` (#4643)

Authored by: Lesmiscore
											
										
										
											2022-08-14 14:04:13 +02:00
+								                        # No media template,
 								                        # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
-												[extractor] Detect `sttp` as subtitles in MPD
Closes #656
Solution by: fstirlitz

											
										
										
											2021-08-10 01:12:03 +02:00
+								                        # or any YouTube dashsegments video
 								                        fragments = []
 								                        segment_index = 0
 								                        timescale = representation_ms_info['timescale']
 								                        for s in representation_ms_info['s']:
 								                            duration = float_or_none(s['d'], timescale)
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								                            for _ in range(s.get('r', 0) + 1):
-												[extractor] Detect `sttp` as subtitles in MPD
Closes #656
Solution by: fstirlitz

											
										
										
											2021-08-10 01:12:03 +02:00
+								                                segment_uri = representation_ms_info['segment_urls'][segment_index]
 								                                fragments.append({
 								                                    location_key(segment_uri): segment_uri,
 								                                    'duration': duration,
 								                                })
 								                                segment_index += 1
 								                        representation_ms_info['fragments'] = fragments
 								                    elif 'segment_urls' in representation_ms_info:
 								                        # Segment URLs with no SegmentTimeline
-												[docs] Consistent use of `e.g.` (#4643)

Authored by: Lesmiscore
											
										
										
											2022-08-14 14:04:13 +02:00
+								                        # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
-												[extractor] Detect `sttp` as subtitles in MPD
Closes #656
Solution by: fstirlitz

											
										
										
											2021-08-10 01:12:03 +02:00
+								                        # https://github.com/ytdl-org/youtube-dl/pull/14844
 								                        fragments = []
 								                        segment_duration = float_or_none(
 								                            representation_ms_info['segment_duration'],
 								                            representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
 								                        for segment_url in representation_ms_info['segment_urls']:
 								                            fragment = {
 								                                location_key(segment_url): segment_url,
 								                            }
 								                            if segment_duration:
 								                                fragment['duration'] = segment_duration
 								                            fragments.append(fragment)
 								                        representation_ms_info['fragments'] = fragments
 								                    # If there is a fragments key available then we correctly recognized fragmented media.
 								                    # Otherwise we will assume unfragmented media with direct access. Technically, such
 								                    # assumption is not necessarily correct since we may simply have no support for
 								                    # some forms of fragmented media renditions yet, but for now we'll use this fallback.
 								                    if 'fragments' in representation_ms_info:
 								                        f.update({
 								                            # NB: mpd_url may be empty when MPD manifest is parsed from a string
 								                            'url': mpd_url or base_url,
 								                            'fragment_base_url': base_url,
 								                            'fragments': [],
 								                            'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
 								                        })
 								                        if 'initialization_url' in representation_ms_info:
 								                            initialization_url = representation_ms_info['initialization_url']
 								                            if not f.get('url'):
 								                                f['url'] = initialization_url
 								                            f['fragments'].append({location_key(initialization_url): initialization_url})
 								                        f['fragments'].extend(representation_ms_info['fragments'])
-												[extractor] Fix for manifests without period duration

Closes #2705
Authored by: dirkf, pukkandan

											
										
										
											2022-02-17 14:36:22 +01:00
+								                        if not period_duration:
 								                            period_duration = try_get(
 								                                representation_ms_info,
 								                                lambda r: sum(frag['duration'] for frag in r['fragments']), float)
-												[common] _parse_dash_manifest() from youtube.py

											
										
										
											2016-01-30 14:05:55 +01:00
+								                    else:
-												[extractor] Detect `sttp` as subtitles in MPD
Closes #656
Solution by: fstirlitz

											
										
										
											2021-08-10 01:12:03 +02:00
+								                        # Assuming direct URL to unfragmented media.
 								                        f['url'] = base_url
-												[downloader/ffmpeg] Fix for direct videos inside mpd manifests
Closes #1751

											
										
										
											2021-11-22 19:02:14 +01:00
+								                    if content_type in ('video', 'audio', 'image/jpeg'):
 								                        f['manifest_stream_number'] = stream_numbers[f['url']]
 								                        stream_numbers[f['url']] += 1
-												[ie] Support multi-period MPD streams (#6654)

											
										
										
											2023-03-27 19:04:23 +02:00
+								                        period_entry['formats'].append(f)
-												[extractor] Detect `sttp` as subtitles in MPD
Closes #656
Solution by: fstirlitz

											
										
										
											2021-08-10 01:12:03 +02:00
+								                    elif content_type == 'text':
-												[ie] Support multi-period MPD streams (#6654)

											
										
										
											2023-03-27 19:04:23 +02:00
+								                        period_entry['subtitles'][lang or 'und'].append(f)
 								            yield period_entry
-												[common] _parse_dash_manifest() from youtube.py

											
										
										
											2016-01-30 14:05:55 +01:00
-												[extractor/common, downloader/ism] Extract SSTR subtitle tracks

_parse_ism_formats was extended into _parse_ism_formats_and_subtitles;
all direct users were updated, though _extract_ism_formats was left
as a compatibility wrapper.

The SSTR downloader was also modified in order to prepare for muxing
subtitle streams, although no support for any subtitle codecs was
added in this commit.

											
										
										
											2021-04-18 12:47:42 +02:00
+								    def _extract_ism_formats(self, *args, **kwargs):
 								        fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
 								        if subs:
-												[cleanup] Misc cleanup

											
										
										
											2021-10-09 02:23:15 +02:00
+								            self._report_ignoring_subs('ISM')
-												[extractor/common, downloader/ism] Extract SSTR subtitle tracks

_parse_ism_formats was extended into _parse_ism_formats_and_subtitles;
all direct users were updated, though _extract_ism_formats was left
as a compatibility wrapper.

The SSTR downloader was also modified in order to prepare for muxing
subtitle streams, although no support for any subtitle codecs was
added in this commit.

											
										
										
											2021-04-18 12:47:42 +02:00
+								        return fmts
 								    def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
-												[extractor] Let `_extract_format` functions obey `--ignore-no-formats`

											
										
										
											2022-12-15 15:28:57 +01:00
+								        if self.get_param('ignore_no_formats_error'):
 								            fatal = False
-												Generalize XML manifest processing code and improve XSPF parsing (closes #15794)

											
										
										
											2018-03-17 20:46:50 +01:00
+								        res = self._download_xml_handle(
-												add Basic support for Smooth Streaming protocol(#8118)

											
										
										
											2016-10-19 17:22:40 +02:00
+								            ism_url, video_id,
-												[extractor] Allow `note=False` when extracting manifests

											
										
										
											2021-05-29 10:52:44 +02:00
+								            note='Downloading ISM manifest' if note is None else note,
 								            errnote='Failed to download ISM manifest' if errnote is None else errnote,
-												[extractor/common] Add data, headers and query to all major extract methods preserving standard order for potential future use

											
										
										
											2019-11-15 23:44:14 +01:00
+								            fatal=fatal, data=data, headers=headers, query=query)
-												add Basic support for Smooth Streaming protocol(#8118)

											
										
										
											2016-10-19 17:22:40 +02:00
+								        if res is False:
-												[extractor/common, downloader/ism] Extract SSTR subtitle tracks

_parse_ism_formats was extended into _parse_ism_formats_and_subtitles;
all direct users were updated, though _extract_ism_formats was left
as a compatibility wrapper.

The SSTR downloader was also modified in order to prepare for muxing
subtitle streams, although no support for any subtitle codecs was
added in this commit.

											
										
										
											2021-04-18 12:47:42 +02:00
+								            return [], {}
-												Generalize XML manifest processing code and improve XSPF parsing (closes #15794)

											
										
										
											2018-03-17 20:46:50 +01:00
+								        ism_doc, urlh = res
-												[extractor/common] Skip malformed ISM manifest XMLs while extracting ISM formats (#24667)

											
										
										
											2020-04-07 17:54:34 +02:00
+								        if ism_doc is None:
-												[extractor/common, downloader/ism] Extract SSTR subtitle tracks

_parse_ism_formats was extended into _parse_ism_formats_and_subtitles;
all direct users were updated, though _extract_ism_formats was left
as a compatibility wrapper.

The SSTR downloader was also modified in order to prepare for muxing
subtitle streams, although no support for any subtitle codecs was
added in this commit.

											
										
										
											2021-04-18 12:47:42 +02:00
+								            return [], {}
-												add Basic support for Smooth Streaming protocol(#8118)

											
										
										
											2016-10-19 17:22:40 +02:00
-												[compat, networking] Deprecate old functions (#2861)

Authored by: coletdjnz, pukkandan

											
										
										
											2023-07-09 09:53:02 +02:00
+								        return self._parse_ism_formats_and_subtitles(ism_doc, urlh.url, ism_id)
-												add Basic support for Smooth Streaming protocol(#8118)

											
										
										
											2016-10-19 17:22:40 +02:00
-												[extractor/common, downloader/ism] Extract SSTR subtitle tracks

_parse_ism_formats was extended into _parse_ism_formats_and_subtitles;
all direct users were updated, though _extract_ism_formats was left
as a compatibility wrapper.

The SSTR downloader was also modified in order to prepare for muxing
subtitle streams, although no support for any subtitle codecs was
added in this commit.

											
										
										
											2021-04-18 12:47:42 +02:00
+								    def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
-												[extractor/common] Respect Width and Height attributes in ISM manifests

											
										
										
											2017-05-14 01:11:45 +02:00
+								        """
 								        Parse formats from ISM manifest.
 								        References:
 . [MS-SSTR]: Smooth Streaming Protocol,
 								            https://msdn.microsoft.com/en-us/library/ff469518.aspx
 								        """
-												Cleanup some code and fix typos

:ci skip dl

											
										
										
											2021-02-12 05:34:04 +01:00
+								        if ism_doc.get('IsLive') == 'TRUE':
-												[extractor/common, downloader/ism] Extract SSTR subtitle tracks

_parse_ism_formats was extended into _parse_ism_formats_and_subtitles;
all direct users were updated, though _extract_ism_formats was left
as a compatibility wrapper.

The SSTR downloader was also modified in order to prepare for muxing
subtitle streams, although no support for any subtitle codecs was
added in this commit.

											
										
										
											2021-04-18 12:47:42 +02:00
+								            return [], {}
-												add Basic support for Smooth Streaming protocol(#8118)

											
										
										
											2016-10-19 17:22:40 +02:00
 								        duration = int(ism_doc.attrib['Duration'])
 								        timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
 								        formats = []
-												[extractor/common, downloader/ism] Extract SSTR subtitle tracks

_parse_ism_formats was extended into _parse_ism_formats_and_subtitles;
all direct users were updated, though _extract_ism_formats was left
as a compatibility wrapper.

The SSTR downloader was also modified in order to prepare for muxing
subtitle streams, although no support for any subtitle codecs was
added in this commit.

											
										
										
											2021-04-18 12:47:42 +02:00
+								        subtitles = {}
-												add Basic support for Smooth Streaming protocol(#8118)

											
										
										
											2016-10-19 17:22:40 +02:00
+								        for stream in ism_doc.findall('StreamIndex'):
 								            stream_type = stream.get('Type')
-												[extractor/common, downloader/ism] Extract SSTR subtitle tracks

_parse_ism_formats was extended into _parse_ism_formats_and_subtitles;
all direct users were updated, though _extract_ism_formats was left
as a compatibility wrapper.

The SSTR downloader was also modified in order to prepare for muxing
subtitle streams, although no support for any subtitle codecs was
added in this commit.

											
										
										
											2021-04-18 12:47:42 +02:00
+								            if stream_type not in ('video', 'audio', 'text'):
-												add Basic support for Smooth Streaming protocol(#8118)

											
										
										
											2016-10-19 17:22:40 +02:00
+								                continue
 								            url_pattern = stream.attrib['Url']
 								            stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
 								            stream_name = stream.get('Name')
-												[extractor/common, downloader/ism] Extract SSTR subtitle tracks

_parse_ism_formats was extended into _parse_ism_formats_and_subtitles;
all direct users were updated, though _extract_ism_formats was left
as a compatibility wrapper.

The SSTR downloader was also modified in order to prepare for muxing
subtitle streams, although no support for any subtitle codecs was
added in this commit.

											
										
										
											2021-04-18 12:47:42 +02:00
+								            stream_language = stream.get('Language', 'und')
-												add Basic support for Smooth Streaming protocol(#8118)

											
										
										
											2016-10-19 17:22:40 +02:00
+								            for track in stream.findall('QualityLevel'):
-												[downloader/ism] Support ec-3 codec (#5004)

Closes #296
Authored by: nixxo
											
										
										
											2022-09-30 19:33:29 +02:00
+								                KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}
 								                fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))
-												add Basic support for Smooth Streaming protocol(#8118)

											
										
										
											2016-10-19 17:22:40 +02:00
+								                # TODO: add support for WVC1 and WMAP
-												[downloader/ism] Support ec-3 codec (#5004)

Closes #296
Authored by: nixxo
											
										
										
											2022-09-30 19:33:29 +02:00
+								                if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								                    self.report_warning(f'{fourcc} is not a supported codec')
-												add Basic support for Smooth Streaming protocol(#8118)

											
										
										
											2016-10-19 17:22:40 +02:00
+								                    continue
 								                tbr = int(track.attrib['Bitrate']) // 1000
-												[extractor/common] Respect Width and Height attributes in ISM manifests

											
										
										
											2017-05-14 01:11:45 +02:00
+								                # [1] does not mention Width and Height attributes. However,
 								                # they're often present while MaxWidth and MaxHeight are
 								                # missing, so should be used as fallbacks
 								                width = int_or_none(track.get('MaxWidth') or track.get('Width'))
 								                height = int_or_none(track.get('MaxHeight') or track.get('Height'))
-												add Basic support for Smooth Streaming protocol(#8118)

											
										
										
											2016-10-19 17:22:40 +02:00
+								                sampling_rate = int_or_none(track.get('SamplingRate'))
 								                track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
-												[compat] Remove deprecated functions from core code

											
										
										
											2022-06-24 12:54:43 +02:00
+								                track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
-												add Basic support for Smooth Streaming protocol(#8118)

											
										
										
											2016-10-19 17:22:40 +02:00
 								                fragments = []
 								                fragment_ctx = {
 								                    'time': 0,
 								                }
 								                stream_fragments = stream.findall('c')
 								                for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
 								                    fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
 								                    fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
 								                    fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
 								                    if not fragment_ctx['duration']:
 								                        try:
 								                            next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
 								                        except IndexError:
 								                            next_fragment_time = duration
-												[extractor/common] Fix typo

											
										
										
											2016-11-01 20:21:43 +01:00
+								                        fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
-												add Basic support for Smooth Streaming protocol(#8118)

											
										
										
											2016-10-19 17:22:40 +02:00
+								                    for _ in range(fragment_repeat):
 								                        fragments.append({
-												[compat] Remove deprecated functions from core code

											
										
										
											2022-06-24 12:54:43 +02:00
+								                            'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
-												add Basic support for Smooth Streaming protocol(#8118)

											
										
										
											2016-10-19 17:22:40 +02:00
+								                            'duration': fragment_ctx['duration'] / stream_timescale,
 								                        })
 								                        fragment_ctx['time'] += fragment_ctx['duration']
-												[extractor/common, downloader/ism] Extract SSTR subtitle tracks

_parse_ism_formats was extended into _parse_ism_formats_and_subtitles;
all direct users were updated, though _extract_ism_formats was left
as a compatibility wrapper.

The SSTR downloader was also modified in order to prepare for muxing
subtitle streams, although no support for any subtitle codecs was
added in this commit.

											
										
										
											2021-04-18 12:47:42 +02:00
+								                if stream_type == 'text':
 								                    subtitles.setdefault(stream_language, []).append({
 								                        'ext': 'ismt',
 								                        'protocol': 'ism',
 								                        'url': ism_url,
 								                        'manifest_url': ism_url,
 								                        'fragments': fragments,
 								                        '_download_params': {
 								                            'stream_type': stream_type,
 								                            'duration': duration,
 								                            'timescale': stream_timescale,
 								                            'fourcc': fourcc,
 								                            'language': stream_language,
 								                            'codec_private_data': track.get('CodecPrivateData'),
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								                        },
-												[extractor/common, downloader/ism] Extract SSTR subtitle tracks

_parse_ism_formats was extended into _parse_ism_formats_and_subtitles;
all direct users were updated, though _extract_ism_formats was left
as a compatibility wrapper.

The SSTR downloader was also modified in order to prepare for muxing
subtitle streams, although no support for any subtitle codecs was
added in this commit.

											
										
										
											2021-04-18 12:47:42 +02:00
+								                    })
 								                elif stream_type in ('video', 'audio'):
 								                    formats.append({
-												[utils] Add `join_nonempty`

											
										
										
											2021-11-06 02:05:24 +01:00
+								                        'format_id': join_nonempty(ism_id, stream_name, tbr),
-												[extractor/common, downloader/ism] Extract SSTR subtitle tracks

_parse_ism_formats was extended into _parse_ism_formats_and_subtitles;
all direct users were updated, though _extract_ism_formats was left
as a compatibility wrapper.

The SSTR downloader was also modified in order to prepare for muxing
subtitle streams, although no support for any subtitle codecs was
added in this commit.

											
										
										
											2021-04-18 12:47:42 +02:00
+								                        'url': ism_url,
 								                        'manifest_url': ism_url,
 								                        'ext': 'ismv' if stream_type == 'video' else 'isma',
 								                        'width': width,
 								                        'height': height,
 								                        'tbr': tbr,
 								                        'asr': sampling_rate,
 								                        'vcodec': 'none' if stream_type == 'audio' else fourcc,
 								                        'acodec': 'none' if stream_type == 'video' else fourcc,
 								                        'protocol': 'ism',
 								                        'fragments': fragments,
-												[extractor] Better error message for DRM (#729)

Closes #636
											
										
										
											2021-08-22 22:08:38 +02:00
+								                        'has_drm': ism_doc.find('Protection') is not None,
-												[extractor] Extract more metadata from ISM

Fixes https://github.com/yt-dlp/yt-dlp/commit/81b6102d2099eec78a2db9ae3d101a8503dd4f25#r105892531

											
										
										
											2023-03-24 17:23:06 +01:00
+								                        'language': stream_language,
 								                        'audio_channels': int_or_none(track.get('Channels')),
-												[extractor/common, downloader/ism] Extract SSTR subtitle tracks

_parse_ism_formats was extended into _parse_ism_formats_and_subtitles;
all direct users were updated, though _extract_ism_formats was left
as a compatibility wrapper.

The SSTR downloader was also modified in order to prepare for muxing
subtitle streams, although no support for any subtitle codecs was
added in this commit.

											
										
										
											2021-04-18 12:47:42 +02:00
+								                        '_download_params': {
 								                            'stream_type': stream_type,
 								                            'duration': duration,
 								                            'timescale': stream_timescale,
 								                            'width': width or 0,
 								                            'height': height or 0,
 								                            'fourcc': fourcc,
 								                            'language': stream_language,
 								                            'codec_private_data': track.get('CodecPrivateData'),
 								                            'sampling_rate': sampling_rate,
 								                            'channels': int_or_none(track.get('Channels', 2)),
 								                            'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
 								                            'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
 								                        },
 								                    })
 								        return formats, subtitles
-												add Basic support for Smooth Streaming protocol(#8118)

											
										
										
											2016-10-19 17:22:40 +02:00
-												[downloader] Do not use aria2c for non-native `m3u8`

Closes #2718

											
										
										
											2022-02-11 07:39:03 +01:00
+								    def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
-												[extractor/common] Improve thumbnail extraction for HTML5 entries

											
										
										
											2018-03-19 17:43:53 +01:00
+								        def absolute_url(item_url):
 								            return urljoin(base_url, item_url)
-												[extractor/common] add helper method to extract html5 media entries

											
										
										
											2016-03-16 18:50:45 +01:00
 								        def parse_content_type(content_type):
 								            if not content_type:
 								                return {}
 								            ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
 								            if ctr:
 								                mimetype, codecs = ctr.groups()
 								                f = parse_codecs(codecs)
 								                f['ext'] = mimetype2ext(mimetype)
 								                return f
 								            return {}
-												[extractor/common] Recognize `src` attribute from HTML5 media elements (#3899)

Authored by: Lesmiscore
											
										
										
											2022-05-29 15:48:04 +02:00
+								        def _media_formats(src, cur_media_type, type_info=None):
 								            type_info = type_info or {}
-												[extractor/common] Recognize m3u8 manifests in HTML5 multimedia tags

											
										
										
											2016-08-19 17:53:47 +02:00
+								            full_url = absolute_url(src)
-												[extractor/common] Respect source's type attribute for HTML5 media (closes #13892)

											
										
										
											2017-08-12 11:48:11 +02:00
+								            ext = type_info.get('ext') or determine_ext(full_url)
-												[extractor/common] Recognize DASH formats in html5 media entries

											
										
										
											2016-12-17 17:03:13 +01:00
+								            if ext == 'm3u8':
-												[extractor/common] Recognize m3u8 manifests in HTML5 multimedia tags

											
										
										
											2016-08-19 17:53:47 +02:00
+								                is_plain_url = False
 								                formats = self._extract_m3u8_formats(
-												[extractor/common] Change the default m3u8 protocol in HTML5

Helper functions should have consistent default values

											
										
										
											2016-08-21 20:18:46 +02:00
+								                    full_url, video_id, ext='mp4',
-												[extractor/common] Add 'preference' to _parse_html5_media_entries

Some websites, like NJPWorld, put different qualities on different
player pages.

											
										
										
											2017-02-25 11:40:05 +01:00
+								                    entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
-												[formatsort] Remove misuse of 'preference'

'preference' is to be used only when the format is better that ALL qualities of a lower preference irrespective of ANY sorting order the user requests. See deezer.py for correct use of this

In the older sorting method, `preference`, `quality` and `language_preference` were functionally almost equivalent. So these disparities doesn't really matter there

Also, despite what the documentation says, the default for `preference` was actually 0 and not -1. I have tried to correct this and also account for it when converting `preference` to `quality`

											
										
										
											2021-02-18 23:03:16 +01:00
+								                    preference=preference, quality=quality, fatal=False)
-												[extractor/common] Recognize DASH formats in html5 media entries

											
										
										
											2016-12-17 17:03:13 +01:00
+								            elif ext == 'mpd':
 								                is_plain_url = False
 								                formats = self._extract_mpd_formats(
-												[extractor/common] Make HLS and DASH extraction non fatal in _parse_html5_media_entries (closes #13970)

											
										
										
											2017-08-20 09:16:58 +02:00
+								                    full_url, video_id, mpd_id=mpd_id, fatal=False)
-												[extractor/common] Recognize m3u8 manifests in HTML5 multimedia tags

											
										
										
											2016-08-19 17:53:47 +02:00
+								            else:
 								                is_plain_url = True
 								                formats = [{
 								                    'url': full_url,
 								                    'vcodec': 'none' if cur_media_type == 'audio' else None,
-												[extractor/common] Recognize `src` attribute from HTML5 media elements (#3899)

Authored by: Lesmiscore
											
										
										
											2022-05-29 15:48:04 +02:00
+								                    'ext': ext,
-												[extractor/common] Recognize m3u8 manifests in HTML5 multimedia tags

											
										
										
											2016-08-19 17:53:47 +02:00
+								                }]
 								            return is_plain_url, formats
-												[extractor/common] add helper method to extract html5 media entries

											
										
										
											2016-03-16 18:50:45 +01:00
+								        entries = []
-												[extractor/common] Add support for AMP tags in _parse_html5_media_entries

											
										
										
											2017-07-09 11:29:52 +02:00
+								        # amp-video and amp-audio are very similar to their HTML5 counterparts
-												[cleanup] Fix some typos (#4194)

Authored by: crazymoose77756
											
										
										
											2022-06-27 02:50:06 +02:00
+								        # so we will include them right here (see
-												[extractor/common] Add support for AMP tags in _parse_html5_media_entries

											
										
										
											2017-07-09 11:29:52 +02:00
+								        # https://www.ampproject.org/docs/reference/components/amp-video)
-												Update to ytdl-2021.01.03

											
										
										
											2021-01-01 13:26:37 +01:00
+								        # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
 								        _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
 								        media_tags = [(media_tag, media_tag_name, media_type, '')
 								                      for media_tag, media_tag_name, media_type
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								                      in re.findall(rf'(?s)(<({_MEDIA_TAG_NAME_RE})[^>]*/>)', webpage)]
-												[extractor/common] Speed-up media tags regex (closes #11979)

											
										
										
											2017-02-05 18:20:30 +01:00
+								        media_tags.extend(re.findall(
 								            # We only allow video|audio followed by a whitespace or '>'.
 								            # Allowing more characters may end up in significant slow down (see
-												[docs] Consistent use of `e.g.` (#4643)

Authored by: Lesmiscore
											
										
										
											2022-08-14 14:04:13 +02:00
+								            # https://github.com/ytdl-org/youtube-dl/issues/11979,
 								            # e.g. http://www.porntrex.com/maps/videositemap.xml).
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								            rf'(?s)(<(?P<tag>{_MEDIA_TAG_NAME_RE})(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
-												Update to ytdl-2021.01.03

											
										
										
											2021-01-01 13:26:37 +01:00
+								        for media_tag, _, media_type, media_content in media_tags:
-												[extractor/common] add helper method to extract html5 media entries

											
										
										
											2016-03-16 18:50:45 +01:00
+								            media_info = {
 								                'formats': [],
 								                'subtitles': {},
 								            }
 								            media_attributes = extract_attributes(media_tag)
-												[extractor/newspicks] Add extractor (#4725)

Authored by: Lesmiscore
											
										
										
											2022-08-30 19:07:55 +02:00
+								            src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))
-												[extractor/common] add helper method to extract html5 media entries

											
										
										
											2016-03-16 18:50:45 +01:00
+								            if src:
-												[extractor/common] Recognize `src` attribute from HTML5 media elements (#3899)

Authored by: Lesmiscore
											
										
										
											2022-05-29 15:48:04 +02:00
+								                f = parse_content_type(media_attributes.get('type'))
 								                _, formats = _media_formats(src, media_type, f)
-												[extractor/common] Recognize m3u8 manifests in HTML5 multimedia tags

											
										
										
											2016-08-19 17:53:47 +02:00
+								                media_info['formats'].extend(formats)
-												[extractor/common] Improve thumbnail extraction for HTML5 entries

											
										
										
											2018-03-19 17:43:53 +01:00
+								            media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
-												[extractor/common] add helper method to extract html5 media entries

											
										
										
											2016-03-16 18:50:45 +01:00
+								            if media_content:
 								                for source_tag in re.findall(r'<source[^>]+>', media_content):
-												[extractor/common] Improve HTML5 entries extraction and add some realworld tests

											
										
										
											2019-03-17 03:09:32 +01:00
+								                    s_attr = extract_attributes(source_tag)
 								                    # data-video-src and data-src are non standard but seen
 								                    # several times in the wild
-												[extractor/newspicks] Add extractor (#4725)

Authored by: Lesmiscore
											
										
										
											2022-08-30 19:07:55 +02:00
+								                    src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))
-												[extractor/common] add helper method to extract html5 media entries

											
										
										
											2016-03-16 18:50:45 +01:00
+								                    if not src:
 								                        continue
-												[extractor/common] Improve HTML5 entries extraction and add some realworld tests

											
										
										
											2019-03-17 03:09:32 +01:00
+								                    f = parse_content_type(s_attr.get('type'))
-												[extractor/common] Fix _media_formats

											
										
										
											2017-08-12 14:24:26 +02:00
+								                    is_plain_url, formats = _media_formats(src, media_type, f)
-												[extractor/common] Recognize m3u8 manifests in HTML5 multimedia tags

											
										
										
											2016-08-19 17:53:47 +02:00
+								                    if is_plain_url:
-												[extractor/common] Improve HTML5 entries extraction and add some realworld tests

											
										
										
											2019-03-17 03:09:32 +01:00
+								                        # width, height, res, label and title attributes are
 								                        # all not standard but seen several times in the wild
 								                        labels = [
 								                            s_attr.get(lbl)
 								                            for lbl in ('label', 'title')
 								                            if str_or_none(s_attr.get(lbl))
 								                        ]
 								                        width = int_or_none(s_attr.get('width'))
-												Fix W504 and disable W503 (closes #20863)

											
										
										
											2019-05-10 22:56:22 +02:00
+								                        height = (int_or_none(s_attr.get('height'))
 								                                  or int_or_none(s_attr.get('res')))
-												[extractor/common] Improve HTML5 entries extraction and add some realworld tests

											
										
										
											2019-03-17 03:09:32 +01:00
+								                        if not width or not height:
 								                            for lbl in labels:
 								                                resolution = parse_resolution(lbl)
 								                                if not resolution:
 								                                    continue
 								                                width = width or resolution.get('width')
 								                                height = height or resolution.get('height')
 								                        for lbl in labels:
 								                            tbr = parse_bitrate(lbl)
 								                            if tbr:
 								                                break
 								                        else:
 								                            tbr = None
-												[extractor/common] Extract format id from label attribute of source tag for HTML5 videos (#14034)

											
										
										
											2017-08-26 22:27:05 +02:00
+								                        f.update({
-												[extractor/common] Improve HTML5 entries extraction and add some realworld tests

											
										
										
											2019-03-17 03:09:32 +01:00
+								                            'width': width,
 								                            'height': height,
 								                            'tbr': tbr,
 								                            'format_id': s_attr.get('label') or s_attr.get('title'),
-												[extractor/common] Extract format id from label attribute of source tag for HTML5 videos (#14034)

											
										
										
											2017-08-26 22:27:05 +02:00
+								                        })
-												[extractor/common] Recognize m3u8 manifests in HTML5 multimedia tags

											
										
										
											2016-08-19 17:53:47 +02:00
+								                        f.update(formats[0])
 								                        media_info['formats'].append(f)
 								                    else:
 								                        media_info['formats'].extend(formats)
-												[extractor/common] add helper method to extract html5 media entries

											
										
										
											2016-03-16 18:50:45 +01:00
+								                for track_tag in re.findall(r'<track[^>]+>', media_content):
 								                    track_attributes = extract_attributes(track_tag)
 								                    kind = track_attributes.get('kind')
-												[extractor/common] Improved support for HTML5 subtitles

Ref: #10625

In a strict sense, <track>s with kind=captions are not subtitles. [1]
openload misuses this attribute, and I guess there will be more
examples, so I add it to common.py.

Also allow extracting information for subtitles-only <video> or <audio>
tags, which is the case of openload.

[1] https://www.w3.org/TR/html5/embedded-content-0.html#attr-track-kind

											
										
										
											2016-09-24 08:20:42 +02:00
+								                    if not kind or kind in ('subtitles', 'captions'):
-												[extractor/common] Strip src attribute for HTML5 entries code (closes #18485, closes #21169)

											
										
										
											2019-05-23 18:52:11 +02:00
+								                        src = strip_or_none(track_attributes.get('src'))
-												[extractor/common] add helper method to extract html5 media entries

											
										
										
											2016-03-16 18:50:45 +01:00
+								                        if not src:
 								                            continue
 								                        lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
 								                        media_info['subtitles'].setdefault(lang, []).append({
 								                            'url': absolute_url(src),
 								                        })
-												[extractor/common] Use source URL as Referer for HTML5 entries (closes #16849)

											
										
										
											2018-06-28 20:25:05 +02:00
+								            for f in media_info['formats']:
 								                f.setdefault('http_headers', {})['Referer'] = base_url
-												[extractor/common] Improved support for HTML5 subtitles

Ref: #10625

In a strict sense, <track>s with kind=captions are not subtitles. [1]
openload misuses this attribute, and I guess there will be more
examples, so I add it to common.py.

Also allow extracting information for subtitles-only <video> or <audio>
tags, which is the case of openload.

[1] https://www.w3.org/TR/html5/embedded-content-0.html#attr-track-kind

											
										
										
											2016-09-24 08:20:42 +02:00
+								            if media_info['formats'] or media_info['subtitles']:
-												[extractor/common] add helper method to extract html5 media entries

											
										
										
											2016-03-16 18:50:45 +01:00
+								                entries.append(media_info)
 								        return entries
-												[extractor/common] Extend _extract_akamai_formats to also extract subtitle tracks

											
										
										
											2021-04-22 17:19:26 +02:00
+								    def _extract_akamai_formats(self, *args, **kwargs):
 								        fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
 								        if subs:
-												[cleanup] Misc cleanup

											
										
										
											2021-10-09 02:23:15 +02:00
+								            self._report_ignoring_subs('akamai')
-												[extractor/common] Extend _extract_akamai_formats to also extract subtitle tracks

											
										
										
											2021-04-22 17:19:26 +02:00
+								        return fmts
 								    def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
-												Update to ytdl-2021.01.03

											
										
										
											2021-01-01 13:26:37 +01:00
+								        signed = 'hdnea=' in manifest_url
 								        if not signed:
 								            # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
 								            manifest_url = re.sub(
 								                r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
 								                '', manifest_url).strip('?')
-												[common] add helper method to extract akamai m3u8 and f4m formats

											
										
										
											2016-08-22 08:47:25 +02:00
+								        formats = []
-												[extractor/common] Extend _extract_akamai_formats to also extract subtitle tracks

											
										
										
											2021-04-22 17:19:26 +02:00
+								        subtitles = {}
-												Update to release 2020.11.24 except youtube and skyit extractors

											
										
										
											2020-11-23 22:03:08 +01:00
-												[common] add hdcore sign to akamai f4m formats

											
										
										
											2016-09-24 22:55:53 +02:00
+								        hdcore_sign = 'hdcore=3.7.0'
-												[extractor/common] fix typo in _extract_akamai_formats

											
										
										
											2017-05-04 17:04:25 +02:00
+								        f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
-												[common] add possibility to customize akamai manifest host

											
										
										
											2017-01-13 10:08:51 +01:00
+								        hds_host = hosts.get('hds')
 								        if hds_host:
 								            f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
-												[common] add hdcore sign to akamai f4m formats

											
										
										
											2016-09-24 22:55:53 +02:00
+								        if 'hdcore=' not in f4m_url:
 								            f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
 								        f4m_formats = self._extract_f4m_formats(
 								            f4m_url, video_id, f4m_id='hds', fatal=False)
 								        for entry in f4m_formats:
 								            entry.update({'extra_param_to_segment_url': hdcore_sign})
 								        formats.extend(f4m_formats)
-												Update to release 2020.11.24 except youtube and skyit extractors

											
										
										
											2020-11-23 22:03:08 +01:00
-												[common] add possibility to customize akamai manifest host

											
										
										
											2017-01-13 10:08:51 +01:00
+								        m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
 								        hls_host = hosts.get('hls')
 								        if hls_host:
 								            m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
-												[extractor/common] Extend _extract_akamai_formats to also extract subtitle tracks

											
										
										
											2021-04-22 17:19:26 +02:00
+								        m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
-												[common] add helper method to extract akamai m3u8 and f4m formats

											
										
										
											2016-08-22 08:47:25 +02:00
+								            m3u8_url, video_id, 'mp4', 'm3u8_native',
-												Update to ytdl-2021.01.03

											
										
										
											2021-01-01 13:26:37 +01:00
+								            m3u8_id='hls', fatal=False)
 								        formats.extend(m3u8_formats)
-												[extractor/common] Extend _extract_akamai_formats to also extract subtitle tracks

											
										
										
											2021-04-22 17:19:26 +02:00
+								        subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
-												Update to release 2020.11.24 except youtube and skyit extractors

											
										
										
											2020-11-23 22:03:08 +01:00
 								        http_host = hosts.get('http')
-												Update to ytdl-2021.01.03

											
										
										
											2021-01-01 13:26:37 +01:00
+								        if http_host and m3u8_formats and not signed:
 								            REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
-												Update to release 2020.11.24 except youtube and skyit extractors

											
										
										
											2020-11-23 22:03:08 +01:00
+								            qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
 								            qualities_length = len(qualities)
-												Update to ytdl-2021.01.03

											
										
										
											2021-01-01 13:26:37 +01:00
+								            if len(m3u8_formats) in (qualities_length, qualities_length + 1):
-												Update to release 2020.11.24 except youtube and skyit extractors

											
										
										
											2020-11-23 22:03:08 +01:00
+								                i = 0
-												Update to ytdl-2021.01.03

											
										
										
											2021-01-01 13:26:37 +01:00
+								                for f in m3u8_formats:
 								                    if f['vcodec'] != 'none':
-												Update to release 2020.11.24 except youtube and skyit extractors

											
										
										
											2020-11-23 22:03:08 +01:00
+								                        for protocol in ('http', 'https'):
 								                            http_f = f.copy()
 								                            del http_f['manifest_url']
 								                            http_url = re.sub(
-												[cleanup] Upgrade syntax

Using https://github.com/asottile/pyupgrade

1. `__future__` imports and `coding: utf-8` were removed
2. Files were rewritten with `pyupgrade --py36-plus --keep-percent-format`
3. f-strings were cherry-picked from `pyupgrade --py36-plus`

Extractors are left untouched (except removing header) to avoid unnecessary merge conflicts

											
										
										
											2022-04-11 17:10:28 +02:00
+								                                REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
-												Update to release 2020.11.24 except youtube and skyit extractors

											
										
										
											2020-11-23 22:03:08 +01:00
+								                            http_f.update({
 								                                'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
 								                                'url': http_url,
 								                                'protocol': protocol,
 								                            })
-												Update to ytdl-2021.01.03

											
										
										
											2021-01-01 13:26:37 +01:00
+								                            formats.append(http_f)
-												Update to release 2020.11.24 except youtube and skyit extractors

											
										
										
											2020-11-23 22:03:08 +01:00
+								                        i += 1
-												[extractor/common] Extend _extract_akamai_formats to also extract subtitle tracks

											
										
										
											2021-04-22 17:19:26 +02:00
+								        return formats, subtitles
-												[common] add helper method to extract akamai m3u8 and f4m formats

											
										
										
											2016-08-22 08:47:25 +02:00
-												[common] add helper method for Wowza Streaming Engine format extraction

											
										
										
											2016-09-16 20:30:38 +02:00
+								    def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
-												[compat] Remove deprecated functions from core code

											
										
										
											2022-06-24 12:54:43 +02:00
+								        query = urllib.parse.urlparse(url).query
-												[common] add helper method for Wowza Streaming Engine format extraction

											
										
										
											2016-09-16 20:30:38 +02:00
+								        url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
-												[extractor/common] Respect secure schemes in _extract_wowza_formats

											
										
										
											2018-02-05 17:41:55 +01:00
+								        mobj = re.search(
 								            r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
 								        url_base = mobj.group('url')
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								        http_base_url = '{}{}:{}'.format('http', mobj.group('s') or '', url_base)
-												[common] add helper method for Wowza Streaming Engine format extraction

											
										
										
											2016-09-16 20:30:38 +02:00
+								        formats = []
-												[extractor/common] Respect URL query in _extract_wowza_formats (closes #14645)

											
										
										
											2017-11-01 17:39:26 +01:00
 								        def manifest_url(manifest):
-												[cleanup] Upgrade syntax

Using https://github.com/asottile/pyupgrade

1. `__future__` imports and `coding: utf-8` were removed
2. Files were rewritten with `pyupgrade --py36-plus --keep-percent-format`
3. f-strings were cherry-picked from `pyupgrade --py36-plus`

Extractors are left untouched (except removing header) to avoid unnecessary merge conflicts

											
										
										
											2022-04-11 17:10:28 +02:00
+								            m_url = f'{http_base_url}/{manifest}'
-												[extractor/common] Respect URL query in _extract_wowza_formats (closes #14645)

											
										
										
											2017-11-01 17:39:26 +01:00
+								            if query:
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								                m_url += f'?{query}'
-												[extractor/common] Respect URL query in _extract_wowza_formats (closes #14645)

											
										
										
											2017-11-01 17:39:26 +01:00
+								            return m_url
-												[common] add helper method for Wowza Streaming Engine format extraction

											
										
										
											2016-09-16 20:30:38 +02:00
+								        if 'm3u8' not in skip_protocols:
 								            formats.extend(self._extract_m3u8_formats(
-												[extractor/common] Respect URL query in _extract_wowza_formats (closes #14645)

											
										
										
											2017-11-01 17:39:26 +01:00
+								                manifest_url('playlist.m3u8'), video_id, 'mp4',
-												[common] add helper method for Wowza Streaming Engine format extraction

											
										
										
											2016-09-16 20:30:38 +02:00
+								                m3u8_entry_protocol, m3u8_id='hls', fatal=False))
 								        if 'f4m' not in skip_protocols:
 								            formats.extend(self._extract_f4m_formats(
-												[extractor/common] Respect URL query in _extract_wowza_formats (closes #14645)

											
										
										
											2017-11-01 17:39:26 +01:00
+								                manifest_url('manifest.f4m'),
-												[common] add helper method for Wowza Streaming Engine format extraction

											
										
										
											2016-09-16 20:30:38 +02:00
+								                video_id, f4m_id='hds', fatal=False))
-												[extractor/common] try to extract non smil wowza mpd manifests

											
										
										
											2016-10-19 15:57:12 +02:00
+								        if 'dash' not in skip_protocols:
 								            formats.extend(self._extract_mpd_formats(
-												[extractor/common] Respect URL query in _extract_wowza_formats (closes #14645)

											
										
										
											2017-11-01 17:39:26 +01:00
+								                manifest_url('manifest.mpd'),
-												[extractor/common] try to extract non smil wowza mpd manifests

											
										
										
											2016-10-19 15:57:12 +02:00
+								                video_id, mpd_id='dash', fatal=False))
-												[common] add helper method for Wowza Streaming Engine format extraction

											
										
										
											2016-09-16 20:30:38 +02:00
+								        if re.search(r'(?:/smil:|\.smil)', url_base):
 								            if 'smil' not in skip_protocols:
 								                rtmp_formats = self._extract_smil_formats(
-												[extractor/common] Respect URL query in _extract_wowza_formats (closes #14645)

											
										
										
											2017-11-01 17:39:26 +01:00
+								                    manifest_url('jwplayer.smil'),
-												[common] add helper method for Wowza Streaming Engine format extraction

											
										
										
											2016-09-16 20:30:38 +02:00
+								                    video_id, fatal=False)
 								                for rtmp_format in rtmp_formats:
 								                    rtsp_format = rtmp_format.copy()
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								                    rtsp_format['url'] = '{}/{}'.format(rtmp_format['url'], rtmp_format['play_path'])
-												[common] add helper method for Wowza Streaming Engine format extraction

											
										
										
											2016-09-16 20:30:38 +02:00
+								                    del rtsp_format['play_path']
 								                    del rtsp_format['ext']
 								                    rtsp_format.update({
 								                        'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
 								                        'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
 								                        'protocol': 'rtsp',
 								                    })
 								                    formats.extend([rtmp_format, rtsp_format])
 								        else:
 								            for protocol in ('rtmp', 'rtsp'):
 								                if protocol not in skip_protocols:
 								                    formats.append({
-												[cleanup] Upgrade syntax

Using https://github.com/asottile/pyupgrade

1. `__future__` imports and `coding: utf-8` were removed
2. Files were rewritten with `pyupgrade --py36-plus --keep-percent-format`
3. f-strings were cherry-picked from `pyupgrade --py36-plus`

Extractors are left untouched (except removing header) to avoid unnecessary merge conflicts

											
										
										
											2022-04-11 17:10:28 +02:00
+								                        'url': f'{protocol}:{url_base}',
-												[common] add helper method for Wowza Streaming Engine format extraction

											
										
										
											2016-09-16 20:30:38 +02:00
+								                        'format_id': protocol,
 								                        'protocol': protocol,
 								                    })
 								        return formats
-												_find_jwplayer_data() returns dict or None

This simplifies code for callers of `_find_jwplayer_data()` which no longer have
to run `_parse_json()` on the return value.

It also makes sure that `_find_jwplayer_data()` returns either a `dict` or
`None` and nothing else.

											
										
										
											2017-03-25 19:38:30 +01:00
+								    def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
-												Update to ytdl-commit-a08f2b7 (#10012)

[ie] Rework JWPlayer extraction
- https://github.com/ytdl-org/youtube-dl/commit/f66372403fd9e1661199fea100ba2600fa9697b2
[ie/gbnews] Add extractor
- https://github.com/ytdl-org/youtube-dl/commit/70f230f9cf28e948662599b6257cb7d1262870e3
[ie/caffeinetv] Add extractor
- https://github.com/ytdl-org/youtube-dl/commit/40bd5c18153afe765caa6726302ee1dd8a9a2ce6
[ie/youporn] Improve extraction
- https://github.com/ytdl-org/youtube-dl/commit/0b2ce3685e02ea1a3ccee1026572e081b8f6ac83
[ie/youporn] Add playlist extractors
- https://github.com/ytdl-org/youtube-dl/commit/668332b9733023ca2e927eeb2208725022248af8

Closes #9188, Closes #9523
Authored by: Grub4K, bashonly
											
										
										
											2024-05-26 23:09:53 +02:00
+								        return self._search_json(
 								            r'''(?<!-)\bjwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?:(?!</script>).)*?\.\s*(?:setup\s*\(|(?P<load>load)\s*\(\s*\[)''',
 								            webpage, 'JWPlayer data', video_id,
 								            # must be a {...} or sequence, ending
 								            contains_pattern=r'\{(?s:.*)}(?(load)(?:\s*,\s*\{(?s:.*)})*)', end_pattern=r'(?(load)\]|\))',
 								            transform_source=transform_source, default=None)
-												[generic] Try parsing JWPlayer embedded videos (closes #12030)

											
										
										
											2017-02-16 16:42:36 +01:00
-												Update to ytdl-commit-a08f2b7 (#10012)

[ie] Rework JWPlayer extraction
- https://github.com/ytdl-org/youtube-dl/commit/f66372403fd9e1661199fea100ba2600fa9697b2
[ie/gbnews] Add extractor
- https://github.com/ytdl-org/youtube-dl/commit/70f230f9cf28e948662599b6257cb7d1262870e3
[ie/caffeinetv] Add extractor
- https://github.com/ytdl-org/youtube-dl/commit/40bd5c18153afe765caa6726302ee1dd8a9a2ce6
[ie/youporn] Improve extraction
- https://github.com/ytdl-org/youtube-dl/commit/0b2ce3685e02ea1a3ccee1026572e081b8f6ac83
[ie/youporn] Add playlist extractors
- https://github.com/ytdl-org/youtube-dl/commit/668332b9733023ca2e927eeb2208725022248af8

Closes #9188, Closes #9523
Authored by: Grub4K, bashonly
											
										
										
											2024-05-26 23:09:53 +02:00
+								    def _extract_jwplayer_data(self, webpage, video_id, *args, transform_source=js_to_json, **kwargs):
-												_find_jwplayer_data() returns dict or None

This simplifies code for callers of `_find_jwplayer_data()` which no longer have
to run `_parse_json()` on the return value.

It also makes sure that `_find_jwplayer_data()` returns either a `dict` or
`None` and nothing else.

											
										
										
											2017-03-25 19:38:30 +01:00
+								        jwplayer_data = self._find_jwplayer_data(
-												Update to ytdl-commit-a08f2b7 (#10012)

[ie] Rework JWPlayer extraction
- https://github.com/ytdl-org/youtube-dl/commit/f66372403fd9e1661199fea100ba2600fa9697b2
[ie/gbnews] Add extractor
- https://github.com/ytdl-org/youtube-dl/commit/70f230f9cf28e948662599b6257cb7d1262870e3
[ie/caffeinetv] Add extractor
- https://github.com/ytdl-org/youtube-dl/commit/40bd5c18153afe765caa6726302ee1dd8a9a2ce6
[ie/youporn] Improve extraction
- https://github.com/ytdl-org/youtube-dl/commit/0b2ce3685e02ea1a3ccee1026572e081b8f6ac83
[ie/youporn] Add playlist extractors
- https://github.com/ytdl-org/youtube-dl/commit/668332b9733023ca2e927eeb2208725022248af8

Closes #9188, Closes #9523
Authored by: Grub4K, bashonly
											
										
										
											2024-05-26 23:09:53 +02:00
+								            webpage, video_id, transform_source=transform_source)
-												[generic] Try parsing JWPlayer embedded videos (closes #12030)

											
										
										
											2017-02-16 16:42:36 +01:00
+								        return self._parse_jwplayer_data(
 								            jwplayer_data, video_id, *args, **kwargs)
 								    def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
 								                             m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
 								        entries = []
-												Update to ytdl-commit-195f22f6

[generic] Improve KVS (etc) extraction
https://github.com/ytdl-org/youtube-dl/commit/195f22f679330549882a8234e7234942893a4902

Closes #3716
Authored by: Grub4k, pukkandan
											
										
										
											2023-01-02 14:45:36 +01:00
+								        if not isinstance(jwplayer_data, dict):
 								            return entries
-												[generic] Try parsing JWPlayer embedded videos (closes #12030)

											
										
										
											2017-02-16 16:42:36 +01:00
-												Update to ytdl-commit-195f22f6

[generic] Improve KVS (etc) extraction
https://github.com/ytdl-org/youtube-dl/commit/195f22f679330549882a8234e7234942893a4902

Closes #3716
Authored by: Grub4k, pukkandan
											
										
										
											2023-01-02 14:45:36 +01:00
+								        playlist_items = jwplayer_data.get('playlist')
 								        # JWPlayer backward compatibility: single playlist item/flattened playlists
-												[generic] Try parsing JWPlayer embedded videos (closes #12030)

											
										
										
											2017-02-16 16:42:36 +01:00
+								        # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
-												Update to ytdl-commit-195f22f6

[generic] Improve KVS (etc) extraction
https://github.com/ytdl-org/youtube-dl/commit/195f22f679330549882a8234e7234942893a4902

Closes #3716
Authored by: Grub4k, pukkandan
											
										
										
											2023-01-02 14:45:36 +01:00
+								        # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
 								        if not isinstance(playlist_items, list):
 								            playlist_items = (playlist_items or jwplayer_data, )
-												[generic] Try parsing JWPlayer embedded videos (closes #12030)

											
										
										
											2017-02-16 16:42:36 +01:00
-												Update to ytdl-commit-195f22f6

[generic] Improve KVS (etc) extraction
https://github.com/ytdl-org/youtube-dl/commit/195f22f679330549882a8234e7234942893a4902

Closes #3716
Authored by: Grub4k, pukkandan
											
										
										
											2023-01-02 14:45:36 +01:00
+								        for video_data in playlist_items:
 								            if not isinstance(video_data, dict):
 								                continue
-												[generic] Try parsing JWPlayer embedded videos (closes #12030)

											
										
										
											2017-02-16 16:42:36 +01:00
+								            # JWPlayer backward compatibility: flattened sources
 								            # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
 								            if 'sources' not in video_data:
 								                video_data['sources'] = [video_data]
 								            this_video_id = video_id or video_data['mediaid']
-												[extractor/common] Pass arguments to _parse_jwplayer_formats and PEP8

											
										
										
											2017-03-05 17:28:32 +01:00
+								            formats = self._parse_jwplayer_formats(
 								                video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
 								                mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
-												[generic] Try parsing JWPlayer embedded videos (closes #12030)

											
										
										
											2017-02-16 16:42:36 +01:00
 								            subtitles = {}
-												Update to ytdl-commit-a08f2b7 (#10012)

[ie] Rework JWPlayer extraction
- https://github.com/ytdl-org/youtube-dl/commit/f66372403fd9e1661199fea100ba2600fa9697b2
[ie/gbnews] Add extractor
- https://github.com/ytdl-org/youtube-dl/commit/70f230f9cf28e948662599b6257cb7d1262870e3
[ie/caffeinetv] Add extractor
- https://github.com/ytdl-org/youtube-dl/commit/40bd5c18153afe765caa6726302ee1dd8a9a2ce6
[ie/youporn] Improve extraction
- https://github.com/ytdl-org/youtube-dl/commit/0b2ce3685e02ea1a3ccee1026572e081b8f6ac83
[ie/youporn] Add playlist extractors
- https://github.com/ytdl-org/youtube-dl/commit/668332b9733023ca2e927eeb2208725022248af8

Closes #9188, Closes #9523
Authored by: Grub4K, bashonly
											
										
										
											2024-05-26 23:09:53 +02:00
+								            for track in traverse_obj(video_data, (
 								                    'tracks', lambda _, v: v['kind'].lower() in ('captions', 'subtitles'))):
 								                track_url = urljoin(base_url, track.get('file'))
 								                if not track_url:
 								                    continue
 								                subtitles.setdefault(track.get('label') or 'en', []).append({
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								                    'url': self._proto_relative_url(track_url),
-												Update to ytdl-commit-a08f2b7 (#10012)

[ie] Rework JWPlayer extraction
- https://github.com/ytdl-org/youtube-dl/commit/f66372403fd9e1661199fea100ba2600fa9697b2
[ie/gbnews] Add extractor
- https://github.com/ytdl-org/youtube-dl/commit/70f230f9cf28e948662599b6257cb7d1262870e3
[ie/caffeinetv] Add extractor
- https://github.com/ytdl-org/youtube-dl/commit/40bd5c18153afe765caa6726302ee1dd8a9a2ce6
[ie/youporn] Improve extraction
- https://github.com/ytdl-org/youtube-dl/commit/0b2ce3685e02ea1a3ccee1026572e081b8f6ac83
[ie/youporn] Add playlist extractors
- https://github.com/ytdl-org/youtube-dl/commit/668332b9733023ca2e927eeb2208725022248af8

Closes #9188, Closes #9523
Authored by: Grub4K, bashonly
											
										
										
											2024-05-26 23:09:53 +02:00
+								                })
-												[generic] Try parsing JWPlayer embedded videos (closes #12030)

											
										
										
											2017-02-16 16:42:36 +01:00
-												[common] add support for jwplayer youtube embeds

											
										
										
											2017-10-12 18:12:47 +02:00
+								            entry = {
-												[generic] Try parsing JWPlayer embedded videos (closes #12030)

											
										
										
											2017-02-16 16:42:36 +01:00
+								                'id': this_video_id,
-												[common] add support for jwplayer youtube embeds

											
										
										
											2017-10-12 18:12:47 +02:00
+								                'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
-												[extractor/common] clean jwplayer description HTML tags

											
										
										
											2019-11-09 13:11:59 +01:00
+								                'description': clean_html(video_data.get('description')),
-												[extractor/common] improve jwplayer relative url handling(closes #18892)

											
										
										
											2019-01-20 13:31:41 +01:00
+								                'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
-												[generic] Try parsing JWPlayer embedded videos (closes #12030)

											
										
										
											2017-02-16 16:42:36 +01:00
+								                'timestamp': int_or_none(video_data.get('pubdate')),
 								                'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
 								                'subtitles': subtitles,
-												Update to ytdl-commit-195f22f6

[generic] Improve KVS (etc) extraction
https://github.com/ytdl-org/youtube-dl/commit/195f22f679330549882a8234e7234942893a4902

Closes #3716
Authored by: Grub4k, pukkandan
											
										
										
											2023-01-02 14:45:36 +01:00
+								                'alt_title': clean_html(video_data.get('subtitle')),  # attributes used e.g. by Tele5 ...
 								                'genre': clean_html(video_data.get('genre')),
 								                'channel': clean_html(dict_get(video_data, ('category', 'channel'))),
 								                'season_number': int_or_none(video_data.get('season')),
 								                'episode_number': int_or_none(video_data.get('episode')),
 								                'release_year': int_or_none(video_data.get('releasedate')),
 								                'age_limit': int_or_none(video_data.get('age_restriction')),
-												[common] add support for jwplayer youtube embeds

											
										
										
											2017-10-12 18:12:47 +02:00
+								            }
 								            # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
 								            if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
 								                entry.update({
 								                    '_type': 'url_transparent',
 								                    'url': formats[0]['url'],
 								                })
 								            else:
 								                entry['formats'] = formats
 								            entries.append(entry)
-												[generic] Try parsing JWPlayer embedded videos (closes #12030)

											
										
										
											2017-02-16 16:42:36 +01:00
+								        if len(entries) == 1:
 								            return entries[0]
 								        else:
 								            return self.playlist_result(entries)
-												[extractor/common] Move jwplayer formats extraction in separate method

											
										
										
											2017-03-05 17:22:27 +01:00
+								    def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
 								                                m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
-												Update to ytdl-commit-195f22f6

[generic] Improve KVS (etc) extraction
https://github.com/ytdl-org/youtube-dl/commit/195f22f679330549882a8234e7234942893a4902

Closes #3716
Authored by: Grub4k, pukkandan
											
										
										
											2023-01-02 14:45:36 +01:00
+								        urls = set()
-												[extractor/common] Move jwplayer formats extraction in separate method

											
										
										
											2017-03-05 17:22:27 +01:00
+								        formats = []
-												[extractor/common] Pass arguments to _parse_jwplayer_formats and PEP8

											
										
										
											2017-03-05 17:28:32 +01:00
+								        for source in jwplayer_sources_data:
-												[extractor/common] Improve jwplayer formats extraction (closes #13379)

											
										
										
											2017-06-14 17:02:15 +02:00
+								            if not isinstance(source, dict):
 								                continue
-												[extractor/common] improve jwplayer relative url handling(closes #18892)

											
										
										
											2019-01-20 13:31:41 +01:00
+								            source_url = urljoin(
 								                base_url, self._proto_relative_url(source.get('file')))
 								            if not source_url or source_url in urls:
-												[common] Relax JWPlayer regex and remove duplicate urls(#12768)

											
										
										
											2017-04-17 09:48:24 +02:00
+								                continue
-												Update to ytdl-commit-195f22f6

[generic] Improve KVS (etc) extraction
https://github.com/ytdl-org/youtube-dl/commit/195f22f679330549882a8234e7234942893a4902

Closes #3716
Authored by: Grub4k, pukkandan
											
										
										
											2023-01-02 14:45:36 +01:00
+								            urls.add(source_url)
-												[extractor/common] Move jwplayer formats extraction in separate method

											
										
										
											2017-03-05 17:22:27 +01:00
+								            source_type = source.get('type') or ''
 								            ext = mimetype2ext(source_type) or determine_ext(source_url)
-												Update to ytdl-commit-195f22f6

[generic] Improve KVS (etc) extraction
https://github.com/ytdl-org/youtube-dl/commit/195f22f679330549882a8234e7234942893a4902

Closes #3716
Authored by: Grub4k, pukkandan
											
										
										
											2023-01-02 14:45:36 +01:00
+								            if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
-												[extractor/common] Move jwplayer formats extraction in separate method

											
										
										
											2017-03-05 17:22:27 +01:00
+								                formats.extend(self._extract_m3u8_formats(
-												[extractor/common] Improve height extraction and extract bitrate

											
										
										
											2017-03-05 17:25:03 +01:00
+								                    source_url, video_id, 'mp4', entry_protocol='m3u8_native',
 								                    m3u8_id=m3u8_id, fatal=False))
-												Update to ytdl-commit-195f22f6

[generic] Improve KVS (etc) extraction
https://github.com/ytdl-org/youtube-dl/commit/195f22f679330549882a8234e7234942893a4902

Closes #3716
Authored by: Grub4k, pukkandan
											
										
										
											2023-01-02 14:45:36 +01:00
+								            elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:
-												[extractor/common] Move jwplayer formats extraction in separate method

											
										
										
											2017-03-05 17:22:27 +01:00
+								                formats.extend(self._extract_mpd_formats(
 								                    source_url, video_id, mpd_id=mpd_id, fatal=False))
-												[extractor/common] Extract SMIL formats from jwplayer

											
										
										
											2017-03-15 21:30:53 +01:00
+								            elif ext == 'smil':
 								                formats.extend(self._extract_smil_formats(
 								                    source_url, video_id, fatal=False))
-												[extractor/common] Move jwplayer formats extraction in separate method

											
										
										
											2017-03-05 17:22:27 +01:00
+								            # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
-												[extractor/common] Improve height extraction and extract bitrate

											
										
										
											2017-03-05 17:25:03 +01:00
+								            elif source_type.startswith('audio') or ext in (
 								                    'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
-												[extractor/common] Move jwplayer formats extraction in separate method

											
										
										
											2017-03-05 17:22:27 +01:00
+								                formats.append({
 								                    'url': source_url,
 								                    'vcodec': 'none',
 								                    'ext': ext,
 								                })
 								            else:
-												Update to ytdl-commit-195f22f6

[generic] Improve KVS (etc) extraction
https://github.com/ytdl-org/youtube-dl/commit/195f22f679330549882a8234e7234942893a4902

Closes #3716
Authored by: Grub4k, pukkandan
											
										
										
											2023-01-02 14:45:36 +01:00
+								                format_id = str_or_none(source.get('label'))
-												[extractor/common] Move jwplayer formats extraction in separate method

											
										
										
											2017-03-05 17:22:27 +01:00
+								                height = int_or_none(source.get('height'))
-												Update to ytdl-commit-195f22f6

[generic] Improve KVS (etc) extraction
https://github.com/ytdl-org/youtube-dl/commit/195f22f679330549882a8234e7234942893a4902

Closes #3716
Authored by: Grub4k, pukkandan
											
										
										
											2023-01-02 14:45:36 +01:00
+								                if height is None and format_id:
-												[extractor/common] Move jwplayer formats extraction in separate method

											
										
										
											2017-03-05 17:22:27 +01:00
+								                    # Often no height is provided but there is a label in
-												[extractor/common] Improve height extraction and extract bitrate

											
										
										
											2017-03-05 17:25:03 +01:00
+								                    # format like "1080p", "720p SD", or 1080.
-												Update to ytdl-commit-195f22f6

[generic] Improve KVS (etc) extraction
https://github.com/ytdl-org/youtube-dl/commit/195f22f679330549882a8234e7234942893a4902

Closes #3716
Authored by: Grub4k, pukkandan
											
										
										
											2023-01-02 14:45:36 +01:00
+								                    height = parse_resolution(format_id).get('height')
-												[extractor/common] Move jwplayer formats extraction in separate method

											
										
										
											2017-03-05 17:22:27 +01:00
+								                a_format = {
 								                    'url': source_url,
 								                    'width': int_or_none(source.get('width')),
 								                    'height': height,
-												[extractor/JWPlatform] Fix extractor (#5112)

Fix bitrate and filesize extraction and support embeds with unquoted urls.

Related: #5106 

Authored by: coletdjnz
											
										
										
											2022-10-03 21:37:48 +02:00
+								                    'tbr': int_or_none(source.get('bitrate'), scale=1000),
 								                    'filesize': int_or_none(source.get('filesize')),
-												[extractor/common] Move jwplayer formats extraction in separate method

											
										
										
											2017-03-05 17:22:27 +01:00
+								                    'ext': ext,
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								                    'format_id': format_id,
-												[extractor/common] Move jwplayer formats extraction in separate method

											
										
										
											2017-03-05 17:22:27 +01:00
+								                }
 								                if source_url.startswith('rtmp'):
 								                    a_format['ext'] = 'flv'
 								                    # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
 								                    # of jwplayer.flash.swf
 								                    rtmp_url_parts = re.split(
-												[misc] Cleanup (#9765)

Closes #9763
Authored by: bashonly, seproDev, Grub4K

Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com>
Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com>
											
										
										
											2024-05-26 23:37:49 +02:00
+								                        r'((?:mp4|mp3|flv):)', source_url, maxsplit=1)
-												[extractor/common] Move jwplayer formats extraction in separate method

											
										
										
											2017-03-05 17:22:27 +01:00
+								                    if len(rtmp_url_parts) == 3:
 								                        rtmp_url, prefix, play_path = rtmp_url_parts
 								                        a_format.update({
 								                            'url': rtmp_url,
 								                            'play_path': prefix + play_path,
 								                        })
 								                    if rtmp_params:
 								                        a_format.update(rtmp_params)
 								                formats.append(a_format)
 								        return formats
-												[muenchentv] Move live title generation to common

											
										
										
											2014-09-28 08:53:52 +02:00
+								    def _live_title(self, name):
-												[extractor] Standardize `_live_title`

											
										
										
											2021-12-15 17:00:46 +01:00
+								        self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
 								        return name
-												[muenchentv] Move live title generation to common

											
										
										
											2014-09-28 08:53:52 +02:00
-												[golem] Simplify (#3828)

											
										
										
											2014-09-28 10:34:55 +02:00
+								    def _int(self, v, name, fatal=False, **kwargs):
 								        res = int_or_none(v, **kwargs)
 								        if res is None:
-												[cleanup] Upgrade syntax

Using https://github.com/asottile/pyupgrade

1. `__future__` imports and `coding: utf-8` were removed
2. Files were rewritten with `pyupgrade --py36-plus --keep-percent-format`
3. f-strings were cherry-picked from `pyupgrade --py36-plus`

Extractors are left untouched (except removing header) to avoid unnecessary merge conflicts

											
										
										
											2022-04-11 17:10:28 +02:00
+								            msg = f'Failed to extract {name}: Could not parse value {v!r}'
-												[golem] Simplify (#3828)

											
										
										
											2014-09-28 10:34:55 +02:00
+								            if fatal:
 								                raise ExtractorError(msg)
 								            else:
-												Fix inconsistent use of `report_warning`

											
										
										
											2021-04-16 12:01:10 +02:00
+								                self.report_warning(msg)
-												[golem] Simplify (#3828)

											
										
										
											2014-09-28 10:34:55 +02:00
+								        return res
 								    def _float(self, v, name, fatal=False, **kwargs):
 								        res = float_or_none(v, **kwargs)
 								        if res is None:
-												[cleanup] Upgrade syntax

Using https://github.com/asottile/pyupgrade

1. `__future__` imports and `coding: utf-8` were removed
2. Files were rewritten with `pyupgrade --py36-plus --keep-percent-format`
3. f-strings were cherry-picked from `pyupgrade --py36-plus`

Extractors are left untouched (except removing header) to avoid unnecessary merge conflicts

											
										
										
											2022-04-11 17:10:28 +02:00
+								            msg = f'Failed to extract {name}: Could not parse value {v!r}'
-												[golem] Simplify (#3828)

											
										
										
											2014-09-28 10:34:55 +02:00
+								            if fatal:
 								                raise ExtractorError(msg)
 								            else:
-												Fix inconsistent use of `report_warning`

											
										
										
											2021-04-16 12:01:10 +02:00
+								                self.report_warning(msg)
-												[golem] Simplify (#3828)

											
										
										
											2014-09-28 10:34:55 +02:00
+								        return res
-												[phantomjs] add cookie support

											
										
										
											2017-04-25 15:12:54 +02:00
+								    def _set_cookie(self, domain, name, value, expire_time=None, port=None,
 								                    path='/', secure=False, discard=False, rest={}, **kwargs):
-												[compat] Remove more functions

Removing any more will require changes to a large number of extractors

											
										
										
											2022-06-24 10:10:17 +02:00
+								        cookie = http.cookiejar.Cookie(
-												Fix flake8 issues after #14225

											
										
										
											2017-09-17 07:53:04 +02:00
+, name, value, port, port is not None, domain, True,
-												[phantomjs] add cookie support

											
										
										
											2017-04-25 15:12:54 +02:00
+								            domain.startswith('.'), path, True, secure, expire_time,
 								            discard, None, None, rest)
-												[extractor, cleanup] Reduce direct use of `_downloader`

											
										
										
											2022-06-23 06:14:22 +02:00
+								        self.cookiejar.set_cookie(cookie)
-												[youtube] Use a cookie for seeting the language

This way, we don't have to do an aditional request

											
										
										
											2014-11-30 00:03:59 +01:00
-												[viewster] extract the api auth token

Closes #6406.

											
										
										
											2015-07-30 00:20:37 +02:00
+								    def _get_cookies(self, url):
-												[compat] Remove more functions

Removing any more will require changes to a large number of extractors

											
										
										
											2022-06-24 10:10:17 +02:00
+								        """ Return a http.cookies.SimpleCookie with the cookies for the url """
-												[cookies] Move `YoutubeDLCookieJar` to cookies module (#7091)

Authored by: coletdjnz
											
										
										
											2023-05-27 09:08:19 +02:00
+								        return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url))
-												[viewster] extract the api auth token

Closes #6406.

											
										
										
											2015-07-30 00:20:37 +02:00
-												[extractor/common] Move workaround for applying first Set-Cookie header into a separate method

											
										
										
											2019-05-17 22:17:15 +02:00
+								    def _apply_first_set_cookie_header(self, url_handle, cookie):
-												[extractor/common] Add doc string for _apply_first_set_cookie_header

											
										
										
											2019-05-20 18:23:18 +02:00
+								        """
 								        Apply first Set-Cookie header instead of the last. Experimental.
 								        Some sites (e.g. [1-3]) may serve two cookies under the same name
 								        in Set-Cookie header and expect the first (old) one to be set rather
 								        than second (new). However, as of RFC6265 the newer one cookie
 								        should be set into cookie store what actually happens.
 								        We will workaround this issue by resetting the cookie to
 								        the first one manually.
 . https://new.vk.com/
 . https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
 . https://learning.oreilly.com/
 								        """
-												[extractor/common] Move workaround for applying first Set-Cookie header into a separate method

											
										
										
											2019-05-17 22:17:15 +02:00
+								        for header, cookies in url_handle.headers.items():
 								            if header.lower() != 'set-cookie':
 								                continue
-												[cleanup] Remove unused code paths (#2173)

Notes:

* `_windows_write_string`: Fixed in 3.6
  * https://bugs.python.org/issue1602
  * PEP: https://www.python.org/dev/peps/pep-0528

* Windows UTF-8 fix: Fixed in 3.3
  * https://bugs.python.org/issue13216

* `__loader__`: is always present in 3.3+
  * https://bugs.python.org/issue14646

* `workaround_optparse_bug9161`: Fixed in 2.7
  * https://bugs.python.org/issue9161

Authored by: fstirlitz

											
										
										
											2021-12-30 13:23:36 +01:00
+								            cookies = cookies.encode('iso-8859-1').decode('utf-8')
-												[extractor/common] Move workaround for applying first Set-Cookie header into a separate method

											
										
										
											2019-05-17 22:17:15 +02:00
+								            cookie_value = re.search(
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								                rf'{cookie}=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)', cookies)
-												[extractor/common] Move workaround for applying first Set-Cookie header into a separate method

											
										
										
											2019-05-17 22:17:15 +02:00
+								            if cookie_value:
 								                value, domain = cookie_value.groups()
 								                self._set_cookie(domain, cookie, value)
 								                break
-												[extractor] Use classmethod/property where possible

and refactor lazy extractors accordingly.

This reduces the need to create extractor instances

											
										
										
											2022-05-11 17:54:44 +02:00
+								    @classmethod
 								    def get_testcases(cls, include_onlymatching=False):
-												[cleanup] Misc

Closes #5541

											
										
										
											2022-11-16 01:57:43 +01:00
+								        # Do not look in super classes
 								        t = vars(cls).get('_TEST')
-												Respect age_limit when listing extractors (Fixes #4653)

											
										
										
											2015-01-07 07:20:20 +01:00
+								        if t:
-												[extractor] Use classmethod/property where possible

and refactor lazy extractors accordingly.

This reduces the need to create extractor instances

											
										
										
											2022-05-11 17:54:44 +02:00
+								            assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
-												Respect age_limit when listing extractors (Fixes #4653)

											
										
										
											2015-01-07 07:20:20 +01:00
+								            tests = [t]
 								        else:
-												[cleanup] Misc

Closes #5541

											
										
										
											2022-11-16 01:57:43 +01:00
+								            tests = vars(cls).get('_TESTS', [])
-												Respect age_limit when listing extractors (Fixes #4653)

											
										
										
											2015-01-07 07:20:20 +01:00
+								        for t in tests:
 								            if not include_onlymatching and t.get('only_matching', False):
 								                continue
-												[extractor] Use classmethod/property where possible

and refactor lazy extractors accordingly.

This reduces the need to create extractor instances

											
										
										
											2022-05-11 17:54:44 +02:00
+								            t['name'] = cls.ie_key()
-												Respect age_limit when listing extractors (Fixes #4653)

											
										
										
											2015-01-07 07:20:20 +01:00
+								            yield t
-												Improve handling for overriding extractors with plugins (#5916)

* Extractors replaced with plugin extractors now show in debug output
* Better testcase handling
* Added documentation
Authored by: coletdjnz, pukkandan
											
										
										
											2023-01-02 05:55:11 +01:00
+								        if getattr(cls, '__wrapped__', None):
 								            yield from cls.__wrapped__.get_testcases(include_onlymatching)
-												Respect age_limit when listing extractors (Fixes #4653)

											
										
										
											2015-01-07 07:20:20 +01:00
-												[extractor, test] Basic framework for embed tests (#4307)

and split download tests so they can be more easily run in CI

Authored by: coletdjnz

											
										
										
											2022-07-08 13:23:05 +02:00
+								    @classmethod
 								    def get_webpage_testcases(cls):
-												[cleanup] Misc

Closes #5541

											
										
										
											2022-11-16 01:57:43 +01:00
+								        tests = vars(cls).get('_WEBPAGE_TESTS', [])
-												[extractor, test] Basic framework for embed tests (#4307)

and split download tests so they can be more easily run in CI

Authored by: coletdjnz

											
										
										
											2022-07-08 13:23:05 +02:00
+								        for t in tests:
 								            t['name'] = cls.ie_key()
-												Improve handling for overriding extractors with plugins (#5916)

* Extractors replaced with plugin extractors now show in debug output
* Better testcase handling
* Added documentation
Authored by: coletdjnz, pukkandan
											
										
										
											2023-01-02 05:55:11 +01:00
+								            yield t
 								        if getattr(cls, '__wrapped__', None):
 								            yield from cls.__wrapped__.get_webpage_testcases()
-												[extractor, test] Basic framework for embed tests (#4307)

and split download tests so they can be more easily run in CI

Authored by: coletdjnz

											
										
										
											2022-07-08 13:23:05 +02:00
-												[cleanup] Misc

Closes #5541

											
										
										
											2022-11-16 01:57:43 +01:00
+								    @classproperty(cache=True)
-												[cleanup] Misc cleanup

											
										
										
											2022-05-16 16:06:36 +02:00
+								    def age_limit(cls):
 								        """Get age limit from the testcases"""
 								        return max(traverse_obj(
-												[extractor, test] Basic framework for embed tests (#4307)

and split download tests so they can be more easily run in CI

Authored by: coletdjnz

											
										
										
											2022-07-08 13:23:05 +02:00
+								            (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
-												[cleanup] Misc cleanup

											
										
										
											2022-05-16 16:06:36 +02:00
+								            (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
-												[extractor] Add a way to distinguish IEs that returns only videos

											
										
										
											2022-11-13 06:26:04 +01:00
+								    @classproperty(cache=True)
 								    def _RETURN_TYPE(cls):
 								        """What the extractor returns: "video", "playlist", "any", or None (Unknown)"""
 								        tests = tuple(cls.get_testcases(include_onlymatching=False))
 								        if not tests:
 								            return None
 								        elif not any(k.startswith('playlist') for test in tests for k in test):
 								            return 'video'
 								        elif all(any(k.startswith('playlist') for k in test) for test in tests):
 								            return 'playlist'
 								        return 'any'
 								    @classmethod
 								    def is_single_video(cls, url):
 								        """Returns whether the URL is of a single video, None if unknown"""
-												[extractor] Do not exit early for unsuitable `url_result`

											
										
										
											2023-03-24 17:23:45 +01:00
+								        if cls.suitable(url):
 								            return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
-												[extractor] Add a way to distinguish IEs that returns only videos

											
										
										
											2022-11-13 06:26:04 +01:00
-												[extractor] Use classmethod/property where possible

and refactor lazy extractors accordingly.

This reduces the need to create extractor instances

											
										
										
											2022-05-11 17:54:44 +02:00
+								    @classmethod
 								    def is_suitable(cls, age_limit):
-												[cleanup] Misc cleanup

											
										
										
											2022-05-16 16:06:36 +02:00
+								        """Test whether the extractor is generally suitable for the given age limit"""
 								        return not age_restricted(cls.age_limit, age_limit)
-												Respect age_limit when listing extractors (Fixes #4653)

											
										
										
											2015-01-07 07:20:20 +01:00
-												[extractor] Use classmethod/property where possible

and refactor lazy extractors accordingly.

This reduces the need to create extractor instances

											
										
										
											2022-05-11 17:54:44 +02:00
+								    @classmethod
 								    def description(cls, *, markdown=True, search_examples=None):
-												[extractor] Document netrc machines

Closes #3169

											
										
										
											2022-05-09 06:32:17 +02:00
+								        """Description of the extractor"""
 								        desc = ''
-												[extractor] Use classmethod/property where possible

and refactor lazy extractors accordingly.

This reduces the need to create extractor instances

											
										
										
											2022-05-11 17:54:44 +02:00
+								        if cls._NETRC_MACHINE:
-												[extractor] Document netrc machines

Closes #3169

											
										
										
											2022-05-09 06:32:17 +02:00
+								            if markdown:
-												[cleanup] Misc

											
										
										
											2023-02-28 19:01:02 +01:00
+								                desc += f' [*{cls._NETRC_MACHINE}*](## "netrc machine")'
-												[extractor] Document netrc machines

Closes #3169

											
										
										
											2022-05-09 06:32:17 +02:00
+								            else:
-												[extractor] Use classmethod/property where possible

and refactor lazy extractors accordingly.

This reduces the need to create extractor instances

											
										
										
											2022-05-11 17:54:44 +02:00
+								                desc += f' [{cls._NETRC_MACHINE}]'
 								        if cls.IE_DESC is False:
-												[extractor] Document netrc machines

Closes #3169

											
										
										
											2022-05-09 06:32:17 +02:00
+								            desc += ' [HIDDEN]'
-												[extractor] Use classmethod/property where possible

and refactor lazy extractors accordingly.

This reduces the need to create extractor instances

											
										
										
											2022-05-11 17:54:44 +02:00
+								        elif cls.IE_DESC:
 								            desc += f' {cls.IE_DESC}'
 								        if cls.SEARCH_KEY:
-												[cleanup] Misc

Closes #5576, closes #5887

											
										
										
											2023-01-02 15:09:03 +01:00
+								            desc += f'{";" if cls.IE_DESC else ""} "{cls.SEARCH_KEY}:" prefix'
-												[extractor] Document netrc machines

Closes #3169

											
										
										
											2022-05-09 06:32:17 +02:00
+								            if search_examples:
 								                _COUNTS = ('', '5', '10', 'all')
-												[docs] Consistent use of `e.g.` (#4643)

Authored by: Lesmiscore
											
										
										
											2022-08-14 14:04:13 +02:00
+								                desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
-												[extractor] Use classmethod/property where possible

and refactor lazy extractors accordingly.

This reduces the need to create extractor instances

											
										
										
											2022-05-11 17:54:44 +02:00
+								        if not cls.working():
-												[extractor] Document netrc machines

Closes #3169

											
										
										
											2022-05-09 06:32:17 +02:00
+								            desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
-												[cleanup] Lint and misc cleanup

											
										
										
											2022-11-06 21:59:58 +01:00
+								        # Escape emojis. Ref: https://github.com/github/markup/issues/1153
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								        name = (' - **{}**'.format(re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME))) if markdown else cls.IE_NAME
-												[extractor] Document netrc machines

Closes #3169

											
										
										
											2022-05-09 06:32:17 +02:00
+								        return f'{name}:{desc}' if desc else name
-												Improve subtitles support

For each language the extractor builds a list with the available formats sorted (like for video formats), then YoutubeDL selects one of them using the '--sub-format' option which now allows giving the format preferences (for example 'ass/srt/best').
For each format the 'url' field can be set so that we only download the contents if needed, or if the contents needs to be processed (like in crunchyroll) the 'data' field can be used.

The reasons for this change are:
* We weren't checking that the format given with '--sub-format' was available, checking it in each extractor would be repetitive.
* It allows to easily support giving a format preference.
* The subtitles were automatically downloaded in the extractor, but I think that if you use for example the '--dump-json' option you want to finish as fast as possible.

Currently only the ted extractor has been updated, but the old system still works.

											
										
										
											2015-02-15 18:03:41 +01:00
+								    def extract_subtitles(self, *args, **kwargs):
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											2021-05-17 14:23:08 +02:00
+								        if (self.get_param('writesubtitles', False)
 								                or self.get_param('listsubtitles')):
-												[extractor/common] Simplify subtitles handling methods

Initially I was going to use a single method for handling both subtitles and automatic captions, that's why I used the 'list_subtitles' and the 'subtitles' variables.

											
										
										
											2015-02-17 22:16:29 +01:00
+								            return self._get_subtitles(*args, **kwargs)
 								        return {}
-												Improve subtitles support

For each language the extractor builds a list with the available formats sorted (like for video formats), then YoutubeDL selects one of them using the '--sub-format' option which now allows giving the format preferences (for example 'ass/srt/best').
For each format the 'url' field can be set so that we only download the contents if needed, or if the contents needs to be processed (like in crunchyroll) the 'data' field can be used.

The reasons for this change are:
* We weren't checking that the format given with '--sub-format' was available, checking it in each extractor would be repetitive.
* It allows to easily support giving a format preference.
* The subtitles were automatically downloaded in the extractor, but I think that if you use for example the '--dump-json' option you want to finish as fast as possible.

Currently only the ted extractor has been updated, but the old system still works.

											
										
										
											2015-02-15 18:03:41 +01:00
 								    def _get_subtitles(self, *args, **kwargs):
-												[refactor] Single quotes consistency

											
										
										
											2016-02-14 10:37:17 +01:00
+								        raise NotImplementedError('This method must be implemented by subclasses')
-												Improve subtitles support

For each language the extractor builds a list with the available formats sorted (like for video formats), then YoutubeDL selects one of them using the '--sub-format' option which now allows giving the format preferences (for example 'ass/srt/best').
For each format the 'url' field can be set so that we only download the contents if needed, or if the contents needs to be processed (like in crunchyroll) the 'data' field can be used.

The reasons for this change are:
* We weren't checking that the format given with '--sub-format' was available, checking it in each extractor would be repetitive.
* It allows to easily support giving a format preference.
* The subtitles were automatically downloaded in the extractor, but I think that if you use for example the '--dump-json' option you want to finish as fast as possible.

Currently only the ted extractor has been updated, but the old system still works.

											
										
										
											2015-02-15 18:03:41 +01:00
-												[extractor/youtube] Differentiate between no and disabled comments (#5491)

`comments` and `comment_count` will be set to None, as opposed to 
an empty list and 0, respectively.

Fixes https://github.com/yt-dlp/yt-dlp/issues/5068

Authored by: coletdjnz, pukkandan
											
										
										
											2022-11-10 04:33:03 +01:00
+								    class CommentsDisabled(Exception):
 								        """Raise in _get_comments if comments are disabled for the video"""
-												[extractor] Generalize `getcomments` implementation

											
										
										
											2021-10-12 11:50:50 +02:00
+								    def extract_comments(self, *args, **kwargs):
 								        if not self.get_param('getcomments'):
 								            return None
 								        generator = self._get_comments(*args, **kwargs)
 								        def extractor():
 								            comments = []
-												[extractor] Ignore errors in comment extraction when `-i` is given
Closes #1787

											
										
										
											2021-12-02 23:00:37 +01:00
+								            interrupted = True
-												[extractor] Generalize `getcomments` implementation

											
										
										
											2021-10-12 11:50:50 +02:00
+								            try:
 								                while True:
 								                    comments.append(next(generator))
 								            except StopIteration:
 								                interrupted = False
-												[extractor] Ignore errors in comment extraction when `-i` is given
Closes #1787

											
										
										
											2021-12-02 23:00:37 +01:00
+								            except KeyboardInterrupt:
 								                self.to_screen('Interrupted by user')
-												[extractor/youtube] Differentiate between no and disabled comments (#5491)

`comments` and `comment_count` will be set to None, as opposed to 
an empty list and 0, respectively.

Fixes https://github.com/yt-dlp/yt-dlp/issues/5068

Authored by: coletdjnz, pukkandan
											
										
										
											2022-11-10 04:33:03 +01:00
+								            except self.CommentsDisabled:
 								                return {'comments': None, 'comment_count': None}
-												[extractor] Ignore errors in comment extraction when `-i` is given
Closes #1787

											
										
										
											2021-12-02 23:00:37 +01:00
+								            except Exception as e:
 								                if self.get_param('ignoreerrors') is not True:
 								                    raise
 								                self._downloader.report_error(e)
-												[extractor] Generalize `getcomments` implementation

											
										
										
											2021-10-12 11:50:50 +02:00
+								            comment_count = len(comments)
 								            self.to_screen(f'Extracted {comment_count} comments')
 								            return {
 								                'comments': comments,
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								                'comment_count': None if interrupted else comment_count,
-												[extractor] Generalize `getcomments` implementation

											
										
										
											2021-10-12 11:50:50 +02:00
+								            }
 								        return extractor
 								    def _get_comments(self, *args, **kwargs):
 								        raise NotImplementedError('This method must be implemented by subclasses')
-												[common] Add _merge_subtitles()

											
										
										
											2015-08-20 19:37:07 +02:00
+								    @staticmethod
 								    def _merge_subtitle_items(subtitle_list1, subtitle_list2):
-												[extractor] Support merging subtitles with data

Authored-by: coletdjnz

											
										
										
											2022-03-11 23:20:58 +01:00
+								        """ Merge subtitle items for one language. Items with duplicated URLs/data
-												[common] Add _merge_subtitles()

											
										
										
											2015-08-20 19:37:07 +02:00
+								        will be dropped. """
-												[cleanup] Upgrade syntax

Using https://github.com/asottile/pyupgrade

1. `__future__` imports and `coding: utf-8` were removed
2. Files were rewritten with `pyupgrade --py36-plus --keep-percent-format`
3. f-strings were cherry-picked from `pyupgrade --py36-plus`

Extractors are left untouched (except removing header) to avoid unnecessary merge conflicts

											
										
										
											2022-04-11 17:10:28 +02:00
+								        list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
-												[common] Add _merge_subtitles()

											
										
										
											2015-08-20 19:37:07 +02:00
+								        ret = list(subtitle_list1)
-												[cleanup] Misc fixes

Closes https://github.com/yt-dlp/yt-dlp/pull/3213, Closes https://github.com/yt-dlp/yt-dlp/pull/3117

Related: https://github.com/yt-dlp/yt-dlp/issues/3146#issuecomment-1077323114, https://github.com/yt-dlp/yt-dlp/pull/3277#discussion_r841019671, https://github.com/yt-dlp/yt-dlp/commit/a825ffbffa0bea322e3ccb44c6f8e01d8d9572fb#commitcomment-68538986, https://github.com/yt-dlp/yt-dlp/issues/2360, https://github.com/yt-dlp/yt-dlp/commit/5fa3c9a88f597625296981a4a26be723e65d4842#r70393519, https://github.com/yt-dlp/yt-dlp/commit/5fa3c9a88f597625296981a4a26be723e65d4842#r70393254

											
										
										
											2022-03-27 04:20:43 +02:00
+								        ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
-												[common] Add _merge_subtitles()

											
										
										
											2015-08-20 19:37:07 +02:00
+								        return ret
 								    @classmethod
-												[extractor] Minor improvements (See desc)

1. Allow removal of login hint - extractors can set their own login hint as part of `msg`
2. Cleanup `_merge_subtitles` signature

											
										
										
											2021-07-06 22:57:53 +02:00
+								    def _merge_subtitles(cls, *dicts, target=None):
-												[extractor/common] Generalise _merge_subtitles

This allows modifying a subtitles dictionary in-place.

											
										
										
											2021-04-19 19:25:54 +02:00
+								        """ Merge subtitle dictionaries, language by language. """
 								        if target is None:
 								            target = {}
 								        for d in dicts:
 								            for lang, subs in d.items():
 								                target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
 								        return target
-												[common] Add _merge_subtitles()

											
										
										
											2015-08-20 19:37:07 +02:00
-												[youtube] Convert to new subtitles system

The automatic captions are stored in the 'automactic_captions' field, which is used if no normal subtitles are found for an specific language.

											
										
										
											2015-02-16 21:44:17 +01:00
+								    def extract_automatic_captions(self, *args, **kwargs):
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											2021-05-17 14:23:08 +02:00
+								        if (self.get_param('writeautomaticsub', False)
 								                or self.get_param('listsubtitles')):
-												[extractor/common] Simplify subtitles handling methods

Initially I was going to use a single method for handling both subtitles and automatic captions, that's why I used the 'list_subtitles' and the 'subtitles' variables.

											
										
										
											2015-02-17 22:16:29 +01:00
+								            return self._get_automatic_captions(*args, **kwargs)
 								        return {}
-												[youtube] Convert to new subtitles system

The automatic captions are stored in the 'automactic_captions' field, which is used if no normal subtitles are found for an specific language.

											
										
										
											2015-02-16 21:44:17 +01:00
 								    def _get_automatic_captions(self, *args, **kwargs):
-												[refactor] Single quotes consistency

											
										
										
											2016-02-14 10:37:17 +01:00
+								        raise NotImplementedError('This method must be implemented by subclasses')
-												[youtube] Convert to new subtitles system

The automatic captions are stored in the 'automactic_captions' field, which is used if no normal subtitles are found for an specific language.

											
										
										
											2015-02-16 21:44:17 +01:00
-												[compat] Add `functools.cached_property`

											
										
										
											2022-05-20 17:25:21 +02:00
+								    @functools.cached_property
-												[cleanup] Misc cleanup

											
										
										
											2022-05-16 16:06:36 +02:00
+								    def _cookies_passed(self):
 								        """Whether cookies have been passed to YoutubeDL"""
 								        return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
-												Add --mark-watched feature (Closes #5054)

											
										
										
											2016-02-29 20:01:33 +01:00
+								    def mark_watched(self, *args, **kwargs):
-												[youtube] Fix `--mark-watched` with `--cookies-from-browser`
Closes #1019

											
										
										
											2021-09-23 22:46:03 +02:00
+								        if not self.get_param('mark_watched', False):
 								            return
-												[cleanup] Misc cleanup

											
										
										
											2022-05-16 16:06:36 +02:00
+								        if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
-												Add --mark-watched feature (Closes #5054)

											
										
										
											2016-02-29 20:01:33 +01:00
+								            self._mark_watched(*args, **kwargs)
 								    def _mark_watched(self, *args, **kwargs):
 								        raise NotImplementedError('This method must be implemented by subclasses')
-												Rename --cn-verfication-proxy to --geo-verification-proxy

And deprecate the former one

Since commit f1388739002a7fd1e8e9c41b642734786fc6c391, this option is
not limited to China websites, so rename it.

											
										
										
											2016-07-03 17:23:48 +02:00
+								    def geo_verification_headers(self):
 								        headers = {}
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											2021-05-17 14:23:08 +02:00
+								        geo_verification_proxy = self.get_param('geo_verification_proxy')
-												Rename --cn-verfication-proxy to --geo-verification-proxy

And deprecate the former one

Since commit f1388739002a7fd1e8e9c41b642734786fc6c391, this option is
not limited to China websites, so rename it.

											
										
										
											2016-07-03 17:23:48 +02:00
+								        if geo_verification_proxy:
 								            headers['Ytdl-request-proxy'] = geo_verification_proxy
 								        return headers
-												[extractor] Framework for embed detection (#4307)

											
										
										
											2022-08-01 03:22:03 +02:00
+								    @staticmethod
 								    def _generic_id(url):
-												[compat] Remove deprecated functions from core code

											
										
										
											2022-06-24 12:54:43 +02:00
+								        return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
-												[extractor/common] Add id and title helpers for generic IEs

											
										
										
											2016-10-07 13:20:53 +02:00
-												[extractor] Improve `_generic_title`

											
										
										
											2022-10-31 13:05:20 +01:00
+								    def _generic_title(self, url='', webpage='', *, default=None):
 								        return (self._og_search_title(webpage, default=None)
 								                or self._html_extract_title(webpage, default=None)
 								                or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
 								                or default)
-												[extractor/common] Add id and title helpers for generic IEs

											
										
										
											2016-10-07 13:20:53 +02:00
-												[extractor/rutube] Extract chapters from description (#6345)

Authored by: mushbite
											
										
										
											2023-03-04 14:33:17 +01:00
+								    def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True):
 								        if not duration:
 								            return
 								        chapter_list = [{
 								            'start_time': start_function(chapter),
 								            'title': title_function(chapter),
 								        } for chapter in chapter_list or []]
-												[extractor] Do not warn for invalid chapter data in description

Fixes https://github.com/yt-dlp/yt-dlp/issues/6811#issuecomment-1509876209

											
										
										
											2023-04-15 23:46:23 +02:00
+								        if strict:
 								            warn = self.report_warning
 								        else:
 								            warn = self.write_debug
-												[extractor/rutube] Extract chapters from description (#6345)

Authored by: mushbite
											
										
										
											2023-03-04 14:33:17 +01:00
+								            chapter_list.sort(key=lambda c: c['start_time'] or 0)
 								        chapters = [{'start_time': 0}]
 								        for idx, chapter in enumerate(chapter_list):
 								            if chapter['start_time'] is None:
-												[extractor] Do not warn for invalid chapter data in description

Fixes https://github.com/yt-dlp/yt-dlp/issues/6811#issuecomment-1509876209

											
										
										
											2023-04-15 23:46:23 +02:00
+								                warn(f'Incomplete chapter {idx}')
-												[extractor/rutube] Extract chapters from description (#6345)

Authored by: mushbite
											
										
										
											2023-03-04 14:33:17 +01:00
+								            elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
 								                chapters.append(chapter)
 								            elif chapter not in chapters:
-												[extractor] Do not warn for invalid chapter data in description

Fixes https://github.com/yt-dlp/yt-dlp/issues/6811#issuecomment-1509876209

											
										
										
											2023-04-15 23:46:23 +02:00
+								                issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration
 								                         else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}')
 								                warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"')
-												[extractor/rutube] Extract chapters from description (#6345)

Authored by: mushbite
											
										
										
											2023-03-04 14:33:17 +01:00
+								        return chapters[1:]
 								    def _extract_chapters_from_description(self, description, duration):
 								        duration_re = r'(?:\d+:)?\d{1,2}:\d{2}'
 								        sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$'
 								        return self._extract_chapters_helper(
 								            re.findall(sep_re % (duration_re, r'.+?'), description or ''),
 								            start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1],
 								            duration=duration, strict=False) or self._extract_chapters_helper(
 								            re.findall(sep_re % (r'.+?', duration_re), description or ''),
 								            start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0],
 								            duration=duration, strict=False)
-												[youtube] Show if video is `private`, `unlisted` etc in new field `availability` (#188)
Closes: #185, https://github.com/ytdl-org/youtube-dl/issues/25631

Authored by: colethedj, pukkandan

											
										
										
											2021-03-21 22:23:34 +01:00
+								    @staticmethod
-												[fancode] Add extractor (#316,#354)
Closes #269, #363

Authored by: rmsmachine

											
										
										
											2021-05-30 15:59:00 +02:00
+								    def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								        all_known = all(
 								            x is not None for x in
 								            (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted))
-												[youtube] Show if video is `private`, `unlisted` etc in new field `availability` (#188)
Closes: #185, https://github.com/ytdl-org/youtube-dl/issues/25631

Authored by: colethedj, pukkandan

											
										
										
											2021-03-21 22:23:34 +01:00
+								        return (
 								            'private' if is_private
 								            else 'premium_only' if needs_premium
 								            else 'subscriber_only' if needs_subscription
 								            else 'needs_auth' if needs_auth
 								            else 'unlisted' if is_unlisted
 								            else 'public' if all_known
 								            else None)
-												[GameJolt] Add extractors (#2036)

Authored by: MinePlayersPE
											
										
										
											2021-12-24 23:28:57 +01:00
+								    def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
-												Improve `extractor_args` parsing

											
										
										
											2021-07-08 17:33:13 +02:00
+								        '''
 								        @returns            A list of values for the extractor argument given by "key"
 								                            or "default" if no such key is present
 								        @param default      The default value to return when the key is not present (default: [])
 								        @param casesense    When false, the values are converted to lower case
 								        '''
-												[extractor/youtube:tab] Let `approximate_date` return timestamp

											
										
										
											2022-10-13 00:53:39 +02:00
+								        ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key()
 								        val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key))
-												Improve `extractor_args` parsing

											
										
										
											2021-07-08 17:33:13 +02:00
+								        if val is None:
 								            return [] if default is NO_DEFAULT else default
 								        return list(val) if casesense else [x.lower() for x in val]
-												Add `--extractor-args` to pass extractor-specific arguments

											
										
										
											2021-06-25 16:05:41 +02:00
-												[extractor] Add convinience function `_yes_playlist`

											
										
										
											2022-01-23 18:54:24 +01:00
+								    def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
 								        if not playlist_id or not video_id:
 								            return not video_id
 								        no_playlist = (smuggled_data or {}).get('force_noplaylist')
 								        if no_playlist is not None:
 								            return not no_playlist
 								        video_id = '' if video_id is True else f' {video_id}'
 								        playlist_id = '' if playlist_id is True else f' {playlist_id}'
 								        if self.get_param('noplaylist'):
 								            self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
 								            return False
 								        self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
 								        return True
-												Standardize retry mechanism (#1649)

* [utils] Create `RetryManager`
* Migrate all retries to use the manager
* [extractor] Add wrapper methods for convenience
* Standardize console messages for retries
* Add `--retry-sleep` for extractors
											
										
										
											2022-08-01 22:13:18 +02:00
+								    def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
-												[extractor] Fix `fatal=False` in `RetryManager`

											
										
										
											2022-09-21 22:23:37 +02:00
+								        RetryManager.report_retry(
 								            err, _count or int(fatal), _retries,
 								            info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,
 								            sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
-												Standardize retry mechanism (#1649)

* [utils] Create `RetryManager`
* Migrate all retries to use the manager
* [extractor] Add wrapper methods for convenience
* Standardize console messages for retries
* Add `--retry-sleep` for extractors
											
										
										
											2022-08-01 22:13:18 +02:00
 								    def RetryManager(self, **kwargs):
 								        return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
-												[extractor/generic] Separate embed extraction into own function (#5176)


											
										
										
											2022-10-09 12:39:36 +02:00
+								    def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs):
 								        display_id = traverse_obj(info_dict, 'display_id', 'id')
 								        self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}')
 								        return self._downloader.get_info_extractor('Generic')._extract_embeds(
 								            smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs)
-												[extractor] Framework for embed detection (#4307)

											
										
										
											2022-08-01 03:22:03 +02:00
+								    @classmethod
 								    def extract_from_webpage(cls, ydl, url, webpage):
 								        ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
 								              else ydl.get_info_extractor(cls.ie_key()))
-												[extractor, test] Basic framework for embed tests (#4307)

and split download tests so they can be more easily run in CI

Authored by: coletdjnz

											
										
										
											2022-07-08 13:23:05 +02:00
+								        for info in ie._extract_from_webpage(url, webpage) or []:
 								            # url = None since we do not want to set (webpage/original)_url
 								            ydl.add_default_extra_info(info, ie, None)
 								            yield info
-												[extractor] Framework for embed detection (#4307)

											
										
										
											2022-08-01 03:22:03 +02:00
 								    @classmethod
 								    def _extract_from_webpage(cls, url, webpage):
 								        for embed_url in orderedSet(
 								                cls._extract_embed_urls(url, webpage) or [], lazy=True):
-												[cleanup] Misc

Closes #4710, Closes #4754, Closes #4723
Authored by: pukkandan, MrRawes, DavidH-2022

											
										
										
											2022-09-01 13:19:03 +02:00
+								            yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
-												[extractor] Framework for embed detection (#4307)

											
										
										
											2022-08-01 03:22:03 +02:00
 								    @classmethod
 								    def _extract_embed_urls(cls, url, webpage):
 								        """@returns all the embed urls on the webpage"""
 								        if '_EMBED_URL_RE' not in cls.__dict__:
 								            assert isinstance(cls._EMBED_REGEX, (list, tuple))
 								            for idx, regex in enumerate(cls._EMBED_REGEX):
 								                assert regex.count('(?P<url>') == 1, \
 								                    f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
 								            cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
 								        for regex in cls._EMBED_URL_RE:
 								            for mobj in regex.finditer(webpage):
 								                embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
 								                if cls._VALID_URL is False or cls.suitable(embed_url):
 								                    yield embed_url
 								    class StopExtraction(Exception):
 								        pass
-												[extractors] Use new framework for existing embeds (#4307)

`Brightcove` is difficult to migrate because it's subclasses may depend
on the signature of the current functions. So it is left as-is for now

Note: Tests have not been migrated

											
										
										
											2022-08-01 03:23:25 +02:00
+								    @classmethod
 								    def _extract_url(cls, webpage):  # TODO: Remove
 								        """Only for compatibility with some older extractors"""
 								        return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
-												Allow plugin extractors to replace the built-in ones

This allows easier plugin chaining; e.g.
- https://gist.github.com/pukkandan/24f13ff1ed385c5a390c1d7bd130d8f7
- https://gist.github.com/pukkandan/fcf5ca1785c80f64e471f0ee14f990fb

											
										
										
											2022-09-16 13:07:38 +02:00
+								    @classmethod
 								    def __init_subclass__(cls, *, plugin_name=None, **kwargs):
 								        if plugin_name:
 								            mro = inspect.getmro(cls)
 								            super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
-												Improve handling for overriding extractors with plugins (#5916)

* Extractors replaced with plugin extractors now show in debug output
* Better testcase handling
* Added documentation
Authored by: coletdjnz, pukkandan
											
										
										
											2023-01-02 05:55:11 +01:00
+								            cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key
 								            cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}'
-												Allow plugin extractors to replace the built-in ones

This allows easier plugin chaining; e.g.
- https://gist.github.com/pukkandan/24f13ff1ed385c5a390c1d7bd130d8f7
- https://gist.github.com/pukkandan/fcf5ca1785c80f64e471f0ee14f990fb

											
										
										
											2022-09-16 13:07:38 +02:00
+								            while getattr(super_class, '__wrapped__', None):
 								                super_class = super_class.__wrapped__
 								            setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
-												Improve handling for overriding extractors with plugins (#5916)

* Extractors replaced with plugin extractors now show in debug output
* Better testcase handling
* Added documentation
Authored by: coletdjnz, pukkandan
											
										
										
											2023-01-02 05:55:11 +01:00
+								            _PLUGIN_OVERRIDES[super_class].append(cls)
-												Allow plugin extractors to replace the built-in ones

This allows easier plugin chaining; e.g.
- https://gist.github.com/pukkandan/24f13ff1ed385c5a390c1d7bd130d8f7
- https://gist.github.com/pukkandan/fcf5ca1785c80f64e471f0ee14f990fb

											
										
										
											2022-09-16 13:07:38 +02:00
 								        return super().__init_subclass__(**kwargs)
-												Allow users to specify an age limit (fixes #1545)

With these changes, users can now restrict what videos are downloaded by the intented audience, by specifying their age with --age-limit YEARS .
Add rudimentary support in youtube, pornotube, and youporn.

											
										
										
											2013-10-06 06:06:30 +02:00
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								class SearchInfoExtractor(InfoExtractor):
 								    """
 								    Base class for paged search queries extractors.
-												[extractor/common] Consistent URL spelling

											
										
										
											2015-07-23 19:37:45 +02:00
+								    They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
-												[cleanup] Add keyword automatically to SearchIE descriptions
and some minor cleanup of docs

											
										
										
											2021-10-23 16:29:52 +02:00
+								    Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								    """
-												[cleanup] Add keyword automatically to SearchIE descriptions
and some minor cleanup of docs

											
										
										
											2021-10-23 16:29:52 +02:00
+								    _MAX_RESULTS = float('inf')
-												[extractor] Add a way to distinguish IEs that returns only videos

											
										
										
											2022-11-13 06:26:04 +01:00
+								    _RETURN_TYPE = 'playlist'
-												[cleanup] Add keyword automatically to SearchIE descriptions
and some minor cleanup of docs

											
										
										
											2021-10-23 16:29:52 +02:00
-												[extractor] Framework for embed detection (#4307)

											
										
										
											2022-08-01 03:22:03 +02:00
+								    @classproperty
 								    def _VALID_URL(cls):
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								        return rf'{cls._SEARCH_KEY}(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)'
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
 								    def _real_extract(self, query):
-												[lazy_extractors] Fix for search IEs
Closes #1851

											
										
										
											2021-12-01 18:51:19 +01:00
+								        prefix, query = self._match_valid_url(query).group('prefix', 'query')
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								        if prefix == '':
 								            return self._get_n_results(query, 1)
 								        elif prefix == 'all':
 								            return self._get_n_results(query, self._MAX_RESULTS)
 								        else:
 								            n = int(prefix)
 								            if n <= 0:
-												[cleanup] Upgrade syntax

Using https://github.com/asottile/pyupgrade

1. `__future__` imports and `coding: utf-8` were removed
2. Files were rewritten with `pyupgrade --py36-plus --keep-percent-format`
3. f-strings were cherry-picked from `pyupgrade --py36-plus`

Extractors are left untouched (except removing header) to avoid unnecessary merge conflicts

											
										
										
											2022-04-11 17:10:28 +02:00
+								                raise ExtractorError(f'invalid download number {n} for query "{query}"')
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								            elif n > self._MAX_RESULTS:
-												Fix inconsistent use of `report_warning`

											
										
										
											2021-04-16 12:01:10 +02:00
+								                self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
-												Fix generic class move (add all files)

											
										
										
											2013-06-23 19:57:38 +02:00
+								                n = self._MAX_RESULTS
 								            return self._get_n_results(query, n)
 								    def _get_n_results(self, query, n):
-												[extractor] Simplify search extractors

											
										
										
											2021-10-08 22:39:55 +02:00
+								        """Get a specified number of results for a query.
 								        Either this function or _search_results must be overridden by subclasses """
 								        return self.playlist_result(
 								            itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
 								            query, query)
 								    def _search_results(self, query):
 								        """Returns an iterator of search results"""
-												[refactor] Single quotes consistency

											
										
										
											2016-02-14 10:37:17 +01:00
+								        raise NotImplementedError('This method must be implemented by subclasses')
-												Add --list-extractor-descriptions (human-readable list of IEs)

											
										
										
											2013-07-01 18:52:19 +02:00
-												[extractor] Use classmethod/property where possible

and refactor lazy extractors accordingly.

This reduces the need to create extractor instances

											
										
										
											2022-05-11 17:54:44 +02:00
+								    @classproperty
 								    def SEARCH_KEY(cls):
 								        return cls._SEARCH_KEY
-												Add option `--use-extractors`

Deprecates `--force-generic-extractor`

Closes #3234, Closes #2044

Related: #4307, #1791

											
										
										
											2022-08-24 02:12:16 +02:00
 								class UnsupportedURLIE(InfoExtractor):
 								    _VALID_URL = '.*'
 								    _ENABLED = False
 								    IE_DESC = False
 								    def _real_extract(self, url):
 								        raise UnsupportedError(url)
-												Improve handling for overriding extractors with plugins (#5916)

* Extractors replaced with plugin extractors now show in debug output
* Better testcase handling
* Added documentation
Authored by: coletdjnz, pukkandan
											
										
										
											2023-01-02 05:55:11 +01:00
 								_PLUGIN_OVERRIDES = collections.defaultdict(list)