2021-06-03 11:43:42 +02:00
#!/usr/bin/env python3
2016-10-02 13:39:18 +02:00
# coding: utf-8
2012-03-25 03:07:37 +02:00
2014-11-02 11:37:49 +01:00
from __future__ import unicode_literals
2015-07-22 14:03:05 +02:00
import base64
2016-02-16 23:01:44 +01:00
import binascii
2014-03-24 01:40:09 +01:00
import calendar
2014-04-04 23:00:51 +02:00
import codecs
2020-05-04 23:19:33 +02:00
import collections
2014-02-25 01:43:17 +01:00
import contextlib
2013-12-16 05:04:12 +01:00
import ctypes
2013-08-28 12:57:10 +02:00
import datetime
import email . utils
2017-05-01 17:09:18 +02:00
import email . header
2013-05-13 09:20:08 +02:00
import errno
2015-01-10 19:55:36 +01:00
import functools
2012-03-25 03:07:37 +02:00
import gzip
2021-09-23 19:40:51 +02:00
import hashlib
import hmac
2021-10-18 03:46:49 +02:00
import importlib . util
2012-11-28 00:09:17 +01:00
import io
2016-05-02 05:21:39 +02:00
import itertools
2012-12-20 13:13:24 +01:00
import json
2012-03-25 03:07:37 +02:00
import locale
2013-11-25 03:12:26 +01:00
import math
2015-02-10 03:32:21 +01:00
import operator
2012-03-25 03:07:37 +02:00
import os
2013-08-28 12:57:10 +02:00
import platform
2017-02-04 12:49:58 +01:00
import random
2012-03-25 03:07:37 +02:00
import re
2013-08-28 12:57:10 +02:00
import socket
2016-05-02 05:21:39 +02:00
import ssl
2013-12-09 18:29:07 +01:00
import subprocess
2012-03-25 03:07:37 +02:00
import sys
2014-08-21 13:01:13 +02:00
import tempfile
2020-05-04 23:19:33 +02:00
import time
2013-01-03 15:39:55 +01:00
import traceback
2014-03-10 17:31:32 +01:00
import xml . etree . ElementTree
2012-03-25 03:07:37 +02:00
import zlib
2021-12-09 12:40:52 +01:00
import mimetypes
2012-03-25 03:07:37 +02:00
2014-11-02 11:23:40 +01:00
from . compat import (
2017-06-11 20:52:24 +02:00
compat_HTMLParseError ,
2016-01-02 20:49:59 +01:00
compat_HTMLParser ,
2021-04-17 05:02:33 +02:00
compat_HTTPError ,
2015-02-01 11:30:56 +01:00
compat_basestring ,
2014-11-02 11:23:40 +01:00
compat_chr ,
2018-12-09 00:00:32 +01:00
compat_cookiejar ,
2014-12-12 04:01:08 +01:00
compat_ctypes_WINFUNCTYPE ,
2015-10-25 20:04:55 +01:00
compat_etree_fromstring ,
2017-03-25 20:30:10 +01:00
compat_expanduser ,
2014-11-02 11:23:40 +01:00
compat_html_entities ,
2016-06-10 09:11:55 +02:00
compat_html_entities_html5 ,
2015-01-10 19:55:36 +01:00
compat_http_client ,
2019-12-15 17:15:24 +01:00
compat_integer_types ,
2021-02-02 22:15:00 +01:00
compat_numeric_types ,
2015-04-27 16:00:18 +02:00
compat_kwargs ,
2016-09-29 18:28:32 +02:00
compat_os_name ,
2014-11-02 11:23:40 +01:00
compat_parse_qs ,
2016-05-10 09:58:25 +02:00
compat_shlex_quote ,
2014-11-02 11:23:40 +01:00
compat_str ,
2016-05-03 10:50:16 +02:00
compat_struct_pack ,
2016-08-06 20:42:58 +02:00
compat_struct_unpack ,
2014-11-02 11:23:40 +01:00
compat_urllib_error ,
compat_urllib_parse ,
2016-03-25 20:46:57 +01:00
compat_urllib_parse_urlencode ,
2014-11-02 11:23:40 +01:00
compat_urllib_parse_urlparse ,
2020-10-27 11:37:21 +01:00
compat_urllib_parse_urlunparse ,
compat_urllib_parse_quote ,
compat_urllib_parse_quote_plus ,
2016-05-12 12:57:53 +02:00
compat_urllib_parse_unquote_plus ,
2014-11-02 11:23:40 +01:00
compat_urllib_request ,
compat_urlparse ,
2016-03-17 21:52:23 +01:00
compat_xpath ,
2014-11-02 11:23:40 +01:00
)
2014-09-30 17:27:53 +02:00
2016-04-23 15:30:06 +02:00
from . socks import (
ProxyType ,
sockssocket ,
)
2014-09-30 17:27:53 +02:00
2016-05-03 09:15:32 +02:00
def register_socks_protocols ( ) :
# "Register" SOCKS protocols
2016-05-03 09:37:30 +02:00
# In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
# URLs with protocols not in urlparse.uses_netloc are not handled correctly
2016-05-03 09:15:32 +02:00
for scheme in ( ' socks ' , ' socks4 ' , ' socks4a ' , ' socks5 ' ) :
if scheme not in compat_urlparse . uses_netloc :
compat_urlparse . uses_netloc . append ( scheme )
2013-06-06 14:35:08 +02:00
# This is not clearly defined otherwise
compiled_regex_type = type ( re . compile ( ' ' ) )
2019-06-28 19:32:43 +02:00
def random_user_agent ( ) :
_USER_AGENT_TPL = ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/ %s Safari/537.36 '
_CHROME_VERSIONS = (
2021-12-18 01:04:24 +01:00
' 90.0.4430.212 ' ,
' 90.0.4430.24 ' ,
' 90.0.4430.70 ' ,
' 90.0.4430.72 ' ,
' 90.0.4430.85 ' ,
' 90.0.4430.93 ' ,
' 91.0.4472.101 ' ,
' 91.0.4472.106 ' ,
' 91.0.4472.114 ' ,
' 91.0.4472.124 ' ,
' 91.0.4472.164 ' ,
' 91.0.4472.19 ' ,
' 91.0.4472.77 ' ,
' 92.0.4515.107 ' ,
' 92.0.4515.115 ' ,
' 92.0.4515.131 ' ,
' 92.0.4515.159 ' ,
' 92.0.4515.43 ' ,
' 93.0.4556.0 ' ,
' 93.0.4577.15 ' ,
' 93.0.4577.63 ' ,
' 93.0.4577.82 ' ,
' 94.0.4606.41 ' ,
' 94.0.4606.54 ' ,
' 94.0.4606.61 ' ,
' 94.0.4606.71 ' ,
' 94.0.4606.81 ' ,
' 94.0.4606.85 ' ,
' 95.0.4638.17 ' ,
' 95.0.4638.50 ' ,
' 95.0.4638.54 ' ,
' 95.0.4638.69 ' ,
' 95.0.4638.74 ' ,
' 96.0.4664.18 ' ,
' 96.0.4664.45 ' ,
' 96.0.4664.55 ' ,
' 96.0.4664.93 ' ,
' 97.0.4692.20 ' ,
2019-06-28 19:32:43 +02:00
)
return _USER_AGENT_TPL % random . choice ( _CHROME_VERSIONS )
2012-11-28 00:02:55 +01:00
std_headers = {
2019-06-28 19:32:43 +02:00
' User-Agent ' : random_user_agent ( ) ,
2012-11-28 02:04:46 +01:00
' Accept ' : ' text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8 ' ,
' Accept-Encoding ' : ' gzip, deflate ' ,
' Accept-Language ' : ' en-us,en;q=0.5 ' ,
2012-11-28 00:02:55 +01:00
}
2012-12-30 18:22:36 +01:00
2014-11-23 20:41:03 +01:00
2016-12-11 18:49:07 +01:00
USER_AGENTS = {
' Safari ' : ' Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27 ' ,
}
2015-06-28 18:56:07 +02:00
NO_DEFAULT = object ( )
2015-02-13 08:14:23 +01:00
ENGLISH_MONTH_NAMES = [
' January ' , ' February ' , ' March ' , ' April ' , ' May ' , ' June ' ,
' July ' , ' August ' , ' September ' , ' October ' , ' November ' , ' December ' ]
2016-09-14 18:13:55 +02:00
MONTH_NAMES = {
' en ' : ENGLISH_MONTH_NAMES ,
' fr ' : [
2016-09-14 18:57:01 +02:00
' janvier ' , ' février ' , ' mars ' , ' avril ' , ' mai ' , ' juin ' ,
' juillet ' , ' août ' , ' septembre ' , ' octobre ' , ' novembre ' , ' décembre ' ] ,
2016-09-14 18:13:55 +02:00
}
2016-09-02 18:31:52 +02:00
2016-01-03 20:08:34 +01:00
KNOWN_EXTENSIONS = (
' mp4 ' , ' m4a ' , ' m4p ' , ' m4b ' , ' m4r ' , ' m4v ' , ' aac ' ,
' flv ' , ' f4v ' , ' f4a ' , ' f4b ' ,
' webm ' , ' ogg ' , ' ogv ' , ' oga ' , ' ogx ' , ' spx ' , ' opus ' ,
' mkv ' , ' mka ' , ' mk3d ' ,
' avi ' , ' divx ' ,
' mov ' ,
' asf ' , ' wmv ' , ' wma ' ,
' 3gp ' , ' 3g2 ' ,
' mp3 ' ,
' flac ' ,
' ape ' ,
' wav ' ,
' f4f ' , ' f4m ' , ' m3u8 ' , ' smil ' )
2016-05-03 02:40:30 +02:00
# needed for sanitizing filenames in restricted mode
2016-06-02 11:51:48 +02:00
ACCENT_CHARS = dict ( zip ( ' ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ ' ,
2019-05-10 20:42:32 +02:00
itertools . chain ( ' AAAAAA ' , [ ' AE ' ] , ' CEEEEIIIIDNOOOOOOO ' , [ ' OE ' ] , ' UUUUUY ' , [ ' TH ' , ' ss ' ] ,
' aaaaaa ' , [ ' ae ' ] , ' ceeeeiiiionooooooo ' , [ ' oe ' ] , ' uuuuuy ' , [ ' th ' ] , ' y ' ) ) )
2016-05-03 02:40:30 +02:00
2016-06-25 17:30:35 +02:00
DATE_FORMATS = (
' %d % B % Y ' ,
' %d % b % Y ' ,
' % B %d % Y ' ,
2017-01-12 16:39:45 +01:00
' % B %d st % Y ' ,
' % B %d nd % Y ' ,
2019-11-26 18:08:37 +01:00
' % B %d rd % Y ' ,
2017-01-12 16:39:45 +01:00
' % B %d th % Y ' ,
2016-06-25 17:30:35 +02:00
' % b %d % Y ' ,
2017-01-12 16:39:45 +01:00
' % b %d st % Y ' ,
' % b %d nd % Y ' ,
2019-11-26 18:08:37 +01:00
' % b %d rd % Y ' ,
2017-01-12 16:39:45 +01:00
' % b %d th % Y ' ,
2016-06-25 17:30:35 +02:00
' % b %d st % Y % I: % M ' ,
' % b %d nd % Y % I: % M ' ,
2019-11-26 18:08:37 +01:00
' % b %d rd % Y % I: % M ' ,
2016-06-25 17:30:35 +02:00
' % b %d th % Y % I: % M ' ,
' % Y % m %d ' ,
' % Y- % m- %d ' ,
2021-09-06 08:52:38 +02:00
' % Y. % m. %d . ' ,
2016-06-25 17:30:35 +02:00
' % Y/ % m/ %d ' ,
2016-08-10 05:36:49 +02:00
' % Y/ % m/ %d % H: % M ' ,
2016-06-25 17:30:35 +02:00
' % Y/ % m/ %d % H: % M: % S ' ,
2021-08-25 06:48:27 +02:00
' % Y % m %d % H % M ' ,
' % Y % m %d % H % M % S ' ,
2017-01-23 16:31:43 +01:00
' % Y- % m- %d % H: % M ' ,
2016-06-25 17:30:35 +02:00
' % Y- % m- %d % H: % M: % S ' ,
' % Y- % m- %d % H: % M: % S. %f ' ,
2021-05-20 15:05:37 +02:00
' % Y- % m- %d % H: % M: % S: %f ' ,
2016-06-25 17:30:35 +02:00
' %d . % m. % Y % H: % M ' ,
' %d . % m. % Y % H. % M ' ,
' % Y- % m- %d T % H: % M: % SZ ' ,
' % Y- % m- %d T % H: % M: % S. %f Z ' ,
' % Y- % m- %d T % H: % M: % S. %f 0Z ' ,
' % Y- % m- %d T % H: % M: % S ' ,
' % Y- % m- %d T % H: % M: % S. %f ' ,
' % Y- % m- %d T % H: % M ' ,
2016-09-29 18:47:25 +02:00
' % b %d % Y at % H: % M ' ,
' % b %d % Y at % H: % M: % S ' ,
2017-12-16 15:56:16 +01:00
' % B %d % Y at % H: % M ' ,
' % B %d % Y at % H: % M: % S ' ,
2021-09-19 14:18:22 +02:00
' % H: % M %d - % b- % Y ' ,
2016-06-25 17:30:35 +02:00
)
DATE_FORMATS_DAY_FIRST = list ( DATE_FORMATS )
DATE_FORMATS_DAY_FIRST . extend ( [
' %d - % m- % Y ' ,
' %d . % m. % Y ' ,
' %d . % m. % y ' ,
' %d / % m/ % Y ' ,
' %d / % m/ % y ' ,
' %d / % m/ % Y % H: % M: % S ' ,
] )
DATE_FORMATS_MONTH_FIRST = list ( DATE_FORMATS )
DATE_FORMATS_MONTH_FIRST . extend ( [
' % m- %d - % Y ' ,
' % m. %d . % Y ' ,
' % m/ %d / % Y ' ,
' % m/ %d / % y ' ,
' % m/ %d / % Y % H: % M: % S ' ,
] )
2016-10-19 18:28:49 +02:00
PACKED_CODES_RE = r " } \ ( ' (.+) ' ,( \ d+),( \ d+), ' ([^ ' ]+) ' \ .split \ ( ' \ | ' \ ) "
2018-10-10 23:47:21 +02:00
JSON_LD_RE = r ' (?is)<script[^>]+type=([ " \' ]?)application/ld \ +json \ 1[^>]*>(?P<json_ld>.+?)</script> '
2016-10-19 18:28:49 +02:00
2015-02-13 08:14:23 +01:00
2012-03-25 03:07:37 +02:00
def preferredencoding ( ) :
2012-11-28 02:04:46 +01:00
""" Get preferred encoding.
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
Returns the best encoding scheme for the system , based on
locale . getpreferredencoding ( ) and some further tweaks .
"""
try :
pref = locale . getpreferredencoding ( )
2014-11-17 07:16:12 +01:00
' TEST ' . encode ( pref )
2015-03-27 13:02:20 +01:00
except Exception :
2012-11-28 02:04:46 +01:00
pref = ' UTF-8 '
2012-07-01 18:21:27 +02:00
2012-11-28 02:04:46 +01:00
return pref
2012-03-25 03:07:37 +02:00
2012-12-20 13:13:24 +01:00
2014-08-21 13:01:13 +02:00
def write_json_file ( obj , fn ) :
2014-11-20 07:05:39 +01:00
""" Encode obj as JSON and write it to fn, atomically if possible """
2014-08-21 13:01:13 +02:00
2014-11-18 23:28:42 +01:00
fn = encodeFilename ( fn )
2014-11-20 07:04:04 +01:00
if sys . version_info < ( 3 , 0 ) and sys . platform != ' win32 ' :
2014-11-15 22:00:32 +01:00
encoding = get_filesystem_encoding ( )
# os.path.basename returns a bytes object, but NamedTemporaryFile
# will fail if the filename contains non ascii characters unless we
# use a unicode object
path_basename = lambda f : os . path . basename ( fn ) . decode ( encoding )
# the same for os.path.dirname
path_dirname = lambda f : os . path . dirname ( fn ) . decode ( encoding )
else :
path_basename = os . path . basename
path_dirname = os . path . dirname
2014-08-21 17:03:00 +02:00
args = {
' suffix ' : ' .tmp ' ,
2014-11-15 22:00:32 +01:00
' prefix ' : path_basename ( fn ) + ' . ' ,
' dir ' : path_dirname ( fn ) ,
2014-08-21 17:03:00 +02:00
' delete ' : False ,
}
2014-08-21 13:01:13 +02:00
# In Python 2.x, json.dump expects a bytestream.
# In Python 3.x, it writes to a character stream
if sys . version_info < ( 3 , 0 ) :
2014-08-21 17:03:00 +02:00
args [ ' mode ' ] = ' wb '
2014-08-21 13:01:13 +02:00
else :
2014-08-21 17:03:00 +02:00
args . update ( {
' mode ' : ' w ' ,
' encoding ' : ' utf-8 ' ,
} )
2015-04-27 16:00:18 +02:00
tf = tempfile . NamedTemporaryFile ( * * compat_kwargs ( args ) )
2014-08-21 13:01:13 +02:00
try :
with tf :
2021-08-07 17:46:55 +02:00
json . dump ( obj , tf )
2014-11-20 07:05:39 +01:00
if sys . platform == ' win32 ' :
# Need to remove existing file on Windows, else os.rename raises
# WindowsError or FileExistsError.
try :
os . unlink ( fn )
except OSError :
pass
2020-05-19 22:21:52 +02:00
try :
mask = os . umask ( 0 )
os . umask ( mask )
os . chmod ( tf . name , 0o666 & ~ mask )
except OSError :
pass
2014-08-21 13:01:13 +02:00
os . rename ( tf . name , fn )
2015-03-27 13:02:20 +01:00
except Exception :
2014-08-21 13:01:13 +02:00
try :
os . remove ( tf . name )
except OSError :
pass
raise
if sys . version_info > = ( 2 , 7 ) :
2015-08-01 16:22:13 +02:00
def find_xpath_attr ( node , xpath , key , val = None ) :
2013-07-11 16:12:08 +02:00
""" Find the xpath xpath[@key=val] """
2015-09-04 19:57:27 +02:00
assert re . match ( r ' ^[a-zA-Z_-]+$ ' , key )
2015-08-01 16:22:13 +02:00
expr = xpath + ( ' [@ %s ] ' % key if val is None else " [@ %s = ' %s ' ] " % ( key , val ) )
2013-07-11 16:12:08 +02:00
return node . find ( expr )
else :
2015-08-01 16:22:13 +02:00
def find_xpath_attr ( node , xpath , key , val = None ) :
2016-03-17 21:52:23 +01:00
for f in node . findall ( compat_xpath ( xpath ) ) :
2015-08-01 16:22:13 +02:00
if key not in f . attrib :
continue
if val is None or f . attrib . get ( key ) == val :
2013-07-11 16:12:08 +02:00
return f
return None
2013-10-12 21:34:04 +02:00
# On python2.6 the xml.etree.ElementTree.Element methods don't support
# the namespace parameter
2014-11-23 20:41:03 +01:00
2013-10-12 21:34:04 +02:00
def xpath_with_ns ( path , ns_map ) :
components = [ c . split ( ' : ' ) for c in path . split ( ' / ' ) ]
replaced = [ ]
for c in components :
if len ( c ) == 1 :
replaced . append ( c [ 0 ] )
else :
ns , tag = c
replaced . append ( ' { %s } %s ' % ( ns_map [ ns ] , tag ) )
return ' / ' . join ( replaced )
2012-03-25 03:07:37 +02:00
2015-09-04 19:56:45 +02:00
def xpath_element ( node , xpath , name = None , fatal = False , default = NO_DEFAULT ) :
2015-10-31 17:39:44 +01:00
def _find_xpath ( xpath ) :
2016-03-17 21:52:23 +01:00
return node . find ( compat_xpath ( xpath ) )
2015-10-31 17:39:44 +01:00
if isinstance ( xpath , ( str , compat_str ) ) :
n = _find_xpath ( xpath )
else :
for xp in xpath :
n = _find_xpath ( xp )
if n is not None :
break
2014-09-13 09:11:14 +02:00
2015-09-04 20:34:49 +02:00
if n is None :
2015-06-28 18:56:07 +02:00
if default is not NO_DEFAULT :
return default
elif fatal :
2014-09-13 09:09:55 +02:00
name = xpath if name is None else name
raise ExtractorError ( ' Could not find XML element %s ' % name )
else :
return None
2015-09-04 19:56:45 +02:00
return n
def xpath_text ( node , xpath , name = None , fatal = False , default = NO_DEFAULT ) :
2015-09-04 20:34:49 +02:00
n = xpath_element ( node , xpath , name , fatal = fatal , default = default )
if n is None or n == default :
return n
if n . text is None :
if default is not NO_DEFAULT :
return default
elif fatal :
name = xpath if name is None else name
raise ExtractorError ( ' Could not find XML element \' s text %s ' % name )
else :
return None
return n . text
2015-09-04 19:56:45 +02:00
def xpath_attr ( node , xpath , key , name = None , fatal = False , default = NO_DEFAULT ) :
n = find_xpath_attr ( node , xpath , key )
if n is None :
if default is not NO_DEFAULT :
return default
elif fatal :
name = ' %s [@ %s ] ' % ( xpath , key ) if name is None else name
raise ExtractorError ( ' Could not find XML attribute %s ' % name )
else :
return None
return n . attrib [ key ]
2014-09-13 09:09:55 +02:00
2012-04-11 00:22:51 +02:00
def get_element_by_id ( id , html ) :
2012-12-19 15:21:14 +01:00
""" Return the content of the tag with the specified ID in the passed HTML document """
2016-02-14 10:37:17 +01:00
return get_element_by_attribute ( ' id ' , id , html )
2012-12-19 15:21:14 +01:00
2014-11-04 23:20:39 +01:00
2016-07-06 14:02:52 +02:00
def get_element_by_class ( class_name , html ) :
2017-02-11 10:16:54 +01:00
""" Return the content of the first tag with the specified class in the passed HTML document """
retval = get_elements_by_class ( class_name , html )
return retval [ 0 ] if retval else None
def get_element_by_attribute ( attribute , value , html , escape_value = True ) :
retval = get_elements_by_attribute ( attribute , value , html , escape_value )
return retval [ 0 ] if retval else None
def get_elements_by_class ( class_name , html ) :
""" Return the content of all tags with the specified class in the passed HTML document as a list """
return get_elements_by_attribute (
2016-07-06 14:02:52 +02:00
' class ' , r ' [^ \' " ]* \ b %s \ b[^ \' " ]* ' % re . escape ( class_name ) ,
html , escape_value = False )
2017-02-11 10:16:54 +01:00
def get_elements_by_attribute ( attribute , value , html , escape_value = True ) :
2012-12-19 15:21:14 +01:00
""" Return the content of the tag with the specified attribute in the passed HTML document """
2012-04-11 00:22:51 +02:00
2016-07-06 14:02:52 +02:00
value = re . escape ( value ) if escape_value else value
2017-02-11 10:16:54 +01:00
retlist = [ ]
for m in re . finditer ( r ''' (?xs)
2014-11-04 23:33:43 +01:00
< ( [ a - zA - Z0 - 9 : . _ - ] + )
2017-07-05 17:23:35 +02:00
( ? : \s + [ a - zA - Z0 - 9 : . _ - ] + ( ? := [ a - zA - Z0 - 9 : . _ - ] * | = " [^ " ] * " |= ' [^ ' ]* ' |))*?
2014-11-04 23:33:43 +01:00
\s + % s = [ ' " ]? %s [ ' " ]?
2017-07-05 17:23:35 +02:00
( ? : \s + [ a - zA - Z0 - 9 : . _ - ] + ( ? := [ a - zA - Z0 - 9 : . _ - ] * | = " [^ " ] * " |= ' [^ ' ]* ' |))*?
2014-11-04 23:33:43 +01:00
\s * >
( ? P < content > . * ? )
< / \1 >
2017-02-11 10:16:54 +01:00
''' % (re.escape(attribute), value), html):
res = m . group ( ' content ' )
2014-11-04 23:33:43 +01:00
2017-02-11 10:16:54 +01:00
if res . startswith ( ' " ' ) or res . startswith ( " ' " ) :
res = res [ 1 : - 1 ]
2014-11-04 23:33:43 +01:00
2017-02-11 10:16:54 +01:00
retlist . append ( unescapeHTML ( res ) )
2013-09-13 22:05:29 +02:00
2017-02-11 10:16:54 +01:00
return retlist
2013-09-13 22:05:29 +02:00
2016-03-16 16:50:04 +01:00
2016-01-02 20:49:59 +01:00
class HTMLAttributeParser ( compat_HTMLParser ) :
""" Trivial HTML parser to gather the attributes for a single element """
2020-10-09 07:06:49 +02:00
2016-01-02 20:49:59 +01:00
def __init__ ( self ) :
2016-03-16 16:50:04 +01:00
self . attrs = { }
2016-01-02 20:49:59 +01:00
compat_HTMLParser . __init__ ( self )
def handle_starttag ( self , tag , attrs ) :
self . attrs = dict ( attrs )
2016-03-16 16:50:04 +01:00
2021-11-05 17:54:56 +01:00
class HTMLListAttrsParser ( compat_HTMLParser ) :
""" HTML parser to gather the attributes for the elements of a list """
def __init__ ( self ) :
compat_HTMLParser . __init__ ( self )
self . items = [ ]
self . _level = 0
def handle_starttag ( self , tag , attrs ) :
if tag == ' li ' and self . _level == 0 :
self . items . append ( dict ( attrs ) )
self . _level + = 1
def handle_endtag ( self , tag ) :
self . _level - = 1
2016-01-02 20:49:59 +01:00
def extract_attributes ( html_element ) :
""" Given a string for an HTML element such as
< el
a = " foo " B = " bar " c = " &98;az " d = boz
empty = noval entity = " & "
sq = ' " ' dq = " ' "
>
Decode and return a dictionary of attributes .
{
' a ' : ' foo ' , ' b ' : ' bar ' , c : ' baz ' , d : ' boz ' ,
' empty ' : ' ' , ' noval ' : None , ' entity ' : ' & ' ,
' sq ' : ' " ' , ' dq ' : ' \' '
} .
NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions ,
but the cases in the unit test will work for all of 2.6 , 2.7 , 3.2 - 3.5 .
"""
parser = HTMLAttributeParser ( )
2017-06-11 20:52:24 +02:00
try :
parser . feed ( html_element )
parser . close ( )
# Older Python may throw HTMLParseError in case of malformed HTML
except compat_HTMLParseError :
pass
2016-01-02 20:49:59 +01:00
return parser . attrs
2012-04-11 00:22:51 +02:00
2016-03-16 16:50:04 +01:00
2021-11-05 17:54:56 +01:00
def parse_list ( webpage ) :
""" Given a string for an series of HTML <li> elements,
return a dictionary of their attributes """
parser = HTMLListAttrsParser ( )
parser . feed ( webpage )
parser . close ( )
return parser . items
2012-04-11 00:22:51 +02:00
def clean_html ( html ) :
2012-11-28 02:04:46 +01:00
""" Clean an HTML snippet into a readable string """
2015-01-09 23:59:18 +01:00
if html is None : # Convenience for sanitizing descriptions etc.
return html
2012-11-28 02:04:46 +01:00
# Newline vs <br />
html = html . replace ( ' \n ' , ' ' )
2017-04-28 17:34:27 +02:00
html = re . sub ( r ' (?u) \ s*< \ s*br \ s*/? \ s*> \ s* ' , ' \n ' , html )
html = re . sub ( r ' (?u)< \ s*/ \ s*p \ s*> \ s*< \ s*p[^>]*> ' , ' \n ' , html )
2012-11-28 02:04:46 +01:00
# Strip html tags
html = re . sub ( ' <.*?> ' , ' ' , html )
# Replace html entities
html = unescapeHTML ( html )
2013-03-29 15:59:13 +01:00
return html . strip ( )
2012-04-11 00:22:51 +02:00
2012-03-25 03:07:37 +02:00
def sanitize_open ( filename , open_mode ) :
2012-11-28 02:04:46 +01:00
""" Try to open the given filename, and slightly tweak it if this fails.
Attempts to open the given filename . If this fails , it tries to change
the filename slightly , step by step , until it ' s either able to open it
or it fails and raises a final exception , like the standard open ( )
function .
It returns the tuple ( stream , definitive_file_name ) .
"""
try :
2014-11-17 07:16:12 +01:00
if filename == ' - ' :
2012-11-28 02:04:46 +01:00
if sys . platform == ' win32 ' :
import msvcrt
msvcrt . setmode ( sys . stdout . fileno ( ) , os . O_BINARY )
2013-03-28 13:13:03 +01:00
return ( sys . stdout . buffer if hasattr ( sys . stdout , ' buffer ' ) else sys . stdout , filename )
2012-11-28 02:04:46 +01:00
stream = open ( encodeFilename ( filename ) , open_mode )
return ( stream , filename )
except ( IOError , OSError ) as err :
2013-05-13 09:20:08 +02:00
if err . errno in ( errno . EACCES , ) :
raise
2012-11-28 02:04:46 +01:00
2013-05-13 09:20:08 +02:00
# In case of error, try to remove win32 forbidden chars
2015-03-08 15:56:28 +01:00
alt_filename = sanitize_path ( filename )
2013-05-13 09:20:08 +02:00
if alt_filename == filename :
raise
else :
# An exception here should be caught in the caller
2015-03-08 15:56:28 +01:00
stream = open ( encodeFilename ( alt_filename ) , open_mode )
2013-05-13 09:20:08 +02:00
return ( stream , alt_filename )
2012-03-25 03:07:37 +02:00
def timeconvert ( timestr ) :
2012-11-28 02:04:46 +01:00
""" Convert RFC 2822 defined time string into system timestamp """
timestamp = None
timetuple = email . utils . parsedate_tz ( timestr )
if timetuple is not None :
timestamp = email . utils . mktime_tz ( timetuple )
return timestamp
2012-11-26 23:58:46 +01:00
2014-11-23 20:41:03 +01:00
2012-12-03 15:36:24 +01:00
def sanitize_filename ( s , restricted = False , is_id = False ) :
2012-11-28 02:04:46 +01:00
""" Sanitizes a string so it could be used as part of a filename.
If restricted is set , use a stricter subset of allowed characters .
2017-03-01 17:04:02 +01:00
Set is_id if this is not an arbitrary string , but an ID that should be kept
if possible .
2012-11-28 02:04:46 +01:00
"""
def replace_insane ( char ) :
2016-05-03 02:40:30 +02:00
if restricted and char in ACCENT_CHARS :
return ACCENT_CHARS [ char ]
2021-09-27 07:59:16 +02:00
elif not restricted and char == ' \n ' :
return ' '
elif char == ' ? ' or ord ( char ) < 32 or ord ( char ) == 127 :
2012-11-28 02:04:46 +01:00
return ' '
elif char == ' " ' :
return ' ' if restricted else ' \' '
elif char == ' : ' :
return ' _- ' if restricted else ' - '
elif char in ' \\ /|*<> ' :
return ' _ '
2012-11-28 12:59:27 +01:00
if restricted and ( char in ' !& \' ()[] {} $;`^,# ' or char . isspace ( ) ) :
2012-11-28 02:04:46 +01:00
return ' _ '
if restricted and ord ( char ) > 127 :
return ' _ '
return char
2021-06-09 11:13:51 +02:00
if s == ' ' :
return ' '
2015-01-11 17:40:45 +01:00
# Handle timestamps
s = re . sub ( r ' [0-9]+(?::[0-9]+)+ ' , lambda m : m . group ( 0 ) . replace ( ' : ' , ' _ ' ) , s )
2014-11-17 07:16:12 +01:00
result = ' ' . join ( map ( replace_insane , s ) )
2012-12-03 15:36:24 +01:00
if not is_id :
while ' __ ' in result :
result = result . replace ( ' __ ' , ' _ ' )
result = result . strip ( ' _ ' )
# Common case of "Foreign band name - English song title"
if restricted and result . startswith ( ' -_ ' ) :
result = result [ 2 : ]
2015-02-24 11:38:01 +01:00
if result . startswith ( ' - ' ) :
result = ' _ ' + result [ len ( ' - ' ) : ]
2015-03-02 19:07:17 +01:00
result = result . lstrip ( ' . ' )
2012-12-03 15:36:24 +01:00
if not result :
result = ' _ '
2012-11-28 02:04:46 +01:00
return result
2012-03-25 03:07:37 +02:00
2014-11-23 20:41:03 +01:00
2021-02-17 20:09:38 +01:00
def sanitize_path ( s , force = False ) :
2015-03-08 15:55:22 +01:00
""" Sanitizes and normalizes path on Windows """
2021-02-17 20:09:38 +01:00
if sys . platform == ' win32 ' :
2021-02-24 19:32:44 +01:00
force = False
2021-02-17 20:09:38 +01:00
drive_or_unc , _ = os . path . splitdrive ( s )
if sys . version_info < ( 2 , 7 ) and not drive_or_unc :
drive_or_unc , _ = os . path . splitunc ( s )
elif force :
drive_or_unc = ' '
else :
2015-03-08 15:55:22 +01:00
return s
2021-02-17 20:09:38 +01:00
2015-04-16 18:12:38 +02:00
norm_path = os . path . normpath ( remove_start ( s , drive_or_unc ) ) . split ( os . path . sep )
if drive_or_unc :
2015-03-08 15:55:22 +01:00
norm_path . pop ( 0 )
sanitized_path = [
2017-01-02 13:08:07 +01:00
path_part if path_part in [ ' . ' , ' .. ' ] else re . sub ( r ' (?:[/<>: " \ | \\ ? \ *]|[ \ s.]$) ' , ' # ' , path_part )
2015-03-08 15:55:22 +01:00
for path_part in norm_path ]
2015-04-16 18:12:38 +02:00
if drive_or_unc :
sanitized_path . insert ( 0 , drive_or_unc + os . path . sep )
2021-02-24 19:32:44 +01:00
elif force and s [ 0 ] == os . path . sep :
sanitized_path . insert ( 0 , os . path . sep )
2015-03-08 15:55:22 +01:00
return os . path . join ( * sanitized_path )
2016-03-26 14:33:57 +01:00
def sanitize_url ( url ) :
2018-02-19 16:50:23 +01:00
# Prepend protocol-less URLs with `http:` scheme in order to mitigate
# the number of unwanted failures due to missing protocol
if url . startswith ( ' // ' ) :
return ' http: %s ' % url
# Fix some common typos seen so far
COMMON_TYPOS = (
2019-03-09 13:14:41 +01:00
# https://github.com/ytdl-org/youtube-dl/issues/15649
2018-02-19 16:50:23 +01:00
( r ' ^httpss:// ' , r ' https:// ' ) ,
# https://bx1.be/lives/direct-tv/
( r ' ^rmtp([es]?):// ' , r ' rtmp \ 1:// ' ) ,
)
for mistake , fixup in COMMON_TYPOS :
if re . match ( mistake , url ) :
return re . sub ( mistake , fixup , url )
2021-06-01 14:35:41 +02:00
return url
2016-03-26 14:33:57 +01:00
2021-04-19 14:07:45 +02:00
def extract_basic_auth ( url ) :
parts = compat_urlparse . urlsplit ( url )
if parts . username is None :
return url , None
url = compat_urlparse . urlunsplit ( parts . _replace ( netloc = (
parts . hostname if parts . port is None
else ' %s : %d ' % ( parts . hostname , parts . port ) ) ) )
auth_payload = base64 . b64encode (
( ' %s : %s ' % ( parts . username , parts . password or ' ' ) ) . encode ( ' utf-8 ' ) )
return url , ' Basic ' + auth_payload . decode ( ' utf-8 ' )
2015-11-20 15:33:49 +01:00
def sanitized_Request ( url , * args , * * kwargs ) :
2021-06-01 14:35:41 +02:00
url , auth_header = extract_basic_auth ( escape_url ( sanitize_url ( url ) ) )
2021-04-19 14:07:45 +02:00
if auth_header is not None :
headers = args [ 1 ] if len ( args ) > = 2 else kwargs . setdefault ( ' headers ' , { } )
headers [ ' Authorization ' ] = auth_header
return compat_urllib_request . Request ( url , * args , * * kwargs )
2015-11-20 15:33:49 +01:00
2017-03-25 20:30:10 +01:00
def expand_path ( s ) :
""" Expand shell variables and ~ """
return os . path . expandvars ( compat_expanduser ( s ) )
2012-03-25 03:07:37 +02:00
def orderedSet ( iterable ) :
2012-11-28 02:04:46 +01:00
""" Remove all duplicates from the input iterable """
res = [ ]
for el in iterable :
if el not in res :
res . append ( el )
return res
2012-03-25 03:07:37 +02:00
2014-03-24 01:40:09 +01:00
2016-06-10 09:11:55 +02:00
def _htmlentity_transform ( entity_with_semicolon ) :
2014-08-27 19:11:45 +02:00
""" Transforms an HTML entity to a character. """
2016-06-10 09:11:55 +02:00
entity = entity_with_semicolon [ : - 1 ]
2014-08-27 19:11:45 +02:00
# Known non-numeric HTML entity
if entity in compat_html_entities . name2codepoint :
return compat_chr ( compat_html_entities . name2codepoint [ entity ] )
2016-06-10 09:11:55 +02:00
# TODO: HTML5 allows entities without a semicolon. For example,
# 'Éric' should be decoded as 'Éric'.
if entity_with_semicolon in compat_html_entities_html5 :
return compat_html_entities_html5 [ entity_with_semicolon ]
2015-03-26 16:15:27 +01:00
mobj = re . match ( r ' #(x[0-9a-fA-F]+|[0-9]+) ' , entity )
2014-08-27 19:11:45 +02:00
if mobj is not None :
numstr = mobj . group ( 1 )
2014-11-17 07:16:12 +01:00
if numstr . startswith ( ' x ' ) :
2014-08-27 19:11:45 +02:00
base = 16
2014-11-17 07:16:12 +01:00
numstr = ' 0 %s ' % numstr
2014-08-27 19:11:45 +02:00
else :
base = 10
2019-03-09 13:14:41 +01:00
# See https://github.com/ytdl-org/youtube-dl/issues/7518
2015-11-16 15:20:16 +01:00
try :
return compat_chr ( int ( numstr , base ) )
except ValueError :
pass
2014-08-27 19:11:45 +02:00
# Unknown entity in name, return its literal representation
2015-11-16 15:24:09 +01:00
return ' & %s ; ' % entity
2014-08-27 19:11:45 +02:00
2012-03-25 03:07:37 +02:00
def unescapeHTML ( s ) :
2014-03-24 01:40:09 +01:00
if s is None :
return None
assert type ( s ) == compat_str
2012-03-25 03:07:37 +02:00
2014-08-27 19:11:45 +02:00
return re . sub (
2017-08-19 15:40:53 +02:00
r ' &([^&;]+;) ' , lambda m : _htmlentity_transform ( m . group ( 1 ) ) , s )
2012-03-25 03:07:37 +02:00
2014-01-05 03:07:55 +01:00
2021-05-23 18:34:49 +02:00
def escapeHTML ( text ) :
return (
text
. replace ( ' & ' , ' & ' )
. replace ( ' < ' , ' < ' )
. replace ( ' > ' , ' > ' )
. replace ( ' " ' , ' " ' )
. replace ( " ' " , ' ' ' )
)
2021-01-09 13:26:12 +01:00
def process_communicate_or_kill ( p , * args , * * kwargs ) :
try :
return p . communicate ( * args , * * kwargs )
except BaseException : # Including KeyboardInterrupt
p . kill ( )
p . wait ( )
raise
2021-10-20 18:19:40 +02:00
class Popen ( subprocess . Popen ) :
if sys . platform == ' win32 ' :
_startupinfo = subprocess . STARTUPINFO ( )
_startupinfo . dwFlags | = subprocess . STARTF_USESHOWWINDOW
else :
_startupinfo = None
def __init__ ( self , * args , * * kwargs ) :
super ( Popen , self ) . __init__ ( * args , * * kwargs , startupinfo = self . _startupinfo )
def communicate_or_kill ( self , * args , * * kwargs ) :
return process_communicate_or_kill ( self , * args , * * kwargs )
2015-04-26 00:29:41 +02:00
def get_subprocess_encoding ( ) :
if sys . platform == ' win32 ' and sys . getwindowsversion ( ) [ 0 ] > = 5 :
# For subprocess calls, encode with locale encoding
# Refer to http://stackoverflow.com/a/9951851/35070
encoding = preferredencoding ( )
else :
encoding = sys . getfilesystemencoding ( )
if encoding is None :
encoding = ' utf-8 '
return encoding
2014-01-05 03:07:55 +01:00
def encodeFilename ( s , for_subprocess = False ) :
2012-11-28 02:04:46 +01:00
"""
@param s The name of the file
"""
2012-03-25 03:07:37 +02:00
2014-01-05 03:07:55 +01:00
assert type ( s ) == compat_str
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
# Python 3 has a Unicode API
if sys . version_info > = ( 3 , 0 ) :
return s
2012-11-28 00:56:20 +01:00
2015-04-26 00:29:41 +02:00
# Pass '' directly to use Unicode APIs on Windows 2000 and up
# (Detecting Windows NT 4 is tricky because 'major >= 4' would
# match Windows 9x series as well. Besides, NT 4 is obsolete.)
if not for_subprocess and sys . platform == ' win32 ' and sys . getwindowsversion ( ) [ 0 ] > = 5 :
return s
2016-03-03 11:47:54 +01:00
# Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
if sys . platform . startswith ( ' java ' ) :
return s
2015-04-26 00:29:41 +02:00
return s . encode ( get_subprocess_encoding ( ) , ' ignore ' )
def decodeFilename ( b , for_subprocess = False ) :
if sys . version_info > = ( 3 , 0 ) :
return b
if not isinstance ( b , bytes ) :
return b
return b . decode ( get_subprocess_encoding ( ) , ' ignore ' )
2014-01-05 03:07:55 +01:00
2014-05-16 15:47:54 +02:00
def encodeArgument ( s ) :
if not isinstance ( s , compat_str ) :
# Legacy code that uses byte strings
# Uncomment the following line after fixing all post processors
2014-12-17 00:06:41 +01:00
# assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
2014-05-16 15:47:54 +02:00
s = s . decode ( ' ascii ' )
return encodeFilename ( s , True )
2015-04-26 00:29:41 +02:00
def decodeArgument ( b ) :
return decodeFilename ( b , True )
2013-02-21 17:09:39 +01:00
def decodeOption ( optval ) :
if optval is None :
return optval
if isinstance ( optval , bytes ) :
optval = optval . decode ( preferredencoding ( ) )
assert isinstance ( optval , compat_str )
return optval
2013-01-01 20:27:53 +01:00
2014-11-23 20:41:03 +01:00
2021-10-19 19:28:14 +02:00
_timetuple = collections . namedtuple ( ' Time ' , ( ' hours ' , ' minutes ' , ' seconds ' , ' milliseconds ' ) )
def timetuple_from_msec ( msec ) :
secs , msec = divmod ( msec , 1000 )
mins , secs = divmod ( secs , 60 )
hrs , mins = divmod ( mins , 60 )
return _timetuple ( hrs , mins , secs , msec )
2021-05-23 18:34:49 +02:00
def formatSeconds ( secs , delim = ' : ' , msec = False ) :
2021-10-19 19:28:14 +02:00
time = timetuple_from_msec ( secs * 1000 )
if time . hours :
ret = ' %d %s %02d %s %02d ' % ( time . hours , delim , time . minutes , delim , time . seconds )
elif time . minutes :
ret = ' %d %s %02d ' % ( time . minutes , delim , time . seconds )
2013-05-04 12:02:18 +02:00
else :
2021-10-19 19:28:14 +02:00
ret = ' %d ' % time . seconds
return ' %s . %03d ' % ( ret , time . milliseconds ) if msec else ret
2013-05-04 12:02:18 +02:00
2013-12-29 15:28:32 +01:00
2021-09-28 23:37:23 +02:00
def _ssl_load_windows_store_certs ( ssl_context , storename ) :
# Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
try :
certs = [ cert for cert , encoding , trust in ssl . enum_certificates ( storename )
if encoding == ' x509_asn ' and (
trust is True or ssl . Purpose . SERVER_AUTH . oid in trust ) ]
except PermissionError :
return
for cert in certs :
2014-12-13 23:27:21 +01:00
try :
2021-09-28 23:37:23 +02:00
ssl_context . load_verify_locations ( cadata = cert )
except ssl . SSLError :
2014-12-13 23:27:21 +01:00
pass
2021-09-28 23:37:23 +02:00
def make_HTTPS_handler ( params , * * kwargs ) :
opts_check_certificate = not params . get ( ' nocheckcertificate ' )
context = ssl . SSLContext ( ssl . PROTOCOL_TLS_CLIENT )
context . check_hostname = opts_check_certificate
context . verify_mode = ssl . CERT_REQUIRED if opts_check_certificate else ssl . CERT_NONE
if opts_check_certificate :
2021-10-05 05:02:05 +02:00
try :
context . load_default_certs ( )
# Work around the issue in load_default_certs when there are bad certificates. See:
# https://github.com/yt-dlp/yt-dlp/issues/1060,
# https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
except ssl . SSLError :
# enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
if sys . platform == ' win32 ' and hasattr ( ssl , ' enum_certificates ' ) :
# Create a new context to discard any certificates that were already loaded
context = ssl . SSLContext ( ssl . PROTOCOL_TLS_CLIENT )
context . check_hostname , context . verify_mode = True , ssl . CERT_REQUIRED
for storename in ( ' CA ' , ' ROOT ' ) :
_ssl_load_windows_store_certs ( context , storename )
context . set_default_verify_paths ( )
2021-09-28 23:37:23 +02:00
return YoutubeDLHTTPSHandler ( params , context = context , * * kwargs )
2013-05-04 12:19:02 +02:00
2014-11-20 12:14:28 +01:00
2021-04-22 21:16:29 +02:00
def bug_reports_message ( before = ' ; ' ) :
2015-04-17 14:55:24 +02:00
if ytdl_is_updateable ( ) :
2021-02-24 19:45:56 +01:00
update_cmd = ' type yt-dlp -U to update '
2015-04-17 14:55:24 +02:00
else :
2021-02-24 19:45:56 +01:00
update_cmd = ' see https://github.com/yt-dlp/yt-dlp on how to update '
2021-04-22 21:16:29 +02:00
msg = ' please report this issue on https://github.com/yt-dlp/yt-dlp . '
2015-04-17 14:55:24 +02:00
msg + = ' Make sure you are using the latest version; %s . ' % update_cmd
2021-02-24 19:45:56 +01:00
msg + = ' Be sure to call yt-dlp with the --verbose flag and include its complete output. '
2021-04-22 21:16:29 +02:00
before = before . rstrip ( )
if not before or before . endswith ( ( ' . ' , ' ! ' , ' ? ' ) ) :
msg = msg [ 0 ] . title ( ) + msg [ 1 : ]
return ( before + ' ' if before else ' ' ) + msg
2015-04-17 14:55:24 +02:00
2016-10-17 13:38:37 +02:00
class YoutubeDLError ( Exception ) :
""" Base exception for YoutubeDL errors. """
2021-11-09 23:49:33 +01:00
msg = None
def __init__ ( self , msg = None ) :
if msg is not None :
self . msg = msg
elif self . msg is None :
self . msg = type ( self ) . __name__
super ( ) . __init__ ( self . msg )
2016-10-17 13:38:37 +02:00
2021-05-04 19:06:18 +02:00
network_exceptions = [ compat_urllib_error . URLError , compat_http_client . HTTPException , socket . error ]
if hasattr ( ssl , ' CertificateError ' ) :
network_exceptions . append ( ssl . CertificateError )
network_exceptions = tuple ( network_exceptions )
2016-10-17 13:38:37 +02:00
class ExtractorError ( YoutubeDLError ) :
2013-01-01 20:27:53 +01:00
""" Error during info extraction. """
2014-11-23 20:41:03 +01:00
2021-08-19 03:49:23 +02:00
def __init__ ( self , msg , tb = None , expected = False , cause = None , video_id = None , ie = None ) :
2013-07-02 08:40:21 +02:00
""" tb, if given, is the original traceback (so that it can be printed out).
2021-02-24 19:45:56 +01:00
If expected is set , this is a normal error message and most likely not a bug in yt - dlp .
2013-07-02 08:40:21 +02:00
"""
2021-05-04 19:06:18 +02:00
if sys . exc_info ( ) [ 0 ] in network_exceptions :
2013-07-02 08:40:21 +02:00
expected = True
2013-06-09 11:55:08 +02:00
2021-09-05 07:46:23 +02:00
self . msg = str ( msg )
2013-01-01 20:27:53 +01:00
self . traceback = tb
2021-08-19 03:49:23 +02:00
self . expected = expected
2013-08-28 04:25:38 +02:00
self . cause = cause
2014-04-21 20:34:03 +02:00
self . video_id = video_id
2021-08-19 03:49:23 +02:00
self . ie = ie
self . exc_info = sys . exc_info ( ) # preserve original exception
super ( ExtractorError , self ) . __init__ ( ' ' . join ( (
format_field ( ie , template = ' [ %s ] ' ) ,
format_field ( video_id , template = ' %s : ' ) ,
2021-09-05 07:46:23 +02:00
self . msg ,
2021-08-19 03:49:23 +02:00
format_field ( cause , template = ' (caused by %r ) ' ) ,
' ' if expected else bug_reports_message ( ) ) ) )
2013-01-01 20:27:53 +01:00
2013-01-03 15:39:55 +01:00
def format_traceback ( self ) :
if self . traceback is None :
return None
2014-11-17 07:16:12 +01:00
return ' ' . join ( traceback . format_tb ( self . traceback ) )
2013-01-03 15:39:55 +01:00
2013-01-01 20:27:53 +01:00
2014-12-30 19:35:35 +01:00
class UnsupportedError ( ExtractorError ) :
def __init__ ( self , url ) :
super ( UnsupportedError , self ) . __init__ (
' Unsupported URL: %s ' % url , expected = True )
self . url = url
2013-10-23 14:38:03 +02:00
class RegexNotFoundError ( ExtractorError ) :
""" Error when a regex didn ' t match """
pass
2017-02-04 12:49:58 +01:00
class GeoRestrictedError ( ExtractorError ) :
""" Geographic restriction Error exception.
This exception may be thrown when a video is not available from your
geographic location due to geographic restrictions imposed by a website .
"""
2020-10-09 07:06:49 +02:00
2021-10-26 16:47:29 +02:00
def __init__ ( self , msg , countries = None , * * kwargs ) :
kwargs [ ' expected ' ] = True
super ( GeoRestrictedError , self ) . __init__ ( msg , * * kwargs )
2017-02-04 12:49:58 +01:00
self . countries = countries
2016-10-17 13:38:37 +02:00
class DownloadError ( YoutubeDLError ) :
2012-11-28 02:04:46 +01:00
""" Download Error exception.
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
This exception may be thrown by FileDownloader objects if they are not
configured to continue on errors . They will contain the appropriate
error message .
"""
2014-11-23 20:41:03 +01:00
2013-03-09 10:05:43 +01:00
def __init__ ( self , msg , exc_info = None ) :
""" exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
super ( DownloadError , self ) . __init__ ( msg )
self . exc_info = exc_info
2012-03-25 03:07:37 +02:00
2021-03-23 20:45:53 +01:00
class EntryNotInPlaylist ( YoutubeDLError ) :
""" Entry not in playlist exception.
This exception will be thrown by YoutubeDL when a requested entry
is not found in the playlist info_dict
"""
2021-11-09 23:49:33 +01:00
msg = ' Entry not found in info '
2021-03-23 20:45:53 +01:00
2016-10-17 13:38:37 +02:00
class SameFileError ( YoutubeDLError ) :
2012-11-28 02:04:46 +01:00
""" Same File exception.
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
This exception will be thrown by FileDownloader objects if they detect
multiple files would have to be downloaded to the same file on disk .
"""
2021-11-09 23:49:33 +01:00
msg = ' Fixed output name but more than one file to download '
def __init__ ( self , filename = None ) :
if filename is not None :
self . msg + = f ' : { filename } '
super ( ) . __init__ ( self . msg )
2012-03-25 03:07:37 +02:00
2016-10-17 13:38:37 +02:00
class PostProcessingError ( YoutubeDLError ) :
2012-11-28 02:04:46 +01:00
""" Post Processing exception.
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
This exception may be raised by PostProcessor ' s .run() method to
indicate an error in the postprocessing task .
"""
2014-11-23 20:41:03 +01:00
2021-10-26 16:45:12 +02:00
class DownloadCancelled ( YoutubeDLError ) :
""" Exception raised when the download queue should be interrupted """
msg = ' The download was cancelled '
2021-01-13 02:01:01 +01:00
2021-10-26 16:45:12 +02:00
class ExistingVideoReached ( DownloadCancelled ) :
""" --break-on-existing triggered """
msg = ' Encountered a video that is already in the archive, stopping due to --break-on-existing '
2021-01-13 02:01:01 +01:00
2021-10-26 16:45:12 +02:00
class RejectedVideoReached ( DownloadCancelled ) :
""" --break-on-reject triggered """
msg = ' Encountered a video that did not match filter, stopping due to --break-on-reject '
2021-06-23 01:11:09 +02:00
2021-10-26 16:45:12 +02:00
class MaxDownloadsReached ( DownloadCancelled ) :
2012-11-28 02:04:46 +01:00
""" --max-downloads limit has been reached. """
2021-10-26 16:45:12 +02:00
msg = ' Maximum number of downloads reached, stopping due to --max-downloads '
2021-11-28 19:57:44 +01:00
class ReExtractInfo ( YoutubeDLError ) :
""" Video info needs to be re-extracted. """
def __init__ ( self , msg , expected = False ) :
super ( ) . __init__ ( msg )
self . expected = expected
class ThrottledDownload ( ReExtractInfo ) :
2021-10-26 16:45:12 +02:00
""" Download speed below --throttled-rate. """
2021-11-09 23:49:33 +01:00
msg = ' The download speed is below throttle limit '
2012-03-25 03:07:37 +02:00
2021-12-02 22:22:03 +01:00
def __init__ ( self ) :
super ( ) . __init__ ( self . msg , expected = False )
2021-11-28 19:57:44 +01:00
2012-03-25 03:07:37 +02:00
2016-10-17 13:38:37 +02:00
class UnavailableVideoError ( YoutubeDLError ) :
2012-11-28 02:04:46 +01:00
""" Unavailable Format exception.
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
This exception will be thrown when a video is requested
in a format that is not available for that video .
"""
2021-11-09 23:49:33 +01:00
msg = ' Unable to download video '
def __init__ ( self , err = None ) :
if err is not None :
self . msg + = f ' : { err } '
super ( ) . __init__ ( self . msg )
2012-03-25 03:07:37 +02:00
2016-10-17 13:38:37 +02:00
class ContentTooShortError ( YoutubeDLError ) :
2012-11-28 02:04:46 +01:00
""" Content Too Short exception.
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
This exception may be raised by FileDownloader objects when a file they
download is too small for what the server announced first , indicating
the connection was probably interrupted .
"""
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
def __init__ ( self , downloaded , expected ) :
2016-10-17 13:38:37 +02:00
super ( ContentTooShortError , self ) . __init__ (
' Downloaded {0} bytes, expected {1} bytes ' . format ( downloaded , expected )
)
2015-07-26 16:37:51 +02:00
# Both in bytes
2012-11-28 02:04:46 +01:00
self . downloaded = downloaded
self . expected = expected
2012-03-25 03:07:37 +02:00
2014-11-23 20:41:03 +01:00
2016-10-17 13:38:37 +02:00
class XAttrMetadataError ( YoutubeDLError ) :
2016-09-29 18:28:32 +02:00
def __init__ ( self , code = None , msg = ' Unknown error ' ) :
super ( XAttrMetadataError , self ) . __init__ ( msg )
self . code = code
2016-10-01 21:03:41 +02:00
self . msg = msg
2016-09-29 18:28:32 +02:00
# Parsing code and msg
2019-05-10 22:56:22 +02:00
if ( self . code in ( errno . ENOSPC , errno . EDQUOT )
2020-11-21 15:50:42 +01:00
or ' No space left ' in self . msg or ' Disk quota exceeded ' in self . msg ) :
2016-09-29 18:28:32 +02:00
self . reason = ' NO_SPACE '
elif self . code == errno . E2BIG or ' Argument list too long ' in self . msg :
self . reason = ' VALUE_TOO_LONG '
else :
self . reason = ' NOT_SUPPORTED '
2016-10-17 13:38:37 +02:00
class XAttrUnavailableError ( YoutubeDLError ) :
2016-09-29 18:28:32 +02:00
pass
2015-01-10 20:05:28 +01:00
def _create_http_connection ( ydl_handler , http_class , is_https , * args , * * kwargs ) :
2015-09-01 22:16:04 +02:00
# Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
# expected HTTP responses to meet HTTP/1.0 or later (see also
2019-03-09 13:14:41 +01:00
# https://github.com/ytdl-org/youtube-dl/issues/6727)
2015-09-01 22:16:04 +02:00
if sys . version_info < ( 3 , 0 ) :
2018-01-27 21:04:39 +01:00
kwargs [ ' strict ' ] = True
hc = http_class ( * args , * * compat_kwargs ( kwargs ) )
2015-01-10 19:55:36 +01:00
source_address = ydl_handler . _params . get ( ' source_address ' )
2018-03-17 01:11:47 +01:00
2015-01-10 19:55:36 +01:00
if source_address is not None :
2018-03-17 01:11:47 +01:00
# This is to workaround _create_connection() from socket where it will try all
# address data from getaddrinfo() including IPv6. This filters the result from
# getaddrinfo() based on the source_address value.
# This is based on the cpython socket.create_connection() function.
# https://github.com/python/cpython/blob/master/Lib/socket.py#L691
def _create_connection ( address , timeout = socket . _GLOBAL_DEFAULT_TIMEOUT , source_address = None ) :
host , port = address
err = None
addrs = socket . getaddrinfo ( host , port , 0 , socket . SOCK_STREAM )
2018-08-28 20:17:21 +02:00
af = socket . AF_INET if ' . ' in source_address [ 0 ] else socket . AF_INET6
ip_addrs = [ addr for addr in addrs if addr [ 0 ] == af ]
if addrs and not ip_addrs :
ip_version = ' v4 ' if af == socket . AF_INET else ' v6 '
raise socket . error (
" No remote IP %s addresses available for connect, can ' t use ' %s ' as source address "
% ( ip_version , source_address [ 0 ] ) )
2018-03-17 01:11:47 +01:00
for res in ip_addrs :
af , socktype , proto , canonname , sa = res
sock = None
try :
sock = socket . socket ( af , socktype , proto )
if timeout is not socket . _GLOBAL_DEFAULT_TIMEOUT :
sock . settimeout ( timeout )
sock . bind ( source_address )
sock . connect ( sa )
err = None # Explicitly break reference cycle
return sock
except socket . error as _ :
err = _
if sock is not None :
sock . close ( )
if err is not None :
raise err
else :
2018-08-28 20:17:21 +02:00
raise socket . error ( ' getaddrinfo returns an empty list ' )
if hasattr ( hc , ' _create_connection ' ) :
hc . _create_connection = _create_connection
2015-01-10 19:55:36 +01:00
sa = ( source_address , 0 )
if hasattr ( hc , ' source_address ' ) : # Python 2.7+
hc . source_address = sa
else : # Python 2.6
def _hc_connect ( self , * args , * * kwargs ) :
2018-08-28 20:17:21 +02:00
sock = _create_connection (
2015-01-10 19:55:36 +01:00
( self . host , self . port ) , self . timeout , sa )
if is_https :
2015-01-23 11:15:18 +01:00
self . sock = ssl . wrap_socket (
sock , self . key_file , self . cert_file ,
ssl_version = ssl . PROTOCOL_TLSv1 )
2015-01-10 19:55:36 +01:00
else :
self . sock = sock
hc . connect = functools . partial ( _hc_connect , hc )
return hc
2015-11-29 05:42:50 +01:00
def handle_youtubedl_headers ( headers ) :
2015-11-29 05:58:29 +01:00
filtered_headers = headers
if ' Youtubedl-no-compression ' in filtered_headers :
filtered_headers = dict ( ( k , v ) for k , v in filtered_headers . items ( ) if k . lower ( ) != ' accept-encoding ' )
2015-11-29 05:42:50 +01:00
del filtered_headers [ ' Youtubedl-no-compression ' ]
2015-11-29 05:58:29 +01:00
return filtered_headers
2015-11-29 05:42:50 +01:00
2013-08-27 23:15:01 +02:00
class YoutubeDLHandler ( compat_urllib_request . HTTPHandler ) :
2012-11-28 02:04:46 +01:00
""" Handler for HTTP requests and responses.
This class , when installed with an OpenerDirector , automatically adds
the standard headers to every HTTP request and handles gzipped and
deflated responses from web servers . If compression is to be avoided in
a particular request , the original request in the program code only has
2015-11-29 05:46:04 +01:00
to include the HTTP header " Youtubedl-no-compression " , which will be
2012-11-28 02:04:46 +01:00
removed before making the real request .
Part of this code was copied from :
http : / / techknack . net / python - urllib2 - handlers /
Andrew Rowls , the author of that code , agreed to release it to the
public domain .
"""
2015-01-10 19:55:36 +01:00
def __init__ ( self , params , * args , * * kwargs ) :
compat_urllib_request . HTTPHandler . __init__ ( self , * args , * * kwargs )
self . _params = params
def http_open ( self , req ) :
2016-04-23 15:30:06 +02:00
conn_class = compat_http_client . HTTPConnection
socks_proxy = req . headers . get ( ' Ytdl-socks-proxy ' )
if socks_proxy :
conn_class = make_socks_conn_class ( conn_class , socks_proxy )
del req . headers [ ' Ytdl-socks-proxy ' ]
2015-01-10 19:55:36 +01:00
return self . do_open ( functools . partial (
2016-04-23 15:30:06 +02:00
_create_http_connection , self , conn_class , False ) ,
2015-01-10 19:55:36 +01:00
req )
2012-11-28 02:04:46 +01:00
@staticmethod
def deflate ( data ) :
2021-02-11 17:01:34 +01:00
if not data :
return data
2012-11-28 02:04:46 +01:00
try :
return zlib . decompress ( data , - zlib . MAX_WBITS )
except zlib . error :
return zlib . decompress ( data )
2013-08-27 23:15:01 +02:00
def http_request ( self , req ) :
2015-08-06 18:01:01 +02:00
# According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
# always respected by websites, some tend to give out URLs with non percent-encoded
# non-ASCII characters (see telemb.py, ard.py [#3412])
# urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
# To work around aforementioned issue we will replace request's original URL with
# percent-encoded one
# Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
# the code of this workaround has been moved here from YoutubeDL.urlopen()
url = req . get_full_url ( )
url_escaped = escape_url ( url )
# Substitute URL if any change after escaping
if url != url_escaped :
2016-03-31 18:55:49 +02:00
req = update_Request ( req , url = url_escaped )
2015-08-06 18:01:01 +02:00
2014-08-26 11:51:48 +02:00
for h , v in std_headers . items ( ) :
2015-01-12 22:26:20 +01:00
# Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
# The dict keys are capitalized because of this bug by urllib
if h . capitalize ( ) not in req . headers :
2014-08-26 11:51:48 +02:00
req . add_header ( h , v )
2015-11-29 05:42:50 +01:00
req . headers = handle_youtubedl_headers ( req . headers )
2014-09-29 06:15:46 +02:00
if sys . version_info < ( 2 , 7 ) and ' # ' in req . get_full_url ( ) :
# Python 2.6 is brain-dead when it comes to fragments
req . _Request__original = req . _Request__original . partition ( ' # ' ) [ 0 ]
req . _Request__r_type = req . _Request__r_type . partition ( ' # ' ) [ 0 ]
2012-11-28 02:04:46 +01:00
return req
2013-08-27 23:15:01 +02:00
def http_response ( self , req , resp ) :
2012-11-28 02:04:46 +01:00
old_resp = resp
# gzip
if resp . headers . get ( ' Content-encoding ' , ' ' ) == ' gzip ' :
2013-08-28 11:57:13 +02:00
content = resp . read ( )
gz = gzip . GzipFile ( fileobj = io . BytesIO ( content ) , mode = ' rb ' )
try :
uncompressed = io . BytesIO ( gz . read ( ) )
except IOError as original_ioerror :
# There may be junk add the end of the file
# See http://stackoverflow.com/q/4928560/35070 for details
for i in range ( 1 , 1024 ) :
try :
gz = gzip . GzipFile ( fileobj = io . BytesIO ( content [ : - i ] ) , mode = ' rb ' )
uncompressed = io . BytesIO ( gz . read ( ) )
except IOError :
continue
break
else :
raise original_ioerror
2017-05-27 17:05:02 +02:00
resp = compat_urllib_request . addinfourl ( uncompressed , old_resp . headers , old_resp . url , old_resp . code )
2012-11-28 02:04:46 +01:00
resp . msg = old_resp . msg
[utils] Remove Content-encoding from headers after decompression
With cn_verification_proxy, our http_response() is called twice, one from
PerRequestProxyHandler.proxy_open() and another from normal
YoutubeDL.urlopen(). As a result, for proxies honoring Accept-Encoding, the
following bug occurs:
$ youtube-dl -vs --cn-verification-proxy https://secure.uku.im:993 "test:letv"
[debug] System config: []
[debug] User config: []
[debug] Command-line args: ['-vs', '--cn-verification-proxy', 'https://secure.uku.im:993', 'test:letv']
[debug] Encodings: locale UTF-8, fs utf-8, out UTF-8, pref UTF-8
[debug] youtube-dl version 2015.12.23
[debug] Git HEAD: 97f18fa
[debug] Python version 3.5.1 - Linux-4.3.3-1-ARCH-x86_64-with-arch-Arch-Linux
[debug] exe versions: ffmpeg 2.8.4, ffprobe 2.8.4, rtmpdump 2.4
[debug] Proxy map: {}
[TestURL] Test URL: http://www.letv.com/ptv/vplay/22005890.html
[Letv] 22005890: Downloading webpage
[Letv] 22005890: Downloading playJson data
ERROR: Unable to download JSON metadata: Not a gzipped file (b'{"') (caused by OSError('Not a gzipped file (b\'{"\')',)); please report this issue on https://yt-dl.org/bug . Make sure you are using the latest version; see https://yt-dl.org/update on how to update. Be sure to call youtube-dl with the --verbose flag and include its complete output.
File "/home/yen/Executables/Multimedia/youtube-dl/youtube_dl/extractor/common.py", line 330, in _request_webpage
return self._downloader.urlopen(url_or_request)
File "/home/yen/Executables/Multimedia/youtube-dl/youtube_dl/YoutubeDL.py", line 1886, in urlopen
return self._opener.open(req, timeout=self._socket_timeout)
File "/usr/lib/python3.5/urllib/request.py", line 471, in open
response = meth(req, response)
File "/home/yen/Executables/Multimedia/youtube-dl/youtube_dl/utils.py", line 773, in http_response
raise original_ioerror
File "/home/yen/Executables/Multimedia/youtube-dl/youtube_dl/utils.py", line 761, in http_response
uncompressed = io.BytesIO(gz.read())
File "/usr/lib/python3.5/gzip.py", line 274, in read
return self._buffer.read(size)
File "/usr/lib/python3.5/gzip.py", line 461, in read
if not self._read_gzip_header():
File "/usr/lib/python3.5/gzip.py", line 409, in _read_gzip_header
raise OSError('Not a gzipped file (%r)' % magic)
2015-12-27 18:09:18 +01:00
del resp . headers [ ' Content-encoding ' ]
2012-11-28 02:04:46 +01:00
# deflate
if resp . headers . get ( ' Content-encoding ' , ' ' ) == ' deflate ' :
gz = io . BytesIO ( self . deflate ( resp . read ( ) ) )
2017-05-27 17:05:02 +02:00
resp = compat_urllib_request . addinfourl ( gz , old_resp . headers , old_resp . url , old_resp . code )
2012-11-28 02:04:46 +01:00
resp . msg = old_resp . msg
[utils] Remove Content-encoding from headers after decompression
With cn_verification_proxy, our http_response() is called twice, one from
PerRequestProxyHandler.proxy_open() and another from normal
YoutubeDL.urlopen(). As a result, for proxies honoring Accept-Encoding, the
following bug occurs:
$ youtube-dl -vs --cn-verification-proxy https://secure.uku.im:993 "test:letv"
[debug] System config: []
[debug] User config: []
[debug] Command-line args: ['-vs', '--cn-verification-proxy', 'https://secure.uku.im:993', 'test:letv']
[debug] Encodings: locale UTF-8, fs utf-8, out UTF-8, pref UTF-8
[debug] youtube-dl version 2015.12.23
[debug] Git HEAD: 97f18fa
[debug] Python version 3.5.1 - Linux-4.3.3-1-ARCH-x86_64-with-arch-Arch-Linux
[debug] exe versions: ffmpeg 2.8.4, ffprobe 2.8.4, rtmpdump 2.4
[debug] Proxy map: {}
[TestURL] Test URL: http://www.letv.com/ptv/vplay/22005890.html
[Letv] 22005890: Downloading webpage
[Letv] 22005890: Downloading playJson data
ERROR: Unable to download JSON metadata: Not a gzipped file (b'{"') (caused by OSError('Not a gzipped file (b\'{"\')',)); please report this issue on https://yt-dl.org/bug . Make sure you are using the latest version; see https://yt-dl.org/update on how to update. Be sure to call youtube-dl with the --verbose flag and include its complete output.
File "/home/yen/Executables/Multimedia/youtube-dl/youtube_dl/extractor/common.py", line 330, in _request_webpage
return self._downloader.urlopen(url_or_request)
File "/home/yen/Executables/Multimedia/youtube-dl/youtube_dl/YoutubeDL.py", line 1886, in urlopen
return self._opener.open(req, timeout=self._socket_timeout)
File "/usr/lib/python3.5/urllib/request.py", line 471, in open
response = meth(req, response)
File "/home/yen/Executables/Multimedia/youtube-dl/youtube_dl/utils.py", line 773, in http_response
raise original_ioerror
File "/home/yen/Executables/Multimedia/youtube-dl/youtube_dl/utils.py", line 761, in http_response
uncompressed = io.BytesIO(gz.read())
File "/usr/lib/python3.5/gzip.py", line 274, in read
return self._buffer.read(size)
File "/usr/lib/python3.5/gzip.py", line 461, in read
if not self._read_gzip_header():
File "/usr/lib/python3.5/gzip.py", line 409, in _read_gzip_header
raise OSError('Not a gzipped file (%r)' % magic)
2015-12-27 18:09:18 +01:00
del resp . headers [ ' Content-encoding ' ]
2015-09-06 02:23:44 +02:00
# Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
2019-03-09 13:14:41 +01:00
# https://github.com/ytdl-org/youtube-dl/issues/6457).
2015-08-06 21:26:40 +02:00
if 300 < = resp . code < 400 :
location = resp . headers . get ( ' Location ' )
if location :
# As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
if sys . version_info > = ( 3 , 0 ) :
location = location . encode ( ' iso-8859-1 ' ) . decode ( ' utf-8 ' )
2016-05-26 11:22:40 +02:00
else :
location = location . decode ( ' utf-8 ' )
2015-08-06 21:26:40 +02:00
location_escaped = escape_url ( location )
if location != location_escaped :
del resp . headers [ ' Location ' ]
2016-05-29 13:25:25 +02:00
if sys . version_info < ( 3 , 0 ) :
location_escaped = location_escaped . encode ( ' utf-8 ' )
2015-08-06 21:26:40 +02:00
resp . headers [ ' Location ' ] = location_escaped
2012-11-28 02:04:46 +01:00
return resp
2012-12-07 00:39:44 +01:00
2013-08-27 23:15:01 +02:00
https_request = http_request
https_response = http_response
2013-04-27 15:14:20 +02:00
2014-02-06 11:29:46 +01:00
2016-04-23 15:30:06 +02:00
def make_socks_conn_class ( base_class , socks_proxy ) :
assert issubclass ( base_class , (
compat_http_client . HTTPConnection , compat_http_client . HTTPSConnection ) )
url_components = compat_urlparse . urlparse ( socks_proxy )
if url_components . scheme . lower ( ) == ' socks5 ' :
socks_type = ProxyType . SOCKS5
elif url_components . scheme . lower ( ) in ( ' socks ' , ' socks4 ' ) :
socks_type = ProxyType . SOCKS4
2016-05-03 09:15:32 +02:00
elif url_components . scheme . lower ( ) == ' socks4a ' :
socks_type = ProxyType . SOCKS4A
2016-04-23 15:30:06 +02:00
2016-05-17 08:38:15 +02:00
def unquote_if_non_empty ( s ) :
if not s :
return s
return compat_urllib_parse_unquote_plus ( s )
2016-04-23 15:30:06 +02:00
proxy_args = (
socks_type ,
url_components . hostname , url_components . port or 1080 ,
True , # Remote DNS
2016-05-17 08:38:15 +02:00
unquote_if_non_empty ( url_components . username ) ,
unquote_if_non_empty ( url_components . password ) ,
2016-04-23 15:30:06 +02:00
)
class SocksConnection ( base_class ) :
def connect ( self ) :
self . sock = sockssocket ( )
self . sock . setproxy ( * proxy_args )
if type ( self . timeout ) in ( int , float ) :
self . sock . settimeout ( self . timeout )
self . sock . connect ( ( self . host , self . port ) )
if isinstance ( self , compat_http_client . HTTPSConnection ) :
if hasattr ( self , ' _context ' ) : # Python > 2.6
self . sock = self . _context . wrap_socket (
self . sock , server_hostname = self . host )
else :
self . sock = ssl . wrap_socket ( self . sock )
return SocksConnection
2015-01-10 19:55:36 +01:00
class YoutubeDLHTTPSHandler ( compat_urllib_request . HTTPSHandler ) :
def __init__ ( self , params , https_conn_class = None , * args , * * kwargs ) :
compat_urllib_request . HTTPSHandler . __init__ ( self , * args , * * kwargs )
self . _https_conn_class = https_conn_class or compat_http_client . HTTPSConnection
self . _params = params
def https_open ( self , req ) :
2015-01-29 13:37:17 +01:00
kwargs = { }
2016-04-23 15:30:06 +02:00
conn_class = self . _https_conn_class
2015-01-29 13:37:17 +01:00
if hasattr ( self , ' _context ' ) : # python > 2.6
kwargs [ ' context ' ] = self . _context
if hasattr ( self , ' _check_hostname ' ) : # python 3.x
kwargs [ ' check_hostname ' ] = self . _check_hostname
2016-04-23 15:30:06 +02:00
socks_proxy = req . headers . get ( ' Ytdl-socks-proxy ' )
if socks_proxy :
conn_class = make_socks_conn_class ( conn_class , socks_proxy )
del req . headers [ ' Ytdl-socks-proxy ' ]
2015-01-10 19:55:36 +01:00
return self . do_open ( functools . partial (
2016-04-23 15:30:06 +02:00
_create_http_connection , self , conn_class , True ) ,
2015-01-29 13:37:17 +01:00
req , * * kwargs )
2015-01-10 19:55:36 +01:00
2018-12-09 00:00:32 +01:00
class YoutubeDLCookieJar ( compat_cookiejar . MozillaCookieJar ) :
2020-03-09 22:59:02 +01:00
"""
See [ 1 ] for cookie file format .
1. https : / / curl . haxx . se / docs / http - cookies . html
"""
2019-03-03 13:23:59 +01:00
_HTTPONLY_PREFIX = ' #HttpOnly_ '
2020-05-04 23:19:33 +02:00
_ENTRY_LEN = 7
_HEADER = ''' # Netscape HTTP Cookie File
2021-02-24 19:45:56 +01:00
# This file is generated by yt-dlp. Do not edit.
2020-05-04 23:19:33 +02:00
'''
_CookieFileEntry = collections . namedtuple (
' CookieFileEntry ' ,
( ' domain_name ' , ' include_subdomains ' , ' path ' , ' https_only ' , ' expires_at ' , ' name ' , ' value ' ) )
2019-03-03 13:23:59 +01:00
2018-12-09 00:00:32 +01:00
def save ( self , filename = None , ignore_discard = False , ignore_expires = False ) :
2020-05-04 23:19:33 +02:00
"""
Save cookies to a file .
Most of the code is taken from CPython 3.8 and slightly adapted
to support cookie files with UTF - 8 in both python 2 and 3.
"""
if filename is None :
if self . filename is not None :
filename = self . filename
else :
raise ValueError ( compat_cookiejar . MISSING_FILENAME_TEXT )
2018-12-09 00:00:32 +01:00
# Store session cookies with `expires` set to 0 instead of an empty
# string
for cookie in self :
if cookie . expires is None :
cookie . expires = 0
2020-05-04 23:19:33 +02:00
with io . open ( filename , ' w ' , encoding = ' utf-8 ' ) as f :
f . write ( self . _HEADER )
now = time . time ( )
for cookie in self :
if not ignore_discard and cookie . discard :
continue
if not ignore_expires and cookie . is_expired ( now ) :
continue
if cookie . secure :
secure = ' TRUE '
else :
secure = ' FALSE '
if cookie . domain . startswith ( ' . ' ) :
initial_dot = ' TRUE '
else :
initial_dot = ' FALSE '
if cookie . expires is not None :
expires = compat_str ( cookie . expires )
else :
expires = ' '
if cookie . value is None :
# cookies.txt regards 'Set-Cookie: foo' as a cookie
# with no name, whereas http.cookiejar regards it as a
# cookie with no value.
name = ' '
value = cookie . name
else :
name = cookie . name
value = cookie . value
f . write (
' \t ' . join ( [ cookie . domain , initial_dot , cookie . path ,
secure , expires , name , value ] ) + ' \n ' )
2018-12-09 00:00:32 +01:00
def load ( self , filename = None , ignore_discard = False , ignore_expires = False ) :
2019-03-03 13:23:59 +01:00
""" Load cookies from a file. """
if filename is None :
if self . filename is not None :
filename = self . filename
else :
raise ValueError ( compat_cookiejar . MISSING_FILENAME_TEXT )
2020-05-04 23:19:33 +02:00
def prepare_line ( line ) :
if line . startswith ( self . _HTTPONLY_PREFIX ) :
line = line [ len ( self . _HTTPONLY_PREFIX ) : ]
# comments and empty lines are fine
if line . startswith ( ' # ' ) or not line . strip ( ) :
return line
cookie_list = line . split ( ' \t ' )
if len ( cookie_list ) != self . _ENTRY_LEN :
raise compat_cookiejar . LoadError ( ' invalid length %d ' % len ( cookie_list ) )
cookie = self . _CookieFileEntry ( * cookie_list )
if cookie . expires_at and not cookie . expires_at . isdigit ( ) :
raise compat_cookiejar . LoadError ( ' invalid expires at %s ' % cookie . expires_at )
return line
2019-03-03 13:23:59 +01:00
cf = io . StringIO ( )
2020-05-04 23:19:33 +02:00
with io . open ( filename , encoding = ' utf-8 ' ) as f :
2019-03-03 13:23:59 +01:00
for line in f :
2020-05-04 23:19:33 +02:00
try :
cf . write ( prepare_line ( line ) )
except compat_cookiejar . LoadError as e :
write_string (
' WARNING: skipping cookie file entry due to %s : %r \n '
% ( e , line ) , sys . stderr )
continue
2019-03-03 13:23:59 +01:00
cf . seek ( 0 )
self . _really_load ( cf , filename , ignore_discard , ignore_expires )
2018-12-09 00:00:32 +01:00
# Session cookies are denoted by either `expires` field set to
# an empty string or 0. MozillaCookieJar only recognizes the former
# (see [1]). So we need force the latter to be recognized as session
# cookies on our own.
# Session cookies may be important for cookies-based authentication,
# e.g. usually, when user does not check 'Remember me' check box while
# logging in on a site, some important cookies are stored as session
# cookies so that not recognizing them will result in failed login.
# 1. https://bugs.python.org/issue17164
for cookie in self :
# Treat `expires=0` cookies as session cookies
if cookie . expires == 0 :
cookie . expires = None
cookie . discard = True
2015-09-06 02:20:48 +02:00
class YoutubeDLCookieProcessor ( compat_urllib_request . HTTPCookieProcessor ) :
def __init__ ( self , cookiejar = None ) :
compat_urllib_request . HTTPCookieProcessor . __init__ ( self , cookiejar )
def http_response ( self , request , response ) :
# Python 2 will choke on next HTTP request in row if there are non-ASCII
# characters in Set-Cookie HTTP header of last response (see
2019-03-09 13:14:41 +01:00
# https://github.com/ytdl-org/youtube-dl/issues/6769).
2015-09-06 02:20:48 +02:00
# In order to at least prevent crashing we will percent encode Set-Cookie
# header before HTTPCookieProcessor starts processing it.
2015-09-06 04:16:39 +02:00
# if sys.version_info < (3, 0) and response.headers:
# for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
# set_cookie = response.headers.get(set_cookie_header)
# if set_cookie:
# set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
# if set_cookie != set_cookie_escaped:
# del response.headers[set_cookie_header]
# response.headers[set_cookie_header] = set_cookie_escaped
2015-09-06 02:20:48 +02:00
return compat_urllib_request . HTTPCookieProcessor . http_response ( self , request , response )
2021-04-29 01:19:44 +02:00
https_request = compat_urllib_request . HTTPCookieProcessor . http_request
2015-09-06 02:20:48 +02:00
https_response = http_response
2020-02-29 13:08:44 +01:00
class YoutubeDLRedirectHandler ( compat_urllib_request . HTTPRedirectHandler ) :
2021-04-17 05:02:33 +02:00
""" YoutubeDL redirect handler
The code is based on HTTPRedirectHandler implementation from CPython [ 1 ] .
This redirect handler solves two issues :
- ensures redirect URL is always unicode under python 2
- introduces support for experimental HTTP response status code
308 Permanent Redirect [ 2 ] used by some sites [ 3 ]
1. https : / / github . com / python / cpython / blob / master / Lib / urllib / request . py
2. https : / / developer . mozilla . org / en - US / docs / Web / HTTP / Status / 308
3. https : / / github . com / ytdl - org / youtube - dl / issues / 28768
"""
http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request . HTTPRedirectHandler . http_error_302
def redirect_request ( self , req , fp , code , msg , headers , newurl ) :
""" Return a Request or None in response to a redirect.
This is called by the http_error_30x methods when a
redirection response is received . If a redirection should
take place , return a new Request to allow http_error_30x to
perform the redirect . Otherwise , raise HTTPError if no - one
else should try to handle this url . Return None if you can ' t
but another Handler might .
"""
m = req . get_method ( )
if ( not ( code in ( 301 , 302 , 303 , 307 , 308 ) and m in ( " GET " , " HEAD " )
or code in ( 301 , 302 , 303 ) and m == " POST " ) ) :
raise compat_HTTPError ( req . full_url , code , msg , headers , fp )
# Strictly (according to RFC 2616), 301 or 302 in response to
# a POST MUST NOT cause a redirection without confirmation
# from the user (of urllib.request, in this case). In practice,
# essentially all clients do redirect in this case, so we do
# the same.
# On python 2 urlh.geturl() may sometimes return redirect URL
# as byte string instead of unicode. This workaround allows
# to force it always return unicode.
if sys . version_info [ 0 ] < 3 :
newurl = compat_str ( newurl )
# Be conciliant with URIs containing a space. This is mainly
# redundant with the more complete encoding done in http_error_302(),
# but it is kept for compatibility with other callers.
newurl = newurl . replace ( ' ' , ' % 20 ' )
CONTENT_HEADERS = ( " content-length " , " content-type " )
# NB: don't use dict comprehension for python 2.6 compatibility
newheaders = dict ( ( k , v ) for k , v in req . headers . items ( )
if k . lower ( ) not in CONTENT_HEADERS )
return compat_urllib_request . Request (
newurl , headers = newheaders , origin_req_host = req . origin_req_host ,
unverifiable = True )
2020-02-29 13:08:44 +01:00
2016-06-25 17:30:35 +02:00
def extract_timezone ( date_str ) :
m = re . search (
2021-09-19 14:15:41 +02:00
r ''' (?x)
^ . { 8 , } ? # >=8 char non-TZ prefix, if present
( ? P < tz > Z | # just the UTC Z, or
( ? : ( ? < = . \b \d { 4 } | \b \d { 2 } : \d \d ) | # preceded by 4 digits or hh:mm or
( ? < ! . \b [ a - zA - Z ] { 3 } | [ a - zA - Z ] { 4 } | . . \b \d \d ) ) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
[ ] ? # optional space
( ? P < sign > \+ | - ) # +/-
( ? P < hours > [ 0 - 9 ] { 2 } ) : ? ( ? P < minutes > [ 0 - 9 ] { 2 } ) # hh[:]mm
$ )
''' , date_str)
2016-06-25 17:30:35 +02:00
if not m :
timezone = datetime . timedelta ( )
else :
date_str = date_str [ : - len ( m . group ( ' tz ' ) ) ]
if not m . group ( ' sign ' ) :
timezone = datetime . timedelta ( )
else :
sign = 1 if m . group ( ' sign ' ) == ' + ' else - 1
timezone = datetime . timedelta (
hours = sign * int ( m . group ( ' hours ' ) ) ,
minutes = sign * int ( m . group ( ' minutes ' ) ) )
return timezone , date_str
2015-02-12 08:55:06 +01:00
def parse_iso8601 ( date_str , delimiter = ' T ' , timezone = None ) :
2014-03-24 01:40:09 +01:00
""" Return a UNIX timestamp from the given date """
if date_str is None :
return None
2015-10-28 16:40:22 +01:00
date_str = re . sub ( r ' \ .[0-9]+ ' , ' ' , date_str )
2015-02-12 08:55:06 +01:00
if timezone is None :
2016-06-25 17:30:35 +02:00
timezone , date_str = extract_timezone ( date_str )
2015-10-28 16:40:22 +01:00
try :
date_format = ' % Y- % m- %d {0} % H: % M: % S ' . format ( delimiter )
dt = datetime . datetime . strptime ( date_str , date_format ) - timezone
return calendar . timegm ( dt . timetuple ( ) )
except ValueError :
pass
2014-03-24 01:40:09 +01:00
2016-06-25 17:30:35 +02:00
def date_formats ( day_first = True ) :
return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
2014-12-12 02:57:36 +01:00
def unified_strdate ( date_str , day_first = True ) :
2013-04-27 15:14:20 +02:00
""" Return a string with the date in the format YYYYMMDD """
2014-03-21 14:38:37 +01:00
if date_str is None :
return None
2013-04-27 15:14:20 +02:00
upload_date = None
2014-11-23 20:41:03 +01:00
# Replace commas
2014-02-09 18:09:57 +01:00
date_str = date_str . replace ( ' , ' , ' ' )
2014-12-12 02:57:36 +01:00
# Remove AM/PM + timezone
2015-02-03 10:58:28 +01:00
date_str = re . sub ( r ' (?i) \ s*(?:AM|PM)(?: \ s+[A-Z]+)? ' , ' ' , date_str )
2016-06-25 17:30:35 +02:00
_ , date_str = extract_timezone ( date_str )
2014-12-12 02:57:36 +01:00
2016-06-25 17:30:35 +02:00
for expression in date_formats ( day_first ) :
2013-04-27 15:14:20 +02:00
try :
upload_date = datetime . datetime . strptime ( date_str , expression ) . strftime ( ' % Y % m %d ' )
2014-02-06 11:29:46 +01:00
except ValueError :
2013-04-27 15:14:20 +02:00
pass
2013-12-17 12:33:55 +01:00
if upload_date is None :
timetuple = email . utils . parsedate_tz ( date_str )
if timetuple :
2016-05-22 04:28:41 +02:00
try :
upload_date = datetime . datetime ( * timetuple [ : 6 ] ) . strftime ( ' % Y % m %d ' )
except ValueError :
pass
2015-11-02 14:08:38 +01:00
if upload_date is not None :
return compat_str ( upload_date )
2013-04-27 15:14:20 +02:00
2014-11-23 20:41:03 +01:00
2016-06-25 17:30:35 +02:00
def unified_timestamp ( date_str , day_first = True ) :
if date_str is None :
return None
2017-06-11 16:27:22 +02:00
date_str = re . sub ( r ' [,|] ' , ' ' , date_str )
2016-06-25 17:30:35 +02:00
2016-08-05 05:41:55 +02:00
pm_delta = 12 if re . search ( r ' (?i)PM ' , date_str ) else 0
2016-06-25 17:30:35 +02:00
timezone , date_str = extract_timezone ( date_str )
# Remove AM/PM + timezone
date_str = re . sub ( r ' (?i) \ s*(?:AM|PM)(?: \ s+[A-Z]+)? ' , ' ' , date_str )
2017-04-30 16:07:30 +02:00
# Remove unrecognized timezones from ISO 8601 alike timestamps
m = re . search ( r ' \ d { 1,2}: \ d { 1,2}(?: \ . \ d+)?(?P<tz> \ s*[A-Z]+)$ ' , date_str )
if m :
date_str = date_str [ : - len ( m . group ( ' tz ' ) ) ]
2018-03-14 01:28:40 +01:00
# Python only supports microseconds, so remove nanoseconds
m = re . search ( r ' ^([0-9] { 4,}-[0-9] { 1,2}-[0-9] { 1,2}T[0-9] { 1,2}:[0-9] { 1,2}:[0-9] { 1,2} \ .[0-9] {6} )[0-9]+$ ' , date_str )
if m :
date_str = m . group ( 1 )
2016-06-25 17:30:35 +02:00
for expression in date_formats ( day_first ) :
try :
2016-08-05 05:41:55 +02:00
dt = datetime . datetime . strptime ( date_str , expression ) - timezone + datetime . timedelta ( hours = pm_delta )
2016-06-25 17:30:35 +02:00
return calendar . timegm ( dt . timetuple ( ) )
except ValueError :
pass
timetuple = email . utils . parsedate_tz ( date_str )
if timetuple :
2016-08-05 05:41:55 +02:00
return calendar . timegm ( timetuple ) + pm_delta * 3600
2016-06-25 17:30:35 +02:00
2014-11-17 07:16:12 +01:00
def determine_ext ( url , default_ext = ' unknown_video ' ) :
2018-06-01 19:16:22 +02:00
if url is None or ' . ' not in url :
2014-08-01 14:08:09 +02:00
return default_ext
2015-11-22 12:27:13 +01:00
guess = url . partition ( ' ? ' ) [ 0 ] . rpartition ( ' . ' ) [ 2 ]
2013-07-08 01:13:55 +02:00
if re . match ( r ' ^[A-Za-z0-9]+$ ' , guess ) :
return guess
2016-01-03 20:08:34 +01:00
# Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
elif guess . rstrip ( ' / ' ) in KNOWN_EXTENSIONS :
2015-11-22 12:27:13 +01:00
return guess . rstrip ( ' / ' )
2013-07-08 01:13:55 +02:00
else :
2013-07-12 21:52:59 +02:00
return default_ext
2013-07-08 01:13:55 +02:00
2014-11-23 20:41:03 +01:00
2019-10-17 23:03:53 +02:00
def subtitles_filename ( filename , sub_lang , sub_format , expected_real_ext = None ) :
return replace_extension ( filename , sub_lang + ' . ' + sub_format , expected_real_ext )
2013-07-20 12:48:57 +02:00
2014-11-23 20:41:03 +01:00
2021-04-06 08:45:15 +02:00
def datetime_from_str ( date_str , precision = ' auto ' , format = ' % Y % m %d ' ) :
2013-04-28 11:39:37 +02:00
"""
Return a datetime object from a string in the format YYYYMMDD or
2021-04-06 08:45:15 +02:00
( now | today | date ) [ + - ] [ 0 - 9 ] ( microsecond | second | minute | hour | day | week | month | year ) ( s ) ?
format : string date format used to return datetime object from
precision : round the time portion of a datetime object .
auto | microsecond | second | minute | hour | day .
auto : round to the unit provided in date_str ( if applicable ) .
"""
auto_precision = False
if precision == ' auto ' :
auto_precision = True
precision = ' microsecond '
today = datetime_round ( datetime . datetime . now ( ) , precision )
2014-12-11 10:29:30 +01:00
if date_str in ( ' now ' , ' today ' ) :
2013-04-28 11:39:37 +02:00
return today
2014-12-11 10:29:30 +01:00
if date_str == ' yesterday ' :
return today - datetime . timedelta ( days = 1 )
2021-04-06 08:45:15 +02:00
match = re . match (
r ' (?P<start>.+)(?P<sign>[+-])(?P<time> \ d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)? ' ,
date_str )
2013-04-28 11:39:37 +02:00
if match is not None :
2021-04-06 08:45:15 +02:00
start_time = datetime_from_str ( match . group ( ' start ' ) , precision , format )
time = int ( match . group ( ' time ' ) ) * ( - 1 if match . group ( ' sign ' ) == ' - ' else 1 )
2013-04-28 11:39:37 +02:00
unit = match . group ( ' unit ' )
2021-04-06 08:45:15 +02:00
if unit == ' month ' or unit == ' year ' :
new_date = datetime_add_months ( start_time , time * 12 if unit == ' year ' else time )
2013-04-28 11:39:37 +02:00
unit = ' day '
2021-04-06 08:45:15 +02:00
else :
if unit == ' week ' :
unit = ' day '
time * = 7
delta = datetime . timedelta ( * * { unit + ' s ' : time } )
new_date = start_time + delta
if auto_precision :
return datetime_round ( new_date , unit )
return new_date
return datetime_round ( datetime . datetime . strptime ( date_str , format ) , precision )
def date_from_str ( date_str , format = ' % Y % m %d ' ) :
"""
Return a datetime object from a string in the format YYYYMMDD or
( now | today | date ) [ + - ] [ 0 - 9 ] ( microsecond | second | minute | hour | day | week | month | year ) ( s ) ?
format : string date format used to return datetime object from
"""
return datetime_from_str ( date_str , precision = ' microsecond ' , format = format ) . date ( )
def datetime_add_months ( dt , months ) :
""" Increment/Decrement a datetime object by months. """
month = dt . month + months - 1
year = dt . year + month / / 12
month = month % 12 + 1
day = min ( dt . day , calendar . monthrange ( year , month ) [ 1 ] )
return dt . replace ( year , month , day )
def datetime_round ( dt , precision = ' day ' ) :
"""
Round a datetime object ' s time to a specific precision
"""
if precision == ' microsecond ' :
return dt
unit_seconds = {
' day ' : 86400 ,
' hour ' : 3600 ,
' minute ' : 60 ,
' second ' : 1 ,
}
roundto = lambda x , n : ( ( x + n / 2 ) / / n ) * n
timestamp = calendar . timegm ( dt . timetuple ( ) )
return datetime . datetime . utcfromtimestamp ( roundto ( timestamp , unit_seconds [ precision ] ) )
2014-11-23 20:41:03 +01:00
2014-01-02 13:47:28 +01:00
def hyphenate_date ( date_str ) :
"""
Convert a date in ' YYYYMMDD ' format to ' YYYY-MM-DD ' format """
match = re . match ( r ' ^( \ d \ d \ d \ d)( \ d \ d)( \ d \ d)$ ' , date_str )
if match is not None :
return ' - ' . join ( match . groups ( ) )
else :
return date_str
2014-11-23 20:41:03 +01:00
2013-04-27 14:01:55 +02:00
class DateRange ( object ) :
""" Represents a time interval between two dates """
2014-11-23 20:41:03 +01:00
2013-04-27 14:01:55 +02:00
def __init__ ( self , start = None , end = None ) :
""" start and end must be strings in the format accepted by date """
if start is not None :
self . start = date_from_str ( start )
else :
self . start = datetime . datetime . min . date ( )
if end is not None :
self . end = date_from_str ( end )
else :
self . end = datetime . datetime . max . date ( )
2013-04-28 11:39:37 +02:00
if self . start > self . end :
2013-04-27 14:01:55 +02:00
raise ValueError ( ' Date range: " %s " , the start date must be before the end date ' % self )
2014-11-23 20:41:03 +01:00
2013-04-27 14:01:55 +02:00
@classmethod
def day ( cls , day ) :
""" Returns a range that only contains the given day """
2014-11-23 20:41:03 +01:00
return cls ( day , day )
2013-04-27 14:01:55 +02:00
def __contains__ ( self , date ) :
""" Check if the date is in the range """
2013-04-28 11:39:37 +02:00
if not isinstance ( date , datetime . date ) :
date = date_from_str ( date )
return self . start < = date < = self . end
2014-11-23 20:41:03 +01:00
2013-04-27 14:01:55 +02:00
def __str__ ( self ) :
2014-11-23 20:41:03 +01:00
return ' %s - %s ' % ( self . start . isoformat ( ) , self . end . isoformat ( ) )
2013-08-28 12:57:10 +02:00
def platform_name ( ) :
""" Returns the platform name as a compat_str """
res = platform . platform ( )
if isinstance ( res , bytes ) :
res = res . decode ( preferredencoding ( ) )
assert isinstance ( res , compat_str )
return res
2013-08-28 18:22:28 +02:00
2021-09-23 19:40:51 +02:00
def get_windows_version ( ) :
''' Get Windows version. None if it ' s not running on Windows '''
if compat_os_name == ' nt ' :
return version_tuple ( platform . win32_ver ( ) [ 1 ] )
else :
return None
2014-04-07 22:48:13 +02:00
def _windows_write_string ( s , out ) :
""" Returns True if the string was written using special methods,
False if it has yet to be written out . """
# Adapted from http://stackoverflow.com/a/3259271/35070
import ctypes
import ctypes . wintypes
WIN_OUTPUT_IDS = {
1 : - 11 ,
2 : - 12 ,
}
2014-04-30 10:07:32 +02:00
try :
fileno = out . fileno ( )
except AttributeError :
# If the output stream doesn't have a fileno, it's virtual
return False
2015-01-23 12:17:12 +01:00
except io . UnsupportedOperation :
# Some strange Windows pseudo files?
return False
2014-04-07 22:48:13 +02:00
if fileno not in WIN_OUTPUT_IDS :
return False
2014-12-12 04:01:08 +01:00
GetStdHandle = compat_ctypes_WINFUNCTYPE (
2014-04-07 22:48:13 +02:00
ctypes . wintypes . HANDLE , ctypes . wintypes . DWORD ) (
2014-12-12 04:01:08 +01:00
( ' GetStdHandle ' , ctypes . windll . kernel32 ) )
2014-04-07 22:48:13 +02:00
h = GetStdHandle ( WIN_OUTPUT_IDS [ fileno ] )
2014-12-12 04:01:08 +01:00
WriteConsoleW = compat_ctypes_WINFUNCTYPE (
2014-04-07 22:48:13 +02:00
ctypes . wintypes . BOOL , ctypes . wintypes . HANDLE , ctypes . wintypes . LPWSTR ,
ctypes . wintypes . DWORD , ctypes . POINTER ( ctypes . wintypes . DWORD ) ,
2014-12-12 04:01:08 +01:00
ctypes . wintypes . LPVOID ) ( ( ' WriteConsoleW ' , ctypes . windll . kernel32 ) )
2014-04-07 22:48:13 +02:00
written = ctypes . wintypes . DWORD ( 0 )
2014-12-12 04:01:08 +01:00
GetFileType = compat_ctypes_WINFUNCTYPE ( ctypes . wintypes . DWORD , ctypes . wintypes . DWORD ) ( ( ' GetFileType ' , ctypes . windll . kernel32 ) )
2014-04-07 22:48:13 +02:00
FILE_TYPE_CHAR = 0x0002
FILE_TYPE_REMOTE = 0x8000
2014-12-12 04:01:08 +01:00
GetConsoleMode = compat_ctypes_WINFUNCTYPE (
2014-04-07 22:48:13 +02:00
ctypes . wintypes . BOOL , ctypes . wintypes . HANDLE ,
ctypes . POINTER ( ctypes . wintypes . DWORD ) ) (
2014-12-12 04:01:08 +01:00
( ' GetConsoleMode ' , ctypes . windll . kernel32 ) )
2014-04-07 22:48:13 +02:00
INVALID_HANDLE_VALUE = ctypes . wintypes . DWORD ( - 1 ) . value
def not_a_console ( handle ) :
if handle == INVALID_HANDLE_VALUE or handle is None :
return True
2019-05-10 22:56:22 +02:00
return ( ( GetFileType ( handle ) & ~ FILE_TYPE_REMOTE ) != FILE_TYPE_CHAR
or GetConsoleMode ( handle , ctypes . byref ( ctypes . wintypes . DWORD ( ) ) ) == 0 )
2014-04-07 22:48:13 +02:00
if not_a_console ( h ) :
return False
2014-04-21 04:59:44 +02:00
def next_nonbmp_pos ( s ) :
try :
return next ( i for i , c in enumerate ( s ) if ord ( c ) > 0xffff )
except StopIteration :
return len ( s )
while s :
count = min ( next_nonbmp_pos ( s ) , 1024 )
2014-04-07 22:48:13 +02:00
ret = WriteConsoleW (
2014-04-21 04:59:44 +02:00
h , s , count if count else 2 , ctypes . byref ( written ) , None )
2014-04-07 22:48:13 +02:00
if ret == 0 :
raise OSError ( ' Failed to write string ' )
2014-04-21 04:59:44 +02:00
if not count : # We just wrote a non-BMP character
assert written . value == 2
s = s [ 1 : ]
else :
assert written . value > 0
s = s [ written . value : ]
2014-04-07 22:48:13 +02:00
return True
2014-04-07 19:57:42 +02:00
def write_string ( s , out = None , encoding = None ) :
2013-09-16 06:55:33 +02:00
if out is None :
out = sys . stderr
2014-01-05 03:07:55 +01:00
assert type ( s ) == compat_str
2013-09-16 06:55:33 +02:00
2014-04-07 22:48:13 +02:00
if sys . platform == ' win32 ' and encoding is None and hasattr ( out , ' fileno ' ) :
if _windows_write_string ( s , out ) :
return
2019-05-10 22:56:22 +02:00
if ( ' b ' in getattr ( out , ' mode ' , ' ' )
or sys . version_info [ 0 ] < 3 ) : # Python 2 lies about mode of sys.stderr
2014-04-07 21:40:34 +02:00
byt = s . encode ( encoding or preferredencoding ( ) , ' ignore ' )
out . write ( byt )
elif hasattr ( out , ' buffer ' ) :
enc = encoding or getattr ( out , ' encoding ' , None ) or preferredencoding ( )
byt = s . encode ( enc , ' ignore ' )
out . buffer . write ( byt )
else :
2014-01-05 03:07:55 +01:00
out . write ( s )
2013-09-16 06:55:33 +02:00
out . flush ( )
2013-08-28 14:28:55 +02:00
def bytes_to_intlist ( bs ) :
if not bs :
return [ ]
if isinstance ( bs [ 0 ] , int ) : # Python 3
return list ( bs )
else :
return [ ord ( c ) for c in bs ]
2013-08-28 18:22:28 +02:00
2013-08-28 15:59:07 +02:00
def intlist_to_bytes ( xs ) :
if not xs :
return b ' '
2016-05-03 10:50:16 +02:00
return compat_struct_pack ( ' %d B ' % len ( xs ) , * xs )
2013-10-02 08:41:03 +02:00
2013-10-06 04:27:09 +02:00
# Cross-platform file locking
if sys . platform == ' win32 ' :
import ctypes . wintypes
import msvcrt
class OVERLAPPED ( ctypes . Structure ) :
_fields_ = [
( ' Internal ' , ctypes . wintypes . LPVOID ) ,
( ' InternalHigh ' , ctypes . wintypes . LPVOID ) ,
( ' Offset ' , ctypes . wintypes . DWORD ) ,
( ' OffsetHigh ' , ctypes . wintypes . DWORD ) ,
( ' hEvent ' , ctypes . wintypes . HANDLE ) ,
]
kernel32 = ctypes . windll . kernel32
LockFileEx = kernel32 . LockFileEx
LockFileEx . argtypes = [
ctypes . wintypes . HANDLE , # hFile
ctypes . wintypes . DWORD , # dwFlags
ctypes . wintypes . DWORD , # dwReserved
ctypes . wintypes . DWORD , # nNumberOfBytesToLockLow
ctypes . wintypes . DWORD , # nNumberOfBytesToLockHigh
ctypes . POINTER ( OVERLAPPED ) # Overlapped
]
LockFileEx . restype = ctypes . wintypes . BOOL
UnlockFileEx = kernel32 . UnlockFileEx
UnlockFileEx . argtypes = [
ctypes . wintypes . HANDLE , # hFile
ctypes . wintypes . DWORD , # dwReserved
ctypes . wintypes . DWORD , # nNumberOfBytesToLockLow
ctypes . wintypes . DWORD , # nNumberOfBytesToLockHigh
ctypes . POINTER ( OVERLAPPED ) # Overlapped
]
UnlockFileEx . restype = ctypes . wintypes . BOOL
whole_low = 0xffffffff
whole_high = 0x7fffffff
def _lock_file ( f , exclusive ) :
overlapped = OVERLAPPED ( )
overlapped . Offset = 0
overlapped . OffsetHigh = 0
overlapped . hEvent = 0
f . _lock_file_overlapped_p = ctypes . pointer ( overlapped )
handle = msvcrt . get_osfhandle ( f . fileno ( ) )
if not LockFileEx ( handle , 0x2 if exclusive else 0x0 , 0 ,
whole_low , whole_high , f . _lock_file_overlapped_p ) :
raise OSError ( ' Locking file failed: %r ' % ctypes . FormatError ( ) )
def _unlock_file ( f ) :
assert f . _lock_file_overlapped_p
handle = msvcrt . get_osfhandle ( f . fileno ( ) )
if not UnlockFileEx ( handle , 0 ,
whole_low , whole_high , f . _lock_file_overlapped_p ) :
raise OSError ( ' Unlocking file failed: %r ' % ctypes . FormatError ( ) )
else :
2016-02-20 20:28:25 +01:00
# Some platforms, such as Jython, is missing fcntl
try :
import fcntl
2013-10-06 04:27:09 +02:00
2016-02-20 20:28:25 +01:00
def _lock_file ( f , exclusive ) :
fcntl . flock ( f , fcntl . LOCK_EX if exclusive else fcntl . LOCK_SH )
2013-10-06 04:27:09 +02:00
2016-02-20 20:28:25 +01:00
def _unlock_file ( f ) :
fcntl . flock ( f , fcntl . LOCK_UN )
except ImportError :
UNSUPPORTED_MSG = ' file locking is not supported on this platform '
def _lock_file ( f , exclusive ) :
raise IOError ( UNSUPPORTED_MSG )
def _unlock_file ( f ) :
raise IOError ( UNSUPPORTED_MSG )
2013-10-06 04:27:09 +02:00
class locked_file ( object ) :
def __init__ ( self , filename , mode , encoding = None ) :
assert mode in [ ' r ' , ' a ' , ' w ' ]
self . f = io . open ( filename , mode , encoding = encoding )
self . mode = mode
def __enter__ ( self ) :
exclusive = self . mode != ' r '
try :
_lock_file ( self . f , exclusive )
except IOError :
self . f . close ( )
raise
return self
def __exit__ ( self , etype , value , traceback ) :
try :
_unlock_file ( self . f )
finally :
self . f . close ( )
def __iter__ ( self ) :
return iter ( self . f )
def write ( self , * args ) :
return self . f . write ( * args )
def read ( self , * args ) :
return self . f . read ( * args )
2013-10-12 13:49:27 +02:00
2014-09-30 17:27:53 +02:00
def get_filesystem_encoding ( ) :
encoding = sys . getfilesystemencoding ( )
return encoding if encoding is not None else ' utf-8 '
2013-10-12 13:49:27 +02:00
def shell_quote ( args ) :
2013-11-21 14:09:28 +01:00
quoted_args = [ ]
2014-09-30 17:27:53 +02:00
encoding = get_filesystem_encoding ( )
2013-11-21 14:09:28 +01:00
for a in args :
if isinstance ( a , bytes ) :
# We may get a filename encoded with 'encodeFilename'
a = a . decode ( encoding )
2017-06-17 18:48:58 +02:00
quoted_args . append ( compat_shlex_quote ( a ) )
2014-11-17 07:16:12 +01:00
return ' ' . join ( quoted_args )
2013-10-15 12:05:13 +02:00
def smuggle_url ( url , data ) :
""" Pass additional data in a URL for internal use. """
2016-07-04 18:57:44 +02:00
url , idata = unsmuggle_url ( url , { } )
data . update ( idata )
2016-03-25 20:46:57 +01:00
sdata = compat_urllib_parse_urlencode (
2014-11-17 07:16:12 +01:00
{ ' __youtubedl_smuggle ' : json . dumps ( data ) } )
return url + ' # ' + sdata
2013-10-15 12:05:13 +02:00
2014-01-07 05:34:14 +01:00
def unsmuggle_url ( smug_url , default = None ) :
2014-12-09 23:11:26 +01:00
if ' #__youtubedl_smuggle ' not in smug_url :
2014-01-07 05:34:14 +01:00
return smug_url , default
2014-11-17 07:16:12 +01:00
url , _ , sdata = smug_url . rpartition ( ' # ' )
jsond = compat_parse_qs ( sdata ) [ ' __youtubedl_smuggle ' ] [ 0 ]
2013-10-15 12:05:13 +02:00
data = json . loads ( jsond )
return url , data
2013-11-25 03:12:26 +01:00
def format_bytes ( bytes ) :
if bytes is None :
2014-11-17 07:16:12 +01:00
return ' N/A '
2013-11-25 03:12:26 +01:00
if type ( bytes ) is str :
bytes = float ( bytes )
if bytes == 0.0 :
exponent = 0
else :
exponent = int ( math . log ( bytes , 1024.0 ) )
2014-11-17 07:16:12 +01:00
suffix = [ ' B ' , ' KiB ' , ' MiB ' , ' GiB ' , ' TiB ' , ' PiB ' , ' EiB ' , ' ZiB ' , ' YiB ' ] [ exponent ]
2013-11-25 03:12:26 +01:00
converted = float ( bytes ) / float ( 1024 * * exponent )
2014-11-17 07:16:12 +01:00
return ' %.2f %s ' % ( converted , suffix )
2013-12-06 13:36:36 +01:00
2013-12-09 18:29:07 +01:00
2016-03-13 11:27:20 +01:00
def lookup_unit_table ( unit_table , s ) :
units_re = ' | ' . join ( re . escape ( u ) for u in unit_table )
m = re . match (
2016-03-19 11:42:35 +01:00
r ' (?P<num>[0-9]+(?:[,.][0-9]*)?) \ s*(?P<unit> %s ) \ b ' % units_re , s )
2016-03-13 11:27:20 +01:00
if not m :
return None
num_str = m . group ( ' num ' ) . replace ( ' , ' , ' . ' )
mult = unit_table [ m . group ( ' unit ' ) ]
return int ( float ( num_str ) * mult )
2014-11-25 09:54:54 +01:00
def parse_filesize ( s ) :
if s is None :
return None
2016-01-10 16:17:47 +01:00
# The lower-case forms are of course incorrect and unofficial,
2014-11-25 09:54:54 +01:00
# but we support those too
_UNIT_TABLE = {
' B ' : 1 ,
' b ' : 1 ,
2016-08-19 18:12:32 +02:00
' bytes ' : 1 ,
2014-11-25 09:54:54 +01:00
' KiB ' : 1024 ,
' KB ' : 1000 ,
' kB ' : 1024 ,
' Kb ' : 1000 ,
2016-08-18 18:32:00 +02:00
' kb ' : 1000 ,
2016-08-19 18:12:32 +02:00
' kilobytes ' : 1000 ,
' kibibytes ' : 1024 ,
2014-11-25 09:54:54 +01:00
' MiB ' : 1024 * * 2 ,
' MB ' : 1000 * * 2 ,
' mB ' : 1024 * * 2 ,
' Mb ' : 1000 * * 2 ,
2016-08-18 18:32:00 +02:00
' mb ' : 1000 * * 2 ,
2016-08-19 18:12:32 +02:00
' megabytes ' : 1000 * * 2 ,
' mebibytes ' : 1024 * * 2 ,
2014-11-25 09:54:54 +01:00
' GiB ' : 1024 * * 3 ,
' GB ' : 1000 * * 3 ,
' gB ' : 1024 * * 3 ,
' Gb ' : 1000 * * 3 ,
2016-08-18 18:32:00 +02:00
' gb ' : 1000 * * 3 ,
2016-08-19 18:12:32 +02:00
' gigabytes ' : 1000 * * 3 ,
' gibibytes ' : 1024 * * 3 ,
2014-11-25 09:54:54 +01:00
' TiB ' : 1024 * * 4 ,
' TB ' : 1000 * * 4 ,
' tB ' : 1024 * * 4 ,
' Tb ' : 1000 * * 4 ,
2016-08-18 18:32:00 +02:00
' tb ' : 1000 * * 4 ,
2016-08-19 18:12:32 +02:00
' terabytes ' : 1000 * * 4 ,
' tebibytes ' : 1024 * * 4 ,
2014-11-25 09:54:54 +01:00
' PiB ' : 1024 * * 5 ,
' PB ' : 1000 * * 5 ,
' pB ' : 1024 * * 5 ,
' Pb ' : 1000 * * 5 ,
2016-08-18 18:32:00 +02:00
' pb ' : 1000 * * 5 ,
2016-08-19 18:12:32 +02:00
' petabytes ' : 1000 * * 5 ,
' pebibytes ' : 1024 * * 5 ,
2014-11-25 09:54:54 +01:00
' EiB ' : 1024 * * 6 ,
' EB ' : 1000 * * 6 ,
' eB ' : 1024 * * 6 ,
' Eb ' : 1000 * * 6 ,
2016-08-18 18:32:00 +02:00
' eb ' : 1000 * * 6 ,
2016-08-19 18:12:32 +02:00
' exabytes ' : 1000 * * 6 ,
' exbibytes ' : 1024 * * 6 ,
2014-11-25 09:54:54 +01:00
' ZiB ' : 1024 * * 7 ,
' ZB ' : 1000 * * 7 ,
' zB ' : 1024 * * 7 ,
' Zb ' : 1000 * * 7 ,
2016-08-18 18:32:00 +02:00
' zb ' : 1000 * * 7 ,
2016-08-19 18:12:32 +02:00
' zettabytes ' : 1000 * * 7 ,
' zebibytes ' : 1024 * * 7 ,
2014-11-25 09:54:54 +01:00
' YiB ' : 1024 * * 8 ,
' YB ' : 1000 * * 8 ,
' yB ' : 1024 * * 8 ,
' Yb ' : 1000 * * 8 ,
2016-08-18 18:32:00 +02:00
' yb ' : 1000 * * 8 ,
2016-08-19 18:12:32 +02:00
' yottabytes ' : 1000 * * 8 ,
' yobibytes ' : 1024 * * 8 ,
2014-11-25 09:54:54 +01:00
}
2016-03-13 11:27:20 +01:00
return lookup_unit_table ( _UNIT_TABLE , s )
def parse_count ( s ) :
if s is None :
2014-11-25 09:54:54 +01:00
return None
2016-03-13 11:27:20 +01:00
s = s . strip ( )
if re . match ( r ' ^[ \ d,.]+$ ' , s ) :
return str_to_int ( s )
_UNIT_TABLE = {
' k ' : 1000 ,
' K ' : 1000 ,
' m ' : 1000 * * 2 ,
' M ' : 1000 * * 2 ,
' kk ' : 1000 * * 2 ,
' KK ' : 1000 * * 2 ,
}
2014-11-25 09:54:54 +01:00
2016-03-13 11:27:20 +01:00
return lookup_unit_table ( _UNIT_TABLE , s )
2014-11-25 09:54:54 +01:00
2016-03-13 12:23:08 +01:00
2018-03-02 17:39:04 +01:00
def parse_resolution ( s ) :
if s is None :
return { }
2021-10-22 02:04:00 +02:00
mobj = re . search ( r ' (?<![a-zA-Z0-9])(?P<w> \ d+) \ s*[xX× ,] \ s*(?P<h> \ d+)(?![a-zA-Z0-9]) ' , s )
2018-03-02 17:39:04 +01:00
if mobj :
return {
' width ' : int ( mobj . group ( ' w ' ) ) ,
' height ' : int ( mobj . group ( ' h ' ) ) ,
}
2021-10-22 02:04:00 +02:00
mobj = re . search ( r ' (?<![a-zA-Z0-9])( \ d+)[pPiI](?![a-zA-Z0-9]) ' , s )
2018-03-02 17:39:04 +01:00
if mobj :
return { ' height ' : int ( mobj . group ( 1 ) ) }
mobj = re . search ( r ' \ b([48])[kK] \ b ' , s )
if mobj :
return { ' height ' : int ( mobj . group ( 1 ) ) * 540 }
return { }
2019-03-17 03:07:47 +01:00
def parse_bitrate ( s ) :
if not isinstance ( s , compat_str ) :
return
mobj = re . search ( r ' \ b( \ d+) \ s*kbps ' , s )
if mobj :
return int ( mobj . group ( 1 ) )
2016-09-02 18:31:52 +02:00
def month_by_name ( name , lang = ' en ' ) :
2013-12-09 19:39:41 +01:00
""" Return the number of a month by (locale-independently) English name """
2016-09-14 18:13:55 +02:00
month_names = MONTH_NAMES . get ( lang , MONTH_NAMES [ ' en ' ] )
2016-09-02 18:31:52 +02:00
2013-12-09 19:39:41 +01:00
try :
2016-09-14 18:13:55 +02:00
return month_names . index ( name ) + 1
2015-02-13 08:14:23 +01:00
except ValueError :
return None
def month_by_abbreviation ( abbrev ) :
""" Return the number of a month by (locale-independently) English
abbreviations """
try :
return [ s [ : 3 ] for s in ENGLISH_MONTH_NAMES ] . index ( abbrev ) + 1
2013-12-09 19:39:41 +01:00
except ValueError :
return None
2013-12-10 21:03:53 +01:00
2014-01-20 22:11:34 +01:00
def fix_xml_ampersands ( xml_str ) :
2013-12-10 21:03:53 +01:00
""" Replace all the ' & ' by ' & ' in XML """
2014-01-20 22:11:34 +01:00
return re . sub (
r ' &(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F] { ,4};|#[0-9] { ,4};) ' ,
2014-11-17 07:16:12 +01:00
' & ' ,
2014-01-20 22:11:34 +01:00
xml_str )
2013-12-16 05:04:12 +01:00
def setproctitle ( title ) :
2014-01-05 03:07:55 +01:00
assert isinstance ( title , compat_str )
2016-02-20 20:29:02 +01:00
# ctypes in Jython is not complete
# http://bugs.jython.org/issue2148
if sys . platform . startswith ( ' java ' ) :
return
2013-12-16 05:04:12 +01:00
try :
2016-02-14 10:37:17 +01:00
libc = ctypes . cdll . LoadLibrary ( ' libc.so.6 ' )
2013-12-16 05:04:12 +01:00
except OSError :
return
2017-02-10 22:05:09 +01:00
except TypeError :
# LoadLibrary in Windows Python 2.7.13 only expects
# a bytestring, but since unicode_literals turns
# every string into a unicode string, it fails.
return
2014-03-23 14:28:22 +01:00
title_bytes = title . encode ( ' utf-8 ' )
buf = ctypes . create_string_buffer ( len ( title_bytes ) )
buf . value = title_bytes
2013-12-16 05:04:12 +01:00
try :
2014-03-23 14:28:22 +01:00
libc . prctl ( 15 , buf , 0 , 0 , 0 )
2013-12-16 05:04:12 +01:00
except AttributeError :
return # Strange libc, just skip this
2013-12-16 13:56:13 +01:00
def remove_start ( s , start ) :
2016-05-19 00:31:30 +02:00
return s [ len ( start ) : ] if s is not None and s . startswith ( start ) else s
2013-12-17 04:13:36 +01:00
2014-08-22 18:40:26 +02:00
def remove_end ( s , end ) :
2016-05-19 00:31:30 +02:00
return s [ : - len ( end ) ] if s is not None and s . endswith ( end ) else s
2014-08-22 18:40:26 +02:00
2015-12-14 16:30:58 +01:00
def remove_quotes ( s ) :
if s is None or len ( s ) < 2 :
return s
for quote in ( ' " ' , " ' " , ) :
if s [ 0 ] == quote and s [ - 1 ] == quote :
return s [ 1 : - 1 ]
return s
2020-10-09 07:06:49 +02:00
def get_domain ( url ) :
domain = re . match ( r ' (?:https?: \ / \ /)?(?:www \ .)?(?P<domain>[^ \ n \ /]+ \ .[^ \ n \ /]+)(?: \ /(.*))? ' , url )
return domain . group ( ' domain ' ) if domain else None
2013-12-17 04:13:36 +01:00
def url_basename ( url ) :
2013-12-17 14:56:29 +01:00
path = compat_urlparse . urlparse ( url ) . path
2014-11-17 07:16:12 +01:00
return path . strip ( ' / ' ) . split ( ' / ' ) [ - 1 ]
2013-12-20 17:05:28 +01:00
2016-11-01 20:14:01 +01:00
def base_url ( url ) :
return re . match ( r ' https?://[^?#&]+/ ' , url ) . group ( )
2016-12-12 20:23:49 +01:00
def urljoin ( base , path ) :
2017-03-05 21:57:46 +01:00
if isinstance ( path , bytes ) :
path = path . decode ( ' utf-8 ' )
2016-12-12 20:23:49 +01:00
if not isinstance ( path , compat_str ) or not path :
return None
2019-01-20 14:21:24 +01:00
if re . match ( r ' ^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?// ' , path ) :
2016-12-12 20:23:49 +01:00
return path
2017-03-05 21:57:46 +01:00
if isinstance ( base , bytes ) :
base = base . decode ( ' utf-8 ' )
if not isinstance ( base , compat_str ) or not re . match (
r ' ^(?:https?:)?// ' , base ) :
2016-12-12 20:23:49 +01:00
return None
return compat_urlparse . urljoin ( base , path )
2013-12-20 17:05:28 +01:00
class HEADRequest ( compat_urllib_request . Request ) :
def get_method ( self ) :
2016-02-14 10:37:17 +01:00
return ' HEAD '
2013-12-25 15:18:40 +01:00
2016-07-02 21:21:32 +02:00
class PUTRequest ( compat_urllib_request . Request ) :
def get_method ( self ) :
return ' PUT '
2014-07-21 12:02:44 +02:00
def int_or_none ( v , scale = 1 , default = None , get_attr = None , invscale = 1 ) :
2014-04-21 13:45:27 +02:00
if get_attr :
if v is not None :
v = getattr ( v , get_attr , None )
2014-08-10 13:04:45 +02:00
if v == ' ' :
v = None
2015-10-14 18:35:01 +02:00
if v is None :
return default
try :
return int ( v ) * invscale / / scale
2021-11-03 19:35:53 +01:00
except ( ValueError , TypeError , OverflowError ) :
2015-10-14 18:37:03 +02:00
return default
2014-07-21 12:02:44 +02:00
2014-08-10 13:04:45 +02:00
2014-08-10 11:00:14 +02:00
def str_or_none ( v , default = None ) :
return default if v is None else compat_str ( v )
2014-07-21 12:02:44 +02:00
def str_to_int ( int_str ) :
2014-08-31 23:51:36 +02:00
""" A more relaxed version of int_or_none """
2019-12-15 17:15:24 +01:00
if isinstance ( int_str , compat_integer_types ) :
2019-11-29 17:05:06 +01:00
return int_str
2019-12-15 17:15:24 +01:00
elif isinstance ( int_str , compat_str ) :
int_str = re . sub ( r ' [, \ . \ +] ' , ' ' , int_str )
return int_or_none ( int_str )
2013-12-26 13:49:44 +01:00
2014-07-21 12:02:44 +02:00
def float_or_none ( v , scale = 1 , invscale = 1 , default = None ) :
2015-10-14 18:36:37 +02:00
if v is None :
return default
try :
return float ( v ) * invscale / scale
2019-03-22 19:08:54 +01:00
except ( ValueError , TypeError ) :
2015-10-14 18:36:37 +02:00
return default
2014-03-28 23:06:34 +01:00
2017-09-10 14:08:39 +02:00
def bool_or_none ( v , default = None ) :
return v if isinstance ( v , bool ) else default
2019-05-23 18:58:35 +02:00
def strip_or_none ( v , default = None ) :
return v . strip ( ) if isinstance ( v , compat_str ) else default
2016-06-25 17:32:02 +02:00
2018-07-21 13:01:06 +02:00
def url_or_none ( url ) :
if not url or not isinstance ( url , compat_str ) :
return None
url = url . strip ( )
2021-01-01 13:26:37 +01:00
return url if re . match ( r ' ^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?// ' , url ) else None
2018-07-21 13:01:06 +02:00
2021-02-02 22:15:00 +01:00
def strftime_or_none ( timestamp , date_format , default = None ) :
datetime_object = None
try :
if isinstance ( timestamp , compat_numeric_types ) : # unix timestamp
datetime_object = datetime . datetime . utcfromtimestamp ( timestamp )
elif isinstance ( timestamp , compat_str ) : # assume YYYYMMDD
datetime_object = datetime . datetime . strptime ( timestamp , ' % Y % m %d ' )
return datetime_object . strftime ( date_format )
except ( ValueError , TypeError , AttributeError ) :
return default
2013-12-26 13:49:44 +01:00
def parse_duration ( s ) :
2015-02-01 11:30:56 +01:00
if not isinstance ( s , compat_basestring ) :
2013-12-26 13:49:44 +01:00
return None
2014-08-31 01:41:30 +02:00
s = s . strip ( )
2021-12-06 19:00:33 +01:00
if not s :
return None
2014-08-31 01:41:30 +02:00
2016-04-07 20:30:47 +02:00
days , hours , mins , secs , ms = [ None ] * 5
2017-01-26 17:23:08 +01:00
m = re . match ( r ' (?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms> \ .[0-9]+)?Z?$ ' , s )
2016-04-07 20:30:47 +02:00
if m :
days , hours , mins , secs , ms = m . groups ( )
else :
m = re . match (
2017-10-29 02:04:48 +02:00
r ''' (?ix)(?:P?
( ? :
[ 0 - 9 ] + \s * y ( ? : ears ? ) ? \s *
) ?
( ? :
[ 0 - 9 ] + \s * m ( ? : onths ? ) ? \s *
) ?
( ? :
[ 0 - 9 ] + \s * w ( ? : eeks ? ) ? \s *
) ?
2015-02-02 21:48:54 +01:00
( ? :
2016-04-07 20:30:47 +02:00
( ? P < days > [ 0 - 9 ] + ) \s * d ( ? : ays ? ) ? \s *
2015-02-02 21:48:54 +01:00
) ?
2017-10-29 02:04:48 +02:00
T ) ?
2016-04-07 20:30:47 +02:00
( ? :
( ? P < hours > [ 0 - 9 ] + ) \s * h ( ? : ours ? ) ? \s *
) ?
( ? :
( ? P < mins > [ 0 - 9 ] + ) \s * m ( ? : in ( ? : ute ) ? s ? ) ? \s *
) ?
( ? :
( ? P < secs > [ 0 - 9 ] + ) ( ? P < ms > \. [ 0 - 9 ] + ) ? \s * s ( ? : ec ( ? : ond ) ? s ? ) ? \s *
2017-01-26 17:23:08 +01:00
) ? Z ? $ ''' , s)
2016-04-07 20:30:47 +02:00
if m :
days , hours , mins , secs , ms = m . groups ( )
else :
2017-01-26 17:23:08 +01:00
m = re . match ( r ' (?i)(?:(?P<hours>[0-9.]+) \ s*(?:hours?)|(?P<mins>[0-9.]+) \ s*(?:mins? \ .?|minutes?) \ s*)Z?$ ' , s )
2016-04-07 20:30:47 +02:00
if m :
hours , mins = m . groups ( )
else :
return None
duration = 0
if secs :
duration + = float ( secs )
if mins :
duration + = float ( mins ) * 60
if hours :
duration + = float ( hours ) * 60 * 60
if days :
duration + = float ( days ) * 24 * 60 * 60
if ms :
duration + = float ( ms )
return duration
2014-01-03 12:52:27 +01:00
2015-05-02 19:06:01 +02:00
def prepend_extension ( filename , ext , expected_real_ext = None ) :
2014-11-23 20:41:03 +01:00
name , real_ext = os . path . splitext ( filename )
2015-05-02 19:06:01 +02:00
return (
' {0} . {1} {2} ' . format ( name , ext , real_ext )
if not expected_real_ext or real_ext [ 1 : ] == expected_real_ext
else ' {0} . {1} ' . format ( filename , ext ) )
2014-01-07 06:23:41 +01:00
2015-05-02 19:23:06 +02:00
def replace_extension ( filename , ext , expected_real_ext = None ) :
name , real_ext = os . path . splitext ( filename )
return ' {0} . {1} ' . format (
name if not expected_real_ext or real_ext [ 1 : ] == expected_real_ext else filename ,
ext )
2014-01-07 06:23:41 +01:00
def check_executable ( exe , args = [ ] ) :
""" Checks if the given binary is installed somewhere in PATH, and returns its name.
args can be a list of arguments for a short output ( like - version ) """
try :
2021-10-20 18:19:40 +02:00
Popen ( [ exe ] + args , stdout = subprocess . PIPE , stderr = subprocess . PIPE ) . communicate_or_kill ( )
2014-01-07 06:23:41 +01:00
except OSError :
return False
return exe
2014-01-20 11:36:47 +01:00
2021-11-03 19:53:48 +01:00
def _get_exe_version_output ( exe , args ) :
2014-11-02 10:50:30 +01:00
try :
2016-10-22 07:04:05 +02:00
# STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2021-02-24 19:45:56 +01:00
# SIGTTOU if yt-dlp is run in the background.
2019-03-09 13:14:41 +01:00
# See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2021-10-20 18:19:40 +02:00
out , _ = Popen (
[ encodeArgument ( exe ) ] + args , stdin = subprocess . PIPE ,
stdout = subprocess . PIPE , stderr = subprocess . STDOUT ) . communicate_or_kill ( )
2014-11-02 10:50:30 +01:00
except OSError :
return False
2014-12-14 21:59:59 +01:00
if isinstance ( out , bytes ) : # Python 2.x
out = out . decode ( ' ascii ' , ' ignore ' )
2021-11-03 19:53:48 +01:00
return out
2014-12-14 21:59:59 +01:00
def detect_exe_version ( output , version_re = None , unrecognized = ' present ' ) :
assert isinstance ( output , compat_str )
if version_re is None :
version_re = r ' version \ s+([-0-9._a-zA-Z]+) '
m = re . search ( version_re , output )
2014-11-02 10:50:30 +01:00
if m :
return m . group ( 1 )
else :
return unrecognized
2021-11-03 19:53:48 +01:00
def get_exe_version ( exe , args = [ ' --version ' ] ,
version_re = None , unrecognized = ' present ' ) :
""" Returns the version of the specified executable,
or False if the executable is not present """
out = _get_exe_version_output ( exe , args )
return detect_exe_version ( out , version_re , unrecognized ) if out else False
2021-07-23 17:02:48 +02:00
class LazyList ( collections . abc . Sequence ) :
2021-05-28 18:49:13 +02:00
''' Lazy immutable list from an iterable
Note that slices of a LazyList are lists and not LazyList '''
2021-09-03 23:37:27 +02:00
class IndexError ( IndexError ) :
pass
2021-11-20 03:35:57 +01:00
def __init__ ( self , iterable , * , reverse = False , _cache = None ) :
2021-05-28 18:49:13 +02:00
self . __iterable = iter ( iterable )
2021-11-20 03:35:57 +01:00
self . __cache = [ ] if _cache is None else _cache
self . __reversed = reverse
2021-05-28 18:49:13 +02:00
def __iter__ ( self ) :
2021-06-12 17:14:30 +02:00
if self . __reversed :
# We need to consume the entire iterable to iterate in reverse
2021-06-27 04:05:58 +02:00
yield from self . exhaust ( )
2021-06-12 17:14:30 +02:00
return
yield from self . __cache
2021-05-28 18:49:13 +02:00
for item in self . __iterable :
self . __cache . append ( item )
yield item
2021-06-27 04:05:58 +02:00
def __exhaust ( self ) :
2021-05-28 18:49:13 +02:00
self . __cache . extend ( self . __iterable )
2021-10-24 11:16:07 +02:00
# Discard the emptied iterable to make it pickle-able
self . __iterable = [ ]
2021-06-12 17:14:30 +02:00
return self . __cache
2021-06-27 04:05:58 +02:00
def exhaust ( self ) :
''' Evaluate the entire iterable '''
return self . __exhaust ( ) [ : : - 1 if self . __reversed else 1 ]
2021-06-12 17:14:30 +02:00
@staticmethod
2021-06-27 04:05:58 +02:00
def __reverse_index ( x ) :
2021-08-01 08:17:30 +02:00
return None if x is None else - ( x + 1 )
2021-05-28 18:49:13 +02:00
def __getitem__ ( self , idx ) :
if isinstance ( idx , slice ) :
2021-06-12 17:14:30 +02:00
if self . __reversed :
2021-08-01 08:17:30 +02:00
idx = slice ( self . __reverse_index ( idx . start ) , self . __reverse_index ( idx . stop ) , - ( idx . step or 1 ) )
start , stop , step = idx . start , idx . stop , idx . step or 1
2021-05-28 18:49:13 +02:00
elif isinstance ( idx , int ) :
2021-06-12 17:14:30 +02:00
if self . __reversed :
2021-06-27 04:05:58 +02:00
idx = self . __reverse_index ( idx )
2021-08-01 08:17:30 +02:00
start , stop , step = idx , idx , 0
2021-05-28 18:49:13 +02:00
else :
raise TypeError ( ' indices must be integers or slices ' )
2021-08-01 08:17:30 +02:00
if ( ( start or 0 ) < 0 or ( stop or 0 ) < 0
or ( start is None and step < 0 )
or ( stop is None and step > 0 ) ) :
2021-05-28 18:49:13 +02:00
# We need to consume the entire iterable to be able to slice from the end
# Obviously, never use this with infinite iterables
2021-09-03 23:37:27 +02:00
self . __exhaust ( )
try :
return self . __cache [ idx ]
except IndexError as e :
raise self . IndexError ( e ) from e
2021-08-01 08:17:30 +02:00
n = max ( start or 0 , stop or 0 ) - len ( self . __cache ) + 1
2021-06-12 17:14:30 +02:00
if n > 0 :
self . __cache . extend ( itertools . islice ( self . __iterable , n ) )
2021-09-03 23:37:27 +02:00
try :
return self . __cache [ idx ]
except IndexError as e :
raise self . IndexError ( e ) from e
2021-05-28 18:49:13 +02:00
def __bool__ ( self ) :
try :
2021-06-12 17:14:30 +02:00
self [ - 1 ] if self . __reversed else self [ 0 ]
2021-09-03 23:37:27 +02:00
except self . IndexError :
2021-05-28 18:49:13 +02:00
return False
return True
def __len__ ( self ) :
2021-09-03 23:37:27 +02:00
self . __exhaust ( )
2021-05-28 18:49:13 +02:00
return len ( self . __cache )
2021-11-20 03:35:57 +01:00
def __reversed__ ( self ) :
return type ( self ) ( self . __iterable , reverse = not self . __reversed , _cache = self . __cache )
def __copy__ ( self ) :
return type ( self ) ( self . __iterable , reverse = self . __reversed , _cache = self . __cache )
2021-06-12 17:14:30 +02:00
def __repr__ ( self ) :
# repr and str should mimic a list. So we exhaust the iterable
return repr ( self . exhaust ( ) )
def __str__ ( self ) :
return repr ( self . exhaust ( ) )
2021-05-28 18:49:13 +02:00
2021-08-10 00:10:40 +02:00
class PagedList :
2021-11-19 16:15:52 +01:00
class IndexError ( IndexError ) :
pass
2014-01-22 21:43:33 +01:00
def __len__ ( self ) :
# This is only useful for tests
return len ( self . getslice ( ) )
2021-08-10 00:10:40 +02:00
def __init__ ( self , pagefunc , pagesize , use_cache = True ) :
self . _pagefunc = pagefunc
self . _pagesize = pagesize
self . _use_cache = use_cache
self . _cache = { }
def getpage ( self , pagenum ) :
2021-11-16 16:44:02 +01:00
page_results = self . _cache . get ( pagenum )
if page_results is None :
page_results = list ( self . _pagefunc ( pagenum ) )
2021-08-10 00:10:40 +02:00
if self . _use_cache :
self . _cache [ pagenum ] = page_results
return page_results
def getslice ( self , start = 0 , end = None ) :
return list ( self . _getslice ( start , end ) )
def _getslice ( self , start , end ) :
2021-05-17 15:44:20 +02:00
raise NotImplementedError ( ' This method must be implemented by subclasses ' )
def __getitem__ ( self , idx ) :
2021-08-10 00:10:40 +02:00
# NOTE: cache must be enabled if this is used
2021-05-17 15:44:20 +02:00
if not isinstance ( idx , int ) or idx < 0 :
raise TypeError ( ' indices must be non-negative integers ' )
entries = self . getslice ( idx , idx + 1 )
2021-11-16 16:44:02 +01:00
if not entries :
2021-11-19 16:15:52 +01:00
raise self . IndexError ( )
2021-11-16 16:44:02 +01:00
return entries [ 0 ]
2021-05-17 15:44:20 +02:00
2014-09-29 00:36:06 +02:00
class OnDemandPagedList ( PagedList ) :
2021-08-10 00:10:40 +02:00
def _getslice ( self , start , end ) :
2014-01-20 11:36:47 +01:00
for pagenum in itertools . count ( start / / self . _pagesize ) :
firstid = pagenum * self . _pagesize
nextfirstid = pagenum * self . _pagesize + self . _pagesize
if start > = nextfirstid :
continue
startv = (
start % self . _pagesize
if firstid < = start < nextfirstid
else 0 )
endv = (
( ( end - 1 ) % self . _pagesize ) + 1
if ( end is not None and firstid < = end < = nextfirstid )
else None )
2021-08-10 00:10:40 +02:00
page_results = self . getpage ( pagenum )
2014-01-20 11:36:47 +01:00
if startv != 0 or endv is not None :
page_results = page_results [ startv : endv ]
2021-08-10 00:10:40 +02:00
yield from page_results
2014-01-20 11:36:47 +01:00
# A little optimization - if current page is not "full", ie. does
# not contain page_size videos then we can assume that this page
# is the last one - there are no more ids on further pages -
# i.e. no need to query again.
if len ( page_results ) + startv < self . _pagesize :
break
# If we got the whole page, but the next page is not interesting,
# break out early as well
if end == nextfirstid :
break
2014-02-09 17:56:10 +01:00
2014-09-29 00:36:06 +02:00
class InAdvancePagedList ( PagedList ) :
def __init__ ( self , pagefunc , pagecount , pagesize ) :
self . _pagecount = pagecount
2021-08-10 00:10:40 +02:00
PagedList . __init__ ( self , pagefunc , pagesize , True )
2014-09-29 00:36:06 +02:00
2021-08-10 00:10:40 +02:00
def _getslice ( self , start , end ) :
2014-09-29 00:36:06 +02:00
start_page = start / / self . _pagesize
end_page = (
self . _pagecount if end is None else ( end / / self . _pagesize + 1 ) )
skip_elems = start - start_page * self . _pagesize
only_more = None if end is None else end - start
for pagenum in range ( start_page , end_page ) :
2021-08-10 00:10:40 +02:00
page_results = self . getpage ( pagenum )
2014-09-29 00:36:06 +02:00
if skip_elems :
2021-08-10 00:10:40 +02:00
page_results = page_results [ skip_elems : ]
2014-09-29 00:36:06 +02:00
skip_elems = None
if only_more is not None :
2021-08-10 00:10:40 +02:00
if len ( page_results ) < only_more :
only_more - = len ( page_results )
2014-09-29 00:36:06 +02:00
else :
2021-08-10 00:10:40 +02:00
yield from page_results [ : only_more ]
2014-09-29 00:36:06 +02:00
break
2021-08-10 00:10:40 +02:00
yield from page_results
2014-09-29 00:36:06 +02:00
2014-02-09 17:56:10 +01:00
def uppercase_escape ( s ) :
2014-04-04 23:00:51 +02:00
unicode_escape = codecs . getdecoder ( ' unicode_escape ' )
2014-02-09 17:56:10 +01:00
return re . sub (
2014-04-01 13:17:07 +02:00
r ' \\ U[0-9a-fA-F] {8} ' ,
2014-04-04 23:00:51 +02:00
lambda m : unicode_escape ( m . group ( 0 ) ) [ 0 ] ,
s )
2015-05-04 15:53:05 +02:00
def lowercase_escape ( s ) :
unicode_escape = codecs . getdecoder ( ' unicode_escape ' )
return re . sub (
r ' \\ u[0-9a-fA-F] {4} ' ,
lambda m : unicode_escape ( m . group ( 0 ) ) [ 0 ] ,
s )
2014-02-15 16:24:43 +01:00
2014-09-13 15:59:16 +02:00
def escape_rfc3986 ( s ) :
""" Escape non-ASCII characters as suggested by RFC 3986 """
2015-02-01 11:30:56 +01:00
if sys . version_info < ( 3 , 0 ) and isinstance ( s , compat_str ) :
2014-09-13 15:59:16 +02:00
s = s . encode ( ' utf-8 ' )
2014-11-02 11:37:49 +01:00
return compat_urllib_parse . quote ( s , b " % /;:@&=+$,!~* ' ()?#[] " )
2014-09-13 15:59:16 +02:00
def escape_url ( url ) :
""" Escape URL as suggested by RFC 3986 """
url_parsed = compat_urllib_parse_urlparse ( url )
return url_parsed . _replace (
2016-03-23 15:24:52 +01:00
netloc = url_parsed . netloc . encode ( ' idna ' ) . decode ( ' ascii ' ) ,
2014-09-13 15:59:16 +02:00
path = escape_rfc3986 ( url_parsed . path ) ,
params = escape_rfc3986 ( url_parsed . params ) ,
query = escape_rfc3986 ( url_parsed . query ) ,
fragment = escape_rfc3986 ( url_parsed . fragment )
) . geturl ( )
2014-02-25 01:43:17 +01:00
2021-08-22 21:02:00 +02:00
def parse_qs ( url ) :
return compat_parse_qs ( compat_urllib_parse_urlparse ( url ) . query )
2014-02-25 01:43:17 +01:00
def read_batch_urls ( batch_fd ) :
def fixup ( url ) :
if not isinstance ( url , compat_str ) :
url = url . decode ( ' utf-8 ' , ' replace ' )
batch-file enumeration improvements (https://github.com/ytdl-org/youtube-dl/pull/26813)
Co-authored by: glenn-slayden
Modified from https://github.com/ytdl-org/youtube-dl/pull/26813/commits/c9a9ccf8a35e157e22afeaafc2851176ddd87e68
These improvements apply to reading the list of URLs from the file supplied via the `--batch-file` (`-a`) command line option.
1. Skip blank and empty lines in the file. Currently, lines with leading whitespace are only skipped when that whitespace is followed by a comment character (`#`, `;`, or `]`). This means that empty lines and lines consisting only of whitespace are returned as (trimmed) empty strings in the list of URLs to process.
2. [bug fix] Detect and remove the Unicode BOM when the file descriptor is already decoding Unicode.
With Python 3, the `batch_fd` enumerator returns the lines of the file as Unicode. For UTF-8, this means that the raw BOM bytes from the file `\xef \xbb \xbf` show up converted into a single `\ufeff` character prefixed to the first enumerated text line.
This fix solves several buggy interactions between the presence of BOM, the skipping of comments and/or blank lines, and ensuring the list of URLs is consistently trimmed. For example, if the first line of the file is blank, the BOM is incorrectly returned as a URL standing alone. If the first line contains a URL, it will be prefixed with this unwanted single character--but note that its being there will have inhibited the proper trimming of any leading whitespace. Currently, the `UnicodeBOMIE` helper attempts to recover from some of these error cases, but this fix prevents the error from happening in the first place (at least on Python3). In any case, the `UnicodeBOMIE` approach is flawed, because it is clearly illogical for a BOM to appear in the (non-batch) URL(s) specified directly on the command line (and for that matter, on URLs *after the first line* of a batch list, also)
3. Adds proper trimming of the " #" into the read_batch_urls processing so that the URLs it enumerates are cleaned and trimmed more consistently.
2021-01-09 13:38:03 +01:00
BOM_UTF8 = ( ' \xef \xbb \xbf ' , ' \ufeff ' )
for bom in BOM_UTF8 :
if url . startswith ( bom ) :
url = url [ len ( bom ) : ]
url = url . lstrip ( )
if not url or url . startswith ( ( ' # ' , ' ; ' , ' ] ' ) ) :
2014-02-25 01:43:17 +01:00
return False
batch-file enumeration improvements (https://github.com/ytdl-org/youtube-dl/pull/26813)
Co-authored by: glenn-slayden
Modified from https://github.com/ytdl-org/youtube-dl/pull/26813/commits/c9a9ccf8a35e157e22afeaafc2851176ddd87e68
These improvements apply to reading the list of URLs from the file supplied via the `--batch-file` (`-a`) command line option.
1. Skip blank and empty lines in the file. Currently, lines with leading whitespace are only skipped when that whitespace is followed by a comment character (`#`, `;`, or `]`). This means that empty lines and lines consisting only of whitespace are returned as (trimmed) empty strings in the list of URLs to process.
2. [bug fix] Detect and remove the Unicode BOM when the file descriptor is already decoding Unicode.
With Python 3, the `batch_fd` enumerator returns the lines of the file as Unicode. For UTF-8, this means that the raw BOM bytes from the file `\xef \xbb \xbf` show up converted into a single `\ufeff` character prefixed to the first enumerated text line.
This fix solves several buggy interactions between the presence of BOM, the skipping of comments and/or blank lines, and ensuring the list of URLs is consistently trimmed. For example, if the first line of the file is blank, the BOM is incorrectly returned as a URL standing alone. If the first line contains a URL, it will be prefixed with this unwanted single character--but note that its being there will have inhibited the proper trimming of any leading whitespace. Currently, the `UnicodeBOMIE` helper attempts to recover from some of these error cases, but this fix prevents the error from happening in the first place (at least on Python3). In any case, the `UnicodeBOMIE` approach is flawed, because it is clearly illogical for a BOM to appear in the (non-batch) URL(s) specified directly on the command line (and for that matter, on URLs *after the first line* of a batch list, also)
3. Adds proper trimming of the " #" into the read_batch_urls processing so that the URLs it enumerates are cleaned and trimmed more consistently.
2021-01-09 13:38:03 +01:00
# "#" cannot be stripped out since it is part of the URI
# However, it can be safely stipped out if follwing a whitespace
return re . split ( r ' \ s# ' , url , 1 ) [ 0 ] . rstrip ( )
2014-02-25 01:43:17 +01:00
with contextlib . closing ( batch_fd ) as fd :
return [ url for url in map ( fixup , fd ) if url ]
2014-03-07 15:25:33 +01:00
def urlencode_postdata ( * args , * * kargs ) :
2016-03-25 20:46:57 +01:00
return compat_urllib_parse_urlencode ( * args , * * kargs ) . encode ( ' ascii ' )
2014-03-10 17:31:32 +01:00
2016-03-03 18:34:52 +01:00
def update_url_query ( url , query ) :
2016-04-09 13:27:54 +02:00
if not query :
return url
2016-03-03 18:34:52 +01:00
parsed_url = compat_urlparse . urlparse ( url )
qs = compat_parse_qs ( parsed_url . query )
qs . update ( query )
return compat_urlparse . urlunparse ( parsed_url . _replace (
2016-03-25 20:46:57 +01:00
query = compat_urllib_parse_urlencode ( qs , True ) ) )
2015-09-06 03:22:20 +02:00
2015-12-20 01:26:26 +01:00
2016-03-31 18:55:01 +02:00
def update_Request ( req , url = None , data = None , headers = { } , query = { } ) :
req_headers = req . headers . copy ( )
req_headers . update ( headers )
req_data = data or req . data
req_url = update_url_query ( url or req . get_full_url ( ) , query )
2016-07-02 21:21:32 +02:00
req_get_method = req . get_method ( )
if req_get_method == ' HEAD ' :
req_type = HEADRequest
elif req_get_method == ' PUT ' :
req_type = PUTRequest
else :
req_type = compat_urllib_request . Request
2016-03-31 18:55:01 +02:00
new_req = req_type (
req_url , data = req_data , headers = req_headers ,
origin_req_host = req . origin_req_host , unverifiable = req . unverifiable )
if hasattr ( req , ' timeout ' ) :
new_req . timeout = req . timeout
return new_req
2017-05-06 13:06:18 +02:00
def _multipart_encode_impl ( data , boundary ) :
2017-05-01 17:09:18 +02:00
content_type = ' multipart/form-data; boundary= %s ' % boundary
out = b ' '
for k , v in data . items ( ) :
out + = b ' -- ' + boundary . encode ( ' ascii ' ) + b ' \r \n '
if isinstance ( k , compat_str ) :
k = k . encode ( ' utf-8 ' )
if isinstance ( v , compat_str ) :
v = v . encode ( ' utf-8 ' )
# RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
# suggests sending UTF-8 directly. Firefox sends UTF-8, too
2017-05-05 14:51:59 +02:00
content = b ' Content-Disposition: form-data; name= " ' + k + b ' " \r \n \r \n ' + v + b ' \r \n '
2017-05-01 17:09:18 +02:00
if boundary . encode ( ' ascii ' ) in content :
raise ValueError ( ' Boundary overlaps with data ' )
out + = content
out + = b ' -- ' + boundary . encode ( ' ascii ' ) + b ' -- \r \n '
return out , content_type
def multipart_encode ( data , boundary = None ) :
'''
Encode a dict to RFC 7578 - compliant form - data
data :
A dict where keys and values can be either Unicode or bytes - like
objects .
boundary :
If specified a Unicode object , it ' s used as the boundary. Otherwise
a random boundary is generated .
Reference : https : / / tools . ietf . org / html / rfc7578
'''
has_specified_boundary = boundary is not None
while True :
if boundary is None :
boundary = ' --------------- ' + str ( random . randrange ( 0x0fffffff , 0xffffffff ) )
try :
2017-05-06 13:06:18 +02:00
out , content_type = _multipart_encode_impl ( data , boundary )
2017-05-01 17:09:18 +02:00
break
except ValueError :
if has_specified_boundary :
raise
boundary = None
return out , content_type
2016-02-07 03:13:04 +01:00
def dict_get ( d , key_or_keys , default = None , skip_false_values = True ) :
2016-02-07 01:12:53 +01:00
if isinstance ( key_or_keys , ( list , tuple ) ) :
for key in key_or_keys :
2016-02-07 03:13:04 +01:00
if key not in d or d [ key ] is None or skip_false_values and not d [ key ] :
continue
return d [ key ]
2016-02-07 01:12:53 +01:00
return default
return d . get ( key_or_keys , default )
2016-06-12 01:05:34 +02:00
def try_get ( src , getter , expected_type = None ) :
2021-07-10 23:59:44 +02:00
for get in variadic ( getter ) :
2017-04-18 17:39:58 +02:00
try :
v = get ( src )
except ( AttributeError , KeyError , TypeError , IndexError ) :
pass
else :
if expected_type is None or isinstance ( v , expected_type ) :
return v
2016-06-12 01:05:34 +02:00
2018-04-27 21:47:17 +02:00
def merge_dicts ( * dicts ) :
merged = { }
for a_dict in dicts :
for k , v in a_dict . items ( ) :
if v is None :
continue
2019-05-10 22:56:22 +02:00
if ( k not in merged
or ( isinstance ( v , compat_str ) and v
and isinstance ( merged [ k ] , compat_str )
and not merged [ k ] ) ) :
2018-04-27 21:47:17 +02:00
merged [ k ] = v
return merged
2015-12-20 01:26:26 +01:00
def encode_compat_str ( string , encoding = preferredencoding ( ) , errors = ' strict ' ) :
return string if isinstance ( string , compat_str ) else compat_str ( string , encoding , errors )
2015-09-06 03:22:20 +02:00
2014-03-21 00:59:51 +01:00
US_RATINGS = {
' G ' : 0 ,
' PG ' : 10 ,
' PG-13 ' : 13 ,
' R ' : 16 ,
' NC ' : 18 ,
}
2014-03-24 23:21:20 +01:00
2016-08-07 15:45:18 +02:00
TV_PARENTAL_GUIDELINES = {
2018-05-26 00:12:18 +02:00
' TV-Y ' : 0 ,
' TV-Y7 ' : 7 ,
' TV-G ' : 0 ,
' TV-PG ' : 0 ,
' TV-14 ' : 14 ,
' TV-MA ' : 17 ,
2016-08-07 15:45:18 +02:00
}
2014-10-03 14:37:25 +02:00
def parse_age_limit ( s ) :
2016-08-07 15:45:18 +02:00
if type ( s ) == int :
return s if 0 < = s < = 21 else None
if not isinstance ( s , compat_basestring ) :
2014-10-03 20:17:10 +02:00
return None
2014-10-03 14:37:25 +02:00
m = re . match ( r ' ^(?P<age> \ d { 1,2}) \ +?$ ' , s )
2016-08-07 15:45:18 +02:00
if m :
return int ( m . group ( ' age ' ) )
2021-03-20 11:41:11 +01:00
s = s . upper ( )
2016-08-07 15:45:18 +02:00
if s in US_RATINGS :
return US_RATINGS [ s ]
2018-05-26 00:12:18 +02:00
m = re . match ( r ' ^TV[_-]?( %s )$ ' % ' | ' . join ( k [ 3 : ] for k in TV_PARENTAL_GUIDELINES ) , s )
2018-05-23 13:12:20 +02:00
if m :
2018-05-26 00:12:18 +02:00
return TV_PARENTAL_GUIDELINES [ ' TV- ' + m . group ( 1 ) ]
2018-05-23 13:12:20 +02:00
return None
2014-10-03 14:37:25 +02:00
2014-03-24 23:21:20 +01:00
def strip_jsonp ( code ) :
2014-11-13 16:28:05 +01:00
return re . sub (
2017-05-26 15:58:18 +02:00
r ''' (?sx)^
2018-07-21 07:30:18 +02:00
( ? : window \. ) ? ( ? P < func_name > [ a - zA - Z0 - 9 _ . $ ] * )
2017-05-26 15:58:18 +02:00
( ? : \s * & & \s * ( ? P = func_name ) ) ?
\s * \( \s * ( ? P < callback_data > . * ) \) ; ?
\s * ? ( ? : / / [ ^ \n ] * ) * $ ''' ,
r ' \ g<callback_data> ' , code )
2014-04-21 07:12:02 +02:00
2021-01-19 20:05:50 +01:00
def js_to_json ( code , vars = { } ) :
# vars is a dict of var, val pairs to substitute
2021-07-13 09:18:20 +02:00
COMMENT_RE = r ' / \ *(?:(?! \ */).)*? \ */|//[^ \ n]* \ n '
2017-02-02 20:55:06 +01:00
SKIP_RE = r ' \ s*(?: {comment} )? \ s* ' . format ( comment = COMMENT_RE )
INTEGER_TABLE = (
( r ' (?s)^(0[xX][0-9a-fA-F]+) {skip} :?$ ' . format ( skip = SKIP_RE ) , 16 ) ,
( r ' (?s)^(0+[0-7]+) {skip} :?$ ' . format ( skip = SKIP_RE ) , 8 ) ,
)
2014-08-22 02:33:29 +02:00
def fix_kv ( m ) :
2014-09-30 11:12:59 +02:00
v = m . group ( 0 )
if v in ( ' true ' , ' false ' , ' null ' ) :
return v
2021-09-04 14:29:35 +02:00
elif v in ( ' undefined ' , ' void 0 ' ) :
return ' null '
2020-11-19 20:22:59 +01:00
elif v . startswith ( ' /* ' ) or v . startswith ( ' // ' ) or v . startswith ( ' ! ' ) or v == ' , ' :
2016-03-13 12:29:15 +01:00
return " "
if v [ 0 ] in ( " ' " , ' " ' ) :
v = re . sub ( r ' (?s) \\ .| " ' , lambda m : {
2014-09-30 11:12:59 +02:00
' " ' : ' \\ " ' ,
2016-03-13 12:29:15 +01:00
" \\ ' " : " ' " ,
' \\ \n ' : ' ' ,
' \\ x ' : ' \\ u00 ' ,
} . get ( m . group ( 0 ) , m . group ( 0 ) ) , v [ 1 : - 1 ] )
2020-11-19 20:22:59 +01:00
else :
for regex , base in INTEGER_TABLE :
im = re . match ( regex , v )
if im :
i = int ( im . group ( 1 ) , base )
return ' " %d " : ' % i if v . endswith ( ' : ' ) else ' %d ' % i
2016-05-14 16:39:58 +02:00
2021-01-19 20:05:50 +01:00
if v in vars :
return vars [ v ]
2014-09-30 11:12:59 +02:00
return ' " %s " ' % v
2014-08-22 02:33:29 +02:00
2016-03-13 12:29:15 +01:00
return re . sub ( r ''' (?sx)
" (?:[^ " \\] * ( ? : \\\\| \\[ ' " nurtbfx/ \n ]))*[^ " \\ ]* " |
' (?:[^ ' \\] * ( ? : \\\\| \\[ ' " nurtbfx/ \n ]))*[^ ' \\] * ' |
2017-02-02 20:55:06 +01:00
{ comment } | , ( ? = { skip } [ \] } } ] ) |
2021-09-04 14:29:35 +02:00
void \s0 | ( ? : ( ? < ! [ 0 - 9 ] ) [ eE ] | [ a - df - zA - DF - Z_ $ ] ) [ . a - zA - Z_ $ 0 - 9 ] * |
2017-02-02 20:55:06 +01:00
\b ( ? : 0 [ xX ] [ 0 - 9 a - fA - F ] + | 0 + [ 0 - 7 ] + ) ( ? : { skip } : ) ? |
2020-11-19 20:22:59 +01:00
[ 0 - 9 ] + ( ? = { skip } : ) |
! +
2017-02-02 20:55:06 +01:00
''' .format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
2014-08-22 02:33:29 +02:00
2014-04-21 07:12:02 +02:00
def qualities ( quality_ids ) :
""" Get a numeric quality value out of a list of possible values """
def q ( qid ) :
try :
return quality_ids . index ( qid )
except ValueError :
return - 1
return q
2014-04-30 10:02:03 +02:00
2021-02-03 14:36:09 +01:00
DEFAULT_OUTTMPL = {
' default ' : ' %(title)s [ %(id)s ]. %(ext)s ' ,
2021-03-15 00:02:13 +01:00
' chapter ' : ' %(title)s - %(section_number)03d %(section_title)s [ %(id)s ]. %(ext)s ' ,
2021-02-03 14:36:09 +01:00
}
OUTTMPL_TYPES = {
2021-03-15 00:02:13 +01:00
' chapter ' : None ,
2021-02-03 14:36:09 +01:00
' subtitle ' : None ,
' thumbnail ' : None ,
' description ' : ' description ' ,
' annotation ' : ' annotations.xml ' ,
' infojson ' : ' info.json ' ,
2021-10-26 16:41:59 +02:00
' link ' : None ,
2021-05-17 22:10:21 +02:00
' pl_thumbnail ' : None ,
2021-02-03 14:36:09 +01:00
' pl_description ' : ' description ' ,
' pl_infojson ' : ' info.json ' ,
}
2014-05-16 12:03:59 +02:00
2021-03-24 23:02:15 +01:00
# As of [1] format syntax is:
# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
2021-07-29 01:49:26 +02:00
STR_FORMAT_RE_TMPL = r ''' (?x)
( ? < ! % ) ( ? P < prefix > ( ? : % % ) * )
2021-03-24 23:02:15 +01:00
%
2021-09-25 22:09:44 +02:00
( ? P < has_key > \( ( ? P < key > { 0 } ) \) ) ?
2021-06-03 20:00:38 +02:00
( ? P < format >
2021-09-25 22:09:44 +02:00
( ? P < conversion > [ #0\-+ ]+)?
( ? P < min_width > \d + ) ?
( ? P < precision > \. \d + ) ?
( ? P < len_mod > [ hlL ] ) ? # unused in python
2021-07-29 01:49:26 +02:00
{ 1 } # conversion type
2021-06-03 20:00:38 +02:00
)
2021-03-24 23:02:15 +01:00
'''
2021-07-29 04:56:17 +02:00
2021-07-29 01:49:26 +02:00
STR_FORMAT_TYPES = ' diouxXeEfFgGcrs '
2014-09-15 15:10:24 +02:00
2021-07-29 04:56:17 +02:00
2014-09-15 15:10:24 +02:00
def limit_length ( s , length ) :
""" Add ellipses to overly long strings """
if s is None :
return None
ELLIPSES = ' ... '
if len ( s ) > length :
return s [ : length - len ( ELLIPSES ) ] + ELLIPSES
return s
2014-10-26 16:46:34 +01:00
def version_tuple ( v ) :
2014-12-06 12:14:26 +01:00
return tuple ( int ( e ) for e in re . split ( r ' [-.] ' , v ) )
2014-10-26 16:46:34 +01:00
def is_outdated_version ( version , limit , assume_new = True ) :
if not version :
return not assume_new
try :
return version_tuple ( version ) < version_tuple ( limit )
except ValueError :
return not assume_new
2014-11-20 12:14:28 +01:00
def ytdl_is_updateable ( ) :
2021-02-24 19:45:56 +01:00
""" Returns if yt-dlp can be updated with -U """
2021-01-06 12:58:30 +01:00
2021-10-03 22:55:13 +02:00
from . update import is_non_updateable
2014-11-20 12:14:28 +01:00
2021-10-03 22:55:13 +02:00
return not is_non_updateable ( )
2014-11-23 10:49:19 +01:00
def args_to_str ( args ) :
# Get a short string representation for a subprocess command
2016-05-10 09:58:25 +02:00
return ' ' . join ( compat_shlex_quote ( a ) for a in args )
2015-01-04 02:20:45 +01:00
2015-12-20 02:00:39 +01:00
def error_to_compat_str ( err ) :
2015-12-20 00:26:47 +01:00
err_str = str ( err )
# On python 2 error byte string must be decoded with proper
# encoding rather than ascii
if sys . version_info [ 0 ] < 3 :
err_str = err_str . decode ( preferredencoding ( ) )
return err_str
2015-02-19 00:31:01 +01:00
def mimetype2ext ( mt ) :
2016-04-24 20:03:12 +02:00
if mt is None :
return None
2021-10-02 20:43:42 +02:00
mt , _ , params = mt . partition ( ' ; ' )
mt = mt . strip ( )
FULL_MAP = {
2016-02-20 19:55:10 +01:00
' audio/mp4 ' : ' m4a ' ,
2016-06-09 06:58:24 +02:00
# Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
# it's the most popular one
' audio/mpeg ' : ' mp3 ' ,
2020-08-28 18:00:30 +02:00
' audio/x-wav ' : ' wav ' ,
2021-10-02 20:43:42 +02:00
' audio/wav ' : ' wav ' ,
' audio/wave ' : ' wav ' ,
}
ext = FULL_MAP . get ( mt )
2016-02-20 19:55:10 +01:00
if ext is not None :
return ext
2021-10-02 20:43:42 +02:00
SUBTYPE_MAP = {
2016-01-24 17:58:53 +01:00
' 3gpp ' : ' 3gp ' ,
2016-02-20 22:02:03 +01:00
' smptett+xml ' : ' tt ' ,
' ttaf+xml ' : ' dfxp ' ,
2016-01-24 18:01:15 +01:00
' ttml+xml ' : ' ttml ' ,
2016-01-24 17:58:53 +01:00
' x-flv ' : ' flv ' ,
2016-01-24 18:01:15 +01:00
' x-mp4-fragmented ' : ' mp4 ' ,
2017-12-03 00:04:43 +01:00
' x-ms-sami ' : ' sami ' ,
2016-01-24 18:01:15 +01:00
' x-ms-wmv ' : ' wmv ' ,
2016-07-06 10:06:28 +02:00
' mpegurl ' : ' m3u8 ' ,
' x-mpegurl ' : ' m3u8 ' ,
' vnd.apple.mpegurl ' : ' m3u8 ' ,
' dash+xml ' : ' mpd ' ,
' f4m+xml ' : ' f4m ' ,
2016-07-23 11:48:59 +02:00
' hds+xml ' : ' f4m ' ,
2016-07-14 15:13:57 +02:00
' vnd.ms-sstr+xml ' : ' ism ' ,
2016-09-02 18:50:42 +02:00
' quicktime ' : ' mov ' ,
2017-04-30 10:01:05 +02:00
' mp2t ' : ' ts ' ,
2020-09-12 05:08:57 +02:00
' x-wav ' : ' wav ' ,
2021-10-02 20:43:42 +02:00
' filmstrip+json ' : ' fs ' ,
' svg+xml ' : ' svg ' ,
}
_ , _ , subtype = mt . rpartition ( ' / ' )
ext = SUBTYPE_MAP . get ( subtype . lower ( ) )
if ext is not None :
return ext
SUFFIX_MAP = {
' json ' : ' json ' ,
' xml ' : ' xml ' ,
' zip ' : ' zip ' ,
' gzip ' : ' gz ' ,
}
_ , _ , suffix = subtype . partition ( ' + ' )
ext = SUFFIX_MAP . get ( suffix )
if ext is not None :
return ext
return subtype . replace ( ' + ' , ' . ' )
2015-02-19 00:31:01 +01:00
2021-12-09 12:40:52 +01:00
def ext2mimetype ( ext_or_url ) :
if not ext_or_url :
return None
if ' . ' not in ext_or_url :
ext_or_url = f ' file. { ext_or_url } '
return mimetypes . guess_type ( ext_or_url ) [ 0 ]
2016-03-16 18:48:06 +01:00
def parse_codecs ( codecs_str ) :
# http://tools.ietf.org/html/rfc6381
if not codecs_str :
return { }
2020-11-21 15:50:42 +01:00
split_codecs = list ( filter ( None , map (
2021-07-31 12:51:01 +02:00
str . strip , codecs_str . strip ( ) . strip ( ' , ' ) . split ( ' , ' ) ) ) )
2021-10-18 15:04:21 +02:00
vcodec , acodec , hdr = None , None , None
2020-11-21 15:50:42 +01:00
for full_codec in split_codecs :
2021-11-03 11:56:34 +01:00
parts = full_codec . split ( ' . ' )
codec = parts [ 0 ] . replace ( ' 0 ' , ' ' )
if codec in ( ' avc1 ' , ' avc2 ' , ' avc3 ' , ' avc4 ' , ' vp9 ' , ' vp8 ' , ' hev1 ' , ' hev2 ' ,
' h263 ' , ' h264 ' , ' mp4v ' , ' hvc1 ' , ' av1 ' , ' theora ' , ' dvh1 ' , ' dvhe ' ) :
2016-03-16 18:48:06 +01:00
if not vcodec :
2021-11-03 11:56:34 +01:00
vcodec = ' . ' . join ( parts [ : 4 ] ) if codec in ( ' vp9 ' , ' av1 ' ) else full_codec
2021-10-18 15:04:21 +02:00
if codec in ( ' dvh1 ' , ' dvhe ' ) :
hdr = ' DV '
2021-11-03 11:56:34 +01:00
elif codec == ' av1 ' and len ( parts ) > 3 and parts [ 3 ] == ' 10 ' :
hdr = ' HDR10 '
elif full_codec . replace ( ' 0 ' , ' ' ) . startswith ( ' vp9.2 ' ) :
2021-10-18 15:04:21 +02:00
hdr = ' HDR10 '
2017-05-14 07:33:33 +02:00
elif codec in ( ' mp4a ' , ' opus ' , ' vorbis ' , ' mp3 ' , ' aac ' , ' ac-3 ' , ' ec-3 ' , ' eac3 ' , ' dtsc ' , ' dtse ' , ' dtsh ' , ' dtsl ' ) :
2016-03-16 18:48:06 +01:00
if not acodec :
acodec = full_codec
else :
2017-05-14 07:33:33 +02:00
write_string ( ' WARNING: Unknown codec %s \n ' % full_codec , sys . stderr )
2016-03-16 18:48:06 +01:00
if not vcodec and not acodec :
2020-11-21 15:50:42 +01:00
if len ( split_codecs ) == 2 :
2016-03-16 18:48:06 +01:00
return {
2020-11-21 15:50:42 +01:00
' vcodec ' : split_codecs [ 0 ] ,
' acodec ' : split_codecs [ 1 ] ,
2016-03-16 18:48:06 +01:00
}
else :
return {
' vcodec ' : vcodec or ' none ' ,
' acodec ' : acodec or ' none ' ,
2021-10-18 15:04:21 +02:00
' dynamic_range ' : hdr ,
2016-03-16 18:48:06 +01:00
}
return { }
2015-01-04 02:20:45 +01:00
def urlhandle_detect_ext ( url_handle ) :
2016-05-15 09:32:54 +02:00
getheader = url_handle . headers . get
2015-01-04 02:20:45 +01:00
2015-01-22 12:04:07 +01:00
cd = getheader ( ' Content-Disposition ' )
if cd :
m = re . match ( r ' attachment; \ s*filename= " (?P<filename>[^ " ]+) " ' , cd )
if m :
e = determine_ext ( m . group ( ' filename ' ) , default_ext = None )
if e :
return e
2015-02-19 00:31:01 +01:00
return mimetype2ext ( getheader ( ' Content-Type ' ) )
2015-01-07 07:20:20 +01:00
2015-07-22 14:03:05 +02:00
def encode_data_uri ( data , mime_type ) :
return ' data: %s ;base64, %s ' % ( mime_type , base64 . b64encode ( data ) . decode ( ' ascii ' ) )
2015-01-07 07:20:20 +01:00
def age_restricted ( content_limit , age_limit ) :
2016-01-10 19:27:22 +01:00
""" Returns True iff the content should be blocked """
2015-01-07 07:20:20 +01:00
if age_limit is None : # No limit set
return False
if content_limit is None :
return False # Content available for everyone
return age_limit < content_limit
2015-01-23 01:21:30 +01:00
def is_html ( first_bytes ) :
""" Detect whether a file contains HTML by examining its first bytes. """
BOMS = [
( b ' \xef \xbb \xbf ' , ' utf-8 ' ) ,
( b ' \x00 \x00 \xfe \xff ' , ' utf-32-be ' ) ,
( b ' \xff \xfe \x00 \x00 ' , ' utf-32-le ' ) ,
( b ' \xff \xfe ' , ' utf-16-le ' ) ,
( b ' \xfe \xff ' , ' utf-16-be ' ) ,
]
for bom , enc in BOMS :
if first_bytes . startswith ( bom ) :
s = first_bytes [ len ( bom ) : ] . decode ( enc , ' replace ' )
break
else :
s = first_bytes . decode ( ' utf-8 ' , ' replace ' )
return re . match ( r ' ^ \ s*< ' , s )
2015-01-23 23:50:31 +01:00
def determine_protocol ( info_dict ) :
protocol = info_dict . get ( ' protocol ' )
if protocol is not None :
return protocol
2021-10-26 16:01:56 +02:00
url = sanitize_url ( info_dict [ ' url ' ] )
2015-01-23 23:50:31 +01:00
if url . startswith ( ' rtmp ' ) :
return ' rtmp '
elif url . startswith ( ' mms ' ) :
return ' mms '
elif url . startswith ( ' rtsp ' ) :
return ' rtsp '
ext = determine_ext ( url )
if ext == ' m3u8 ' :
return ' m3u8 '
elif ext == ' f4m ' :
return ' f4m '
return compat_urllib_parse_urlparse ( url ) . scheme
2015-01-25 02:38:47 +01:00
2021-11-20 04:03:51 +01:00
def render_table ( header_row , data , delim = False , extra_gap = 0 , hide_empty = False ) :
""" Render a list of rows, each as a list of values.
Text after a \t will be right aligned """
2021-10-20 18:37:32 +02:00
def width ( string ) :
2021-11-20 04:03:51 +01:00
return len ( remove_terminal_sequences ( string ) . replace ( ' \t ' , ' ' ) )
2020-12-13 15:29:09 +01:00
def get_max_lens ( table ) :
2021-10-20 18:37:32 +02:00
return [ max ( width ( str ( v ) ) for v in col ) for col in zip ( * table ) ]
2020-12-13 15:29:09 +01:00
def filter_using_list ( row , filterArray ) :
return [ col for ( take , col ) in zip ( filterArray , row ) if take ]
2021-11-20 04:03:51 +01:00
if hide_empty :
2020-12-13 15:29:09 +01:00
max_lens = get_max_lens ( data )
header_row = filter_using_list ( header_row , max_lens )
data = [ filter_using_list ( row , max_lens ) for row in data ]
2015-01-25 02:38:47 +01:00
table = [ header_row ] + data
2020-12-13 15:29:09 +01:00
max_lens = get_max_lens ( table )
2021-11-20 04:03:51 +01:00
extra_gap + = 1
2020-12-13 15:29:09 +01:00
if delim :
2021-11-20 04:03:51 +01:00
table = [ header_row , [ delim * ( ml + extra_gap ) for ml in max_lens ] ] + data
table [ 1 ] [ - 1 ] = table [ 1 ] [ - 1 ] [ : - extra_gap ] # Remove extra_gap from end of delimiter
2021-10-20 18:37:32 +02:00
for row in table :
for pos , text in enumerate ( map ( str , row ) ) :
2021-11-20 04:03:51 +01:00
if ' \t ' in text :
row [ pos ] = text . replace ( ' \t ' , ' ' * ( max_lens [ pos ] - width ( text ) ) ) + ' ' * extra_gap
else :
row [ pos ] = text + ' ' * ( max_lens [ pos ] - width ( text ) + extra_gap )
ret = ' \n ' . join ( ' ' . join ( row ) . rstrip ( ) for row in table )
2021-10-20 18:37:32 +02:00
return ret
2015-02-10 03:32:21 +01:00
2021-08-15 10:12:23 +02:00
def _match_one ( filter_part , dct , incomplete ) :
2021-06-13 16:25:19 +02:00
# TODO: Generalize code with YoutubeDL._build_format_filter
2021-08-04 23:31:23 +02:00
STRING_OPERATORS = {
' *= ' : operator . contains ,
' ^= ' : lambda attr , value : attr . startswith ( value ) ,
' $= ' : lambda attr , value : attr . endswith ( value ) ,
' ~= ' : lambda attr , value : re . search ( value , attr ) ,
}
2015-02-10 03:32:21 +01:00
COMPARISON_OPERATORS = {
2021-08-04 23:31:23 +02:00
* * STRING_OPERATORS ,
' <= ' : operator . le , # "<=" must be defined above "<"
2015-02-10 03:32:21 +01:00
' < ' : operator . lt ,
' >= ' : operator . ge ,
2021-08-04 23:31:23 +02:00
' > ' : operator . gt ,
2015-02-10 03:32:21 +01:00
' = ' : operator . eq ,
}
2021-08-04 23:31:23 +02:00
2015-02-10 03:32:21 +01:00
operator_rex = re . compile ( r ''' (?x) \ s*
( ? P < key > [ a - z_ ] + )
2021-06-13 16:25:19 +02:00
\s * ( ? P < negation > ! \s * ) ? ( ? P < op > % s ) ( ? P < none_inclusive > \s * \? ) ? \s *
2015-02-10 03:32:21 +01:00
( ? :
2021-08-04 23:31:23 +02:00
( ? P < quote > [ " \' ])(?P<quotedstrval>.+?)(?P=quote)|
( ? P < strval > . + ? )
2015-02-10 03:32:21 +01:00
)
\s * $
''' % ' | ' .join(map(re.escape, COMPARISON_OPERATORS.keys())))
m = operator_rex . search ( filter_part )
if m :
2021-10-16 21:34:00 +02:00
m = m . groupdict ( )
unnegated_op = COMPARISON_OPERATORS [ m [ ' op ' ] ]
if m [ ' negation ' ] :
2021-06-13 16:25:19 +02:00
op = lambda attr , value : not unnegated_op ( attr , value )
else :
op = unnegated_op
2021-10-16 21:34:00 +02:00
comparison_value = m [ ' quotedstrval ' ] or m [ ' strval ' ] or m [ ' intval ' ]
if m [ ' quote ' ] :
comparison_value = comparison_value . replace ( r ' \ %s ' % m [ ' quote ' ] , m [ ' quote ' ] )
actual_value = dct . get ( m [ ' key ' ] )
numeric_comparison = None
if isinstance ( actual_value , compat_numeric_types ) :
2016-10-31 17:32:08 +01:00
# If the original field is a string and matching comparisonvalue is
# a number we should respect the origin of the original field
# and process comparison value as a string (see
2021-10-16 21:34:00 +02:00
# https://github.com/ytdl-org/youtube-dl/issues/11082)
2015-02-10 03:32:21 +01:00
try :
2021-10-16 21:34:00 +02:00
numeric_comparison = int ( comparison_value )
2015-02-10 03:32:21 +01:00
except ValueError :
2021-10-16 21:34:00 +02:00
numeric_comparison = parse_filesize ( comparison_value )
if numeric_comparison is None :
numeric_comparison = parse_filesize ( f ' { comparison_value } B ' )
if numeric_comparison is None :
numeric_comparison = parse_duration ( comparison_value )
if numeric_comparison is not None and m [ ' op ' ] in STRING_OPERATORS :
raise ValueError ( ' Operator %s only supports string values! ' % m [ ' op ' ] )
2015-02-10 03:32:21 +01:00
if actual_value is None :
2021-10-16 21:34:00 +02:00
return incomplete or m [ ' none_inclusive ' ]
return op ( actual_value , comparison_value if numeric_comparison is None else numeric_comparison )
2015-02-10 03:32:21 +01:00
UNARY_OPERATORS = {
2018-04-24 18:49:30 +02:00
' ' : lambda v : ( v is True ) if isinstance ( v , bool ) else ( v is not None ) ,
' ! ' : lambda v : ( v is False ) if isinstance ( v , bool ) else ( v is None ) ,
2015-02-10 03:32:21 +01:00
}
operator_rex = re . compile ( r ''' (?x) \ s*
( ? P < op > % s ) \s * ( ? P < key > [ a - z_ ] + )
\s * $
''' % ' | ' .join(map(re.escape, UNARY_OPERATORS.keys())))
m = operator_rex . search ( filter_part )
if m :
op = UNARY_OPERATORS [ m . group ( ' op ' ) ]
actual_value = dct . get ( m . group ( ' key ' ) )
2021-08-15 10:12:23 +02:00
if incomplete and actual_value is None :
return True
2015-02-10 03:32:21 +01:00
return op ( actual_value )
raise ValueError ( ' Invalid filter part %r ' % filter_part )
2021-08-15 10:12:23 +02:00
def match_str ( filter_str , dct , incomplete = False ) :
""" Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false
When incomplete , all conditions passes on missing fields
"""
2015-02-10 03:32:21 +01:00
return all (
2021-08-15 10:12:23 +02:00
_match_one ( filter_part . replace ( r ' \ & ' , ' & ' ) , dct , incomplete )
2021-08-04 23:31:23 +02:00
for filter_part in re . split ( r ' (?<! \\ )& ' , filter_str ) )
2015-02-10 03:32:21 +01:00
def match_filter_func ( filter_str ) :
2021-08-15 10:12:23 +02:00
def _match_func ( info_dict , * args , * * kwargs ) :
if match_str ( filter_str , info_dict , * args , * * kwargs ) :
2015-02-10 03:32:21 +01:00
return None
else :
video_title = info_dict . get ( ' title ' , info_dict . get ( ' id ' , ' video ' ) )
return ' %s does not pass filter %s , skipping .. ' % ( video_title , filter_str )
return _match_func
2015-03-03 00:03:06 +01:00
2015-04-25 17:15:05 +02:00
def parse_dfxp_time_expr ( time_expr ) :
if not time_expr :
2015-12-19 11:21:42 +01:00
return
2015-04-25 17:15:05 +02:00
mobj = re . match ( r ' ^(?P<time_offset> \ d+(?: \ . \ d+)?)s?$ ' , time_expr )
if mobj :
return float ( mobj . group ( ' time_offset ' ) )
2015-12-19 12:29:51 +01:00
mobj = re . match ( r ' ^( \ d+):( \ d \ d):( \ d \ d(?:(?: \ .|:) \ d+)?)$ ' , time_expr )
2015-04-25 17:15:05 +02:00
if mobj :
2015-12-19 12:29:51 +01:00
return 3600 * int ( mobj . group ( 1 ) ) + 60 * int ( mobj . group ( 2 ) ) + float ( mobj . group ( 3 ) . replace ( ' : ' , ' . ' ) )
2015-04-25 17:15:05 +02:00
2015-05-12 07:04:54 +02:00
def srt_subtitles_timecode ( seconds ) :
2021-10-19 19:28:14 +02:00
return ' %02d : %02d : %02d , %03d ' % timetuple_from_msec ( seconds * 1000 )
def ass_subtitles_timecode ( seconds ) :
time = timetuple_from_msec ( seconds * 1000 )
return ' %01d : %02d : %02d . %02d ' % ( * time [ : - 1 ] , time . milliseconds / 10 )
2015-04-25 17:15:05 +02:00
def dfxp2srt ( dfxp_data ) :
2017-09-16 06:18:38 +02:00
'''
@param dfxp_data A bytes - like object containing DFXP data
@returns A unicode object containing converted SRT data
'''
2017-02-23 18:46:20 +01:00
LEGACY_NAMESPACES = (
2017-09-16 06:18:38 +02:00
( b ' http://www.w3.org/ns/ttml ' , [
b ' http://www.w3.org/2004/11/ttaf1 ' ,
b ' http://www.w3.org/2006/04/ttaf1 ' ,
b ' http://www.w3.org/2006/10/ttaf1 ' ,
2017-02-23 18:46:20 +01:00
] ) ,
2017-09-16 06:18:38 +02:00
( b ' http://www.w3.org/ns/ttml#styling ' , [
b ' http://www.w3.org/ns/ttml#style ' ,
2017-02-23 18:46:20 +01:00
] ) ,
)
SUPPORTED_STYLING = [
' color ' ,
' fontFamily ' ,
' fontSize ' ,
' fontStyle ' ,
' fontWeight ' ,
' textDecoration '
]
2015-06-21 13:16:59 +02:00
_x = functools . partial ( xpath_with_ns , ns_map = {
2018-05-26 15:35:47 +02:00
' xml ' : ' http://www.w3.org/XML/1998/namespace ' ,
2015-06-21 13:16:59 +02:00
' ttml ' : ' http://www.w3.org/ns/ttml ' ,
2017-02-23 18:46:20 +01:00
' tts ' : ' http://www.w3.org/ns/ttml#styling ' ,
2015-06-21 13:16:59 +02:00
} )
2015-04-25 17:15:05 +02:00
2017-02-23 18:46:20 +01:00
styles = { }
default_style = { }
2016-02-02 22:30:13 +01:00
class TTMLPElementParser ( object ) :
2017-02-23 18:46:20 +01:00
_out = ' '
_unclosed_elements = [ ]
_applied_styles = [ ]
2015-04-25 17:15:05 +02:00
2016-01-28 12:38:34 +01:00
def start ( self , tag , attrib ) :
2017-02-23 18:46:20 +01:00
if tag in ( _x ( ' ttml:br ' ) , ' br ' ) :
self . _out + = ' \n '
else :
unclosed_elements = [ ]
style = { }
element_style_id = attrib . get ( ' style ' )
if default_style :
style . update ( default_style )
if element_style_id :
style . update ( styles . get ( element_style_id , { } ) )
for prop in SUPPORTED_STYLING :
prop_val = attrib . get ( _x ( ' tts: ' + prop ) )
if prop_val :
style [ prop ] = prop_val
if style :
font = ' '
for k , v in sorted ( style . items ( ) ) :
if self . _applied_styles and self . _applied_styles [ - 1 ] . get ( k ) == v :
continue
if k == ' color ' :
font + = ' color= " %s " ' % v
elif k == ' fontSize ' :
font + = ' size= " %s " ' % v
elif k == ' fontFamily ' :
font + = ' face= " %s " ' % v
elif k == ' fontWeight ' and v == ' bold ' :
self . _out + = ' <b> '
unclosed_elements . append ( ' b ' )
elif k == ' fontStyle ' and v == ' italic ' :
self . _out + = ' <i> '
unclosed_elements . append ( ' i ' )
elif k == ' textDecoration ' and v == ' underline ' :
self . _out + = ' <u> '
unclosed_elements . append ( ' u ' )
if font :
self . _out + = ' <font ' + font + ' > '
unclosed_elements . append ( ' font ' )
applied_style = { }
if self . _applied_styles :
applied_style . update ( self . _applied_styles [ - 1 ] )
applied_style . update ( style )
self . _applied_styles . append ( applied_style )
self . _unclosed_elements . append ( unclosed_elements )
2015-04-25 17:15:05 +02:00
2016-01-28 12:38:34 +01:00
def end ( self , tag ) :
2017-02-23 18:46:20 +01:00
if tag not in ( _x ( ' ttml:br ' ) , ' br ' ) :
unclosed_elements = self . _unclosed_elements . pop ( )
for element in reversed ( unclosed_elements ) :
self . _out + = ' </ %s > ' % element
if unclosed_elements and self . _applied_styles :
self . _applied_styles . pop ( )
2015-04-25 17:15:05 +02:00
2016-01-28 12:38:34 +01:00
def data ( self , data ) :
2017-02-23 18:46:20 +01:00
self . _out + = data
2016-01-28 12:38:34 +01:00
def close ( self ) :
2017-02-23 18:46:20 +01:00
return self . _out . strip ( )
2016-01-28 12:38:34 +01:00
def parse_node ( node ) :
target = TTMLPElementParser ( )
parser = xml . etree . ElementTree . XMLParser ( target = target )
parser . feed ( xml . etree . ElementTree . tostring ( node ) )
return parser . close ( )
2015-04-25 17:15:05 +02:00
2017-02-23 18:46:20 +01:00
for k , v in LEGACY_NAMESPACES :
for ns in v :
dfxp_data = dfxp_data . replace ( ns , k )
2017-09-16 06:18:38 +02:00
dfxp = compat_etree_fromstring ( dfxp_data )
2015-04-25 17:15:05 +02:00
out = [ ]
2017-02-23 18:46:20 +01:00
paras = dfxp . findall ( _x ( ' .//ttml:p ' ) ) or dfxp . findall ( ' .//p ' )
2015-05-18 18:45:01 +02:00
if not paras :
raise ValueError ( ' Invalid dfxp/TTML subtitle ' )
2015-04-25 17:15:05 +02:00
2017-02-23 18:46:20 +01:00
repeat = False
while True :
for style in dfxp . findall ( _x ( ' .//ttml:style ' ) ) :
2018-05-26 15:35:47 +02:00
style_id = style . get ( ' id ' ) or style . get ( _x ( ' xml:id ' ) )
if not style_id :
continue
2017-02-23 18:46:20 +01:00
parent_style_id = style . get ( ' style ' )
if parent_style_id :
if parent_style_id not in styles :
repeat = True
continue
styles [ style_id ] = styles [ parent_style_id ] . copy ( )
for prop in SUPPORTED_STYLING :
prop_val = style . get ( _x ( ' tts: ' + prop ) )
if prop_val :
styles . setdefault ( style_id , { } ) [ prop ] = prop_val
if repeat :
repeat = False
else :
break
for p in ( ' body ' , ' div ' ) :
ele = xpath_element ( dfxp , [ _x ( ' .//ttml: ' + p ) , ' .// ' + p ] )
if ele is None :
continue
style = styles . get ( ele . get ( ' style ' ) )
if not style :
continue
default_style . update ( style )
2015-04-25 17:15:05 +02:00
for para , index in zip ( paras , itertools . count ( 1 ) ) :
2015-12-19 11:21:42 +01:00
begin_time = parse_dfxp_time_expr ( para . attrib . get ( ' begin ' ) )
2015-05-12 06:47:37 +02:00
end_time = parse_dfxp_time_expr ( para . attrib . get ( ' end ' ) )
2015-12-19 11:21:42 +01:00
dur = parse_dfxp_time_expr ( para . attrib . get ( ' dur ' ) )
if begin_time is None :
continue
2015-05-12 06:47:37 +02:00
if not end_time :
2015-12-19 11:21:42 +01:00
if not dur :
continue
end_time = begin_time + dur
2015-04-25 17:15:05 +02:00
out . append ( ' %d \n %s --> %s \n %s \n \n ' % (
index ,
2015-05-12 07:04:54 +02:00
srt_subtitles_timecode ( begin_time ) ,
srt_subtitles_timecode ( end_time ) ,
2015-04-25 17:15:05 +02:00
parse_node ( para ) ) )
return ' ' . join ( out )
2015-09-04 23:05:11 +02:00
def cli_option ( params , command_option , param ) :
param = params . get ( param )
2016-08-12 13:30:02 +02:00
if param :
param = compat_str ( param )
2015-09-04 23:05:11 +02:00
return [ command_option , param ] if param is not None else [ ]
def cli_bool_option ( params , command_option , param , true_value = ' true ' , false_value = ' false ' , separator = None ) :
param = params . get ( param )
2017-08-09 17:28:19 +02:00
if param is None :
return [ ]
2015-09-04 23:05:11 +02:00
assert isinstance ( param , bool )
if separator :
return [ command_option + separator + ( true_value if param else false_value ) ]
return [ command_option , true_value if param else false_value ]
def cli_valueless_option ( params , command_option , param , expected_value = True ) :
param = params . get ( param )
return [ command_option ] if param == expected_value else [ ]
2021-03-09 03:17:21 +01:00
def cli_configuration_args ( argdict , keys , default = [ ] , use_compat = True ) :
2021-01-23 10:43:51 +01:00
if isinstance ( argdict , ( list , tuple ) ) : # for backward compatibility
2021-03-09 03:17:21 +01:00
if use_compat :
2021-02-24 17:05:18 +01:00
return argdict
else :
argdict = None
2021-01-23 10:43:51 +01:00
if argdict is None :
2021-02-24 17:05:18 +01:00
return default
2021-01-23 10:43:51 +01:00
assert isinstance ( argdict , dict )
2021-03-09 03:17:21 +01:00
assert isinstance ( keys , ( list , tuple ) )
for key_list in keys :
arg_list = list ( filter (
lambda x : x is not None ,
2021-07-10 23:59:44 +02:00
[ argdict . get ( key . lower ( ) ) for key in variadic ( key_list ) ] ) )
2021-03-09 03:17:21 +01:00
if arg_list :
return [ arg for args in arg_list for arg in args ]
return default
2015-09-04 23:05:11 +02:00
2021-08-24 02:12:45 +02:00
2021-08-23 23:45:44 +02:00
def _configuration_args ( main_key , argdict , exe , keys = None , default = [ ] , use_compat = True ) :
main_key , exe = main_key . lower ( ) , exe . lower ( )
root_key = exe if main_key == exe else f ' { main_key } + { exe } '
keys = [ f ' { root_key } { k } ' for k in ( keys or [ ' ' ] ) ]
if root_key in keys :
if main_key != exe :
keys . append ( ( main_key , exe ) )
keys . append ( ' default ' )
else :
use_compat = False
return cli_configuration_args ( argdict , keys , default , use_compat )
2015-09-04 23:05:11 +02:00
2015-06-21 12:53:17 +02:00
class ISO639Utils ( object ) :
# See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
_lang_map = {
' aa ' : ' aar ' ,
' ab ' : ' abk ' ,
' ae ' : ' ave ' ,
' af ' : ' afr ' ,
' ak ' : ' aka ' ,
' am ' : ' amh ' ,
' an ' : ' arg ' ,
' ar ' : ' ara ' ,
' as ' : ' asm ' ,
' av ' : ' ava ' ,
' ay ' : ' aym ' ,
' az ' : ' aze ' ,
' ba ' : ' bak ' ,
' be ' : ' bel ' ,
' bg ' : ' bul ' ,
' bh ' : ' bih ' ,
' bi ' : ' bis ' ,
' bm ' : ' bam ' ,
' bn ' : ' ben ' ,
' bo ' : ' bod ' ,
' br ' : ' bre ' ,
' bs ' : ' bos ' ,
' ca ' : ' cat ' ,
' ce ' : ' che ' ,
' ch ' : ' cha ' ,
' co ' : ' cos ' ,
' cr ' : ' cre ' ,
' cs ' : ' ces ' ,
' cu ' : ' chu ' ,
' cv ' : ' chv ' ,
' cy ' : ' cym ' ,
' da ' : ' dan ' ,
' de ' : ' deu ' ,
' dv ' : ' div ' ,
' dz ' : ' dzo ' ,
' ee ' : ' ewe ' ,
' el ' : ' ell ' ,
' en ' : ' eng ' ,
' eo ' : ' epo ' ,
' es ' : ' spa ' ,
' et ' : ' est ' ,
' eu ' : ' eus ' ,
' fa ' : ' fas ' ,
' ff ' : ' ful ' ,
' fi ' : ' fin ' ,
' fj ' : ' fij ' ,
' fo ' : ' fao ' ,
' fr ' : ' fra ' ,
' fy ' : ' fry ' ,
' ga ' : ' gle ' ,
' gd ' : ' gla ' ,
' gl ' : ' glg ' ,
' gn ' : ' grn ' ,
' gu ' : ' guj ' ,
' gv ' : ' glv ' ,
' ha ' : ' hau ' ,
' he ' : ' heb ' ,
2019-01-06 18:55:39 +01:00
' iw ' : ' heb ' , # Replaced by he in 1989 revision
2015-06-21 12:53:17 +02:00
' hi ' : ' hin ' ,
' ho ' : ' hmo ' ,
' hr ' : ' hrv ' ,
' ht ' : ' hat ' ,
' hu ' : ' hun ' ,
' hy ' : ' hye ' ,
' hz ' : ' her ' ,
' ia ' : ' ina ' ,
' id ' : ' ind ' ,
2019-01-06 18:55:39 +01:00
' in ' : ' ind ' , # Replaced by id in 1989 revision
2015-06-21 12:53:17 +02:00
' ie ' : ' ile ' ,
' ig ' : ' ibo ' ,
' ii ' : ' iii ' ,
' ik ' : ' ipk ' ,
' io ' : ' ido ' ,
' is ' : ' isl ' ,
' it ' : ' ita ' ,
' iu ' : ' iku ' ,
' ja ' : ' jpn ' ,
' jv ' : ' jav ' ,
' ka ' : ' kat ' ,
' kg ' : ' kon ' ,
' ki ' : ' kik ' ,
' kj ' : ' kua ' ,
' kk ' : ' kaz ' ,
' kl ' : ' kal ' ,
' km ' : ' khm ' ,
' kn ' : ' kan ' ,
' ko ' : ' kor ' ,
' kr ' : ' kau ' ,
' ks ' : ' kas ' ,
' ku ' : ' kur ' ,
' kv ' : ' kom ' ,
' kw ' : ' cor ' ,
' ky ' : ' kir ' ,
' la ' : ' lat ' ,
' lb ' : ' ltz ' ,
' lg ' : ' lug ' ,
' li ' : ' lim ' ,
' ln ' : ' lin ' ,
' lo ' : ' lao ' ,
' lt ' : ' lit ' ,
' lu ' : ' lub ' ,
' lv ' : ' lav ' ,
' mg ' : ' mlg ' ,
' mh ' : ' mah ' ,
' mi ' : ' mri ' ,
' mk ' : ' mkd ' ,
' ml ' : ' mal ' ,
' mn ' : ' mon ' ,
' mr ' : ' mar ' ,
' ms ' : ' msa ' ,
' mt ' : ' mlt ' ,
' my ' : ' mya ' ,
' na ' : ' nau ' ,
' nb ' : ' nob ' ,
' nd ' : ' nde ' ,
' ne ' : ' nep ' ,
' ng ' : ' ndo ' ,
' nl ' : ' nld ' ,
' nn ' : ' nno ' ,
' no ' : ' nor ' ,
' nr ' : ' nbl ' ,
' nv ' : ' nav ' ,
' ny ' : ' nya ' ,
' oc ' : ' oci ' ,
' oj ' : ' oji ' ,
' om ' : ' orm ' ,
' or ' : ' ori ' ,
' os ' : ' oss ' ,
' pa ' : ' pan ' ,
' pi ' : ' pli ' ,
' pl ' : ' pol ' ,
' ps ' : ' pus ' ,
' pt ' : ' por ' ,
' qu ' : ' que ' ,
' rm ' : ' roh ' ,
' rn ' : ' run ' ,
' ro ' : ' ron ' ,
' ru ' : ' rus ' ,
' rw ' : ' kin ' ,
' sa ' : ' san ' ,
' sc ' : ' srd ' ,
' sd ' : ' snd ' ,
' se ' : ' sme ' ,
' sg ' : ' sag ' ,
' si ' : ' sin ' ,
' sk ' : ' slk ' ,
' sl ' : ' slv ' ,
' sm ' : ' smo ' ,
' sn ' : ' sna ' ,
' so ' : ' som ' ,
' sq ' : ' sqi ' ,
' sr ' : ' srp ' ,
' ss ' : ' ssw ' ,
' st ' : ' sot ' ,
' su ' : ' sun ' ,
' sv ' : ' swe ' ,
' sw ' : ' swa ' ,
' ta ' : ' tam ' ,
' te ' : ' tel ' ,
' tg ' : ' tgk ' ,
' th ' : ' tha ' ,
' ti ' : ' tir ' ,
' tk ' : ' tuk ' ,
' tl ' : ' tgl ' ,
' tn ' : ' tsn ' ,
' to ' : ' ton ' ,
' tr ' : ' tur ' ,
' ts ' : ' tso ' ,
' tt ' : ' tat ' ,
' tw ' : ' twi ' ,
' ty ' : ' tah ' ,
' ug ' : ' uig ' ,
' uk ' : ' ukr ' ,
' ur ' : ' urd ' ,
' uz ' : ' uzb ' ,
' ve ' : ' ven ' ,
' vi ' : ' vie ' ,
' vo ' : ' vol ' ,
' wa ' : ' wln ' ,
' wo ' : ' wol ' ,
' xh ' : ' xho ' ,
' yi ' : ' yid ' ,
2019-01-06 19:02:34 +01:00
' ji ' : ' yid ' , # Replaced by yi in 1989 revision
2015-06-21 12:53:17 +02:00
' yo ' : ' yor ' ,
' za ' : ' zha ' ,
' zh ' : ' zho ' ,
' zu ' : ' zul ' ,
}
@classmethod
def short2long ( cls , code ) :
""" Convert language code from ISO 639-1 to ISO 639-2/T """
return cls . _lang_map . get ( code [ : 2 ] )
@classmethod
def long2short ( cls , code ) :
""" Convert language code from ISO 639-2/T to ISO 639-1 """
for short_name , long_name in cls . _lang_map . items ( ) :
if long_name == code :
return short_name
2015-06-27 07:13:57 +02:00
class ISO3166Utils ( object ) :
# From http://data.okfn.org/data/core/country-list
_country_map = {
' AF ' : ' Afghanistan ' ,
' AX ' : ' Åland Islands ' ,
' AL ' : ' Albania ' ,
' DZ ' : ' Algeria ' ,
' AS ' : ' American Samoa ' ,
' AD ' : ' Andorra ' ,
' AO ' : ' Angola ' ,
' AI ' : ' Anguilla ' ,
' AQ ' : ' Antarctica ' ,
' AG ' : ' Antigua and Barbuda ' ,
' AR ' : ' Argentina ' ,
' AM ' : ' Armenia ' ,
' AW ' : ' Aruba ' ,
' AU ' : ' Australia ' ,
' AT ' : ' Austria ' ,
' AZ ' : ' Azerbaijan ' ,
' BS ' : ' Bahamas ' ,
' BH ' : ' Bahrain ' ,
' BD ' : ' Bangladesh ' ,
' BB ' : ' Barbados ' ,
' BY ' : ' Belarus ' ,
' BE ' : ' Belgium ' ,
' BZ ' : ' Belize ' ,
' BJ ' : ' Benin ' ,
' BM ' : ' Bermuda ' ,
' BT ' : ' Bhutan ' ,
' BO ' : ' Bolivia, Plurinational State of ' ,
' BQ ' : ' Bonaire, Sint Eustatius and Saba ' ,
' BA ' : ' Bosnia and Herzegovina ' ,
' BW ' : ' Botswana ' ,
' BV ' : ' Bouvet Island ' ,
' BR ' : ' Brazil ' ,
' IO ' : ' British Indian Ocean Territory ' ,
' BN ' : ' Brunei Darussalam ' ,
' BG ' : ' Bulgaria ' ,
' BF ' : ' Burkina Faso ' ,
' BI ' : ' Burundi ' ,
' KH ' : ' Cambodia ' ,
' CM ' : ' Cameroon ' ,
' CA ' : ' Canada ' ,
' CV ' : ' Cape Verde ' ,
' KY ' : ' Cayman Islands ' ,
' CF ' : ' Central African Republic ' ,
' TD ' : ' Chad ' ,
' CL ' : ' Chile ' ,
' CN ' : ' China ' ,
' CX ' : ' Christmas Island ' ,
' CC ' : ' Cocos (Keeling) Islands ' ,
' CO ' : ' Colombia ' ,
' KM ' : ' Comoros ' ,
' CG ' : ' Congo ' ,
' CD ' : ' Congo, the Democratic Republic of the ' ,
' CK ' : ' Cook Islands ' ,
' CR ' : ' Costa Rica ' ,
' CI ' : ' Côte d \' Ivoire ' ,
' HR ' : ' Croatia ' ,
' CU ' : ' Cuba ' ,
' CW ' : ' Curaçao ' ,
' CY ' : ' Cyprus ' ,
' CZ ' : ' Czech Republic ' ,
' DK ' : ' Denmark ' ,
' DJ ' : ' Djibouti ' ,
' DM ' : ' Dominica ' ,
' DO ' : ' Dominican Republic ' ,
' EC ' : ' Ecuador ' ,
' EG ' : ' Egypt ' ,
' SV ' : ' El Salvador ' ,
' GQ ' : ' Equatorial Guinea ' ,
' ER ' : ' Eritrea ' ,
' EE ' : ' Estonia ' ,
' ET ' : ' Ethiopia ' ,
' FK ' : ' Falkland Islands (Malvinas) ' ,
' FO ' : ' Faroe Islands ' ,
' FJ ' : ' Fiji ' ,
' FI ' : ' Finland ' ,
' FR ' : ' France ' ,
' GF ' : ' French Guiana ' ,
' PF ' : ' French Polynesia ' ,
' TF ' : ' French Southern Territories ' ,
' GA ' : ' Gabon ' ,
' GM ' : ' Gambia ' ,
' GE ' : ' Georgia ' ,
' DE ' : ' Germany ' ,
' GH ' : ' Ghana ' ,
' GI ' : ' Gibraltar ' ,
' GR ' : ' Greece ' ,
' GL ' : ' Greenland ' ,
' GD ' : ' Grenada ' ,
' GP ' : ' Guadeloupe ' ,
' GU ' : ' Guam ' ,
' GT ' : ' Guatemala ' ,
' GG ' : ' Guernsey ' ,
' GN ' : ' Guinea ' ,
' GW ' : ' Guinea-Bissau ' ,
' GY ' : ' Guyana ' ,
' HT ' : ' Haiti ' ,
' HM ' : ' Heard Island and McDonald Islands ' ,
' VA ' : ' Holy See (Vatican City State) ' ,
' HN ' : ' Honduras ' ,
' HK ' : ' Hong Kong ' ,
' HU ' : ' Hungary ' ,
' IS ' : ' Iceland ' ,
' IN ' : ' India ' ,
' ID ' : ' Indonesia ' ,
' IR ' : ' Iran, Islamic Republic of ' ,
' IQ ' : ' Iraq ' ,
' IE ' : ' Ireland ' ,
' IM ' : ' Isle of Man ' ,
' IL ' : ' Israel ' ,
' IT ' : ' Italy ' ,
' JM ' : ' Jamaica ' ,
' JP ' : ' Japan ' ,
' JE ' : ' Jersey ' ,
' JO ' : ' Jordan ' ,
' KZ ' : ' Kazakhstan ' ,
' KE ' : ' Kenya ' ,
' KI ' : ' Kiribati ' ,
' KP ' : ' Korea, Democratic People \' s Republic of ' ,
' KR ' : ' Korea, Republic of ' ,
' KW ' : ' Kuwait ' ,
' KG ' : ' Kyrgyzstan ' ,
' LA ' : ' Lao People \' s Democratic Republic ' ,
' LV ' : ' Latvia ' ,
' LB ' : ' Lebanon ' ,
' LS ' : ' Lesotho ' ,
' LR ' : ' Liberia ' ,
' LY ' : ' Libya ' ,
' LI ' : ' Liechtenstein ' ,
' LT ' : ' Lithuania ' ,
' LU ' : ' Luxembourg ' ,
' MO ' : ' Macao ' ,
' MK ' : ' Macedonia, the Former Yugoslav Republic of ' ,
' MG ' : ' Madagascar ' ,
' MW ' : ' Malawi ' ,
' MY ' : ' Malaysia ' ,
' MV ' : ' Maldives ' ,
' ML ' : ' Mali ' ,
' MT ' : ' Malta ' ,
' MH ' : ' Marshall Islands ' ,
' MQ ' : ' Martinique ' ,
' MR ' : ' Mauritania ' ,
' MU ' : ' Mauritius ' ,
' YT ' : ' Mayotte ' ,
' MX ' : ' Mexico ' ,
' FM ' : ' Micronesia, Federated States of ' ,
' MD ' : ' Moldova, Republic of ' ,
' MC ' : ' Monaco ' ,
' MN ' : ' Mongolia ' ,
' ME ' : ' Montenegro ' ,
' MS ' : ' Montserrat ' ,
' MA ' : ' Morocco ' ,
' MZ ' : ' Mozambique ' ,
' MM ' : ' Myanmar ' ,
' NA ' : ' Namibia ' ,
' NR ' : ' Nauru ' ,
' NP ' : ' Nepal ' ,
' NL ' : ' Netherlands ' ,
' NC ' : ' New Caledonia ' ,
' NZ ' : ' New Zealand ' ,
' NI ' : ' Nicaragua ' ,
' NE ' : ' Niger ' ,
' NG ' : ' Nigeria ' ,
' NU ' : ' Niue ' ,
' NF ' : ' Norfolk Island ' ,
' MP ' : ' Northern Mariana Islands ' ,
' NO ' : ' Norway ' ,
' OM ' : ' Oman ' ,
' PK ' : ' Pakistan ' ,
' PW ' : ' Palau ' ,
' PS ' : ' Palestine, State of ' ,
' PA ' : ' Panama ' ,
' PG ' : ' Papua New Guinea ' ,
' PY ' : ' Paraguay ' ,
' PE ' : ' Peru ' ,
' PH ' : ' Philippines ' ,
' PN ' : ' Pitcairn ' ,
' PL ' : ' Poland ' ,
' PT ' : ' Portugal ' ,
' PR ' : ' Puerto Rico ' ,
' QA ' : ' Qatar ' ,
' RE ' : ' Réunion ' ,
' RO ' : ' Romania ' ,
' RU ' : ' Russian Federation ' ,
' RW ' : ' Rwanda ' ,
' BL ' : ' Saint Barthélemy ' ,
' SH ' : ' Saint Helena, Ascension and Tristan da Cunha ' ,
' KN ' : ' Saint Kitts and Nevis ' ,
' LC ' : ' Saint Lucia ' ,
' MF ' : ' Saint Martin (French part) ' ,
' PM ' : ' Saint Pierre and Miquelon ' ,
' VC ' : ' Saint Vincent and the Grenadines ' ,
' WS ' : ' Samoa ' ,
' SM ' : ' San Marino ' ,
' ST ' : ' Sao Tome and Principe ' ,
' SA ' : ' Saudi Arabia ' ,
' SN ' : ' Senegal ' ,
' RS ' : ' Serbia ' ,
' SC ' : ' Seychelles ' ,
' SL ' : ' Sierra Leone ' ,
' SG ' : ' Singapore ' ,
' SX ' : ' Sint Maarten (Dutch part) ' ,
' SK ' : ' Slovakia ' ,
' SI ' : ' Slovenia ' ,
' SB ' : ' Solomon Islands ' ,
' SO ' : ' Somalia ' ,
' ZA ' : ' South Africa ' ,
' GS ' : ' South Georgia and the South Sandwich Islands ' ,
' SS ' : ' South Sudan ' ,
' ES ' : ' Spain ' ,
' LK ' : ' Sri Lanka ' ,
' SD ' : ' Sudan ' ,
' SR ' : ' Suriname ' ,
' SJ ' : ' Svalbard and Jan Mayen ' ,
' SZ ' : ' Swaziland ' ,
' SE ' : ' Sweden ' ,
' CH ' : ' Switzerland ' ,
' SY ' : ' Syrian Arab Republic ' ,
' TW ' : ' Taiwan, Province of China ' ,
' TJ ' : ' Tajikistan ' ,
' TZ ' : ' Tanzania, United Republic of ' ,
' TH ' : ' Thailand ' ,
' TL ' : ' Timor-Leste ' ,
' TG ' : ' Togo ' ,
' TK ' : ' Tokelau ' ,
' TO ' : ' Tonga ' ,
' TT ' : ' Trinidad and Tobago ' ,
' TN ' : ' Tunisia ' ,
' TR ' : ' Turkey ' ,
' TM ' : ' Turkmenistan ' ,
' TC ' : ' Turks and Caicos Islands ' ,
' TV ' : ' Tuvalu ' ,
' UG ' : ' Uganda ' ,
' UA ' : ' Ukraine ' ,
' AE ' : ' United Arab Emirates ' ,
' GB ' : ' United Kingdom ' ,
' US ' : ' United States ' ,
' UM ' : ' United States Minor Outlying Islands ' ,
' UY ' : ' Uruguay ' ,
' UZ ' : ' Uzbekistan ' ,
' VU ' : ' Vanuatu ' ,
' VE ' : ' Venezuela, Bolivarian Republic of ' ,
' VN ' : ' Viet Nam ' ,
' VG ' : ' Virgin Islands, British ' ,
' VI ' : ' Virgin Islands, U.S. ' ,
' WF ' : ' Wallis and Futuna ' ,
' EH ' : ' Western Sahara ' ,
' YE ' : ' Yemen ' ,
' ZM ' : ' Zambia ' ,
' ZW ' : ' Zimbabwe ' ,
}
@classmethod
def short2full ( cls , code ) :
""" Convert an ISO 3166-2 country code to the corresponding full name """
return cls . _country_map . get ( code . upper ( ) )
2017-02-04 12:49:58 +01:00
class GeoUtils ( object ) :
# Major IPv4 address blocks per country
_country_ip_map = {
2019-10-29 00:10:20 +01:00
' AD ' : ' 46.172.224.0/19 ' ,
2017-02-04 12:49:58 +01:00
' AE ' : ' 94.200.0.0/13 ' ,
' AF ' : ' 149.54.0.0/17 ' ,
' AG ' : ' 209.59.64.0/18 ' ,
' AI ' : ' 204.14.248.0/21 ' ,
' AL ' : ' 46.99.0.0/16 ' ,
' AM ' : ' 46.70.0.0/15 ' ,
' AO ' : ' 105.168.0.0/13 ' ,
2019-10-29 00:10:20 +01:00
' AP ' : ' 182.50.184.0/21 ' ,
' AQ ' : ' 23.154.160.0/24 ' ,
2017-02-04 12:49:58 +01:00
' AR ' : ' 181.0.0.0/12 ' ,
' AS ' : ' 202.70.112.0/20 ' ,
2019-10-29 00:10:20 +01:00
' AT ' : ' 77.116.0.0/14 ' ,
2017-02-04 12:49:58 +01:00
' AU ' : ' 1.128.0.0/11 ' ,
' AW ' : ' 181.41.0.0/18 ' ,
2019-10-29 00:10:20 +01:00
' AX ' : ' 185.217.4.0/22 ' ,
' AZ ' : ' 5.197.0.0/16 ' ,
2017-02-04 12:49:58 +01:00
' BA ' : ' 31.176.128.0/17 ' ,
' BB ' : ' 65.48.128.0/17 ' ,
' BD ' : ' 114.130.0.0/16 ' ,
' BE ' : ' 57.0.0.0/8 ' ,
2019-10-29 00:10:20 +01:00
' BF ' : ' 102.178.0.0/15 ' ,
2017-02-04 12:49:58 +01:00
' BG ' : ' 95.42.0.0/15 ' ,
' BH ' : ' 37.131.0.0/17 ' ,
' BI ' : ' 154.117.192.0/18 ' ,
' BJ ' : ' 137.255.0.0/16 ' ,
2019-10-29 00:10:20 +01:00
' BL ' : ' 185.212.72.0/23 ' ,
2017-02-04 12:49:58 +01:00
' BM ' : ' 196.12.64.0/18 ' ,
' BN ' : ' 156.31.0.0/16 ' ,
' BO ' : ' 161.56.0.0/16 ' ,
' BQ ' : ' 161.0.80.0/20 ' ,
2019-10-29 00:10:20 +01:00
' BR ' : ' 191.128.0.0/12 ' ,
2017-02-04 12:49:58 +01:00
' BS ' : ' 24.51.64.0/18 ' ,
' BT ' : ' 119.2.96.0/19 ' ,
' BW ' : ' 168.167.0.0/16 ' ,
' BY ' : ' 178.120.0.0/13 ' ,
' BZ ' : ' 179.42.192.0/18 ' ,
' CA ' : ' 99.224.0.0/11 ' ,
' CD ' : ' 41.243.0.0/16 ' ,
2019-10-29 00:10:20 +01:00
' CF ' : ' 197.242.176.0/21 ' ,
' CG ' : ' 160.113.0.0/16 ' ,
2017-02-04 12:49:58 +01:00
' CH ' : ' 85.0.0.0/13 ' ,
2019-10-29 00:10:20 +01:00
' CI ' : ' 102.136.0.0/14 ' ,
2017-02-04 12:49:58 +01:00
' CK ' : ' 202.65.32.0/19 ' ,
' CL ' : ' 152.172.0.0/14 ' ,
2019-10-29 00:10:20 +01:00
' CM ' : ' 102.244.0.0/14 ' ,
2017-02-04 12:49:58 +01:00
' CN ' : ' 36.128.0.0/10 ' ,
' CO ' : ' 181.240.0.0/12 ' ,
' CR ' : ' 201.192.0.0/12 ' ,
' CU ' : ' 152.206.0.0/15 ' ,
' CV ' : ' 165.90.96.0/19 ' ,
' CW ' : ' 190.88.128.0/17 ' ,
2019-10-29 00:10:20 +01:00
' CY ' : ' 31.153.0.0/16 ' ,
2017-02-04 12:49:58 +01:00
' CZ ' : ' 88.100.0.0/14 ' ,
' DE ' : ' 53.0.0.0/8 ' ,
' DJ ' : ' 197.241.0.0/17 ' ,
' DK ' : ' 87.48.0.0/12 ' ,
' DM ' : ' 192.243.48.0/20 ' ,
' DO ' : ' 152.166.0.0/15 ' ,
' DZ ' : ' 41.96.0.0/12 ' ,
' EC ' : ' 186.68.0.0/15 ' ,
' EE ' : ' 90.190.0.0/15 ' ,
' EG ' : ' 156.160.0.0/11 ' ,
' ER ' : ' 196.200.96.0/20 ' ,
' ES ' : ' 88.0.0.0/11 ' ,
' ET ' : ' 196.188.0.0/14 ' ,
' EU ' : ' 2.16.0.0/13 ' ,
' FI ' : ' 91.152.0.0/13 ' ,
' FJ ' : ' 144.120.0.0/16 ' ,
2019-10-29 00:10:20 +01:00
' FK ' : ' 80.73.208.0/21 ' ,
2017-02-04 12:49:58 +01:00
' FM ' : ' 119.252.112.0/20 ' ,
' FO ' : ' 88.85.32.0/19 ' ,
' FR ' : ' 90.0.0.0/9 ' ,
' GA ' : ' 41.158.0.0/15 ' ,
' GB ' : ' 25.0.0.0/8 ' ,
' GD ' : ' 74.122.88.0/21 ' ,
' GE ' : ' 31.146.0.0/16 ' ,
' GF ' : ' 161.22.64.0/18 ' ,
' GG ' : ' 62.68.160.0/19 ' ,
2019-10-29 00:10:20 +01:00
' GH ' : ' 154.160.0.0/12 ' ,
' GI ' : ' 95.164.0.0/16 ' ,
2017-02-04 12:49:58 +01:00
' GL ' : ' 88.83.0.0/19 ' ,
' GM ' : ' 160.182.0.0/15 ' ,
' GN ' : ' 197.149.192.0/18 ' ,
' GP ' : ' 104.250.0.0/19 ' ,
' GQ ' : ' 105.235.224.0/20 ' ,
' GR ' : ' 94.64.0.0/13 ' ,
' GT ' : ' 168.234.0.0/16 ' ,
' GU ' : ' 168.123.0.0/16 ' ,
' GW ' : ' 197.214.80.0/20 ' ,
' GY ' : ' 181.41.64.0/18 ' ,
' HK ' : ' 113.252.0.0/14 ' ,
' HN ' : ' 181.210.0.0/16 ' ,
' HR ' : ' 93.136.0.0/13 ' ,
' HT ' : ' 148.102.128.0/17 ' ,
' HU ' : ' 84.0.0.0/14 ' ,
' ID ' : ' 39.192.0.0/10 ' ,
' IE ' : ' 87.32.0.0/12 ' ,
' IL ' : ' 79.176.0.0/13 ' ,
' IM ' : ' 5.62.80.0/20 ' ,
' IN ' : ' 117.192.0.0/10 ' ,
' IO ' : ' 203.83.48.0/21 ' ,
' IQ ' : ' 37.236.0.0/14 ' ,
' IR ' : ' 2.176.0.0/12 ' ,
' IS ' : ' 82.221.0.0/16 ' ,
' IT ' : ' 79.0.0.0/10 ' ,
' JE ' : ' 87.244.64.0/18 ' ,
' JM ' : ' 72.27.0.0/17 ' ,
' JO ' : ' 176.29.0.0/16 ' ,
2019-10-29 00:10:20 +01:00
' JP ' : ' 133.0.0.0/8 ' ,
2017-02-04 12:49:58 +01:00
' KE ' : ' 105.48.0.0/12 ' ,
' KG ' : ' 158.181.128.0/17 ' ,
' KH ' : ' 36.37.128.0/17 ' ,
' KI ' : ' 103.25.140.0/22 ' ,
' KM ' : ' 197.255.224.0/20 ' ,
2019-10-29 00:10:20 +01:00
' KN ' : ' 198.167.192.0/19 ' ,
2017-02-04 12:49:58 +01:00
' KP ' : ' 175.45.176.0/22 ' ,
' KR ' : ' 175.192.0.0/10 ' ,
' KW ' : ' 37.36.0.0/14 ' ,
' KY ' : ' 64.96.0.0/15 ' ,
' KZ ' : ' 2.72.0.0/13 ' ,
' LA ' : ' 115.84.64.0/18 ' ,
' LB ' : ' 178.135.0.0/16 ' ,
2019-10-29 00:10:20 +01:00
' LC ' : ' 24.92.144.0/20 ' ,
2017-02-04 12:49:58 +01:00
' LI ' : ' 82.117.0.0/19 ' ,
' LK ' : ' 112.134.0.0/15 ' ,
2019-10-29 00:10:20 +01:00
' LR ' : ' 102.183.0.0/16 ' ,
2017-02-04 12:49:58 +01:00
' LS ' : ' 129.232.0.0/17 ' ,
' LT ' : ' 78.56.0.0/13 ' ,
' LU ' : ' 188.42.0.0/16 ' ,
' LV ' : ' 46.109.0.0/16 ' ,
' LY ' : ' 41.252.0.0/14 ' ,
' MA ' : ' 105.128.0.0/11 ' ,
' MC ' : ' 88.209.64.0/18 ' ,
' MD ' : ' 37.246.0.0/16 ' ,
' ME ' : ' 178.175.0.0/17 ' ,
' MF ' : ' 74.112.232.0/21 ' ,
' MG ' : ' 154.126.0.0/17 ' ,
' MH ' : ' 117.103.88.0/21 ' ,
' MK ' : ' 77.28.0.0/15 ' ,
' ML ' : ' 154.118.128.0/18 ' ,
' MM ' : ' 37.111.0.0/17 ' ,
' MN ' : ' 49.0.128.0/17 ' ,
' MO ' : ' 60.246.0.0/16 ' ,
' MP ' : ' 202.88.64.0/20 ' ,
' MQ ' : ' 109.203.224.0/19 ' ,
' MR ' : ' 41.188.64.0/18 ' ,
' MS ' : ' 208.90.112.0/22 ' ,
' MT ' : ' 46.11.0.0/16 ' ,
' MU ' : ' 105.16.0.0/12 ' ,
' MV ' : ' 27.114.128.0/18 ' ,
2019-10-29 00:10:20 +01:00
' MW ' : ' 102.70.0.0/15 ' ,
2017-02-04 12:49:58 +01:00
' MX ' : ' 187.192.0.0/11 ' ,
' MY ' : ' 175.136.0.0/13 ' ,
' MZ ' : ' 197.218.0.0/15 ' ,
' NA ' : ' 41.182.0.0/16 ' ,
' NC ' : ' 101.101.0.0/18 ' ,
' NE ' : ' 197.214.0.0/18 ' ,
' NF ' : ' 203.17.240.0/22 ' ,
' NG ' : ' 105.112.0.0/12 ' ,
' NI ' : ' 186.76.0.0/15 ' ,
' NL ' : ' 145.96.0.0/11 ' ,
' NO ' : ' 84.208.0.0/13 ' ,
' NP ' : ' 36.252.0.0/15 ' ,
' NR ' : ' 203.98.224.0/19 ' ,
' NU ' : ' 49.156.48.0/22 ' ,
' NZ ' : ' 49.224.0.0/14 ' ,
' OM ' : ' 5.36.0.0/15 ' ,
' PA ' : ' 186.72.0.0/15 ' ,
' PE ' : ' 186.160.0.0/14 ' ,
' PF ' : ' 123.50.64.0/18 ' ,
' PG ' : ' 124.240.192.0/19 ' ,
' PH ' : ' 49.144.0.0/13 ' ,
' PK ' : ' 39.32.0.0/11 ' ,
' PL ' : ' 83.0.0.0/11 ' ,
' PM ' : ' 70.36.0.0/20 ' ,
' PR ' : ' 66.50.0.0/16 ' ,
' PS ' : ' 188.161.0.0/16 ' ,
' PT ' : ' 85.240.0.0/13 ' ,
' PW ' : ' 202.124.224.0/20 ' ,
' PY ' : ' 181.120.0.0/14 ' ,
' QA ' : ' 37.210.0.0/15 ' ,
2019-10-29 00:10:20 +01:00
' RE ' : ' 102.35.0.0/16 ' ,
2017-02-04 12:49:58 +01:00
' RO ' : ' 79.112.0.0/13 ' ,
2019-10-29 00:10:20 +01:00
' RS ' : ' 93.86.0.0/15 ' ,
2017-02-04 12:49:58 +01:00
' RU ' : ' 5.136.0.0/13 ' ,
2019-10-29 00:10:20 +01:00
' RW ' : ' 41.186.0.0/16 ' ,
2017-02-04 12:49:58 +01:00
' SA ' : ' 188.48.0.0/13 ' ,
' SB ' : ' 202.1.160.0/19 ' ,
' SC ' : ' 154.192.0.0/11 ' ,
2019-10-29 00:10:20 +01:00
' SD ' : ' 102.120.0.0/13 ' ,
2017-02-04 12:49:58 +01:00
' SE ' : ' 78.64.0.0/12 ' ,
2019-10-29 00:10:20 +01:00
' SG ' : ' 8.128.0.0/10 ' ,
2017-02-04 12:49:58 +01:00
' SI ' : ' 188.196.0.0/14 ' ,
' SK ' : ' 78.98.0.0/15 ' ,
2019-10-29 00:10:20 +01:00
' SL ' : ' 102.143.0.0/17 ' ,
2017-02-04 12:49:58 +01:00
' SM ' : ' 89.186.32.0/19 ' ,
' SN ' : ' 41.82.0.0/15 ' ,
2019-10-29 00:10:20 +01:00
' SO ' : ' 154.115.192.0/18 ' ,
2017-02-04 12:49:58 +01:00
' SR ' : ' 186.179.128.0/17 ' ,
' SS ' : ' 105.235.208.0/21 ' ,
' ST ' : ' 197.159.160.0/19 ' ,
' SV ' : ' 168.243.0.0/16 ' ,
' SX ' : ' 190.102.0.0/20 ' ,
' SY ' : ' 5.0.0.0/16 ' ,
' SZ ' : ' 41.84.224.0/19 ' ,
' TC ' : ' 65.255.48.0/20 ' ,
' TD ' : ' 154.68.128.0/19 ' ,
' TG ' : ' 196.168.0.0/14 ' ,
' TH ' : ' 171.96.0.0/13 ' ,
' TJ ' : ' 85.9.128.0/18 ' ,
' TK ' : ' 27.96.24.0/21 ' ,
' TL ' : ' 180.189.160.0/20 ' ,
' TM ' : ' 95.85.96.0/19 ' ,
' TN ' : ' 197.0.0.0/11 ' ,
' TO ' : ' 175.176.144.0/21 ' ,
' TR ' : ' 78.160.0.0/11 ' ,
' TT ' : ' 186.44.0.0/15 ' ,
' TV ' : ' 202.2.96.0/19 ' ,
' TW ' : ' 120.96.0.0/11 ' ,
' TZ ' : ' 156.156.0.0/14 ' ,
2019-10-29 00:10:20 +01:00
' UA ' : ' 37.52.0.0/14 ' ,
' UG ' : ' 102.80.0.0/13 ' ,
' US ' : ' 6.0.0.0/8 ' ,
2017-02-04 12:49:58 +01:00
' UY ' : ' 167.56.0.0/13 ' ,
2019-10-29 00:10:20 +01:00
' UZ ' : ' 84.54.64.0/18 ' ,
2017-02-04 12:49:58 +01:00
' VA ' : ' 212.77.0.0/19 ' ,
2019-10-29 00:10:20 +01:00
' VC ' : ' 207.191.240.0/21 ' ,
2017-02-04 12:49:58 +01:00
' VE ' : ' 186.88.0.0/13 ' ,
2019-10-29 00:10:20 +01:00
' VG ' : ' 66.81.192.0/20 ' ,
2017-02-04 12:49:58 +01:00
' VI ' : ' 146.226.0.0/16 ' ,
' VN ' : ' 14.160.0.0/11 ' ,
' VU ' : ' 202.80.32.0/20 ' ,
' WF ' : ' 117.20.32.0/21 ' ,
' WS ' : ' 202.4.32.0/19 ' ,
' YE ' : ' 134.35.0.0/16 ' ,
' YT ' : ' 41.242.116.0/22 ' ,
' ZA ' : ' 41.0.0.0/11 ' ,
2019-10-29 00:10:20 +01:00
' ZM ' : ' 102.144.0.0/13 ' ,
' ZW ' : ' 102.177.192.0/18 ' ,
2017-02-04 12:49:58 +01:00
}
@classmethod
2018-05-02 02:18:01 +02:00
def random_ipv4 ( cls , code_or_block ) :
if len ( code_or_block ) == 2 :
block = cls . _country_ip_map . get ( code_or_block . upper ( ) )
if not block :
return None
else :
block = code_or_block
2017-02-04 12:49:58 +01:00
addr , preflen = block . split ( ' / ' )
addr_min = compat_struct_unpack ( ' !L ' , socket . inet_aton ( addr ) ) [ 0 ]
addr_max = addr_min | ( 0xffffffff >> int ( preflen ) )
2017-02-04 14:26:43 +01:00
return compat_str ( socket . inet_ntoa (
2017-02-18 21:53:23 +01:00
compat_struct_pack ( ' !L ' , random . randint ( addr_min , addr_max ) ) ) )
2017-02-04 12:49:58 +01:00
2015-03-03 00:03:06 +01:00
class PerRequestProxyHandler ( compat_urllib_request . ProxyHandler ) :
2015-03-03 13:56:06 +01:00
def __init__ ( self , proxies = None ) :
# Set default handlers
for type in ( ' http ' , ' https ' ) :
setattr ( self , ' %s _open ' % type ,
lambda r , proxy = ' __noproxy__ ' , type = type , meth = self . proxy_open :
meth ( r , proxy , type ) )
2018-07-29 00:52:42 +02:00
compat_urllib_request . ProxyHandler . __init__ ( self , proxies )
2015-03-03 13:56:06 +01:00
2015-03-03 00:03:06 +01:00
def proxy_open ( self , req , proxy , type ) :
2015-03-03 13:56:06 +01:00
req_proxy = req . headers . get ( ' Ytdl-request-proxy ' )
2015-03-03 00:03:06 +01:00
if req_proxy is not None :
proxy = req_proxy
2015-03-03 13:56:06 +01:00
del req . headers [ ' Ytdl-request-proxy ' ]
if proxy == ' __noproxy__ ' :
return None # No Proxy
2016-05-03 09:15:32 +02:00
if compat_urlparse . urlparse ( proxy ) . scheme . lower ( ) in ( ' socks ' , ' socks4 ' , ' socks4a ' , ' socks5 ' ) :
2016-04-23 15:30:06 +02:00
req . add_header ( ' Ytdl-socks-proxy ' , proxy )
2021-02-24 19:45:56 +01:00
# yt-dlp's http/https handlers do wrapping the socket with socks
2016-04-23 15:30:06 +02:00
return None
2015-03-03 00:03:06 +01:00
return compat_urllib_request . ProxyHandler . proxy_open (
self , req , proxy , type )
2016-02-16 23:01:44 +01:00
2017-02-28 12:16:55 +01:00
# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
# released into Public Domain
# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
def long_to_bytes ( n , blocksize = 0 ) :
""" long_to_bytes(n:long, blocksize:int) : string
Convert a long integer to a byte string .
If optional blocksize is given and greater than zero , pad the front of the
byte string with binary zeros so that the length is a multiple of
blocksize .
"""
# after much testing, this algorithm was deemed to be the fastest
s = b ' '
n = int ( n )
while n > 0 :
s = compat_struct_pack ( ' >I ' , n & 0xffffffff ) + s
n = n >> 32
# strip off leading zeros
for i in range ( len ( s ) ) :
if s [ i ] != b ' \000 ' [ 0 ] :
break
else :
# only happens when n == 0
s = b ' \000 '
i = 0
s = s [ i : ]
# add back some pad bytes. this could be done more efficiently w.r.t. the
# de-padding being done above, but sigh...
if blocksize > 0 and len ( s ) % blocksize :
s = ( blocksize - len ( s ) % blocksize ) * b ' \000 ' + s
return s
def bytes_to_long ( s ) :
""" bytes_to_long(string) : long
Convert a byte string to a long integer .
This is ( essentially ) the inverse of long_to_bytes ( ) .
"""
acc = 0
length = len ( s )
if length % 4 :
extra = ( 4 - length % 4 )
s = b ' \000 ' * extra + s
length = length + extra
for i in range ( 0 , length , 4 ) :
acc = ( acc << 32 ) + compat_struct_unpack ( ' >I ' , s [ i : i + 4 ] ) [ 0 ]
return acc
2016-02-16 23:01:44 +01:00
def ohdave_rsa_encrypt ( data , exponent , modulus ) :
'''
Implement OHDave ' s RSA algorithm. See http://www.ohdave.com/rsa/
Input :
data : data to encrypt , bytes - like object
exponent , modulus : parameter e and N of RSA algorithm , both integer
Output : hex string of encrypted data
Limitation : supports one block encryption only
'''
payload = int ( binascii . hexlify ( data [ : : - 1 ] ) , 16 )
encrypted = pow ( payload , exponent , modulus )
return ' %x ' % encrypted
2016-02-24 15:08:40 +01:00
2017-02-27 11:50:19 +01:00
def pkcs1pad ( data , length ) :
"""
Padding input data with PKCS #1 scheme
@param { int [ ] } data input data
@param { int } length target length
@returns { int [ ] } padded data
"""
if len ( data ) > length - 11 :
raise ValueError ( ' Input data too long for PKCS#1 padding ' )
pseudo_random = [ random . randint ( 0 , 254 ) for _ in range ( length - len ( data ) - 3 ) ]
return [ 0 , 2 ] + pseudo_random + [ 0 ] + data
2016-02-26 20:19:50 +01:00
def encode_base_n ( num , n , table = None ) :
2016-02-26 07:37:20 +01:00
FULL_TABLE = ' 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ '
if not table :
table = FULL_TABLE [ : n ]
2016-02-26 20:19:50 +01:00
if n > len ( table ) :
raise ValueError ( ' base %d exceeds table length %d ' % ( n , len ( table ) ) )
if num == 0 :
return table [ 0 ]
2016-02-24 15:08:40 +01:00
ret = ' '
while num :
ret = table [ num % n ] + ret
num = num / / n
return ret
2016-02-26 07:58:29 +01:00
def decode_packed_codes ( code ) :
2016-10-19 18:28:49 +02:00
mobj = re . search ( PACKED_CODES_RE , code )
2020-11-21 15:50:42 +01:00
obfuscated_code , base , count , symbols = mobj . groups ( )
2016-02-26 07:58:29 +01:00
base = int ( base )
count = int ( count )
symbols = symbols . split ( ' | ' )
symbol_table = { }
while count :
count - = 1
2016-02-26 20:19:50 +01:00
base_n_count = encode_base_n ( count , base )
2016-02-26 07:58:29 +01:00
symbol_table [ base_n_count ] = symbols [ count ] or base_n_count
return re . sub (
r ' \ b( \ w+) \ b ' , lambda mobj : symbol_table [ mobj . group ( 0 ) ] ,
2020-11-21 15:50:42 +01:00
obfuscated_code )
2016-01-10 20:09:53 +01:00
2019-11-26 20:26:42 +01:00
def caesar ( s , alphabet , shift ) :
if shift == 0 :
return s
l = len ( alphabet )
return ' ' . join (
alphabet [ ( alphabet . index ( c ) + shift ) % l ] if c in alphabet else c
for c in s )
def rot47 ( s ) :
return caesar ( s , r ''' ! " #$ % & ' ()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[ \ ]^_`abcdefghijklmnopqrstuvwxyz { |}~ ''' , 47 )
2016-01-10 20:09:53 +01:00
def parse_m3u8_attributes ( attrib ) :
info = { }
for ( key , val ) in re . findall ( r ' (?P<key>[A-Z0-9-]+)=(?P<val> " [^ " ]+ " |[^ " ,]+)(?:,|$) ' , attrib ) :
if val . startswith ( ' " ' ) :
val = val [ 1 : - 1 ]
info [ key ] = val
return info
2016-06-26 09:16:49 +02:00
def urshift ( val , n ) :
return val >> n if val > = 0 else ( val + 0x100000000 ) >> n
2016-08-06 20:42:58 +02:00
# Based on png2str() written by @gdkchan and improved by @yokrysty
2019-03-09 13:14:41 +01:00
# Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
2016-08-06 20:42:58 +02:00
def decode_png ( png_data ) :
# Reference: https://www.w3.org/TR/PNG/
header = png_data [ 8 : ]
if png_data [ : 8 ] != b ' \x89 PNG \x0d \x0a \x1a \x0a ' or header [ 4 : 8 ] != b ' IHDR ' :
raise IOError ( ' Not a valid PNG file. ' )
int_map = { 1 : ' >B ' , 2 : ' >H ' , 4 : ' >I ' }
unpack_integer = lambda x : compat_struct_unpack ( int_map [ len ( x ) ] , x ) [ 0 ]
chunks = [ ]
while header :
length = unpack_integer ( header [ : 4 ] )
header = header [ 4 : ]
chunk_type = header [ : 4 ]
header = header [ 4 : ]
chunk_data = header [ : length ]
header = header [ length : ]
header = header [ 4 : ] # Skip CRC
chunks . append ( {
' type ' : chunk_type ,
' length ' : length ,
' data ' : chunk_data
} )
ihdr = chunks [ 0 ] [ ' data ' ]
width = unpack_integer ( ihdr [ : 4 ] )
height = unpack_integer ( ihdr [ 4 : 8 ] )
idat = b ' '
for chunk in chunks :
if chunk [ ' type ' ] == b ' IDAT ' :
idat + = chunk [ ' data ' ]
if not idat :
raise IOError ( ' Unable to read PNG data. ' )
decompressed_data = bytearray ( zlib . decompress ( idat ) )
stride = width * 3
pixels = [ ]
def _get_pixel ( idx ) :
x = idx % stride
y = idx / / stride
return pixels [ y ] [ x ]
for y in range ( height ) :
basePos = y * ( 1 + stride )
filter_type = decompressed_data [ basePos ]
current_row = [ ]
pixels . append ( current_row )
for x in range ( stride ) :
color = decompressed_data [ 1 + basePos + x ]
basex = y * stride + x
left = 0
up = 0
if x > 2 :
left = _get_pixel ( basex - 3 )
if y > 0 :
up = _get_pixel ( basex - stride )
if filter_type == 1 : # Sub
color = ( color + left ) & 0xff
elif filter_type == 2 : # Up
color = ( color + up ) & 0xff
elif filter_type == 3 : # Average
color = ( color + ( ( left + up ) >> 1 ) ) & 0xff
elif filter_type == 4 : # Paeth
a = left
b = up
c = 0
if x > 2 and y > 0 :
c = _get_pixel ( basex - stride - 3 )
p = a + b - c
pa = abs ( p - a )
pb = abs ( p - b )
pc = abs ( p - c )
if pa < = pb and pa < = pc :
color = ( color + a ) & 0xff
elif pb < = pc :
color = ( color + b ) & 0xff
else :
color = ( color + c ) & 0xff
current_row . append ( color )
return width , height , pixels
2016-09-29 18:28:32 +02:00
def write_xattr ( path , key , value ) :
# This mess below finds the best xattr tool for the job
try :
# try the pyxattr module...
import xattr
2016-10-01 14:13:04 +02:00
if hasattr ( xattr , ' set ' ) : # pyxattr
# Unicode arguments are not supported in python-pyxattr until
# version 0.5.0
2019-03-09 13:14:41 +01:00
# See https://github.com/ytdl-org/youtube-dl/issues/5498
2016-10-01 14:13:04 +02:00
pyxattr_required_version = ' 0.5.0 '
if version_tuple ( xattr . __version__ ) < version_tuple ( pyxattr_required_version ) :
# TODO: fallback to CLI tools
raise XAttrUnavailableError (
' python-pyxattr is detected but is too old. '
2021-02-24 19:45:56 +01:00
' yt-dlp requires %s or above while your version is %s . '
2016-10-01 14:13:04 +02:00
' Falling back to other xattr implementations ' % (
pyxattr_required_version , xattr . __version__ ) )
setxattr = xattr . set
else : # xattr
setxattr = xattr . setxattr
2016-09-29 18:28:32 +02:00
try :
2016-10-01 14:13:04 +02:00
setxattr ( path , key , value )
2016-09-29 18:28:32 +02:00
except EnvironmentError as e :
raise XAttrMetadataError ( e . errno , e . strerror )
except ImportError :
if compat_os_name == ' nt ' :
# Write xattrs to NTFS Alternate Data Streams:
# http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
assert ' : ' not in key
assert os . path . exists ( path )
ads_fn = path + ' : ' + key
try :
with open ( ads_fn , ' wb ' ) as f :
f . write ( value )
except EnvironmentError as e :
raise XAttrMetadataError ( e . errno , e . strerror )
else :
user_has_setfattr = check_executable ( ' setfattr ' , [ ' --version ' ] )
user_has_xattr = check_executable ( ' xattr ' , [ ' -h ' ] )
if user_has_setfattr or user_has_xattr :
value = value . decode ( ' utf-8 ' )
if user_has_setfattr :
executable = ' setfattr '
opts = [ ' -n ' , key , ' -v ' , value ]
elif user_has_xattr :
executable = ' xattr '
opts = [ ' -w ' , key , value ]
2019-05-10 22:56:22 +02:00
cmd = ( [ encodeFilename ( executable , True ) ]
+ [ encodeArgument ( o ) for o in opts ]
+ [ encodeFilename ( path , True ) ] )
2016-09-29 18:28:32 +02:00
try :
2021-10-20 18:19:40 +02:00
p = Popen (
2016-09-29 18:28:32 +02:00
cmd , stdout = subprocess . PIPE , stderr = subprocess . PIPE , stdin = subprocess . PIPE )
except EnvironmentError as e :
raise XAttrMetadataError ( e . errno , e . strerror )
2021-10-20 18:19:40 +02:00
stdout , stderr = p . communicate_or_kill ( )
2016-09-29 18:28:32 +02:00
stderr = stderr . decode ( ' utf-8 ' , ' replace ' )
if p . returncode != 0 :
raise XAttrMetadataError ( p . returncode , stderr )
else :
# On Unix, and can't find pyxattr, setfattr, or xattr.
if sys . platform . startswith ( ' linux ' ) :
raise XAttrUnavailableError (
" Couldn ' t find a tool to set the xattrs. "
" Install either the python ' pyxattr ' or ' xattr ' "
" modules, or the GNU ' attr ' package "
" (which contains the ' setfattr ' tool). " )
else :
raise XAttrUnavailableError (
" Couldn ' t find a tool to set the xattrs. "
" Install either the python ' xattr ' module, "
" or the ' xattr ' binary. " )
2017-05-01 17:09:18 +02:00
def random_birthday ( year_field , month_field , day_field ) :
2018-12-01 18:05:15 +01:00
start_date = datetime . date ( 1950 , 1 , 1 )
end_date = datetime . date ( 1995 , 12 , 31 )
offset = random . randint ( 0 , ( end_date - start_date ) . days )
random_date = start_date + datetime . timedelta ( offset )
2017-05-01 17:09:18 +02:00
return {
2018-12-01 18:05:15 +01:00
year_field : str ( random_date . year ) ,
month_field : str ( random_date . month ) ,
day_field : str ( random_date . day ) ,
2017-05-01 17:09:18 +02:00
}
2020-10-27 11:37:21 +01:00
2021-01-07 07:41:05 +01:00
2020-10-27 11:37:21 +01:00
# Templates for internet shortcut files, which are plain text files.
DOT_URL_LINK_TEMPLATE = '''
[ InternetShortcut ]
URL = % ( url ) s
''' .lstrip()
DOT_WEBLOC_LINK_TEMPLATE = '''
< ? xml version = " 1.0 " encoding = " UTF-8 " ? >
< ! DOCTYPE plist PUBLIC " -//Apple//DTD PLIST 1.0//EN " " http://www.apple.com/DTDs/PropertyList-1.0.dtd " >
< plist version = " 1.0 " >
< dict >
\t < key > URL < / key >
\t < string > % ( url ) s < / string >
< / dict >
< / plist >
''' .lstrip()
DOT_DESKTOP_LINK_TEMPLATE = '''
[ Desktop Entry ]
Encoding = UTF - 8
Name = % ( filename ) s
Type = Link
URL = % ( url ) s
Icon = text - html
''' .lstrip()
2021-10-26 16:41:59 +02:00
LINK_TEMPLATES = {
' url ' : DOT_URL_LINK_TEMPLATE ,
' desktop ' : DOT_DESKTOP_LINK_TEMPLATE ,
' webloc ' : DOT_WEBLOC_LINK_TEMPLATE ,
}
2020-10-27 11:37:21 +01:00
def iri_to_uri ( iri ) :
"""
Converts an IRI ( Internationalized Resource Identifier , allowing Unicode characters ) to a URI ( Uniform Resource Identifier , ASCII - only ) .
The function doesn ' t add an additional layer of escaping; e.g., it doesn ' t escape ` % 3 C ` as ` % 253 C ` . Instead , it percent - escapes characters with an underlying UTF - 8 encoding * besides * those already escaped , leaving the URI intact .
"""
iri_parts = compat_urllib_parse_urlparse ( iri )
if ' [ ' in iri_parts . netloc :
raise ValueError ( ' IPv6 URIs are not, yet, supported. ' )
# Querying `.netloc`, when there's only one bracket, also raises a ValueError.
# The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
net_location = ' '
if iri_parts . username :
net_location + = compat_urllib_parse_quote ( iri_parts . username , safe = r " !$ % & ' ()*+,~ " )
if iri_parts . password is not None :
net_location + = ' : ' + compat_urllib_parse_quote ( iri_parts . password , safe = r " !$ % & ' ()*+,~ " )
net_location + = ' @ '
net_location + = iri_parts . hostname . encode ( ' idna ' ) . decode ( ' utf-8 ' ) # Punycode for Unicode hostnames.
# The 'idna' encoding produces ASCII text.
if iri_parts . port is not None and iri_parts . port != 80 :
net_location + = ' : ' + str ( iri_parts . port )
return compat_urllib_parse_urlunparse (
( iri_parts . scheme ,
net_location ,
compat_urllib_parse_quote_plus ( iri_parts . path , safe = r " !$ % & ' ()*+,/:;=@|~ " ) ,
# Unsure about the `safe` argument, since this is a legacy way of handling parameters.
compat_urllib_parse_quote_plus ( iri_parts . params , safe = r " !$ % & ' ()*+,/:;=@|~ " ) ,
# Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
compat_urllib_parse_quote_plus ( iri_parts . query , safe = r " !$ % & ' ()*+,/:;=?@ { |}~ " ) ,
compat_urllib_parse_quote_plus ( iri_parts . fragment , safe = r " !#$ % & ' ()*+,/:;=?@ { |}~ " ) ) )
# Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
def to_high_limit_path ( path ) :
if sys . platform in [ ' win32 ' , ' cygwin ' ] :
# Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
return r ' \\ ? \ ' . rstrip ( ) + os . path . abspath ( path )
return path
2020-12-13 15:29:09 +01:00
2021-01-07 07:41:05 +01:00
2021-08-07 13:20:46 +02:00
def format_field ( obj , field = None , template = ' %s ' , ignore = ( None , ' ' ) , default = ' ' , func = None ) :
if field is None :
val = obj if obj is not None else default
else :
val = obj . get ( field , default )
2020-12-13 15:29:09 +01:00
if func and val not in ignore :
val = func ( val )
return template % val if val not in ignore else default
2021-01-08 17:14:50 +01:00
def clean_podcast_url ( url ) :
return re . sub ( r ''' (?x)
( ? :
( ? :
chtbl \. com / track |
media \. blubrry \. com | # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
play \. podtrac \. com
) / [ ^ / ] + |
( ? : dts | www ) \. podtrac \. com / ( ? : pts / ) ? redirect \. [ 0 - 9 a - z ] { 3 , 4 } | # http://analytics.podtrac.com/how-to-measure
flex \. acast \. com |
pd ( ? :
cn \. co | # https://podcorn.com/analytics-prefix/
st \. fm # https://podsights.com/docs/
) / e
) / ''' , ' ' , url)
2021-01-22 14:43:30 +01:00
_HEX_TABLE = ' 0123456789abcdef '
def random_uuidv4 ( ) :
return re . sub ( r ' [xy] ' , lambda x : _HEX_TABLE [ random . randint ( 0 , 15 ) ] , ' xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx ' )
2021-01-23 13:18:12 +01:00
def make_dir ( path , to_screen = None ) :
try :
dn = os . path . dirname ( path )
if dn and not os . path . exists ( dn ) :
os . makedirs ( dn )
return True
except ( OSError , IOError ) as err :
if callable ( to_screen ) is not None :
to_screen ( ' unable to create directory ' + error_to_compat_str ( err ) )
return False
2021-01-24 14:40:02 +01:00
def get_executable_path ( ) :
2021-02-25 23:58:02 +01:00
from zipimport import zipimporter
if hasattr ( sys , ' frozen ' ) : # Running from PyInstaller
path = os . path . dirname ( sys . executable )
elif isinstance ( globals ( ) . get ( ' __loader__ ' ) , zipimporter ) : # Running from ZIP
path = os . path . join ( os . path . dirname ( __file__ ) , ' ../.. ' )
else :
path = os . path . join ( os . path . dirname ( __file__ ) , ' .. ' )
2021-01-24 14:40:02 +01:00
return os . path . abspath ( path )
2021-05-08 17:15:14 +02:00
def load_plugins ( name , suffix , namespace ) :
2021-09-29 22:53:33 +02:00
classes = { }
2021-01-24 14:40:02 +01:00
try :
2021-10-18 03:46:49 +02:00
plugins_spec = importlib . util . spec_from_file_location (
name , os . path . join ( get_executable_path ( ) , ' ytdlp_plugins ' , name , ' __init__.py ' ) )
plugins = importlib . util . module_from_spec ( plugins_spec )
sys . modules [ plugins_spec . name ] = plugins
plugins_spec . loader . exec_module ( plugins )
2021-01-24 14:40:02 +01:00
for name in dir ( plugins ) :
2021-05-08 17:15:14 +02:00
if name in namespace :
continue
if not name . endswith ( suffix ) :
2021-01-24 14:40:02 +01:00
continue
klass = getattr ( plugins , name )
2021-09-29 22:53:33 +02:00
classes [ name ] = namespace [ name ] = klass
2021-10-18 03:46:49 +02:00
except FileNotFoundError :
2021-01-24 14:40:02 +01:00
pass
return classes
2021-01-27 16:02:51 +01:00
2021-07-11 00:14:39 +02:00
def traverse_obj (
2021-07-21 07:47:27 +02:00
obj , * path_list , default = None , expected_type = None , get_all = True ,
2021-07-11 00:14:39 +02:00
casesense = True , is_user_input = False , traverse_string = False ) :
2021-06-08 10:53:56 +02:00
''' Traverse nested list/dict/tuple
[utils] Improve `traverse_obj`
* Allow skipping a level: `traverse_obj([{k:v1}, {k:v2}], (None, k))` => `[v1, v2]`
* Make keys variadic: `traverse_obj(obj, k1: str, k2: str)` => `traverse_obj(obj, (k1,), (k2,))`
* Fetch from multiple keys: `traverse_obj([{k1:[1], k2:[2], k3:[3]}], (0, (k1, k2), 0))` => `[1, 2]`
TODO: Add tests
2021-07-15 16:52:49 +02:00
@param path_list A list of paths which are checked one by one .
Each path is a list of keys where each key is a string ,
2021-10-09 04:44:41 +02:00
a function , a tuple of strings or " ... " .
When a fuction is given , it takes the key as argument and
returns whether the key matches or not . When a tuple is given ,
[utils] Improve `traverse_obj`
* Allow skipping a level: `traverse_obj([{k:v1}, {k:v2}], (None, k))` => `[v1, v2]`
* Make keys variadic: `traverse_obj(obj, k1: str, k2: str)` => `traverse_obj(obj, (k1,), (k2,))`
* Fetch from multiple keys: `traverse_obj([{k1:[1], k2:[2], k3:[3]}], (0, (k1, k2), 0))` => `[1, 2]`
TODO: Add tests
2021-07-15 16:52:49 +02:00
all the keys given in the tuple are traversed , and
" ... " traverses all the keys in the object
2021-07-11 00:14:39 +02:00
@param default Default value to return
2021-07-21 07:47:27 +02:00
@param expected_type Only accept final value of this type ( Can also be any callable )
@param get_all Return all the values obtained from a path or only the first one
2021-06-08 10:53:56 +02:00
@param casesense Whether to consider dictionary keys as case sensitive
@param is_user_input Whether the keys are generated from user input . If True ,
strings are converted to int / slice if necessary
@param traverse_string Whether to traverse inside strings . If True , any
non - compatible object will also be converted into a string
[utils] Improve `traverse_obj`
* Allow skipping a level: `traverse_obj([{k:v1}, {k:v2}], (None, k))` => `[v1, v2]`
* Make keys variadic: `traverse_obj(obj, k1: str, k2: str)` => `traverse_obj(obj, (k1,), (k2,))`
* Fetch from multiple keys: `traverse_obj([{k1:[1], k2:[2], k3:[3]}], (0, (k1, k2), 0))` => `[1, 2]`
TODO: Add tests
2021-07-15 16:52:49 +02:00
# TODO: Write tests
2021-06-08 10:53:56 +02:00
'''
2021-07-11 00:14:39 +02:00
if not casesense :
2021-07-31 12:51:01 +02:00
_lower = lambda k : ( k . lower ( ) if isinstance ( k , str ) else k )
[utils] Improve `traverse_obj`
* Allow skipping a level: `traverse_obj([{k:v1}, {k:v2}], (None, k))` => `[v1, v2]`
* Make keys variadic: `traverse_obj(obj, k1: str, k2: str)` => `traverse_obj(obj, (k1,), (k2,))`
* Fetch from multiple keys: `traverse_obj([{k1:[1], k2:[2], k3:[3]}], (0, (k1, k2), 0))` => `[1, 2]`
TODO: Add tests
2021-07-15 16:52:49 +02:00
path_list = ( map ( _lower , variadic ( path ) ) for path in path_list )
def _traverse_obj ( obj , path , _current_depth = 0 ) :
nonlocal depth
path = tuple ( variadic ( path ) )
for i , key in enumerate ( path ) :
2021-11-07 19:53:57 +01:00
if obj is None :
return None
[utils] Improve `traverse_obj`
* Allow skipping a level: `traverse_obj([{k:v1}, {k:v2}], (None, k))` => `[v1, v2]`
* Make keys variadic: `traverse_obj(obj, k1: str, k2: str)` => `traverse_obj(obj, (k1,), (k2,))`
* Fetch from multiple keys: `traverse_obj([{k1:[1], k2:[2], k3:[3]}], (0, (k1, k2), 0))` => `[1, 2]`
TODO: Add tests
2021-07-15 16:52:49 +02:00
if isinstance ( key , ( list , tuple ) ) :
obj = [ _traverse_obj ( obj , sub_key , _current_depth ) for sub_key in key ]
key = . . .
if key is . . . :
obj = ( obj . values ( ) if isinstance ( obj , dict )
else obj if isinstance ( obj , ( list , tuple , LazyList ) )
else str ( obj ) if traverse_string else [ ] )
_current_depth + = 1
depth = max ( depth , _current_depth )
return [ _traverse_obj ( inner_obj , path [ i + 1 : ] , _current_depth ) for inner_obj in obj ]
2021-10-09 04:44:41 +02:00
elif callable ( key ) :
if isinstance ( obj , ( list , tuple , LazyList ) ) :
obj = enumerate ( obj )
elif isinstance ( obj , dict ) :
obj = obj . items ( )
else :
if not traverse_string :
return None
obj = str ( obj )
_current_depth + = 1
depth = max ( depth , _current_depth )
return [ _traverse_obj ( v , path [ i + 1 : ] , _current_depth ) for k , v in obj if key ( k ) ]
2021-08-07 10:54:50 +02:00
elif isinstance ( obj , dict ) and not ( is_user_input and key == ' : ' ) :
2021-07-11 00:14:39 +02:00
obj = ( obj . get ( key ) if casesense or ( key in obj )
else next ( ( v for k , v in obj . items ( ) if _lower ( k ) == key ) , None ) )
else :
if is_user_input :
key = ( int_or_none ( key ) if ' : ' not in key
else slice ( * map ( int_or_none , key . split ( ' : ' ) ) ) )
[utils] Improve `traverse_obj`
* Allow skipping a level: `traverse_obj([{k:v1}, {k:v2}], (None, k))` => `[v1, v2]`
* Make keys variadic: `traverse_obj(obj, k1: str, k2: str)` => `traverse_obj(obj, (k1,), (k2,))`
* Fetch from multiple keys: `traverse_obj([{k1:[1], k2:[2], k3:[3]}], (0, (k1, k2), 0))` => `[1, 2]`
TODO: Add tests
2021-07-15 16:52:49 +02:00
if key == slice ( None ) :
2021-08-07 10:54:50 +02:00
return _traverse_obj ( obj , ( . . . , * path [ i + 1 : ] ) , _current_depth )
2021-07-11 00:14:39 +02:00
if not isinstance ( key , ( int , slice ) ) :
2021-06-16 22:45:57 +02:00
return None
[utils] Improve `traverse_obj`
* Allow skipping a level: `traverse_obj([{k:v1}, {k:v2}], (None, k))` => `[v1, v2]`
* Make keys variadic: `traverse_obj(obj, k1: str, k2: str)` => `traverse_obj(obj, (k1,), (k2,))`
* Fetch from multiple keys: `traverse_obj([{k1:[1], k2:[2], k3:[3]}], (0, (k1, k2), 0))` => `[1, 2]`
TODO: Add tests
2021-07-15 16:52:49 +02:00
if not isinstance ( obj , ( list , tuple , LazyList ) ) :
2021-07-11 00:14:39 +02:00
if not traverse_string :
return None
obj = str ( obj )
try :
obj = obj [ key ]
except IndexError :
2021-06-08 10:53:56 +02:00
return None
2021-07-11 00:14:39 +02:00
return obj
2021-07-21 07:47:27 +02:00
if isinstance ( expected_type , type ) :
type_test = lambda val : val if isinstance ( val , expected_type ) else None
elif expected_type is not None :
type_test = expected_type
else :
type_test = lambda val : val
[utils] Improve `traverse_obj`
* Allow skipping a level: `traverse_obj([{k:v1}, {k:v2}], (None, k))` => `[v1, v2]`
* Make keys variadic: `traverse_obj(obj, k1: str, k2: str)` => `traverse_obj(obj, (k1,), (k2,))`
* Fetch from multiple keys: `traverse_obj([{k1:[1], k2:[2], k3:[3]}], (0, (k1, k2), 0))` => `[1, 2]`
TODO: Add tests
2021-07-15 16:52:49 +02:00
for path in path_list :
depth = 0
val = _traverse_obj ( obj , path )
2021-07-11 00:14:39 +02:00
if val is not None :
[utils] Improve `traverse_obj`
* Allow skipping a level: `traverse_obj([{k:v1}, {k:v2}], (None, k))` => `[v1, v2]`
* Make keys variadic: `traverse_obj(obj, k1: str, k2: str)` => `traverse_obj(obj, (k1,), (k2,))`
* Fetch from multiple keys: `traverse_obj([{k1:[1], k2:[2], k3:[3]}], (0, (k1, k2), 0))` => `[1, 2]`
TODO: Add tests
2021-07-15 16:52:49 +02:00
if depth :
for _ in range ( depth - 1 ) :
2021-07-20 19:46:38 +02:00
val = itertools . chain . from_iterable ( v for v in val if v is not None )
2021-07-21 07:47:27 +02:00
val = [ v for v in map ( type_test , val ) if v is not None ]
[utils] Improve `traverse_obj`
* Allow skipping a level: `traverse_obj([{k:v1}, {k:v2}], (None, k))` => `[v1, v2]`
* Make keys variadic: `traverse_obj(obj, k1: str, k2: str)` => `traverse_obj(obj, (k1,), (k2,))`
* Fetch from multiple keys: `traverse_obj([{k1:[1], k2:[2], k3:[3]}], (0, (k1, k2), 0))` => `[1, 2]`
TODO: Add tests
2021-07-15 16:52:49 +02:00
if val :
2021-07-21 07:47:27 +02:00
return val if get_all else val [ 0 ]
else :
val = type_test ( val )
if val is not None :
[utils] Improve `traverse_obj`
* Allow skipping a level: `traverse_obj([{k:v1}, {k:v2}], (None, k))` => `[v1, v2]`
* Make keys variadic: `traverse_obj(obj, k1: str, k2: str)` => `traverse_obj(obj, (k1,), (k2,))`
* Fetch from multiple keys: `traverse_obj([{k1:[1], k2:[2], k3:[3]}], (0, (k1, k2), 0))` => `[1, 2]`
TODO: Add tests
2021-07-15 16:52:49 +02:00
return val
2021-07-11 00:14:39 +02:00
return default
2021-06-08 10:53:56 +02:00
2021-11-29 18:46:06 +01:00
# Deprecated
2021-06-08 10:53:56 +02:00
def traverse_dict ( dictn , keys , casesense = True ) :
2021-11-29 18:46:06 +01:00
write_string ( ' DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
' and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead ' )
return traverse_obj ( dictn , keys , casesense = casesense , is_user_input = True , traverse_string = True )
2021-07-10 23:59:44 +02:00
2021-07-19 22:51:55 +02:00
def variadic ( x , allowed_types = ( str , bytes ) ) :
2021-07-23 17:02:48 +02:00
return x if isinstance ( x , collections . abc . Iterable ) and not isinstance ( x , allowed_types ) else ( x , )
2021-09-22 16:12:04 +02:00
2021-09-23 19:40:51 +02:00
# create a JSON Web Signature (jws) with HS256 algorithm
# the resulting format is in JWS Compact Serialization
# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
def jwt_encode_hs256 ( payload_data , key , headers = { } ) :
header_data = {
' alg ' : ' HS256 ' ,
' typ ' : ' JWT ' ,
}
if headers :
header_data . update ( headers )
header_b64 = base64 . b64encode ( json . dumps ( header_data ) . encode ( ' utf-8 ' ) )
payload_b64 = base64 . b64encode ( json . dumps ( payload_data ) . encode ( ' utf-8 ' ) )
h = hmac . new ( key . encode ( ' utf-8 ' ) , header_b64 + b ' . ' + payload_b64 , hashlib . sha256 )
signature_b64 = base64 . b64encode ( h . digest ( ) )
token = header_b64 + b ' . ' + payload_b64 + b ' . ' + signature_b64
return token
2021-10-08 21:11:59 +02:00
2021-10-27 22:37:15 +02:00
# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
def jwt_decode_hs256 ( jwt ) :
header_b64 , payload_b64 , signature_b64 = jwt . split ( ' . ' )
payload_data = json . loads ( base64 . urlsafe_b64decode ( payload_b64 ) )
return payload_data
2021-10-08 21:11:59 +02:00
def supports_terminal_sequences ( stream ) :
if compat_os_name == ' nt ' :
2021-12-08 15:11:54 +01:00
from . compat import WINDOWS_VT_MODE # Must be imported locally
if not WINDOWS_VT_MODE or get_windows_version ( ) < ( 10 , 0 , 10586 ) :
2021-10-08 21:11:59 +02:00
return False
elif not os . getenv ( ' TERM ' ) :
return False
try :
return stream . isatty ( )
except BaseException :
return False
2021-10-20 18:37:32 +02:00
_terminal_sequences_re = re . compile ( ' \033 \\ [[^m]+m ' )
def remove_terminal_sequences ( string ) :
return _terminal_sequences_re . sub ( ' ' , string )
def number_of_digits ( number ) :
return len ( ' %d ' % number )
2021-11-06 02:05:24 +01:00
def join_nonempty ( * values , delim = ' - ' , from_dict = None ) :
if from_dict is not None :
2021-11-09 23:44:42 +01:00
values = map ( from_dict . get , values )
2021-11-06 02:05:24 +01:00
return delim . join ( map ( str , filter ( None , values ) ) )