[cda] Implement birthday verification (closes #12789)

This commit is contained in:
Yen Chi Hsuan 2017-05-01 23:09:18 +08:00
parent 5401bea27f
commit 0c26548601
No known key found for this signature in database
GPG Key ID: 7F902A182457CA23
5 changed files with 125 additions and 9 deletions

View File

@ -1,6 +1,7 @@
version <unreleased> version <unreleased>
Extractors Extractors
+ [cda] Support birthday verification (#12789)
* [leeco] Fix extraction (#12974) * [leeco] Fix extraction (#12974)

View File

@ -44,6 +44,7 @@
limit_length, limit_length,
mimetype2ext, mimetype2ext,
month_by_name, month_by_name,
multipart_encode,
ohdave_rsa_encrypt, ohdave_rsa_encrypt,
OnDemandPagedList, OnDemandPagedList,
orderedSet, orderedSet,
@ -620,6 +621,16 @@ def query_dict(url):
'http://example.com/path', {'test': '第二行тест'})), 'http://example.com/path', {'test': '第二行тест'})),
query_dict('http://example.com/path?test=%E7%AC%AC%E4%BA%8C%E8%A1%8C%D1%82%D0%B5%D1%81%D1%82')) query_dict('http://example.com/path?test=%E7%AC%AC%E4%BA%8C%E8%A1%8C%D1%82%D0%B5%D1%81%D1%82'))
def test_multipart_encode(self):
self.assertEqual(
multipart_encode({b'field': b'value'}, boundary='AAAAAA')[0],
b'--AAAAAA\r\nContent-Disposition: form-data; name="field"\r\n\r\nvalue\r\n--AAAAAA--\r\n')
self.assertEqual(
multipart_encode({'欄位'.encode('utf-8'): ''.encode('utf-8')}, boundary='AAAAAA')[0],
b'--AAAAAA\r\nContent-Disposition: form-data; name="\xe6\xac\x84\xe4\xbd\x8d"\r\n\r\n\xe5\x80\xbc\r\n--AAAAAA--\r\n')
self.assertRaises(
ValueError, multipart_encode, {b'field': b'value'}, boundary='value')
def test_dict_get(self): def test_dict_get(self):
FALSE_VALUES = { FALSE_VALUES = {
'none': None, 'none': None,

View File

@ -9,7 +9,10 @@
ExtractorError, ExtractorError,
float_or_none, float_or_none,
int_or_none, int_or_none,
multipart_encode,
parse_duration, parse_duration,
random_birthday,
urljoin,
) )
@ -27,7 +30,8 @@ class CDAIE(InfoExtractor):
'description': 'md5:269ccd135d550da90d1662651fcb9772', 'description': 'md5:269ccd135d550da90d1662651fcb9772',
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
'average_rating': float, 'average_rating': float,
'duration': 39 'duration': 39,
'age_limit': 0,
} }
}, { }, {
'url': 'http://www.cda.pl/video/57413289', 'url': 'http://www.cda.pl/video/57413289',
@ -41,13 +45,41 @@ class CDAIE(InfoExtractor):
'uploader': 'crash404', 'uploader': 'crash404',
'view_count': int, 'view_count': int,
'average_rating': float, 'average_rating': float,
'duration': 137 'duration': 137,
'age_limit': 0,
} }
}, {
# Age-restricted
'url': 'http://www.cda.pl/video/1273454c4',
'info_dict': {
'id': '1273454c4',
'ext': 'mp4',
'title': 'Bronson (2008) napisy HD 1080p',
'description': 'md5:1b6cb18508daf2dc4e0fa4db77fec24c',
'height': 1080,
'uploader': 'boniek61',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 5554,
'age_limit': 18,
'view_count': int,
'average_rating': float,
},
}, { }, {
'url': 'http://ebd.cda.pl/0x0/5749950c', 'url': 'http://ebd.cda.pl/0x0/5749950c',
'only_matching': True, 'only_matching': True,
}] }]
def _download_age_confirm_page(self, url, video_id, *args, **kwargs):
form_data = random_birthday('rok', 'miesiac', 'dzien')
form_data.update({'return': url, 'module': 'video', 'module_id': video_id})
data, content_type = multipart_encode(form_data)
return self._download_webpage(
urljoin(url, '/a/validatebirth'), video_id, *args,
data=data, headers={
'Referer': url,
'Content-Type': content_type,
}, **kwargs)
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
self._set_cookie('cda.pl', 'cda.player', 'html5') self._set_cookie('cda.pl', 'cda.player', 'html5')
@ -57,6 +89,13 @@ def _real_extract(self, url):
if 'Ten film jest dostępny dla użytkowników premium' in webpage: if 'Ten film jest dostępny dla użytkowników premium' in webpage:
raise ExtractorError('This video is only available for premium users.', expected=True) raise ExtractorError('This video is only available for premium users.', expected=True)
need_confirm_age = False
if self._html_search_regex(r'(<form[^>]+action="/a/validatebirth")',
webpage, 'birthday validate form', default=None):
webpage = self._download_age_confirm_page(
url, video_id, note='Confirming age')
need_confirm_age = True
formats = [] formats = []
uploader = self._search_regex(r'''(?x) uploader = self._search_regex(r'''(?x)
@ -81,6 +120,7 @@ def _real_extract(self, url):
'thumbnail': self._og_search_thumbnail(webpage), 'thumbnail': self._og_search_thumbnail(webpage),
'formats': formats, 'formats': formats,
'duration': None, 'duration': None,
'age_limit': 18 if need_confirm_age else 0,
} }
def extract_format(page, version): def extract_format(page, version):
@ -121,7 +161,12 @@ def extract_format(page, version):
for href, resolution in re.findall( for href, resolution in re.findall(
r'<a[^>]+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)', r'<a[^>]+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)',
webpage): webpage):
webpage = self._download_webpage( if need_confirm_age:
handler = self._download_age_confirm_page
else:
handler = self._download_webpage
webpage = handler(
self._BASE_URL + href, video_id, self._BASE_URL + href, video_id,
'Downloading %s version information' % resolution, fatal=False) 'Downloading %s version information' % resolution, fatal=False)
if not webpage: if not webpage:
@ -129,6 +174,7 @@ def extract_format(page, version):
# invalid version is requested. # invalid version is requested.
self.report_warning('Unable to download %s version information' % resolution) self.report_warning('Unable to download %s version information' % resolution)
continue continue
extract_format(webpage, resolution) extract_format(webpage, resolution)
self._sort_formats(formats) self._sort_formats(formats)

View File

@ -1,7 +1,6 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import random
import re import re
from .common import InfoExtractor from .common import InfoExtractor
@ -11,6 +10,7 @@
float_or_none, float_or_none,
parse_age_limit, parse_age_limit,
qualities, qualities,
random_birthday,
try_get, try_get,
unified_timestamp, unified_timestamp,
urljoin, urljoin,
@ -47,13 +47,10 @@ def _extract_urls(webpage):
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
query = random_birthday('birth_year', 'birth_month', 'birth_day')
video = self._download_json( video = self._download_json(
'https://public-api.wordpress.com/rest/v1.1/videos/%s' % video_id, 'https://public-api.wordpress.com/rest/v1.1/videos/%s' % video_id,
video_id, query={ video_id, query=query)
'birth_month': random.randint(1, 12),
'birth_day': random.randint(1, 31),
'birth_year': random.randint(1950, 1995),
})
title = video['title'] title = video['title']

View File

@ -11,6 +11,7 @@
import ctypes import ctypes
import datetime import datetime
import email.utils import email.utils
import email.header
import errno import errno
import functools import functools
import gzip import gzip
@ -2097,6 +2098,58 @@ def update_Request(req, url=None, data=None, headers={}, query={}):
return new_req return new_req
def try_multipart_encode(data, boundary):
content_type = 'multipart/form-data; boundary=%s' % boundary
out = b''
for k, v in data.items():
out += b'--' + boundary.encode('ascii') + b'\r\n'
if isinstance(k, compat_str):
k = k.encode('utf-8')
if isinstance(v, compat_str):
v = v.encode('utf-8')
# RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
# suggests sending UTF-8 directly. Firefox sends UTF-8, too
content = b'Content-Disposition: form-data; name="%s"\r\n\r\n' % k + v + b'\r\n'
if boundary.encode('ascii') in content:
raise ValueError('Boundary overlaps with data')
out += content
out += b'--' + boundary.encode('ascii') + b'--\r\n'
return out, content_type
def multipart_encode(data, boundary=None):
'''
Encode a dict to RFC 7578-compliant form-data
data:
A dict where keys and values can be either Unicode or bytes-like
objects.
boundary:
If specified a Unicode object, it's used as the boundary. Otherwise
a random boundary is generated.
Reference: https://tools.ietf.org/html/rfc7578
'''
has_specified_boundary = boundary is not None
while True:
if boundary is None:
boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
try:
out, content_type = try_multipart_encode(data, boundary)
break
except ValueError:
if has_specified_boundary:
raise
boundary = None
return out, content_type
def dict_get(d, key_or_keys, default=None, skip_false_values=True): def dict_get(d, key_or_keys, default=None, skip_false_values=True):
if isinstance(key_or_keys, (list, tuple)): if isinstance(key_or_keys, (list, tuple)):
for key in key_or_keys: for key in key_or_keys:
@ -3760,3 +3813,11 @@ def write_xattr(path, key, value):
"Couldn't find a tool to set the xattrs. " "Couldn't find a tool to set the xattrs. "
"Install either the python 'xattr' module, " "Install either the python 'xattr' module, "
"or the 'xattr' binary.") "or the 'xattr' binary.")
def random_birthday(year_field, month_field, day_field):
return {
year_field: str(random.randint(1950, 1995)),
month_field: str(random.randint(1, 12)),
day_field: str(random.randint(1, 31)),
}