[pornhub] Bypass scrape detection (closes #5930)

2024-11-27 21:46:54 +01:00 · 2019-01-23 03:51:29 +07:00 · 2019-01-23 03:51:29 +07:00 · 278d061a0c
commit 278d061a0c
parent 503b604a31
1 changed files with 22 additions and 0 deletions
--- a/youtube_dl/extractor/pornhub.py
+++ b/youtube_dl/extractor/pornhub.py
@ -10,7 +10,9 @@ from .common import InfoExtractor
 from ..compat import (
    compat_HTTPError,
    compat_str,
    compat_urllib_request,
 )
 from .openload import PhantomJSwrapper
 from ..utils import (
    ExtractorError,
    int_or_none,
@ -126,6 +128,26 @@ class PornHubIE(InfoExtractor):
        'only_matching': True,
    }]
    def _download_webpage_handle(self, *args, **kwargs):
        def dl(*args, **kwargs):
            return super(PornHubIE, self)._download_webpage_handle(*args, **kwargs)
        webpage, urlh = dl(*args, **kwargs)
        if any(re.search(p, webpage) for p in (
                r'<body\b[^>]+\bonload=["\']go\(\)',
                r'document\.cookie\s*=\s*["\']RNKEY=',
                r'document\.location\.reload\(true\)')):
            url_or_request = args[0]
            url = (url_or_request.get_full_url()
                   if isinstance(url_or_request, compat_urllib_request.Request)
                   else url_or_request)
            phantom = PhantomJSwrapper(self, required_version='2.0')
            phantom.get(url, html=webpage)
            webpage, urlh = dl(*args, **kwargs)
        return webpage, urlh
    @staticmethod
    def _extract_urls(webpage):
        return re.findall(