import asyncio import json import logging import os.path import shutil import time from pathlib import Path from typing import ( Dict, List, Optional, Tuple, ) import aiofiles from izihawa_utils.random import random_string from nexus.pylon.consts import DEFAULT_USER_AGENT from nexus.pylon.drivers.base import BaseDriver from nexus.pylon.exceptions import NotFoundError from nexus.pylon.prepared_request import PreparedRequest from nexus.pylon.proto.file_pb2 import Chunk as ChunkPb from nexus.pylon.proto.file_pb2 import FileResponse as FileResponsePb from nexus.pylon.proxy_manager import ProxyManager from selenium import webdriver from selenium.common.exceptions import WebDriverException from selenium.webdriver.common.by import By from selenium.webdriver.common.desired_capabilities import DesiredCapabilities from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait class BrowserDriver(BaseDriver): def __init__( self, validator=None, proxy_list: Optional[List] = None, proxy_manager: Optional[ProxyManager] = None, actions: Optional[List] = None, downloads_directory='/downloads', window_size: Tuple[int, int] = (1279, 833), erase_webdrive_property: bool = True, webdrive_hub_endpoint: str = "http://127.0.0.1:4444/wd/hub", ): super().__init__(validator=validator, proxy_list=proxy_list, proxy_manager=proxy_manager) self.actions = actions self.downloads_directory = Path(downloads_directory) self.window_size = window_size self.erase_webdrive_property = erase_webdrive_property self.webdrive_hub_endpoint = webdrive_hub_endpoint async def get_chrome_sessions(self): proxies = list( self.proxy_manager.get_proxies(self.proxy_list) if self.proxy_manager and self.proxy_list else [None] ) for proxy in proxies: downloads_folder = self.downloads_directory / random_string(16) os.mkdir(downloads_folder) os.chmod(downloads_folder, 0o777) chrome = await asyncio.get_running_loop().run_in_executor(None, lambda: self.setup_chrome(proxy, downloads_folder)) try: yield chrome, downloads_folder finally: shutil.rmtree(downloads_folder) chrome.quit() def setup_chrome(self, proxy, downloads_folder): options = webdriver.ChromeOptions() options.add_experimental_option("prefs", { 'download.default_directory': str(downloads_folder), 'download.prompt_for_download': False, 'safebrowsing.enabled': True, 'plugins.always_open_pdf_externally': True, 'profile.default_content_setting_values.automatic_downloads': True, }) options.add_argument('user-agent=' + DEFAULT_USER_AGENT) if proxy: options.add_argument('--proxy-server=%s' % proxy.get_address()) options.add_argument('--headless') options.add_argument('--enable-javascript') options.add_argument('--disable-dev-shm-usage') options.add_argument("--disable-popup-blocking") chrome = webdriver.Remote( self.webdrive_hub_endpoint, DesiredCapabilities.CHROME, options=options, ) chrome.set_window_size(self.window_size[0], self.window_size[1]) if self.erase_webdrive_property: resource = "/session/%s/chromium/send_command_and_get_result" % chrome.session_id url = chrome.command_executor._url + resource body = json.dumps({'cmd': "Page.addScriptToEvaluateOnNewDocument", 'params': { "source": """ Object.defineProperty(navigator, 'webdriver', { get: () => undefined }) """ }}) chrome.command_executor._request('POST', url, body) logging.getLogger('debug').debug({ 'action': 'start_chrome', 'mode': 'pylon', 'proxy': str(proxy) if proxy is not None else None, 'downloads_folder': str(downloads_folder), }) return chrome async def wait_for_file(self, path, timeout): start_time = time.time() while time.time() - timeout < start_time: file = self.get_first_file(path) if file: return file await asyncio.sleep(0.1) def get_first_file(self, path): files = os.listdir(path) if files: return str(Path(path) / files[0]) async def produce_downloaded_file(self, downloads_folder, timeout=10.0, download_timeout=1200.0): filename = await self.wait_for_file(downloads_folder, timeout) if not filename: raise NotFoundError() current_offset = 0 try: file = await aiofiles.open(filename, 'rb') except FileNotFoundError: file = await aiofiles.open(self.get_first_file(downloads_folder), 'rb') try: start_time = time.time() while time.time() - download_timeout < start_time: current_file = self.get_first_file(downloads_folder) await file.seek(0, os.SEEK_END) downloaded_offset = await file.tell() if ( not current_file.endswith('.crdownload') and downloaded_offset == current_offset and current_offset > 0 ): logging.getLogger('debug').debug({ 'action': 'sent', 'mode': 'pylon', 'filename': filename, }) return logging.getLogger('debug').debug({ 'action': 'send_part', 'mode': 'pylon', 'current_offset': current_offset, 'downloaded_offset': downloaded_offset, 'filename': filename, }) await file.seek(current_offset) yield await file.read(downloaded_offset - current_offset) current_offset = downloaded_offset await asyncio.sleep(0.5) raise NotFoundError() finally: await file.close() def get(self, chrome, url, params): logging.getLogger('debug').debug({ 'action': 'get', 'mode': 'pylon', 'url': url, }) try: chrome.get(url) if not self.actions: return True last_element = None previous_window = None current_window = chrome.window_handles[0] for action in self.actions: match action['type']: case 'click': if not last_element: raise RuntimeError('Nothing to click') chrome.execute_script("arguments[0].click();", last_element) logging.getLogger('debug').debug({ 'action': 'clicked', 'mode': 'pylon', 'element': str(last_element), }) case 'close_window': current_window = previous_window previous_window = None chrome.close() chrome.switch_to.window(current_window) case 'native_click': if not last_element: raise RuntimeError('Nothing to click') last_element.click() logging.getLogger('debug').debug({ 'action': 'native_clicked', 'mode': 'pylon', 'element': str(last_element), }) case 'switch_to_new_window': previous_window = current_window current_window = chrome.window_handles[-1] chrome.switch_to.window(current_window) case 'type': if not last_element: raise RuntimeError('Nothing to type') last_element.clear() last_element.send_keys(action['text'].format(**params)) case 'wait': time.sleep(action['timeout']) case 'wait_css_selector': last_element = WebDriverWait(chrome, action.get('timeout', 15.0)).until( EC.presence_of_element_located(( By.CSS_SELECTOR, action['selector'], )) ) logging.getLogger('debug').debug({ 'action': 'waited_css_selector', 'mode': 'pylon', 'element': str(last_element), 'step': action }) case 'wait_link_text': last_element = WebDriverWait(chrome, action.get('timeout', 15.0)).until( EC.presence_of_element_located(( By.LINK_TEXT, action['selector'], )) ) logging.getLogger('debug').debug({ 'action': 'waited_link_text', 'mode': 'pylon', 'element': str(last_element), 'step': action }) case 'wait_xpath': last_element = WebDriverWait(chrome, action.get('timeout', 15.0)).until( EC.presence_of_element_located(( By.XPATH, action['selector'], )) ) logging.getLogger('debug').debug({ 'action': 'waited_xpath', 'mode': 'pylon', 'element': str(last_element), 'step': action }) case _: raise NotImplementedError('Not implemented action type') except WebDriverException as e: logging.getLogger('debug').debug({ 'action': 'error', 'mode': 'pylon', 'error': str(e), }) return False return True async def execute_prepared_file_request(self, prepared_file_request: PreparedRequest, params: Dict): async for chrome, downloads_folder in self.get_chrome_sessions(): try: result = await asyncio.get_running_loop().run_in_executor( None, lambda: self.get(chrome, prepared_file_request.url, params), ) if not result: continue file_validator = self.validator(params) yield FileResponsePb(status=FileResponsePb.Status.BEGIN_TRANSMISSION, source=chrome.current_url) async for content in self.produce_downloaded_file( downloads_folder, timeout=prepared_file_request.timeout, download_timeout=1200.0, ): file_validator.update(content) yield FileResponsePb( chunk=ChunkPb(content=content), source=chrome.current_url, ) file_validator.validate() logging.getLogger('debug').debug({ 'action': 'validated', 'mode': 'pylon', 'url': prepared_file_request.url, }) return except NotFoundError: logging.getLogger('debug').debug({ 'action': 'no_response', 'mode': 'pylon', }) raise NotFoundError(params=params, url=prepared_file_request.url, driver=str(self))