mirror of
https://github.com/nexus-stc/hyperboria
synced 2025-02-21 22:31:10 +01:00
- [nexus] Remove outdated protos - [nexus] Development - [nexus] Development - [nexus] Development - [nexus] Development - [nexus] Development - [nexus] Refactor views - [nexus] Update aiosumma - [nexus] Add tags - [nexus] Development - [nexus] Update repository - [nexus] Update repository - [nexus] Update dependencies - [nexus] Update dependencies - [nexus] Fixes for MetaAPI - [nexus] Support for new queries - [nexus] Adopt new versions of search - [nexus] Improving Nexus - [nexus] Various fixes - [nexus] Add profile - [nexus] Fixes for ingestion - [nexus] Refactorings and bugfixes - [idm] Add profile methods - [nexus] Fix stalled nexus-meta bugs - [nexus] Various bugfixes - [nexus] Restore IDM API functionality GitOrigin-RevId: a0842345a6dde5b321279ab5510a50c0def0e71a
51 lines
1.8 KiB
Python
51 lines
1.8 KiB
Python
import re
|
|
from typing import (
|
|
AsyncIterable,
|
|
List,
|
|
Optional,
|
|
)
|
|
|
|
from nexus.pylon.prepared_request import PreparedRequest
|
|
from nexus.pylon.proxy_manager import ProxyManager
|
|
from nexus.pylon.resolvers.base import BaseResolver
|
|
|
|
|
|
class RequestResolver(BaseResolver):
|
|
def __init__(
|
|
self,
|
|
url: str,
|
|
extractors: List,
|
|
proxy_list: Optional[List] = None,
|
|
proxy_manager: Optional[ProxyManager] = None,
|
|
):
|
|
super().__init__(proxy_list=proxy_list, proxy_manager=proxy_manager)
|
|
self.url = url
|
|
self.extractors = extractors
|
|
|
|
def __str__(self):
|
|
return f'{self.__class__.__name__}({self.url})'
|
|
|
|
async def resolve(self, params) -> AsyncIterable[PreparedRequest]:
|
|
async with self.get_session() as session:
|
|
url = self.url.format(**params)
|
|
async with PreparedRequest(
|
|
method='get',
|
|
url=url,
|
|
timeout=10.0,
|
|
).execute_with(session=session) as resp:
|
|
# Sometimes sci-hub returns file
|
|
if resp.headers.get('Content-Type') == 'application/pdf':
|
|
yield PreparedRequest(method='get', url=url, timeout=10.0)
|
|
downloaded_page_bytes = await resp.read()
|
|
downloaded_page = downloaded_page_bytes.decode('utf-8', 'backslashreplace')
|
|
|
|
for extractor in self.extractors:
|
|
match = re.search(extractor['re'], downloaded_page, re.IGNORECASE)
|
|
if match:
|
|
matched_group = match.group(extractor['producer']['group'])
|
|
yield PreparedRequest(
|
|
method='get',
|
|
url=extractor['producer']['format_string'].format(matched_group=matched_group),
|
|
timeout=extractor['producer'].get('timeout', 10.0),
|
|
)
|