hyperboria/nexus/pylon/resolvers/doi_org_request.py
the-superpirate 7c1bb06b1b - [nexus] Development
GitOrigin-RevId: ccfd55db266862ed70f1299aaf62500765b03cc4
2022-09-06 19:35:58 +03:00

83 lines
2.6 KiB
Python

import json
import logging
from typing import (
AsyncIterable,
Dict,
List,
Optional,
)
import jq
from nexus.pylon.prepared_request import PreparedRequest
from nexus.pylon.proxy_manager import ProxyManager
from nexus.pylon.resolvers.base import BaseResolver
from tenacity import (
retry,
stop_after_attempt,
wait_random,
)
class DoiOrgRequestResolver(BaseResolver):
def __init__(
self,
selector='.link[0].URL',
format_string='{selected}',
timeout: float = 10.0,
resolve_timeout: float = 10.0,
proxy_list: Optional[List] = None,
proxy_manager: Optional[ProxyManager] = None,
):
super().__init__(proxy_list=proxy_list, proxy_manager=proxy_manager)
self.text_selector = selector
self.selector = jq.compile(selector)
self.format_string = format_string
self.timeout = timeout
self.resolve_timeout = resolve_timeout
def __str__(self):
return f'{self.__class__.__name__}(selector = {self.selector}, format_string = {self.format_string})'
@retry(
reraise=True,
wait=wait_random(min=1, max=3),
stop=stop_after_attempt(4),
)
async def resolve_through_doi_org(self, params):
async with self.get_session() as session:
doi_url = f'https://doi.org/{params["doi"]}'
async with PreparedRequest(
method='get',
url=doi_url,
timeout=self.resolve_timeout,
headers={'Accept': 'application/json'}
).execute_with(session=session) as resp:
return await resp.json()
async def resolve(self, params: Dict) -> AsyncIterable[PreparedRequest]:
body = await self.resolve_through_doi_org(params)
try:
selected = json.loads(self.selector.input(body).text())
except ValueError as e:
logging.getLogger('error').error({
'action': 'error',
'mode': 'pylon',
'params': params,
'error': str(e)
})
return
if selected:
yield PreparedRequest(
method='get',
url=self.format_string.format(selected=selected),
timeout=self.timeout,
)
else:
logging.getLogger('debug').error({
'action': 'missed_selector',
'mode': 'pylon',
'params': params,
'selector': self.text_selector,
'format_string': self.format_string,
})