2021-04-12 19:47:21 +02:00
|
|
|
import asyncio
|
2021-01-08 21:08:47 +01:00
|
|
|
import hashlib
|
2021-04-12 19:47:21 +02:00
|
|
|
import socket
|
|
|
|
from contextlib import asynccontextmanager
|
2021-01-08 21:08:47 +01:00
|
|
|
from typing import (
|
|
|
|
AsyncIterable,
|
2021-04-12 19:47:21 +02:00
|
|
|
Callable,
|
2021-01-08 21:08:47 +01:00
|
|
|
Optional,
|
|
|
|
)
|
|
|
|
|
|
|
|
import aiohttp
|
|
|
|
import aiohttp.client_exceptions
|
2021-04-12 19:47:21 +02:00
|
|
|
from aiohttp.client_reqrep import ClientRequest
|
2021-01-08 21:08:47 +01:00
|
|
|
from aiohttp_socks import (
|
2021-04-12 19:47:21 +02:00
|
|
|
ProxyConnectionError,
|
2021-01-08 21:08:47 +01:00
|
|
|
ProxyConnector,
|
|
|
|
ProxyError,
|
|
|
|
)
|
|
|
|
from aiokit import AioThing
|
2021-04-12 19:47:21 +02:00
|
|
|
from izihawa_utils.importlib import class_fullname
|
|
|
|
from library.logging import error_log
|
2021-01-08 21:08:47 +01:00
|
|
|
from nexus.pylon.exceptions import (
|
|
|
|
BadResponseError,
|
2021-04-12 19:47:21 +02:00
|
|
|
DownloadError,
|
2021-01-08 21:08:47 +01:00
|
|
|
IncorrectMD5Error,
|
|
|
|
NotFoundError,
|
|
|
|
)
|
|
|
|
from nexus.pylon.pdftools import is_pdf
|
|
|
|
from nexus.pylon.proto.file_pb2 import Chunk as ChunkPb
|
|
|
|
from nexus.pylon.proto.file_pb2 import FileResponse as FileResponsePb
|
|
|
|
from python_socks import ProxyTimeoutError
|
|
|
|
from tenacity import (
|
|
|
|
retry,
|
|
|
|
retry_if_exception_type,
|
|
|
|
stop_after_attempt,
|
|
|
|
)
|
|
|
|
|
|
|
|
DEFAULT_USER_AGENT = 'PylonBot/1.0 (Linux x86_64) PylonBot/1.0.0'
|
|
|
|
|
|
|
|
|
2021-04-12 19:47:21 +02:00
|
|
|
class KeepAliveClientRequest(ClientRequest):
|
|
|
|
async def send(self, conn):
|
|
|
|
sock = conn.protocol.transport.get_extra_info("socket")
|
|
|
|
sock.setsockopt(socket.SOL_SOCKET, socket.SO_KEEPALIVE, 1)
|
|
|
|
sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPIDLE, 60)
|
|
|
|
sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPINTVL, 2)
|
|
|
|
sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPCNT, 5)
|
|
|
|
|
|
|
|
return await super().send(conn)
|
|
|
|
|
|
|
|
|
2021-01-08 21:08:47 +01:00
|
|
|
class PreparedRequest:
|
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
method: str,
|
|
|
|
url: str,
|
2021-04-12 19:47:21 +02:00
|
|
|
headers: Optional[dict] = None,
|
|
|
|
params: Optional[dict] = None,
|
|
|
|
cookies: Optional[dict] = None,
|
2021-01-08 21:08:47 +01:00
|
|
|
ssl: bool = True,
|
2021-04-12 19:47:21 +02:00
|
|
|
timeout: Optional[float] = None
|
2021-01-08 21:08:47 +01:00
|
|
|
):
|
|
|
|
self.method = method
|
|
|
|
self.url = url
|
|
|
|
self.headers = {
|
2021-04-12 19:47:21 +02:00
|
|
|
'Connection': 'keep-alive',
|
2021-01-08 21:08:47 +01:00
|
|
|
'User-Agent': DEFAULT_USER_AGENT,
|
|
|
|
}
|
|
|
|
if headers:
|
|
|
|
self.headers.update(headers)
|
|
|
|
self.params = params
|
|
|
|
self.cookies = cookies
|
|
|
|
self.ssl = ssl
|
2021-04-12 19:47:21 +02:00
|
|
|
self.timeout = timeout
|
2021-01-08 21:08:47 +01:00
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return f'{self.method} {self.url} {self.headers} {self.params}'
|
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
return repr(self)
|
|
|
|
|
2021-04-12 19:47:21 +02:00
|
|
|
@asynccontextmanager
|
|
|
|
async def execute_with(self, session):
|
2021-04-15 16:14:54 +02:00
|
|
|
try:
|
|
|
|
async with session.request(
|
|
|
|
method=self.method,
|
|
|
|
url=self.url,
|
|
|
|
timeout=self.timeout,
|
|
|
|
headers=self.headers,
|
|
|
|
cookies=self.cookies,
|
|
|
|
params=self.params,
|
|
|
|
ssl=self.ssl,
|
|
|
|
) as resp:
|
2021-04-12 19:47:21 +02:00
|
|
|
yield resp
|
2021-04-15 16:14:54 +02:00
|
|
|
except BadResponseError as e:
|
|
|
|
e.add('url', self.url)
|
|
|
|
raise e
|
|
|
|
except (
|
|
|
|
aiohttp.client_exceptions.ClientConnectionError,
|
|
|
|
aiohttp.client_exceptions.ClientPayloadError,
|
|
|
|
aiohttp.client_exceptions.ClientResponseError,
|
|
|
|
aiohttp.client_exceptions.TooManyRedirects,
|
|
|
|
asyncio.TimeoutError,
|
|
|
|
ProxyConnectionError,
|
|
|
|
ProxyTimeoutError,
|
|
|
|
ProxyError,
|
|
|
|
) as e:
|
|
|
|
raise DownloadError(nested_error=repr(e), nested_error_cls=class_fullname(e))
|
2021-04-12 19:47:21 +02:00
|
|
|
|
2021-01-08 21:08:47 +01:00
|
|
|
|
|
|
|
class BaseValidator:
|
|
|
|
def update(self, chunk: bytes):
|
|
|
|
pass
|
|
|
|
|
|
|
|
def validate(self):
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
class Md5Validator(BaseValidator):
|
|
|
|
def __init__(self, md5: str):
|
|
|
|
self.md5 = md5
|
|
|
|
self.v = hashlib.md5()
|
|
|
|
|
|
|
|
def update(self, chunk: bytes):
|
|
|
|
self.v.update(chunk)
|
|
|
|
|
|
|
|
def validate(self):
|
|
|
|
digest = self.v.hexdigest()
|
2021-02-21 11:48:18 +01:00
|
|
|
if self.md5.lower() != digest.lower():
|
2021-01-08 21:08:47 +01:00
|
|
|
raise IncorrectMD5Error(requested_md5=self.md5, downloaded_md5=digest)
|
|
|
|
|
|
|
|
|
|
|
|
class DoiValidator(BaseValidator):
|
2021-02-21 11:48:18 +01:00
|
|
|
def __init__(self, doi: str, md5: Optional[str] = None):
|
2021-01-08 21:08:47 +01:00
|
|
|
self.doi = doi
|
2021-02-21 11:48:18 +01:00
|
|
|
self.md5 = md5
|
2021-01-08 21:08:47 +01:00
|
|
|
self.file = bytes()
|
2021-02-21 11:48:18 +01:00
|
|
|
self.v = hashlib.md5()
|
2021-01-08 21:08:47 +01:00
|
|
|
|
|
|
|
def update(self, chunk):
|
|
|
|
self.file += chunk
|
2021-02-21 11:48:18 +01:00
|
|
|
self.v.update(chunk)
|
2021-01-08 21:08:47 +01:00
|
|
|
|
|
|
|
def validate(self):
|
2021-02-21 11:48:18 +01:00
|
|
|
if self.md5 and self.md5.lower() == self.v.hexdigest().lower():
|
|
|
|
return
|
2021-03-29 12:39:23 +02:00
|
|
|
elif not is_pdf(f=self.file):
|
|
|
|
raise BadResponseError(doi=self.doi, file=str(self.file[:100]))
|
2021-01-08 21:08:47 +01:00
|
|
|
|
|
|
|
|
|
|
|
class BaseSource(AioThing):
|
|
|
|
allowed_content_type = None
|
|
|
|
base_url = None
|
|
|
|
is_enabled = True
|
|
|
|
resolve_timeout = None
|
|
|
|
ssl = True
|
|
|
|
timeout = None
|
|
|
|
use_proxy = None
|
|
|
|
|
|
|
|
def __init__(self, proxy: str = None, resolve_proxy: str = None):
|
|
|
|
super().__init__()
|
|
|
|
self.proxy = proxy
|
|
|
|
self.resolve_proxy = resolve_proxy
|
|
|
|
|
|
|
|
def get_proxy(self):
|
|
|
|
if self.proxy and self.use_proxy is not False:
|
|
|
|
return ProxyConnector.from_url(self.proxy, verify_ssl=self.ssl)
|
|
|
|
return aiohttp.TCPConnector(verify_ssl=self.ssl)
|
|
|
|
|
|
|
|
def get_resolve_proxy(self):
|
|
|
|
if self.resolve_proxy and self.use_proxy is not False:
|
|
|
|
return ProxyConnector.from_url(self.resolve_proxy, verify_ssl=self.ssl)
|
|
|
|
return aiohttp.TCPConnector(verify_ssl=self.ssl)
|
|
|
|
|
|
|
|
def get_session(self):
|
2021-04-12 19:47:21 +02:00
|
|
|
return aiohttp.ClientSession(request_class=KeepAliveClientRequest, connector=self.get_proxy())
|
2021-01-08 21:08:47 +01:00
|
|
|
|
|
|
|
def get_resolve_session(self):
|
2021-04-12 19:47:21 +02:00
|
|
|
return aiohttp.ClientSession(request_class=KeepAliveClientRequest, connector=self.get_resolve_proxy())
|
2021-01-08 21:08:47 +01:00
|
|
|
|
2021-04-12 19:47:21 +02:00
|
|
|
async def resolve(self, error_log_func: Callable = error_log) -> AsyncIterable[PreparedRequest]:
|
2021-01-08 21:08:47 +01:00
|
|
|
raise NotImplementedError("`resolve` for BaseSource is not implemented")
|
|
|
|
|
|
|
|
def get_validator(self):
|
|
|
|
return BaseValidator()
|
|
|
|
|
|
|
|
@retry(
|
|
|
|
reraise=True,
|
|
|
|
stop=stop_after_attempt(3),
|
|
|
|
retry=retry_if_exception_type((ProxyError, aiohttp.client_exceptions.ClientPayloadError, ProxyTimeoutError)),
|
|
|
|
)
|
2021-04-12 19:47:21 +02:00
|
|
|
async def execute_prepared_file_request(self, prepared_file_request: PreparedRequest):
|
2021-01-08 21:08:47 +01:00
|
|
|
async with self.get_session() as session:
|
2021-04-12 19:47:21 +02:00
|
|
|
async with prepared_file_request.execute_with(session=session) as resp:
|
2021-01-08 21:08:47 +01:00
|
|
|
if resp.status == 404:
|
2021-04-12 19:47:21 +02:00
|
|
|
raise NotFoundError(url=prepared_file_request.url)
|
2021-01-08 21:08:47 +01:00
|
|
|
elif (
|
|
|
|
resp.status != 200
|
|
|
|
or (
|
|
|
|
self.allowed_content_type
|
|
|
|
and resp.headers.get('Content-Type', '').lower() not in self.allowed_content_type
|
|
|
|
)
|
|
|
|
):
|
|
|
|
raise BadResponseError(
|
2021-04-12 19:47:21 +02:00
|
|
|
request_headers=prepared_file_request.headers,
|
|
|
|
url=prepared_file_request.url,
|
2021-01-08 21:08:47 +01:00
|
|
|
status=resp.status,
|
|
|
|
headers=str(resp.headers),
|
|
|
|
)
|
|
|
|
file_validator = self.get_validator()
|
2021-04-12 19:47:21 +02:00
|
|
|
yield FileResponsePb(status=FileResponsePb.Status.BEGIN_TRANSMISSION, source=prepared_file_request.url)
|
|
|
|
async for content, _ in resp.content.iter_chunks():
|
2021-01-08 21:08:47 +01:00
|
|
|
file_validator.update(content)
|
2021-04-12 19:47:21 +02:00
|
|
|
yield FileResponsePb(chunk=ChunkPb(content=content), source=prepared_file_request.url)
|
|
|
|
file_validator.validate()
|
2021-01-08 21:08:47 +01:00
|
|
|
|
|
|
|
|
|
|
|
class Md5Source(BaseSource):
|
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
md5: str,
|
|
|
|
proxy: Optional[str] = None,
|
|
|
|
resolve_proxy: Optional[str] = None,
|
|
|
|
):
|
|
|
|
super().__init__(proxy=proxy, resolve_proxy=resolve_proxy)
|
|
|
|
self.md5 = md5
|
|
|
|
|
|
|
|
def get_validator(self):
|
|
|
|
return Md5Validator(self.md5)
|
|
|
|
|
|
|
|
|
|
|
|
class DoiSource(BaseSource):
|
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
doi: str,
|
|
|
|
md5: Optional[str] = None,
|
|
|
|
proxy: Optional[str] = None,
|
|
|
|
resolve_proxy: Optional[str] = None,
|
|
|
|
):
|
|
|
|
super().__init__(proxy=proxy, resolve_proxy=resolve_proxy)
|
|
|
|
self.doi = doi
|
|
|
|
self.md5 = md5
|
|
|
|
|
|
|
|
def get_validator(self):
|
2021-02-21 11:48:18 +01:00
|
|
|
return DoiValidator(self.doi, md5=self.md5)
|