- [nexus] Development

- [nexus] Development

GitOrigin-RevId: 5d5feedff7b70be4c788abeb22f89c6758431d33
This commit is contained in:
the-superpirate 2022-09-13 17:15:16 +03:00
parent 73f5fd4a06
commit a683e0ce18
58 changed files with 655 additions and 634 deletions

View File

@ -43,7 +43,7 @@ py3_image(
"//library/aiogrpctools",
requirement("aiokit"),
"//library/aiopostgres",
"//library/configurator",
requirement("izihawa_configurator"),
"//library/telegram",
requirement("izihawa_utils"),
],

View File

@ -1,11 +1,13 @@
from library.configurator import Configurator
from izihawa_configurator import Configurator
from izihawa_utils import env
def get_config():
return Configurator([
'idm/api/configs/base.yaml',
'idm/api/configs/%s.yaml?' % env.type,
'idm/api/configs/logging.yaml',
], env_prefix='NEXUS_IDM_API')
], env_prefix='IDM_API')
config = get_config()

View File

@ -7,9 +7,9 @@ from idm.api.configs import get_config
from idm.api.services.chat_manager import ChatManagerService
from idm.api.services.profile import ProfileService
from idm.api.services.subscription_manager import SubscriptionManagerService
from izihawa_configurator import Configurator
from library.aiogrpctools import AioGrpcServer
from library.aiopostgres.pool_holder import AioPostgresPoolHolder
from library.configurator import Configurator
from library.logging import configure_logging

View File

@ -105,20 +105,24 @@ class ProfileService(profile_service_pb2_grpc.ProfileServicer, BaseService):
for tag in download_document.tags:
tags_counter[tag] += 1
most_popular_issns = sorted(issns_counter, key=issns_counter.get, reverse=True)[:7]
most_popular_issns = sorted(issns_counter, key=issns_counter.get, reverse=True)[:14]
most_popular_tags = sorted(tags_counter, key=tags_counter.get, reverse=True)[:7]
most_popular_series = []
async for row in self.application.pool_holder['nexus'].iterate(
f"select name, issns from series where issns && array[{most_popular_issns}]::text[]".format(
most_popular_issns=','.join(map(lambda x: "'" + x + "'", most_popular_issns)),
),
row_factory=dict_row,
):
most_popular_series.append(profile_service_pb2.Series(
name=row['name'],
issns=row['issns'],
))
if most_popular_issns:
async for row in self.application.pool_holder['nexus'].iterate(
"select name, array_agg(issn) as issns from series "
"where issn in ({most_popular_issns}) "
"group by name order by name "
"limit 7".format(
most_popular_issns=','.join(map(lambda x: "'" + x + "'", most_popular_issns)),
),
row_factory=dict_row,
):
most_popular_series.append(profile_service_pb2.Series(
name=row['name'],
issns=row['issns'],
))
return most_popular_series, most_popular_tags

View File

@ -13,7 +13,7 @@ py_library(
requirement("grpcio"),
requirement("pyyaml"),
requirement("aiokit"),
"//library/configurator",
requirement("izihawa_configurator"),
"//library/logging",
requirement("izihawa_utils"),
],

View File

@ -92,6 +92,7 @@ class AioPostgresPoolHolder(AioThing):
row_factory=tuple_row,
cursor_name: Optional[str] = None,
itersize: Optional[int] = None,
statement_timeout: Optional[int] = None,
):
if not self.pool:
raise RuntimeError('AioPostgresPoolHolder has not been started')
@ -99,7 +100,9 @@ class AioPostgresPoolHolder(AioThing):
async with conn.cursor(name=cursor_name, row_factory=row_factory) as cur:
if itersize is not None:
cur.itersize = itersize
await cur.execute(stmt, values)
await cur.execute(stmt + ';' if statement_timeout else '', values)
if statement_timeout:
await cur.execute(f'SET statement_timeout = {statement_timeout};')
async for row in cur:
yield row

View File

@ -1,17 +0,0 @@
load("@pip_modules//:requirements.bzl", "requirement")
load("@rules_python//python:defs.bzl", "py_library")
py_library(
name = "configurator",
srcs = glob(
["**/*.py"],
exclude = ["tests/**"],
),
srcs_version = "PY3",
visibility = ["//visibility:public"],
deps = [
requirement("jinja2"),
requirement("pyyaml"),
requirement("izihawa_utils"),
],
)

View File

@ -1,170 +0,0 @@
import json
import os
import os.path
from types import ModuleType
import yaml
from izihawa_utils.common import (
smart_merge_dicts,
unflatten,
)
from jinja2 import Template
from library.configurator.exceptions import UnknownConfigFormatError
class ConfigObject(dict):
def __getattr__(self, name):
try:
return self[name]
except KeyError as e:
raise AttributeError(e)
class AnyOf:
def __init__(self, *args):
self.args = args
class RichDict(dict):
def has(self, *args):
current = self
for c in args:
if c not in current:
return False
current = current[c]
return True
def copy_if_exists(self, source_keys, target_key):
current = self
for c in source_keys:
if c not in current:
return False
current = current[c]
self[target_key] = current
return True
class Configurator(RichDict):
def __init__(self, configs: list, env_prefix: str = None, env_key_separator: str = '.'):
"""
Create Configurator object
:param configs: list of paths to config files, dicts or modules.
End filepath with `?` to mark it as optional config.
"""
super().__init__()
self._by_basenames = {}
self._omitted_files = []
env_dict = {}
if env_prefix:
env_prefix = env_prefix.lower()
for name, value in os.environ.items():
if name.lower().startswith(env_prefix):
stripped_name = name[len(env_prefix):].lstrip('_')
if stripped_name[-2:] == '[]':
if stripped_name not in env_dict:
env_dict[stripped_name[:-2]] = []
env_dict[stripped_name[:-2]].append(value)
else:
env_dict[stripped_name] = value
env_dict = unflatten(env_dict, sep=env_key_separator)
for config in ([os.environ] + configs + [env_dict]):
file_found = self.update(config)
if not file_found:
self._omitted_files.append(config)
def _config_filename(self, filename):
return os.path.join(os.getcwd(), filename)
def walk_and_render(self, c):
if isinstance(c, str):
return Template(c).render(**self)
elif isinstance(c, list):
return [self.walk_and_render(e) for e in c]
elif isinstance(c, dict):
for key in list(c.keys()):
c[key] = self.walk_and_render(c[key])
if key.endswith('_filepath'):
with open(c[key]) as f:
if c[key].endswith('.json'):
c[key.replace('_filepath', '')] = json.loads(f.read())
elif c[key].endswith('.yaml'):
c[key.replace('_filepath', '')] = yaml.safe_load(f.read())
return c
def update(self, new_config, basename=None, **kwargs):
if isinstance(new_config, AnyOf):
for config in new_config.args:
try:
return self.update(config.rstrip('?'))
except IOError:
pass
raise IOError('None of %s was found' % ', '.join(new_config.args))
elif isinstance(new_config, str):
optional = new_config.endswith('?')
filename = new_config.rstrip('?')
basename = basename or os.path.basename(filename)
config_filename = self._config_filename(filename)
data = None
if os.path.exists(config_filename) and os.access(config_filename, os.R_OK):
with open(config_filename) as f:
data = f.read()
if data is None:
if optional:
return False
else:
raise IOError(f'File {config_filename} not found')
if filename.endswith('.json'):
new_config = json.loads(data)
elif filename.endswith('.yaml'):
new_config = yaml.safe_load(data)
else:
raise UnknownConfigFormatError(filename)
new_config = self.walk_and_render(new_config)
elif isinstance(new_config, ModuleType):
new_config = new_config.__dict__
elif callable(new_config):
new_config = new_config(self)
if not new_config:
new_config = {}
for k in new_config:
if callable(new_config[k]):
new_config[k] = new_config[k](context=self)
if 'log_path' in new_config:
new_config['log_path'] = os.path.expanduser(new_config['log_path']).rstrip('/')
smart_merge_dicts(self, new_config, list_policy='override', copy=False)
if basename:
self._by_basenames[basename] = new_config
return True
def get_config_by_basename(self, basename):
return self._by_basenames[basename]
def get_object_by_basename(self, basename):
return ConfigObject(self._by_basenames[basename])
def has_missed_configs(self):
return bool(self._omitted_files)
def has_file(self, basename):
return basename in self._by_basenames
def get_files(self):
return self._by_basenames

View File

@ -1,2 +0,0 @@
class UnknownConfigFormatError(Exception):
pass

View File

@ -159,7 +159,6 @@ class ToSummaAction(BaseAction):
'journal',
'journal-issue',
'journal-volume',
'other',
'peer-review',
'proceedings',
'report-series',

View File

@ -35,7 +35,7 @@ py3_image(
requirement("aiobaseclient"),
requirement("aiocrossref"),
requirement("aiokit"),
"//library/configurator",
requirement("izihawa_configurator"),
"//library/logging",
"//library/telegram",
"//nexus/hub/aioclient",

View File

@ -1,5 +1,5 @@
from izihawa_configurator import Configurator
from izihawa_utils import env
from library.configurator import Configurator
def get_config():

View File

@ -104,7 +104,7 @@ class ViewHandler(BaseHandler):
),
event.delete(),
]
if not has_found_old_widget:
if not has_found_old_widget and is_earlier_than_2_days(old_message):
async with safe_execution(error_log=request_context.error_log):
await self.application.telegram_client.delete_messages(request_context.chat.chat_id, [old_message_id])
return await asyncio.gather(*actions)

View File

@ -189,6 +189,13 @@ schema:
record: basic
tokenizer: raw
stored: true
- name: series_page_rank
type: f64
options:
fast: single
fieldnorms: false
indexed: true
stored: true
multi_fields: ["authors", "ipfs_multihashes", "isbns", "issns", "references", "tags"]
primary_key: "id"
stop_words: ['a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'if', 'in', 'is', 'it', 'of', 'on', 'or',

View File

@ -43,7 +43,7 @@ py3_image(
requirement("aioipfs-2"),
requirement("aiokit"),
"//library/aiopostgres",
"//library/configurator",
requirement("izihawa_configurator"),
"//library/telegram",
"//nexus/hub/proto:grpc_py",
"//nexus/hub/proto:proto_py",

View File

@ -1,5 +1,5 @@
from izihawa_configurator import Configurator
from izihawa_utils import env
from library.configurator import Configurator
def get_config():

View File

@ -1,26 +1,8 @@
---
pylon:
default_driver_proxy_list:
- [cambridge]
- [edinburg]
- [southampton]
default_resolver_proxy_list: ~
downloads_directory: /downloads
proxies:
- address: clash.default.svc.cluster.example.com:7890
name: cambridge
tags: ['cambridge']
- address: clash.default.svc.cluster.example.com:7990
name: edinburg
tags: ['edinburg']
- address: clash.default.svc.cluster.example.com:8090
name: southampton
tags: ['southampton']
- address: socks5://clash.default.svc.cluster.example.com:7991
name: socks5
tags: ['socks5']
sources:
# LibGen.rocks
# IPFS
- driver:
args:
proxy_list: ~
@ -29,37 +11,13 @@ pylon:
class:
nexus.pylon.drivers.DirectDriver
matcher:
md5: ^.*$
ipfs_multihashes: ^.*$
resolver:
args:
extractors:
- producer:
format_string: 'http://libgen.rocks/{matched_group}'
group: 0
re: 'get\.php\?md5=.*&key=[A-Za-z\d]+'
timeout: 25.0
type: regex
url: https://libgen.rocks/ads.php?md5={md5}
class: nexus.pylon.resolvers.RequestResolver
# LibGen.rocks
- driver:
args:
proxy_list: ~
class:
nexus.pylon.drivers.DirectDriver
matcher:
doi: ^.*$
resolver:
args:
extractors:
- producer:
format_string: 'http://libgen.rocks/{matched_group}'
group: 0
re: 'get\.php\?md5=[a-fA-F\d]+&key=[A-Za-z\d]+(&doi=[^\"]*)+'
timeout: 25.0
type: regex
url: 'https://libgen.rocks/ads.php?doi={doi}'
class: nexus.pylon.resolvers.RequestResolver
format_string: 'http://nexus-ipfs-headless.default.svc.cluster.example.com:5001/api/v0/cat?arg={ipfs_multihashes[0]}'
headers_override: true
method: 'POST'
class: nexus.pylon.resolvers.TemplateResolver
# Library.lol
- driver:
args:
@ -74,20 +32,17 @@ pylon:
args:
extractors:
- producer:
format_string: '{matched_group}'
group: 1
re: '<a href="([^\"]+)">GET</a>'
timeout: 45.0
format_string: '{href}'
timeout: 45.0
re: '<a href="(?P<href>[^\"]+)">GET</a>'
type: regex
- producer:
format_string: '{matched_group}'
group: 0
re: 'https://ipfs.io/ipfs/[A-Za-z\d]+'
format_string: '{url}'
re: '(?P<url>https://ipfs.io/ipfs/[A-Za-z\d]+)'
type: regex
- producer:
format_string: '{matched_group}'
group: 0
re: 'https://cloudflare-ipfs.com/ipfs/[A-Za-z\d]+'
format_string: '{url}'
re: '(?P<url>https://cloudflare-ipfs.com/ipfs/[A-Za-z\d]+)'
type: regex
url: http://library.lol/main/{md5}
class: nexus.pylon.resolvers.RequestResolver
@ -103,13 +58,51 @@ pylon:
args:
extractors:
- producer:
format_string: '{matched_group}'
group: 1
re: '<a href="([^\"]+)">GET</a>'
timeout: 45.0
format_string: '{href}'
timeout: 45.0
re: '<a href="(?P<href>[^\"]+)">GET</a>'
type: regex
url: 'http://library.lol/scimag/{doi}'
class: nexus.pylon.resolvers.RequestResolver
# LibGen.rocks
- driver:
args:
proxy_list: ~
validator:
class: nexus.pylon.validators.Md5Validator
class:
nexus.pylon.drivers.DirectDriver
matcher:
md5: ^.*$
resolver:
args:
extractors:
- producer:
format_string: 'http://libgen.rocks/{key}'
timeout: 25.0
re: '(?P<key>get\.php\?md5=.*&key=[A-Za-z\d]+)'
type: regex
resolve_timeout: 25.0
url: https://libgen.rocks/ads.php?md5={md5}
class: nexus.pylon.resolvers.RequestResolver
# LibGen.rocks
- driver:
args:
proxy_list: ~
class:
nexus.pylon.drivers.DirectDriver
matcher:
doi: ^.*$
resolver:
args:
extractors:
- producer:
format_string: 'http://libgen.rocks/{key}'
timeout: 25.0
re: '(?P<key>get\.php\?md5=[a-fA-F\d]+&key=[A-Za-z\d]+(&doi=[^\"]*)+)'
type: regex
url: 'https://libgen.rocks/ads.php?doi={doi}'
class: nexus.pylon.resolvers.RequestResolver
# jamanetwork.com
- driver:
args:
@ -142,6 +135,7 @@ pylon:
resolver:
args:
format_string: 'https://www.sciencedirect.com/science/article/pii/{selected}/pdfft?isDTMRedir=true&download=true'
resolve_timeout: 25.0
selector: '(.resource.primary.URL | split("/"))[-1]'
timeout: 40.0
class: nexus.pylon.resolvers.DoiOrgRequestResolver
@ -209,6 +203,13 @@ pylon:
format_string: 'https://www.tandfonline.com/doi/pdf/{doi}?download=true'
class: nexus.pylon.resolvers.TemplateResolver
# iopscience.iop.org
- matcher:
doi: ^10.1088/.*$
resolver:
args:
format_string: 'https://iopscience.iop.org/article/{doi}/pdf'
class: nexus.pylon.resolvers.TemplateResolver
# iopscience.iop.org
- matcher:
doi: ^10.1088/.*$
resolver:
@ -249,6 +250,13 @@ pylon:
args:
timeout: 30.0
class: nexus.pylon.resolvers.TemplateResolver
# biorxiv.org
- matcher:
doi: ^10.1101/.*$
resolver:
args:
format_string: 'https://www.biorxiv.org/content/{doi}.full.pdf'
class: nexus.pylon.resolvers.TemplateResolver
# journals.aps.org
- matcher:
doi: ^10.1103/.*$
@ -374,6 +382,13 @@ pylon:
args:
format_string: 'https://journals.plos.org/plosone/article/file?id={doi}&type=printable'
class: nexus.pylon.resolvers.TemplateResolver
# guilfordjournals.com
- matcher:
doi: ^10.1521/.*$
resolver:
args:
format_string: 'https://guilfordjournals.com/doi/pdf/{doi}?download=true'
class: nexus.pylon.resolvers.TemplateResolver
# bioone.org
- driver:
args:
@ -549,6 +564,24 @@ pylon:
args:
format_string: 'https://jcsm.aasm.org/doi/pdf/{doi}?download=true'
class: nexus.pylon.resolvers.TemplateResolver
# www.medwave.cl
- driver:
args:
proxy_list: ~
class:
nexus.pylon.drivers.DirectDriver
matcher:
doi: ^10.5867/.*$
resolver:
args:
extractors:
- producer:
format_string: 'https://www.medwave.cl/{path}'
timeout: 25.0
re: 'href=\"/(?P<path>[\w/.\-_]+\.pdf)\">PDF</a>'
type: regex
url: https://doi.org/{doi}
class: nexus.pylon.resolvers.RequestResolver
# journal.permsc.ru
- driver:
args:
@ -609,6 +642,24 @@ pylon:
class: nexus.pylon.drivers.BrowserDriver
matcher:
doi: ^10.32920/.*$
# PKP Project
- driver:
args:
proxy_list: ~
class:
nexus.pylon.drivers.DirectDriver
matcher:
doi: ^10.(5399|24905|31004|32729|37934)/.*$
resolver:
args:
extractors:
- producer:
format_string: 'https://{host}/{prefix}/{journal}/article/download/{key}'
timeout: 25.0
re: 'href=\"(?:https?://[\w.]+)/(?P<prefix>[\w./]+)/(?P<journal>[\w.]+)/article/view/(?P<key>\w+/\w+)\"[^>]*>[Pp][Dd][Ff]\s*</a>'
type: regex
url: https://doi.org/{doi}
class: nexus.pylon.resolvers.RequestResolver
# papers.cumincad.org
- driver:
args:
@ -621,9 +672,19 @@ pylon:
matcher:
doi: ^10.52842/.*$
# ^.*$
- matcher:
doi: ^.*$
resolver:
args:
selector: '.resource.primary.URL | select (. | ascii_downcase | contains("pdf"))'
class: nexus.pylon.resolvers.DoiOrgRequestResolver
- matcher:
doi: ^.*$
resolver:
args:
selector: '[(.link | if . == null then [] else . end)[] | select((."content-type" == "application/pdf") or (.URL | ascii_downcase | contains("pdf")))][0].URL'
class: nexus.pylon.resolvers.DoiOrgRequestResolver
webdriver_hub:
downloads_directory: /downloads
endpoint: http://127.0.0.1:4444/wd/hub
host_downloads_directory: /downloads

View File

@ -5,9 +5,9 @@ import uvloop
from aiogrobid import GrobidClient
from aioipfs import AsyncIPFS as AsyncIPFS
from idm.api.aioclient import IdmApiGrpcClient
from izihawa_configurator import Configurator
from library.aiogrpctools import AioGrpcServer
from library.aiopostgres import AioPostgresPoolHolder
from library.configurator import Configurator
from library.logging import configure_logging
from library.telegram.base import BaseTelegramClient
from nexus.hub.configs import get_config

View File

@ -65,6 +65,7 @@ class BaseHubService(BaseService):
await asyncio.gather(
self.application.ipfs_client.add_bytes(file, cid_version=1, hash='blake2b-256', only_hash=True),
self.application.ipfs_client.add_bytes(file, cid_version=0, hash='sha2-256', only_hash=True),
self.application.ipfs_client.add_bytes(file, cid_version=1, hash='blake3', only_hash=True),
)
))

View File

@ -73,13 +73,7 @@ class DeliveryService(delivery_service_pb2_grpc.DeliveryServicer, BaseHubService
self.downloadings = set()
self.is_sharience_enabled = is_sharience_enabled
self.maintenance_picture_url = maintenance_picture_url
self.pylon_client = PylonClient(
proxies=pylon_config['proxies'],
source_configs=pylon_config['sources'],
default_driver_proxy_list=pylon_config['default_driver_proxy_list'],
default_resolver_proxy_list=pylon_config['default_resolver_proxy_list'],
downloads_directory=pylon_config['downloads_directory'],
)
self.pylon_client = PylonClient(config=pylon_config)
self.should_parse_with_grobid = should_parse_with_grobid
self.should_store_hashes = should_store_hashes
self.telegram_bot_configs = telegram_bot_configs
@ -170,6 +164,15 @@ class DeliveryService(delivery_service_pb2_grpc.DeliveryServicer, BaseHubService
return delivery_service_pb2.StartDeliveryResponse(status=delivery_service_pb2.StartDeliveryResponse.Status.OK)
async def delayed_task(create_task, t):
try:
await asyncio.sleep(t)
task = create_task()
await task
except asyncio.CancelledError:
pass
class DownloadTask:
def __init__(
self,
@ -204,7 +207,7 @@ class DownloadTask:
)
async def download_task(self, request_context: RequestContext, document_holder):
throttle_secs = 2.0
throttle_secs = 3.0
async def _on_fail():
await self.application.telegram_clients[request_context.bot_name].send_message(
@ -218,6 +221,7 @@ class DownloadTask:
error_log=request_context.error_log,
on_fail=_on_fail,
):
start_time = time.time()
filename = document_holder.get_filename()
progress_bar_download = ProgressBar(
telegram_client=self.application.telegram_clients[request_context.bot_name],
@ -226,9 +230,9 @@ class DownloadTask:
header=f'⬇️ {filename}',
tail_text=t('TRANSMITTED_FROM', request_context.chat.language),
throttle_secs=throttle_secs,
last_call=start_time,
)
downloads_gauge.inc()
start_time = time.time()
try:
file = await self.download(
document_holder=document_holder,
@ -242,11 +246,21 @@ class DownloadTask:
)
if not document_holder.md5 and document_holder.get_extension() == 'pdf':
try:
await progress_bar_download.send_message(
t("PROCESSING_PAPER", request_context.chat.language).format(filename=filename),
ignore_last_call=True
processing_message_task = asyncio.create_task(delayed_task(
create_task=lambda: progress_bar_download.send_message(
t("PROCESSING_PAPER", request_context.chat.language).format(filename=filename),
ignore_last_call=True
),
t=5.0
))
file = await asyncio.get_running_loop().run_in_executor(
None,
lambda: clean_metadata(file, doi=document_holder.doi)
)
file = clean_metadata(file, doi=document_holder.doi)
processing_message_task.cancel()
await processing_message_task
request_context.statbox(
action='cleaned',
len=len(file),
@ -260,7 +274,8 @@ class DownloadTask:
banner=t("LOOKING_AT", request_context.chat.language),
header=f'⬇️ {filename}',
tail_text=t('UPLOADED_TO_TELEGRAM', request_context.chat.language),
throttle_secs=throttle_secs
throttle_secs=throttle_secs,
last_call=progress_bar_download.last_call,
)
uploaded_message = await self.delivery_service.send_file(
document_holder=self.document_holder,
@ -393,11 +408,15 @@ class DownloadTask:
async def download(self, document_holder, progress_bar):
collected = bytearray()
if document_holder.doi:
try:
params = {'doi': document_holder.doi}
if document_holder.md5:
params['md5'] = document_holder.md5
params = {}
try:
if document_holder.doi:
params['doi'] = document_holder.doi
if document_holder.md5:
params['md5'] = document_holder.md5
if document_holder.ipfs_multihashes:
params['ipfs_multihashes'] = [ipfs_multihash for ipfs_multihash in document_holder.ipfs_multihashes]
if params:
async for resp in self.delivery_service.pylon_client.download(params):
await self.process_resp(
resp=resp,
@ -406,20 +425,8 @@ class DownloadTask:
filesize=document_holder.filesize,
)
return bytes(collected)
except DownloadError:
pass
if document_holder.md5:
try:
async for resp in self.delivery_service.pylon_client.download({'md5': document_holder.md5}):
await self.process_resp(
resp=resp,
progress_bar=progress_bar,
collected=collected,
filesize=document_holder.filesize,
)
return bytes(collected)
except DownloadError:
pass
except DownloadError:
pass
async def external_cancel(self):
self.request_context.statbox(action='externally_canceled')

View File

@ -27,7 +27,7 @@ py3_image(
requirement("aiokit"),
requirement("aiolibgen"),
"//library/aiopostgres",
"//library/configurator",
requirement("izihawa_configurator"),
"//library/jobber",
"//nexus/actions",
],

View File

@ -39,8 +39,8 @@ class CrossrefApiJob(BaseJob):
})
count = 0
async for chunk in self.crossref_client.works_cursor(
filter=f'from-index-date:{self.from_date}',
rows=500,
filter=f'from-index-date:{self.from_date}',
rows=500,
):
for item in chunk['items']:
yield item
@ -50,4 +50,4 @@ class CrossrefApiJob(BaseJob):
'mode': 'ingest',
'items': count,
'target_date': self.from_date,
})
})

View File

@ -34,6 +34,7 @@ class PostgresJob(BaseJob):
f'user={database["username"]} '
f'password={database["password"]} '
f'host={database["host"]}',
timeout=3600 * 2,
)
self.summa_client = SummaClient(endpoint=summa['endpoint'])
self.summa_config = summa
@ -84,6 +85,7 @@ class PostgresJob(BaseJob):
# Mandatory for server side cursor
cursor_name='nexus_ingest_cursor',
itersize=50_000,
statement_timeout=3600 * 2,
):
loaded = True
yield row
@ -95,8 +97,12 @@ class PostgresJob(BaseJob):
# Mandatory for server side cursor
cursor_name='nexus_ingest_cursor',
itersize=50_000,
statement_timeout=3600 * 2,
):
yield row
await self.summa_client.commit_index(self.summa_config['name'], session_id=session_id)
await self.summa_client.commit_index(
self.summa_config['name'],
session_id=session_id,
)
await self.summa_client.set_index_alias(self.summa_config['index_alias'], self.summa_config['name'], session_id=session_id)

View File

@ -25,7 +25,7 @@ DEPS = [
"//library/aiogrpctools",
requirement("aiokit"),
"//library/aiopostgres",
"//library/configurator",
requirement("izihawa_configurator"),
"//library/logging",
"//nexus/meta_api/proto:grpc_py",
"//nexus/models/proto:proto_py",

View File

@ -1,5 +1,5 @@
from izihawa_configurator import Configurator
from izihawa_utils import env
from library.configurator import Configurator
def get_config():

View File

@ -315,7 +315,7 @@ class SearchService(SearchServicer, BaseService):
with suppress(RetryError):
async for attempt in AsyncRetrying(
retry=retry_if_exception_type(NeedRetryError),
wait=wait_fixed(5),
wait=wait_fixed(10),
stop=stop_after_attempt(6)
):
with attempt:

View File

@ -33,4 +33,5 @@ message Scimag {
string volume = 21;
int32 year = 30;
float page_rank = 34;
float series_page_rank = 35;
}

View File

@ -26,7 +26,7 @@ py3_image(
requirement("aiocrossref"),
requirement("aiokit"),
"//library/aiopostgres",
"//library/configurator",
requirement("izihawa_configurator"),
"//library/logging",
"//nexus/actions",
"//nexus/models/proto:proto_py",

View File

@ -10,6 +10,6 @@ py_library(
srcs_version = "PY3",
visibility = ["//visibility:public"],
deps = [
"//library/configurator",
requirement("izihawa_configurator"),
],
)

View File

@ -1,4 +1,4 @@
from library.configurator import Configurator
from izihawa_configurator import Configurator
def get_promotions():

View File

@ -19,6 +19,9 @@ promotions:
- texts:
en: 💬 Research is the only and ultimate goal
weight: 1
- texts:
en: 💬 Intellectual property is not a valid form of property
weight: 1
- texts:
en: ✋ Have a subscription to paid articles? [Help researchers!](https://t.me/{mutual_aid_group})
ru: ✋ Есть доступ к платным статьям? [Помоги ученым!](https://t.me/{mutual_aid_group})

View File

@ -1,12 +1,16 @@
load("@rules_python//python:defs.bzl", "py_binary", "py_library")
load("@rules_python//python:defs.bzl", "py_library")
load("@rules_python//python:packaging.bzl", "py_wheel")
load("@pip_modules//:requirements.bzl", "requirement")
filegroup(
name = "data",
srcs = ["configs/pylon.yaml"],
)
py_library(
name = "pylon",
srcs = glob(["**/*.py"]),
data = [
"configs/pylon.yaml",
],
data = [":data"],
visibility = ["//visibility:public"],
deps = [
requirement("aiodns"),
@ -16,6 +20,7 @@ py_library(
requirement("brotli"),
requirement("cchardet"),
requirement("certifi"),
requirement("fire"),
requirement("jq"),
requirement("orjson"),
requirement("pypdf2"),
@ -23,20 +28,38 @@ py_library(
requirement("selenium"),
requirement("tenacity"),
requirement("aiokit"),
"//library/configurator",
requirement("izihawa_configurator"),
"//library/logging",
"//nexus/pylon/proto:pylon_proto_py",
],
)
py_binary(
name = "cli",
srcs = ["cli.py"],
main = "cli.py",
srcs_version = "PY3",
visibility = ["//visibility:public"],
py_wheel(
name = "nexus-pylon-wheel",
author = "The Superpirate",
author_email = "fist.of.the.first.pirates@gmail.com",
classifiers = [
"Programming Language :: Python :: 3.10",
],
description_file = ":README.md",
distribution = "nexus-pylon-wheel",
entry_points = {"console_scripts": ["pylon = nexus.pylon.cli:main"]},
homepage = "https://github.com/nexus-stc/hyperboria/tree/master/nexus/pylon",
license = "MIT License",
python_requires = ">=3.10",
python_tag = "py3",
requires = [
"aiokit >= 1.0.0",
"izihawa_configurator >= 1.0.0",
"selenium >= 4.3.0",
],
strip_path_prefixes = [
"nexus/pylon/proto/pylon_proto_py_pb",
],
version = "1.0.0",
deps = [
requirement("fire"),
":data",
":pylon",
"//nexus/pylon/proto:pylon_proto_py",
],
)

View File

@ -6,16 +6,51 @@
- Streams data by chunks
- GRPC-ready
## Build
```bash
bazel build -c opt nexus-pylon-wheel
```
## Install
### PIP
```bash
pip install nexus-pylon
```
## Nexus Pylon CLI
Casual download
```bash
bazel run -c opt cli -- doi 10.1056/NEJMoa2033700 --output article.pdf
Download scientific publication:
```bash
pylon download --doi 10.1182/blood-2011-03-325258 --output article.pdf
```
Download with proxies
```bash
bazel run -c opt cli -- md5 278C3A72B7B04717361501B8642857DF \
--output file.pdf \
--proxies socks5://127.0.0.1:9050
Download file by its MD5:
```bash
pylon download --md5 f07707ee92fa675fd4ee53e3fee977d1 --output article.pdf
```
Download file by its multihash:
```bash
pylon download --ipfs-multihashes '["bafykbzacea3vduqii3u52xkzdqan5oc54vsvedmed25dfybrqxyafahjl3rzu"]' --output article.pdf
```
### Using with Selenium
Create directory for exchaning files between host and launched Selenium in Docker
```bash
mkdir downloads
```
Launch Selenium in Docker
```bash
docker run -e SE_START_XVFB=false -v $(pwd)/downloads:/downloads -p 4444:4444 selenium/standalone-chrome:latest
```
Launch Pylon
```bash
pylon download --doi 10.1101/2022.09.09.507349 --output article.pdf \
--wd-endpoint 'http://127.0.0.1:4444/wd/hub' \
--wd-directory /downloads --wd-host-directory $(pwd)/downloads --debug
```

View File

@ -1,15 +1,17 @@
import logging
import os
import sys
from typing import Optional
import fire
from aiokit.utils import sync_fu
from nexus.pylon.client import (
from izihawa_configurator import Configurator
from .client import (
DownloadError,
PylonClient,
)
from nexus.pylon.configs import get_config
from nexus.pylon.proto.file_pb2 import FileResponse as FileResponsePb
from .proto.file_pb2 import FileResponse as FileResponsePb
def resolve_path(filepath):
@ -27,22 +29,20 @@ async def fetch(
collected = bytes()
try:
last_len = 0
last_source = ''
async for resp in iter:
if resp.HasField('status'):
if resp.status == FileResponsePb.Status.BEGIN_TRANSMISSION:
print(f'Started transmission from {resp.source}...', end='\r', file=sys.stderr)
print(f'Started transmission...', file=sys.stderr)
last_len = 0
last_source = resp.source
collected = bytes()
elif resp.HasField('chunk'):
if len(collected) - last_len > 1024 * 100:
print(f'Loaded {len(collected)} bytes from {resp.source}', end='\r', file=sys.stderr)
print(f'Loaded {len(collected)} bytes', end='\r', file=sys.stderr)
last_len = len(collected)
last_source = resp.source
collected += resp.chunk.content
with open(resolve_path(output), 'wb') as f:
print(f'Completed! Loaded {len(collected)} bytes from {last_source}', file=sys.stderr)
print()
print(f'Completed! Loaded {len(collected)} bytes', file=sys.stderr)
f.write(collected)
except DownloadError:
print('File not found')
@ -50,25 +50,53 @@ async def fetch(
async def download(
output: str,
config: Optional[str] = None,
debug: bool = False,
wd_endpoint: Optional[str] = None,
wd_directory: Optional[str] = None,
wd_host_directory: Optional[str] = None,
**params,
):
"""
Download scientific publications from various sources
Large portion of fresh articles could be retrieved only though publisher libraries through `BrowserDriver`, it
requires Selenium webdriver:
`docker run -e SE_START_XVFB=false -v $(pwd)/downloads:/downloads -p 4444:4444 selenium/standalone-chrome:latest`
Args:
output: name of the output file
config: pylon config
debug: enable debug logging
wd_endpoint: web-driver
wd_directory: mounted directory inside Docker image
wd_host_directory: directory for downloads on host that should be mounter as `wd_directory` inside Docker image
"""
if debug:
logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
c = get_config()['pylon']
p = PylonClient(
proxies=c['proxies'],
source_configs=c['sources'],
default_driver_proxy_list=c['default_driver_proxy_list'],
downloads_directory=c['downloads_directory'],
)
return await fetch(iter=p.download(params=params), output=output)
default_config_path = os.path.join(os.path.dirname(__file__), 'configs/pylon.yaml')
config = Configurator([config if config else default_config_path], env_prefix='NEXUS_PYLON')
config = config['pylon']
if wd_endpoint:
config.setdefault('webdriver_hub', {})
config['webdriver_hub']['endpoint'] = wd_endpoint
if not wd_directory:
raise ValueError('Should pass --wd-directory with --wd-endpoint')
config['webdriver_hub']['downloads_directory'] = wd_directory
if not wd_host_directory:
raise ValueError('Should pass --wd-host-directory with --wd-endpoint')
config['webdriver_hub']['host_downloads_directory'] = wd_host_directory
pylon_client = PylonClient(config=config)
return await fetch(iter=pylon_client.download(params=params), output=output)
def main():
fire.Fire({
'download': sync_fu(download),
})
try:
fire.Fire({
'download': sync_fu(download),
})
except KeyboardInterrupt:
sys.exit(1)
if __name__ == '__main__':

View File

@ -1,12 +1,10 @@
import logging
from typing import (
AsyncIterable,
Dict,
List,
Optional,
)
from aiokit import AioThing
from library.logging import error_log
from nexus.pylon.exceptions import (
DownloadError,
NotFoundError,
@ -17,30 +15,25 @@ from nexus.pylon.source import Source
class PylonClient(AioThing):
def __init__(
self,
source_configs: Optional[List],
proxies: Optional[List[str]] = None,
downloads_directory: Optional[str] = None,
default_driver_proxy_list: [Optional[List]] = None,
default_resolver_proxy_list: [Optional[List]] = None,
):
def __init__(self, config):
super().__init__()
self.proxy_manager = ProxyManager(proxies)
self.downloads_directory = downloads_directory
self.default_driver_proxy_list = default_driver_proxy_list
self.default_resolver_proxy_list = default_resolver_proxy_list
self.config = config
self.proxy_manager = ProxyManager(config.get('proxies'))
self.sources = []
for source_config in source_configs:
if config.get('webdriver_hub') is None:
logging.getLogger('nexus_pylon').warning({
'action': 'missed_webdriver',
'mode': 'pylon',
})
for source_config in config['sources']:
source = Source.from_config(
proxy_manager=self.proxy_manager,
config=self.config,
source_config=source_config,
downloads_directory=downloads_directory,
default_driver_proxy_list=default_driver_proxy_list,
default_resolver_proxy_list=default_resolver_proxy_list,
)
self.sources.append(source)
self.starts.append(source)
if source:
self.sources.append(source)
self.starts.append(source)
async def download(self, params: Dict) -> AsyncIterable[FileResponsePb]:
for source in self.sources:
@ -50,9 +43,10 @@ class PylonClient(AioThing):
async for resp in source.download(params):
yield resp
return
except NotFoundError:
except NotFoundError as e:
logging.getLogger('nexus_pylon').debug(e)
continue
except DownloadError as e:
error_log(e)
logging.getLogger('nexus_pylon').warning(e)
continue
raise NotFoundError()
raise NotFoundError(params=params)

View File

@ -1,11 +0,0 @@
from izihawa_utils import env
from library.configurator import Configurator
def get_config():
return Configurator([
'nexus/pylon/configs/pylon.yaml',
], env_prefix='NEXUS_PYLON')
config = get_config()

View File

@ -1,65 +1,25 @@
---
pylon:
default_driver_proxy_list:
- [proxy1]
- [proxy2]
- [proxy3]
downloads_directory: /downloads
proxies:
- address: proxy1.net:7890
name: proxy1
tags: [proxy1]
- address: proxy2.net:7990
name: proxy2
tags: [proxy2]
- address: proxy3.net:8090
name: proxy3
tags: [proxy3]
default_driver_proxy_list: ~
default_resolver_proxy_list: ~
proxies: ~
sources:
# LibGen.rocks
# IPFS
- driver:
args:
proxy_list: ~
validator:
class: nexus.pylon.validators.Md5Validator
class: nexus.pylon.validators.BaseValidator
class:
nexus.pylon.drivers.DirectDriver
matcher:
md5: ^.*$
ipfs_multihashes: ^.*$
resolver:
args:
extractors:
- producer:
format_string: 'http://libgen.rocks/{matched_group}'
group: 0
re: 'get\.php\?md5=.*&key=[A-Za-z\d]+'
timeout: 25.0
type: regex
url: https://libgen.rocks/ads.php?md5={md5}
class: nexus.pylon.resolvers.RequestResolver
# LibGen.rocks
- driver:
args:
proxy_list: ~
class:
nexus.pylon.drivers.DirectDriver
matcher:
doi: ^.*$
resolver:
args:
extractors:
- producer:
format_string: 'http://libgen.rocks/{matched_group}'
group: 0
re: 'get\.php\?md5=[a-fA-F\d]+&key=[A-Za-z\d]+(&doi=[^\"]*)+'
timeout: 25.0
type: regex
url: 'https://libgen.rocks/ads.php?doi={doi}'
class: nexus.pylon.resolvers.RequestResolver
format_string: 'https://ipfs.io/ipfs/{ipfs_multihashes[0]}'
class: nexus.pylon.resolvers.TemplateResolver
# Library.lol
- driver:
args:
proxy_list: ~
validator:
class: nexus.pylon.validators.Md5Validator
class:
@ -70,27 +30,22 @@ pylon:
args:
extractors:
- producer:
format_string: '{matched_group}'
group: 1
re: '<a href="([^\"]+)">GET</a>'
timeout: 45.0
format_string: '{href}'
timeout: 45.0
re: '<a href="(?P<href>[^\"]+)">GET</a>'
type: regex
- producer:
format_string: '{matched_group}'
group: 0
re: 'https://ipfs.io/ipfs/[A-Za-z\d]+'
format_string: '{url}'
re: '(?P<url>https://ipfs.io/ipfs/[A-Za-z\d]+)'
type: regex
- producer:
format_string: '{matched_group}'
group: 0
re: 'https://cloudflare-ipfs.com/ipfs/[A-Za-z\d]+'
format_string: '{url}'
re: '(?P<url>https://cloudflare-ipfs.com/ipfs/[A-Za-z\d]+)'
type: regex
url: http://library.lol/main/{md5}
class: nexus.pylon.resolvers.RequestResolver
# library.lol
- driver:
args:
proxy_list: ~
class:
nexus.pylon.drivers.DirectDriver
matcher:
@ -99,13 +54,48 @@ pylon:
args:
extractors:
- producer:
format_string: '{matched_group}'
group: 1
re: '<a href="([^\"]+)">GET</a>'
timeout: 45.0
format_string: '{href}'
timeout: 45.0
re: '<a href="(?P<href>[^\"]+)">GET</a>'
type: regex
url: 'http://library.lol/scimag/{doi}'
class: nexus.pylon.resolvers.RequestResolver
# LibGen.rocks
- driver:
args:
validator:
class: nexus.pylon.validators.Md5Validator
class:
nexus.pylon.drivers.DirectDriver
matcher:
md5: ^.*$
resolver:
args:
extractors:
- producer:
format_string: 'http://libgen.rocks/{key}'
timeout: 25.0
re: '(?P<key>get\.php\?md5=.*&key=[A-Za-z\d]+)'
type: regex
resolve_timeout: 25.0
url: https://libgen.rocks/ads.php?md5={md5}
class: nexus.pylon.resolvers.RequestResolver
# LibGen.rocks
- driver:
class:
nexus.pylon.drivers.DirectDriver
matcher:
doi: ^.*$
resolver:
args:
extractors:
- producer:
format_string: 'http://libgen.rocks/{key}'
timeout: 25.0
re: '(?P<key>get\.php\?md5=[a-fA-F\d]+&key=[A-Za-z\d]+(&doi=[^\"]*)+)'
type: regex
url: 'https://libgen.rocks/ads.php?doi={doi}'
class: nexus.pylon.resolvers.RequestResolver
# jamanetwork.com
- driver:
args:
@ -206,6 +196,13 @@ pylon:
format_string: 'https://www.tandfonline.com/doi/pdf/{doi}?download=true'
class: nexus.pylon.resolvers.TemplateResolver
# iopscience.iop.org
- matcher:
doi: ^10.1088/.*$
resolver:
args:
format_string: 'https://iopscience.iop.org/article/{doi}/pdf'
class: nexus.pylon.resolvers.TemplateResolver
# iopscience.iop.org
- matcher:
doi: ^10.1088/.*$
resolver:
@ -220,10 +217,6 @@ pylon:
timeout: 30
type: wait_css_selector
- type: click
proxy_list:
- [proxy2]
- [proxy1]
- [proxy3]
class: nexus.pylon.drivers.BrowserDriver
matcher:
doi: ^10.1093/.*$
@ -246,6 +239,13 @@ pylon:
args:
timeout: 30.0
class: nexus.pylon.resolvers.TemplateResolver
# biorxiv.org
- matcher:
doi: ^10.1101/.*$
resolver:
args:
format_string: 'https://www.biorxiv.org/content/{doi}.full.pdf'
class: nexus.pylon.resolvers.TemplateResolver
# journals.aps.org
- matcher:
doi: ^10.1103/.*$
@ -332,6 +332,13 @@ pylon:
args:
format_string: 'https://journals.physiology.org/doi/pdf/{doi}?download=true'
class: nexus.pylon.resolvers.TemplateResolver
# www.ahajournals.org
- matcher:
doi: ^10.1161/.*$
resolver:
args:
format_string: 'https://www.ahajournals.org/doi/pdf/{doi}?download=true'
class: nexus.pylon.resolvers.TemplateResolver
# ajp.psychiatryonline.org
- matcher:
doi: ^10.1176/.*$
@ -355,8 +362,6 @@ pylon:
class: nexus.pylon.resolvers.TemplateResolver
# journals.plos.org
- driver:
args:
proxy_list: ~
class: nexus.pylon.drivers.direct.DirectDriver
matcher:
doi: ^10.1371/.*$
@ -364,6 +369,13 @@ pylon:
args:
format_string: 'https://journals.plos.org/plosone/article/file?id={doi}&type=printable'
class: nexus.pylon.resolvers.TemplateResolver
# guilfordjournals.com
- matcher:
doi: ^10.1521/.*$
resolver:
args:
format_string: 'https://guilfordjournals.com/doi/pdf/{doi}?download=true'
class: nexus.pylon.resolvers.TemplateResolver
# bioone.org
- driver:
args:
@ -396,8 +408,6 @@ pylon:
doi: ^10.2139/.*$
# www.afghandata.org
- driver:
args:
proxy_list: ~
class:
nexus.pylon.drivers.DirectDriver
matcher:
@ -503,8 +513,6 @@ pylon:
doi: ^10.5334/.*$
# hess.copernicus.org
- driver:
args:
proxy_list: ~
class: nexus.pylon.drivers.DirectDriver
matcher:
doi: ^10.5194/.*$
@ -524,7 +532,6 @@ pylon:
- selector: '.uxf-download'
type: wait_css_selector
- type: click
proxy_list: ~
class: nexus.pylon.drivers.BrowserDriver
matcher:
doi: ^10.5585/.*
@ -539,6 +546,22 @@ pylon:
args:
format_string: 'https://jcsm.aasm.org/doi/pdf/{doi}?download=true'
class: nexus.pylon.resolvers.TemplateResolver
# www.medwave.cl
- driver:
class:
nexus.pylon.drivers.DirectDriver
matcher:
doi: ^10.5867/.*$
resolver:
args:
extractors:
- producer:
format_string: 'https://www.medwave.cl/{path}'
timeout: 25.0
re: 'href=\"/(?P<path>[\w/.\-_]+\.pdf)\">PDF</a>'
type: regex
url: https://doi.org/{doi}
class: nexus.pylon.resolvers.RequestResolver
# journal.permsc.ru
- driver:
args:
@ -584,8 +607,6 @@ pylon:
doi: ^10.21203/.*$
# www.ukm.my/
- driver:
args:
proxy_list: ~
class: nexus.pylon.drivers.DirectDriver
matcher:
doi: ^10.24035/.*$
@ -599,6 +620,22 @@ pylon:
class: nexus.pylon.drivers.BrowserDriver
matcher:
doi: ^10.32920/.*$
# PKP Project
- driver:
class:
nexus.pylon.drivers.DirectDriver
matcher:
doi: ^10.(5399|24905|31004|32729|37934)/.*$
resolver:
args:
extractors:
- producer:
format_string: 'https://{host}/{prefix}/{journal}/article/download/{key}'
timeout: 25.0
re: 'href=\"(?:https?://[\w.]+)/(?P<prefix>[\w./]+)/(?P<journal>[\w.]+)/article/view/(?P<key>\w+/\w+)\"[^>]*>[Pp][Dd][Ff]\s*</a>'
type: regex
url: https://doi.org/{doi}
class: nexus.pylon.resolvers.RequestResolver
# papers.cumincad.org
- driver:
args:
@ -606,11 +643,16 @@ pylon:
- selector: 'file.pdf'
type: wait_link_text
- type: click
proxy_list: ~
class: nexus.pylon.drivers.BrowserDriver
matcher:
doi: ^10.52842/.*$
# ^.*$
- matcher:
doi: ^.*$
resolver:
args:
selector: '.resource.primary.URL | select (. | ascii_downcase | contains("pdf"))'
class: nexus.pylon.resolvers.DoiOrgRequestResolver
- matcher:
doi: ^.*$
resolver:

View File

@ -4,22 +4,22 @@ from typing import (
Optional,
)
from izihawa_utils.importlib import import_object
from nexus.pylon.network_agent import NetworkAgent
from nexus.pylon.prepared_request import PreparedRequest
from nexus.pylon.proxy_manager import ProxyManager
from nexus.pylon.validators.base import BaseValidator
from utils.izihawa_utils.importlib import import_object
class BaseDriver(NetworkAgent):
def __init__(
self,
config,
validator=None,
downloads_directory: str = '/downloads',
proxy_list: Optional[List] = None,
proxy_manager: Optional[ProxyManager] = None,
):
super().__init__(proxy_list=proxy_list, proxy_manager=proxy_manager)
self.config = config
validator_cls = 'nexus.pylon.validators.PdfValidator'
if validator and 'class' in validator:
@ -27,7 +27,6 @@ class BaseDriver(NetworkAgent):
validator_cls = import_object(validator_cls)
self.validator = validator_cls
self.downloads_directory = downloads_directory
def __str__(self):
return self.__class__.__name__

View File

@ -32,21 +32,20 @@ from selenium.webdriver.support.ui import WebDriverWait
class BrowserDriver(BaseDriver):
def __init__(
self,
config,
validator=None,
proxy_list: Optional[List] = None,
proxy_manager: Optional[ProxyManager] = None,
actions: Optional[List] = None,
downloads_directory='/downloads',
window_size: Tuple[int, int] = (1279, 833),
erase_webdrive_property: bool = True,
webdrive_hub_endpoint: str = "http://127.0.0.1:4444/wd/hub",
):
super().__init__(validator=validator, proxy_list=proxy_list, proxy_manager=proxy_manager)
super().__init__(config=config, validator=validator, proxy_list=proxy_list, proxy_manager=proxy_manager)
self.actions = actions
self.downloads_directory = Path(downloads_directory)
self.window_size = window_size
self.erase_webdrive_property = erase_webdrive_property
self.webdrive_hub_endpoint = webdrive_hub_endpoint
self.downloads_directory = Path(config['webdriver_hub']['downloads_directory'])
self.host_downloads_directory = Path(config['webdriver_hub']['host_downloads_directory'])
self.window_size = tuple(config['webdriver_hub'].get('window_size', [1279, 833]))
self.erase_webdriver_property = config['webdriver_hub'].get('erase_webdriver_property', True)
self.webdriver_hub_endpoint = config['webdriver_hub']['endpoint']
self.file_poll_timeout = 2.0
async def get_chrome_sessions(self):
proxies = list(
@ -55,15 +54,14 @@ class BrowserDriver(BaseDriver):
else [None]
)
for proxy in proxies:
downloads_folder = self.downloads_directory / random_string(16)
os.mkdir(downloads_folder)
os.chmod(downloads_folder, 0o777)
chrome = await asyncio.get_running_loop().run_in_executor(None, lambda: self.setup_chrome(proxy, downloads_folder))
try:
yield chrome, downloads_folder
finally:
shutil.rmtree(downloads_folder)
chrome.quit()
subdirectory = random_string(16)
downloads_directory = self.downloads_directory / subdirectory
host_downloads_directory = self.host_downloads_directory / subdirectory
os.mkdir(host_downloads_directory)
os.chmod(host_downloads_directory, 0o777)
chrome = await asyncio.get_running_loop().run_in_executor(None, lambda: self.setup_chrome(proxy, downloads_directory))
yield chrome, host_downloads_directory
def setup_chrome(self, proxy, downloads_folder):
options = webdriver.ChromeOptions()
@ -85,13 +83,13 @@ class BrowserDriver(BaseDriver):
options.add_argument('--disable-dev-shm-usage')
options.add_argument("--disable-popup-blocking")
chrome = webdriver.Remote(
self.webdrive_hub_endpoint,
self.webdriver_hub_endpoint,
DesiredCapabilities.CHROME,
options=options,
)
chrome.set_window_size(self.window_size[0], self.window_size[1])
if self.erase_webdrive_property:
if self.erase_webdriver_property:
resource = "/session/%s/chromium/send_command_and_get_result" % chrome.session_id
url = chrome.command_executor._url + resource
body = json.dumps({'cmd': "Page.addScriptToEvaluateOnNewDocument", 'params': {
@ -103,7 +101,7 @@ class BrowserDriver(BaseDriver):
}})
chrome.command_executor._request('POST', url, body)
logging.getLogger('debug').debug({
logging.getLogger('nexus_pylon').debug({
'action': 'start_chrome',
'mode': 'pylon',
'proxy': str(proxy) if proxy is not None else None,
@ -148,32 +146,19 @@ class BrowserDriver(BaseDriver):
and downloaded_offset == current_offset
and current_offset > 0
):
logging.getLogger('debug').debug({
'action': 'sent',
'mode': 'pylon',
'filename': filename,
})
return
logging.getLogger('debug').debug({
'action': 'send_part',
'mode': 'pylon',
'current_offset': current_offset,
'downloaded_offset': downloaded_offset,
'filename': filename,
})
await file.seek(current_offset)
yield await file.read(downloaded_offset - current_offset)
current_offset = downloaded_offset
await asyncio.sleep(0.5)
await asyncio.sleep(self.file_poll_timeout)
raise NotFoundError()
finally:
await file.close()
def get(self, chrome, url, params):
logging.getLogger('debug').debug({
'action': 'get',
logging.getLogger('nexus_pylon').debug({
'action': 'download',
'mode': 'pylon',
'url': url,
})
@ -190,11 +175,6 @@ class BrowserDriver(BaseDriver):
if not last_element:
raise RuntimeError('Nothing to click')
chrome.execute_script("arguments[0].click();", last_element)
logging.getLogger('debug').debug({
'action': 'clicked',
'mode': 'pylon',
'element': str(last_element),
})
case 'close_window':
current_window = previous_window
previous_window = None
@ -204,11 +184,6 @@ class BrowserDriver(BaseDriver):
if not last_element:
raise RuntimeError('Nothing to click')
last_element.click()
logging.getLogger('debug').debug({
'action': 'native_clicked',
'mode': 'pylon',
'element': str(last_element),
})
case 'switch_to_new_window':
previous_window = current_window
current_window = chrome.window_handles[-1]
@ -227,12 +202,6 @@ class BrowserDriver(BaseDriver):
action['selector'],
))
)
logging.getLogger('debug').debug({
'action': 'waited_css_selector',
'mode': 'pylon',
'element': str(last_element),
'step': action
})
case 'wait_link_text':
last_element = WebDriverWait(chrome, action.get('timeout', 15.0)).until(
EC.presence_of_element_located((
@ -240,12 +209,6 @@ class BrowserDriver(BaseDriver):
action['selector'],
))
)
logging.getLogger('debug').debug({
'action': 'waited_link_text',
'mode': 'pylon',
'element': str(last_element),
'step': action
})
case 'wait_xpath':
last_element = WebDriverWait(chrome, action.get('timeout', 15.0)).until(
EC.presence_of_element_located((
@ -253,16 +216,10 @@ class BrowserDriver(BaseDriver):
action['selector'],
))
)
logging.getLogger('debug').debug({
'action': 'waited_xpath',
'mode': 'pylon',
'element': str(last_element),
'step': action
})
case _:
raise NotImplementedError('Not implemented action type')
except WebDriverException as e:
logging.getLogger('debug').debug({
logging.getLogger('nexus_pylon').debug({
'action': 'error',
'mode': 'pylon',
'error': str(e),
@ -294,15 +251,17 @@ class BrowserDriver(BaseDriver):
source=chrome.current_url,
)
file_validator.validate()
logging.getLogger('debug').debug({
'action': 'validated',
'mode': 'pylon',
'url': prepared_file_request.url,
})
return
except NotFoundError:
logging.getLogger('debug').debug({
logging.getLogger('nexus_pylon').debug({
'action': 'no_response',
'mode': 'pylon',
})
finally:
logging.getLogger('nexus_pylon').debug({
'action': 'quit_chrome',
'mode': 'pylon',
})
chrome.quit()
shutil.rmtree(downloads_folder)
raise NotFoundError(params=params, url=prepared_file_request.url, driver=str(self))

View File

@ -1,3 +1,4 @@
import logging
from typing import Dict
import aiohttp.client_exceptions
@ -25,12 +26,27 @@ class DirectDriver(BaseDriver):
@retry(
reraise=True,
wait=wait_random(min=1, max=2),
stop=stop_after_attempt(7),
stop=stop_after_attempt(3),
retry=retry_if_exception_type((ProxyError, aiohttp.client_exceptions.ClientPayloadError, ProxyTimeoutError)),
)
async def execute_prepared_file_request(self, prepared_file_request: PreparedRequest, params: Dict):
logging.debug({
'action': 'download',
'mode': 'pylon',
'params': params,
'source': str(self),
'url': prepared_file_request.url,
})
async with self.get_session() as session:
async with prepared_file_request.execute_with(session=session) as resp:
logging.debug({
'action': 'response',
'mode': 'pylon',
'params': params,
'source': str(self),
'url': prepared_file_request.url,
'status': resp.status,
})
if resp.status == 404:
raise NotFoundError(url=prepared_file_request.url)
elif (

View File

@ -1,5 +1,8 @@
import re
import sys
from typing import (
List,
Tuple,
)
class Matcher:
@ -10,8 +13,11 @@ class Matcher:
def is_match(self, params) -> bool:
for param in params:
if params[param]:
if param_regex := self.param_regexes.get(param):
if re.match(param_regex, params[param]):
return True
return False
param_value = params[param]
param_regex = self.param_regexes.get(param)
if param_value and param_regex:
if not isinstance(param_value, (List, Tuple)):
param_value = [param_value]
for el in param_value:
if re.match(param_regex, el):
return el

View File

@ -10,7 +10,7 @@ import aiohttp
from aiohttp import ClientSession
from aiohttp.client_reqrep import ClientRequest
from aiohttp_socks import ProxyConnector
from library.aiokit.aiokit import AioThing
from aiokit import AioThing
from nexus.pylon.proxy_manager import (
AllOf,
AnyOf,

View File

@ -228,7 +228,7 @@ class BasePdfProcessor:
try:
page = self.process_page(page, pdf_reader)
except (PdfStreamError, binascii.Error) as e:
logging.getLogger('warning').warning({
logging.getLogger('nexus_pylon').warning({
'action': 'pdf_stream_error',
'mode': 'pylon',
'error': str(e),
@ -259,7 +259,7 @@ class WatermarkEraser1(BaseWatermarkEraser):
if self.is_watermark_predicate(text.encode()):
xobj_death_note.append(operands[0])
operations_death_note.append(op_i)
logging.getLogger('debug').debug({
logging.getLogger('nexus_pylon').debug({
'action': 'watermark_removal',
'mode': 'pylon',
'text': text,
@ -289,7 +289,7 @@ class WatermarkEraser2(BaseWatermarkEraser):
if operation == b"Tj":
if isinstance(operands[0], bytes) and self.is_watermark_predicate(operands[0]):
operations_death_note.append(op_i)
logging.getLogger('debug').debug({
logging.getLogger('nexus_pylon').debug({
'action': 'watermark_removal',
'mode': 'pylon',
'text': operands[0].decode(),
@ -319,7 +319,7 @@ class WatermarkEraser3(BaseWatermarkEraser):
text += operand
if self.is_watermark_predicate(text):
operations_death_note.append(op_i)
logging.getLogger('debug').debug({
logging.getLogger('nexus_pylon').debug({
'action': 'watermark_removal',
'mode': 'pylon',
'text': text.decode(),
@ -402,7 +402,7 @@ class WatermarkEraser4(BaseWatermarkEraser):
text, matched = tc.match(self.regexp)
if matched:
operations_death_note.extend(matched)
logging.getLogger('debug').debug({
logging.getLogger('nexus_pylon').debug({
'action': 'watermark_removal',
'mode': 'pylon',
'matched': text,

View File

@ -1,4 +1,5 @@
import asyncio
import logging
from contextlib import asynccontextmanager
from typing import Optional
@ -23,6 +24,7 @@ class PreparedRequest:
cookies: Optional[dict] = None,
ssl: bool = True,
timeout: Optional[float] = None,
headers_override: bool = False
):
self.method = method
self.url = url
@ -32,6 +34,8 @@ class PreparedRequest:
}
if headers:
self.headers.update(headers)
if headers_override:
self.headers = headers or {}
self.params = params
self.cookies = cookies
self.ssl = ssl
@ -49,6 +53,13 @@ class PreparedRequest:
@asynccontextmanager
async def execute_with(self, session):
try:
logging.getLogger('nexus_pylon').debug({
'action': 'request',
'mode': 'pylon',
'url': self.url,
'method': self.method,
'headers': self.headers,
})
async with session.request(
method=self.method,
url=self.url,

View File

@ -54,6 +54,8 @@ class Proxy:
class ProxyManager:
def __init__(self, proxies=None):
if proxies is None:
proxies = []
self.proxies = [Proxy(proxy) for proxy in proxies]
def get_proxy(self, tags: Optional[Union[AllOf, AnyOf, Set]] = None) -> Proxy:

View File

@ -1,5 +1,6 @@
import json
import logging
import sys
from typing import (
AsyncIterable,
Dict,
@ -50,20 +51,25 @@ class DoiOrgRequestResolver(BaseResolver):
method='get',
url=doi_url,
timeout=self.resolve_timeout,
headers={'Accept': 'application/json'}
headers={
'Accept': 'application/json',
}
).execute_with(session=session) as resp:
return await resp.json()
async def resolve(self, params: Dict) -> AsyncIterable[PreparedRequest]:
body = await self.resolve_through_doi_org(params)
selected = None
try:
selected = json.loads(self.selector.input(body).text())
if text := self.selector.input(body).text():
selected = json.loads(text)
except ValueError as e:
logging.getLogger('error').error({
logging.getLogger('nexus_pylon').error({
'action': 'error',
'mode': 'pylon',
'params': params,
'error': str(e)
'error': str(e),
'selector': str(self.selector),
})
return
if selected:
@ -73,7 +79,7 @@ class DoiOrgRequestResolver(BaseResolver):
timeout=self.timeout,
)
else:
logging.getLogger('debug').error({
logging.getLogger('nexus_pylon').debug({
'action': 'missed_selector',
'mode': 'pylon',
'params': params,

View File

@ -15,10 +15,12 @@ class RequestResolver(BaseResolver):
self,
url: str,
extractors: List,
resolve_timeout: float = 10.0,
proxy_list: Optional[List] = None,
proxy_manager: Optional[ProxyManager] = None,
):
super().__init__(proxy_list=proxy_list, proxy_manager=proxy_manager)
self.resolve_timeout = resolve_timeout
self.url = url
self.extractors = extractors
@ -31,9 +33,9 @@ class RequestResolver(BaseResolver):
async with PreparedRequest(
method='get',
url=url,
timeout=10.0,
timeout=self.resolve_timeout,
).execute_with(session=session) as resp:
# Sometimes sci-hub returns file
# Sometimes hosts return file URL
if resp.headers.get('Content-Type') == 'application/pdf':
yield PreparedRequest(method='get', url=url, timeout=10.0)
downloaded_page_bytes = await resp.read()
@ -42,9 +44,11 @@ class RequestResolver(BaseResolver):
for extractor in self.extractors:
match = re.search(extractor['re'], downloaded_page, re.IGNORECASE)
if match:
matched_group = match.group(extractor['producer']['group'])
yield PreparedRequest(
method='get',
url=extractor['producer']['format_string'].format(matched_group=matched_group),
url=extractor['producer']['format_string'].format(
host=resp.real_url.host,
**match.groupdict()
),
timeout=extractor['producer'].get('timeout', 10.0),
)

View File

@ -14,19 +14,27 @@ class TemplateResolver(BaseResolver):
self,
format_string: str = 'https://doi.org/{doi}',
timeout: float = 10.0,
method: str = 'GET',
headers: Optional[dict] = None,
headers_override: bool = False,
proxy_list: Optional[List] = None,
proxy_manager: Optional[ProxyManager] = None,
):
super().__init__(proxy_list=proxy_list, proxy_manager=proxy_manager)
self.format_string = format_string
self.timeout = timeout
self.method = method
self.headers = headers
self.headers_override = headers_override
def __str__(self):
return f'{self.__class__.__name__}({self.format_string})'
async def resolve(self, params) -> AsyncIterable[PreparedRequest]:
yield PreparedRequest(
method='GET',
method=self.method,
url=self.format_string.format(**params),
timeout=self.timeout,
headers=self.headers,
headers_override=self.headers_override,
)

View File

@ -2,12 +2,12 @@ import logging
from typing import (
AsyncIterable,
Dict,
List,
Optional,
)
from aiohttp.client_exceptions import ClientPayloadError
from library.aiokit.aiokit import AioThing
from library.logging import error_log
from aiokit import AioThing
from izihawa_utils.importlib import import_object
from nexus.pylon.drivers.base import BaseDriver
from nexus.pylon.exceptions import (
DownloadError,
@ -16,7 +16,6 @@ from nexus.pylon.exceptions import (
from nexus.pylon.matcher import Matcher
from nexus.pylon.proto.file_pb2 import FileResponse as FileResponsePb
from nexus.pylon.resolvers.base import BaseResolver
from utils.izihawa_utils.importlib import import_object
class Source(AioThing):
@ -29,12 +28,15 @@ class Source(AioThing):
@classmethod
def from_config(
cls,
proxy_manager,
config,
source_config,
downloads_directory: str,
default_driver_proxy_list: List,
default_resolver_proxy_list: List,
) -> 'Source':
proxy_manager,
) -> Optional['Source']:
driver_cls_name = source_config.get('driver', {}).get('class', 'nexus.pylon.drivers.BrowserDriver')
if driver_cls_name.endswith('BrowserDriver') and config.get('webdriver_hub') is None:
return None
matcher = Matcher(source_config['matcher'])
resolver_cls = import_object(
@ -42,16 +44,16 @@ class Source(AioThing):
)
resolver_args = dict(
proxy_manager=proxy_manager,
proxy_list=default_resolver_proxy_list,
proxy_list=config['default_resolver_proxy_list'],
)
resolver_args.update(**source_config.get('resolver', {}).get('args', {}))
resolver = resolver_cls(**resolver_args)
driver_cls = import_object(source_config.get('driver', {}).get('class', 'nexus.pylon.drivers.BrowserDriver'))
driver_cls = import_object(driver_cls_name)
driver_args = dict(
proxy_manager=proxy_manager,
downloads_directory=downloads_directory,
proxy_list=default_driver_proxy_list,
proxy_list=config['default_driver_proxy_list'],
config=config,
)
driver_args.update(**source_config.get('driver', {}).get('args', {}))
driver = driver_cls(**driver_args)
@ -67,13 +69,6 @@ class Source(AioThing):
async def download(self, params: Dict) -> AsyncIterable[FileResponsePb]:
yield FileResponsePb(status=FileResponsePb.Status.RESOLVING)
async for prepared_file_request in self.resolver.resolve(params):
logging.debug({
'action': 'download',
'mode': 'pylon',
'params': params,
'source': str(self),
'url': prepared_file_request.url,
})
try:
async for resp in self.driver.execute_prepared_file_request(
prepared_file_request=prepared_file_request,
@ -82,11 +77,11 @@ class Source(AioThing):
yield resp
return
except ClientPayloadError as e:
error_log(e, level=logging.WARNING)
logging.getLogger('nexus_pylon').warning(e)
continue
except NotFoundError:
continue
except DownloadError as e:
error_log(e)
logging.getLogger('nexus_pylon').warning(e)
continue
raise NotFoundError(params=params, resolver=str(self.resolver), driver=str(self.driver))

View File

@ -1,4 +1,5 @@
from .base import BaseValidator
from .md5 import Md5Validator
from .pdf import PdfValidator
__all__ = ['Md5Validator', 'PdfValidator']
__all__ = ['BaseValidator', 'Md5Validator', 'PdfValidator']

View File

@ -1,4 +1,10 @@
from typing import Dict
class BaseValidator:
def __init__(self, params: Dict):
self.params = params
def update(self, chunk):
pass

View File

@ -7,6 +7,7 @@ from nexus.pylon.validators.base import BaseValidator
class Md5Validator(BaseValidator):
def __init__(self, params: Dict):
super().__init__(params)
self.md5 = params['md5']
self.v = hashlib.md5()

View File

@ -12,7 +12,7 @@ from PyPDF2.errors import PdfReadError
class PdfValidator(BaseValidator):
def __init__(self, params: Dict):
self.params = params
super().__init__(params)
self.md5 = params.get('md5')
self.file = bytes()
self.v = hashlib.md5()
@ -24,7 +24,7 @@ class PdfValidator(BaseValidator):
def validate(self):
if self.md5 and self.md5.lower() == self.v.hexdigest().lower():
logging.getLogger('debug').debug({
logging.getLogger('nexus_pylon').debug({
'action': 'validation',
'mode': 'pylon',
'result': 'md5_ok',
@ -32,7 +32,7 @@ class PdfValidator(BaseValidator):
})
return
elif not is_pdf(f=self.file):
logging.getLogger('debug').debug({
logging.getLogger('nexus_pylon').debug({
'action': 'validation',
'mode': 'pylon',
'result': 'not_pdf',
@ -41,28 +41,18 @@ class PdfValidator(BaseValidator):
raise BadResponseError(file=str(self.file[:100]))
try:
logging.getLogger('debug').debug({
'action': 'open_pdf',
'mode': 'pylon',
'file_len': len(self.file),
'params': self.params,
})
PyPDF2.PdfReader(BytesIO(self.file))
logging.getLogger('debug').debug({
'action': 'opened_pdf',
'mode': 'pylon',
'file_len': len(self.file),
'params': self.params,
})
except PdfReadError:
logging.getLogger('debug').debug({
logging.getLogger('nexus_pylon').debug({
'action': 'validation',
'mode': 'pylon',
'result': 'not_opened_as_pdf',
'params': self.params,
})
raise BadResponseError(file=str(self.file[:100]))
logging.getLogger('debug').debug({
logging.getLogger('nexus_pylon').debug({
'action': 'validation',
'mode': 'pylon',
'result': 'ok',
'params': self.params,
})

View File

@ -10,6 +10,6 @@ py_library(
srcs_version = "PY3",
visibility = ["//visibility:public"],
deps = [
"//library/configurator",
requirement("izihawa_configurator"),
],
)

View File

@ -1,4 +1,4 @@
from library.configurator import Configurator
from izihawa_configurator import Configurator
def get_translations():

View File

@ -33,7 +33,10 @@ class ProgressBar:
tail_text,
message=None,
source=None,
throttle_secs: float = 0,
throttle_secs: float = 0.0,
hard_throttle_secs: float = 10.0,
last_call: float = 0.0,
done_threshold_size: int = 10 * 1024 * 1024,
):
self.telegram_client = telegram_client
self.request_context = request_context
@ -45,9 +48,12 @@ class ProgressBar:
self.done = 0
self.total = 1
self.throttle_secs = throttle_secs
self.hard_throttle_secs = hard_throttle_secs
self.done_threshold_size = done_threshold_size
self.previous_done = 0
self.last_text = None
self.last_call = 0
self.last_call = last_call
def share(self):
if self.total > 0:
@ -56,6 +62,7 @@ class ProgressBar:
return f'{float(self.done / (1024 * 1024)):.1f}Mb'
def _set_progress(self, done, total):
self.previous_done = self.done
self.done = done
self.total = total
@ -74,11 +81,20 @@ class ProgressBar:
progress_bar = '|' + filled * bars['filled'] + (total_bars - filled) * bars['empty'] + '| '
tail_text = self.tail_text.format(source=self.source)
return f'`{self.header}\n{progress_bar}{self.share()} {tail_text}`'
return f'`{self.header}\n{progress_bar}{self.share().ljust(8)} {tail_text}`'
def should_send(self, now, ignore_last_call):
if ignore_last_call:
return True
if abs(now - self.last_call) > self.hard_throttle_secs:
return True
if abs(now - self.last_call) > self.throttle_secs and (self.done - self.previous_done) < self.done_threshold_size:
return True
return False
async def send_message(self, text, ignore_last_call=False):
now = time.time()
if not ignore_last_call and abs(now - self.last_call) < self.throttle_secs:
if not self.should_send(now, ignore_last_call):
return
try:
if not self.message:
@ -103,17 +119,3 @@ class ProgressBar:
async def callback(self, done, total, ignore_last_call=False):
self._set_progress(done, total)
return await self.send_message(await self.render_progress(), ignore_last_call=ignore_last_call)
class ThrottlerWrapper:
def __init__(self, callback: Callable, throttle_secs: Union[int, float]):
self.callback = callback
self.last_call = 0
self.throttle_secs = throttle_secs
async def __call__(self, *args, **kwargs):
now = time.time()
if abs(now - self.last_call) < self.throttle_secs:
return
self.last_call = now
return await self.callback(*args, **kwargs)

View File

@ -63,7 +63,6 @@ class ScimagViewBuilder(BaseViewBuilder):
'chapter': '🔖',
'book-chapter': '🔖',
}
multihash_ix = 0
def is_preprint(self):
return self.document_holder.doi.split('/')[0] in preprints

View File

@ -13,7 +13,7 @@ aiohttp-socks==0.7.1
aiokafka==0.7.2
aiokit==1.1.2
aiosignal==1.2.0
aiosumma==2.8.13
aiosumma==2.10.4
asn1crypto==1.5.1
async-generator==1.10
async-timeout==4.0.2
@ -55,7 +55,7 @@ h11==0.13.0
idna==3.3
iniconfig==1.1.1
isort==5.10.1
izihawa-nlptools==1.1.7
izihawa-nlptools==1.1.9
izihawa-types==0.1.3
izihawa-utils==1.0.7
Jinja2==3.1.2