- [nexus] Development

- [nexus] Development

GitOrigin-RevId: 5d5feedff7b70be4c788abeb22f89c6758431d33
This commit is contained in:
the-superpirate 2022-09-13 17:15:16 +03:00
parent 73f5fd4a06
commit a683e0ce18
58 changed files with 655 additions and 634 deletions

View File

@ -43,7 +43,7 @@ py3_image(
"//library/aiogrpctools", "//library/aiogrpctools",
requirement("aiokit"), requirement("aiokit"),
"//library/aiopostgres", "//library/aiopostgres",
"//library/configurator", requirement("izihawa_configurator"),
"//library/telegram", "//library/telegram",
requirement("izihawa_utils"), requirement("izihawa_utils"),
], ],

View File

@ -1,11 +1,13 @@
from library.configurator import Configurator from izihawa_configurator import Configurator
from izihawa_utils import env
def get_config(): def get_config():
return Configurator([ return Configurator([
'idm/api/configs/base.yaml', 'idm/api/configs/base.yaml',
'idm/api/configs/%s.yaml?' % env.type,
'idm/api/configs/logging.yaml', 'idm/api/configs/logging.yaml',
], env_prefix='NEXUS_IDM_API') ], env_prefix='IDM_API')
config = get_config() config = get_config()

View File

@ -7,9 +7,9 @@ from idm.api.configs import get_config
from idm.api.services.chat_manager import ChatManagerService from idm.api.services.chat_manager import ChatManagerService
from idm.api.services.profile import ProfileService from idm.api.services.profile import ProfileService
from idm.api.services.subscription_manager import SubscriptionManagerService from idm.api.services.subscription_manager import SubscriptionManagerService
from izihawa_configurator import Configurator
from library.aiogrpctools import AioGrpcServer from library.aiogrpctools import AioGrpcServer
from library.aiopostgres.pool_holder import AioPostgresPoolHolder from library.aiopostgres.pool_holder import AioPostgresPoolHolder
from library.configurator import Configurator
from library.logging import configure_logging from library.logging import configure_logging

View File

@ -105,12 +105,16 @@ class ProfileService(profile_service_pb2_grpc.ProfileServicer, BaseService):
for tag in download_document.tags: for tag in download_document.tags:
tags_counter[tag] += 1 tags_counter[tag] += 1
most_popular_issns = sorted(issns_counter, key=issns_counter.get, reverse=True)[:7] most_popular_issns = sorted(issns_counter, key=issns_counter.get, reverse=True)[:14]
most_popular_tags = sorted(tags_counter, key=tags_counter.get, reverse=True)[:7] most_popular_tags = sorted(tags_counter, key=tags_counter.get, reverse=True)[:7]
most_popular_series = [] most_popular_series = []
if most_popular_issns:
async for row in self.application.pool_holder['nexus'].iterate( async for row in self.application.pool_holder['nexus'].iterate(
f"select name, issns from series where issns && array[{most_popular_issns}]::text[]".format( "select name, array_agg(issn) as issns from series "
"where issn in ({most_popular_issns}) "
"group by name order by name "
"limit 7".format(
most_popular_issns=','.join(map(lambda x: "'" + x + "'", most_popular_issns)), most_popular_issns=','.join(map(lambda x: "'" + x + "'", most_popular_issns)),
), ),
row_factory=dict_row, row_factory=dict_row,

View File

@ -13,7 +13,7 @@ py_library(
requirement("grpcio"), requirement("grpcio"),
requirement("pyyaml"), requirement("pyyaml"),
requirement("aiokit"), requirement("aiokit"),
"//library/configurator", requirement("izihawa_configurator"),
"//library/logging", "//library/logging",
requirement("izihawa_utils"), requirement("izihawa_utils"),
], ],

View File

@ -92,6 +92,7 @@ class AioPostgresPoolHolder(AioThing):
row_factory=tuple_row, row_factory=tuple_row,
cursor_name: Optional[str] = None, cursor_name: Optional[str] = None,
itersize: Optional[int] = None, itersize: Optional[int] = None,
statement_timeout: Optional[int] = None,
): ):
if not self.pool: if not self.pool:
raise RuntimeError('AioPostgresPoolHolder has not been started') raise RuntimeError('AioPostgresPoolHolder has not been started')
@ -99,7 +100,9 @@ class AioPostgresPoolHolder(AioThing):
async with conn.cursor(name=cursor_name, row_factory=row_factory) as cur: async with conn.cursor(name=cursor_name, row_factory=row_factory) as cur:
if itersize is not None: if itersize is not None:
cur.itersize = itersize cur.itersize = itersize
await cur.execute(stmt, values) await cur.execute(stmt + ';' if statement_timeout else '', values)
if statement_timeout:
await cur.execute(f'SET statement_timeout = {statement_timeout};')
async for row in cur: async for row in cur:
yield row yield row

View File

@ -1,17 +0,0 @@
load("@pip_modules//:requirements.bzl", "requirement")
load("@rules_python//python:defs.bzl", "py_library")
py_library(
name = "configurator",
srcs = glob(
["**/*.py"],
exclude = ["tests/**"],
),
srcs_version = "PY3",
visibility = ["//visibility:public"],
deps = [
requirement("jinja2"),
requirement("pyyaml"),
requirement("izihawa_utils"),
],
)

View File

@ -1,170 +0,0 @@
import json
import os
import os.path
from types import ModuleType
import yaml
from izihawa_utils.common import (
smart_merge_dicts,
unflatten,
)
from jinja2 import Template
from library.configurator.exceptions import UnknownConfigFormatError
class ConfigObject(dict):
def __getattr__(self, name):
try:
return self[name]
except KeyError as e:
raise AttributeError(e)
class AnyOf:
def __init__(self, *args):
self.args = args
class RichDict(dict):
def has(self, *args):
current = self
for c in args:
if c not in current:
return False
current = current[c]
return True
def copy_if_exists(self, source_keys, target_key):
current = self
for c in source_keys:
if c not in current:
return False
current = current[c]
self[target_key] = current
return True
class Configurator(RichDict):
def __init__(self, configs: list, env_prefix: str = None, env_key_separator: str = '.'):
"""
Create Configurator object
:param configs: list of paths to config files, dicts or modules.
End filepath with `?` to mark it as optional config.
"""
super().__init__()
self._by_basenames = {}
self._omitted_files = []
env_dict = {}
if env_prefix:
env_prefix = env_prefix.lower()
for name, value in os.environ.items():
if name.lower().startswith(env_prefix):
stripped_name = name[len(env_prefix):].lstrip('_')
if stripped_name[-2:] == '[]':
if stripped_name not in env_dict:
env_dict[stripped_name[:-2]] = []
env_dict[stripped_name[:-2]].append(value)
else:
env_dict[stripped_name] = value
env_dict = unflatten(env_dict, sep=env_key_separator)
for config in ([os.environ] + configs + [env_dict]):
file_found = self.update(config)
if not file_found:
self._omitted_files.append(config)
def _config_filename(self, filename):
return os.path.join(os.getcwd(), filename)
def walk_and_render(self, c):
if isinstance(c, str):
return Template(c).render(**self)
elif isinstance(c, list):
return [self.walk_and_render(e) for e in c]
elif isinstance(c, dict):
for key in list(c.keys()):
c[key] = self.walk_and_render(c[key])
if key.endswith('_filepath'):
with open(c[key]) as f:
if c[key].endswith('.json'):
c[key.replace('_filepath', '')] = json.loads(f.read())
elif c[key].endswith('.yaml'):
c[key.replace('_filepath', '')] = yaml.safe_load(f.read())
return c
def update(self, new_config, basename=None, **kwargs):
if isinstance(new_config, AnyOf):
for config in new_config.args:
try:
return self.update(config.rstrip('?'))
except IOError:
pass
raise IOError('None of %s was found' % ', '.join(new_config.args))
elif isinstance(new_config, str):
optional = new_config.endswith('?')
filename = new_config.rstrip('?')
basename = basename or os.path.basename(filename)
config_filename = self._config_filename(filename)
data = None
if os.path.exists(config_filename) and os.access(config_filename, os.R_OK):
with open(config_filename) as f:
data = f.read()
if data is None:
if optional:
return False
else:
raise IOError(f'File {config_filename} not found')
if filename.endswith('.json'):
new_config = json.loads(data)
elif filename.endswith('.yaml'):
new_config = yaml.safe_load(data)
else:
raise UnknownConfigFormatError(filename)
new_config = self.walk_and_render(new_config)
elif isinstance(new_config, ModuleType):
new_config = new_config.__dict__
elif callable(new_config):
new_config = new_config(self)
if not new_config:
new_config = {}
for k in new_config:
if callable(new_config[k]):
new_config[k] = new_config[k](context=self)
if 'log_path' in new_config:
new_config['log_path'] = os.path.expanduser(new_config['log_path']).rstrip('/')
smart_merge_dicts(self, new_config, list_policy='override', copy=False)
if basename:
self._by_basenames[basename] = new_config
return True
def get_config_by_basename(self, basename):
return self._by_basenames[basename]
def get_object_by_basename(self, basename):
return ConfigObject(self._by_basenames[basename])
def has_missed_configs(self):
return bool(self._omitted_files)
def has_file(self, basename):
return basename in self._by_basenames
def get_files(self):
return self._by_basenames

View File

@ -1,2 +0,0 @@
class UnknownConfigFormatError(Exception):
pass

View File

@ -159,7 +159,6 @@ class ToSummaAction(BaseAction):
'journal', 'journal',
'journal-issue', 'journal-issue',
'journal-volume', 'journal-volume',
'other',
'peer-review', 'peer-review',
'proceedings', 'proceedings',
'report-series', 'report-series',

View File

@ -35,7 +35,7 @@ py3_image(
requirement("aiobaseclient"), requirement("aiobaseclient"),
requirement("aiocrossref"), requirement("aiocrossref"),
requirement("aiokit"), requirement("aiokit"),
"//library/configurator", requirement("izihawa_configurator"),
"//library/logging", "//library/logging",
"//library/telegram", "//library/telegram",
"//nexus/hub/aioclient", "//nexus/hub/aioclient",

View File

@ -1,5 +1,5 @@
from izihawa_configurator import Configurator
from izihawa_utils import env from izihawa_utils import env
from library.configurator import Configurator
def get_config(): def get_config():

View File

@ -104,7 +104,7 @@ class ViewHandler(BaseHandler):
), ),
event.delete(), event.delete(),
] ]
if not has_found_old_widget: if not has_found_old_widget and is_earlier_than_2_days(old_message):
async with safe_execution(error_log=request_context.error_log): async with safe_execution(error_log=request_context.error_log):
await self.application.telegram_client.delete_messages(request_context.chat.chat_id, [old_message_id]) await self.application.telegram_client.delete_messages(request_context.chat.chat_id, [old_message_id])
return await asyncio.gather(*actions) return await asyncio.gather(*actions)

View File

@ -189,6 +189,13 @@ schema:
record: basic record: basic
tokenizer: raw tokenizer: raw
stored: true stored: true
- name: series_page_rank
type: f64
options:
fast: single
fieldnorms: false
indexed: true
stored: true
multi_fields: ["authors", "ipfs_multihashes", "isbns", "issns", "references", "tags"] multi_fields: ["authors", "ipfs_multihashes", "isbns", "issns", "references", "tags"]
primary_key: "id" primary_key: "id"
stop_words: ['a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'if', 'in', 'is', 'it', 'of', 'on', 'or', stop_words: ['a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'if', 'in', 'is', 'it', 'of', 'on', 'or',

View File

@ -43,7 +43,7 @@ py3_image(
requirement("aioipfs-2"), requirement("aioipfs-2"),
requirement("aiokit"), requirement("aiokit"),
"//library/aiopostgres", "//library/aiopostgres",
"//library/configurator", requirement("izihawa_configurator"),
"//library/telegram", "//library/telegram",
"//nexus/hub/proto:grpc_py", "//nexus/hub/proto:grpc_py",
"//nexus/hub/proto:proto_py", "//nexus/hub/proto:proto_py",

View File

@ -1,5 +1,5 @@
from izihawa_configurator import Configurator
from izihawa_utils import env from izihawa_utils import env
from library.configurator import Configurator
def get_config(): def get_config():

View File

@ -1,26 +1,8 @@
--- ---
pylon: pylon:
default_driver_proxy_list:
- [cambridge]
- [edinburg]
- [southampton]
default_resolver_proxy_list: ~ default_resolver_proxy_list: ~
downloads_directory: /downloads
proxies:
- address: clash.default.svc.cluster.example.com:7890
name: cambridge
tags: ['cambridge']
- address: clash.default.svc.cluster.example.com:7990
name: edinburg
tags: ['edinburg']
- address: clash.default.svc.cluster.example.com:8090
name: southampton
tags: ['southampton']
- address: socks5://clash.default.svc.cluster.example.com:7991
name: socks5
tags: ['socks5']
sources: sources:
# LibGen.rocks # IPFS
- driver: - driver:
args: args:
proxy_list: ~ proxy_list: ~
@ -29,37 +11,13 @@ pylon:
class: class:
nexus.pylon.drivers.DirectDriver nexus.pylon.drivers.DirectDriver
matcher: matcher:
md5: ^.*$ ipfs_multihashes: ^.*$
resolver: resolver:
args: args:
extractors: format_string: 'http://nexus-ipfs-headless.default.svc.cluster.example.com:5001/api/v0/cat?arg={ipfs_multihashes[0]}'
- producer: headers_override: true
format_string: 'http://libgen.rocks/{matched_group}' method: 'POST'
group: 0 class: nexus.pylon.resolvers.TemplateResolver
re: 'get\.php\?md5=.*&key=[A-Za-z\d]+'
timeout: 25.0
type: regex
url: https://libgen.rocks/ads.php?md5={md5}
class: nexus.pylon.resolvers.RequestResolver
# LibGen.rocks
- driver:
args:
proxy_list: ~
class:
nexus.pylon.drivers.DirectDriver
matcher:
doi: ^.*$
resolver:
args:
extractors:
- producer:
format_string: 'http://libgen.rocks/{matched_group}'
group: 0
re: 'get\.php\?md5=[a-fA-F\d]+&key=[A-Za-z\d]+(&doi=[^\"]*)+'
timeout: 25.0
type: regex
url: 'https://libgen.rocks/ads.php?doi={doi}'
class: nexus.pylon.resolvers.RequestResolver
# Library.lol # Library.lol
- driver: - driver:
args: args:
@ -74,20 +32,17 @@ pylon:
args: args:
extractors: extractors:
- producer: - producer:
format_string: '{matched_group}' format_string: '{href}'
group: 1
re: '<a href="([^\"]+)">GET</a>'
timeout: 45.0 timeout: 45.0
re: '<a href="(?P<href>[^\"]+)">GET</a>'
type: regex type: regex
- producer: - producer:
format_string: '{matched_group}' format_string: '{url}'
group: 0 re: '(?P<url>https://ipfs.io/ipfs/[A-Za-z\d]+)'
re: 'https://ipfs.io/ipfs/[A-Za-z\d]+'
type: regex type: regex
- producer: - producer:
format_string: '{matched_group}' format_string: '{url}'
group: 0 re: '(?P<url>https://cloudflare-ipfs.com/ipfs/[A-Za-z\d]+)'
re: 'https://cloudflare-ipfs.com/ipfs/[A-Za-z\d]+'
type: regex type: regex
url: http://library.lol/main/{md5} url: http://library.lol/main/{md5}
class: nexus.pylon.resolvers.RequestResolver class: nexus.pylon.resolvers.RequestResolver
@ -103,13 +58,51 @@ pylon:
args: args:
extractors: extractors:
- producer: - producer:
format_string: '{matched_group}' format_string: '{href}'
group: 1
re: '<a href="([^\"]+)">GET</a>'
timeout: 45.0 timeout: 45.0
re: '<a href="(?P<href>[^\"]+)">GET</a>'
type: regex type: regex
url: 'http://library.lol/scimag/{doi}' url: 'http://library.lol/scimag/{doi}'
class: nexus.pylon.resolvers.RequestResolver class: nexus.pylon.resolvers.RequestResolver
# LibGen.rocks
- driver:
args:
proxy_list: ~
validator:
class: nexus.pylon.validators.Md5Validator
class:
nexus.pylon.drivers.DirectDriver
matcher:
md5: ^.*$
resolver:
args:
extractors:
- producer:
format_string: 'http://libgen.rocks/{key}'
timeout: 25.0
re: '(?P<key>get\.php\?md5=.*&key=[A-Za-z\d]+)'
type: regex
resolve_timeout: 25.0
url: https://libgen.rocks/ads.php?md5={md5}
class: nexus.pylon.resolvers.RequestResolver
# LibGen.rocks
- driver:
args:
proxy_list: ~
class:
nexus.pylon.drivers.DirectDriver
matcher:
doi: ^.*$
resolver:
args:
extractors:
- producer:
format_string: 'http://libgen.rocks/{key}'
timeout: 25.0
re: '(?P<key>get\.php\?md5=[a-fA-F\d]+&key=[A-Za-z\d]+(&doi=[^\"]*)+)'
type: regex
url: 'https://libgen.rocks/ads.php?doi={doi}'
class: nexus.pylon.resolvers.RequestResolver
# jamanetwork.com # jamanetwork.com
- driver: - driver:
args: args:
@ -142,6 +135,7 @@ pylon:
resolver: resolver:
args: args:
format_string: 'https://www.sciencedirect.com/science/article/pii/{selected}/pdfft?isDTMRedir=true&download=true' format_string: 'https://www.sciencedirect.com/science/article/pii/{selected}/pdfft?isDTMRedir=true&download=true'
resolve_timeout: 25.0
selector: '(.resource.primary.URL | split("/"))[-1]' selector: '(.resource.primary.URL | split("/"))[-1]'
timeout: 40.0 timeout: 40.0
class: nexus.pylon.resolvers.DoiOrgRequestResolver class: nexus.pylon.resolvers.DoiOrgRequestResolver
@ -209,6 +203,13 @@ pylon:
format_string: 'https://www.tandfonline.com/doi/pdf/{doi}?download=true' format_string: 'https://www.tandfonline.com/doi/pdf/{doi}?download=true'
class: nexus.pylon.resolvers.TemplateResolver class: nexus.pylon.resolvers.TemplateResolver
# iopscience.iop.org # iopscience.iop.org
- matcher:
doi: ^10.1088/.*$
resolver:
args:
format_string: 'https://iopscience.iop.org/article/{doi}/pdf'
class: nexus.pylon.resolvers.TemplateResolver
# iopscience.iop.org
- matcher: - matcher:
doi: ^10.1088/.*$ doi: ^10.1088/.*$
resolver: resolver:
@ -249,6 +250,13 @@ pylon:
args: args:
timeout: 30.0 timeout: 30.0
class: nexus.pylon.resolvers.TemplateResolver class: nexus.pylon.resolvers.TemplateResolver
# biorxiv.org
- matcher:
doi: ^10.1101/.*$
resolver:
args:
format_string: 'https://www.biorxiv.org/content/{doi}.full.pdf'
class: nexus.pylon.resolvers.TemplateResolver
# journals.aps.org # journals.aps.org
- matcher: - matcher:
doi: ^10.1103/.*$ doi: ^10.1103/.*$
@ -374,6 +382,13 @@ pylon:
args: args:
format_string: 'https://journals.plos.org/plosone/article/file?id={doi}&type=printable' format_string: 'https://journals.plos.org/plosone/article/file?id={doi}&type=printable'
class: nexus.pylon.resolvers.TemplateResolver class: nexus.pylon.resolvers.TemplateResolver
# guilfordjournals.com
- matcher:
doi: ^10.1521/.*$
resolver:
args:
format_string: 'https://guilfordjournals.com/doi/pdf/{doi}?download=true'
class: nexus.pylon.resolvers.TemplateResolver
# bioone.org # bioone.org
- driver: - driver:
args: args:
@ -549,6 +564,24 @@ pylon:
args: args:
format_string: 'https://jcsm.aasm.org/doi/pdf/{doi}?download=true' format_string: 'https://jcsm.aasm.org/doi/pdf/{doi}?download=true'
class: nexus.pylon.resolvers.TemplateResolver class: nexus.pylon.resolvers.TemplateResolver
# www.medwave.cl
- driver:
args:
proxy_list: ~
class:
nexus.pylon.drivers.DirectDriver
matcher:
doi: ^10.5867/.*$
resolver:
args:
extractors:
- producer:
format_string: 'https://www.medwave.cl/{path}'
timeout: 25.0
re: 'href=\"/(?P<path>[\w/.\-_]+\.pdf)\">PDF</a>'
type: regex
url: https://doi.org/{doi}
class: nexus.pylon.resolvers.RequestResolver
# journal.permsc.ru # journal.permsc.ru
- driver: - driver:
args: args:
@ -609,6 +642,24 @@ pylon:
class: nexus.pylon.drivers.BrowserDriver class: nexus.pylon.drivers.BrowserDriver
matcher: matcher:
doi: ^10.32920/.*$ doi: ^10.32920/.*$
# PKP Project
- driver:
args:
proxy_list: ~
class:
nexus.pylon.drivers.DirectDriver
matcher:
doi: ^10.(5399|24905|31004|32729|37934)/.*$
resolver:
args:
extractors:
- producer:
format_string: 'https://{host}/{prefix}/{journal}/article/download/{key}'
timeout: 25.0
re: 'href=\"(?:https?://[\w.]+)/(?P<prefix>[\w./]+)/(?P<journal>[\w.]+)/article/view/(?P<key>\w+/\w+)\"[^>]*>[Pp][Dd][Ff]\s*</a>'
type: regex
url: https://doi.org/{doi}
class: nexus.pylon.resolvers.RequestResolver
# papers.cumincad.org # papers.cumincad.org
- driver: - driver:
args: args:
@ -621,9 +672,19 @@ pylon:
matcher: matcher:
doi: ^10.52842/.*$ doi: ^10.52842/.*$
# ^.*$ # ^.*$
- matcher:
doi: ^.*$
resolver:
args:
selector: '.resource.primary.URL | select (. | ascii_downcase | contains("pdf"))'
class: nexus.pylon.resolvers.DoiOrgRequestResolver
- matcher: - matcher:
doi: ^.*$ doi: ^.*$
resolver: resolver:
args: args:
selector: '[(.link | if . == null then [] else . end)[] | select((."content-type" == "application/pdf") or (.URL | ascii_downcase | contains("pdf")))][0].URL' selector: '[(.link | if . == null then [] else . end)[] | select((."content-type" == "application/pdf") or (.URL | ascii_downcase | contains("pdf")))][0].URL'
class: nexus.pylon.resolvers.DoiOrgRequestResolver class: nexus.pylon.resolvers.DoiOrgRequestResolver
webdriver_hub:
downloads_directory: /downloads
endpoint: http://127.0.0.1:4444/wd/hub
host_downloads_directory: /downloads

View File

@ -5,9 +5,9 @@ import uvloop
from aiogrobid import GrobidClient from aiogrobid import GrobidClient
from aioipfs import AsyncIPFS as AsyncIPFS from aioipfs import AsyncIPFS as AsyncIPFS
from idm.api.aioclient import IdmApiGrpcClient from idm.api.aioclient import IdmApiGrpcClient
from izihawa_configurator import Configurator
from library.aiogrpctools import AioGrpcServer from library.aiogrpctools import AioGrpcServer
from library.aiopostgres import AioPostgresPoolHolder from library.aiopostgres import AioPostgresPoolHolder
from library.configurator import Configurator
from library.logging import configure_logging from library.logging import configure_logging
from library.telegram.base import BaseTelegramClient from library.telegram.base import BaseTelegramClient
from nexus.hub.configs import get_config from nexus.hub.configs import get_config

View File

@ -65,6 +65,7 @@ class BaseHubService(BaseService):
await asyncio.gather( await asyncio.gather(
self.application.ipfs_client.add_bytes(file, cid_version=1, hash='blake2b-256', only_hash=True), self.application.ipfs_client.add_bytes(file, cid_version=1, hash='blake2b-256', only_hash=True),
self.application.ipfs_client.add_bytes(file, cid_version=0, hash='sha2-256', only_hash=True), self.application.ipfs_client.add_bytes(file, cid_version=0, hash='sha2-256', only_hash=True),
self.application.ipfs_client.add_bytes(file, cid_version=1, hash='blake3', only_hash=True),
) )
)) ))

View File

@ -73,13 +73,7 @@ class DeliveryService(delivery_service_pb2_grpc.DeliveryServicer, BaseHubService
self.downloadings = set() self.downloadings = set()
self.is_sharience_enabled = is_sharience_enabled self.is_sharience_enabled = is_sharience_enabled
self.maintenance_picture_url = maintenance_picture_url self.maintenance_picture_url = maintenance_picture_url
self.pylon_client = PylonClient( self.pylon_client = PylonClient(config=pylon_config)
proxies=pylon_config['proxies'],
source_configs=pylon_config['sources'],
default_driver_proxy_list=pylon_config['default_driver_proxy_list'],
default_resolver_proxy_list=pylon_config['default_resolver_proxy_list'],
downloads_directory=pylon_config['downloads_directory'],
)
self.should_parse_with_grobid = should_parse_with_grobid self.should_parse_with_grobid = should_parse_with_grobid
self.should_store_hashes = should_store_hashes self.should_store_hashes = should_store_hashes
self.telegram_bot_configs = telegram_bot_configs self.telegram_bot_configs = telegram_bot_configs
@ -170,6 +164,15 @@ class DeliveryService(delivery_service_pb2_grpc.DeliveryServicer, BaseHubService
return delivery_service_pb2.StartDeliveryResponse(status=delivery_service_pb2.StartDeliveryResponse.Status.OK) return delivery_service_pb2.StartDeliveryResponse(status=delivery_service_pb2.StartDeliveryResponse.Status.OK)
async def delayed_task(create_task, t):
try:
await asyncio.sleep(t)
task = create_task()
await task
except asyncio.CancelledError:
pass
class DownloadTask: class DownloadTask:
def __init__( def __init__(
self, self,
@ -204,7 +207,7 @@ class DownloadTask:
) )
async def download_task(self, request_context: RequestContext, document_holder): async def download_task(self, request_context: RequestContext, document_holder):
throttle_secs = 2.0 throttle_secs = 3.0
async def _on_fail(): async def _on_fail():
await self.application.telegram_clients[request_context.bot_name].send_message( await self.application.telegram_clients[request_context.bot_name].send_message(
@ -218,6 +221,7 @@ class DownloadTask:
error_log=request_context.error_log, error_log=request_context.error_log,
on_fail=_on_fail, on_fail=_on_fail,
): ):
start_time = time.time()
filename = document_holder.get_filename() filename = document_holder.get_filename()
progress_bar_download = ProgressBar( progress_bar_download = ProgressBar(
telegram_client=self.application.telegram_clients[request_context.bot_name], telegram_client=self.application.telegram_clients[request_context.bot_name],
@ -226,9 +230,9 @@ class DownloadTask:
header=f'⬇️ {filename}', header=f'⬇️ {filename}',
tail_text=t('TRANSMITTED_FROM', request_context.chat.language), tail_text=t('TRANSMITTED_FROM', request_context.chat.language),
throttle_secs=throttle_secs, throttle_secs=throttle_secs,
last_call=start_time,
) )
downloads_gauge.inc() downloads_gauge.inc()
start_time = time.time()
try: try:
file = await self.download( file = await self.download(
document_holder=document_holder, document_holder=document_holder,
@ -242,11 +246,21 @@ class DownloadTask:
) )
if not document_holder.md5 and document_holder.get_extension() == 'pdf': if not document_holder.md5 and document_holder.get_extension() == 'pdf':
try: try:
await progress_bar_download.send_message( processing_message_task = asyncio.create_task(delayed_task(
create_task=lambda: progress_bar_download.send_message(
t("PROCESSING_PAPER", request_context.chat.language).format(filename=filename), t("PROCESSING_PAPER", request_context.chat.language).format(filename=filename),
ignore_last_call=True ignore_last_call=True
),
t=5.0
))
file = await asyncio.get_running_loop().run_in_executor(
None,
lambda: clean_metadata(file, doi=document_holder.doi)
) )
file = clean_metadata(file, doi=document_holder.doi)
processing_message_task.cancel()
await processing_message_task
request_context.statbox( request_context.statbox(
action='cleaned', action='cleaned',
len=len(file), len=len(file),
@ -260,7 +274,8 @@ class DownloadTask:
banner=t("LOOKING_AT", request_context.chat.language), banner=t("LOOKING_AT", request_context.chat.language),
header=f'⬇️ {filename}', header=f'⬇️ {filename}',
tail_text=t('UPLOADED_TO_TELEGRAM', request_context.chat.language), tail_text=t('UPLOADED_TO_TELEGRAM', request_context.chat.language),
throttle_secs=throttle_secs throttle_secs=throttle_secs,
last_call=progress_bar_download.last_call,
) )
uploaded_message = await self.delivery_service.send_file( uploaded_message = await self.delivery_service.send_file(
document_holder=self.document_holder, document_holder=self.document_holder,
@ -393,11 +408,15 @@ class DownloadTask:
async def download(self, document_holder, progress_bar): async def download(self, document_holder, progress_bar):
collected = bytearray() collected = bytearray()
if document_holder.doi: params = {}
try: try:
params = {'doi': document_holder.doi} if document_holder.doi:
params['doi'] = document_holder.doi
if document_holder.md5: if document_holder.md5:
params['md5'] = document_holder.md5 params['md5'] = document_holder.md5
if document_holder.ipfs_multihashes:
params['ipfs_multihashes'] = [ipfs_multihash for ipfs_multihash in document_holder.ipfs_multihashes]
if params:
async for resp in self.delivery_service.pylon_client.download(params): async for resp in self.delivery_service.pylon_client.download(params):
await self.process_resp( await self.process_resp(
resp=resp, resp=resp,
@ -408,18 +427,6 @@ class DownloadTask:
return bytes(collected) return bytes(collected)
except DownloadError: except DownloadError:
pass pass
if document_holder.md5:
try:
async for resp in self.delivery_service.pylon_client.download({'md5': document_holder.md5}):
await self.process_resp(
resp=resp,
progress_bar=progress_bar,
collected=collected,
filesize=document_holder.filesize,
)
return bytes(collected)
except DownloadError:
pass
async def external_cancel(self): async def external_cancel(self):
self.request_context.statbox(action='externally_canceled') self.request_context.statbox(action='externally_canceled')

View File

@ -27,7 +27,7 @@ py3_image(
requirement("aiokit"), requirement("aiokit"),
requirement("aiolibgen"), requirement("aiolibgen"),
"//library/aiopostgres", "//library/aiopostgres",
"//library/configurator", requirement("izihawa_configurator"),
"//library/jobber", "//library/jobber",
"//nexus/actions", "//nexus/actions",
], ],

View File

@ -34,6 +34,7 @@ class PostgresJob(BaseJob):
f'user={database["username"]} ' f'user={database["username"]} '
f'password={database["password"]} ' f'password={database["password"]} '
f'host={database["host"]}', f'host={database["host"]}',
timeout=3600 * 2,
) )
self.summa_client = SummaClient(endpoint=summa['endpoint']) self.summa_client = SummaClient(endpoint=summa['endpoint'])
self.summa_config = summa self.summa_config = summa
@ -84,6 +85,7 @@ class PostgresJob(BaseJob):
# Mandatory for server side cursor # Mandatory for server side cursor
cursor_name='nexus_ingest_cursor', cursor_name='nexus_ingest_cursor',
itersize=50_000, itersize=50_000,
statement_timeout=3600 * 2,
): ):
loaded = True loaded = True
yield row yield row
@ -95,8 +97,12 @@ class PostgresJob(BaseJob):
# Mandatory for server side cursor # Mandatory for server side cursor
cursor_name='nexus_ingest_cursor', cursor_name='nexus_ingest_cursor',
itersize=50_000, itersize=50_000,
statement_timeout=3600 * 2,
): ):
yield row yield row
await self.summa_client.commit_index(self.summa_config['name'], session_id=session_id) await self.summa_client.commit_index(
self.summa_config['name'],
session_id=session_id,
)
await self.summa_client.set_index_alias(self.summa_config['index_alias'], self.summa_config['name'], session_id=session_id) await self.summa_client.set_index_alias(self.summa_config['index_alias'], self.summa_config['name'], session_id=session_id)

View File

@ -25,7 +25,7 @@ DEPS = [
"//library/aiogrpctools", "//library/aiogrpctools",
requirement("aiokit"), requirement("aiokit"),
"//library/aiopostgres", "//library/aiopostgres",
"//library/configurator", requirement("izihawa_configurator"),
"//library/logging", "//library/logging",
"//nexus/meta_api/proto:grpc_py", "//nexus/meta_api/proto:grpc_py",
"//nexus/models/proto:proto_py", "//nexus/models/proto:proto_py",

View File

@ -1,5 +1,5 @@
from izihawa_configurator import Configurator
from izihawa_utils import env from izihawa_utils import env
from library.configurator import Configurator
def get_config(): def get_config():

View File

@ -315,7 +315,7 @@ class SearchService(SearchServicer, BaseService):
with suppress(RetryError): with suppress(RetryError):
async for attempt in AsyncRetrying( async for attempt in AsyncRetrying(
retry=retry_if_exception_type(NeedRetryError), retry=retry_if_exception_type(NeedRetryError),
wait=wait_fixed(5), wait=wait_fixed(10),
stop=stop_after_attempt(6) stop=stop_after_attempt(6)
): ):
with attempt: with attempt:

View File

@ -33,4 +33,5 @@ message Scimag {
string volume = 21; string volume = 21;
int32 year = 30; int32 year = 30;
float page_rank = 34; float page_rank = 34;
float series_page_rank = 35;
} }

View File

@ -26,7 +26,7 @@ py3_image(
requirement("aiocrossref"), requirement("aiocrossref"),
requirement("aiokit"), requirement("aiokit"),
"//library/aiopostgres", "//library/aiopostgres",
"//library/configurator", requirement("izihawa_configurator"),
"//library/logging", "//library/logging",
"//nexus/actions", "//nexus/actions",
"//nexus/models/proto:proto_py", "//nexus/models/proto:proto_py",

View File

@ -10,6 +10,6 @@ py_library(
srcs_version = "PY3", srcs_version = "PY3",
visibility = ["//visibility:public"], visibility = ["//visibility:public"],
deps = [ deps = [
"//library/configurator", requirement("izihawa_configurator"),
], ],
) )

View File

@ -1,4 +1,4 @@
from library.configurator import Configurator from izihawa_configurator import Configurator
def get_promotions(): def get_promotions():

View File

@ -19,6 +19,9 @@ promotions:
- texts: - texts:
en: 💬 Research is the only and ultimate goal en: 💬 Research is the only and ultimate goal
weight: 1 weight: 1
- texts:
en: 💬 Intellectual property is not a valid form of property
weight: 1
- texts: - texts:
en: ✋ Have a subscription to paid articles? [Help researchers!](https://t.me/{mutual_aid_group}) en: ✋ Have a subscription to paid articles? [Help researchers!](https://t.me/{mutual_aid_group})
ru: ✋ Есть доступ к платным статьям? [Помоги ученым!](https://t.me/{mutual_aid_group}) ru: ✋ Есть доступ к платным статьям? [Помоги ученым!](https://t.me/{mutual_aid_group})

View File

@ -1,12 +1,16 @@
load("@rules_python//python:defs.bzl", "py_binary", "py_library") load("@rules_python//python:defs.bzl", "py_library")
load("@rules_python//python:packaging.bzl", "py_wheel")
load("@pip_modules//:requirements.bzl", "requirement") load("@pip_modules//:requirements.bzl", "requirement")
filegroup(
name = "data",
srcs = ["configs/pylon.yaml"],
)
py_library( py_library(
name = "pylon", name = "pylon",
srcs = glob(["**/*.py"]), srcs = glob(["**/*.py"]),
data = [ data = [":data"],
"configs/pylon.yaml",
],
visibility = ["//visibility:public"], visibility = ["//visibility:public"],
deps = [ deps = [
requirement("aiodns"), requirement("aiodns"),
@ -16,6 +20,7 @@ py_library(
requirement("brotli"), requirement("brotli"),
requirement("cchardet"), requirement("cchardet"),
requirement("certifi"), requirement("certifi"),
requirement("fire"),
requirement("jq"), requirement("jq"),
requirement("orjson"), requirement("orjson"),
requirement("pypdf2"), requirement("pypdf2"),
@ -23,20 +28,38 @@ py_library(
requirement("selenium"), requirement("selenium"),
requirement("tenacity"), requirement("tenacity"),
requirement("aiokit"), requirement("aiokit"),
"//library/configurator", requirement("izihawa_configurator"),
"//library/logging", "//library/logging",
"//nexus/pylon/proto:pylon_proto_py", "//nexus/pylon/proto:pylon_proto_py",
], ],
) )
py_binary( py_wheel(
name = "cli", name = "nexus-pylon-wheel",
srcs = ["cli.py"], author = "The Superpirate",
main = "cli.py", author_email = "fist.of.the.first.pirates@gmail.com",
srcs_version = "PY3", classifiers = [
visibility = ["//visibility:public"], "Programming Language :: Python :: 3.10",
],
description_file = ":README.md",
distribution = "nexus-pylon-wheel",
entry_points = {"console_scripts": ["pylon = nexus.pylon.cli:main"]},
homepage = "https://github.com/nexus-stc/hyperboria/tree/master/nexus/pylon",
license = "MIT License",
python_requires = ">=3.10",
python_tag = "py3",
requires = [
"aiokit >= 1.0.0",
"izihawa_configurator >= 1.0.0",
"selenium >= 4.3.0",
],
strip_path_prefixes = [
"nexus/pylon/proto/pylon_proto_py_pb",
],
version = "1.0.0",
deps = [ deps = [
requirement("fire"), ":data",
":pylon", ":pylon",
"//nexus/pylon/proto:pylon_proto_py",
], ],
) )

View File

@ -6,16 +6,51 @@
- Streams data by chunks - Streams data by chunks
- GRPC-ready - GRPC-ready
## Build
```bash
bazel build -c opt nexus-pylon-wheel
```
## Install
### PIP
```bash
pip install nexus-pylon
```
## Nexus Pylon CLI ## Nexus Pylon CLI
Casual download Download scientific publication:
```bash ```bash
bazel run -c opt cli -- doi 10.1056/NEJMoa2033700 --output article.pdf pylon download --doi 10.1182/blood-2011-03-325258 --output article.pdf
``` ```
Download with proxies Download file by its MD5:
```bash ```bash
bazel run -c opt cli -- md5 278C3A72B7B04717361501B8642857DF \ pylon download --md5 f07707ee92fa675fd4ee53e3fee977d1 --output article.pdf
--output file.pdf \ ```
--proxies socks5://127.0.0.1:9050
Download file by its multihash:
```bash
pylon download --ipfs-multihashes '["bafykbzacea3vduqii3u52xkzdqan5oc54vsvedmed25dfybrqxyafahjl3rzu"]' --output article.pdf
```
### Using with Selenium
Create directory for exchaning files between host and launched Selenium in Docker
```bash
mkdir downloads
```
Launch Selenium in Docker
```bash
docker run -e SE_START_XVFB=false -v $(pwd)/downloads:/downloads -p 4444:4444 selenium/standalone-chrome:latest
```
Launch Pylon
```bash
pylon download --doi 10.1101/2022.09.09.507349 --output article.pdf \
--wd-endpoint 'http://127.0.0.1:4444/wd/hub' \
--wd-directory /downloads --wd-host-directory $(pwd)/downloads --debug
``` ```

View File

@ -1,15 +1,17 @@
import logging import logging
import os import os
import sys import sys
from typing import Optional
import fire import fire
from aiokit.utils import sync_fu from aiokit.utils import sync_fu
from nexus.pylon.client import ( from izihawa_configurator import Configurator
from .client import (
DownloadError, DownloadError,
PylonClient, PylonClient,
) )
from nexus.pylon.configs import get_config from .proto.file_pb2 import FileResponse as FileResponsePb
from nexus.pylon.proto.file_pb2 import FileResponse as FileResponsePb
def resolve_path(filepath): def resolve_path(filepath):
@ -27,22 +29,20 @@ async def fetch(
collected = bytes() collected = bytes()
try: try:
last_len = 0 last_len = 0
last_source = ''
async for resp in iter: async for resp in iter:
if resp.HasField('status'): if resp.HasField('status'):
if resp.status == FileResponsePb.Status.BEGIN_TRANSMISSION: if resp.status == FileResponsePb.Status.BEGIN_TRANSMISSION:
print(f'Started transmission from {resp.source}...', end='\r', file=sys.stderr) print(f'Started transmission...', file=sys.stderr)
last_len = 0 last_len = 0
last_source = resp.source
collected = bytes() collected = bytes()
elif resp.HasField('chunk'): elif resp.HasField('chunk'):
if len(collected) - last_len > 1024 * 100: if len(collected) - last_len > 1024 * 100:
print(f'Loaded {len(collected)} bytes from {resp.source}', end='\r', file=sys.stderr) print(f'Loaded {len(collected)} bytes', end='\r', file=sys.stderr)
last_len = len(collected) last_len = len(collected)
last_source = resp.source
collected += resp.chunk.content collected += resp.chunk.content
with open(resolve_path(output), 'wb') as f: with open(resolve_path(output), 'wb') as f:
print(f'Completed! Loaded {len(collected)} bytes from {last_source}', file=sys.stderr) print()
print(f'Completed! Loaded {len(collected)} bytes', file=sys.stderr)
f.write(collected) f.write(collected)
except DownloadError: except DownloadError:
print('File not found') print('File not found')
@ -50,25 +50,53 @@ async def fetch(
async def download( async def download(
output: str, output: str,
config: Optional[str] = None,
debug: bool = False, debug: bool = False,
wd_endpoint: Optional[str] = None,
wd_directory: Optional[str] = None,
wd_host_directory: Optional[str] = None,
**params, **params,
): ):
"""
Download scientific publications from various sources
Large portion of fresh articles could be retrieved only though publisher libraries through `BrowserDriver`, it
requires Selenium webdriver:
`docker run -e SE_START_XVFB=false -v $(pwd)/downloads:/downloads -p 4444:4444 selenium/standalone-chrome:latest`
Args:
output: name of the output file
config: pylon config
debug: enable debug logging
wd_endpoint: web-driver
wd_directory: mounted directory inside Docker image
wd_host_directory: directory for downloads on host that should be mounter as `wd_directory` inside Docker image
"""
if debug: if debug:
logging.basicConfig(stream=sys.stderr, level=logging.DEBUG) logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
c = get_config()['pylon']
p = PylonClient( default_config_path = os.path.join(os.path.dirname(__file__), 'configs/pylon.yaml')
proxies=c['proxies'], config = Configurator([config if config else default_config_path], env_prefix='NEXUS_PYLON')
source_configs=c['sources'], config = config['pylon']
default_driver_proxy_list=c['default_driver_proxy_list'], if wd_endpoint:
downloads_directory=c['downloads_directory'], config.setdefault('webdriver_hub', {})
) config['webdriver_hub']['endpoint'] = wd_endpoint
return await fetch(iter=p.download(params=params), output=output) if not wd_directory:
raise ValueError('Should pass --wd-directory with --wd-endpoint')
config['webdriver_hub']['downloads_directory'] = wd_directory
if not wd_host_directory:
raise ValueError('Should pass --wd-host-directory with --wd-endpoint')
config['webdriver_hub']['host_downloads_directory'] = wd_host_directory
pylon_client = PylonClient(config=config)
return await fetch(iter=pylon_client.download(params=params), output=output)
def main(): def main():
try:
fire.Fire({ fire.Fire({
'download': sync_fu(download), 'download': sync_fu(download),
}) })
except KeyboardInterrupt:
sys.exit(1)
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -1,12 +1,10 @@
import logging
from typing import ( from typing import (
AsyncIterable, AsyncIterable,
Dict, Dict,
List,
Optional,
) )
from aiokit import AioThing from aiokit import AioThing
from library.logging import error_log
from nexus.pylon.exceptions import ( from nexus.pylon.exceptions import (
DownloadError, DownloadError,
NotFoundError, NotFoundError,
@ -17,28 +15,23 @@ from nexus.pylon.source import Source
class PylonClient(AioThing): class PylonClient(AioThing):
def __init__( def __init__(self, config):
self,
source_configs: Optional[List],
proxies: Optional[List[str]] = None,
downloads_directory: Optional[str] = None,
default_driver_proxy_list: [Optional[List]] = None,
default_resolver_proxy_list: [Optional[List]] = None,
):
super().__init__() super().__init__()
self.proxy_manager = ProxyManager(proxies) self.config = config
self.downloads_directory = downloads_directory self.proxy_manager = ProxyManager(config.get('proxies'))
self.default_driver_proxy_list = default_driver_proxy_list
self.default_resolver_proxy_list = default_resolver_proxy_list
self.sources = [] self.sources = []
for source_config in source_configs: if config.get('webdriver_hub') is None:
logging.getLogger('nexus_pylon').warning({
'action': 'missed_webdriver',
'mode': 'pylon',
})
for source_config in config['sources']:
source = Source.from_config( source = Source.from_config(
proxy_manager=self.proxy_manager, proxy_manager=self.proxy_manager,
config=self.config,
source_config=source_config, source_config=source_config,
downloads_directory=downloads_directory,
default_driver_proxy_list=default_driver_proxy_list,
default_resolver_proxy_list=default_resolver_proxy_list,
) )
if source:
self.sources.append(source) self.sources.append(source)
self.starts.append(source) self.starts.append(source)
@ -50,9 +43,10 @@ class PylonClient(AioThing):
async for resp in source.download(params): async for resp in source.download(params):
yield resp yield resp
return return
except NotFoundError: except NotFoundError as e:
logging.getLogger('nexus_pylon').debug(e)
continue continue
except DownloadError as e: except DownloadError as e:
error_log(e) logging.getLogger('nexus_pylon').warning(e)
continue continue
raise NotFoundError() raise NotFoundError(params=params)

View File

@ -1,11 +0,0 @@
from izihawa_utils import env
from library.configurator import Configurator
def get_config():
return Configurator([
'nexus/pylon/configs/pylon.yaml',
], env_prefix='NEXUS_PYLON')
config = get_config()

View File

@ -1,65 +1,25 @@
--- ---
pylon: pylon:
default_driver_proxy_list: default_driver_proxy_list: ~
- [proxy1] default_resolver_proxy_list: ~
- [proxy2] proxies: ~
- [proxy3]
downloads_directory: /downloads
proxies:
- address: proxy1.net:7890
name: proxy1
tags: [proxy1]
- address: proxy2.net:7990
name: proxy2
tags: [proxy2]
- address: proxy3.net:8090
name: proxy3
tags: [proxy3]
sources: sources:
# LibGen.rocks # IPFS
- driver: - driver:
args: args:
proxy_list: ~
validator: validator:
class: nexus.pylon.validators.Md5Validator class: nexus.pylon.validators.BaseValidator
class: class:
nexus.pylon.drivers.DirectDriver nexus.pylon.drivers.DirectDriver
matcher: matcher:
md5: ^.*$ ipfs_multihashes: ^.*$
resolver: resolver:
args: args:
extractors: format_string: 'https://ipfs.io/ipfs/{ipfs_multihashes[0]}'
- producer: class: nexus.pylon.resolvers.TemplateResolver
format_string: 'http://libgen.rocks/{matched_group}'
group: 0
re: 'get\.php\?md5=.*&key=[A-Za-z\d]+'
timeout: 25.0
type: regex
url: https://libgen.rocks/ads.php?md5={md5}
class: nexus.pylon.resolvers.RequestResolver
# LibGen.rocks
- driver:
args:
proxy_list: ~
class:
nexus.pylon.drivers.DirectDriver
matcher:
doi: ^.*$
resolver:
args:
extractors:
- producer:
format_string: 'http://libgen.rocks/{matched_group}'
group: 0
re: 'get\.php\?md5=[a-fA-F\d]+&key=[A-Za-z\d]+(&doi=[^\"]*)+'
timeout: 25.0
type: regex
url: 'https://libgen.rocks/ads.php?doi={doi}'
class: nexus.pylon.resolvers.RequestResolver
# Library.lol # Library.lol
- driver: - driver:
args: args:
proxy_list: ~
validator: validator:
class: nexus.pylon.validators.Md5Validator class: nexus.pylon.validators.Md5Validator
class: class:
@ -70,27 +30,22 @@ pylon:
args: args:
extractors: extractors:
- producer: - producer:
format_string: '{matched_group}' format_string: '{href}'
group: 1
re: '<a href="([^\"]+)">GET</a>'
timeout: 45.0 timeout: 45.0
re: '<a href="(?P<href>[^\"]+)">GET</a>'
type: regex type: regex
- producer: - producer:
format_string: '{matched_group}' format_string: '{url}'
group: 0 re: '(?P<url>https://ipfs.io/ipfs/[A-Za-z\d]+)'
re: 'https://ipfs.io/ipfs/[A-Za-z\d]+'
type: regex type: regex
- producer: - producer:
format_string: '{matched_group}' format_string: '{url}'
group: 0 re: '(?P<url>https://cloudflare-ipfs.com/ipfs/[A-Za-z\d]+)'
re: 'https://cloudflare-ipfs.com/ipfs/[A-Za-z\d]+'
type: regex type: regex
url: http://library.lol/main/{md5} url: http://library.lol/main/{md5}
class: nexus.pylon.resolvers.RequestResolver class: nexus.pylon.resolvers.RequestResolver
# library.lol # library.lol
- driver: - driver:
args:
proxy_list: ~
class: class:
nexus.pylon.drivers.DirectDriver nexus.pylon.drivers.DirectDriver
matcher: matcher:
@ -99,13 +54,48 @@ pylon:
args: args:
extractors: extractors:
- producer: - producer:
format_string: '{matched_group}' format_string: '{href}'
group: 1
re: '<a href="([^\"]+)">GET</a>'
timeout: 45.0 timeout: 45.0
re: '<a href="(?P<href>[^\"]+)">GET</a>'
type: regex type: regex
url: 'http://library.lol/scimag/{doi}' url: 'http://library.lol/scimag/{doi}'
class: nexus.pylon.resolvers.RequestResolver class: nexus.pylon.resolvers.RequestResolver
# LibGen.rocks
- driver:
args:
validator:
class: nexus.pylon.validators.Md5Validator
class:
nexus.pylon.drivers.DirectDriver
matcher:
md5: ^.*$
resolver:
args:
extractors:
- producer:
format_string: 'http://libgen.rocks/{key}'
timeout: 25.0
re: '(?P<key>get\.php\?md5=.*&key=[A-Za-z\d]+)'
type: regex
resolve_timeout: 25.0
url: https://libgen.rocks/ads.php?md5={md5}
class: nexus.pylon.resolvers.RequestResolver
# LibGen.rocks
- driver:
class:
nexus.pylon.drivers.DirectDriver
matcher:
doi: ^.*$
resolver:
args:
extractors:
- producer:
format_string: 'http://libgen.rocks/{key}'
timeout: 25.0
re: '(?P<key>get\.php\?md5=[a-fA-F\d]+&key=[A-Za-z\d]+(&doi=[^\"]*)+)'
type: regex
url: 'https://libgen.rocks/ads.php?doi={doi}'
class: nexus.pylon.resolvers.RequestResolver
# jamanetwork.com # jamanetwork.com
- driver: - driver:
args: args:
@ -206,6 +196,13 @@ pylon:
format_string: 'https://www.tandfonline.com/doi/pdf/{doi}?download=true' format_string: 'https://www.tandfonline.com/doi/pdf/{doi}?download=true'
class: nexus.pylon.resolvers.TemplateResolver class: nexus.pylon.resolvers.TemplateResolver
# iopscience.iop.org # iopscience.iop.org
- matcher:
doi: ^10.1088/.*$
resolver:
args:
format_string: 'https://iopscience.iop.org/article/{doi}/pdf'
class: nexus.pylon.resolvers.TemplateResolver
# iopscience.iop.org
- matcher: - matcher:
doi: ^10.1088/.*$ doi: ^10.1088/.*$
resolver: resolver:
@ -220,10 +217,6 @@ pylon:
timeout: 30 timeout: 30
type: wait_css_selector type: wait_css_selector
- type: click - type: click
proxy_list:
- [proxy2]
- [proxy1]
- [proxy3]
class: nexus.pylon.drivers.BrowserDriver class: nexus.pylon.drivers.BrowserDriver
matcher: matcher:
doi: ^10.1093/.*$ doi: ^10.1093/.*$
@ -246,6 +239,13 @@ pylon:
args: args:
timeout: 30.0 timeout: 30.0
class: nexus.pylon.resolvers.TemplateResolver class: nexus.pylon.resolvers.TemplateResolver
# biorxiv.org
- matcher:
doi: ^10.1101/.*$
resolver:
args:
format_string: 'https://www.biorxiv.org/content/{doi}.full.pdf'
class: nexus.pylon.resolvers.TemplateResolver
# journals.aps.org # journals.aps.org
- matcher: - matcher:
doi: ^10.1103/.*$ doi: ^10.1103/.*$
@ -332,6 +332,13 @@ pylon:
args: args:
format_string: 'https://journals.physiology.org/doi/pdf/{doi}?download=true' format_string: 'https://journals.physiology.org/doi/pdf/{doi}?download=true'
class: nexus.pylon.resolvers.TemplateResolver class: nexus.pylon.resolvers.TemplateResolver
# www.ahajournals.org
- matcher:
doi: ^10.1161/.*$
resolver:
args:
format_string: 'https://www.ahajournals.org/doi/pdf/{doi}?download=true'
class: nexus.pylon.resolvers.TemplateResolver
# ajp.psychiatryonline.org # ajp.psychiatryonline.org
- matcher: - matcher:
doi: ^10.1176/.*$ doi: ^10.1176/.*$
@ -355,8 +362,6 @@ pylon:
class: nexus.pylon.resolvers.TemplateResolver class: nexus.pylon.resolvers.TemplateResolver
# journals.plos.org # journals.plos.org
- driver: - driver:
args:
proxy_list: ~
class: nexus.pylon.drivers.direct.DirectDriver class: nexus.pylon.drivers.direct.DirectDriver
matcher: matcher:
doi: ^10.1371/.*$ doi: ^10.1371/.*$
@ -364,6 +369,13 @@ pylon:
args: args:
format_string: 'https://journals.plos.org/plosone/article/file?id={doi}&type=printable' format_string: 'https://journals.plos.org/plosone/article/file?id={doi}&type=printable'
class: nexus.pylon.resolvers.TemplateResolver class: nexus.pylon.resolvers.TemplateResolver
# guilfordjournals.com
- matcher:
doi: ^10.1521/.*$
resolver:
args:
format_string: 'https://guilfordjournals.com/doi/pdf/{doi}?download=true'
class: nexus.pylon.resolvers.TemplateResolver
# bioone.org # bioone.org
- driver: - driver:
args: args:
@ -396,8 +408,6 @@ pylon:
doi: ^10.2139/.*$ doi: ^10.2139/.*$
# www.afghandata.org # www.afghandata.org
- driver: - driver:
args:
proxy_list: ~
class: class:
nexus.pylon.drivers.DirectDriver nexus.pylon.drivers.DirectDriver
matcher: matcher:
@ -503,8 +513,6 @@ pylon:
doi: ^10.5334/.*$ doi: ^10.5334/.*$
# hess.copernicus.org # hess.copernicus.org
- driver: - driver:
args:
proxy_list: ~
class: nexus.pylon.drivers.DirectDriver class: nexus.pylon.drivers.DirectDriver
matcher: matcher:
doi: ^10.5194/.*$ doi: ^10.5194/.*$
@ -524,7 +532,6 @@ pylon:
- selector: '.uxf-download' - selector: '.uxf-download'
type: wait_css_selector type: wait_css_selector
- type: click - type: click
proxy_list: ~
class: nexus.pylon.drivers.BrowserDriver class: nexus.pylon.drivers.BrowserDriver
matcher: matcher:
doi: ^10.5585/.* doi: ^10.5585/.*
@ -539,6 +546,22 @@ pylon:
args: args:
format_string: 'https://jcsm.aasm.org/doi/pdf/{doi}?download=true' format_string: 'https://jcsm.aasm.org/doi/pdf/{doi}?download=true'
class: nexus.pylon.resolvers.TemplateResolver class: nexus.pylon.resolvers.TemplateResolver
# www.medwave.cl
- driver:
class:
nexus.pylon.drivers.DirectDriver
matcher:
doi: ^10.5867/.*$
resolver:
args:
extractors:
- producer:
format_string: 'https://www.medwave.cl/{path}'
timeout: 25.0
re: 'href=\"/(?P<path>[\w/.\-_]+\.pdf)\">PDF</a>'
type: regex
url: https://doi.org/{doi}
class: nexus.pylon.resolvers.RequestResolver
# journal.permsc.ru # journal.permsc.ru
- driver: - driver:
args: args:
@ -584,8 +607,6 @@ pylon:
doi: ^10.21203/.*$ doi: ^10.21203/.*$
# www.ukm.my/ # www.ukm.my/
- driver: - driver:
args:
proxy_list: ~
class: nexus.pylon.drivers.DirectDriver class: nexus.pylon.drivers.DirectDriver
matcher: matcher:
doi: ^10.24035/.*$ doi: ^10.24035/.*$
@ -599,6 +620,22 @@ pylon:
class: nexus.pylon.drivers.BrowserDriver class: nexus.pylon.drivers.BrowserDriver
matcher: matcher:
doi: ^10.32920/.*$ doi: ^10.32920/.*$
# PKP Project
- driver:
class:
nexus.pylon.drivers.DirectDriver
matcher:
doi: ^10.(5399|24905|31004|32729|37934)/.*$
resolver:
args:
extractors:
- producer:
format_string: 'https://{host}/{prefix}/{journal}/article/download/{key}'
timeout: 25.0
re: 'href=\"(?:https?://[\w.]+)/(?P<prefix>[\w./]+)/(?P<journal>[\w.]+)/article/view/(?P<key>\w+/\w+)\"[^>]*>[Pp][Dd][Ff]\s*</a>'
type: regex
url: https://doi.org/{doi}
class: nexus.pylon.resolvers.RequestResolver
# papers.cumincad.org # papers.cumincad.org
- driver: - driver:
args: args:
@ -606,11 +643,16 @@ pylon:
- selector: 'file.pdf' - selector: 'file.pdf'
type: wait_link_text type: wait_link_text
- type: click - type: click
proxy_list: ~
class: nexus.pylon.drivers.BrowserDriver class: nexus.pylon.drivers.BrowserDriver
matcher: matcher:
doi: ^10.52842/.*$ doi: ^10.52842/.*$
# ^.*$ # ^.*$
- matcher:
doi: ^.*$
resolver:
args:
selector: '.resource.primary.URL | select (. | ascii_downcase | contains("pdf"))'
class: nexus.pylon.resolvers.DoiOrgRequestResolver
- matcher: - matcher:
doi: ^.*$ doi: ^.*$
resolver: resolver:

View File

@ -4,22 +4,22 @@ from typing import (
Optional, Optional,
) )
from izihawa_utils.importlib import import_object
from nexus.pylon.network_agent import NetworkAgent from nexus.pylon.network_agent import NetworkAgent
from nexus.pylon.prepared_request import PreparedRequest from nexus.pylon.prepared_request import PreparedRequest
from nexus.pylon.proxy_manager import ProxyManager from nexus.pylon.proxy_manager import ProxyManager
from nexus.pylon.validators.base import BaseValidator
from utils.izihawa_utils.importlib import import_object
class BaseDriver(NetworkAgent): class BaseDriver(NetworkAgent):
def __init__( def __init__(
self, self,
config,
validator=None, validator=None,
downloads_directory: str = '/downloads',
proxy_list: Optional[List] = None, proxy_list: Optional[List] = None,
proxy_manager: Optional[ProxyManager] = None, proxy_manager: Optional[ProxyManager] = None,
): ):
super().__init__(proxy_list=proxy_list, proxy_manager=proxy_manager) super().__init__(proxy_list=proxy_list, proxy_manager=proxy_manager)
self.config = config
validator_cls = 'nexus.pylon.validators.PdfValidator' validator_cls = 'nexus.pylon.validators.PdfValidator'
if validator and 'class' in validator: if validator and 'class' in validator:
@ -27,7 +27,6 @@ class BaseDriver(NetworkAgent):
validator_cls = import_object(validator_cls) validator_cls = import_object(validator_cls)
self.validator = validator_cls self.validator = validator_cls
self.downloads_directory = downloads_directory
def __str__(self): def __str__(self):
return self.__class__.__name__ return self.__class__.__name__

View File

@ -32,21 +32,20 @@ from selenium.webdriver.support.ui import WebDriverWait
class BrowserDriver(BaseDriver): class BrowserDriver(BaseDriver):
def __init__( def __init__(
self, self,
config,
validator=None, validator=None,
proxy_list: Optional[List] = None, proxy_list: Optional[List] = None,
proxy_manager: Optional[ProxyManager] = None, proxy_manager: Optional[ProxyManager] = None,
actions: Optional[List] = None, actions: Optional[List] = None,
downloads_directory='/downloads',
window_size: Tuple[int, int] = (1279, 833),
erase_webdrive_property: bool = True,
webdrive_hub_endpoint: str = "http://127.0.0.1:4444/wd/hub",
): ):
super().__init__(validator=validator, proxy_list=proxy_list, proxy_manager=proxy_manager) super().__init__(config=config, validator=validator, proxy_list=proxy_list, proxy_manager=proxy_manager)
self.actions = actions self.actions = actions
self.downloads_directory = Path(downloads_directory) self.downloads_directory = Path(config['webdriver_hub']['downloads_directory'])
self.window_size = window_size self.host_downloads_directory = Path(config['webdriver_hub']['host_downloads_directory'])
self.erase_webdrive_property = erase_webdrive_property self.window_size = tuple(config['webdriver_hub'].get('window_size', [1279, 833]))
self.webdrive_hub_endpoint = webdrive_hub_endpoint self.erase_webdriver_property = config['webdriver_hub'].get('erase_webdriver_property', True)
self.webdriver_hub_endpoint = config['webdriver_hub']['endpoint']
self.file_poll_timeout = 2.0
async def get_chrome_sessions(self): async def get_chrome_sessions(self):
proxies = list( proxies = list(
@ -55,15 +54,14 @@ class BrowserDriver(BaseDriver):
else [None] else [None]
) )
for proxy in proxies: for proxy in proxies:
downloads_folder = self.downloads_directory / random_string(16) subdirectory = random_string(16)
os.mkdir(downloads_folder) downloads_directory = self.downloads_directory / subdirectory
os.chmod(downloads_folder, 0o777) host_downloads_directory = self.host_downloads_directory / subdirectory
chrome = await asyncio.get_running_loop().run_in_executor(None, lambda: self.setup_chrome(proxy, downloads_folder)) os.mkdir(host_downloads_directory)
try: os.chmod(host_downloads_directory, 0o777)
yield chrome, downloads_folder chrome = await asyncio.get_running_loop().run_in_executor(None, lambda: self.setup_chrome(proxy, downloads_directory))
finally: yield chrome, host_downloads_directory
shutil.rmtree(downloads_folder)
chrome.quit()
def setup_chrome(self, proxy, downloads_folder): def setup_chrome(self, proxy, downloads_folder):
options = webdriver.ChromeOptions() options = webdriver.ChromeOptions()
@ -85,13 +83,13 @@ class BrowserDriver(BaseDriver):
options.add_argument('--disable-dev-shm-usage') options.add_argument('--disable-dev-shm-usage')
options.add_argument("--disable-popup-blocking") options.add_argument("--disable-popup-blocking")
chrome = webdriver.Remote( chrome = webdriver.Remote(
self.webdrive_hub_endpoint, self.webdriver_hub_endpoint,
DesiredCapabilities.CHROME, DesiredCapabilities.CHROME,
options=options, options=options,
) )
chrome.set_window_size(self.window_size[0], self.window_size[1]) chrome.set_window_size(self.window_size[0], self.window_size[1])
if self.erase_webdrive_property: if self.erase_webdriver_property:
resource = "/session/%s/chromium/send_command_and_get_result" % chrome.session_id resource = "/session/%s/chromium/send_command_and_get_result" % chrome.session_id
url = chrome.command_executor._url + resource url = chrome.command_executor._url + resource
body = json.dumps({'cmd': "Page.addScriptToEvaluateOnNewDocument", 'params': { body = json.dumps({'cmd': "Page.addScriptToEvaluateOnNewDocument", 'params': {
@ -103,7 +101,7 @@ class BrowserDriver(BaseDriver):
}}) }})
chrome.command_executor._request('POST', url, body) chrome.command_executor._request('POST', url, body)
logging.getLogger('debug').debug({ logging.getLogger('nexus_pylon').debug({
'action': 'start_chrome', 'action': 'start_chrome',
'mode': 'pylon', 'mode': 'pylon',
'proxy': str(proxy) if proxy is not None else None, 'proxy': str(proxy) if proxy is not None else None,
@ -148,32 +146,19 @@ class BrowserDriver(BaseDriver):
and downloaded_offset == current_offset and downloaded_offset == current_offset
and current_offset > 0 and current_offset > 0
): ):
logging.getLogger('debug').debug({
'action': 'sent',
'mode': 'pylon',
'filename': filename,
})
return return
logging.getLogger('debug').debug({
'action': 'send_part',
'mode': 'pylon',
'current_offset': current_offset,
'downloaded_offset': downloaded_offset,
'filename': filename,
})
await file.seek(current_offset) await file.seek(current_offset)
yield await file.read(downloaded_offset - current_offset) yield await file.read(downloaded_offset - current_offset)
current_offset = downloaded_offset current_offset = downloaded_offset
await asyncio.sleep(0.5) await asyncio.sleep(self.file_poll_timeout)
raise NotFoundError() raise NotFoundError()
finally: finally:
await file.close() await file.close()
def get(self, chrome, url, params): def get(self, chrome, url, params):
logging.getLogger('debug').debug({ logging.getLogger('nexus_pylon').debug({
'action': 'get', 'action': 'download',
'mode': 'pylon', 'mode': 'pylon',
'url': url, 'url': url,
}) })
@ -190,11 +175,6 @@ class BrowserDriver(BaseDriver):
if not last_element: if not last_element:
raise RuntimeError('Nothing to click') raise RuntimeError('Nothing to click')
chrome.execute_script("arguments[0].click();", last_element) chrome.execute_script("arguments[0].click();", last_element)
logging.getLogger('debug').debug({
'action': 'clicked',
'mode': 'pylon',
'element': str(last_element),
})
case 'close_window': case 'close_window':
current_window = previous_window current_window = previous_window
previous_window = None previous_window = None
@ -204,11 +184,6 @@ class BrowserDriver(BaseDriver):
if not last_element: if not last_element:
raise RuntimeError('Nothing to click') raise RuntimeError('Nothing to click')
last_element.click() last_element.click()
logging.getLogger('debug').debug({
'action': 'native_clicked',
'mode': 'pylon',
'element': str(last_element),
})
case 'switch_to_new_window': case 'switch_to_new_window':
previous_window = current_window previous_window = current_window
current_window = chrome.window_handles[-1] current_window = chrome.window_handles[-1]
@ -227,12 +202,6 @@ class BrowserDriver(BaseDriver):
action['selector'], action['selector'],
)) ))
) )
logging.getLogger('debug').debug({
'action': 'waited_css_selector',
'mode': 'pylon',
'element': str(last_element),
'step': action
})
case 'wait_link_text': case 'wait_link_text':
last_element = WebDriverWait(chrome, action.get('timeout', 15.0)).until( last_element = WebDriverWait(chrome, action.get('timeout', 15.0)).until(
EC.presence_of_element_located(( EC.presence_of_element_located((
@ -240,12 +209,6 @@ class BrowserDriver(BaseDriver):
action['selector'], action['selector'],
)) ))
) )
logging.getLogger('debug').debug({
'action': 'waited_link_text',
'mode': 'pylon',
'element': str(last_element),
'step': action
})
case 'wait_xpath': case 'wait_xpath':
last_element = WebDriverWait(chrome, action.get('timeout', 15.0)).until( last_element = WebDriverWait(chrome, action.get('timeout', 15.0)).until(
EC.presence_of_element_located(( EC.presence_of_element_located((
@ -253,16 +216,10 @@ class BrowserDriver(BaseDriver):
action['selector'], action['selector'],
)) ))
) )
logging.getLogger('debug').debug({
'action': 'waited_xpath',
'mode': 'pylon',
'element': str(last_element),
'step': action
})
case _: case _:
raise NotImplementedError('Not implemented action type') raise NotImplementedError('Not implemented action type')
except WebDriverException as e: except WebDriverException as e:
logging.getLogger('debug').debug({ logging.getLogger('nexus_pylon').debug({
'action': 'error', 'action': 'error',
'mode': 'pylon', 'mode': 'pylon',
'error': str(e), 'error': str(e),
@ -294,15 +251,17 @@ class BrowserDriver(BaseDriver):
source=chrome.current_url, source=chrome.current_url,
) )
file_validator.validate() file_validator.validate()
logging.getLogger('debug').debug({
'action': 'validated',
'mode': 'pylon',
'url': prepared_file_request.url,
})
return return
except NotFoundError: except NotFoundError:
logging.getLogger('debug').debug({ logging.getLogger('nexus_pylon').debug({
'action': 'no_response', 'action': 'no_response',
'mode': 'pylon', 'mode': 'pylon',
}) })
finally:
logging.getLogger('nexus_pylon').debug({
'action': 'quit_chrome',
'mode': 'pylon',
})
chrome.quit()
shutil.rmtree(downloads_folder)
raise NotFoundError(params=params, url=prepared_file_request.url, driver=str(self)) raise NotFoundError(params=params, url=prepared_file_request.url, driver=str(self))

View File

@ -1,3 +1,4 @@
import logging
from typing import Dict from typing import Dict
import aiohttp.client_exceptions import aiohttp.client_exceptions
@ -25,12 +26,27 @@ class DirectDriver(BaseDriver):
@retry( @retry(
reraise=True, reraise=True,
wait=wait_random(min=1, max=2), wait=wait_random(min=1, max=2),
stop=stop_after_attempt(7), stop=stop_after_attempt(3),
retry=retry_if_exception_type((ProxyError, aiohttp.client_exceptions.ClientPayloadError, ProxyTimeoutError)), retry=retry_if_exception_type((ProxyError, aiohttp.client_exceptions.ClientPayloadError, ProxyTimeoutError)),
) )
async def execute_prepared_file_request(self, prepared_file_request: PreparedRequest, params: Dict): async def execute_prepared_file_request(self, prepared_file_request: PreparedRequest, params: Dict):
logging.debug({
'action': 'download',
'mode': 'pylon',
'params': params,
'source': str(self),
'url': prepared_file_request.url,
})
async with self.get_session() as session: async with self.get_session() as session:
async with prepared_file_request.execute_with(session=session) as resp: async with prepared_file_request.execute_with(session=session) as resp:
logging.debug({
'action': 'response',
'mode': 'pylon',
'params': params,
'source': str(self),
'url': prepared_file_request.url,
'status': resp.status,
})
if resp.status == 404: if resp.status == 404:
raise NotFoundError(url=prepared_file_request.url) raise NotFoundError(url=prepared_file_request.url)
elif ( elif (

View File

@ -1,5 +1,8 @@
import re import re
import sys from typing import (
List,
Tuple,
)
class Matcher: class Matcher:
@ -10,8 +13,11 @@ class Matcher:
def is_match(self, params) -> bool: def is_match(self, params) -> bool:
for param in params: for param in params:
if params[param]: param_value = params[param]
if param_regex := self.param_regexes.get(param): param_regex = self.param_regexes.get(param)
if re.match(param_regex, params[param]): if param_value and param_regex:
return True if not isinstance(param_value, (List, Tuple)):
return False param_value = [param_value]
for el in param_value:
if re.match(param_regex, el):
return el

View File

@ -10,7 +10,7 @@ import aiohttp
from aiohttp import ClientSession from aiohttp import ClientSession
from aiohttp.client_reqrep import ClientRequest from aiohttp.client_reqrep import ClientRequest
from aiohttp_socks import ProxyConnector from aiohttp_socks import ProxyConnector
from library.aiokit.aiokit import AioThing from aiokit import AioThing
from nexus.pylon.proxy_manager import ( from nexus.pylon.proxy_manager import (
AllOf, AllOf,
AnyOf, AnyOf,

View File

@ -228,7 +228,7 @@ class BasePdfProcessor:
try: try:
page = self.process_page(page, pdf_reader) page = self.process_page(page, pdf_reader)
except (PdfStreamError, binascii.Error) as e: except (PdfStreamError, binascii.Error) as e:
logging.getLogger('warning').warning({ logging.getLogger('nexus_pylon').warning({
'action': 'pdf_stream_error', 'action': 'pdf_stream_error',
'mode': 'pylon', 'mode': 'pylon',
'error': str(e), 'error': str(e),
@ -259,7 +259,7 @@ class WatermarkEraser1(BaseWatermarkEraser):
if self.is_watermark_predicate(text.encode()): if self.is_watermark_predicate(text.encode()):
xobj_death_note.append(operands[0]) xobj_death_note.append(operands[0])
operations_death_note.append(op_i) operations_death_note.append(op_i)
logging.getLogger('debug').debug({ logging.getLogger('nexus_pylon').debug({
'action': 'watermark_removal', 'action': 'watermark_removal',
'mode': 'pylon', 'mode': 'pylon',
'text': text, 'text': text,
@ -289,7 +289,7 @@ class WatermarkEraser2(BaseWatermarkEraser):
if operation == b"Tj": if operation == b"Tj":
if isinstance(operands[0], bytes) and self.is_watermark_predicate(operands[0]): if isinstance(operands[0], bytes) and self.is_watermark_predicate(operands[0]):
operations_death_note.append(op_i) operations_death_note.append(op_i)
logging.getLogger('debug').debug({ logging.getLogger('nexus_pylon').debug({
'action': 'watermark_removal', 'action': 'watermark_removal',
'mode': 'pylon', 'mode': 'pylon',
'text': operands[0].decode(), 'text': operands[0].decode(),
@ -319,7 +319,7 @@ class WatermarkEraser3(BaseWatermarkEraser):
text += operand text += operand
if self.is_watermark_predicate(text): if self.is_watermark_predicate(text):
operations_death_note.append(op_i) operations_death_note.append(op_i)
logging.getLogger('debug').debug({ logging.getLogger('nexus_pylon').debug({
'action': 'watermark_removal', 'action': 'watermark_removal',
'mode': 'pylon', 'mode': 'pylon',
'text': text.decode(), 'text': text.decode(),
@ -402,7 +402,7 @@ class WatermarkEraser4(BaseWatermarkEraser):
text, matched = tc.match(self.regexp) text, matched = tc.match(self.regexp)
if matched: if matched:
operations_death_note.extend(matched) operations_death_note.extend(matched)
logging.getLogger('debug').debug({ logging.getLogger('nexus_pylon').debug({
'action': 'watermark_removal', 'action': 'watermark_removal',
'mode': 'pylon', 'mode': 'pylon',
'matched': text, 'matched': text,

View File

@ -1,4 +1,5 @@
import asyncio import asyncio
import logging
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
from typing import Optional from typing import Optional
@ -23,6 +24,7 @@ class PreparedRequest:
cookies: Optional[dict] = None, cookies: Optional[dict] = None,
ssl: bool = True, ssl: bool = True,
timeout: Optional[float] = None, timeout: Optional[float] = None,
headers_override: bool = False
): ):
self.method = method self.method = method
self.url = url self.url = url
@ -32,6 +34,8 @@ class PreparedRequest:
} }
if headers: if headers:
self.headers.update(headers) self.headers.update(headers)
if headers_override:
self.headers = headers or {}
self.params = params self.params = params
self.cookies = cookies self.cookies = cookies
self.ssl = ssl self.ssl = ssl
@ -49,6 +53,13 @@ class PreparedRequest:
@asynccontextmanager @asynccontextmanager
async def execute_with(self, session): async def execute_with(self, session):
try: try:
logging.getLogger('nexus_pylon').debug({
'action': 'request',
'mode': 'pylon',
'url': self.url,
'method': self.method,
'headers': self.headers,
})
async with session.request( async with session.request(
method=self.method, method=self.method,
url=self.url, url=self.url,

View File

@ -54,6 +54,8 @@ class Proxy:
class ProxyManager: class ProxyManager:
def __init__(self, proxies=None): def __init__(self, proxies=None):
if proxies is None:
proxies = []
self.proxies = [Proxy(proxy) for proxy in proxies] self.proxies = [Proxy(proxy) for proxy in proxies]
def get_proxy(self, tags: Optional[Union[AllOf, AnyOf, Set]] = None) -> Proxy: def get_proxy(self, tags: Optional[Union[AllOf, AnyOf, Set]] = None) -> Proxy:

View File

@ -1,5 +1,6 @@
import json import json
import logging import logging
import sys
from typing import ( from typing import (
AsyncIterable, AsyncIterable,
Dict, Dict,
@ -50,20 +51,25 @@ class DoiOrgRequestResolver(BaseResolver):
method='get', method='get',
url=doi_url, url=doi_url,
timeout=self.resolve_timeout, timeout=self.resolve_timeout,
headers={'Accept': 'application/json'} headers={
'Accept': 'application/json',
}
).execute_with(session=session) as resp: ).execute_with(session=session) as resp:
return await resp.json() return await resp.json()
async def resolve(self, params: Dict) -> AsyncIterable[PreparedRequest]: async def resolve(self, params: Dict) -> AsyncIterable[PreparedRequest]:
body = await self.resolve_through_doi_org(params) body = await self.resolve_through_doi_org(params)
selected = None
try: try:
selected = json.loads(self.selector.input(body).text()) if text := self.selector.input(body).text():
selected = json.loads(text)
except ValueError as e: except ValueError as e:
logging.getLogger('error').error({ logging.getLogger('nexus_pylon').error({
'action': 'error', 'action': 'error',
'mode': 'pylon', 'mode': 'pylon',
'params': params, 'params': params,
'error': str(e) 'error': str(e),
'selector': str(self.selector),
}) })
return return
if selected: if selected:
@ -73,7 +79,7 @@ class DoiOrgRequestResolver(BaseResolver):
timeout=self.timeout, timeout=self.timeout,
) )
else: else:
logging.getLogger('debug').error({ logging.getLogger('nexus_pylon').debug({
'action': 'missed_selector', 'action': 'missed_selector',
'mode': 'pylon', 'mode': 'pylon',
'params': params, 'params': params,

View File

@ -15,10 +15,12 @@ class RequestResolver(BaseResolver):
self, self,
url: str, url: str,
extractors: List, extractors: List,
resolve_timeout: float = 10.0,
proxy_list: Optional[List] = None, proxy_list: Optional[List] = None,
proxy_manager: Optional[ProxyManager] = None, proxy_manager: Optional[ProxyManager] = None,
): ):
super().__init__(proxy_list=proxy_list, proxy_manager=proxy_manager) super().__init__(proxy_list=proxy_list, proxy_manager=proxy_manager)
self.resolve_timeout = resolve_timeout
self.url = url self.url = url
self.extractors = extractors self.extractors = extractors
@ -31,9 +33,9 @@ class RequestResolver(BaseResolver):
async with PreparedRequest( async with PreparedRequest(
method='get', method='get',
url=url, url=url,
timeout=10.0, timeout=self.resolve_timeout,
).execute_with(session=session) as resp: ).execute_with(session=session) as resp:
# Sometimes sci-hub returns file # Sometimes hosts return file URL
if resp.headers.get('Content-Type') == 'application/pdf': if resp.headers.get('Content-Type') == 'application/pdf':
yield PreparedRequest(method='get', url=url, timeout=10.0) yield PreparedRequest(method='get', url=url, timeout=10.0)
downloaded_page_bytes = await resp.read() downloaded_page_bytes = await resp.read()
@ -42,9 +44,11 @@ class RequestResolver(BaseResolver):
for extractor in self.extractors: for extractor in self.extractors:
match = re.search(extractor['re'], downloaded_page, re.IGNORECASE) match = re.search(extractor['re'], downloaded_page, re.IGNORECASE)
if match: if match:
matched_group = match.group(extractor['producer']['group'])
yield PreparedRequest( yield PreparedRequest(
method='get', method='get',
url=extractor['producer']['format_string'].format(matched_group=matched_group), url=extractor['producer']['format_string'].format(
host=resp.real_url.host,
**match.groupdict()
),
timeout=extractor['producer'].get('timeout', 10.0), timeout=extractor['producer'].get('timeout', 10.0),
) )

View File

@ -14,19 +14,27 @@ class TemplateResolver(BaseResolver):
self, self,
format_string: str = 'https://doi.org/{doi}', format_string: str = 'https://doi.org/{doi}',
timeout: float = 10.0, timeout: float = 10.0,
method: str = 'GET',
headers: Optional[dict] = None,
headers_override: bool = False,
proxy_list: Optional[List] = None, proxy_list: Optional[List] = None,
proxy_manager: Optional[ProxyManager] = None, proxy_manager: Optional[ProxyManager] = None,
): ):
super().__init__(proxy_list=proxy_list, proxy_manager=proxy_manager) super().__init__(proxy_list=proxy_list, proxy_manager=proxy_manager)
self.format_string = format_string self.format_string = format_string
self.timeout = timeout self.timeout = timeout
self.method = method
self.headers = headers
self.headers_override = headers_override
def __str__(self): def __str__(self):
return f'{self.__class__.__name__}({self.format_string})' return f'{self.__class__.__name__}({self.format_string})'
async def resolve(self, params) -> AsyncIterable[PreparedRequest]: async def resolve(self, params) -> AsyncIterable[PreparedRequest]:
yield PreparedRequest( yield PreparedRequest(
method='GET', method=self.method,
url=self.format_string.format(**params), url=self.format_string.format(**params),
timeout=self.timeout, timeout=self.timeout,
headers=self.headers,
headers_override=self.headers_override,
) )

View File

@ -2,12 +2,12 @@ import logging
from typing import ( from typing import (
AsyncIterable, AsyncIterable,
Dict, Dict,
List, Optional,
) )
from aiohttp.client_exceptions import ClientPayloadError from aiohttp.client_exceptions import ClientPayloadError
from library.aiokit.aiokit import AioThing from aiokit import AioThing
from library.logging import error_log from izihawa_utils.importlib import import_object
from nexus.pylon.drivers.base import BaseDriver from nexus.pylon.drivers.base import BaseDriver
from nexus.pylon.exceptions import ( from nexus.pylon.exceptions import (
DownloadError, DownloadError,
@ -16,7 +16,6 @@ from nexus.pylon.exceptions import (
from nexus.pylon.matcher import Matcher from nexus.pylon.matcher import Matcher
from nexus.pylon.proto.file_pb2 import FileResponse as FileResponsePb from nexus.pylon.proto.file_pb2 import FileResponse as FileResponsePb
from nexus.pylon.resolvers.base import BaseResolver from nexus.pylon.resolvers.base import BaseResolver
from utils.izihawa_utils.importlib import import_object
class Source(AioThing): class Source(AioThing):
@ -29,12 +28,15 @@ class Source(AioThing):
@classmethod @classmethod
def from_config( def from_config(
cls, cls,
proxy_manager, config,
source_config, source_config,
downloads_directory: str, proxy_manager,
default_driver_proxy_list: List, ) -> Optional['Source']:
default_resolver_proxy_list: List, driver_cls_name = source_config.get('driver', {}).get('class', 'nexus.pylon.drivers.BrowserDriver')
) -> 'Source':
if driver_cls_name.endswith('BrowserDriver') and config.get('webdriver_hub') is None:
return None
matcher = Matcher(source_config['matcher']) matcher = Matcher(source_config['matcher'])
resolver_cls = import_object( resolver_cls = import_object(
@ -42,16 +44,16 @@ class Source(AioThing):
) )
resolver_args = dict( resolver_args = dict(
proxy_manager=proxy_manager, proxy_manager=proxy_manager,
proxy_list=default_resolver_proxy_list, proxy_list=config['default_resolver_proxy_list'],
) )
resolver_args.update(**source_config.get('resolver', {}).get('args', {})) resolver_args.update(**source_config.get('resolver', {}).get('args', {}))
resolver = resolver_cls(**resolver_args) resolver = resolver_cls(**resolver_args)
driver_cls = import_object(source_config.get('driver', {}).get('class', 'nexus.pylon.drivers.BrowserDriver')) driver_cls = import_object(driver_cls_name)
driver_args = dict( driver_args = dict(
proxy_manager=proxy_manager, proxy_manager=proxy_manager,
downloads_directory=downloads_directory, proxy_list=config['default_driver_proxy_list'],
proxy_list=default_driver_proxy_list, config=config,
) )
driver_args.update(**source_config.get('driver', {}).get('args', {})) driver_args.update(**source_config.get('driver', {}).get('args', {}))
driver = driver_cls(**driver_args) driver = driver_cls(**driver_args)
@ -67,13 +69,6 @@ class Source(AioThing):
async def download(self, params: Dict) -> AsyncIterable[FileResponsePb]: async def download(self, params: Dict) -> AsyncIterable[FileResponsePb]:
yield FileResponsePb(status=FileResponsePb.Status.RESOLVING) yield FileResponsePb(status=FileResponsePb.Status.RESOLVING)
async for prepared_file_request in self.resolver.resolve(params): async for prepared_file_request in self.resolver.resolve(params):
logging.debug({
'action': 'download',
'mode': 'pylon',
'params': params,
'source': str(self),
'url': prepared_file_request.url,
})
try: try:
async for resp in self.driver.execute_prepared_file_request( async for resp in self.driver.execute_prepared_file_request(
prepared_file_request=prepared_file_request, prepared_file_request=prepared_file_request,
@ -82,11 +77,11 @@ class Source(AioThing):
yield resp yield resp
return return
except ClientPayloadError as e: except ClientPayloadError as e:
error_log(e, level=logging.WARNING) logging.getLogger('nexus_pylon').warning(e)
continue continue
except NotFoundError: except NotFoundError:
continue continue
except DownloadError as e: except DownloadError as e:
error_log(e) logging.getLogger('nexus_pylon').warning(e)
continue continue
raise NotFoundError(params=params, resolver=str(self.resolver), driver=str(self.driver)) raise NotFoundError(params=params, resolver=str(self.resolver), driver=str(self.driver))

View File

@ -1,4 +1,5 @@
from .base import BaseValidator
from .md5 import Md5Validator from .md5 import Md5Validator
from .pdf import PdfValidator from .pdf import PdfValidator
__all__ = ['Md5Validator', 'PdfValidator'] __all__ = ['BaseValidator', 'Md5Validator', 'PdfValidator']

View File

@ -1,4 +1,10 @@
from typing import Dict
class BaseValidator: class BaseValidator:
def __init__(self, params: Dict):
self.params = params
def update(self, chunk): def update(self, chunk):
pass pass

View File

@ -7,6 +7,7 @@ from nexus.pylon.validators.base import BaseValidator
class Md5Validator(BaseValidator): class Md5Validator(BaseValidator):
def __init__(self, params: Dict): def __init__(self, params: Dict):
super().__init__(params)
self.md5 = params['md5'] self.md5 = params['md5']
self.v = hashlib.md5() self.v = hashlib.md5()

View File

@ -12,7 +12,7 @@ from PyPDF2.errors import PdfReadError
class PdfValidator(BaseValidator): class PdfValidator(BaseValidator):
def __init__(self, params: Dict): def __init__(self, params: Dict):
self.params = params super().__init__(params)
self.md5 = params.get('md5') self.md5 = params.get('md5')
self.file = bytes() self.file = bytes()
self.v = hashlib.md5() self.v = hashlib.md5()
@ -24,7 +24,7 @@ class PdfValidator(BaseValidator):
def validate(self): def validate(self):
if self.md5 and self.md5.lower() == self.v.hexdigest().lower(): if self.md5 and self.md5.lower() == self.v.hexdigest().lower():
logging.getLogger('debug').debug({ logging.getLogger('nexus_pylon').debug({
'action': 'validation', 'action': 'validation',
'mode': 'pylon', 'mode': 'pylon',
'result': 'md5_ok', 'result': 'md5_ok',
@ -32,7 +32,7 @@ class PdfValidator(BaseValidator):
}) })
return return
elif not is_pdf(f=self.file): elif not is_pdf(f=self.file):
logging.getLogger('debug').debug({ logging.getLogger('nexus_pylon').debug({
'action': 'validation', 'action': 'validation',
'mode': 'pylon', 'mode': 'pylon',
'result': 'not_pdf', 'result': 'not_pdf',
@ -41,28 +41,18 @@ class PdfValidator(BaseValidator):
raise BadResponseError(file=str(self.file[:100])) raise BadResponseError(file=str(self.file[:100]))
try: try:
logging.getLogger('debug').debug({
'action': 'open_pdf',
'mode': 'pylon',
'file_len': len(self.file),
'params': self.params,
})
PyPDF2.PdfReader(BytesIO(self.file)) PyPDF2.PdfReader(BytesIO(self.file))
logging.getLogger('debug').debug({
'action': 'opened_pdf',
'mode': 'pylon',
'file_len': len(self.file),
'params': self.params,
})
except PdfReadError: except PdfReadError:
logging.getLogger('debug').debug({ logging.getLogger('nexus_pylon').debug({
'action': 'validation', 'action': 'validation',
'mode': 'pylon', 'mode': 'pylon',
'result': 'not_opened_as_pdf', 'result': 'not_opened_as_pdf',
'params': self.params,
}) })
raise BadResponseError(file=str(self.file[:100])) raise BadResponseError(file=str(self.file[:100]))
logging.getLogger('debug').debug({ logging.getLogger('nexus_pylon').debug({
'action': 'validation', 'action': 'validation',
'mode': 'pylon', 'mode': 'pylon',
'result': 'ok', 'result': 'ok',
'params': self.params,
}) })

View File

@ -10,6 +10,6 @@ py_library(
srcs_version = "PY3", srcs_version = "PY3",
visibility = ["//visibility:public"], visibility = ["//visibility:public"],
deps = [ deps = [
"//library/configurator", requirement("izihawa_configurator"),
], ],
) )

View File

@ -1,4 +1,4 @@
from library.configurator import Configurator from izihawa_configurator import Configurator
def get_translations(): def get_translations():

View File

@ -33,7 +33,10 @@ class ProgressBar:
tail_text, tail_text,
message=None, message=None,
source=None, source=None,
throttle_secs: float = 0, throttle_secs: float = 0.0,
hard_throttle_secs: float = 10.0,
last_call: float = 0.0,
done_threshold_size: int = 10 * 1024 * 1024,
): ):
self.telegram_client = telegram_client self.telegram_client = telegram_client
self.request_context = request_context self.request_context = request_context
@ -45,9 +48,12 @@ class ProgressBar:
self.done = 0 self.done = 0
self.total = 1 self.total = 1
self.throttle_secs = throttle_secs self.throttle_secs = throttle_secs
self.hard_throttle_secs = hard_throttle_secs
self.done_threshold_size = done_threshold_size
self.previous_done = 0
self.last_text = None self.last_text = None
self.last_call = 0 self.last_call = last_call
def share(self): def share(self):
if self.total > 0: if self.total > 0:
@ -56,6 +62,7 @@ class ProgressBar:
return f'{float(self.done / (1024 * 1024)):.1f}Mb' return f'{float(self.done / (1024 * 1024)):.1f}Mb'
def _set_progress(self, done, total): def _set_progress(self, done, total):
self.previous_done = self.done
self.done = done self.done = done
self.total = total self.total = total
@ -74,11 +81,20 @@ class ProgressBar:
progress_bar = '|' + filled * bars['filled'] + (total_bars - filled) * bars['empty'] + '| ' progress_bar = '|' + filled * bars['filled'] + (total_bars - filled) * bars['empty'] + '| '
tail_text = self.tail_text.format(source=self.source) tail_text = self.tail_text.format(source=self.source)
return f'`{self.header}\n{progress_bar}{self.share()} {tail_text}`' return f'`{self.header}\n{progress_bar}{self.share().ljust(8)} {tail_text}`'
def should_send(self, now, ignore_last_call):
if ignore_last_call:
return True
if abs(now - self.last_call) > self.hard_throttle_secs:
return True
if abs(now - self.last_call) > self.throttle_secs and (self.done - self.previous_done) < self.done_threshold_size:
return True
return False
async def send_message(self, text, ignore_last_call=False): async def send_message(self, text, ignore_last_call=False):
now = time.time() now = time.time()
if not ignore_last_call and abs(now - self.last_call) < self.throttle_secs: if not self.should_send(now, ignore_last_call):
return return
try: try:
if not self.message: if not self.message:
@ -103,17 +119,3 @@ class ProgressBar:
async def callback(self, done, total, ignore_last_call=False): async def callback(self, done, total, ignore_last_call=False):
self._set_progress(done, total) self._set_progress(done, total)
return await self.send_message(await self.render_progress(), ignore_last_call=ignore_last_call) return await self.send_message(await self.render_progress(), ignore_last_call=ignore_last_call)
class ThrottlerWrapper:
def __init__(self, callback: Callable, throttle_secs: Union[int, float]):
self.callback = callback
self.last_call = 0
self.throttle_secs = throttle_secs
async def __call__(self, *args, **kwargs):
now = time.time()
if abs(now - self.last_call) < self.throttle_secs:
return
self.last_call = now
return await self.callback(*args, **kwargs)

View File

@ -63,7 +63,6 @@ class ScimagViewBuilder(BaseViewBuilder):
'chapter': '🔖', 'chapter': '🔖',
'book-chapter': '🔖', 'book-chapter': '🔖',
} }
multihash_ix = 0
def is_preprint(self): def is_preprint(self):
return self.document_holder.doi.split('/')[0] in preprints return self.document_holder.doi.split('/')[0] in preprints

View File

@ -13,7 +13,7 @@ aiohttp-socks==0.7.1
aiokafka==0.7.2 aiokafka==0.7.2
aiokit==1.1.2 aiokit==1.1.2
aiosignal==1.2.0 aiosignal==1.2.0
aiosumma==2.8.13 aiosumma==2.10.4
asn1crypto==1.5.1 asn1crypto==1.5.1
async-generator==1.10 async-generator==1.10
async-timeout==4.0.2 async-timeout==4.0.2
@ -55,7 +55,7 @@ h11==0.13.0
idna==3.3 idna==3.3
iniconfig==1.1.1 iniconfig==1.1.1
isort==5.10.1 isort==5.10.1
izihawa-nlptools==1.1.7 izihawa-nlptools==1.1.9
izihawa-types==0.1.3 izihawa-types==0.1.3
izihawa-utils==1.0.7 izihawa-utils==1.0.7
Jinja2==3.1.2 Jinja2==3.1.2