mirror of
https://github.com/nexus-stc/hyperboria
synced 2024-12-23 18:17:45 +01:00
commit
065c84779f
@ -43,7 +43,7 @@ py3_image(
|
||||
"//library/aiogrpctools",
|
||||
requirement("aiokit"),
|
||||
"//library/aiopostgres",
|
||||
"//library/configurator",
|
||||
requirement("izihawa_configurator"),
|
||||
"//library/telegram",
|
||||
requirement("izihawa_utils"),
|
||||
],
|
||||
|
@ -1,11 +1,13 @@
|
||||
from library.configurator import Configurator
|
||||
from izihawa_configurator import Configurator
|
||||
from izihawa_utils import env
|
||||
|
||||
|
||||
def get_config():
|
||||
return Configurator([
|
||||
'idm/api/configs/base.yaml',
|
||||
'idm/api/configs/%s.yaml?' % env.type,
|
||||
'idm/api/configs/logging.yaml',
|
||||
], env_prefix='NEXUS_IDM_API')
|
||||
], env_prefix='IDM_API')
|
||||
|
||||
|
||||
config = get_config()
|
||||
|
@ -2,25 +2,25 @@
|
||||
application:
|
||||
debug: true
|
||||
service_name: idm-api
|
||||
database:
|
||||
idm:
|
||||
database: nexus
|
||||
host:
|
||||
password:
|
||||
username:
|
||||
drivername: postgresql
|
||||
port: 5432
|
||||
nexus:
|
||||
database: nexus
|
||||
host:
|
||||
password:
|
||||
username:
|
||||
drivername: postgresql
|
||||
port: 5432
|
||||
clickhouse:
|
||||
host:
|
||||
password:
|
||||
username:
|
||||
database:
|
||||
idm:
|
||||
database: nexus
|
||||
drivername: postgresql
|
||||
host:
|
||||
password:
|
||||
port: 5432
|
||||
username:
|
||||
nexus:
|
||||
database: nexus
|
||||
drivername: postgresql
|
||||
host:
|
||||
password:
|
||||
port: 5432
|
||||
username:
|
||||
grpc:
|
||||
address: 0.0.0.0
|
||||
port: 82
|
||||
|
@ -7,9 +7,9 @@ from idm.api.configs import get_config
|
||||
from idm.api.services.chat_manager import ChatManagerService
|
||||
from idm.api.services.profile import ProfileService
|
||||
from idm.api.services.subscription_manager import SubscriptionManagerService
|
||||
from izihawa_configurator import Configurator
|
||||
from library.aiogrpctools import AioGrpcServer
|
||||
from library.aiopostgres.pool_holder import AioPostgresPoolHolder
|
||||
from library.configurator import Configurator
|
||||
from library.logging import configure_logging
|
||||
|
||||
|
||||
|
@ -105,20 +105,24 @@ class ProfileService(profile_service_pb2_grpc.ProfileServicer, BaseService):
|
||||
for tag in download_document.tags:
|
||||
tags_counter[tag] += 1
|
||||
|
||||
most_popular_issns = sorted(issns_counter, key=issns_counter.get, reverse=True)[:7]
|
||||
most_popular_issns = sorted(issns_counter, key=issns_counter.get, reverse=True)[:14]
|
||||
most_popular_tags = sorted(tags_counter, key=tags_counter.get, reverse=True)[:7]
|
||||
|
||||
most_popular_series = []
|
||||
async for row in self.application.pool_holder['nexus'].iterate(
|
||||
f"select name, issns from series where issns && array[{most_popular_issns}]::text[]".format(
|
||||
most_popular_issns=','.join(map(lambda x: "'" + x + "'", most_popular_issns)),
|
||||
),
|
||||
row_factory=dict_row,
|
||||
):
|
||||
most_popular_series.append(profile_service_pb2.Series(
|
||||
name=row['name'],
|
||||
issns=row['issns'],
|
||||
))
|
||||
if most_popular_issns:
|
||||
async for row in self.application.pool_holder['nexus'].iterate(
|
||||
"select name, array_agg(issn) as issns from series "
|
||||
"where issn in ({most_popular_issns}) "
|
||||
"group by name order by name "
|
||||
"limit 7".format(
|
||||
most_popular_issns=','.join(map(lambda x: "'" + x + "'", most_popular_issns)),
|
||||
),
|
||||
row_factory=dict_row,
|
||||
):
|
||||
most_popular_series.append(profile_service_pb2.Series(
|
||||
name=row['name'],
|
||||
issns=row['issns'],
|
||||
))
|
||||
|
||||
return most_popular_series, most_popular_tags
|
||||
|
||||
|
@ -13,7 +13,7 @@ py_library(
|
||||
requirement("grpcio"),
|
||||
requirement("pyyaml"),
|
||||
requirement("aiokit"),
|
||||
"//library/configurator",
|
||||
requirement("izihawa_configurator"),
|
||||
"//library/logging",
|
||||
requirement("izihawa_utils"),
|
||||
],
|
||||
|
@ -92,6 +92,7 @@ class AioPostgresPoolHolder(AioThing):
|
||||
row_factory=tuple_row,
|
||||
cursor_name: Optional[str] = None,
|
||||
itersize: Optional[int] = None,
|
||||
statement_timeout: Optional[int] = None,
|
||||
):
|
||||
if not self.pool:
|
||||
raise RuntimeError('AioPostgresPoolHolder has not been started')
|
||||
@ -99,7 +100,9 @@ class AioPostgresPoolHolder(AioThing):
|
||||
async with conn.cursor(name=cursor_name, row_factory=row_factory) as cur:
|
||||
if itersize is not None:
|
||||
cur.itersize = itersize
|
||||
await cur.execute(stmt, values)
|
||||
await cur.execute(stmt + ';' if statement_timeout else '', values)
|
||||
if statement_timeout:
|
||||
await cur.execute(f'SET statement_timeout = {statement_timeout};')
|
||||
async for row in cur:
|
||||
yield row
|
||||
|
||||
|
@ -1,17 +0,0 @@
|
||||
load("@pip_modules//:requirements.bzl", "requirement")
|
||||
load("@rules_python//python:defs.bzl", "py_library")
|
||||
|
||||
py_library(
|
||||
name = "configurator",
|
||||
srcs = glob(
|
||||
["**/*.py"],
|
||||
exclude = ["tests/**"],
|
||||
),
|
||||
srcs_version = "PY3",
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
requirement("jinja2"),
|
||||
requirement("pyyaml"),
|
||||
requirement("izihawa_utils"),
|
||||
],
|
||||
)
|
@ -1,170 +0,0 @@
|
||||
import json
|
||||
import os
|
||||
import os.path
|
||||
from types import ModuleType
|
||||
|
||||
import yaml
|
||||
from izihawa_utils.common import (
|
||||
smart_merge_dicts,
|
||||
unflatten,
|
||||
)
|
||||
from jinja2 import Template
|
||||
from library.configurator.exceptions import UnknownConfigFormatError
|
||||
|
||||
|
||||
class ConfigObject(dict):
|
||||
def __getattr__(self, name):
|
||||
try:
|
||||
return self[name]
|
||||
except KeyError as e:
|
||||
raise AttributeError(e)
|
||||
|
||||
|
||||
class AnyOf:
|
||||
def __init__(self, *args):
|
||||
self.args = args
|
||||
|
||||
|
||||
class RichDict(dict):
|
||||
def has(self, *args):
|
||||
current = self
|
||||
for c in args:
|
||||
if c not in current:
|
||||
return False
|
||||
current = current[c]
|
||||
return True
|
||||
|
||||
def copy_if_exists(self, source_keys, target_key):
|
||||
current = self
|
||||
for c in source_keys:
|
||||
if c not in current:
|
||||
return False
|
||||
current = current[c]
|
||||
self[target_key] = current
|
||||
return True
|
||||
|
||||
|
||||
class Configurator(RichDict):
|
||||
def __init__(self, configs: list, env_prefix: str = None, env_key_separator: str = '.'):
|
||||
"""
|
||||
Create Configurator object
|
||||
|
||||
:param configs: list of paths to config files, dicts or modules.
|
||||
End filepath with `?` to mark it as optional config.
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
self._by_basenames = {}
|
||||
self._omitted_files = []
|
||||
|
||||
env_dict = {}
|
||||
|
||||
if env_prefix:
|
||||
env_prefix = env_prefix.lower()
|
||||
for name, value in os.environ.items():
|
||||
if name.lower().startswith(env_prefix):
|
||||
stripped_name = name[len(env_prefix):].lstrip('_')
|
||||
if stripped_name[-2:] == '[]':
|
||||
if stripped_name not in env_dict:
|
||||
env_dict[stripped_name[:-2]] = []
|
||||
env_dict[stripped_name[:-2]].append(value)
|
||||
else:
|
||||
env_dict[stripped_name] = value
|
||||
env_dict = unflatten(env_dict, sep=env_key_separator)
|
||||
|
||||
for config in ([os.environ] + configs + [env_dict]):
|
||||
file_found = self.update(config)
|
||||
if not file_found:
|
||||
self._omitted_files.append(config)
|
||||
|
||||
def _config_filename(self, filename):
|
||||
return os.path.join(os.getcwd(), filename)
|
||||
|
||||
def walk_and_render(self, c):
|
||||
if isinstance(c, str):
|
||||
return Template(c).render(**self)
|
||||
elif isinstance(c, list):
|
||||
return [self.walk_and_render(e) for e in c]
|
||||
elif isinstance(c, dict):
|
||||
for key in list(c.keys()):
|
||||
c[key] = self.walk_and_render(c[key])
|
||||
if key.endswith('_filepath'):
|
||||
with open(c[key]) as f:
|
||||
if c[key].endswith('.json'):
|
||||
c[key.replace('_filepath', '')] = json.loads(f.read())
|
||||
elif c[key].endswith('.yaml'):
|
||||
c[key.replace('_filepath', '')] = yaml.safe_load(f.read())
|
||||
return c
|
||||
|
||||
def update(self, new_config, basename=None, **kwargs):
|
||||
if isinstance(new_config, AnyOf):
|
||||
for config in new_config.args:
|
||||
try:
|
||||
return self.update(config.rstrip('?'))
|
||||
except IOError:
|
||||
pass
|
||||
raise IOError('None of %s was found' % ', '.join(new_config.args))
|
||||
elif isinstance(new_config, str):
|
||||
optional = new_config.endswith('?')
|
||||
filename = new_config.rstrip('?')
|
||||
basename = basename or os.path.basename(filename)
|
||||
|
||||
config_filename = self._config_filename(filename)
|
||||
|
||||
data = None
|
||||
|
||||
if os.path.exists(config_filename) and os.access(config_filename, os.R_OK):
|
||||
with open(config_filename) as f:
|
||||
data = f.read()
|
||||
|
||||
if data is None:
|
||||
if optional:
|
||||
return False
|
||||
else:
|
||||
raise IOError(f'File {config_filename} not found')
|
||||
|
||||
if filename.endswith('.json'):
|
||||
new_config = json.loads(data)
|
||||
elif filename.endswith('.yaml'):
|
||||
new_config = yaml.safe_load(data)
|
||||
else:
|
||||
raise UnknownConfigFormatError(filename)
|
||||
|
||||
new_config = self.walk_and_render(new_config)
|
||||
|
||||
elif isinstance(new_config, ModuleType):
|
||||
new_config = new_config.__dict__
|
||||
|
||||
elif callable(new_config):
|
||||
new_config = new_config(self)
|
||||
|
||||
if not new_config:
|
||||
new_config = {}
|
||||
|
||||
for k in new_config:
|
||||
if callable(new_config[k]):
|
||||
new_config[k] = new_config[k](context=self)
|
||||
|
||||
if 'log_path' in new_config:
|
||||
new_config['log_path'] = os.path.expanduser(new_config['log_path']).rstrip('/')
|
||||
|
||||
smart_merge_dicts(self, new_config, list_policy='override', copy=False)
|
||||
if basename:
|
||||
self._by_basenames[basename] = new_config
|
||||
|
||||
return True
|
||||
|
||||
def get_config_by_basename(self, basename):
|
||||
return self._by_basenames[basename]
|
||||
|
||||
def get_object_by_basename(self, basename):
|
||||
return ConfigObject(self._by_basenames[basename])
|
||||
|
||||
def has_missed_configs(self):
|
||||
return bool(self._omitted_files)
|
||||
|
||||
def has_file(self, basename):
|
||||
return basename in self._by_basenames
|
||||
|
||||
def get_files(self):
|
||||
return self._by_basenames
|
@ -1,2 +0,0 @@
|
||||
class UnknownConfigFormatError(Exception):
|
||||
pass
|
@ -159,7 +159,6 @@ class ToSummaAction(BaseAction):
|
||||
'journal',
|
||||
'journal-issue',
|
||||
'journal-volume',
|
||||
'other',
|
||||
'peer-review',
|
||||
'proceedings',
|
||||
'report-series',
|
||||
|
@ -35,7 +35,7 @@ py3_image(
|
||||
requirement("aiobaseclient"),
|
||||
requirement("aiocrossref"),
|
||||
requirement("aiokit"),
|
||||
"//library/configurator",
|
||||
requirement("izihawa_configurator"),
|
||||
"//library/logging",
|
||||
"//library/telegram",
|
||||
"//nexus/hub/aioclient",
|
||||
|
@ -1,5 +1,5 @@
|
||||
from izihawa_configurator import Configurator
|
||||
from izihawa_utils import env
|
||||
from library.configurator import Configurator
|
||||
|
||||
|
||||
def get_config():
|
||||
|
@ -104,7 +104,7 @@ class ViewHandler(BaseHandler):
|
||||
),
|
||||
event.delete(),
|
||||
]
|
||||
if not has_found_old_widget:
|
||||
if not has_found_old_widget and is_earlier_than_2_days(old_message):
|
||||
async with safe_execution(error_log=request_context.error_log):
|
||||
await self.application.telegram_client.delete_messages(request_context.chat.chat_id, [old_message_id])
|
||||
return await asyncio.gather(*actions)
|
||||
|
@ -189,6 +189,13 @@ schema:
|
||||
record: basic
|
||||
tokenizer: raw
|
||||
stored: true
|
||||
- name: series_page_rank
|
||||
type: f64
|
||||
options:
|
||||
fast: single
|
||||
fieldnorms: false
|
||||
indexed: true
|
||||
stored: true
|
||||
multi_fields: ["authors", "ipfs_multihashes", "isbns", "issns", "references", "tags"]
|
||||
primary_key: "id"
|
||||
stop_words: ['a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'if', 'in', 'is', 'it', 'of', 'on', 'or',
|
||||
|
@ -43,7 +43,7 @@ py3_image(
|
||||
requirement("aioipfs-2"),
|
||||
requirement("aiokit"),
|
||||
"//library/aiopostgres",
|
||||
"//library/configurator",
|
||||
requirement("izihawa_configurator"),
|
||||
"//library/telegram",
|
||||
"//nexus/hub/proto:grpc_py",
|
||||
"//nexus/hub/proto:proto_py",
|
||||
|
@ -1,5 +1,5 @@
|
||||
from izihawa_configurator import Configurator
|
||||
from izihawa_utils import env
|
||||
from library.configurator import Configurator
|
||||
|
||||
|
||||
def get_config():
|
||||
|
@ -1,629 +0,0 @@
|
||||
---
|
||||
pylon:
|
||||
default_driver_proxy_list:
|
||||
- [cambridge]
|
||||
- [edinburg]
|
||||
- [southampton]
|
||||
default_resolver_proxy_list: ~
|
||||
downloads_directory: /downloads
|
||||
proxies:
|
||||
- address: clash.default.svc.cluster.example.com:7890
|
||||
name: cambridge
|
||||
tags: ['cambridge']
|
||||
- address: clash.default.svc.cluster.example.com:7990
|
||||
name: edinburg
|
||||
tags: ['edinburg']
|
||||
- address: clash.default.svc.cluster.example.com:8090
|
||||
name: southampton
|
||||
tags: ['southampton']
|
||||
- address: socks5://clash.default.svc.cluster.example.com:7991
|
||||
name: socks5
|
||||
tags: ['socks5']
|
||||
sources:
|
||||
# LibGen.rocks
|
||||
- driver:
|
||||
args:
|
||||
proxy_list: ~
|
||||
validator:
|
||||
class: nexus.pylon.validators.Md5Validator
|
||||
class:
|
||||
nexus.pylon.drivers.DirectDriver
|
||||
matcher:
|
||||
md5: ^.*$
|
||||
resolver:
|
||||
args:
|
||||
extractors:
|
||||
- producer:
|
||||
format_string: 'http://libgen.rocks/{matched_group}'
|
||||
group: 0
|
||||
re: 'get\.php\?md5=.*&key=[A-Za-z\d]+'
|
||||
timeout: 25.0
|
||||
type: regex
|
||||
url: https://libgen.rocks/ads.php?md5={md5}
|
||||
class: nexus.pylon.resolvers.RequestResolver
|
||||
# LibGen.rocks
|
||||
- driver:
|
||||
args:
|
||||
proxy_list: ~
|
||||
class:
|
||||
nexus.pylon.drivers.DirectDriver
|
||||
matcher:
|
||||
doi: ^.*$
|
||||
resolver:
|
||||
args:
|
||||
extractors:
|
||||
- producer:
|
||||
format_string: 'http://libgen.rocks/{matched_group}'
|
||||
group: 0
|
||||
re: 'get\.php\?md5=[a-fA-F\d]+&key=[A-Za-z\d]+(&doi=[^\"]*)+'
|
||||
timeout: 25.0
|
||||
type: regex
|
||||
url: 'https://libgen.rocks/ads.php?doi={doi}'
|
||||
class: nexus.pylon.resolvers.RequestResolver
|
||||
# Library.lol
|
||||
- driver:
|
||||
args:
|
||||
proxy_list: ~
|
||||
validator:
|
||||
class: nexus.pylon.validators.Md5Validator
|
||||
class:
|
||||
nexus.pylon.drivers.DirectDriver
|
||||
matcher:
|
||||
md5: ^.*$
|
||||
resolver:
|
||||
args:
|
||||
extractors:
|
||||
- producer:
|
||||
format_string: '{matched_group}'
|
||||
group: 1
|
||||
re: '<a href="([^\"]+)">GET</a>'
|
||||
timeout: 45.0
|
||||
type: regex
|
||||
- producer:
|
||||
format_string: '{matched_group}'
|
||||
group: 0
|
||||
re: 'https://ipfs.io/ipfs/[A-Za-z\d]+'
|
||||
type: regex
|
||||
- producer:
|
||||
format_string: '{matched_group}'
|
||||
group: 0
|
||||
re: 'https://cloudflare-ipfs.com/ipfs/[A-Za-z\d]+'
|
||||
type: regex
|
||||
url: http://library.lol/main/{md5}
|
||||
class: nexus.pylon.resolvers.RequestResolver
|
||||
# library.lol
|
||||
- driver:
|
||||
args:
|
||||
proxy_list: ~
|
||||
class:
|
||||
nexus.pylon.drivers.DirectDriver
|
||||
matcher:
|
||||
doi: ^.*$
|
||||
resolver:
|
||||
args:
|
||||
extractors:
|
||||
- producer:
|
||||
format_string: '{matched_group}'
|
||||
group: 1
|
||||
re: '<a href="([^\"]+)">GET</a>'
|
||||
timeout: 45.0
|
||||
type: regex
|
||||
url: 'http://library.lol/scimag/{doi}'
|
||||
class: nexus.pylon.resolvers.RequestResolver
|
||||
# jamanetwork.com
|
||||
- driver:
|
||||
args:
|
||||
actions:
|
||||
- selector: '#pdf-link .toolbar-link-text-extra, .contents-tab-contents > #pdf-link .toolbar-link-text-extra'
|
||||
timeout: 20
|
||||
type: wait_css_selector
|
||||
- type: click
|
||||
class:
|
||||
nexus.pylon.drivers.BrowserDriver
|
||||
matcher:
|
||||
doi: ^10.1001/.*$
|
||||
# wires.onlinelibrary.wiley.com
|
||||
- matcher:
|
||||
doi: ^10.1002/.*$
|
||||
resolver:
|
||||
args:
|
||||
format_string: 'https://onlinelibrary.wiley.com/doi/pdf/{doi}'
|
||||
class: nexus.pylon.resolvers.TemplateResolver
|
||||
# link.springer.com
|
||||
- matcher:
|
||||
doi: ^10.(1007|14283)/.*$
|
||||
resolver:
|
||||
args:
|
||||
format_string: 'https://link.springer.com/content/pdf/{doi}.pdf'
|
||||
class: nexus.pylon.resolvers.TemplateResolver
|
||||
# www.sciencedirect.com
|
||||
- matcher:
|
||||
doi: ^10.(1016|1053|4103)/.*$
|
||||
resolver:
|
||||
args:
|
||||
format_string: 'https://www.sciencedirect.com/science/article/pii/{selected}/pdfft?isDTMRedir=true&download=true'
|
||||
selector: '(.resource.primary.URL | split("/"))[-1]'
|
||||
timeout: 40.0
|
||||
class: nexus.pylon.resolvers.DoiOrgRequestResolver
|
||||
# www.clinicalkey.com
|
||||
- driver:
|
||||
args:
|
||||
actions:
|
||||
- selector: '.x-pdf'
|
||||
timeout: 30.0
|
||||
type: wait_css_selector
|
||||
- type: click
|
||||
class:
|
||||
nexus.pylon.drivers.BrowserDriver
|
||||
matcher:
|
||||
doi: ^10.1016/.*$
|
||||
# www.cambridge.org
|
||||
- matcher:
|
||||
doi: ^10.1017/.*$
|
||||
resolver:
|
||||
class: nexus.pylon.resolvers.DoiOrgRequestResolver
|
||||
# pubs.acs.org
|
||||
- matcher:
|
||||
doi: ^10.1021/.*$
|
||||
resolver:
|
||||
args:
|
||||
format_string: 'https://pubs.acs.org/doi/pdf/{doi}?download=true'
|
||||
class: nexus.pylon.resolvers.TemplateResolver
|
||||
# www.nature.com
|
||||
- driver:
|
||||
args:
|
||||
actions:
|
||||
- selector: '#entitlement-box-right-column span'
|
||||
timeout: 30
|
||||
type: wait_css_selector
|
||||
- type: click
|
||||
class: nexus.pylon.drivers.BrowserDriver
|
||||
matcher:
|
||||
doi: ^10.1038/.*$
|
||||
# www.nejm.org
|
||||
- matcher:
|
||||
doi: ^10.1056/.*$
|
||||
resolver:
|
||||
args:
|
||||
format_string: 'https://www.nejm.org/doi/pdf/{doi}?download=true'
|
||||
class: nexus.pylon.resolvers.TemplateResolver
|
||||
# ascelibrary.org
|
||||
- matcher:
|
||||
doi: ^10.1061/.*$
|
||||
resolver:
|
||||
args:
|
||||
format_string: 'https://ascelibrary.org/doi/pdf/{doi}?download=true'
|
||||
class: nexus.pylon.resolvers.TemplateResolver
|
||||
# www.pnas.org
|
||||
- matcher:
|
||||
doi: ^10.1073/.*$
|
||||
resolver:
|
||||
args:
|
||||
format_string: 'https://www.pnas.org/doi/pdf/{doi}?download=true'
|
||||
class: nexus.pylon.resolvers.TemplateResolver
|
||||
# www.tandfonline.com
|
||||
- matcher:
|
||||
doi: ^10.1080/.*$
|
||||
resolver:
|
||||
args:
|
||||
format_string: 'https://www.tandfonline.com/doi/pdf/{doi}?download=true'
|
||||
class: nexus.pylon.resolvers.TemplateResolver
|
||||
# iopscience.iop.org
|
||||
- matcher:
|
||||
doi: ^10.1088/.*$
|
||||
resolver:
|
||||
args:
|
||||
format_string: 'https://iopscience.iop.org/article/{doi}/pdf'
|
||||
class: nexus.pylon.resolvers.TemplateResolver
|
||||
# academic.oup.com
|
||||
- driver:
|
||||
args:
|
||||
actions:
|
||||
- selector: '.pdf-link-text'
|
||||
timeout: 30
|
||||
type: wait_css_selector
|
||||
- type: click
|
||||
proxy_list:
|
||||
- [edinburg]
|
||||
- [cambridge]
|
||||
- [southampton]
|
||||
class: nexus.pylon.drivers.BrowserDriver
|
||||
matcher:
|
||||
doi: ^10.1093/.*$
|
||||
# journals.lww.com
|
||||
- driver:
|
||||
args:
|
||||
actions:
|
||||
- selector: '.ejp-article-tools__list-icon-holder > .icon-pdf'
|
||||
timeout: 30.0
|
||||
type: wait_css_selector
|
||||
- type: click
|
||||
- selector: '.ejp-article-tools__dropdown-list-button > .icon-pdf'
|
||||
timeout: 5.0
|
||||
type: wait_css_selector
|
||||
- type: click
|
||||
class: nexus.pylon.drivers.BrowserDriver
|
||||
matcher:
|
||||
doi: ^10.(1097|1213|5435|14309)/.*$
|
||||
resolver:
|
||||
args:
|
||||
timeout: 30.0
|
||||
class: nexus.pylon.resolvers.TemplateResolver
|
||||
# journals.aps.org
|
||||
- matcher:
|
||||
doi: ^10.1103/.*$
|
||||
resolver:
|
||||
args:
|
||||
selector: '[.link[] | select(.URL | ascii_downcase | endswith("fulltext"))][0].URL'
|
||||
class: nexus.pylon.resolvers.DoiOrgRequestResolver
|
||||
# emerald.com
|
||||
- matcher:
|
||||
doi: ^10.1108/.*$
|
||||
resolver:
|
||||
args:
|
||||
format_string: 'https://www.emerald.com/insight/content/doi/{doi}/full/pdf'
|
||||
class: nexus.pylon.resolvers.TemplateResolver
|
||||
# ieeexplore.ieee.org
|
||||
- matcher:
|
||||
doi: ^10.1109/.*$
|
||||
resolver:
|
||||
args:
|
||||
format_string: 'https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber={selected}'
|
||||
selector: '(.resource.primary.URL | split("/"))[-2]'
|
||||
class: nexus.pylon.resolvers.DoiOrgRequestResolver
|
||||
# onlinelibrary.wiley.com
|
||||
- matcher:
|
||||
doi: ^10.1111/.*$
|
||||
resolver:
|
||||
args:
|
||||
format_string: 'https://onlinelibrary.wiley.com/doi/pdfdirect/{doi}?download=true'
|
||||
class: nexus.pylon.resolvers.TemplateResolver
|
||||
# asa.scitation.org
|
||||
- matcher:
|
||||
doi: ^10.1121/.*$
|
||||
resolver:
|
||||
args:
|
||||
format_string: 'https://asa.scitation.org/doi/pdf/{doi}?download=true'
|
||||
class: nexus.pylon.resolvers.TemplateResolver
|
||||
# www.science.org
|
||||
- matcher:
|
||||
doi: ^10.1126/.*$
|
||||
resolver:
|
||||
args:
|
||||
format_string: 'https://www.science.org/doi/pdf/{doi}?download=true'
|
||||
class: nexus.pylon.resolvers.TemplateResolver
|
||||
# ^10.1136/.*$
|
||||
- driver:
|
||||
args:
|
||||
actions:
|
||||
- selector: '.article-pdf-download > img'
|
||||
type: wait_css_selector
|
||||
- type: click
|
||||
class: nexus.pylon.drivers.BrowserDriver
|
||||
matcher:
|
||||
doi: ^10.1136/.*$
|
||||
# ^10.1136/.*$
|
||||
- driver:
|
||||
args:
|
||||
actions:
|
||||
- selector: '.icon'
|
||||
type: wait_css_selector
|
||||
- type: click
|
||||
class: nexus.pylon.drivers.BrowserDriver
|
||||
matcher:
|
||||
doi: ^10.1136/.*$
|
||||
resolver:
|
||||
class: nexus.pylon.resolvers.DoiOrgRequestResolver
|
||||
# dl.acm.org
|
||||
- matcher:
|
||||
doi: ^10.1145/.*$
|
||||
resolver:
|
||||
args:
|
||||
format_string: 'https://dl.acm.org/doi/pdf/{doi}?download=true'
|
||||
class: nexus.pylon.resolvers.TemplateResolver
|
||||
# www.annualreviews.org
|
||||
- matcher:
|
||||
doi: ^10.1146/.*$
|
||||
resolver:
|
||||
args:
|
||||
format_string: 'https://www.annualreviews.org/doi/pdf/{doi}?download=true'
|
||||
class: nexus.pylon.resolvers.TemplateResolver
|
||||
# journals.physiology.org
|
||||
- matcher:
|
||||
doi: ^10.1152/.*$
|
||||
resolver:
|
||||
args:
|
||||
format_string: 'https://journals.physiology.org/doi/pdf/{doi}?download=true'
|
||||
class: nexus.pylon.resolvers.TemplateResolver
|
||||
# www.ahajournals.org
|
||||
- matcher:
|
||||
doi: ^10.1161/.*$
|
||||
resolver:
|
||||
args:
|
||||
format_string: 'https://www.ahajournals.org/doi/pdf/{doi}?download=true'
|
||||
class: nexus.pylon.resolvers.TemplateResolver
|
||||
# ajp.psychiatryonline.org
|
||||
- matcher:
|
||||
doi: ^10.1176/.*$
|
||||
resolver:
|
||||
args:
|
||||
format_string: 'https://ajp.psychiatryonline.org/doi/pdf/{doi}?download=true'
|
||||
class: nexus.pylon.resolvers.TemplateResolver
|
||||
# journals.sagepub.com
|
||||
- matcher:
|
||||
doi: ^10.1177/.*$
|
||||
resolver:
|
||||
args:
|
||||
format_string: 'https://journals.sagepub.com/doi/pdf/{doi}?download=true'
|
||||
class: nexus.pylon.resolvers.TemplateResolver
|
||||
# bmcpsychiatry.biomedcentral.com
|
||||
- matcher:
|
||||
doi: ^10.1186/.*$
|
||||
resolver:
|
||||
args:
|
||||
format_string: 'https://bmcpsychiatry.biomedcentral.com/track/pdf/{doi}.pdf'
|
||||
class: nexus.pylon.resolvers.TemplateResolver
|
||||
# journals.plos.org
|
||||
- driver:
|
||||
args:
|
||||
proxy_list: ~
|
||||
class: nexus.pylon.drivers.direct.DirectDriver
|
||||
matcher:
|
||||
doi: ^10.1371/.*$
|
||||
resolver:
|
||||
args:
|
||||
format_string: 'https://journals.plos.org/plosone/article/file?id={doi}&type=printable'
|
||||
class: nexus.pylon.resolvers.TemplateResolver
|
||||
# bioone.org
|
||||
- driver:
|
||||
args:
|
||||
actions:
|
||||
- selector: 'DOWNLOAD PAPER'
|
||||
type: wait_link_text
|
||||
- type: click
|
||||
class: nexus.pylon.drivers.BrowserDriver
|
||||
matcher:
|
||||
doi: ^10.(1638|1654|1667|2108)/.*$
|
||||
# jasn.asnjournals.org
|
||||
- driver:
|
||||
args:
|
||||
actions:
|
||||
- selector: 'View PDF'
|
||||
type: wait_link_text
|
||||
- type: click
|
||||
class: nexus.pylon.drivers.BrowserDriver
|
||||
matcher:
|
||||
doi: ^10.1681/.*$
|
||||
# papers.ssrn.com
|
||||
- driver:
|
||||
args:
|
||||
actions:
|
||||
- selector: '.abstract-buttons:nth-child(1) .primary > span'
|
||||
type: wait_css_selector
|
||||
- type: click
|
||||
class: nexus.pylon.drivers.BrowserDriver
|
||||
matcher:
|
||||
doi: ^10.2139/.*$
|
||||
# www.afghandata.org
|
||||
- driver:
|
||||
args:
|
||||
proxy_list: ~
|
||||
class:
|
||||
nexus.pylon.drivers.DirectDriver
|
||||
matcher:
|
||||
doi: ^10.(2458|29171)/.*$
|
||||
# www.euppublishing.com
|
||||
- matcher:
|
||||
doi: ^10.3366/.*$
|
||||
resolver:
|
||||
args:
|
||||
format_string: 'https://www.euppublishing.com/doi/pdf/{doi}?download=true'
|
||||
class: nexus.pylon.resolvers.TemplateResolver
|
||||
# www.frontiersin.org
|
||||
- driver:
|
||||
args:
|
||||
actions:
|
||||
- selector: '.paper > .dropdown-toggle'
|
||||
type: wait_css_selector
|
||||
- type: click
|
||||
- selector: '.download-files-pdf'
|
||||
type: wait_css_selector
|
||||
- type: click
|
||||
class:
|
||||
nexus.pylon.drivers.BrowserDriver
|
||||
matcher:
|
||||
doi: ^10.3389/.*$
|
||||
resolver:
|
||||
args:
|
||||
selector: '[.link[] | select(.URL | ascii_downcase | endswith("full"))][0].URL'
|
||||
class: nexus.pylon.resolvers.DoiOrgRequestResolver
|
||||
# bjgp.org
|
||||
- driver:
|
||||
args:
|
||||
actions:
|
||||
- selector: 'PDF'
|
||||
type: wait_link_text
|
||||
- type: click
|
||||
class: nexus.pylon.drivers.BrowserDriver
|
||||
matcher:
|
||||
doi: ^10.3399/.*$
|
||||
# www.wjgnet.com
|
||||
- driver:
|
||||
args:
|
||||
actions:
|
||||
- selector: 'Full Article (PDF)'
|
||||
type: wait_link_text
|
||||
- type: click
|
||||
class: nexus.pylon.drivers.BrowserDriver
|
||||
matcher:
|
||||
doi: ^10.3748/.*$
|
||||
resolver:
|
||||
class: nexus.pylon.resolvers.DoiOrgRequestResolver
|
||||
# journals.vilniustech.lt
|
||||
- driver:
|
||||
args:
|
||||
actions:
|
||||
- selector: '.label'
|
||||
type: wait_css_selector
|
||||
- type: click
|
||||
class: nexus.pylon.drivers.BrowserDriver
|
||||
matcher:
|
||||
doi: ^10.3846/.*$
|
||||
resolver:
|
||||
class: nexus.pylon.resolvers.DoiOrgRequestResolver
|
||||
# isegoria.revistas.csic.es
|
||||
- matcher:
|
||||
doi: ^10.3989/.*$
|
||||
resolver:
|
||||
args:
|
||||
selector: '[.link[] | select(."intended-application" == "similarity-checking")][0].URL'
|
||||
class: nexus.pylon.resolvers.DoiOrgRequestResolver
|
||||
# www.psychiatrist.com
|
||||
- driver:
|
||||
args:
|
||||
actions:
|
||||
- selector: '.article-dwndpdf > img'
|
||||
type: wait_css_selector
|
||||
- type: click
|
||||
class: nexus.pylon.drivers.BrowserDriver
|
||||
matcher:
|
||||
doi: ^10.4088/.*$
|
||||
# www.scirp.org
|
||||
- driver:
|
||||
args:
|
||||
actions:
|
||||
- selector: 'PDF'
|
||||
type: wait_link_text
|
||||
- type: click
|
||||
class: nexus.pylon.drivers.BrowserDriver
|
||||
matcher:
|
||||
doi: ^10.4236/.*$
|
||||
# jsbr.be
|
||||
- driver:
|
||||
args:
|
||||
actions:
|
||||
- selector: 'h4 > .fa-download'
|
||||
type: wait_css_selector
|
||||
- type: click
|
||||
- selector: 'PDF (EN)'
|
||||
type: wait_link_text
|
||||
- type: click
|
||||
class: nexus.pylon.drivers.BrowserDriver
|
||||
matcher:
|
||||
doi: ^10.5334/.*$
|
||||
# hess.copernicus.org
|
||||
- driver:
|
||||
args:
|
||||
proxy_list: ~
|
||||
class: nexus.pylon.drivers.DirectDriver
|
||||
matcher:
|
||||
doi: ^10.5194/.*$
|
||||
# ProQuest
|
||||
- driver:
|
||||
args:
|
||||
actions:
|
||||
- selector: '#searchTerm'
|
||||
type: wait_css_selector
|
||||
- type: click
|
||||
- selector: '#searchTerm'
|
||||
text: '{doi}'
|
||||
type: type
|
||||
- selector: '.uxf-search'
|
||||
type: wait_css_selector
|
||||
- type: click
|
||||
- selector: '.uxf-download'
|
||||
type: wait_css_selector
|
||||
- type: click
|
||||
proxy_list: ~
|
||||
class: nexus.pylon.drivers.BrowserDriver
|
||||
matcher:
|
||||
doi: ^10.5585/.*
|
||||
resolver:
|
||||
args:
|
||||
format_string: 'https://www.proquest.com/'
|
||||
class: nexus.pylon.resolvers.TemplateResolver
|
||||
# jcsm.aasm.org
|
||||
- matcher:
|
||||
doi: ^10.5664/.*$
|
||||
resolver:
|
||||
args:
|
||||
format_string: 'https://jcsm.aasm.org/doi/pdf/{doi}?download=true'
|
||||
class: nexus.pylon.resolvers.TemplateResolver
|
||||
# journal.permsc.ru
|
||||
- driver:
|
||||
args:
|
||||
actions:
|
||||
- selector: '.obj_galley_link'
|
||||
type: wait_css_selector
|
||||
- type: click
|
||||
- selector: '.label'
|
||||
type: wait_css_selector
|
||||
- type: click
|
||||
class: nexus.pylon.drivers.BrowserDriver
|
||||
matcher:
|
||||
doi: ^10.7242/.*$
|
||||
# amjcaserep.com
|
||||
- driver:
|
||||
args:
|
||||
actions:
|
||||
- selector: "//input[@value='Download PDF version']"
|
||||
type: wait_xpath
|
||||
- type: click
|
||||
class: nexus.pylon.drivers.BrowserDriver
|
||||
matcher:
|
||||
doi: ^10.12659/.*$
|
||||
# medcraveonline.com
|
||||
- driver:
|
||||
args:
|
||||
actions:
|
||||
- selector: 'Download PDF'
|
||||
type: wait_link_text
|
||||
- type: click
|
||||
class: nexus.pylon.drivers.BrowserDriver
|
||||
matcher:
|
||||
doi: ^10.15406/.*$
|
||||
# www.researchsquare.com
|
||||
- driver:
|
||||
args:
|
||||
actions:
|
||||
- selector: '.hidden > .text-blue-600 > .antialiased'
|
||||
type: wait_css_selector
|
||||
- type: native_click
|
||||
class: nexus.pylon.drivers.BrowserDriver
|
||||
matcher:
|
||||
doi: ^10.21203/.*$
|
||||
# www.ukm.my/
|
||||
- driver:
|
||||
args:
|
||||
proxy_list: ~
|
||||
class: nexus.pylon.drivers.DirectDriver
|
||||
matcher:
|
||||
doi: ^10.24035/.*$
|
||||
# journals.library.ryerson.ca
|
||||
- driver:
|
||||
args:
|
||||
actions:
|
||||
- selector: '#a11y-1-tab-tab-download'
|
||||
type: wait_css_selector
|
||||
- type: click
|
||||
class: nexus.pylon.drivers.BrowserDriver
|
||||
matcher:
|
||||
doi: ^10.32920/.*$
|
||||
# papers.cumincad.org
|
||||
- driver:
|
||||
args:
|
||||
actions:
|
||||
- selector: 'file.pdf'
|
||||
type: wait_link_text
|
||||
- type: click
|
||||
proxy_list: ~
|
||||
class: nexus.pylon.drivers.BrowserDriver
|
||||
matcher:
|
||||
doi: ^10.52842/.*$
|
||||
# ^.*$
|
||||
- matcher:
|
||||
doi: ^.*$
|
||||
resolver:
|
||||
args:
|
||||
selector: '[(.link | if . == null then [] else . end)[] | select((."content-type" == "application/pdf") or (.URL | ascii_downcase | contains("pdf")))][0].URL'
|
||||
class: nexus.pylon.resolvers.DoiOrgRequestResolver
|
@ -5,9 +5,9 @@ import uvloop
|
||||
from aiogrobid import GrobidClient
|
||||
from aioipfs import AsyncIPFS as AsyncIPFS
|
||||
from idm.api.aioclient import IdmApiGrpcClient
|
||||
from izihawa_configurator import Configurator
|
||||
from library.aiogrpctools import AioGrpcServer
|
||||
from library.aiopostgres import AioPostgresPoolHolder
|
||||
from library.configurator import Configurator
|
||||
from library.logging import configure_logging
|
||||
from library.telegram.base import BaseTelegramClient
|
||||
from nexus.hub.configs import get_config
|
||||
@ -16,6 +16,7 @@ from nexus.hub.services.mutual_aid_service import MutualAidService
|
||||
from nexus.hub.services.submitter import SubmitterService
|
||||
from nexus.hub.user_manager import UserManager
|
||||
from nexus.meta_api.aioclient import MetaApiGrpcClient
|
||||
from nexus.pylon.configs import get_config as get_default_pylon_config
|
||||
|
||||
|
||||
class GrpcServer(AioGrpcServer):
|
||||
@ -79,7 +80,7 @@ class GrpcServer(AioGrpcServer):
|
||||
should_parse_with_grobid=config['application']['should_parse_with_grobid'],
|
||||
should_store_hashes=config['application']['should_store_hashes'],
|
||||
telegram_bot_configs=config['telegram']['bots'],
|
||||
pylon_config=config['pylon'],
|
||||
pylon_config=config.get('pylon') or get_default_pylon_config()['pylon'],
|
||||
)
|
||||
self.submitter_service = SubmitterService(
|
||||
application=self,
|
||||
|
@ -65,6 +65,7 @@ class BaseHubService(BaseService):
|
||||
await asyncio.gather(
|
||||
self.application.ipfs_client.add_bytes(file, cid_version=1, hash='blake2b-256', only_hash=True),
|
||||
self.application.ipfs_client.add_bytes(file, cid_version=0, hash='sha2-256', only_hash=True),
|
||||
self.application.ipfs_client.add_bytes(file, cid_version=1, hash='blake3', only_hash=True),
|
||||
)
|
||||
))
|
||||
|
||||
|
@ -73,13 +73,7 @@ class DeliveryService(delivery_service_pb2_grpc.DeliveryServicer, BaseHubService
|
||||
self.downloadings = set()
|
||||
self.is_sharience_enabled = is_sharience_enabled
|
||||
self.maintenance_picture_url = maintenance_picture_url
|
||||
self.pylon_client = PylonClient(
|
||||
proxies=pylon_config['proxies'],
|
||||
source_configs=pylon_config['sources'],
|
||||
default_driver_proxy_list=pylon_config['default_driver_proxy_list'],
|
||||
default_resolver_proxy_list=pylon_config['default_resolver_proxy_list'],
|
||||
downloads_directory=pylon_config['downloads_directory'],
|
||||
)
|
||||
self.pylon_client = PylonClient(config=pylon_config)
|
||||
self.should_parse_with_grobid = should_parse_with_grobid
|
||||
self.should_store_hashes = should_store_hashes
|
||||
self.telegram_bot_configs = telegram_bot_configs
|
||||
@ -170,6 +164,15 @@ class DeliveryService(delivery_service_pb2_grpc.DeliveryServicer, BaseHubService
|
||||
return delivery_service_pb2.StartDeliveryResponse(status=delivery_service_pb2.StartDeliveryResponse.Status.OK)
|
||||
|
||||
|
||||
async def delayed_task(create_task, t):
|
||||
try:
|
||||
await asyncio.sleep(t)
|
||||
task = create_task()
|
||||
await task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
|
||||
class DownloadTask:
|
||||
def __init__(
|
||||
self,
|
||||
@ -204,7 +207,7 @@ class DownloadTask:
|
||||
)
|
||||
|
||||
async def download_task(self, request_context: RequestContext, document_holder):
|
||||
throttle_secs = 2.0
|
||||
throttle_secs = 3.0
|
||||
|
||||
async def _on_fail():
|
||||
await self.application.telegram_clients[request_context.bot_name].send_message(
|
||||
@ -218,6 +221,7 @@ class DownloadTask:
|
||||
error_log=request_context.error_log,
|
||||
on_fail=_on_fail,
|
||||
):
|
||||
start_time = time.time()
|
||||
filename = document_holder.get_filename()
|
||||
progress_bar_download = ProgressBar(
|
||||
telegram_client=self.application.telegram_clients[request_context.bot_name],
|
||||
@ -226,9 +230,9 @@ class DownloadTask:
|
||||
header=f'⬇️ {filename}',
|
||||
tail_text=t('TRANSMITTED_FROM', request_context.chat.language),
|
||||
throttle_secs=throttle_secs,
|
||||
last_call=start_time,
|
||||
)
|
||||
downloads_gauge.inc()
|
||||
start_time = time.time()
|
||||
try:
|
||||
file = await self.download(
|
||||
document_holder=document_holder,
|
||||
@ -242,11 +246,21 @@ class DownloadTask:
|
||||
)
|
||||
if not document_holder.md5 and document_holder.get_extension() == 'pdf':
|
||||
try:
|
||||
await progress_bar_download.send_message(
|
||||
t("PROCESSING_PAPER", request_context.chat.language).format(filename=filename),
|
||||
ignore_last_call=True
|
||||
processing_message_task = asyncio.create_task(delayed_task(
|
||||
create_task=lambda: progress_bar_download.send_message(
|
||||
t("PROCESSING_PAPER", request_context.chat.language).format(filename=filename),
|
||||
ignore_last_call=True
|
||||
),
|
||||
t=5.0
|
||||
))
|
||||
file = await asyncio.get_running_loop().run_in_executor(
|
||||
None,
|
||||
lambda: clean_metadata(file, doi=document_holder.doi)
|
||||
)
|
||||
file = clean_metadata(file, doi=document_holder.doi)
|
||||
|
||||
processing_message_task.cancel()
|
||||
await processing_message_task
|
||||
|
||||
request_context.statbox(
|
||||
action='cleaned',
|
||||
len=len(file),
|
||||
@ -260,7 +274,8 @@ class DownloadTask:
|
||||
banner=t("LOOKING_AT", request_context.chat.language),
|
||||
header=f'⬇️ {filename}',
|
||||
tail_text=t('UPLOADED_TO_TELEGRAM', request_context.chat.language),
|
||||
throttle_secs=throttle_secs
|
||||
throttle_secs=throttle_secs,
|
||||
last_call=progress_bar_download.last_call,
|
||||
)
|
||||
uploaded_message = await self.delivery_service.send_file(
|
||||
document_holder=self.document_holder,
|
||||
@ -393,11 +408,15 @@ class DownloadTask:
|
||||
|
||||
async def download(self, document_holder, progress_bar):
|
||||
collected = bytearray()
|
||||
if document_holder.doi:
|
||||
try:
|
||||
params = {'doi': document_holder.doi}
|
||||
if document_holder.md5:
|
||||
params['md5'] = document_holder.md5
|
||||
params = {}
|
||||
try:
|
||||
if document_holder.doi:
|
||||
params['doi'] = document_holder.doi
|
||||
if document_holder.md5:
|
||||
params['md5'] = document_holder.md5
|
||||
if document_holder.ipfs_multihashes:
|
||||
params['ipfs_multihashes'] = [ipfs_multihash for ipfs_multihash in document_holder.ipfs_multihashes]
|
||||
if params:
|
||||
async for resp in self.delivery_service.pylon_client.download(params):
|
||||
await self.process_resp(
|
||||
resp=resp,
|
||||
@ -406,20 +425,8 @@ class DownloadTask:
|
||||
filesize=document_holder.filesize,
|
||||
)
|
||||
return bytes(collected)
|
||||
except DownloadError:
|
||||
pass
|
||||
if document_holder.md5:
|
||||
try:
|
||||
async for resp in self.delivery_service.pylon_client.download({'md5': document_holder.md5}):
|
||||
await self.process_resp(
|
||||
resp=resp,
|
||||
progress_bar=progress_bar,
|
||||
collected=collected,
|
||||
filesize=document_holder.filesize,
|
||||
)
|
||||
return bytes(collected)
|
||||
except DownloadError:
|
||||
pass
|
||||
except DownloadError:
|
||||
pass
|
||||
|
||||
async def external_cancel(self):
|
||||
self.request_context.statbox(action='externally_canceled')
|
||||
|
@ -27,7 +27,7 @@ py3_image(
|
||||
requirement("aiokit"),
|
||||
requirement("aiolibgen"),
|
||||
"//library/aiopostgres",
|
||||
"//library/configurator",
|
||||
requirement("izihawa_configurator"),
|
||||
"//library/jobber",
|
||||
"//nexus/actions",
|
||||
],
|
||||
|
@ -39,8 +39,8 @@ class CrossrefApiJob(BaseJob):
|
||||
})
|
||||
count = 0
|
||||
async for chunk in self.crossref_client.works_cursor(
|
||||
filter=f'from-index-date:{self.from_date}',
|
||||
rows=500,
|
||||
filter=f'from-index-date:{self.from_date}',
|
||||
rows=500,
|
||||
):
|
||||
for item in chunk['items']:
|
||||
yield item
|
||||
@ -50,4 +50,4 @@ class CrossrefApiJob(BaseJob):
|
||||
'mode': 'ingest',
|
||||
'items': count,
|
||||
'target_date': self.from_date,
|
||||
})
|
||||
})
|
||||
|
@ -34,6 +34,7 @@ class PostgresJob(BaseJob):
|
||||
f'user={database["username"]} '
|
||||
f'password={database["password"]} '
|
||||
f'host={database["host"]}',
|
||||
timeout=3600 * 2,
|
||||
)
|
||||
self.summa_client = SummaClient(endpoint=summa['endpoint'])
|
||||
self.summa_config = summa
|
||||
@ -84,6 +85,7 @@ class PostgresJob(BaseJob):
|
||||
# Mandatory for server side cursor
|
||||
cursor_name='nexus_ingest_cursor',
|
||||
itersize=50_000,
|
||||
statement_timeout=3600 * 2,
|
||||
):
|
||||
loaded = True
|
||||
yield row
|
||||
@ -95,8 +97,12 @@ class PostgresJob(BaseJob):
|
||||
# Mandatory for server side cursor
|
||||
cursor_name='nexus_ingest_cursor',
|
||||
itersize=50_000,
|
||||
statement_timeout=3600 * 2,
|
||||
):
|
||||
yield row
|
||||
|
||||
await self.summa_client.commit_index(self.summa_config['name'], session_id=session_id)
|
||||
await self.summa_client.commit_index(
|
||||
self.summa_config['name'],
|
||||
session_id=session_id,
|
||||
)
|
||||
await self.summa_client.set_index_alias(self.summa_config['index_alias'], self.summa_config['name'], session_id=session_id)
|
||||
|
@ -25,7 +25,7 @@ DEPS = [
|
||||
"//library/aiogrpctools",
|
||||
requirement("aiokit"),
|
||||
"//library/aiopostgres",
|
||||
"//library/configurator",
|
||||
requirement("izihawa_configurator"),
|
||||
"//library/logging",
|
||||
"//nexus/meta_api/proto:grpc_py",
|
||||
"//nexus/models/proto:proto_py",
|
||||
|
@ -1,5 +1,5 @@
|
||||
from izihawa_configurator import Configurator
|
||||
from izihawa_utils import env
|
||||
from library.configurator import Configurator
|
||||
|
||||
|
||||
def get_config():
|
||||
|
@ -315,7 +315,7 @@ class SearchService(SearchServicer, BaseService):
|
||||
with suppress(RetryError):
|
||||
async for attempt in AsyncRetrying(
|
||||
retry=retry_if_exception_type(NeedRetryError),
|
||||
wait=wait_fixed(5),
|
||||
wait=wait_fixed(10),
|
||||
stop=stop_after_attempt(6)
|
||||
):
|
||||
with attempt:
|
||||
|
@ -33,4 +33,5 @@ message Scimag {
|
||||
string volume = 21;
|
||||
int32 year = 30;
|
||||
float page_rank = 34;
|
||||
float series_page_rank = 35;
|
||||
}
|
||||
|
@ -26,7 +26,7 @@ py3_image(
|
||||
requirement("aiocrossref"),
|
||||
requirement("aiokit"),
|
||||
"//library/aiopostgres",
|
||||
"//library/configurator",
|
||||
requirement("izihawa_configurator"),
|
||||
"//library/logging",
|
||||
"//nexus/actions",
|
||||
"//nexus/models/proto:proto_py",
|
||||
|
@ -10,6 +10,6 @@ py_library(
|
||||
srcs_version = "PY3",
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
"//library/configurator",
|
||||
requirement("izihawa_configurator"),
|
||||
],
|
||||
)
|
||||
|
@ -1,4 +1,4 @@
|
||||
from library.configurator import Configurator
|
||||
from izihawa_configurator import Configurator
|
||||
|
||||
|
||||
def get_promotions():
|
||||
|
@ -19,6 +19,9 @@ promotions:
|
||||
- texts:
|
||||
en: 💬 Research is the only and ultimate goal
|
||||
weight: 1
|
||||
- texts:
|
||||
en: 💬 Intellectual property is not a valid form of property
|
||||
weight: 1
|
||||
- texts:
|
||||
en: ✋ Have a subscription to paid articles? [Help researchers!](https://t.me/{mutual_aid_group})
|
||||
ru: ✋ Есть доступ к платным статьям? [Помоги ученым!](https://t.me/{mutual_aid_group})
|
||||
|
@ -1,12 +1,16 @@
|
||||
load("@rules_python//python:defs.bzl", "py_binary", "py_library")
|
||||
load("@rules_python//python:defs.bzl", "py_library")
|
||||
load("@rules_python//python:packaging.bzl", "py_wheel")
|
||||
load("@pip_modules//:requirements.bzl", "requirement")
|
||||
|
||||
filegroup(
|
||||
name = "data",
|
||||
srcs = ["configs/pylon.yaml"],
|
||||
)
|
||||
|
||||
py_library(
|
||||
name = "pylon",
|
||||
srcs = glob(["**/*.py"]),
|
||||
data = [
|
||||
"configs/pylon.yaml",
|
||||
],
|
||||
data = [":data"],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
requirement("aiodns"),
|
||||
@ -16,6 +20,7 @@ py_library(
|
||||
requirement("brotli"),
|
||||
requirement("cchardet"),
|
||||
requirement("certifi"),
|
||||
requirement("fire"),
|
||||
requirement("jq"),
|
||||
requirement("orjson"),
|
||||
requirement("pypdf2"),
|
||||
@ -23,20 +28,38 @@ py_library(
|
||||
requirement("selenium"),
|
||||
requirement("tenacity"),
|
||||
requirement("aiokit"),
|
||||
"//library/configurator",
|
||||
requirement("izihawa_configurator"),
|
||||
"//library/logging",
|
||||
"//nexus/pylon/proto:pylon_proto_py",
|
||||
],
|
||||
)
|
||||
|
||||
py_binary(
|
||||
name = "cli",
|
||||
srcs = ["cli.py"],
|
||||
main = "cli.py",
|
||||
srcs_version = "PY3",
|
||||
visibility = ["//visibility:public"],
|
||||
py_wheel(
|
||||
name = "nexus-pylon-wheel",
|
||||
author = "The Superpirate",
|
||||
author_email = "fist.of.the.first.pirates@gmail.com",
|
||||
classifiers = [
|
||||
"Programming Language :: Python :: 3.10",
|
||||
],
|
||||
description_file = ":README.md",
|
||||
distribution = "nexus-pylon",
|
||||
entry_points = {"console_scripts": ["pylon = nexus.pylon.cli:main"]},
|
||||
homepage = "https://github.com/nexus-stc/hyperboria/tree/master/nexus/pylon",
|
||||
license = "MIT License",
|
||||
python_requires = ">=3.10",
|
||||
python_tag = "py3",
|
||||
requires = [
|
||||
"aiokit >= 1.0.0",
|
||||
"izihawa_configurator >= 1.0.0",
|
||||
"selenium >= 4.3.0",
|
||||
],
|
||||
strip_path_prefixes = [
|
||||
"nexus/pylon/proto/pylon_proto_py_pb",
|
||||
],
|
||||
version = "1.0.0",
|
||||
deps = [
|
||||
requirement("fire"),
|
||||
":data",
|
||||
":pylon",
|
||||
"//nexus/pylon/proto:pylon_proto_py",
|
||||
],
|
||||
)
|
||||
|
@ -6,16 +6,51 @@
|
||||
- Streams data by chunks
|
||||
- GRPC-ready
|
||||
|
||||
## Build
|
||||
|
||||
```bash
|
||||
bazel build -c opt nexus-pylon-wheel
|
||||
```
|
||||
|
||||
## Install
|
||||
|
||||
### PIP
|
||||
```bash
|
||||
pip install nexus-pylon
|
||||
```
|
||||
|
||||
## Nexus Pylon CLI
|
||||
|
||||
Casual download
|
||||
```bash
|
||||
bazel run -c opt cli -- doi 10.1056/NEJMoa2033700 --output article.pdf
|
||||
Download scientific publication:
|
||||
```bash
|
||||
pylon download --doi 10.1182/blood-2011-03-325258 --output article.pdf
|
||||
```
|
||||
|
||||
Download with proxies
|
||||
```bash
|
||||
bazel run -c opt cli -- md5 278C3A72B7B04717361501B8642857DF \
|
||||
--output file.pdf \
|
||||
--proxies socks5://127.0.0.1:9050
|
||||
Download file by its MD5:
|
||||
```bash
|
||||
pylon download --md5 f07707ee92fa675fd4ee53e3fee977d1 --output article.pdf
|
||||
```
|
||||
|
||||
Download file by its multihash:
|
||||
```bash
|
||||
pylon download --ipfs-multihashes '["bafykbzacea3vduqii3u52xkzdqan5oc54vsvedmed25dfybrqxyafahjl3rzu"]' --output article.pdf
|
||||
```
|
||||
|
||||
### Using with Selenium
|
||||
|
||||
Create directory for exchaning files between host and launched Selenium in Docker
|
||||
```bash
|
||||
mkdir downloads
|
||||
```
|
||||
|
||||
Launch Selenium in Docker
|
||||
```bash
|
||||
docker run -e SE_START_XVFB=false -v $(pwd)/downloads:/downloads -p 4444:4444 selenium/standalone-chrome:latest
|
||||
```
|
||||
|
||||
Launch Pylon
|
||||
```bash
|
||||
pylon download --doi 10.1101/2022.09.09.507349 --output article.pdf \
|
||||
--wd-endpoint 'http://127.0.0.1:4444/wd/hub' \
|
||||
--wd-directory /downloads --wd-host-directory $(pwd)/downloads --debug
|
||||
```
|
@ -1,15 +1,17 @@
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
import fire
|
||||
from aiokit.utils import sync_fu
|
||||
from nexus.pylon.client import (
|
||||
from izihawa_configurator import Configurator
|
||||
|
||||
from .client import (
|
||||
DownloadError,
|
||||
PylonClient,
|
||||
)
|
||||
from nexus.pylon.configs import get_config
|
||||
from nexus.pylon.proto.file_pb2 import FileResponse as FileResponsePb
|
||||
from .proto.file_pb2 import FileResponse as FileResponsePb
|
||||
|
||||
|
||||
def resolve_path(filepath):
|
||||
@ -27,22 +29,20 @@ async def fetch(
|
||||
collected = bytes()
|
||||
try:
|
||||
last_len = 0
|
||||
last_source = ''
|
||||
async for resp in iter:
|
||||
if resp.HasField('status'):
|
||||
if resp.status == FileResponsePb.Status.BEGIN_TRANSMISSION:
|
||||
print(f'Started transmission from {resp.source}...', end='\r', file=sys.stderr)
|
||||
print(f'Started transmission...', file=sys.stderr)
|
||||
last_len = 0
|
||||
last_source = resp.source
|
||||
collected = bytes()
|
||||
elif resp.HasField('chunk'):
|
||||
if len(collected) - last_len > 1024 * 100:
|
||||
print(f'Loaded {len(collected)} bytes from {resp.source}', end='\r', file=sys.stderr)
|
||||
print(f'Loaded {len(collected)} bytes', end='\r', file=sys.stderr)
|
||||
last_len = len(collected)
|
||||
last_source = resp.source
|
||||
collected += resp.chunk.content
|
||||
with open(resolve_path(output), 'wb') as f:
|
||||
print(f'Completed! Loaded {len(collected)} bytes from {last_source}', file=sys.stderr)
|
||||
print()
|
||||
print(f'Completed! Loaded {len(collected)} bytes', file=sys.stderr)
|
||||
f.write(collected)
|
||||
except DownloadError:
|
||||
print('File not found')
|
||||
@ -50,25 +50,53 @@ async def fetch(
|
||||
|
||||
async def download(
|
||||
output: str,
|
||||
config: Optional[str] = None,
|
||||
debug: bool = False,
|
||||
wd_endpoint: Optional[str] = None,
|
||||
wd_directory: Optional[str] = None,
|
||||
wd_host_directory: Optional[str] = None,
|
||||
**params,
|
||||
):
|
||||
"""
|
||||
Download scientific publications from various sources
|
||||
Large portion of fresh articles could be retrieved only though publisher libraries through `BrowserDriver`, it
|
||||
requires Selenium webdriver:
|
||||
`docker run -e SE_START_XVFB=false -v $(pwd)/downloads:/downloads -p 4444:4444 selenium/standalone-chrome:latest`
|
||||
Args:
|
||||
output: name of the output file
|
||||
config: pylon config
|
||||
debug: enable debug logging
|
||||
wd_endpoint: web-driver
|
||||
wd_directory: mounted directory inside Docker image
|
||||
wd_host_directory: directory for downloads on host that should be mounter as `wd_directory` inside Docker image
|
||||
"""
|
||||
if debug:
|
||||
logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
|
||||
c = get_config()['pylon']
|
||||
p = PylonClient(
|
||||
proxies=c['proxies'],
|
||||
source_configs=c['sources'],
|
||||
default_driver_proxy_list=c['default_driver_proxy_list'],
|
||||
downloads_directory=c['downloads_directory'],
|
||||
)
|
||||
return await fetch(iter=p.download(params=params), output=output)
|
||||
|
||||
default_config_path = os.path.join(os.path.dirname(__file__), 'configs/pylon.yaml')
|
||||
config = Configurator([config if config else default_config_path], env_prefix='NEXUS_PYLON')
|
||||
config = config['pylon']
|
||||
if wd_endpoint:
|
||||
config.setdefault('webdriver_hub', {})
|
||||
config['webdriver_hub']['endpoint'] = wd_endpoint
|
||||
if not wd_directory:
|
||||
raise ValueError('Should pass --wd-directory with --wd-endpoint')
|
||||
config['webdriver_hub']['downloads_directory'] = wd_directory
|
||||
if not wd_host_directory:
|
||||
raise ValueError('Should pass --wd-host-directory with --wd-endpoint')
|
||||
config['webdriver_hub']['host_downloads_directory'] = wd_host_directory
|
||||
|
||||
pylon_client = PylonClient(config=config)
|
||||
return await fetch(iter=pylon_client.download(params=params), output=output)
|
||||
|
||||
|
||||
def main():
|
||||
fire.Fire({
|
||||
'download': sync_fu(download),
|
||||
})
|
||||
try:
|
||||
fire.Fire({
|
||||
'download': sync_fu(download),
|
||||
})
|
||||
except KeyboardInterrupt:
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
@ -1,12 +1,10 @@
|
||||
import logging
|
||||
from typing import (
|
||||
AsyncIterable,
|
||||
Dict,
|
||||
List,
|
||||
Optional,
|
||||
)
|
||||
|
||||
from aiokit import AioThing
|
||||
from library.logging import error_log
|
||||
from nexus.pylon.exceptions import (
|
||||
DownloadError,
|
||||
NotFoundError,
|
||||
@ -17,30 +15,25 @@ from nexus.pylon.source import Source
|
||||
|
||||
|
||||
class PylonClient(AioThing):
|
||||
def __init__(
|
||||
self,
|
||||
source_configs: Optional[List],
|
||||
proxies: Optional[List[str]] = None,
|
||||
downloads_directory: Optional[str] = None,
|
||||
default_driver_proxy_list: [Optional[List]] = None,
|
||||
default_resolver_proxy_list: [Optional[List]] = None,
|
||||
):
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.proxy_manager = ProxyManager(proxies)
|
||||
self.downloads_directory = downloads_directory
|
||||
self.default_driver_proxy_list = default_driver_proxy_list
|
||||
self.default_resolver_proxy_list = default_resolver_proxy_list
|
||||
self.config = config
|
||||
self.proxy_manager = ProxyManager(config.get('proxies'))
|
||||
self.sources = []
|
||||
for source_config in source_configs:
|
||||
if config.get('webdriver_hub') is None:
|
||||
logging.getLogger('nexus_pylon').warning({
|
||||
'action': 'missed_webdriver',
|
||||
'mode': 'pylon',
|
||||
})
|
||||
for source_config in config['sources']:
|
||||
source = Source.from_config(
|
||||
proxy_manager=self.proxy_manager,
|
||||
config=self.config,
|
||||
source_config=source_config,
|
||||
downloads_directory=downloads_directory,
|
||||
default_driver_proxy_list=default_driver_proxy_list,
|
||||
default_resolver_proxy_list=default_resolver_proxy_list,
|
||||
)
|
||||
self.sources.append(source)
|
||||
self.starts.append(source)
|
||||
if source:
|
||||
self.sources.append(source)
|
||||
self.starts.append(source)
|
||||
|
||||
async def download(self, params: Dict) -> AsyncIterable[FileResponsePb]:
|
||||
for source in self.sources:
|
||||
@ -50,9 +43,10 @@ class PylonClient(AioThing):
|
||||
async for resp in source.download(params):
|
||||
yield resp
|
||||
return
|
||||
except NotFoundError:
|
||||
except NotFoundError as e:
|
||||
logging.getLogger('nexus_pylon').debug(e)
|
||||
continue
|
||||
except DownloadError as e:
|
||||
error_log(e)
|
||||
logging.getLogger('nexus_pylon').warning(e)
|
||||
continue
|
||||
raise NotFoundError()
|
||||
raise NotFoundError(params=params)
|
||||
|
@ -1,5 +1,4 @@
|
||||
from izihawa_utils import env
|
||||
from library.configurator import Configurator
|
||||
from izihawa_configurator import Configurator
|
||||
|
||||
|
||||
def get_config():
|
||||
|
@ -1,65 +1,25 @@
|
||||
---
|
||||
pylon:
|
||||
default_driver_proxy_list:
|
||||
- [proxy1]
|
||||
- [proxy2]
|
||||
- [proxy3]
|
||||
downloads_directory: /downloads
|
||||
proxies:
|
||||
- address: proxy1.net:7890
|
||||
name: proxy1
|
||||
tags: [proxy1]
|
||||
- address: proxy2.net:7990
|
||||
name: proxy2
|
||||
tags: [proxy2]
|
||||
- address: proxy3.net:8090
|
||||
name: proxy3
|
||||
tags: [proxy3]
|
||||
default_driver_proxy_list: ~
|
||||
default_resolver_proxy_list: ~
|
||||
proxies: ~
|
||||
sources:
|
||||
# LibGen.rocks
|
||||
# IPFS
|
||||
- driver:
|
||||
args:
|
||||
proxy_list: ~
|
||||
validator:
|
||||
class: nexus.pylon.validators.Md5Validator
|
||||
class: nexus.pylon.validators.BaseValidator
|
||||
class:
|
||||
nexus.pylon.drivers.DirectDriver
|
||||
matcher:
|
||||
md5: ^.*$
|
||||
ipfs_multihashes: ^.*$
|
||||
resolver:
|
||||
args:
|
||||
extractors:
|
||||
- producer:
|
||||
format_string: 'http://libgen.rocks/{matched_group}'
|
||||
group: 0
|
||||
re: 'get\.php\?md5=.*&key=[A-Za-z\d]+'
|
||||
timeout: 25.0
|
||||
type: regex
|
||||
url: https://libgen.rocks/ads.php?md5={md5}
|
||||
class: nexus.pylon.resolvers.RequestResolver
|
||||
# LibGen.rocks
|
||||
- driver:
|
||||
args:
|
||||
proxy_list: ~
|
||||
class:
|
||||
nexus.pylon.drivers.DirectDriver
|
||||
matcher:
|
||||
doi: ^.*$
|
||||
resolver:
|
||||
args:
|
||||
extractors:
|
||||
- producer:
|
||||
format_string: 'http://libgen.rocks/{matched_group}'
|
||||
group: 0
|
||||
re: 'get\.php\?md5=[a-fA-F\d]+&key=[A-Za-z\d]+(&doi=[^\"]*)+'
|
||||
timeout: 25.0
|
||||
type: regex
|
||||
url: 'https://libgen.rocks/ads.php?doi={doi}'
|
||||
class: nexus.pylon.resolvers.RequestResolver
|
||||
format_string: 'https://ipfs.io/ipfs/{ipfs_multihashes[0]}'
|
||||
class: nexus.pylon.resolvers.TemplateResolver
|
||||
# Library.lol
|
||||
- driver:
|
||||
args:
|
||||
proxy_list: ~
|
||||
validator:
|
||||
class: nexus.pylon.validators.Md5Validator
|
||||
class:
|
||||
@ -70,27 +30,22 @@ pylon:
|
||||
args:
|
||||
extractors:
|
||||
- producer:
|
||||
format_string: '{matched_group}'
|
||||
group: 1
|
||||
re: '<a href="([^\"]+)">GET</a>'
|
||||
timeout: 45.0
|
||||
format_string: '{href}'
|
||||
timeout: 45.0
|
||||
re: '<a href="(?P<href>[^\"]+)">GET</a>'
|
||||
type: regex
|
||||
- producer:
|
||||
format_string: '{matched_group}'
|
||||
group: 0
|
||||
re: 'https://ipfs.io/ipfs/[A-Za-z\d]+'
|
||||
format_string: '{url}'
|
||||
re: '(?P<url>https://ipfs.io/ipfs/[A-Za-z\d]+)'
|
||||
type: regex
|
||||
- producer:
|
||||
format_string: '{matched_group}'
|
||||
group: 0
|
||||
re: 'https://cloudflare-ipfs.com/ipfs/[A-Za-z\d]+'
|
||||
format_string: '{url}'
|
||||
re: '(?P<url>https://cloudflare-ipfs.com/ipfs/[A-Za-z\d]+)'
|
||||
type: regex
|
||||
url: http://library.lol/main/{md5}
|
||||
class: nexus.pylon.resolvers.RequestResolver
|
||||
# library.lol
|
||||
- driver:
|
||||
args:
|
||||
proxy_list: ~
|
||||
class:
|
||||
nexus.pylon.drivers.DirectDriver
|
||||
matcher:
|
||||
@ -99,13 +54,48 @@ pylon:
|
||||
args:
|
||||
extractors:
|
||||
- producer:
|
||||
format_string: '{matched_group}'
|
||||
group: 1
|
||||
re: '<a href="([^\"]+)">GET</a>'
|
||||
timeout: 45.0
|
||||
format_string: '{href}'
|
||||
timeout: 45.0
|
||||
re: '<a href="(?P<href>[^\"]+)">GET</a>'
|
||||
type: regex
|
||||
url: 'http://library.lol/scimag/{doi}'
|
||||
class: nexus.pylon.resolvers.RequestResolver
|
||||
# LibGen.rocks
|
||||
- driver:
|
||||
args:
|
||||
validator:
|
||||
class: nexus.pylon.validators.Md5Validator
|
||||
class:
|
||||
nexus.pylon.drivers.DirectDriver
|
||||
matcher:
|
||||
md5: ^.*$
|
||||
resolver:
|
||||
args:
|
||||
extractors:
|
||||
- producer:
|
||||
format_string: 'http://libgen.rocks/{key}'
|
||||
timeout: 25.0
|
||||
re: '(?P<key>get\.php\?md5=.*&key=[A-Za-z\d]+)'
|
||||
type: regex
|
||||
resolve_timeout: 25.0
|
||||
url: https://libgen.rocks/ads.php?md5={md5}
|
||||
class: nexus.pylon.resolvers.RequestResolver
|
||||
# LibGen.rocks
|
||||
- driver:
|
||||
class:
|
||||
nexus.pylon.drivers.DirectDriver
|
||||
matcher:
|
||||
doi: ^.*$
|
||||
resolver:
|
||||
args:
|
||||
extractors:
|
||||
- producer:
|
||||
format_string: 'http://libgen.rocks/{key}'
|
||||
timeout: 25.0
|
||||
re: '(?P<key>get\.php\?md5=[a-fA-F\d]+&key=[A-Za-z\d]+(&doi=[^\"]*)+)'
|
||||
type: regex
|
||||
url: 'https://libgen.rocks/ads.php?doi={doi}'
|
||||
class: nexus.pylon.resolvers.RequestResolver
|
||||
# jamanetwork.com
|
||||
- driver:
|
||||
args:
|
||||
@ -206,6 +196,13 @@ pylon:
|
||||
format_string: 'https://www.tandfonline.com/doi/pdf/{doi}?download=true'
|
||||
class: nexus.pylon.resolvers.TemplateResolver
|
||||
# iopscience.iop.org
|
||||
- matcher:
|
||||
doi: ^10.1088/.*$
|
||||
resolver:
|
||||
args:
|
||||
format_string: 'https://iopscience.iop.org/article/{doi}/pdf'
|
||||
class: nexus.pylon.resolvers.TemplateResolver
|
||||
# iopscience.iop.org
|
||||
- matcher:
|
||||
doi: ^10.1088/.*$
|
||||
resolver:
|
||||
@ -220,10 +217,6 @@ pylon:
|
||||
timeout: 30
|
||||
type: wait_css_selector
|
||||
- type: click
|
||||
proxy_list:
|
||||
- [proxy2]
|
||||
- [proxy1]
|
||||
- [proxy3]
|
||||
class: nexus.pylon.drivers.BrowserDriver
|
||||
matcher:
|
||||
doi: ^10.1093/.*$
|
||||
@ -246,6 +239,13 @@ pylon:
|
||||
args:
|
||||
timeout: 30.0
|
||||
class: nexus.pylon.resolvers.TemplateResolver
|
||||
# biorxiv.org
|
||||
- matcher:
|
||||
doi: ^10.1101/.*$
|
||||
resolver:
|
||||
args:
|
||||
format_string: 'https://www.biorxiv.org/content/{doi}.full.pdf'
|
||||
class: nexus.pylon.resolvers.TemplateResolver
|
||||
# journals.aps.org
|
||||
- matcher:
|
||||
doi: ^10.1103/.*$
|
||||
@ -332,6 +332,13 @@ pylon:
|
||||
args:
|
||||
format_string: 'https://journals.physiology.org/doi/pdf/{doi}?download=true'
|
||||
class: nexus.pylon.resolvers.TemplateResolver
|
||||
# www.ahajournals.org
|
||||
- matcher:
|
||||
doi: ^10.1161/.*$
|
||||
resolver:
|
||||
args:
|
||||
format_string: 'https://www.ahajournals.org/doi/pdf/{doi}?download=true'
|
||||
class: nexus.pylon.resolvers.TemplateResolver
|
||||
# ajp.psychiatryonline.org
|
||||
- matcher:
|
||||
doi: ^10.1176/.*$
|
||||
@ -355,8 +362,6 @@ pylon:
|
||||
class: nexus.pylon.resolvers.TemplateResolver
|
||||
# journals.plos.org
|
||||
- driver:
|
||||
args:
|
||||
proxy_list: ~
|
||||
class: nexus.pylon.drivers.direct.DirectDriver
|
||||
matcher:
|
||||
doi: ^10.1371/.*$
|
||||
@ -364,6 +369,13 @@ pylon:
|
||||
args:
|
||||
format_string: 'https://journals.plos.org/plosone/article/file?id={doi}&type=printable'
|
||||
class: nexus.pylon.resolvers.TemplateResolver
|
||||
# guilfordjournals.com
|
||||
- matcher:
|
||||
doi: ^10.1521/.*$
|
||||
resolver:
|
||||
args:
|
||||
format_string: 'https://guilfordjournals.com/doi/pdf/{doi}?download=true'
|
||||
class: nexus.pylon.resolvers.TemplateResolver
|
||||
# bioone.org
|
||||
- driver:
|
||||
args:
|
||||
@ -396,8 +408,6 @@ pylon:
|
||||
doi: ^10.2139/.*$
|
||||
# www.afghandata.org
|
||||
- driver:
|
||||
args:
|
||||
proxy_list: ~
|
||||
class:
|
||||
nexus.pylon.drivers.DirectDriver
|
||||
matcher:
|
||||
@ -503,8 +513,6 @@ pylon:
|
||||
doi: ^10.5334/.*$
|
||||
# hess.copernicus.org
|
||||
- driver:
|
||||
args:
|
||||
proxy_list: ~
|
||||
class: nexus.pylon.drivers.DirectDriver
|
||||
matcher:
|
||||
doi: ^10.5194/.*$
|
||||
@ -524,7 +532,6 @@ pylon:
|
||||
- selector: '.uxf-download'
|
||||
type: wait_css_selector
|
||||
- type: click
|
||||
proxy_list: ~
|
||||
class: nexus.pylon.drivers.BrowserDriver
|
||||
matcher:
|
||||
doi: ^10.5585/.*
|
||||
@ -539,6 +546,22 @@ pylon:
|
||||
args:
|
||||
format_string: 'https://jcsm.aasm.org/doi/pdf/{doi}?download=true'
|
||||
class: nexus.pylon.resolvers.TemplateResolver
|
||||
# www.medwave.cl
|
||||
- driver:
|
||||
class:
|
||||
nexus.pylon.drivers.DirectDriver
|
||||
matcher:
|
||||
doi: ^10.5867/.*$
|
||||
resolver:
|
||||
args:
|
||||
extractors:
|
||||
- producer:
|
||||
format_string: 'https://www.medwave.cl/{path}'
|
||||
timeout: 25.0
|
||||
re: 'href=\"/(?P<path>[\w/.\-_]+\.pdf)\">PDF</a>'
|
||||
type: regex
|
||||
url: https://doi.org/{doi}
|
||||
class: nexus.pylon.resolvers.RequestResolver
|
||||
# journal.permsc.ru
|
||||
- driver:
|
||||
args:
|
||||
@ -584,8 +607,6 @@ pylon:
|
||||
doi: ^10.21203/.*$
|
||||
# www.ukm.my/
|
||||
- driver:
|
||||
args:
|
||||
proxy_list: ~
|
||||
class: nexus.pylon.drivers.DirectDriver
|
||||
matcher:
|
||||
doi: ^10.24035/.*$
|
||||
@ -599,6 +620,22 @@ pylon:
|
||||
class: nexus.pylon.drivers.BrowserDriver
|
||||
matcher:
|
||||
doi: ^10.32920/.*$
|
||||
# PKP Project
|
||||
- driver:
|
||||
class:
|
||||
nexus.pylon.drivers.DirectDriver
|
||||
matcher:
|
||||
doi: ^10.(5399|24905|31004|32729|37934)/.*$
|
||||
resolver:
|
||||
args:
|
||||
extractors:
|
||||
- producer:
|
||||
format_string: 'https://{host}/{prefix}/{journal}/article/download/{key}'
|
||||
timeout: 25.0
|
||||
re: 'href=\"(?:https?://[\w.]+)/(?P<prefix>[\w./]+)/(?P<journal>[\w.]+)/article/view/(?P<key>\w+/\w+)\"[^>]*>[Pp][Dd][Ff]\s*</a>'
|
||||
type: regex
|
||||
url: https://doi.org/{doi}
|
||||
class: nexus.pylon.resolvers.RequestResolver
|
||||
# papers.cumincad.org
|
||||
- driver:
|
||||
args:
|
||||
@ -606,11 +643,16 @@ pylon:
|
||||
- selector: 'file.pdf'
|
||||
type: wait_link_text
|
||||
- type: click
|
||||
proxy_list: ~
|
||||
class: nexus.pylon.drivers.BrowserDriver
|
||||
matcher:
|
||||
doi: ^10.52842/.*$
|
||||
# ^.*$
|
||||
- matcher:
|
||||
doi: ^.*$
|
||||
resolver:
|
||||
args:
|
||||
selector: '.resource.primary.URL | select (. | ascii_downcase | contains("pdf"))'
|
||||
class: nexus.pylon.resolvers.DoiOrgRequestResolver
|
||||
- matcher:
|
||||
doi: ^.*$
|
||||
resolver:
|
||||
|
@ -4,22 +4,22 @@ from typing import (
|
||||
Optional,
|
||||
)
|
||||
|
||||
from izihawa_utils.importlib import import_object
|
||||
from nexus.pylon.network_agent import NetworkAgent
|
||||
from nexus.pylon.prepared_request import PreparedRequest
|
||||
from nexus.pylon.proxy_manager import ProxyManager
|
||||
from nexus.pylon.validators.base import BaseValidator
|
||||
from utils.izihawa_utils.importlib import import_object
|
||||
|
||||
|
||||
class BaseDriver(NetworkAgent):
|
||||
def __init__(
|
||||
self,
|
||||
config,
|
||||
validator=None,
|
||||
downloads_directory: str = '/downloads',
|
||||
proxy_list: Optional[List] = None,
|
||||
proxy_manager: Optional[ProxyManager] = None,
|
||||
):
|
||||
super().__init__(proxy_list=proxy_list, proxy_manager=proxy_manager)
|
||||
self.config = config
|
||||
|
||||
validator_cls = 'nexus.pylon.validators.PdfValidator'
|
||||
if validator and 'class' in validator:
|
||||
@ -27,7 +27,6 @@ class BaseDriver(NetworkAgent):
|
||||
validator_cls = import_object(validator_cls)
|
||||
|
||||
self.validator = validator_cls
|
||||
self.downloads_directory = downloads_directory
|
||||
|
||||
def __str__(self):
|
||||
return self.__class__.__name__
|
||||
|
@ -32,21 +32,20 @@ from selenium.webdriver.support.ui import WebDriverWait
|
||||
class BrowserDriver(BaseDriver):
|
||||
def __init__(
|
||||
self,
|
||||
config,
|
||||
validator=None,
|
||||
proxy_list: Optional[List] = None,
|
||||
proxy_manager: Optional[ProxyManager] = None,
|
||||
actions: Optional[List] = None,
|
||||
downloads_directory='/downloads',
|
||||
window_size: Tuple[int, int] = (1279, 833),
|
||||
erase_webdrive_property: bool = True,
|
||||
webdrive_hub_endpoint: str = "http://127.0.0.1:4444/wd/hub",
|
||||
):
|
||||
super().__init__(validator=validator, proxy_list=proxy_list, proxy_manager=proxy_manager)
|
||||
super().__init__(config=config, validator=validator, proxy_list=proxy_list, proxy_manager=proxy_manager)
|
||||
self.actions = actions
|
||||
self.downloads_directory = Path(downloads_directory)
|
||||
self.window_size = window_size
|
||||
self.erase_webdrive_property = erase_webdrive_property
|
||||
self.webdrive_hub_endpoint = webdrive_hub_endpoint
|
||||
self.downloads_directory = Path(config['webdriver_hub']['downloads_directory'])
|
||||
self.host_downloads_directory = Path(config['webdriver_hub']['host_downloads_directory'])
|
||||
self.window_size = tuple(config['webdriver_hub'].get('window_size', [1279, 833]))
|
||||
self.erase_webdriver_property = config['webdriver_hub'].get('erase_webdriver_property', True)
|
||||
self.webdriver_hub_endpoint = config['webdriver_hub']['endpoint']
|
||||
self.file_poll_timeout = 2.0
|
||||
|
||||
async def get_chrome_sessions(self):
|
||||
proxies = list(
|
||||
@ -55,15 +54,14 @@ class BrowserDriver(BaseDriver):
|
||||
else [None]
|
||||
)
|
||||
for proxy in proxies:
|
||||
downloads_folder = self.downloads_directory / random_string(16)
|
||||
os.mkdir(downloads_folder)
|
||||
os.chmod(downloads_folder, 0o777)
|
||||
chrome = await asyncio.get_running_loop().run_in_executor(None, lambda: self.setup_chrome(proxy, downloads_folder))
|
||||
try:
|
||||
yield chrome, downloads_folder
|
||||
finally:
|
||||
shutil.rmtree(downloads_folder)
|
||||
chrome.quit()
|
||||
subdirectory = random_string(16)
|
||||
downloads_directory = self.downloads_directory / subdirectory
|
||||
host_downloads_directory = self.host_downloads_directory / subdirectory
|
||||
os.mkdir(host_downloads_directory)
|
||||
os.chmod(host_downloads_directory, 0o777)
|
||||
chrome = await asyncio.get_running_loop().run_in_executor(None, lambda: self.setup_chrome(proxy, downloads_directory))
|
||||
yield chrome, host_downloads_directory
|
||||
|
||||
|
||||
def setup_chrome(self, proxy, downloads_folder):
|
||||
options = webdriver.ChromeOptions()
|
||||
@ -85,13 +83,13 @@ class BrowserDriver(BaseDriver):
|
||||
options.add_argument('--disable-dev-shm-usage')
|
||||
options.add_argument("--disable-popup-blocking")
|
||||
chrome = webdriver.Remote(
|
||||
self.webdrive_hub_endpoint,
|
||||
self.webdriver_hub_endpoint,
|
||||
DesiredCapabilities.CHROME,
|
||||
options=options,
|
||||
)
|
||||
chrome.set_window_size(self.window_size[0], self.window_size[1])
|
||||
|
||||
if self.erase_webdrive_property:
|
||||
if self.erase_webdriver_property:
|
||||
resource = "/session/%s/chromium/send_command_and_get_result" % chrome.session_id
|
||||
url = chrome.command_executor._url + resource
|
||||
body = json.dumps({'cmd': "Page.addScriptToEvaluateOnNewDocument", 'params': {
|
||||
@ -103,7 +101,7 @@ class BrowserDriver(BaseDriver):
|
||||
}})
|
||||
chrome.command_executor._request('POST', url, body)
|
||||
|
||||
logging.getLogger('debug').debug({
|
||||
logging.getLogger('nexus_pylon').debug({
|
||||
'action': 'start_chrome',
|
||||
'mode': 'pylon',
|
||||
'proxy': str(proxy) if proxy is not None else None,
|
||||
@ -148,32 +146,19 @@ class BrowserDriver(BaseDriver):
|
||||
and downloaded_offset == current_offset
|
||||
and current_offset > 0
|
||||
):
|
||||
logging.getLogger('debug').debug({
|
||||
'action': 'sent',
|
||||
'mode': 'pylon',
|
||||
'filename': filename,
|
||||
})
|
||||
return
|
||||
|
||||
logging.getLogger('debug').debug({
|
||||
'action': 'send_part',
|
||||
'mode': 'pylon',
|
||||
'current_offset': current_offset,
|
||||
'downloaded_offset': downloaded_offset,
|
||||
'filename': filename,
|
||||
})
|
||||
await file.seek(current_offset)
|
||||
yield await file.read(downloaded_offset - current_offset)
|
||||
current_offset = downloaded_offset
|
||||
|
||||
await asyncio.sleep(0.5)
|
||||
await asyncio.sleep(self.file_poll_timeout)
|
||||
raise NotFoundError()
|
||||
finally:
|
||||
await file.close()
|
||||
|
||||
def get(self, chrome, url, params):
|
||||
logging.getLogger('debug').debug({
|
||||
'action': 'get',
|
||||
logging.getLogger('nexus_pylon').debug({
|
||||
'action': 'download',
|
||||
'mode': 'pylon',
|
||||
'url': url,
|
||||
})
|
||||
@ -190,11 +175,6 @@ class BrowserDriver(BaseDriver):
|
||||
if not last_element:
|
||||
raise RuntimeError('Nothing to click')
|
||||
chrome.execute_script("arguments[0].click();", last_element)
|
||||
logging.getLogger('debug').debug({
|
||||
'action': 'clicked',
|
||||
'mode': 'pylon',
|
||||
'element': str(last_element),
|
||||
})
|
||||
case 'close_window':
|
||||
current_window = previous_window
|
||||
previous_window = None
|
||||
@ -204,11 +184,6 @@ class BrowserDriver(BaseDriver):
|
||||
if not last_element:
|
||||
raise RuntimeError('Nothing to click')
|
||||
last_element.click()
|
||||
logging.getLogger('debug').debug({
|
||||
'action': 'native_clicked',
|
||||
'mode': 'pylon',
|
||||
'element': str(last_element),
|
||||
})
|
||||
case 'switch_to_new_window':
|
||||
previous_window = current_window
|
||||
current_window = chrome.window_handles[-1]
|
||||
@ -227,12 +202,6 @@ class BrowserDriver(BaseDriver):
|
||||
action['selector'],
|
||||
))
|
||||
)
|
||||
logging.getLogger('debug').debug({
|
||||
'action': 'waited_css_selector',
|
||||
'mode': 'pylon',
|
||||
'element': str(last_element),
|
||||
'step': action
|
||||
})
|
||||
case 'wait_link_text':
|
||||
last_element = WebDriverWait(chrome, action.get('timeout', 15.0)).until(
|
||||
EC.presence_of_element_located((
|
||||
@ -240,12 +209,6 @@ class BrowserDriver(BaseDriver):
|
||||
action['selector'],
|
||||
))
|
||||
)
|
||||
logging.getLogger('debug').debug({
|
||||
'action': 'waited_link_text',
|
||||
'mode': 'pylon',
|
||||
'element': str(last_element),
|
||||
'step': action
|
||||
})
|
||||
case 'wait_xpath':
|
||||
last_element = WebDriverWait(chrome, action.get('timeout', 15.0)).until(
|
||||
EC.presence_of_element_located((
|
||||
@ -253,16 +216,10 @@ class BrowserDriver(BaseDriver):
|
||||
action['selector'],
|
||||
))
|
||||
)
|
||||
logging.getLogger('debug').debug({
|
||||
'action': 'waited_xpath',
|
||||
'mode': 'pylon',
|
||||
'element': str(last_element),
|
||||
'step': action
|
||||
})
|
||||
case _:
|
||||
raise NotImplementedError('Not implemented action type')
|
||||
except WebDriverException as e:
|
||||
logging.getLogger('debug').debug({
|
||||
logging.getLogger('nexus_pylon').debug({
|
||||
'action': 'error',
|
||||
'mode': 'pylon',
|
||||
'error': str(e),
|
||||
@ -294,15 +251,17 @@ class BrowserDriver(BaseDriver):
|
||||
source=chrome.current_url,
|
||||
)
|
||||
file_validator.validate()
|
||||
logging.getLogger('debug').debug({
|
||||
'action': 'validated',
|
||||
'mode': 'pylon',
|
||||
'url': prepared_file_request.url,
|
||||
})
|
||||
return
|
||||
except NotFoundError:
|
||||
logging.getLogger('debug').debug({
|
||||
logging.getLogger('nexus_pylon').debug({
|
||||
'action': 'no_response',
|
||||
'mode': 'pylon',
|
||||
})
|
||||
finally:
|
||||
logging.getLogger('nexus_pylon').debug({
|
||||
'action': 'quit_chrome',
|
||||
'mode': 'pylon',
|
||||
})
|
||||
chrome.quit()
|
||||
shutil.rmtree(downloads_folder)
|
||||
raise NotFoundError(params=params, url=prepared_file_request.url, driver=str(self))
|
||||
|
@ -1,3 +1,4 @@
|
||||
import logging
|
||||
from typing import Dict
|
||||
|
||||
import aiohttp.client_exceptions
|
||||
@ -25,12 +26,27 @@ class DirectDriver(BaseDriver):
|
||||
@retry(
|
||||
reraise=True,
|
||||
wait=wait_random(min=1, max=2),
|
||||
stop=stop_after_attempt(7),
|
||||
stop=stop_after_attempt(3),
|
||||
retry=retry_if_exception_type((ProxyError, aiohttp.client_exceptions.ClientPayloadError, ProxyTimeoutError)),
|
||||
)
|
||||
async def execute_prepared_file_request(self, prepared_file_request: PreparedRequest, params: Dict):
|
||||
logging.debug({
|
||||
'action': 'download',
|
||||
'mode': 'pylon',
|
||||
'params': params,
|
||||
'source': str(self),
|
||||
'url': prepared_file_request.url,
|
||||
})
|
||||
async with self.get_session() as session:
|
||||
async with prepared_file_request.execute_with(session=session) as resp:
|
||||
logging.debug({
|
||||
'action': 'response',
|
||||
'mode': 'pylon',
|
||||
'params': params,
|
||||
'source': str(self),
|
||||
'url': prepared_file_request.url,
|
||||
'status': resp.status,
|
||||
})
|
||||
if resp.status == 404:
|
||||
raise NotFoundError(url=prepared_file_request.url)
|
||||
elif (
|
||||
|
@ -1,5 +1,8 @@
|
||||
import re
|
||||
import sys
|
||||
from typing import (
|
||||
List,
|
||||
Tuple,
|
||||
)
|
||||
|
||||
|
||||
class Matcher:
|
||||
@ -10,8 +13,11 @@ class Matcher:
|
||||
|
||||
def is_match(self, params) -> bool:
|
||||
for param in params:
|
||||
if params[param]:
|
||||
if param_regex := self.param_regexes.get(param):
|
||||
if re.match(param_regex, params[param]):
|
||||
return True
|
||||
return False
|
||||
param_value = params[param]
|
||||
param_regex = self.param_regexes.get(param)
|
||||
if param_value and param_regex:
|
||||
if not isinstance(param_value, (List, Tuple)):
|
||||
param_value = [param_value]
|
||||
for el in param_value:
|
||||
if re.match(param_regex, el):
|
||||
return el
|
||||
|
@ -10,7 +10,7 @@ import aiohttp
|
||||
from aiohttp import ClientSession
|
||||
from aiohttp.client_reqrep import ClientRequest
|
||||
from aiohttp_socks import ProxyConnector
|
||||
from library.aiokit.aiokit import AioThing
|
||||
from aiokit import AioThing
|
||||
from nexus.pylon.proxy_manager import (
|
||||
AllOf,
|
||||
AnyOf,
|
||||
|
@ -228,7 +228,7 @@ class BasePdfProcessor:
|
||||
try:
|
||||
page = self.process_page(page, pdf_reader)
|
||||
except (PdfStreamError, binascii.Error) as e:
|
||||
logging.getLogger('warning').warning({
|
||||
logging.getLogger('nexus_pylon').warning({
|
||||
'action': 'pdf_stream_error',
|
||||
'mode': 'pylon',
|
||||
'error': str(e),
|
||||
@ -259,7 +259,7 @@ class WatermarkEraser1(BaseWatermarkEraser):
|
||||
if self.is_watermark_predicate(text.encode()):
|
||||
xobj_death_note.append(operands[0])
|
||||
operations_death_note.append(op_i)
|
||||
logging.getLogger('debug').debug({
|
||||
logging.getLogger('nexus_pylon').debug({
|
||||
'action': 'watermark_removal',
|
||||
'mode': 'pylon',
|
||||
'text': text,
|
||||
@ -289,7 +289,7 @@ class WatermarkEraser2(BaseWatermarkEraser):
|
||||
if operation == b"Tj":
|
||||
if isinstance(operands[0], bytes) and self.is_watermark_predicate(operands[0]):
|
||||
operations_death_note.append(op_i)
|
||||
logging.getLogger('debug').debug({
|
||||
logging.getLogger('nexus_pylon').debug({
|
||||
'action': 'watermark_removal',
|
||||
'mode': 'pylon',
|
||||
'text': operands[0].decode(),
|
||||
@ -319,7 +319,7 @@ class WatermarkEraser3(BaseWatermarkEraser):
|
||||
text += operand
|
||||
if self.is_watermark_predicate(text):
|
||||
operations_death_note.append(op_i)
|
||||
logging.getLogger('debug').debug({
|
||||
logging.getLogger('nexus_pylon').debug({
|
||||
'action': 'watermark_removal',
|
||||
'mode': 'pylon',
|
||||
'text': text.decode(),
|
||||
@ -402,7 +402,7 @@ class WatermarkEraser4(BaseWatermarkEraser):
|
||||
text, matched = tc.match(self.regexp)
|
||||
if matched:
|
||||
operations_death_note.extend(matched)
|
||||
logging.getLogger('debug').debug({
|
||||
logging.getLogger('nexus_pylon').debug({
|
||||
'action': 'watermark_removal',
|
||||
'mode': 'pylon',
|
||||
'matched': text,
|
||||
|
@ -1,4 +1,5 @@
|
||||
import asyncio
|
||||
import logging
|
||||
from contextlib import asynccontextmanager
|
||||
from typing import Optional
|
||||
|
||||
@ -23,6 +24,7 @@ class PreparedRequest:
|
||||
cookies: Optional[dict] = None,
|
||||
ssl: bool = True,
|
||||
timeout: Optional[float] = None,
|
||||
headers_override: bool = False
|
||||
):
|
||||
self.method = method
|
||||
self.url = url
|
||||
@ -32,6 +34,8 @@ class PreparedRequest:
|
||||
}
|
||||
if headers:
|
||||
self.headers.update(headers)
|
||||
if headers_override:
|
||||
self.headers = headers or {}
|
||||
self.params = params
|
||||
self.cookies = cookies
|
||||
self.ssl = ssl
|
||||
@ -49,6 +53,13 @@ class PreparedRequest:
|
||||
@asynccontextmanager
|
||||
async def execute_with(self, session):
|
||||
try:
|
||||
logging.getLogger('nexus_pylon').debug({
|
||||
'action': 'request',
|
||||
'mode': 'pylon',
|
||||
'url': self.url,
|
||||
'method': self.method,
|
||||
'headers': self.headers,
|
||||
})
|
||||
async with session.request(
|
||||
method=self.method,
|
||||
url=self.url,
|
||||
|
@ -54,6 +54,8 @@ class Proxy:
|
||||
|
||||
class ProxyManager:
|
||||
def __init__(self, proxies=None):
|
||||
if proxies is None:
|
||||
proxies = []
|
||||
self.proxies = [Proxy(proxy) for proxy in proxies]
|
||||
|
||||
def get_proxy(self, tags: Optional[Union[AllOf, AnyOf, Set]] = None) -> Proxy:
|
||||
|
@ -1,5 +1,6 @@
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
from typing import (
|
||||
AsyncIterable,
|
||||
Dict,
|
||||
@ -50,20 +51,25 @@ class DoiOrgRequestResolver(BaseResolver):
|
||||
method='get',
|
||||
url=doi_url,
|
||||
timeout=self.resolve_timeout,
|
||||
headers={'Accept': 'application/json'}
|
||||
headers={
|
||||
'Accept': 'application/json',
|
||||
}
|
||||
).execute_with(session=session) as resp:
|
||||
return await resp.json()
|
||||
|
||||
async def resolve(self, params: Dict) -> AsyncIterable[PreparedRequest]:
|
||||
body = await self.resolve_through_doi_org(params)
|
||||
selected = None
|
||||
try:
|
||||
selected = json.loads(self.selector.input(body).text())
|
||||
if text := self.selector.input(body).text():
|
||||
selected = json.loads(text)
|
||||
except ValueError as e:
|
||||
logging.getLogger('error').error({
|
||||
logging.getLogger('nexus_pylon').error({
|
||||
'action': 'error',
|
||||
'mode': 'pylon',
|
||||
'params': params,
|
||||
'error': str(e)
|
||||
'error': str(e),
|
||||
'selector': str(self.selector),
|
||||
})
|
||||
return
|
||||
if selected:
|
||||
@ -73,7 +79,7 @@ class DoiOrgRequestResolver(BaseResolver):
|
||||
timeout=self.timeout,
|
||||
)
|
||||
else:
|
||||
logging.getLogger('debug').error({
|
||||
logging.getLogger('nexus_pylon').debug({
|
||||
'action': 'missed_selector',
|
||||
'mode': 'pylon',
|
||||
'params': params,
|
||||
|
@ -15,10 +15,12 @@ class RequestResolver(BaseResolver):
|
||||
self,
|
||||
url: str,
|
||||
extractors: List,
|
||||
resolve_timeout: float = 10.0,
|
||||
proxy_list: Optional[List] = None,
|
||||
proxy_manager: Optional[ProxyManager] = None,
|
||||
):
|
||||
super().__init__(proxy_list=proxy_list, proxy_manager=proxy_manager)
|
||||
self.resolve_timeout = resolve_timeout
|
||||
self.url = url
|
||||
self.extractors = extractors
|
||||
|
||||
@ -31,9 +33,9 @@ class RequestResolver(BaseResolver):
|
||||
async with PreparedRequest(
|
||||
method='get',
|
||||
url=url,
|
||||
timeout=10.0,
|
||||
timeout=self.resolve_timeout,
|
||||
).execute_with(session=session) as resp:
|
||||
# Sometimes sci-hub returns file
|
||||
# Sometimes hosts return file URL
|
||||
if resp.headers.get('Content-Type') == 'application/pdf':
|
||||
yield PreparedRequest(method='get', url=url, timeout=10.0)
|
||||
downloaded_page_bytes = await resp.read()
|
||||
@ -42,9 +44,11 @@ class RequestResolver(BaseResolver):
|
||||
for extractor in self.extractors:
|
||||
match = re.search(extractor['re'], downloaded_page, re.IGNORECASE)
|
||||
if match:
|
||||
matched_group = match.group(extractor['producer']['group'])
|
||||
yield PreparedRequest(
|
||||
method='get',
|
||||
url=extractor['producer']['format_string'].format(matched_group=matched_group),
|
||||
url=extractor['producer']['format_string'].format(
|
||||
host=resp.real_url.host,
|
||||
**match.groupdict()
|
||||
),
|
||||
timeout=extractor['producer'].get('timeout', 10.0),
|
||||
)
|
||||
|
@ -14,19 +14,27 @@ class TemplateResolver(BaseResolver):
|
||||
self,
|
||||
format_string: str = 'https://doi.org/{doi}',
|
||||
timeout: float = 10.0,
|
||||
method: str = 'GET',
|
||||
headers: Optional[dict] = None,
|
||||
headers_override: bool = False,
|
||||
proxy_list: Optional[List] = None,
|
||||
proxy_manager: Optional[ProxyManager] = None,
|
||||
):
|
||||
super().__init__(proxy_list=proxy_list, proxy_manager=proxy_manager)
|
||||
self.format_string = format_string
|
||||
self.timeout = timeout
|
||||
self.method = method
|
||||
self.headers = headers
|
||||
self.headers_override = headers_override
|
||||
|
||||
def __str__(self):
|
||||
return f'{self.__class__.__name__}({self.format_string})'
|
||||
|
||||
async def resolve(self, params) -> AsyncIterable[PreparedRequest]:
|
||||
yield PreparedRequest(
|
||||
method='GET',
|
||||
method=self.method,
|
||||
url=self.format_string.format(**params),
|
||||
timeout=self.timeout,
|
||||
headers=self.headers,
|
||||
headers_override=self.headers_override,
|
||||
)
|
||||
|
@ -2,12 +2,12 @@ import logging
|
||||
from typing import (
|
||||
AsyncIterable,
|
||||
Dict,
|
||||
List,
|
||||
Optional,
|
||||
)
|
||||
|
||||
from aiohttp.client_exceptions import ClientPayloadError
|
||||
from library.aiokit.aiokit import AioThing
|
||||
from library.logging import error_log
|
||||
from aiokit import AioThing
|
||||
from izihawa_utils.importlib import import_object
|
||||
from nexus.pylon.drivers.base import BaseDriver
|
||||
from nexus.pylon.exceptions import (
|
||||
DownloadError,
|
||||
@ -16,7 +16,6 @@ from nexus.pylon.exceptions import (
|
||||
from nexus.pylon.matcher import Matcher
|
||||
from nexus.pylon.proto.file_pb2 import FileResponse as FileResponsePb
|
||||
from nexus.pylon.resolvers.base import BaseResolver
|
||||
from utils.izihawa_utils.importlib import import_object
|
||||
|
||||
|
||||
class Source(AioThing):
|
||||
@ -29,12 +28,15 @@ class Source(AioThing):
|
||||
@classmethod
|
||||
def from_config(
|
||||
cls,
|
||||
proxy_manager,
|
||||
config,
|
||||
source_config,
|
||||
downloads_directory: str,
|
||||
default_driver_proxy_list: List,
|
||||
default_resolver_proxy_list: List,
|
||||
) -> 'Source':
|
||||
proxy_manager,
|
||||
) -> Optional['Source']:
|
||||
driver_cls_name = source_config.get('driver', {}).get('class', 'nexus.pylon.drivers.BrowserDriver')
|
||||
|
||||
if driver_cls_name.endswith('BrowserDriver') and config.get('webdriver_hub') is None:
|
||||
return None
|
||||
|
||||
matcher = Matcher(source_config['matcher'])
|
||||
|
||||
resolver_cls = import_object(
|
||||
@ -42,16 +44,16 @@ class Source(AioThing):
|
||||
)
|
||||
resolver_args = dict(
|
||||
proxy_manager=proxy_manager,
|
||||
proxy_list=default_resolver_proxy_list,
|
||||
proxy_list=config['default_resolver_proxy_list'],
|
||||
)
|
||||
resolver_args.update(**source_config.get('resolver', {}).get('args', {}))
|
||||
resolver = resolver_cls(**resolver_args)
|
||||
|
||||
driver_cls = import_object(source_config.get('driver', {}).get('class', 'nexus.pylon.drivers.BrowserDriver'))
|
||||
driver_cls = import_object(driver_cls_name)
|
||||
driver_args = dict(
|
||||
proxy_manager=proxy_manager,
|
||||
downloads_directory=downloads_directory,
|
||||
proxy_list=default_driver_proxy_list,
|
||||
proxy_list=config['default_driver_proxy_list'],
|
||||
config=config,
|
||||
)
|
||||
driver_args.update(**source_config.get('driver', {}).get('args', {}))
|
||||
driver = driver_cls(**driver_args)
|
||||
@ -67,13 +69,6 @@ class Source(AioThing):
|
||||
async def download(self, params: Dict) -> AsyncIterable[FileResponsePb]:
|
||||
yield FileResponsePb(status=FileResponsePb.Status.RESOLVING)
|
||||
async for prepared_file_request in self.resolver.resolve(params):
|
||||
logging.debug({
|
||||
'action': 'download',
|
||||
'mode': 'pylon',
|
||||
'params': params,
|
||||
'source': str(self),
|
||||
'url': prepared_file_request.url,
|
||||
})
|
||||
try:
|
||||
async for resp in self.driver.execute_prepared_file_request(
|
||||
prepared_file_request=prepared_file_request,
|
||||
@ -82,11 +77,11 @@ class Source(AioThing):
|
||||
yield resp
|
||||
return
|
||||
except ClientPayloadError as e:
|
||||
error_log(e, level=logging.WARNING)
|
||||
logging.getLogger('nexus_pylon').warning(e)
|
||||
continue
|
||||
except NotFoundError:
|
||||
continue
|
||||
except DownloadError as e:
|
||||
error_log(e)
|
||||
logging.getLogger('nexus_pylon').warning(e)
|
||||
continue
|
||||
raise NotFoundError(params=params, resolver=str(self.resolver), driver=str(self.driver))
|
||||
|
@ -1,4 +1,5 @@
|
||||
from .base import BaseValidator
|
||||
from .md5 import Md5Validator
|
||||
from .pdf import PdfValidator
|
||||
|
||||
__all__ = ['Md5Validator', 'PdfValidator']
|
||||
__all__ = ['BaseValidator', 'Md5Validator', 'PdfValidator']
|
||||
|
@ -1,4 +1,10 @@
|
||||
from typing import Dict
|
||||
|
||||
|
||||
class BaseValidator:
|
||||
def __init__(self, params: Dict):
|
||||
self.params = params
|
||||
|
||||
def update(self, chunk):
|
||||
pass
|
||||
|
||||
|
@ -7,6 +7,7 @@ from nexus.pylon.validators.base import BaseValidator
|
||||
|
||||
class Md5Validator(BaseValidator):
|
||||
def __init__(self, params: Dict):
|
||||
super().__init__(params)
|
||||
self.md5 = params['md5']
|
||||
self.v = hashlib.md5()
|
||||
|
||||
|
@ -12,7 +12,7 @@ from PyPDF2.errors import PdfReadError
|
||||
|
||||
class PdfValidator(BaseValidator):
|
||||
def __init__(self, params: Dict):
|
||||
self.params = params
|
||||
super().__init__(params)
|
||||
self.md5 = params.get('md5')
|
||||
self.file = bytes()
|
||||
self.v = hashlib.md5()
|
||||
@ -24,7 +24,7 @@ class PdfValidator(BaseValidator):
|
||||
|
||||
def validate(self):
|
||||
if self.md5 and self.md5.lower() == self.v.hexdigest().lower():
|
||||
logging.getLogger('debug').debug({
|
||||
logging.getLogger('nexus_pylon').debug({
|
||||
'action': 'validation',
|
||||
'mode': 'pylon',
|
||||
'result': 'md5_ok',
|
||||
@ -32,7 +32,7 @@ class PdfValidator(BaseValidator):
|
||||
})
|
||||
return
|
||||
elif not is_pdf(f=self.file):
|
||||
logging.getLogger('debug').debug({
|
||||
logging.getLogger('nexus_pylon').debug({
|
||||
'action': 'validation',
|
||||
'mode': 'pylon',
|
||||
'result': 'not_pdf',
|
||||
@ -41,28 +41,18 @@ class PdfValidator(BaseValidator):
|
||||
raise BadResponseError(file=str(self.file[:100]))
|
||||
|
||||
try:
|
||||
logging.getLogger('debug').debug({
|
||||
'action': 'open_pdf',
|
||||
'mode': 'pylon',
|
||||
'file_len': len(self.file),
|
||||
'params': self.params,
|
||||
})
|
||||
PyPDF2.PdfReader(BytesIO(self.file))
|
||||
logging.getLogger('debug').debug({
|
||||
'action': 'opened_pdf',
|
||||
'mode': 'pylon',
|
||||
'file_len': len(self.file),
|
||||
'params': self.params,
|
||||
})
|
||||
except PdfReadError:
|
||||
logging.getLogger('debug').debug({
|
||||
logging.getLogger('nexus_pylon').debug({
|
||||
'action': 'validation',
|
||||
'mode': 'pylon',
|
||||
'result': 'not_opened_as_pdf',
|
||||
'params': self.params,
|
||||
})
|
||||
raise BadResponseError(file=str(self.file[:100]))
|
||||
logging.getLogger('debug').debug({
|
||||
logging.getLogger('nexus_pylon').debug({
|
||||
'action': 'validation',
|
||||
'mode': 'pylon',
|
||||
'result': 'ok',
|
||||
'params': self.params,
|
||||
})
|
||||
|
@ -10,6 +10,6 @@ py_library(
|
||||
srcs_version = "PY3",
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
"//library/configurator",
|
||||
requirement("izihawa_configurator"),
|
||||
],
|
||||
)
|
||||
|
@ -1,4 +1,4 @@
|
||||
from library.configurator import Configurator
|
||||
from izihawa_configurator import Configurator
|
||||
|
||||
|
||||
def get_translations():
|
||||
|
@ -33,7 +33,10 @@ class ProgressBar:
|
||||
tail_text,
|
||||
message=None,
|
||||
source=None,
|
||||
throttle_secs: float = 0,
|
||||
throttle_secs: float = 0.0,
|
||||
hard_throttle_secs: float = 10.0,
|
||||
last_call: float = 0.0,
|
||||
done_threshold_size: int = 10 * 1024 * 1024,
|
||||
):
|
||||
self.telegram_client = telegram_client
|
||||
self.request_context = request_context
|
||||
@ -45,9 +48,12 @@ class ProgressBar:
|
||||
self.done = 0
|
||||
self.total = 1
|
||||
self.throttle_secs = throttle_secs
|
||||
self.hard_throttle_secs = hard_throttle_secs
|
||||
self.done_threshold_size = done_threshold_size
|
||||
|
||||
self.previous_done = 0
|
||||
self.last_text = None
|
||||
self.last_call = 0
|
||||
self.last_call = last_call
|
||||
|
||||
def share(self):
|
||||
if self.total > 0:
|
||||
@ -56,6 +62,7 @@ class ProgressBar:
|
||||
return f'{float(self.done / (1024 * 1024)):.1f}Mb'
|
||||
|
||||
def _set_progress(self, done, total):
|
||||
self.previous_done = self.done
|
||||
self.done = done
|
||||
self.total = total
|
||||
|
||||
@ -74,11 +81,20 @@ class ProgressBar:
|
||||
progress_bar = '|' + filled * bars['filled'] + (total_bars - filled) * bars['empty'] + '| '
|
||||
|
||||
tail_text = self.tail_text.format(source=self.source)
|
||||
return f'`{self.header}\n{progress_bar}{self.share()} {tail_text}`'
|
||||
return f'`{self.header}\n{progress_bar}{self.share().ljust(8)} {tail_text}`'
|
||||
|
||||
def should_send(self, now, ignore_last_call):
|
||||
if ignore_last_call:
|
||||
return True
|
||||
if abs(now - self.last_call) > self.hard_throttle_secs:
|
||||
return True
|
||||
if abs(now - self.last_call) > self.throttle_secs and (self.done - self.previous_done) < self.done_threshold_size:
|
||||
return True
|
||||
return False
|
||||
|
||||
async def send_message(self, text, ignore_last_call=False):
|
||||
now = time.time()
|
||||
if not ignore_last_call and abs(now - self.last_call) < self.throttle_secs:
|
||||
if not self.should_send(now, ignore_last_call):
|
||||
return
|
||||
try:
|
||||
if not self.message:
|
||||
@ -103,17 +119,3 @@ class ProgressBar:
|
||||
async def callback(self, done, total, ignore_last_call=False):
|
||||
self._set_progress(done, total)
|
||||
return await self.send_message(await self.render_progress(), ignore_last_call=ignore_last_call)
|
||||
|
||||
|
||||
class ThrottlerWrapper:
|
||||
def __init__(self, callback: Callable, throttle_secs: Union[int, float]):
|
||||
self.callback = callback
|
||||
self.last_call = 0
|
||||
self.throttle_secs = throttle_secs
|
||||
|
||||
async def __call__(self, *args, **kwargs):
|
||||
now = time.time()
|
||||
if abs(now - self.last_call) < self.throttle_secs:
|
||||
return
|
||||
self.last_call = now
|
||||
return await self.callback(*args, **kwargs)
|
||||
|
@ -63,7 +63,6 @@ class ScimagViewBuilder(BaseViewBuilder):
|
||||
'chapter': '🔖',
|
||||
'book-chapter': '🔖',
|
||||
}
|
||||
multihash_ix = 0
|
||||
|
||||
def is_preprint(self):
|
||||
return self.document_holder.doi.split('/')[0] in preprints
|
||||
|
@ -13,7 +13,7 @@ aiohttp-socks==0.7.1
|
||||
aiokafka==0.7.2
|
||||
aiokit==1.1.2
|
||||
aiosignal==1.2.0
|
||||
aiosumma==2.8.13
|
||||
aiosumma==2.10.4
|
||||
asn1crypto==1.5.1
|
||||
async-generator==1.10
|
||||
async-timeout==4.0.2
|
||||
@ -55,7 +55,7 @@ h11==0.13.0
|
||||
idna==3.3
|
||||
iniconfig==1.1.1
|
||||
isort==5.10.1
|
||||
izihawa-nlptools==1.1.7
|
||||
izihawa-nlptools==1.1.9
|
||||
izihawa-types==0.1.3
|
||||
izihawa-utils==1.0.7
|
||||
Jinja2==3.1.2
|
||||
|
Loading…
Reference in New Issue
Block a user