mirror of
https://github.com/nexus-stc/hyperboria
synced 2025-01-11 11:16:10 +01:00
- [nexus] Development
- [nexus] Development GitOrigin-RevId: 5d5feedff7b70be4c788abeb22f89c6758431d33
This commit is contained in:
parent
73f5fd4a06
commit
a683e0ce18
@ -43,7 +43,7 @@ py3_image(
|
|||||||
"//library/aiogrpctools",
|
"//library/aiogrpctools",
|
||||||
requirement("aiokit"),
|
requirement("aiokit"),
|
||||||
"//library/aiopostgres",
|
"//library/aiopostgres",
|
||||||
"//library/configurator",
|
requirement("izihawa_configurator"),
|
||||||
"//library/telegram",
|
"//library/telegram",
|
||||||
requirement("izihawa_utils"),
|
requirement("izihawa_utils"),
|
||||||
],
|
],
|
||||||
|
@ -1,11 +1,13 @@
|
|||||||
from library.configurator import Configurator
|
from izihawa_configurator import Configurator
|
||||||
|
from izihawa_utils import env
|
||||||
|
|
||||||
|
|
||||||
def get_config():
|
def get_config():
|
||||||
return Configurator([
|
return Configurator([
|
||||||
'idm/api/configs/base.yaml',
|
'idm/api/configs/base.yaml',
|
||||||
|
'idm/api/configs/%s.yaml?' % env.type,
|
||||||
'idm/api/configs/logging.yaml',
|
'idm/api/configs/logging.yaml',
|
||||||
], env_prefix='NEXUS_IDM_API')
|
], env_prefix='IDM_API')
|
||||||
|
|
||||||
|
|
||||||
config = get_config()
|
config = get_config()
|
||||||
|
@ -7,9 +7,9 @@ from idm.api.configs import get_config
|
|||||||
from idm.api.services.chat_manager import ChatManagerService
|
from idm.api.services.chat_manager import ChatManagerService
|
||||||
from idm.api.services.profile import ProfileService
|
from idm.api.services.profile import ProfileService
|
||||||
from idm.api.services.subscription_manager import SubscriptionManagerService
|
from idm.api.services.subscription_manager import SubscriptionManagerService
|
||||||
|
from izihawa_configurator import Configurator
|
||||||
from library.aiogrpctools import AioGrpcServer
|
from library.aiogrpctools import AioGrpcServer
|
||||||
from library.aiopostgres.pool_holder import AioPostgresPoolHolder
|
from library.aiopostgres.pool_holder import AioPostgresPoolHolder
|
||||||
from library.configurator import Configurator
|
|
||||||
from library.logging import configure_logging
|
from library.logging import configure_logging
|
||||||
|
|
||||||
|
|
||||||
|
@ -105,12 +105,16 @@ class ProfileService(profile_service_pb2_grpc.ProfileServicer, BaseService):
|
|||||||
for tag in download_document.tags:
|
for tag in download_document.tags:
|
||||||
tags_counter[tag] += 1
|
tags_counter[tag] += 1
|
||||||
|
|
||||||
most_popular_issns = sorted(issns_counter, key=issns_counter.get, reverse=True)[:7]
|
most_popular_issns = sorted(issns_counter, key=issns_counter.get, reverse=True)[:14]
|
||||||
most_popular_tags = sorted(tags_counter, key=tags_counter.get, reverse=True)[:7]
|
most_popular_tags = sorted(tags_counter, key=tags_counter.get, reverse=True)[:7]
|
||||||
|
|
||||||
most_popular_series = []
|
most_popular_series = []
|
||||||
|
if most_popular_issns:
|
||||||
async for row in self.application.pool_holder['nexus'].iterate(
|
async for row in self.application.pool_holder['nexus'].iterate(
|
||||||
f"select name, issns from series where issns && array[{most_popular_issns}]::text[]".format(
|
"select name, array_agg(issn) as issns from series "
|
||||||
|
"where issn in ({most_popular_issns}) "
|
||||||
|
"group by name order by name "
|
||||||
|
"limit 7".format(
|
||||||
most_popular_issns=','.join(map(lambda x: "'" + x + "'", most_popular_issns)),
|
most_popular_issns=','.join(map(lambda x: "'" + x + "'", most_popular_issns)),
|
||||||
),
|
),
|
||||||
row_factory=dict_row,
|
row_factory=dict_row,
|
||||||
|
@ -13,7 +13,7 @@ py_library(
|
|||||||
requirement("grpcio"),
|
requirement("grpcio"),
|
||||||
requirement("pyyaml"),
|
requirement("pyyaml"),
|
||||||
requirement("aiokit"),
|
requirement("aiokit"),
|
||||||
"//library/configurator",
|
requirement("izihawa_configurator"),
|
||||||
"//library/logging",
|
"//library/logging",
|
||||||
requirement("izihawa_utils"),
|
requirement("izihawa_utils"),
|
||||||
],
|
],
|
||||||
|
@ -92,6 +92,7 @@ class AioPostgresPoolHolder(AioThing):
|
|||||||
row_factory=tuple_row,
|
row_factory=tuple_row,
|
||||||
cursor_name: Optional[str] = None,
|
cursor_name: Optional[str] = None,
|
||||||
itersize: Optional[int] = None,
|
itersize: Optional[int] = None,
|
||||||
|
statement_timeout: Optional[int] = None,
|
||||||
):
|
):
|
||||||
if not self.pool:
|
if not self.pool:
|
||||||
raise RuntimeError('AioPostgresPoolHolder has not been started')
|
raise RuntimeError('AioPostgresPoolHolder has not been started')
|
||||||
@ -99,7 +100,9 @@ class AioPostgresPoolHolder(AioThing):
|
|||||||
async with conn.cursor(name=cursor_name, row_factory=row_factory) as cur:
|
async with conn.cursor(name=cursor_name, row_factory=row_factory) as cur:
|
||||||
if itersize is not None:
|
if itersize is not None:
|
||||||
cur.itersize = itersize
|
cur.itersize = itersize
|
||||||
await cur.execute(stmt, values)
|
await cur.execute(stmt + ';' if statement_timeout else '', values)
|
||||||
|
if statement_timeout:
|
||||||
|
await cur.execute(f'SET statement_timeout = {statement_timeout};')
|
||||||
async for row in cur:
|
async for row in cur:
|
||||||
yield row
|
yield row
|
||||||
|
|
||||||
|
@ -1,17 +0,0 @@
|
|||||||
load("@pip_modules//:requirements.bzl", "requirement")
|
|
||||||
load("@rules_python//python:defs.bzl", "py_library")
|
|
||||||
|
|
||||||
py_library(
|
|
||||||
name = "configurator",
|
|
||||||
srcs = glob(
|
|
||||||
["**/*.py"],
|
|
||||||
exclude = ["tests/**"],
|
|
||||||
),
|
|
||||||
srcs_version = "PY3",
|
|
||||||
visibility = ["//visibility:public"],
|
|
||||||
deps = [
|
|
||||||
requirement("jinja2"),
|
|
||||||
requirement("pyyaml"),
|
|
||||||
requirement("izihawa_utils"),
|
|
||||||
],
|
|
||||||
)
|
|
@ -1,170 +0,0 @@
|
|||||||
import json
|
|
||||||
import os
|
|
||||||
import os.path
|
|
||||||
from types import ModuleType
|
|
||||||
|
|
||||||
import yaml
|
|
||||||
from izihawa_utils.common import (
|
|
||||||
smart_merge_dicts,
|
|
||||||
unflatten,
|
|
||||||
)
|
|
||||||
from jinja2 import Template
|
|
||||||
from library.configurator.exceptions import UnknownConfigFormatError
|
|
||||||
|
|
||||||
|
|
||||||
class ConfigObject(dict):
|
|
||||||
def __getattr__(self, name):
|
|
||||||
try:
|
|
||||||
return self[name]
|
|
||||||
except KeyError as e:
|
|
||||||
raise AttributeError(e)
|
|
||||||
|
|
||||||
|
|
||||||
class AnyOf:
|
|
||||||
def __init__(self, *args):
|
|
||||||
self.args = args
|
|
||||||
|
|
||||||
|
|
||||||
class RichDict(dict):
|
|
||||||
def has(self, *args):
|
|
||||||
current = self
|
|
||||||
for c in args:
|
|
||||||
if c not in current:
|
|
||||||
return False
|
|
||||||
current = current[c]
|
|
||||||
return True
|
|
||||||
|
|
||||||
def copy_if_exists(self, source_keys, target_key):
|
|
||||||
current = self
|
|
||||||
for c in source_keys:
|
|
||||||
if c not in current:
|
|
||||||
return False
|
|
||||||
current = current[c]
|
|
||||||
self[target_key] = current
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
class Configurator(RichDict):
|
|
||||||
def __init__(self, configs: list, env_prefix: str = None, env_key_separator: str = '.'):
|
|
||||||
"""
|
|
||||||
Create Configurator object
|
|
||||||
|
|
||||||
:param configs: list of paths to config files, dicts or modules.
|
|
||||||
End filepath with `?` to mark it as optional config.
|
|
||||||
"""
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
self._by_basenames = {}
|
|
||||||
self._omitted_files = []
|
|
||||||
|
|
||||||
env_dict = {}
|
|
||||||
|
|
||||||
if env_prefix:
|
|
||||||
env_prefix = env_prefix.lower()
|
|
||||||
for name, value in os.environ.items():
|
|
||||||
if name.lower().startswith(env_prefix):
|
|
||||||
stripped_name = name[len(env_prefix):].lstrip('_')
|
|
||||||
if stripped_name[-2:] == '[]':
|
|
||||||
if stripped_name not in env_dict:
|
|
||||||
env_dict[stripped_name[:-2]] = []
|
|
||||||
env_dict[stripped_name[:-2]].append(value)
|
|
||||||
else:
|
|
||||||
env_dict[stripped_name] = value
|
|
||||||
env_dict = unflatten(env_dict, sep=env_key_separator)
|
|
||||||
|
|
||||||
for config in ([os.environ] + configs + [env_dict]):
|
|
||||||
file_found = self.update(config)
|
|
||||||
if not file_found:
|
|
||||||
self._omitted_files.append(config)
|
|
||||||
|
|
||||||
def _config_filename(self, filename):
|
|
||||||
return os.path.join(os.getcwd(), filename)
|
|
||||||
|
|
||||||
def walk_and_render(self, c):
|
|
||||||
if isinstance(c, str):
|
|
||||||
return Template(c).render(**self)
|
|
||||||
elif isinstance(c, list):
|
|
||||||
return [self.walk_and_render(e) for e in c]
|
|
||||||
elif isinstance(c, dict):
|
|
||||||
for key in list(c.keys()):
|
|
||||||
c[key] = self.walk_and_render(c[key])
|
|
||||||
if key.endswith('_filepath'):
|
|
||||||
with open(c[key]) as f:
|
|
||||||
if c[key].endswith('.json'):
|
|
||||||
c[key.replace('_filepath', '')] = json.loads(f.read())
|
|
||||||
elif c[key].endswith('.yaml'):
|
|
||||||
c[key.replace('_filepath', '')] = yaml.safe_load(f.read())
|
|
||||||
return c
|
|
||||||
|
|
||||||
def update(self, new_config, basename=None, **kwargs):
|
|
||||||
if isinstance(new_config, AnyOf):
|
|
||||||
for config in new_config.args:
|
|
||||||
try:
|
|
||||||
return self.update(config.rstrip('?'))
|
|
||||||
except IOError:
|
|
||||||
pass
|
|
||||||
raise IOError('None of %s was found' % ', '.join(new_config.args))
|
|
||||||
elif isinstance(new_config, str):
|
|
||||||
optional = new_config.endswith('?')
|
|
||||||
filename = new_config.rstrip('?')
|
|
||||||
basename = basename or os.path.basename(filename)
|
|
||||||
|
|
||||||
config_filename = self._config_filename(filename)
|
|
||||||
|
|
||||||
data = None
|
|
||||||
|
|
||||||
if os.path.exists(config_filename) and os.access(config_filename, os.R_OK):
|
|
||||||
with open(config_filename) as f:
|
|
||||||
data = f.read()
|
|
||||||
|
|
||||||
if data is None:
|
|
||||||
if optional:
|
|
||||||
return False
|
|
||||||
else:
|
|
||||||
raise IOError(f'File {config_filename} not found')
|
|
||||||
|
|
||||||
if filename.endswith('.json'):
|
|
||||||
new_config = json.loads(data)
|
|
||||||
elif filename.endswith('.yaml'):
|
|
||||||
new_config = yaml.safe_load(data)
|
|
||||||
else:
|
|
||||||
raise UnknownConfigFormatError(filename)
|
|
||||||
|
|
||||||
new_config = self.walk_and_render(new_config)
|
|
||||||
|
|
||||||
elif isinstance(new_config, ModuleType):
|
|
||||||
new_config = new_config.__dict__
|
|
||||||
|
|
||||||
elif callable(new_config):
|
|
||||||
new_config = new_config(self)
|
|
||||||
|
|
||||||
if not new_config:
|
|
||||||
new_config = {}
|
|
||||||
|
|
||||||
for k in new_config:
|
|
||||||
if callable(new_config[k]):
|
|
||||||
new_config[k] = new_config[k](context=self)
|
|
||||||
|
|
||||||
if 'log_path' in new_config:
|
|
||||||
new_config['log_path'] = os.path.expanduser(new_config['log_path']).rstrip('/')
|
|
||||||
|
|
||||||
smart_merge_dicts(self, new_config, list_policy='override', copy=False)
|
|
||||||
if basename:
|
|
||||||
self._by_basenames[basename] = new_config
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
def get_config_by_basename(self, basename):
|
|
||||||
return self._by_basenames[basename]
|
|
||||||
|
|
||||||
def get_object_by_basename(self, basename):
|
|
||||||
return ConfigObject(self._by_basenames[basename])
|
|
||||||
|
|
||||||
def has_missed_configs(self):
|
|
||||||
return bool(self._omitted_files)
|
|
||||||
|
|
||||||
def has_file(self, basename):
|
|
||||||
return basename in self._by_basenames
|
|
||||||
|
|
||||||
def get_files(self):
|
|
||||||
return self._by_basenames
|
|
@ -1,2 +0,0 @@
|
|||||||
class UnknownConfigFormatError(Exception):
|
|
||||||
pass
|
|
@ -159,7 +159,6 @@ class ToSummaAction(BaseAction):
|
|||||||
'journal',
|
'journal',
|
||||||
'journal-issue',
|
'journal-issue',
|
||||||
'journal-volume',
|
'journal-volume',
|
||||||
'other',
|
|
||||||
'peer-review',
|
'peer-review',
|
||||||
'proceedings',
|
'proceedings',
|
||||||
'report-series',
|
'report-series',
|
||||||
|
@ -35,7 +35,7 @@ py3_image(
|
|||||||
requirement("aiobaseclient"),
|
requirement("aiobaseclient"),
|
||||||
requirement("aiocrossref"),
|
requirement("aiocrossref"),
|
||||||
requirement("aiokit"),
|
requirement("aiokit"),
|
||||||
"//library/configurator",
|
requirement("izihawa_configurator"),
|
||||||
"//library/logging",
|
"//library/logging",
|
||||||
"//library/telegram",
|
"//library/telegram",
|
||||||
"//nexus/hub/aioclient",
|
"//nexus/hub/aioclient",
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
|
from izihawa_configurator import Configurator
|
||||||
from izihawa_utils import env
|
from izihawa_utils import env
|
||||||
from library.configurator import Configurator
|
|
||||||
|
|
||||||
|
|
||||||
def get_config():
|
def get_config():
|
||||||
|
@ -104,7 +104,7 @@ class ViewHandler(BaseHandler):
|
|||||||
),
|
),
|
||||||
event.delete(),
|
event.delete(),
|
||||||
]
|
]
|
||||||
if not has_found_old_widget:
|
if not has_found_old_widget and is_earlier_than_2_days(old_message):
|
||||||
async with safe_execution(error_log=request_context.error_log):
|
async with safe_execution(error_log=request_context.error_log):
|
||||||
await self.application.telegram_client.delete_messages(request_context.chat.chat_id, [old_message_id])
|
await self.application.telegram_client.delete_messages(request_context.chat.chat_id, [old_message_id])
|
||||||
return await asyncio.gather(*actions)
|
return await asyncio.gather(*actions)
|
||||||
|
@ -189,6 +189,13 @@ schema:
|
|||||||
record: basic
|
record: basic
|
||||||
tokenizer: raw
|
tokenizer: raw
|
||||||
stored: true
|
stored: true
|
||||||
|
- name: series_page_rank
|
||||||
|
type: f64
|
||||||
|
options:
|
||||||
|
fast: single
|
||||||
|
fieldnorms: false
|
||||||
|
indexed: true
|
||||||
|
stored: true
|
||||||
multi_fields: ["authors", "ipfs_multihashes", "isbns", "issns", "references", "tags"]
|
multi_fields: ["authors", "ipfs_multihashes", "isbns", "issns", "references", "tags"]
|
||||||
primary_key: "id"
|
primary_key: "id"
|
||||||
stop_words: ['a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'if', 'in', 'is', 'it', 'of', 'on', 'or',
|
stop_words: ['a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'if', 'in', 'is', 'it', 'of', 'on', 'or',
|
||||||
|
@ -43,7 +43,7 @@ py3_image(
|
|||||||
requirement("aioipfs-2"),
|
requirement("aioipfs-2"),
|
||||||
requirement("aiokit"),
|
requirement("aiokit"),
|
||||||
"//library/aiopostgres",
|
"//library/aiopostgres",
|
||||||
"//library/configurator",
|
requirement("izihawa_configurator"),
|
||||||
"//library/telegram",
|
"//library/telegram",
|
||||||
"//nexus/hub/proto:grpc_py",
|
"//nexus/hub/proto:grpc_py",
|
||||||
"//nexus/hub/proto:proto_py",
|
"//nexus/hub/proto:proto_py",
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
|
from izihawa_configurator import Configurator
|
||||||
from izihawa_utils import env
|
from izihawa_utils import env
|
||||||
from library.configurator import Configurator
|
|
||||||
|
|
||||||
|
|
||||||
def get_config():
|
def get_config():
|
||||||
|
@ -1,26 +1,8 @@
|
|||||||
---
|
---
|
||||||
pylon:
|
pylon:
|
||||||
default_driver_proxy_list:
|
|
||||||
- [cambridge]
|
|
||||||
- [edinburg]
|
|
||||||
- [southampton]
|
|
||||||
default_resolver_proxy_list: ~
|
default_resolver_proxy_list: ~
|
||||||
downloads_directory: /downloads
|
|
||||||
proxies:
|
|
||||||
- address: clash.default.svc.cluster.example.com:7890
|
|
||||||
name: cambridge
|
|
||||||
tags: ['cambridge']
|
|
||||||
- address: clash.default.svc.cluster.example.com:7990
|
|
||||||
name: edinburg
|
|
||||||
tags: ['edinburg']
|
|
||||||
- address: clash.default.svc.cluster.example.com:8090
|
|
||||||
name: southampton
|
|
||||||
tags: ['southampton']
|
|
||||||
- address: socks5://clash.default.svc.cluster.example.com:7991
|
|
||||||
name: socks5
|
|
||||||
tags: ['socks5']
|
|
||||||
sources:
|
sources:
|
||||||
# LibGen.rocks
|
# IPFS
|
||||||
- driver:
|
- driver:
|
||||||
args:
|
args:
|
||||||
proxy_list: ~
|
proxy_list: ~
|
||||||
@ -29,37 +11,13 @@ pylon:
|
|||||||
class:
|
class:
|
||||||
nexus.pylon.drivers.DirectDriver
|
nexus.pylon.drivers.DirectDriver
|
||||||
matcher:
|
matcher:
|
||||||
md5: ^.*$
|
ipfs_multihashes: ^.*$
|
||||||
resolver:
|
resolver:
|
||||||
args:
|
args:
|
||||||
extractors:
|
format_string: 'http://nexus-ipfs-headless.default.svc.cluster.example.com:5001/api/v0/cat?arg={ipfs_multihashes[0]}'
|
||||||
- producer:
|
headers_override: true
|
||||||
format_string: 'http://libgen.rocks/{matched_group}'
|
method: 'POST'
|
||||||
group: 0
|
class: nexus.pylon.resolvers.TemplateResolver
|
||||||
re: 'get\.php\?md5=.*&key=[A-Za-z\d]+'
|
|
||||||
timeout: 25.0
|
|
||||||
type: regex
|
|
||||||
url: https://libgen.rocks/ads.php?md5={md5}
|
|
||||||
class: nexus.pylon.resolvers.RequestResolver
|
|
||||||
# LibGen.rocks
|
|
||||||
- driver:
|
|
||||||
args:
|
|
||||||
proxy_list: ~
|
|
||||||
class:
|
|
||||||
nexus.pylon.drivers.DirectDriver
|
|
||||||
matcher:
|
|
||||||
doi: ^.*$
|
|
||||||
resolver:
|
|
||||||
args:
|
|
||||||
extractors:
|
|
||||||
- producer:
|
|
||||||
format_string: 'http://libgen.rocks/{matched_group}'
|
|
||||||
group: 0
|
|
||||||
re: 'get\.php\?md5=[a-fA-F\d]+&key=[A-Za-z\d]+(&doi=[^\"]*)+'
|
|
||||||
timeout: 25.0
|
|
||||||
type: regex
|
|
||||||
url: 'https://libgen.rocks/ads.php?doi={doi}'
|
|
||||||
class: nexus.pylon.resolvers.RequestResolver
|
|
||||||
# Library.lol
|
# Library.lol
|
||||||
- driver:
|
- driver:
|
||||||
args:
|
args:
|
||||||
@ -74,20 +32,17 @@ pylon:
|
|||||||
args:
|
args:
|
||||||
extractors:
|
extractors:
|
||||||
- producer:
|
- producer:
|
||||||
format_string: '{matched_group}'
|
format_string: '{href}'
|
||||||
group: 1
|
|
||||||
re: '<a href="([^\"]+)">GET</a>'
|
|
||||||
timeout: 45.0
|
timeout: 45.0
|
||||||
|
re: '<a href="(?P<href>[^\"]+)">GET</a>'
|
||||||
type: regex
|
type: regex
|
||||||
- producer:
|
- producer:
|
||||||
format_string: '{matched_group}'
|
format_string: '{url}'
|
||||||
group: 0
|
re: '(?P<url>https://ipfs.io/ipfs/[A-Za-z\d]+)'
|
||||||
re: 'https://ipfs.io/ipfs/[A-Za-z\d]+'
|
|
||||||
type: regex
|
type: regex
|
||||||
- producer:
|
- producer:
|
||||||
format_string: '{matched_group}'
|
format_string: '{url}'
|
||||||
group: 0
|
re: '(?P<url>https://cloudflare-ipfs.com/ipfs/[A-Za-z\d]+)'
|
||||||
re: 'https://cloudflare-ipfs.com/ipfs/[A-Za-z\d]+'
|
|
||||||
type: regex
|
type: regex
|
||||||
url: http://library.lol/main/{md5}
|
url: http://library.lol/main/{md5}
|
||||||
class: nexus.pylon.resolvers.RequestResolver
|
class: nexus.pylon.resolvers.RequestResolver
|
||||||
@ -103,13 +58,51 @@ pylon:
|
|||||||
args:
|
args:
|
||||||
extractors:
|
extractors:
|
||||||
- producer:
|
- producer:
|
||||||
format_string: '{matched_group}'
|
format_string: '{href}'
|
||||||
group: 1
|
|
||||||
re: '<a href="([^\"]+)">GET</a>'
|
|
||||||
timeout: 45.0
|
timeout: 45.0
|
||||||
|
re: '<a href="(?P<href>[^\"]+)">GET</a>'
|
||||||
type: regex
|
type: regex
|
||||||
url: 'http://library.lol/scimag/{doi}'
|
url: 'http://library.lol/scimag/{doi}'
|
||||||
class: nexus.pylon.resolvers.RequestResolver
|
class: nexus.pylon.resolvers.RequestResolver
|
||||||
|
# LibGen.rocks
|
||||||
|
- driver:
|
||||||
|
args:
|
||||||
|
proxy_list: ~
|
||||||
|
validator:
|
||||||
|
class: nexus.pylon.validators.Md5Validator
|
||||||
|
class:
|
||||||
|
nexus.pylon.drivers.DirectDriver
|
||||||
|
matcher:
|
||||||
|
md5: ^.*$
|
||||||
|
resolver:
|
||||||
|
args:
|
||||||
|
extractors:
|
||||||
|
- producer:
|
||||||
|
format_string: 'http://libgen.rocks/{key}'
|
||||||
|
timeout: 25.0
|
||||||
|
re: '(?P<key>get\.php\?md5=.*&key=[A-Za-z\d]+)'
|
||||||
|
type: regex
|
||||||
|
resolve_timeout: 25.0
|
||||||
|
url: https://libgen.rocks/ads.php?md5={md5}
|
||||||
|
class: nexus.pylon.resolvers.RequestResolver
|
||||||
|
# LibGen.rocks
|
||||||
|
- driver:
|
||||||
|
args:
|
||||||
|
proxy_list: ~
|
||||||
|
class:
|
||||||
|
nexus.pylon.drivers.DirectDriver
|
||||||
|
matcher:
|
||||||
|
doi: ^.*$
|
||||||
|
resolver:
|
||||||
|
args:
|
||||||
|
extractors:
|
||||||
|
- producer:
|
||||||
|
format_string: 'http://libgen.rocks/{key}'
|
||||||
|
timeout: 25.0
|
||||||
|
re: '(?P<key>get\.php\?md5=[a-fA-F\d]+&key=[A-Za-z\d]+(&doi=[^\"]*)+)'
|
||||||
|
type: regex
|
||||||
|
url: 'https://libgen.rocks/ads.php?doi={doi}'
|
||||||
|
class: nexus.pylon.resolvers.RequestResolver
|
||||||
# jamanetwork.com
|
# jamanetwork.com
|
||||||
- driver:
|
- driver:
|
||||||
args:
|
args:
|
||||||
@ -142,6 +135,7 @@ pylon:
|
|||||||
resolver:
|
resolver:
|
||||||
args:
|
args:
|
||||||
format_string: 'https://www.sciencedirect.com/science/article/pii/{selected}/pdfft?isDTMRedir=true&download=true'
|
format_string: 'https://www.sciencedirect.com/science/article/pii/{selected}/pdfft?isDTMRedir=true&download=true'
|
||||||
|
resolve_timeout: 25.0
|
||||||
selector: '(.resource.primary.URL | split("/"))[-1]'
|
selector: '(.resource.primary.URL | split("/"))[-1]'
|
||||||
timeout: 40.0
|
timeout: 40.0
|
||||||
class: nexus.pylon.resolvers.DoiOrgRequestResolver
|
class: nexus.pylon.resolvers.DoiOrgRequestResolver
|
||||||
@ -209,6 +203,13 @@ pylon:
|
|||||||
format_string: 'https://www.tandfonline.com/doi/pdf/{doi}?download=true'
|
format_string: 'https://www.tandfonline.com/doi/pdf/{doi}?download=true'
|
||||||
class: nexus.pylon.resolvers.TemplateResolver
|
class: nexus.pylon.resolvers.TemplateResolver
|
||||||
# iopscience.iop.org
|
# iopscience.iop.org
|
||||||
|
- matcher:
|
||||||
|
doi: ^10.1088/.*$
|
||||||
|
resolver:
|
||||||
|
args:
|
||||||
|
format_string: 'https://iopscience.iop.org/article/{doi}/pdf'
|
||||||
|
class: nexus.pylon.resolvers.TemplateResolver
|
||||||
|
# iopscience.iop.org
|
||||||
- matcher:
|
- matcher:
|
||||||
doi: ^10.1088/.*$
|
doi: ^10.1088/.*$
|
||||||
resolver:
|
resolver:
|
||||||
@ -249,6 +250,13 @@ pylon:
|
|||||||
args:
|
args:
|
||||||
timeout: 30.0
|
timeout: 30.0
|
||||||
class: nexus.pylon.resolvers.TemplateResolver
|
class: nexus.pylon.resolvers.TemplateResolver
|
||||||
|
# biorxiv.org
|
||||||
|
- matcher:
|
||||||
|
doi: ^10.1101/.*$
|
||||||
|
resolver:
|
||||||
|
args:
|
||||||
|
format_string: 'https://www.biorxiv.org/content/{doi}.full.pdf'
|
||||||
|
class: nexus.pylon.resolvers.TemplateResolver
|
||||||
# journals.aps.org
|
# journals.aps.org
|
||||||
- matcher:
|
- matcher:
|
||||||
doi: ^10.1103/.*$
|
doi: ^10.1103/.*$
|
||||||
@ -374,6 +382,13 @@ pylon:
|
|||||||
args:
|
args:
|
||||||
format_string: 'https://journals.plos.org/plosone/article/file?id={doi}&type=printable'
|
format_string: 'https://journals.plos.org/plosone/article/file?id={doi}&type=printable'
|
||||||
class: nexus.pylon.resolvers.TemplateResolver
|
class: nexus.pylon.resolvers.TemplateResolver
|
||||||
|
# guilfordjournals.com
|
||||||
|
- matcher:
|
||||||
|
doi: ^10.1521/.*$
|
||||||
|
resolver:
|
||||||
|
args:
|
||||||
|
format_string: 'https://guilfordjournals.com/doi/pdf/{doi}?download=true'
|
||||||
|
class: nexus.pylon.resolvers.TemplateResolver
|
||||||
# bioone.org
|
# bioone.org
|
||||||
- driver:
|
- driver:
|
||||||
args:
|
args:
|
||||||
@ -549,6 +564,24 @@ pylon:
|
|||||||
args:
|
args:
|
||||||
format_string: 'https://jcsm.aasm.org/doi/pdf/{doi}?download=true'
|
format_string: 'https://jcsm.aasm.org/doi/pdf/{doi}?download=true'
|
||||||
class: nexus.pylon.resolvers.TemplateResolver
|
class: nexus.pylon.resolvers.TemplateResolver
|
||||||
|
# www.medwave.cl
|
||||||
|
- driver:
|
||||||
|
args:
|
||||||
|
proxy_list: ~
|
||||||
|
class:
|
||||||
|
nexus.pylon.drivers.DirectDriver
|
||||||
|
matcher:
|
||||||
|
doi: ^10.5867/.*$
|
||||||
|
resolver:
|
||||||
|
args:
|
||||||
|
extractors:
|
||||||
|
- producer:
|
||||||
|
format_string: 'https://www.medwave.cl/{path}'
|
||||||
|
timeout: 25.0
|
||||||
|
re: 'href=\"/(?P<path>[\w/.\-_]+\.pdf)\">PDF</a>'
|
||||||
|
type: regex
|
||||||
|
url: https://doi.org/{doi}
|
||||||
|
class: nexus.pylon.resolvers.RequestResolver
|
||||||
# journal.permsc.ru
|
# journal.permsc.ru
|
||||||
- driver:
|
- driver:
|
||||||
args:
|
args:
|
||||||
@ -609,6 +642,24 @@ pylon:
|
|||||||
class: nexus.pylon.drivers.BrowserDriver
|
class: nexus.pylon.drivers.BrowserDriver
|
||||||
matcher:
|
matcher:
|
||||||
doi: ^10.32920/.*$
|
doi: ^10.32920/.*$
|
||||||
|
# PKP Project
|
||||||
|
- driver:
|
||||||
|
args:
|
||||||
|
proxy_list: ~
|
||||||
|
class:
|
||||||
|
nexus.pylon.drivers.DirectDriver
|
||||||
|
matcher:
|
||||||
|
doi: ^10.(5399|24905|31004|32729|37934)/.*$
|
||||||
|
resolver:
|
||||||
|
args:
|
||||||
|
extractors:
|
||||||
|
- producer:
|
||||||
|
format_string: 'https://{host}/{prefix}/{journal}/article/download/{key}'
|
||||||
|
timeout: 25.0
|
||||||
|
re: 'href=\"(?:https?://[\w.]+)/(?P<prefix>[\w./]+)/(?P<journal>[\w.]+)/article/view/(?P<key>\w+/\w+)\"[^>]*>[Pp][Dd][Ff]\s*</a>'
|
||||||
|
type: regex
|
||||||
|
url: https://doi.org/{doi}
|
||||||
|
class: nexus.pylon.resolvers.RequestResolver
|
||||||
# papers.cumincad.org
|
# papers.cumincad.org
|
||||||
- driver:
|
- driver:
|
||||||
args:
|
args:
|
||||||
@ -621,9 +672,19 @@ pylon:
|
|||||||
matcher:
|
matcher:
|
||||||
doi: ^10.52842/.*$
|
doi: ^10.52842/.*$
|
||||||
# ^.*$
|
# ^.*$
|
||||||
|
- matcher:
|
||||||
|
doi: ^.*$
|
||||||
|
resolver:
|
||||||
|
args:
|
||||||
|
selector: '.resource.primary.URL | select (. | ascii_downcase | contains("pdf"))'
|
||||||
|
class: nexus.pylon.resolvers.DoiOrgRequestResolver
|
||||||
- matcher:
|
- matcher:
|
||||||
doi: ^.*$
|
doi: ^.*$
|
||||||
resolver:
|
resolver:
|
||||||
args:
|
args:
|
||||||
selector: '[(.link | if . == null then [] else . end)[] | select((."content-type" == "application/pdf") or (.URL | ascii_downcase | contains("pdf")))][0].URL'
|
selector: '[(.link | if . == null then [] else . end)[] | select((."content-type" == "application/pdf") or (.URL | ascii_downcase | contains("pdf")))][0].URL'
|
||||||
class: nexus.pylon.resolvers.DoiOrgRequestResolver
|
class: nexus.pylon.resolvers.DoiOrgRequestResolver
|
||||||
|
webdriver_hub:
|
||||||
|
downloads_directory: /downloads
|
||||||
|
endpoint: http://127.0.0.1:4444/wd/hub
|
||||||
|
host_downloads_directory: /downloads
|
||||||
|
@ -5,9 +5,9 @@ import uvloop
|
|||||||
from aiogrobid import GrobidClient
|
from aiogrobid import GrobidClient
|
||||||
from aioipfs import AsyncIPFS as AsyncIPFS
|
from aioipfs import AsyncIPFS as AsyncIPFS
|
||||||
from idm.api.aioclient import IdmApiGrpcClient
|
from idm.api.aioclient import IdmApiGrpcClient
|
||||||
|
from izihawa_configurator import Configurator
|
||||||
from library.aiogrpctools import AioGrpcServer
|
from library.aiogrpctools import AioGrpcServer
|
||||||
from library.aiopostgres import AioPostgresPoolHolder
|
from library.aiopostgres import AioPostgresPoolHolder
|
||||||
from library.configurator import Configurator
|
|
||||||
from library.logging import configure_logging
|
from library.logging import configure_logging
|
||||||
from library.telegram.base import BaseTelegramClient
|
from library.telegram.base import BaseTelegramClient
|
||||||
from nexus.hub.configs import get_config
|
from nexus.hub.configs import get_config
|
||||||
|
@ -65,6 +65,7 @@ class BaseHubService(BaseService):
|
|||||||
await asyncio.gather(
|
await asyncio.gather(
|
||||||
self.application.ipfs_client.add_bytes(file, cid_version=1, hash='blake2b-256', only_hash=True),
|
self.application.ipfs_client.add_bytes(file, cid_version=1, hash='blake2b-256', only_hash=True),
|
||||||
self.application.ipfs_client.add_bytes(file, cid_version=0, hash='sha2-256', only_hash=True),
|
self.application.ipfs_client.add_bytes(file, cid_version=0, hash='sha2-256', only_hash=True),
|
||||||
|
self.application.ipfs_client.add_bytes(file, cid_version=1, hash='blake3', only_hash=True),
|
||||||
)
|
)
|
||||||
))
|
))
|
||||||
|
|
||||||
|
@ -73,13 +73,7 @@ class DeliveryService(delivery_service_pb2_grpc.DeliveryServicer, BaseHubService
|
|||||||
self.downloadings = set()
|
self.downloadings = set()
|
||||||
self.is_sharience_enabled = is_sharience_enabled
|
self.is_sharience_enabled = is_sharience_enabled
|
||||||
self.maintenance_picture_url = maintenance_picture_url
|
self.maintenance_picture_url = maintenance_picture_url
|
||||||
self.pylon_client = PylonClient(
|
self.pylon_client = PylonClient(config=pylon_config)
|
||||||
proxies=pylon_config['proxies'],
|
|
||||||
source_configs=pylon_config['sources'],
|
|
||||||
default_driver_proxy_list=pylon_config['default_driver_proxy_list'],
|
|
||||||
default_resolver_proxy_list=pylon_config['default_resolver_proxy_list'],
|
|
||||||
downloads_directory=pylon_config['downloads_directory'],
|
|
||||||
)
|
|
||||||
self.should_parse_with_grobid = should_parse_with_grobid
|
self.should_parse_with_grobid = should_parse_with_grobid
|
||||||
self.should_store_hashes = should_store_hashes
|
self.should_store_hashes = should_store_hashes
|
||||||
self.telegram_bot_configs = telegram_bot_configs
|
self.telegram_bot_configs = telegram_bot_configs
|
||||||
@ -170,6 +164,15 @@ class DeliveryService(delivery_service_pb2_grpc.DeliveryServicer, BaseHubService
|
|||||||
return delivery_service_pb2.StartDeliveryResponse(status=delivery_service_pb2.StartDeliveryResponse.Status.OK)
|
return delivery_service_pb2.StartDeliveryResponse(status=delivery_service_pb2.StartDeliveryResponse.Status.OK)
|
||||||
|
|
||||||
|
|
||||||
|
async def delayed_task(create_task, t):
|
||||||
|
try:
|
||||||
|
await asyncio.sleep(t)
|
||||||
|
task = create_task()
|
||||||
|
await task
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class DownloadTask:
|
class DownloadTask:
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@ -204,7 +207,7 @@ class DownloadTask:
|
|||||||
)
|
)
|
||||||
|
|
||||||
async def download_task(self, request_context: RequestContext, document_holder):
|
async def download_task(self, request_context: RequestContext, document_holder):
|
||||||
throttle_secs = 2.0
|
throttle_secs = 3.0
|
||||||
|
|
||||||
async def _on_fail():
|
async def _on_fail():
|
||||||
await self.application.telegram_clients[request_context.bot_name].send_message(
|
await self.application.telegram_clients[request_context.bot_name].send_message(
|
||||||
@ -218,6 +221,7 @@ class DownloadTask:
|
|||||||
error_log=request_context.error_log,
|
error_log=request_context.error_log,
|
||||||
on_fail=_on_fail,
|
on_fail=_on_fail,
|
||||||
):
|
):
|
||||||
|
start_time = time.time()
|
||||||
filename = document_holder.get_filename()
|
filename = document_holder.get_filename()
|
||||||
progress_bar_download = ProgressBar(
|
progress_bar_download = ProgressBar(
|
||||||
telegram_client=self.application.telegram_clients[request_context.bot_name],
|
telegram_client=self.application.telegram_clients[request_context.bot_name],
|
||||||
@ -226,9 +230,9 @@ class DownloadTask:
|
|||||||
header=f'⬇️ {filename}',
|
header=f'⬇️ {filename}',
|
||||||
tail_text=t('TRANSMITTED_FROM', request_context.chat.language),
|
tail_text=t('TRANSMITTED_FROM', request_context.chat.language),
|
||||||
throttle_secs=throttle_secs,
|
throttle_secs=throttle_secs,
|
||||||
|
last_call=start_time,
|
||||||
)
|
)
|
||||||
downloads_gauge.inc()
|
downloads_gauge.inc()
|
||||||
start_time = time.time()
|
|
||||||
try:
|
try:
|
||||||
file = await self.download(
|
file = await self.download(
|
||||||
document_holder=document_holder,
|
document_holder=document_holder,
|
||||||
@ -242,11 +246,21 @@ class DownloadTask:
|
|||||||
)
|
)
|
||||||
if not document_holder.md5 and document_holder.get_extension() == 'pdf':
|
if not document_holder.md5 and document_holder.get_extension() == 'pdf':
|
||||||
try:
|
try:
|
||||||
await progress_bar_download.send_message(
|
processing_message_task = asyncio.create_task(delayed_task(
|
||||||
|
create_task=lambda: progress_bar_download.send_message(
|
||||||
t("PROCESSING_PAPER", request_context.chat.language).format(filename=filename),
|
t("PROCESSING_PAPER", request_context.chat.language).format(filename=filename),
|
||||||
ignore_last_call=True
|
ignore_last_call=True
|
||||||
|
),
|
||||||
|
t=5.0
|
||||||
|
))
|
||||||
|
file = await asyncio.get_running_loop().run_in_executor(
|
||||||
|
None,
|
||||||
|
lambda: clean_metadata(file, doi=document_holder.doi)
|
||||||
)
|
)
|
||||||
file = clean_metadata(file, doi=document_holder.doi)
|
|
||||||
|
processing_message_task.cancel()
|
||||||
|
await processing_message_task
|
||||||
|
|
||||||
request_context.statbox(
|
request_context.statbox(
|
||||||
action='cleaned',
|
action='cleaned',
|
||||||
len=len(file),
|
len=len(file),
|
||||||
@ -260,7 +274,8 @@ class DownloadTask:
|
|||||||
banner=t("LOOKING_AT", request_context.chat.language),
|
banner=t("LOOKING_AT", request_context.chat.language),
|
||||||
header=f'⬇️ {filename}',
|
header=f'⬇️ {filename}',
|
||||||
tail_text=t('UPLOADED_TO_TELEGRAM', request_context.chat.language),
|
tail_text=t('UPLOADED_TO_TELEGRAM', request_context.chat.language),
|
||||||
throttle_secs=throttle_secs
|
throttle_secs=throttle_secs,
|
||||||
|
last_call=progress_bar_download.last_call,
|
||||||
)
|
)
|
||||||
uploaded_message = await self.delivery_service.send_file(
|
uploaded_message = await self.delivery_service.send_file(
|
||||||
document_holder=self.document_holder,
|
document_holder=self.document_holder,
|
||||||
@ -393,11 +408,15 @@ class DownloadTask:
|
|||||||
|
|
||||||
async def download(self, document_holder, progress_bar):
|
async def download(self, document_holder, progress_bar):
|
||||||
collected = bytearray()
|
collected = bytearray()
|
||||||
if document_holder.doi:
|
params = {}
|
||||||
try:
|
try:
|
||||||
params = {'doi': document_holder.doi}
|
if document_holder.doi:
|
||||||
|
params['doi'] = document_holder.doi
|
||||||
if document_holder.md5:
|
if document_holder.md5:
|
||||||
params['md5'] = document_holder.md5
|
params['md5'] = document_holder.md5
|
||||||
|
if document_holder.ipfs_multihashes:
|
||||||
|
params['ipfs_multihashes'] = [ipfs_multihash for ipfs_multihash in document_holder.ipfs_multihashes]
|
||||||
|
if params:
|
||||||
async for resp in self.delivery_service.pylon_client.download(params):
|
async for resp in self.delivery_service.pylon_client.download(params):
|
||||||
await self.process_resp(
|
await self.process_resp(
|
||||||
resp=resp,
|
resp=resp,
|
||||||
@ -408,18 +427,6 @@ class DownloadTask:
|
|||||||
return bytes(collected)
|
return bytes(collected)
|
||||||
except DownloadError:
|
except DownloadError:
|
||||||
pass
|
pass
|
||||||
if document_holder.md5:
|
|
||||||
try:
|
|
||||||
async for resp in self.delivery_service.pylon_client.download({'md5': document_holder.md5}):
|
|
||||||
await self.process_resp(
|
|
||||||
resp=resp,
|
|
||||||
progress_bar=progress_bar,
|
|
||||||
collected=collected,
|
|
||||||
filesize=document_holder.filesize,
|
|
||||||
)
|
|
||||||
return bytes(collected)
|
|
||||||
except DownloadError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
async def external_cancel(self):
|
async def external_cancel(self):
|
||||||
self.request_context.statbox(action='externally_canceled')
|
self.request_context.statbox(action='externally_canceled')
|
||||||
|
@ -27,7 +27,7 @@ py3_image(
|
|||||||
requirement("aiokit"),
|
requirement("aiokit"),
|
||||||
requirement("aiolibgen"),
|
requirement("aiolibgen"),
|
||||||
"//library/aiopostgres",
|
"//library/aiopostgres",
|
||||||
"//library/configurator",
|
requirement("izihawa_configurator"),
|
||||||
"//library/jobber",
|
"//library/jobber",
|
||||||
"//nexus/actions",
|
"//nexus/actions",
|
||||||
],
|
],
|
||||||
|
@ -34,6 +34,7 @@ class PostgresJob(BaseJob):
|
|||||||
f'user={database["username"]} '
|
f'user={database["username"]} '
|
||||||
f'password={database["password"]} '
|
f'password={database["password"]} '
|
||||||
f'host={database["host"]}',
|
f'host={database["host"]}',
|
||||||
|
timeout=3600 * 2,
|
||||||
)
|
)
|
||||||
self.summa_client = SummaClient(endpoint=summa['endpoint'])
|
self.summa_client = SummaClient(endpoint=summa['endpoint'])
|
||||||
self.summa_config = summa
|
self.summa_config = summa
|
||||||
@ -84,6 +85,7 @@ class PostgresJob(BaseJob):
|
|||||||
# Mandatory for server side cursor
|
# Mandatory for server side cursor
|
||||||
cursor_name='nexus_ingest_cursor',
|
cursor_name='nexus_ingest_cursor',
|
||||||
itersize=50_000,
|
itersize=50_000,
|
||||||
|
statement_timeout=3600 * 2,
|
||||||
):
|
):
|
||||||
loaded = True
|
loaded = True
|
||||||
yield row
|
yield row
|
||||||
@ -95,8 +97,12 @@ class PostgresJob(BaseJob):
|
|||||||
# Mandatory for server side cursor
|
# Mandatory for server side cursor
|
||||||
cursor_name='nexus_ingest_cursor',
|
cursor_name='nexus_ingest_cursor',
|
||||||
itersize=50_000,
|
itersize=50_000,
|
||||||
|
statement_timeout=3600 * 2,
|
||||||
):
|
):
|
||||||
yield row
|
yield row
|
||||||
|
|
||||||
await self.summa_client.commit_index(self.summa_config['name'], session_id=session_id)
|
await self.summa_client.commit_index(
|
||||||
|
self.summa_config['name'],
|
||||||
|
session_id=session_id,
|
||||||
|
)
|
||||||
await self.summa_client.set_index_alias(self.summa_config['index_alias'], self.summa_config['name'], session_id=session_id)
|
await self.summa_client.set_index_alias(self.summa_config['index_alias'], self.summa_config['name'], session_id=session_id)
|
||||||
|
@ -25,7 +25,7 @@ DEPS = [
|
|||||||
"//library/aiogrpctools",
|
"//library/aiogrpctools",
|
||||||
requirement("aiokit"),
|
requirement("aiokit"),
|
||||||
"//library/aiopostgres",
|
"//library/aiopostgres",
|
||||||
"//library/configurator",
|
requirement("izihawa_configurator"),
|
||||||
"//library/logging",
|
"//library/logging",
|
||||||
"//nexus/meta_api/proto:grpc_py",
|
"//nexus/meta_api/proto:grpc_py",
|
||||||
"//nexus/models/proto:proto_py",
|
"//nexus/models/proto:proto_py",
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
|
from izihawa_configurator import Configurator
|
||||||
from izihawa_utils import env
|
from izihawa_utils import env
|
||||||
from library.configurator import Configurator
|
|
||||||
|
|
||||||
|
|
||||||
def get_config():
|
def get_config():
|
||||||
|
@ -315,7 +315,7 @@ class SearchService(SearchServicer, BaseService):
|
|||||||
with suppress(RetryError):
|
with suppress(RetryError):
|
||||||
async for attempt in AsyncRetrying(
|
async for attempt in AsyncRetrying(
|
||||||
retry=retry_if_exception_type(NeedRetryError),
|
retry=retry_if_exception_type(NeedRetryError),
|
||||||
wait=wait_fixed(5),
|
wait=wait_fixed(10),
|
||||||
stop=stop_after_attempt(6)
|
stop=stop_after_attempt(6)
|
||||||
):
|
):
|
||||||
with attempt:
|
with attempt:
|
||||||
|
@ -33,4 +33,5 @@ message Scimag {
|
|||||||
string volume = 21;
|
string volume = 21;
|
||||||
int32 year = 30;
|
int32 year = 30;
|
||||||
float page_rank = 34;
|
float page_rank = 34;
|
||||||
|
float series_page_rank = 35;
|
||||||
}
|
}
|
||||||
|
@ -26,7 +26,7 @@ py3_image(
|
|||||||
requirement("aiocrossref"),
|
requirement("aiocrossref"),
|
||||||
requirement("aiokit"),
|
requirement("aiokit"),
|
||||||
"//library/aiopostgres",
|
"//library/aiopostgres",
|
||||||
"//library/configurator",
|
requirement("izihawa_configurator"),
|
||||||
"//library/logging",
|
"//library/logging",
|
||||||
"//nexus/actions",
|
"//nexus/actions",
|
||||||
"//nexus/models/proto:proto_py",
|
"//nexus/models/proto:proto_py",
|
||||||
|
@ -10,6 +10,6 @@ py_library(
|
|||||||
srcs_version = "PY3",
|
srcs_version = "PY3",
|
||||||
visibility = ["//visibility:public"],
|
visibility = ["//visibility:public"],
|
||||||
deps = [
|
deps = [
|
||||||
"//library/configurator",
|
requirement("izihawa_configurator"),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from library.configurator import Configurator
|
from izihawa_configurator import Configurator
|
||||||
|
|
||||||
|
|
||||||
def get_promotions():
|
def get_promotions():
|
||||||
|
@ -19,6 +19,9 @@ promotions:
|
|||||||
- texts:
|
- texts:
|
||||||
en: 💬 Research is the only and ultimate goal
|
en: 💬 Research is the only and ultimate goal
|
||||||
weight: 1
|
weight: 1
|
||||||
|
- texts:
|
||||||
|
en: 💬 Intellectual property is not a valid form of property
|
||||||
|
weight: 1
|
||||||
- texts:
|
- texts:
|
||||||
en: ✋ Have a subscription to paid articles? [Help researchers!](https://t.me/{mutual_aid_group})
|
en: ✋ Have a subscription to paid articles? [Help researchers!](https://t.me/{mutual_aid_group})
|
||||||
ru: ✋ Есть доступ к платным статьям? [Помоги ученым!](https://t.me/{mutual_aid_group})
|
ru: ✋ Есть доступ к платным статьям? [Помоги ученым!](https://t.me/{mutual_aid_group})
|
||||||
|
@ -1,12 +1,16 @@
|
|||||||
load("@rules_python//python:defs.bzl", "py_binary", "py_library")
|
load("@rules_python//python:defs.bzl", "py_library")
|
||||||
|
load("@rules_python//python:packaging.bzl", "py_wheel")
|
||||||
load("@pip_modules//:requirements.bzl", "requirement")
|
load("@pip_modules//:requirements.bzl", "requirement")
|
||||||
|
|
||||||
|
filegroup(
|
||||||
|
name = "data",
|
||||||
|
srcs = ["configs/pylon.yaml"],
|
||||||
|
)
|
||||||
|
|
||||||
py_library(
|
py_library(
|
||||||
name = "pylon",
|
name = "pylon",
|
||||||
srcs = glob(["**/*.py"]),
|
srcs = glob(["**/*.py"]),
|
||||||
data = [
|
data = [":data"],
|
||||||
"configs/pylon.yaml",
|
|
||||||
],
|
|
||||||
visibility = ["//visibility:public"],
|
visibility = ["//visibility:public"],
|
||||||
deps = [
|
deps = [
|
||||||
requirement("aiodns"),
|
requirement("aiodns"),
|
||||||
@ -16,6 +20,7 @@ py_library(
|
|||||||
requirement("brotli"),
|
requirement("brotli"),
|
||||||
requirement("cchardet"),
|
requirement("cchardet"),
|
||||||
requirement("certifi"),
|
requirement("certifi"),
|
||||||
|
requirement("fire"),
|
||||||
requirement("jq"),
|
requirement("jq"),
|
||||||
requirement("orjson"),
|
requirement("orjson"),
|
||||||
requirement("pypdf2"),
|
requirement("pypdf2"),
|
||||||
@ -23,20 +28,38 @@ py_library(
|
|||||||
requirement("selenium"),
|
requirement("selenium"),
|
||||||
requirement("tenacity"),
|
requirement("tenacity"),
|
||||||
requirement("aiokit"),
|
requirement("aiokit"),
|
||||||
"//library/configurator",
|
requirement("izihawa_configurator"),
|
||||||
"//library/logging",
|
"//library/logging",
|
||||||
"//nexus/pylon/proto:pylon_proto_py",
|
"//nexus/pylon/proto:pylon_proto_py",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
py_binary(
|
py_wheel(
|
||||||
name = "cli",
|
name = "nexus-pylon-wheel",
|
||||||
srcs = ["cli.py"],
|
author = "The Superpirate",
|
||||||
main = "cli.py",
|
author_email = "fist.of.the.first.pirates@gmail.com",
|
||||||
srcs_version = "PY3",
|
classifiers = [
|
||||||
visibility = ["//visibility:public"],
|
"Programming Language :: Python :: 3.10",
|
||||||
|
],
|
||||||
|
description_file = ":README.md",
|
||||||
|
distribution = "nexus-pylon-wheel",
|
||||||
|
entry_points = {"console_scripts": ["pylon = nexus.pylon.cli:main"]},
|
||||||
|
homepage = "https://github.com/nexus-stc/hyperboria/tree/master/nexus/pylon",
|
||||||
|
license = "MIT License",
|
||||||
|
python_requires = ">=3.10",
|
||||||
|
python_tag = "py3",
|
||||||
|
requires = [
|
||||||
|
"aiokit >= 1.0.0",
|
||||||
|
"izihawa_configurator >= 1.0.0",
|
||||||
|
"selenium >= 4.3.0",
|
||||||
|
],
|
||||||
|
strip_path_prefixes = [
|
||||||
|
"nexus/pylon/proto/pylon_proto_py_pb",
|
||||||
|
],
|
||||||
|
version = "1.0.0",
|
||||||
deps = [
|
deps = [
|
||||||
requirement("fire"),
|
":data",
|
||||||
":pylon",
|
":pylon",
|
||||||
|
"//nexus/pylon/proto:pylon_proto_py",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
@ -6,16 +6,51 @@
|
|||||||
- Streams data by chunks
|
- Streams data by chunks
|
||||||
- GRPC-ready
|
- GRPC-ready
|
||||||
|
|
||||||
|
## Build
|
||||||
|
|
||||||
|
```bash
|
||||||
|
bazel build -c opt nexus-pylon-wheel
|
||||||
|
```
|
||||||
|
|
||||||
|
## Install
|
||||||
|
|
||||||
|
### PIP
|
||||||
|
```bash
|
||||||
|
pip install nexus-pylon
|
||||||
|
```
|
||||||
|
|
||||||
## Nexus Pylon CLI
|
## Nexus Pylon CLI
|
||||||
|
|
||||||
Casual download
|
Download scientific publication:
|
||||||
```bash
|
```bash
|
||||||
bazel run -c opt cli -- doi 10.1056/NEJMoa2033700 --output article.pdf
|
pylon download --doi 10.1182/blood-2011-03-325258 --output article.pdf
|
||||||
```
|
```
|
||||||
|
|
||||||
Download with proxies
|
Download file by its MD5:
|
||||||
```bash
|
```bash
|
||||||
bazel run -c opt cli -- md5 278C3A72B7B04717361501B8642857DF \
|
pylon download --md5 f07707ee92fa675fd4ee53e3fee977d1 --output article.pdf
|
||||||
--output file.pdf \
|
```
|
||||||
--proxies socks5://127.0.0.1:9050
|
|
||||||
|
Download file by its multihash:
|
||||||
|
```bash
|
||||||
|
pylon download --ipfs-multihashes '["bafykbzacea3vduqii3u52xkzdqan5oc54vsvedmed25dfybrqxyafahjl3rzu"]' --output article.pdf
|
||||||
|
```
|
||||||
|
|
||||||
|
### Using with Selenium
|
||||||
|
|
||||||
|
Create directory for exchaning files between host and launched Selenium in Docker
|
||||||
|
```bash
|
||||||
|
mkdir downloads
|
||||||
|
```
|
||||||
|
|
||||||
|
Launch Selenium in Docker
|
||||||
|
```bash
|
||||||
|
docker run -e SE_START_XVFB=false -v $(pwd)/downloads:/downloads -p 4444:4444 selenium/standalone-chrome:latest
|
||||||
|
```
|
||||||
|
|
||||||
|
Launch Pylon
|
||||||
|
```bash
|
||||||
|
pylon download --doi 10.1101/2022.09.09.507349 --output article.pdf \
|
||||||
|
--wd-endpoint 'http://127.0.0.1:4444/wd/hub' \
|
||||||
|
--wd-directory /downloads --wd-host-directory $(pwd)/downloads --debug
|
||||||
```
|
```
|
@ -1,15 +1,17 @@
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
import fire
|
import fire
|
||||||
from aiokit.utils import sync_fu
|
from aiokit.utils import sync_fu
|
||||||
from nexus.pylon.client import (
|
from izihawa_configurator import Configurator
|
||||||
|
|
||||||
|
from .client import (
|
||||||
DownloadError,
|
DownloadError,
|
||||||
PylonClient,
|
PylonClient,
|
||||||
)
|
)
|
||||||
from nexus.pylon.configs import get_config
|
from .proto.file_pb2 import FileResponse as FileResponsePb
|
||||||
from nexus.pylon.proto.file_pb2 import FileResponse as FileResponsePb
|
|
||||||
|
|
||||||
|
|
||||||
def resolve_path(filepath):
|
def resolve_path(filepath):
|
||||||
@ -27,22 +29,20 @@ async def fetch(
|
|||||||
collected = bytes()
|
collected = bytes()
|
||||||
try:
|
try:
|
||||||
last_len = 0
|
last_len = 0
|
||||||
last_source = ''
|
|
||||||
async for resp in iter:
|
async for resp in iter:
|
||||||
if resp.HasField('status'):
|
if resp.HasField('status'):
|
||||||
if resp.status == FileResponsePb.Status.BEGIN_TRANSMISSION:
|
if resp.status == FileResponsePb.Status.BEGIN_TRANSMISSION:
|
||||||
print(f'Started transmission from {resp.source}...', end='\r', file=sys.stderr)
|
print(f'Started transmission...', file=sys.stderr)
|
||||||
last_len = 0
|
last_len = 0
|
||||||
last_source = resp.source
|
|
||||||
collected = bytes()
|
collected = bytes()
|
||||||
elif resp.HasField('chunk'):
|
elif resp.HasField('chunk'):
|
||||||
if len(collected) - last_len > 1024 * 100:
|
if len(collected) - last_len > 1024 * 100:
|
||||||
print(f'Loaded {len(collected)} bytes from {resp.source}', end='\r', file=sys.stderr)
|
print(f'Loaded {len(collected)} bytes', end='\r', file=sys.stderr)
|
||||||
last_len = len(collected)
|
last_len = len(collected)
|
||||||
last_source = resp.source
|
|
||||||
collected += resp.chunk.content
|
collected += resp.chunk.content
|
||||||
with open(resolve_path(output), 'wb') as f:
|
with open(resolve_path(output), 'wb') as f:
|
||||||
print(f'Completed! Loaded {len(collected)} bytes from {last_source}', file=sys.stderr)
|
print()
|
||||||
|
print(f'Completed! Loaded {len(collected)} bytes', file=sys.stderr)
|
||||||
f.write(collected)
|
f.write(collected)
|
||||||
except DownloadError:
|
except DownloadError:
|
||||||
print('File not found')
|
print('File not found')
|
||||||
@ -50,25 +50,53 @@ async def fetch(
|
|||||||
|
|
||||||
async def download(
|
async def download(
|
||||||
output: str,
|
output: str,
|
||||||
|
config: Optional[str] = None,
|
||||||
debug: bool = False,
|
debug: bool = False,
|
||||||
|
wd_endpoint: Optional[str] = None,
|
||||||
|
wd_directory: Optional[str] = None,
|
||||||
|
wd_host_directory: Optional[str] = None,
|
||||||
**params,
|
**params,
|
||||||
):
|
):
|
||||||
|
"""
|
||||||
|
Download scientific publications from various sources
|
||||||
|
Large portion of fresh articles could be retrieved only though publisher libraries through `BrowserDriver`, it
|
||||||
|
requires Selenium webdriver:
|
||||||
|
`docker run -e SE_START_XVFB=false -v $(pwd)/downloads:/downloads -p 4444:4444 selenium/standalone-chrome:latest`
|
||||||
|
Args:
|
||||||
|
output: name of the output file
|
||||||
|
config: pylon config
|
||||||
|
debug: enable debug logging
|
||||||
|
wd_endpoint: web-driver
|
||||||
|
wd_directory: mounted directory inside Docker image
|
||||||
|
wd_host_directory: directory for downloads on host that should be mounter as `wd_directory` inside Docker image
|
||||||
|
"""
|
||||||
if debug:
|
if debug:
|
||||||
logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
|
logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
|
||||||
c = get_config()['pylon']
|
|
||||||
p = PylonClient(
|
default_config_path = os.path.join(os.path.dirname(__file__), 'configs/pylon.yaml')
|
||||||
proxies=c['proxies'],
|
config = Configurator([config if config else default_config_path], env_prefix='NEXUS_PYLON')
|
||||||
source_configs=c['sources'],
|
config = config['pylon']
|
||||||
default_driver_proxy_list=c['default_driver_proxy_list'],
|
if wd_endpoint:
|
||||||
downloads_directory=c['downloads_directory'],
|
config.setdefault('webdriver_hub', {})
|
||||||
)
|
config['webdriver_hub']['endpoint'] = wd_endpoint
|
||||||
return await fetch(iter=p.download(params=params), output=output)
|
if not wd_directory:
|
||||||
|
raise ValueError('Should pass --wd-directory with --wd-endpoint')
|
||||||
|
config['webdriver_hub']['downloads_directory'] = wd_directory
|
||||||
|
if not wd_host_directory:
|
||||||
|
raise ValueError('Should pass --wd-host-directory with --wd-endpoint')
|
||||||
|
config['webdriver_hub']['host_downloads_directory'] = wd_host_directory
|
||||||
|
|
||||||
|
pylon_client = PylonClient(config=config)
|
||||||
|
return await fetch(iter=pylon_client.download(params=params), output=output)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
try:
|
||||||
fire.Fire({
|
fire.Fire({
|
||||||
'download': sync_fu(download),
|
'download': sync_fu(download),
|
||||||
})
|
})
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@ -1,12 +1,10 @@
|
|||||||
|
import logging
|
||||||
from typing import (
|
from typing import (
|
||||||
AsyncIterable,
|
AsyncIterable,
|
||||||
Dict,
|
Dict,
|
||||||
List,
|
|
||||||
Optional,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
from aiokit import AioThing
|
from aiokit import AioThing
|
||||||
from library.logging import error_log
|
|
||||||
from nexus.pylon.exceptions import (
|
from nexus.pylon.exceptions import (
|
||||||
DownloadError,
|
DownloadError,
|
||||||
NotFoundError,
|
NotFoundError,
|
||||||
@ -17,28 +15,23 @@ from nexus.pylon.source import Source
|
|||||||
|
|
||||||
|
|
||||||
class PylonClient(AioThing):
|
class PylonClient(AioThing):
|
||||||
def __init__(
|
def __init__(self, config):
|
||||||
self,
|
|
||||||
source_configs: Optional[List],
|
|
||||||
proxies: Optional[List[str]] = None,
|
|
||||||
downloads_directory: Optional[str] = None,
|
|
||||||
default_driver_proxy_list: [Optional[List]] = None,
|
|
||||||
default_resolver_proxy_list: [Optional[List]] = None,
|
|
||||||
):
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.proxy_manager = ProxyManager(proxies)
|
self.config = config
|
||||||
self.downloads_directory = downloads_directory
|
self.proxy_manager = ProxyManager(config.get('proxies'))
|
||||||
self.default_driver_proxy_list = default_driver_proxy_list
|
|
||||||
self.default_resolver_proxy_list = default_resolver_proxy_list
|
|
||||||
self.sources = []
|
self.sources = []
|
||||||
for source_config in source_configs:
|
if config.get('webdriver_hub') is None:
|
||||||
|
logging.getLogger('nexus_pylon').warning({
|
||||||
|
'action': 'missed_webdriver',
|
||||||
|
'mode': 'pylon',
|
||||||
|
})
|
||||||
|
for source_config in config['sources']:
|
||||||
source = Source.from_config(
|
source = Source.from_config(
|
||||||
proxy_manager=self.proxy_manager,
|
proxy_manager=self.proxy_manager,
|
||||||
|
config=self.config,
|
||||||
source_config=source_config,
|
source_config=source_config,
|
||||||
downloads_directory=downloads_directory,
|
|
||||||
default_driver_proxy_list=default_driver_proxy_list,
|
|
||||||
default_resolver_proxy_list=default_resolver_proxy_list,
|
|
||||||
)
|
)
|
||||||
|
if source:
|
||||||
self.sources.append(source)
|
self.sources.append(source)
|
||||||
self.starts.append(source)
|
self.starts.append(source)
|
||||||
|
|
||||||
@ -50,9 +43,10 @@ class PylonClient(AioThing):
|
|||||||
async for resp in source.download(params):
|
async for resp in source.download(params):
|
||||||
yield resp
|
yield resp
|
||||||
return
|
return
|
||||||
except NotFoundError:
|
except NotFoundError as e:
|
||||||
|
logging.getLogger('nexus_pylon').debug(e)
|
||||||
continue
|
continue
|
||||||
except DownloadError as e:
|
except DownloadError as e:
|
||||||
error_log(e)
|
logging.getLogger('nexus_pylon').warning(e)
|
||||||
continue
|
continue
|
||||||
raise NotFoundError()
|
raise NotFoundError(params=params)
|
||||||
|
@ -1,11 +0,0 @@
|
|||||||
from izihawa_utils import env
|
|
||||||
from library.configurator import Configurator
|
|
||||||
|
|
||||||
|
|
||||||
def get_config():
|
|
||||||
return Configurator([
|
|
||||||
'nexus/pylon/configs/pylon.yaml',
|
|
||||||
], env_prefix='NEXUS_PYLON')
|
|
||||||
|
|
||||||
|
|
||||||
config = get_config()
|
|
@ -1,65 +1,25 @@
|
|||||||
---
|
---
|
||||||
pylon:
|
pylon:
|
||||||
default_driver_proxy_list:
|
default_driver_proxy_list: ~
|
||||||
- [proxy1]
|
default_resolver_proxy_list: ~
|
||||||
- [proxy2]
|
proxies: ~
|
||||||
- [proxy3]
|
|
||||||
downloads_directory: /downloads
|
|
||||||
proxies:
|
|
||||||
- address: proxy1.net:7890
|
|
||||||
name: proxy1
|
|
||||||
tags: [proxy1]
|
|
||||||
- address: proxy2.net:7990
|
|
||||||
name: proxy2
|
|
||||||
tags: [proxy2]
|
|
||||||
- address: proxy3.net:8090
|
|
||||||
name: proxy3
|
|
||||||
tags: [proxy3]
|
|
||||||
sources:
|
sources:
|
||||||
# LibGen.rocks
|
# IPFS
|
||||||
- driver:
|
- driver:
|
||||||
args:
|
args:
|
||||||
proxy_list: ~
|
|
||||||
validator:
|
validator:
|
||||||
class: nexus.pylon.validators.Md5Validator
|
class: nexus.pylon.validators.BaseValidator
|
||||||
class:
|
class:
|
||||||
nexus.pylon.drivers.DirectDriver
|
nexus.pylon.drivers.DirectDriver
|
||||||
matcher:
|
matcher:
|
||||||
md5: ^.*$
|
ipfs_multihashes: ^.*$
|
||||||
resolver:
|
resolver:
|
||||||
args:
|
args:
|
||||||
extractors:
|
format_string: 'https://ipfs.io/ipfs/{ipfs_multihashes[0]}'
|
||||||
- producer:
|
class: nexus.pylon.resolvers.TemplateResolver
|
||||||
format_string: 'http://libgen.rocks/{matched_group}'
|
|
||||||
group: 0
|
|
||||||
re: 'get\.php\?md5=.*&key=[A-Za-z\d]+'
|
|
||||||
timeout: 25.0
|
|
||||||
type: regex
|
|
||||||
url: https://libgen.rocks/ads.php?md5={md5}
|
|
||||||
class: nexus.pylon.resolvers.RequestResolver
|
|
||||||
# LibGen.rocks
|
|
||||||
- driver:
|
|
||||||
args:
|
|
||||||
proxy_list: ~
|
|
||||||
class:
|
|
||||||
nexus.pylon.drivers.DirectDriver
|
|
||||||
matcher:
|
|
||||||
doi: ^.*$
|
|
||||||
resolver:
|
|
||||||
args:
|
|
||||||
extractors:
|
|
||||||
- producer:
|
|
||||||
format_string: 'http://libgen.rocks/{matched_group}'
|
|
||||||
group: 0
|
|
||||||
re: 'get\.php\?md5=[a-fA-F\d]+&key=[A-Za-z\d]+(&doi=[^\"]*)+'
|
|
||||||
timeout: 25.0
|
|
||||||
type: regex
|
|
||||||
url: 'https://libgen.rocks/ads.php?doi={doi}'
|
|
||||||
class: nexus.pylon.resolvers.RequestResolver
|
|
||||||
# Library.lol
|
# Library.lol
|
||||||
- driver:
|
- driver:
|
||||||
args:
|
args:
|
||||||
proxy_list: ~
|
|
||||||
validator:
|
validator:
|
||||||
class: nexus.pylon.validators.Md5Validator
|
class: nexus.pylon.validators.Md5Validator
|
||||||
class:
|
class:
|
||||||
@ -70,27 +30,22 @@ pylon:
|
|||||||
args:
|
args:
|
||||||
extractors:
|
extractors:
|
||||||
- producer:
|
- producer:
|
||||||
format_string: '{matched_group}'
|
format_string: '{href}'
|
||||||
group: 1
|
|
||||||
re: '<a href="([^\"]+)">GET</a>'
|
|
||||||
timeout: 45.0
|
timeout: 45.0
|
||||||
|
re: '<a href="(?P<href>[^\"]+)">GET</a>'
|
||||||
type: regex
|
type: regex
|
||||||
- producer:
|
- producer:
|
||||||
format_string: '{matched_group}'
|
format_string: '{url}'
|
||||||
group: 0
|
re: '(?P<url>https://ipfs.io/ipfs/[A-Za-z\d]+)'
|
||||||
re: 'https://ipfs.io/ipfs/[A-Za-z\d]+'
|
|
||||||
type: regex
|
type: regex
|
||||||
- producer:
|
- producer:
|
||||||
format_string: '{matched_group}'
|
format_string: '{url}'
|
||||||
group: 0
|
re: '(?P<url>https://cloudflare-ipfs.com/ipfs/[A-Za-z\d]+)'
|
||||||
re: 'https://cloudflare-ipfs.com/ipfs/[A-Za-z\d]+'
|
|
||||||
type: regex
|
type: regex
|
||||||
url: http://library.lol/main/{md5}
|
url: http://library.lol/main/{md5}
|
||||||
class: nexus.pylon.resolvers.RequestResolver
|
class: nexus.pylon.resolvers.RequestResolver
|
||||||
# library.lol
|
# library.lol
|
||||||
- driver:
|
- driver:
|
||||||
args:
|
|
||||||
proxy_list: ~
|
|
||||||
class:
|
class:
|
||||||
nexus.pylon.drivers.DirectDriver
|
nexus.pylon.drivers.DirectDriver
|
||||||
matcher:
|
matcher:
|
||||||
@ -99,13 +54,48 @@ pylon:
|
|||||||
args:
|
args:
|
||||||
extractors:
|
extractors:
|
||||||
- producer:
|
- producer:
|
||||||
format_string: '{matched_group}'
|
format_string: '{href}'
|
||||||
group: 1
|
|
||||||
re: '<a href="([^\"]+)">GET</a>'
|
|
||||||
timeout: 45.0
|
timeout: 45.0
|
||||||
|
re: '<a href="(?P<href>[^\"]+)">GET</a>'
|
||||||
type: regex
|
type: regex
|
||||||
url: 'http://library.lol/scimag/{doi}'
|
url: 'http://library.lol/scimag/{doi}'
|
||||||
class: nexus.pylon.resolvers.RequestResolver
|
class: nexus.pylon.resolvers.RequestResolver
|
||||||
|
# LibGen.rocks
|
||||||
|
- driver:
|
||||||
|
args:
|
||||||
|
validator:
|
||||||
|
class: nexus.pylon.validators.Md5Validator
|
||||||
|
class:
|
||||||
|
nexus.pylon.drivers.DirectDriver
|
||||||
|
matcher:
|
||||||
|
md5: ^.*$
|
||||||
|
resolver:
|
||||||
|
args:
|
||||||
|
extractors:
|
||||||
|
- producer:
|
||||||
|
format_string: 'http://libgen.rocks/{key}'
|
||||||
|
timeout: 25.0
|
||||||
|
re: '(?P<key>get\.php\?md5=.*&key=[A-Za-z\d]+)'
|
||||||
|
type: regex
|
||||||
|
resolve_timeout: 25.0
|
||||||
|
url: https://libgen.rocks/ads.php?md5={md5}
|
||||||
|
class: nexus.pylon.resolvers.RequestResolver
|
||||||
|
# LibGen.rocks
|
||||||
|
- driver:
|
||||||
|
class:
|
||||||
|
nexus.pylon.drivers.DirectDriver
|
||||||
|
matcher:
|
||||||
|
doi: ^.*$
|
||||||
|
resolver:
|
||||||
|
args:
|
||||||
|
extractors:
|
||||||
|
- producer:
|
||||||
|
format_string: 'http://libgen.rocks/{key}'
|
||||||
|
timeout: 25.0
|
||||||
|
re: '(?P<key>get\.php\?md5=[a-fA-F\d]+&key=[A-Za-z\d]+(&doi=[^\"]*)+)'
|
||||||
|
type: regex
|
||||||
|
url: 'https://libgen.rocks/ads.php?doi={doi}'
|
||||||
|
class: nexus.pylon.resolvers.RequestResolver
|
||||||
# jamanetwork.com
|
# jamanetwork.com
|
||||||
- driver:
|
- driver:
|
||||||
args:
|
args:
|
||||||
@ -206,6 +196,13 @@ pylon:
|
|||||||
format_string: 'https://www.tandfonline.com/doi/pdf/{doi}?download=true'
|
format_string: 'https://www.tandfonline.com/doi/pdf/{doi}?download=true'
|
||||||
class: nexus.pylon.resolvers.TemplateResolver
|
class: nexus.pylon.resolvers.TemplateResolver
|
||||||
# iopscience.iop.org
|
# iopscience.iop.org
|
||||||
|
- matcher:
|
||||||
|
doi: ^10.1088/.*$
|
||||||
|
resolver:
|
||||||
|
args:
|
||||||
|
format_string: 'https://iopscience.iop.org/article/{doi}/pdf'
|
||||||
|
class: nexus.pylon.resolvers.TemplateResolver
|
||||||
|
# iopscience.iop.org
|
||||||
- matcher:
|
- matcher:
|
||||||
doi: ^10.1088/.*$
|
doi: ^10.1088/.*$
|
||||||
resolver:
|
resolver:
|
||||||
@ -220,10 +217,6 @@ pylon:
|
|||||||
timeout: 30
|
timeout: 30
|
||||||
type: wait_css_selector
|
type: wait_css_selector
|
||||||
- type: click
|
- type: click
|
||||||
proxy_list:
|
|
||||||
- [proxy2]
|
|
||||||
- [proxy1]
|
|
||||||
- [proxy3]
|
|
||||||
class: nexus.pylon.drivers.BrowserDriver
|
class: nexus.pylon.drivers.BrowserDriver
|
||||||
matcher:
|
matcher:
|
||||||
doi: ^10.1093/.*$
|
doi: ^10.1093/.*$
|
||||||
@ -246,6 +239,13 @@ pylon:
|
|||||||
args:
|
args:
|
||||||
timeout: 30.0
|
timeout: 30.0
|
||||||
class: nexus.pylon.resolvers.TemplateResolver
|
class: nexus.pylon.resolvers.TemplateResolver
|
||||||
|
# biorxiv.org
|
||||||
|
- matcher:
|
||||||
|
doi: ^10.1101/.*$
|
||||||
|
resolver:
|
||||||
|
args:
|
||||||
|
format_string: 'https://www.biorxiv.org/content/{doi}.full.pdf'
|
||||||
|
class: nexus.pylon.resolvers.TemplateResolver
|
||||||
# journals.aps.org
|
# journals.aps.org
|
||||||
- matcher:
|
- matcher:
|
||||||
doi: ^10.1103/.*$
|
doi: ^10.1103/.*$
|
||||||
@ -332,6 +332,13 @@ pylon:
|
|||||||
args:
|
args:
|
||||||
format_string: 'https://journals.physiology.org/doi/pdf/{doi}?download=true'
|
format_string: 'https://journals.physiology.org/doi/pdf/{doi}?download=true'
|
||||||
class: nexus.pylon.resolvers.TemplateResolver
|
class: nexus.pylon.resolvers.TemplateResolver
|
||||||
|
# www.ahajournals.org
|
||||||
|
- matcher:
|
||||||
|
doi: ^10.1161/.*$
|
||||||
|
resolver:
|
||||||
|
args:
|
||||||
|
format_string: 'https://www.ahajournals.org/doi/pdf/{doi}?download=true'
|
||||||
|
class: nexus.pylon.resolvers.TemplateResolver
|
||||||
# ajp.psychiatryonline.org
|
# ajp.psychiatryonline.org
|
||||||
- matcher:
|
- matcher:
|
||||||
doi: ^10.1176/.*$
|
doi: ^10.1176/.*$
|
||||||
@ -355,8 +362,6 @@ pylon:
|
|||||||
class: nexus.pylon.resolvers.TemplateResolver
|
class: nexus.pylon.resolvers.TemplateResolver
|
||||||
# journals.plos.org
|
# journals.plos.org
|
||||||
- driver:
|
- driver:
|
||||||
args:
|
|
||||||
proxy_list: ~
|
|
||||||
class: nexus.pylon.drivers.direct.DirectDriver
|
class: nexus.pylon.drivers.direct.DirectDriver
|
||||||
matcher:
|
matcher:
|
||||||
doi: ^10.1371/.*$
|
doi: ^10.1371/.*$
|
||||||
@ -364,6 +369,13 @@ pylon:
|
|||||||
args:
|
args:
|
||||||
format_string: 'https://journals.plos.org/plosone/article/file?id={doi}&type=printable'
|
format_string: 'https://journals.plos.org/plosone/article/file?id={doi}&type=printable'
|
||||||
class: nexus.pylon.resolvers.TemplateResolver
|
class: nexus.pylon.resolvers.TemplateResolver
|
||||||
|
# guilfordjournals.com
|
||||||
|
- matcher:
|
||||||
|
doi: ^10.1521/.*$
|
||||||
|
resolver:
|
||||||
|
args:
|
||||||
|
format_string: 'https://guilfordjournals.com/doi/pdf/{doi}?download=true'
|
||||||
|
class: nexus.pylon.resolvers.TemplateResolver
|
||||||
# bioone.org
|
# bioone.org
|
||||||
- driver:
|
- driver:
|
||||||
args:
|
args:
|
||||||
@ -396,8 +408,6 @@ pylon:
|
|||||||
doi: ^10.2139/.*$
|
doi: ^10.2139/.*$
|
||||||
# www.afghandata.org
|
# www.afghandata.org
|
||||||
- driver:
|
- driver:
|
||||||
args:
|
|
||||||
proxy_list: ~
|
|
||||||
class:
|
class:
|
||||||
nexus.pylon.drivers.DirectDriver
|
nexus.pylon.drivers.DirectDriver
|
||||||
matcher:
|
matcher:
|
||||||
@ -503,8 +513,6 @@ pylon:
|
|||||||
doi: ^10.5334/.*$
|
doi: ^10.5334/.*$
|
||||||
# hess.copernicus.org
|
# hess.copernicus.org
|
||||||
- driver:
|
- driver:
|
||||||
args:
|
|
||||||
proxy_list: ~
|
|
||||||
class: nexus.pylon.drivers.DirectDriver
|
class: nexus.pylon.drivers.DirectDriver
|
||||||
matcher:
|
matcher:
|
||||||
doi: ^10.5194/.*$
|
doi: ^10.5194/.*$
|
||||||
@ -524,7 +532,6 @@ pylon:
|
|||||||
- selector: '.uxf-download'
|
- selector: '.uxf-download'
|
||||||
type: wait_css_selector
|
type: wait_css_selector
|
||||||
- type: click
|
- type: click
|
||||||
proxy_list: ~
|
|
||||||
class: nexus.pylon.drivers.BrowserDriver
|
class: nexus.pylon.drivers.BrowserDriver
|
||||||
matcher:
|
matcher:
|
||||||
doi: ^10.5585/.*
|
doi: ^10.5585/.*
|
||||||
@ -539,6 +546,22 @@ pylon:
|
|||||||
args:
|
args:
|
||||||
format_string: 'https://jcsm.aasm.org/doi/pdf/{doi}?download=true'
|
format_string: 'https://jcsm.aasm.org/doi/pdf/{doi}?download=true'
|
||||||
class: nexus.pylon.resolvers.TemplateResolver
|
class: nexus.pylon.resolvers.TemplateResolver
|
||||||
|
# www.medwave.cl
|
||||||
|
- driver:
|
||||||
|
class:
|
||||||
|
nexus.pylon.drivers.DirectDriver
|
||||||
|
matcher:
|
||||||
|
doi: ^10.5867/.*$
|
||||||
|
resolver:
|
||||||
|
args:
|
||||||
|
extractors:
|
||||||
|
- producer:
|
||||||
|
format_string: 'https://www.medwave.cl/{path}'
|
||||||
|
timeout: 25.0
|
||||||
|
re: 'href=\"/(?P<path>[\w/.\-_]+\.pdf)\">PDF</a>'
|
||||||
|
type: regex
|
||||||
|
url: https://doi.org/{doi}
|
||||||
|
class: nexus.pylon.resolvers.RequestResolver
|
||||||
# journal.permsc.ru
|
# journal.permsc.ru
|
||||||
- driver:
|
- driver:
|
||||||
args:
|
args:
|
||||||
@ -584,8 +607,6 @@ pylon:
|
|||||||
doi: ^10.21203/.*$
|
doi: ^10.21203/.*$
|
||||||
# www.ukm.my/
|
# www.ukm.my/
|
||||||
- driver:
|
- driver:
|
||||||
args:
|
|
||||||
proxy_list: ~
|
|
||||||
class: nexus.pylon.drivers.DirectDriver
|
class: nexus.pylon.drivers.DirectDriver
|
||||||
matcher:
|
matcher:
|
||||||
doi: ^10.24035/.*$
|
doi: ^10.24035/.*$
|
||||||
@ -599,6 +620,22 @@ pylon:
|
|||||||
class: nexus.pylon.drivers.BrowserDriver
|
class: nexus.pylon.drivers.BrowserDriver
|
||||||
matcher:
|
matcher:
|
||||||
doi: ^10.32920/.*$
|
doi: ^10.32920/.*$
|
||||||
|
# PKP Project
|
||||||
|
- driver:
|
||||||
|
class:
|
||||||
|
nexus.pylon.drivers.DirectDriver
|
||||||
|
matcher:
|
||||||
|
doi: ^10.(5399|24905|31004|32729|37934)/.*$
|
||||||
|
resolver:
|
||||||
|
args:
|
||||||
|
extractors:
|
||||||
|
- producer:
|
||||||
|
format_string: 'https://{host}/{prefix}/{journal}/article/download/{key}'
|
||||||
|
timeout: 25.0
|
||||||
|
re: 'href=\"(?:https?://[\w.]+)/(?P<prefix>[\w./]+)/(?P<journal>[\w.]+)/article/view/(?P<key>\w+/\w+)\"[^>]*>[Pp][Dd][Ff]\s*</a>'
|
||||||
|
type: regex
|
||||||
|
url: https://doi.org/{doi}
|
||||||
|
class: nexus.pylon.resolvers.RequestResolver
|
||||||
# papers.cumincad.org
|
# papers.cumincad.org
|
||||||
- driver:
|
- driver:
|
||||||
args:
|
args:
|
||||||
@ -606,11 +643,16 @@ pylon:
|
|||||||
- selector: 'file.pdf'
|
- selector: 'file.pdf'
|
||||||
type: wait_link_text
|
type: wait_link_text
|
||||||
- type: click
|
- type: click
|
||||||
proxy_list: ~
|
|
||||||
class: nexus.pylon.drivers.BrowserDriver
|
class: nexus.pylon.drivers.BrowserDriver
|
||||||
matcher:
|
matcher:
|
||||||
doi: ^10.52842/.*$
|
doi: ^10.52842/.*$
|
||||||
# ^.*$
|
# ^.*$
|
||||||
|
- matcher:
|
||||||
|
doi: ^.*$
|
||||||
|
resolver:
|
||||||
|
args:
|
||||||
|
selector: '.resource.primary.URL | select (. | ascii_downcase | contains("pdf"))'
|
||||||
|
class: nexus.pylon.resolvers.DoiOrgRequestResolver
|
||||||
- matcher:
|
- matcher:
|
||||||
doi: ^.*$
|
doi: ^.*$
|
||||||
resolver:
|
resolver:
|
||||||
|
@ -4,22 +4,22 @@ from typing import (
|
|||||||
Optional,
|
Optional,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from izihawa_utils.importlib import import_object
|
||||||
from nexus.pylon.network_agent import NetworkAgent
|
from nexus.pylon.network_agent import NetworkAgent
|
||||||
from nexus.pylon.prepared_request import PreparedRequest
|
from nexus.pylon.prepared_request import PreparedRequest
|
||||||
from nexus.pylon.proxy_manager import ProxyManager
|
from nexus.pylon.proxy_manager import ProxyManager
|
||||||
from nexus.pylon.validators.base import BaseValidator
|
|
||||||
from utils.izihawa_utils.importlib import import_object
|
|
||||||
|
|
||||||
|
|
||||||
class BaseDriver(NetworkAgent):
|
class BaseDriver(NetworkAgent):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
config,
|
||||||
validator=None,
|
validator=None,
|
||||||
downloads_directory: str = '/downloads',
|
|
||||||
proxy_list: Optional[List] = None,
|
proxy_list: Optional[List] = None,
|
||||||
proxy_manager: Optional[ProxyManager] = None,
|
proxy_manager: Optional[ProxyManager] = None,
|
||||||
):
|
):
|
||||||
super().__init__(proxy_list=proxy_list, proxy_manager=proxy_manager)
|
super().__init__(proxy_list=proxy_list, proxy_manager=proxy_manager)
|
||||||
|
self.config = config
|
||||||
|
|
||||||
validator_cls = 'nexus.pylon.validators.PdfValidator'
|
validator_cls = 'nexus.pylon.validators.PdfValidator'
|
||||||
if validator and 'class' in validator:
|
if validator and 'class' in validator:
|
||||||
@ -27,7 +27,6 @@ class BaseDriver(NetworkAgent):
|
|||||||
validator_cls = import_object(validator_cls)
|
validator_cls = import_object(validator_cls)
|
||||||
|
|
||||||
self.validator = validator_cls
|
self.validator = validator_cls
|
||||||
self.downloads_directory = downloads_directory
|
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return self.__class__.__name__
|
return self.__class__.__name__
|
||||||
|
@ -32,21 +32,20 @@ from selenium.webdriver.support.ui import WebDriverWait
|
|||||||
class BrowserDriver(BaseDriver):
|
class BrowserDriver(BaseDriver):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
config,
|
||||||
validator=None,
|
validator=None,
|
||||||
proxy_list: Optional[List] = None,
|
proxy_list: Optional[List] = None,
|
||||||
proxy_manager: Optional[ProxyManager] = None,
|
proxy_manager: Optional[ProxyManager] = None,
|
||||||
actions: Optional[List] = None,
|
actions: Optional[List] = None,
|
||||||
downloads_directory='/downloads',
|
|
||||||
window_size: Tuple[int, int] = (1279, 833),
|
|
||||||
erase_webdrive_property: bool = True,
|
|
||||||
webdrive_hub_endpoint: str = "http://127.0.0.1:4444/wd/hub",
|
|
||||||
):
|
):
|
||||||
super().__init__(validator=validator, proxy_list=proxy_list, proxy_manager=proxy_manager)
|
super().__init__(config=config, validator=validator, proxy_list=proxy_list, proxy_manager=proxy_manager)
|
||||||
self.actions = actions
|
self.actions = actions
|
||||||
self.downloads_directory = Path(downloads_directory)
|
self.downloads_directory = Path(config['webdriver_hub']['downloads_directory'])
|
||||||
self.window_size = window_size
|
self.host_downloads_directory = Path(config['webdriver_hub']['host_downloads_directory'])
|
||||||
self.erase_webdrive_property = erase_webdrive_property
|
self.window_size = tuple(config['webdriver_hub'].get('window_size', [1279, 833]))
|
||||||
self.webdrive_hub_endpoint = webdrive_hub_endpoint
|
self.erase_webdriver_property = config['webdriver_hub'].get('erase_webdriver_property', True)
|
||||||
|
self.webdriver_hub_endpoint = config['webdriver_hub']['endpoint']
|
||||||
|
self.file_poll_timeout = 2.0
|
||||||
|
|
||||||
async def get_chrome_sessions(self):
|
async def get_chrome_sessions(self):
|
||||||
proxies = list(
|
proxies = list(
|
||||||
@ -55,15 +54,14 @@ class BrowserDriver(BaseDriver):
|
|||||||
else [None]
|
else [None]
|
||||||
)
|
)
|
||||||
for proxy in proxies:
|
for proxy in proxies:
|
||||||
downloads_folder = self.downloads_directory / random_string(16)
|
subdirectory = random_string(16)
|
||||||
os.mkdir(downloads_folder)
|
downloads_directory = self.downloads_directory / subdirectory
|
||||||
os.chmod(downloads_folder, 0o777)
|
host_downloads_directory = self.host_downloads_directory / subdirectory
|
||||||
chrome = await asyncio.get_running_loop().run_in_executor(None, lambda: self.setup_chrome(proxy, downloads_folder))
|
os.mkdir(host_downloads_directory)
|
||||||
try:
|
os.chmod(host_downloads_directory, 0o777)
|
||||||
yield chrome, downloads_folder
|
chrome = await asyncio.get_running_loop().run_in_executor(None, lambda: self.setup_chrome(proxy, downloads_directory))
|
||||||
finally:
|
yield chrome, host_downloads_directory
|
||||||
shutil.rmtree(downloads_folder)
|
|
||||||
chrome.quit()
|
|
||||||
|
|
||||||
def setup_chrome(self, proxy, downloads_folder):
|
def setup_chrome(self, proxy, downloads_folder):
|
||||||
options = webdriver.ChromeOptions()
|
options = webdriver.ChromeOptions()
|
||||||
@ -85,13 +83,13 @@ class BrowserDriver(BaseDriver):
|
|||||||
options.add_argument('--disable-dev-shm-usage')
|
options.add_argument('--disable-dev-shm-usage')
|
||||||
options.add_argument("--disable-popup-blocking")
|
options.add_argument("--disable-popup-blocking")
|
||||||
chrome = webdriver.Remote(
|
chrome = webdriver.Remote(
|
||||||
self.webdrive_hub_endpoint,
|
self.webdriver_hub_endpoint,
|
||||||
DesiredCapabilities.CHROME,
|
DesiredCapabilities.CHROME,
|
||||||
options=options,
|
options=options,
|
||||||
)
|
)
|
||||||
chrome.set_window_size(self.window_size[0], self.window_size[1])
|
chrome.set_window_size(self.window_size[0], self.window_size[1])
|
||||||
|
|
||||||
if self.erase_webdrive_property:
|
if self.erase_webdriver_property:
|
||||||
resource = "/session/%s/chromium/send_command_and_get_result" % chrome.session_id
|
resource = "/session/%s/chromium/send_command_and_get_result" % chrome.session_id
|
||||||
url = chrome.command_executor._url + resource
|
url = chrome.command_executor._url + resource
|
||||||
body = json.dumps({'cmd': "Page.addScriptToEvaluateOnNewDocument", 'params': {
|
body = json.dumps({'cmd': "Page.addScriptToEvaluateOnNewDocument", 'params': {
|
||||||
@ -103,7 +101,7 @@ class BrowserDriver(BaseDriver):
|
|||||||
}})
|
}})
|
||||||
chrome.command_executor._request('POST', url, body)
|
chrome.command_executor._request('POST', url, body)
|
||||||
|
|
||||||
logging.getLogger('debug').debug({
|
logging.getLogger('nexus_pylon').debug({
|
||||||
'action': 'start_chrome',
|
'action': 'start_chrome',
|
||||||
'mode': 'pylon',
|
'mode': 'pylon',
|
||||||
'proxy': str(proxy) if proxy is not None else None,
|
'proxy': str(proxy) if proxy is not None else None,
|
||||||
@ -148,32 +146,19 @@ class BrowserDriver(BaseDriver):
|
|||||||
and downloaded_offset == current_offset
|
and downloaded_offset == current_offset
|
||||||
and current_offset > 0
|
and current_offset > 0
|
||||||
):
|
):
|
||||||
logging.getLogger('debug').debug({
|
|
||||||
'action': 'sent',
|
|
||||||
'mode': 'pylon',
|
|
||||||
'filename': filename,
|
|
||||||
})
|
|
||||||
return
|
return
|
||||||
|
|
||||||
logging.getLogger('debug').debug({
|
|
||||||
'action': 'send_part',
|
|
||||||
'mode': 'pylon',
|
|
||||||
'current_offset': current_offset,
|
|
||||||
'downloaded_offset': downloaded_offset,
|
|
||||||
'filename': filename,
|
|
||||||
})
|
|
||||||
await file.seek(current_offset)
|
await file.seek(current_offset)
|
||||||
yield await file.read(downloaded_offset - current_offset)
|
yield await file.read(downloaded_offset - current_offset)
|
||||||
current_offset = downloaded_offset
|
current_offset = downloaded_offset
|
||||||
|
|
||||||
await asyncio.sleep(0.5)
|
await asyncio.sleep(self.file_poll_timeout)
|
||||||
raise NotFoundError()
|
raise NotFoundError()
|
||||||
finally:
|
finally:
|
||||||
await file.close()
|
await file.close()
|
||||||
|
|
||||||
def get(self, chrome, url, params):
|
def get(self, chrome, url, params):
|
||||||
logging.getLogger('debug').debug({
|
logging.getLogger('nexus_pylon').debug({
|
||||||
'action': 'get',
|
'action': 'download',
|
||||||
'mode': 'pylon',
|
'mode': 'pylon',
|
||||||
'url': url,
|
'url': url,
|
||||||
})
|
})
|
||||||
@ -190,11 +175,6 @@ class BrowserDriver(BaseDriver):
|
|||||||
if not last_element:
|
if not last_element:
|
||||||
raise RuntimeError('Nothing to click')
|
raise RuntimeError('Nothing to click')
|
||||||
chrome.execute_script("arguments[0].click();", last_element)
|
chrome.execute_script("arguments[0].click();", last_element)
|
||||||
logging.getLogger('debug').debug({
|
|
||||||
'action': 'clicked',
|
|
||||||
'mode': 'pylon',
|
|
||||||
'element': str(last_element),
|
|
||||||
})
|
|
||||||
case 'close_window':
|
case 'close_window':
|
||||||
current_window = previous_window
|
current_window = previous_window
|
||||||
previous_window = None
|
previous_window = None
|
||||||
@ -204,11 +184,6 @@ class BrowserDriver(BaseDriver):
|
|||||||
if not last_element:
|
if not last_element:
|
||||||
raise RuntimeError('Nothing to click')
|
raise RuntimeError('Nothing to click')
|
||||||
last_element.click()
|
last_element.click()
|
||||||
logging.getLogger('debug').debug({
|
|
||||||
'action': 'native_clicked',
|
|
||||||
'mode': 'pylon',
|
|
||||||
'element': str(last_element),
|
|
||||||
})
|
|
||||||
case 'switch_to_new_window':
|
case 'switch_to_new_window':
|
||||||
previous_window = current_window
|
previous_window = current_window
|
||||||
current_window = chrome.window_handles[-1]
|
current_window = chrome.window_handles[-1]
|
||||||
@ -227,12 +202,6 @@ class BrowserDriver(BaseDriver):
|
|||||||
action['selector'],
|
action['selector'],
|
||||||
))
|
))
|
||||||
)
|
)
|
||||||
logging.getLogger('debug').debug({
|
|
||||||
'action': 'waited_css_selector',
|
|
||||||
'mode': 'pylon',
|
|
||||||
'element': str(last_element),
|
|
||||||
'step': action
|
|
||||||
})
|
|
||||||
case 'wait_link_text':
|
case 'wait_link_text':
|
||||||
last_element = WebDriverWait(chrome, action.get('timeout', 15.0)).until(
|
last_element = WebDriverWait(chrome, action.get('timeout', 15.0)).until(
|
||||||
EC.presence_of_element_located((
|
EC.presence_of_element_located((
|
||||||
@ -240,12 +209,6 @@ class BrowserDriver(BaseDriver):
|
|||||||
action['selector'],
|
action['selector'],
|
||||||
))
|
))
|
||||||
)
|
)
|
||||||
logging.getLogger('debug').debug({
|
|
||||||
'action': 'waited_link_text',
|
|
||||||
'mode': 'pylon',
|
|
||||||
'element': str(last_element),
|
|
||||||
'step': action
|
|
||||||
})
|
|
||||||
case 'wait_xpath':
|
case 'wait_xpath':
|
||||||
last_element = WebDriverWait(chrome, action.get('timeout', 15.0)).until(
|
last_element = WebDriverWait(chrome, action.get('timeout', 15.0)).until(
|
||||||
EC.presence_of_element_located((
|
EC.presence_of_element_located((
|
||||||
@ -253,16 +216,10 @@ class BrowserDriver(BaseDriver):
|
|||||||
action['selector'],
|
action['selector'],
|
||||||
))
|
))
|
||||||
)
|
)
|
||||||
logging.getLogger('debug').debug({
|
|
||||||
'action': 'waited_xpath',
|
|
||||||
'mode': 'pylon',
|
|
||||||
'element': str(last_element),
|
|
||||||
'step': action
|
|
||||||
})
|
|
||||||
case _:
|
case _:
|
||||||
raise NotImplementedError('Not implemented action type')
|
raise NotImplementedError('Not implemented action type')
|
||||||
except WebDriverException as e:
|
except WebDriverException as e:
|
||||||
logging.getLogger('debug').debug({
|
logging.getLogger('nexus_pylon').debug({
|
||||||
'action': 'error',
|
'action': 'error',
|
||||||
'mode': 'pylon',
|
'mode': 'pylon',
|
||||||
'error': str(e),
|
'error': str(e),
|
||||||
@ -294,15 +251,17 @@ class BrowserDriver(BaseDriver):
|
|||||||
source=chrome.current_url,
|
source=chrome.current_url,
|
||||||
)
|
)
|
||||||
file_validator.validate()
|
file_validator.validate()
|
||||||
logging.getLogger('debug').debug({
|
|
||||||
'action': 'validated',
|
|
||||||
'mode': 'pylon',
|
|
||||||
'url': prepared_file_request.url,
|
|
||||||
})
|
|
||||||
return
|
return
|
||||||
except NotFoundError:
|
except NotFoundError:
|
||||||
logging.getLogger('debug').debug({
|
logging.getLogger('nexus_pylon').debug({
|
||||||
'action': 'no_response',
|
'action': 'no_response',
|
||||||
'mode': 'pylon',
|
'mode': 'pylon',
|
||||||
})
|
})
|
||||||
|
finally:
|
||||||
|
logging.getLogger('nexus_pylon').debug({
|
||||||
|
'action': 'quit_chrome',
|
||||||
|
'mode': 'pylon',
|
||||||
|
})
|
||||||
|
chrome.quit()
|
||||||
|
shutil.rmtree(downloads_folder)
|
||||||
raise NotFoundError(params=params, url=prepared_file_request.url, driver=str(self))
|
raise NotFoundError(params=params, url=prepared_file_request.url, driver=str(self))
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import logging
|
||||||
from typing import Dict
|
from typing import Dict
|
||||||
|
|
||||||
import aiohttp.client_exceptions
|
import aiohttp.client_exceptions
|
||||||
@ -25,12 +26,27 @@ class DirectDriver(BaseDriver):
|
|||||||
@retry(
|
@retry(
|
||||||
reraise=True,
|
reraise=True,
|
||||||
wait=wait_random(min=1, max=2),
|
wait=wait_random(min=1, max=2),
|
||||||
stop=stop_after_attempt(7),
|
stop=stop_after_attempt(3),
|
||||||
retry=retry_if_exception_type((ProxyError, aiohttp.client_exceptions.ClientPayloadError, ProxyTimeoutError)),
|
retry=retry_if_exception_type((ProxyError, aiohttp.client_exceptions.ClientPayloadError, ProxyTimeoutError)),
|
||||||
)
|
)
|
||||||
async def execute_prepared_file_request(self, prepared_file_request: PreparedRequest, params: Dict):
|
async def execute_prepared_file_request(self, prepared_file_request: PreparedRequest, params: Dict):
|
||||||
|
logging.debug({
|
||||||
|
'action': 'download',
|
||||||
|
'mode': 'pylon',
|
||||||
|
'params': params,
|
||||||
|
'source': str(self),
|
||||||
|
'url': prepared_file_request.url,
|
||||||
|
})
|
||||||
async with self.get_session() as session:
|
async with self.get_session() as session:
|
||||||
async with prepared_file_request.execute_with(session=session) as resp:
|
async with prepared_file_request.execute_with(session=session) as resp:
|
||||||
|
logging.debug({
|
||||||
|
'action': 'response',
|
||||||
|
'mode': 'pylon',
|
||||||
|
'params': params,
|
||||||
|
'source': str(self),
|
||||||
|
'url': prepared_file_request.url,
|
||||||
|
'status': resp.status,
|
||||||
|
})
|
||||||
if resp.status == 404:
|
if resp.status == 404:
|
||||||
raise NotFoundError(url=prepared_file_request.url)
|
raise NotFoundError(url=prepared_file_request.url)
|
||||||
elif (
|
elif (
|
||||||
|
@ -1,5 +1,8 @@
|
|||||||
import re
|
import re
|
||||||
import sys
|
from typing import (
|
||||||
|
List,
|
||||||
|
Tuple,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class Matcher:
|
class Matcher:
|
||||||
@ -10,8 +13,11 @@ class Matcher:
|
|||||||
|
|
||||||
def is_match(self, params) -> bool:
|
def is_match(self, params) -> bool:
|
||||||
for param in params:
|
for param in params:
|
||||||
if params[param]:
|
param_value = params[param]
|
||||||
if param_regex := self.param_regexes.get(param):
|
param_regex = self.param_regexes.get(param)
|
||||||
if re.match(param_regex, params[param]):
|
if param_value and param_regex:
|
||||||
return True
|
if not isinstance(param_value, (List, Tuple)):
|
||||||
return False
|
param_value = [param_value]
|
||||||
|
for el in param_value:
|
||||||
|
if re.match(param_regex, el):
|
||||||
|
return el
|
||||||
|
@ -10,7 +10,7 @@ import aiohttp
|
|||||||
from aiohttp import ClientSession
|
from aiohttp import ClientSession
|
||||||
from aiohttp.client_reqrep import ClientRequest
|
from aiohttp.client_reqrep import ClientRequest
|
||||||
from aiohttp_socks import ProxyConnector
|
from aiohttp_socks import ProxyConnector
|
||||||
from library.aiokit.aiokit import AioThing
|
from aiokit import AioThing
|
||||||
from nexus.pylon.proxy_manager import (
|
from nexus.pylon.proxy_manager import (
|
||||||
AllOf,
|
AllOf,
|
||||||
AnyOf,
|
AnyOf,
|
||||||
|
@ -228,7 +228,7 @@ class BasePdfProcessor:
|
|||||||
try:
|
try:
|
||||||
page = self.process_page(page, pdf_reader)
|
page = self.process_page(page, pdf_reader)
|
||||||
except (PdfStreamError, binascii.Error) as e:
|
except (PdfStreamError, binascii.Error) as e:
|
||||||
logging.getLogger('warning').warning({
|
logging.getLogger('nexus_pylon').warning({
|
||||||
'action': 'pdf_stream_error',
|
'action': 'pdf_stream_error',
|
||||||
'mode': 'pylon',
|
'mode': 'pylon',
|
||||||
'error': str(e),
|
'error': str(e),
|
||||||
@ -259,7 +259,7 @@ class WatermarkEraser1(BaseWatermarkEraser):
|
|||||||
if self.is_watermark_predicate(text.encode()):
|
if self.is_watermark_predicate(text.encode()):
|
||||||
xobj_death_note.append(operands[0])
|
xobj_death_note.append(operands[0])
|
||||||
operations_death_note.append(op_i)
|
operations_death_note.append(op_i)
|
||||||
logging.getLogger('debug').debug({
|
logging.getLogger('nexus_pylon').debug({
|
||||||
'action': 'watermark_removal',
|
'action': 'watermark_removal',
|
||||||
'mode': 'pylon',
|
'mode': 'pylon',
|
||||||
'text': text,
|
'text': text,
|
||||||
@ -289,7 +289,7 @@ class WatermarkEraser2(BaseWatermarkEraser):
|
|||||||
if operation == b"Tj":
|
if operation == b"Tj":
|
||||||
if isinstance(operands[0], bytes) and self.is_watermark_predicate(operands[0]):
|
if isinstance(operands[0], bytes) and self.is_watermark_predicate(operands[0]):
|
||||||
operations_death_note.append(op_i)
|
operations_death_note.append(op_i)
|
||||||
logging.getLogger('debug').debug({
|
logging.getLogger('nexus_pylon').debug({
|
||||||
'action': 'watermark_removal',
|
'action': 'watermark_removal',
|
||||||
'mode': 'pylon',
|
'mode': 'pylon',
|
||||||
'text': operands[0].decode(),
|
'text': operands[0].decode(),
|
||||||
@ -319,7 +319,7 @@ class WatermarkEraser3(BaseWatermarkEraser):
|
|||||||
text += operand
|
text += operand
|
||||||
if self.is_watermark_predicate(text):
|
if self.is_watermark_predicate(text):
|
||||||
operations_death_note.append(op_i)
|
operations_death_note.append(op_i)
|
||||||
logging.getLogger('debug').debug({
|
logging.getLogger('nexus_pylon').debug({
|
||||||
'action': 'watermark_removal',
|
'action': 'watermark_removal',
|
||||||
'mode': 'pylon',
|
'mode': 'pylon',
|
||||||
'text': text.decode(),
|
'text': text.decode(),
|
||||||
@ -402,7 +402,7 @@ class WatermarkEraser4(BaseWatermarkEraser):
|
|||||||
text, matched = tc.match(self.regexp)
|
text, matched = tc.match(self.regexp)
|
||||||
if matched:
|
if matched:
|
||||||
operations_death_note.extend(matched)
|
operations_death_note.extend(matched)
|
||||||
logging.getLogger('debug').debug({
|
logging.getLogger('nexus_pylon').debug({
|
||||||
'action': 'watermark_removal',
|
'action': 'watermark_removal',
|
||||||
'mode': 'pylon',
|
'mode': 'pylon',
|
||||||
'matched': text,
|
'matched': text,
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
|
import logging
|
||||||
from contextlib import asynccontextmanager
|
from contextlib import asynccontextmanager
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
@ -23,6 +24,7 @@ class PreparedRequest:
|
|||||||
cookies: Optional[dict] = None,
|
cookies: Optional[dict] = None,
|
||||||
ssl: bool = True,
|
ssl: bool = True,
|
||||||
timeout: Optional[float] = None,
|
timeout: Optional[float] = None,
|
||||||
|
headers_override: bool = False
|
||||||
):
|
):
|
||||||
self.method = method
|
self.method = method
|
||||||
self.url = url
|
self.url = url
|
||||||
@ -32,6 +34,8 @@ class PreparedRequest:
|
|||||||
}
|
}
|
||||||
if headers:
|
if headers:
|
||||||
self.headers.update(headers)
|
self.headers.update(headers)
|
||||||
|
if headers_override:
|
||||||
|
self.headers = headers or {}
|
||||||
self.params = params
|
self.params = params
|
||||||
self.cookies = cookies
|
self.cookies = cookies
|
||||||
self.ssl = ssl
|
self.ssl = ssl
|
||||||
@ -49,6 +53,13 @@ class PreparedRequest:
|
|||||||
@asynccontextmanager
|
@asynccontextmanager
|
||||||
async def execute_with(self, session):
|
async def execute_with(self, session):
|
||||||
try:
|
try:
|
||||||
|
logging.getLogger('nexus_pylon').debug({
|
||||||
|
'action': 'request',
|
||||||
|
'mode': 'pylon',
|
||||||
|
'url': self.url,
|
||||||
|
'method': self.method,
|
||||||
|
'headers': self.headers,
|
||||||
|
})
|
||||||
async with session.request(
|
async with session.request(
|
||||||
method=self.method,
|
method=self.method,
|
||||||
url=self.url,
|
url=self.url,
|
||||||
|
@ -54,6 +54,8 @@ class Proxy:
|
|||||||
|
|
||||||
class ProxyManager:
|
class ProxyManager:
|
||||||
def __init__(self, proxies=None):
|
def __init__(self, proxies=None):
|
||||||
|
if proxies is None:
|
||||||
|
proxies = []
|
||||||
self.proxies = [Proxy(proxy) for proxy in proxies]
|
self.proxies = [Proxy(proxy) for proxy in proxies]
|
||||||
|
|
||||||
def get_proxy(self, tags: Optional[Union[AllOf, AnyOf, Set]] = None) -> Proxy:
|
def get_proxy(self, tags: Optional[Union[AllOf, AnyOf, Set]] = None) -> Proxy:
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import sys
|
||||||
from typing import (
|
from typing import (
|
||||||
AsyncIterable,
|
AsyncIterable,
|
||||||
Dict,
|
Dict,
|
||||||
@ -50,20 +51,25 @@ class DoiOrgRequestResolver(BaseResolver):
|
|||||||
method='get',
|
method='get',
|
||||||
url=doi_url,
|
url=doi_url,
|
||||||
timeout=self.resolve_timeout,
|
timeout=self.resolve_timeout,
|
||||||
headers={'Accept': 'application/json'}
|
headers={
|
||||||
|
'Accept': 'application/json',
|
||||||
|
}
|
||||||
).execute_with(session=session) as resp:
|
).execute_with(session=session) as resp:
|
||||||
return await resp.json()
|
return await resp.json()
|
||||||
|
|
||||||
async def resolve(self, params: Dict) -> AsyncIterable[PreparedRequest]:
|
async def resolve(self, params: Dict) -> AsyncIterable[PreparedRequest]:
|
||||||
body = await self.resolve_through_doi_org(params)
|
body = await self.resolve_through_doi_org(params)
|
||||||
|
selected = None
|
||||||
try:
|
try:
|
||||||
selected = json.loads(self.selector.input(body).text())
|
if text := self.selector.input(body).text():
|
||||||
|
selected = json.loads(text)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
logging.getLogger('error').error({
|
logging.getLogger('nexus_pylon').error({
|
||||||
'action': 'error',
|
'action': 'error',
|
||||||
'mode': 'pylon',
|
'mode': 'pylon',
|
||||||
'params': params,
|
'params': params,
|
||||||
'error': str(e)
|
'error': str(e),
|
||||||
|
'selector': str(self.selector),
|
||||||
})
|
})
|
||||||
return
|
return
|
||||||
if selected:
|
if selected:
|
||||||
@ -73,7 +79,7 @@ class DoiOrgRequestResolver(BaseResolver):
|
|||||||
timeout=self.timeout,
|
timeout=self.timeout,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
logging.getLogger('debug').error({
|
logging.getLogger('nexus_pylon').debug({
|
||||||
'action': 'missed_selector',
|
'action': 'missed_selector',
|
||||||
'mode': 'pylon',
|
'mode': 'pylon',
|
||||||
'params': params,
|
'params': params,
|
||||||
|
@ -15,10 +15,12 @@ class RequestResolver(BaseResolver):
|
|||||||
self,
|
self,
|
||||||
url: str,
|
url: str,
|
||||||
extractors: List,
|
extractors: List,
|
||||||
|
resolve_timeout: float = 10.0,
|
||||||
proxy_list: Optional[List] = None,
|
proxy_list: Optional[List] = None,
|
||||||
proxy_manager: Optional[ProxyManager] = None,
|
proxy_manager: Optional[ProxyManager] = None,
|
||||||
):
|
):
|
||||||
super().__init__(proxy_list=proxy_list, proxy_manager=proxy_manager)
|
super().__init__(proxy_list=proxy_list, proxy_manager=proxy_manager)
|
||||||
|
self.resolve_timeout = resolve_timeout
|
||||||
self.url = url
|
self.url = url
|
||||||
self.extractors = extractors
|
self.extractors = extractors
|
||||||
|
|
||||||
@ -31,9 +33,9 @@ class RequestResolver(BaseResolver):
|
|||||||
async with PreparedRequest(
|
async with PreparedRequest(
|
||||||
method='get',
|
method='get',
|
||||||
url=url,
|
url=url,
|
||||||
timeout=10.0,
|
timeout=self.resolve_timeout,
|
||||||
).execute_with(session=session) as resp:
|
).execute_with(session=session) as resp:
|
||||||
# Sometimes sci-hub returns file
|
# Sometimes hosts return file URL
|
||||||
if resp.headers.get('Content-Type') == 'application/pdf':
|
if resp.headers.get('Content-Type') == 'application/pdf':
|
||||||
yield PreparedRequest(method='get', url=url, timeout=10.0)
|
yield PreparedRequest(method='get', url=url, timeout=10.0)
|
||||||
downloaded_page_bytes = await resp.read()
|
downloaded_page_bytes = await resp.read()
|
||||||
@ -42,9 +44,11 @@ class RequestResolver(BaseResolver):
|
|||||||
for extractor in self.extractors:
|
for extractor in self.extractors:
|
||||||
match = re.search(extractor['re'], downloaded_page, re.IGNORECASE)
|
match = re.search(extractor['re'], downloaded_page, re.IGNORECASE)
|
||||||
if match:
|
if match:
|
||||||
matched_group = match.group(extractor['producer']['group'])
|
|
||||||
yield PreparedRequest(
|
yield PreparedRequest(
|
||||||
method='get',
|
method='get',
|
||||||
url=extractor['producer']['format_string'].format(matched_group=matched_group),
|
url=extractor['producer']['format_string'].format(
|
||||||
|
host=resp.real_url.host,
|
||||||
|
**match.groupdict()
|
||||||
|
),
|
||||||
timeout=extractor['producer'].get('timeout', 10.0),
|
timeout=extractor['producer'].get('timeout', 10.0),
|
||||||
)
|
)
|
||||||
|
@ -14,19 +14,27 @@ class TemplateResolver(BaseResolver):
|
|||||||
self,
|
self,
|
||||||
format_string: str = 'https://doi.org/{doi}',
|
format_string: str = 'https://doi.org/{doi}',
|
||||||
timeout: float = 10.0,
|
timeout: float = 10.0,
|
||||||
|
method: str = 'GET',
|
||||||
|
headers: Optional[dict] = None,
|
||||||
|
headers_override: bool = False,
|
||||||
proxy_list: Optional[List] = None,
|
proxy_list: Optional[List] = None,
|
||||||
proxy_manager: Optional[ProxyManager] = None,
|
proxy_manager: Optional[ProxyManager] = None,
|
||||||
):
|
):
|
||||||
super().__init__(proxy_list=proxy_list, proxy_manager=proxy_manager)
|
super().__init__(proxy_list=proxy_list, proxy_manager=proxy_manager)
|
||||||
self.format_string = format_string
|
self.format_string = format_string
|
||||||
self.timeout = timeout
|
self.timeout = timeout
|
||||||
|
self.method = method
|
||||||
|
self.headers = headers
|
||||||
|
self.headers_override = headers_override
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return f'{self.__class__.__name__}({self.format_string})'
|
return f'{self.__class__.__name__}({self.format_string})'
|
||||||
|
|
||||||
async def resolve(self, params) -> AsyncIterable[PreparedRequest]:
|
async def resolve(self, params) -> AsyncIterable[PreparedRequest]:
|
||||||
yield PreparedRequest(
|
yield PreparedRequest(
|
||||||
method='GET',
|
method=self.method,
|
||||||
url=self.format_string.format(**params),
|
url=self.format_string.format(**params),
|
||||||
timeout=self.timeout,
|
timeout=self.timeout,
|
||||||
|
headers=self.headers,
|
||||||
|
headers_override=self.headers_override,
|
||||||
)
|
)
|
||||||
|
@ -2,12 +2,12 @@ import logging
|
|||||||
from typing import (
|
from typing import (
|
||||||
AsyncIterable,
|
AsyncIterable,
|
||||||
Dict,
|
Dict,
|
||||||
List,
|
Optional,
|
||||||
)
|
)
|
||||||
|
|
||||||
from aiohttp.client_exceptions import ClientPayloadError
|
from aiohttp.client_exceptions import ClientPayloadError
|
||||||
from library.aiokit.aiokit import AioThing
|
from aiokit import AioThing
|
||||||
from library.logging import error_log
|
from izihawa_utils.importlib import import_object
|
||||||
from nexus.pylon.drivers.base import BaseDriver
|
from nexus.pylon.drivers.base import BaseDriver
|
||||||
from nexus.pylon.exceptions import (
|
from nexus.pylon.exceptions import (
|
||||||
DownloadError,
|
DownloadError,
|
||||||
@ -16,7 +16,6 @@ from nexus.pylon.exceptions import (
|
|||||||
from nexus.pylon.matcher import Matcher
|
from nexus.pylon.matcher import Matcher
|
||||||
from nexus.pylon.proto.file_pb2 import FileResponse as FileResponsePb
|
from nexus.pylon.proto.file_pb2 import FileResponse as FileResponsePb
|
||||||
from nexus.pylon.resolvers.base import BaseResolver
|
from nexus.pylon.resolvers.base import BaseResolver
|
||||||
from utils.izihawa_utils.importlib import import_object
|
|
||||||
|
|
||||||
|
|
||||||
class Source(AioThing):
|
class Source(AioThing):
|
||||||
@ -29,12 +28,15 @@ class Source(AioThing):
|
|||||||
@classmethod
|
@classmethod
|
||||||
def from_config(
|
def from_config(
|
||||||
cls,
|
cls,
|
||||||
proxy_manager,
|
config,
|
||||||
source_config,
|
source_config,
|
||||||
downloads_directory: str,
|
proxy_manager,
|
||||||
default_driver_proxy_list: List,
|
) -> Optional['Source']:
|
||||||
default_resolver_proxy_list: List,
|
driver_cls_name = source_config.get('driver', {}).get('class', 'nexus.pylon.drivers.BrowserDriver')
|
||||||
) -> 'Source':
|
|
||||||
|
if driver_cls_name.endswith('BrowserDriver') and config.get('webdriver_hub') is None:
|
||||||
|
return None
|
||||||
|
|
||||||
matcher = Matcher(source_config['matcher'])
|
matcher = Matcher(source_config['matcher'])
|
||||||
|
|
||||||
resolver_cls = import_object(
|
resolver_cls = import_object(
|
||||||
@ -42,16 +44,16 @@ class Source(AioThing):
|
|||||||
)
|
)
|
||||||
resolver_args = dict(
|
resolver_args = dict(
|
||||||
proxy_manager=proxy_manager,
|
proxy_manager=proxy_manager,
|
||||||
proxy_list=default_resolver_proxy_list,
|
proxy_list=config['default_resolver_proxy_list'],
|
||||||
)
|
)
|
||||||
resolver_args.update(**source_config.get('resolver', {}).get('args', {}))
|
resolver_args.update(**source_config.get('resolver', {}).get('args', {}))
|
||||||
resolver = resolver_cls(**resolver_args)
|
resolver = resolver_cls(**resolver_args)
|
||||||
|
|
||||||
driver_cls = import_object(source_config.get('driver', {}).get('class', 'nexus.pylon.drivers.BrowserDriver'))
|
driver_cls = import_object(driver_cls_name)
|
||||||
driver_args = dict(
|
driver_args = dict(
|
||||||
proxy_manager=proxy_manager,
|
proxy_manager=proxy_manager,
|
||||||
downloads_directory=downloads_directory,
|
proxy_list=config['default_driver_proxy_list'],
|
||||||
proxy_list=default_driver_proxy_list,
|
config=config,
|
||||||
)
|
)
|
||||||
driver_args.update(**source_config.get('driver', {}).get('args', {}))
|
driver_args.update(**source_config.get('driver', {}).get('args', {}))
|
||||||
driver = driver_cls(**driver_args)
|
driver = driver_cls(**driver_args)
|
||||||
@ -67,13 +69,6 @@ class Source(AioThing):
|
|||||||
async def download(self, params: Dict) -> AsyncIterable[FileResponsePb]:
|
async def download(self, params: Dict) -> AsyncIterable[FileResponsePb]:
|
||||||
yield FileResponsePb(status=FileResponsePb.Status.RESOLVING)
|
yield FileResponsePb(status=FileResponsePb.Status.RESOLVING)
|
||||||
async for prepared_file_request in self.resolver.resolve(params):
|
async for prepared_file_request in self.resolver.resolve(params):
|
||||||
logging.debug({
|
|
||||||
'action': 'download',
|
|
||||||
'mode': 'pylon',
|
|
||||||
'params': params,
|
|
||||||
'source': str(self),
|
|
||||||
'url': prepared_file_request.url,
|
|
||||||
})
|
|
||||||
try:
|
try:
|
||||||
async for resp in self.driver.execute_prepared_file_request(
|
async for resp in self.driver.execute_prepared_file_request(
|
||||||
prepared_file_request=prepared_file_request,
|
prepared_file_request=prepared_file_request,
|
||||||
@ -82,11 +77,11 @@ class Source(AioThing):
|
|||||||
yield resp
|
yield resp
|
||||||
return
|
return
|
||||||
except ClientPayloadError as e:
|
except ClientPayloadError as e:
|
||||||
error_log(e, level=logging.WARNING)
|
logging.getLogger('nexus_pylon').warning(e)
|
||||||
continue
|
continue
|
||||||
except NotFoundError:
|
except NotFoundError:
|
||||||
continue
|
continue
|
||||||
except DownloadError as e:
|
except DownloadError as e:
|
||||||
error_log(e)
|
logging.getLogger('nexus_pylon').warning(e)
|
||||||
continue
|
continue
|
||||||
raise NotFoundError(params=params, resolver=str(self.resolver), driver=str(self.driver))
|
raise NotFoundError(params=params, resolver=str(self.resolver), driver=str(self.driver))
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
|
from .base import BaseValidator
|
||||||
from .md5 import Md5Validator
|
from .md5 import Md5Validator
|
||||||
from .pdf import PdfValidator
|
from .pdf import PdfValidator
|
||||||
|
|
||||||
__all__ = ['Md5Validator', 'PdfValidator']
|
__all__ = ['BaseValidator', 'Md5Validator', 'PdfValidator']
|
||||||
|
@ -1,4 +1,10 @@
|
|||||||
|
from typing import Dict
|
||||||
|
|
||||||
|
|
||||||
class BaseValidator:
|
class BaseValidator:
|
||||||
|
def __init__(self, params: Dict):
|
||||||
|
self.params = params
|
||||||
|
|
||||||
def update(self, chunk):
|
def update(self, chunk):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -7,6 +7,7 @@ from nexus.pylon.validators.base import BaseValidator
|
|||||||
|
|
||||||
class Md5Validator(BaseValidator):
|
class Md5Validator(BaseValidator):
|
||||||
def __init__(self, params: Dict):
|
def __init__(self, params: Dict):
|
||||||
|
super().__init__(params)
|
||||||
self.md5 = params['md5']
|
self.md5 = params['md5']
|
||||||
self.v = hashlib.md5()
|
self.v = hashlib.md5()
|
||||||
|
|
||||||
|
@ -12,7 +12,7 @@ from PyPDF2.errors import PdfReadError
|
|||||||
|
|
||||||
class PdfValidator(BaseValidator):
|
class PdfValidator(BaseValidator):
|
||||||
def __init__(self, params: Dict):
|
def __init__(self, params: Dict):
|
||||||
self.params = params
|
super().__init__(params)
|
||||||
self.md5 = params.get('md5')
|
self.md5 = params.get('md5')
|
||||||
self.file = bytes()
|
self.file = bytes()
|
||||||
self.v = hashlib.md5()
|
self.v = hashlib.md5()
|
||||||
@ -24,7 +24,7 @@ class PdfValidator(BaseValidator):
|
|||||||
|
|
||||||
def validate(self):
|
def validate(self):
|
||||||
if self.md5 and self.md5.lower() == self.v.hexdigest().lower():
|
if self.md5 and self.md5.lower() == self.v.hexdigest().lower():
|
||||||
logging.getLogger('debug').debug({
|
logging.getLogger('nexus_pylon').debug({
|
||||||
'action': 'validation',
|
'action': 'validation',
|
||||||
'mode': 'pylon',
|
'mode': 'pylon',
|
||||||
'result': 'md5_ok',
|
'result': 'md5_ok',
|
||||||
@ -32,7 +32,7 @@ class PdfValidator(BaseValidator):
|
|||||||
})
|
})
|
||||||
return
|
return
|
||||||
elif not is_pdf(f=self.file):
|
elif not is_pdf(f=self.file):
|
||||||
logging.getLogger('debug').debug({
|
logging.getLogger('nexus_pylon').debug({
|
||||||
'action': 'validation',
|
'action': 'validation',
|
||||||
'mode': 'pylon',
|
'mode': 'pylon',
|
||||||
'result': 'not_pdf',
|
'result': 'not_pdf',
|
||||||
@ -41,28 +41,18 @@ class PdfValidator(BaseValidator):
|
|||||||
raise BadResponseError(file=str(self.file[:100]))
|
raise BadResponseError(file=str(self.file[:100]))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
logging.getLogger('debug').debug({
|
|
||||||
'action': 'open_pdf',
|
|
||||||
'mode': 'pylon',
|
|
||||||
'file_len': len(self.file),
|
|
||||||
'params': self.params,
|
|
||||||
})
|
|
||||||
PyPDF2.PdfReader(BytesIO(self.file))
|
PyPDF2.PdfReader(BytesIO(self.file))
|
||||||
logging.getLogger('debug').debug({
|
|
||||||
'action': 'opened_pdf',
|
|
||||||
'mode': 'pylon',
|
|
||||||
'file_len': len(self.file),
|
|
||||||
'params': self.params,
|
|
||||||
})
|
|
||||||
except PdfReadError:
|
except PdfReadError:
|
||||||
logging.getLogger('debug').debug({
|
logging.getLogger('nexus_pylon').debug({
|
||||||
'action': 'validation',
|
'action': 'validation',
|
||||||
'mode': 'pylon',
|
'mode': 'pylon',
|
||||||
'result': 'not_opened_as_pdf',
|
'result': 'not_opened_as_pdf',
|
||||||
|
'params': self.params,
|
||||||
})
|
})
|
||||||
raise BadResponseError(file=str(self.file[:100]))
|
raise BadResponseError(file=str(self.file[:100]))
|
||||||
logging.getLogger('debug').debug({
|
logging.getLogger('nexus_pylon').debug({
|
||||||
'action': 'validation',
|
'action': 'validation',
|
||||||
'mode': 'pylon',
|
'mode': 'pylon',
|
||||||
'result': 'ok',
|
'result': 'ok',
|
||||||
|
'params': self.params,
|
||||||
})
|
})
|
||||||
|
@ -10,6 +10,6 @@ py_library(
|
|||||||
srcs_version = "PY3",
|
srcs_version = "PY3",
|
||||||
visibility = ["//visibility:public"],
|
visibility = ["//visibility:public"],
|
||||||
deps = [
|
deps = [
|
||||||
"//library/configurator",
|
requirement("izihawa_configurator"),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from library.configurator import Configurator
|
from izihawa_configurator import Configurator
|
||||||
|
|
||||||
|
|
||||||
def get_translations():
|
def get_translations():
|
||||||
|
@ -33,7 +33,10 @@ class ProgressBar:
|
|||||||
tail_text,
|
tail_text,
|
||||||
message=None,
|
message=None,
|
||||||
source=None,
|
source=None,
|
||||||
throttle_secs: float = 0,
|
throttle_secs: float = 0.0,
|
||||||
|
hard_throttle_secs: float = 10.0,
|
||||||
|
last_call: float = 0.0,
|
||||||
|
done_threshold_size: int = 10 * 1024 * 1024,
|
||||||
):
|
):
|
||||||
self.telegram_client = telegram_client
|
self.telegram_client = telegram_client
|
||||||
self.request_context = request_context
|
self.request_context = request_context
|
||||||
@ -45,9 +48,12 @@ class ProgressBar:
|
|||||||
self.done = 0
|
self.done = 0
|
||||||
self.total = 1
|
self.total = 1
|
||||||
self.throttle_secs = throttle_secs
|
self.throttle_secs = throttle_secs
|
||||||
|
self.hard_throttle_secs = hard_throttle_secs
|
||||||
|
self.done_threshold_size = done_threshold_size
|
||||||
|
|
||||||
|
self.previous_done = 0
|
||||||
self.last_text = None
|
self.last_text = None
|
||||||
self.last_call = 0
|
self.last_call = last_call
|
||||||
|
|
||||||
def share(self):
|
def share(self):
|
||||||
if self.total > 0:
|
if self.total > 0:
|
||||||
@ -56,6 +62,7 @@ class ProgressBar:
|
|||||||
return f'{float(self.done / (1024 * 1024)):.1f}Mb'
|
return f'{float(self.done / (1024 * 1024)):.1f}Mb'
|
||||||
|
|
||||||
def _set_progress(self, done, total):
|
def _set_progress(self, done, total):
|
||||||
|
self.previous_done = self.done
|
||||||
self.done = done
|
self.done = done
|
||||||
self.total = total
|
self.total = total
|
||||||
|
|
||||||
@ -74,11 +81,20 @@ class ProgressBar:
|
|||||||
progress_bar = '|' + filled * bars['filled'] + (total_bars - filled) * bars['empty'] + '| '
|
progress_bar = '|' + filled * bars['filled'] + (total_bars - filled) * bars['empty'] + '| '
|
||||||
|
|
||||||
tail_text = self.tail_text.format(source=self.source)
|
tail_text = self.tail_text.format(source=self.source)
|
||||||
return f'`{self.header}\n{progress_bar}{self.share()} {tail_text}`'
|
return f'`{self.header}\n{progress_bar}{self.share().ljust(8)} {tail_text}`'
|
||||||
|
|
||||||
|
def should_send(self, now, ignore_last_call):
|
||||||
|
if ignore_last_call:
|
||||||
|
return True
|
||||||
|
if abs(now - self.last_call) > self.hard_throttle_secs:
|
||||||
|
return True
|
||||||
|
if abs(now - self.last_call) > self.throttle_secs and (self.done - self.previous_done) < self.done_threshold_size:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
async def send_message(self, text, ignore_last_call=False):
|
async def send_message(self, text, ignore_last_call=False):
|
||||||
now = time.time()
|
now = time.time()
|
||||||
if not ignore_last_call and abs(now - self.last_call) < self.throttle_secs:
|
if not self.should_send(now, ignore_last_call):
|
||||||
return
|
return
|
||||||
try:
|
try:
|
||||||
if not self.message:
|
if not self.message:
|
||||||
@ -103,17 +119,3 @@ class ProgressBar:
|
|||||||
async def callback(self, done, total, ignore_last_call=False):
|
async def callback(self, done, total, ignore_last_call=False):
|
||||||
self._set_progress(done, total)
|
self._set_progress(done, total)
|
||||||
return await self.send_message(await self.render_progress(), ignore_last_call=ignore_last_call)
|
return await self.send_message(await self.render_progress(), ignore_last_call=ignore_last_call)
|
||||||
|
|
||||||
|
|
||||||
class ThrottlerWrapper:
|
|
||||||
def __init__(self, callback: Callable, throttle_secs: Union[int, float]):
|
|
||||||
self.callback = callback
|
|
||||||
self.last_call = 0
|
|
||||||
self.throttle_secs = throttle_secs
|
|
||||||
|
|
||||||
async def __call__(self, *args, **kwargs):
|
|
||||||
now = time.time()
|
|
||||||
if abs(now - self.last_call) < self.throttle_secs:
|
|
||||||
return
|
|
||||||
self.last_call = now
|
|
||||||
return await self.callback(*args, **kwargs)
|
|
||||||
|
@ -63,7 +63,6 @@ class ScimagViewBuilder(BaseViewBuilder):
|
|||||||
'chapter': '🔖',
|
'chapter': '🔖',
|
||||||
'book-chapter': '🔖',
|
'book-chapter': '🔖',
|
||||||
}
|
}
|
||||||
multihash_ix = 0
|
|
||||||
|
|
||||||
def is_preprint(self):
|
def is_preprint(self):
|
||||||
return self.document_holder.doi.split('/')[0] in preprints
|
return self.document_holder.doi.split('/')[0] in preprints
|
||||||
|
@ -13,7 +13,7 @@ aiohttp-socks==0.7.1
|
|||||||
aiokafka==0.7.2
|
aiokafka==0.7.2
|
||||||
aiokit==1.1.2
|
aiokit==1.1.2
|
||||||
aiosignal==1.2.0
|
aiosignal==1.2.0
|
||||||
aiosumma==2.8.13
|
aiosumma==2.10.4
|
||||||
asn1crypto==1.5.1
|
asn1crypto==1.5.1
|
||||||
async-generator==1.10
|
async-generator==1.10
|
||||||
async-timeout==4.0.2
|
async-timeout==4.0.2
|
||||||
@ -55,7 +55,7 @@ h11==0.13.0
|
|||||||
idna==3.3
|
idna==3.3
|
||||||
iniconfig==1.1.1
|
iniconfig==1.1.1
|
||||||
isort==5.10.1
|
isort==5.10.1
|
||||||
izihawa-nlptools==1.1.7
|
izihawa-nlptools==1.1.9
|
||||||
izihawa-types==0.1.3
|
izihawa-types==0.1.3
|
||||||
izihawa-utils==1.0.7
|
izihawa-utils==1.0.7
|
||||||
Jinja2==3.1.2
|
Jinja2==3.1.2
|
||||||
|
Loading…
Reference in New Issue
Block a user