diff --git a/idm/api/BUILD.bazel b/idm/api/BUILD.bazel
index c5a805a..207bc87 100644
--- a/idm/api/BUILD.bazel
+++ b/idm/api/BUILD.bazel
@@ -43,7 +43,7 @@ py3_image(
"//library/aiogrpctools",
requirement("aiokit"),
"//library/aiopostgres",
- "//library/configurator",
+ requirement("izihawa_configurator"),
"//library/telegram",
requirement("izihawa_utils"),
],
diff --git a/idm/api/configs/__init__.py b/idm/api/configs/__init__.py
index 339e2f7..5d552f4 100644
--- a/idm/api/configs/__init__.py
+++ b/idm/api/configs/__init__.py
@@ -1,11 +1,13 @@
-from library.configurator import Configurator
+from izihawa_configurator import Configurator
+from izihawa_utils import env
def get_config():
return Configurator([
'idm/api/configs/base.yaml',
+ 'idm/api/configs/%s.yaml?' % env.type,
'idm/api/configs/logging.yaml',
- ], env_prefix='NEXUS_IDM_API')
+ ], env_prefix='IDM_API')
config = get_config()
diff --git a/idm/api/configs/base.yaml b/idm/api/configs/base.yaml
index 91819d3..95f4228 100644
--- a/idm/api/configs/base.yaml
+++ b/idm/api/configs/base.yaml
@@ -2,25 +2,25 @@
application:
debug: true
service_name: idm-api
-database:
- idm:
- database: nexus
- host:
- password:
- username:
- drivername: postgresql
- port: 5432
- nexus:
- database: nexus
- host:
- password:
- username:
- drivername: postgresql
- port: 5432
clickhouse:
host:
password:
username:
+database:
+ idm:
+ database: nexus
+ drivername: postgresql
+ host:
+ password:
+ port: 5432
+ username:
+ nexus:
+ database: nexus
+ drivername: postgresql
+ host:
+ password:
+ port: 5432
+ username:
grpc:
address: 0.0.0.0
port: 82
diff --git a/idm/api/main.py b/idm/api/main.py
index 9496936..c86d2a5 100644
--- a/idm/api/main.py
+++ b/idm/api/main.py
@@ -7,9 +7,9 @@ from idm.api.configs import get_config
from idm.api.services.chat_manager import ChatManagerService
from idm.api.services.profile import ProfileService
from idm.api.services.subscription_manager import SubscriptionManagerService
+from izihawa_configurator import Configurator
from library.aiogrpctools import AioGrpcServer
from library.aiopostgres.pool_holder import AioPostgresPoolHolder
-from library.configurator import Configurator
from library.logging import configure_logging
diff --git a/idm/api/services/profile.py b/idm/api/services/profile.py
index 857deed..dfc78d4 100644
--- a/idm/api/services/profile.py
+++ b/idm/api/services/profile.py
@@ -105,20 +105,24 @@ class ProfileService(profile_service_pb2_grpc.ProfileServicer, BaseService):
for tag in download_document.tags:
tags_counter[tag] += 1
- most_popular_issns = sorted(issns_counter, key=issns_counter.get, reverse=True)[:7]
+ most_popular_issns = sorted(issns_counter, key=issns_counter.get, reverse=True)[:14]
most_popular_tags = sorted(tags_counter, key=tags_counter.get, reverse=True)[:7]
most_popular_series = []
- async for row in self.application.pool_holder['nexus'].iterate(
- f"select name, issns from series where issns && array[{most_popular_issns}]::text[]".format(
- most_popular_issns=','.join(map(lambda x: "'" + x + "'", most_popular_issns)),
- ),
- row_factory=dict_row,
- ):
- most_popular_series.append(profile_service_pb2.Series(
- name=row['name'],
- issns=row['issns'],
- ))
+ if most_popular_issns:
+ async for row in self.application.pool_holder['nexus'].iterate(
+ "select name, array_agg(issn) as issns from series "
+ "where issn in ({most_popular_issns}) "
+ "group by name order by name "
+ "limit 7".format(
+ most_popular_issns=','.join(map(lambda x: "'" + x + "'", most_popular_issns)),
+ ),
+ row_factory=dict_row,
+ ):
+ most_popular_series.append(profile_service_pb2.Series(
+ name=row['name'],
+ issns=row['issns'],
+ ))
return most_popular_series, most_popular_tags
diff --git a/library/aiogrpctools/BUILD.bazel b/library/aiogrpctools/BUILD.bazel
index d0dd791..1b637dd 100644
--- a/library/aiogrpctools/BUILD.bazel
+++ b/library/aiogrpctools/BUILD.bazel
@@ -13,7 +13,7 @@ py_library(
requirement("grpcio"),
requirement("pyyaml"),
requirement("aiokit"),
- "//library/configurator",
+ requirement("izihawa_configurator"),
"//library/logging",
requirement("izihawa_utils"),
],
diff --git a/library/aiopostgres/pool_holder.py b/library/aiopostgres/pool_holder.py
index 31830b2..31385d3 100644
--- a/library/aiopostgres/pool_holder.py
+++ b/library/aiopostgres/pool_holder.py
@@ -92,6 +92,7 @@ class AioPostgresPoolHolder(AioThing):
row_factory=tuple_row,
cursor_name: Optional[str] = None,
itersize: Optional[int] = None,
+ statement_timeout: Optional[int] = None,
):
if not self.pool:
raise RuntimeError('AioPostgresPoolHolder has not been started')
@@ -99,7 +100,9 @@ class AioPostgresPoolHolder(AioThing):
async with conn.cursor(name=cursor_name, row_factory=row_factory) as cur:
if itersize is not None:
cur.itersize = itersize
- await cur.execute(stmt, values)
+ await cur.execute(stmt + ';' if statement_timeout else '', values)
+ if statement_timeout:
+ await cur.execute(f'SET statement_timeout = {statement_timeout};')
async for row in cur:
yield row
diff --git a/library/configurator/BUILD.bazel b/library/configurator/BUILD.bazel
deleted file mode 100644
index 09efa31..0000000
--- a/library/configurator/BUILD.bazel
+++ /dev/null
@@ -1,17 +0,0 @@
-load("@pip_modules//:requirements.bzl", "requirement")
-load("@rules_python//python:defs.bzl", "py_library")
-
-py_library(
- name = "configurator",
- srcs = glob(
- ["**/*.py"],
- exclude = ["tests/**"],
- ),
- srcs_version = "PY3",
- visibility = ["//visibility:public"],
- deps = [
- requirement("jinja2"),
- requirement("pyyaml"),
- requirement("izihawa_utils"),
- ],
-)
diff --git a/library/configurator/__init__.py b/library/configurator/__init__.py
deleted file mode 100644
index 3c77143..0000000
--- a/library/configurator/__init__.py
+++ /dev/null
@@ -1,170 +0,0 @@
-import json
-import os
-import os.path
-from types import ModuleType
-
-import yaml
-from izihawa_utils.common import (
- smart_merge_dicts,
- unflatten,
-)
-from jinja2 import Template
-from library.configurator.exceptions import UnknownConfigFormatError
-
-
-class ConfigObject(dict):
- def __getattr__(self, name):
- try:
- return self[name]
- except KeyError as e:
- raise AttributeError(e)
-
-
-class AnyOf:
- def __init__(self, *args):
- self.args = args
-
-
-class RichDict(dict):
- def has(self, *args):
- current = self
- for c in args:
- if c not in current:
- return False
- current = current[c]
- return True
-
- def copy_if_exists(self, source_keys, target_key):
- current = self
- for c in source_keys:
- if c not in current:
- return False
- current = current[c]
- self[target_key] = current
- return True
-
-
-class Configurator(RichDict):
- def __init__(self, configs: list, env_prefix: str = None, env_key_separator: str = '.'):
- """
- Create Configurator object
-
- :param configs: list of paths to config files, dicts or modules.
- End filepath with `?` to mark it as optional config.
- """
- super().__init__()
-
- self._by_basenames = {}
- self._omitted_files = []
-
- env_dict = {}
-
- if env_prefix:
- env_prefix = env_prefix.lower()
- for name, value in os.environ.items():
- if name.lower().startswith(env_prefix):
- stripped_name = name[len(env_prefix):].lstrip('_')
- if stripped_name[-2:] == '[]':
- if stripped_name not in env_dict:
- env_dict[stripped_name[:-2]] = []
- env_dict[stripped_name[:-2]].append(value)
- else:
- env_dict[stripped_name] = value
- env_dict = unflatten(env_dict, sep=env_key_separator)
-
- for config in ([os.environ] + configs + [env_dict]):
- file_found = self.update(config)
- if not file_found:
- self._omitted_files.append(config)
-
- def _config_filename(self, filename):
- return os.path.join(os.getcwd(), filename)
-
- def walk_and_render(self, c):
- if isinstance(c, str):
- return Template(c).render(**self)
- elif isinstance(c, list):
- return [self.walk_and_render(e) for e in c]
- elif isinstance(c, dict):
- for key in list(c.keys()):
- c[key] = self.walk_and_render(c[key])
- if key.endswith('_filepath'):
- with open(c[key]) as f:
- if c[key].endswith('.json'):
- c[key.replace('_filepath', '')] = json.loads(f.read())
- elif c[key].endswith('.yaml'):
- c[key.replace('_filepath', '')] = yaml.safe_load(f.read())
- return c
-
- def update(self, new_config, basename=None, **kwargs):
- if isinstance(new_config, AnyOf):
- for config in new_config.args:
- try:
- return self.update(config.rstrip('?'))
- except IOError:
- pass
- raise IOError('None of %s was found' % ', '.join(new_config.args))
- elif isinstance(new_config, str):
- optional = new_config.endswith('?')
- filename = new_config.rstrip('?')
- basename = basename or os.path.basename(filename)
-
- config_filename = self._config_filename(filename)
-
- data = None
-
- if os.path.exists(config_filename) and os.access(config_filename, os.R_OK):
- with open(config_filename) as f:
- data = f.read()
-
- if data is None:
- if optional:
- return False
- else:
- raise IOError(f'File {config_filename} not found')
-
- if filename.endswith('.json'):
- new_config = json.loads(data)
- elif filename.endswith('.yaml'):
- new_config = yaml.safe_load(data)
- else:
- raise UnknownConfigFormatError(filename)
-
- new_config = self.walk_and_render(new_config)
-
- elif isinstance(new_config, ModuleType):
- new_config = new_config.__dict__
-
- elif callable(new_config):
- new_config = new_config(self)
-
- if not new_config:
- new_config = {}
-
- for k in new_config:
- if callable(new_config[k]):
- new_config[k] = new_config[k](context=self)
-
- if 'log_path' in new_config:
- new_config['log_path'] = os.path.expanduser(new_config['log_path']).rstrip('/')
-
- smart_merge_dicts(self, new_config, list_policy='override', copy=False)
- if basename:
- self._by_basenames[basename] = new_config
-
- return True
-
- def get_config_by_basename(self, basename):
- return self._by_basenames[basename]
-
- def get_object_by_basename(self, basename):
- return ConfigObject(self._by_basenames[basename])
-
- def has_missed_configs(self):
- return bool(self._omitted_files)
-
- def has_file(self, basename):
- return basename in self._by_basenames
-
- def get_files(self):
- return self._by_basenames
diff --git a/library/configurator/exceptions.py b/library/configurator/exceptions.py
deleted file mode 100644
index cbc1f99..0000000
--- a/library/configurator/exceptions.py
+++ /dev/null
@@ -1,2 +0,0 @@
-class UnknownConfigFormatError(Exception):
- pass
diff --git a/nexus/actions/document_operations_pb/update_document_scimag_pb.py b/nexus/actions/document_operations_pb/update_document_scimag_pb.py
index 1bec67b..7b35df4 100644
--- a/nexus/actions/document_operations_pb/update_document_scimag_pb.py
+++ b/nexus/actions/document_operations_pb/update_document_scimag_pb.py
@@ -159,7 +159,6 @@ class ToSummaAction(BaseAction):
'journal',
'journal-issue',
'journal-volume',
- 'other',
'peer-review',
'proceedings',
'report-series',
diff --git a/nexus/bot/BUILD.bazel b/nexus/bot/BUILD.bazel
index b8666ac..951f102 100644
--- a/nexus/bot/BUILD.bazel
+++ b/nexus/bot/BUILD.bazel
@@ -35,7 +35,7 @@ py3_image(
requirement("aiobaseclient"),
requirement("aiocrossref"),
requirement("aiokit"),
- "//library/configurator",
+ requirement("izihawa_configurator"),
"//library/logging",
"//library/telegram",
"//nexus/hub/aioclient",
diff --git a/nexus/bot/configs/__init__.py b/nexus/bot/configs/__init__.py
index a954719..91097a5 100644
--- a/nexus/bot/configs/__init__.py
+++ b/nexus/bot/configs/__init__.py
@@ -1,5 +1,5 @@
+from izihawa_configurator import Configurator
from izihawa_utils import env
-from library.configurator import Configurator
def get_config():
diff --git a/nexus/bot/handlers/view.py b/nexus/bot/handlers/view.py
index d562091..d10d00d 100644
--- a/nexus/bot/handlers/view.py
+++ b/nexus/bot/handlers/view.py
@@ -104,7 +104,7 @@ class ViewHandler(BaseHandler):
),
event.delete(),
]
- if not has_found_old_widget:
+ if not has_found_old_widget and is_earlier_than_2_days(old_message):
async with safe_execution(error_log=request_context.error_log):
await self.application.telegram_client.delete_messages(request_context.chat.chat_id, [old_message_id])
return await asyncio.gather(*actions)
diff --git a/nexus/cognitron/configs/scimag.yaml b/nexus/cognitron/configs/scimag.yaml
index b345b4c..594552a 100644
--- a/nexus/cognitron/configs/scimag.yaml
+++ b/nexus/cognitron/configs/scimag.yaml
@@ -189,6 +189,13 @@ schema:
record: basic
tokenizer: raw
stored: true
+ - name: series_page_rank
+ type: f64
+ options:
+ fast: single
+ fieldnorms: false
+ indexed: true
+ stored: true
multi_fields: ["authors", "ipfs_multihashes", "isbns", "issns", "references", "tags"]
primary_key: "id"
stop_words: ['a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'if', 'in', 'is', 'it', 'of', 'on', 'or',
diff --git a/nexus/hub/BUILD.bazel b/nexus/hub/BUILD.bazel
index 9510659..b6b8996 100644
--- a/nexus/hub/BUILD.bazel
+++ b/nexus/hub/BUILD.bazel
@@ -43,7 +43,7 @@ py3_image(
requirement("aioipfs-2"),
requirement("aiokit"),
"//library/aiopostgres",
- "//library/configurator",
+ requirement("izihawa_configurator"),
"//library/telegram",
"//nexus/hub/proto:grpc_py",
"//nexus/hub/proto:proto_py",
diff --git a/nexus/hub/configs/__init__.py b/nexus/hub/configs/__init__.py
index 02c7ff7..e282a6d 100644
--- a/nexus/hub/configs/__init__.py
+++ b/nexus/hub/configs/__init__.py
@@ -1,5 +1,5 @@
+from izihawa_configurator import Configurator
from izihawa_utils import env
-from library.configurator import Configurator
def get_config():
diff --git a/nexus/hub/configs/pylon.yaml b/nexus/hub/configs/pylon.yaml
deleted file mode 100644
index 5091ed1..0000000
--- a/nexus/hub/configs/pylon.yaml
+++ /dev/null
@@ -1,629 +0,0 @@
----
-pylon:
- default_driver_proxy_list:
- - [cambridge]
- - [edinburg]
- - [southampton]
- default_resolver_proxy_list: ~
- downloads_directory: /downloads
- proxies:
- - address: clash.default.svc.cluster.example.com:7890
- name: cambridge
- tags: ['cambridge']
- - address: clash.default.svc.cluster.example.com:7990
- name: edinburg
- tags: ['edinburg']
- - address: clash.default.svc.cluster.example.com:8090
- name: southampton
- tags: ['southampton']
- - address: socks5://clash.default.svc.cluster.example.com:7991
- name: socks5
- tags: ['socks5']
- sources:
- # LibGen.rocks
- - driver:
- args:
- proxy_list: ~
- validator:
- class: nexus.pylon.validators.Md5Validator
- class:
- nexus.pylon.drivers.DirectDriver
- matcher:
- md5: ^.*$
- resolver:
- args:
- extractors:
- - producer:
- format_string: 'http://libgen.rocks/{matched_group}'
- group: 0
- re: 'get\.php\?md5=.*&key=[A-Za-z\d]+'
- timeout: 25.0
- type: regex
- url: https://libgen.rocks/ads.php?md5={md5}
- class: nexus.pylon.resolvers.RequestResolver
- # LibGen.rocks
- - driver:
- args:
- proxy_list: ~
- class:
- nexus.pylon.drivers.DirectDriver
- matcher:
- doi: ^.*$
- resolver:
- args:
- extractors:
- - producer:
- format_string: 'http://libgen.rocks/{matched_group}'
- group: 0
- re: 'get\.php\?md5=[a-fA-F\d]+&key=[A-Za-z\d]+(&doi=[^\"]*)+'
- timeout: 25.0
- type: regex
- url: 'https://libgen.rocks/ads.php?doi={doi}'
- class: nexus.pylon.resolvers.RequestResolver
- # Library.lol
- - driver:
- args:
- proxy_list: ~
- validator:
- class: nexus.pylon.validators.Md5Validator
- class:
- nexus.pylon.drivers.DirectDriver
- matcher:
- md5: ^.*$
- resolver:
- args:
- extractors:
- - producer:
- format_string: '{matched_group}'
- group: 1
- re: 'GET'
- timeout: 45.0
- type: regex
- - producer:
- format_string: '{matched_group}'
- group: 0
- re: 'https://ipfs.io/ipfs/[A-Za-z\d]+'
- type: regex
- - producer:
- format_string: '{matched_group}'
- group: 0
- re: 'https://cloudflare-ipfs.com/ipfs/[A-Za-z\d]+'
- type: regex
- url: http://library.lol/main/{md5}
- class: nexus.pylon.resolvers.RequestResolver
- # library.lol
- - driver:
- args:
- proxy_list: ~
- class:
- nexus.pylon.drivers.DirectDriver
- matcher:
- doi: ^.*$
- resolver:
- args:
- extractors:
- - producer:
- format_string: '{matched_group}'
- group: 1
- re: 'GET'
- timeout: 45.0
- type: regex
- url: 'http://library.lol/scimag/{doi}'
- class: nexus.pylon.resolvers.RequestResolver
- # jamanetwork.com
- - driver:
- args:
- actions:
- - selector: '#pdf-link .toolbar-link-text-extra, .contents-tab-contents > #pdf-link .toolbar-link-text-extra'
- timeout: 20
- type: wait_css_selector
- - type: click
- class:
- nexus.pylon.drivers.BrowserDriver
- matcher:
- doi: ^10.1001/.*$
- # wires.onlinelibrary.wiley.com
- - matcher:
- doi: ^10.1002/.*$
- resolver:
- args:
- format_string: 'https://onlinelibrary.wiley.com/doi/pdf/{doi}'
- class: nexus.pylon.resolvers.TemplateResolver
- # link.springer.com
- - matcher:
- doi: ^10.(1007|14283)/.*$
- resolver:
- args:
- format_string: 'https://link.springer.com/content/pdf/{doi}.pdf'
- class: nexus.pylon.resolvers.TemplateResolver
- # www.sciencedirect.com
- - matcher:
- doi: ^10.(1016|1053|4103)/.*$
- resolver:
- args:
- format_string: 'https://www.sciencedirect.com/science/article/pii/{selected}/pdfft?isDTMRedir=true&download=true'
- selector: '(.resource.primary.URL | split("/"))[-1]'
- timeout: 40.0
- class: nexus.pylon.resolvers.DoiOrgRequestResolver
- # www.clinicalkey.com
- - driver:
- args:
- actions:
- - selector: '.x-pdf'
- timeout: 30.0
- type: wait_css_selector
- - type: click
- class:
- nexus.pylon.drivers.BrowserDriver
- matcher:
- doi: ^10.1016/.*$
- # www.cambridge.org
- - matcher:
- doi: ^10.1017/.*$
- resolver:
- class: nexus.pylon.resolvers.DoiOrgRequestResolver
- # pubs.acs.org
- - matcher:
- doi: ^10.1021/.*$
- resolver:
- args:
- format_string: 'https://pubs.acs.org/doi/pdf/{doi}?download=true'
- class: nexus.pylon.resolvers.TemplateResolver
- # www.nature.com
- - driver:
- args:
- actions:
- - selector: '#entitlement-box-right-column span'
- timeout: 30
- type: wait_css_selector
- - type: click
- class: nexus.pylon.drivers.BrowserDriver
- matcher:
- doi: ^10.1038/.*$
- # www.nejm.org
- - matcher:
- doi: ^10.1056/.*$
- resolver:
- args:
- format_string: 'https://www.nejm.org/doi/pdf/{doi}?download=true'
- class: nexus.pylon.resolvers.TemplateResolver
- # ascelibrary.org
- - matcher:
- doi: ^10.1061/.*$
- resolver:
- args:
- format_string: 'https://ascelibrary.org/doi/pdf/{doi}?download=true'
- class: nexus.pylon.resolvers.TemplateResolver
- # www.pnas.org
- - matcher:
- doi: ^10.1073/.*$
- resolver:
- args:
- format_string: 'https://www.pnas.org/doi/pdf/{doi}?download=true'
- class: nexus.pylon.resolvers.TemplateResolver
- # www.tandfonline.com
- - matcher:
- doi: ^10.1080/.*$
- resolver:
- args:
- format_string: 'https://www.tandfonline.com/doi/pdf/{doi}?download=true'
- class: nexus.pylon.resolvers.TemplateResolver
- # iopscience.iop.org
- - matcher:
- doi: ^10.1088/.*$
- resolver:
- args:
- format_string: 'https://iopscience.iop.org/article/{doi}/pdf'
- class: nexus.pylon.resolvers.TemplateResolver
- # academic.oup.com
- - driver:
- args:
- actions:
- - selector: '.pdf-link-text'
- timeout: 30
- type: wait_css_selector
- - type: click
- proxy_list:
- - [edinburg]
- - [cambridge]
- - [southampton]
- class: nexus.pylon.drivers.BrowserDriver
- matcher:
- doi: ^10.1093/.*$
- # journals.lww.com
- - driver:
- args:
- actions:
- - selector: '.ejp-article-tools__list-icon-holder > .icon-pdf'
- timeout: 30.0
- type: wait_css_selector
- - type: click
- - selector: '.ejp-article-tools__dropdown-list-button > .icon-pdf'
- timeout: 5.0
- type: wait_css_selector
- - type: click
- class: nexus.pylon.drivers.BrowserDriver
- matcher:
- doi: ^10.(1097|1213|5435|14309)/.*$
- resolver:
- args:
- timeout: 30.0
- class: nexus.pylon.resolvers.TemplateResolver
- # journals.aps.org
- - matcher:
- doi: ^10.1103/.*$
- resolver:
- args:
- selector: '[.link[] | select(.URL | ascii_downcase | endswith("fulltext"))][0].URL'
- class: nexus.pylon.resolvers.DoiOrgRequestResolver
- # emerald.com
- - matcher:
- doi: ^10.1108/.*$
- resolver:
- args:
- format_string: 'https://www.emerald.com/insight/content/doi/{doi}/full/pdf'
- class: nexus.pylon.resolvers.TemplateResolver
- # ieeexplore.ieee.org
- - matcher:
- doi: ^10.1109/.*$
- resolver:
- args:
- format_string: 'https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber={selected}'
- selector: '(.resource.primary.URL | split("/"))[-2]'
- class: nexus.pylon.resolvers.DoiOrgRequestResolver
- # onlinelibrary.wiley.com
- - matcher:
- doi: ^10.1111/.*$
- resolver:
- args:
- format_string: 'https://onlinelibrary.wiley.com/doi/pdfdirect/{doi}?download=true'
- class: nexus.pylon.resolvers.TemplateResolver
- # asa.scitation.org
- - matcher:
- doi: ^10.1121/.*$
- resolver:
- args:
- format_string: 'https://asa.scitation.org/doi/pdf/{doi}?download=true'
- class: nexus.pylon.resolvers.TemplateResolver
- # www.science.org
- - matcher:
- doi: ^10.1126/.*$
- resolver:
- args:
- format_string: 'https://www.science.org/doi/pdf/{doi}?download=true'
- class: nexus.pylon.resolvers.TemplateResolver
- # ^10.1136/.*$
- - driver:
- args:
- actions:
- - selector: '.article-pdf-download > img'
- type: wait_css_selector
- - type: click
- class: nexus.pylon.drivers.BrowserDriver
- matcher:
- doi: ^10.1136/.*$
- # ^10.1136/.*$
- - driver:
- args:
- actions:
- - selector: '.icon'
- type: wait_css_selector
- - type: click
- class: nexus.pylon.drivers.BrowserDriver
- matcher:
- doi: ^10.1136/.*$
- resolver:
- class: nexus.pylon.resolvers.DoiOrgRequestResolver
- # dl.acm.org
- - matcher:
- doi: ^10.1145/.*$
- resolver:
- args:
- format_string: 'https://dl.acm.org/doi/pdf/{doi}?download=true'
- class: nexus.pylon.resolvers.TemplateResolver
- # www.annualreviews.org
- - matcher:
- doi: ^10.1146/.*$
- resolver:
- args:
- format_string: 'https://www.annualreviews.org/doi/pdf/{doi}?download=true'
- class: nexus.pylon.resolvers.TemplateResolver
- # journals.physiology.org
- - matcher:
- doi: ^10.1152/.*$
- resolver:
- args:
- format_string: 'https://journals.physiology.org/doi/pdf/{doi}?download=true'
- class: nexus.pylon.resolvers.TemplateResolver
- # www.ahajournals.org
- - matcher:
- doi: ^10.1161/.*$
- resolver:
- args:
- format_string: 'https://www.ahajournals.org/doi/pdf/{doi}?download=true'
- class: nexus.pylon.resolvers.TemplateResolver
- # ajp.psychiatryonline.org
- - matcher:
- doi: ^10.1176/.*$
- resolver:
- args:
- format_string: 'https://ajp.psychiatryonline.org/doi/pdf/{doi}?download=true'
- class: nexus.pylon.resolvers.TemplateResolver
- # journals.sagepub.com
- - matcher:
- doi: ^10.1177/.*$
- resolver:
- args:
- format_string: 'https://journals.sagepub.com/doi/pdf/{doi}?download=true'
- class: nexus.pylon.resolvers.TemplateResolver
- # bmcpsychiatry.biomedcentral.com
- - matcher:
- doi: ^10.1186/.*$
- resolver:
- args:
- format_string: 'https://bmcpsychiatry.biomedcentral.com/track/pdf/{doi}.pdf'
- class: nexus.pylon.resolvers.TemplateResolver
- # journals.plos.org
- - driver:
- args:
- proxy_list: ~
- class: nexus.pylon.drivers.direct.DirectDriver
- matcher:
- doi: ^10.1371/.*$
- resolver:
- args:
- format_string: 'https://journals.plos.org/plosone/article/file?id={doi}&type=printable'
- class: nexus.pylon.resolvers.TemplateResolver
- # bioone.org
- - driver:
- args:
- actions:
- - selector: 'DOWNLOAD PAPER'
- type: wait_link_text
- - type: click
- class: nexus.pylon.drivers.BrowserDriver
- matcher:
- doi: ^10.(1638|1654|1667|2108)/.*$
- # jasn.asnjournals.org
- - driver:
- args:
- actions:
- - selector: 'View PDF'
- type: wait_link_text
- - type: click
- class: nexus.pylon.drivers.BrowserDriver
- matcher:
- doi: ^10.1681/.*$
- # papers.ssrn.com
- - driver:
- args:
- actions:
- - selector: '.abstract-buttons:nth-child(1) .primary > span'
- type: wait_css_selector
- - type: click
- class: nexus.pylon.drivers.BrowserDriver
- matcher:
- doi: ^10.2139/.*$
- # www.afghandata.org
- - driver:
- args:
- proxy_list: ~
- class:
- nexus.pylon.drivers.DirectDriver
- matcher:
- doi: ^10.(2458|29171)/.*$
- # www.euppublishing.com
- - matcher:
- doi: ^10.3366/.*$
- resolver:
- args:
- format_string: 'https://www.euppublishing.com/doi/pdf/{doi}?download=true'
- class: nexus.pylon.resolvers.TemplateResolver
- # www.frontiersin.org
- - driver:
- args:
- actions:
- - selector: '.paper > .dropdown-toggle'
- type: wait_css_selector
- - type: click
- - selector: '.download-files-pdf'
- type: wait_css_selector
- - type: click
- class:
- nexus.pylon.drivers.BrowserDriver
- matcher:
- doi: ^10.3389/.*$
- resolver:
- args:
- selector: '[.link[] | select(.URL | ascii_downcase | endswith("full"))][0].URL'
- class: nexus.pylon.resolvers.DoiOrgRequestResolver
- # bjgp.org
- - driver:
- args:
- actions:
- - selector: 'PDF'
- type: wait_link_text
- - type: click
- class: nexus.pylon.drivers.BrowserDriver
- matcher:
- doi: ^10.3399/.*$
- # www.wjgnet.com
- - driver:
- args:
- actions:
- - selector: 'Full Article (PDF)'
- type: wait_link_text
- - type: click
- class: nexus.pylon.drivers.BrowserDriver
- matcher:
- doi: ^10.3748/.*$
- resolver:
- class: nexus.pylon.resolvers.DoiOrgRequestResolver
- # journals.vilniustech.lt
- - driver:
- args:
- actions:
- - selector: '.label'
- type: wait_css_selector
- - type: click
- class: nexus.pylon.drivers.BrowserDriver
- matcher:
- doi: ^10.3846/.*$
- resolver:
- class: nexus.pylon.resolvers.DoiOrgRequestResolver
- # isegoria.revistas.csic.es
- - matcher:
- doi: ^10.3989/.*$
- resolver:
- args:
- selector: '[.link[] | select(."intended-application" == "similarity-checking")][0].URL'
- class: nexus.pylon.resolvers.DoiOrgRequestResolver
- # www.psychiatrist.com
- - driver:
- args:
- actions:
- - selector: '.article-dwndpdf > img'
- type: wait_css_selector
- - type: click
- class: nexus.pylon.drivers.BrowserDriver
- matcher:
- doi: ^10.4088/.*$
- # www.scirp.org
- - driver:
- args:
- actions:
- - selector: 'PDF'
- type: wait_link_text
- - type: click
- class: nexus.pylon.drivers.BrowserDriver
- matcher:
- doi: ^10.4236/.*$
- # jsbr.be
- - driver:
- args:
- actions:
- - selector: 'h4 > .fa-download'
- type: wait_css_selector
- - type: click
- - selector: 'PDF (EN)'
- type: wait_link_text
- - type: click
- class: nexus.pylon.drivers.BrowserDriver
- matcher:
- doi: ^10.5334/.*$
- # hess.copernicus.org
- - driver:
- args:
- proxy_list: ~
- class: nexus.pylon.drivers.DirectDriver
- matcher:
- doi: ^10.5194/.*$
- # ProQuest
- - driver:
- args:
- actions:
- - selector: '#searchTerm'
- type: wait_css_selector
- - type: click
- - selector: '#searchTerm'
- text: '{doi}'
- type: type
- - selector: '.uxf-search'
- type: wait_css_selector
- - type: click
- - selector: '.uxf-download'
- type: wait_css_selector
- - type: click
- proxy_list: ~
- class: nexus.pylon.drivers.BrowserDriver
- matcher:
- doi: ^10.5585/.*
- resolver:
- args:
- format_string: 'https://www.proquest.com/'
- class: nexus.pylon.resolvers.TemplateResolver
- # jcsm.aasm.org
- - matcher:
- doi: ^10.5664/.*$
- resolver:
- args:
- format_string: 'https://jcsm.aasm.org/doi/pdf/{doi}?download=true'
- class: nexus.pylon.resolvers.TemplateResolver
- # journal.permsc.ru
- - driver:
- args:
- actions:
- - selector: '.obj_galley_link'
- type: wait_css_selector
- - type: click
- - selector: '.label'
- type: wait_css_selector
- - type: click
- class: nexus.pylon.drivers.BrowserDriver
- matcher:
- doi: ^10.7242/.*$
- # amjcaserep.com
- - driver:
- args:
- actions:
- - selector: "//input[@value='Download PDF version']"
- type: wait_xpath
- - type: click
- class: nexus.pylon.drivers.BrowserDriver
- matcher:
- doi: ^10.12659/.*$
- # medcraveonline.com
- - driver:
- args:
- actions:
- - selector: 'Download PDF'
- type: wait_link_text
- - type: click
- class: nexus.pylon.drivers.BrowserDriver
- matcher:
- doi: ^10.15406/.*$
- # www.researchsquare.com
- - driver:
- args:
- actions:
- - selector: '.hidden > .text-blue-600 > .antialiased'
- type: wait_css_selector
- - type: native_click
- class: nexus.pylon.drivers.BrowserDriver
- matcher:
- doi: ^10.21203/.*$
- # www.ukm.my/
- - driver:
- args:
- proxy_list: ~
- class: nexus.pylon.drivers.DirectDriver
- matcher:
- doi: ^10.24035/.*$
- # journals.library.ryerson.ca
- - driver:
- args:
- actions:
- - selector: '#a11y-1-tab-tab-download'
- type: wait_css_selector
- - type: click
- class: nexus.pylon.drivers.BrowserDriver
- matcher:
- doi: ^10.32920/.*$
- # papers.cumincad.org
- - driver:
- args:
- actions:
- - selector: 'file.pdf'
- type: wait_link_text
- - type: click
- proxy_list: ~
- class: nexus.pylon.drivers.BrowserDriver
- matcher:
- doi: ^10.52842/.*$
- # ^.*$
- - matcher:
- doi: ^.*$
- resolver:
- args:
- selector: '[(.link | if . == null then [] else . end)[] | select((."content-type" == "application/pdf") or (.URL | ascii_downcase | contains("pdf")))][0].URL'
- class: nexus.pylon.resolvers.DoiOrgRequestResolver
diff --git a/nexus/hub/main.py b/nexus/hub/main.py
index 3799b17..03a3c06 100644
--- a/nexus/hub/main.py
+++ b/nexus/hub/main.py
@@ -5,9 +5,9 @@ import uvloop
from aiogrobid import GrobidClient
from aioipfs import AsyncIPFS as AsyncIPFS
from idm.api.aioclient import IdmApiGrpcClient
+from izihawa_configurator import Configurator
from library.aiogrpctools import AioGrpcServer
from library.aiopostgres import AioPostgresPoolHolder
-from library.configurator import Configurator
from library.logging import configure_logging
from library.telegram.base import BaseTelegramClient
from nexus.hub.configs import get_config
@@ -16,6 +16,7 @@ from nexus.hub.services.mutual_aid_service import MutualAidService
from nexus.hub.services.submitter import SubmitterService
from nexus.hub.user_manager import UserManager
from nexus.meta_api.aioclient import MetaApiGrpcClient
+from nexus.pylon.configs import get_config as get_default_pylon_config
class GrpcServer(AioGrpcServer):
@@ -79,7 +80,7 @@ class GrpcServer(AioGrpcServer):
should_parse_with_grobid=config['application']['should_parse_with_grobid'],
should_store_hashes=config['application']['should_store_hashes'],
telegram_bot_configs=config['telegram']['bots'],
- pylon_config=config['pylon'],
+ pylon_config=config.get('pylon') or get_default_pylon_config()['pylon'],
)
self.submitter_service = SubmitterService(
application=self,
diff --git a/nexus/hub/services/base.py b/nexus/hub/services/base.py
index 0ff6939..8706dde 100644
--- a/nexus/hub/services/base.py
+++ b/nexus/hub/services/base.py
@@ -65,6 +65,7 @@ class BaseHubService(BaseService):
await asyncio.gather(
self.application.ipfs_client.add_bytes(file, cid_version=1, hash='blake2b-256', only_hash=True),
self.application.ipfs_client.add_bytes(file, cid_version=0, hash='sha2-256', only_hash=True),
+ self.application.ipfs_client.add_bytes(file, cid_version=1, hash='blake3', only_hash=True),
)
))
diff --git a/nexus/hub/services/delivery.py b/nexus/hub/services/delivery.py
index 91f0519..fc0833c 100644
--- a/nexus/hub/services/delivery.py
+++ b/nexus/hub/services/delivery.py
@@ -73,13 +73,7 @@ class DeliveryService(delivery_service_pb2_grpc.DeliveryServicer, BaseHubService
self.downloadings = set()
self.is_sharience_enabled = is_sharience_enabled
self.maintenance_picture_url = maintenance_picture_url
- self.pylon_client = PylonClient(
- proxies=pylon_config['proxies'],
- source_configs=pylon_config['sources'],
- default_driver_proxy_list=pylon_config['default_driver_proxy_list'],
- default_resolver_proxy_list=pylon_config['default_resolver_proxy_list'],
- downloads_directory=pylon_config['downloads_directory'],
- )
+ self.pylon_client = PylonClient(config=pylon_config)
self.should_parse_with_grobid = should_parse_with_grobid
self.should_store_hashes = should_store_hashes
self.telegram_bot_configs = telegram_bot_configs
@@ -170,6 +164,15 @@ class DeliveryService(delivery_service_pb2_grpc.DeliveryServicer, BaseHubService
return delivery_service_pb2.StartDeliveryResponse(status=delivery_service_pb2.StartDeliveryResponse.Status.OK)
+async def delayed_task(create_task, t):
+ try:
+ await asyncio.sleep(t)
+ task = create_task()
+ await task
+ except asyncio.CancelledError:
+ pass
+
+
class DownloadTask:
def __init__(
self,
@@ -204,7 +207,7 @@ class DownloadTask:
)
async def download_task(self, request_context: RequestContext, document_holder):
- throttle_secs = 2.0
+ throttle_secs = 3.0
async def _on_fail():
await self.application.telegram_clients[request_context.bot_name].send_message(
@@ -218,6 +221,7 @@ class DownloadTask:
error_log=request_context.error_log,
on_fail=_on_fail,
):
+ start_time = time.time()
filename = document_holder.get_filename()
progress_bar_download = ProgressBar(
telegram_client=self.application.telegram_clients[request_context.bot_name],
@@ -226,9 +230,9 @@ class DownloadTask:
header=f'⬇️ {filename}',
tail_text=t('TRANSMITTED_FROM', request_context.chat.language),
throttle_secs=throttle_secs,
+ last_call=start_time,
)
downloads_gauge.inc()
- start_time = time.time()
try:
file = await self.download(
document_holder=document_holder,
@@ -242,11 +246,21 @@ class DownloadTask:
)
if not document_holder.md5 and document_holder.get_extension() == 'pdf':
try:
- await progress_bar_download.send_message(
- t("PROCESSING_PAPER", request_context.chat.language).format(filename=filename),
- ignore_last_call=True
+ processing_message_task = asyncio.create_task(delayed_task(
+ create_task=lambda: progress_bar_download.send_message(
+ t("PROCESSING_PAPER", request_context.chat.language).format(filename=filename),
+ ignore_last_call=True
+ ),
+ t=5.0
+ ))
+ file = await asyncio.get_running_loop().run_in_executor(
+ None,
+ lambda: clean_metadata(file, doi=document_holder.doi)
)
- file = clean_metadata(file, doi=document_holder.doi)
+
+ processing_message_task.cancel()
+ await processing_message_task
+
request_context.statbox(
action='cleaned',
len=len(file),
@@ -260,7 +274,8 @@ class DownloadTask:
banner=t("LOOKING_AT", request_context.chat.language),
header=f'⬇️ {filename}',
tail_text=t('UPLOADED_TO_TELEGRAM', request_context.chat.language),
- throttle_secs=throttle_secs
+ throttle_secs=throttle_secs,
+ last_call=progress_bar_download.last_call,
)
uploaded_message = await self.delivery_service.send_file(
document_holder=self.document_holder,
@@ -393,11 +408,15 @@ class DownloadTask:
async def download(self, document_holder, progress_bar):
collected = bytearray()
- if document_holder.doi:
- try:
- params = {'doi': document_holder.doi}
- if document_holder.md5:
- params['md5'] = document_holder.md5
+ params = {}
+ try:
+ if document_holder.doi:
+ params['doi'] = document_holder.doi
+ if document_holder.md5:
+ params['md5'] = document_holder.md5
+ if document_holder.ipfs_multihashes:
+ params['ipfs_multihashes'] = [ipfs_multihash for ipfs_multihash in document_holder.ipfs_multihashes]
+ if params:
async for resp in self.delivery_service.pylon_client.download(params):
await self.process_resp(
resp=resp,
@@ -406,20 +425,8 @@ class DownloadTask:
filesize=document_holder.filesize,
)
return bytes(collected)
- except DownloadError:
- pass
- if document_holder.md5:
- try:
- async for resp in self.delivery_service.pylon_client.download({'md5': document_holder.md5}):
- await self.process_resp(
- resp=resp,
- progress_bar=progress_bar,
- collected=collected,
- filesize=document_holder.filesize,
- )
- return bytes(collected)
- except DownloadError:
- pass
+ except DownloadError:
+ pass
async def external_cancel(self):
self.request_context.statbox(action='externally_canceled')
diff --git a/nexus/ingest/BUILD.bazel b/nexus/ingest/BUILD.bazel
index a90c356..6d22fb3 100644
--- a/nexus/ingest/BUILD.bazel
+++ b/nexus/ingest/BUILD.bazel
@@ -27,7 +27,7 @@ py3_image(
requirement("aiokit"),
requirement("aiolibgen"),
"//library/aiopostgres",
- "//library/configurator",
+ requirement("izihawa_configurator"),
"//library/jobber",
"//nexus/actions",
],
diff --git a/nexus/ingest/jobs/crossref_api.py b/nexus/ingest/jobs/crossref_api.py
index 1950e89..7a22899 100644
--- a/nexus/ingest/jobs/crossref_api.py
+++ b/nexus/ingest/jobs/crossref_api.py
@@ -39,8 +39,8 @@ class CrossrefApiJob(BaseJob):
})
count = 0
async for chunk in self.crossref_client.works_cursor(
- filter=f'from-index-date:{self.from_date}',
- rows=500,
+ filter=f'from-index-date:{self.from_date}',
+ rows=500,
):
for item in chunk['items']:
yield item
@@ -50,4 +50,4 @@ class CrossrefApiJob(BaseJob):
'mode': 'ingest',
'items': count,
'target_date': self.from_date,
- })
\ No newline at end of file
+ })
diff --git a/nexus/ingest/jobs/postgres.py b/nexus/ingest/jobs/postgres.py
index be3feae..9362bfb 100644
--- a/nexus/ingest/jobs/postgres.py
+++ b/nexus/ingest/jobs/postgres.py
@@ -34,6 +34,7 @@ class PostgresJob(BaseJob):
f'user={database["username"]} '
f'password={database["password"]} '
f'host={database["host"]}',
+ timeout=3600 * 2,
)
self.summa_client = SummaClient(endpoint=summa['endpoint'])
self.summa_config = summa
@@ -84,6 +85,7 @@ class PostgresJob(BaseJob):
# Mandatory for server side cursor
cursor_name='nexus_ingest_cursor',
itersize=50_000,
+ statement_timeout=3600 * 2,
):
loaded = True
yield row
@@ -95,8 +97,12 @@ class PostgresJob(BaseJob):
# Mandatory for server side cursor
cursor_name='nexus_ingest_cursor',
itersize=50_000,
+ statement_timeout=3600 * 2,
):
yield row
- await self.summa_client.commit_index(self.summa_config['name'], session_id=session_id)
+ await self.summa_client.commit_index(
+ self.summa_config['name'],
+ session_id=session_id,
+ )
await self.summa_client.set_index_alias(self.summa_config['index_alias'], self.summa_config['name'], session_id=session_id)
diff --git a/nexus/meta_api/BUILD.bazel b/nexus/meta_api/BUILD.bazel
index 6f2a18b..d3d67fc 100644
--- a/nexus/meta_api/BUILD.bazel
+++ b/nexus/meta_api/BUILD.bazel
@@ -25,7 +25,7 @@ DEPS = [
"//library/aiogrpctools",
requirement("aiokit"),
"//library/aiopostgres",
- "//library/configurator",
+ requirement("izihawa_configurator"),
"//library/logging",
"//nexus/meta_api/proto:grpc_py",
"//nexus/models/proto:proto_py",
diff --git a/nexus/meta_api/configs/__init__.py b/nexus/meta_api/configs/__init__.py
index 0952810..5043832 100644
--- a/nexus/meta_api/configs/__init__.py
+++ b/nexus/meta_api/configs/__init__.py
@@ -1,5 +1,5 @@
+from izihawa_configurator import Configurator
from izihawa_utils import env
-from library.configurator import Configurator
def get_config():
diff --git a/nexus/meta_api/services/search.py b/nexus/meta_api/services/search.py
index 7f5a93f..35fbd0b 100644
--- a/nexus/meta_api/services/search.py
+++ b/nexus/meta_api/services/search.py
@@ -315,7 +315,7 @@ class SearchService(SearchServicer, BaseService):
with suppress(RetryError):
async for attempt in AsyncRetrying(
retry=retry_if_exception_type(NeedRetryError),
- wait=wait_fixed(5),
+ wait=wait_fixed(10),
stop=stop_after_attempt(6)
):
with attempt:
diff --git a/nexus/models/proto/scimag.proto b/nexus/models/proto/scimag.proto
index 66e3019..ba83b57 100644
--- a/nexus/models/proto/scimag.proto
+++ b/nexus/models/proto/scimag.proto
@@ -33,4 +33,5 @@ message Scimag {
string volume = 21;
int32 year = 30;
float page_rank = 34;
+ float series_page_rank = 35;
}
diff --git a/nexus/pipe/BUILD.bazel b/nexus/pipe/BUILD.bazel
index 9f6431c..4fbc938 100644
--- a/nexus/pipe/BUILD.bazel
+++ b/nexus/pipe/BUILD.bazel
@@ -26,7 +26,7 @@ py3_image(
requirement("aiocrossref"),
requirement("aiokit"),
"//library/aiopostgres",
- "//library/configurator",
+ requirement("izihawa_configurator"),
"//library/logging",
"//nexus/actions",
"//nexus/models/proto:proto_py",
diff --git a/nexus/promotions/BUILD.bazel b/nexus/promotions/BUILD.bazel
index 57f4422..e670166 100644
--- a/nexus/promotions/BUILD.bazel
+++ b/nexus/promotions/BUILD.bazel
@@ -10,6 +10,6 @@ py_library(
srcs_version = "PY3",
visibility = ["//visibility:public"],
deps = [
- "//library/configurator",
+ requirement("izihawa_configurator"),
],
)
diff --git a/nexus/promotions/__init__.py b/nexus/promotions/__init__.py
index 4bd76b0..4ad4db9 100644
--- a/nexus/promotions/__init__.py
+++ b/nexus/promotions/__init__.py
@@ -1,4 +1,4 @@
-from library.configurator import Configurator
+from izihawa_configurator import Configurator
def get_promotions():
diff --git a/nexus/promotions/promotions.yaml b/nexus/promotions/promotions.yaml
index 0c8b20b..8c66e81 100644
--- a/nexus/promotions/promotions.yaml
+++ b/nexus/promotions/promotions.yaml
@@ -19,6 +19,9 @@ promotions:
- texts:
en: 💬 Research is the only and ultimate goal
weight: 1
+ - texts:
+ en: 💬 Intellectual property is not a valid form of property
+ weight: 1
- texts:
en: ✋ Have a subscription to paid articles? [Help researchers!](https://t.me/{mutual_aid_group})
ru: ✋ Есть доступ к платным статьям? [Помоги ученым!](https://t.me/{mutual_aid_group})
diff --git a/nexus/pylon/BUILD.bazel b/nexus/pylon/BUILD.bazel
index 4c3ab61..8815183 100644
--- a/nexus/pylon/BUILD.bazel
+++ b/nexus/pylon/BUILD.bazel
@@ -1,12 +1,16 @@
-load("@rules_python//python:defs.bzl", "py_binary", "py_library")
+load("@rules_python//python:defs.bzl", "py_library")
+load("@rules_python//python:packaging.bzl", "py_wheel")
load("@pip_modules//:requirements.bzl", "requirement")
+filegroup(
+ name = "data",
+ srcs = ["configs/pylon.yaml"],
+)
+
py_library(
name = "pylon",
srcs = glob(["**/*.py"]),
- data = [
- "configs/pylon.yaml",
- ],
+ data = [":data"],
visibility = ["//visibility:public"],
deps = [
requirement("aiodns"),
@@ -16,6 +20,7 @@ py_library(
requirement("brotli"),
requirement("cchardet"),
requirement("certifi"),
+ requirement("fire"),
requirement("jq"),
requirement("orjson"),
requirement("pypdf2"),
@@ -23,20 +28,38 @@ py_library(
requirement("selenium"),
requirement("tenacity"),
requirement("aiokit"),
- "//library/configurator",
+ requirement("izihawa_configurator"),
"//library/logging",
"//nexus/pylon/proto:pylon_proto_py",
],
)
-py_binary(
- name = "cli",
- srcs = ["cli.py"],
- main = "cli.py",
- srcs_version = "PY3",
- visibility = ["//visibility:public"],
+py_wheel(
+ name = "nexus-pylon-wheel",
+ author = "The Superpirate",
+ author_email = "fist.of.the.first.pirates@gmail.com",
+ classifiers = [
+ "Programming Language :: Python :: 3.10",
+ ],
+ description_file = ":README.md",
+ distribution = "nexus-pylon",
+ entry_points = {"console_scripts": ["pylon = nexus.pylon.cli:main"]},
+ homepage = "https://github.com/nexus-stc/hyperboria/tree/master/nexus/pylon",
+ license = "MIT License",
+ python_requires = ">=3.10",
+ python_tag = "py3",
+ requires = [
+ "aiokit >= 1.0.0",
+ "izihawa_configurator >= 1.0.0",
+ "selenium >= 4.3.0",
+ ],
+ strip_path_prefixes = [
+ "nexus/pylon/proto/pylon_proto_py_pb",
+ ],
+ version = "1.0.0",
deps = [
- requirement("fire"),
+ ":data",
":pylon",
+ "//nexus/pylon/proto:pylon_proto_py",
],
)
diff --git a/nexus/pylon/README.md b/nexus/pylon/README.md
index 57d3fd1..d8068e1 100644
--- a/nexus/pylon/README.md
+++ b/nexus/pylon/README.md
@@ -6,16 +6,51 @@
- Streams data by chunks
- GRPC-ready
+## Build
+
+```bash
+bazel build -c opt nexus-pylon-wheel
+```
+
+## Install
+
+### PIP
+```bash
+pip install nexus-pylon
+```
+
## Nexus Pylon CLI
-Casual download
-```bash
-bazel run -c opt cli -- doi 10.1056/NEJMoa2033700 --output article.pdf
+Download scientific publication:
+```bash
+pylon download --doi 10.1182/blood-2011-03-325258 --output article.pdf
```
-Download with proxies
-```bash
-bazel run -c opt cli -- md5 278C3A72B7B04717361501B8642857DF \
- --output file.pdf \
- --proxies socks5://127.0.0.1:9050
+Download file by its MD5:
+```bash
+pylon download --md5 f07707ee92fa675fd4ee53e3fee977d1 --output article.pdf
```
+
+Download file by its multihash:
+```bash
+pylon download --ipfs-multihashes '["bafykbzacea3vduqii3u52xkzdqan5oc54vsvedmed25dfybrqxyafahjl3rzu"]' --output article.pdf
+```
+
+### Using with Selenium
+
+Create directory for exchaning files between host and launched Selenium in Docker
+```bash
+mkdir downloads
+```
+
+Launch Selenium in Docker
+```bash
+docker run -e SE_START_XVFB=false -v $(pwd)/downloads:/downloads -p 4444:4444 selenium/standalone-chrome:latest
+```
+
+Launch Pylon
+```bash
+pylon download --doi 10.1101/2022.09.09.507349 --output article.pdf \
+--wd-endpoint 'http://127.0.0.1:4444/wd/hub' \
+--wd-directory /downloads --wd-host-directory $(pwd)/downloads --debug
+```
\ No newline at end of file
diff --git a/nexus/pylon/cli.py b/nexus/pylon/cli.py
index 476d9eb..7dd459f 100644
--- a/nexus/pylon/cli.py
+++ b/nexus/pylon/cli.py
@@ -1,15 +1,17 @@
import logging
import os
import sys
+from typing import Optional
import fire
from aiokit.utils import sync_fu
-from nexus.pylon.client import (
+from izihawa_configurator import Configurator
+
+from .client import (
DownloadError,
PylonClient,
)
-from nexus.pylon.configs import get_config
-from nexus.pylon.proto.file_pb2 import FileResponse as FileResponsePb
+from .proto.file_pb2 import FileResponse as FileResponsePb
def resolve_path(filepath):
@@ -27,22 +29,20 @@ async def fetch(
collected = bytes()
try:
last_len = 0
- last_source = ''
async for resp in iter:
if resp.HasField('status'):
if resp.status == FileResponsePb.Status.BEGIN_TRANSMISSION:
- print(f'Started transmission from {resp.source}...', end='\r', file=sys.stderr)
+ print(f'Started transmission...', file=sys.stderr)
last_len = 0
- last_source = resp.source
collected = bytes()
elif resp.HasField('chunk'):
if len(collected) - last_len > 1024 * 100:
- print(f'Loaded {len(collected)} bytes from {resp.source}', end='\r', file=sys.stderr)
+ print(f'Loaded {len(collected)} bytes', end='\r', file=sys.stderr)
last_len = len(collected)
- last_source = resp.source
collected += resp.chunk.content
with open(resolve_path(output), 'wb') as f:
- print(f'Completed! Loaded {len(collected)} bytes from {last_source}', file=sys.stderr)
+ print()
+ print(f'Completed! Loaded {len(collected)} bytes', file=sys.stderr)
f.write(collected)
except DownloadError:
print('File not found')
@@ -50,25 +50,53 @@ async def fetch(
async def download(
output: str,
+ config: Optional[str] = None,
debug: bool = False,
+ wd_endpoint: Optional[str] = None,
+ wd_directory: Optional[str] = None,
+ wd_host_directory: Optional[str] = None,
**params,
):
+ """
+ Download scientific publications from various sources
+ Large portion of fresh articles could be retrieved only though publisher libraries through `BrowserDriver`, it
+ requires Selenium webdriver:
+ `docker run -e SE_START_XVFB=false -v $(pwd)/downloads:/downloads -p 4444:4444 selenium/standalone-chrome:latest`
+ Args:
+ output: name of the output file
+ config: pylon config
+ debug: enable debug logging
+ wd_endpoint: web-driver
+ wd_directory: mounted directory inside Docker image
+ wd_host_directory: directory for downloads on host that should be mounter as `wd_directory` inside Docker image
+ """
if debug:
logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
- c = get_config()['pylon']
- p = PylonClient(
- proxies=c['proxies'],
- source_configs=c['sources'],
- default_driver_proxy_list=c['default_driver_proxy_list'],
- downloads_directory=c['downloads_directory'],
- )
- return await fetch(iter=p.download(params=params), output=output)
+
+ default_config_path = os.path.join(os.path.dirname(__file__), 'configs/pylon.yaml')
+ config = Configurator([config if config else default_config_path], env_prefix='NEXUS_PYLON')
+ config = config['pylon']
+ if wd_endpoint:
+ config.setdefault('webdriver_hub', {})
+ config['webdriver_hub']['endpoint'] = wd_endpoint
+ if not wd_directory:
+ raise ValueError('Should pass --wd-directory with --wd-endpoint')
+ config['webdriver_hub']['downloads_directory'] = wd_directory
+ if not wd_host_directory:
+ raise ValueError('Should pass --wd-host-directory with --wd-endpoint')
+ config['webdriver_hub']['host_downloads_directory'] = wd_host_directory
+
+ pylon_client = PylonClient(config=config)
+ return await fetch(iter=pylon_client.download(params=params), output=output)
def main():
- fire.Fire({
- 'download': sync_fu(download),
- })
+ try:
+ fire.Fire({
+ 'download': sync_fu(download),
+ })
+ except KeyboardInterrupt:
+ sys.exit(1)
if __name__ == '__main__':
diff --git a/nexus/pylon/client.py b/nexus/pylon/client.py
index 9d3fbab..e02f6c9 100644
--- a/nexus/pylon/client.py
+++ b/nexus/pylon/client.py
@@ -1,12 +1,10 @@
+import logging
from typing import (
AsyncIterable,
Dict,
- List,
- Optional,
)
from aiokit import AioThing
-from library.logging import error_log
from nexus.pylon.exceptions import (
DownloadError,
NotFoundError,
@@ -17,30 +15,25 @@ from nexus.pylon.source import Source
class PylonClient(AioThing):
- def __init__(
- self,
- source_configs: Optional[List],
- proxies: Optional[List[str]] = None,
- downloads_directory: Optional[str] = None,
- default_driver_proxy_list: [Optional[List]] = None,
- default_resolver_proxy_list: [Optional[List]] = None,
- ):
+ def __init__(self, config):
super().__init__()
- self.proxy_manager = ProxyManager(proxies)
- self.downloads_directory = downloads_directory
- self.default_driver_proxy_list = default_driver_proxy_list
- self.default_resolver_proxy_list = default_resolver_proxy_list
+ self.config = config
+ self.proxy_manager = ProxyManager(config.get('proxies'))
self.sources = []
- for source_config in source_configs:
+ if config.get('webdriver_hub') is None:
+ logging.getLogger('nexus_pylon').warning({
+ 'action': 'missed_webdriver',
+ 'mode': 'pylon',
+ })
+ for source_config in config['sources']:
source = Source.from_config(
proxy_manager=self.proxy_manager,
+ config=self.config,
source_config=source_config,
- downloads_directory=downloads_directory,
- default_driver_proxy_list=default_driver_proxy_list,
- default_resolver_proxy_list=default_resolver_proxy_list,
)
- self.sources.append(source)
- self.starts.append(source)
+ if source:
+ self.sources.append(source)
+ self.starts.append(source)
async def download(self, params: Dict) -> AsyncIterable[FileResponsePb]:
for source in self.sources:
@@ -50,9 +43,10 @@ class PylonClient(AioThing):
async for resp in source.download(params):
yield resp
return
- except NotFoundError:
+ except NotFoundError as e:
+ logging.getLogger('nexus_pylon').debug(e)
continue
except DownloadError as e:
- error_log(e)
+ logging.getLogger('nexus_pylon').warning(e)
continue
- raise NotFoundError()
+ raise NotFoundError(params=params)
diff --git a/nexus/pylon/configs/__init__.py b/nexus/pylon/configs/__init__.py
index 834248b..6e32e46 100644
--- a/nexus/pylon/configs/__init__.py
+++ b/nexus/pylon/configs/__init__.py
@@ -1,5 +1,4 @@
-from izihawa_utils import env
-from library.configurator import Configurator
+from izihawa_configurator import Configurator
def get_config():
diff --git a/nexus/pylon/configs/pylon.yaml b/nexus/pylon/configs/pylon.yaml
index ed96b83..d7ef6a6 100644
--- a/nexus/pylon/configs/pylon.yaml
+++ b/nexus/pylon/configs/pylon.yaml
@@ -1,65 +1,25 @@
---
pylon:
- default_driver_proxy_list:
- - [proxy1]
- - [proxy2]
- - [proxy3]
- downloads_directory: /downloads
- proxies:
- - address: proxy1.net:7890
- name: proxy1
- tags: [proxy1]
- - address: proxy2.net:7990
- name: proxy2
- tags: [proxy2]
- - address: proxy3.net:8090
- name: proxy3
- tags: [proxy3]
+ default_driver_proxy_list: ~
+ default_resolver_proxy_list: ~
+ proxies: ~
sources:
- # LibGen.rocks
+ # IPFS
- driver:
args:
- proxy_list: ~
validator:
- class: nexus.pylon.validators.Md5Validator
+ class: nexus.pylon.validators.BaseValidator
class:
nexus.pylon.drivers.DirectDriver
matcher:
- md5: ^.*$
+ ipfs_multihashes: ^.*$
resolver:
args:
- extractors:
- - producer:
- format_string: 'http://libgen.rocks/{matched_group}'
- group: 0
- re: 'get\.php\?md5=.*&key=[A-Za-z\d]+'
- timeout: 25.0
- type: regex
- url: https://libgen.rocks/ads.php?md5={md5}
- class: nexus.pylon.resolvers.RequestResolver
- # LibGen.rocks
- - driver:
- args:
- proxy_list: ~
- class:
- nexus.pylon.drivers.DirectDriver
- matcher:
- doi: ^.*$
- resolver:
- args:
- extractors:
- - producer:
- format_string: 'http://libgen.rocks/{matched_group}'
- group: 0
- re: 'get\.php\?md5=[a-fA-F\d]+&key=[A-Za-z\d]+(&doi=[^\"]*)+'
- timeout: 25.0
- type: regex
- url: 'https://libgen.rocks/ads.php?doi={doi}'
- class: nexus.pylon.resolvers.RequestResolver
+ format_string: 'https://ipfs.io/ipfs/{ipfs_multihashes[0]}'
+ class: nexus.pylon.resolvers.TemplateResolver
# Library.lol
- driver:
args:
- proxy_list: ~
validator:
class: nexus.pylon.validators.Md5Validator
class:
@@ -70,27 +30,22 @@ pylon:
args:
extractors:
- producer:
- format_string: '{matched_group}'
- group: 1
- re: 'GET'
- timeout: 45.0
+ format_string: '{href}'
+ timeout: 45.0
+ re: 'GET'
type: regex
- producer:
- format_string: '{matched_group}'
- group: 0
- re: 'https://ipfs.io/ipfs/[A-Za-z\d]+'
+ format_string: '{url}'
+ re: '(?Phttps://ipfs.io/ipfs/[A-Za-z\d]+)'
type: regex
- producer:
- format_string: '{matched_group}'
- group: 0
- re: 'https://cloudflare-ipfs.com/ipfs/[A-Za-z\d]+'
+ format_string: '{url}'
+ re: '(?Phttps://cloudflare-ipfs.com/ipfs/[A-Za-z\d]+)'
type: regex
url: http://library.lol/main/{md5}
class: nexus.pylon.resolvers.RequestResolver
# library.lol
- driver:
- args:
- proxy_list: ~
class:
nexus.pylon.drivers.DirectDriver
matcher:
@@ -99,13 +54,48 @@ pylon:
args:
extractors:
- producer:
- format_string: '{matched_group}'
- group: 1
- re: 'GET'
- timeout: 45.0
+ format_string: '{href}'
+ timeout: 45.0
+ re: 'GET'
type: regex
url: 'http://library.lol/scimag/{doi}'
class: nexus.pylon.resolvers.RequestResolver
+ # LibGen.rocks
+ - driver:
+ args:
+ validator:
+ class: nexus.pylon.validators.Md5Validator
+ class:
+ nexus.pylon.drivers.DirectDriver
+ matcher:
+ md5: ^.*$
+ resolver:
+ args:
+ extractors:
+ - producer:
+ format_string: 'http://libgen.rocks/{key}'
+ timeout: 25.0
+ re: '(?Pget\.php\?md5=.*&key=[A-Za-z\d]+)'
+ type: regex
+ resolve_timeout: 25.0
+ url: https://libgen.rocks/ads.php?md5={md5}
+ class: nexus.pylon.resolvers.RequestResolver
+ # LibGen.rocks
+ - driver:
+ class:
+ nexus.pylon.drivers.DirectDriver
+ matcher:
+ doi: ^.*$
+ resolver:
+ args:
+ extractors:
+ - producer:
+ format_string: 'http://libgen.rocks/{key}'
+ timeout: 25.0
+ re: '(?Pget\.php\?md5=[a-fA-F\d]+&key=[A-Za-z\d]+(&doi=[^\"]*)+)'
+ type: regex
+ url: 'https://libgen.rocks/ads.php?doi={doi}'
+ class: nexus.pylon.resolvers.RequestResolver
# jamanetwork.com
- driver:
args:
@@ -206,6 +196,13 @@ pylon:
format_string: 'https://www.tandfonline.com/doi/pdf/{doi}?download=true'
class: nexus.pylon.resolvers.TemplateResolver
# iopscience.iop.org
+ - matcher:
+ doi: ^10.1088/.*$
+ resolver:
+ args:
+ format_string: 'https://iopscience.iop.org/article/{doi}/pdf'
+ class: nexus.pylon.resolvers.TemplateResolver
+ # iopscience.iop.org
- matcher:
doi: ^10.1088/.*$
resolver:
@@ -220,10 +217,6 @@ pylon:
timeout: 30
type: wait_css_selector
- type: click
- proxy_list:
- - [proxy2]
- - [proxy1]
- - [proxy3]
class: nexus.pylon.drivers.BrowserDriver
matcher:
doi: ^10.1093/.*$
@@ -246,6 +239,13 @@ pylon:
args:
timeout: 30.0
class: nexus.pylon.resolvers.TemplateResolver
+ # biorxiv.org
+ - matcher:
+ doi: ^10.1101/.*$
+ resolver:
+ args:
+ format_string: 'https://www.biorxiv.org/content/{doi}.full.pdf'
+ class: nexus.pylon.resolvers.TemplateResolver
# journals.aps.org
- matcher:
doi: ^10.1103/.*$
@@ -332,6 +332,13 @@ pylon:
args:
format_string: 'https://journals.physiology.org/doi/pdf/{doi}?download=true'
class: nexus.pylon.resolvers.TemplateResolver
+ # www.ahajournals.org
+ - matcher:
+ doi: ^10.1161/.*$
+ resolver:
+ args:
+ format_string: 'https://www.ahajournals.org/doi/pdf/{doi}?download=true'
+ class: nexus.pylon.resolvers.TemplateResolver
# ajp.psychiatryonline.org
- matcher:
doi: ^10.1176/.*$
@@ -355,8 +362,6 @@ pylon:
class: nexus.pylon.resolvers.TemplateResolver
# journals.plos.org
- driver:
- args:
- proxy_list: ~
class: nexus.pylon.drivers.direct.DirectDriver
matcher:
doi: ^10.1371/.*$
@@ -364,6 +369,13 @@ pylon:
args:
format_string: 'https://journals.plos.org/plosone/article/file?id={doi}&type=printable'
class: nexus.pylon.resolvers.TemplateResolver
+ # guilfordjournals.com
+ - matcher:
+ doi: ^10.1521/.*$
+ resolver:
+ args:
+ format_string: 'https://guilfordjournals.com/doi/pdf/{doi}?download=true'
+ class: nexus.pylon.resolvers.TemplateResolver
# bioone.org
- driver:
args:
@@ -396,8 +408,6 @@ pylon:
doi: ^10.2139/.*$
# www.afghandata.org
- driver:
- args:
- proxy_list: ~
class:
nexus.pylon.drivers.DirectDriver
matcher:
@@ -503,8 +513,6 @@ pylon:
doi: ^10.5334/.*$
# hess.copernicus.org
- driver:
- args:
- proxy_list: ~
class: nexus.pylon.drivers.DirectDriver
matcher:
doi: ^10.5194/.*$
@@ -524,7 +532,6 @@ pylon:
- selector: '.uxf-download'
type: wait_css_selector
- type: click
- proxy_list: ~
class: nexus.pylon.drivers.BrowserDriver
matcher:
doi: ^10.5585/.*
@@ -539,6 +546,22 @@ pylon:
args:
format_string: 'https://jcsm.aasm.org/doi/pdf/{doi}?download=true'
class: nexus.pylon.resolvers.TemplateResolver
+ # www.medwave.cl
+ - driver:
+ class:
+ nexus.pylon.drivers.DirectDriver
+ matcher:
+ doi: ^10.5867/.*$
+ resolver:
+ args:
+ extractors:
+ - producer:
+ format_string: 'https://www.medwave.cl/{path}'
+ timeout: 25.0
+ re: 'href=\"/(?P[\w/.\-_]+\.pdf)\">PDF'
+ type: regex
+ url: https://doi.org/{doi}
+ class: nexus.pylon.resolvers.RequestResolver
# journal.permsc.ru
- driver:
args:
@@ -584,8 +607,6 @@ pylon:
doi: ^10.21203/.*$
# www.ukm.my/
- driver:
- args:
- proxy_list: ~
class: nexus.pylon.drivers.DirectDriver
matcher:
doi: ^10.24035/.*$
@@ -599,6 +620,22 @@ pylon:
class: nexus.pylon.drivers.BrowserDriver
matcher:
doi: ^10.32920/.*$
+ # PKP Project
+ - driver:
+ class:
+ nexus.pylon.drivers.DirectDriver
+ matcher:
+ doi: ^10.(5399|24905|31004|32729|37934)/.*$
+ resolver:
+ args:
+ extractors:
+ - producer:
+ format_string: 'https://{host}/{prefix}/{journal}/article/download/{key}'
+ timeout: 25.0
+ re: 'href=\"(?:https?://[\w.]+)/(?P[\w./]+)/(?P[\w.]+)/article/view/(?P\w+/\w+)\"[^>]*>[Pp][Dd][Ff]\s*'
+ type: regex
+ url: https://doi.org/{doi}
+ class: nexus.pylon.resolvers.RequestResolver
# papers.cumincad.org
- driver:
args:
@@ -606,11 +643,16 @@ pylon:
- selector: 'file.pdf'
type: wait_link_text
- type: click
- proxy_list: ~
class: nexus.pylon.drivers.BrowserDriver
matcher:
doi: ^10.52842/.*$
# ^.*$
+ - matcher:
+ doi: ^.*$
+ resolver:
+ args:
+ selector: '.resource.primary.URL | select (. | ascii_downcase | contains("pdf"))'
+ class: nexus.pylon.resolvers.DoiOrgRequestResolver
- matcher:
doi: ^.*$
resolver:
diff --git a/nexus/pylon/drivers/base.py b/nexus/pylon/drivers/base.py
index 96fde80..addb6c7 100644
--- a/nexus/pylon/drivers/base.py
+++ b/nexus/pylon/drivers/base.py
@@ -4,22 +4,22 @@ from typing import (
Optional,
)
+from izihawa_utils.importlib import import_object
from nexus.pylon.network_agent import NetworkAgent
from nexus.pylon.prepared_request import PreparedRequest
from nexus.pylon.proxy_manager import ProxyManager
-from nexus.pylon.validators.base import BaseValidator
-from utils.izihawa_utils.importlib import import_object
class BaseDriver(NetworkAgent):
def __init__(
self,
+ config,
validator=None,
- downloads_directory: str = '/downloads',
proxy_list: Optional[List] = None,
proxy_manager: Optional[ProxyManager] = None,
):
super().__init__(proxy_list=proxy_list, proxy_manager=proxy_manager)
+ self.config = config
validator_cls = 'nexus.pylon.validators.PdfValidator'
if validator and 'class' in validator:
@@ -27,7 +27,6 @@ class BaseDriver(NetworkAgent):
validator_cls = import_object(validator_cls)
self.validator = validator_cls
- self.downloads_directory = downloads_directory
def __str__(self):
return self.__class__.__name__
diff --git a/nexus/pylon/drivers/browser.py b/nexus/pylon/drivers/browser.py
index 8d4ad25..b0963e2 100644
--- a/nexus/pylon/drivers/browser.py
+++ b/nexus/pylon/drivers/browser.py
@@ -32,21 +32,20 @@ from selenium.webdriver.support.ui import WebDriverWait
class BrowserDriver(BaseDriver):
def __init__(
self,
+ config,
validator=None,
proxy_list: Optional[List] = None,
proxy_manager: Optional[ProxyManager] = None,
actions: Optional[List] = None,
- downloads_directory='/downloads',
- window_size: Tuple[int, int] = (1279, 833),
- erase_webdrive_property: bool = True,
- webdrive_hub_endpoint: str = "http://127.0.0.1:4444/wd/hub",
):
- super().__init__(validator=validator, proxy_list=proxy_list, proxy_manager=proxy_manager)
+ super().__init__(config=config, validator=validator, proxy_list=proxy_list, proxy_manager=proxy_manager)
self.actions = actions
- self.downloads_directory = Path(downloads_directory)
- self.window_size = window_size
- self.erase_webdrive_property = erase_webdrive_property
- self.webdrive_hub_endpoint = webdrive_hub_endpoint
+ self.downloads_directory = Path(config['webdriver_hub']['downloads_directory'])
+ self.host_downloads_directory = Path(config['webdriver_hub']['host_downloads_directory'])
+ self.window_size = tuple(config['webdriver_hub'].get('window_size', [1279, 833]))
+ self.erase_webdriver_property = config['webdriver_hub'].get('erase_webdriver_property', True)
+ self.webdriver_hub_endpoint = config['webdriver_hub']['endpoint']
+ self.file_poll_timeout = 2.0
async def get_chrome_sessions(self):
proxies = list(
@@ -55,15 +54,14 @@ class BrowserDriver(BaseDriver):
else [None]
)
for proxy in proxies:
- downloads_folder = self.downloads_directory / random_string(16)
- os.mkdir(downloads_folder)
- os.chmod(downloads_folder, 0o777)
- chrome = await asyncio.get_running_loop().run_in_executor(None, lambda: self.setup_chrome(proxy, downloads_folder))
- try:
- yield chrome, downloads_folder
- finally:
- shutil.rmtree(downloads_folder)
- chrome.quit()
+ subdirectory = random_string(16)
+ downloads_directory = self.downloads_directory / subdirectory
+ host_downloads_directory = self.host_downloads_directory / subdirectory
+ os.mkdir(host_downloads_directory)
+ os.chmod(host_downloads_directory, 0o777)
+ chrome = await asyncio.get_running_loop().run_in_executor(None, lambda: self.setup_chrome(proxy, downloads_directory))
+ yield chrome, host_downloads_directory
+
def setup_chrome(self, proxy, downloads_folder):
options = webdriver.ChromeOptions()
@@ -85,13 +83,13 @@ class BrowserDriver(BaseDriver):
options.add_argument('--disable-dev-shm-usage')
options.add_argument("--disable-popup-blocking")
chrome = webdriver.Remote(
- self.webdrive_hub_endpoint,
+ self.webdriver_hub_endpoint,
DesiredCapabilities.CHROME,
options=options,
)
chrome.set_window_size(self.window_size[0], self.window_size[1])
- if self.erase_webdrive_property:
+ if self.erase_webdriver_property:
resource = "/session/%s/chromium/send_command_and_get_result" % chrome.session_id
url = chrome.command_executor._url + resource
body = json.dumps({'cmd': "Page.addScriptToEvaluateOnNewDocument", 'params': {
@@ -103,7 +101,7 @@ class BrowserDriver(BaseDriver):
}})
chrome.command_executor._request('POST', url, body)
- logging.getLogger('debug').debug({
+ logging.getLogger('nexus_pylon').debug({
'action': 'start_chrome',
'mode': 'pylon',
'proxy': str(proxy) if proxy is not None else None,
@@ -148,32 +146,19 @@ class BrowserDriver(BaseDriver):
and downloaded_offset == current_offset
and current_offset > 0
):
- logging.getLogger('debug').debug({
- 'action': 'sent',
- 'mode': 'pylon',
- 'filename': filename,
- })
return
-
- logging.getLogger('debug').debug({
- 'action': 'send_part',
- 'mode': 'pylon',
- 'current_offset': current_offset,
- 'downloaded_offset': downloaded_offset,
- 'filename': filename,
- })
await file.seek(current_offset)
yield await file.read(downloaded_offset - current_offset)
current_offset = downloaded_offset
- await asyncio.sleep(0.5)
+ await asyncio.sleep(self.file_poll_timeout)
raise NotFoundError()
finally:
await file.close()
def get(self, chrome, url, params):
- logging.getLogger('debug').debug({
- 'action': 'get',
+ logging.getLogger('nexus_pylon').debug({
+ 'action': 'download',
'mode': 'pylon',
'url': url,
})
@@ -190,11 +175,6 @@ class BrowserDriver(BaseDriver):
if not last_element:
raise RuntimeError('Nothing to click')
chrome.execute_script("arguments[0].click();", last_element)
- logging.getLogger('debug').debug({
- 'action': 'clicked',
- 'mode': 'pylon',
- 'element': str(last_element),
- })
case 'close_window':
current_window = previous_window
previous_window = None
@@ -204,11 +184,6 @@ class BrowserDriver(BaseDriver):
if not last_element:
raise RuntimeError('Nothing to click')
last_element.click()
- logging.getLogger('debug').debug({
- 'action': 'native_clicked',
- 'mode': 'pylon',
- 'element': str(last_element),
- })
case 'switch_to_new_window':
previous_window = current_window
current_window = chrome.window_handles[-1]
@@ -227,12 +202,6 @@ class BrowserDriver(BaseDriver):
action['selector'],
))
)
- logging.getLogger('debug').debug({
- 'action': 'waited_css_selector',
- 'mode': 'pylon',
- 'element': str(last_element),
- 'step': action
- })
case 'wait_link_text':
last_element = WebDriverWait(chrome, action.get('timeout', 15.0)).until(
EC.presence_of_element_located((
@@ -240,12 +209,6 @@ class BrowserDriver(BaseDriver):
action['selector'],
))
)
- logging.getLogger('debug').debug({
- 'action': 'waited_link_text',
- 'mode': 'pylon',
- 'element': str(last_element),
- 'step': action
- })
case 'wait_xpath':
last_element = WebDriverWait(chrome, action.get('timeout', 15.0)).until(
EC.presence_of_element_located((
@@ -253,16 +216,10 @@ class BrowserDriver(BaseDriver):
action['selector'],
))
)
- logging.getLogger('debug').debug({
- 'action': 'waited_xpath',
- 'mode': 'pylon',
- 'element': str(last_element),
- 'step': action
- })
case _:
raise NotImplementedError('Not implemented action type')
except WebDriverException as e:
- logging.getLogger('debug').debug({
+ logging.getLogger('nexus_pylon').debug({
'action': 'error',
'mode': 'pylon',
'error': str(e),
@@ -294,15 +251,17 @@ class BrowserDriver(BaseDriver):
source=chrome.current_url,
)
file_validator.validate()
- logging.getLogger('debug').debug({
- 'action': 'validated',
- 'mode': 'pylon',
- 'url': prepared_file_request.url,
- })
return
except NotFoundError:
- logging.getLogger('debug').debug({
+ logging.getLogger('nexus_pylon').debug({
'action': 'no_response',
'mode': 'pylon',
})
+ finally:
+ logging.getLogger('nexus_pylon').debug({
+ 'action': 'quit_chrome',
+ 'mode': 'pylon',
+ })
+ chrome.quit()
+ shutil.rmtree(downloads_folder)
raise NotFoundError(params=params, url=prepared_file_request.url, driver=str(self))
diff --git a/nexus/pylon/drivers/direct.py b/nexus/pylon/drivers/direct.py
index 61db9fa..8b08546 100644
--- a/nexus/pylon/drivers/direct.py
+++ b/nexus/pylon/drivers/direct.py
@@ -1,3 +1,4 @@
+import logging
from typing import Dict
import aiohttp.client_exceptions
@@ -25,12 +26,27 @@ class DirectDriver(BaseDriver):
@retry(
reraise=True,
wait=wait_random(min=1, max=2),
- stop=stop_after_attempt(7),
+ stop=stop_after_attempt(3),
retry=retry_if_exception_type((ProxyError, aiohttp.client_exceptions.ClientPayloadError, ProxyTimeoutError)),
)
async def execute_prepared_file_request(self, prepared_file_request: PreparedRequest, params: Dict):
+ logging.debug({
+ 'action': 'download',
+ 'mode': 'pylon',
+ 'params': params,
+ 'source': str(self),
+ 'url': prepared_file_request.url,
+ })
async with self.get_session() as session:
async with prepared_file_request.execute_with(session=session) as resp:
+ logging.debug({
+ 'action': 'response',
+ 'mode': 'pylon',
+ 'params': params,
+ 'source': str(self),
+ 'url': prepared_file_request.url,
+ 'status': resp.status,
+ })
if resp.status == 404:
raise NotFoundError(url=prepared_file_request.url)
elif (
diff --git a/nexus/pylon/matcher.py b/nexus/pylon/matcher.py
index 45cf385..dabb05c 100644
--- a/nexus/pylon/matcher.py
+++ b/nexus/pylon/matcher.py
@@ -1,5 +1,8 @@
import re
-import sys
+from typing import (
+ List,
+ Tuple,
+)
class Matcher:
@@ -10,8 +13,11 @@ class Matcher:
def is_match(self, params) -> bool:
for param in params:
- if params[param]:
- if param_regex := self.param_regexes.get(param):
- if re.match(param_regex, params[param]):
- return True
- return False
+ param_value = params[param]
+ param_regex = self.param_regexes.get(param)
+ if param_value and param_regex:
+ if not isinstance(param_value, (List, Tuple)):
+ param_value = [param_value]
+ for el in param_value:
+ if re.match(param_regex, el):
+ return el
diff --git a/nexus/pylon/network_agent.py b/nexus/pylon/network_agent.py
index 3d57c35..37fce86 100644
--- a/nexus/pylon/network_agent.py
+++ b/nexus/pylon/network_agent.py
@@ -10,7 +10,7 @@ import aiohttp
from aiohttp import ClientSession
from aiohttp.client_reqrep import ClientRequest
from aiohttp_socks import ProxyConnector
-from library.aiokit.aiokit import AioThing
+from aiokit import AioThing
from nexus.pylon.proxy_manager import (
AllOf,
AnyOf,
diff --git a/nexus/pylon/pdftools/watermarks.py b/nexus/pylon/pdftools/watermarks.py
index d49a842..5784347 100644
--- a/nexus/pylon/pdftools/watermarks.py
+++ b/nexus/pylon/pdftools/watermarks.py
@@ -228,7 +228,7 @@ class BasePdfProcessor:
try:
page = self.process_page(page, pdf_reader)
except (PdfStreamError, binascii.Error) as e:
- logging.getLogger('warning').warning({
+ logging.getLogger('nexus_pylon').warning({
'action': 'pdf_stream_error',
'mode': 'pylon',
'error': str(e),
@@ -259,7 +259,7 @@ class WatermarkEraser1(BaseWatermarkEraser):
if self.is_watermark_predicate(text.encode()):
xobj_death_note.append(operands[0])
operations_death_note.append(op_i)
- logging.getLogger('debug').debug({
+ logging.getLogger('nexus_pylon').debug({
'action': 'watermark_removal',
'mode': 'pylon',
'text': text,
@@ -289,7 +289,7 @@ class WatermarkEraser2(BaseWatermarkEraser):
if operation == b"Tj":
if isinstance(operands[0], bytes) and self.is_watermark_predicate(operands[0]):
operations_death_note.append(op_i)
- logging.getLogger('debug').debug({
+ logging.getLogger('nexus_pylon').debug({
'action': 'watermark_removal',
'mode': 'pylon',
'text': operands[0].decode(),
@@ -319,7 +319,7 @@ class WatermarkEraser3(BaseWatermarkEraser):
text += operand
if self.is_watermark_predicate(text):
operations_death_note.append(op_i)
- logging.getLogger('debug').debug({
+ logging.getLogger('nexus_pylon').debug({
'action': 'watermark_removal',
'mode': 'pylon',
'text': text.decode(),
@@ -402,7 +402,7 @@ class WatermarkEraser4(BaseWatermarkEraser):
text, matched = tc.match(self.regexp)
if matched:
operations_death_note.extend(matched)
- logging.getLogger('debug').debug({
+ logging.getLogger('nexus_pylon').debug({
'action': 'watermark_removal',
'mode': 'pylon',
'matched': text,
diff --git a/nexus/pylon/prepared_request.py b/nexus/pylon/prepared_request.py
index 13f9149..21226b0 100644
--- a/nexus/pylon/prepared_request.py
+++ b/nexus/pylon/prepared_request.py
@@ -1,4 +1,5 @@
import asyncio
+import logging
from contextlib import asynccontextmanager
from typing import Optional
@@ -23,6 +24,7 @@ class PreparedRequest:
cookies: Optional[dict] = None,
ssl: bool = True,
timeout: Optional[float] = None,
+ headers_override: bool = False
):
self.method = method
self.url = url
@@ -32,6 +34,8 @@ class PreparedRequest:
}
if headers:
self.headers.update(headers)
+ if headers_override:
+ self.headers = headers or {}
self.params = params
self.cookies = cookies
self.ssl = ssl
@@ -49,6 +53,13 @@ class PreparedRequest:
@asynccontextmanager
async def execute_with(self, session):
try:
+ logging.getLogger('nexus_pylon').debug({
+ 'action': 'request',
+ 'mode': 'pylon',
+ 'url': self.url,
+ 'method': self.method,
+ 'headers': self.headers,
+ })
async with session.request(
method=self.method,
url=self.url,
diff --git a/nexus/pylon/proxy_manager.py b/nexus/pylon/proxy_manager.py
index c622a6c..6a04ec8 100644
--- a/nexus/pylon/proxy_manager.py
+++ b/nexus/pylon/proxy_manager.py
@@ -54,6 +54,8 @@ class Proxy:
class ProxyManager:
def __init__(self, proxies=None):
+ if proxies is None:
+ proxies = []
self.proxies = [Proxy(proxy) for proxy in proxies]
def get_proxy(self, tags: Optional[Union[AllOf, AnyOf, Set]] = None) -> Proxy:
diff --git a/nexus/pylon/resolvers/doi_org_request.py b/nexus/pylon/resolvers/doi_org_request.py
index 24f5d94..3f36cb2 100644
--- a/nexus/pylon/resolvers/doi_org_request.py
+++ b/nexus/pylon/resolvers/doi_org_request.py
@@ -1,5 +1,6 @@
import json
import logging
+import sys
from typing import (
AsyncIterable,
Dict,
@@ -50,20 +51,25 @@ class DoiOrgRequestResolver(BaseResolver):
method='get',
url=doi_url,
timeout=self.resolve_timeout,
- headers={'Accept': 'application/json'}
+ headers={
+ 'Accept': 'application/json',
+ }
).execute_with(session=session) as resp:
return await resp.json()
async def resolve(self, params: Dict) -> AsyncIterable[PreparedRequest]:
body = await self.resolve_through_doi_org(params)
+ selected = None
try:
- selected = json.loads(self.selector.input(body).text())
+ if text := self.selector.input(body).text():
+ selected = json.loads(text)
except ValueError as e:
- logging.getLogger('error').error({
+ logging.getLogger('nexus_pylon').error({
'action': 'error',
'mode': 'pylon',
'params': params,
- 'error': str(e)
+ 'error': str(e),
+ 'selector': str(self.selector),
})
return
if selected:
@@ -73,7 +79,7 @@ class DoiOrgRequestResolver(BaseResolver):
timeout=self.timeout,
)
else:
- logging.getLogger('debug').error({
+ logging.getLogger('nexus_pylon').debug({
'action': 'missed_selector',
'mode': 'pylon',
'params': params,
diff --git a/nexus/pylon/resolvers/request.py b/nexus/pylon/resolvers/request.py
index a4b1289..a42232d 100644
--- a/nexus/pylon/resolvers/request.py
+++ b/nexus/pylon/resolvers/request.py
@@ -15,10 +15,12 @@ class RequestResolver(BaseResolver):
self,
url: str,
extractors: List,
+ resolve_timeout: float = 10.0,
proxy_list: Optional[List] = None,
proxy_manager: Optional[ProxyManager] = None,
):
super().__init__(proxy_list=proxy_list, proxy_manager=proxy_manager)
+ self.resolve_timeout = resolve_timeout
self.url = url
self.extractors = extractors
@@ -31,9 +33,9 @@ class RequestResolver(BaseResolver):
async with PreparedRequest(
method='get',
url=url,
- timeout=10.0,
+ timeout=self.resolve_timeout,
).execute_with(session=session) as resp:
- # Sometimes sci-hub returns file
+ # Sometimes hosts return file URL
if resp.headers.get('Content-Type') == 'application/pdf':
yield PreparedRequest(method='get', url=url, timeout=10.0)
downloaded_page_bytes = await resp.read()
@@ -42,9 +44,11 @@ class RequestResolver(BaseResolver):
for extractor in self.extractors:
match = re.search(extractor['re'], downloaded_page, re.IGNORECASE)
if match:
- matched_group = match.group(extractor['producer']['group'])
yield PreparedRequest(
method='get',
- url=extractor['producer']['format_string'].format(matched_group=matched_group),
+ url=extractor['producer']['format_string'].format(
+ host=resp.real_url.host,
+ **match.groupdict()
+ ),
timeout=extractor['producer'].get('timeout', 10.0),
)
diff --git a/nexus/pylon/resolvers/template.py b/nexus/pylon/resolvers/template.py
index 397e453..e84523b 100644
--- a/nexus/pylon/resolvers/template.py
+++ b/nexus/pylon/resolvers/template.py
@@ -14,19 +14,27 @@ class TemplateResolver(BaseResolver):
self,
format_string: str = 'https://doi.org/{doi}',
timeout: float = 10.0,
+ method: str = 'GET',
+ headers: Optional[dict] = None,
+ headers_override: bool = False,
proxy_list: Optional[List] = None,
proxy_manager: Optional[ProxyManager] = None,
):
super().__init__(proxy_list=proxy_list, proxy_manager=proxy_manager)
self.format_string = format_string
self.timeout = timeout
+ self.method = method
+ self.headers = headers
+ self.headers_override = headers_override
def __str__(self):
return f'{self.__class__.__name__}({self.format_string})'
async def resolve(self, params) -> AsyncIterable[PreparedRequest]:
yield PreparedRequest(
- method='GET',
+ method=self.method,
url=self.format_string.format(**params),
timeout=self.timeout,
+ headers=self.headers,
+ headers_override=self.headers_override,
)
diff --git a/nexus/pylon/source.py b/nexus/pylon/source.py
index 3a6be68..83840ac 100644
--- a/nexus/pylon/source.py
+++ b/nexus/pylon/source.py
@@ -2,12 +2,12 @@ import logging
from typing import (
AsyncIterable,
Dict,
- List,
+ Optional,
)
from aiohttp.client_exceptions import ClientPayloadError
-from library.aiokit.aiokit import AioThing
-from library.logging import error_log
+from aiokit import AioThing
+from izihawa_utils.importlib import import_object
from nexus.pylon.drivers.base import BaseDriver
from nexus.pylon.exceptions import (
DownloadError,
@@ -16,7 +16,6 @@ from nexus.pylon.exceptions import (
from nexus.pylon.matcher import Matcher
from nexus.pylon.proto.file_pb2 import FileResponse as FileResponsePb
from nexus.pylon.resolvers.base import BaseResolver
-from utils.izihawa_utils.importlib import import_object
class Source(AioThing):
@@ -29,12 +28,15 @@ class Source(AioThing):
@classmethod
def from_config(
cls,
- proxy_manager,
+ config,
source_config,
- downloads_directory: str,
- default_driver_proxy_list: List,
- default_resolver_proxy_list: List,
- ) -> 'Source':
+ proxy_manager,
+ ) -> Optional['Source']:
+ driver_cls_name = source_config.get('driver', {}).get('class', 'nexus.pylon.drivers.BrowserDriver')
+
+ if driver_cls_name.endswith('BrowserDriver') and config.get('webdriver_hub') is None:
+ return None
+
matcher = Matcher(source_config['matcher'])
resolver_cls = import_object(
@@ -42,16 +44,16 @@ class Source(AioThing):
)
resolver_args = dict(
proxy_manager=proxy_manager,
- proxy_list=default_resolver_proxy_list,
+ proxy_list=config['default_resolver_proxy_list'],
)
resolver_args.update(**source_config.get('resolver', {}).get('args', {}))
resolver = resolver_cls(**resolver_args)
- driver_cls = import_object(source_config.get('driver', {}).get('class', 'nexus.pylon.drivers.BrowserDriver'))
+ driver_cls = import_object(driver_cls_name)
driver_args = dict(
proxy_manager=proxy_manager,
- downloads_directory=downloads_directory,
- proxy_list=default_driver_proxy_list,
+ proxy_list=config['default_driver_proxy_list'],
+ config=config,
)
driver_args.update(**source_config.get('driver', {}).get('args', {}))
driver = driver_cls(**driver_args)
@@ -67,13 +69,6 @@ class Source(AioThing):
async def download(self, params: Dict) -> AsyncIterable[FileResponsePb]:
yield FileResponsePb(status=FileResponsePb.Status.RESOLVING)
async for prepared_file_request in self.resolver.resolve(params):
- logging.debug({
- 'action': 'download',
- 'mode': 'pylon',
- 'params': params,
- 'source': str(self),
- 'url': prepared_file_request.url,
- })
try:
async for resp in self.driver.execute_prepared_file_request(
prepared_file_request=prepared_file_request,
@@ -82,11 +77,11 @@ class Source(AioThing):
yield resp
return
except ClientPayloadError as e:
- error_log(e, level=logging.WARNING)
+ logging.getLogger('nexus_pylon').warning(e)
continue
except NotFoundError:
continue
except DownloadError as e:
- error_log(e)
+ logging.getLogger('nexus_pylon').warning(e)
continue
raise NotFoundError(params=params, resolver=str(self.resolver), driver=str(self.driver))
diff --git a/nexus/pylon/validators/__init__.py b/nexus/pylon/validators/__init__.py
index 33cc2a0..835a2da 100644
--- a/nexus/pylon/validators/__init__.py
+++ b/nexus/pylon/validators/__init__.py
@@ -1,4 +1,5 @@
+from .base import BaseValidator
from .md5 import Md5Validator
from .pdf import PdfValidator
-__all__ = ['Md5Validator', 'PdfValidator']
+__all__ = ['BaseValidator', 'Md5Validator', 'PdfValidator']
diff --git a/nexus/pylon/validators/base.py b/nexus/pylon/validators/base.py
index cbf6e1c..54bf504 100644
--- a/nexus/pylon/validators/base.py
+++ b/nexus/pylon/validators/base.py
@@ -1,4 +1,10 @@
+from typing import Dict
+
+
class BaseValidator:
+ def __init__(self, params: Dict):
+ self.params = params
+
def update(self, chunk):
pass
diff --git a/nexus/pylon/validators/md5.py b/nexus/pylon/validators/md5.py
index d57660f..b24a651 100644
--- a/nexus/pylon/validators/md5.py
+++ b/nexus/pylon/validators/md5.py
@@ -7,6 +7,7 @@ from nexus.pylon.validators.base import BaseValidator
class Md5Validator(BaseValidator):
def __init__(self, params: Dict):
+ super().__init__(params)
self.md5 = params['md5']
self.v = hashlib.md5()
diff --git a/nexus/pylon/validators/pdf.py b/nexus/pylon/validators/pdf.py
index d393419..6634a34 100644
--- a/nexus/pylon/validators/pdf.py
+++ b/nexus/pylon/validators/pdf.py
@@ -12,7 +12,7 @@ from PyPDF2.errors import PdfReadError
class PdfValidator(BaseValidator):
def __init__(self, params: Dict):
- self.params = params
+ super().__init__(params)
self.md5 = params.get('md5')
self.file = bytes()
self.v = hashlib.md5()
@@ -24,7 +24,7 @@ class PdfValidator(BaseValidator):
def validate(self):
if self.md5 and self.md5.lower() == self.v.hexdigest().lower():
- logging.getLogger('debug').debug({
+ logging.getLogger('nexus_pylon').debug({
'action': 'validation',
'mode': 'pylon',
'result': 'md5_ok',
@@ -32,7 +32,7 @@ class PdfValidator(BaseValidator):
})
return
elif not is_pdf(f=self.file):
- logging.getLogger('debug').debug({
+ logging.getLogger('nexus_pylon').debug({
'action': 'validation',
'mode': 'pylon',
'result': 'not_pdf',
@@ -41,28 +41,18 @@ class PdfValidator(BaseValidator):
raise BadResponseError(file=str(self.file[:100]))
try:
- logging.getLogger('debug').debug({
- 'action': 'open_pdf',
- 'mode': 'pylon',
- 'file_len': len(self.file),
- 'params': self.params,
- })
PyPDF2.PdfReader(BytesIO(self.file))
- logging.getLogger('debug').debug({
- 'action': 'opened_pdf',
- 'mode': 'pylon',
- 'file_len': len(self.file),
- 'params': self.params,
- })
except PdfReadError:
- logging.getLogger('debug').debug({
+ logging.getLogger('nexus_pylon').debug({
'action': 'validation',
'mode': 'pylon',
'result': 'not_opened_as_pdf',
+ 'params': self.params,
})
raise BadResponseError(file=str(self.file[:100]))
- logging.getLogger('debug').debug({
+ logging.getLogger('nexus_pylon').debug({
'action': 'validation',
'mode': 'pylon',
'result': 'ok',
+ 'params': self.params,
})
diff --git a/nexus/translations/BUILD.bazel b/nexus/translations/BUILD.bazel
index b2a817a..96e44a2 100644
--- a/nexus/translations/BUILD.bazel
+++ b/nexus/translations/BUILD.bazel
@@ -10,6 +10,6 @@ py_library(
srcs_version = "PY3",
visibility = ["//visibility:public"],
deps = [
- "//library/configurator",
+ requirement("izihawa_configurator"),
],
)
diff --git a/nexus/translations/__init__.py b/nexus/translations/__init__.py
index 9b6964b..335bf23 100644
--- a/nexus/translations/__init__.py
+++ b/nexus/translations/__init__.py
@@ -1,4 +1,4 @@
-from library.configurator import Configurator
+from izihawa_configurator import Configurator
def get_translations():
diff --git a/nexus/views/telegram/progress_bar.py b/nexus/views/telegram/progress_bar.py
index 5d1bb6c..41e4106 100644
--- a/nexus/views/telegram/progress_bar.py
+++ b/nexus/views/telegram/progress_bar.py
@@ -33,7 +33,10 @@ class ProgressBar:
tail_text,
message=None,
source=None,
- throttle_secs: float = 0,
+ throttle_secs: float = 0.0,
+ hard_throttle_secs: float = 10.0,
+ last_call: float = 0.0,
+ done_threshold_size: int = 10 * 1024 * 1024,
):
self.telegram_client = telegram_client
self.request_context = request_context
@@ -45,9 +48,12 @@ class ProgressBar:
self.done = 0
self.total = 1
self.throttle_secs = throttle_secs
+ self.hard_throttle_secs = hard_throttle_secs
+ self.done_threshold_size = done_threshold_size
+ self.previous_done = 0
self.last_text = None
- self.last_call = 0
+ self.last_call = last_call
def share(self):
if self.total > 0:
@@ -56,6 +62,7 @@ class ProgressBar:
return f'{float(self.done / (1024 * 1024)):.1f}Mb'
def _set_progress(self, done, total):
+ self.previous_done = self.done
self.done = done
self.total = total
@@ -74,11 +81,20 @@ class ProgressBar:
progress_bar = '|' + filled * bars['filled'] + (total_bars - filled) * bars['empty'] + '| '
tail_text = self.tail_text.format(source=self.source)
- return f'`{self.header}\n{progress_bar}{self.share()} {tail_text}`'
+ return f'`{self.header}\n{progress_bar}{self.share().ljust(8)} {tail_text}`'
+
+ def should_send(self, now, ignore_last_call):
+ if ignore_last_call:
+ return True
+ if abs(now - self.last_call) > self.hard_throttle_secs:
+ return True
+ if abs(now - self.last_call) > self.throttle_secs and (self.done - self.previous_done) < self.done_threshold_size:
+ return True
+ return False
async def send_message(self, text, ignore_last_call=False):
now = time.time()
- if not ignore_last_call and abs(now - self.last_call) < self.throttle_secs:
+ if not self.should_send(now, ignore_last_call):
return
try:
if not self.message:
@@ -103,17 +119,3 @@ class ProgressBar:
async def callback(self, done, total, ignore_last_call=False):
self._set_progress(done, total)
return await self.send_message(await self.render_progress(), ignore_last_call=ignore_last_call)
-
-
-class ThrottlerWrapper:
- def __init__(self, callback: Callable, throttle_secs: Union[int, float]):
- self.callback = callback
- self.last_call = 0
- self.throttle_secs = throttle_secs
-
- async def __call__(self, *args, **kwargs):
- now = time.time()
- if abs(now - self.last_call) < self.throttle_secs:
- return
- self.last_call = now
- return await self.callback(*args, **kwargs)
diff --git a/nexus/views/telegram/scimag.py b/nexus/views/telegram/scimag.py
index ddd7fad..2915e15 100644
--- a/nexus/views/telegram/scimag.py
+++ b/nexus/views/telegram/scimag.py
@@ -63,7 +63,6 @@ class ScimagViewBuilder(BaseViewBuilder):
'chapter': '🔖',
'book-chapter': '🔖',
}
- multihash_ix = 0
def is_preprint(self):
return self.document_holder.doi.split('/')[0] in preprints
diff --git a/rules/python/requirements-lock.txt b/rules/python/requirements-lock.txt
index 4308082..b0ac838 100644
--- a/rules/python/requirements-lock.txt
+++ b/rules/python/requirements-lock.txt
@@ -13,7 +13,7 @@ aiohttp-socks==0.7.1
aiokafka==0.7.2
aiokit==1.1.2
aiosignal==1.2.0
-aiosumma==2.8.13
+aiosumma==2.10.4
asn1crypto==1.5.1
async-generator==1.10
async-timeout==4.0.2
@@ -55,7 +55,7 @@ h11==0.13.0
idna==3.3
iniconfig==1.1.1
isort==5.10.1
-izihawa-nlptools==1.1.7
+izihawa-nlptools==1.1.9
izihawa-types==0.1.3
izihawa-utils==1.0.7
Jinja2==3.1.2