diff --git a/idm/api/BUILD.bazel b/idm/api/BUILD.bazel index c5a805a..207bc87 100644 --- a/idm/api/BUILD.bazel +++ b/idm/api/BUILD.bazel @@ -43,7 +43,7 @@ py3_image( "//library/aiogrpctools", requirement("aiokit"), "//library/aiopostgres", - "//library/configurator", + requirement("izihawa_configurator"), "//library/telegram", requirement("izihawa_utils"), ], diff --git a/idm/api/configs/__init__.py b/idm/api/configs/__init__.py index 339e2f7..5d552f4 100644 --- a/idm/api/configs/__init__.py +++ b/idm/api/configs/__init__.py @@ -1,11 +1,13 @@ -from library.configurator import Configurator +from izihawa_configurator import Configurator +from izihawa_utils import env def get_config(): return Configurator([ 'idm/api/configs/base.yaml', + 'idm/api/configs/%s.yaml?' % env.type, 'idm/api/configs/logging.yaml', - ], env_prefix='NEXUS_IDM_API') + ], env_prefix='IDM_API') config = get_config() diff --git a/idm/api/configs/base.yaml b/idm/api/configs/base.yaml index 91819d3..95f4228 100644 --- a/idm/api/configs/base.yaml +++ b/idm/api/configs/base.yaml @@ -2,25 +2,25 @@ application: debug: true service_name: idm-api -database: - idm: - database: nexus - host: - password: - username: - drivername: postgresql - port: 5432 - nexus: - database: nexus - host: - password: - username: - drivername: postgresql - port: 5432 clickhouse: host: password: username: +database: + idm: + database: nexus + drivername: postgresql + host: + password: + port: 5432 + username: + nexus: + database: nexus + drivername: postgresql + host: + password: + port: 5432 + username: grpc: address: 0.0.0.0 port: 82 diff --git a/idm/api/main.py b/idm/api/main.py index 9496936..c86d2a5 100644 --- a/idm/api/main.py +++ b/idm/api/main.py @@ -7,9 +7,9 @@ from idm.api.configs import get_config from idm.api.services.chat_manager import ChatManagerService from idm.api.services.profile import ProfileService from idm.api.services.subscription_manager import SubscriptionManagerService +from izihawa_configurator import Configurator from library.aiogrpctools import AioGrpcServer from library.aiopostgres.pool_holder import AioPostgresPoolHolder -from library.configurator import Configurator from library.logging import configure_logging diff --git a/idm/api/services/profile.py b/idm/api/services/profile.py index 857deed..dfc78d4 100644 --- a/idm/api/services/profile.py +++ b/idm/api/services/profile.py @@ -105,20 +105,24 @@ class ProfileService(profile_service_pb2_grpc.ProfileServicer, BaseService): for tag in download_document.tags: tags_counter[tag] += 1 - most_popular_issns = sorted(issns_counter, key=issns_counter.get, reverse=True)[:7] + most_popular_issns = sorted(issns_counter, key=issns_counter.get, reverse=True)[:14] most_popular_tags = sorted(tags_counter, key=tags_counter.get, reverse=True)[:7] most_popular_series = [] - async for row in self.application.pool_holder['nexus'].iterate( - f"select name, issns from series where issns && array[{most_popular_issns}]::text[]".format( - most_popular_issns=','.join(map(lambda x: "'" + x + "'", most_popular_issns)), - ), - row_factory=dict_row, - ): - most_popular_series.append(profile_service_pb2.Series( - name=row['name'], - issns=row['issns'], - )) + if most_popular_issns: + async for row in self.application.pool_holder['nexus'].iterate( + "select name, array_agg(issn) as issns from series " + "where issn in ({most_popular_issns}) " + "group by name order by name " + "limit 7".format( + most_popular_issns=','.join(map(lambda x: "'" + x + "'", most_popular_issns)), + ), + row_factory=dict_row, + ): + most_popular_series.append(profile_service_pb2.Series( + name=row['name'], + issns=row['issns'], + )) return most_popular_series, most_popular_tags diff --git a/library/aiogrpctools/BUILD.bazel b/library/aiogrpctools/BUILD.bazel index d0dd791..1b637dd 100644 --- a/library/aiogrpctools/BUILD.bazel +++ b/library/aiogrpctools/BUILD.bazel @@ -13,7 +13,7 @@ py_library( requirement("grpcio"), requirement("pyyaml"), requirement("aiokit"), - "//library/configurator", + requirement("izihawa_configurator"), "//library/logging", requirement("izihawa_utils"), ], diff --git a/library/aiopostgres/pool_holder.py b/library/aiopostgres/pool_holder.py index 31830b2..31385d3 100644 --- a/library/aiopostgres/pool_holder.py +++ b/library/aiopostgres/pool_holder.py @@ -92,6 +92,7 @@ class AioPostgresPoolHolder(AioThing): row_factory=tuple_row, cursor_name: Optional[str] = None, itersize: Optional[int] = None, + statement_timeout: Optional[int] = None, ): if not self.pool: raise RuntimeError('AioPostgresPoolHolder has not been started') @@ -99,7 +100,9 @@ class AioPostgresPoolHolder(AioThing): async with conn.cursor(name=cursor_name, row_factory=row_factory) as cur: if itersize is not None: cur.itersize = itersize - await cur.execute(stmt, values) + await cur.execute(stmt + ';' if statement_timeout else '', values) + if statement_timeout: + await cur.execute(f'SET statement_timeout = {statement_timeout};') async for row in cur: yield row diff --git a/library/configurator/BUILD.bazel b/library/configurator/BUILD.bazel deleted file mode 100644 index 09efa31..0000000 --- a/library/configurator/BUILD.bazel +++ /dev/null @@ -1,17 +0,0 @@ -load("@pip_modules//:requirements.bzl", "requirement") -load("@rules_python//python:defs.bzl", "py_library") - -py_library( - name = "configurator", - srcs = glob( - ["**/*.py"], - exclude = ["tests/**"], - ), - srcs_version = "PY3", - visibility = ["//visibility:public"], - deps = [ - requirement("jinja2"), - requirement("pyyaml"), - requirement("izihawa_utils"), - ], -) diff --git a/library/configurator/__init__.py b/library/configurator/__init__.py deleted file mode 100644 index 3c77143..0000000 --- a/library/configurator/__init__.py +++ /dev/null @@ -1,170 +0,0 @@ -import json -import os -import os.path -from types import ModuleType - -import yaml -from izihawa_utils.common import ( - smart_merge_dicts, - unflatten, -) -from jinja2 import Template -from library.configurator.exceptions import UnknownConfigFormatError - - -class ConfigObject(dict): - def __getattr__(self, name): - try: - return self[name] - except KeyError as e: - raise AttributeError(e) - - -class AnyOf: - def __init__(self, *args): - self.args = args - - -class RichDict(dict): - def has(self, *args): - current = self - for c in args: - if c not in current: - return False - current = current[c] - return True - - def copy_if_exists(self, source_keys, target_key): - current = self - for c in source_keys: - if c not in current: - return False - current = current[c] - self[target_key] = current - return True - - -class Configurator(RichDict): - def __init__(self, configs: list, env_prefix: str = None, env_key_separator: str = '.'): - """ - Create Configurator object - - :param configs: list of paths to config files, dicts or modules. - End filepath with `?` to mark it as optional config. - """ - super().__init__() - - self._by_basenames = {} - self._omitted_files = [] - - env_dict = {} - - if env_prefix: - env_prefix = env_prefix.lower() - for name, value in os.environ.items(): - if name.lower().startswith(env_prefix): - stripped_name = name[len(env_prefix):].lstrip('_') - if stripped_name[-2:] == '[]': - if stripped_name not in env_dict: - env_dict[stripped_name[:-2]] = [] - env_dict[stripped_name[:-2]].append(value) - else: - env_dict[stripped_name] = value - env_dict = unflatten(env_dict, sep=env_key_separator) - - for config in ([os.environ] + configs + [env_dict]): - file_found = self.update(config) - if not file_found: - self._omitted_files.append(config) - - def _config_filename(self, filename): - return os.path.join(os.getcwd(), filename) - - def walk_and_render(self, c): - if isinstance(c, str): - return Template(c).render(**self) - elif isinstance(c, list): - return [self.walk_and_render(e) for e in c] - elif isinstance(c, dict): - for key in list(c.keys()): - c[key] = self.walk_and_render(c[key]) - if key.endswith('_filepath'): - with open(c[key]) as f: - if c[key].endswith('.json'): - c[key.replace('_filepath', '')] = json.loads(f.read()) - elif c[key].endswith('.yaml'): - c[key.replace('_filepath', '')] = yaml.safe_load(f.read()) - return c - - def update(self, new_config, basename=None, **kwargs): - if isinstance(new_config, AnyOf): - for config in new_config.args: - try: - return self.update(config.rstrip('?')) - except IOError: - pass - raise IOError('None of %s was found' % ', '.join(new_config.args)) - elif isinstance(new_config, str): - optional = new_config.endswith('?') - filename = new_config.rstrip('?') - basename = basename or os.path.basename(filename) - - config_filename = self._config_filename(filename) - - data = None - - if os.path.exists(config_filename) and os.access(config_filename, os.R_OK): - with open(config_filename) as f: - data = f.read() - - if data is None: - if optional: - return False - else: - raise IOError(f'File {config_filename} not found') - - if filename.endswith('.json'): - new_config = json.loads(data) - elif filename.endswith('.yaml'): - new_config = yaml.safe_load(data) - else: - raise UnknownConfigFormatError(filename) - - new_config = self.walk_and_render(new_config) - - elif isinstance(new_config, ModuleType): - new_config = new_config.__dict__ - - elif callable(new_config): - new_config = new_config(self) - - if not new_config: - new_config = {} - - for k in new_config: - if callable(new_config[k]): - new_config[k] = new_config[k](context=self) - - if 'log_path' in new_config: - new_config['log_path'] = os.path.expanduser(new_config['log_path']).rstrip('/') - - smart_merge_dicts(self, new_config, list_policy='override', copy=False) - if basename: - self._by_basenames[basename] = new_config - - return True - - def get_config_by_basename(self, basename): - return self._by_basenames[basename] - - def get_object_by_basename(self, basename): - return ConfigObject(self._by_basenames[basename]) - - def has_missed_configs(self): - return bool(self._omitted_files) - - def has_file(self, basename): - return basename in self._by_basenames - - def get_files(self): - return self._by_basenames diff --git a/library/configurator/exceptions.py b/library/configurator/exceptions.py deleted file mode 100644 index cbc1f99..0000000 --- a/library/configurator/exceptions.py +++ /dev/null @@ -1,2 +0,0 @@ -class UnknownConfigFormatError(Exception): - pass diff --git a/nexus/actions/document_operations_pb/update_document_scimag_pb.py b/nexus/actions/document_operations_pb/update_document_scimag_pb.py index 1bec67b..7b35df4 100644 --- a/nexus/actions/document_operations_pb/update_document_scimag_pb.py +++ b/nexus/actions/document_operations_pb/update_document_scimag_pb.py @@ -159,7 +159,6 @@ class ToSummaAction(BaseAction): 'journal', 'journal-issue', 'journal-volume', - 'other', 'peer-review', 'proceedings', 'report-series', diff --git a/nexus/bot/BUILD.bazel b/nexus/bot/BUILD.bazel index b8666ac..951f102 100644 --- a/nexus/bot/BUILD.bazel +++ b/nexus/bot/BUILD.bazel @@ -35,7 +35,7 @@ py3_image( requirement("aiobaseclient"), requirement("aiocrossref"), requirement("aiokit"), - "//library/configurator", + requirement("izihawa_configurator"), "//library/logging", "//library/telegram", "//nexus/hub/aioclient", diff --git a/nexus/bot/configs/__init__.py b/nexus/bot/configs/__init__.py index a954719..91097a5 100644 --- a/nexus/bot/configs/__init__.py +++ b/nexus/bot/configs/__init__.py @@ -1,5 +1,5 @@ +from izihawa_configurator import Configurator from izihawa_utils import env -from library.configurator import Configurator def get_config(): diff --git a/nexus/bot/handlers/view.py b/nexus/bot/handlers/view.py index d562091..d10d00d 100644 --- a/nexus/bot/handlers/view.py +++ b/nexus/bot/handlers/view.py @@ -104,7 +104,7 @@ class ViewHandler(BaseHandler): ), event.delete(), ] - if not has_found_old_widget: + if not has_found_old_widget and is_earlier_than_2_days(old_message): async with safe_execution(error_log=request_context.error_log): await self.application.telegram_client.delete_messages(request_context.chat.chat_id, [old_message_id]) return await asyncio.gather(*actions) diff --git a/nexus/cognitron/configs/scimag.yaml b/nexus/cognitron/configs/scimag.yaml index b345b4c..594552a 100644 --- a/nexus/cognitron/configs/scimag.yaml +++ b/nexus/cognitron/configs/scimag.yaml @@ -189,6 +189,13 @@ schema: record: basic tokenizer: raw stored: true + - name: series_page_rank + type: f64 + options: + fast: single + fieldnorms: false + indexed: true + stored: true multi_fields: ["authors", "ipfs_multihashes", "isbns", "issns", "references", "tags"] primary_key: "id" stop_words: ['a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'if', 'in', 'is', 'it', 'of', 'on', 'or', diff --git a/nexus/hub/BUILD.bazel b/nexus/hub/BUILD.bazel index 9510659..b6b8996 100644 --- a/nexus/hub/BUILD.bazel +++ b/nexus/hub/BUILD.bazel @@ -43,7 +43,7 @@ py3_image( requirement("aioipfs-2"), requirement("aiokit"), "//library/aiopostgres", - "//library/configurator", + requirement("izihawa_configurator"), "//library/telegram", "//nexus/hub/proto:grpc_py", "//nexus/hub/proto:proto_py", diff --git a/nexus/hub/configs/__init__.py b/nexus/hub/configs/__init__.py index 02c7ff7..e282a6d 100644 --- a/nexus/hub/configs/__init__.py +++ b/nexus/hub/configs/__init__.py @@ -1,5 +1,5 @@ +from izihawa_configurator import Configurator from izihawa_utils import env -from library.configurator import Configurator def get_config(): diff --git a/nexus/hub/configs/pylon.yaml b/nexus/hub/configs/pylon.yaml deleted file mode 100644 index 5091ed1..0000000 --- a/nexus/hub/configs/pylon.yaml +++ /dev/null @@ -1,629 +0,0 @@ ---- -pylon: - default_driver_proxy_list: - - [cambridge] - - [edinburg] - - [southampton] - default_resolver_proxy_list: ~ - downloads_directory: /downloads - proxies: - - address: clash.default.svc.cluster.example.com:7890 - name: cambridge - tags: ['cambridge'] - - address: clash.default.svc.cluster.example.com:7990 - name: edinburg - tags: ['edinburg'] - - address: clash.default.svc.cluster.example.com:8090 - name: southampton - tags: ['southampton'] - - address: socks5://clash.default.svc.cluster.example.com:7991 - name: socks5 - tags: ['socks5'] - sources: - # LibGen.rocks - - driver: - args: - proxy_list: ~ - validator: - class: nexus.pylon.validators.Md5Validator - class: - nexus.pylon.drivers.DirectDriver - matcher: - md5: ^.*$ - resolver: - args: - extractors: - - producer: - format_string: 'http://libgen.rocks/{matched_group}' - group: 0 - re: 'get\.php\?md5=.*&key=[A-Za-z\d]+' - timeout: 25.0 - type: regex - url: https://libgen.rocks/ads.php?md5={md5} - class: nexus.pylon.resolvers.RequestResolver - # LibGen.rocks - - driver: - args: - proxy_list: ~ - class: - nexus.pylon.drivers.DirectDriver - matcher: - doi: ^.*$ - resolver: - args: - extractors: - - producer: - format_string: 'http://libgen.rocks/{matched_group}' - group: 0 - re: 'get\.php\?md5=[a-fA-F\d]+&key=[A-Za-z\d]+(&doi=[^\"]*)+' - timeout: 25.0 - type: regex - url: 'https://libgen.rocks/ads.php?doi={doi}' - class: nexus.pylon.resolvers.RequestResolver - # Library.lol - - driver: - args: - proxy_list: ~ - validator: - class: nexus.pylon.validators.Md5Validator - class: - nexus.pylon.drivers.DirectDriver - matcher: - md5: ^.*$ - resolver: - args: - extractors: - - producer: - format_string: '{matched_group}' - group: 1 - re: 'GET' - timeout: 45.0 - type: regex - - producer: - format_string: '{matched_group}' - group: 0 - re: 'https://ipfs.io/ipfs/[A-Za-z\d]+' - type: regex - - producer: - format_string: '{matched_group}' - group: 0 - re: 'https://cloudflare-ipfs.com/ipfs/[A-Za-z\d]+' - type: regex - url: http://library.lol/main/{md5} - class: nexus.pylon.resolvers.RequestResolver - # library.lol - - driver: - args: - proxy_list: ~ - class: - nexus.pylon.drivers.DirectDriver - matcher: - doi: ^.*$ - resolver: - args: - extractors: - - producer: - format_string: '{matched_group}' - group: 1 - re: 'GET' - timeout: 45.0 - type: regex - url: 'http://library.lol/scimag/{doi}' - class: nexus.pylon.resolvers.RequestResolver - # jamanetwork.com - - driver: - args: - actions: - - selector: '#pdf-link .toolbar-link-text-extra, .contents-tab-contents > #pdf-link .toolbar-link-text-extra' - timeout: 20 - type: wait_css_selector - - type: click - class: - nexus.pylon.drivers.BrowserDriver - matcher: - doi: ^10.1001/.*$ - # wires.onlinelibrary.wiley.com - - matcher: - doi: ^10.1002/.*$ - resolver: - args: - format_string: 'https://onlinelibrary.wiley.com/doi/pdf/{doi}' - class: nexus.pylon.resolvers.TemplateResolver - # link.springer.com - - matcher: - doi: ^10.(1007|14283)/.*$ - resolver: - args: - format_string: 'https://link.springer.com/content/pdf/{doi}.pdf' - class: nexus.pylon.resolvers.TemplateResolver - # www.sciencedirect.com - - matcher: - doi: ^10.(1016|1053|4103)/.*$ - resolver: - args: - format_string: 'https://www.sciencedirect.com/science/article/pii/{selected}/pdfft?isDTMRedir=true&download=true' - selector: '(.resource.primary.URL | split("/"))[-1]' - timeout: 40.0 - class: nexus.pylon.resolvers.DoiOrgRequestResolver - # www.clinicalkey.com - - driver: - args: - actions: - - selector: '.x-pdf' - timeout: 30.0 - type: wait_css_selector - - type: click - class: - nexus.pylon.drivers.BrowserDriver - matcher: - doi: ^10.1016/.*$ - # www.cambridge.org - - matcher: - doi: ^10.1017/.*$ - resolver: - class: nexus.pylon.resolvers.DoiOrgRequestResolver - # pubs.acs.org - - matcher: - doi: ^10.1021/.*$ - resolver: - args: - format_string: 'https://pubs.acs.org/doi/pdf/{doi}?download=true' - class: nexus.pylon.resolvers.TemplateResolver - # www.nature.com - - driver: - args: - actions: - - selector: '#entitlement-box-right-column span' - timeout: 30 - type: wait_css_selector - - type: click - class: nexus.pylon.drivers.BrowserDriver - matcher: - doi: ^10.1038/.*$ - # www.nejm.org - - matcher: - doi: ^10.1056/.*$ - resolver: - args: - format_string: 'https://www.nejm.org/doi/pdf/{doi}?download=true' - class: nexus.pylon.resolvers.TemplateResolver - # ascelibrary.org - - matcher: - doi: ^10.1061/.*$ - resolver: - args: - format_string: 'https://ascelibrary.org/doi/pdf/{doi}?download=true' - class: nexus.pylon.resolvers.TemplateResolver - # www.pnas.org - - matcher: - doi: ^10.1073/.*$ - resolver: - args: - format_string: 'https://www.pnas.org/doi/pdf/{doi}?download=true' - class: nexus.pylon.resolvers.TemplateResolver - # www.tandfonline.com - - matcher: - doi: ^10.1080/.*$ - resolver: - args: - format_string: 'https://www.tandfonline.com/doi/pdf/{doi}?download=true' - class: nexus.pylon.resolvers.TemplateResolver - # iopscience.iop.org - - matcher: - doi: ^10.1088/.*$ - resolver: - args: - format_string: 'https://iopscience.iop.org/article/{doi}/pdf' - class: nexus.pylon.resolvers.TemplateResolver - # academic.oup.com - - driver: - args: - actions: - - selector: '.pdf-link-text' - timeout: 30 - type: wait_css_selector - - type: click - proxy_list: - - [edinburg] - - [cambridge] - - [southampton] - class: nexus.pylon.drivers.BrowserDriver - matcher: - doi: ^10.1093/.*$ - # journals.lww.com - - driver: - args: - actions: - - selector: '.ejp-article-tools__list-icon-holder > .icon-pdf' - timeout: 30.0 - type: wait_css_selector - - type: click - - selector: '.ejp-article-tools__dropdown-list-button > .icon-pdf' - timeout: 5.0 - type: wait_css_selector - - type: click - class: nexus.pylon.drivers.BrowserDriver - matcher: - doi: ^10.(1097|1213|5435|14309)/.*$ - resolver: - args: - timeout: 30.0 - class: nexus.pylon.resolvers.TemplateResolver - # journals.aps.org - - matcher: - doi: ^10.1103/.*$ - resolver: - args: - selector: '[.link[] | select(.URL | ascii_downcase | endswith("fulltext"))][0].URL' - class: nexus.pylon.resolvers.DoiOrgRequestResolver - # emerald.com - - matcher: - doi: ^10.1108/.*$ - resolver: - args: - format_string: 'https://www.emerald.com/insight/content/doi/{doi}/full/pdf' - class: nexus.pylon.resolvers.TemplateResolver - # ieeexplore.ieee.org - - matcher: - doi: ^10.1109/.*$ - resolver: - args: - format_string: 'https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber={selected}' - selector: '(.resource.primary.URL | split("/"))[-2]' - class: nexus.pylon.resolvers.DoiOrgRequestResolver - # onlinelibrary.wiley.com - - matcher: - doi: ^10.1111/.*$ - resolver: - args: - format_string: 'https://onlinelibrary.wiley.com/doi/pdfdirect/{doi}?download=true' - class: nexus.pylon.resolvers.TemplateResolver - # asa.scitation.org - - matcher: - doi: ^10.1121/.*$ - resolver: - args: - format_string: 'https://asa.scitation.org/doi/pdf/{doi}?download=true' - class: nexus.pylon.resolvers.TemplateResolver - # www.science.org - - matcher: - doi: ^10.1126/.*$ - resolver: - args: - format_string: 'https://www.science.org/doi/pdf/{doi}?download=true' - class: nexus.pylon.resolvers.TemplateResolver - # ^10.1136/.*$ - - driver: - args: - actions: - - selector: '.article-pdf-download > img' - type: wait_css_selector - - type: click - class: nexus.pylon.drivers.BrowserDriver - matcher: - doi: ^10.1136/.*$ - # ^10.1136/.*$ - - driver: - args: - actions: - - selector: '.icon' - type: wait_css_selector - - type: click - class: nexus.pylon.drivers.BrowserDriver - matcher: - doi: ^10.1136/.*$ - resolver: - class: nexus.pylon.resolvers.DoiOrgRequestResolver - # dl.acm.org - - matcher: - doi: ^10.1145/.*$ - resolver: - args: - format_string: 'https://dl.acm.org/doi/pdf/{doi}?download=true' - class: nexus.pylon.resolvers.TemplateResolver - # www.annualreviews.org - - matcher: - doi: ^10.1146/.*$ - resolver: - args: - format_string: 'https://www.annualreviews.org/doi/pdf/{doi}?download=true' - class: nexus.pylon.resolvers.TemplateResolver - # journals.physiology.org - - matcher: - doi: ^10.1152/.*$ - resolver: - args: - format_string: 'https://journals.physiology.org/doi/pdf/{doi}?download=true' - class: nexus.pylon.resolvers.TemplateResolver - # www.ahajournals.org - - matcher: - doi: ^10.1161/.*$ - resolver: - args: - format_string: 'https://www.ahajournals.org/doi/pdf/{doi}?download=true' - class: nexus.pylon.resolvers.TemplateResolver - # ajp.psychiatryonline.org - - matcher: - doi: ^10.1176/.*$ - resolver: - args: - format_string: 'https://ajp.psychiatryonline.org/doi/pdf/{doi}?download=true' - class: nexus.pylon.resolvers.TemplateResolver - # journals.sagepub.com - - matcher: - doi: ^10.1177/.*$ - resolver: - args: - format_string: 'https://journals.sagepub.com/doi/pdf/{doi}?download=true' - class: nexus.pylon.resolvers.TemplateResolver - # bmcpsychiatry.biomedcentral.com - - matcher: - doi: ^10.1186/.*$ - resolver: - args: - format_string: 'https://bmcpsychiatry.biomedcentral.com/track/pdf/{doi}.pdf' - class: nexus.pylon.resolvers.TemplateResolver - # journals.plos.org - - driver: - args: - proxy_list: ~ - class: nexus.pylon.drivers.direct.DirectDriver - matcher: - doi: ^10.1371/.*$ - resolver: - args: - format_string: 'https://journals.plos.org/plosone/article/file?id={doi}&type=printable' - class: nexus.pylon.resolvers.TemplateResolver - # bioone.org - - driver: - args: - actions: - - selector: 'DOWNLOAD PAPER' - type: wait_link_text - - type: click - class: nexus.pylon.drivers.BrowserDriver - matcher: - doi: ^10.(1638|1654|1667|2108)/.*$ - # jasn.asnjournals.org - - driver: - args: - actions: - - selector: 'View PDF' - type: wait_link_text - - type: click - class: nexus.pylon.drivers.BrowserDriver - matcher: - doi: ^10.1681/.*$ - # papers.ssrn.com - - driver: - args: - actions: - - selector: '.abstract-buttons:nth-child(1) .primary > span' - type: wait_css_selector - - type: click - class: nexus.pylon.drivers.BrowserDriver - matcher: - doi: ^10.2139/.*$ - # www.afghandata.org - - driver: - args: - proxy_list: ~ - class: - nexus.pylon.drivers.DirectDriver - matcher: - doi: ^10.(2458|29171)/.*$ - # www.euppublishing.com - - matcher: - doi: ^10.3366/.*$ - resolver: - args: - format_string: 'https://www.euppublishing.com/doi/pdf/{doi}?download=true' - class: nexus.pylon.resolvers.TemplateResolver - # www.frontiersin.org - - driver: - args: - actions: - - selector: '.paper > .dropdown-toggle' - type: wait_css_selector - - type: click - - selector: '.download-files-pdf' - type: wait_css_selector - - type: click - class: - nexus.pylon.drivers.BrowserDriver - matcher: - doi: ^10.3389/.*$ - resolver: - args: - selector: '[.link[] | select(.URL | ascii_downcase | endswith("full"))][0].URL' - class: nexus.pylon.resolvers.DoiOrgRequestResolver - # bjgp.org - - driver: - args: - actions: - - selector: 'PDF' - type: wait_link_text - - type: click - class: nexus.pylon.drivers.BrowserDriver - matcher: - doi: ^10.3399/.*$ - # www.wjgnet.com - - driver: - args: - actions: - - selector: 'Full Article (PDF)' - type: wait_link_text - - type: click - class: nexus.pylon.drivers.BrowserDriver - matcher: - doi: ^10.3748/.*$ - resolver: - class: nexus.pylon.resolvers.DoiOrgRequestResolver - # journals.vilniustech.lt - - driver: - args: - actions: - - selector: '.label' - type: wait_css_selector - - type: click - class: nexus.pylon.drivers.BrowserDriver - matcher: - doi: ^10.3846/.*$ - resolver: - class: nexus.pylon.resolvers.DoiOrgRequestResolver - # isegoria.revistas.csic.es - - matcher: - doi: ^10.3989/.*$ - resolver: - args: - selector: '[.link[] | select(."intended-application" == "similarity-checking")][0].URL' - class: nexus.pylon.resolvers.DoiOrgRequestResolver - # www.psychiatrist.com - - driver: - args: - actions: - - selector: '.article-dwndpdf > img' - type: wait_css_selector - - type: click - class: nexus.pylon.drivers.BrowserDriver - matcher: - doi: ^10.4088/.*$ - # www.scirp.org - - driver: - args: - actions: - - selector: 'PDF' - type: wait_link_text - - type: click - class: nexus.pylon.drivers.BrowserDriver - matcher: - doi: ^10.4236/.*$ - # jsbr.be - - driver: - args: - actions: - - selector: 'h4 > .fa-download' - type: wait_css_selector - - type: click - - selector: 'PDF (EN)' - type: wait_link_text - - type: click - class: nexus.pylon.drivers.BrowserDriver - matcher: - doi: ^10.5334/.*$ - # hess.copernicus.org - - driver: - args: - proxy_list: ~ - class: nexus.pylon.drivers.DirectDriver - matcher: - doi: ^10.5194/.*$ - # ProQuest - - driver: - args: - actions: - - selector: '#searchTerm' - type: wait_css_selector - - type: click - - selector: '#searchTerm' - text: '{doi}' - type: type - - selector: '.uxf-search' - type: wait_css_selector - - type: click - - selector: '.uxf-download' - type: wait_css_selector - - type: click - proxy_list: ~ - class: nexus.pylon.drivers.BrowserDriver - matcher: - doi: ^10.5585/.* - resolver: - args: - format_string: 'https://www.proquest.com/' - class: nexus.pylon.resolvers.TemplateResolver - # jcsm.aasm.org - - matcher: - doi: ^10.5664/.*$ - resolver: - args: - format_string: 'https://jcsm.aasm.org/doi/pdf/{doi}?download=true' - class: nexus.pylon.resolvers.TemplateResolver - # journal.permsc.ru - - driver: - args: - actions: - - selector: '.obj_galley_link' - type: wait_css_selector - - type: click - - selector: '.label' - type: wait_css_selector - - type: click - class: nexus.pylon.drivers.BrowserDriver - matcher: - doi: ^10.7242/.*$ - # amjcaserep.com - - driver: - args: - actions: - - selector: "//input[@value='Download PDF version']" - type: wait_xpath - - type: click - class: nexus.pylon.drivers.BrowserDriver - matcher: - doi: ^10.12659/.*$ - # medcraveonline.com - - driver: - args: - actions: - - selector: 'Download PDF' - type: wait_link_text - - type: click - class: nexus.pylon.drivers.BrowserDriver - matcher: - doi: ^10.15406/.*$ - # www.researchsquare.com - - driver: - args: - actions: - - selector: '.hidden > .text-blue-600 > .antialiased' - type: wait_css_selector - - type: native_click - class: nexus.pylon.drivers.BrowserDriver - matcher: - doi: ^10.21203/.*$ - # www.ukm.my/ - - driver: - args: - proxy_list: ~ - class: nexus.pylon.drivers.DirectDriver - matcher: - doi: ^10.24035/.*$ - # journals.library.ryerson.ca - - driver: - args: - actions: - - selector: '#a11y-1-tab-tab-download' - type: wait_css_selector - - type: click - class: nexus.pylon.drivers.BrowserDriver - matcher: - doi: ^10.32920/.*$ - # papers.cumincad.org - - driver: - args: - actions: - - selector: 'file.pdf' - type: wait_link_text - - type: click - proxy_list: ~ - class: nexus.pylon.drivers.BrowserDriver - matcher: - doi: ^10.52842/.*$ - # ^.*$ - - matcher: - doi: ^.*$ - resolver: - args: - selector: '[(.link | if . == null then [] else . end)[] | select((."content-type" == "application/pdf") or (.URL | ascii_downcase | contains("pdf")))][0].URL' - class: nexus.pylon.resolvers.DoiOrgRequestResolver diff --git a/nexus/hub/main.py b/nexus/hub/main.py index 3799b17..03a3c06 100644 --- a/nexus/hub/main.py +++ b/nexus/hub/main.py @@ -5,9 +5,9 @@ import uvloop from aiogrobid import GrobidClient from aioipfs import AsyncIPFS as AsyncIPFS from idm.api.aioclient import IdmApiGrpcClient +from izihawa_configurator import Configurator from library.aiogrpctools import AioGrpcServer from library.aiopostgres import AioPostgresPoolHolder -from library.configurator import Configurator from library.logging import configure_logging from library.telegram.base import BaseTelegramClient from nexus.hub.configs import get_config @@ -16,6 +16,7 @@ from nexus.hub.services.mutual_aid_service import MutualAidService from nexus.hub.services.submitter import SubmitterService from nexus.hub.user_manager import UserManager from nexus.meta_api.aioclient import MetaApiGrpcClient +from nexus.pylon.configs import get_config as get_default_pylon_config class GrpcServer(AioGrpcServer): @@ -79,7 +80,7 @@ class GrpcServer(AioGrpcServer): should_parse_with_grobid=config['application']['should_parse_with_grobid'], should_store_hashes=config['application']['should_store_hashes'], telegram_bot_configs=config['telegram']['bots'], - pylon_config=config['pylon'], + pylon_config=config.get('pylon') or get_default_pylon_config()['pylon'], ) self.submitter_service = SubmitterService( application=self, diff --git a/nexus/hub/services/base.py b/nexus/hub/services/base.py index 0ff6939..8706dde 100644 --- a/nexus/hub/services/base.py +++ b/nexus/hub/services/base.py @@ -65,6 +65,7 @@ class BaseHubService(BaseService): await asyncio.gather( self.application.ipfs_client.add_bytes(file, cid_version=1, hash='blake2b-256', only_hash=True), self.application.ipfs_client.add_bytes(file, cid_version=0, hash='sha2-256', only_hash=True), + self.application.ipfs_client.add_bytes(file, cid_version=1, hash='blake3', only_hash=True), ) )) diff --git a/nexus/hub/services/delivery.py b/nexus/hub/services/delivery.py index 91f0519..fc0833c 100644 --- a/nexus/hub/services/delivery.py +++ b/nexus/hub/services/delivery.py @@ -73,13 +73,7 @@ class DeliveryService(delivery_service_pb2_grpc.DeliveryServicer, BaseHubService self.downloadings = set() self.is_sharience_enabled = is_sharience_enabled self.maintenance_picture_url = maintenance_picture_url - self.pylon_client = PylonClient( - proxies=pylon_config['proxies'], - source_configs=pylon_config['sources'], - default_driver_proxy_list=pylon_config['default_driver_proxy_list'], - default_resolver_proxy_list=pylon_config['default_resolver_proxy_list'], - downloads_directory=pylon_config['downloads_directory'], - ) + self.pylon_client = PylonClient(config=pylon_config) self.should_parse_with_grobid = should_parse_with_grobid self.should_store_hashes = should_store_hashes self.telegram_bot_configs = telegram_bot_configs @@ -170,6 +164,15 @@ class DeliveryService(delivery_service_pb2_grpc.DeliveryServicer, BaseHubService return delivery_service_pb2.StartDeliveryResponse(status=delivery_service_pb2.StartDeliveryResponse.Status.OK) +async def delayed_task(create_task, t): + try: + await asyncio.sleep(t) + task = create_task() + await task + except asyncio.CancelledError: + pass + + class DownloadTask: def __init__( self, @@ -204,7 +207,7 @@ class DownloadTask: ) async def download_task(self, request_context: RequestContext, document_holder): - throttle_secs = 2.0 + throttle_secs = 3.0 async def _on_fail(): await self.application.telegram_clients[request_context.bot_name].send_message( @@ -218,6 +221,7 @@ class DownloadTask: error_log=request_context.error_log, on_fail=_on_fail, ): + start_time = time.time() filename = document_holder.get_filename() progress_bar_download = ProgressBar( telegram_client=self.application.telegram_clients[request_context.bot_name], @@ -226,9 +230,9 @@ class DownloadTask: header=f'⬇️ {filename}', tail_text=t('TRANSMITTED_FROM', request_context.chat.language), throttle_secs=throttle_secs, + last_call=start_time, ) downloads_gauge.inc() - start_time = time.time() try: file = await self.download( document_holder=document_holder, @@ -242,11 +246,21 @@ class DownloadTask: ) if not document_holder.md5 and document_holder.get_extension() == 'pdf': try: - await progress_bar_download.send_message( - t("PROCESSING_PAPER", request_context.chat.language).format(filename=filename), - ignore_last_call=True + processing_message_task = asyncio.create_task(delayed_task( + create_task=lambda: progress_bar_download.send_message( + t("PROCESSING_PAPER", request_context.chat.language).format(filename=filename), + ignore_last_call=True + ), + t=5.0 + )) + file = await asyncio.get_running_loop().run_in_executor( + None, + lambda: clean_metadata(file, doi=document_holder.doi) ) - file = clean_metadata(file, doi=document_holder.doi) + + processing_message_task.cancel() + await processing_message_task + request_context.statbox( action='cleaned', len=len(file), @@ -260,7 +274,8 @@ class DownloadTask: banner=t("LOOKING_AT", request_context.chat.language), header=f'⬇️ {filename}', tail_text=t('UPLOADED_TO_TELEGRAM', request_context.chat.language), - throttle_secs=throttle_secs + throttle_secs=throttle_secs, + last_call=progress_bar_download.last_call, ) uploaded_message = await self.delivery_service.send_file( document_holder=self.document_holder, @@ -393,11 +408,15 @@ class DownloadTask: async def download(self, document_holder, progress_bar): collected = bytearray() - if document_holder.doi: - try: - params = {'doi': document_holder.doi} - if document_holder.md5: - params['md5'] = document_holder.md5 + params = {} + try: + if document_holder.doi: + params['doi'] = document_holder.doi + if document_holder.md5: + params['md5'] = document_holder.md5 + if document_holder.ipfs_multihashes: + params['ipfs_multihashes'] = [ipfs_multihash for ipfs_multihash in document_holder.ipfs_multihashes] + if params: async for resp in self.delivery_service.pylon_client.download(params): await self.process_resp( resp=resp, @@ -406,20 +425,8 @@ class DownloadTask: filesize=document_holder.filesize, ) return bytes(collected) - except DownloadError: - pass - if document_holder.md5: - try: - async for resp in self.delivery_service.pylon_client.download({'md5': document_holder.md5}): - await self.process_resp( - resp=resp, - progress_bar=progress_bar, - collected=collected, - filesize=document_holder.filesize, - ) - return bytes(collected) - except DownloadError: - pass + except DownloadError: + pass async def external_cancel(self): self.request_context.statbox(action='externally_canceled') diff --git a/nexus/ingest/BUILD.bazel b/nexus/ingest/BUILD.bazel index a90c356..6d22fb3 100644 --- a/nexus/ingest/BUILD.bazel +++ b/nexus/ingest/BUILD.bazel @@ -27,7 +27,7 @@ py3_image( requirement("aiokit"), requirement("aiolibgen"), "//library/aiopostgres", - "//library/configurator", + requirement("izihawa_configurator"), "//library/jobber", "//nexus/actions", ], diff --git a/nexus/ingest/jobs/crossref_api.py b/nexus/ingest/jobs/crossref_api.py index 1950e89..7a22899 100644 --- a/nexus/ingest/jobs/crossref_api.py +++ b/nexus/ingest/jobs/crossref_api.py @@ -39,8 +39,8 @@ class CrossrefApiJob(BaseJob): }) count = 0 async for chunk in self.crossref_client.works_cursor( - filter=f'from-index-date:{self.from_date}', - rows=500, + filter=f'from-index-date:{self.from_date}', + rows=500, ): for item in chunk['items']: yield item @@ -50,4 +50,4 @@ class CrossrefApiJob(BaseJob): 'mode': 'ingest', 'items': count, 'target_date': self.from_date, - }) \ No newline at end of file + }) diff --git a/nexus/ingest/jobs/postgres.py b/nexus/ingest/jobs/postgres.py index be3feae..9362bfb 100644 --- a/nexus/ingest/jobs/postgres.py +++ b/nexus/ingest/jobs/postgres.py @@ -34,6 +34,7 @@ class PostgresJob(BaseJob): f'user={database["username"]} ' f'password={database["password"]} ' f'host={database["host"]}', + timeout=3600 * 2, ) self.summa_client = SummaClient(endpoint=summa['endpoint']) self.summa_config = summa @@ -84,6 +85,7 @@ class PostgresJob(BaseJob): # Mandatory for server side cursor cursor_name='nexus_ingest_cursor', itersize=50_000, + statement_timeout=3600 * 2, ): loaded = True yield row @@ -95,8 +97,12 @@ class PostgresJob(BaseJob): # Mandatory for server side cursor cursor_name='nexus_ingest_cursor', itersize=50_000, + statement_timeout=3600 * 2, ): yield row - await self.summa_client.commit_index(self.summa_config['name'], session_id=session_id) + await self.summa_client.commit_index( + self.summa_config['name'], + session_id=session_id, + ) await self.summa_client.set_index_alias(self.summa_config['index_alias'], self.summa_config['name'], session_id=session_id) diff --git a/nexus/meta_api/BUILD.bazel b/nexus/meta_api/BUILD.bazel index 6f2a18b..d3d67fc 100644 --- a/nexus/meta_api/BUILD.bazel +++ b/nexus/meta_api/BUILD.bazel @@ -25,7 +25,7 @@ DEPS = [ "//library/aiogrpctools", requirement("aiokit"), "//library/aiopostgres", - "//library/configurator", + requirement("izihawa_configurator"), "//library/logging", "//nexus/meta_api/proto:grpc_py", "//nexus/models/proto:proto_py", diff --git a/nexus/meta_api/configs/__init__.py b/nexus/meta_api/configs/__init__.py index 0952810..5043832 100644 --- a/nexus/meta_api/configs/__init__.py +++ b/nexus/meta_api/configs/__init__.py @@ -1,5 +1,5 @@ +from izihawa_configurator import Configurator from izihawa_utils import env -from library.configurator import Configurator def get_config(): diff --git a/nexus/meta_api/services/search.py b/nexus/meta_api/services/search.py index 7f5a93f..35fbd0b 100644 --- a/nexus/meta_api/services/search.py +++ b/nexus/meta_api/services/search.py @@ -315,7 +315,7 @@ class SearchService(SearchServicer, BaseService): with suppress(RetryError): async for attempt in AsyncRetrying( retry=retry_if_exception_type(NeedRetryError), - wait=wait_fixed(5), + wait=wait_fixed(10), stop=stop_after_attempt(6) ): with attempt: diff --git a/nexus/models/proto/scimag.proto b/nexus/models/proto/scimag.proto index 66e3019..ba83b57 100644 --- a/nexus/models/proto/scimag.proto +++ b/nexus/models/proto/scimag.proto @@ -33,4 +33,5 @@ message Scimag { string volume = 21; int32 year = 30; float page_rank = 34; + float series_page_rank = 35; } diff --git a/nexus/pipe/BUILD.bazel b/nexus/pipe/BUILD.bazel index 9f6431c..4fbc938 100644 --- a/nexus/pipe/BUILD.bazel +++ b/nexus/pipe/BUILD.bazel @@ -26,7 +26,7 @@ py3_image( requirement("aiocrossref"), requirement("aiokit"), "//library/aiopostgres", - "//library/configurator", + requirement("izihawa_configurator"), "//library/logging", "//nexus/actions", "//nexus/models/proto:proto_py", diff --git a/nexus/promotions/BUILD.bazel b/nexus/promotions/BUILD.bazel index 57f4422..e670166 100644 --- a/nexus/promotions/BUILD.bazel +++ b/nexus/promotions/BUILD.bazel @@ -10,6 +10,6 @@ py_library( srcs_version = "PY3", visibility = ["//visibility:public"], deps = [ - "//library/configurator", + requirement("izihawa_configurator"), ], ) diff --git a/nexus/promotions/__init__.py b/nexus/promotions/__init__.py index 4bd76b0..4ad4db9 100644 --- a/nexus/promotions/__init__.py +++ b/nexus/promotions/__init__.py @@ -1,4 +1,4 @@ -from library.configurator import Configurator +from izihawa_configurator import Configurator def get_promotions(): diff --git a/nexus/promotions/promotions.yaml b/nexus/promotions/promotions.yaml index 0c8b20b..8c66e81 100644 --- a/nexus/promotions/promotions.yaml +++ b/nexus/promotions/promotions.yaml @@ -19,6 +19,9 @@ promotions: - texts: en: 💬 Research is the only and ultimate goal weight: 1 + - texts: + en: 💬 Intellectual property is not a valid form of property + weight: 1 - texts: en: ✋ Have a subscription to paid articles? [Help researchers!](https://t.me/{mutual_aid_group}) ru: ✋ Есть доступ к платным статьям? [Помоги ученым!](https://t.me/{mutual_aid_group}) diff --git a/nexus/pylon/BUILD.bazel b/nexus/pylon/BUILD.bazel index 4c3ab61..8815183 100644 --- a/nexus/pylon/BUILD.bazel +++ b/nexus/pylon/BUILD.bazel @@ -1,12 +1,16 @@ -load("@rules_python//python:defs.bzl", "py_binary", "py_library") +load("@rules_python//python:defs.bzl", "py_library") +load("@rules_python//python:packaging.bzl", "py_wheel") load("@pip_modules//:requirements.bzl", "requirement") +filegroup( + name = "data", + srcs = ["configs/pylon.yaml"], +) + py_library( name = "pylon", srcs = glob(["**/*.py"]), - data = [ - "configs/pylon.yaml", - ], + data = [":data"], visibility = ["//visibility:public"], deps = [ requirement("aiodns"), @@ -16,6 +20,7 @@ py_library( requirement("brotli"), requirement("cchardet"), requirement("certifi"), + requirement("fire"), requirement("jq"), requirement("orjson"), requirement("pypdf2"), @@ -23,20 +28,38 @@ py_library( requirement("selenium"), requirement("tenacity"), requirement("aiokit"), - "//library/configurator", + requirement("izihawa_configurator"), "//library/logging", "//nexus/pylon/proto:pylon_proto_py", ], ) -py_binary( - name = "cli", - srcs = ["cli.py"], - main = "cli.py", - srcs_version = "PY3", - visibility = ["//visibility:public"], +py_wheel( + name = "nexus-pylon-wheel", + author = "The Superpirate", + author_email = "fist.of.the.first.pirates@gmail.com", + classifiers = [ + "Programming Language :: Python :: 3.10", + ], + description_file = ":README.md", + distribution = "nexus-pylon", + entry_points = {"console_scripts": ["pylon = nexus.pylon.cli:main"]}, + homepage = "https://github.com/nexus-stc/hyperboria/tree/master/nexus/pylon", + license = "MIT License", + python_requires = ">=3.10", + python_tag = "py3", + requires = [ + "aiokit >= 1.0.0", + "izihawa_configurator >= 1.0.0", + "selenium >= 4.3.0", + ], + strip_path_prefixes = [ + "nexus/pylon/proto/pylon_proto_py_pb", + ], + version = "1.0.0", deps = [ - requirement("fire"), + ":data", ":pylon", + "//nexus/pylon/proto:pylon_proto_py", ], ) diff --git a/nexus/pylon/README.md b/nexus/pylon/README.md index 57d3fd1..d8068e1 100644 --- a/nexus/pylon/README.md +++ b/nexus/pylon/README.md @@ -6,16 +6,51 @@ - Streams data by chunks - GRPC-ready +## Build + +```bash +bazel build -c opt nexus-pylon-wheel +``` + +## Install + +### PIP +```bash +pip install nexus-pylon +``` + ## Nexus Pylon CLI -Casual download -```bash -bazel run -c opt cli -- doi 10.1056/NEJMoa2033700 --output article.pdf +Download scientific publication: +```bash +pylon download --doi 10.1182/blood-2011-03-325258 --output article.pdf ``` -Download with proxies -```bash -bazel run -c opt cli -- md5 278C3A72B7B04717361501B8642857DF \ - --output file.pdf \ - --proxies socks5://127.0.0.1:9050 +Download file by its MD5: +```bash +pylon download --md5 f07707ee92fa675fd4ee53e3fee977d1 --output article.pdf ``` + +Download file by its multihash: +```bash +pylon download --ipfs-multihashes '["bafykbzacea3vduqii3u52xkzdqan5oc54vsvedmed25dfybrqxyafahjl3rzu"]' --output article.pdf +``` + +### Using with Selenium + +Create directory for exchaning files between host and launched Selenium in Docker +```bash +mkdir downloads +``` + +Launch Selenium in Docker +```bash +docker run -e SE_START_XVFB=false -v $(pwd)/downloads:/downloads -p 4444:4444 selenium/standalone-chrome:latest +``` + +Launch Pylon +```bash +pylon download --doi 10.1101/2022.09.09.507349 --output article.pdf \ +--wd-endpoint 'http://127.0.0.1:4444/wd/hub' \ +--wd-directory /downloads --wd-host-directory $(pwd)/downloads --debug +``` \ No newline at end of file diff --git a/nexus/pylon/cli.py b/nexus/pylon/cli.py index 476d9eb..7dd459f 100644 --- a/nexus/pylon/cli.py +++ b/nexus/pylon/cli.py @@ -1,15 +1,17 @@ import logging import os import sys +from typing import Optional import fire from aiokit.utils import sync_fu -from nexus.pylon.client import ( +from izihawa_configurator import Configurator + +from .client import ( DownloadError, PylonClient, ) -from nexus.pylon.configs import get_config -from nexus.pylon.proto.file_pb2 import FileResponse as FileResponsePb +from .proto.file_pb2 import FileResponse as FileResponsePb def resolve_path(filepath): @@ -27,22 +29,20 @@ async def fetch( collected = bytes() try: last_len = 0 - last_source = '' async for resp in iter: if resp.HasField('status'): if resp.status == FileResponsePb.Status.BEGIN_TRANSMISSION: - print(f'Started transmission from {resp.source}...', end='\r', file=sys.stderr) + print(f'Started transmission...', file=sys.stderr) last_len = 0 - last_source = resp.source collected = bytes() elif resp.HasField('chunk'): if len(collected) - last_len > 1024 * 100: - print(f'Loaded {len(collected)} bytes from {resp.source}', end='\r', file=sys.stderr) + print(f'Loaded {len(collected)} bytes', end='\r', file=sys.stderr) last_len = len(collected) - last_source = resp.source collected += resp.chunk.content with open(resolve_path(output), 'wb') as f: - print(f'Completed! Loaded {len(collected)} bytes from {last_source}', file=sys.stderr) + print() + print(f'Completed! Loaded {len(collected)} bytes', file=sys.stderr) f.write(collected) except DownloadError: print('File not found') @@ -50,25 +50,53 @@ async def fetch( async def download( output: str, + config: Optional[str] = None, debug: bool = False, + wd_endpoint: Optional[str] = None, + wd_directory: Optional[str] = None, + wd_host_directory: Optional[str] = None, **params, ): + """ + Download scientific publications from various sources + Large portion of fresh articles could be retrieved only though publisher libraries through `BrowserDriver`, it + requires Selenium webdriver: + `docker run -e SE_START_XVFB=false -v $(pwd)/downloads:/downloads -p 4444:4444 selenium/standalone-chrome:latest` + Args: + output: name of the output file + config: pylon config + debug: enable debug logging + wd_endpoint: web-driver + wd_directory: mounted directory inside Docker image + wd_host_directory: directory for downloads on host that should be mounter as `wd_directory` inside Docker image + """ if debug: logging.basicConfig(stream=sys.stderr, level=logging.DEBUG) - c = get_config()['pylon'] - p = PylonClient( - proxies=c['proxies'], - source_configs=c['sources'], - default_driver_proxy_list=c['default_driver_proxy_list'], - downloads_directory=c['downloads_directory'], - ) - return await fetch(iter=p.download(params=params), output=output) + + default_config_path = os.path.join(os.path.dirname(__file__), 'configs/pylon.yaml') + config = Configurator([config if config else default_config_path], env_prefix='NEXUS_PYLON') + config = config['pylon'] + if wd_endpoint: + config.setdefault('webdriver_hub', {}) + config['webdriver_hub']['endpoint'] = wd_endpoint + if not wd_directory: + raise ValueError('Should pass --wd-directory with --wd-endpoint') + config['webdriver_hub']['downloads_directory'] = wd_directory + if not wd_host_directory: + raise ValueError('Should pass --wd-host-directory with --wd-endpoint') + config['webdriver_hub']['host_downloads_directory'] = wd_host_directory + + pylon_client = PylonClient(config=config) + return await fetch(iter=pylon_client.download(params=params), output=output) def main(): - fire.Fire({ - 'download': sync_fu(download), - }) + try: + fire.Fire({ + 'download': sync_fu(download), + }) + except KeyboardInterrupt: + sys.exit(1) if __name__ == '__main__': diff --git a/nexus/pylon/client.py b/nexus/pylon/client.py index 9d3fbab..e02f6c9 100644 --- a/nexus/pylon/client.py +++ b/nexus/pylon/client.py @@ -1,12 +1,10 @@ +import logging from typing import ( AsyncIterable, Dict, - List, - Optional, ) from aiokit import AioThing -from library.logging import error_log from nexus.pylon.exceptions import ( DownloadError, NotFoundError, @@ -17,30 +15,25 @@ from nexus.pylon.source import Source class PylonClient(AioThing): - def __init__( - self, - source_configs: Optional[List], - proxies: Optional[List[str]] = None, - downloads_directory: Optional[str] = None, - default_driver_proxy_list: [Optional[List]] = None, - default_resolver_proxy_list: [Optional[List]] = None, - ): + def __init__(self, config): super().__init__() - self.proxy_manager = ProxyManager(proxies) - self.downloads_directory = downloads_directory - self.default_driver_proxy_list = default_driver_proxy_list - self.default_resolver_proxy_list = default_resolver_proxy_list + self.config = config + self.proxy_manager = ProxyManager(config.get('proxies')) self.sources = [] - for source_config in source_configs: + if config.get('webdriver_hub') is None: + logging.getLogger('nexus_pylon').warning({ + 'action': 'missed_webdriver', + 'mode': 'pylon', + }) + for source_config in config['sources']: source = Source.from_config( proxy_manager=self.proxy_manager, + config=self.config, source_config=source_config, - downloads_directory=downloads_directory, - default_driver_proxy_list=default_driver_proxy_list, - default_resolver_proxy_list=default_resolver_proxy_list, ) - self.sources.append(source) - self.starts.append(source) + if source: + self.sources.append(source) + self.starts.append(source) async def download(self, params: Dict) -> AsyncIterable[FileResponsePb]: for source in self.sources: @@ -50,9 +43,10 @@ class PylonClient(AioThing): async for resp in source.download(params): yield resp return - except NotFoundError: + except NotFoundError as e: + logging.getLogger('nexus_pylon').debug(e) continue except DownloadError as e: - error_log(e) + logging.getLogger('nexus_pylon').warning(e) continue - raise NotFoundError() + raise NotFoundError(params=params) diff --git a/nexus/pylon/configs/__init__.py b/nexus/pylon/configs/__init__.py index 834248b..6e32e46 100644 --- a/nexus/pylon/configs/__init__.py +++ b/nexus/pylon/configs/__init__.py @@ -1,5 +1,4 @@ -from izihawa_utils import env -from library.configurator import Configurator +from izihawa_configurator import Configurator def get_config(): diff --git a/nexus/pylon/configs/pylon.yaml b/nexus/pylon/configs/pylon.yaml index ed96b83..d7ef6a6 100644 --- a/nexus/pylon/configs/pylon.yaml +++ b/nexus/pylon/configs/pylon.yaml @@ -1,65 +1,25 @@ --- pylon: - default_driver_proxy_list: - - [proxy1] - - [proxy2] - - [proxy3] - downloads_directory: /downloads - proxies: - - address: proxy1.net:7890 - name: proxy1 - tags: [proxy1] - - address: proxy2.net:7990 - name: proxy2 - tags: [proxy2] - - address: proxy3.net:8090 - name: proxy3 - tags: [proxy3] + default_driver_proxy_list: ~ + default_resolver_proxy_list: ~ + proxies: ~ sources: - # LibGen.rocks + # IPFS - driver: args: - proxy_list: ~ validator: - class: nexus.pylon.validators.Md5Validator + class: nexus.pylon.validators.BaseValidator class: nexus.pylon.drivers.DirectDriver matcher: - md5: ^.*$ + ipfs_multihashes: ^.*$ resolver: args: - extractors: - - producer: - format_string: 'http://libgen.rocks/{matched_group}' - group: 0 - re: 'get\.php\?md5=.*&key=[A-Za-z\d]+' - timeout: 25.0 - type: regex - url: https://libgen.rocks/ads.php?md5={md5} - class: nexus.pylon.resolvers.RequestResolver - # LibGen.rocks - - driver: - args: - proxy_list: ~ - class: - nexus.pylon.drivers.DirectDriver - matcher: - doi: ^.*$ - resolver: - args: - extractors: - - producer: - format_string: 'http://libgen.rocks/{matched_group}' - group: 0 - re: 'get\.php\?md5=[a-fA-F\d]+&key=[A-Za-z\d]+(&doi=[^\"]*)+' - timeout: 25.0 - type: regex - url: 'https://libgen.rocks/ads.php?doi={doi}' - class: nexus.pylon.resolvers.RequestResolver + format_string: 'https://ipfs.io/ipfs/{ipfs_multihashes[0]}' + class: nexus.pylon.resolvers.TemplateResolver # Library.lol - driver: args: - proxy_list: ~ validator: class: nexus.pylon.validators.Md5Validator class: @@ -70,27 +30,22 @@ pylon: args: extractors: - producer: - format_string: '{matched_group}' - group: 1 - re: 'GET' - timeout: 45.0 + format_string: '{href}' + timeout: 45.0 + re: 'GET' type: regex - producer: - format_string: '{matched_group}' - group: 0 - re: 'https://ipfs.io/ipfs/[A-Za-z\d]+' + format_string: '{url}' + re: '(?Phttps://ipfs.io/ipfs/[A-Za-z\d]+)' type: regex - producer: - format_string: '{matched_group}' - group: 0 - re: 'https://cloudflare-ipfs.com/ipfs/[A-Za-z\d]+' + format_string: '{url}' + re: '(?Phttps://cloudflare-ipfs.com/ipfs/[A-Za-z\d]+)' type: regex url: http://library.lol/main/{md5} class: nexus.pylon.resolvers.RequestResolver # library.lol - driver: - args: - proxy_list: ~ class: nexus.pylon.drivers.DirectDriver matcher: @@ -99,13 +54,48 @@ pylon: args: extractors: - producer: - format_string: '{matched_group}' - group: 1 - re: 'GET' - timeout: 45.0 + format_string: '{href}' + timeout: 45.0 + re: 'GET' type: regex url: 'http://library.lol/scimag/{doi}' class: nexus.pylon.resolvers.RequestResolver + # LibGen.rocks + - driver: + args: + validator: + class: nexus.pylon.validators.Md5Validator + class: + nexus.pylon.drivers.DirectDriver + matcher: + md5: ^.*$ + resolver: + args: + extractors: + - producer: + format_string: 'http://libgen.rocks/{key}' + timeout: 25.0 + re: '(?Pget\.php\?md5=.*&key=[A-Za-z\d]+)' + type: regex + resolve_timeout: 25.0 + url: https://libgen.rocks/ads.php?md5={md5} + class: nexus.pylon.resolvers.RequestResolver + # LibGen.rocks + - driver: + class: + nexus.pylon.drivers.DirectDriver + matcher: + doi: ^.*$ + resolver: + args: + extractors: + - producer: + format_string: 'http://libgen.rocks/{key}' + timeout: 25.0 + re: '(?Pget\.php\?md5=[a-fA-F\d]+&key=[A-Za-z\d]+(&doi=[^\"]*)+)' + type: regex + url: 'https://libgen.rocks/ads.php?doi={doi}' + class: nexus.pylon.resolvers.RequestResolver # jamanetwork.com - driver: args: @@ -206,6 +196,13 @@ pylon: format_string: 'https://www.tandfonline.com/doi/pdf/{doi}?download=true' class: nexus.pylon.resolvers.TemplateResolver # iopscience.iop.org + - matcher: + doi: ^10.1088/.*$ + resolver: + args: + format_string: 'https://iopscience.iop.org/article/{doi}/pdf' + class: nexus.pylon.resolvers.TemplateResolver + # iopscience.iop.org - matcher: doi: ^10.1088/.*$ resolver: @@ -220,10 +217,6 @@ pylon: timeout: 30 type: wait_css_selector - type: click - proxy_list: - - [proxy2] - - [proxy1] - - [proxy3] class: nexus.pylon.drivers.BrowserDriver matcher: doi: ^10.1093/.*$ @@ -246,6 +239,13 @@ pylon: args: timeout: 30.0 class: nexus.pylon.resolvers.TemplateResolver + # biorxiv.org + - matcher: + doi: ^10.1101/.*$ + resolver: + args: + format_string: 'https://www.biorxiv.org/content/{doi}.full.pdf' + class: nexus.pylon.resolvers.TemplateResolver # journals.aps.org - matcher: doi: ^10.1103/.*$ @@ -332,6 +332,13 @@ pylon: args: format_string: 'https://journals.physiology.org/doi/pdf/{doi}?download=true' class: nexus.pylon.resolvers.TemplateResolver + # www.ahajournals.org + - matcher: + doi: ^10.1161/.*$ + resolver: + args: + format_string: 'https://www.ahajournals.org/doi/pdf/{doi}?download=true' + class: nexus.pylon.resolvers.TemplateResolver # ajp.psychiatryonline.org - matcher: doi: ^10.1176/.*$ @@ -355,8 +362,6 @@ pylon: class: nexus.pylon.resolvers.TemplateResolver # journals.plos.org - driver: - args: - proxy_list: ~ class: nexus.pylon.drivers.direct.DirectDriver matcher: doi: ^10.1371/.*$ @@ -364,6 +369,13 @@ pylon: args: format_string: 'https://journals.plos.org/plosone/article/file?id={doi}&type=printable' class: nexus.pylon.resolvers.TemplateResolver + # guilfordjournals.com + - matcher: + doi: ^10.1521/.*$ + resolver: + args: + format_string: 'https://guilfordjournals.com/doi/pdf/{doi}?download=true' + class: nexus.pylon.resolvers.TemplateResolver # bioone.org - driver: args: @@ -396,8 +408,6 @@ pylon: doi: ^10.2139/.*$ # www.afghandata.org - driver: - args: - proxy_list: ~ class: nexus.pylon.drivers.DirectDriver matcher: @@ -503,8 +513,6 @@ pylon: doi: ^10.5334/.*$ # hess.copernicus.org - driver: - args: - proxy_list: ~ class: nexus.pylon.drivers.DirectDriver matcher: doi: ^10.5194/.*$ @@ -524,7 +532,6 @@ pylon: - selector: '.uxf-download' type: wait_css_selector - type: click - proxy_list: ~ class: nexus.pylon.drivers.BrowserDriver matcher: doi: ^10.5585/.* @@ -539,6 +546,22 @@ pylon: args: format_string: 'https://jcsm.aasm.org/doi/pdf/{doi}?download=true' class: nexus.pylon.resolvers.TemplateResolver + # www.medwave.cl + - driver: + class: + nexus.pylon.drivers.DirectDriver + matcher: + doi: ^10.5867/.*$ + resolver: + args: + extractors: + - producer: + format_string: 'https://www.medwave.cl/{path}' + timeout: 25.0 + re: 'href=\"/(?P[\w/.\-_]+\.pdf)\">PDF' + type: regex + url: https://doi.org/{doi} + class: nexus.pylon.resolvers.RequestResolver # journal.permsc.ru - driver: args: @@ -584,8 +607,6 @@ pylon: doi: ^10.21203/.*$ # www.ukm.my/ - driver: - args: - proxy_list: ~ class: nexus.pylon.drivers.DirectDriver matcher: doi: ^10.24035/.*$ @@ -599,6 +620,22 @@ pylon: class: nexus.pylon.drivers.BrowserDriver matcher: doi: ^10.32920/.*$ + # PKP Project + - driver: + class: + nexus.pylon.drivers.DirectDriver + matcher: + doi: ^10.(5399|24905|31004|32729|37934)/.*$ + resolver: + args: + extractors: + - producer: + format_string: 'https://{host}/{prefix}/{journal}/article/download/{key}' + timeout: 25.0 + re: 'href=\"(?:https?://[\w.]+)/(?P[\w./]+)/(?P[\w.]+)/article/view/(?P\w+/\w+)\"[^>]*>[Pp][Dd][Ff]\s*' + type: regex + url: https://doi.org/{doi} + class: nexus.pylon.resolvers.RequestResolver # papers.cumincad.org - driver: args: @@ -606,11 +643,16 @@ pylon: - selector: 'file.pdf' type: wait_link_text - type: click - proxy_list: ~ class: nexus.pylon.drivers.BrowserDriver matcher: doi: ^10.52842/.*$ # ^.*$ + - matcher: + doi: ^.*$ + resolver: + args: + selector: '.resource.primary.URL | select (. | ascii_downcase | contains("pdf"))' + class: nexus.pylon.resolvers.DoiOrgRequestResolver - matcher: doi: ^.*$ resolver: diff --git a/nexus/pylon/drivers/base.py b/nexus/pylon/drivers/base.py index 96fde80..addb6c7 100644 --- a/nexus/pylon/drivers/base.py +++ b/nexus/pylon/drivers/base.py @@ -4,22 +4,22 @@ from typing import ( Optional, ) +from izihawa_utils.importlib import import_object from nexus.pylon.network_agent import NetworkAgent from nexus.pylon.prepared_request import PreparedRequest from nexus.pylon.proxy_manager import ProxyManager -from nexus.pylon.validators.base import BaseValidator -from utils.izihawa_utils.importlib import import_object class BaseDriver(NetworkAgent): def __init__( self, + config, validator=None, - downloads_directory: str = '/downloads', proxy_list: Optional[List] = None, proxy_manager: Optional[ProxyManager] = None, ): super().__init__(proxy_list=proxy_list, proxy_manager=proxy_manager) + self.config = config validator_cls = 'nexus.pylon.validators.PdfValidator' if validator and 'class' in validator: @@ -27,7 +27,6 @@ class BaseDriver(NetworkAgent): validator_cls = import_object(validator_cls) self.validator = validator_cls - self.downloads_directory = downloads_directory def __str__(self): return self.__class__.__name__ diff --git a/nexus/pylon/drivers/browser.py b/nexus/pylon/drivers/browser.py index 8d4ad25..b0963e2 100644 --- a/nexus/pylon/drivers/browser.py +++ b/nexus/pylon/drivers/browser.py @@ -32,21 +32,20 @@ from selenium.webdriver.support.ui import WebDriverWait class BrowserDriver(BaseDriver): def __init__( self, + config, validator=None, proxy_list: Optional[List] = None, proxy_manager: Optional[ProxyManager] = None, actions: Optional[List] = None, - downloads_directory='/downloads', - window_size: Tuple[int, int] = (1279, 833), - erase_webdrive_property: bool = True, - webdrive_hub_endpoint: str = "http://127.0.0.1:4444/wd/hub", ): - super().__init__(validator=validator, proxy_list=proxy_list, proxy_manager=proxy_manager) + super().__init__(config=config, validator=validator, proxy_list=proxy_list, proxy_manager=proxy_manager) self.actions = actions - self.downloads_directory = Path(downloads_directory) - self.window_size = window_size - self.erase_webdrive_property = erase_webdrive_property - self.webdrive_hub_endpoint = webdrive_hub_endpoint + self.downloads_directory = Path(config['webdriver_hub']['downloads_directory']) + self.host_downloads_directory = Path(config['webdriver_hub']['host_downloads_directory']) + self.window_size = tuple(config['webdriver_hub'].get('window_size', [1279, 833])) + self.erase_webdriver_property = config['webdriver_hub'].get('erase_webdriver_property', True) + self.webdriver_hub_endpoint = config['webdriver_hub']['endpoint'] + self.file_poll_timeout = 2.0 async def get_chrome_sessions(self): proxies = list( @@ -55,15 +54,14 @@ class BrowserDriver(BaseDriver): else [None] ) for proxy in proxies: - downloads_folder = self.downloads_directory / random_string(16) - os.mkdir(downloads_folder) - os.chmod(downloads_folder, 0o777) - chrome = await asyncio.get_running_loop().run_in_executor(None, lambda: self.setup_chrome(proxy, downloads_folder)) - try: - yield chrome, downloads_folder - finally: - shutil.rmtree(downloads_folder) - chrome.quit() + subdirectory = random_string(16) + downloads_directory = self.downloads_directory / subdirectory + host_downloads_directory = self.host_downloads_directory / subdirectory + os.mkdir(host_downloads_directory) + os.chmod(host_downloads_directory, 0o777) + chrome = await asyncio.get_running_loop().run_in_executor(None, lambda: self.setup_chrome(proxy, downloads_directory)) + yield chrome, host_downloads_directory + def setup_chrome(self, proxy, downloads_folder): options = webdriver.ChromeOptions() @@ -85,13 +83,13 @@ class BrowserDriver(BaseDriver): options.add_argument('--disable-dev-shm-usage') options.add_argument("--disable-popup-blocking") chrome = webdriver.Remote( - self.webdrive_hub_endpoint, + self.webdriver_hub_endpoint, DesiredCapabilities.CHROME, options=options, ) chrome.set_window_size(self.window_size[0], self.window_size[1]) - if self.erase_webdrive_property: + if self.erase_webdriver_property: resource = "/session/%s/chromium/send_command_and_get_result" % chrome.session_id url = chrome.command_executor._url + resource body = json.dumps({'cmd': "Page.addScriptToEvaluateOnNewDocument", 'params': { @@ -103,7 +101,7 @@ class BrowserDriver(BaseDriver): }}) chrome.command_executor._request('POST', url, body) - logging.getLogger('debug').debug({ + logging.getLogger('nexus_pylon').debug({ 'action': 'start_chrome', 'mode': 'pylon', 'proxy': str(proxy) if proxy is not None else None, @@ -148,32 +146,19 @@ class BrowserDriver(BaseDriver): and downloaded_offset == current_offset and current_offset > 0 ): - logging.getLogger('debug').debug({ - 'action': 'sent', - 'mode': 'pylon', - 'filename': filename, - }) return - - logging.getLogger('debug').debug({ - 'action': 'send_part', - 'mode': 'pylon', - 'current_offset': current_offset, - 'downloaded_offset': downloaded_offset, - 'filename': filename, - }) await file.seek(current_offset) yield await file.read(downloaded_offset - current_offset) current_offset = downloaded_offset - await asyncio.sleep(0.5) + await asyncio.sleep(self.file_poll_timeout) raise NotFoundError() finally: await file.close() def get(self, chrome, url, params): - logging.getLogger('debug').debug({ - 'action': 'get', + logging.getLogger('nexus_pylon').debug({ + 'action': 'download', 'mode': 'pylon', 'url': url, }) @@ -190,11 +175,6 @@ class BrowserDriver(BaseDriver): if not last_element: raise RuntimeError('Nothing to click') chrome.execute_script("arguments[0].click();", last_element) - logging.getLogger('debug').debug({ - 'action': 'clicked', - 'mode': 'pylon', - 'element': str(last_element), - }) case 'close_window': current_window = previous_window previous_window = None @@ -204,11 +184,6 @@ class BrowserDriver(BaseDriver): if not last_element: raise RuntimeError('Nothing to click') last_element.click() - logging.getLogger('debug').debug({ - 'action': 'native_clicked', - 'mode': 'pylon', - 'element': str(last_element), - }) case 'switch_to_new_window': previous_window = current_window current_window = chrome.window_handles[-1] @@ -227,12 +202,6 @@ class BrowserDriver(BaseDriver): action['selector'], )) ) - logging.getLogger('debug').debug({ - 'action': 'waited_css_selector', - 'mode': 'pylon', - 'element': str(last_element), - 'step': action - }) case 'wait_link_text': last_element = WebDriverWait(chrome, action.get('timeout', 15.0)).until( EC.presence_of_element_located(( @@ -240,12 +209,6 @@ class BrowserDriver(BaseDriver): action['selector'], )) ) - logging.getLogger('debug').debug({ - 'action': 'waited_link_text', - 'mode': 'pylon', - 'element': str(last_element), - 'step': action - }) case 'wait_xpath': last_element = WebDriverWait(chrome, action.get('timeout', 15.0)).until( EC.presence_of_element_located(( @@ -253,16 +216,10 @@ class BrowserDriver(BaseDriver): action['selector'], )) ) - logging.getLogger('debug').debug({ - 'action': 'waited_xpath', - 'mode': 'pylon', - 'element': str(last_element), - 'step': action - }) case _: raise NotImplementedError('Not implemented action type') except WebDriverException as e: - logging.getLogger('debug').debug({ + logging.getLogger('nexus_pylon').debug({ 'action': 'error', 'mode': 'pylon', 'error': str(e), @@ -294,15 +251,17 @@ class BrowserDriver(BaseDriver): source=chrome.current_url, ) file_validator.validate() - logging.getLogger('debug').debug({ - 'action': 'validated', - 'mode': 'pylon', - 'url': prepared_file_request.url, - }) return except NotFoundError: - logging.getLogger('debug').debug({ + logging.getLogger('nexus_pylon').debug({ 'action': 'no_response', 'mode': 'pylon', }) + finally: + logging.getLogger('nexus_pylon').debug({ + 'action': 'quit_chrome', + 'mode': 'pylon', + }) + chrome.quit() + shutil.rmtree(downloads_folder) raise NotFoundError(params=params, url=prepared_file_request.url, driver=str(self)) diff --git a/nexus/pylon/drivers/direct.py b/nexus/pylon/drivers/direct.py index 61db9fa..8b08546 100644 --- a/nexus/pylon/drivers/direct.py +++ b/nexus/pylon/drivers/direct.py @@ -1,3 +1,4 @@ +import logging from typing import Dict import aiohttp.client_exceptions @@ -25,12 +26,27 @@ class DirectDriver(BaseDriver): @retry( reraise=True, wait=wait_random(min=1, max=2), - stop=stop_after_attempt(7), + stop=stop_after_attempt(3), retry=retry_if_exception_type((ProxyError, aiohttp.client_exceptions.ClientPayloadError, ProxyTimeoutError)), ) async def execute_prepared_file_request(self, prepared_file_request: PreparedRequest, params: Dict): + logging.debug({ + 'action': 'download', + 'mode': 'pylon', + 'params': params, + 'source': str(self), + 'url': prepared_file_request.url, + }) async with self.get_session() as session: async with prepared_file_request.execute_with(session=session) as resp: + logging.debug({ + 'action': 'response', + 'mode': 'pylon', + 'params': params, + 'source': str(self), + 'url': prepared_file_request.url, + 'status': resp.status, + }) if resp.status == 404: raise NotFoundError(url=prepared_file_request.url) elif ( diff --git a/nexus/pylon/matcher.py b/nexus/pylon/matcher.py index 45cf385..dabb05c 100644 --- a/nexus/pylon/matcher.py +++ b/nexus/pylon/matcher.py @@ -1,5 +1,8 @@ import re -import sys +from typing import ( + List, + Tuple, +) class Matcher: @@ -10,8 +13,11 @@ class Matcher: def is_match(self, params) -> bool: for param in params: - if params[param]: - if param_regex := self.param_regexes.get(param): - if re.match(param_regex, params[param]): - return True - return False + param_value = params[param] + param_regex = self.param_regexes.get(param) + if param_value and param_regex: + if not isinstance(param_value, (List, Tuple)): + param_value = [param_value] + for el in param_value: + if re.match(param_regex, el): + return el diff --git a/nexus/pylon/network_agent.py b/nexus/pylon/network_agent.py index 3d57c35..37fce86 100644 --- a/nexus/pylon/network_agent.py +++ b/nexus/pylon/network_agent.py @@ -10,7 +10,7 @@ import aiohttp from aiohttp import ClientSession from aiohttp.client_reqrep import ClientRequest from aiohttp_socks import ProxyConnector -from library.aiokit.aiokit import AioThing +from aiokit import AioThing from nexus.pylon.proxy_manager import ( AllOf, AnyOf, diff --git a/nexus/pylon/pdftools/watermarks.py b/nexus/pylon/pdftools/watermarks.py index d49a842..5784347 100644 --- a/nexus/pylon/pdftools/watermarks.py +++ b/nexus/pylon/pdftools/watermarks.py @@ -228,7 +228,7 @@ class BasePdfProcessor: try: page = self.process_page(page, pdf_reader) except (PdfStreamError, binascii.Error) as e: - logging.getLogger('warning').warning({ + logging.getLogger('nexus_pylon').warning({ 'action': 'pdf_stream_error', 'mode': 'pylon', 'error': str(e), @@ -259,7 +259,7 @@ class WatermarkEraser1(BaseWatermarkEraser): if self.is_watermark_predicate(text.encode()): xobj_death_note.append(operands[0]) operations_death_note.append(op_i) - logging.getLogger('debug').debug({ + logging.getLogger('nexus_pylon').debug({ 'action': 'watermark_removal', 'mode': 'pylon', 'text': text, @@ -289,7 +289,7 @@ class WatermarkEraser2(BaseWatermarkEraser): if operation == b"Tj": if isinstance(operands[0], bytes) and self.is_watermark_predicate(operands[0]): operations_death_note.append(op_i) - logging.getLogger('debug').debug({ + logging.getLogger('nexus_pylon').debug({ 'action': 'watermark_removal', 'mode': 'pylon', 'text': operands[0].decode(), @@ -319,7 +319,7 @@ class WatermarkEraser3(BaseWatermarkEraser): text += operand if self.is_watermark_predicate(text): operations_death_note.append(op_i) - logging.getLogger('debug').debug({ + logging.getLogger('nexus_pylon').debug({ 'action': 'watermark_removal', 'mode': 'pylon', 'text': text.decode(), @@ -402,7 +402,7 @@ class WatermarkEraser4(BaseWatermarkEraser): text, matched = tc.match(self.regexp) if matched: operations_death_note.extend(matched) - logging.getLogger('debug').debug({ + logging.getLogger('nexus_pylon').debug({ 'action': 'watermark_removal', 'mode': 'pylon', 'matched': text, diff --git a/nexus/pylon/prepared_request.py b/nexus/pylon/prepared_request.py index 13f9149..21226b0 100644 --- a/nexus/pylon/prepared_request.py +++ b/nexus/pylon/prepared_request.py @@ -1,4 +1,5 @@ import asyncio +import logging from contextlib import asynccontextmanager from typing import Optional @@ -23,6 +24,7 @@ class PreparedRequest: cookies: Optional[dict] = None, ssl: bool = True, timeout: Optional[float] = None, + headers_override: bool = False ): self.method = method self.url = url @@ -32,6 +34,8 @@ class PreparedRequest: } if headers: self.headers.update(headers) + if headers_override: + self.headers = headers or {} self.params = params self.cookies = cookies self.ssl = ssl @@ -49,6 +53,13 @@ class PreparedRequest: @asynccontextmanager async def execute_with(self, session): try: + logging.getLogger('nexus_pylon').debug({ + 'action': 'request', + 'mode': 'pylon', + 'url': self.url, + 'method': self.method, + 'headers': self.headers, + }) async with session.request( method=self.method, url=self.url, diff --git a/nexus/pylon/proxy_manager.py b/nexus/pylon/proxy_manager.py index c622a6c..6a04ec8 100644 --- a/nexus/pylon/proxy_manager.py +++ b/nexus/pylon/proxy_manager.py @@ -54,6 +54,8 @@ class Proxy: class ProxyManager: def __init__(self, proxies=None): + if proxies is None: + proxies = [] self.proxies = [Proxy(proxy) for proxy in proxies] def get_proxy(self, tags: Optional[Union[AllOf, AnyOf, Set]] = None) -> Proxy: diff --git a/nexus/pylon/resolvers/doi_org_request.py b/nexus/pylon/resolvers/doi_org_request.py index 24f5d94..3f36cb2 100644 --- a/nexus/pylon/resolvers/doi_org_request.py +++ b/nexus/pylon/resolvers/doi_org_request.py @@ -1,5 +1,6 @@ import json import logging +import sys from typing import ( AsyncIterable, Dict, @@ -50,20 +51,25 @@ class DoiOrgRequestResolver(BaseResolver): method='get', url=doi_url, timeout=self.resolve_timeout, - headers={'Accept': 'application/json'} + headers={ + 'Accept': 'application/json', + } ).execute_with(session=session) as resp: return await resp.json() async def resolve(self, params: Dict) -> AsyncIterable[PreparedRequest]: body = await self.resolve_through_doi_org(params) + selected = None try: - selected = json.loads(self.selector.input(body).text()) + if text := self.selector.input(body).text(): + selected = json.loads(text) except ValueError as e: - logging.getLogger('error').error({ + logging.getLogger('nexus_pylon').error({ 'action': 'error', 'mode': 'pylon', 'params': params, - 'error': str(e) + 'error': str(e), + 'selector': str(self.selector), }) return if selected: @@ -73,7 +79,7 @@ class DoiOrgRequestResolver(BaseResolver): timeout=self.timeout, ) else: - logging.getLogger('debug').error({ + logging.getLogger('nexus_pylon').debug({ 'action': 'missed_selector', 'mode': 'pylon', 'params': params, diff --git a/nexus/pylon/resolvers/request.py b/nexus/pylon/resolvers/request.py index a4b1289..a42232d 100644 --- a/nexus/pylon/resolvers/request.py +++ b/nexus/pylon/resolvers/request.py @@ -15,10 +15,12 @@ class RequestResolver(BaseResolver): self, url: str, extractors: List, + resolve_timeout: float = 10.0, proxy_list: Optional[List] = None, proxy_manager: Optional[ProxyManager] = None, ): super().__init__(proxy_list=proxy_list, proxy_manager=proxy_manager) + self.resolve_timeout = resolve_timeout self.url = url self.extractors = extractors @@ -31,9 +33,9 @@ class RequestResolver(BaseResolver): async with PreparedRequest( method='get', url=url, - timeout=10.0, + timeout=self.resolve_timeout, ).execute_with(session=session) as resp: - # Sometimes sci-hub returns file + # Sometimes hosts return file URL if resp.headers.get('Content-Type') == 'application/pdf': yield PreparedRequest(method='get', url=url, timeout=10.0) downloaded_page_bytes = await resp.read() @@ -42,9 +44,11 @@ class RequestResolver(BaseResolver): for extractor in self.extractors: match = re.search(extractor['re'], downloaded_page, re.IGNORECASE) if match: - matched_group = match.group(extractor['producer']['group']) yield PreparedRequest( method='get', - url=extractor['producer']['format_string'].format(matched_group=matched_group), + url=extractor['producer']['format_string'].format( + host=resp.real_url.host, + **match.groupdict() + ), timeout=extractor['producer'].get('timeout', 10.0), ) diff --git a/nexus/pylon/resolvers/template.py b/nexus/pylon/resolvers/template.py index 397e453..e84523b 100644 --- a/nexus/pylon/resolvers/template.py +++ b/nexus/pylon/resolvers/template.py @@ -14,19 +14,27 @@ class TemplateResolver(BaseResolver): self, format_string: str = 'https://doi.org/{doi}', timeout: float = 10.0, + method: str = 'GET', + headers: Optional[dict] = None, + headers_override: bool = False, proxy_list: Optional[List] = None, proxy_manager: Optional[ProxyManager] = None, ): super().__init__(proxy_list=proxy_list, proxy_manager=proxy_manager) self.format_string = format_string self.timeout = timeout + self.method = method + self.headers = headers + self.headers_override = headers_override def __str__(self): return f'{self.__class__.__name__}({self.format_string})' async def resolve(self, params) -> AsyncIterable[PreparedRequest]: yield PreparedRequest( - method='GET', + method=self.method, url=self.format_string.format(**params), timeout=self.timeout, + headers=self.headers, + headers_override=self.headers_override, ) diff --git a/nexus/pylon/source.py b/nexus/pylon/source.py index 3a6be68..83840ac 100644 --- a/nexus/pylon/source.py +++ b/nexus/pylon/source.py @@ -2,12 +2,12 @@ import logging from typing import ( AsyncIterable, Dict, - List, + Optional, ) from aiohttp.client_exceptions import ClientPayloadError -from library.aiokit.aiokit import AioThing -from library.logging import error_log +from aiokit import AioThing +from izihawa_utils.importlib import import_object from nexus.pylon.drivers.base import BaseDriver from nexus.pylon.exceptions import ( DownloadError, @@ -16,7 +16,6 @@ from nexus.pylon.exceptions import ( from nexus.pylon.matcher import Matcher from nexus.pylon.proto.file_pb2 import FileResponse as FileResponsePb from nexus.pylon.resolvers.base import BaseResolver -from utils.izihawa_utils.importlib import import_object class Source(AioThing): @@ -29,12 +28,15 @@ class Source(AioThing): @classmethod def from_config( cls, - proxy_manager, + config, source_config, - downloads_directory: str, - default_driver_proxy_list: List, - default_resolver_proxy_list: List, - ) -> 'Source': + proxy_manager, + ) -> Optional['Source']: + driver_cls_name = source_config.get('driver', {}).get('class', 'nexus.pylon.drivers.BrowserDriver') + + if driver_cls_name.endswith('BrowserDriver') and config.get('webdriver_hub') is None: + return None + matcher = Matcher(source_config['matcher']) resolver_cls = import_object( @@ -42,16 +44,16 @@ class Source(AioThing): ) resolver_args = dict( proxy_manager=proxy_manager, - proxy_list=default_resolver_proxy_list, + proxy_list=config['default_resolver_proxy_list'], ) resolver_args.update(**source_config.get('resolver', {}).get('args', {})) resolver = resolver_cls(**resolver_args) - driver_cls = import_object(source_config.get('driver', {}).get('class', 'nexus.pylon.drivers.BrowserDriver')) + driver_cls = import_object(driver_cls_name) driver_args = dict( proxy_manager=proxy_manager, - downloads_directory=downloads_directory, - proxy_list=default_driver_proxy_list, + proxy_list=config['default_driver_proxy_list'], + config=config, ) driver_args.update(**source_config.get('driver', {}).get('args', {})) driver = driver_cls(**driver_args) @@ -67,13 +69,6 @@ class Source(AioThing): async def download(self, params: Dict) -> AsyncIterable[FileResponsePb]: yield FileResponsePb(status=FileResponsePb.Status.RESOLVING) async for prepared_file_request in self.resolver.resolve(params): - logging.debug({ - 'action': 'download', - 'mode': 'pylon', - 'params': params, - 'source': str(self), - 'url': prepared_file_request.url, - }) try: async for resp in self.driver.execute_prepared_file_request( prepared_file_request=prepared_file_request, @@ -82,11 +77,11 @@ class Source(AioThing): yield resp return except ClientPayloadError as e: - error_log(e, level=logging.WARNING) + logging.getLogger('nexus_pylon').warning(e) continue except NotFoundError: continue except DownloadError as e: - error_log(e) + logging.getLogger('nexus_pylon').warning(e) continue raise NotFoundError(params=params, resolver=str(self.resolver), driver=str(self.driver)) diff --git a/nexus/pylon/validators/__init__.py b/nexus/pylon/validators/__init__.py index 33cc2a0..835a2da 100644 --- a/nexus/pylon/validators/__init__.py +++ b/nexus/pylon/validators/__init__.py @@ -1,4 +1,5 @@ +from .base import BaseValidator from .md5 import Md5Validator from .pdf import PdfValidator -__all__ = ['Md5Validator', 'PdfValidator'] +__all__ = ['BaseValidator', 'Md5Validator', 'PdfValidator'] diff --git a/nexus/pylon/validators/base.py b/nexus/pylon/validators/base.py index cbf6e1c..54bf504 100644 --- a/nexus/pylon/validators/base.py +++ b/nexus/pylon/validators/base.py @@ -1,4 +1,10 @@ +from typing import Dict + + class BaseValidator: + def __init__(self, params: Dict): + self.params = params + def update(self, chunk): pass diff --git a/nexus/pylon/validators/md5.py b/nexus/pylon/validators/md5.py index d57660f..b24a651 100644 --- a/nexus/pylon/validators/md5.py +++ b/nexus/pylon/validators/md5.py @@ -7,6 +7,7 @@ from nexus.pylon.validators.base import BaseValidator class Md5Validator(BaseValidator): def __init__(self, params: Dict): + super().__init__(params) self.md5 = params['md5'] self.v = hashlib.md5() diff --git a/nexus/pylon/validators/pdf.py b/nexus/pylon/validators/pdf.py index d393419..6634a34 100644 --- a/nexus/pylon/validators/pdf.py +++ b/nexus/pylon/validators/pdf.py @@ -12,7 +12,7 @@ from PyPDF2.errors import PdfReadError class PdfValidator(BaseValidator): def __init__(self, params: Dict): - self.params = params + super().__init__(params) self.md5 = params.get('md5') self.file = bytes() self.v = hashlib.md5() @@ -24,7 +24,7 @@ class PdfValidator(BaseValidator): def validate(self): if self.md5 and self.md5.lower() == self.v.hexdigest().lower(): - logging.getLogger('debug').debug({ + logging.getLogger('nexus_pylon').debug({ 'action': 'validation', 'mode': 'pylon', 'result': 'md5_ok', @@ -32,7 +32,7 @@ class PdfValidator(BaseValidator): }) return elif not is_pdf(f=self.file): - logging.getLogger('debug').debug({ + logging.getLogger('nexus_pylon').debug({ 'action': 'validation', 'mode': 'pylon', 'result': 'not_pdf', @@ -41,28 +41,18 @@ class PdfValidator(BaseValidator): raise BadResponseError(file=str(self.file[:100])) try: - logging.getLogger('debug').debug({ - 'action': 'open_pdf', - 'mode': 'pylon', - 'file_len': len(self.file), - 'params': self.params, - }) PyPDF2.PdfReader(BytesIO(self.file)) - logging.getLogger('debug').debug({ - 'action': 'opened_pdf', - 'mode': 'pylon', - 'file_len': len(self.file), - 'params': self.params, - }) except PdfReadError: - logging.getLogger('debug').debug({ + logging.getLogger('nexus_pylon').debug({ 'action': 'validation', 'mode': 'pylon', 'result': 'not_opened_as_pdf', + 'params': self.params, }) raise BadResponseError(file=str(self.file[:100])) - logging.getLogger('debug').debug({ + logging.getLogger('nexus_pylon').debug({ 'action': 'validation', 'mode': 'pylon', 'result': 'ok', + 'params': self.params, }) diff --git a/nexus/translations/BUILD.bazel b/nexus/translations/BUILD.bazel index b2a817a..96e44a2 100644 --- a/nexus/translations/BUILD.bazel +++ b/nexus/translations/BUILD.bazel @@ -10,6 +10,6 @@ py_library( srcs_version = "PY3", visibility = ["//visibility:public"], deps = [ - "//library/configurator", + requirement("izihawa_configurator"), ], ) diff --git a/nexus/translations/__init__.py b/nexus/translations/__init__.py index 9b6964b..335bf23 100644 --- a/nexus/translations/__init__.py +++ b/nexus/translations/__init__.py @@ -1,4 +1,4 @@ -from library.configurator import Configurator +from izihawa_configurator import Configurator def get_translations(): diff --git a/nexus/views/telegram/progress_bar.py b/nexus/views/telegram/progress_bar.py index 5d1bb6c..41e4106 100644 --- a/nexus/views/telegram/progress_bar.py +++ b/nexus/views/telegram/progress_bar.py @@ -33,7 +33,10 @@ class ProgressBar: tail_text, message=None, source=None, - throttle_secs: float = 0, + throttle_secs: float = 0.0, + hard_throttle_secs: float = 10.0, + last_call: float = 0.0, + done_threshold_size: int = 10 * 1024 * 1024, ): self.telegram_client = telegram_client self.request_context = request_context @@ -45,9 +48,12 @@ class ProgressBar: self.done = 0 self.total = 1 self.throttle_secs = throttle_secs + self.hard_throttle_secs = hard_throttle_secs + self.done_threshold_size = done_threshold_size + self.previous_done = 0 self.last_text = None - self.last_call = 0 + self.last_call = last_call def share(self): if self.total > 0: @@ -56,6 +62,7 @@ class ProgressBar: return f'{float(self.done / (1024 * 1024)):.1f}Mb' def _set_progress(self, done, total): + self.previous_done = self.done self.done = done self.total = total @@ -74,11 +81,20 @@ class ProgressBar: progress_bar = '|' + filled * bars['filled'] + (total_bars - filled) * bars['empty'] + '| ' tail_text = self.tail_text.format(source=self.source) - return f'`{self.header}\n{progress_bar}{self.share()} {tail_text}`' + return f'`{self.header}\n{progress_bar}{self.share().ljust(8)} {tail_text}`' + + def should_send(self, now, ignore_last_call): + if ignore_last_call: + return True + if abs(now - self.last_call) > self.hard_throttle_secs: + return True + if abs(now - self.last_call) > self.throttle_secs and (self.done - self.previous_done) < self.done_threshold_size: + return True + return False async def send_message(self, text, ignore_last_call=False): now = time.time() - if not ignore_last_call and abs(now - self.last_call) < self.throttle_secs: + if not self.should_send(now, ignore_last_call): return try: if not self.message: @@ -103,17 +119,3 @@ class ProgressBar: async def callback(self, done, total, ignore_last_call=False): self._set_progress(done, total) return await self.send_message(await self.render_progress(), ignore_last_call=ignore_last_call) - - -class ThrottlerWrapper: - def __init__(self, callback: Callable, throttle_secs: Union[int, float]): - self.callback = callback - self.last_call = 0 - self.throttle_secs = throttle_secs - - async def __call__(self, *args, **kwargs): - now = time.time() - if abs(now - self.last_call) < self.throttle_secs: - return - self.last_call = now - return await self.callback(*args, **kwargs) diff --git a/nexus/views/telegram/scimag.py b/nexus/views/telegram/scimag.py index ddd7fad..2915e15 100644 --- a/nexus/views/telegram/scimag.py +++ b/nexus/views/telegram/scimag.py @@ -63,7 +63,6 @@ class ScimagViewBuilder(BaseViewBuilder): 'chapter': '🔖', 'book-chapter': '🔖', } - multihash_ix = 0 def is_preprint(self): return self.document_holder.doi.split('/')[0] in preprints diff --git a/rules/python/requirements-lock.txt b/rules/python/requirements-lock.txt index 4308082..b0ac838 100644 --- a/rules/python/requirements-lock.txt +++ b/rules/python/requirements-lock.txt @@ -13,7 +13,7 @@ aiohttp-socks==0.7.1 aiokafka==0.7.2 aiokit==1.1.2 aiosignal==1.2.0 -aiosumma==2.8.13 +aiosumma==2.10.4 asn1crypto==1.5.1 async-generator==1.10 async-timeout==4.0.2 @@ -55,7 +55,7 @@ h11==0.13.0 idna==3.3 iniconfig==1.1.1 isort==5.10.1 -izihawa-nlptools==1.1.7 +izihawa-nlptools==1.1.9 izihawa-types==0.1.3 izihawa-utils==1.0.7 Jinja2==3.1.2