diff --git a/library/telegram/base.py b/library/telegram/base.py index 28a7fb2..d87f8b5 100644 --- a/library/telegram/base.py +++ b/library/telegram/base.py @@ -39,6 +39,7 @@ class BaseTelegramClient(AioThing): bot_token: Optional[str] = None, mtproxy: Optional[dict] = None, flood_sleep_threshold: int = 60, + catch_up: bool = False, ): super().__init__() if not app_id or not app_hash: @@ -50,6 +51,7 @@ class BaseTelegramClient(AioThing): self._get_session(database), app_id, app_hash, + catch_up=catch_up, flood_sleep_threshold=flood_sleep_threshold, **self._get_proxy(mtproxy=mtproxy), ) diff --git a/nexus/bot/promotioner.py b/library/telegram/promotioner.py similarity index 61% rename from nexus/bot/promotioner.py rename to library/telegram/promotioner.py index b6e4aab..e53bcc5 100644 --- a/nexus/bot/promotioner.py +++ b/library/telegram/promotioner.py @@ -5,9 +5,17 @@ class Promotioner: """ Promotioner is used to select promotion randomly based on weights of every promotion. """ - def __init__(self, promotions: list[dict], default_promotion_index: int = 0): + def __init__( + self, + promotions: list[dict], + default_promotion_index: int = 0, + promotion_vars: dict = None, + ): self.promotions = promotions self.default_promotion_index = default_promotion_index + if not promotion_vars: + promotion_vars = {} + self.promotion_vars = promotion_vars self.partial_sums: list = [self.promotions[0]['weight']] for promotion in self.promotions[1:]: self.partial_sums.append(promotion['weight'] + self.partial_sums[-1]) @@ -18,11 +26,11 @@ class Promotioner: if partial_sum <= pivot: continue if language in promotion['texts']: - return promotion['texts'][language] + return promotion['texts'][language].format(**self.promotion_vars) elif promotion.get('local', False): default_promotion = self.promotions[self.default_promotion_index] if language in default_promotion['texts']: - return default_promotion['texts'][language] - return default_promotion['texts']['en'] + return default_promotion['texts'][language].format(**self.promotion_vars) + return default_promotion['texts']['en'].format(**self.promotion_vars) else: - return promotion['texts']['en'] + return promotion['texts']['en'].format(**self.promotion_vars) diff --git a/library/telegram/utils.py b/library/telegram/utils.py index 008714b..ddc49c1 100644 --- a/library/telegram/utils.py +++ b/library/telegram/utils.py @@ -19,6 +19,7 @@ async def safe_execution( error_log=error_log, on_fail: Optional[Callable[[], Awaitable]] = None, level=logging.WARNING, + is_logging_enabled: bool = True ): try: try: @@ -34,13 +35,17 @@ async def safe_execution( errors.MessageIdInvalidError, errors.ChatAdminRequiredError, ) as e: - error_log(e, level=level) + if is_logging_enabled: + error_log(e, level=level) + traceback.print_exc() except Exception as e: - error_log(e, level=level) - traceback.print_exc() + if is_logging_enabled: + error_log(e, level=level) + traceback.print_exc() if on_fail: await on_fail() except events.StopPropagation: raise except Exception as e: - error_log(e, level=level) + if is_logging_enabled: + error_log(e, level=level) diff --git a/nexus/actions/crossref_api.py b/nexus/actions/crossref_api.py index 25d7be0..cfef142 100644 --- a/nexus/actions/crossref_api.py +++ b/nexus/actions/crossref_api.py @@ -76,6 +76,11 @@ def clean_issns(issns): return cleaned_issns +def clean_isbns(isbns): + if isbns: + return isbns + + def extract_title(title, subtitle): return ': '.join(filter(lambda x: bool(x), [title.strip(), subtitle.strip()])) @@ -90,6 +95,7 @@ class ToScimagPbAction(BaseAction): doi=item['DOI'], issue=item.get('issue'), issns=clean_issns(item.get('ISSN')), + isbns=clean_isbns(item.get('ISBN')), language=item.get('language'), referenced_by_count=item.get('is-referenced-by-count'), references=extract_references(item.get('reference')), diff --git a/nexus/bot/BUILD.bazel b/nexus/bot/BUILD.bazel index 94285c7..10f7aad 100644 --- a/nexus/bot/BUILD.bazel +++ b/nexus/bot/BUILD.bazel @@ -42,6 +42,8 @@ py3_image( "//nexus/hub/aioclient", "//nexus/meta_api/aioclient", "//nexus/models/proto:proto_py", + "//nexus/promotions", + "//nexus/translations", "//nexus/views/telegram", requirement("izihawa_nlptools"), requirement("izihawa_utils"), diff --git a/nexus/bot/application.py b/nexus/bot/application.py index 25d665f..7b815db 100644 --- a/nexus/bot/application.py +++ b/nexus/bot/application.py @@ -2,10 +2,11 @@ from aiokit import AioRootThing from idm.api.aioclient import IdmApiGrpcClient from izihawa_utils.importlib import import_object from library.telegram.base import BaseTelegramClient -from nexus.bot.promotioner import Promotioner +from library.telegram.promotioner import Promotioner from nexus.bot.user_manager import UserManager from nexus.hub.aioclient import HubGrpcClient from nexus.meta_api.aioclient import MetaApiGrpcClient +from nexus.promotions import get_promotions class TelegramApplication(AioRootThing): @@ -18,6 +19,7 @@ class TelegramApplication(AioRootThing): bot_token=self.config['telegram']['bot_token'], database=self.config['telegram'].get('database'), mtproxy=self.config['telegram'].get('mtproxy'), + catch_up=self.config['telegram'].get('catch_up', False) ) self.hub_client = HubGrpcClient(endpoint=self.config['hub']['endpoint']) self.starts.append(self.hub_client) @@ -28,7 +30,13 @@ class TelegramApplication(AioRootThing): self.meta_api_client = MetaApiGrpcClient(endpoint=self.config['meta_api']['endpoint']) self.starts.append(self.meta_api_client) - self.promotioner = Promotioner(promotions=self.config['promotions']) + self.promotioner = Promotioner( + promotions=get_promotions(), + promotion_vars=dict( + related_channel=self.config['telegram']['related_channel'], + twitter_contact_url=self.config['twitter']['contact_url'], + ) + ) self.user_manager = UserManager() self._handlers = [] diff --git a/nexus/bot/configs/__init__.py b/nexus/bot/configs/__init__.py index 4a7ab92..a954719 100644 --- a/nexus/bot/configs/__init__.py +++ b/nexus/bot/configs/__init__.py @@ -7,7 +7,6 @@ def get_config(): 'nexus/bot/configs/base.yaml', 'nexus/bot/configs/%s.yaml?' % env.type, 'nexus/bot/configs/logging.yaml', - 'nexus/bot/configs/promotions.yaml', ], env_prefix='NEXUS_BOT') diff --git a/nexus/bot/configs/promotions.yaml b/nexus/bot/configs/promotions.yaml deleted file mode 100644 index 233962a..0000000 --- a/nexus/bot/configs/promotions.yaml +++ /dev/null @@ -1,36 +0,0 @@ ---- - -promotions: - - texts: - en: 💬 The victory of humanity is inevitable - weight: 1 - - texts: - en: 💬 Shall build Standard Template Construct - weight: 1 - - texts: - en: 💬 Gaining knowledge is the only purpose of life - weight: 1 - - texts: - en: 💬 Knowledge cannot belong - weight: 1 - - texts: - en: 💬 Obey the path of discovery - weight: 1 - - texts: - en: 💬 Research is the only and ultimate goal - weight: 1 - - texts: - en: ✋ Have a subscription to paid articles? [Help researchers!](https://t.me/nexus_aaron) - ru: ✋ Есть доступ к платным статьям? [Помоги ученым!](https://t.me/nexus_aaron) - weight: 25 - - texts: - en: ✋ Help us, become a seeder of books. Learn how in /seed - ru: ✋ Сохрани наследие, раздавай книги нуждающимся. Узнай как в /seed - weight: 25 - - texts: - en: ⤴️ Stay tuned with us at @{related_channel} and [Twitter]({twitter_contact_url}) - es: ⤴️ Mantente en contacto con nosotros en @{related_channel} y [Twitter]({twitter_contact_url}) - it: ⤴️ Resta aggiornato con noi su @{related_channel} e [Twitter]({twitter_contact_url}) - pb: ⤴️ Fique ligado conosco em @{related_channel} e [Twitter]({twitter_contact_url}) - ru: ⤴️ Оставайся на связи с нами на @{related_channel} и в [Twitter]({twitter_contact_url}) - weight: 25 diff --git a/nexus/bot/handlers/close.py b/nexus/bot/handlers/close.py index caf6f5a..815f260 100644 --- a/nexus/bot/handlers/close.py +++ b/nexus/bot/handlers/close.py @@ -2,6 +2,7 @@ import asyncio import time from library.telegram.base import RequestContext +from library.telegram.utils import safe_execution from nexus.translations import t from telethon import events @@ -36,5 +37,6 @@ class CloseHandler(BaseCallbackQueryHandler): target_events.append(reply_message.delete()) target_events.append(message.delete()) else: - target_events.append(event.answer(t('DELETION_FORBIDDEN_DUE_TO_AGE'))) + async with safe_execution(is_logging_enabled=False): + await event.answer(t('DELETION_FORBIDDEN_DUE_TO_AGE')) await asyncio.gather(*target_events) diff --git a/nexus/bot/handlers/download.py b/nexus/bot/handlers/download.py index c0e66e9..4de2ac0 100644 --- a/nexus/bot/handlers/download.py +++ b/nexus/bot/handlers/download.py @@ -1,4 +1,5 @@ from library.telegram.base import RequestContext +from library.telegram.utils import safe_execution from nexus.hub.proto.delivery_service_pb2 import \ StartDeliveryResponse as StartDeliveryResponsePb from nexus.translations import t @@ -45,14 +46,16 @@ class DownloadHandler(BaseCallbackQueryHandler): bot_name=request_context.bot_name, ) if start_delivery_response_pb.status == StartDeliveryResponsePb.Status.ALREADY_DOWNLOADING: - await event.answer( - f'{t("ALREADY_DOWNLOADING", request_context.chat.language)}', - ) + async with safe_execution(is_logging_enabled=False): + await event.answer( + f'{t("ALREADY_DOWNLOADING", request_context.chat.language)}', + ) await remove_button(event, '⬇️', and_empty_too=True) elif start_delivery_response_pb.status == StartDeliveryResponsePb.Status.TOO_MANY_DOWNLOADS: - await event.answer( - f'{t("TOO_MANY_DOWNLOADS", request_context.chat.language)}', - ) + async with safe_execution(is_logging_enabled=False): + await event.answer( + f'{t("TOO_MANY_DOWNLOADS", request_context.chat.language)}', + ) else: await remove_button(event, '⬇️', and_empty_too=True) self.application.user_manager.last_widget[request_context.chat.chat_id] = None diff --git a/nexus/bot/handlers/profile.py b/nexus/bot/handlers/profile.py index 661a7cc..1cf92be 100644 --- a/nexus/bot/handlers/profile.py +++ b/nexus/bot/handlers/profile.py @@ -61,7 +61,7 @@ class ProfileHandler(BaseHandler): target_events.append(profile_reply_message.reply(rendered_widget, buttons=buttons, link_preview=False)) else: target_events.append(event.reply(rendered_widget, buttons=buttons, link_preview=False)) - return asyncio.gather(*target_events) + return await asyncio.gather(*target_events) class DigestHandler(BaseHandler): @@ -114,6 +114,7 @@ class DigestHandler(BaseHandler): document_holders=document_holders, bot_name=bot_name, header='✨ Nexus Discovery ✨', + promotioner=self.application.promotioner, ) view, buttons = await document_list_widget.render() diff --git a/nexus/bot/handlers/roll.py b/nexus/bot/handlers/roll.py index 5981a63..5f375e3 100644 --- a/nexus/bot/handlers/roll.py +++ b/nexus/bot/handlers/roll.py @@ -39,10 +39,7 @@ class RollHandler(BaseHandler): if random_documents: holder = BaseHolder.create_from_document(random_documents[0]) - promo = self.application.promotioner.choose_promotion(language).format( - related_channel=self.application.config['telegram']['related_channel'], - twitter_contact_url=self.application.config['twitter']['contact_url'], - ) + promo = self.application.promotioner.choose_promotion(language) view = holder.view_builder(language).add_view(bot_name=bot_name).add_new_line(2).add(promo, escaped=True).build() buttons_builder = holder.buttons_builder(language) @@ -54,5 +51,5 @@ class RollHandler(BaseHandler): request_context.statbox(action='show', duration=time.time() - start_time) await event.respond(view, buttons=buttons_builder.build()) - async with safe_execution(error_log=request_context.error_log, level=logging.DEBUG): + async with safe_execution(is_logging_enabled=False): await event.delete() diff --git a/nexus/bot/handlers/search.py b/nexus/bot/handlers/search.py index 2099631..50ce5cf 100644 --- a/nexus/bot/handlers/search.py +++ b/nexus/bot/handlers/search.py @@ -210,6 +210,7 @@ class SearchEditHandler(BaseSearchHandler): if request_context.is_group_mode() and not search_prefix: return + if request_context.is_personal_mode() and search_prefix: query = event.raw_text @@ -288,7 +289,7 @@ class SearchPagingHandler(BaseCallbackQueryHandler): ) serp, buttons = await search_widget.render(message_id=message_id) - return await asyncio.gather( - event.answer(), - message.edit(serp, buttons=buttons, link_preview=False) - ) + + await message.edit(serp, buttons=buttons, link_preview=False) + async with safe_execution(is_logging_enabled=False): + await event.answer() diff --git a/nexus/bot/handlers/submit.py b/nexus/bot/handlers/submit.py index 34d5706..0a9188c 100644 --- a/nexus/bot/handlers/submit.py +++ b/nexus/bot/handlers/submit.py @@ -47,13 +47,13 @@ class SubmitHandler(BaseHandler): reply_to = reply_message.id doi_hint = self.get_doi_hint(message=message, reply_message=reply_message) - doi_hint_priority = '⚡' in message.raw_text + skip_analysis = '⚡️' in message.raw_text user_id = message.sender_id request_context.statbox( action='analyzed', mode='submit', doi_hint=doi_hint, - doi_hint_priority=doi_hint_priority, + skip_analysis=skip_analysis, reply_to=reply_to, ) @@ -71,12 +71,12 @@ class SubmitHandler(BaseHandler): request_id=request_context.request_id, session_id=session_id, doi_hint=doi_hint, - doi_hint_priority=doi_hint_priority, + skip_analysis=skip_analysis, uploader_id=user_id, ) case 'application/zip': - try: - if request_context.is_personal_mode(): + if request_context.is_personal_mode(): + try: file_data = await self.application.telegram_client.download_document( document=event.document, file=bytes, @@ -105,10 +105,8 @@ class SubmitHandler(BaseHandler): session_id=session_id, uploader_id=user_id, ) - else: - await event.reply(t('ZIP_FILES_ARE_NOT_SUPPORTED_IN_GROUP_MODE', request_context.chat.language)) - finally: - return await event.delete() + finally: + return await event.delete() case _: request_context.statbox(action='unknown_file_format') request_context.error_log(UnknownFileFormatError(format=event.document.mime_type)) diff --git a/nexus/bot/handlers/view.py b/nexus/bot/handlers/view.py index b36557a..d562091 100644 --- a/nexus/bot/handlers/view.py +++ b/nexus/bot/handlers/view.py @@ -85,10 +85,7 @@ class ViewHandler(BaseHandler): holder = BaseHolder.create(typed_document_pb=typed_document_pb) back_command = await self.compose_back_command(session_id=session_id, message_id=message_id, page=page) - promo = self.application.promotioner.choose_promotion(language).format( - related_channel=self.application.config['telegram']['related_channel'], - twitter_contact_url=self.application.config['twitter']['contact_url'], - ) + promo = self.application.promotioner.choose_promotion(language) view_builder = holder.view_builder(language).add_view( bot_name=self.application.config['telegram']['bot_name'] ).add_new_line(2).add(promo, escaped=True) diff --git a/nexus/bot/widgets/search_widget.py b/nexus/bot/widgets/search_widget.py index c3fc7ad..8d2c284 100644 --- a/nexus/bot/widgets/search_widget.py +++ b/nexus/bot/widgets/search_widget.py @@ -154,10 +154,7 @@ class SearchWidget(BaseSearchWidget): ) promotion_language = self.query_language or self.chat.language - promo = self.application.promotioner.choose_promotion(promotion_language).format( - related_channel=self.application.config['telegram']['related_channel'], - twitter_contact_url=self.application.config['twitter']['contact_url'], - ) + promo = self.application.promotioner.choose_promotion(promotion_language) serp = f'{serp}\n\n{promo}\n' buttons = None diff --git a/nexus/hub/aioclient/aioclient.py b/nexus/hub/aioclient/aioclient.py index 17b0e8f..8985ddb 100644 --- a/nexus/hub/aioclient/aioclient.py +++ b/nexus/hub/aioclient/aioclient.py @@ -58,7 +58,7 @@ class HubGrpcClient(BaseGrpcClient): bot_name: str, reply_to: Optional[int] = None, doi_hint: Optional[str] = None, - doi_hint_priority: bool = False, + skip_analysis: bool = False, request_id: Optional[str] = None, session_id: Optional[str] = None, uploader_id: Optional[int] = None @@ -68,7 +68,7 @@ class HubGrpcClient(BaseGrpcClient): bot_name=bot_name, reply_to=reply_to, doi_hint=doi_hint, - doi_hint_priority=doi_hint_priority, + skip_analysis=skip_analysis, uploader_id=uploader_id, ) if isinstance(file, submitter_service_pb2.PlainFile): diff --git a/nexus/hub/configs/pylon.yaml b/nexus/hub/configs/pylon.yaml index 9ec777b..5091ed1 100644 --- a/nexus/hub/configs/pylon.yaml +++ b/nexus/hub/configs/pylon.yaml @@ -4,6 +4,7 @@ pylon: - [cambridge] - [edinburg] - [southampton] + default_resolver_proxy_list: ~ downloads_directory: /downloads proxies: - address: clash.default.svc.cluster.example.com:7890 @@ -15,6 +16,9 @@ pylon: - address: clash.default.svc.cluster.example.com:8090 name: southampton tags: ['southampton'] + - address: socks5://clash.default.svc.cluster.example.com:7991 + name: socks5 + tags: ['socks5'] sources: # LibGen.rocks - driver: @@ -331,6 +335,13 @@ pylon: args: format_string: 'https://journals.physiology.org/doi/pdf/{doi}?download=true' class: nexus.pylon.resolvers.TemplateResolver + # www.ahajournals.org + - matcher: + doi: ^10.1161/.*$ + resolver: + args: + format_string: 'https://www.ahajournals.org/doi/pdf/{doi}?download=true' + class: nexus.pylon.resolvers.TemplateResolver # ajp.psychiatryonline.org - matcher: doi: ^10.1176/.*$ diff --git a/nexus/hub/proto/submitter_service.proto b/nexus/hub/proto/submitter_service.proto index 4c20e4d..7457a9d 100644 --- a/nexus/hub/proto/submitter_service.proto +++ b/nexus/hub/proto/submitter_service.proto @@ -23,7 +23,7 @@ message SubmitRequest { string bot_name = 4; optional int64 reply_to = 5; optional string doi_hint = 6; - bool doi_hint_priority = 7; + bool skip_analysis = 7; int64 uploader_id = 8; } message SubmitResponse { } diff --git a/nexus/hub/services/base.py b/nexus/hub/services/base.py index c94a748..0ff6939 100644 --- a/nexus/hub/services/base.py +++ b/nexus/hub/services/base.py @@ -1,5 +1,6 @@ import asyncio +from aiobaseclient.exceptions import BadRequestError from library.aiogrpctools.base import BaseService from library.telegram.common import close_button from nexus.views.telegram.common import vote_button @@ -16,8 +17,39 @@ def is_group_or_channel(chat_id: int): return chat_id < 0 +class ProcessedDocument: + def __init__(self, processed_document): + self.processed_document = processed_document + + @staticmethod + async def setup(file_data, grobid_client, request_context): + try: + processed_document = await grobid_client.process_fulltext_document(pdf_file=file_data) + except BadRequestError as e: + request_context.statbox(action='unparsable_document') + request_context.error_log(e) + processed_document = {} + return ProcessedDocument(processed_document) + + @property + def doi(self): + return self.processed_document.get('doi') + + @property + def title(self): + return self.processed_document.get('title') + + @property + def abstract(self): + return self.processed_document.get('abstract') + + @property + def body(self): + return self.processed_document.get('body') + + class BaseHubService(BaseService): - async def item_found(self, bot_name, doi): + async def found_item(self, bot_name, doi): if mutual_aid_service := self.application.mutual_aid_services.get(bot_name): await mutual_aid_service.delete_request(doi) await self.application.idm_client.reschedule_subscriptions( @@ -36,13 +68,13 @@ class BaseHubService(BaseService): ) )) - def set_fields_from_processed(self, document_pb, processed_document): + def set_fields_from_processed(self, document_pb, processed_document: ProcessedDocument): new_fields = [] - if processed_document.get('abstract') and not document_pb.abstract: - document_pb.abstract = processed_document['abstract'] + if processed_document.abstract and not document_pb.abstract: + document_pb.abstract = processed_document.abstract new_fields.append('abstract') - if processed_document.get('body') and not document_pb.content: - document_pb.content = processed_document['body'] + if processed_document.body and not document_pb.content: + document_pb.content = processed_document.body new_fields.append('content') return new_fields diff --git a/nexus/hub/services/delivery.py b/nexus/hub/services/delivery.py index e817398..91f0519 100644 --- a/nexus/hub/services/delivery.py +++ b/nexus/hub/services/delivery.py @@ -46,6 +46,7 @@ from pypika import ( from .base import ( BaseHubService, + ProcessedDocument, is_group_or_channel, ) @@ -76,6 +77,7 @@ class DeliveryService(delivery_service_pb2_grpc.DeliveryServicer, BaseHubService proxies=pylon_config['proxies'], source_configs=pylon_config['sources'], default_driver_proxy_list=pylon_config['default_driver_proxy_list'], + default_resolver_proxy_list=pylon_config['default_resolver_proxy_list'], downloads_directory=pylon_config['downloads_directory'], ) self.should_parse_with_grobid = should_parse_with_grobid @@ -216,11 +218,12 @@ class DownloadTask: error_log=request_context.error_log, on_fail=_on_fail, ): + filename = document_holder.get_filename() progress_bar_download = ProgressBar( telegram_client=self.application.telegram_clients[request_context.bot_name], request_context=request_context, banner=t("LOOKING_AT", request_context.chat.language), - header=f'⬇️ {document_holder.get_filename()}', + header=f'⬇️ {filename}', tail_text=t('TRANSMITTED_FROM', request_context.chat.language), throttle_secs=throttle_secs, ) @@ -239,6 +242,10 @@ class DownloadTask: ) if not document_holder.md5 and document_holder.get_extension() == 'pdf': try: + await progress_bar_download.send_message( + t("PROCESSING_PAPER", request_context.chat.language).format(filename=filename), + ignore_last_call=True + ) file = clean_metadata(file, doi=document_holder.doi) request_context.statbox( action='cleaned', @@ -251,7 +258,7 @@ class DownloadTask: request_context=request_context, message=progress_bar_download.message, banner=t("LOOKING_AT", request_context.chat.language), - header=f'⬇️ {document_holder.get_filename()}', + header=f'⬇️ {filename}', tail_text=t('UPLOADED_TO_TELEGRAM', request_context.chat.language), throttle_secs=throttle_secs ) @@ -264,7 +271,7 @@ class DownloadTask: voting=not is_group_or_channel(self.request_context.chat.chat_id), ) if self.document_holder.doi: - await self.delivery_service.item_found( + await self.delivery_service.found_item( bot_name=request_context.bot_name, doi=self.document_holder.doi, ) @@ -277,6 +284,7 @@ class DownloadTask: document_holder=document_holder, telegram_file_id=uploaded_message.file.id, file=file, + request_context=request_context, )) else: request_context.statbox( @@ -425,7 +433,7 @@ class DownloadTask: self.task.cancel() await self.task - async def store_new_data(self, bot_name, document_holder, telegram_file_id, file): + async def store_new_data(self, bot_name, document_holder, telegram_file_id, file, request_context): document_pb = document_holder.document_pb new_fields = [] if self.delivery_service.should_store_hashes: @@ -448,7 +456,11 @@ class DownloadTask: and document_holder.index_alias == 'scimag' ): try: - processed_document = await self.application.grobid_client.process_fulltext_document(pdf_file=file) + processed_document = await ProcessedDocument.setup( + file, + grobid_client=self.application.grobid_client, + request_context=request_context, + ) new_fields += self.delivery_service.set_fields_from_processed(document_pb, processed_document) except BadRequestError as e: self.request_context.statbox(action='unparsable_document') diff --git a/nexus/hub/services/submitter.py b/nexus/hub/services/submitter.py index d45c94a..56d7830 100644 --- a/nexus/hub/services/submitter.py +++ b/nexus/hub/services/submitter.py @@ -34,7 +34,10 @@ from nexus.views.telegram.base_holder import ScimagHolder from telethon.errors import ChatAdminRequiredError from telethon.extensions import BinaryReader -from .base import BaseHubService +from .base import ( + BaseHubService, + ProcessedDocument, +) async def operation_log(document_operation_pb): @@ -87,22 +90,48 @@ class PlainFile: def fuzzy_compare(a, b): - if a is None or b is None: - return False a = re.sub(r'[^a-z\d]', '', a.lower()) b = re.sub(r'[^a-z\d]', '', b.lower()) return SequenceMatcher(None, a, b).ratio() > 0.9 -async def delayed_task(task, seconds): - await asyncio.sleep(seconds) - await task - - class SubmitterService(submitter_service_pb2_grpc.SubmitterServicer, BaseHubService): async def start(self): submitter_service_pb2_grpc.add_SubmitterServicer_to_server(self, self.application.server) + def wrap_request_file(self, request, request_context): + match str(request.WhichOneof('file')): + case 'plain': + return PlainFile(request.plain) + case 'telegram': + return TelegramFile(self.application.telegram_clients[request_context.bot_name], request.telegram) + case _: + raise RuntimeError(f"Unknown file type {request.WhichOneof('file')}") + + async def retrieve_metadata(self, doi, title, session_id, request_context): + if doi: + meta_search_response = await self.application.meta_api_client.meta_search( + index_aliases=['scimag', ], + query=doi, + collectors=[{'top_docs': {'limit': 1}}], + session_id=session_id, + request_id=request_context.request_id, + user_id=str(request_context.chat.chat_id), + query_tags=['submitter'], + ) + scored_documents = meta_search_response.collector_outputs[0].top_docs.scored_documents + if len(scored_documents) == 1: + scimag_pb = scimag_pb2.Scimag(**json.loads(scored_documents[0].document)) + if title is not None and not fuzzy_compare(scimag_pb.title, title): + request_context.statbox( + action='mismatched_title', + doi=doi, + processed_title=title, + title=scimag_pb.title, + ) + return None + return scimag_pb + @aiogrpc_request_wrapper(log=False) async def submit( self, @@ -125,17 +154,10 @@ class SubmitterService(submitter_service_pb2_grpc.SubmitterServicer, BaseHubServ ) buttons = None if request_context.is_group_mode() else [close_button()] + wrapped_file = self.wrap_request_file(request, request_context) - match str(request.WhichOneof('file')): - case 'plain': - file = PlainFile(request.plain) - case 'telegram': - file = TelegramFile(self.application.telegram_clients[request_context.bot_name], request.telegram) - case _: - raise RuntimeError(f"Unknown file type {request.WhichOneof('file')}") - - if file.size > 30 * 1024 * 1024: - request_context.error_log(FileTooBigError(size=file.size)) + if wrapped_file.size > 300 * 1024 * 1024: + request_context.error_log(FileTooBigError(size=wrapped_file.size)) request_context.statbox(action='file_too_big') async with safe_execution(error_log=request_context.error_log): await self.application.telegram_clients[request_context.bot_name].send_message( @@ -149,109 +171,59 @@ class SubmitterService(submitter_service_pb2_grpc.SubmitterServicer, BaseHubServ try: processing_message = await self.application.telegram_clients[request_context.bot_name].send_message( request_context.chat.chat_id, - t("PROCESSING_PAPER", request_context.chat.language).format(filename=file.filename), + t("PROCESSING_PAPER", request_context.chat.language).format(filename=wrapped_file.filename), reply_to=request.reply_to, ) except ChatAdminRequiredError: return submitter_service_pb2.SubmitResponse() try: - file_data = await file.read() - processed_document = None - pdf_doi = None - pdf_title = None - try: - processed_document = await self.application.grobid_client.process_fulltext_document(pdf_file=file_data) - pdf_doi = processed_document.get('doi') - pdf_title = processed_document.get('title') - request_context.add_default_fields(pdf_doi=pdf_doi) - except BadRequestError as e: - request_context.statbox(action='unparsable_document') - request_context.error_log(e) - if not request.doi_hint: - await self.application.telegram_clients[request_context.bot_name].send_message( - request_context.chat.chat_id, - t('UNPARSABLE_DOCUMENT_ERROR', request_context.chat.language).format( - filename=file.filename, - ), - buttons=buttons, - reply_to=request.reply_to, - ) - return submitter_service_pb2.SubmitResponse() + file_data = await wrapped_file.read() + if not request.skip_analysis: + processed_document = await ProcessedDocument.setup( + file_data, + grobid_client=self.application.grobid_client, + request_context=request_context, + ) + else: + processed_document = ProcessedDocument({}) - if request.doi_hint and pdf_doi != request.doi_hint: - request_context.statbox(action='mismatched_doi', doi_hint_priority=request.doi_hint_priority) - if request.doi_hint_priority: - pdf_doi = request.doi_hint - - if not pdf_doi and not request.doi_hint: + if not processed_document.doi and not request.doi_hint: request_context.statbox(action='unparsable_doi') request_context.error_log(UnparsableDoiError()) await self.application.telegram_clients[request_context.bot_name].send_message( request_context.chat.chat_id, t('UNPARSABLE_DOI_ERROR', request_context.chat.language).format( - filename=file.filename, + filename=wrapped_file.filename, ), buttons=buttons, reply_to=request.reply_to, ) return submitter_service_pb2.SubmitResponse() - scimag_pb = None - - if pdf_doi: - meta_search_response = await self.application.meta_api_client.meta_search( - index_aliases=['scimag',], - query=pdf_doi, - collectors=[{'top_docs': {'limit': 1}}], - session_id=session_id, - request_id=request_context.request_id, - user_id=str(request_context.chat.chat_id), - query_tags=['submitter'], - ) - scored_documents = meta_search_response.collector_outputs[0].top_docs.scored_documents - if len(scored_documents) == 1: - scimag_pb = scimag_pb2.Scimag(**json.loads(scored_documents[0].document)) - if not fuzzy_compare(scimag_pb.title, pdf_title): - request_context.statbox( - action='mismatched_title', - doi=pdf_doi, - pdf_title=pdf_title, - title=scimag_pb.title, - ) - scimag_pb = None - + scimag_pb = await self.retrieve_metadata( + processed_document.doi, + processed_document.title, + session_id=session_id, + request_context=request_context, + ) if not scimag_pb and request.doi_hint: - meta_search_response = await self.application.meta_api_client.meta_search( - index_aliases=['scimag', ], - query=request.doi_hint, - collectors=[{'top_docs': {'limit': 1}}], + scimag_pb = await self.retrieve_metadata( + request.doi_hint, + processed_document.title, session_id=session_id, - request_id=request_context.request_id, - user_id=str(request_context.chat.chat_id), - query_tags=['submitter'], + request_context=request_context, ) - scored_documents = meta_search_response.collector_outputs[0].top_docs.scored_documents - if len(scored_documents) == 1: - scimag_pb = scimag_pb2.Scimag(**json.loads(scored_documents[0].document)) - if not fuzzy_compare(scimag_pb.title, pdf_title): - request_context.statbox( - action='mismatched_title', - doi=request.doi_hint, - pdf_title=pdf_title, - title=scimag_pb.title, - ) - # ToDo: add trust mechanics if not scimag_pb: request_context.statbox(action='unavailable_metadata') - request_context.error_log(UnavailableMetadataError(doi=pdf_doi)) + request_context.error_log(UnavailableMetadataError(doi=processed_document.doi)) await self.application.telegram_clients[request_context.bot_name].send_message( request_context.chat.chat_id, t( 'UNAVAILABLE_METADATA_ERROR', language=request_context.chat.language - ).format(doi=pdf_doi or request.doi_hint), + ).format(doi=processed_document.doi or request.doi_hint), buttons=buttons, reply_to=request.reply_to, ) @@ -260,10 +232,7 @@ class SubmitterService(submitter_service_pb2_grpc.SubmitterServicer, BaseHubServ request_context.add_default_fields(doi=scimag_pb.doi, document_id=scimag_pb.id) try: file_data = clean_metadata(file_data, doi=scimag_pb.doi) - request_context.statbox( - action='cleaned', - len=len(file_data), - ) + request_context.statbox(action='cleaned', len=len(file_data)) except ValueError as e: request_context.error_log(e) uploaded_message = await self.send_file( @@ -276,13 +245,13 @@ class SubmitterService(submitter_service_pb2_grpc.SubmitterServicer, BaseHubServ if processed_document: sharience_pb = sharience_pb2.Sharience( - abstract=processed_document.get('abstract', ''), - content=processed_document.get('body', ''), + abstract=processed_document.abstract or '', + content=processed_document.body or '', parent_id=scimag_pb.id, uploader_id=request.uploader_id or request_context.chat.chat_id, updated_at=int(time.time()), md5=hashlib.md5(file_data).hexdigest(), - filesize=file.size, + filesize=wrapped_file.size, ipfs_multihashes=await self.get_ipfs_hashes(file=file_data), ) update_sharience_pb = operation_pb2.DocumentOperation( @@ -313,13 +282,13 @@ class SubmitterService(submitter_service_pb2_grpc.SubmitterServicer, BaseHubServ await operation_log(store_telegram_file_id_operation_pb) request_context.statbox(action='successfully_stored') - if file.message_id: + if wrapped_file.message_id: async with safe_execution(error_log=request_context.error_log, level=logging.DEBUG): await self.application.telegram_clients[request_context.bot_name].delete_messages( request_context.chat.chat_id, - file.message_id, + wrapped_file.message_id, ) - await self.item_found(bot_name=request_context.bot_name, doi=scimag_pb.doi) + await self.found_item(bot_name=request_context.bot_name, doi=scimag_pb.doi) finally: await processing_message.delete() return submitter_service_pb2.SubmitResponse() diff --git a/nexus/meta_api/main.py b/nexus/meta_api/main.py index a40f28f..d2b6351 100644 --- a/nexus/meta_api/main.py +++ b/nexus/meta_api/main.py @@ -45,7 +45,7 @@ from nexus.meta_api.word_transformers import ( ) -def create_query_transformer(valid_fields, invalid_fields): +def create_query_transformer(valid_fields, invalid_fields, order_by_valid_fields): return QueryProcessor( tree_transformers=[ OrderByTreeTransformer( @@ -55,14 +55,7 @@ def create_query_transformer(valid_fields, invalid_fields): 'pr': 'page_rank', 'refc': 'referenced_by_count', }, - valid_fields=frozenset([ - 'id', - 'referenced_by_count', - 'issued_at', - 'page_rank', - 'pages', - 'updated_at' - ]) + valid_fields=frozenset(order_by_valid_fields) ), FieldTreeTransformer( field_aliases={ @@ -135,20 +128,35 @@ class GrpcServer(AioGrpcServer): 'doi', 'ipfs_multihashes', 'issns', 'isbns', 'issued_at', 'language', 'original_id', 'page_rank', 'referenced_by_count', 'references', 'tags', 'title', 'year', } + order_by_scimag_fields = { + 'id', + 'referenced_by_count', + 'issued_at', + 'page_rank', + 'updated_at' + } scitech_fields = { 'id', 'authors', 'doi', 'description', 'extension', 'ipfs_multihashes', 'isbns', 'issued_at', 'language', 'original_id', 'pages', 'tags', 'title', 'updated_at', 'year', } + order_by_scitech_fields = { + 'id', + 'issued_at', + 'pages', + 'updated_at' + } self.query_transformers = { 'scimag': create_query_transformer( valid_fields=scimag_fields, invalid_fields=scitech_fields.difference(scimag_fields), + order_by_valid_fields=order_by_scimag_fields, ), 'scitech': create_query_transformer( valid_fields=scitech_fields, invalid_fields=scimag_fields.difference(scitech_fields), + order_by_valid_fields=order_by_scitech_fields, ) } self.summa_client = SummaClient( diff --git a/nexus/meta_api/services/base.py b/nexus/meta_api/services/base.py index c4f507a..5cd2650 100644 --- a/nexus/meta_api/services/base.py +++ b/nexus/meta_api/services/base.py @@ -31,7 +31,6 @@ class BaseService(LibraryBaseService): new_scored_documents = [] for scored_document in scored_documents: document = json.loads(scored_document.document) - document = self.enrich_document_with_stat_provider(document) new_scored_documents.append(nexus_meta_api_search_service_pb.ScoredDocument( typed_document=typed_document_pb2.TypedDocument( **{scored_document.index_alias: self.pb_registry[scored_document.index_alias](**document)}, @@ -46,7 +45,6 @@ class BaseService(LibraryBaseService): new_scored_documents = [] for position, document in enumerate(documents): document = json.loads(document) - document = self.enrich_document_with_stat_provider(document) new_scored_documents.append(nexus_meta_api_search_service_pb.ScoredDocument( typed_document=TypedDocumentPb( **{index_alias: self.pb_registry[index_alias](**document)}, @@ -55,14 +53,3 @@ class BaseService(LibraryBaseService): score=1.0, )) return new_scored_documents - - def enrich_document_with_stat_provider(self, document): - if self.stat_provider: - original_id = ( - document.get('original_id') - or document['id'] - ) - download_stats = self.stat_provider.get_download_stats(original_id) - if download_stats and download_stats.downloads_count: - document['downloads_count'] = download_stats.downloads_count - return document diff --git a/nexus/meta_api/services/documents.py b/nexus/meta_api/services/documents.py index c731247..4561f3d 100644 --- a/nexus/meta_api/services/documents.py +++ b/nexus/meta_api/services/documents.py @@ -57,19 +57,6 @@ class DocumentsService(DocumentsServicer, BaseService): document_pb = getattr(typed_document_pb, typed_document_pb.WhichOneof('document')) - if hasattr(document_pb, 'original_id') and document_pb.original_id: - original_document_pb = await self.get_document( - index_alias=request.index_alias, - document_id=document_pb.original_id, - context=context, - request_id=metadata['request-id'], - session_id=metadata['session-id'], - ) - for to_remove in ('doi', 'fiction_id', 'filesize', 'libgen_id',): - original_document_pb.ClearField(to_remove) - original_document_pb.MergeFrom(document_pb) - document_pb = original_document_pb - logging.getLogger('query').info({ 'action': 'get', 'cache_hit': False, diff --git a/nexus/meta_api/services/search.py b/nexus/meta_api/services/search.py index 01eee7b..7f5a93f 100644 --- a/nexus/meta_api/services/search.py +++ b/nexus/meta_api/services/search.py @@ -270,6 +270,10 @@ class SearchService(SearchServicer, BaseService): @aiogrpc_request_wrapper(log=False) async def search(self, request, context, metadata): preprocessed_query = await self.process_query(query=request.query, languages=request.language, context=context) + logging.getLogger('debug').info({ + 'action': 'preprocess_query', + 'preprocessed_query': str(preprocessed_query), + }) index_aliases = self.resolve_index_aliases( request_index_aliases=request.index_aliases, processed_query=preprocessed_query, @@ -280,19 +284,27 @@ class SearchService(SearchServicer, BaseService): right_offset = left_offset + page_size search_requests = [] + processed_queries = {} for index_alias in index_aliases: - processed_query = self.query_transformers[index_alias].apply_tree_transformers(preprocessed_query) + processed_queries[index_alias] = self.query_transformers[index_alias].apply_tree_transformers(preprocessed_query) + logging.getLogger('debug').info({ + 'action': 'process_query', + 'index_alias': index_alias, + 'processed_query': str(processed_queries[index_alias]), + 'order_by': processed_queries[index_alias].context.order_by, + 'has_invalid_fields': processed_queries[index_alias].context.has_invalid_fields, + }) search_requests.append( SearchRequest( index_alias=index_alias, - query=processed_query.to_summa_query(), + query=processed_queries[index_alias].to_summa_query(), collectors=[ search_service_pb2.Collector( top_docs=search_service_pb2.TopDocsCollector( limit=50, - scorer=self.scorer(processed_query, index_alias), + scorer=self.scorer(processed_queries[index_alias], index_alias), snippets=self.snippets[index_alias], - explain=processed_query.context.explain, + explain=processed_queries[index_alias].context.explain, ) ), search_service_pb2.Collector(count=search_service_pb2.CountCollector()) @@ -322,9 +334,9 @@ class SearchService(SearchServicer, BaseService): meta_search_response.collector_outputs[0].top_docs.scored_documents, ) has_next = len(new_scored_documents) > right_offset - if 'scimag' in index_aliases: + if 'scimag' in processed_queries: await self.check_if_need_new_documents_by_dois( - requested_dois=processed_query.context.dois, + requested_dois=processed_queries['scimag'].context.dois, scored_documents=new_scored_documents, should_request=attempt.retry_state.attempt_number == 1 ) @@ -333,7 +345,7 @@ class SearchService(SearchServicer, BaseService): scored_documents=new_scored_documents[left_offset:right_offset], has_next=has_next, count=meta_search_response.collector_outputs[1].count.count, - query_language=processed_query.context.query_language, + query_language=preprocessed_query.context.query_language, ) return search_response_pb diff --git a/nexus/pipe/configs/base.yaml b/nexus/pipe/configs/base.yaml index 7900787..0b24d04 100644 --- a/nexus/pipe/configs/base.yaml +++ b/nexus/pipe/configs/base.yaml @@ -6,7 +6,7 @@ pipe: kafka-0.example.net schema: - consumers: - - class: nexus.pipe.consumers.CrossReferencesBulkConsumer + - class: nexus.pipe.consumers.CrossReferencesConsumer topics: - name: cross_references workers: 4 diff --git a/nexus/pipe/consumers/__init__.py b/nexus/pipe/consumers/__init__.py index 25a1bd6..7fd6652 100644 --- a/nexus/pipe/consumers/__init__.py +++ b/nexus/pipe/consumers/__init__.py @@ -1,17 +1,11 @@ -from .cross_references_consumer import ( - CrossReferencesBulkConsumer, - CrossReferencesConsumer, -) +from .cross_references_consumer import CrossReferencesConsumer from .document_operations_consumer import ( - DocumentOperationsBulkConsumer, DocumentOperationsConsumer, DocumentOperationsJsonConsumer, ) __all__ = [ - 'CrossReferencesBulkConsumer', 'CrossReferencesConsumer', 'DocumentOperationsConsumer', - 'DocumentOperationsBulkConsumer', 'DocumentOperationsJsonConsumer', ] diff --git a/nexus/pipe/consumers/base.py b/nexus/pipe/consumers/base.py index e711074..b19319d 100644 --- a/nexus/pipe/consumers/base.py +++ b/nexus/pipe/consumers/base.py @@ -66,9 +66,10 @@ class BaseConsumer(AioRootThing): except (ConflictError, InterruptProcessing) as e: logging.getLogger('statbox').info(e) except (asyncio.CancelledError, ConsumerStoppedError): - pass + return finally: await consumer.stop() + asyncio.create_task(self.stop()) async def start(self): logging.getLogger('statbox').info({ @@ -87,8 +88,10 @@ class BaseConsumer(AioRootThing): async def stop(self): if self.consumer_task: - self.consumer_task.cancel() - await self.consumer_task + consumer_task = self.consumer_task + self.consumer_task = None + consumer_task.cancel() + await consumer_task class BasePbConsumer(BaseConsumer): @@ -108,52 +111,3 @@ class BaseJsonConsumer(BaseConsumer): message = json.loads(msg.value) ParseDict(message, pb, ignore_unknown_fields=True) return pb - - -class BaseBulkConsumer(BaseConsumer): - auto_commit = False - bulk_size = 20 - timeout = 1 - - async def consume(self, consumer): - try: - while self.started: - try: - result = await consumer.getmany(timeout_ms=self.timeout * 1000, max_records=self.bulk_size) - except (ConsumerStoppedError, asyncio.CancelledError): - break - collector = [] - for tp, messages in result.items(): - if messages: - for message in messages: - preprocessed_msg = self.preprocess(message) - if preprocessed_msg: - collector.append(preprocessed_msg) - for processor in self.processors: - filtered = filter(processor.filter, collector) - try: - await processor.process_bulk(filtered) - except InterruptProcessing as e: - logging.getLogger('statbox').info(e) - try: - await consumer.commit() - except CommitFailedError as e: - error_log(e) - continue - finally: - await consumer.stop() - - async def start(self): - logging.getLogger('statbox').info({ - 'action': 'start', - 'group_id': self.group_id, - 'topic_names': self.topic_names, - }) - consumer = self.create_consumer() - await consumer.start() - logging.getLogger('statbox').info({ - 'action': 'started', - 'group_id': self.group_id, - 'topic_names': self.topic_names, - }) - self.consumer_task = asyncio.create_task(self.consume(consumer)) diff --git a/nexus/pipe/consumers/cross_references_consumer.py b/nexus/pipe/consumers/cross_references_consumer.py index d745b03..91ddcf5 100644 --- a/nexus/pipe/consumers/cross_references_consumer.py +++ b/nexus/pipe/consumers/cross_references_consumer.py @@ -1,15 +1,8 @@ from nexus.models.proto.operation_pb2 import \ CrossReferenceOperation as CrossReferenceOperationPb -from .base import ( - BaseBulkConsumer, - BasePbConsumer, -) +from .base import BasePbConsumer class CrossReferencesConsumer(BasePbConsumer): pb_class = CrossReferenceOperationPb - - -class CrossReferencesBulkConsumer(BaseBulkConsumer, CrossReferencesConsumer): - pass diff --git a/nexus/pipe/consumers/document_operations_consumer.py b/nexus/pipe/consumers/document_operations_consumer.py index 05a9169..492a99e 100644 --- a/nexus/pipe/consumers/document_operations_consumer.py +++ b/nexus/pipe/consumers/document_operations_consumer.py @@ -2,7 +2,6 @@ from nexus.models.proto.operation_pb2 import \ DocumentOperation as DocumentOperationPb from .base import ( - BaseBulkConsumer, BaseJsonConsumer, BasePbConsumer, ) @@ -14,7 +13,3 @@ class DocumentOperationsConsumer(BasePbConsumer): class DocumentOperationsJsonConsumer(BaseJsonConsumer): pb_class = DocumentOperationPb - - -class DocumentOperationsBulkConsumer(BaseBulkConsumer, DocumentOperationsConsumer): - pass diff --git a/nexus/pylon/client.py b/nexus/pylon/client.py index df46d2d..9d3fbab 100644 --- a/nexus/pylon/client.py +++ b/nexus/pylon/client.py @@ -22,12 +22,14 @@ class PylonClient(AioThing): source_configs: Optional[List], proxies: Optional[List[str]] = None, downloads_directory: Optional[str] = None, - default_driver_proxy_list: [Optional[List]] = None + default_driver_proxy_list: [Optional[List]] = None, + default_resolver_proxy_list: [Optional[List]] = None, ): super().__init__() self.proxy_manager = ProxyManager(proxies) self.downloads_directory = downloads_directory self.default_driver_proxy_list = default_driver_proxy_list + self.default_resolver_proxy_list = default_resolver_proxy_list self.sources = [] for source_config in source_configs: source = Source.from_config( @@ -35,6 +37,7 @@ class PylonClient(AioThing): source_config=source_config, downloads_directory=downloads_directory, default_driver_proxy_list=default_driver_proxy_list, + default_resolver_proxy_list=default_resolver_proxy_list, ) self.sources.append(source) self.starts.append(source) diff --git a/nexus/pylon/configs/pylon.yaml b/nexus/pylon/configs/pylon.yaml index a455776..ed96b83 100644 --- a/nexus/pylon/configs/pylon.yaml +++ b/nexus/pylon/configs/pylon.yaml @@ -138,6 +138,7 @@ pylon: resolver: args: format_string: 'https://www.sciencedirect.com/science/article/pii/{selected}/pdfft?isDTMRedir=true&download=true' + resolve_timeout: 25.0 selector: '(.resource.primary.URL | split("/"))[-1]' timeout: 40.0 class: nexus.pylon.resolvers.DoiOrgRequestResolver diff --git a/nexus/pylon/drivers/base.py b/nexus/pylon/drivers/base.py index d59863c..96fde80 100644 --- a/nexus/pylon/drivers/base.py +++ b/nexus/pylon/drivers/base.py @@ -26,7 +26,7 @@ class BaseDriver(NetworkAgent): validator_cls = validator['class'] validator_cls = import_object(validator_cls) - self.validator = validator_cls or BaseValidator + self.validator = validator_cls self.downloads_directory = downloads_directory def __str__(self): diff --git a/nexus/pylon/drivers/browser.py b/nexus/pylon/drivers/browser.py index 8f86458..8d4ad25 100644 --- a/nexus/pylon/drivers/browser.py +++ b/nexus/pylon/drivers/browser.py @@ -3,7 +3,6 @@ import json import logging import os.path import shutil -import sys import time from pathlib import Path from typing import ( diff --git a/nexus/pylon/matcher.py b/nexus/pylon/matcher.py index 9797fa4..45cf385 100644 --- a/nexus/pylon/matcher.py +++ b/nexus/pylon/matcher.py @@ -1,4 +1,5 @@ import re +import sys class Matcher: diff --git a/nexus/pylon/resolvers/doi_org_request.py b/nexus/pylon/resolvers/doi_org_request.py index e0c2b81..24f5d94 100644 --- a/nexus/pylon/resolvers/doi_org_request.py +++ b/nexus/pylon/resolvers/doi_org_request.py @@ -11,6 +11,11 @@ import jq from nexus.pylon.prepared_request import PreparedRequest from nexus.pylon.proxy_manager import ProxyManager from nexus.pylon.resolvers.base import BaseResolver +from tenacity import ( + retry, + stop_after_attempt, + wait_random, +) class DoiOrgRequestResolver(BaseResolver): @@ -24,6 +29,7 @@ class DoiOrgRequestResolver(BaseResolver): proxy_manager: Optional[ProxyManager] = None, ): super().__init__(proxy_list=proxy_list, proxy_manager=proxy_manager) + self.text_selector = selector self.selector = jq.compile(selector) self.format_string = format_string self.timeout = timeout @@ -32,6 +38,11 @@ class DoiOrgRequestResolver(BaseResolver): def __str__(self): return f'{self.__class__.__name__}(selector = {self.selector}, format_string = {self.format_string})' + @retry( + reraise=True, + wait=wait_random(min=1, max=3), + stop=stop_after_attempt(4), + ) async def resolve_through_doi_org(self, params): async with self.get_session() as session: doi_url = f'https://doi.org/{params["doi"]}' @@ -51,6 +62,7 @@ class DoiOrgRequestResolver(BaseResolver): logging.getLogger('error').error({ 'action': 'error', 'mode': 'pylon', + 'params': params, 'error': str(e) }) return @@ -60,3 +72,11 @@ class DoiOrgRequestResolver(BaseResolver): url=self.format_string.format(selected=selected), timeout=self.timeout, ) + else: + logging.getLogger('debug').error({ + 'action': 'missed_selector', + 'mode': 'pylon', + 'params': params, + 'selector': self.text_selector, + 'format_string': self.format_string, + }) diff --git a/nexus/pylon/source.py b/nexus/pylon/source.py index 24efe4a..3a6be68 100644 --- a/nexus/pylon/source.py +++ b/nexus/pylon/source.py @@ -33,13 +33,17 @@ class Source(AioThing): source_config, downloads_directory: str, default_driver_proxy_list: List, + default_resolver_proxy_list: List, ) -> 'Source': matcher = Matcher(source_config['matcher']) resolver_cls = import_object( source_config.get('resolver', {}).get('class', 'nexus.pylon.resolvers.TemplateResolver') ) - resolver_args = dict(proxy_manager=proxy_manager) + resolver_args = dict( + proxy_manager=proxy_manager, + proxy_list=default_resolver_proxy_list, + ) resolver_args.update(**source_config.get('resolver', {}).get('args', {})) resolver = resolver_cls(**resolver_args) diff --git a/nexus/translations/translations.yaml b/nexus/translations/translations.yaml index 45bc5e8..593a53a 100644 --- a/nexus/translations/translations.yaml +++ b/nexus/translations/translations.yaml @@ -4,7 +4,7 @@ en: ABOUT_US: | **About us** - Among the most impactful inventions of humanity such as the wheel, semiconductors or the wing were the invention of writing and advent of mass printing. Uwe of printing press made possible wide spread of ideas and knowledge. This burst increased the number of educated people and in its turn people started the Age of Enlightenment and shaped modern civilization. + Among the most impactful inventions of humanity such as the wheel, semiconductors or the wing were the invention of writing and advent of mass printing. Use of printing press made possible wide spread of ideas and knowledge. This burst increased the number of educated people and in its turn people started the Age of Enlightenment and shaped modern civilization. Why printing provoked such dramatic changes in society? Sane beings and knowledge live in a natural symbiosis: knowledge are fruiting and growing while circulated among people and people do the same. We may even note that complexity of human civilization, its achievements and technological progress during all its history have been correlated with the amount of accumulated knowledge and possibility to share and navigate through it. @@ -14,7 +14,7 @@ en: However, there is one remaining issue: knowledge is usurped by large publishers, corporations such as Elsevier, Springer, Wiley, T&F etc. Held by copyright, the knowledge remains out of reach for many researchers and for many enterprises that could launch the new Age of Enlightenment. Knowledge is not just usurped, an entire system has been built which is putting in chains researchers, academia and readers: - - Large publishers promote and lobby approaches that encourage evaluation of researchers using the number of published articles in journals having high-impact. Everybody knows who owns journals of high "impact" + - Large publishers promote and lobby approaches that encourage evaluation of researchers using the number of published articles in journals having high-impact - Publishers make authors to pay for publishing tax-funded researches, then readers to pay for access to published tax-funded researches, while peer-reviewing is done for free by other researchers and this activity is considered as "honorable" in academia - Publishers such as Elsevier repeatedly carried and carry campaigns against open access diff --git a/nexus/views/telegram/document_list_widget.py b/nexus/views/telegram/document_list_widget.py index a2cf4d4..d2f2e70 100644 --- a/nexus/views/telegram/document_list_widget.py +++ b/nexus/views/telegram/document_list_widget.py @@ -17,6 +17,7 @@ class DocumentListWidget: document_holders: List[BaseHolder], bot_name, header: Optional[str] = None, + promotioner=None, has_next: bool = False, session_id: Optional[str] = None, message_id: Optional[int] = None, @@ -29,6 +30,7 @@ class DocumentListWidget: self.document_holders = document_holders self.bot_name = bot_name self.header = header + self.promotioner = promotioner self.cmd = cmd self.has_next = has_next self.session_id = session_id @@ -57,6 +59,10 @@ class DocumentListWidget: if self.header: serp = f'**{self.header}**\n\n{serp}' + promotion_language = self.chat.language + promo = self.promotioner.choose_promotion(promotion_language) + serp = f'{serp}\n\n{promo}\n' + buttons = [] if self.cmd and self.message_id and self.session_id and (self.has_next or self.page > 0): buttons = [ diff --git a/nexus/views/telegram/progress_bar.py b/nexus/views/telegram/progress_bar.py index d2914f6..5d1bb6c 100644 --- a/nexus/views/telegram/progress_bar.py +++ b/nexus/views/telegram/progress_bar.py @@ -100,9 +100,9 @@ class ProgressBar: async def show_banner(self): return await self.send_message(await self.render_banner(), ignore_last_call=True) - async def callback(self, done, total): + async def callback(self, done, total, ignore_last_call=False): self._set_progress(done, total) - return await self.send_message(await self.render_progress()) + return await self.send_message(await self.render_progress(), ignore_last_call=ignore_last_call) class ThrottlerWrapper: diff --git a/rules/python/requirements-lock.txt b/rules/python/requirements-lock.txt index 98c8b30..4308082 100644 --- a/rules/python/requirements-lock.txt +++ b/rules/python/requirements-lock.txt @@ -139,7 +139,7 @@ spacy-loggers==1.0.3 SQLAlchemy==1.4.39 sqlparse==0.4.2 srsly==2.4.4 -Telethon==1.24.0 +Telethon==1.25.0 tenacity==8.0.1 termcolor==1.1.0 textblob==0.17.1