- [nexus] Development

GitOrigin-RevId: ccfd55db266862ed70f1299aaf62500765b03cc4
This commit is contained in:
the-superpirate 2022-09-06 19:33:57 +03:00
parent 51ae1fc5d9
commit 7c1bb06b1b
42 changed files with 300 additions and 322 deletions

View File

@ -39,6 +39,7 @@ class BaseTelegramClient(AioThing):
bot_token: Optional[str] = None, bot_token: Optional[str] = None,
mtproxy: Optional[dict] = None, mtproxy: Optional[dict] = None,
flood_sleep_threshold: int = 60, flood_sleep_threshold: int = 60,
catch_up: bool = False,
): ):
super().__init__() super().__init__()
if not app_id or not app_hash: if not app_id or not app_hash:
@ -50,6 +51,7 @@ class BaseTelegramClient(AioThing):
self._get_session(database), self._get_session(database),
app_id, app_id,
app_hash, app_hash,
catch_up=catch_up,
flood_sleep_threshold=flood_sleep_threshold, flood_sleep_threshold=flood_sleep_threshold,
**self._get_proxy(mtproxy=mtproxy), **self._get_proxy(mtproxy=mtproxy),
) )

View File

@ -5,9 +5,17 @@ class Promotioner:
""" """
Promotioner is used to select promotion randomly based on weights of every promotion. Promotioner is used to select promotion randomly based on weights of every promotion.
""" """
def __init__(self, promotions: list[dict], default_promotion_index: int = 0): def __init__(
self,
promotions: list[dict],
default_promotion_index: int = 0,
promotion_vars: dict = None,
):
self.promotions = promotions self.promotions = promotions
self.default_promotion_index = default_promotion_index self.default_promotion_index = default_promotion_index
if not promotion_vars:
promotion_vars = {}
self.promotion_vars = promotion_vars
self.partial_sums: list = [self.promotions[0]['weight']] self.partial_sums: list = [self.promotions[0]['weight']]
for promotion in self.promotions[1:]: for promotion in self.promotions[1:]:
self.partial_sums.append(promotion['weight'] + self.partial_sums[-1]) self.partial_sums.append(promotion['weight'] + self.partial_sums[-1])
@ -18,11 +26,11 @@ class Promotioner:
if partial_sum <= pivot: if partial_sum <= pivot:
continue continue
if language in promotion['texts']: if language in promotion['texts']:
return promotion['texts'][language] return promotion['texts'][language].format(**self.promotion_vars)
elif promotion.get('local', False): elif promotion.get('local', False):
default_promotion = self.promotions[self.default_promotion_index] default_promotion = self.promotions[self.default_promotion_index]
if language in default_promotion['texts']: if language in default_promotion['texts']:
return default_promotion['texts'][language] return default_promotion['texts'][language].format(**self.promotion_vars)
return default_promotion['texts']['en'] return default_promotion['texts']['en'].format(**self.promotion_vars)
else: else:
return promotion['texts']['en'] return promotion['texts']['en'].format(**self.promotion_vars)

View File

@ -19,6 +19,7 @@ async def safe_execution(
error_log=error_log, error_log=error_log,
on_fail: Optional[Callable[[], Awaitable]] = None, on_fail: Optional[Callable[[], Awaitable]] = None,
level=logging.WARNING, level=logging.WARNING,
is_logging_enabled: bool = True
): ):
try: try:
try: try:
@ -34,8 +35,11 @@ async def safe_execution(
errors.MessageIdInvalidError, errors.MessageIdInvalidError,
errors.ChatAdminRequiredError, errors.ChatAdminRequiredError,
) as e: ) as e:
if is_logging_enabled:
error_log(e, level=level) error_log(e, level=level)
traceback.print_exc()
except Exception as e: except Exception as e:
if is_logging_enabled:
error_log(e, level=level) error_log(e, level=level)
traceback.print_exc() traceback.print_exc()
if on_fail: if on_fail:
@ -43,4 +47,5 @@ async def safe_execution(
except events.StopPropagation: except events.StopPropagation:
raise raise
except Exception as e: except Exception as e:
if is_logging_enabled:
error_log(e, level=level) error_log(e, level=level)

View File

@ -76,6 +76,11 @@ def clean_issns(issns):
return cleaned_issns return cleaned_issns
def clean_isbns(isbns):
if isbns:
return isbns
def extract_title(title, subtitle): def extract_title(title, subtitle):
return ': '.join(filter(lambda x: bool(x), [title.strip(), subtitle.strip()])) return ': '.join(filter(lambda x: bool(x), [title.strip(), subtitle.strip()]))
@ -90,6 +95,7 @@ class ToScimagPbAction(BaseAction):
doi=item['DOI'], doi=item['DOI'],
issue=item.get('issue'), issue=item.get('issue'),
issns=clean_issns(item.get('ISSN')), issns=clean_issns(item.get('ISSN')),
isbns=clean_isbns(item.get('ISBN')),
language=item.get('language'), language=item.get('language'),
referenced_by_count=item.get('is-referenced-by-count'), referenced_by_count=item.get('is-referenced-by-count'),
references=extract_references(item.get('reference')), references=extract_references(item.get('reference')),

View File

@ -42,6 +42,8 @@ py3_image(
"//nexus/hub/aioclient", "//nexus/hub/aioclient",
"//nexus/meta_api/aioclient", "//nexus/meta_api/aioclient",
"//nexus/models/proto:proto_py", "//nexus/models/proto:proto_py",
"//nexus/promotions",
"//nexus/translations",
"//nexus/views/telegram", "//nexus/views/telegram",
requirement("izihawa_nlptools"), requirement("izihawa_nlptools"),
requirement("izihawa_utils"), requirement("izihawa_utils"),

View File

@ -2,10 +2,11 @@ from aiokit import AioRootThing
from idm.api.aioclient import IdmApiGrpcClient from idm.api.aioclient import IdmApiGrpcClient
from izihawa_utils.importlib import import_object from izihawa_utils.importlib import import_object
from library.telegram.base import BaseTelegramClient from library.telegram.base import BaseTelegramClient
from nexus.bot.promotioner import Promotioner from library.telegram.promotioner import Promotioner
from nexus.bot.user_manager import UserManager from nexus.bot.user_manager import UserManager
from nexus.hub.aioclient import HubGrpcClient from nexus.hub.aioclient import HubGrpcClient
from nexus.meta_api.aioclient import MetaApiGrpcClient from nexus.meta_api.aioclient import MetaApiGrpcClient
from nexus.promotions import get_promotions
class TelegramApplication(AioRootThing): class TelegramApplication(AioRootThing):
@ -18,6 +19,7 @@ class TelegramApplication(AioRootThing):
bot_token=self.config['telegram']['bot_token'], bot_token=self.config['telegram']['bot_token'],
database=self.config['telegram'].get('database'), database=self.config['telegram'].get('database'),
mtproxy=self.config['telegram'].get('mtproxy'), mtproxy=self.config['telegram'].get('mtproxy'),
catch_up=self.config['telegram'].get('catch_up', False)
) )
self.hub_client = HubGrpcClient(endpoint=self.config['hub']['endpoint']) self.hub_client = HubGrpcClient(endpoint=self.config['hub']['endpoint'])
self.starts.append(self.hub_client) self.starts.append(self.hub_client)
@ -28,7 +30,13 @@ class TelegramApplication(AioRootThing):
self.meta_api_client = MetaApiGrpcClient(endpoint=self.config['meta_api']['endpoint']) self.meta_api_client = MetaApiGrpcClient(endpoint=self.config['meta_api']['endpoint'])
self.starts.append(self.meta_api_client) self.starts.append(self.meta_api_client)
self.promotioner = Promotioner(promotions=self.config['promotions']) self.promotioner = Promotioner(
promotions=get_promotions(),
promotion_vars=dict(
related_channel=self.config['telegram']['related_channel'],
twitter_contact_url=self.config['twitter']['contact_url'],
)
)
self.user_manager = UserManager() self.user_manager = UserManager()
self._handlers = [] self._handlers = []

View File

@ -7,7 +7,6 @@ def get_config():
'nexus/bot/configs/base.yaml', 'nexus/bot/configs/base.yaml',
'nexus/bot/configs/%s.yaml?' % env.type, 'nexus/bot/configs/%s.yaml?' % env.type,
'nexus/bot/configs/logging.yaml', 'nexus/bot/configs/logging.yaml',
'nexus/bot/configs/promotions.yaml',
], env_prefix='NEXUS_BOT') ], env_prefix='NEXUS_BOT')

View File

@ -1,36 +0,0 @@
---
promotions:
- texts:
en: 💬 The victory of humanity is inevitable
weight: 1
- texts:
en: 💬 Shall build Standard Template Construct
weight: 1
- texts:
en: 💬 Gaining knowledge is the only purpose of life
weight: 1
- texts:
en: 💬 Knowledge cannot belong
weight: 1
- texts:
en: 💬 Obey the path of discovery
weight: 1
- texts:
en: 💬 Research is the only and ultimate goal
weight: 1
- texts:
en: ✋ Have a subscription to paid articles? [Help researchers!](https://t.me/nexus_aaron)
ru: ✋ Есть доступ к платным статьям? [Помоги ученым!](https://t.me/nexus_aaron)
weight: 25
- texts:
en: ✋ Help us, become a seeder of books. Learn how in /seed
ru: ✋ Сохрани наследие, раздавай книги нуждающимся. Узнай как в /seed
weight: 25
- texts:
en: ⤴️ Stay tuned with us at @{related_channel} and [Twitter]({twitter_contact_url})
es: ⤴️ Mantente en contacto con nosotros en @{related_channel} y [Twitter]({twitter_contact_url})
it: ⤴️ Resta aggiornato con noi su @{related_channel} e [Twitter]({twitter_contact_url})
pb: ⤴️ Fique ligado conosco em @{related_channel} e [Twitter]({twitter_contact_url})
ru: ⤴️ Оставайся на связи с нами на @{related_channel} и в [Twitter]({twitter_contact_url})
weight: 25

View File

@ -2,6 +2,7 @@ import asyncio
import time import time
from library.telegram.base import RequestContext from library.telegram.base import RequestContext
from library.telegram.utils import safe_execution
from nexus.translations import t from nexus.translations import t
from telethon import events from telethon import events
@ -36,5 +37,6 @@ class CloseHandler(BaseCallbackQueryHandler):
target_events.append(reply_message.delete()) target_events.append(reply_message.delete())
target_events.append(message.delete()) target_events.append(message.delete())
else: else:
target_events.append(event.answer(t('DELETION_FORBIDDEN_DUE_TO_AGE'))) async with safe_execution(is_logging_enabled=False):
await event.answer(t('DELETION_FORBIDDEN_DUE_TO_AGE'))
await asyncio.gather(*target_events) await asyncio.gather(*target_events)

View File

@ -1,4 +1,5 @@
from library.telegram.base import RequestContext from library.telegram.base import RequestContext
from library.telegram.utils import safe_execution
from nexus.hub.proto.delivery_service_pb2 import \ from nexus.hub.proto.delivery_service_pb2 import \
StartDeliveryResponse as StartDeliveryResponsePb StartDeliveryResponse as StartDeliveryResponsePb
from nexus.translations import t from nexus.translations import t
@ -45,11 +46,13 @@ class DownloadHandler(BaseCallbackQueryHandler):
bot_name=request_context.bot_name, bot_name=request_context.bot_name,
) )
if start_delivery_response_pb.status == StartDeliveryResponsePb.Status.ALREADY_DOWNLOADING: if start_delivery_response_pb.status == StartDeliveryResponsePb.Status.ALREADY_DOWNLOADING:
async with safe_execution(is_logging_enabled=False):
await event.answer( await event.answer(
f'{t("ALREADY_DOWNLOADING", request_context.chat.language)}', f'{t("ALREADY_DOWNLOADING", request_context.chat.language)}',
) )
await remove_button(event, '⬇️', and_empty_too=True) await remove_button(event, '⬇️', and_empty_too=True)
elif start_delivery_response_pb.status == StartDeliveryResponsePb.Status.TOO_MANY_DOWNLOADS: elif start_delivery_response_pb.status == StartDeliveryResponsePb.Status.TOO_MANY_DOWNLOADS:
async with safe_execution(is_logging_enabled=False):
await event.answer( await event.answer(
f'{t("TOO_MANY_DOWNLOADS", request_context.chat.language)}', f'{t("TOO_MANY_DOWNLOADS", request_context.chat.language)}',
) )

View File

@ -61,7 +61,7 @@ class ProfileHandler(BaseHandler):
target_events.append(profile_reply_message.reply(rendered_widget, buttons=buttons, link_preview=False)) target_events.append(profile_reply_message.reply(rendered_widget, buttons=buttons, link_preview=False))
else: else:
target_events.append(event.reply(rendered_widget, buttons=buttons, link_preview=False)) target_events.append(event.reply(rendered_widget, buttons=buttons, link_preview=False))
return asyncio.gather(*target_events) return await asyncio.gather(*target_events)
class DigestHandler(BaseHandler): class DigestHandler(BaseHandler):
@ -114,6 +114,7 @@ class DigestHandler(BaseHandler):
document_holders=document_holders, document_holders=document_holders,
bot_name=bot_name, bot_name=bot_name,
header='✨ Nexus Discovery ✨', header='✨ Nexus Discovery ✨',
promotioner=self.application.promotioner,
) )
view, buttons = await document_list_widget.render() view, buttons = await document_list_widget.render()

View File

@ -39,10 +39,7 @@ class RollHandler(BaseHandler):
if random_documents: if random_documents:
holder = BaseHolder.create_from_document(random_documents[0]) holder = BaseHolder.create_from_document(random_documents[0])
promo = self.application.promotioner.choose_promotion(language).format( promo = self.application.promotioner.choose_promotion(language)
related_channel=self.application.config['telegram']['related_channel'],
twitter_contact_url=self.application.config['twitter']['contact_url'],
)
view = holder.view_builder(language).add_view(bot_name=bot_name).add_new_line(2).add(promo, escaped=True).build() view = holder.view_builder(language).add_view(bot_name=bot_name).add_new_line(2).add(promo, escaped=True).build()
buttons_builder = holder.buttons_builder(language) buttons_builder = holder.buttons_builder(language)
@ -54,5 +51,5 @@ class RollHandler(BaseHandler):
request_context.statbox(action='show', duration=time.time() - start_time) request_context.statbox(action='show', duration=time.time() - start_time)
await event.respond(view, buttons=buttons_builder.build()) await event.respond(view, buttons=buttons_builder.build())
async with safe_execution(error_log=request_context.error_log, level=logging.DEBUG): async with safe_execution(is_logging_enabled=False):
await event.delete() await event.delete()

View File

@ -210,6 +210,7 @@ class SearchEditHandler(BaseSearchHandler):
if request_context.is_group_mode() and not search_prefix: if request_context.is_group_mode() and not search_prefix:
return return
if request_context.is_personal_mode() and search_prefix: if request_context.is_personal_mode() and search_prefix:
query = event.raw_text query = event.raw_text
@ -288,7 +289,7 @@ class SearchPagingHandler(BaseCallbackQueryHandler):
) )
serp, buttons = await search_widget.render(message_id=message_id) serp, buttons = await search_widget.render(message_id=message_id)
return await asyncio.gather(
event.answer(), await message.edit(serp, buttons=buttons, link_preview=False)
message.edit(serp, buttons=buttons, link_preview=False) async with safe_execution(is_logging_enabled=False):
) await event.answer()

View File

@ -47,13 +47,13 @@ class SubmitHandler(BaseHandler):
reply_to = reply_message.id reply_to = reply_message.id
doi_hint = self.get_doi_hint(message=message, reply_message=reply_message) doi_hint = self.get_doi_hint(message=message, reply_message=reply_message)
doi_hint_priority = '' in message.raw_text skip_analysis = '⚡️' in message.raw_text
user_id = message.sender_id user_id = message.sender_id
request_context.statbox( request_context.statbox(
action='analyzed', action='analyzed',
mode='submit', mode='submit',
doi_hint=doi_hint, doi_hint=doi_hint,
doi_hint_priority=doi_hint_priority, skip_analysis=skip_analysis,
reply_to=reply_to, reply_to=reply_to,
) )
@ -71,12 +71,12 @@ class SubmitHandler(BaseHandler):
request_id=request_context.request_id, request_id=request_context.request_id,
session_id=session_id, session_id=session_id,
doi_hint=doi_hint, doi_hint=doi_hint,
doi_hint_priority=doi_hint_priority, skip_analysis=skip_analysis,
uploader_id=user_id, uploader_id=user_id,
) )
case 'application/zip': case 'application/zip':
try:
if request_context.is_personal_mode(): if request_context.is_personal_mode():
try:
file_data = await self.application.telegram_client.download_document( file_data = await self.application.telegram_client.download_document(
document=event.document, document=event.document,
file=bytes, file=bytes,
@ -105,8 +105,6 @@ class SubmitHandler(BaseHandler):
session_id=session_id, session_id=session_id,
uploader_id=user_id, uploader_id=user_id,
) )
else:
await event.reply(t('ZIP_FILES_ARE_NOT_SUPPORTED_IN_GROUP_MODE', request_context.chat.language))
finally: finally:
return await event.delete() return await event.delete()
case _: case _:

View File

@ -85,10 +85,7 @@ class ViewHandler(BaseHandler):
holder = BaseHolder.create(typed_document_pb=typed_document_pb) holder = BaseHolder.create(typed_document_pb=typed_document_pb)
back_command = await self.compose_back_command(session_id=session_id, message_id=message_id, page=page) back_command = await self.compose_back_command(session_id=session_id, message_id=message_id, page=page)
promo = self.application.promotioner.choose_promotion(language).format( promo = self.application.promotioner.choose_promotion(language)
related_channel=self.application.config['telegram']['related_channel'],
twitter_contact_url=self.application.config['twitter']['contact_url'],
)
view_builder = holder.view_builder(language).add_view( view_builder = holder.view_builder(language).add_view(
bot_name=self.application.config['telegram']['bot_name'] bot_name=self.application.config['telegram']['bot_name']
).add_new_line(2).add(promo, escaped=True) ).add_new_line(2).add(promo, escaped=True)

View File

@ -154,10 +154,7 @@ class SearchWidget(BaseSearchWidget):
) )
promotion_language = self.query_language or self.chat.language promotion_language = self.query_language or self.chat.language
promo = self.application.promotioner.choose_promotion(promotion_language).format( promo = self.application.promotioner.choose_promotion(promotion_language)
related_channel=self.application.config['telegram']['related_channel'],
twitter_contact_url=self.application.config['twitter']['contact_url'],
)
serp = f'{serp}\n\n{promo}\n' serp = f'{serp}\n\n{promo}\n'
buttons = None buttons = None

View File

@ -58,7 +58,7 @@ class HubGrpcClient(BaseGrpcClient):
bot_name: str, bot_name: str,
reply_to: Optional[int] = None, reply_to: Optional[int] = None,
doi_hint: Optional[str] = None, doi_hint: Optional[str] = None,
doi_hint_priority: bool = False, skip_analysis: bool = False,
request_id: Optional[str] = None, request_id: Optional[str] = None,
session_id: Optional[str] = None, session_id: Optional[str] = None,
uploader_id: Optional[int] = None uploader_id: Optional[int] = None
@ -68,7 +68,7 @@ class HubGrpcClient(BaseGrpcClient):
bot_name=bot_name, bot_name=bot_name,
reply_to=reply_to, reply_to=reply_to,
doi_hint=doi_hint, doi_hint=doi_hint,
doi_hint_priority=doi_hint_priority, skip_analysis=skip_analysis,
uploader_id=uploader_id, uploader_id=uploader_id,
) )
if isinstance(file, submitter_service_pb2.PlainFile): if isinstance(file, submitter_service_pb2.PlainFile):

View File

@ -4,6 +4,7 @@ pylon:
- [cambridge] - [cambridge]
- [edinburg] - [edinburg]
- [southampton] - [southampton]
default_resolver_proxy_list: ~
downloads_directory: /downloads downloads_directory: /downloads
proxies: proxies:
- address: clash.default.svc.cluster.example.com:7890 - address: clash.default.svc.cluster.example.com:7890
@ -15,6 +16,9 @@ pylon:
- address: clash.default.svc.cluster.example.com:8090 - address: clash.default.svc.cluster.example.com:8090
name: southampton name: southampton
tags: ['southampton'] tags: ['southampton']
- address: socks5://clash.default.svc.cluster.example.com:7991
name: socks5
tags: ['socks5']
sources: sources:
# LibGen.rocks # LibGen.rocks
- driver: - driver:
@ -331,6 +335,13 @@ pylon:
args: args:
format_string: 'https://journals.physiology.org/doi/pdf/{doi}?download=true' format_string: 'https://journals.physiology.org/doi/pdf/{doi}?download=true'
class: nexus.pylon.resolvers.TemplateResolver class: nexus.pylon.resolvers.TemplateResolver
# www.ahajournals.org
- matcher:
doi: ^10.1161/.*$
resolver:
args:
format_string: 'https://www.ahajournals.org/doi/pdf/{doi}?download=true'
class: nexus.pylon.resolvers.TemplateResolver
# ajp.psychiatryonline.org # ajp.psychiatryonline.org
- matcher: - matcher:
doi: ^10.1176/.*$ doi: ^10.1176/.*$

View File

@ -23,7 +23,7 @@ message SubmitRequest {
string bot_name = 4; string bot_name = 4;
optional int64 reply_to = 5; optional int64 reply_to = 5;
optional string doi_hint = 6; optional string doi_hint = 6;
bool doi_hint_priority = 7; bool skip_analysis = 7;
int64 uploader_id = 8; int64 uploader_id = 8;
} }
message SubmitResponse { } message SubmitResponse { }

View File

@ -1,5 +1,6 @@
import asyncio import asyncio
from aiobaseclient.exceptions import BadRequestError
from library.aiogrpctools.base import BaseService from library.aiogrpctools.base import BaseService
from library.telegram.common import close_button from library.telegram.common import close_button
from nexus.views.telegram.common import vote_button from nexus.views.telegram.common import vote_button
@ -16,8 +17,39 @@ def is_group_or_channel(chat_id: int):
return chat_id < 0 return chat_id < 0
class ProcessedDocument:
def __init__(self, processed_document):
self.processed_document = processed_document
@staticmethod
async def setup(file_data, grobid_client, request_context):
try:
processed_document = await grobid_client.process_fulltext_document(pdf_file=file_data)
except BadRequestError as e:
request_context.statbox(action='unparsable_document')
request_context.error_log(e)
processed_document = {}
return ProcessedDocument(processed_document)
@property
def doi(self):
return self.processed_document.get('doi')
@property
def title(self):
return self.processed_document.get('title')
@property
def abstract(self):
return self.processed_document.get('abstract')
@property
def body(self):
return self.processed_document.get('body')
class BaseHubService(BaseService): class BaseHubService(BaseService):
async def item_found(self, bot_name, doi): async def found_item(self, bot_name, doi):
if mutual_aid_service := self.application.mutual_aid_services.get(bot_name): if mutual_aid_service := self.application.mutual_aid_services.get(bot_name):
await mutual_aid_service.delete_request(doi) await mutual_aid_service.delete_request(doi)
await self.application.idm_client.reschedule_subscriptions( await self.application.idm_client.reschedule_subscriptions(
@ -36,13 +68,13 @@ class BaseHubService(BaseService):
) )
)) ))
def set_fields_from_processed(self, document_pb, processed_document): def set_fields_from_processed(self, document_pb, processed_document: ProcessedDocument):
new_fields = [] new_fields = []
if processed_document.get('abstract') and not document_pb.abstract: if processed_document.abstract and not document_pb.abstract:
document_pb.abstract = processed_document['abstract'] document_pb.abstract = processed_document.abstract
new_fields.append('abstract') new_fields.append('abstract')
if processed_document.get('body') and not document_pb.content: if processed_document.body and not document_pb.content:
document_pb.content = processed_document['body'] document_pb.content = processed_document.body
new_fields.append('content') new_fields.append('content')
return new_fields return new_fields

View File

@ -46,6 +46,7 @@ from pypika import (
from .base import ( from .base import (
BaseHubService, BaseHubService,
ProcessedDocument,
is_group_or_channel, is_group_or_channel,
) )
@ -76,6 +77,7 @@ class DeliveryService(delivery_service_pb2_grpc.DeliveryServicer, BaseHubService
proxies=pylon_config['proxies'], proxies=pylon_config['proxies'],
source_configs=pylon_config['sources'], source_configs=pylon_config['sources'],
default_driver_proxy_list=pylon_config['default_driver_proxy_list'], default_driver_proxy_list=pylon_config['default_driver_proxy_list'],
default_resolver_proxy_list=pylon_config['default_resolver_proxy_list'],
downloads_directory=pylon_config['downloads_directory'], downloads_directory=pylon_config['downloads_directory'],
) )
self.should_parse_with_grobid = should_parse_with_grobid self.should_parse_with_grobid = should_parse_with_grobid
@ -216,11 +218,12 @@ class DownloadTask:
error_log=request_context.error_log, error_log=request_context.error_log,
on_fail=_on_fail, on_fail=_on_fail,
): ):
filename = document_holder.get_filename()
progress_bar_download = ProgressBar( progress_bar_download = ProgressBar(
telegram_client=self.application.telegram_clients[request_context.bot_name], telegram_client=self.application.telegram_clients[request_context.bot_name],
request_context=request_context, request_context=request_context,
banner=t("LOOKING_AT", request_context.chat.language), banner=t("LOOKING_AT", request_context.chat.language),
header=f'⬇️ {document_holder.get_filename()}', header=f'⬇️ {filename}',
tail_text=t('TRANSMITTED_FROM', request_context.chat.language), tail_text=t('TRANSMITTED_FROM', request_context.chat.language),
throttle_secs=throttle_secs, throttle_secs=throttle_secs,
) )
@ -239,6 +242,10 @@ class DownloadTask:
) )
if not document_holder.md5 and document_holder.get_extension() == 'pdf': if not document_holder.md5 and document_holder.get_extension() == 'pdf':
try: try:
await progress_bar_download.send_message(
t("PROCESSING_PAPER", request_context.chat.language).format(filename=filename),
ignore_last_call=True
)
file = clean_metadata(file, doi=document_holder.doi) file = clean_metadata(file, doi=document_holder.doi)
request_context.statbox( request_context.statbox(
action='cleaned', action='cleaned',
@ -251,7 +258,7 @@ class DownloadTask:
request_context=request_context, request_context=request_context,
message=progress_bar_download.message, message=progress_bar_download.message,
banner=t("LOOKING_AT", request_context.chat.language), banner=t("LOOKING_AT", request_context.chat.language),
header=f'⬇️ {document_holder.get_filename()}', header=f'⬇️ {filename}',
tail_text=t('UPLOADED_TO_TELEGRAM', request_context.chat.language), tail_text=t('UPLOADED_TO_TELEGRAM', request_context.chat.language),
throttle_secs=throttle_secs throttle_secs=throttle_secs
) )
@ -264,7 +271,7 @@ class DownloadTask:
voting=not is_group_or_channel(self.request_context.chat.chat_id), voting=not is_group_or_channel(self.request_context.chat.chat_id),
) )
if self.document_holder.doi: if self.document_holder.doi:
await self.delivery_service.item_found( await self.delivery_service.found_item(
bot_name=request_context.bot_name, bot_name=request_context.bot_name,
doi=self.document_holder.doi, doi=self.document_holder.doi,
) )
@ -277,6 +284,7 @@ class DownloadTask:
document_holder=document_holder, document_holder=document_holder,
telegram_file_id=uploaded_message.file.id, telegram_file_id=uploaded_message.file.id,
file=file, file=file,
request_context=request_context,
)) ))
else: else:
request_context.statbox( request_context.statbox(
@ -425,7 +433,7 @@ class DownloadTask:
self.task.cancel() self.task.cancel()
await self.task await self.task
async def store_new_data(self, bot_name, document_holder, telegram_file_id, file): async def store_new_data(self, bot_name, document_holder, telegram_file_id, file, request_context):
document_pb = document_holder.document_pb document_pb = document_holder.document_pb
new_fields = [] new_fields = []
if self.delivery_service.should_store_hashes: if self.delivery_service.should_store_hashes:
@ -448,7 +456,11 @@ class DownloadTask:
and document_holder.index_alias == 'scimag' and document_holder.index_alias == 'scimag'
): ):
try: try:
processed_document = await self.application.grobid_client.process_fulltext_document(pdf_file=file) processed_document = await ProcessedDocument.setup(
file,
grobid_client=self.application.grobid_client,
request_context=request_context,
)
new_fields += self.delivery_service.set_fields_from_processed(document_pb, processed_document) new_fields += self.delivery_service.set_fields_from_processed(document_pb, processed_document)
except BadRequestError as e: except BadRequestError as e:
self.request_context.statbox(action='unparsable_document') self.request_context.statbox(action='unparsable_document')

View File

@ -34,7 +34,10 @@ from nexus.views.telegram.base_holder import ScimagHolder
from telethon.errors import ChatAdminRequiredError from telethon.errors import ChatAdminRequiredError
from telethon.extensions import BinaryReader from telethon.extensions import BinaryReader
from .base import BaseHubService from .base import (
BaseHubService,
ProcessedDocument,
)
async def operation_log(document_operation_pb): async def operation_log(document_operation_pb):
@ -87,22 +90,48 @@ class PlainFile:
def fuzzy_compare(a, b): def fuzzy_compare(a, b):
if a is None or b is None:
return False
a = re.sub(r'[^a-z\d]', '', a.lower()) a = re.sub(r'[^a-z\d]', '', a.lower())
b = re.sub(r'[^a-z\d]', '', b.lower()) b = re.sub(r'[^a-z\d]', '', b.lower())
return SequenceMatcher(None, a, b).ratio() > 0.9 return SequenceMatcher(None, a, b).ratio() > 0.9
async def delayed_task(task, seconds):
await asyncio.sleep(seconds)
await task
class SubmitterService(submitter_service_pb2_grpc.SubmitterServicer, BaseHubService): class SubmitterService(submitter_service_pb2_grpc.SubmitterServicer, BaseHubService):
async def start(self): async def start(self):
submitter_service_pb2_grpc.add_SubmitterServicer_to_server(self, self.application.server) submitter_service_pb2_grpc.add_SubmitterServicer_to_server(self, self.application.server)
def wrap_request_file(self, request, request_context):
match str(request.WhichOneof('file')):
case 'plain':
return PlainFile(request.plain)
case 'telegram':
return TelegramFile(self.application.telegram_clients[request_context.bot_name], request.telegram)
case _:
raise RuntimeError(f"Unknown file type {request.WhichOneof('file')}")
async def retrieve_metadata(self, doi, title, session_id, request_context):
if doi:
meta_search_response = await self.application.meta_api_client.meta_search(
index_aliases=['scimag', ],
query=doi,
collectors=[{'top_docs': {'limit': 1}}],
session_id=session_id,
request_id=request_context.request_id,
user_id=str(request_context.chat.chat_id),
query_tags=['submitter'],
)
scored_documents = meta_search_response.collector_outputs[0].top_docs.scored_documents
if len(scored_documents) == 1:
scimag_pb = scimag_pb2.Scimag(**json.loads(scored_documents[0].document))
if title is not None and not fuzzy_compare(scimag_pb.title, title):
request_context.statbox(
action='mismatched_title',
doi=doi,
processed_title=title,
title=scimag_pb.title,
)
return None
return scimag_pb
@aiogrpc_request_wrapper(log=False) @aiogrpc_request_wrapper(log=False)
async def submit( async def submit(
self, self,
@ -125,17 +154,10 @@ class SubmitterService(submitter_service_pb2_grpc.SubmitterServicer, BaseHubServ
) )
buttons = None if request_context.is_group_mode() else [close_button()] buttons = None if request_context.is_group_mode() else [close_button()]
wrapped_file = self.wrap_request_file(request, request_context)
match str(request.WhichOneof('file')): if wrapped_file.size > 300 * 1024 * 1024:
case 'plain': request_context.error_log(FileTooBigError(size=wrapped_file.size))
file = PlainFile(request.plain)
case 'telegram':
file = TelegramFile(self.application.telegram_clients[request_context.bot_name], request.telegram)
case _:
raise RuntimeError(f"Unknown file type {request.WhichOneof('file')}")
if file.size > 30 * 1024 * 1024:
request_context.error_log(FileTooBigError(size=file.size))
request_context.statbox(action='file_too_big') request_context.statbox(action='file_too_big')
async with safe_execution(error_log=request_context.error_log): async with safe_execution(error_log=request_context.error_log):
await self.application.telegram_clients[request_context.bot_name].send_message( await self.application.telegram_clients[request_context.bot_name].send_message(
@ -149,109 +171,59 @@ class SubmitterService(submitter_service_pb2_grpc.SubmitterServicer, BaseHubServ
try: try:
processing_message = await self.application.telegram_clients[request_context.bot_name].send_message( processing_message = await self.application.telegram_clients[request_context.bot_name].send_message(
request_context.chat.chat_id, request_context.chat.chat_id,
t("PROCESSING_PAPER", request_context.chat.language).format(filename=file.filename), t("PROCESSING_PAPER", request_context.chat.language).format(filename=wrapped_file.filename),
reply_to=request.reply_to, reply_to=request.reply_to,
) )
except ChatAdminRequiredError: except ChatAdminRequiredError:
return submitter_service_pb2.SubmitResponse() return submitter_service_pb2.SubmitResponse()
try: try:
file_data = await file.read() file_data = await wrapped_file.read()
processed_document = None if not request.skip_analysis:
pdf_doi = None processed_document = await ProcessedDocument.setup(
pdf_title = None file_data,
try: grobid_client=self.application.grobid_client,
processed_document = await self.application.grobid_client.process_fulltext_document(pdf_file=file_data) request_context=request_context,
pdf_doi = processed_document.get('doi')
pdf_title = processed_document.get('title')
request_context.add_default_fields(pdf_doi=pdf_doi)
except BadRequestError as e:
request_context.statbox(action='unparsable_document')
request_context.error_log(e)
if not request.doi_hint:
await self.application.telegram_clients[request_context.bot_name].send_message(
request_context.chat.chat_id,
t('UNPARSABLE_DOCUMENT_ERROR', request_context.chat.language).format(
filename=file.filename,
),
buttons=buttons,
reply_to=request.reply_to,
) )
return submitter_service_pb2.SubmitResponse() else:
processed_document = ProcessedDocument({})
if request.doi_hint and pdf_doi != request.doi_hint: if not processed_document.doi and not request.doi_hint:
request_context.statbox(action='mismatched_doi', doi_hint_priority=request.doi_hint_priority)
if request.doi_hint_priority:
pdf_doi = request.doi_hint
if not pdf_doi and not request.doi_hint:
request_context.statbox(action='unparsable_doi') request_context.statbox(action='unparsable_doi')
request_context.error_log(UnparsableDoiError()) request_context.error_log(UnparsableDoiError())
await self.application.telegram_clients[request_context.bot_name].send_message( await self.application.telegram_clients[request_context.bot_name].send_message(
request_context.chat.chat_id, request_context.chat.chat_id,
t('UNPARSABLE_DOI_ERROR', request_context.chat.language).format( t('UNPARSABLE_DOI_ERROR', request_context.chat.language).format(
filename=file.filename, filename=wrapped_file.filename,
), ),
buttons=buttons, buttons=buttons,
reply_to=request.reply_to, reply_to=request.reply_to,
) )
return submitter_service_pb2.SubmitResponse() return submitter_service_pb2.SubmitResponse()
scimag_pb = None scimag_pb = await self.retrieve_metadata(
processed_document.doi,
if pdf_doi: processed_document.title,
meta_search_response = await self.application.meta_api_client.meta_search(
index_aliases=['scimag',],
query=pdf_doi,
collectors=[{'top_docs': {'limit': 1}}],
session_id=session_id, session_id=session_id,
request_id=request_context.request_id, request_context=request_context,
user_id=str(request_context.chat.chat_id),
query_tags=['submitter'],
) )
scored_documents = meta_search_response.collector_outputs[0].top_docs.scored_documents
if len(scored_documents) == 1:
scimag_pb = scimag_pb2.Scimag(**json.loads(scored_documents[0].document))
if not fuzzy_compare(scimag_pb.title, pdf_title):
request_context.statbox(
action='mismatched_title',
doi=pdf_doi,
pdf_title=pdf_title,
title=scimag_pb.title,
)
scimag_pb = None
if not scimag_pb and request.doi_hint: if not scimag_pb and request.doi_hint:
meta_search_response = await self.application.meta_api_client.meta_search( scimag_pb = await self.retrieve_metadata(
index_aliases=['scimag', ], request.doi_hint,
query=request.doi_hint, processed_document.title,
collectors=[{'top_docs': {'limit': 1}}],
session_id=session_id, session_id=session_id,
request_id=request_context.request_id, request_context=request_context,
user_id=str(request_context.chat.chat_id),
query_tags=['submitter'],
) )
scored_documents = meta_search_response.collector_outputs[0].top_docs.scored_documents
if len(scored_documents) == 1:
scimag_pb = scimag_pb2.Scimag(**json.loads(scored_documents[0].document))
if not fuzzy_compare(scimag_pb.title, pdf_title):
request_context.statbox(
action='mismatched_title',
doi=request.doi_hint,
pdf_title=pdf_title,
title=scimag_pb.title,
)
# ToDo: add trust mechanics
if not scimag_pb: if not scimag_pb:
request_context.statbox(action='unavailable_metadata') request_context.statbox(action='unavailable_metadata')
request_context.error_log(UnavailableMetadataError(doi=pdf_doi)) request_context.error_log(UnavailableMetadataError(doi=processed_document.doi))
await self.application.telegram_clients[request_context.bot_name].send_message( await self.application.telegram_clients[request_context.bot_name].send_message(
request_context.chat.chat_id, request_context.chat.chat_id,
t( t(
'UNAVAILABLE_METADATA_ERROR', 'UNAVAILABLE_METADATA_ERROR',
language=request_context.chat.language language=request_context.chat.language
).format(doi=pdf_doi or request.doi_hint), ).format(doi=processed_document.doi or request.doi_hint),
buttons=buttons, buttons=buttons,
reply_to=request.reply_to, reply_to=request.reply_to,
) )
@ -260,10 +232,7 @@ class SubmitterService(submitter_service_pb2_grpc.SubmitterServicer, BaseHubServ
request_context.add_default_fields(doi=scimag_pb.doi, document_id=scimag_pb.id) request_context.add_default_fields(doi=scimag_pb.doi, document_id=scimag_pb.id)
try: try:
file_data = clean_metadata(file_data, doi=scimag_pb.doi) file_data = clean_metadata(file_data, doi=scimag_pb.doi)
request_context.statbox( request_context.statbox(action='cleaned', len=len(file_data))
action='cleaned',
len=len(file_data),
)
except ValueError as e: except ValueError as e:
request_context.error_log(e) request_context.error_log(e)
uploaded_message = await self.send_file( uploaded_message = await self.send_file(
@ -276,13 +245,13 @@ class SubmitterService(submitter_service_pb2_grpc.SubmitterServicer, BaseHubServ
if processed_document: if processed_document:
sharience_pb = sharience_pb2.Sharience( sharience_pb = sharience_pb2.Sharience(
abstract=processed_document.get('abstract', ''), abstract=processed_document.abstract or '',
content=processed_document.get('body', ''), content=processed_document.body or '',
parent_id=scimag_pb.id, parent_id=scimag_pb.id,
uploader_id=request.uploader_id or request_context.chat.chat_id, uploader_id=request.uploader_id or request_context.chat.chat_id,
updated_at=int(time.time()), updated_at=int(time.time()),
md5=hashlib.md5(file_data).hexdigest(), md5=hashlib.md5(file_data).hexdigest(),
filesize=file.size, filesize=wrapped_file.size,
ipfs_multihashes=await self.get_ipfs_hashes(file=file_data), ipfs_multihashes=await self.get_ipfs_hashes(file=file_data),
) )
update_sharience_pb = operation_pb2.DocumentOperation( update_sharience_pb = operation_pb2.DocumentOperation(
@ -313,13 +282,13 @@ class SubmitterService(submitter_service_pb2_grpc.SubmitterServicer, BaseHubServ
await operation_log(store_telegram_file_id_operation_pb) await operation_log(store_telegram_file_id_operation_pb)
request_context.statbox(action='successfully_stored') request_context.statbox(action='successfully_stored')
if file.message_id: if wrapped_file.message_id:
async with safe_execution(error_log=request_context.error_log, level=logging.DEBUG): async with safe_execution(error_log=request_context.error_log, level=logging.DEBUG):
await self.application.telegram_clients[request_context.bot_name].delete_messages( await self.application.telegram_clients[request_context.bot_name].delete_messages(
request_context.chat.chat_id, request_context.chat.chat_id,
file.message_id, wrapped_file.message_id,
) )
await self.item_found(bot_name=request_context.bot_name, doi=scimag_pb.doi) await self.found_item(bot_name=request_context.bot_name, doi=scimag_pb.doi)
finally: finally:
await processing_message.delete() await processing_message.delete()
return submitter_service_pb2.SubmitResponse() return submitter_service_pb2.SubmitResponse()

View File

@ -45,7 +45,7 @@ from nexus.meta_api.word_transformers import (
) )
def create_query_transformer(valid_fields, invalid_fields): def create_query_transformer(valid_fields, invalid_fields, order_by_valid_fields):
return QueryProcessor( return QueryProcessor(
tree_transformers=[ tree_transformers=[
OrderByTreeTransformer( OrderByTreeTransformer(
@ -55,14 +55,7 @@ def create_query_transformer(valid_fields, invalid_fields):
'pr': 'page_rank', 'pr': 'page_rank',
'refc': 'referenced_by_count', 'refc': 'referenced_by_count',
}, },
valid_fields=frozenset([ valid_fields=frozenset(order_by_valid_fields)
'id',
'referenced_by_count',
'issued_at',
'page_rank',
'pages',
'updated_at'
])
), ),
FieldTreeTransformer( FieldTreeTransformer(
field_aliases={ field_aliases={
@ -135,20 +128,35 @@ class GrpcServer(AioGrpcServer):
'doi', 'ipfs_multihashes', 'issns', 'isbns', 'issued_at', 'language', 'original_id', 'doi', 'ipfs_multihashes', 'issns', 'isbns', 'issued_at', 'language', 'original_id',
'page_rank', 'referenced_by_count', 'references', 'tags', 'title', 'year', 'page_rank', 'referenced_by_count', 'references', 'tags', 'title', 'year',
} }
order_by_scimag_fields = {
'id',
'referenced_by_count',
'issued_at',
'page_rank',
'updated_at'
}
scitech_fields = { scitech_fields = {
'id', 'authors', 'doi', 'description', 'extension', 'id', 'authors', 'doi', 'description', 'extension',
'ipfs_multihashes', 'isbns', 'issued_at', 'language', 'original_id', 'pages', 'ipfs_multihashes', 'isbns', 'issued_at', 'language', 'original_id', 'pages',
'tags', 'title', 'updated_at', 'year', 'tags', 'title', 'updated_at', 'year',
} }
order_by_scitech_fields = {
'id',
'issued_at',
'pages',
'updated_at'
}
self.query_transformers = { self.query_transformers = {
'scimag': create_query_transformer( 'scimag': create_query_transformer(
valid_fields=scimag_fields, valid_fields=scimag_fields,
invalid_fields=scitech_fields.difference(scimag_fields), invalid_fields=scitech_fields.difference(scimag_fields),
order_by_valid_fields=order_by_scimag_fields,
), ),
'scitech': create_query_transformer( 'scitech': create_query_transformer(
valid_fields=scitech_fields, valid_fields=scitech_fields,
invalid_fields=scimag_fields.difference(scitech_fields), invalid_fields=scimag_fields.difference(scitech_fields),
order_by_valid_fields=order_by_scitech_fields,
) )
} }
self.summa_client = SummaClient( self.summa_client = SummaClient(

View File

@ -31,7 +31,6 @@ class BaseService(LibraryBaseService):
new_scored_documents = [] new_scored_documents = []
for scored_document in scored_documents: for scored_document in scored_documents:
document = json.loads(scored_document.document) document = json.loads(scored_document.document)
document = self.enrich_document_with_stat_provider(document)
new_scored_documents.append(nexus_meta_api_search_service_pb.ScoredDocument( new_scored_documents.append(nexus_meta_api_search_service_pb.ScoredDocument(
typed_document=typed_document_pb2.TypedDocument( typed_document=typed_document_pb2.TypedDocument(
**{scored_document.index_alias: self.pb_registry[scored_document.index_alias](**document)}, **{scored_document.index_alias: self.pb_registry[scored_document.index_alias](**document)},
@ -46,7 +45,6 @@ class BaseService(LibraryBaseService):
new_scored_documents = [] new_scored_documents = []
for position, document in enumerate(documents): for position, document in enumerate(documents):
document = json.loads(document) document = json.loads(document)
document = self.enrich_document_with_stat_provider(document)
new_scored_documents.append(nexus_meta_api_search_service_pb.ScoredDocument( new_scored_documents.append(nexus_meta_api_search_service_pb.ScoredDocument(
typed_document=TypedDocumentPb( typed_document=TypedDocumentPb(
**{index_alias: self.pb_registry[index_alias](**document)}, **{index_alias: self.pb_registry[index_alias](**document)},
@ -55,14 +53,3 @@ class BaseService(LibraryBaseService):
score=1.0, score=1.0,
)) ))
return new_scored_documents return new_scored_documents
def enrich_document_with_stat_provider(self, document):
if self.stat_provider:
original_id = (
document.get('original_id')
or document['id']
)
download_stats = self.stat_provider.get_download_stats(original_id)
if download_stats and download_stats.downloads_count:
document['downloads_count'] = download_stats.downloads_count
return document

View File

@ -57,19 +57,6 @@ class DocumentsService(DocumentsServicer, BaseService):
document_pb = getattr(typed_document_pb, typed_document_pb.WhichOneof('document')) document_pb = getattr(typed_document_pb, typed_document_pb.WhichOneof('document'))
if hasattr(document_pb, 'original_id') and document_pb.original_id:
original_document_pb = await self.get_document(
index_alias=request.index_alias,
document_id=document_pb.original_id,
context=context,
request_id=metadata['request-id'],
session_id=metadata['session-id'],
)
for to_remove in ('doi', 'fiction_id', 'filesize', 'libgen_id',):
original_document_pb.ClearField(to_remove)
original_document_pb.MergeFrom(document_pb)
document_pb = original_document_pb
logging.getLogger('query').info({ logging.getLogger('query').info({
'action': 'get', 'action': 'get',
'cache_hit': False, 'cache_hit': False,

View File

@ -270,6 +270,10 @@ class SearchService(SearchServicer, BaseService):
@aiogrpc_request_wrapper(log=False) @aiogrpc_request_wrapper(log=False)
async def search(self, request, context, metadata): async def search(self, request, context, metadata):
preprocessed_query = await self.process_query(query=request.query, languages=request.language, context=context) preprocessed_query = await self.process_query(query=request.query, languages=request.language, context=context)
logging.getLogger('debug').info({
'action': 'preprocess_query',
'preprocessed_query': str(preprocessed_query),
})
index_aliases = self.resolve_index_aliases( index_aliases = self.resolve_index_aliases(
request_index_aliases=request.index_aliases, request_index_aliases=request.index_aliases,
processed_query=preprocessed_query, processed_query=preprocessed_query,
@ -280,19 +284,27 @@ class SearchService(SearchServicer, BaseService):
right_offset = left_offset + page_size right_offset = left_offset + page_size
search_requests = [] search_requests = []
processed_queries = {}
for index_alias in index_aliases: for index_alias in index_aliases:
processed_query = self.query_transformers[index_alias].apply_tree_transformers(preprocessed_query) processed_queries[index_alias] = self.query_transformers[index_alias].apply_tree_transformers(preprocessed_query)
logging.getLogger('debug').info({
'action': 'process_query',
'index_alias': index_alias,
'processed_query': str(processed_queries[index_alias]),
'order_by': processed_queries[index_alias].context.order_by,
'has_invalid_fields': processed_queries[index_alias].context.has_invalid_fields,
})
search_requests.append( search_requests.append(
SearchRequest( SearchRequest(
index_alias=index_alias, index_alias=index_alias,
query=processed_query.to_summa_query(), query=processed_queries[index_alias].to_summa_query(),
collectors=[ collectors=[
search_service_pb2.Collector( search_service_pb2.Collector(
top_docs=search_service_pb2.TopDocsCollector( top_docs=search_service_pb2.TopDocsCollector(
limit=50, limit=50,
scorer=self.scorer(processed_query, index_alias), scorer=self.scorer(processed_queries[index_alias], index_alias),
snippets=self.snippets[index_alias], snippets=self.snippets[index_alias],
explain=processed_query.context.explain, explain=processed_queries[index_alias].context.explain,
) )
), ),
search_service_pb2.Collector(count=search_service_pb2.CountCollector()) search_service_pb2.Collector(count=search_service_pb2.CountCollector())
@ -322,9 +334,9 @@ class SearchService(SearchServicer, BaseService):
meta_search_response.collector_outputs[0].top_docs.scored_documents, meta_search_response.collector_outputs[0].top_docs.scored_documents,
) )
has_next = len(new_scored_documents) > right_offset has_next = len(new_scored_documents) > right_offset
if 'scimag' in index_aliases: if 'scimag' in processed_queries:
await self.check_if_need_new_documents_by_dois( await self.check_if_need_new_documents_by_dois(
requested_dois=processed_query.context.dois, requested_dois=processed_queries['scimag'].context.dois,
scored_documents=new_scored_documents, scored_documents=new_scored_documents,
should_request=attempt.retry_state.attempt_number == 1 should_request=attempt.retry_state.attempt_number == 1
) )
@ -333,7 +345,7 @@ class SearchService(SearchServicer, BaseService):
scored_documents=new_scored_documents[left_offset:right_offset], scored_documents=new_scored_documents[left_offset:right_offset],
has_next=has_next, has_next=has_next,
count=meta_search_response.collector_outputs[1].count.count, count=meta_search_response.collector_outputs[1].count.count,
query_language=processed_query.context.query_language, query_language=preprocessed_query.context.query_language,
) )
return search_response_pb return search_response_pb

View File

@ -6,7 +6,7 @@ pipe:
kafka-0.example.net kafka-0.example.net
schema: schema:
- consumers: - consumers:
- class: nexus.pipe.consumers.CrossReferencesBulkConsumer - class: nexus.pipe.consumers.CrossReferencesConsumer
topics: topics:
- name: cross_references - name: cross_references
workers: 4 workers: 4

View File

@ -1,17 +1,11 @@
from .cross_references_consumer import ( from .cross_references_consumer import CrossReferencesConsumer
CrossReferencesBulkConsumer,
CrossReferencesConsumer,
)
from .document_operations_consumer import ( from .document_operations_consumer import (
DocumentOperationsBulkConsumer,
DocumentOperationsConsumer, DocumentOperationsConsumer,
DocumentOperationsJsonConsumer, DocumentOperationsJsonConsumer,
) )
__all__ = [ __all__ = [
'CrossReferencesBulkConsumer',
'CrossReferencesConsumer', 'CrossReferencesConsumer',
'DocumentOperationsConsumer', 'DocumentOperationsConsumer',
'DocumentOperationsBulkConsumer',
'DocumentOperationsJsonConsumer', 'DocumentOperationsJsonConsumer',
] ]

View File

@ -66,9 +66,10 @@ class BaseConsumer(AioRootThing):
except (ConflictError, InterruptProcessing) as e: except (ConflictError, InterruptProcessing) as e:
logging.getLogger('statbox').info(e) logging.getLogger('statbox').info(e)
except (asyncio.CancelledError, ConsumerStoppedError): except (asyncio.CancelledError, ConsumerStoppedError):
pass return
finally: finally:
await consumer.stop() await consumer.stop()
asyncio.create_task(self.stop())
async def start(self): async def start(self):
logging.getLogger('statbox').info({ logging.getLogger('statbox').info({
@ -87,8 +88,10 @@ class BaseConsumer(AioRootThing):
async def stop(self): async def stop(self):
if self.consumer_task: if self.consumer_task:
self.consumer_task.cancel() consumer_task = self.consumer_task
await self.consumer_task self.consumer_task = None
consumer_task.cancel()
await consumer_task
class BasePbConsumer(BaseConsumer): class BasePbConsumer(BaseConsumer):
@ -108,52 +111,3 @@ class BaseJsonConsumer(BaseConsumer):
message = json.loads(msg.value) message = json.loads(msg.value)
ParseDict(message, pb, ignore_unknown_fields=True) ParseDict(message, pb, ignore_unknown_fields=True)
return pb return pb
class BaseBulkConsumer(BaseConsumer):
auto_commit = False
bulk_size = 20
timeout = 1
async def consume(self, consumer):
try:
while self.started:
try:
result = await consumer.getmany(timeout_ms=self.timeout * 1000, max_records=self.bulk_size)
except (ConsumerStoppedError, asyncio.CancelledError):
break
collector = []
for tp, messages in result.items():
if messages:
for message in messages:
preprocessed_msg = self.preprocess(message)
if preprocessed_msg:
collector.append(preprocessed_msg)
for processor in self.processors:
filtered = filter(processor.filter, collector)
try:
await processor.process_bulk(filtered)
except InterruptProcessing as e:
logging.getLogger('statbox').info(e)
try:
await consumer.commit()
except CommitFailedError as e:
error_log(e)
continue
finally:
await consumer.stop()
async def start(self):
logging.getLogger('statbox').info({
'action': 'start',
'group_id': self.group_id,
'topic_names': self.topic_names,
})
consumer = self.create_consumer()
await consumer.start()
logging.getLogger('statbox').info({
'action': 'started',
'group_id': self.group_id,
'topic_names': self.topic_names,
})
self.consumer_task = asyncio.create_task(self.consume(consumer))

View File

@ -1,15 +1,8 @@
from nexus.models.proto.operation_pb2 import \ from nexus.models.proto.operation_pb2 import \
CrossReferenceOperation as CrossReferenceOperationPb CrossReferenceOperation as CrossReferenceOperationPb
from .base import ( from .base import BasePbConsumer
BaseBulkConsumer,
BasePbConsumer,
)
class CrossReferencesConsumer(BasePbConsumer): class CrossReferencesConsumer(BasePbConsumer):
pb_class = CrossReferenceOperationPb pb_class = CrossReferenceOperationPb
class CrossReferencesBulkConsumer(BaseBulkConsumer, CrossReferencesConsumer):
pass

View File

@ -2,7 +2,6 @@ from nexus.models.proto.operation_pb2 import \
DocumentOperation as DocumentOperationPb DocumentOperation as DocumentOperationPb
from .base import ( from .base import (
BaseBulkConsumer,
BaseJsonConsumer, BaseJsonConsumer,
BasePbConsumer, BasePbConsumer,
) )
@ -14,7 +13,3 @@ class DocumentOperationsConsumer(BasePbConsumer):
class DocumentOperationsJsonConsumer(BaseJsonConsumer): class DocumentOperationsJsonConsumer(BaseJsonConsumer):
pb_class = DocumentOperationPb pb_class = DocumentOperationPb
class DocumentOperationsBulkConsumer(BaseBulkConsumer, DocumentOperationsConsumer):
pass

View File

@ -22,12 +22,14 @@ class PylonClient(AioThing):
source_configs: Optional[List], source_configs: Optional[List],
proxies: Optional[List[str]] = None, proxies: Optional[List[str]] = None,
downloads_directory: Optional[str] = None, downloads_directory: Optional[str] = None,
default_driver_proxy_list: [Optional[List]] = None default_driver_proxy_list: [Optional[List]] = None,
default_resolver_proxy_list: [Optional[List]] = None,
): ):
super().__init__() super().__init__()
self.proxy_manager = ProxyManager(proxies) self.proxy_manager = ProxyManager(proxies)
self.downloads_directory = downloads_directory self.downloads_directory = downloads_directory
self.default_driver_proxy_list = default_driver_proxy_list self.default_driver_proxy_list = default_driver_proxy_list
self.default_resolver_proxy_list = default_resolver_proxy_list
self.sources = [] self.sources = []
for source_config in source_configs: for source_config in source_configs:
source = Source.from_config( source = Source.from_config(
@ -35,6 +37,7 @@ class PylonClient(AioThing):
source_config=source_config, source_config=source_config,
downloads_directory=downloads_directory, downloads_directory=downloads_directory,
default_driver_proxy_list=default_driver_proxy_list, default_driver_proxy_list=default_driver_proxy_list,
default_resolver_proxy_list=default_resolver_proxy_list,
) )
self.sources.append(source) self.sources.append(source)
self.starts.append(source) self.starts.append(source)

View File

@ -138,6 +138,7 @@ pylon:
resolver: resolver:
args: args:
format_string: 'https://www.sciencedirect.com/science/article/pii/{selected}/pdfft?isDTMRedir=true&download=true' format_string: 'https://www.sciencedirect.com/science/article/pii/{selected}/pdfft?isDTMRedir=true&download=true'
resolve_timeout: 25.0
selector: '(.resource.primary.URL | split("/"))[-1]' selector: '(.resource.primary.URL | split("/"))[-1]'
timeout: 40.0 timeout: 40.0
class: nexus.pylon.resolvers.DoiOrgRequestResolver class: nexus.pylon.resolvers.DoiOrgRequestResolver

View File

@ -26,7 +26,7 @@ class BaseDriver(NetworkAgent):
validator_cls = validator['class'] validator_cls = validator['class']
validator_cls = import_object(validator_cls) validator_cls = import_object(validator_cls)
self.validator = validator_cls or BaseValidator self.validator = validator_cls
self.downloads_directory = downloads_directory self.downloads_directory = downloads_directory
def __str__(self): def __str__(self):

View File

@ -3,7 +3,6 @@ import json
import logging import logging
import os.path import os.path
import shutil import shutil
import sys
import time import time
from pathlib import Path from pathlib import Path
from typing import ( from typing import (

View File

@ -1,4 +1,5 @@
import re import re
import sys
class Matcher: class Matcher:

View File

@ -11,6 +11,11 @@ import jq
from nexus.pylon.prepared_request import PreparedRequest from nexus.pylon.prepared_request import PreparedRequest
from nexus.pylon.proxy_manager import ProxyManager from nexus.pylon.proxy_manager import ProxyManager
from nexus.pylon.resolvers.base import BaseResolver from nexus.pylon.resolvers.base import BaseResolver
from tenacity import (
retry,
stop_after_attempt,
wait_random,
)
class DoiOrgRequestResolver(BaseResolver): class DoiOrgRequestResolver(BaseResolver):
@ -24,6 +29,7 @@ class DoiOrgRequestResolver(BaseResolver):
proxy_manager: Optional[ProxyManager] = None, proxy_manager: Optional[ProxyManager] = None,
): ):
super().__init__(proxy_list=proxy_list, proxy_manager=proxy_manager) super().__init__(proxy_list=proxy_list, proxy_manager=proxy_manager)
self.text_selector = selector
self.selector = jq.compile(selector) self.selector = jq.compile(selector)
self.format_string = format_string self.format_string = format_string
self.timeout = timeout self.timeout = timeout
@ -32,6 +38,11 @@ class DoiOrgRequestResolver(BaseResolver):
def __str__(self): def __str__(self):
return f'{self.__class__.__name__}(selector = {self.selector}, format_string = {self.format_string})' return f'{self.__class__.__name__}(selector = {self.selector}, format_string = {self.format_string})'
@retry(
reraise=True,
wait=wait_random(min=1, max=3),
stop=stop_after_attempt(4),
)
async def resolve_through_doi_org(self, params): async def resolve_through_doi_org(self, params):
async with self.get_session() as session: async with self.get_session() as session:
doi_url = f'https://doi.org/{params["doi"]}' doi_url = f'https://doi.org/{params["doi"]}'
@ -51,6 +62,7 @@ class DoiOrgRequestResolver(BaseResolver):
logging.getLogger('error').error({ logging.getLogger('error').error({
'action': 'error', 'action': 'error',
'mode': 'pylon', 'mode': 'pylon',
'params': params,
'error': str(e) 'error': str(e)
}) })
return return
@ -60,3 +72,11 @@ class DoiOrgRequestResolver(BaseResolver):
url=self.format_string.format(selected=selected), url=self.format_string.format(selected=selected),
timeout=self.timeout, timeout=self.timeout,
) )
else:
logging.getLogger('debug').error({
'action': 'missed_selector',
'mode': 'pylon',
'params': params,
'selector': self.text_selector,
'format_string': self.format_string,
})

View File

@ -33,13 +33,17 @@ class Source(AioThing):
source_config, source_config,
downloads_directory: str, downloads_directory: str,
default_driver_proxy_list: List, default_driver_proxy_list: List,
default_resolver_proxy_list: List,
) -> 'Source': ) -> 'Source':
matcher = Matcher(source_config['matcher']) matcher = Matcher(source_config['matcher'])
resolver_cls = import_object( resolver_cls = import_object(
source_config.get('resolver', {}).get('class', 'nexus.pylon.resolvers.TemplateResolver') source_config.get('resolver', {}).get('class', 'nexus.pylon.resolvers.TemplateResolver')
) )
resolver_args = dict(proxy_manager=proxy_manager) resolver_args = dict(
proxy_manager=proxy_manager,
proxy_list=default_resolver_proxy_list,
)
resolver_args.update(**source_config.get('resolver', {}).get('args', {})) resolver_args.update(**source_config.get('resolver', {}).get('args', {}))
resolver = resolver_cls(**resolver_args) resolver = resolver_cls(**resolver_args)

View File

@ -4,7 +4,7 @@
en: en:
ABOUT_US: | ABOUT_US: |
**About us** **About us**
Among the most impactful inventions of humanity such as the wheel, semiconductors or the wing were the invention of writing and advent of mass printing. Uwe of printing press made possible wide spread of ideas and knowledge. This burst increased the number of educated people and in its turn people started the Age of Enlightenment and shaped modern civilization. Among the most impactful inventions of humanity such as the wheel, semiconductors or the wing were the invention of writing and advent of mass printing. Use of printing press made possible wide spread of ideas and knowledge. This burst increased the number of educated people and in its turn people started the Age of Enlightenment and shaped modern civilization.
Why printing provoked such dramatic changes in society? Why printing provoked such dramatic changes in society?
Sane beings and knowledge live in a natural symbiosis: knowledge are fruiting and growing while circulated among people and people do the same. We may even note that complexity of human civilization, its achievements and technological progress during all its history have been correlated with the amount of accumulated knowledge and possibility to share and navigate through it. Sane beings and knowledge live in a natural symbiosis: knowledge are fruiting and growing while circulated among people and people do the same. We may even note that complexity of human civilization, its achievements and technological progress during all its history have been correlated with the amount of accumulated knowledge and possibility to share and navigate through it.
@ -14,7 +14,7 @@ en:
However, there is one remaining issue: knowledge is usurped by large publishers, corporations such as Elsevier, Springer, Wiley, T&F etc. Held by copyright, the knowledge remains out of reach for many researchers and for many enterprises that could launch the new Age of Enlightenment. However, there is one remaining issue: knowledge is usurped by large publishers, corporations such as Elsevier, Springer, Wiley, T&F etc. Held by copyright, the knowledge remains out of reach for many researchers and for many enterprises that could launch the new Age of Enlightenment.
Knowledge is not just usurped, an entire system has been built which is putting in chains researchers, academia and readers: Knowledge is not just usurped, an entire system has been built which is putting in chains researchers, academia and readers:
- Large publishers promote and lobby approaches that encourage evaluation of researchers using the number of published articles in journals having high-impact. Everybody knows who owns journals of high "impact" - Large publishers promote and lobby approaches that encourage evaluation of researchers using the number of published articles in journals having high-impact
- Publishers make authors to pay for publishing tax-funded researches, then readers to pay for access to published tax-funded researches, while peer-reviewing is done for free by other researchers and this activity is considered as "honorable" in academia - Publishers make authors to pay for publishing tax-funded researches, then readers to pay for access to published tax-funded researches, while peer-reviewing is done for free by other researchers and this activity is considered as "honorable" in academia
- Publishers such as Elsevier repeatedly carried and carry campaigns against open access - Publishers such as Elsevier repeatedly carried and carry campaigns against open access

View File

@ -17,6 +17,7 @@ class DocumentListWidget:
document_holders: List[BaseHolder], document_holders: List[BaseHolder],
bot_name, bot_name,
header: Optional[str] = None, header: Optional[str] = None,
promotioner=None,
has_next: bool = False, has_next: bool = False,
session_id: Optional[str] = None, session_id: Optional[str] = None,
message_id: Optional[int] = None, message_id: Optional[int] = None,
@ -29,6 +30,7 @@ class DocumentListWidget:
self.document_holders = document_holders self.document_holders = document_holders
self.bot_name = bot_name self.bot_name = bot_name
self.header = header self.header = header
self.promotioner = promotioner
self.cmd = cmd self.cmd = cmd
self.has_next = has_next self.has_next = has_next
self.session_id = session_id self.session_id = session_id
@ -57,6 +59,10 @@ class DocumentListWidget:
if self.header: if self.header:
serp = f'**{self.header}**\n\n{serp}' serp = f'**{self.header}**\n\n{serp}'
promotion_language = self.chat.language
promo = self.promotioner.choose_promotion(promotion_language)
serp = f'{serp}\n\n{promo}\n'
buttons = [] buttons = []
if self.cmd and self.message_id and self.session_id and (self.has_next or self.page > 0): if self.cmd and self.message_id and self.session_id and (self.has_next or self.page > 0):
buttons = [ buttons = [

View File

@ -100,9 +100,9 @@ class ProgressBar:
async def show_banner(self): async def show_banner(self):
return await self.send_message(await self.render_banner(), ignore_last_call=True) return await self.send_message(await self.render_banner(), ignore_last_call=True)
async def callback(self, done, total): async def callback(self, done, total, ignore_last_call=False):
self._set_progress(done, total) self._set_progress(done, total)
return await self.send_message(await self.render_progress()) return await self.send_message(await self.render_progress(), ignore_last_call=ignore_last_call)
class ThrottlerWrapper: class ThrottlerWrapper:

View File

@ -139,7 +139,7 @@ spacy-loggers==1.0.3
SQLAlchemy==1.4.39 SQLAlchemy==1.4.39
sqlparse==0.4.2 sqlparse==0.4.2
srsly==2.4.4 srsly==2.4.4
Telethon==1.24.0 Telethon==1.25.0
tenacity==8.0.1 tenacity==8.0.1
termcolor==1.1.0 termcolor==1.1.0
textblob==0.17.1 textblob==0.17.1