mirror of
https://github.com/nexus-stc/hyperboria
synced 2025-01-11 11:16:10 +01:00
- [nexus] Development
GitOrigin-RevId: ccfd55db266862ed70f1299aaf62500765b03cc4
This commit is contained in:
parent
51ae1fc5d9
commit
7c1bb06b1b
@ -39,6 +39,7 @@ class BaseTelegramClient(AioThing):
|
||||
bot_token: Optional[str] = None,
|
||||
mtproxy: Optional[dict] = None,
|
||||
flood_sleep_threshold: int = 60,
|
||||
catch_up: bool = False,
|
||||
):
|
||||
super().__init__()
|
||||
if not app_id or not app_hash:
|
||||
@ -50,6 +51,7 @@ class BaseTelegramClient(AioThing):
|
||||
self._get_session(database),
|
||||
app_id,
|
||||
app_hash,
|
||||
catch_up=catch_up,
|
||||
flood_sleep_threshold=flood_sleep_threshold,
|
||||
**self._get_proxy(mtproxy=mtproxy),
|
||||
)
|
||||
|
@ -5,9 +5,17 @@ class Promotioner:
|
||||
"""
|
||||
Promotioner is used to select promotion randomly based on weights of every promotion.
|
||||
"""
|
||||
def __init__(self, promotions: list[dict], default_promotion_index: int = 0):
|
||||
def __init__(
|
||||
self,
|
||||
promotions: list[dict],
|
||||
default_promotion_index: int = 0,
|
||||
promotion_vars: dict = None,
|
||||
):
|
||||
self.promotions = promotions
|
||||
self.default_promotion_index = default_promotion_index
|
||||
if not promotion_vars:
|
||||
promotion_vars = {}
|
||||
self.promotion_vars = promotion_vars
|
||||
self.partial_sums: list = [self.promotions[0]['weight']]
|
||||
for promotion in self.promotions[1:]:
|
||||
self.partial_sums.append(promotion['weight'] + self.partial_sums[-1])
|
||||
@ -18,11 +26,11 @@ class Promotioner:
|
||||
if partial_sum <= pivot:
|
||||
continue
|
||||
if language in promotion['texts']:
|
||||
return promotion['texts'][language]
|
||||
return promotion['texts'][language].format(**self.promotion_vars)
|
||||
elif promotion.get('local', False):
|
||||
default_promotion = self.promotions[self.default_promotion_index]
|
||||
if language in default_promotion['texts']:
|
||||
return default_promotion['texts'][language]
|
||||
return default_promotion['texts']['en']
|
||||
return default_promotion['texts'][language].format(**self.promotion_vars)
|
||||
return default_promotion['texts']['en'].format(**self.promotion_vars)
|
||||
else:
|
||||
return promotion['texts']['en']
|
||||
return promotion['texts']['en'].format(**self.promotion_vars)
|
@ -19,6 +19,7 @@ async def safe_execution(
|
||||
error_log=error_log,
|
||||
on_fail: Optional[Callable[[], Awaitable]] = None,
|
||||
level=logging.WARNING,
|
||||
is_logging_enabled: bool = True
|
||||
):
|
||||
try:
|
||||
try:
|
||||
@ -34,13 +35,17 @@ async def safe_execution(
|
||||
errors.MessageIdInvalidError,
|
||||
errors.ChatAdminRequiredError,
|
||||
) as e:
|
||||
error_log(e, level=level)
|
||||
if is_logging_enabled:
|
||||
error_log(e, level=level)
|
||||
traceback.print_exc()
|
||||
except Exception as e:
|
||||
error_log(e, level=level)
|
||||
traceback.print_exc()
|
||||
if is_logging_enabled:
|
||||
error_log(e, level=level)
|
||||
traceback.print_exc()
|
||||
if on_fail:
|
||||
await on_fail()
|
||||
except events.StopPropagation:
|
||||
raise
|
||||
except Exception as e:
|
||||
error_log(e, level=level)
|
||||
if is_logging_enabled:
|
||||
error_log(e, level=level)
|
||||
|
@ -76,6 +76,11 @@ def clean_issns(issns):
|
||||
return cleaned_issns
|
||||
|
||||
|
||||
def clean_isbns(isbns):
|
||||
if isbns:
|
||||
return isbns
|
||||
|
||||
|
||||
def extract_title(title, subtitle):
|
||||
return ': '.join(filter(lambda x: bool(x), [title.strip(), subtitle.strip()]))
|
||||
|
||||
@ -90,6 +95,7 @@ class ToScimagPbAction(BaseAction):
|
||||
doi=item['DOI'],
|
||||
issue=item.get('issue'),
|
||||
issns=clean_issns(item.get('ISSN')),
|
||||
isbns=clean_isbns(item.get('ISBN')),
|
||||
language=item.get('language'),
|
||||
referenced_by_count=item.get('is-referenced-by-count'),
|
||||
references=extract_references(item.get('reference')),
|
||||
|
@ -42,6 +42,8 @@ py3_image(
|
||||
"//nexus/hub/aioclient",
|
||||
"//nexus/meta_api/aioclient",
|
||||
"//nexus/models/proto:proto_py",
|
||||
"//nexus/promotions",
|
||||
"//nexus/translations",
|
||||
"//nexus/views/telegram",
|
||||
requirement("izihawa_nlptools"),
|
||||
requirement("izihawa_utils"),
|
||||
|
@ -2,10 +2,11 @@ from aiokit import AioRootThing
|
||||
from idm.api.aioclient import IdmApiGrpcClient
|
||||
from izihawa_utils.importlib import import_object
|
||||
from library.telegram.base import BaseTelegramClient
|
||||
from nexus.bot.promotioner import Promotioner
|
||||
from library.telegram.promotioner import Promotioner
|
||||
from nexus.bot.user_manager import UserManager
|
||||
from nexus.hub.aioclient import HubGrpcClient
|
||||
from nexus.meta_api.aioclient import MetaApiGrpcClient
|
||||
from nexus.promotions import get_promotions
|
||||
|
||||
|
||||
class TelegramApplication(AioRootThing):
|
||||
@ -18,6 +19,7 @@ class TelegramApplication(AioRootThing):
|
||||
bot_token=self.config['telegram']['bot_token'],
|
||||
database=self.config['telegram'].get('database'),
|
||||
mtproxy=self.config['telegram'].get('mtproxy'),
|
||||
catch_up=self.config['telegram'].get('catch_up', False)
|
||||
)
|
||||
self.hub_client = HubGrpcClient(endpoint=self.config['hub']['endpoint'])
|
||||
self.starts.append(self.hub_client)
|
||||
@ -28,7 +30,13 @@ class TelegramApplication(AioRootThing):
|
||||
self.meta_api_client = MetaApiGrpcClient(endpoint=self.config['meta_api']['endpoint'])
|
||||
self.starts.append(self.meta_api_client)
|
||||
|
||||
self.promotioner = Promotioner(promotions=self.config['promotions'])
|
||||
self.promotioner = Promotioner(
|
||||
promotions=get_promotions(),
|
||||
promotion_vars=dict(
|
||||
related_channel=self.config['telegram']['related_channel'],
|
||||
twitter_contact_url=self.config['twitter']['contact_url'],
|
||||
)
|
||||
)
|
||||
self.user_manager = UserManager()
|
||||
self._handlers = []
|
||||
|
||||
|
@ -7,7 +7,6 @@ def get_config():
|
||||
'nexus/bot/configs/base.yaml',
|
||||
'nexus/bot/configs/%s.yaml?' % env.type,
|
||||
'nexus/bot/configs/logging.yaml',
|
||||
'nexus/bot/configs/promotions.yaml',
|
||||
], env_prefix='NEXUS_BOT')
|
||||
|
||||
|
||||
|
@ -1,36 +0,0 @@
|
||||
---
|
||||
|
||||
promotions:
|
||||
- texts:
|
||||
en: 💬 The victory of humanity is inevitable
|
||||
weight: 1
|
||||
- texts:
|
||||
en: 💬 Shall build Standard Template Construct
|
||||
weight: 1
|
||||
- texts:
|
||||
en: 💬 Gaining knowledge is the only purpose of life
|
||||
weight: 1
|
||||
- texts:
|
||||
en: 💬 Knowledge cannot belong
|
||||
weight: 1
|
||||
- texts:
|
||||
en: 💬 Obey the path of discovery
|
||||
weight: 1
|
||||
- texts:
|
||||
en: 💬 Research is the only and ultimate goal
|
||||
weight: 1
|
||||
- texts:
|
||||
en: ✋ Have a subscription to paid articles? [Help researchers!](https://t.me/nexus_aaron)
|
||||
ru: ✋ Есть доступ к платным статьям? [Помоги ученым!](https://t.me/nexus_aaron)
|
||||
weight: 25
|
||||
- texts:
|
||||
en: ✋ Help us, become a seeder of books. Learn how in /seed
|
||||
ru: ✋ Сохрани наследие, раздавай книги нуждающимся. Узнай как в /seed
|
||||
weight: 25
|
||||
- texts:
|
||||
en: ⤴️ Stay tuned with us at @{related_channel} and [Twitter]({twitter_contact_url})
|
||||
es: ⤴️ Mantente en contacto con nosotros en @{related_channel} y [Twitter]({twitter_contact_url})
|
||||
it: ⤴️ Resta aggiornato con noi su @{related_channel} e [Twitter]({twitter_contact_url})
|
||||
pb: ⤴️ Fique ligado conosco em @{related_channel} e [Twitter]({twitter_contact_url})
|
||||
ru: ⤴️ Оставайся на связи с нами на @{related_channel} и в [Twitter]({twitter_contact_url})
|
||||
weight: 25
|
@ -2,6 +2,7 @@ import asyncio
|
||||
import time
|
||||
|
||||
from library.telegram.base import RequestContext
|
||||
from library.telegram.utils import safe_execution
|
||||
from nexus.translations import t
|
||||
from telethon import events
|
||||
|
||||
@ -36,5 +37,6 @@ class CloseHandler(BaseCallbackQueryHandler):
|
||||
target_events.append(reply_message.delete())
|
||||
target_events.append(message.delete())
|
||||
else:
|
||||
target_events.append(event.answer(t('DELETION_FORBIDDEN_DUE_TO_AGE')))
|
||||
async with safe_execution(is_logging_enabled=False):
|
||||
await event.answer(t('DELETION_FORBIDDEN_DUE_TO_AGE'))
|
||||
await asyncio.gather(*target_events)
|
||||
|
@ -1,4 +1,5 @@
|
||||
from library.telegram.base import RequestContext
|
||||
from library.telegram.utils import safe_execution
|
||||
from nexus.hub.proto.delivery_service_pb2 import \
|
||||
StartDeliveryResponse as StartDeliveryResponsePb
|
||||
from nexus.translations import t
|
||||
@ -45,14 +46,16 @@ class DownloadHandler(BaseCallbackQueryHandler):
|
||||
bot_name=request_context.bot_name,
|
||||
)
|
||||
if start_delivery_response_pb.status == StartDeliveryResponsePb.Status.ALREADY_DOWNLOADING:
|
||||
await event.answer(
|
||||
f'{t("ALREADY_DOWNLOADING", request_context.chat.language)}',
|
||||
)
|
||||
async with safe_execution(is_logging_enabled=False):
|
||||
await event.answer(
|
||||
f'{t("ALREADY_DOWNLOADING", request_context.chat.language)}',
|
||||
)
|
||||
await remove_button(event, '⬇️', and_empty_too=True)
|
||||
elif start_delivery_response_pb.status == StartDeliveryResponsePb.Status.TOO_MANY_DOWNLOADS:
|
||||
await event.answer(
|
||||
f'{t("TOO_MANY_DOWNLOADS", request_context.chat.language)}',
|
||||
)
|
||||
async with safe_execution(is_logging_enabled=False):
|
||||
await event.answer(
|
||||
f'{t("TOO_MANY_DOWNLOADS", request_context.chat.language)}',
|
||||
)
|
||||
else:
|
||||
await remove_button(event, '⬇️', and_empty_too=True)
|
||||
self.application.user_manager.last_widget[request_context.chat.chat_id] = None
|
||||
|
@ -61,7 +61,7 @@ class ProfileHandler(BaseHandler):
|
||||
target_events.append(profile_reply_message.reply(rendered_widget, buttons=buttons, link_preview=False))
|
||||
else:
|
||||
target_events.append(event.reply(rendered_widget, buttons=buttons, link_preview=False))
|
||||
return asyncio.gather(*target_events)
|
||||
return await asyncio.gather(*target_events)
|
||||
|
||||
|
||||
class DigestHandler(BaseHandler):
|
||||
@ -114,6 +114,7 @@ class DigestHandler(BaseHandler):
|
||||
document_holders=document_holders,
|
||||
bot_name=bot_name,
|
||||
header='✨ Nexus Discovery ✨',
|
||||
promotioner=self.application.promotioner,
|
||||
)
|
||||
|
||||
view, buttons = await document_list_widget.render()
|
||||
|
@ -39,10 +39,7 @@ class RollHandler(BaseHandler):
|
||||
|
||||
if random_documents:
|
||||
holder = BaseHolder.create_from_document(random_documents[0])
|
||||
promo = self.application.promotioner.choose_promotion(language).format(
|
||||
related_channel=self.application.config['telegram']['related_channel'],
|
||||
twitter_contact_url=self.application.config['twitter']['contact_url'],
|
||||
)
|
||||
promo = self.application.promotioner.choose_promotion(language)
|
||||
view = holder.view_builder(language).add_view(bot_name=bot_name).add_new_line(2).add(promo, escaped=True).build()
|
||||
buttons_builder = holder.buttons_builder(language)
|
||||
|
||||
@ -54,5 +51,5 @@ class RollHandler(BaseHandler):
|
||||
|
||||
request_context.statbox(action='show', duration=time.time() - start_time)
|
||||
await event.respond(view, buttons=buttons_builder.build())
|
||||
async with safe_execution(error_log=request_context.error_log, level=logging.DEBUG):
|
||||
async with safe_execution(is_logging_enabled=False):
|
||||
await event.delete()
|
||||
|
@ -210,6 +210,7 @@ class SearchEditHandler(BaseSearchHandler):
|
||||
|
||||
if request_context.is_group_mode() and not search_prefix:
|
||||
return
|
||||
|
||||
if request_context.is_personal_mode() and search_prefix:
|
||||
query = event.raw_text
|
||||
|
||||
@ -288,7 +289,7 @@ class SearchPagingHandler(BaseCallbackQueryHandler):
|
||||
)
|
||||
|
||||
serp, buttons = await search_widget.render(message_id=message_id)
|
||||
return await asyncio.gather(
|
||||
event.answer(),
|
||||
message.edit(serp, buttons=buttons, link_preview=False)
|
||||
)
|
||||
|
||||
await message.edit(serp, buttons=buttons, link_preview=False)
|
||||
async with safe_execution(is_logging_enabled=False):
|
||||
await event.answer()
|
||||
|
@ -47,13 +47,13 @@ class SubmitHandler(BaseHandler):
|
||||
reply_to = reply_message.id
|
||||
|
||||
doi_hint = self.get_doi_hint(message=message, reply_message=reply_message)
|
||||
doi_hint_priority = '⚡' in message.raw_text
|
||||
skip_analysis = '⚡️' in message.raw_text
|
||||
user_id = message.sender_id
|
||||
request_context.statbox(
|
||||
action='analyzed',
|
||||
mode='submit',
|
||||
doi_hint=doi_hint,
|
||||
doi_hint_priority=doi_hint_priority,
|
||||
skip_analysis=skip_analysis,
|
||||
reply_to=reply_to,
|
||||
)
|
||||
|
||||
@ -71,12 +71,12 @@ class SubmitHandler(BaseHandler):
|
||||
request_id=request_context.request_id,
|
||||
session_id=session_id,
|
||||
doi_hint=doi_hint,
|
||||
doi_hint_priority=doi_hint_priority,
|
||||
skip_analysis=skip_analysis,
|
||||
uploader_id=user_id,
|
||||
)
|
||||
case 'application/zip':
|
||||
try:
|
||||
if request_context.is_personal_mode():
|
||||
if request_context.is_personal_mode():
|
||||
try:
|
||||
file_data = await self.application.telegram_client.download_document(
|
||||
document=event.document,
|
||||
file=bytes,
|
||||
@ -105,10 +105,8 @@ class SubmitHandler(BaseHandler):
|
||||
session_id=session_id,
|
||||
uploader_id=user_id,
|
||||
)
|
||||
else:
|
||||
await event.reply(t('ZIP_FILES_ARE_NOT_SUPPORTED_IN_GROUP_MODE', request_context.chat.language))
|
||||
finally:
|
||||
return await event.delete()
|
||||
finally:
|
||||
return await event.delete()
|
||||
case _:
|
||||
request_context.statbox(action='unknown_file_format')
|
||||
request_context.error_log(UnknownFileFormatError(format=event.document.mime_type))
|
||||
|
@ -85,10 +85,7 @@ class ViewHandler(BaseHandler):
|
||||
holder = BaseHolder.create(typed_document_pb=typed_document_pb)
|
||||
back_command = await self.compose_back_command(session_id=session_id, message_id=message_id, page=page)
|
||||
|
||||
promo = self.application.promotioner.choose_promotion(language).format(
|
||||
related_channel=self.application.config['telegram']['related_channel'],
|
||||
twitter_contact_url=self.application.config['twitter']['contact_url'],
|
||||
)
|
||||
promo = self.application.promotioner.choose_promotion(language)
|
||||
view_builder = holder.view_builder(language).add_view(
|
||||
bot_name=self.application.config['telegram']['bot_name']
|
||||
).add_new_line(2).add(promo, escaped=True)
|
||||
|
@ -154,10 +154,7 @@ class SearchWidget(BaseSearchWidget):
|
||||
)
|
||||
|
||||
promotion_language = self.query_language or self.chat.language
|
||||
promo = self.application.promotioner.choose_promotion(promotion_language).format(
|
||||
related_channel=self.application.config['telegram']['related_channel'],
|
||||
twitter_contact_url=self.application.config['twitter']['contact_url'],
|
||||
)
|
||||
promo = self.application.promotioner.choose_promotion(promotion_language)
|
||||
serp = f'{serp}\n\n{promo}\n'
|
||||
|
||||
buttons = None
|
||||
|
@ -58,7 +58,7 @@ class HubGrpcClient(BaseGrpcClient):
|
||||
bot_name: str,
|
||||
reply_to: Optional[int] = None,
|
||||
doi_hint: Optional[str] = None,
|
||||
doi_hint_priority: bool = False,
|
||||
skip_analysis: bool = False,
|
||||
request_id: Optional[str] = None,
|
||||
session_id: Optional[str] = None,
|
||||
uploader_id: Optional[int] = None
|
||||
@ -68,7 +68,7 @@ class HubGrpcClient(BaseGrpcClient):
|
||||
bot_name=bot_name,
|
||||
reply_to=reply_to,
|
||||
doi_hint=doi_hint,
|
||||
doi_hint_priority=doi_hint_priority,
|
||||
skip_analysis=skip_analysis,
|
||||
uploader_id=uploader_id,
|
||||
)
|
||||
if isinstance(file, submitter_service_pb2.PlainFile):
|
||||
|
@ -4,6 +4,7 @@ pylon:
|
||||
- [cambridge]
|
||||
- [edinburg]
|
||||
- [southampton]
|
||||
default_resolver_proxy_list: ~
|
||||
downloads_directory: /downloads
|
||||
proxies:
|
||||
- address: clash.default.svc.cluster.example.com:7890
|
||||
@ -15,6 +16,9 @@ pylon:
|
||||
- address: clash.default.svc.cluster.example.com:8090
|
||||
name: southampton
|
||||
tags: ['southampton']
|
||||
- address: socks5://clash.default.svc.cluster.example.com:7991
|
||||
name: socks5
|
||||
tags: ['socks5']
|
||||
sources:
|
||||
# LibGen.rocks
|
||||
- driver:
|
||||
@ -331,6 +335,13 @@ pylon:
|
||||
args:
|
||||
format_string: 'https://journals.physiology.org/doi/pdf/{doi}?download=true'
|
||||
class: nexus.pylon.resolvers.TemplateResolver
|
||||
# www.ahajournals.org
|
||||
- matcher:
|
||||
doi: ^10.1161/.*$
|
||||
resolver:
|
||||
args:
|
||||
format_string: 'https://www.ahajournals.org/doi/pdf/{doi}?download=true'
|
||||
class: nexus.pylon.resolvers.TemplateResolver
|
||||
# ajp.psychiatryonline.org
|
||||
- matcher:
|
||||
doi: ^10.1176/.*$
|
||||
|
@ -23,7 +23,7 @@ message SubmitRequest {
|
||||
string bot_name = 4;
|
||||
optional int64 reply_to = 5;
|
||||
optional string doi_hint = 6;
|
||||
bool doi_hint_priority = 7;
|
||||
bool skip_analysis = 7;
|
||||
int64 uploader_id = 8;
|
||||
}
|
||||
message SubmitResponse { }
|
||||
|
@ -1,5 +1,6 @@
|
||||
import asyncio
|
||||
|
||||
from aiobaseclient.exceptions import BadRequestError
|
||||
from library.aiogrpctools.base import BaseService
|
||||
from library.telegram.common import close_button
|
||||
from nexus.views.telegram.common import vote_button
|
||||
@ -16,8 +17,39 @@ def is_group_or_channel(chat_id: int):
|
||||
return chat_id < 0
|
||||
|
||||
|
||||
class ProcessedDocument:
|
||||
def __init__(self, processed_document):
|
||||
self.processed_document = processed_document
|
||||
|
||||
@staticmethod
|
||||
async def setup(file_data, grobid_client, request_context):
|
||||
try:
|
||||
processed_document = await grobid_client.process_fulltext_document(pdf_file=file_data)
|
||||
except BadRequestError as e:
|
||||
request_context.statbox(action='unparsable_document')
|
||||
request_context.error_log(e)
|
||||
processed_document = {}
|
||||
return ProcessedDocument(processed_document)
|
||||
|
||||
@property
|
||||
def doi(self):
|
||||
return self.processed_document.get('doi')
|
||||
|
||||
@property
|
||||
def title(self):
|
||||
return self.processed_document.get('title')
|
||||
|
||||
@property
|
||||
def abstract(self):
|
||||
return self.processed_document.get('abstract')
|
||||
|
||||
@property
|
||||
def body(self):
|
||||
return self.processed_document.get('body')
|
||||
|
||||
|
||||
class BaseHubService(BaseService):
|
||||
async def item_found(self, bot_name, doi):
|
||||
async def found_item(self, bot_name, doi):
|
||||
if mutual_aid_service := self.application.mutual_aid_services.get(bot_name):
|
||||
await mutual_aid_service.delete_request(doi)
|
||||
await self.application.idm_client.reschedule_subscriptions(
|
||||
@ -36,13 +68,13 @@ class BaseHubService(BaseService):
|
||||
)
|
||||
))
|
||||
|
||||
def set_fields_from_processed(self, document_pb, processed_document):
|
||||
def set_fields_from_processed(self, document_pb, processed_document: ProcessedDocument):
|
||||
new_fields = []
|
||||
if processed_document.get('abstract') and not document_pb.abstract:
|
||||
document_pb.abstract = processed_document['abstract']
|
||||
if processed_document.abstract and not document_pb.abstract:
|
||||
document_pb.abstract = processed_document.abstract
|
||||
new_fields.append('abstract')
|
||||
if processed_document.get('body') and not document_pb.content:
|
||||
document_pb.content = processed_document['body']
|
||||
if processed_document.body and not document_pb.content:
|
||||
document_pb.content = processed_document.body
|
||||
new_fields.append('content')
|
||||
return new_fields
|
||||
|
||||
|
@ -46,6 +46,7 @@ from pypika import (
|
||||
|
||||
from .base import (
|
||||
BaseHubService,
|
||||
ProcessedDocument,
|
||||
is_group_or_channel,
|
||||
)
|
||||
|
||||
@ -76,6 +77,7 @@ class DeliveryService(delivery_service_pb2_grpc.DeliveryServicer, BaseHubService
|
||||
proxies=pylon_config['proxies'],
|
||||
source_configs=pylon_config['sources'],
|
||||
default_driver_proxy_list=pylon_config['default_driver_proxy_list'],
|
||||
default_resolver_proxy_list=pylon_config['default_resolver_proxy_list'],
|
||||
downloads_directory=pylon_config['downloads_directory'],
|
||||
)
|
||||
self.should_parse_with_grobid = should_parse_with_grobid
|
||||
@ -216,11 +218,12 @@ class DownloadTask:
|
||||
error_log=request_context.error_log,
|
||||
on_fail=_on_fail,
|
||||
):
|
||||
filename = document_holder.get_filename()
|
||||
progress_bar_download = ProgressBar(
|
||||
telegram_client=self.application.telegram_clients[request_context.bot_name],
|
||||
request_context=request_context,
|
||||
banner=t("LOOKING_AT", request_context.chat.language),
|
||||
header=f'⬇️ {document_holder.get_filename()}',
|
||||
header=f'⬇️ {filename}',
|
||||
tail_text=t('TRANSMITTED_FROM', request_context.chat.language),
|
||||
throttle_secs=throttle_secs,
|
||||
)
|
||||
@ -239,6 +242,10 @@ class DownloadTask:
|
||||
)
|
||||
if not document_holder.md5 and document_holder.get_extension() == 'pdf':
|
||||
try:
|
||||
await progress_bar_download.send_message(
|
||||
t("PROCESSING_PAPER", request_context.chat.language).format(filename=filename),
|
||||
ignore_last_call=True
|
||||
)
|
||||
file = clean_metadata(file, doi=document_holder.doi)
|
||||
request_context.statbox(
|
||||
action='cleaned',
|
||||
@ -251,7 +258,7 @@ class DownloadTask:
|
||||
request_context=request_context,
|
||||
message=progress_bar_download.message,
|
||||
banner=t("LOOKING_AT", request_context.chat.language),
|
||||
header=f'⬇️ {document_holder.get_filename()}',
|
||||
header=f'⬇️ {filename}',
|
||||
tail_text=t('UPLOADED_TO_TELEGRAM', request_context.chat.language),
|
||||
throttle_secs=throttle_secs
|
||||
)
|
||||
@ -264,7 +271,7 @@ class DownloadTask:
|
||||
voting=not is_group_or_channel(self.request_context.chat.chat_id),
|
||||
)
|
||||
if self.document_holder.doi:
|
||||
await self.delivery_service.item_found(
|
||||
await self.delivery_service.found_item(
|
||||
bot_name=request_context.bot_name,
|
||||
doi=self.document_holder.doi,
|
||||
)
|
||||
@ -277,6 +284,7 @@ class DownloadTask:
|
||||
document_holder=document_holder,
|
||||
telegram_file_id=uploaded_message.file.id,
|
||||
file=file,
|
||||
request_context=request_context,
|
||||
))
|
||||
else:
|
||||
request_context.statbox(
|
||||
@ -425,7 +433,7 @@ class DownloadTask:
|
||||
self.task.cancel()
|
||||
await self.task
|
||||
|
||||
async def store_new_data(self, bot_name, document_holder, telegram_file_id, file):
|
||||
async def store_new_data(self, bot_name, document_holder, telegram_file_id, file, request_context):
|
||||
document_pb = document_holder.document_pb
|
||||
new_fields = []
|
||||
if self.delivery_service.should_store_hashes:
|
||||
@ -448,7 +456,11 @@ class DownloadTask:
|
||||
and document_holder.index_alias == 'scimag'
|
||||
):
|
||||
try:
|
||||
processed_document = await self.application.grobid_client.process_fulltext_document(pdf_file=file)
|
||||
processed_document = await ProcessedDocument.setup(
|
||||
file,
|
||||
grobid_client=self.application.grobid_client,
|
||||
request_context=request_context,
|
||||
)
|
||||
new_fields += self.delivery_service.set_fields_from_processed(document_pb, processed_document)
|
||||
except BadRequestError as e:
|
||||
self.request_context.statbox(action='unparsable_document')
|
||||
|
@ -34,7 +34,10 @@ from nexus.views.telegram.base_holder import ScimagHolder
|
||||
from telethon.errors import ChatAdminRequiredError
|
||||
from telethon.extensions import BinaryReader
|
||||
|
||||
from .base import BaseHubService
|
||||
from .base import (
|
||||
BaseHubService,
|
||||
ProcessedDocument,
|
||||
)
|
||||
|
||||
|
||||
async def operation_log(document_operation_pb):
|
||||
@ -87,22 +90,48 @@ class PlainFile:
|
||||
|
||||
|
||||
def fuzzy_compare(a, b):
|
||||
if a is None or b is None:
|
||||
return False
|
||||
a = re.sub(r'[^a-z\d]', '', a.lower())
|
||||
b = re.sub(r'[^a-z\d]', '', b.lower())
|
||||
return SequenceMatcher(None, a, b).ratio() > 0.9
|
||||
|
||||
|
||||
async def delayed_task(task, seconds):
|
||||
await asyncio.sleep(seconds)
|
||||
await task
|
||||
|
||||
|
||||
class SubmitterService(submitter_service_pb2_grpc.SubmitterServicer, BaseHubService):
|
||||
async def start(self):
|
||||
submitter_service_pb2_grpc.add_SubmitterServicer_to_server(self, self.application.server)
|
||||
|
||||
def wrap_request_file(self, request, request_context):
|
||||
match str(request.WhichOneof('file')):
|
||||
case 'plain':
|
||||
return PlainFile(request.plain)
|
||||
case 'telegram':
|
||||
return TelegramFile(self.application.telegram_clients[request_context.bot_name], request.telegram)
|
||||
case _:
|
||||
raise RuntimeError(f"Unknown file type {request.WhichOneof('file')}")
|
||||
|
||||
async def retrieve_metadata(self, doi, title, session_id, request_context):
|
||||
if doi:
|
||||
meta_search_response = await self.application.meta_api_client.meta_search(
|
||||
index_aliases=['scimag', ],
|
||||
query=doi,
|
||||
collectors=[{'top_docs': {'limit': 1}}],
|
||||
session_id=session_id,
|
||||
request_id=request_context.request_id,
|
||||
user_id=str(request_context.chat.chat_id),
|
||||
query_tags=['submitter'],
|
||||
)
|
||||
scored_documents = meta_search_response.collector_outputs[0].top_docs.scored_documents
|
||||
if len(scored_documents) == 1:
|
||||
scimag_pb = scimag_pb2.Scimag(**json.loads(scored_documents[0].document))
|
||||
if title is not None and not fuzzy_compare(scimag_pb.title, title):
|
||||
request_context.statbox(
|
||||
action='mismatched_title',
|
||||
doi=doi,
|
||||
processed_title=title,
|
||||
title=scimag_pb.title,
|
||||
)
|
||||
return None
|
||||
return scimag_pb
|
||||
|
||||
@aiogrpc_request_wrapper(log=False)
|
||||
async def submit(
|
||||
self,
|
||||
@ -125,17 +154,10 @@ class SubmitterService(submitter_service_pb2_grpc.SubmitterServicer, BaseHubServ
|
||||
)
|
||||
|
||||
buttons = None if request_context.is_group_mode() else [close_button()]
|
||||
wrapped_file = self.wrap_request_file(request, request_context)
|
||||
|
||||
match str(request.WhichOneof('file')):
|
||||
case 'plain':
|
||||
file = PlainFile(request.plain)
|
||||
case 'telegram':
|
||||
file = TelegramFile(self.application.telegram_clients[request_context.bot_name], request.telegram)
|
||||
case _:
|
||||
raise RuntimeError(f"Unknown file type {request.WhichOneof('file')}")
|
||||
|
||||
if file.size > 30 * 1024 * 1024:
|
||||
request_context.error_log(FileTooBigError(size=file.size))
|
||||
if wrapped_file.size > 300 * 1024 * 1024:
|
||||
request_context.error_log(FileTooBigError(size=wrapped_file.size))
|
||||
request_context.statbox(action='file_too_big')
|
||||
async with safe_execution(error_log=request_context.error_log):
|
||||
await self.application.telegram_clients[request_context.bot_name].send_message(
|
||||
@ -149,109 +171,59 @@ class SubmitterService(submitter_service_pb2_grpc.SubmitterServicer, BaseHubServ
|
||||
try:
|
||||
processing_message = await self.application.telegram_clients[request_context.bot_name].send_message(
|
||||
request_context.chat.chat_id,
|
||||
t("PROCESSING_PAPER", request_context.chat.language).format(filename=file.filename),
|
||||
t("PROCESSING_PAPER", request_context.chat.language).format(filename=wrapped_file.filename),
|
||||
reply_to=request.reply_to,
|
||||
)
|
||||
except ChatAdminRequiredError:
|
||||
return submitter_service_pb2.SubmitResponse()
|
||||
|
||||
try:
|
||||
file_data = await file.read()
|
||||
processed_document = None
|
||||
pdf_doi = None
|
||||
pdf_title = None
|
||||
try:
|
||||
processed_document = await self.application.grobid_client.process_fulltext_document(pdf_file=file_data)
|
||||
pdf_doi = processed_document.get('doi')
|
||||
pdf_title = processed_document.get('title')
|
||||
request_context.add_default_fields(pdf_doi=pdf_doi)
|
||||
except BadRequestError as e:
|
||||
request_context.statbox(action='unparsable_document')
|
||||
request_context.error_log(e)
|
||||
if not request.doi_hint:
|
||||
await self.application.telegram_clients[request_context.bot_name].send_message(
|
||||
request_context.chat.chat_id,
|
||||
t('UNPARSABLE_DOCUMENT_ERROR', request_context.chat.language).format(
|
||||
filename=file.filename,
|
||||
),
|
||||
buttons=buttons,
|
||||
reply_to=request.reply_to,
|
||||
)
|
||||
return submitter_service_pb2.SubmitResponse()
|
||||
file_data = await wrapped_file.read()
|
||||
if not request.skip_analysis:
|
||||
processed_document = await ProcessedDocument.setup(
|
||||
file_data,
|
||||
grobid_client=self.application.grobid_client,
|
||||
request_context=request_context,
|
||||
)
|
||||
else:
|
||||
processed_document = ProcessedDocument({})
|
||||
|
||||
if request.doi_hint and pdf_doi != request.doi_hint:
|
||||
request_context.statbox(action='mismatched_doi', doi_hint_priority=request.doi_hint_priority)
|
||||
if request.doi_hint_priority:
|
||||
pdf_doi = request.doi_hint
|
||||
|
||||
if not pdf_doi and not request.doi_hint:
|
||||
if not processed_document.doi and not request.doi_hint:
|
||||
request_context.statbox(action='unparsable_doi')
|
||||
request_context.error_log(UnparsableDoiError())
|
||||
await self.application.telegram_clients[request_context.bot_name].send_message(
|
||||
request_context.chat.chat_id,
|
||||
t('UNPARSABLE_DOI_ERROR', request_context.chat.language).format(
|
||||
filename=file.filename,
|
||||
filename=wrapped_file.filename,
|
||||
),
|
||||
buttons=buttons,
|
||||
reply_to=request.reply_to,
|
||||
)
|
||||
return submitter_service_pb2.SubmitResponse()
|
||||
|
||||
scimag_pb = None
|
||||
|
||||
if pdf_doi:
|
||||
meta_search_response = await self.application.meta_api_client.meta_search(
|
||||
index_aliases=['scimag',],
|
||||
query=pdf_doi,
|
||||
collectors=[{'top_docs': {'limit': 1}}],
|
||||
session_id=session_id,
|
||||
request_id=request_context.request_id,
|
||||
user_id=str(request_context.chat.chat_id),
|
||||
query_tags=['submitter'],
|
||||
)
|
||||
scored_documents = meta_search_response.collector_outputs[0].top_docs.scored_documents
|
||||
if len(scored_documents) == 1:
|
||||
scimag_pb = scimag_pb2.Scimag(**json.loads(scored_documents[0].document))
|
||||
if not fuzzy_compare(scimag_pb.title, pdf_title):
|
||||
request_context.statbox(
|
||||
action='mismatched_title',
|
||||
doi=pdf_doi,
|
||||
pdf_title=pdf_title,
|
||||
title=scimag_pb.title,
|
||||
)
|
||||
scimag_pb = None
|
||||
|
||||
scimag_pb = await self.retrieve_metadata(
|
||||
processed_document.doi,
|
||||
processed_document.title,
|
||||
session_id=session_id,
|
||||
request_context=request_context,
|
||||
)
|
||||
if not scimag_pb and request.doi_hint:
|
||||
meta_search_response = await self.application.meta_api_client.meta_search(
|
||||
index_aliases=['scimag', ],
|
||||
query=request.doi_hint,
|
||||
collectors=[{'top_docs': {'limit': 1}}],
|
||||
scimag_pb = await self.retrieve_metadata(
|
||||
request.doi_hint,
|
||||
processed_document.title,
|
||||
session_id=session_id,
|
||||
request_id=request_context.request_id,
|
||||
user_id=str(request_context.chat.chat_id),
|
||||
query_tags=['submitter'],
|
||||
request_context=request_context,
|
||||
)
|
||||
scored_documents = meta_search_response.collector_outputs[0].top_docs.scored_documents
|
||||
if len(scored_documents) == 1:
|
||||
scimag_pb = scimag_pb2.Scimag(**json.loads(scored_documents[0].document))
|
||||
if not fuzzy_compare(scimag_pb.title, pdf_title):
|
||||
request_context.statbox(
|
||||
action='mismatched_title',
|
||||
doi=request.doi_hint,
|
||||
pdf_title=pdf_title,
|
||||
title=scimag_pb.title,
|
||||
)
|
||||
# ToDo: add trust mechanics
|
||||
|
||||
if not scimag_pb:
|
||||
request_context.statbox(action='unavailable_metadata')
|
||||
request_context.error_log(UnavailableMetadataError(doi=pdf_doi))
|
||||
request_context.error_log(UnavailableMetadataError(doi=processed_document.doi))
|
||||
await self.application.telegram_clients[request_context.bot_name].send_message(
|
||||
request_context.chat.chat_id,
|
||||
t(
|
||||
'UNAVAILABLE_METADATA_ERROR',
|
||||
language=request_context.chat.language
|
||||
).format(doi=pdf_doi or request.doi_hint),
|
||||
).format(doi=processed_document.doi or request.doi_hint),
|
||||
buttons=buttons,
|
||||
reply_to=request.reply_to,
|
||||
)
|
||||
@ -260,10 +232,7 @@ class SubmitterService(submitter_service_pb2_grpc.SubmitterServicer, BaseHubServ
|
||||
request_context.add_default_fields(doi=scimag_pb.doi, document_id=scimag_pb.id)
|
||||
try:
|
||||
file_data = clean_metadata(file_data, doi=scimag_pb.doi)
|
||||
request_context.statbox(
|
||||
action='cleaned',
|
||||
len=len(file_data),
|
||||
)
|
||||
request_context.statbox(action='cleaned', len=len(file_data))
|
||||
except ValueError as e:
|
||||
request_context.error_log(e)
|
||||
uploaded_message = await self.send_file(
|
||||
@ -276,13 +245,13 @@ class SubmitterService(submitter_service_pb2_grpc.SubmitterServicer, BaseHubServ
|
||||
|
||||
if processed_document:
|
||||
sharience_pb = sharience_pb2.Sharience(
|
||||
abstract=processed_document.get('abstract', ''),
|
||||
content=processed_document.get('body', ''),
|
||||
abstract=processed_document.abstract or '',
|
||||
content=processed_document.body or '',
|
||||
parent_id=scimag_pb.id,
|
||||
uploader_id=request.uploader_id or request_context.chat.chat_id,
|
||||
updated_at=int(time.time()),
|
||||
md5=hashlib.md5(file_data).hexdigest(),
|
||||
filesize=file.size,
|
||||
filesize=wrapped_file.size,
|
||||
ipfs_multihashes=await self.get_ipfs_hashes(file=file_data),
|
||||
)
|
||||
update_sharience_pb = operation_pb2.DocumentOperation(
|
||||
@ -313,13 +282,13 @@ class SubmitterService(submitter_service_pb2_grpc.SubmitterServicer, BaseHubServ
|
||||
await operation_log(store_telegram_file_id_operation_pb)
|
||||
request_context.statbox(action='successfully_stored')
|
||||
|
||||
if file.message_id:
|
||||
if wrapped_file.message_id:
|
||||
async with safe_execution(error_log=request_context.error_log, level=logging.DEBUG):
|
||||
await self.application.telegram_clients[request_context.bot_name].delete_messages(
|
||||
request_context.chat.chat_id,
|
||||
file.message_id,
|
||||
wrapped_file.message_id,
|
||||
)
|
||||
await self.item_found(bot_name=request_context.bot_name, doi=scimag_pb.doi)
|
||||
await self.found_item(bot_name=request_context.bot_name, doi=scimag_pb.doi)
|
||||
finally:
|
||||
await processing_message.delete()
|
||||
return submitter_service_pb2.SubmitResponse()
|
||||
|
@ -45,7 +45,7 @@ from nexus.meta_api.word_transformers import (
|
||||
)
|
||||
|
||||
|
||||
def create_query_transformer(valid_fields, invalid_fields):
|
||||
def create_query_transformer(valid_fields, invalid_fields, order_by_valid_fields):
|
||||
return QueryProcessor(
|
||||
tree_transformers=[
|
||||
OrderByTreeTransformer(
|
||||
@ -55,14 +55,7 @@ def create_query_transformer(valid_fields, invalid_fields):
|
||||
'pr': 'page_rank',
|
||||
'refc': 'referenced_by_count',
|
||||
},
|
||||
valid_fields=frozenset([
|
||||
'id',
|
||||
'referenced_by_count',
|
||||
'issued_at',
|
||||
'page_rank',
|
||||
'pages',
|
||||
'updated_at'
|
||||
])
|
||||
valid_fields=frozenset(order_by_valid_fields)
|
||||
),
|
||||
FieldTreeTransformer(
|
||||
field_aliases={
|
||||
@ -135,20 +128,35 @@ class GrpcServer(AioGrpcServer):
|
||||
'doi', 'ipfs_multihashes', 'issns', 'isbns', 'issued_at', 'language', 'original_id',
|
||||
'page_rank', 'referenced_by_count', 'references', 'tags', 'title', 'year',
|
||||
}
|
||||
order_by_scimag_fields = {
|
||||
'id',
|
||||
'referenced_by_count',
|
||||
'issued_at',
|
||||
'page_rank',
|
||||
'updated_at'
|
||||
}
|
||||
scitech_fields = {
|
||||
'id', 'authors', 'doi', 'description', 'extension',
|
||||
'ipfs_multihashes', 'isbns', 'issued_at', 'language', 'original_id', 'pages',
|
||||
'tags', 'title', 'updated_at', 'year',
|
||||
}
|
||||
order_by_scitech_fields = {
|
||||
'id',
|
||||
'issued_at',
|
||||
'pages',
|
||||
'updated_at'
|
||||
}
|
||||
|
||||
self.query_transformers = {
|
||||
'scimag': create_query_transformer(
|
||||
valid_fields=scimag_fields,
|
||||
invalid_fields=scitech_fields.difference(scimag_fields),
|
||||
order_by_valid_fields=order_by_scimag_fields,
|
||||
),
|
||||
'scitech': create_query_transformer(
|
||||
valid_fields=scitech_fields,
|
||||
invalid_fields=scimag_fields.difference(scitech_fields),
|
||||
order_by_valid_fields=order_by_scitech_fields,
|
||||
)
|
||||
}
|
||||
self.summa_client = SummaClient(
|
||||
|
@ -31,7 +31,6 @@ class BaseService(LibraryBaseService):
|
||||
new_scored_documents = []
|
||||
for scored_document in scored_documents:
|
||||
document = json.loads(scored_document.document)
|
||||
document = self.enrich_document_with_stat_provider(document)
|
||||
new_scored_documents.append(nexus_meta_api_search_service_pb.ScoredDocument(
|
||||
typed_document=typed_document_pb2.TypedDocument(
|
||||
**{scored_document.index_alias: self.pb_registry[scored_document.index_alias](**document)},
|
||||
@ -46,7 +45,6 @@ class BaseService(LibraryBaseService):
|
||||
new_scored_documents = []
|
||||
for position, document in enumerate(documents):
|
||||
document = json.loads(document)
|
||||
document = self.enrich_document_with_stat_provider(document)
|
||||
new_scored_documents.append(nexus_meta_api_search_service_pb.ScoredDocument(
|
||||
typed_document=TypedDocumentPb(
|
||||
**{index_alias: self.pb_registry[index_alias](**document)},
|
||||
@ -55,14 +53,3 @@ class BaseService(LibraryBaseService):
|
||||
score=1.0,
|
||||
))
|
||||
return new_scored_documents
|
||||
|
||||
def enrich_document_with_stat_provider(self, document):
|
||||
if self.stat_provider:
|
||||
original_id = (
|
||||
document.get('original_id')
|
||||
or document['id']
|
||||
)
|
||||
download_stats = self.stat_provider.get_download_stats(original_id)
|
||||
if download_stats and download_stats.downloads_count:
|
||||
document['downloads_count'] = download_stats.downloads_count
|
||||
return document
|
||||
|
@ -57,19 +57,6 @@ class DocumentsService(DocumentsServicer, BaseService):
|
||||
|
||||
document_pb = getattr(typed_document_pb, typed_document_pb.WhichOneof('document'))
|
||||
|
||||
if hasattr(document_pb, 'original_id') and document_pb.original_id:
|
||||
original_document_pb = await self.get_document(
|
||||
index_alias=request.index_alias,
|
||||
document_id=document_pb.original_id,
|
||||
context=context,
|
||||
request_id=metadata['request-id'],
|
||||
session_id=metadata['session-id'],
|
||||
)
|
||||
for to_remove in ('doi', 'fiction_id', 'filesize', 'libgen_id',):
|
||||
original_document_pb.ClearField(to_remove)
|
||||
original_document_pb.MergeFrom(document_pb)
|
||||
document_pb = original_document_pb
|
||||
|
||||
logging.getLogger('query').info({
|
||||
'action': 'get',
|
||||
'cache_hit': False,
|
||||
|
@ -270,6 +270,10 @@ class SearchService(SearchServicer, BaseService):
|
||||
@aiogrpc_request_wrapper(log=False)
|
||||
async def search(self, request, context, metadata):
|
||||
preprocessed_query = await self.process_query(query=request.query, languages=request.language, context=context)
|
||||
logging.getLogger('debug').info({
|
||||
'action': 'preprocess_query',
|
||||
'preprocessed_query': str(preprocessed_query),
|
||||
})
|
||||
index_aliases = self.resolve_index_aliases(
|
||||
request_index_aliases=request.index_aliases,
|
||||
processed_query=preprocessed_query,
|
||||
@ -280,19 +284,27 @@ class SearchService(SearchServicer, BaseService):
|
||||
right_offset = left_offset + page_size
|
||||
|
||||
search_requests = []
|
||||
processed_queries = {}
|
||||
for index_alias in index_aliases:
|
||||
processed_query = self.query_transformers[index_alias].apply_tree_transformers(preprocessed_query)
|
||||
processed_queries[index_alias] = self.query_transformers[index_alias].apply_tree_transformers(preprocessed_query)
|
||||
logging.getLogger('debug').info({
|
||||
'action': 'process_query',
|
||||
'index_alias': index_alias,
|
||||
'processed_query': str(processed_queries[index_alias]),
|
||||
'order_by': processed_queries[index_alias].context.order_by,
|
||||
'has_invalid_fields': processed_queries[index_alias].context.has_invalid_fields,
|
||||
})
|
||||
search_requests.append(
|
||||
SearchRequest(
|
||||
index_alias=index_alias,
|
||||
query=processed_query.to_summa_query(),
|
||||
query=processed_queries[index_alias].to_summa_query(),
|
||||
collectors=[
|
||||
search_service_pb2.Collector(
|
||||
top_docs=search_service_pb2.TopDocsCollector(
|
||||
limit=50,
|
||||
scorer=self.scorer(processed_query, index_alias),
|
||||
scorer=self.scorer(processed_queries[index_alias], index_alias),
|
||||
snippets=self.snippets[index_alias],
|
||||
explain=processed_query.context.explain,
|
||||
explain=processed_queries[index_alias].context.explain,
|
||||
)
|
||||
),
|
||||
search_service_pb2.Collector(count=search_service_pb2.CountCollector())
|
||||
@ -322,9 +334,9 @@ class SearchService(SearchServicer, BaseService):
|
||||
meta_search_response.collector_outputs[0].top_docs.scored_documents,
|
||||
)
|
||||
has_next = len(new_scored_documents) > right_offset
|
||||
if 'scimag' in index_aliases:
|
||||
if 'scimag' in processed_queries:
|
||||
await self.check_if_need_new_documents_by_dois(
|
||||
requested_dois=processed_query.context.dois,
|
||||
requested_dois=processed_queries['scimag'].context.dois,
|
||||
scored_documents=new_scored_documents,
|
||||
should_request=attempt.retry_state.attempt_number == 1
|
||||
)
|
||||
@ -333,7 +345,7 @@ class SearchService(SearchServicer, BaseService):
|
||||
scored_documents=new_scored_documents[left_offset:right_offset],
|
||||
has_next=has_next,
|
||||
count=meta_search_response.collector_outputs[1].count.count,
|
||||
query_language=processed_query.context.query_language,
|
||||
query_language=preprocessed_query.context.query_language,
|
||||
)
|
||||
|
||||
return search_response_pb
|
||||
|
@ -6,7 +6,7 @@ pipe:
|
||||
kafka-0.example.net
|
||||
schema:
|
||||
- consumers:
|
||||
- class: nexus.pipe.consumers.CrossReferencesBulkConsumer
|
||||
- class: nexus.pipe.consumers.CrossReferencesConsumer
|
||||
topics:
|
||||
- name: cross_references
|
||||
workers: 4
|
||||
|
@ -1,17 +1,11 @@
|
||||
from .cross_references_consumer import (
|
||||
CrossReferencesBulkConsumer,
|
||||
CrossReferencesConsumer,
|
||||
)
|
||||
from .cross_references_consumer import CrossReferencesConsumer
|
||||
from .document_operations_consumer import (
|
||||
DocumentOperationsBulkConsumer,
|
||||
DocumentOperationsConsumer,
|
||||
DocumentOperationsJsonConsumer,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
'CrossReferencesBulkConsumer',
|
||||
'CrossReferencesConsumer',
|
||||
'DocumentOperationsConsumer',
|
||||
'DocumentOperationsBulkConsumer',
|
||||
'DocumentOperationsJsonConsumer',
|
||||
]
|
||||
|
@ -66,9 +66,10 @@ class BaseConsumer(AioRootThing):
|
||||
except (ConflictError, InterruptProcessing) as e:
|
||||
logging.getLogger('statbox').info(e)
|
||||
except (asyncio.CancelledError, ConsumerStoppedError):
|
||||
pass
|
||||
return
|
||||
finally:
|
||||
await consumer.stop()
|
||||
asyncio.create_task(self.stop())
|
||||
|
||||
async def start(self):
|
||||
logging.getLogger('statbox').info({
|
||||
@ -87,8 +88,10 @@ class BaseConsumer(AioRootThing):
|
||||
|
||||
async def stop(self):
|
||||
if self.consumer_task:
|
||||
self.consumer_task.cancel()
|
||||
await self.consumer_task
|
||||
consumer_task = self.consumer_task
|
||||
self.consumer_task = None
|
||||
consumer_task.cancel()
|
||||
await consumer_task
|
||||
|
||||
|
||||
class BasePbConsumer(BaseConsumer):
|
||||
@ -108,52 +111,3 @@ class BaseJsonConsumer(BaseConsumer):
|
||||
message = json.loads(msg.value)
|
||||
ParseDict(message, pb, ignore_unknown_fields=True)
|
||||
return pb
|
||||
|
||||
|
||||
class BaseBulkConsumer(BaseConsumer):
|
||||
auto_commit = False
|
||||
bulk_size = 20
|
||||
timeout = 1
|
||||
|
||||
async def consume(self, consumer):
|
||||
try:
|
||||
while self.started:
|
||||
try:
|
||||
result = await consumer.getmany(timeout_ms=self.timeout * 1000, max_records=self.bulk_size)
|
||||
except (ConsumerStoppedError, asyncio.CancelledError):
|
||||
break
|
||||
collector = []
|
||||
for tp, messages in result.items():
|
||||
if messages:
|
||||
for message in messages:
|
||||
preprocessed_msg = self.preprocess(message)
|
||||
if preprocessed_msg:
|
||||
collector.append(preprocessed_msg)
|
||||
for processor in self.processors:
|
||||
filtered = filter(processor.filter, collector)
|
||||
try:
|
||||
await processor.process_bulk(filtered)
|
||||
except InterruptProcessing as e:
|
||||
logging.getLogger('statbox').info(e)
|
||||
try:
|
||||
await consumer.commit()
|
||||
except CommitFailedError as e:
|
||||
error_log(e)
|
||||
continue
|
||||
finally:
|
||||
await consumer.stop()
|
||||
|
||||
async def start(self):
|
||||
logging.getLogger('statbox').info({
|
||||
'action': 'start',
|
||||
'group_id': self.group_id,
|
||||
'topic_names': self.topic_names,
|
||||
})
|
||||
consumer = self.create_consumer()
|
||||
await consumer.start()
|
||||
logging.getLogger('statbox').info({
|
||||
'action': 'started',
|
||||
'group_id': self.group_id,
|
||||
'topic_names': self.topic_names,
|
||||
})
|
||||
self.consumer_task = asyncio.create_task(self.consume(consumer))
|
||||
|
@ -1,15 +1,8 @@
|
||||
from nexus.models.proto.operation_pb2 import \
|
||||
CrossReferenceOperation as CrossReferenceOperationPb
|
||||
|
||||
from .base import (
|
||||
BaseBulkConsumer,
|
||||
BasePbConsumer,
|
||||
)
|
||||
from .base import BasePbConsumer
|
||||
|
||||
|
||||
class CrossReferencesConsumer(BasePbConsumer):
|
||||
pb_class = CrossReferenceOperationPb
|
||||
|
||||
|
||||
class CrossReferencesBulkConsumer(BaseBulkConsumer, CrossReferencesConsumer):
|
||||
pass
|
||||
|
@ -2,7 +2,6 @@ from nexus.models.proto.operation_pb2 import \
|
||||
DocumentOperation as DocumentOperationPb
|
||||
|
||||
from .base import (
|
||||
BaseBulkConsumer,
|
||||
BaseJsonConsumer,
|
||||
BasePbConsumer,
|
||||
)
|
||||
@ -14,7 +13,3 @@ class DocumentOperationsConsumer(BasePbConsumer):
|
||||
|
||||
class DocumentOperationsJsonConsumer(BaseJsonConsumer):
|
||||
pb_class = DocumentOperationPb
|
||||
|
||||
|
||||
class DocumentOperationsBulkConsumer(BaseBulkConsumer, DocumentOperationsConsumer):
|
||||
pass
|
||||
|
@ -22,12 +22,14 @@ class PylonClient(AioThing):
|
||||
source_configs: Optional[List],
|
||||
proxies: Optional[List[str]] = None,
|
||||
downloads_directory: Optional[str] = None,
|
||||
default_driver_proxy_list: [Optional[List]] = None
|
||||
default_driver_proxy_list: [Optional[List]] = None,
|
||||
default_resolver_proxy_list: [Optional[List]] = None,
|
||||
):
|
||||
super().__init__()
|
||||
self.proxy_manager = ProxyManager(proxies)
|
||||
self.downloads_directory = downloads_directory
|
||||
self.default_driver_proxy_list = default_driver_proxy_list
|
||||
self.default_resolver_proxy_list = default_resolver_proxy_list
|
||||
self.sources = []
|
||||
for source_config in source_configs:
|
||||
source = Source.from_config(
|
||||
@ -35,6 +37,7 @@ class PylonClient(AioThing):
|
||||
source_config=source_config,
|
||||
downloads_directory=downloads_directory,
|
||||
default_driver_proxy_list=default_driver_proxy_list,
|
||||
default_resolver_proxy_list=default_resolver_proxy_list,
|
||||
)
|
||||
self.sources.append(source)
|
||||
self.starts.append(source)
|
||||
|
@ -138,6 +138,7 @@ pylon:
|
||||
resolver:
|
||||
args:
|
||||
format_string: 'https://www.sciencedirect.com/science/article/pii/{selected}/pdfft?isDTMRedir=true&download=true'
|
||||
resolve_timeout: 25.0
|
||||
selector: '(.resource.primary.URL | split("/"))[-1]'
|
||||
timeout: 40.0
|
||||
class: nexus.pylon.resolvers.DoiOrgRequestResolver
|
||||
|
@ -26,7 +26,7 @@ class BaseDriver(NetworkAgent):
|
||||
validator_cls = validator['class']
|
||||
validator_cls = import_object(validator_cls)
|
||||
|
||||
self.validator = validator_cls or BaseValidator
|
||||
self.validator = validator_cls
|
||||
self.downloads_directory = downloads_directory
|
||||
|
||||
def __str__(self):
|
||||
|
@ -3,7 +3,6 @@ import json
|
||||
import logging
|
||||
import os.path
|
||||
import shutil
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import (
|
||||
|
@ -1,4 +1,5 @@
|
||||
import re
|
||||
import sys
|
||||
|
||||
|
||||
class Matcher:
|
||||
|
@ -11,6 +11,11 @@ import jq
|
||||
from nexus.pylon.prepared_request import PreparedRequest
|
||||
from nexus.pylon.proxy_manager import ProxyManager
|
||||
from nexus.pylon.resolvers.base import BaseResolver
|
||||
from tenacity import (
|
||||
retry,
|
||||
stop_after_attempt,
|
||||
wait_random,
|
||||
)
|
||||
|
||||
|
||||
class DoiOrgRequestResolver(BaseResolver):
|
||||
@ -24,6 +29,7 @@ class DoiOrgRequestResolver(BaseResolver):
|
||||
proxy_manager: Optional[ProxyManager] = None,
|
||||
):
|
||||
super().__init__(proxy_list=proxy_list, proxy_manager=proxy_manager)
|
||||
self.text_selector = selector
|
||||
self.selector = jq.compile(selector)
|
||||
self.format_string = format_string
|
||||
self.timeout = timeout
|
||||
@ -32,6 +38,11 @@ class DoiOrgRequestResolver(BaseResolver):
|
||||
def __str__(self):
|
||||
return f'{self.__class__.__name__}(selector = {self.selector}, format_string = {self.format_string})'
|
||||
|
||||
@retry(
|
||||
reraise=True,
|
||||
wait=wait_random(min=1, max=3),
|
||||
stop=stop_after_attempt(4),
|
||||
)
|
||||
async def resolve_through_doi_org(self, params):
|
||||
async with self.get_session() as session:
|
||||
doi_url = f'https://doi.org/{params["doi"]}'
|
||||
@ -51,6 +62,7 @@ class DoiOrgRequestResolver(BaseResolver):
|
||||
logging.getLogger('error').error({
|
||||
'action': 'error',
|
||||
'mode': 'pylon',
|
||||
'params': params,
|
||||
'error': str(e)
|
||||
})
|
||||
return
|
||||
@ -60,3 +72,11 @@ class DoiOrgRequestResolver(BaseResolver):
|
||||
url=self.format_string.format(selected=selected),
|
||||
timeout=self.timeout,
|
||||
)
|
||||
else:
|
||||
logging.getLogger('debug').error({
|
||||
'action': 'missed_selector',
|
||||
'mode': 'pylon',
|
||||
'params': params,
|
||||
'selector': self.text_selector,
|
||||
'format_string': self.format_string,
|
||||
})
|
||||
|
@ -33,13 +33,17 @@ class Source(AioThing):
|
||||
source_config,
|
||||
downloads_directory: str,
|
||||
default_driver_proxy_list: List,
|
||||
default_resolver_proxy_list: List,
|
||||
) -> 'Source':
|
||||
matcher = Matcher(source_config['matcher'])
|
||||
|
||||
resolver_cls = import_object(
|
||||
source_config.get('resolver', {}).get('class', 'nexus.pylon.resolvers.TemplateResolver')
|
||||
)
|
||||
resolver_args = dict(proxy_manager=proxy_manager)
|
||||
resolver_args = dict(
|
||||
proxy_manager=proxy_manager,
|
||||
proxy_list=default_resolver_proxy_list,
|
||||
)
|
||||
resolver_args.update(**source_config.get('resolver', {}).get('args', {}))
|
||||
resolver = resolver_cls(**resolver_args)
|
||||
|
||||
|
@ -4,7 +4,7 @@
|
||||
en:
|
||||
ABOUT_US: |
|
||||
**About us**
|
||||
Among the most impactful inventions of humanity such as the wheel, semiconductors or the wing were the invention of writing and advent of mass printing. Uwe of printing press made possible wide spread of ideas and knowledge. This burst increased the number of educated people and in its turn people started the Age of Enlightenment and shaped modern civilization.
|
||||
Among the most impactful inventions of humanity such as the wheel, semiconductors or the wing were the invention of writing and advent of mass printing. Use of printing press made possible wide spread of ideas and knowledge. This burst increased the number of educated people and in its turn people started the Age of Enlightenment and shaped modern civilization.
|
||||
|
||||
Why printing provoked such dramatic changes in society?
|
||||
Sane beings and knowledge live in a natural symbiosis: knowledge are fruiting and growing while circulated among people and people do the same. We may even note that complexity of human civilization, its achievements and technological progress during all its history have been correlated with the amount of accumulated knowledge and possibility to share and navigate through it.
|
||||
@ -14,7 +14,7 @@ en:
|
||||
However, there is one remaining issue: knowledge is usurped by large publishers, corporations such as Elsevier, Springer, Wiley, T&F etc. Held by copyright, the knowledge remains out of reach for many researchers and for many enterprises that could launch the new Age of Enlightenment.
|
||||
|
||||
Knowledge is not just usurped, an entire system has been built which is putting in chains researchers, academia and readers:
|
||||
- Large publishers promote and lobby approaches that encourage evaluation of researchers using the number of published articles in journals having high-impact. Everybody knows who owns journals of high "impact"
|
||||
- Large publishers promote and lobby approaches that encourage evaluation of researchers using the number of published articles in journals having high-impact
|
||||
- Publishers make authors to pay for publishing tax-funded researches, then readers to pay for access to published tax-funded researches, while peer-reviewing is done for free by other researchers and this activity is considered as "honorable" in academia
|
||||
- Publishers such as Elsevier repeatedly carried and carry campaigns against open access
|
||||
|
||||
|
@ -17,6 +17,7 @@ class DocumentListWidget:
|
||||
document_holders: List[BaseHolder],
|
||||
bot_name,
|
||||
header: Optional[str] = None,
|
||||
promotioner=None,
|
||||
has_next: bool = False,
|
||||
session_id: Optional[str] = None,
|
||||
message_id: Optional[int] = None,
|
||||
@ -29,6 +30,7 @@ class DocumentListWidget:
|
||||
self.document_holders = document_holders
|
||||
self.bot_name = bot_name
|
||||
self.header = header
|
||||
self.promotioner = promotioner
|
||||
self.cmd = cmd
|
||||
self.has_next = has_next
|
||||
self.session_id = session_id
|
||||
@ -57,6 +59,10 @@ class DocumentListWidget:
|
||||
if self.header:
|
||||
serp = f'**{self.header}**\n\n{serp}'
|
||||
|
||||
promotion_language = self.chat.language
|
||||
promo = self.promotioner.choose_promotion(promotion_language)
|
||||
serp = f'{serp}\n\n{promo}\n'
|
||||
|
||||
buttons = []
|
||||
if self.cmd and self.message_id and self.session_id and (self.has_next or self.page > 0):
|
||||
buttons = [
|
||||
|
@ -100,9 +100,9 @@ class ProgressBar:
|
||||
async def show_banner(self):
|
||||
return await self.send_message(await self.render_banner(), ignore_last_call=True)
|
||||
|
||||
async def callback(self, done, total):
|
||||
async def callback(self, done, total, ignore_last_call=False):
|
||||
self._set_progress(done, total)
|
||||
return await self.send_message(await self.render_progress())
|
||||
return await self.send_message(await self.render_progress(), ignore_last_call=ignore_last_call)
|
||||
|
||||
|
||||
class ThrottlerWrapper:
|
||||
|
@ -139,7 +139,7 @@ spacy-loggers==1.0.3
|
||||
SQLAlchemy==1.4.39
|
||||
sqlparse==0.4.2
|
||||
srsly==2.4.4
|
||||
Telethon==1.24.0
|
||||
Telethon==1.25.0
|
||||
tenacity==8.0.1
|
||||
termcolor==1.1.0
|
||||
textblob==0.17.1
|
||||
|
Loading…
Reference in New Issue
Block a user