- [nexus] Development

GitOrigin-RevId: ccfd55db266862ed70f1299aaf62500765b03cc4
This commit is contained in:
the-superpirate 2022-09-06 19:33:57 +03:00
parent 51ae1fc5d9
commit 7c1bb06b1b
42 changed files with 300 additions and 322 deletions

View File

@ -39,6 +39,7 @@ class BaseTelegramClient(AioThing):
bot_token: Optional[str] = None,
mtproxy: Optional[dict] = None,
flood_sleep_threshold: int = 60,
catch_up: bool = False,
):
super().__init__()
if not app_id or not app_hash:
@ -50,6 +51,7 @@ class BaseTelegramClient(AioThing):
self._get_session(database),
app_id,
app_hash,
catch_up=catch_up,
flood_sleep_threshold=flood_sleep_threshold,
**self._get_proxy(mtproxy=mtproxy),
)

View File

@ -5,9 +5,17 @@ class Promotioner:
"""
Promotioner is used to select promotion randomly based on weights of every promotion.
"""
def __init__(self, promotions: list[dict], default_promotion_index: int = 0):
def __init__(
self,
promotions: list[dict],
default_promotion_index: int = 0,
promotion_vars: dict = None,
):
self.promotions = promotions
self.default_promotion_index = default_promotion_index
if not promotion_vars:
promotion_vars = {}
self.promotion_vars = promotion_vars
self.partial_sums: list = [self.promotions[0]['weight']]
for promotion in self.promotions[1:]:
self.partial_sums.append(promotion['weight'] + self.partial_sums[-1])
@ -18,11 +26,11 @@ class Promotioner:
if partial_sum <= pivot:
continue
if language in promotion['texts']:
return promotion['texts'][language]
return promotion['texts'][language].format(**self.promotion_vars)
elif promotion.get('local', False):
default_promotion = self.promotions[self.default_promotion_index]
if language in default_promotion['texts']:
return default_promotion['texts'][language]
return default_promotion['texts']['en']
return default_promotion['texts'][language].format(**self.promotion_vars)
return default_promotion['texts']['en'].format(**self.promotion_vars)
else:
return promotion['texts']['en']
return promotion['texts']['en'].format(**self.promotion_vars)

View File

@ -19,6 +19,7 @@ async def safe_execution(
error_log=error_log,
on_fail: Optional[Callable[[], Awaitable]] = None,
level=logging.WARNING,
is_logging_enabled: bool = True
):
try:
try:
@ -34,8 +35,11 @@ async def safe_execution(
errors.MessageIdInvalidError,
errors.ChatAdminRequiredError,
) as e:
if is_logging_enabled:
error_log(e, level=level)
traceback.print_exc()
except Exception as e:
if is_logging_enabled:
error_log(e, level=level)
traceback.print_exc()
if on_fail:
@ -43,4 +47,5 @@ async def safe_execution(
except events.StopPropagation:
raise
except Exception as e:
if is_logging_enabled:
error_log(e, level=level)

View File

@ -76,6 +76,11 @@ def clean_issns(issns):
return cleaned_issns
def clean_isbns(isbns):
if isbns:
return isbns
def extract_title(title, subtitle):
return ': '.join(filter(lambda x: bool(x), [title.strip(), subtitle.strip()]))
@ -90,6 +95,7 @@ class ToScimagPbAction(BaseAction):
doi=item['DOI'],
issue=item.get('issue'),
issns=clean_issns(item.get('ISSN')),
isbns=clean_isbns(item.get('ISBN')),
language=item.get('language'),
referenced_by_count=item.get('is-referenced-by-count'),
references=extract_references(item.get('reference')),

View File

@ -42,6 +42,8 @@ py3_image(
"//nexus/hub/aioclient",
"//nexus/meta_api/aioclient",
"//nexus/models/proto:proto_py",
"//nexus/promotions",
"//nexus/translations",
"//nexus/views/telegram",
requirement("izihawa_nlptools"),
requirement("izihawa_utils"),

View File

@ -2,10 +2,11 @@ from aiokit import AioRootThing
from idm.api.aioclient import IdmApiGrpcClient
from izihawa_utils.importlib import import_object
from library.telegram.base import BaseTelegramClient
from nexus.bot.promotioner import Promotioner
from library.telegram.promotioner import Promotioner
from nexus.bot.user_manager import UserManager
from nexus.hub.aioclient import HubGrpcClient
from nexus.meta_api.aioclient import MetaApiGrpcClient
from nexus.promotions import get_promotions
class TelegramApplication(AioRootThing):
@ -18,6 +19,7 @@ class TelegramApplication(AioRootThing):
bot_token=self.config['telegram']['bot_token'],
database=self.config['telegram'].get('database'),
mtproxy=self.config['telegram'].get('mtproxy'),
catch_up=self.config['telegram'].get('catch_up', False)
)
self.hub_client = HubGrpcClient(endpoint=self.config['hub']['endpoint'])
self.starts.append(self.hub_client)
@ -28,7 +30,13 @@ class TelegramApplication(AioRootThing):
self.meta_api_client = MetaApiGrpcClient(endpoint=self.config['meta_api']['endpoint'])
self.starts.append(self.meta_api_client)
self.promotioner = Promotioner(promotions=self.config['promotions'])
self.promotioner = Promotioner(
promotions=get_promotions(),
promotion_vars=dict(
related_channel=self.config['telegram']['related_channel'],
twitter_contact_url=self.config['twitter']['contact_url'],
)
)
self.user_manager = UserManager()
self._handlers = []

View File

@ -7,7 +7,6 @@ def get_config():
'nexus/bot/configs/base.yaml',
'nexus/bot/configs/%s.yaml?' % env.type,
'nexus/bot/configs/logging.yaml',
'nexus/bot/configs/promotions.yaml',
], env_prefix='NEXUS_BOT')

View File

@ -1,36 +0,0 @@
---
promotions:
- texts:
en: 💬 The victory of humanity is inevitable
weight: 1
- texts:
en: 💬 Shall build Standard Template Construct
weight: 1
- texts:
en: 💬 Gaining knowledge is the only purpose of life
weight: 1
- texts:
en: 💬 Knowledge cannot belong
weight: 1
- texts:
en: 💬 Obey the path of discovery
weight: 1
- texts:
en: 💬 Research is the only and ultimate goal
weight: 1
- texts:
en: ✋ Have a subscription to paid articles? [Help researchers!](https://t.me/nexus_aaron)
ru: ✋ Есть доступ к платным статьям? [Помоги ученым!](https://t.me/nexus_aaron)
weight: 25
- texts:
en: ✋ Help us, become a seeder of books. Learn how in /seed
ru: ✋ Сохрани наследие, раздавай книги нуждающимся. Узнай как в /seed
weight: 25
- texts:
en: ⤴️ Stay tuned with us at @{related_channel} and [Twitter]({twitter_contact_url})
es: ⤴️ Mantente en contacto con nosotros en @{related_channel} y [Twitter]({twitter_contact_url})
it: ⤴️ Resta aggiornato con noi su @{related_channel} e [Twitter]({twitter_contact_url})
pb: ⤴️ Fique ligado conosco em @{related_channel} e [Twitter]({twitter_contact_url})
ru: ⤴️ Оставайся на связи с нами на @{related_channel} и в [Twitter]({twitter_contact_url})
weight: 25

View File

@ -2,6 +2,7 @@ import asyncio
import time
from library.telegram.base import RequestContext
from library.telegram.utils import safe_execution
from nexus.translations import t
from telethon import events
@ -36,5 +37,6 @@ class CloseHandler(BaseCallbackQueryHandler):
target_events.append(reply_message.delete())
target_events.append(message.delete())
else:
target_events.append(event.answer(t('DELETION_FORBIDDEN_DUE_TO_AGE')))
async with safe_execution(is_logging_enabled=False):
await event.answer(t('DELETION_FORBIDDEN_DUE_TO_AGE'))
await asyncio.gather(*target_events)

View File

@ -1,4 +1,5 @@
from library.telegram.base import RequestContext
from library.telegram.utils import safe_execution
from nexus.hub.proto.delivery_service_pb2 import \
StartDeliveryResponse as StartDeliveryResponsePb
from nexus.translations import t
@ -45,11 +46,13 @@ class DownloadHandler(BaseCallbackQueryHandler):
bot_name=request_context.bot_name,
)
if start_delivery_response_pb.status == StartDeliveryResponsePb.Status.ALREADY_DOWNLOADING:
async with safe_execution(is_logging_enabled=False):
await event.answer(
f'{t("ALREADY_DOWNLOADING", request_context.chat.language)}',
)
await remove_button(event, '⬇️', and_empty_too=True)
elif start_delivery_response_pb.status == StartDeliveryResponsePb.Status.TOO_MANY_DOWNLOADS:
async with safe_execution(is_logging_enabled=False):
await event.answer(
f'{t("TOO_MANY_DOWNLOADS", request_context.chat.language)}',
)

View File

@ -61,7 +61,7 @@ class ProfileHandler(BaseHandler):
target_events.append(profile_reply_message.reply(rendered_widget, buttons=buttons, link_preview=False))
else:
target_events.append(event.reply(rendered_widget, buttons=buttons, link_preview=False))
return asyncio.gather(*target_events)
return await asyncio.gather(*target_events)
class DigestHandler(BaseHandler):
@ -114,6 +114,7 @@ class DigestHandler(BaseHandler):
document_holders=document_holders,
bot_name=bot_name,
header='✨ Nexus Discovery ✨',
promotioner=self.application.promotioner,
)
view, buttons = await document_list_widget.render()

View File

@ -39,10 +39,7 @@ class RollHandler(BaseHandler):
if random_documents:
holder = BaseHolder.create_from_document(random_documents[0])
promo = self.application.promotioner.choose_promotion(language).format(
related_channel=self.application.config['telegram']['related_channel'],
twitter_contact_url=self.application.config['twitter']['contact_url'],
)
promo = self.application.promotioner.choose_promotion(language)
view = holder.view_builder(language).add_view(bot_name=bot_name).add_new_line(2).add(promo, escaped=True).build()
buttons_builder = holder.buttons_builder(language)
@ -54,5 +51,5 @@ class RollHandler(BaseHandler):
request_context.statbox(action='show', duration=time.time() - start_time)
await event.respond(view, buttons=buttons_builder.build())
async with safe_execution(error_log=request_context.error_log, level=logging.DEBUG):
async with safe_execution(is_logging_enabled=False):
await event.delete()

View File

@ -210,6 +210,7 @@ class SearchEditHandler(BaseSearchHandler):
if request_context.is_group_mode() and not search_prefix:
return
if request_context.is_personal_mode() and search_prefix:
query = event.raw_text
@ -288,7 +289,7 @@ class SearchPagingHandler(BaseCallbackQueryHandler):
)
serp, buttons = await search_widget.render(message_id=message_id)
return await asyncio.gather(
event.answer(),
message.edit(serp, buttons=buttons, link_preview=False)
)
await message.edit(serp, buttons=buttons, link_preview=False)
async with safe_execution(is_logging_enabled=False):
await event.answer()

View File

@ -47,13 +47,13 @@ class SubmitHandler(BaseHandler):
reply_to = reply_message.id
doi_hint = self.get_doi_hint(message=message, reply_message=reply_message)
doi_hint_priority = '' in message.raw_text
skip_analysis = '⚡️' in message.raw_text
user_id = message.sender_id
request_context.statbox(
action='analyzed',
mode='submit',
doi_hint=doi_hint,
doi_hint_priority=doi_hint_priority,
skip_analysis=skip_analysis,
reply_to=reply_to,
)
@ -71,12 +71,12 @@ class SubmitHandler(BaseHandler):
request_id=request_context.request_id,
session_id=session_id,
doi_hint=doi_hint,
doi_hint_priority=doi_hint_priority,
skip_analysis=skip_analysis,
uploader_id=user_id,
)
case 'application/zip':
try:
if request_context.is_personal_mode():
try:
file_data = await self.application.telegram_client.download_document(
document=event.document,
file=bytes,
@ -105,8 +105,6 @@ class SubmitHandler(BaseHandler):
session_id=session_id,
uploader_id=user_id,
)
else:
await event.reply(t('ZIP_FILES_ARE_NOT_SUPPORTED_IN_GROUP_MODE', request_context.chat.language))
finally:
return await event.delete()
case _:

View File

@ -85,10 +85,7 @@ class ViewHandler(BaseHandler):
holder = BaseHolder.create(typed_document_pb=typed_document_pb)
back_command = await self.compose_back_command(session_id=session_id, message_id=message_id, page=page)
promo = self.application.promotioner.choose_promotion(language).format(
related_channel=self.application.config['telegram']['related_channel'],
twitter_contact_url=self.application.config['twitter']['contact_url'],
)
promo = self.application.promotioner.choose_promotion(language)
view_builder = holder.view_builder(language).add_view(
bot_name=self.application.config['telegram']['bot_name']
).add_new_line(2).add(promo, escaped=True)

View File

@ -154,10 +154,7 @@ class SearchWidget(BaseSearchWidget):
)
promotion_language = self.query_language or self.chat.language
promo = self.application.promotioner.choose_promotion(promotion_language).format(
related_channel=self.application.config['telegram']['related_channel'],
twitter_contact_url=self.application.config['twitter']['contact_url'],
)
promo = self.application.promotioner.choose_promotion(promotion_language)
serp = f'{serp}\n\n{promo}\n'
buttons = None

View File

@ -58,7 +58,7 @@ class HubGrpcClient(BaseGrpcClient):
bot_name: str,
reply_to: Optional[int] = None,
doi_hint: Optional[str] = None,
doi_hint_priority: bool = False,
skip_analysis: bool = False,
request_id: Optional[str] = None,
session_id: Optional[str] = None,
uploader_id: Optional[int] = None
@ -68,7 +68,7 @@ class HubGrpcClient(BaseGrpcClient):
bot_name=bot_name,
reply_to=reply_to,
doi_hint=doi_hint,
doi_hint_priority=doi_hint_priority,
skip_analysis=skip_analysis,
uploader_id=uploader_id,
)
if isinstance(file, submitter_service_pb2.PlainFile):

View File

@ -4,6 +4,7 @@ pylon:
- [cambridge]
- [edinburg]
- [southampton]
default_resolver_proxy_list: ~
downloads_directory: /downloads
proxies:
- address: clash.default.svc.cluster.example.com:7890
@ -15,6 +16,9 @@ pylon:
- address: clash.default.svc.cluster.example.com:8090
name: southampton
tags: ['southampton']
- address: socks5://clash.default.svc.cluster.example.com:7991
name: socks5
tags: ['socks5']
sources:
# LibGen.rocks
- driver:
@ -331,6 +335,13 @@ pylon:
args:
format_string: 'https://journals.physiology.org/doi/pdf/{doi}?download=true'
class: nexus.pylon.resolvers.TemplateResolver
# www.ahajournals.org
- matcher:
doi: ^10.1161/.*$
resolver:
args:
format_string: 'https://www.ahajournals.org/doi/pdf/{doi}?download=true'
class: nexus.pylon.resolvers.TemplateResolver
# ajp.psychiatryonline.org
- matcher:
doi: ^10.1176/.*$

View File

@ -23,7 +23,7 @@ message SubmitRequest {
string bot_name = 4;
optional int64 reply_to = 5;
optional string doi_hint = 6;
bool doi_hint_priority = 7;
bool skip_analysis = 7;
int64 uploader_id = 8;
}
message SubmitResponse { }

View File

@ -1,5 +1,6 @@
import asyncio
from aiobaseclient.exceptions import BadRequestError
from library.aiogrpctools.base import BaseService
from library.telegram.common import close_button
from nexus.views.telegram.common import vote_button
@ -16,8 +17,39 @@ def is_group_or_channel(chat_id: int):
return chat_id < 0
class ProcessedDocument:
def __init__(self, processed_document):
self.processed_document = processed_document
@staticmethod
async def setup(file_data, grobid_client, request_context):
try:
processed_document = await grobid_client.process_fulltext_document(pdf_file=file_data)
except BadRequestError as e:
request_context.statbox(action='unparsable_document')
request_context.error_log(e)
processed_document = {}
return ProcessedDocument(processed_document)
@property
def doi(self):
return self.processed_document.get('doi')
@property
def title(self):
return self.processed_document.get('title')
@property
def abstract(self):
return self.processed_document.get('abstract')
@property
def body(self):
return self.processed_document.get('body')
class BaseHubService(BaseService):
async def item_found(self, bot_name, doi):
async def found_item(self, bot_name, doi):
if mutual_aid_service := self.application.mutual_aid_services.get(bot_name):
await mutual_aid_service.delete_request(doi)
await self.application.idm_client.reschedule_subscriptions(
@ -36,13 +68,13 @@ class BaseHubService(BaseService):
)
))
def set_fields_from_processed(self, document_pb, processed_document):
def set_fields_from_processed(self, document_pb, processed_document: ProcessedDocument):
new_fields = []
if processed_document.get('abstract') and not document_pb.abstract:
document_pb.abstract = processed_document['abstract']
if processed_document.abstract and not document_pb.abstract:
document_pb.abstract = processed_document.abstract
new_fields.append('abstract')
if processed_document.get('body') and not document_pb.content:
document_pb.content = processed_document['body']
if processed_document.body and not document_pb.content:
document_pb.content = processed_document.body
new_fields.append('content')
return new_fields

View File

@ -46,6 +46,7 @@ from pypika import (
from .base import (
BaseHubService,
ProcessedDocument,
is_group_or_channel,
)
@ -76,6 +77,7 @@ class DeliveryService(delivery_service_pb2_grpc.DeliveryServicer, BaseHubService
proxies=pylon_config['proxies'],
source_configs=pylon_config['sources'],
default_driver_proxy_list=pylon_config['default_driver_proxy_list'],
default_resolver_proxy_list=pylon_config['default_resolver_proxy_list'],
downloads_directory=pylon_config['downloads_directory'],
)
self.should_parse_with_grobid = should_parse_with_grobid
@ -216,11 +218,12 @@ class DownloadTask:
error_log=request_context.error_log,
on_fail=_on_fail,
):
filename = document_holder.get_filename()
progress_bar_download = ProgressBar(
telegram_client=self.application.telegram_clients[request_context.bot_name],
request_context=request_context,
banner=t("LOOKING_AT", request_context.chat.language),
header=f'⬇️ {document_holder.get_filename()}',
header=f'⬇️ {filename}',
tail_text=t('TRANSMITTED_FROM', request_context.chat.language),
throttle_secs=throttle_secs,
)
@ -239,6 +242,10 @@ class DownloadTask:
)
if not document_holder.md5 and document_holder.get_extension() == 'pdf':
try:
await progress_bar_download.send_message(
t("PROCESSING_PAPER", request_context.chat.language).format(filename=filename),
ignore_last_call=True
)
file = clean_metadata(file, doi=document_holder.doi)
request_context.statbox(
action='cleaned',
@ -251,7 +258,7 @@ class DownloadTask:
request_context=request_context,
message=progress_bar_download.message,
banner=t("LOOKING_AT", request_context.chat.language),
header=f'⬇️ {document_holder.get_filename()}',
header=f'⬇️ {filename}',
tail_text=t('UPLOADED_TO_TELEGRAM', request_context.chat.language),
throttle_secs=throttle_secs
)
@ -264,7 +271,7 @@ class DownloadTask:
voting=not is_group_or_channel(self.request_context.chat.chat_id),
)
if self.document_holder.doi:
await self.delivery_service.item_found(
await self.delivery_service.found_item(
bot_name=request_context.bot_name,
doi=self.document_holder.doi,
)
@ -277,6 +284,7 @@ class DownloadTask:
document_holder=document_holder,
telegram_file_id=uploaded_message.file.id,
file=file,
request_context=request_context,
))
else:
request_context.statbox(
@ -425,7 +433,7 @@ class DownloadTask:
self.task.cancel()
await self.task
async def store_new_data(self, bot_name, document_holder, telegram_file_id, file):
async def store_new_data(self, bot_name, document_holder, telegram_file_id, file, request_context):
document_pb = document_holder.document_pb
new_fields = []
if self.delivery_service.should_store_hashes:
@ -448,7 +456,11 @@ class DownloadTask:
and document_holder.index_alias == 'scimag'
):
try:
processed_document = await self.application.grobid_client.process_fulltext_document(pdf_file=file)
processed_document = await ProcessedDocument.setup(
file,
grobid_client=self.application.grobid_client,
request_context=request_context,
)
new_fields += self.delivery_service.set_fields_from_processed(document_pb, processed_document)
except BadRequestError as e:
self.request_context.statbox(action='unparsable_document')

View File

@ -34,7 +34,10 @@ from nexus.views.telegram.base_holder import ScimagHolder
from telethon.errors import ChatAdminRequiredError
from telethon.extensions import BinaryReader
from .base import BaseHubService
from .base import (
BaseHubService,
ProcessedDocument,
)
async def operation_log(document_operation_pb):
@ -87,22 +90,48 @@ class PlainFile:
def fuzzy_compare(a, b):
if a is None or b is None:
return False
a = re.sub(r'[^a-z\d]', '', a.lower())
b = re.sub(r'[^a-z\d]', '', b.lower())
return SequenceMatcher(None, a, b).ratio() > 0.9
async def delayed_task(task, seconds):
await asyncio.sleep(seconds)
await task
class SubmitterService(submitter_service_pb2_grpc.SubmitterServicer, BaseHubService):
async def start(self):
submitter_service_pb2_grpc.add_SubmitterServicer_to_server(self, self.application.server)
def wrap_request_file(self, request, request_context):
match str(request.WhichOneof('file')):
case 'plain':
return PlainFile(request.plain)
case 'telegram':
return TelegramFile(self.application.telegram_clients[request_context.bot_name], request.telegram)
case _:
raise RuntimeError(f"Unknown file type {request.WhichOneof('file')}")
async def retrieve_metadata(self, doi, title, session_id, request_context):
if doi:
meta_search_response = await self.application.meta_api_client.meta_search(
index_aliases=['scimag', ],
query=doi,
collectors=[{'top_docs': {'limit': 1}}],
session_id=session_id,
request_id=request_context.request_id,
user_id=str(request_context.chat.chat_id),
query_tags=['submitter'],
)
scored_documents = meta_search_response.collector_outputs[0].top_docs.scored_documents
if len(scored_documents) == 1:
scimag_pb = scimag_pb2.Scimag(**json.loads(scored_documents[0].document))
if title is not None and not fuzzy_compare(scimag_pb.title, title):
request_context.statbox(
action='mismatched_title',
doi=doi,
processed_title=title,
title=scimag_pb.title,
)
return None
return scimag_pb
@aiogrpc_request_wrapper(log=False)
async def submit(
self,
@ -125,17 +154,10 @@ class SubmitterService(submitter_service_pb2_grpc.SubmitterServicer, BaseHubServ
)
buttons = None if request_context.is_group_mode() else [close_button()]
wrapped_file = self.wrap_request_file(request, request_context)
match str(request.WhichOneof('file')):
case 'plain':
file = PlainFile(request.plain)
case 'telegram':
file = TelegramFile(self.application.telegram_clients[request_context.bot_name], request.telegram)
case _:
raise RuntimeError(f"Unknown file type {request.WhichOneof('file')}")
if file.size > 30 * 1024 * 1024:
request_context.error_log(FileTooBigError(size=file.size))
if wrapped_file.size > 300 * 1024 * 1024:
request_context.error_log(FileTooBigError(size=wrapped_file.size))
request_context.statbox(action='file_too_big')
async with safe_execution(error_log=request_context.error_log):
await self.application.telegram_clients[request_context.bot_name].send_message(
@ -149,109 +171,59 @@ class SubmitterService(submitter_service_pb2_grpc.SubmitterServicer, BaseHubServ
try:
processing_message = await self.application.telegram_clients[request_context.bot_name].send_message(
request_context.chat.chat_id,
t("PROCESSING_PAPER", request_context.chat.language).format(filename=file.filename),
t("PROCESSING_PAPER", request_context.chat.language).format(filename=wrapped_file.filename),
reply_to=request.reply_to,
)
except ChatAdminRequiredError:
return submitter_service_pb2.SubmitResponse()
try:
file_data = await file.read()
processed_document = None
pdf_doi = None
pdf_title = None
try:
processed_document = await self.application.grobid_client.process_fulltext_document(pdf_file=file_data)
pdf_doi = processed_document.get('doi')
pdf_title = processed_document.get('title')
request_context.add_default_fields(pdf_doi=pdf_doi)
except BadRequestError as e:
request_context.statbox(action='unparsable_document')
request_context.error_log(e)
if not request.doi_hint:
await self.application.telegram_clients[request_context.bot_name].send_message(
request_context.chat.chat_id,
t('UNPARSABLE_DOCUMENT_ERROR', request_context.chat.language).format(
filename=file.filename,
),
buttons=buttons,
reply_to=request.reply_to,
file_data = await wrapped_file.read()
if not request.skip_analysis:
processed_document = await ProcessedDocument.setup(
file_data,
grobid_client=self.application.grobid_client,
request_context=request_context,
)
return submitter_service_pb2.SubmitResponse()
else:
processed_document = ProcessedDocument({})
if request.doi_hint and pdf_doi != request.doi_hint:
request_context.statbox(action='mismatched_doi', doi_hint_priority=request.doi_hint_priority)
if request.doi_hint_priority:
pdf_doi = request.doi_hint
if not pdf_doi and not request.doi_hint:
if not processed_document.doi and not request.doi_hint:
request_context.statbox(action='unparsable_doi')
request_context.error_log(UnparsableDoiError())
await self.application.telegram_clients[request_context.bot_name].send_message(
request_context.chat.chat_id,
t('UNPARSABLE_DOI_ERROR', request_context.chat.language).format(
filename=file.filename,
filename=wrapped_file.filename,
),
buttons=buttons,
reply_to=request.reply_to,
)
return submitter_service_pb2.SubmitResponse()
scimag_pb = None
if pdf_doi:
meta_search_response = await self.application.meta_api_client.meta_search(
index_aliases=['scimag',],
query=pdf_doi,
collectors=[{'top_docs': {'limit': 1}}],
scimag_pb = await self.retrieve_metadata(
processed_document.doi,
processed_document.title,
session_id=session_id,
request_id=request_context.request_id,
user_id=str(request_context.chat.chat_id),
query_tags=['submitter'],
request_context=request_context,
)
scored_documents = meta_search_response.collector_outputs[0].top_docs.scored_documents
if len(scored_documents) == 1:
scimag_pb = scimag_pb2.Scimag(**json.loads(scored_documents[0].document))
if not fuzzy_compare(scimag_pb.title, pdf_title):
request_context.statbox(
action='mismatched_title',
doi=pdf_doi,
pdf_title=pdf_title,
title=scimag_pb.title,
)
scimag_pb = None
if not scimag_pb and request.doi_hint:
meta_search_response = await self.application.meta_api_client.meta_search(
index_aliases=['scimag', ],
query=request.doi_hint,
collectors=[{'top_docs': {'limit': 1}}],
scimag_pb = await self.retrieve_metadata(
request.doi_hint,
processed_document.title,
session_id=session_id,
request_id=request_context.request_id,
user_id=str(request_context.chat.chat_id),
query_tags=['submitter'],
request_context=request_context,
)
scored_documents = meta_search_response.collector_outputs[0].top_docs.scored_documents
if len(scored_documents) == 1:
scimag_pb = scimag_pb2.Scimag(**json.loads(scored_documents[0].document))
if not fuzzy_compare(scimag_pb.title, pdf_title):
request_context.statbox(
action='mismatched_title',
doi=request.doi_hint,
pdf_title=pdf_title,
title=scimag_pb.title,
)
# ToDo: add trust mechanics
if not scimag_pb:
request_context.statbox(action='unavailable_metadata')
request_context.error_log(UnavailableMetadataError(doi=pdf_doi))
request_context.error_log(UnavailableMetadataError(doi=processed_document.doi))
await self.application.telegram_clients[request_context.bot_name].send_message(
request_context.chat.chat_id,
t(
'UNAVAILABLE_METADATA_ERROR',
language=request_context.chat.language
).format(doi=pdf_doi or request.doi_hint),
).format(doi=processed_document.doi or request.doi_hint),
buttons=buttons,
reply_to=request.reply_to,
)
@ -260,10 +232,7 @@ class SubmitterService(submitter_service_pb2_grpc.SubmitterServicer, BaseHubServ
request_context.add_default_fields(doi=scimag_pb.doi, document_id=scimag_pb.id)
try:
file_data = clean_metadata(file_data, doi=scimag_pb.doi)
request_context.statbox(
action='cleaned',
len=len(file_data),
)
request_context.statbox(action='cleaned', len=len(file_data))
except ValueError as e:
request_context.error_log(e)
uploaded_message = await self.send_file(
@ -276,13 +245,13 @@ class SubmitterService(submitter_service_pb2_grpc.SubmitterServicer, BaseHubServ
if processed_document:
sharience_pb = sharience_pb2.Sharience(
abstract=processed_document.get('abstract', ''),
content=processed_document.get('body', ''),
abstract=processed_document.abstract or '',
content=processed_document.body or '',
parent_id=scimag_pb.id,
uploader_id=request.uploader_id or request_context.chat.chat_id,
updated_at=int(time.time()),
md5=hashlib.md5(file_data).hexdigest(),
filesize=file.size,
filesize=wrapped_file.size,
ipfs_multihashes=await self.get_ipfs_hashes(file=file_data),
)
update_sharience_pb = operation_pb2.DocumentOperation(
@ -313,13 +282,13 @@ class SubmitterService(submitter_service_pb2_grpc.SubmitterServicer, BaseHubServ
await operation_log(store_telegram_file_id_operation_pb)
request_context.statbox(action='successfully_stored')
if file.message_id:
if wrapped_file.message_id:
async with safe_execution(error_log=request_context.error_log, level=logging.DEBUG):
await self.application.telegram_clients[request_context.bot_name].delete_messages(
request_context.chat.chat_id,
file.message_id,
wrapped_file.message_id,
)
await self.item_found(bot_name=request_context.bot_name, doi=scimag_pb.doi)
await self.found_item(bot_name=request_context.bot_name, doi=scimag_pb.doi)
finally:
await processing_message.delete()
return submitter_service_pb2.SubmitResponse()

View File

@ -45,7 +45,7 @@ from nexus.meta_api.word_transformers import (
)
def create_query_transformer(valid_fields, invalid_fields):
def create_query_transformer(valid_fields, invalid_fields, order_by_valid_fields):
return QueryProcessor(
tree_transformers=[
OrderByTreeTransformer(
@ -55,14 +55,7 @@ def create_query_transformer(valid_fields, invalid_fields):
'pr': 'page_rank',
'refc': 'referenced_by_count',
},
valid_fields=frozenset([
'id',
'referenced_by_count',
'issued_at',
'page_rank',
'pages',
'updated_at'
])
valid_fields=frozenset(order_by_valid_fields)
),
FieldTreeTransformer(
field_aliases={
@ -135,20 +128,35 @@ class GrpcServer(AioGrpcServer):
'doi', 'ipfs_multihashes', 'issns', 'isbns', 'issued_at', 'language', 'original_id',
'page_rank', 'referenced_by_count', 'references', 'tags', 'title', 'year',
}
order_by_scimag_fields = {
'id',
'referenced_by_count',
'issued_at',
'page_rank',
'updated_at'
}
scitech_fields = {
'id', 'authors', 'doi', 'description', 'extension',
'ipfs_multihashes', 'isbns', 'issued_at', 'language', 'original_id', 'pages',
'tags', 'title', 'updated_at', 'year',
}
order_by_scitech_fields = {
'id',
'issued_at',
'pages',
'updated_at'
}
self.query_transformers = {
'scimag': create_query_transformer(
valid_fields=scimag_fields,
invalid_fields=scitech_fields.difference(scimag_fields),
order_by_valid_fields=order_by_scimag_fields,
),
'scitech': create_query_transformer(
valid_fields=scitech_fields,
invalid_fields=scimag_fields.difference(scitech_fields),
order_by_valid_fields=order_by_scitech_fields,
)
}
self.summa_client = SummaClient(

View File

@ -31,7 +31,6 @@ class BaseService(LibraryBaseService):
new_scored_documents = []
for scored_document in scored_documents:
document = json.loads(scored_document.document)
document = self.enrich_document_with_stat_provider(document)
new_scored_documents.append(nexus_meta_api_search_service_pb.ScoredDocument(
typed_document=typed_document_pb2.TypedDocument(
**{scored_document.index_alias: self.pb_registry[scored_document.index_alias](**document)},
@ -46,7 +45,6 @@ class BaseService(LibraryBaseService):
new_scored_documents = []
for position, document in enumerate(documents):
document = json.loads(document)
document = self.enrich_document_with_stat_provider(document)
new_scored_documents.append(nexus_meta_api_search_service_pb.ScoredDocument(
typed_document=TypedDocumentPb(
**{index_alias: self.pb_registry[index_alias](**document)},
@ -55,14 +53,3 @@ class BaseService(LibraryBaseService):
score=1.0,
))
return new_scored_documents
def enrich_document_with_stat_provider(self, document):
if self.stat_provider:
original_id = (
document.get('original_id')
or document['id']
)
download_stats = self.stat_provider.get_download_stats(original_id)
if download_stats and download_stats.downloads_count:
document['downloads_count'] = download_stats.downloads_count
return document

View File

@ -57,19 +57,6 @@ class DocumentsService(DocumentsServicer, BaseService):
document_pb = getattr(typed_document_pb, typed_document_pb.WhichOneof('document'))
if hasattr(document_pb, 'original_id') and document_pb.original_id:
original_document_pb = await self.get_document(
index_alias=request.index_alias,
document_id=document_pb.original_id,
context=context,
request_id=metadata['request-id'],
session_id=metadata['session-id'],
)
for to_remove in ('doi', 'fiction_id', 'filesize', 'libgen_id',):
original_document_pb.ClearField(to_remove)
original_document_pb.MergeFrom(document_pb)
document_pb = original_document_pb
logging.getLogger('query').info({
'action': 'get',
'cache_hit': False,

View File

@ -270,6 +270,10 @@ class SearchService(SearchServicer, BaseService):
@aiogrpc_request_wrapper(log=False)
async def search(self, request, context, metadata):
preprocessed_query = await self.process_query(query=request.query, languages=request.language, context=context)
logging.getLogger('debug').info({
'action': 'preprocess_query',
'preprocessed_query': str(preprocessed_query),
})
index_aliases = self.resolve_index_aliases(
request_index_aliases=request.index_aliases,
processed_query=preprocessed_query,
@ -280,19 +284,27 @@ class SearchService(SearchServicer, BaseService):
right_offset = left_offset + page_size
search_requests = []
processed_queries = {}
for index_alias in index_aliases:
processed_query = self.query_transformers[index_alias].apply_tree_transformers(preprocessed_query)
processed_queries[index_alias] = self.query_transformers[index_alias].apply_tree_transformers(preprocessed_query)
logging.getLogger('debug').info({
'action': 'process_query',
'index_alias': index_alias,
'processed_query': str(processed_queries[index_alias]),
'order_by': processed_queries[index_alias].context.order_by,
'has_invalid_fields': processed_queries[index_alias].context.has_invalid_fields,
})
search_requests.append(
SearchRequest(
index_alias=index_alias,
query=processed_query.to_summa_query(),
query=processed_queries[index_alias].to_summa_query(),
collectors=[
search_service_pb2.Collector(
top_docs=search_service_pb2.TopDocsCollector(
limit=50,
scorer=self.scorer(processed_query, index_alias),
scorer=self.scorer(processed_queries[index_alias], index_alias),
snippets=self.snippets[index_alias],
explain=processed_query.context.explain,
explain=processed_queries[index_alias].context.explain,
)
),
search_service_pb2.Collector(count=search_service_pb2.CountCollector())
@ -322,9 +334,9 @@ class SearchService(SearchServicer, BaseService):
meta_search_response.collector_outputs[0].top_docs.scored_documents,
)
has_next = len(new_scored_documents) > right_offset
if 'scimag' in index_aliases:
if 'scimag' in processed_queries:
await self.check_if_need_new_documents_by_dois(
requested_dois=processed_query.context.dois,
requested_dois=processed_queries['scimag'].context.dois,
scored_documents=new_scored_documents,
should_request=attempt.retry_state.attempt_number == 1
)
@ -333,7 +345,7 @@ class SearchService(SearchServicer, BaseService):
scored_documents=new_scored_documents[left_offset:right_offset],
has_next=has_next,
count=meta_search_response.collector_outputs[1].count.count,
query_language=processed_query.context.query_language,
query_language=preprocessed_query.context.query_language,
)
return search_response_pb

View File

@ -6,7 +6,7 @@ pipe:
kafka-0.example.net
schema:
- consumers:
- class: nexus.pipe.consumers.CrossReferencesBulkConsumer
- class: nexus.pipe.consumers.CrossReferencesConsumer
topics:
- name: cross_references
workers: 4

View File

@ -1,17 +1,11 @@
from .cross_references_consumer import (
CrossReferencesBulkConsumer,
CrossReferencesConsumer,
)
from .cross_references_consumer import CrossReferencesConsumer
from .document_operations_consumer import (
DocumentOperationsBulkConsumer,
DocumentOperationsConsumer,
DocumentOperationsJsonConsumer,
)
__all__ = [
'CrossReferencesBulkConsumer',
'CrossReferencesConsumer',
'DocumentOperationsConsumer',
'DocumentOperationsBulkConsumer',
'DocumentOperationsJsonConsumer',
]

View File

@ -66,9 +66,10 @@ class BaseConsumer(AioRootThing):
except (ConflictError, InterruptProcessing) as e:
logging.getLogger('statbox').info(e)
except (asyncio.CancelledError, ConsumerStoppedError):
pass
return
finally:
await consumer.stop()
asyncio.create_task(self.stop())
async def start(self):
logging.getLogger('statbox').info({
@ -87,8 +88,10 @@ class BaseConsumer(AioRootThing):
async def stop(self):
if self.consumer_task:
self.consumer_task.cancel()
await self.consumer_task
consumer_task = self.consumer_task
self.consumer_task = None
consumer_task.cancel()
await consumer_task
class BasePbConsumer(BaseConsumer):
@ -108,52 +111,3 @@ class BaseJsonConsumer(BaseConsumer):
message = json.loads(msg.value)
ParseDict(message, pb, ignore_unknown_fields=True)
return pb
class BaseBulkConsumer(BaseConsumer):
auto_commit = False
bulk_size = 20
timeout = 1
async def consume(self, consumer):
try:
while self.started:
try:
result = await consumer.getmany(timeout_ms=self.timeout * 1000, max_records=self.bulk_size)
except (ConsumerStoppedError, asyncio.CancelledError):
break
collector = []
for tp, messages in result.items():
if messages:
for message in messages:
preprocessed_msg = self.preprocess(message)
if preprocessed_msg:
collector.append(preprocessed_msg)
for processor in self.processors:
filtered = filter(processor.filter, collector)
try:
await processor.process_bulk(filtered)
except InterruptProcessing as e:
logging.getLogger('statbox').info(e)
try:
await consumer.commit()
except CommitFailedError as e:
error_log(e)
continue
finally:
await consumer.stop()
async def start(self):
logging.getLogger('statbox').info({
'action': 'start',
'group_id': self.group_id,
'topic_names': self.topic_names,
})
consumer = self.create_consumer()
await consumer.start()
logging.getLogger('statbox').info({
'action': 'started',
'group_id': self.group_id,
'topic_names': self.topic_names,
})
self.consumer_task = asyncio.create_task(self.consume(consumer))

View File

@ -1,15 +1,8 @@
from nexus.models.proto.operation_pb2 import \
CrossReferenceOperation as CrossReferenceOperationPb
from .base import (
BaseBulkConsumer,
BasePbConsumer,
)
from .base import BasePbConsumer
class CrossReferencesConsumer(BasePbConsumer):
pb_class = CrossReferenceOperationPb
class CrossReferencesBulkConsumer(BaseBulkConsumer, CrossReferencesConsumer):
pass

View File

@ -2,7 +2,6 @@ from nexus.models.proto.operation_pb2 import \
DocumentOperation as DocumentOperationPb
from .base import (
BaseBulkConsumer,
BaseJsonConsumer,
BasePbConsumer,
)
@ -14,7 +13,3 @@ class DocumentOperationsConsumer(BasePbConsumer):
class DocumentOperationsJsonConsumer(BaseJsonConsumer):
pb_class = DocumentOperationPb
class DocumentOperationsBulkConsumer(BaseBulkConsumer, DocumentOperationsConsumer):
pass

View File

@ -22,12 +22,14 @@ class PylonClient(AioThing):
source_configs: Optional[List],
proxies: Optional[List[str]] = None,
downloads_directory: Optional[str] = None,
default_driver_proxy_list: [Optional[List]] = None
default_driver_proxy_list: [Optional[List]] = None,
default_resolver_proxy_list: [Optional[List]] = None,
):
super().__init__()
self.proxy_manager = ProxyManager(proxies)
self.downloads_directory = downloads_directory
self.default_driver_proxy_list = default_driver_proxy_list
self.default_resolver_proxy_list = default_resolver_proxy_list
self.sources = []
for source_config in source_configs:
source = Source.from_config(
@ -35,6 +37,7 @@ class PylonClient(AioThing):
source_config=source_config,
downloads_directory=downloads_directory,
default_driver_proxy_list=default_driver_proxy_list,
default_resolver_proxy_list=default_resolver_proxy_list,
)
self.sources.append(source)
self.starts.append(source)

View File

@ -138,6 +138,7 @@ pylon:
resolver:
args:
format_string: 'https://www.sciencedirect.com/science/article/pii/{selected}/pdfft?isDTMRedir=true&download=true'
resolve_timeout: 25.0
selector: '(.resource.primary.URL | split("/"))[-1]'
timeout: 40.0
class: nexus.pylon.resolvers.DoiOrgRequestResolver

View File

@ -26,7 +26,7 @@ class BaseDriver(NetworkAgent):
validator_cls = validator['class']
validator_cls = import_object(validator_cls)
self.validator = validator_cls or BaseValidator
self.validator = validator_cls
self.downloads_directory = downloads_directory
def __str__(self):

View File

@ -3,7 +3,6 @@ import json
import logging
import os.path
import shutil
import sys
import time
from pathlib import Path
from typing import (

View File

@ -1,4 +1,5 @@
import re
import sys
class Matcher:

View File

@ -11,6 +11,11 @@ import jq
from nexus.pylon.prepared_request import PreparedRequest
from nexus.pylon.proxy_manager import ProxyManager
from nexus.pylon.resolvers.base import BaseResolver
from tenacity import (
retry,
stop_after_attempt,
wait_random,
)
class DoiOrgRequestResolver(BaseResolver):
@ -24,6 +29,7 @@ class DoiOrgRequestResolver(BaseResolver):
proxy_manager: Optional[ProxyManager] = None,
):
super().__init__(proxy_list=proxy_list, proxy_manager=proxy_manager)
self.text_selector = selector
self.selector = jq.compile(selector)
self.format_string = format_string
self.timeout = timeout
@ -32,6 +38,11 @@ class DoiOrgRequestResolver(BaseResolver):
def __str__(self):
return f'{self.__class__.__name__}(selector = {self.selector}, format_string = {self.format_string})'
@retry(
reraise=True,
wait=wait_random(min=1, max=3),
stop=stop_after_attempt(4),
)
async def resolve_through_doi_org(self, params):
async with self.get_session() as session:
doi_url = f'https://doi.org/{params["doi"]}'
@ -51,6 +62,7 @@ class DoiOrgRequestResolver(BaseResolver):
logging.getLogger('error').error({
'action': 'error',
'mode': 'pylon',
'params': params,
'error': str(e)
})
return
@ -60,3 +72,11 @@ class DoiOrgRequestResolver(BaseResolver):
url=self.format_string.format(selected=selected),
timeout=self.timeout,
)
else:
logging.getLogger('debug').error({
'action': 'missed_selector',
'mode': 'pylon',
'params': params,
'selector': self.text_selector,
'format_string': self.format_string,
})

View File

@ -33,13 +33,17 @@ class Source(AioThing):
source_config,
downloads_directory: str,
default_driver_proxy_list: List,
default_resolver_proxy_list: List,
) -> 'Source':
matcher = Matcher(source_config['matcher'])
resolver_cls = import_object(
source_config.get('resolver', {}).get('class', 'nexus.pylon.resolvers.TemplateResolver')
)
resolver_args = dict(proxy_manager=proxy_manager)
resolver_args = dict(
proxy_manager=proxy_manager,
proxy_list=default_resolver_proxy_list,
)
resolver_args.update(**source_config.get('resolver', {}).get('args', {}))
resolver = resolver_cls(**resolver_args)

View File

@ -4,7 +4,7 @@
en:
ABOUT_US: |
**About us**
Among the most impactful inventions of humanity such as the wheel, semiconductors or the wing were the invention of writing and advent of mass printing. Uwe of printing press made possible wide spread of ideas and knowledge. This burst increased the number of educated people and in its turn people started the Age of Enlightenment and shaped modern civilization.
Among the most impactful inventions of humanity such as the wheel, semiconductors or the wing were the invention of writing and advent of mass printing. Use of printing press made possible wide spread of ideas and knowledge. This burst increased the number of educated people and in its turn people started the Age of Enlightenment and shaped modern civilization.
Why printing provoked such dramatic changes in society?
Sane beings and knowledge live in a natural symbiosis: knowledge are fruiting and growing while circulated among people and people do the same. We may even note that complexity of human civilization, its achievements and technological progress during all its history have been correlated with the amount of accumulated knowledge and possibility to share and navigate through it.
@ -14,7 +14,7 @@ en:
However, there is one remaining issue: knowledge is usurped by large publishers, corporations such as Elsevier, Springer, Wiley, T&F etc. Held by copyright, the knowledge remains out of reach for many researchers and for many enterprises that could launch the new Age of Enlightenment.
Knowledge is not just usurped, an entire system has been built which is putting in chains researchers, academia and readers:
- Large publishers promote and lobby approaches that encourage evaluation of researchers using the number of published articles in journals having high-impact. Everybody knows who owns journals of high "impact"
- Large publishers promote and lobby approaches that encourage evaluation of researchers using the number of published articles in journals having high-impact
- Publishers make authors to pay for publishing tax-funded researches, then readers to pay for access to published tax-funded researches, while peer-reviewing is done for free by other researchers and this activity is considered as "honorable" in academia
- Publishers such as Elsevier repeatedly carried and carry campaigns against open access

View File

@ -17,6 +17,7 @@ class DocumentListWidget:
document_holders: List[BaseHolder],
bot_name,
header: Optional[str] = None,
promotioner=None,
has_next: bool = False,
session_id: Optional[str] = None,
message_id: Optional[int] = None,
@ -29,6 +30,7 @@ class DocumentListWidget:
self.document_holders = document_holders
self.bot_name = bot_name
self.header = header
self.promotioner = promotioner
self.cmd = cmd
self.has_next = has_next
self.session_id = session_id
@ -57,6 +59,10 @@ class DocumentListWidget:
if self.header:
serp = f'**{self.header}**\n\n{serp}'
promotion_language = self.chat.language
promo = self.promotioner.choose_promotion(promotion_language)
serp = f'{serp}\n\n{promo}\n'
buttons = []
if self.cmd and self.message_id and self.session_id and (self.has_next or self.page > 0):
buttons = [

View File

@ -100,9 +100,9 @@ class ProgressBar:
async def show_banner(self):
return await self.send_message(await self.render_banner(), ignore_last_call=True)
async def callback(self, done, total):
async def callback(self, done, total, ignore_last_call=False):
self._set_progress(done, total)
return await self.send_message(await self.render_progress())
return await self.send_message(await self.render_progress(), ignore_last_call=ignore_last_call)
class ThrottlerWrapper:

View File

@ -139,7 +139,7 @@ spacy-loggers==1.0.3
SQLAlchemy==1.4.39
sqlparse==0.4.2
srsly==2.4.4
Telethon==1.24.0
Telethon==1.25.0
tenacity==8.0.1
termcolor==1.1.0
textblob==0.17.1