mirror of
https://github.com/nexus-stc/hyperboria
synced 2025-02-21 14:31:08 +01:00
295 lines
12 KiB
Python
295 lines
12 KiB
Python
import asyncio
|
|
import hashlib
|
|
import logging
|
|
import re
|
|
import time
|
|
from difflib import SequenceMatcher
|
|
|
|
import orjson as json
|
|
from aiogrobid.exceptions import BadRequestError
|
|
from grpc import ServicerContext
|
|
from izihawa_utils.pb_to_json import MessageToDict
|
|
from library.aiogrpctools.base import aiogrpc_request_wrapper
|
|
from library.telegram.base import RequestContext
|
|
from library.telegram.common import close_button
|
|
from library.telegram.utils import safe_execution
|
|
from nexus.hub.exceptions import (
|
|
FileTooBigError,
|
|
UnavailableMetadataError,
|
|
UnparsableDoiError,
|
|
)
|
|
from nexus.hub.proto import (
|
|
submitter_service_pb2,
|
|
submitter_service_pb2_grpc,
|
|
)
|
|
from nexus.models.proto import (
|
|
operation_pb2,
|
|
scimag_pb2,
|
|
sharience_pb2,
|
|
typed_document_pb2,
|
|
)
|
|
from nexus.pylon.pdftools import clean_metadata
|
|
from nexus.translations import t
|
|
from nexus.views.telegram.base_holder import ScimagHolder
|
|
from telethon.errors import ChatAdminRequiredError
|
|
from telethon.extensions import BinaryReader
|
|
|
|
from .base import (
|
|
BaseHubService,
|
|
ProcessedDocument,
|
|
)
|
|
|
|
|
|
async def operation_log(document_operation_pb):
|
|
logging.getLogger('operation').info(msg=MessageToDict(document_operation_pb, preserving_proto_field_name=True))
|
|
|
|
|
|
class TelegramFile:
|
|
def __init__(self, telegram_client, telegram_file):
|
|
self.telegram_client = telegram_client
|
|
self.telegram_file = telegram_file
|
|
self.document = BinaryReader(telegram_file.document).tgread_object()
|
|
|
|
@property
|
|
def size(self):
|
|
return self.document.size
|
|
|
|
@property
|
|
def message_id(self):
|
|
return self.telegram_file.message_id
|
|
|
|
@property
|
|
def filename(self):
|
|
return self.document.attributes[0].file_name
|
|
|
|
async def read(self):
|
|
return await self.telegram_client.download_document(
|
|
document=self.document,
|
|
file=bytes,
|
|
)
|
|
|
|
|
|
class PlainFile:
|
|
def __init__(self, plain_file):
|
|
self.plain_file = plain_file
|
|
|
|
@property
|
|
def size(self):
|
|
return len(self.plain_file.data)
|
|
|
|
@property
|
|
def message_id(self):
|
|
return None
|
|
|
|
@property
|
|
def filename(self):
|
|
return self.plain_file.filename
|
|
|
|
async def read(self):
|
|
return self.plain_file.data
|
|
|
|
|
|
def fuzzy_compare(a, b):
|
|
a = re.sub(r'[^a-z\d]', '', a.lower())
|
|
b = re.sub(r'[^a-z\d]', '', b.lower())
|
|
return SequenceMatcher(None, a, b).ratio() > 0.9
|
|
|
|
|
|
class SubmitterService(submitter_service_pb2_grpc.SubmitterServicer, BaseHubService):
|
|
async def start(self):
|
|
submitter_service_pb2_grpc.add_SubmitterServicer_to_server(self, self.application.server)
|
|
|
|
def wrap_request_file(self, request, request_context):
|
|
match str(request.WhichOneof('file')):
|
|
case 'plain':
|
|
return PlainFile(request.plain)
|
|
case 'telegram':
|
|
return TelegramFile(self.application.telegram_clients[request_context.bot_name], request.telegram)
|
|
case _:
|
|
raise RuntimeError(f"Unknown file type {request.WhichOneof('file')}")
|
|
|
|
async def retrieve_metadata(self, doi, title, session_id, request_context):
|
|
if doi:
|
|
meta_search_response = await self.application.meta_api_client.meta_search(
|
|
index_aliases=['scimag', ],
|
|
query=doi,
|
|
collectors=[{'top_docs': {'limit': 1}}],
|
|
session_id=session_id,
|
|
request_id=request_context.request_id,
|
|
user_id=str(request_context.chat.chat_id),
|
|
query_tags=['submitter'],
|
|
)
|
|
scored_documents = meta_search_response.collector_outputs[0].top_docs.scored_documents
|
|
if len(scored_documents) == 1:
|
|
scimag_pb = scimag_pb2.Scimag(**json.loads(scored_documents[0].document))
|
|
if title is not None and not fuzzy_compare(scimag_pb.title, title):
|
|
request_context.statbox(
|
|
action='mismatched_title',
|
|
doi=doi,
|
|
processed_title=title,
|
|
title=scimag_pb.title,
|
|
)
|
|
return None
|
|
return scimag_pb
|
|
|
|
@aiogrpc_request_wrapper(log=False)
|
|
async def submit(
|
|
self,
|
|
request: submitter_service_pb2.SubmitRequest,
|
|
context: ServicerContext,
|
|
metadata: dict,
|
|
) -> submitter_service_pb2.SubmitResponse:
|
|
session_id = metadata.get('session-id')
|
|
request_context = RequestContext(
|
|
bot_name=request.bot_name,
|
|
chat=request.chat,
|
|
request_id=metadata.get('request-id'),
|
|
)
|
|
request_context.add_default_fields(
|
|
mode='submit',
|
|
index_alias='scimag',
|
|
session_id=metadata.get('session-id'),
|
|
doi_hint=request.doi_hint,
|
|
**self.get_default_service_fields(),
|
|
)
|
|
|
|
buttons = None if request_context.is_group_mode() else [close_button()]
|
|
wrapped_file = self.wrap_request_file(request, request_context)
|
|
|
|
if wrapped_file.size > 300 * 1024 * 1024:
|
|
request_context.error_log(FileTooBigError(size=wrapped_file.size))
|
|
request_context.statbox(action='file_too_big')
|
|
async with safe_execution(error_log=request_context.error_log):
|
|
await self.application.telegram_clients[request_context.bot_name].send_message(
|
|
request_context.chat.chat_id,
|
|
t('FILE_TOO_BIG_ERROR', request_context.chat.language),
|
|
buttons=buttons,
|
|
reply_to=request.reply_to,
|
|
)
|
|
return submitter_service_pb2.SubmitResponse()
|
|
|
|
try:
|
|
processing_message = await self.application.telegram_clients[request_context.bot_name].send_message(
|
|
request_context.chat.chat_id,
|
|
t("PROCESSING_PAPER", request_context.chat.language).format(filename=wrapped_file.filename),
|
|
reply_to=request.reply_to,
|
|
)
|
|
except ChatAdminRequiredError:
|
|
return submitter_service_pb2.SubmitResponse()
|
|
|
|
try:
|
|
file_data = await wrapped_file.read()
|
|
if not request.skip_analysis:
|
|
processed_document = await ProcessedDocument.setup(
|
|
file_data,
|
|
grobid_client=self.application.grobid_client,
|
|
request_context=request_context,
|
|
)
|
|
else:
|
|
processed_document = ProcessedDocument({})
|
|
|
|
if not processed_document.doi and not request.doi_hint:
|
|
request_context.statbox(action='unparsable_doi')
|
|
request_context.error_log(UnparsableDoiError())
|
|
await self.application.telegram_clients[request_context.bot_name].send_message(
|
|
request_context.chat.chat_id,
|
|
t('UNPARSABLE_DOI_ERROR', request_context.chat.language).format(
|
|
filename=wrapped_file.filename,
|
|
),
|
|
buttons=buttons,
|
|
reply_to=request.reply_to,
|
|
)
|
|
return submitter_service_pb2.SubmitResponse()
|
|
|
|
scimag_pb = await self.retrieve_metadata(
|
|
processed_document.doi,
|
|
processed_document.title,
|
|
session_id=session_id,
|
|
request_context=request_context,
|
|
)
|
|
if not scimag_pb and request.doi_hint:
|
|
scimag_pb = await self.retrieve_metadata(
|
|
request.doi_hint,
|
|
processed_document.title,
|
|
session_id=session_id,
|
|
request_context=request_context,
|
|
)
|
|
|
|
if not scimag_pb:
|
|
request_context.statbox(action='unavailable_metadata')
|
|
request_context.error_log(UnavailableMetadataError(doi=processed_document.doi))
|
|
await self.application.telegram_clients[request_context.bot_name].send_message(
|
|
request_context.chat.chat_id,
|
|
t(
|
|
'UNAVAILABLE_METADATA_ERROR',
|
|
language=request_context.chat.language
|
|
).format(doi=processed_document.doi or request.doi_hint),
|
|
buttons=buttons,
|
|
reply_to=request.reply_to,
|
|
)
|
|
return submitter_service_pb2.SubmitResponse()
|
|
|
|
request_context.add_default_fields(doi=scimag_pb.doi, document_id=scimag_pb.id)
|
|
try:
|
|
file_data = clean_metadata(file_data, doi=scimag_pb.doi)
|
|
request_context.statbox(action='cleaned', len=len(file_data))
|
|
except ValueError as e:
|
|
request_context.error_log(e)
|
|
uploaded_message = await self.send_file(
|
|
document_holder=ScimagHolder(scimag_pb),
|
|
file=file_data,
|
|
request_context=request_context,
|
|
session_id=session_id,
|
|
voting=False,
|
|
)
|
|
|
|
if processed_document:
|
|
sharience_pb = sharience_pb2.Sharience(
|
|
abstract=processed_document.abstract or '',
|
|
content=processed_document.body or '',
|
|
parent_id=scimag_pb.id,
|
|
uploader_id=request.uploader_id or request_context.chat.chat_id,
|
|
updated_at=int(time.time()),
|
|
md5=hashlib.md5(file_data).hexdigest(),
|
|
filesize=wrapped_file.size,
|
|
ipfs_multihashes=await self.get_ipfs_hashes(file=file_data),
|
|
)
|
|
update_sharience_pb = operation_pb2.DocumentOperation(
|
|
update_document=operation_pb2.UpdateDocument(
|
|
full_text_index=True,
|
|
typed_document=typed_document_pb2.TypedDocument(sharience=sharience_pb),
|
|
),
|
|
)
|
|
await operation_log(update_sharience_pb)
|
|
|
|
new_fields = self.set_fields_from_processed(scimag_pb, processed_document)
|
|
if new_fields:
|
|
update_scimag_pb = operation_pb2.DocumentOperation(
|
|
update_document=operation_pb2.UpdateDocument(
|
|
full_text_index=True,
|
|
typed_document=typed_document_pb2.TypedDocument(scimag=scimag_pb),
|
|
fields=new_fields
|
|
),
|
|
)
|
|
await operation_log(update_scimag_pb)
|
|
store_telegram_file_id_operation_pb = operation_pb2.DocumentOperation(
|
|
store_telegram_file_id=operation_pb2.StoreTelegramFileId(
|
|
document_id=scimag_pb.id,
|
|
telegram_file_id=uploaded_message.file.id,
|
|
bot_name=request_context.bot_name,
|
|
),
|
|
)
|
|
await operation_log(store_telegram_file_id_operation_pb)
|
|
request_context.statbox(action='successfully_stored')
|
|
|
|
if wrapped_file.message_id:
|
|
async with safe_execution(error_log=request_context.error_log, level=logging.DEBUG):
|
|
await self.application.telegram_clients[request_context.bot_name].delete_messages(
|
|
request_context.chat.chat_id,
|
|
wrapped_file.message_id,
|
|
)
|
|
await self.found_item(bot_name=request_context.bot_name, doi=scimag_pb.doi)
|
|
finally:
|
|
await processing_message.delete()
|
|
return submitter_service_pb2.SubmitResponse()
|