hyperboria/nexus/actions/scitech_pb.py

59 lines
2.2 KiB
Python

from html import unescape
from bs4 import BeautifulSoup
from nexus.models.proto.operation_pb2 import \
DocumentOperation as DocumentOperationPb
from nexus.models.proto.operation_pb2 import UpdateDocument as UpdateDocumentPb
from nexus.models.proto.scitech_pb2 import Scitech as ScitechPb
from nexus.models.proto.typed_document_pb2 import \
TypedDocument as TypedDocumentPb
from nexus.nlptools.language_detect import detect_language
from nexus.nlptools.utils import (
despace,
despace_full,
)
from .base import BaseAction
from .common import canonize_doi
class CleanAction(BaseAction):
async def do(self, scitech_pb: ScitechPb) -> ScitechPb:
if scitech_pb.authors:
for i, author in enumerate(scitech_pb.authors):
scitech_pb.authors[i] = despace_full(author)
if scitech_pb.description:
description_soup = BeautifulSoup(unescape(scitech_pb.description), 'lxml')
for line in description_soup.select(r'p, title, jats\:title, jats\:p'):
line.replace_with(f'\n{line.text.strip()}\n')
scitech_pb.description = despace(description_soup.text.strip())
scitech_pb.series = despace_full(scitech_pb.series)
scitech_pb.title = despace_full(scitech_pb.title)
if not scitech_pb.meta_language and (scitech_pb.title or scitech_pb.description):
detected_language = detect_language(f'{scitech_pb.title} {scitech_pb.description }')
if detected_language:
scitech_pb.meta_language = detected_language
if not scitech_pb.language:
scitech_pb.language = scitech_pb.meta_language
scitech_pb.md5 = scitech_pb.md5.lower()
scitech_pb.extension = scitech_pb.extension.lower()
scitech_pb.doi = canonize_doi(scitech_pb.doi)
if scitech_pb.edition == 'None':
scitech_pb.edition = ''
return scitech_pb
class ToDocumentOperationPbAction(BaseAction):
async def do(self, item: ScitechPb) -> bytes:
document_operation_pb = DocumentOperationPb(
update_document=UpdateDocumentPb(
reindex=True,
typed_document=TypedDocumentPb(scitech=item),
),
)
return document_operation_pb.SerializeToString()