2021-01-04 11:35:31 +03:00
|
|
|
from html import unescape
|
|
|
|
|
|
|
|
from bs4 import BeautifulSoup
|
2022-09-02 18:44:56 +03:00
|
|
|
from izihawa_nlptools.language_detect import detect_language
|
|
|
|
from izihawa_nlptools.utils import (
|
|
|
|
despace,
|
|
|
|
despace_full,
|
|
|
|
)
|
2021-01-04 11:35:31 +03:00
|
|
|
from nexus.models.proto.operation_pb2 import \
|
|
|
|
DocumentOperation as DocumentOperationPb
|
|
|
|
from nexus.models.proto.operation_pb2 import UpdateDocument as UpdateDocumentPb
|
|
|
|
from nexus.models.proto.scimag_pb2 import Scimag as ScimagPb
|
|
|
|
from nexus.models.proto.typed_document_pb2 import \
|
|
|
|
TypedDocument as TypedDocumentPb
|
|
|
|
|
|
|
|
from .base import BaseAction
|
2022-03-28 17:39:36 +03:00
|
|
|
from .common import canonize_doi
|
2021-01-04 11:35:31 +03:00
|
|
|
|
|
|
|
|
2022-09-02 18:44:56 +03:00
|
|
|
class DetectLanguageAction(BaseAction):
|
|
|
|
async def do(self, scimag_pb: ScimagPb) -> ScimagPb:
|
|
|
|
if scimag_pb.title or scimag_pb.abstract or scimag_pb.content:
|
|
|
|
detected_language = detect_language(f'{scimag_pb.title} {scimag_pb.abstract} {scimag_pb.content}')
|
|
|
|
if detected_language:
|
|
|
|
scimag_pb.meta_language = detected_language
|
|
|
|
if scimag_pb.content:
|
|
|
|
scimag_pb.language = detected_language
|
|
|
|
if not scimag_pb.language:
|
|
|
|
scimag_pb.language = scimag_pb.meta_language
|
|
|
|
return scimag_pb
|
|
|
|
|
|
|
|
|
2022-03-28 17:39:36 +03:00
|
|
|
class CleanAction(BaseAction):
|
2021-01-04 11:35:31 +03:00
|
|
|
async def do(self, scimag_pb: ScimagPb) -> ScimagPb:
|
|
|
|
if scimag_pb.abstract:
|
|
|
|
abstract_soup = BeautifulSoup(unescape(scimag_pb.abstract), 'lxml')
|
|
|
|
for line in abstract_soup.select(r'p, title, jats\:title, jats\:p'):
|
|
|
|
line.replace_with(f'\n{line.text.strip()}\n')
|
|
|
|
scimag_pb.abstract = despace(abstract_soup.text.strip())
|
|
|
|
if scimag_pb.title:
|
|
|
|
scimag_pb.title = despace_full(BeautifulSoup(unescape(scimag_pb.title), 'lxml').text.strip())
|
|
|
|
if scimag_pb.authors:
|
|
|
|
for i, author in enumerate(scimag_pb.authors):
|
|
|
|
scimag_pb.authors[i] = despace_full(BeautifulSoup(unescape(author), 'lxml').text.strip())
|
|
|
|
if scimag_pb.container_title:
|
|
|
|
scimag_pb.container_title = scimag_pb.container_title.replace(
|
|
|
|
'<html_ent glyph="@lt;" ascii="<"/>'
|
|
|
|
'html_ent glyph="@amp;" ascii="<html_ent glyph="@amp;" ascii="&"/>"/'
|
|
|
|
'<html_ent glyph="@gt;" ascii=">"/>',
|
|
|
|
'&'
|
|
|
|
)
|
|
|
|
scimag_pb.container_title = scimag_pb.container_title.replace('<html_ent glyph="@amp;" ascii="&"/>', '&')
|
|
|
|
scimag_pb.container_title = scimag_pb.container_title.replace(
|
|
|
|
'<html_ent glyph="@lt;" ascii="<"/>'
|
|
|
|
'html_ent glyph="@amp;" ascii="&"/'
|
|
|
|
'<html_ent glyph="@gt;" ascii=">"/>',
|
|
|
|
'&'
|
|
|
|
)
|
|
|
|
scimag_pb.container_title = scimag_pb.container_title.replace('<html_ent glyph="@lt;" ascii="<"/>', '')
|
|
|
|
scimag_pb.container_title = scimag_pb.container_title.replace('<html_ent glyph="@gt;" ascii=">"/>', '')
|
|
|
|
scimag_pb.container_title = BeautifulSoup(unescape(scimag_pb.container_title), 'lxml').text.strip()
|
|
|
|
if scimag_pb.doi:
|
|
|
|
scimag_pb.doi = canonize_doi(scimag_pb.doi)
|
|
|
|
if scimag_pb.references:
|
|
|
|
canonized_references = list(map(canonize_doi, scimag_pb.references))
|
|
|
|
del scimag_pb.references[:]
|
|
|
|
scimag_pb.references.extend(canonized_references)
|
|
|
|
return scimag_pb
|
|
|
|
|
|
|
|
|
2022-09-02 18:44:56 +03:00
|
|
|
class ToDocumentOperationBytesAction(BaseAction):
|
|
|
|
def __init__(self, full_text_index: bool, should_fill_from_external_source: bool):
|
|
|
|
super().__init__()
|
|
|
|
self.full_text_index = full_text_index
|
|
|
|
self.should_fill_from_external_source = should_fill_from_external_source
|
|
|
|
|
2021-01-04 11:35:31 +03:00
|
|
|
async def do(self, item: ScimagPb) -> bytes:
|
|
|
|
document_operation_pb = DocumentOperationPb(
|
|
|
|
update_document=UpdateDocumentPb(
|
2022-09-02 18:44:56 +03:00
|
|
|
full_text_index=self.full_text_index,
|
|
|
|
should_fill_from_external_source=self.should_fill_from_external_source,
|
2021-01-04 11:35:31 +03:00
|
|
|
typed_document=TypedDocumentPb(scimag=item),
|
|
|
|
),
|
|
|
|
)
|
|
|
|
return document_operation_pb.SerializeToString()
|