325 lines
12 KiB
Python
325 lines
12 KiB
Python
import asyncio
|
|
import logging
|
|
from typing import (
|
|
Optional,
|
|
Set,
|
|
)
|
|
|
|
import orjson as json
|
|
from aiocrossref import CrossrefClient
|
|
from aiocrossref.exceptions import (
|
|
NotFoundError,
|
|
WrongContentTypeError,
|
|
)
|
|
from aiokafka import AIOKafkaProducer
|
|
from aiosumma import SummaClient
|
|
from izihawa_utils.common import filter_none
|
|
from izihawa_utils.pb_to_json import MessageToDict
|
|
from library.aiopostgres.pool_holder import AioPostgresPoolHolder
|
|
from nexus.actions import scimag_pb
|
|
from nexus.actions.base import BaseAction
|
|
from nexus.actions.common import canonize_doi
|
|
from nexus.actions.crossref_api import ToScimagPbAction
|
|
from nexus.actions.exceptions import InterruptProcessing
|
|
from nexus.models.proto.operation_pb2 import \
|
|
CrossReferenceOperation as CrossReferenceOperationPb
|
|
from nexus.models.proto.operation_pb2 import \
|
|
DocumentOperation as DocumentOperationPb
|
|
from nexus.models.proto.scimag_pb2 import Scimag as ScimagPb
|
|
from pypika import (
|
|
PostgreSQLQuery,
|
|
Table,
|
|
)
|
|
from pypika.terms import Array
|
|
from summa.proto import index_service_pb2 as index_service_pb
|
|
|
|
|
|
class ToPostgresAction(BaseAction):
|
|
scimag_table = Table('scimag')
|
|
db_multi_fields = {
|
|
'authors',
|
|
'ipfs_multihashes',
|
|
'isbns',
|
|
'issns',
|
|
'tags',
|
|
}
|
|
db_single_fields = {
|
|
'id',
|
|
'abstract',
|
|
'container_title',
|
|
'content',
|
|
'doi',
|
|
'embedding',
|
|
'filesize',
|
|
'first_page',
|
|
'is_deleted',
|
|
'issued_at',
|
|
'issue',
|
|
'journal_id',
|
|
'language',
|
|
'last_page',
|
|
'meta_language',
|
|
'md5',
|
|
'page_rank',
|
|
'referenced_by_count',
|
|
'scimag_bulk_id',
|
|
'title',
|
|
'type',
|
|
'updated_at',
|
|
'volume',
|
|
}
|
|
db_fields = db_single_fields | db_multi_fields
|
|
|
|
def __init__(self, database):
|
|
super().__init__()
|
|
self.pool_holder = AioPostgresPoolHolder(
|
|
conninfo=f'dbname={database["database"]} '
|
|
f'user={database["username"]} '
|
|
f'password={database["password"]} '
|
|
f'host={database["host"]}',
|
|
)
|
|
self.starts.append(self.pool_holder)
|
|
|
|
def cast_field_value(self, field_name: str, field_value):
|
|
if field_name in self.db_multi_fields:
|
|
field_value = Array(*field_value)
|
|
return field_name, field_value
|
|
|
|
def is_field_set(self, scimag_pb: ScimagPb, field_name: str):
|
|
field_value = getattr(scimag_pb, field_name)
|
|
if field_name in {'scimag_bulk_id', 'issued_at'}:
|
|
return scimag_pb.HasField(field_name)
|
|
return field_value
|
|
|
|
def generate_insert_sql(self, scimag_pb: ScimagPb, fields: Optional[Set[str]] = None):
|
|
columns = []
|
|
params = []
|
|
|
|
fields = fields or self.db_fields
|
|
for field_name in fields:
|
|
if self.is_field_set(scimag_pb, field_name):
|
|
field_value = getattr(scimag_pb, field_name)
|
|
field_name, field_value = self.cast_field_value(field_name, field_value)
|
|
columns.append(field_name)
|
|
params.append(field_value)
|
|
|
|
query = PostgreSQLQuery.into(self.scimag_table).columns(*columns).insert(*params)
|
|
if columns:
|
|
query = query.on_conflict('doi')
|
|
for field, val in zip(columns, params):
|
|
query = query.do_update(field, val)
|
|
|
|
return query.returning(self.scimag_table.id).get_sql()
|
|
|
|
def generate_update_sql(
|
|
self,
|
|
scimag_pb: ScimagPb,
|
|
fields: Optional[Set[str]] = None,
|
|
) -> str:
|
|
query = (
|
|
PostgreSQLQuery
|
|
.update(self.scimag_table)
|
|
)
|
|
fields = fields or self.db_fields
|
|
for field_name in fields:
|
|
if self.is_field_set(scimag_pb, field_name):
|
|
field_value = getattr(scimag_pb, field_name)
|
|
field_name, field_value = self.cast_field_value(field_name, field_value)
|
|
query = query.set(field_name, field_value)
|
|
return query.where(self.scimag_table.id == scimag_pb.id).get_sql()
|
|
|
|
async def do(self, document_operation_pb: DocumentOperationPb) -> DocumentOperationPb:
|
|
update_document_pb = document_operation_pb.update_document
|
|
scimag_pb = update_document_pb.typed_document.scimag
|
|
fields = update_document_pb.fields or self.db_fields
|
|
|
|
if scimag_pb.id:
|
|
sql = self.generate_update_sql(
|
|
scimag_pb,
|
|
fields=fields,
|
|
)
|
|
await self.pool_holder.execute(sql)
|
|
else:
|
|
sql = self.generate_insert_sql(
|
|
scimag_pb=scimag_pb,
|
|
fields=fields,
|
|
)
|
|
result = [row async for row in self.pool_holder.iterate(sql)]
|
|
scimag_pb.id = result[0][0]
|
|
return document_operation_pb
|
|
|
|
|
|
class ToSummaAction(BaseAction):
|
|
forbidden_types = {
|
|
'book-series',
|
|
'book-set',
|
|
'book-track',
|
|
'component',
|
|
'dataset',
|
|
'journal',
|
|
'journal-issue',
|
|
'journal-volume',
|
|
'peer-review',
|
|
'proceedings',
|
|
'report-series',
|
|
}
|
|
|
|
def __init__(self, kafka, summa):
|
|
super().__init__()
|
|
self.kafka = kafka
|
|
self.producer = None
|
|
self.summa_config = summa
|
|
self.summa_client = SummaClient(endpoint=summa['endpoint'])
|
|
|
|
async def start(self):
|
|
self.producer = self.get_producer()
|
|
await self.producer.start()
|
|
await self.summa_client.start()
|
|
|
|
async def stop(self):
|
|
await self.summa_client.stop()
|
|
if self.producer:
|
|
await self.producer.stop()
|
|
self.producer = None
|
|
|
|
def get_producer(self):
|
|
return AIOKafkaProducer(
|
|
loop=asyncio.get_running_loop(),
|
|
bootstrap_servers=self.kafka['bootstrap_servers'],
|
|
max_request_size=self.kafka['max_request_size'],
|
|
)
|
|
|
|
async def async_index(self, scimag_pb: ScimagPb):
|
|
for topic_name in self.kafka['topic_names']:
|
|
await self.producer.send_and_wait(
|
|
topic_name,
|
|
index_service_pb.IndexOperation(
|
|
index_document=index_service_pb.IndexDocumentOperation(
|
|
document=json.dumps(filter_none(MessageToDict(scimag_pb, preserving_proto_field_name=True))),
|
|
),
|
|
).SerializeToString(),
|
|
)
|
|
|
|
async def sync_index(self, scimag_pb: ScimagPb):
|
|
document = filter_none(MessageToDict(scimag_pb, preserving_proto_field_name=True))
|
|
logging.getLogger('statbox').info({'action': 'sync_index', 'document': document})
|
|
await self.summa_client.index_document(index_alias=self.summa_config['index_alias'], document=document)
|
|
await self.summa_client.commit_index(index_alias=self.summa_config['index_alias'])
|
|
|
|
async def do(self, document_operation_pb: DocumentOperationPb) -> DocumentOperationPb:
|
|
update_document_pb = document_operation_pb.update_document
|
|
scimag_pb = update_document_pb.typed_document.scimag
|
|
if scimag_pb.type in self.forbidden_types:
|
|
return document_operation_pb
|
|
if not scimag_pb.HasField('issued_at'):
|
|
scimag_pb.issued_at = -62135596800
|
|
if update_document_pb.full_text_index:
|
|
if update_document_pb.full_text_index_commit:
|
|
await self.sync_index(scimag_pb=scimag_pb)
|
|
else:
|
|
await self.async_index(scimag_pb=scimag_pb)
|
|
return document_operation_pb
|
|
|
|
|
|
class ReferencesToKafkaAction(BaseAction):
|
|
def __init__(self, kafka):
|
|
super().__init__()
|
|
self.kafka = kafka
|
|
self.producer = None
|
|
|
|
async def start(self):
|
|
self.producer = self.get_producer()
|
|
await self.producer.start()
|
|
|
|
async def stop(self):
|
|
if self.producer:
|
|
await self.producer.stop()
|
|
self.producer = None
|
|
|
|
def get_producer(self):
|
|
return AIOKafkaProducer(
|
|
loop=asyncio.get_running_loop(),
|
|
bootstrap_servers=self.kafka['bootstrap_servers'],
|
|
)
|
|
|
|
async def do(self, document_operation_pb: DocumentOperationPb) -> DocumentOperationPb:
|
|
update_document_pb = document_operation_pb.update_document
|
|
scimag_pb = update_document_pb.typed_document.scimag
|
|
for reference in scimag_pb.references:
|
|
reference_operation = CrossReferenceOperationPb(
|
|
source=scimag_pb.doi,
|
|
target=reference,
|
|
)
|
|
for topic_name in self.kafka['topic_names']:
|
|
await self.producer.send_and_wait(
|
|
topic_name,
|
|
reference_operation.SerializeToString(),
|
|
)
|
|
|
|
return document_operation_pb
|
|
|
|
|
|
class FillFromExternalSourceAction(BaseAction):
|
|
def __init__(self, crossref):
|
|
super().__init__()
|
|
self.crossref_client = CrossrefClient(
|
|
max_retries=crossref.get('max_retries', 15),
|
|
proxy_url=crossref.get('proxy_url'),
|
|
retry_delay=crossref.get('retry_delay', 0.5),
|
|
timeout=crossref.get('timeout'),
|
|
user_agent=crossref.get('user_agent'),
|
|
ttl_dns_cache=crossref.get('ttl_dns_cache'),
|
|
)
|
|
self.doi_client = self.crossref_client
|
|
self.crossref_api_to_scimag_pb_action = ToScimagPbAction()
|
|
self.starts.append(self.crossref_client)
|
|
|
|
async def try_resolve(self, doi, look_at_doi_org=False):
|
|
try:
|
|
return await self.crossref_client.works(doi=doi)
|
|
except (WrongContentTypeError, NotFoundError) as e:
|
|
if look_at_doi_org:
|
|
doi_org_response = await self.doi_client.get(doi=doi)
|
|
if doi_org_response:
|
|
resolved_doi = canonize_doi(doi_org_response.get('published-print', {}).get('DOI'))
|
|
if resolved_doi:
|
|
try:
|
|
return await self.crossref_client.works(doi=resolved_doi)
|
|
except (WrongContentTypeError, NotFoundError) as e:
|
|
raise InterruptProcessing(document_id=doi, reason=str(e))
|
|
raise InterruptProcessing(document_id=doi, reason=str(e))
|
|
|
|
async def do(self, document_operation_pb: DocumentOperationPb) -> DocumentOperationPb:
|
|
update_document_pb = document_operation_pb.update_document
|
|
scimag_pb = update_document_pb.typed_document.scimag
|
|
if not update_document_pb.should_fill_from_external_source:
|
|
return document_operation_pb
|
|
crossref_api_response = await self.try_resolve(doi=scimag_pb.doi)
|
|
new_scimag_pb = await self.crossref_api_to_scimag_pb_action.do(crossref_api_response)
|
|
scimag_pb.MergeFrom(new_scimag_pb)
|
|
return document_operation_pb
|
|
|
|
|
|
class CleanAction(BaseAction):
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.cleaner = scimag_pb.CleanAction()
|
|
self.language_detect = scimag_pb.DetectLanguageAction()
|
|
self.starts.append(self.cleaner)
|
|
|
|
async def do(self, document_operation_pb: DocumentOperationPb) -> DocumentOperationPb:
|
|
update_document_pb = document_operation_pb.update_document
|
|
scimag_pb = update_document_pb.typed_document.scimag
|
|
scimag_pb = await self.cleaner.do(scimag_pb)
|
|
scimag_pb = await self.language_detect.do(scimag_pb)
|
|
if update_document_pb.fields and (scimag_pb.language or scimag_pb.meta_language):
|
|
fields = set(update_document_pb.fields)
|
|
if scimag_pb.language:
|
|
fields.add('language')
|
|
if scimag_pb.meta_language:
|
|
fields.add('meta_language')
|
|
del update_document_pb.fields[:]
|
|
update_document_pb.fields.extend(fields)
|
|
update_document_pb.typed_document.scimag.CopyFrom(scimag_pb)
|
|
return document_operation_pb
|