mirror of
https://github.com/nexus-stc/hyperboria
synced 2025-01-10 10:45:56 +01:00
No description
GitOrigin-RevId: ddf02e70d2827c048db49b687ebbcdcc67807ca6
This commit is contained in:
parent
66ecaf0d41
commit
8472f27ec5
37
BUILD.bazel
Normal file
37
BUILD.bazel
Normal file
@ -0,0 +1,37 @@
|
||||
load("@io_bazel_rules_k8s//k8s:objects.bzl", "k8s_objects")
|
||||
|
||||
# System-wide settings
|
||||
config_setting(
|
||||
name = "osx",
|
||||
constraint_values = ["@bazel_tools//platforms:osx"],
|
||||
)
|
||||
|
||||
config_setting(
|
||||
name = "linux",
|
||||
constraint_values = ["@bazel_tools//platforms:linux"],
|
||||
)
|
||||
|
||||
platform(
|
||||
name = "linux_x86",
|
||||
constraint_values = [
|
||||
"@io_bazel_rules_rust//rust/platform:linux",
|
||||
"@bazel_tools//platforms:linux",
|
||||
"@bazel_tools//platforms:x86_64",
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
load("@io_bazel_rules_rust//proto:toolchain.bzl", "rust_proto_toolchain")
|
||||
|
||||
rust_proto_toolchain(
|
||||
name = "proto-toolchain-impl",
|
||||
grpc_plugin = "//rules/rust/cargo:cargo_bin_protoc_gen_rust_grpc",
|
||||
proto_plugin = "//rules/rust/cargo:cargo_bin_protoc_gen_rust",
|
||||
protoc = "@com_google_protobuf//:protoc",
|
||||
)
|
||||
|
||||
toolchain(
|
||||
name = "proto-toolchain",
|
||||
toolchain = ":proto-toolchain-impl",
|
||||
toolchain_type = "@io_bazel_rules_rust//proto:toolchain",
|
||||
)
|
1
LICENSE
1
LICENSE
@ -1,3 +1,4 @@
|
||||
|
||||
This is free and unencumbered software released into the public domain.
|
||||
|
||||
Anyone is free to copy, modify, publish, use, compile, sell, or
|
||||
|
54
README.md
Normal file
54
README.md
Normal file
@ -0,0 +1,54 @@
|
||||
# Hyperboria
|
||||
|
||||
## Introduction
|
||||
|
||||
Hyperboria repository is a pack of tools for dealing with SciMag and SciTech collections.
|
||||
|
||||
It consists of configurable [`search engine`](nexus/summa), [`pipeline`](nexus/pipe) for [`ingesting`](nexus/ingest) data
|
||||
from upstream sources. So-called [`actions`](nexus/actions) aimed to converting data from external APIs
|
||||
into [`internal Protobuf format`](nexus/models) and to landing converted data into databases and/or search engines.
|
||||
|
||||
## Prerequisite
|
||||
|
||||
### Ubuntu 20.04
|
||||
|
||||
#### Docker
|
||||
[Installation Guide](https://docs.docker.com/engine/install/ubuntu/)
|
||||
|
||||
#### System Compilers
|
||||
```shell script
|
||||
sudo apt-get install -y --no-install-recommends g++ python3.9 protobuf-compiler libprotobuf-dev
|
||||
```
|
||||
|
||||
#### Bazel Build System
|
||||
[Installation Guide](https://docs.bazel.build/versions/master/install-ubuntu.html) or _one-liner_:
|
||||
```shell script
|
||||
sudo apt install curl gnupg
|
||||
curl -fsSL https://bazel.build/bazel-release.pub.gpg | gpg --dearmor > bazel.gpg
|
||||
sudo mv bazel.gpg /etc/apt/trusted.gpg.d/
|
||||
echo "deb [arch=amd64] https://storage.googleapis.com/bazel-apt stable jdk1.8" | sudo tee /etc/apt/sources.list.d/bazel.list
|
||||
sudo apt update && sudo apt install bazel
|
||||
```
|
||||
|
||||
### MacOS
|
||||
|
||||
#### Docker
|
||||
[Installation Guide](https://docs.docker.com/docker-for-mac/install/)
|
||||
|
||||
#### System Compilers
|
||||
```shell script
|
||||
brew install llvm protobuf python3.9
|
||||
```
|
||||
|
||||
#### Bazel Build System
|
||||
[Installation Guide](https://docs.bazel.build/versions/master/install-os-x.html) or _one-liner_:
|
||||
```shell script
|
||||
brew install bazel
|
||||
```
|
||||
|
||||
## Content
|
||||
|
||||
- [`images`](images) - base docker images for [`nexus`](nexus)
|
||||
- [`library`](library) - shared libraries
|
||||
- [`nexus`](nexus) - processing and searching in scientific text collections
|
||||
- [`rules`](rules) - build rules
|
285
WORKSPACE
Normal file
285
WORKSPACE
Normal file
@ -0,0 +1,285 @@
|
||||
workspace(
|
||||
name = "hyperboria",
|
||||
managed_directories = {"@npm": ["rules/nodejs/node_modules"]},
|
||||
)
|
||||
|
||||
load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
|
||||
|
||||
http_archive(
|
||||
name = "bazel_skylib",
|
||||
sha256 = "11b0e65ec07113b2ea81be554c7471bb80fc5766aba6239c91d071602c46d50f",
|
||||
strip_prefix = "bazel-skylib-dc080e95161964a1ff841bfd0b871a1123c027a8",
|
||||
urls = [
|
||||
"https://github.com/bazelbuild/bazel-skylib/archive/dc080e95161964a1ff841bfd0b871a1123c027a8.tar.gz",
|
||||
],
|
||||
)
|
||||
|
||||
http_archive(
|
||||
name = "build_bazel_rules_nodejs",
|
||||
sha256 = "6a67a8a1bf6fddc9113f73471029b819eef4575c3a936a4a01d57e411894d692",
|
||||
urls = [
|
||||
"https://github.com/bazelbuild/rules_nodejs/releases/download/2.0.2/rules_nodejs-2.0.2.tar.gz",
|
||||
],
|
||||
)
|
||||
|
||||
http_archive(
|
||||
name = "com_github_grpc_grpc",
|
||||
sha256 = "f046d4cb4d60d4f2a2087e9d46c7ec0c523cd54ebf68eda6272de4ce65e20ac7",
|
||||
strip_prefix = "grpc-ae7f520358d7145a7484db693376fdebbd72662d",
|
||||
urls = [
|
||||
"https://github.com/grpc/grpc/archive/ae7f520358d7145a7484db693376fdebbd72662d.tar.gz",
|
||||
],
|
||||
)
|
||||
|
||||
http_archive(
|
||||
name = "com_google_protobuf",
|
||||
sha256 = "7d663c8dc81d282dc92e884b38e9c179671e31ccacce311154420e65f7d142c6",
|
||||
strip_prefix = "protobuf-3.13.0.1",
|
||||
urls = [
|
||||
"https://github.com/protocolbuffers/protobuf/archive/v3.13.0.1.tar.gz",
|
||||
],
|
||||
)
|
||||
|
||||
http_archive(
|
||||
name = "io_bazel_rules_docker",
|
||||
sha256 = "ba415feb61f7dd08051c7096df9feeb2109bc918878ef924ad9262fe0fcdf6f9",
|
||||
strip_prefix = "rules_docker-9bfcd7dbf0294ed9d11a99da6363fc28df904502",
|
||||
urls = [
|
||||
"https://github.com/bazelbuild/rules_docker/archive/9bfcd7dbf0294ed9d11a99da6363fc28df904502.zip",
|
||||
],
|
||||
)
|
||||
|
||||
http_archive(
|
||||
name = "io_bazel_rules_k8s",
|
||||
sha256 = "95addfd2b7b07b5a4e75663d15aa57dc271f7b831ec404109322288e1b6bf126",
|
||||
strip_prefix = "rules_k8s-9f9886c7252d66bb2e2206842b149a6ceebe6fe5",
|
||||
urls = [
|
||||
"https://github.com/bazelbuild/rules_k8s/archive/9f9886c7252d66bb2e2206842b149a6ceebe6fe5.zip",
|
||||
],
|
||||
)
|
||||
|
||||
http_archive(
|
||||
name = "io_bazel_rules_rust",
|
||||
sha256 = "50a772198877e21a61823fa292d28539f8bc99d72463e55b5b09942394ec370e",
|
||||
strip_prefix = "rules_rust-9a8ef691b8e8f682d767189c38339cbee16d0a16",
|
||||
urls = [
|
||||
# Master branch as of 2020-10-16
|
||||
"https://github.com/bazelbuild/rules_rust/archive/9a8ef691b8e8f682d767189c38339cbee16d0a16.tar.gz",
|
||||
],
|
||||
)
|
||||
|
||||
http_archive(
|
||||
name = "rules_jvm_external",
|
||||
sha256 = "d85951a92c0908c80bd8551002d66cb23c3434409c814179c0ff026b53544dab",
|
||||
strip_prefix = "rules_jvm_external-3.3",
|
||||
urls = [
|
||||
"https://github.com/bazelbuild/rules_jvm_external/archive/3.3.zip",
|
||||
],
|
||||
)
|
||||
|
||||
http_archive(
|
||||
name = "rules_pkg",
|
||||
sha256 = "0a33148c4957e666a29443f75b2c0db1fe3e0baf7256742fc47a35731f7a1d2e",
|
||||
strip_prefix = "rules_pkg-4b0b9f4679484f107f750a60190ff5ec6b164a5f/pkg",
|
||||
urls = [
|
||||
"https://github.com/bazelbuild/rules_pkg/archive/4b0b9f4679484f107f750a60190ff5ec6b164a5f.zip",
|
||||
],
|
||||
)
|
||||
|
||||
http_archive(
|
||||
name = "rules_proto",
|
||||
sha256 = "aa1ee19226f707d44bee44c720915199c20c84a23318bb0597ed4e5c873ccbd5",
|
||||
strip_prefix = "rules_proto-40298556293ae502c66579620a7ce867d5f57311",
|
||||
urls = [
|
||||
"https://github.com/bazelbuild/rules_proto/archive/40298556293ae502c66579620a7ce867d5f57311.tar.gz",
|
||||
],
|
||||
)
|
||||
|
||||
http_archive(
|
||||
name = "rules_python",
|
||||
sha256 = "ae3c1380c3c19d47fb474f201862dde7c14601130be2befa73bb02211267e960",
|
||||
strip_prefix = "rules_python-e3df8bcf0f675d20aaf752c8ba32a0259dd79996",
|
||||
urls = [
|
||||
"https://github.com/bazelbuild/rules_python/archive/e3df8bcf0f675d20aaf752c8ba32a0259dd79996.tar.gz",
|
||||
],
|
||||
)
|
||||
|
||||
http_archive(
|
||||
name = "rules_python_external",
|
||||
sha256 = "30987e33c0b00ef75d11dec756db6a5d57ccd4085525f8888d5237ef798f8d16",
|
||||
strip_prefix = "rules_python_external-2c78da5b5beb78c4a96b8b4d84e9c34de8178efb",
|
||||
urls = [
|
||||
"https://github.com/dillon-giacoppo/rules_python_external/archive/2c78da5b5beb78c4a96b8b4d84e9c34de8178efb.zip",
|
||||
],
|
||||
)
|
||||
|
||||
http_archive(
|
||||
name = "subpar",
|
||||
sha256 = "e6e4332bf9af36c4165ad6cc7b2c76288e9f156eba35dc95b739e58c46f30a50",
|
||||
strip_prefix = "subpar-9fae6b63cfeace2e0fb93c9c1ebdc28d3991b16f",
|
||||
urls = [
|
||||
"https://github.com/google/subpar/archive/9fae6b63cfeace2e0fb93c9c1ebdc28d3991b16f.zip",
|
||||
],
|
||||
)
|
||||
|
||||
http_archive(
|
||||
name = "cython",
|
||||
build_file = "@com_github_grpc_grpc//third_party:cython.BUILD",
|
||||
sha256 = "e2e38e1f0572ca54d6085df3dec8b607d20e81515fb80215aed19c81e8fe2079",
|
||||
strip_prefix = "cython-0.29.21",
|
||||
urls = [
|
||||
"https://github.com/cython/cython/archive/0.29.21.tar.gz",
|
||||
],
|
||||
)
|
||||
|
||||
# Java
|
||||
|
||||
load("//rules/java:artifacts.bzl", "maven_fetch_remote_artifacts")
|
||||
|
||||
maven_fetch_remote_artifacts()
|
||||
|
||||
# Rust
|
||||
|
||||
load("@io_bazel_rules_rust//rust:repositories.bzl", "rust_repository_set")
|
||||
|
||||
rust_version = "1.48.0"
|
||||
|
||||
rustfmt_version = "1.4.20"
|
||||
|
||||
rust_repository_set(
|
||||
name = "rust_linux_x86_64",
|
||||
edition = "2018",
|
||||
exec_triple = "x86_64-unknown-linux-gnu",
|
||||
extra_target_triples = ["wasm32-unknown-unknown"],
|
||||
rustfmt_version = rustfmt_version,
|
||||
version = rust_version,
|
||||
)
|
||||
|
||||
rust_repository_set(
|
||||
name = "rust_darwin_x86_64",
|
||||
edition = "2018",
|
||||
exec_triple = "x86_64-apple-darwin",
|
||||
extra_target_triples = ["wasm32-unknown-unknown"],
|
||||
rustfmt_version = rustfmt_version,
|
||||
version = rust_version,
|
||||
)
|
||||
|
||||
load("@io_bazel_rules_rust//:workspace.bzl", "bazel_version")
|
||||
|
||||
bazel_version(name = "bazel_version")
|
||||
|
||||
load("//rules/rust:crates.bzl", "raze_fetch_remote_crates")
|
||||
|
||||
raze_fetch_remote_crates()
|
||||
|
||||
register_toolchains("//:proto-toolchain")
|
||||
|
||||
# NodeJS
|
||||
load("@build_bazel_rules_nodejs//:index.bzl", "yarn_install")
|
||||
|
||||
yarn_install(
|
||||
name = "npm",
|
||||
package_json = "//rules/nodejs:package.json",
|
||||
symlink_node_modules = True,
|
||||
use_global_yarn_cache = True,
|
||||
yarn_lock = "//rules/nodejs:yarn.lock",
|
||||
)
|
||||
|
||||
# Packaging
|
||||
|
||||
load("@rules_pkg//:deps.bzl", "rules_pkg_dependencies")
|
||||
|
||||
rules_pkg_dependencies()
|
||||
|
||||
# Docker Setup
|
||||
|
||||
load(
|
||||
"@io_bazel_rules_docker//toolchains/docker:toolchain.bzl",
|
||||
docker_toolchain_configure = "toolchain_configure",
|
||||
)
|
||||
|
||||
docker_toolchain_configure(
|
||||
name = "docker_config",
|
||||
client_config = "/docker",
|
||||
)
|
||||
|
||||
load("@io_bazel_rules_docker//repositories:repositories.bzl", container_repositories = "repositories")
|
||||
|
||||
container_repositories()
|
||||
|
||||
load("@io_bazel_rules_docker//repositories:deps.bzl", container_deps = "deps")
|
||||
|
||||
container_deps()
|
||||
|
||||
load("@io_bazel_rules_docker//repositories:pip_repositories.bzl", "pip_deps")
|
||||
|
||||
pip_deps()
|
||||
|
||||
load("@io_bazel_rules_docker//java:image.bzl", java_image_repos = "repositories")
|
||||
load("@io_bazel_rules_docker//python3:image.bzl", py3_image_repos = "repositories")
|
||||
load("@io_bazel_rules_docker//nodejs:image.bzl", nodejs_image_repos = "repositories")
|
||||
load("@io_bazel_rules_docker//rust:image.bzl", rust_image_repos = "repositories")
|
||||
|
||||
java_image_repos()
|
||||
|
||||
nodejs_image_repos()
|
||||
|
||||
py3_image_repos()
|
||||
|
||||
rust_image_repos()
|
||||
|
||||
# Python
|
||||
register_toolchains("//rules/python:py_toolchain")
|
||||
|
||||
load("@rules_python_external//:defs.bzl", "pip_install")
|
||||
|
||||
pip_install(
|
||||
name = "pip_modules_external",
|
||||
requirements = "//rules/python:requirements.txt",
|
||||
)
|
||||
|
||||
load("@rules_python_external//:repositories.bzl", "rules_python_external_dependencies")
|
||||
|
||||
rules_python_external_dependencies()
|
||||
|
||||
# K8s
|
||||
|
||||
load("@io_bazel_rules_k8s//k8s:k8s.bzl", "k8s_repositories")
|
||||
|
||||
k8s_repositories()
|
||||
|
||||
load("@io_bazel_rules_k8s//k8s:k8s_go_deps.bzl", k8s_go_deps = "deps")
|
||||
|
||||
k8s_go_deps()
|
||||
|
||||
# Miscellaneous
|
||||
|
||||
load("//rules/misc:setup.bzl", "rules_misc_setup_internal")
|
||||
|
||||
rules_misc_setup_internal()
|
||||
|
||||
load("//rules/misc:install.bzl", "rules_misc_install_internal")
|
||||
|
||||
rules_misc_install_internal()
|
||||
|
||||
# Images Install
|
||||
|
||||
load("//images:install.bzl", "images_install")
|
||||
|
||||
images_install()
|
||||
|
||||
# Proto / gRPC
|
||||
|
||||
load("@rules_proto//proto:repositories.bzl", "rules_proto_dependencies", "rules_proto_toolchains")
|
||||
|
||||
rules_proto_dependencies()
|
||||
|
||||
rules_proto_toolchains()
|
||||
|
||||
load("@com_github_grpc_grpc//bazel:grpc_deps.bzl", "grpc_deps")
|
||||
|
||||
grpc_deps()
|
||||
|
||||
load("@com_github_grpc_grpc//bazel:grpc_extra_deps.bzl", "grpc_extra_deps")
|
||||
|
||||
grpc_extra_deps()
|
8
images/BUILD.bazel
Normal file
8
images/BUILD.bazel
Normal file
@ -0,0 +1,8 @@
|
||||
load("@io_bazel_rules_docker//container:container.bzl", "container_image")
|
||||
|
||||
package(default_visibility = ["//visibility:public"])
|
||||
|
||||
container_image(
|
||||
name = "base-image",
|
||||
base = "@ubuntu//image",
|
||||
)
|
18
images/install.bzl
Normal file
18
images/install.bzl
Normal file
@ -0,0 +1,18 @@
|
||||
"""
|
||||
Install various images
|
||||
"""
|
||||
|
||||
load("@io_bazel_rules_docker//container:pull.bzl", "container_pull")
|
||||
|
||||
def images_install():
|
||||
"""
|
||||
Docker predefined images
|
||||
"""
|
||||
|
||||
container_pull(
|
||||
name = "ubuntu",
|
||||
registry = "index.docker.io",
|
||||
repository = "library/ubuntu",
|
||||
digest = "sha256:4e4bc990609ed865e07afc8427c30ffdddca5153fd4e82c20d8f0783a291e241",
|
||||
tag = "20.04",
|
||||
)
|
39
images/production/BUILD.bazel
Normal file
39
images/production/BUILD.bazel
Normal file
@ -0,0 +1,39 @@
|
||||
load("@io_bazel_rules_docker//container:container.bzl", "container_image", "container_push")
|
||||
load("@io_bazel_rules_docker//docker/package_managers:download_pkgs.bzl", "download_pkgs")
|
||||
load("@io_bazel_rules_docker//docker/package_managers:install_pkgs.bzl", "install_pkgs")
|
||||
|
||||
package(default_visibility = ["//visibility:public"])
|
||||
|
||||
download_pkgs(
|
||||
name = "download-base-python-image",
|
||||
image_tar = "//images:base-image.tar",
|
||||
packages = [
|
||||
"bash",
|
||||
"libev4",
|
||||
"libgomp1",
|
||||
"libprotobuf17",
|
||||
"libssl1.1",
|
||||
"python3.9",
|
||||
],
|
||||
)
|
||||
|
||||
install_pkgs(
|
||||
name = "install-base-python-image",
|
||||
image_tar = "//images:base-image.tar",
|
||||
installables_tar = ":download-base-python-image.tar",
|
||||
installation_cleanup_commands = "rm -rf /var/lib/apt/lists/*",
|
||||
output_image_name = "installed-base-python-image",
|
||||
)
|
||||
|
||||
container_image(
|
||||
name = "base-python-image",
|
||||
base = ":install-base-python-image",
|
||||
entrypoint = ["/usr/bin/python3.9"],
|
||||
env = {"LANG": "C.UTF-8"},
|
||||
symlinks = {
|
||||
"/usr/bin/python": "/usr/bin/python3.9",
|
||||
"/usr/bin/python3": "/usr/bin/python3.9",
|
||||
},
|
||||
visibility = ["//visibility:public"],
|
||||
)
|
||||
|
0
library/__init__.py
Normal file
0
library/__init__.py
Normal file
19
library/aiogrpctools/BUILD.bazel
Normal file
19
library/aiogrpctools/BUILD.bazel
Normal file
@ -0,0 +1,19 @@
|
||||
load("@pip_modules_external//:requirements.bzl", "requirement")
|
||||
load("@rules_python//python:defs.bzl", "py_library")
|
||||
|
||||
py_library(
|
||||
name = "aiogrpctools",
|
||||
srcs = glob(
|
||||
["**/*.py"],
|
||||
exclude = ["tests/**"],
|
||||
),
|
||||
srcs_version = "PY3ONLY",
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
requirement("grpcio"),
|
||||
requirement("aiokit"),
|
||||
"//library/configurator",
|
||||
"//library/logging",
|
||||
requirement("izihawa_utils"),
|
||||
],
|
||||
)
|
7
library/aiogrpctools/__init__.py
Normal file
7
library/aiogrpctools/__init__.py
Normal file
@ -0,0 +1,7 @@
|
||||
from .base import (
|
||||
AioGrpcServer,
|
||||
aiogrpc_request_wrapper,
|
||||
aiogrpc_streaming_request_wrapper,
|
||||
)
|
||||
|
||||
__all__ = ['AioGrpcServer', 'aiogrpc_streaming_request_wrapper', 'aiogrpc_request_wrapper']
|
100
library/aiogrpctools/base.py
Normal file
100
library/aiogrpctools/base.py
Normal file
@ -0,0 +1,100 @@
|
||||
import logging
|
||||
from functools import wraps
|
||||
|
||||
from aiokit import (
|
||||
AioRootThing,
|
||||
AioThing,
|
||||
)
|
||||
from google.protobuf.json_format import MessageToDict
|
||||
from grpc import aio
|
||||
from izihawa_utils.text import camel_to_snake
|
||||
from library.logging import error_log
|
||||
|
||||
|
||||
class AioGrpcServer(AioRootThing):
|
||||
def __init__(self, address, port):
|
||||
super().__init__()
|
||||
self.server = aio.server()
|
||||
self.server.add_insecure_port(f'{address}:{port}')
|
||||
|
||||
async def start(self):
|
||||
await self.server.start()
|
||||
await self.server.wait_for_termination()
|
||||
|
||||
async def stop(self):
|
||||
await self.server.stop(None)
|
||||
|
||||
|
||||
class BaseService(AioThing):
|
||||
error_mapping = {}
|
||||
|
||||
def __init__(self, service_name):
|
||||
super().__init__()
|
||||
self.service_name = service_name
|
||||
self.class_name = camel_to_snake(self.__class__.__name__)
|
||||
|
||||
def get_default_service_fields(self):
|
||||
return {'service_name': self.service_name, 'view': self.class_name}
|
||||
|
||||
def statbox(self, **kwargs):
|
||||
logging.getLogger('statbox').info(self.get_default_service_fields() | kwargs)
|
||||
|
||||
|
||||
def aiogrpc_request_wrapper(log=True):
|
||||
def _aiogrpc_request_wrapper(func):
|
||||
@wraps(func)
|
||||
async def wrapped(self, request, context):
|
||||
metadata = dict(context.invocation_metadata())
|
||||
try:
|
||||
if log:
|
||||
self.statbox(
|
||||
action='enter',
|
||||
mode=func.__name__,
|
||||
request_id=metadata['request-id'],
|
||||
)
|
||||
r = await func(self, request, context, metadata)
|
||||
if log:
|
||||
self.statbox(
|
||||
action='exit',
|
||||
mode=func.__name__,
|
||||
request_id=metadata['request-id'],
|
||||
)
|
||||
return r
|
||||
except aio.AbortError:
|
||||
raise
|
||||
except Exception as e:
|
||||
serialized_request = MessageToDict(request, preserving_proto_field_name=True)
|
||||
error_log(e, request=serialized_request, request_id=metadata['request-id'])
|
||||
if e.__class__ in self.error_mapping:
|
||||
await context.abort(*self.error_mapping[e.__class__])
|
||||
raise e
|
||||
return wrapped
|
||||
return _aiogrpc_request_wrapper
|
||||
|
||||
|
||||
def aiogrpc_streaming_request_wrapper(func):
|
||||
@wraps(func)
|
||||
async def wrapped(self, request, context):
|
||||
metadata = dict(context.invocation_metadata())
|
||||
try:
|
||||
self.statbox(
|
||||
action='enter',
|
||||
mode=func.__name__,
|
||||
request_id=metadata['request-id'],
|
||||
)
|
||||
async for item in func(self, request, context, metadata):
|
||||
yield item
|
||||
self.statbox(
|
||||
action='exit',
|
||||
mode=func.__name__,
|
||||
request_id=metadata['request-id'],
|
||||
)
|
||||
except aio.AbortError:
|
||||
raise
|
||||
except Exception as e:
|
||||
serialized_request = MessageToDict(request, preserving_proto_field_name=True)
|
||||
error_log(e, request=serialized_request, request_id=metadata['request-id'])
|
||||
if e.__class__ in self.error_mapping:
|
||||
await context.abort(*self.error_mapping[e.__class__])
|
||||
raise e
|
||||
return wrapped
|
17
library/aiopostgres/BUILD.bazel
Normal file
17
library/aiopostgres/BUILD.bazel
Normal file
@ -0,0 +1,17 @@
|
||||
load("@pip_modules_external//:requirements.bzl", "requirement")
|
||||
load("@rules_python//python:defs.bzl", "py_library")
|
||||
|
||||
py_library(
|
||||
name = "aiopostgres",
|
||||
srcs = glob(
|
||||
["**/*.py"],
|
||||
exclude = ["tests/**"],
|
||||
),
|
||||
srcs_version = "PY3",
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
requirement("aiopg"),
|
||||
requirement("tenacity"),
|
||||
requirement("aiokit"),
|
||||
],
|
||||
)
|
3
library/aiopostgres/__init__.py
Normal file
3
library/aiopostgres/__init__.py
Normal file
@ -0,0 +1,3 @@
|
||||
from .pool_holder import AioPostgresPoolHolder
|
||||
|
||||
__all__ = ['AioPostgresPoolHolder']
|
41
library/aiopostgres/pool_holder.py
Normal file
41
library/aiopostgres/pool_holder.py
Normal file
@ -0,0 +1,41 @@
|
||||
import psycopg2.extras
|
||||
from aiokit import AioThing
|
||||
from psycopg2 import OperationalError
|
||||
from tenacity import (
|
||||
retry,
|
||||
retry_if_exception_type,
|
||||
stop_after_attempt,
|
||||
wait_fixed,
|
||||
)
|
||||
|
||||
|
||||
class AioPostgresPoolHolder(AioThing):
|
||||
def __init__(self, fn, *args, **kwargs):
|
||||
super().__init__()
|
||||
self.fn = fn
|
||||
self.args = args
|
||||
self.kwargs = kwargs
|
||||
self.pool = None
|
||||
|
||||
@retry(
|
||||
retry=retry_if_exception_type(OperationalError),
|
||||
stop=stop_after_attempt(3),
|
||||
wait=wait_fixed(1.0),
|
||||
)
|
||||
async def start(self):
|
||||
if not self.pool:
|
||||
self.pool = await self.fn(*self.args, **self.kwargs)
|
||||
|
||||
async def stop(self):
|
||||
if self.pool:
|
||||
self.pool.close()
|
||||
await self.pool.wait_closed()
|
||||
self.pool = None
|
||||
|
||||
async def execute(self, stmt, values=None, fetch=False, timeout=None, cursor_factory=psycopg2.extras.DictCursor):
|
||||
async with self.pool.acquire() as conn:
|
||||
async with conn.cursor(cursor_factory=cursor_factory) as cur:
|
||||
await cur.execute(stmt, values, timeout=timeout)
|
||||
if fetch:
|
||||
return await cur.fetchall()
|
||||
return cur.rowcount
|
18
library/configurator/BUILD.bazel
Normal file
18
library/configurator/BUILD.bazel
Normal file
@ -0,0 +1,18 @@
|
||||
load("@pip_modules_external//:requirements.bzl", "requirement")
|
||||
load("@rules_python//python:defs.bzl", "py_library")
|
||||
|
||||
py_library(
|
||||
name = "configurator",
|
||||
srcs = glob(
|
||||
["**/*.py"],
|
||||
exclude = ["tests/**"],
|
||||
),
|
||||
srcs_version = "PY3",
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
requirement("jinja2"),
|
||||
requirement("orjson"),
|
||||
requirement("pyyaml"),
|
||||
requirement("izihawa_utils"),
|
||||
],
|
||||
)
|
148
library/configurator/__init__.py
Normal file
148
library/configurator/__init__.py
Normal file
@ -0,0 +1,148 @@
|
||||
import os
|
||||
import os.path
|
||||
from types import ModuleType
|
||||
|
||||
import orjson as json
|
||||
import yaml
|
||||
from izihawa_utils.common import smart_merge_dicts
|
||||
from jinja2 import Template
|
||||
from library.configurator.exceptions import UnknownConfigFormatError
|
||||
|
||||
|
||||
class ConfigObject(dict):
|
||||
def __getattr__(self, name):
|
||||
try:
|
||||
return self[name]
|
||||
except KeyError as e:
|
||||
raise AttributeError(e)
|
||||
|
||||
|
||||
class AnyOf:
|
||||
def __init__(self, *args):
|
||||
self.args = args
|
||||
|
||||
|
||||
class RichDict(dict):
|
||||
def has(self, *args):
|
||||
current = self
|
||||
for c in args:
|
||||
if c not in current:
|
||||
return False
|
||||
current = current[c]
|
||||
return True
|
||||
|
||||
def copy_if_exists(self, source_keys, target_key):
|
||||
current = self
|
||||
for c in source_keys:
|
||||
if c not in current:
|
||||
return False
|
||||
current = current[c]
|
||||
self[target_key] = current
|
||||
return True
|
||||
|
||||
|
||||
class Configurator(RichDict):
|
||||
def __init__(self, configs: list):
|
||||
"""
|
||||
Create Configurator object
|
||||
|
||||
:param configs: list of paths to config files, dicts or modules.
|
||||
End filepath with `?` to mark it as optional config.
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
self._by_basenames = {}
|
||||
self._omitted_files = []
|
||||
|
||||
env_config = {}
|
||||
env_config_var = os.environ.get('CONFIGURATOR', '')
|
||||
if env_config_var:
|
||||
env_config = yaml.safe_load(env_config_var)
|
||||
|
||||
for config in ([os.environ] + configs + [env_config]):
|
||||
file_found = self.update(config)
|
||||
if not file_found:
|
||||
self._omitted_files.append(config)
|
||||
|
||||
def _config_filename(self, filename):
|
||||
return os.path.join(os.getcwd(), filename)
|
||||
|
||||
def walk_and_render(self, c):
|
||||
if isinstance(c, str):
|
||||
return Template(c).render(**self)
|
||||
elif isinstance(c, list):
|
||||
return [self.walk_and_render(e) for e in c]
|
||||
elif isinstance(c, dict):
|
||||
for key in c:
|
||||
c[key] = self.walk_and_render(c[key])
|
||||
return c
|
||||
|
||||
def update(self, new_config, basename=None, **kwargs):
|
||||
if isinstance(new_config, AnyOf):
|
||||
for config in new_config.args:
|
||||
try:
|
||||
return self.update(config.rstrip('?'))
|
||||
except IOError:
|
||||
pass
|
||||
raise IOError('None of %s was found' % ', '.join(new_config.args))
|
||||
elif isinstance(new_config, str):
|
||||
optional = new_config.endswith('?')
|
||||
filename = new_config.rstrip('?')
|
||||
basename = basename or os.path.basename(filename)
|
||||
|
||||
config_filename = self._config_filename(filename)
|
||||
|
||||
data = None
|
||||
|
||||
if os.path.exists(config_filename) and os.access(config_filename, os.R_OK):
|
||||
with open(config_filename) as f:
|
||||
data = f.read()
|
||||
|
||||
if data is None:
|
||||
if optional:
|
||||
return False
|
||||
else:
|
||||
raise IOError(f'File {config_filename} not found')
|
||||
|
||||
if filename.endswith('.json'):
|
||||
new_config = json.loads(data)
|
||||
elif filename.endswith('.yaml'):
|
||||
new_config = yaml.safe_load(data)
|
||||
else:
|
||||
raise UnknownConfigFormatError(filename)
|
||||
|
||||
new_config = self.walk_and_render(new_config)
|
||||
|
||||
elif isinstance(new_config, ModuleType):
|
||||
new_config = new_config.__dict__
|
||||
|
||||
elif callable(new_config):
|
||||
new_config = new_config(self)
|
||||
|
||||
if not new_config:
|
||||
new_config = {}
|
||||
|
||||
for k in new_config:
|
||||
if callable(new_config[k]):
|
||||
new_config[k] = new_config[k](context=self)
|
||||
|
||||
if 'log_path' in new_config:
|
||||
new_config['log_path'] = os.path.expanduser(new_config['log_path']).rstrip('/')
|
||||
|
||||
smart_merge_dicts(self, new_config, list_policy='override', copy=False)
|
||||
if basename:
|
||||
self._by_basenames[basename] = new_config
|
||||
|
||||
return True
|
||||
|
||||
def get_config_by_basename(self, basename):
|
||||
return self._by_basenames[basename]
|
||||
|
||||
def get_object_by_basename(self, basename):
|
||||
return ConfigObject(self._by_basenames[basename])
|
||||
|
||||
def has_missed_configs(self):
|
||||
return bool(self._omitted_files)
|
||||
|
||||
def has_file(self, basename):
|
||||
return basename in self._by_basenames
|
2
library/configurator/exceptions.py
Normal file
2
library/configurator/exceptions.py
Normal file
@ -0,0 +1,2 @@
|
||||
class UnknownConfigFormatError(Exception):
|
||||
pass
|
15
library/logging/BUILD.bazel
Normal file
15
library/logging/BUILD.bazel
Normal file
@ -0,0 +1,15 @@
|
||||
load("@pip_modules_external//:requirements.bzl", "requirement")
|
||||
load("@rules_python//python:defs.bzl", "py_library")
|
||||
|
||||
py_library(
|
||||
name = "logging",
|
||||
srcs = glob(["**/*.py"]),
|
||||
srcs_version = "PY3ONLY",
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
requirement("orjson"),
|
||||
requirement("prometheus_client"),
|
||||
requirement("izihawa_types"),
|
||||
requirement("izihawa_utils"),
|
||||
],
|
||||
)
|
44
library/logging/__init__.py
Normal file
44
library/logging/__init__.py
Normal file
@ -0,0 +1,44 @@
|
||||
import logging
|
||||
import logging.config
|
||||
import sys
|
||||
|
||||
from izihawa_utils.exceptions import BaseError
|
||||
from izihawa_utils.file import mkdir_p
|
||||
from library.logging.formatters import (
|
||||
DefaultFormatter,
|
||||
DefaultHttpFormatter,
|
||||
)
|
||||
from library.logging.handlers import QueueHandler
|
||||
from prometheus_client import Counter
|
||||
|
||||
error_counter = Counter('errors_total', 'counter for error.log')
|
||||
|
||||
|
||||
def configure_logging(config, make_path=True):
|
||||
if config.get('application', {}).get('debug', False) or 'logging' not in config:
|
||||
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
|
||||
else:
|
||||
if make_path:
|
||||
mkdir_p(config['log_path'])
|
||||
logging.config.dictConfig(config['logging'])
|
||||
|
||||
|
||||
def error_log(e, level=logging.ERROR, **fields):
|
||||
level = getattr(e, 'level', level)
|
||||
if level == logging.ERROR:
|
||||
error_counter.inc()
|
||||
if isinstance(e, BaseError):
|
||||
e = e.as_internal_dict()
|
||||
e.update(fields)
|
||||
elif fields:
|
||||
e = {'error': str(e), **fields}
|
||||
logging.getLogger('error').log(
|
||||
msg=e,
|
||||
level=level
|
||||
)
|
||||
|
||||
|
||||
__all__ = [
|
||||
'DefaultFormatter', 'DefaultHttpFormatter',
|
||||
'QueueHandler', 'configure_logging', 'error_log',
|
||||
]
|
94
library/logging/formatters.py
Normal file
94
library/logging/formatters.py
Normal file
@ -0,0 +1,94 @@
|
||||
import dataclasses
|
||||
import datetime
|
||||
import logging
|
||||
import os
|
||||
import pprint
|
||||
import time
|
||||
import traceback
|
||||
import typing
|
||||
|
||||
import orjson as json
|
||||
from izihawa_utils.exceptions import BaseError
|
||||
|
||||
DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S.%f'
|
||||
|
||||
|
||||
class BaseFormatter(logging.Formatter):
|
||||
def _prepare(self, record):
|
||||
if isinstance(record.msg, BaseError):
|
||||
return record.msg.as_internal_dict()
|
||||
elif isinstance(record.msg, typing.Dict) or dataclasses.is_dataclass(record.msg):
|
||||
return record.msg
|
||||
else:
|
||||
return dict(message=super().format(record))
|
||||
|
||||
def format(self, record):
|
||||
log_record = self._prepare(record)
|
||||
return json.dumps(log_record).decode()
|
||||
|
||||
|
||||
class DefaultHttpFormatter(BaseFormatter):
|
||||
def _prepare(self, record):
|
||||
log_record = super()._prepare(record)
|
||||
|
||||
timestamp = time.time()
|
||||
formatted_datetime = datetime.datetime.fromtimestamp(timestamp).strftime(DATETIME_FORMAT)
|
||||
user_ip = getattr(record, 'user_ip', None)
|
||||
request_id = getattr(record, 'request_id', None)
|
||||
method = getattr(record, 'method', None)
|
||||
path = getattr(record, 'path', None)
|
||||
|
||||
log_record.update(
|
||||
unixtime=int(timestamp),
|
||||
timestamp=int(timestamp * 1_000_000),
|
||||
datetime=formatted_datetime,
|
||||
process=os.getpid(),
|
||||
)
|
||||
|
||||
if user_ip:
|
||||
log_record['user_ip'] = user_ip
|
||||
if request_id:
|
||||
log_record['request_id'] = request_id
|
||||
if method:
|
||||
log_record['method'] = method
|
||||
if path:
|
||||
log_record['path'] = path
|
||||
|
||||
return log_record
|
||||
|
||||
def format(self, record):
|
||||
log_record = self._prepare(record)
|
||||
return json.dumps(log_record).decode()
|
||||
|
||||
|
||||
class DefaultFormatter(BaseFormatter):
|
||||
def _prepare(self, record):
|
||||
log_record = super()._prepare(record)
|
||||
|
||||
timestamp = time.time()
|
||||
formatted_datetime = datetime.datetime.fromtimestamp(timestamp).strftime(DATETIME_FORMAT)
|
||||
|
||||
log_record.update(
|
||||
unixtime=int(timestamp),
|
||||
timestamp=int(timestamp * 1_000_000),
|
||||
datetime=formatted_datetime,
|
||||
process=os.getpid(),
|
||||
)
|
||||
return log_record
|
||||
|
||||
def format(self, record):
|
||||
log_record = self._prepare(record)
|
||||
return json.dumps(log_record).decode()
|
||||
|
||||
|
||||
class TracebackFormatter(DefaultFormatter):
|
||||
def format(self, record):
|
||||
log_record = self._prepare(record)
|
||||
value = pprint.pformat(log_record, indent=2)
|
||||
if traceback.sys.exc_info()[0] is not None:
|
||||
value += '\n' + traceback.format_exc()
|
||||
return value
|
||||
|
||||
|
||||
default_formatter = DefaultFormatter()
|
||||
default_traceback_formatter = TracebackFormatter()
|
42
library/logging/handlers.py
Normal file
42
library/logging/handlers.py
Normal file
@ -0,0 +1,42 @@
|
||||
import logging.handlers
|
||||
import os
|
||||
import queue
|
||||
|
||||
from izihawa_types.var import varstr
|
||||
|
||||
|
||||
class QueueHandler(logging.handlers.QueueHandler):
|
||||
def __init__(self, *handlers):
|
||||
self._queue = queue.Queue(-1)
|
||||
self._listener = logging.handlers.QueueListener(self._queue, *handlers, respect_handler_level=True)
|
||||
self.setLevel('INFO')
|
||||
|
||||
super().__init__(self._queue)
|
||||
self._listener.start()
|
||||
|
||||
def stop(self):
|
||||
self._listener.stop()
|
||||
|
||||
def prepare(self, record):
|
||||
return record
|
||||
|
||||
|
||||
class BaseFileHandler(logging.handlers.WatchedFileHandler):
|
||||
def _open(self):
|
||||
file = super()._open()
|
||||
os.chmod(self.baseFilename, 0o644)
|
||||
return file
|
||||
|
||||
|
||||
class BaseBinaryFileHandler(BaseFileHandler):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs, mode='ab+')
|
||||
|
||||
def emit(self, record):
|
||||
try:
|
||||
self.stream.write(varstr(record.msg))
|
||||
self.flush()
|
||||
except RecursionError:
|
||||
raise
|
||||
except Exception:
|
||||
self.handleError(record)
|
18
nexus/README.md
Normal file
18
nexus/README.md
Normal file
@ -0,0 +1,18 @@
|
||||
# Nexus
|
||||
|
||||
## Content
|
||||
|
||||
- ✅ [`actions`](actions) - shared code for ingesting data from external APIs (LibGen/CrossrefAPI)
|
||||
- 🛑 `bot` - telegram bot for Summa
|
||||
- 🛑 `cognitron` - bundled app for IPFS, search server and web frontend
|
||||
- 🛑 `hub` - downloading & sending
|
||||
- ✅ [`ingest`](ingest) - retrieving metadata from external APIs and putting it onto Kafka
|
||||
- 🛑 `meta_api` - rescoring and merging API for Summa backends
|
||||
- ✅ [`models`](models) - shared Protobuf models
|
||||
- ✅ [`nlptools`](nlptools) - text routines
|
||||
- ✅ [`pipe`](pipe) - processing pipeline based on Kafka
|
||||
- 🛑 `pylon` - smart proxy for downloading files from the Internet/IPFS
|
||||
- ✅ [`summa`](summa) - scripts for setting Summa
|
||||
- 🛑 `translations` - text translations used in `bot` and `hub`
|
||||
- 🛑 `views` - shared views for [`models`](models)
|
||||
- 🛑 `web` - web frontend for Summa
|
0
nexus/__init__.py
Normal file
0
nexus/__init__.py
Normal file
26
nexus/actions/BUILD.bazel
Normal file
26
nexus/actions/BUILD.bazel
Normal file
@ -0,0 +1,26 @@
|
||||
load("@pip_modules_external//:requirements.bzl", "requirement")
|
||||
load("@rules_python//python:defs.bzl", "py_library")
|
||||
|
||||
py_library(
|
||||
name = "actions",
|
||||
srcs = glob(
|
||||
["**/*.py"],
|
||||
exclude = ["tests/**"],
|
||||
),
|
||||
imports = ["."],
|
||||
srcs_version = "PY3",
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
requirement("beautifulsoup4"),
|
||||
requirement("lxml"),
|
||||
requirement("pypika"),
|
||||
requirement("numpy"),
|
||||
requirement("aiocrossref"),
|
||||
requirement("aiolibgen"),
|
||||
"//library/aiopostgres",
|
||||
"//nexus/models/proto:models_proto_py",
|
||||
"//nexus/nlptools",
|
||||
"//nexus/summa/schema",
|
||||
requirement("aiosumma"),
|
||||
],
|
||||
)
|
5
nexus/actions/README.md
Normal file
5
nexus/actions/README.md
Normal file
@ -0,0 +1,5 @@
|
||||
# Nexus Actions
|
||||
|
||||
`Actions` is segregated dirty code for processing Crossref API and LibGen API responses.
|
||||
Also, module has parts required for landing data onto databases and/or search engines.
|
||||
|
21
nexus/actions/__init__.py
Normal file
21
nexus/actions/__init__.py
Normal file
@ -0,0 +1,21 @@
|
||||
from .update_document import SendDocumentOperationUpdateDocumentPbToSummaAction
|
||||
from .update_document_scimag import (
|
||||
CleanDocumentOperationUpdateDocumentScimagPbAction,
|
||||
FillDocumentOperationUpdateDocumentScimagPbFromExternalSourceAction,
|
||||
SendDocumentOperationUpdateDocumentScimagPbReferencesToKafkaAction,
|
||||
SendDocumentOperationUpdateDocumentScimagPbToGoldenPostgresAction,
|
||||
)
|
||||
from .update_document_scitech import (
|
||||
CleanDocumentOperationUpdateDocumentScitechPbAction,
|
||||
SendDocumentOperationUpdateDocumentScitechPbToGoldenPostgresAction,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
'CleanDocumentOperationUpdateDocumentScimagPbAction',
|
||||
'CleanDocumentOperationUpdateDocumentScitechPbAction',
|
||||
'FillDocumentOperationUpdateDocumentScimagPbFromExternalSourceAction',
|
||||
'SendDocumentOperationUpdateDocumentPbToSummaAction',
|
||||
'SendDocumentOperationUpdateDocumentScimagPbReferencesToKafkaAction',
|
||||
'SendDocumentOperationUpdateDocumentScimagPbToGoldenPostgresAction',
|
||||
'SendDocumentOperationUpdateDocumentScitechPbToGoldenPostgresAction',
|
||||
]
|
6
nexus/actions/base.py
Normal file
6
nexus/actions/base.py
Normal file
@ -0,0 +1,6 @@
|
||||
from aiokit import AioThing
|
||||
|
||||
|
||||
class BaseAction(AioThing):
|
||||
async def do(self, item):
|
||||
pass
|
5
nexus/actions/common.py
Normal file
5
nexus/actions/common.py
Normal file
@ -0,0 +1,5 @@
|
||||
from urllib.parse import unquote
|
||||
|
||||
|
||||
def canonize_doi(doi):
|
||||
return unquote(doi.lower())
|
108
nexus/actions/crossref_api.py
Normal file
108
nexus/actions/crossref_api.py
Normal file
@ -0,0 +1,108 @@
|
||||
import time
|
||||
from datetime import date
|
||||
|
||||
from nexus.models.proto.scimag_pb2 import Scimag as ScimagPb
|
||||
|
||||
from .base import BaseAction
|
||||
|
||||
|
||||
def extract_authors(authors):
|
||||
result = []
|
||||
if authors:
|
||||
for author in authors:
|
||||
if 'family' in author and 'given' in author:
|
||||
result.append(f'{author["family"]}, {author["given"]}')
|
||||
return result
|
||||
|
||||
|
||||
def extract_dates(date_parts):
|
||||
if not date_parts or not date_parts[0]:
|
||||
return '', None
|
||||
year, month, day = date_parts[0] + [0] * (3 - len(date_parts[0]))
|
||||
if year:
|
||||
issued_at = int(time.mktime(date(
|
||||
year=year,
|
||||
month=month if month else 1,
|
||||
day=day if day else 1,
|
||||
).timetuple()))
|
||||
return str(year), issued_at
|
||||
return '', None
|
||||
|
||||
|
||||
def extract_first(arr, default=''):
|
||||
if arr and len(arr) > 0:
|
||||
return arr[0]
|
||||
return default
|
||||
|
||||
|
||||
def extract_page(page, default=0):
|
||||
np = ''
|
||||
for c in page:
|
||||
if c.isdigit():
|
||||
np += c
|
||||
if np:
|
||||
np = int(np)
|
||||
if np < 2**31:
|
||||
return np
|
||||
return default
|
||||
|
||||
|
||||
def extract_pages(pages, default=0):
|
||||
try:
|
||||
if pages is None:
|
||||
return default, default
|
||||
pages = pages.split('-')
|
||||
if len(pages) == 2:
|
||||
return extract_page(pages[0], default=default), extract_page(pages[1], default=default)
|
||||
elif len(pages) == 1:
|
||||
return extract_page(pages[0], default=default), default
|
||||
return default, default
|
||||
except ValueError:
|
||||
return default, default
|
||||
|
||||
|
||||
def extract_references(references):
|
||||
if references:
|
||||
dois = []
|
||||
for reference in references:
|
||||
if reference.get('DOI'):
|
||||
dois.append(reference['DOI'])
|
||||
return dois
|
||||
|
||||
|
||||
def extract_title(title, subtitle):
|
||||
return ': '.join(filter(lambda x: bool(x), [title.strip(), subtitle.strip()]))
|
||||
|
||||
|
||||
class CrossrefApiToThinScimagPbAction(BaseAction):
|
||||
async def do(self, item: dict) -> ScimagPb:
|
||||
return ScimagPb(doi=item['DOI'])
|
||||
|
||||
|
||||
class CrossrefApiToScimagPbAction(BaseAction):
|
||||
async def do(self, item: dict) -> ScimagPb:
|
||||
scimag_pb = ScimagPb(
|
||||
abstract=item.get('abstract'),
|
||||
container_title=extract_first(item.get('container-title')),
|
||||
doi=item['DOI'],
|
||||
issue=item.get('issue'),
|
||||
issns=item.get('ISSN'),
|
||||
language=item.get('language'),
|
||||
ref_by_count=item.get('is-referenced-by-count'),
|
||||
references=extract_references(item.get('reference')),
|
||||
tags=item.get('subject'),
|
||||
title=extract_title(extract_first(item.get('title')), extract_first(item.get('subtitle'))),
|
||||
type=item.get('type'),
|
||||
volume=item.get('volume'),
|
||||
)
|
||||
if item.get('author'):
|
||||
scimag_pb.authors.extend(extract_authors(item['author']))
|
||||
elif item.get('editor'):
|
||||
scimag_pb.authors.extend(extract_authors(item['editor']))
|
||||
|
||||
scimag_pb.first_page, scimag_pb.last_page = extract_pages(item.get('page'))
|
||||
scimag_pb.year, issued_at = extract_dates(item.get('issued', {}).get('date-parts'))
|
||||
if issued_at is not None:
|
||||
scimag_pb.issued_at = issued_at
|
||||
|
||||
return scimag_pb
|
17
nexus/actions/exceptions.py
Normal file
17
nexus/actions/exceptions.py
Normal file
@ -0,0 +1,17 @@
|
||||
from typing import List
|
||||
|
||||
from izihawa_utils.exceptions import BaseError
|
||||
|
||||
|
||||
class InterruptProcessing(BaseError):
|
||||
code = 'interrupt_processing'
|
||||
|
||||
def __init__(self, doc_id, reason):
|
||||
super().__init__(doc_id=doc_id, reason=reason)
|
||||
|
||||
|
||||
class ConflictError(BaseError):
|
||||
code = 'conflict_error'
|
||||
|
||||
def __init__(self, document, duplicates: List[dict]):
|
||||
super().__init__(document=document, duplicates=duplicates)
|
8
nexus/actions/golden_postgres.py
Normal file
8
nexus/actions/golden_postgres.py
Normal file
@ -0,0 +1,8 @@
|
||||
from nexus.models.proto.scimag_pb2 import Scimag as ScimagPb
|
||||
|
||||
from .base import BaseAction
|
||||
|
||||
|
||||
class GoldenPostgresToThinScimagPbAction(BaseAction):
|
||||
async def do(self, item: dict) -> ScimagPb:
|
||||
return ScimagPb(doi=item['doi'])
|
125
nexus/actions/libgen_api.py
Normal file
125
nexus/actions/libgen_api.py
Normal file
@ -0,0 +1,125 @@
|
||||
import numpy as np
|
||||
from izihawa_types.safecast import safe_int
|
||||
from nexus.models.proto.scitech_pb2 import Scitech as ScitechPb
|
||||
|
||||
from .base import BaseAction
|
||||
|
||||
LANGUAGE_TRANSLATION = {
|
||||
'English': 'en',
|
||||
'Russian': 'ru',
|
||||
'German': 'de',
|
||||
'Ukrainian': 'uk',
|
||||
'French': 'fr',
|
||||
'Italian': 'it',
|
||||
'Spanish': 'es',
|
||||
'Portuguese': 'pt',
|
||||
'Chinese': 'cn',
|
||||
'Polish': 'pl',
|
||||
'english': 'en',
|
||||
'Russian-Ukrainian': 'ru,uk',
|
||||
'Russian-Ukrainian-English': 'en,ru,uk',
|
||||
'Russian(Old)': 'ru',
|
||||
'English-Russian': 'en,ru',
|
||||
'Turkish': 'tr',
|
||||
'Greek': 'el',
|
||||
'Romanian': 'ro',
|
||||
'Russian (Old)': 'ru',
|
||||
'Arabic': 'ar',
|
||||
'Français': 'fr',
|
||||
'Dutch': 'nl',
|
||||
'Japanese': 'ja',
|
||||
'Persian': 'fa',
|
||||
'Hungarian': 'hu',
|
||||
'Latin': 'la',
|
||||
'Serbian': 'sr',
|
||||
'Spanish,Castilian': 'es',
|
||||
'German-Russian': 'de,ru',
|
||||
'Croatian': 'hr',
|
||||
'Lithuanian': 'lt',
|
||||
'Hebrew': 'iw',
|
||||
'French-Russian': 'fr,ru',
|
||||
'Czech': 'cs',
|
||||
'Kazakh': 'kz',
|
||||
'Swedish': 'sv',
|
||||
'Indonesian': 'id',
|
||||
'Greek(Modern)': 'el',
|
||||
'Chinese(PRC)': 'cn',
|
||||
'Belorussian': 'by',
|
||||
'Deutsch': 'de',
|
||||
'German-English': 'de,en',
|
||||
'English, German': 'de,en',
|
||||
'English-Ukrainian': 'en,uk',
|
||||
'English, French': 'en,fr',
|
||||
'Bulgarian': 'bg',
|
||||
'Romanian,Moldavian,Moldovan': 'mo',
|
||||
'Belarusian': 'by',
|
||||
'Finnish': 'fi',
|
||||
'Azerbaijani': 'az',
|
||||
'Bengali': 'bn',
|
||||
'English-French': 'en,fr',
|
||||
'English-German': 'de,en',
|
||||
'Chinese-English': 'cn,en',
|
||||
}
|
||||
|
||||
|
||||
def create_cu(libgen_id, coverurl, md5):
|
||||
cu_suf = ''
|
||||
|
||||
bulk_id = (libgen_id - (libgen_id % 1000))
|
||||
proposed_coverurl = f"{bulk_id}/{md5}.jpg"
|
||||
proposed_coverurl_d = f"{bulk_id}/{md5}-d.jpg"
|
||||
proposed_coverurl_g = f"{bulk_id}/{md5}-g.jpg"
|
||||
|
||||
if coverurl == proposed_coverurl:
|
||||
coverurl = ''
|
||||
elif coverurl == proposed_coverurl_d:
|
||||
cu_suf = 'd'
|
||||
coverurl = ''
|
||||
elif coverurl == proposed_coverurl_g:
|
||||
cu_suf = 'g'
|
||||
coverurl = ''
|
||||
return coverurl, cu_suf
|
||||
|
||||
|
||||
class LibgenApiToScitechPbAction(BaseAction):
|
||||
async def do(self, item: dict) -> ScitechPb:
|
||||
scitech_pb = ScitechPb(
|
||||
authors=(item.get('author') or '').split('; '),
|
||||
description=item.get('descr'),
|
||||
doi=item.get('doi'),
|
||||
edition=item.get('edition'),
|
||||
extension=item.get('extension'),
|
||||
filesize=safe_int(item['filesize']) or 0,
|
||||
is_deleted=item.get('visible', '') != '',
|
||||
isbns=list(filter(
|
||||
lambda x: bool(x),
|
||||
map(
|
||||
lambda x: x.replace('-', '').strip(),
|
||||
item['identifier'].replace(';', ',').split(',')
|
||||
),
|
||||
)),
|
||||
language=LANGUAGE_TRANSLATION.get(item['language']),
|
||||
libgen_id=int(item['id']),
|
||||
md5=item['md5'].lower(),
|
||||
pages=safe_int(item['pages']),
|
||||
series=item.get('series'),
|
||||
tags=list(filter(
|
||||
lambda x: bool(x),
|
||||
map(
|
||||
lambda x: x.strip(),
|
||||
item['tags'].split(';')
|
||||
),
|
||||
)),
|
||||
title=item['title'],
|
||||
)
|
||||
|
||||
scitech_pb.cu, scitech_pb.cu_suf = create_cu(
|
||||
libgen_id=scitech_pb.libgen_id,
|
||||
coverurl=item['coverurl'].lower(),
|
||||
md5=scitech_pb.md5
|
||||
)
|
||||
year = safe_int(item['year'])
|
||||
if year and year < 9999:
|
||||
scitech_pb.year = str(year)
|
||||
scitech_pb.issued_at = np.datetime64(scitech_pb.year).astype('<M8[s]').astype(np.int64)
|
||||
return scitech_pb
|
73
nexus/actions/scimag.py
Normal file
73
nexus/actions/scimag.py
Normal file
@ -0,0 +1,73 @@
|
||||
from html import unescape
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from nexus.actions.common import canonize_doi
|
||||
from nexus.models.proto.operation_pb2 import \
|
||||
DocumentOperation as DocumentOperationPb
|
||||
from nexus.models.proto.operation_pb2 import UpdateDocument as UpdateDocumentPb
|
||||
from nexus.models.proto.scimag_pb2 import Scimag as ScimagPb
|
||||
from nexus.models.proto.typed_document_pb2 import \
|
||||
TypedDocument as TypedDocumentPb
|
||||
from nexus.nlptools.language_detect import detect_language
|
||||
from nexus.nlptools.utils import (
|
||||
despace,
|
||||
despace_full,
|
||||
)
|
||||
|
||||
from .base import BaseAction
|
||||
|
||||
|
||||
class CleanScimagPbAction(BaseAction):
|
||||
async def do(self, scimag_pb: ScimagPb) -> ScimagPb:
|
||||
if scimag_pb.abstract:
|
||||
abstract_soup = BeautifulSoup(unescape(scimag_pb.abstract), 'lxml')
|
||||
for line in abstract_soup.select(r'p, title, jats\:title, jats\:p'):
|
||||
line.replace_with(f'\n{line.text.strip()}\n')
|
||||
scimag_pb.abstract = despace(abstract_soup.text.strip())
|
||||
if scimag_pb.title:
|
||||
scimag_pb.title = despace_full(BeautifulSoup(unescape(scimag_pb.title), 'lxml').text.strip())
|
||||
if scimag_pb.authors:
|
||||
for i, author in enumerate(scimag_pb.authors):
|
||||
scimag_pb.authors[i] = despace_full(BeautifulSoup(unescape(author), 'lxml').text.strip())
|
||||
if scimag_pb.container_title:
|
||||
scimag_pb.container_title = scimag_pb.container_title.replace(
|
||||
'<html_ent glyph="@lt;" ascii="<"/>'
|
||||
'html_ent glyph="@amp;" ascii="<html_ent glyph="@amp;" ascii="&"/>"/'
|
||||
'<html_ent glyph="@gt;" ascii=">"/>',
|
||||
'&'
|
||||
)
|
||||
scimag_pb.container_title = scimag_pb.container_title.replace('<html_ent glyph="@amp;" ascii="&"/>', '&')
|
||||
scimag_pb.container_title = scimag_pb.container_title.replace(
|
||||
'<html_ent glyph="@lt;" ascii="<"/>'
|
||||
'html_ent glyph="@amp;" ascii="&"/'
|
||||
'<html_ent glyph="@gt;" ascii=">"/>',
|
||||
'&'
|
||||
)
|
||||
scimag_pb.container_title = scimag_pb.container_title.replace('<html_ent glyph="@lt;" ascii="<"/>', '')
|
||||
scimag_pb.container_title = scimag_pb.container_title.replace('<html_ent glyph="@gt;" ascii=">"/>', '')
|
||||
scimag_pb.container_title = BeautifulSoup(unescape(scimag_pb.container_title), 'lxml').text.strip()
|
||||
if scimag_pb.doi:
|
||||
scimag_pb.doi = canonize_doi(scimag_pb.doi)
|
||||
if scimag_pb.references:
|
||||
canonized_references = list(map(canonize_doi, scimag_pb.references))
|
||||
del scimag_pb.references[:]
|
||||
scimag_pb.references.extend(canonized_references)
|
||||
if not scimag_pb.meta_language and (scimag_pb.title or scimag_pb.abstract):
|
||||
detected_language = detect_language(f'{scimag_pb.title} {scimag_pb.abstract}')
|
||||
if detected_language:
|
||||
scimag_pb.meta_language = detected_language
|
||||
if not scimag_pb.language:
|
||||
scimag_pb.language = scimag_pb.meta_language
|
||||
return scimag_pb
|
||||
|
||||
|
||||
class ScimagPbToDocumentOperationBytesAction(BaseAction):
|
||||
async def do(self, item: ScimagPb) -> bytes:
|
||||
document_operation_pb = DocumentOperationPb(
|
||||
update_document=UpdateDocumentPb(
|
||||
reindex=True,
|
||||
should_fill_from_external_source=True,
|
||||
typed_document=TypedDocumentPb(scimag=item),
|
||||
),
|
||||
)
|
||||
return document_operation_pb.SerializeToString()
|
58
nexus/actions/scitech.py
Normal file
58
nexus/actions/scitech.py
Normal file
@ -0,0 +1,58 @@
|
||||
from html import unescape
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from nexus.actions.common import canonize_doi
|
||||
from nexus.models.proto.operation_pb2 import \
|
||||
DocumentOperation as DocumentOperationPb
|
||||
from nexus.models.proto.operation_pb2 import UpdateDocument as UpdateDocumentPb
|
||||
from nexus.models.proto.scitech_pb2 import Scitech as ScitechPb
|
||||
from nexus.models.proto.typed_document_pb2 import \
|
||||
TypedDocument as TypedDocumentPb
|
||||
from nexus.nlptools.language_detect import detect_language
|
||||
from nexus.nlptools.utils import (
|
||||
despace,
|
||||
despace_full,
|
||||
)
|
||||
|
||||
from .base import BaseAction
|
||||
|
||||
|
||||
class CleanScitechAction(BaseAction):
|
||||
async def do(self, scitech_pb: ScitechPb) -> ScitechPb:
|
||||
if scitech_pb.authors:
|
||||
for i, author in enumerate(scitech_pb.authors):
|
||||
scitech_pb.authors[i] = despace_full(author)
|
||||
|
||||
if scitech_pb.description:
|
||||
description_soup = BeautifulSoup(unescape(scitech_pb.description), 'lxml')
|
||||
for line in description_soup.select(r'p, title, jats\:title, jats\:p'):
|
||||
line.replace_with(f'\n{line.text.strip()}\n')
|
||||
scitech_pb.description = despace(description_soup.text.strip())
|
||||
|
||||
scitech_pb.series = despace_full(scitech_pb.series)
|
||||
scitech_pb.title = despace_full(scitech_pb.title)
|
||||
|
||||
if not scitech_pb.meta_language and (scitech_pb.title or scitech_pb.description):
|
||||
detected_language = detect_language(f'{scitech_pb.title} {scitech_pb.description }')
|
||||
if detected_language:
|
||||
scitech_pb.meta_language = detected_language
|
||||
if not scitech_pb.language:
|
||||
scitech_pb.language = scitech_pb.meta_language
|
||||
|
||||
scitech_pb.md5 = scitech_pb.md5.lower()
|
||||
scitech_pb.extension = scitech_pb.extension.lower()
|
||||
scitech_pb.doi = canonize_doi(scitech_pb.doi)
|
||||
if scitech_pb.edition == 'None':
|
||||
scitech_pb.edition = ''
|
||||
return scitech_pb
|
||||
|
||||
|
||||
class ScitechPbToDocumentOperationBytesAction(BaseAction):
|
||||
async def do(self, item: ScitechPb) -> bytes:
|
||||
document_operation_pb = DocumentOperationPb(
|
||||
update_document=UpdateDocumentPb(
|
||||
reindex=True,
|
||||
typed_document=TypedDocumentPb(scitech=item),
|
||||
),
|
||||
)
|
||||
return document_operation_pb.SerializeToString()
|
26
nexus/actions/update_document.py
Normal file
26
nexus/actions/update_document.py
Normal file
@ -0,0 +1,26 @@
|
||||
from aiosumma import SummaHttpClient
|
||||
from nexus.models.proto.operation_pb2 import \
|
||||
DocumentOperation as DocumentOperationPb
|
||||
from nexus.summa.schema import coders
|
||||
|
||||
from .base import BaseAction
|
||||
|
||||
|
||||
class SendDocumentOperationUpdateDocumentPbToSummaAction(BaseAction):
|
||||
def __init__(self, summa):
|
||||
super().__init__()
|
||||
self.summa_client = SummaHttpClient(**summa)
|
||||
self.waits.append(self.summa_client)
|
||||
|
||||
async def do(self, document_operation_pb: DocumentOperationPb) -> DocumentOperationPb:
|
||||
update_document_pb = document_operation_pb.update_document
|
||||
schema = update_document_pb.typed_document.WhichOneof('document')
|
||||
document = getattr(update_document_pb.typed_document, schema)
|
||||
original_id = getattr(document, 'original_id', None)
|
||||
if not update_document_pb.reindex or original_id:
|
||||
return document_operation_pb
|
||||
document_tantivy = coders[schema].encode_document(document)
|
||||
await self.summa_client.put_document(schema, document_tantivy)
|
||||
if update_document_pb.commit:
|
||||
await self.summa_client.commit(schema)
|
||||
return document_operation_pb
|
232
nexus/actions/update_document_scimag.py
Normal file
232
nexus/actions/update_document_scimag.py
Normal file
@ -0,0 +1,232 @@
|
||||
import asyncio
|
||||
from typing import (
|
||||
Optional,
|
||||
Set,
|
||||
)
|
||||
|
||||
import aiopg
|
||||
from aiocrossref import CrossrefClient
|
||||
from aiocrossref.exceptions import (
|
||||
NotFoundError,
|
||||
WrongContentTypeError,
|
||||
)
|
||||
from aiokafka import AIOKafkaProducer
|
||||
from library.aiopostgres.pool_holder import AioPostgresPoolHolder
|
||||
from nexus.models.proto.operation_pb2 import \
|
||||
CrossReferenceOperation as CrossReferenceOperationPb
|
||||
from nexus.models.proto.operation_pb2 import \
|
||||
DocumentOperation as DocumentOperationPb
|
||||
from nexus.models.proto.scimag_pb2 import Scimag as ScimagPb
|
||||
from pypika import (
|
||||
PostgreSQLQuery,
|
||||
Table,
|
||||
)
|
||||
from pypika.terms import Array
|
||||
|
||||
from .base import BaseAction
|
||||
from .crossref_api import CrossrefApiToScimagPbAction
|
||||
from .exceptions import InterruptProcessing
|
||||
from .scimag import CleanScimagPbAction
|
||||
|
||||
|
||||
class SendDocumentOperationUpdateDocumentScimagPbToGoldenPostgresAction(BaseAction):
|
||||
scimag_table = Table('scimag')
|
||||
db_multi_fields = {
|
||||
'authors',
|
||||
'ipfs_multihashes',
|
||||
'issns',
|
||||
'tags',
|
||||
}
|
||||
db_single_fields = {
|
||||
'id',
|
||||
'abstract',
|
||||
'container_title',
|
||||
'doi',
|
||||
'embedding',
|
||||
'filesize',
|
||||
'first_page',
|
||||
'is_deleted',
|
||||
'issued_at',
|
||||
'issue',
|
||||
'journal_id',
|
||||
'language',
|
||||
'last_page',
|
||||
'meta_language',
|
||||
'md5',
|
||||
'ref_by_count',
|
||||
'scimag_bulk_id',
|
||||
'telegram_file_id',
|
||||
'title',
|
||||
'type',
|
||||
'updated_at',
|
||||
'volume',
|
||||
}
|
||||
db_fields = db_single_fields | db_multi_fields
|
||||
|
||||
def __init__(self, database):
|
||||
super().__init__()
|
||||
self.pool_holder = AioPostgresPoolHolder(
|
||||
fn=aiopg.create_pool,
|
||||
dsn=f'dbname={database["database"]} '
|
||||
f'user={database["username"]} '
|
||||
f'password={database["password"]} '
|
||||
f'host={database["host"]}',
|
||||
timeout=30,
|
||||
pool_recycle=60,
|
||||
maxsize=4,
|
||||
)
|
||||
self.waits.append(self.pool_holder)
|
||||
|
||||
def cast_field_value(self, field_name: str, field_value):
|
||||
if field_name in self.db_multi_fields:
|
||||
field_value = Array(*field_value)
|
||||
return field_name, field_value
|
||||
|
||||
def is_field_set(self, scimag_pb: ScimagPb, field_name: str):
|
||||
field_value = getattr(scimag_pb, field_name)
|
||||
if field_name in {'scimag_bulk_id', 'issued_at'}:
|
||||
return scimag_pb.HasField(field_name)
|
||||
return field_value
|
||||
|
||||
def generate_delete_sql(self, scimag_pb: ScimagPb):
|
||||
return (
|
||||
PostgreSQLQuery
|
||||
.from_('scimag')
|
||||
.where(self.scimag_table.id == scimag_pb.id)
|
||||
.delete()
|
||||
.get_sql()
|
||||
)
|
||||
|
||||
def generate_insert_sql(self, scimag_pb: ScimagPb, fields: Optional[Set[str]] = None):
|
||||
columns = []
|
||||
inserts = []
|
||||
|
||||
fields = fields or self.db_fields
|
||||
for field_name in fields:
|
||||
if self.is_field_set(scimag_pb, field_name):
|
||||
field_value = getattr(scimag_pb, field_name)
|
||||
field_name, field_value = self.cast_field_value(field_name, field_value)
|
||||
columns.append(field_name)
|
||||
inserts.append(field_value)
|
||||
|
||||
query = PostgreSQLQuery.into(self.scimag_table).columns(*columns).insert(*inserts)
|
||||
if columns:
|
||||
query = query.on_conflict('doi')
|
||||
for field, val in zip(columns, inserts):
|
||||
query = query.do_update(field, val)
|
||||
|
||||
return query.returning(self.scimag_table.id).get_sql()
|
||||
|
||||
def generate_update_sql(
|
||||
self,
|
||||
scimag_pb: ScimagPb,
|
||||
fields: Optional[Set[str]] = None,
|
||||
) -> str:
|
||||
query = (
|
||||
PostgreSQLQuery
|
||||
.update(self.scimag_table)
|
||||
)
|
||||
fields = fields or self.db_fields
|
||||
for field_name in fields:
|
||||
if self.is_field_set(scimag_pb, field_name):
|
||||
field_value = getattr(scimag_pb, field_name)
|
||||
field_name, field_value = self.cast_field_value(field_name, field_value)
|
||||
query = query.set(field_name, field_value)
|
||||
return query.where(self.scimag_table.id == scimag_pb.id).get_sql()
|
||||
|
||||
async def do(self, document_operation_pb: DocumentOperationPb) -> DocumentOperationPb:
|
||||
update_document_pb = document_operation_pb.update_document
|
||||
scimag_pb = update_document_pb.typed_document.scimag
|
||||
fields = update_document_pb.fields or self.db_fields
|
||||
|
||||
if scimag_pb.id:
|
||||
if not scimag_pb.is_deleted:
|
||||
sql = self.generate_update_sql(
|
||||
scimag_pb,
|
||||
fields=fields,
|
||||
)
|
||||
else:
|
||||
sql = self.generate_delete_sql(scimag_pb)
|
||||
await self.pool_holder.execute(sql)
|
||||
else:
|
||||
sql = self.generate_insert_sql(
|
||||
scimag_pb=scimag_pb,
|
||||
fields=fields,
|
||||
)
|
||||
result = await self.pool_holder.execute(sql, fetch=True)
|
||||
scimag_pb.id = result[0][0]
|
||||
return document_operation_pb
|
||||
|
||||
|
||||
class SendDocumentOperationUpdateDocumentScimagPbReferencesToKafkaAction(BaseAction):
|
||||
def __init__(self, topic, brokers):
|
||||
super().__init__()
|
||||
self.topic = topic
|
||||
self.brokers = brokers
|
||||
self.producer = None
|
||||
|
||||
async def start(self):
|
||||
self.producer = self.get_producer()
|
||||
await self.producer.start()
|
||||
|
||||
async def stop(self):
|
||||
await self.producer.stop()
|
||||
self.producer = None
|
||||
|
||||
def get_producer(self):
|
||||
return AIOKafkaProducer(
|
||||
loop=asyncio.get_running_loop(),
|
||||
bootstrap_servers=self.brokers,
|
||||
)
|
||||
|
||||
async def do(self, document_operation_pb: DocumentOperationPb) -> DocumentOperationPb:
|
||||
update_document_pb = document_operation_pb.update_document
|
||||
scimag_pb = update_document_pb.typed_document.scimag
|
||||
for reference in scimag_pb.references:
|
||||
reference_operation = CrossReferenceOperationPb(
|
||||
source=scimag_pb.doi,
|
||||
target=reference,
|
||||
)
|
||||
await self.producer.send_and_wait(
|
||||
self.topic,
|
||||
reference_operation.SerializeToString(),
|
||||
)
|
||||
return document_operation_pb
|
||||
|
||||
|
||||
class FillDocumentOperationUpdateDocumentScimagPbFromExternalSourceAction(BaseAction):
|
||||
def __init__(self, crossref):
|
||||
super().__init__()
|
||||
self.crossref_client = CrossrefClient(
|
||||
delay=1.0 / crossref['rps'],
|
||||
max_retries=60,
|
||||
timeout=crossref.get('timeout'),
|
||||
user_agent=crossref.get('user_agent'),
|
||||
)
|
||||
self.crossref_api_to_scimag_pb_action = CrossrefApiToScimagPbAction()
|
||||
self.waits.append(self.crossref_client)
|
||||
|
||||
async def do(self, document_operation_pb: DocumentOperationPb) -> DocumentOperationPb:
|
||||
update_document_pb = document_operation_pb.update_document
|
||||
if not update_document_pb.should_fill_from_external_source:
|
||||
return document_operation_pb
|
||||
scimag_pb = update_document_pb.typed_document.scimag
|
||||
try:
|
||||
crossref_api_response = await self.crossref_client.works(doi=scimag_pb.doi)
|
||||
except (WrongContentTypeError, NotFoundError) as e:
|
||||
raise InterruptProcessing(doc_id=scimag_pb.doi, reason=str(e))
|
||||
new_scimag_pb = await self.crossref_api_to_scimag_pb_action.do(crossref_api_response)
|
||||
scimag_pb.MergeFrom(new_scimag_pb)
|
||||
return document_operation_pb
|
||||
|
||||
|
||||
class CleanDocumentOperationUpdateDocumentScimagPbAction(BaseAction):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.cleaner = CleanScimagPbAction()
|
||||
self.waits.append(self.cleaner)
|
||||
|
||||
async def do(self, document_operation_pb: DocumentOperationPb) -> DocumentOperationPb:
|
||||
update_document_pb = document_operation_pb.update_document
|
||||
update_document_pb.typed_document.scimag.CopyFrom(await self.cleaner.do(update_document_pb.typed_document.scimag))
|
||||
return document_operation_pb
|
161
nexus/actions/update_document_scitech.py
Normal file
161
nexus/actions/update_document_scitech.py
Normal file
@ -0,0 +1,161 @@
|
||||
import aiopg
|
||||
from library.aiopostgres.pool_holder import AioPostgresPoolHolder
|
||||
from nexus.models.proto.operation_pb2 import \
|
||||
DocumentOperation as DocumentOperationPb
|
||||
from nexus.models.proto.scitech_pb2 import Scitech as ScitechPb
|
||||
from pypika import (
|
||||
PostgreSQLQuery,
|
||||
Table,
|
||||
functions,
|
||||
)
|
||||
from pypika.terms import Array
|
||||
|
||||
from .base import BaseAction
|
||||
from .exceptions import ConflictError
|
||||
from .scitech import CleanScitechAction
|
||||
|
||||
|
||||
class UuidFunction(functions.Function):
|
||||
def __init__(self, uuid, alias=None):
|
||||
super(UuidFunction, self).__init__('UUID', uuid, alias=alias)
|
||||
|
||||
|
||||
class SendDocumentOperationUpdateDocumentScitechPbToGoldenPostgresAction(BaseAction):
|
||||
scitech_table = Table('scitech')
|
||||
db_single_fields = {
|
||||
'id',
|
||||
'cu',
|
||||
'cu_suf',
|
||||
'description',
|
||||
'doi',
|
||||
'edition',
|
||||
'extension',
|
||||
'fiction_id',
|
||||
'filesize',
|
||||
'is_deleted',
|
||||
'issued_at',
|
||||
'language',
|
||||
'libgen_id',
|
||||
'meta_language',
|
||||
'md5',
|
||||
'original_id',
|
||||
'pages',
|
||||
'series',
|
||||
'telegram_file_id',
|
||||
'title',
|
||||
'updated_at',
|
||||
'volume',
|
||||
}
|
||||
db_multi_fields = {
|
||||
'authors',
|
||||
'ipfs_multihashes',
|
||||
'isbns',
|
||||
'tags',
|
||||
}
|
||||
db_fields = db_single_fields | db_multi_fields
|
||||
|
||||
def __init__(self, database):
|
||||
super().__init__()
|
||||
self.pool_holder = AioPostgresPoolHolder(
|
||||
fn=aiopg.create_pool,
|
||||
dsn=f'dbname={database["database"]} '
|
||||
f'user={database["username"]} '
|
||||
f'password={database["password"]} '
|
||||
f'host={database["host"]}',
|
||||
timeout=30,
|
||||
pool_recycle=60,
|
||||
maxsize=4,
|
||||
)
|
||||
self.waits.append(self.pool_holder)
|
||||
|
||||
def cast_field_value(self, field_name, field_value):
|
||||
if field_name in self.db_multi_fields:
|
||||
field_value = Array(*field_value)
|
||||
return field_name, field_value
|
||||
|
||||
def is_field_set(self, scitech_pb: ScitechPb, field_name: str):
|
||||
field_value = getattr(scitech_pb, field_name)
|
||||
if field_name in {'issued_at'}:
|
||||
return scitech_pb.HasField(field_name)
|
||||
return field_value
|
||||
|
||||
async def do(self, document_operation_pb: DocumentOperationPb) -> DocumentOperationPb:
|
||||
update_document_pb = document_operation_pb.update_document
|
||||
scitech_pb = update_document_pb.typed_document.scitech
|
||||
fields = update_document_pb.fields or self.db_fields
|
||||
|
||||
conditions = []
|
||||
if scitech_pb.id:
|
||||
conditions.append(self.scitech_table.id == scitech_pb.id)
|
||||
if scitech_pb.libgen_id:
|
||||
conditions.append(self.scitech_table.libgen_id == scitech_pb.libgen_id)
|
||||
if scitech_pb.fiction_id:
|
||||
conditions.append(self.scitech_table.fiction_id == scitech_pb.fiction_id)
|
||||
if scitech_pb.doi:
|
||||
conditions.append(self.scitech_table.doi == scitech_pb.doi)
|
||||
# if scitech_pb.md5:
|
||||
# conditions.append(self.scitech_table.md5 == UuidFunction(scitech_pb.md5))
|
||||
|
||||
if conditions:
|
||||
casted_conditions = conditions[0]
|
||||
for condition in conditions[1:]:
|
||||
casted_conditions = casted_conditions | condition
|
||||
sql = (
|
||||
PostgreSQLQuery
|
||||
.from_(self.scitech_table)
|
||||
.select(functions.Count('*'))
|
||||
.where(casted_conditions)
|
||||
.get_sql()
|
||||
)
|
||||
result = await self.pool_holder.execute(
|
||||
sql,
|
||||
fetch=True
|
||||
)
|
||||
count = result[0][0]
|
||||
|
||||
if count > 1:
|
||||
raise ConflictError(scitech_pb, duplicates=[])
|
||||
|
||||
if count == 1:
|
||||
query = PostgreSQLQuery.update(self.scitech_table)
|
||||
for field_name in fields:
|
||||
if self.is_field_set(scitech_pb, field_name):
|
||||
field_value = getattr(scitech_pb, field_name)
|
||||
field_name, field_value = self.cast_field_value(field_name, field_value)
|
||||
query = query.set(field_name, field_value)
|
||||
sql = query.where(casted_conditions).returning('id', 'original_id').get_sql()
|
||||
else:
|
||||
columns = []
|
||||
inserts = []
|
||||
for field_name in fields:
|
||||
if self.is_field_set(scitech_pb, field_name):
|
||||
field_value = getattr(scitech_pb, field_name)
|
||||
field_name, field_value = self.cast_field_value(field_name, field_value)
|
||||
columns.append(field_name)
|
||||
inserts.append(field_value)
|
||||
query = (
|
||||
PostgreSQLQuery
|
||||
.into(self.scitech_table)
|
||||
.columns(*columns)
|
||||
.insert(*inserts)
|
||||
.on_conflict('libgen_id', 'doi')
|
||||
)
|
||||
for col, val in zip(columns, inserts):
|
||||
query = query.do_update(col, val)
|
||||
sql = query.returning('id', 'original_id').get_sql()
|
||||
|
||||
result = await self.pool_holder.execute(sql, fetch=True)
|
||||
scitech_pb.id, scitech_pb.original_id = result[0][0], result[0][1] or 0
|
||||
return document_operation_pb
|
||||
|
||||
|
||||
class CleanDocumentOperationUpdateDocumentScitechPbAction(BaseAction):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.cleaner = CleanScitechAction()
|
||||
self.waits.append(self.cleaner)
|
||||
|
||||
async def do(self, document_operation_pb: DocumentOperationPb) -> DocumentOperationPb:
|
||||
update_document_pb = document_operation_pb.update_document
|
||||
update_document_pb.typed_document.scitech.CopyFrom(await self.cleaner.do(update_document_pb.typed_document.scitech))
|
||||
return document_operation_pb
|
34
nexus/ingest/BUILD.bazel
Normal file
34
nexus/ingest/BUILD.bazel
Normal file
@ -0,0 +1,34 @@
|
||||
load("@io_bazel_rules_docker//python3:image.bzl", "py3_image")
|
||||
|
||||
load("@pip_modules_external//:requirements.bzl", "requirement")
|
||||
|
||||
alias(
|
||||
name = "binary",
|
||||
actual = ":image.binary",
|
||||
)
|
||||
|
||||
py3_image(
|
||||
name = "image",
|
||||
srcs = glob(["**/*.py"]),
|
||||
base = "//images/production:base-python-image",
|
||||
data = [
|
||||
"configs/base.yaml",
|
||||
"configs/logging.yaml",
|
||||
],
|
||||
main = "main.py",
|
||||
srcs_version = "PY3ONLY",
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
requirement("aiokafka"),
|
||||
requirement("aiopg"),
|
||||
requirement("fire"),
|
||||
requirement("aiocrossref"),
|
||||
requirement("aiokit"),
|
||||
requirement("aiolibgen"),
|
||||
"//library/aiopostgres",
|
||||
"//library/configurator",
|
||||
"//library/logging",
|
||||
"//nexus/actions",
|
||||
],
|
||||
)
|
||||
|
45
nexus/ingest/README.md
Normal file
45
nexus/ingest/README.md
Normal file
@ -0,0 +1,45 @@
|
||||
# Nexus Ingest
|
||||
|
||||
`Ingest` goes to Internet and send retrived data to Kafka queue of operations.
|
||||
This version has cut `configs` subdirectory due to hard reliance of configs on the network infrastructure you are using.
|
||||
You have to write your own configs taking example below into account.
|
||||
|
||||
## Sample `configs/base.yaml`
|
||||
|
||||
```yaml
|
||||
---
|
||||
jobs:
|
||||
crossref-api:
|
||||
class: nexus.ingest.jobs.CrossrefApiJob
|
||||
kwargs:
|
||||
actions:
|
||||
- class: nexus.actions.crossref_api.CrossrefApiToThinScimagPbAction
|
||||
- class: nexus.actions.scimag.ScimagPbToDocumentOperationBytesAction
|
||||
base_url: https://api.crossref.org/
|
||||
max_retries: 60
|
||||
retry_delay: 10
|
||||
sinks:
|
||||
- class: nexus.ingest.sinks.KafkaSink
|
||||
kwargs:
|
||||
kafka_hosts:
|
||||
- kafka-0.example.net
|
||||
- kafka-1.example.net
|
||||
topic_name: operations_binary
|
||||
libgen-api:
|
||||
class: nexus.ingest.jobs.LibgenApiJob
|
||||
kwargs:
|
||||
actions:
|
||||
- class: nexus.actions.libgen_api.LibgenApiToScitechPbAction
|
||||
- class: nexus.actions.scitech.ScitechPbToDocumentOperationBytesAction
|
||||
base_url: libgen.example.net
|
||||
max_retries: 60
|
||||
retry_delay: 10
|
||||
sinks:
|
||||
- class: nexus.ingest.sinks.KafkaSink
|
||||
kwargs:
|
||||
kafka_hosts:
|
||||
- kafka-0.example.net
|
||||
- kafka-1.example.net
|
||||
topic_name: operations_binary
|
||||
log_path: '/var/log/nexus-ingest/{{ ENV_TYPE }}'
|
||||
```
|
6
nexus/ingest/__init__.py
Normal file
6
nexus/ingest/__init__.py
Normal file
@ -0,0 +1,6 @@
|
||||
from . import (
|
||||
jobs,
|
||||
sinks,
|
||||
)
|
||||
|
||||
__all__ = ['jobs', 'sinks']
|
5
nexus/ingest/jobs/__init__.py
Normal file
5
nexus/ingest/jobs/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
from .crossref_api import CrossrefApiJob
|
||||
from .libgen_api import LibgenApiJob
|
||||
from .self_feed import SelfFeedJob
|
||||
|
||||
__all__ = ['CrossrefApiJob', 'LibgenApiJob', 'SelfFeedJob']
|
47
nexus/ingest/jobs/base.py
Normal file
47
nexus/ingest/jobs/base.py
Normal file
@ -0,0 +1,47 @@
|
||||
from typing import (
|
||||
Any,
|
||||
AsyncIterable,
|
||||
Iterable,
|
||||
)
|
||||
|
||||
from aiokit import AioRootThing
|
||||
from izihawa_utils.importlib import import_object
|
||||
|
||||
from ..sinks.base import BaseSink
|
||||
|
||||
|
||||
class BaseJob(AioRootThing):
|
||||
name = None
|
||||
|
||||
def __init__(self, actions: Iterable[dict], sinks: Iterable[dict]):
|
||||
super().__init__()
|
||||
real_sinks = []
|
||||
for sink in sinks:
|
||||
if isinstance(sink, BaseSink):
|
||||
real_sinks.append(sink)
|
||||
else:
|
||||
real_sinks.append(import_object(sink['class'])(**sink.get('kwargs', {})))
|
||||
self.sinks = real_sinks
|
||||
|
||||
real_actions = []
|
||||
for action in actions:
|
||||
real_actions.append(import_object(action['class'])(**action.get('kwargs', {})))
|
||||
self.actions = real_actions
|
||||
|
||||
self.waits.extend(self.sinks)
|
||||
self.waits.extend(self.actions)
|
||||
|
||||
async def iterator(self) -> AsyncIterable[Any]:
|
||||
raise NotImplementedError()
|
||||
|
||||
async def action_iterator(self) -> AsyncIterable[Any]:
|
||||
async for item in self.iterator():
|
||||
processed_item = item
|
||||
for action in self.actions:
|
||||
processed_item = await action.do(processed_item)
|
||||
yield processed_item
|
||||
|
||||
async def start(self):
|
||||
async for data in self.action_iterator():
|
||||
for sink in self.sinks:
|
||||
await sink.send(data)
|
40
nexus/ingest/jobs/crossref_api.py
Normal file
40
nexus/ingest/jobs/crossref_api.py
Normal file
@ -0,0 +1,40 @@
|
||||
from datetime import (
|
||||
datetime,
|
||||
timedelta,
|
||||
)
|
||||
from typing import (
|
||||
Any,
|
||||
AsyncIterable,
|
||||
Iterable,
|
||||
Optional,
|
||||
)
|
||||
|
||||
from aiocrossref import CrossrefClient
|
||||
from nexus.ingest.jobs.base import BaseJob
|
||||
|
||||
|
||||
class CrossrefApiJob(BaseJob):
|
||||
name = 'crossref-api'
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
base_url: str,
|
||||
max_retries: int,
|
||||
retry_delay: int,
|
||||
actions: Iterable[dict],
|
||||
sinks: Iterable[dict],
|
||||
from_date: Optional[str] = None,
|
||||
):
|
||||
super().__init__(actions=actions, sinks=sinks)
|
||||
self.crossref_client = CrossrefClient(base_url=base_url, max_retries=max_retries, retry_delay=retry_delay)
|
||||
self.from_date = from_date or str(datetime.date(datetime.now()) - timedelta(days=1))
|
||||
self.starts.append(self.crossref_client)
|
||||
|
||||
async def iterator(self) -> AsyncIterable[Any]:
|
||||
async for chunk in self.crossref_client.works_cursor(
|
||||
filter=f'from-index-date:{self.from_date}',
|
||||
rows=1000,
|
||||
select='DOI',
|
||||
):
|
||||
for item in chunk['items']:
|
||||
yield item
|
35
nexus/ingest/jobs/libgen_api.py
Normal file
35
nexus/ingest/jobs/libgen_api.py
Normal file
@ -0,0 +1,35 @@
|
||||
from datetime import (
|
||||
datetime,
|
||||
timedelta,
|
||||
)
|
||||
from typing import (
|
||||
Any,
|
||||
AsyncIterable,
|
||||
Iterable,
|
||||
Optional,
|
||||
)
|
||||
|
||||
from aiolibgen import LibgenClient
|
||||
from nexus.ingest.jobs.base import BaseJob
|
||||
|
||||
|
||||
class LibgenApiJob(BaseJob):
|
||||
name = 'libgen-api'
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
base_url: str,
|
||||
max_retries: int,
|
||||
retry_delay: int,
|
||||
actions: Iterable[dict],
|
||||
sinks: Iterable[dict],
|
||||
from_date: Optional[str] = None,
|
||||
):
|
||||
super().__init__(sinks=sinks, actions=actions)
|
||||
self.libgen_client = LibgenClient(base_url=base_url, max_retries=max_retries, retry_delay=retry_delay)
|
||||
self.from_date = from_date or str(datetime.date(datetime.now()) - timedelta(days=1))
|
||||
self.starts.append(self.libgen_client)
|
||||
|
||||
async def iterator(self) -> AsyncIterable[Any]:
|
||||
async for item in self.libgen_client.newer(timenewer=f'{self.from_date} 00:00:00'):
|
||||
yield item
|
39
nexus/ingest/jobs/self_feed.py
Normal file
39
nexus/ingest/jobs/self_feed.py
Normal file
@ -0,0 +1,39 @@
|
||||
from typing import (
|
||||
Any,
|
||||
AsyncIterable,
|
||||
Iterable,
|
||||
)
|
||||
|
||||
import aiopg
|
||||
from library.aiopostgres.pool_holder import AioPostgresPoolHolder
|
||||
from nexus.ingest.jobs.base import BaseJob
|
||||
|
||||
|
||||
class SelfFeedJob(BaseJob):
|
||||
name = 'self-feed-job'
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
database: dict,
|
||||
sql: str,
|
||||
actions: Iterable[dict],
|
||||
sinks: Iterable[dict],
|
||||
):
|
||||
super().__init__(actions=actions, sinks=sinks)
|
||||
self.sql = sql
|
||||
self.pool_holder = AioPostgresPoolHolder(
|
||||
fn=aiopg.create_pool,
|
||||
dsn=f'dbname={database["database"]} '
|
||||
f'user={database["username"]} '
|
||||
f'password={database["password"]} '
|
||||
f'host={database["host"]}',
|
||||
timeout=30,
|
||||
pool_recycle=60,
|
||||
maxsize=4,
|
||||
)
|
||||
self.waits.append(self.pool_holder)
|
||||
|
||||
async def iterator(self) -> AsyncIterable[Any]:
|
||||
rows = await self.pool_holder.execute(self.sql, fetch=True, timeout=3600)
|
||||
for row in rows:
|
||||
yield row
|
35
nexus/ingest/main.py
Normal file
35
nexus/ingest/main.py
Normal file
@ -0,0 +1,35 @@
|
||||
import fire
|
||||
from aiokit.utils import sync_fu
|
||||
from izihawa_utils.importlib import import_object
|
||||
from library.logging import (
|
||||
configure_logging,
|
||||
error_log,
|
||||
)
|
||||
from nexus.ingest.configs import get_config
|
||||
|
||||
|
||||
async def run_job(name, **kwargs):
|
||||
config = get_config()
|
||||
configure_logging(config)
|
||||
|
||||
job_config = config['jobs'][name]
|
||||
job_class = import_object(job_config['class'])
|
||||
real_kwargs = job_config['kwargs'].copy()
|
||||
real_kwargs.update(kwargs)
|
||||
job = job_class(**real_kwargs)
|
||||
|
||||
try:
|
||||
await job.start_and_wait()
|
||||
except Exception as e:
|
||||
error_log(e)
|
||||
raise
|
||||
finally:
|
||||
await job.stop()
|
||||
|
||||
|
||||
def main():
|
||||
fire.Fire({'run-job': sync_fu(run_job)})
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
3
nexus/ingest/sinks/__init__.py
Normal file
3
nexus/ingest/sinks/__init__.py
Normal file
@ -0,0 +1,3 @@
|
||||
from .kafka import KafkaSink
|
||||
|
||||
__all__ = ['KafkaSink', ]
|
12
nexus/ingest/sinks/base.py
Normal file
12
nexus/ingest/sinks/base.py
Normal file
@ -0,0 +1,12 @@
|
||||
from aiokit import AioThing
|
||||
|
||||
|
||||
class BaseSink(AioThing):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def send(self, data: bytes):
|
||||
raise NotImplementedError()
|
||||
|
||||
async def on_shutdown(self):
|
||||
pass
|
23
nexus/ingest/sinks/kafka.py
Normal file
23
nexus/ingest/sinks/kafka.py
Normal file
@ -0,0 +1,23 @@
|
||||
import asyncio
|
||||
from typing import Iterable
|
||||
|
||||
from aiokafka import AIOKafkaProducer
|
||||
|
||||
from .base import BaseSink
|
||||
|
||||
|
||||
class KafkaSink(BaseSink):
|
||||
def __init__(self, kafka_hosts: Iterable[str], topic_name: str):
|
||||
super().__init__()
|
||||
self.producer = AIOKafkaProducer(
|
||||
loop=asyncio.get_event_loop(),
|
||||
bootstrap_servers=kafka_hosts,
|
||||
)
|
||||
self.topic_name = topic_name
|
||||
self.starts.append(self.producer)
|
||||
|
||||
async def send(self, data: bytes):
|
||||
await self.producer.send_and_wait(
|
||||
self.topic_name,
|
||||
data,
|
||||
)
|
4
nexus/models/README.md
Normal file
4
nexus/models/README.md
Normal file
@ -0,0 +1,4 @@
|
||||
# Nexus Models
|
||||
|
||||
## Warning
|
||||
Do not rely heavily on the format, it is still a subject of redesigning and experimenting.
|
0
nexus/models/__init__.py
Normal file
0
nexus/models/__init__.py
Normal file
24
nexus/models/proto/BUILD.bazel
Normal file
24
nexus/models/proto/BUILD.bazel
Normal file
@ -0,0 +1,24 @@
|
||||
load("@com_github_grpc_grpc//bazel:python_rules.bzl", "py_proto_library")
|
||||
load("@io_bazel_rules_rust//proto:proto.bzl", "rust_proto_library")
|
||||
load("@rules_proto//proto:defs.bzl", "proto_library")
|
||||
|
||||
proto_library(
|
||||
name = "models_proto",
|
||||
srcs = glob([
|
||||
"*.proto",
|
||||
]),
|
||||
visibility = ["//visibility:public"],
|
||||
)
|
||||
|
||||
py_proto_library(
|
||||
name = "models_proto_py",
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [":models_proto"],
|
||||
)
|
||||
|
||||
rust_proto_library(
|
||||
name = "models_proto_rust",
|
||||
rust_deps = ["//rules/rust/cargo:protobuf"],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [":models_proto"],
|
||||
)
|
26
nexus/models/proto/operation.proto
Normal file
26
nexus/models/proto/operation.proto
Normal file
@ -0,0 +1,26 @@
|
||||
syntax = "proto3";
|
||||
package nexus.models.proto;
|
||||
|
||||
import "nexus/models/proto/typed_document.proto";
|
||||
|
||||
|
||||
message CrossReferenceOperation {
|
||||
string source = 1;
|
||||
string target = 2;
|
||||
uint32 last_retry_unixtime = 3;
|
||||
uint32 retry_count = 4;
|
||||
}
|
||||
|
||||
message DocumentOperation {
|
||||
oneof operation {
|
||||
UpdateDocument update_document = 3;
|
||||
};
|
||||
}
|
||||
|
||||
message UpdateDocument {
|
||||
repeated string fields = 1;
|
||||
bool should_fill_from_external_source = 2;
|
||||
bool commit = 3;
|
||||
bool reindex = 4;
|
||||
TypedDocument typed_document = 5;
|
||||
}
|
212
nexus/models/proto/operation_pb2.py
Normal file
212
nexus/models/proto/operation_pb2.py
Normal file
@ -0,0 +1,212 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
||||
# source: nexus/models/proto/operation.proto
|
||||
"""Generated protocol buffer code."""
|
||||
from google.protobuf import descriptor as _descriptor
|
||||
from google.protobuf import message as _message
|
||||
from google.protobuf import reflection as _reflection
|
||||
from google.protobuf import symbol_database as _symbol_database
|
||||
|
||||
# @@protoc_insertion_point(imports)
|
||||
|
||||
_sym_db = _symbol_database.Default()
|
||||
|
||||
|
||||
from nexus.models.proto import \
|
||||
typed_document_pb2 as nexus_dot_models_dot_proto_dot_typed__document__pb2
|
||||
|
||||
DESCRIPTOR = _descriptor.FileDescriptor(
|
||||
name='nexus/models/proto/operation.proto',
|
||||
package='nexus.models.proto',
|
||||
syntax='proto3',
|
||||
serialized_options=None,
|
||||
create_key=_descriptor._internal_create_key,
|
||||
serialized_pb=b'\n\"nexus/models/proto/operation.proto\x12\x12nexus.models.proto\x1a\'nexus/models/proto/typed_document.proto\"k\n\x17\x43rossReferenceOperation\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x0e\n\x06target\x18\x02 \x01(\t\x12\x1b\n\x13last_retry_unixtime\x18\x03 \x01(\r\x12\x13\n\x0bretry_count\x18\x04 \x01(\r\"_\n\x11\x44ocumentOperation\x12=\n\x0fupdate_document\x18\x03 \x01(\x0b\x32\".nexus.models.proto.UpdateDocumentH\x00\x42\x0b\n\toperation\"\xa6\x01\n\x0eUpdateDocument\x12\x0e\n\x06\x66ields\x18\x01 \x03(\t\x12(\n should_fill_from_external_source\x18\x02 \x01(\x08\x12\x0e\n\x06\x63ommit\x18\x03 \x01(\x08\x12\x0f\n\x07reindex\x18\x04 \x01(\x08\x12\x39\n\x0etyped_document\x18\x05 \x01(\x0b\x32!.nexus.models.proto.TypedDocumentb\x06proto3'
|
||||
,
|
||||
dependencies=[nexus_dot_models_dot_proto_dot_typed__document__pb2.DESCRIPTOR,])
|
||||
|
||||
|
||||
|
||||
|
||||
_CROSSREFERENCEOPERATION = _descriptor.Descriptor(
|
||||
name='CrossReferenceOperation',
|
||||
full_name='nexus.models.proto.CrossReferenceOperation',
|
||||
filename=None,
|
||||
file=DESCRIPTOR,
|
||||
containing_type=None,
|
||||
create_key=_descriptor._internal_create_key,
|
||||
fields=[
|
||||
_descriptor.FieldDescriptor(
|
||||
name='source', full_name='nexus.models.proto.CrossReferenceOperation.source', index=0,
|
||||
number=1, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=b"".decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='target', full_name='nexus.models.proto.CrossReferenceOperation.target', index=1,
|
||||
number=2, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=b"".decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='last_retry_unixtime', full_name='nexus.models.proto.CrossReferenceOperation.last_retry_unixtime', index=2,
|
||||
number=3, type=13, cpp_type=3, label=1,
|
||||
has_default_value=False, default_value=0,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='retry_count', full_name='nexus.models.proto.CrossReferenceOperation.retry_count', index=3,
|
||||
number=4, type=13, cpp_type=3, label=1,
|
||||
has_default_value=False, default_value=0,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
],
|
||||
extensions=[
|
||||
],
|
||||
nested_types=[],
|
||||
enum_types=[
|
||||
],
|
||||
serialized_options=None,
|
||||
is_extendable=False,
|
||||
syntax='proto3',
|
||||
extension_ranges=[],
|
||||
oneofs=[
|
||||
],
|
||||
serialized_start=99,
|
||||
serialized_end=206,
|
||||
)
|
||||
|
||||
|
||||
_DOCUMENTOPERATION = _descriptor.Descriptor(
|
||||
name='DocumentOperation',
|
||||
full_name='nexus.models.proto.DocumentOperation',
|
||||
filename=None,
|
||||
file=DESCRIPTOR,
|
||||
containing_type=None,
|
||||
create_key=_descriptor._internal_create_key,
|
||||
fields=[
|
||||
_descriptor.FieldDescriptor(
|
||||
name='update_document', full_name='nexus.models.proto.DocumentOperation.update_document', index=0,
|
||||
number=3, type=11, cpp_type=10, label=1,
|
||||
has_default_value=False, default_value=None,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
],
|
||||
extensions=[
|
||||
],
|
||||
nested_types=[],
|
||||
enum_types=[
|
||||
],
|
||||
serialized_options=None,
|
||||
is_extendable=False,
|
||||
syntax='proto3',
|
||||
extension_ranges=[],
|
||||
oneofs=[
|
||||
_descriptor.OneofDescriptor(
|
||||
name='operation', full_name='nexus.models.proto.DocumentOperation.operation',
|
||||
index=0, containing_type=None,
|
||||
create_key=_descriptor._internal_create_key,
|
||||
fields=[]),
|
||||
],
|
||||
serialized_start=208,
|
||||
serialized_end=303,
|
||||
)
|
||||
|
||||
|
||||
_UPDATEDOCUMENT = _descriptor.Descriptor(
|
||||
name='UpdateDocument',
|
||||
full_name='nexus.models.proto.UpdateDocument',
|
||||
filename=None,
|
||||
file=DESCRIPTOR,
|
||||
containing_type=None,
|
||||
create_key=_descriptor._internal_create_key,
|
||||
fields=[
|
||||
_descriptor.FieldDescriptor(
|
||||
name='fields', full_name='nexus.models.proto.UpdateDocument.fields', index=0,
|
||||
number=1, type=9, cpp_type=9, label=3,
|
||||
has_default_value=False, default_value=[],
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='should_fill_from_external_source', full_name='nexus.models.proto.UpdateDocument.should_fill_from_external_source', index=1,
|
||||
number=2, type=8, cpp_type=7, label=1,
|
||||
has_default_value=False, default_value=False,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='commit', full_name='nexus.models.proto.UpdateDocument.commit', index=2,
|
||||
number=3, type=8, cpp_type=7, label=1,
|
||||
has_default_value=False, default_value=False,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='reindex', full_name='nexus.models.proto.UpdateDocument.reindex', index=3,
|
||||
number=4, type=8, cpp_type=7, label=1,
|
||||
has_default_value=False, default_value=False,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='typed_document', full_name='nexus.models.proto.UpdateDocument.typed_document', index=4,
|
||||
number=5, type=11, cpp_type=10, label=1,
|
||||
has_default_value=False, default_value=None,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
],
|
||||
extensions=[
|
||||
],
|
||||
nested_types=[],
|
||||
enum_types=[
|
||||
],
|
||||
serialized_options=None,
|
||||
is_extendable=False,
|
||||
syntax='proto3',
|
||||
extension_ranges=[],
|
||||
oneofs=[
|
||||
],
|
||||
serialized_start=306,
|
||||
serialized_end=472,
|
||||
)
|
||||
|
||||
_DOCUMENTOPERATION.fields_by_name['update_document'].message_type = _UPDATEDOCUMENT
|
||||
_DOCUMENTOPERATION.oneofs_by_name['operation'].fields.append(
|
||||
_DOCUMENTOPERATION.fields_by_name['update_document'])
|
||||
_DOCUMENTOPERATION.fields_by_name['update_document'].containing_oneof = _DOCUMENTOPERATION.oneofs_by_name['operation']
|
||||
_UPDATEDOCUMENT.fields_by_name['typed_document'].message_type = nexus_dot_models_dot_proto_dot_typed__document__pb2._TYPEDDOCUMENT
|
||||
DESCRIPTOR.message_types_by_name['CrossReferenceOperation'] = _CROSSREFERENCEOPERATION
|
||||
DESCRIPTOR.message_types_by_name['DocumentOperation'] = _DOCUMENTOPERATION
|
||||
DESCRIPTOR.message_types_by_name['UpdateDocument'] = _UPDATEDOCUMENT
|
||||
_sym_db.RegisterFileDescriptor(DESCRIPTOR)
|
||||
|
||||
CrossReferenceOperation = _reflection.GeneratedProtocolMessageType('CrossReferenceOperation', (_message.Message,), {
|
||||
'DESCRIPTOR' : _CROSSREFERENCEOPERATION,
|
||||
'__module__' : 'nexus.models.proto.operation_pb2'
|
||||
# @@protoc_insertion_point(class_scope:nexus.models.proto.CrossReferenceOperation)
|
||||
})
|
||||
_sym_db.RegisterMessage(CrossReferenceOperation)
|
||||
|
||||
DocumentOperation = _reflection.GeneratedProtocolMessageType('DocumentOperation', (_message.Message,), {
|
||||
'DESCRIPTOR' : _DOCUMENTOPERATION,
|
||||
'__module__' : 'nexus.models.proto.operation_pb2'
|
||||
# @@protoc_insertion_point(class_scope:nexus.models.proto.DocumentOperation)
|
||||
})
|
||||
_sym_db.RegisterMessage(DocumentOperation)
|
||||
|
||||
UpdateDocument = _reflection.GeneratedProtocolMessageType('UpdateDocument', (_message.Message,), {
|
||||
'DESCRIPTOR' : _UPDATEDOCUMENT,
|
||||
'__module__' : 'nexus.models.proto.operation_pb2'
|
||||
# @@protoc_insertion_point(class_scope:nexus.models.proto.UpdateDocument)
|
||||
})
|
||||
_sym_db.RegisterMessage(UpdateDocument)
|
||||
|
||||
|
||||
# @@protoc_insertion_point(module_scope)
|
38
nexus/models/proto/scimag.proto
Normal file
38
nexus/models/proto/scimag.proto
Normal file
@ -0,0 +1,38 @@
|
||||
syntax = "proto3";
|
||||
package nexus.models.proto;
|
||||
|
||||
message Scimag {
|
||||
int64 id = 1;
|
||||
string abstract = 2;
|
||||
repeated string authors = 3;
|
||||
string container_title = 11;
|
||||
string doi = 4;
|
||||
uint32 downloads_count = 27;
|
||||
bytes embedding = 22;
|
||||
uint32 filesize = 5;
|
||||
uint32 first_page = 6;
|
||||
repeated string ipfs_multihashes = 31;
|
||||
bool is_deleted = 7;
|
||||
repeated string issns = 25;
|
||||
string issue = 10;
|
||||
oneof optional_issued_at {
|
||||
int64 issued_at = 26;
|
||||
}
|
||||
uint32 journal_id = 12;
|
||||
string language = 13;
|
||||
uint32 last_page = 14;
|
||||
string meta_language = 15;
|
||||
string md5 = 16;
|
||||
int32 ref_by_count = 23;
|
||||
repeated string references = 28;
|
||||
oneof optional_scimag_bulk_id {
|
||||
int32 scimag_bulk_id = 24;
|
||||
}
|
||||
repeated string tags = 17;
|
||||
string telegram_file_id = 18;
|
||||
string title = 19;
|
||||
string type = 29;
|
||||
int32 updated_at = 20;
|
||||
string volume = 21;
|
||||
string year = 30;
|
||||
}
|
283
nexus/models/proto/scimag_pb2.py
Normal file
283
nexus/models/proto/scimag_pb2.py
Normal file
@ -0,0 +1,283 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
||||
# source: nexus/models/proto/scimag.proto
|
||||
"""Generated protocol buffer code."""
|
||||
from google.protobuf import descriptor as _descriptor
|
||||
from google.protobuf import message as _message
|
||||
from google.protobuf import reflection as _reflection
|
||||
from google.protobuf import symbol_database as _symbol_database
|
||||
|
||||
# @@protoc_insertion_point(imports)
|
||||
|
||||
_sym_db = _symbol_database.Default()
|
||||
|
||||
|
||||
|
||||
|
||||
DESCRIPTOR = _descriptor.FileDescriptor(
|
||||
name='nexus/models/proto/scimag.proto',
|
||||
package='nexus.models.proto',
|
||||
syntax='proto3',
|
||||
serialized_options=None,
|
||||
create_key=_descriptor._internal_create_key,
|
||||
serialized_pb=b'\n\x1fnexus/models/proto/scimag.proto\x12\x12nexus.models.proto\"\xd9\x04\n\x06Scimag\x12\n\n\x02id\x18\x01 \x01(\x03\x12\x10\n\x08\x61\x62stract\x18\x02 \x01(\t\x12\x0f\n\x07\x61uthors\x18\x03 \x03(\t\x12\x17\n\x0f\x63ontainer_title\x18\x0b \x01(\t\x12\x0b\n\x03\x64oi\x18\x04 \x01(\t\x12\x17\n\x0f\x64ownloads_count\x18\x1b \x01(\r\x12\x11\n\tembedding\x18\x16 \x01(\x0c\x12\x10\n\x08\x66ilesize\x18\x05 \x01(\r\x12\x12\n\nfirst_page\x18\x06 \x01(\r\x12\x18\n\x10ipfs_multihashes\x18\x1f \x03(\t\x12\x12\n\nis_deleted\x18\x07 \x01(\x08\x12\r\n\x05issns\x18\x19 \x03(\t\x12\r\n\x05issue\x18\n \x01(\t\x12\x13\n\tissued_at\x18\x1a \x01(\x03H\x00\x12\x12\n\njournal_id\x18\x0c \x01(\r\x12\x10\n\x08language\x18\r \x01(\t\x12\x11\n\tlast_page\x18\x0e \x01(\r\x12\x15\n\rmeta_language\x18\x0f \x01(\t\x12\x0b\n\x03md5\x18\x10 \x01(\t\x12\x14\n\x0cref_by_count\x18\x17 \x01(\x05\x12\x12\n\nreferences\x18\x1c \x03(\t\x12\x18\n\x0escimag_bulk_id\x18\x18 \x01(\x05H\x01\x12\x0c\n\x04tags\x18\x11 \x03(\t\x12\x18\n\x10telegram_file_id\x18\x12 \x01(\t\x12\r\n\x05title\x18\x13 \x01(\t\x12\x0c\n\x04type\x18\x1d \x01(\t\x12\x12\n\nupdated_at\x18\x14 \x01(\x05\x12\x0e\n\x06volume\x18\x15 \x01(\t\x12\x0c\n\x04year\x18\x1e \x01(\tB\x14\n\x12optional_issued_atB\x19\n\x17optional_scimag_bulk_idb\x06proto3'
|
||||
)
|
||||
|
||||
|
||||
|
||||
|
||||
_SCIMAG = _descriptor.Descriptor(
|
||||
name='Scimag',
|
||||
full_name='nexus.models.proto.Scimag',
|
||||
filename=None,
|
||||
file=DESCRIPTOR,
|
||||
containing_type=None,
|
||||
create_key=_descriptor._internal_create_key,
|
||||
fields=[
|
||||
_descriptor.FieldDescriptor(
|
||||
name='id', full_name='nexus.models.proto.Scimag.id', index=0,
|
||||
number=1, type=3, cpp_type=2, label=1,
|
||||
has_default_value=False, default_value=0,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='abstract', full_name='nexus.models.proto.Scimag.abstract', index=1,
|
||||
number=2, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=b"".decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='authors', full_name='nexus.models.proto.Scimag.authors', index=2,
|
||||
number=3, type=9, cpp_type=9, label=3,
|
||||
has_default_value=False, default_value=[],
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='container_title', full_name='nexus.models.proto.Scimag.container_title', index=3,
|
||||
number=11, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=b"".decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='doi', full_name='nexus.models.proto.Scimag.doi', index=4,
|
||||
number=4, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=b"".decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='downloads_count', full_name='nexus.models.proto.Scimag.downloads_count', index=5,
|
||||
number=27, type=13, cpp_type=3, label=1,
|
||||
has_default_value=False, default_value=0,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='embedding', full_name='nexus.models.proto.Scimag.embedding', index=6,
|
||||
number=22, type=12, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=b"",
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='filesize', full_name='nexus.models.proto.Scimag.filesize', index=7,
|
||||
number=5, type=13, cpp_type=3, label=1,
|
||||
has_default_value=False, default_value=0,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='first_page', full_name='nexus.models.proto.Scimag.first_page', index=8,
|
||||
number=6, type=13, cpp_type=3, label=1,
|
||||
has_default_value=False, default_value=0,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='ipfs_multihashes', full_name='nexus.models.proto.Scimag.ipfs_multihashes', index=9,
|
||||
number=31, type=9, cpp_type=9, label=3,
|
||||
has_default_value=False, default_value=[],
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='is_deleted', full_name='nexus.models.proto.Scimag.is_deleted', index=10,
|
||||
number=7, type=8, cpp_type=7, label=1,
|
||||
has_default_value=False, default_value=False,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='issns', full_name='nexus.models.proto.Scimag.issns', index=11,
|
||||
number=25, type=9, cpp_type=9, label=3,
|
||||
has_default_value=False, default_value=[],
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='issue', full_name='nexus.models.proto.Scimag.issue', index=12,
|
||||
number=10, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=b"".decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='issued_at', full_name='nexus.models.proto.Scimag.issued_at', index=13,
|
||||
number=26, type=3, cpp_type=2, label=1,
|
||||
has_default_value=False, default_value=0,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='journal_id', full_name='nexus.models.proto.Scimag.journal_id', index=14,
|
||||
number=12, type=13, cpp_type=3, label=1,
|
||||
has_default_value=False, default_value=0,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='language', full_name='nexus.models.proto.Scimag.language', index=15,
|
||||
number=13, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=b"".decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='last_page', full_name='nexus.models.proto.Scimag.last_page', index=16,
|
||||
number=14, type=13, cpp_type=3, label=1,
|
||||
has_default_value=False, default_value=0,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='meta_language', full_name='nexus.models.proto.Scimag.meta_language', index=17,
|
||||
number=15, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=b"".decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='md5', full_name='nexus.models.proto.Scimag.md5', index=18,
|
||||
number=16, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=b"".decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='ref_by_count', full_name='nexus.models.proto.Scimag.ref_by_count', index=19,
|
||||
number=23, type=5, cpp_type=1, label=1,
|
||||
has_default_value=False, default_value=0,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='references', full_name='nexus.models.proto.Scimag.references', index=20,
|
||||
number=28, type=9, cpp_type=9, label=3,
|
||||
has_default_value=False, default_value=[],
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='scimag_bulk_id', full_name='nexus.models.proto.Scimag.scimag_bulk_id', index=21,
|
||||
number=24, type=5, cpp_type=1, label=1,
|
||||
has_default_value=False, default_value=0,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='tags', full_name='nexus.models.proto.Scimag.tags', index=22,
|
||||
number=17, type=9, cpp_type=9, label=3,
|
||||
has_default_value=False, default_value=[],
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='telegram_file_id', full_name='nexus.models.proto.Scimag.telegram_file_id', index=23,
|
||||
number=18, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=b"".decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='title', full_name='nexus.models.proto.Scimag.title', index=24,
|
||||
number=19, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=b"".decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='type', full_name='nexus.models.proto.Scimag.type', index=25,
|
||||
number=29, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=b"".decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='updated_at', full_name='nexus.models.proto.Scimag.updated_at', index=26,
|
||||
number=20, type=5, cpp_type=1, label=1,
|
||||
has_default_value=False, default_value=0,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='volume', full_name='nexus.models.proto.Scimag.volume', index=27,
|
||||
number=21, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=b"".decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='year', full_name='nexus.models.proto.Scimag.year', index=28,
|
||||
number=30, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=b"".decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
],
|
||||
extensions=[
|
||||
],
|
||||
nested_types=[],
|
||||
enum_types=[
|
||||
],
|
||||
serialized_options=None,
|
||||
is_extendable=False,
|
||||
syntax='proto3',
|
||||
extension_ranges=[],
|
||||
oneofs=[
|
||||
_descriptor.OneofDescriptor(
|
||||
name='optional_issued_at', full_name='nexus.models.proto.Scimag.optional_issued_at',
|
||||
index=0, containing_type=None,
|
||||
create_key=_descriptor._internal_create_key,
|
||||
fields=[]),
|
||||
_descriptor.OneofDescriptor(
|
||||
name='optional_scimag_bulk_id', full_name='nexus.models.proto.Scimag.optional_scimag_bulk_id',
|
||||
index=1, containing_type=None,
|
||||
create_key=_descriptor._internal_create_key,
|
||||
fields=[]),
|
||||
],
|
||||
serialized_start=56,
|
||||
serialized_end=657,
|
||||
)
|
||||
|
||||
_SCIMAG.oneofs_by_name['optional_issued_at'].fields.append(
|
||||
_SCIMAG.fields_by_name['issued_at'])
|
||||
_SCIMAG.fields_by_name['issued_at'].containing_oneof = _SCIMAG.oneofs_by_name['optional_issued_at']
|
||||
_SCIMAG.oneofs_by_name['optional_scimag_bulk_id'].fields.append(
|
||||
_SCIMAG.fields_by_name['scimag_bulk_id'])
|
||||
_SCIMAG.fields_by_name['scimag_bulk_id'].containing_oneof = _SCIMAG.oneofs_by_name['optional_scimag_bulk_id']
|
||||
DESCRIPTOR.message_types_by_name['Scimag'] = _SCIMAG
|
||||
_sym_db.RegisterFileDescriptor(DESCRIPTOR)
|
||||
|
||||
Scimag = _reflection.GeneratedProtocolMessageType('Scimag', (_message.Message,), {
|
||||
'DESCRIPTOR' : _SCIMAG,
|
||||
'__module__' : 'nexus.models.proto.scimag_pb2'
|
||||
# @@protoc_insertion_point(class_scope:nexus.models.proto.Scimag)
|
||||
})
|
||||
_sym_db.RegisterMessage(Scimag)
|
||||
|
||||
|
||||
# @@protoc_insertion_point(module_scope)
|
36
nexus/models/proto/scitech.proto
Normal file
36
nexus/models/proto/scitech.proto
Normal file
@ -0,0 +1,36 @@
|
||||
syntax = "proto3";
|
||||
package nexus.models.proto;
|
||||
|
||||
message Scitech {
|
||||
int64 id = 1;
|
||||
repeated string authors = 2;
|
||||
string cu = 3;
|
||||
string cu_suf = 4;
|
||||
string description = 5;
|
||||
string doi = 6;
|
||||
uint32 downloads_count = 28;
|
||||
string edition = 7;
|
||||
string extension = 8;
|
||||
int64 fiction_id = 9;
|
||||
uint64 filesize = 10;
|
||||
repeated string ipfs_multihashes = 30;
|
||||
bool is_deleted = 11;
|
||||
repeated string isbns = 12;
|
||||
bool has_duplicates = 31;
|
||||
oneof optional_issued_at {
|
||||
int64 issued_at = 25;
|
||||
}
|
||||
string language = 13;
|
||||
int64 libgen_id = 14;
|
||||
string meta_language = 15;
|
||||
string md5 = 16;
|
||||
int64 original_id = 23;
|
||||
uint32 pages = 17;
|
||||
string series = 18;
|
||||
repeated string tags = 19;
|
||||
string telegram_file_id = 20;
|
||||
string title = 21;
|
||||
int32 updated_at = 22;
|
||||
string volume = 24;
|
||||
string year = 29;
|
||||
}
|
275
nexus/models/proto/scitech_pb2.py
Normal file
275
nexus/models/proto/scitech_pb2.py
Normal file
@ -0,0 +1,275 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
||||
# source: nexus/models/proto/scitech.proto
|
||||
"""Generated protocol buffer code."""
|
||||
from google.protobuf import descriptor as _descriptor
|
||||
from google.protobuf import message as _message
|
||||
from google.protobuf import reflection as _reflection
|
||||
from google.protobuf import symbol_database as _symbol_database
|
||||
|
||||
# @@protoc_insertion_point(imports)
|
||||
|
||||
_sym_db = _symbol_database.Default()
|
||||
|
||||
|
||||
|
||||
|
||||
DESCRIPTOR = _descriptor.FileDescriptor(
|
||||
name='nexus/models/proto/scitech.proto',
|
||||
package='nexus.models.proto',
|
||||
syntax='proto3',
|
||||
serialized_options=None,
|
||||
create_key=_descriptor._internal_create_key,
|
||||
serialized_pb=b'\n nexus/models/proto/scitech.proto\x12\x12nexus.models.proto\"\xad\x04\n\x07Scitech\x12\n\n\x02id\x18\x01 \x01(\x03\x12\x0f\n\x07\x61uthors\x18\x02 \x03(\t\x12\n\n\x02\x63u\x18\x03 \x01(\t\x12\x0e\n\x06\x63u_suf\x18\x04 \x01(\t\x12\x13\n\x0b\x64\x65scription\x18\x05 \x01(\t\x12\x0b\n\x03\x64oi\x18\x06 \x01(\t\x12\x17\n\x0f\x64ownloads_count\x18\x1c \x01(\r\x12\x0f\n\x07\x65\x64ition\x18\x07 \x01(\t\x12\x11\n\textension\x18\x08 \x01(\t\x12\x12\n\nfiction_id\x18\t \x01(\x03\x12\x10\n\x08\x66ilesize\x18\n \x01(\x04\x12\x18\n\x10ipfs_multihashes\x18\x1e \x03(\t\x12\x12\n\nis_deleted\x18\x0b \x01(\x08\x12\r\n\x05isbns\x18\x0c \x03(\t\x12\x16\n\x0ehas_duplicates\x18\x1f \x01(\x08\x12\x13\n\tissued_at\x18\x19 \x01(\x03H\x00\x12\x10\n\x08language\x18\r \x01(\t\x12\x11\n\tlibgen_id\x18\x0e \x01(\x03\x12\x15\n\rmeta_language\x18\x0f \x01(\t\x12\x0b\n\x03md5\x18\x10 \x01(\t\x12\x13\n\x0boriginal_id\x18\x17 \x01(\x03\x12\r\n\x05pages\x18\x11 \x01(\r\x12\x0e\n\x06series\x18\x12 \x01(\t\x12\x0c\n\x04tags\x18\x13 \x03(\t\x12\x18\n\x10telegram_file_id\x18\x14 \x01(\t\x12\r\n\x05title\x18\x15 \x01(\t\x12\x12\n\nupdated_at\x18\x16 \x01(\x05\x12\x0e\n\x06volume\x18\x18 \x01(\t\x12\x0c\n\x04year\x18\x1d \x01(\tB\x14\n\x12optional_issued_atb\x06proto3'
|
||||
)
|
||||
|
||||
|
||||
|
||||
|
||||
_SCITECH = _descriptor.Descriptor(
|
||||
name='Scitech',
|
||||
full_name='nexus.models.proto.Scitech',
|
||||
filename=None,
|
||||
file=DESCRIPTOR,
|
||||
containing_type=None,
|
||||
create_key=_descriptor._internal_create_key,
|
||||
fields=[
|
||||
_descriptor.FieldDescriptor(
|
||||
name='id', full_name='nexus.models.proto.Scitech.id', index=0,
|
||||
number=1, type=3, cpp_type=2, label=1,
|
||||
has_default_value=False, default_value=0,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='authors', full_name='nexus.models.proto.Scitech.authors', index=1,
|
||||
number=2, type=9, cpp_type=9, label=3,
|
||||
has_default_value=False, default_value=[],
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='cu', full_name='nexus.models.proto.Scitech.cu', index=2,
|
||||
number=3, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=b"".decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='cu_suf', full_name='nexus.models.proto.Scitech.cu_suf', index=3,
|
||||
number=4, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=b"".decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='description', full_name='nexus.models.proto.Scitech.description', index=4,
|
||||
number=5, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=b"".decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='doi', full_name='nexus.models.proto.Scitech.doi', index=5,
|
||||
number=6, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=b"".decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='downloads_count', full_name='nexus.models.proto.Scitech.downloads_count', index=6,
|
||||
number=28, type=13, cpp_type=3, label=1,
|
||||
has_default_value=False, default_value=0,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='edition', full_name='nexus.models.proto.Scitech.edition', index=7,
|
||||
number=7, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=b"".decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='extension', full_name='nexus.models.proto.Scitech.extension', index=8,
|
||||
number=8, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=b"".decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='fiction_id', full_name='nexus.models.proto.Scitech.fiction_id', index=9,
|
||||
number=9, type=3, cpp_type=2, label=1,
|
||||
has_default_value=False, default_value=0,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='filesize', full_name='nexus.models.proto.Scitech.filesize', index=10,
|
||||
number=10, type=4, cpp_type=4, label=1,
|
||||
has_default_value=False, default_value=0,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='ipfs_multihashes', full_name='nexus.models.proto.Scitech.ipfs_multihashes', index=11,
|
||||
number=30, type=9, cpp_type=9, label=3,
|
||||
has_default_value=False, default_value=[],
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='is_deleted', full_name='nexus.models.proto.Scitech.is_deleted', index=12,
|
||||
number=11, type=8, cpp_type=7, label=1,
|
||||
has_default_value=False, default_value=False,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='isbns', full_name='nexus.models.proto.Scitech.isbns', index=13,
|
||||
number=12, type=9, cpp_type=9, label=3,
|
||||
has_default_value=False, default_value=[],
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='has_duplicates', full_name='nexus.models.proto.Scitech.has_duplicates', index=14,
|
||||
number=31, type=8, cpp_type=7, label=1,
|
||||
has_default_value=False, default_value=False,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='issued_at', full_name='nexus.models.proto.Scitech.issued_at', index=15,
|
||||
number=25, type=3, cpp_type=2, label=1,
|
||||
has_default_value=False, default_value=0,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='language', full_name='nexus.models.proto.Scitech.language', index=16,
|
||||
number=13, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=b"".decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='libgen_id', full_name='nexus.models.proto.Scitech.libgen_id', index=17,
|
||||
number=14, type=3, cpp_type=2, label=1,
|
||||
has_default_value=False, default_value=0,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='meta_language', full_name='nexus.models.proto.Scitech.meta_language', index=18,
|
||||
number=15, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=b"".decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='md5', full_name='nexus.models.proto.Scitech.md5', index=19,
|
||||
number=16, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=b"".decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='original_id', full_name='nexus.models.proto.Scitech.original_id', index=20,
|
||||
number=23, type=3, cpp_type=2, label=1,
|
||||
has_default_value=False, default_value=0,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='pages', full_name='nexus.models.proto.Scitech.pages', index=21,
|
||||
number=17, type=13, cpp_type=3, label=1,
|
||||
has_default_value=False, default_value=0,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='series', full_name='nexus.models.proto.Scitech.series', index=22,
|
||||
number=18, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=b"".decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='tags', full_name='nexus.models.proto.Scitech.tags', index=23,
|
||||
number=19, type=9, cpp_type=9, label=3,
|
||||
has_default_value=False, default_value=[],
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='telegram_file_id', full_name='nexus.models.proto.Scitech.telegram_file_id', index=24,
|
||||
number=20, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=b"".decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='title', full_name='nexus.models.proto.Scitech.title', index=25,
|
||||
number=21, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=b"".decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='updated_at', full_name='nexus.models.proto.Scitech.updated_at', index=26,
|
||||
number=22, type=5, cpp_type=1, label=1,
|
||||
has_default_value=False, default_value=0,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='volume', full_name='nexus.models.proto.Scitech.volume', index=27,
|
||||
number=24, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=b"".decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='year', full_name='nexus.models.proto.Scitech.year', index=28,
|
||||
number=29, type=9, cpp_type=9, label=1,
|
||||
has_default_value=False, default_value=b"".decode('utf-8'),
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
],
|
||||
extensions=[
|
||||
],
|
||||
nested_types=[],
|
||||
enum_types=[
|
||||
],
|
||||
serialized_options=None,
|
||||
is_extendable=False,
|
||||
syntax='proto3',
|
||||
extension_ranges=[],
|
||||
oneofs=[
|
||||
_descriptor.OneofDescriptor(
|
||||
name='optional_issued_at', full_name='nexus.models.proto.Scitech.optional_issued_at',
|
||||
index=0, containing_type=None,
|
||||
create_key=_descriptor._internal_create_key,
|
||||
fields=[]),
|
||||
],
|
||||
serialized_start=57,
|
||||
serialized_end=614,
|
||||
)
|
||||
|
||||
_SCITECH.oneofs_by_name['optional_issued_at'].fields.append(
|
||||
_SCITECH.fields_by_name['issued_at'])
|
||||
_SCITECH.fields_by_name['issued_at'].containing_oneof = _SCITECH.oneofs_by_name['optional_issued_at']
|
||||
DESCRIPTOR.message_types_by_name['Scitech'] = _SCITECH
|
||||
_sym_db.RegisterFileDescriptor(DESCRIPTOR)
|
||||
|
||||
Scitech = _reflection.GeneratedProtocolMessageType('Scitech', (_message.Message,), {
|
||||
'DESCRIPTOR' : _SCITECH,
|
||||
'__module__' : 'nexus.models.proto.scitech_pb2'
|
||||
# @@protoc_insertion_point(class_scope:nexus.models.proto.Scitech)
|
||||
})
|
||||
_sym_db.RegisterMessage(Scitech)
|
||||
|
||||
|
||||
# @@protoc_insertion_point(module_scope)
|
12
nexus/models/proto/typed_document.proto
Normal file
12
nexus/models/proto/typed_document.proto
Normal file
@ -0,0 +1,12 @@
|
||||
syntax = "proto3";
|
||||
package nexus.models.proto;
|
||||
|
||||
import "nexus/models/proto/scimag.proto";
|
||||
import "nexus/models/proto/scitech.proto";
|
||||
|
||||
message TypedDocument {
|
||||
oneof document {
|
||||
Scimag scimag = 1;
|
||||
Scitech scitech = 2;
|
||||
}
|
||||
}
|
95
nexus/models/proto/typed_document_pb2.py
Normal file
95
nexus/models/proto/typed_document_pb2.py
Normal file
@ -0,0 +1,95 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
||||
# source: nexus/models/proto/typed_document.proto
|
||||
"""Generated protocol buffer code."""
|
||||
from google.protobuf import descriptor as _descriptor
|
||||
from google.protobuf import message as _message
|
||||
from google.protobuf import reflection as _reflection
|
||||
from google.protobuf import symbol_database as _symbol_database
|
||||
|
||||
# @@protoc_insertion_point(imports)
|
||||
|
||||
_sym_db = _symbol_database.Default()
|
||||
|
||||
|
||||
from nexus.models.proto import \
|
||||
scimag_pb2 as nexus_dot_models_dot_proto_dot_scimag__pb2
|
||||
from nexus.models.proto import \
|
||||
scitech_pb2 as nexus_dot_models_dot_proto_dot_scitech__pb2
|
||||
|
||||
DESCRIPTOR = _descriptor.FileDescriptor(
|
||||
name='nexus/models/proto/typed_document.proto',
|
||||
package='nexus.models.proto',
|
||||
syntax='proto3',
|
||||
serialized_options=None,
|
||||
create_key=_descriptor._internal_create_key,
|
||||
serialized_pb=b'\n\'nexus/models/proto/typed_document.proto\x12\x12nexus.models.proto\x1a\x1fnexus/models/proto/scimag.proto\x1a nexus/models/proto/scitech.proto\"y\n\rTypedDocument\x12,\n\x06scimag\x18\x01 \x01(\x0b\x32\x1a.nexus.models.proto.ScimagH\x00\x12.\n\x07scitech\x18\x02 \x01(\x0b\x32\x1b.nexus.models.proto.ScitechH\x00\x42\n\n\x08\x64ocumentb\x06proto3'
|
||||
,
|
||||
dependencies=[nexus_dot_models_dot_proto_dot_scimag__pb2.DESCRIPTOR,nexus_dot_models_dot_proto_dot_scitech__pb2.DESCRIPTOR,])
|
||||
|
||||
|
||||
|
||||
|
||||
_TYPEDDOCUMENT = _descriptor.Descriptor(
|
||||
name='TypedDocument',
|
||||
full_name='nexus.models.proto.TypedDocument',
|
||||
filename=None,
|
||||
file=DESCRIPTOR,
|
||||
containing_type=None,
|
||||
create_key=_descriptor._internal_create_key,
|
||||
fields=[
|
||||
_descriptor.FieldDescriptor(
|
||||
name='scimag', full_name='nexus.models.proto.TypedDocument.scimag', index=0,
|
||||
number=1, type=11, cpp_type=10, label=1,
|
||||
has_default_value=False, default_value=None,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
_descriptor.FieldDescriptor(
|
||||
name='scitech', full_name='nexus.models.proto.TypedDocument.scitech', index=1,
|
||||
number=2, type=11, cpp_type=10, label=1,
|
||||
has_default_value=False, default_value=None,
|
||||
message_type=None, enum_type=None, containing_type=None,
|
||||
is_extension=False, extension_scope=None,
|
||||
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
|
||||
],
|
||||
extensions=[
|
||||
],
|
||||
nested_types=[],
|
||||
enum_types=[
|
||||
],
|
||||
serialized_options=None,
|
||||
is_extendable=False,
|
||||
syntax='proto3',
|
||||
extension_ranges=[],
|
||||
oneofs=[
|
||||
_descriptor.OneofDescriptor(
|
||||
name='document', full_name='nexus.models.proto.TypedDocument.document',
|
||||
index=0, containing_type=None,
|
||||
create_key=_descriptor._internal_create_key,
|
||||
fields=[]),
|
||||
],
|
||||
serialized_start=130,
|
||||
serialized_end=251,
|
||||
)
|
||||
|
||||
_TYPEDDOCUMENT.fields_by_name['scimag'].message_type = nexus_dot_models_dot_proto_dot_scimag__pb2._SCIMAG
|
||||
_TYPEDDOCUMENT.fields_by_name['scitech'].message_type = nexus_dot_models_dot_proto_dot_scitech__pb2._SCITECH
|
||||
_TYPEDDOCUMENT.oneofs_by_name['document'].fields.append(
|
||||
_TYPEDDOCUMENT.fields_by_name['scimag'])
|
||||
_TYPEDDOCUMENT.fields_by_name['scimag'].containing_oneof = _TYPEDDOCUMENT.oneofs_by_name['document']
|
||||
_TYPEDDOCUMENT.oneofs_by_name['document'].fields.append(
|
||||
_TYPEDDOCUMENT.fields_by_name['scitech'])
|
||||
_TYPEDDOCUMENT.fields_by_name['scitech'].containing_oneof = _TYPEDDOCUMENT.oneofs_by_name['document']
|
||||
DESCRIPTOR.message_types_by_name['TypedDocument'] = _TYPEDDOCUMENT
|
||||
_sym_db.RegisterFileDescriptor(DESCRIPTOR)
|
||||
|
||||
TypedDocument = _reflection.GeneratedProtocolMessageType('TypedDocument', (_message.Message,), {
|
||||
'DESCRIPTOR' : _TYPEDDOCUMENT,
|
||||
'__module__' : 'nexus.models.proto.typed_document_pb2'
|
||||
# @@protoc_insertion_point(class_scope:nexus.models.proto.TypedDocument)
|
||||
})
|
||||
_sym_db.RegisterMessage(TypedDocument)
|
||||
|
||||
|
||||
# @@protoc_insertion_point(module_scope)
|
20
nexus/nlptools/BUILD.bazel
Normal file
20
nexus/nlptools/BUILD.bazel
Normal file
@ -0,0 +1,20 @@
|
||||
load("@pip_modules_external//:requirements.bzl", "requirement")
|
||||
load("@rules_python//python:defs.bzl", "py_library")
|
||||
|
||||
py_library(
|
||||
name = "nlptools",
|
||||
srcs = glob(
|
||||
["**/*.py"],
|
||||
exclude = ["tests/**"],
|
||||
),
|
||||
srcs_version = "PY3",
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
requirement("DAWG"),
|
||||
requirement("emoji"),
|
||||
requirement("lemminflect"),
|
||||
requirement("pycld3"),
|
||||
requirement("pymorphy2"),
|
||||
requirement("spacy"),
|
||||
],
|
||||
)
|
0
nexus/nlptools/__init__.py
Normal file
0
nexus/nlptools/__init__.py
Normal file
9
nexus/nlptools/language_detect.py
Normal file
9
nexus/nlptools/language_detect.py
Normal file
@ -0,0 +1,9 @@
|
||||
import cld3
|
||||
|
||||
|
||||
def detect_language(text: str) -> str:
|
||||
prediction = cld3.get_language(text)
|
||||
if prediction and prediction.is_reliable:
|
||||
if prediction.language.endswith('-Latn'):
|
||||
return prediction.language[:2]
|
||||
return prediction.language
|
24
nexus/nlptools/languages.py
Normal file
24
nexus/nlptools/languages.py
Normal file
@ -0,0 +1,24 @@
|
||||
import enum
|
||||
|
||||
|
||||
class Language(enum.IntEnum):
|
||||
unknown_language = 0
|
||||
am = 1
|
||||
ar = 2
|
||||
bn = 3
|
||||
de = 4
|
||||
en = 5
|
||||
es = 6
|
||||
fa = 7
|
||||
fr = 8
|
||||
hi = 9
|
||||
id = 10
|
||||
it = 11
|
||||
ja = 12
|
||||
ms = 13
|
||||
pt = 14
|
||||
ru = 15
|
||||
tg = 16
|
||||
uk = 17
|
||||
uz = 18
|
||||
zh = 19
|
50
nexus/nlptools/morph.py
Normal file
50
nexus/nlptools/morph.py
Normal file
@ -0,0 +1,50 @@
|
||||
import math
|
||||
|
||||
import lemminflect # noqa
|
||||
import pymorphy2
|
||||
import spacy
|
||||
|
||||
|
||||
class EnglishMorphology:
|
||||
VERBS = {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'}
|
||||
ADJS = {'JJ', 'JJR', 'JJS'}
|
||||
NOUNS = {'NN', 'NNP', 'NNPS', 'NNS'}
|
||||
ADVERBS = {'RB', 'RBR', 'RBS'}
|
||||
|
||||
WORD_KINDS = [VERBS, ADJS, NOUNS, ADVERBS]
|
||||
|
||||
def __init__(self, name):
|
||||
self.nlp = spacy.load(name)
|
||||
|
||||
def derive_forms(self, word):
|
||||
forms = set()
|
||||
word = self.nlp(word)[0]
|
||||
inflected = False
|
||||
for kind in self.WORD_KINDS:
|
||||
if word.tag_ in kind:
|
||||
for w in kind:
|
||||
inflection = word._.inflect(w)
|
||||
if inflection:
|
||||
inflected = True
|
||||
forms.add(word._.inflect(w))
|
||||
if not inflected and word:
|
||||
forms.add(str(word))
|
||||
return list(sorted(forms))
|
||||
|
||||
|
||||
class RussianMorphology:
|
||||
def __init__(self):
|
||||
self.morph_analyzer = pymorphy2.MorphAnalyzer()
|
||||
|
||||
def derive_forms(self, word):
|
||||
words = set()
|
||||
phrase_word_form = self.morph_analyzer.parse(word)[0]
|
||||
for lexeme in phrase_word_form.lexeme:
|
||||
if lexeme.word == word:
|
||||
coef = 1.0
|
||||
else:
|
||||
coef = 1.0 / math.log1p(len(phrase_word_form.lexeme))
|
||||
if 'Abbr' in lexeme.tag:
|
||||
continue
|
||||
words.add(f'{lexeme.word}^{coef:.2f}')
|
||||
return list(sorted(words))
|
32
nexus/nlptools/regex.py
Normal file
32
nexus/nlptools/regex.py
Normal file
@ -0,0 +1,32 @@
|
||||
import re
|
||||
|
||||
from emoji import get_emoji_regexp
|
||||
|
||||
ALNUMWHITESPACE_REGEX = re.compile(r'([^\s\w])+')
|
||||
EMAIL_REGEX = re.compile(r'([a-zA-Z0-9_\-\.]+)@([a-zA-Z0-9_\-\.]+)\.([a-zA-Z]{2,5})')
|
||||
EMOJI_REGEX = get_emoji_regexp()
|
||||
HASHTAG_REGEX = re.compile(r'([#@]+)([A-Za-z0-9_]+)')
|
||||
MULTIWHITESPACE_REGEX = re.compile(r"\s+")
|
||||
STICKER_REGEX = re.compile(
|
||||
'^[\U0001F1E0-\U0001F1FF'
|
||||
'\U0001F300-\U0001F5FF'
|
||||
'\U0001F600-\U0001F64F'
|
||||
'\U0001F680-\U0001F6FF'
|
||||
'\U0001F700-\U0001F77F'
|
||||
'\U0001F780-\U0001F7FF'
|
||||
'\U0001F800-\U0001F8FF'
|
||||
'\U0001F900-\U0001F9FF'
|
||||
'\U0001FA00-\U0001FA6F'
|
||||
'\U0001FA70-\U0001FAFF'
|
||||
'\U00002702-\U000027B0]$',
|
||||
flags=re.UNICODE,
|
||||
)
|
||||
URL_REGEX = re.compile(r'^(https?|ftp)?:\/\/[^\s\/$.?#]+\.[^\s]*$')
|
||||
HIDDEN_CHAR = ''
|
||||
TELEGRAM_LINK_REGEX = re.compile('(?:https?://)?t\\.me/(?!joinchat/)([A-Za-z0-9_]+)')
|
||||
|
||||
DOI_REGEX = re.compile(r'(10.\d{4,9})\s?/\s?([-._;()<>/:A-Za-z0-9]+[^.?\s])')
|
||||
ISBN_REGEX = re.compile(r'^(?:[iI][sS][bB][nN]\:?\s*)?((97(8|9))?\-?\d{9}(\d|X))$')
|
||||
MD5_REGEX = re.compile(r'([A-Fa-f0-9]{32})')
|
||||
NID_REGEX = re.compile(r'(?:[Nn][Ii][Dd]\s?:?\s*)([0-9]+)')
|
||||
PUBMED_ID_REGEX = re.compile(r'(?:(?:https?://)?(?:www.)?ncbi.nlm.nih.gov/pubmed/|[Pp][Mm][Ii][Dd]\s?:?\s*)([0-9]+)')
|
12
nexus/nlptools/tests/BUILD.bazel
Normal file
12
nexus/nlptools/tests/BUILD.bazel
Normal file
@ -0,0 +1,12 @@
|
||||
load("@pip_modules_external//:requirements.bzl", "requirement")
|
||||
load("@rules_python//python:defs.bzl", "py_test")
|
||||
|
||||
py_test(
|
||||
name = "tests",
|
||||
srcs = glob(["**/*.py"]),
|
||||
main = "test.py",
|
||||
deps = [
|
||||
requirement("pytest"),
|
||||
"//nexus/nlptools",
|
||||
],
|
||||
)
|
0
nexus/nlptools/tests/__init__.py
Normal file
0
nexus/nlptools/tests/__init__.py
Normal file
7
nexus/nlptools/tests/test.py
Normal file
7
nexus/nlptools/tests/test.py
Normal file
@ -0,0 +1,7 @@
|
||||
import os
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(pytest.main([os.path.dirname(__file__), '-vvv', '-W', 'ignore::DeprecationWarning']))
|
20
nexus/nlptools/tests/test_utils.py
Normal file
20
nexus/nlptools/tests/test_utils.py
Normal file
@ -0,0 +1,20 @@
|
||||
from nexus.nlptools.utils import (
|
||||
cast_string_to_single_string,
|
||||
despace,
|
||||
remove_hashtags,
|
||||
)
|
||||
|
||||
|
||||
def test_cast_string_to_single_string():
|
||||
assert cast_string_to_single_string('kek kek 123\nkek') == 'kek-kek-123-kek'
|
||||
|
||||
|
||||
def test_despace():
|
||||
assert despace(
|
||||
'ArXiv Papers Related to Computer Science, AI , Deep Learning, Computer Vision, NLP, etc\n\n\n'
|
||||
'From: @ai_python'
|
||||
) == 'ArXiv Papers Related to Computer Science, AI , Deep Learning, Computer Vision, NLP, etc\nFrom: @ai_python'
|
||||
|
||||
|
||||
def test_remove_hashtags():
|
||||
assert remove_hashtags('#ny riot') == ' riot'
|
109
nexus/nlptools/utils.py
Normal file
109
nexus/nlptools/utils.py
Normal file
@ -0,0 +1,109 @@
|
||||
import re
|
||||
import struct
|
||||
import unicodedata
|
||||
|
||||
from .regex import (
|
||||
ALNUMWHITESPACE_REGEX,
|
||||
EMAIL_REGEX,
|
||||
EMOJI_REGEX,
|
||||
HASHTAG_REGEX,
|
||||
MULTIWHITESPACE_REGEX,
|
||||
TELEGRAM_LINK_REGEX,
|
||||
URL_REGEX,
|
||||
)
|
||||
|
||||
|
||||
def add_surrogate(text):
|
||||
return ''.join(
|
||||
# SMP -> Surrogate Pairs (Telegram offsets are calculated with these).
|
||||
# See https://en.wikipedia.org/wiki/Plane_(Unicode)#Overview for more.
|
||||
''.join(chr(y) for y in struct.unpack('<HH', x.encode('utf-16le')))
|
||||
if (0x10000 <= ord(x) <= 0x10FFFF) else x for x in text
|
||||
)
|
||||
|
||||
|
||||
def cast_string_to_single_string(s):
|
||||
processed = MULTIWHITESPACE_REGEX.sub(' ', ALNUMWHITESPACE_REGEX.sub(' ', s))
|
||||
processed = processed.strip().replace(' ', '-')
|
||||
return processed
|
||||
|
||||
|
||||
def clean_text(text):
|
||||
text = remove_markdown(remove_emoji(text))
|
||||
text = remove_url(text)
|
||||
text = despace_smart(text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def despace(text):
|
||||
text = re.sub(r'\n+', '\n', text)
|
||||
text = re.sub(r'[ \t]+', ' ', text)
|
||||
text = re.sub(r'\n[ \t]+', '\n', text)
|
||||
return text
|
||||
|
||||
|
||||
def despace_full(text):
|
||||
return re.sub(r'\s+', ' ', text).strip()
|
||||
|
||||
|
||||
def despace_smart(text):
|
||||
text = re.sub(r'\n\s*[-•]+\s*', r'\n', text)
|
||||
text = re.sub(r'\n{2,}', r'\n', text).strip()
|
||||
text = re.sub(r'\.?(\s+)?\n', r'. ', text)
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
return text
|
||||
|
||||
|
||||
def escape_format(text):
|
||||
text = text.replace("__", "_").replace("**", "*").replace("`", "'")
|
||||
text = text.replace('[', r'`[`').replace(']', r'`]`')
|
||||
return text
|
||||
|
||||
|
||||
def remove_markdown(text):
|
||||
text = re.sub('[*_~]{2,}', '', text)
|
||||
text = re.sub('[`]+', '', text)
|
||||
text = re.sub(r'\[\s*(.*?)(\s*)\]\(.*?\)', r'\g<1>\g<2>', text, flags=re.MULTILINE)
|
||||
return text
|
||||
|
||||
|
||||
def normalize_string(string):
|
||||
string = re.sub('[^a-zA-Z0-9_\\-]+', '', string.lower().strip().replace(' ', '-'))
|
||||
return unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8')
|
||||
|
||||
|
||||
def remove_emails(text):
|
||||
return re.sub(EMAIL_REGEX, '', text)
|
||||
|
||||
|
||||
def remove_emoji(text):
|
||||
text = re.sub(EMOJI_REGEX, '', text)
|
||||
text = re.sub(u'\ufe0f', '', text)
|
||||
return text
|
||||
|
||||
|
||||
def remove_hashtags(text):
|
||||
return re.sub(HASHTAG_REGEX, '', text)
|
||||
|
||||
|
||||
def remove_url(text):
|
||||
return re.sub(URL_REGEX, '', text)
|
||||
|
||||
|
||||
def replace_telegram_link(text):
|
||||
return re.sub(TELEGRAM_LINK_REGEX, r'@\1', text)
|
||||
|
||||
|
||||
def split_at(s, pos):
|
||||
if len(s) < pos:
|
||||
return s
|
||||
pos -= 10
|
||||
pos = max(0, pos)
|
||||
for p in range(pos, min(pos + 20, len(s) - 1)):
|
||||
if s[p] in [' ', '\n', '.', ',', ':', ';', '-']:
|
||||
return s[:p] + '...'
|
||||
return s[:pos] + '...'
|
||||
|
||||
|
||||
def unwind_hashtags(text):
|
||||
return re.sub(HASHTAG_REGEX, r'\2', text)
|
37
nexus/pipe/BUILD.bazel
Normal file
37
nexus/pipe/BUILD.bazel
Normal file
@ -0,0 +1,37 @@
|
||||
load("@io_bazel_rules_docker//python3:image.bzl", "py3_image")
|
||||
|
||||
load("@pip_modules_external//:requirements.bzl", "requirement")
|
||||
|
||||
alias(
|
||||
name = "binary",
|
||||
actual = ":image.binary",
|
||||
)
|
||||
|
||||
py3_image(
|
||||
name = "image",
|
||||
srcs = glob(["**/*.py"]),
|
||||
base = "//images/production:base-python-image",
|
||||
data = [
|
||||
"configs/base.yaml",
|
||||
"configs/logging.yaml",
|
||||
],
|
||||
main = "main.py",
|
||||
srcs_version = "PY3ONLY",
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
requirement("aiokafka"),
|
||||
requirement("orjson"),
|
||||
requirement("pypika"),
|
||||
requirement("aiocrossref"),
|
||||
requirement("aiokit"),
|
||||
"//library/aiopostgres",
|
||||
"//library/configurator",
|
||||
"//library/logging",
|
||||
"//nexus/actions",
|
||||
"//nexus/models/proto:models_proto_py",
|
||||
"//nexus/summa/schema",
|
||||
requirement("aiosumma"),
|
||||
requirement("izihawa_utils"),
|
||||
],
|
||||
)
|
||||
|
102
nexus/pipe/README.md
Normal file
102
nexus/pipe/README.md
Normal file
@ -0,0 +1,102 @@
|
||||
# Nexus Pipe
|
||||
|
||||
`Pipe` processes Kafka queue of operations. This version has cut `configs`
|
||||
subdirectory due to hard reliance of configs on the network infrastructure you are using.
|
||||
You have to write your own configs taking example below into account.
|
||||
|
||||
## Sample `configs/base.yaml`
|
||||
|
||||
```yaml
|
||||
---
|
||||
log_path: '/var/log/nexus-pipe/{{ ENV_TYPE }}'
|
||||
pipe:
|
||||
brokers: |
|
||||
kafka-0.example.net,
|
||||
kafka-1.example.net
|
||||
schema:
|
||||
- consumers:
|
||||
- class: nexus.pipe.consumers.CrossReferencesBulkConsumer
|
||||
topics:
|
||||
- name: cross_references
|
||||
workers: 4
|
||||
group_id: pipe
|
||||
processors:
|
||||
- class: nexus.pipe.processors.CrossReferencesProcessor
|
||||
kwargs:
|
||||
brokers: |
|
||||
kafka-0.example.net,
|
||||
kafka-1.example.net
|
||||
database:
|
||||
database: nexus
|
||||
host: postgres.example.net
|
||||
password: '{{ DATABASE_PASSWORD }}'
|
||||
username: '{{ DATABASE_USERNAME }}'
|
||||
- consumers:
|
||||
- class: nexus.pipe.consumers.DocumentOperationsJsonConsumer
|
||||
topics:
|
||||
- name: operations
|
||||
workers: 2
|
||||
- class: nexus.pipe.consumers.DocumentOperationsConsumer
|
||||
topics:
|
||||
- name: operations_binary_hp
|
||||
workers: 4
|
||||
- name: operations_binary
|
||||
workers: 14
|
||||
group_id: pipe
|
||||
processors:
|
||||
- class: nexus.pipe.processors.ActionProcessor
|
||||
kwargs:
|
||||
actions:
|
||||
- class: nexus.actions.FillDocumentOperationUpdateDocumentScimagPbFromExternalSourceAction
|
||||
kwargs:
|
||||
crossref:
|
||||
rps: 50
|
||||
user_agent: 'ScienceLegion/1.0 (Linux x86_64; ) ScienceLegion/1.0.0'
|
||||
- class: nexus.actions.CleanDocumentOperationUpdateDocumentScimagPbAction
|
||||
- class: nexus.actions.SendDocumentOperationUpdateDocumentScimagPbToGoldenPostgresAction
|
||||
kwargs:
|
||||
database:
|
||||
database: nexus
|
||||
host: postgres.example.net
|
||||
password: '{{ DATABASE_PASSWORD }}'
|
||||
username: '{{ DATABASE_USERNAME }}'
|
||||
- class: nexus.actions.SendDocumentOperationUpdateDocumentScimagPbReferencesToKafkaAction
|
||||
kwargs:
|
||||
brokers: |
|
||||
kafka-0.example.net,
|
||||
kafka-1.example.net
|
||||
topic: cross_references
|
||||
- class: nexus.actions.SendDocumentOperationUpdateDocumentPbToSummaAction
|
||||
kwargs:
|
||||
summa:
|
||||
base_url: http://summa.example.net
|
||||
timeout: 15
|
||||
ttl_dns_cache: 30
|
||||
filter:
|
||||
class: nexus.pipe.filters.DocumentOperationFilter
|
||||
kwargs:
|
||||
document: scimag
|
||||
operation: update_document
|
||||
- class: nexus.pipe.processors.ActionProcessor
|
||||
kwargs:
|
||||
actions:
|
||||
- class: nexus.actions.CleanDocumentOperationUpdateDocumentScitechPbAction
|
||||
- class: nexus.actions.SendDocumentOperationUpdateDocumentScitechPbToGoldenPostgresAction
|
||||
kwargs:
|
||||
database:
|
||||
database: nexus
|
||||
host: postgres.example.net
|
||||
password: '{{ DATABASE_PASSWORD }}'
|
||||
username: '{{ DATABASE_USERNAME }}'
|
||||
- class: nexus.actions.SendDocumentOperationUpdateDocumentPbToSummaAction
|
||||
kwargs:
|
||||
summa:
|
||||
base_url: http://summa.example.net
|
||||
timeout: 15
|
||||
ttl_dns_cache: 30
|
||||
filter:
|
||||
class: nexus.pipe.filters.DocumentOperationFilter
|
||||
kwargs:
|
||||
document: scitech
|
||||
operation: update_document
|
||||
```
|
6
nexus/pipe/__init__.py
Normal file
6
nexus/pipe/__init__.py
Normal file
@ -0,0 +1,6 @@
|
||||
from . import (
|
||||
consumers,
|
||||
processors,
|
||||
)
|
||||
|
||||
__all__ = ['consumers', 'processors']
|
17
nexus/pipe/consumers/__init__.py
Normal file
17
nexus/pipe/consumers/__init__.py
Normal file
@ -0,0 +1,17 @@
|
||||
from .cross_references_consumer import (
|
||||
CrossReferencesBulkConsumer,
|
||||
CrossReferencesConsumer,
|
||||
)
|
||||
from .document_operations_consumer import (
|
||||
DocumentOperationsBulkConsumer,
|
||||
DocumentOperationsConsumer,
|
||||
DocumentOperationsJsonConsumer,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
'CrossReferencesBulkConsumer',
|
||||
'CrossReferencesConsumer',
|
||||
'DocumentOperationsConsumer',
|
||||
'DocumentOperationsBulkConsumer',
|
||||
'DocumentOperationsJsonConsumer',
|
||||
]
|
142
nexus/pipe/consumers/base.py
Normal file
142
nexus/pipe/consumers/base.py
Normal file
@ -0,0 +1,142 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import (
|
||||
List,
|
||||
Union,
|
||||
)
|
||||
|
||||
import orjson as json
|
||||
from aiokafka import AIOKafkaConsumer
|
||||
from aiokafka.errors import (
|
||||
CommitFailedError,
|
||||
ConsumerStoppedError,
|
||||
)
|
||||
from aiokit import AioRootThing
|
||||
from google.protobuf.json_format import ParseDict
|
||||
from nexus.actions.exceptions import (
|
||||
ConflictError,
|
||||
InterruptProcessing,
|
||||
)
|
||||
from nexus.pipe.processors.base import Processor
|
||||
|
||||
|
||||
class BaseConsumer(AioRootThing):
|
||||
def __init__(self, processors: List[Processor],
|
||||
topic_names: Union[str, List[str]], bootstrap_servers: str, group_id: str):
|
||||
super().__init__()
|
||||
self.processors = processors
|
||||
if isinstance(topic_names, str):
|
||||
topic_names = [topic_names]
|
||||
self.topic_names = topic_names
|
||||
self.bootstrap_servers = bootstrap_servers
|
||||
self.group_id = group_id
|
||||
self.consumer = None
|
||||
self.starts.extend(self.processors)
|
||||
|
||||
def create_consumer(self):
|
||||
return AIOKafkaConsumer(
|
||||
*self.topic_names,
|
||||
auto_offset_reset='earliest',
|
||||
loop=asyncio.get_event_loop(),
|
||||
bootstrap_servers=self.bootstrap_servers,
|
||||
group_id=self.group_id,
|
||||
enable_auto_commit=False,
|
||||
)
|
||||
|
||||
def preprocess(self, msg):
|
||||
return msg
|
||||
|
||||
async def start(self):
|
||||
logging.getLogger('statbox').info({
|
||||
'action': 'started',
|
||||
'group_id': self.group_id,
|
||||
'topic_names': self.topic_names,
|
||||
})
|
||||
self.consumer = self.create_consumer()
|
||||
await self.consumer.start()
|
||||
try:
|
||||
async for msg in self.consumer:
|
||||
preprocessed_msg = self.preprocess(msg)
|
||||
if preprocessed_msg:
|
||||
for processor in self.processors:
|
||||
if not processor.filter(preprocessed_msg):
|
||||
continue
|
||||
try:
|
||||
await processor.process(preprocessed_msg)
|
||||
except (ConflictError, InterruptProcessing) as e:
|
||||
logging.getLogger('statbox').info(e)
|
||||
except Exception as e:
|
||||
logging.getLogger('error').error(e)
|
||||
raise
|
||||
try:
|
||||
await self.consumer.commit()
|
||||
except CommitFailedError as e:
|
||||
logging.getLogger('error').error(e)
|
||||
except ConsumerStoppedError:
|
||||
pass
|
||||
|
||||
async def stop(self):
|
||||
if not self.consumer:
|
||||
return
|
||||
await self.consumer.stop()
|
||||
|
||||
|
||||
class BasePbConsumer(BaseConsumer):
|
||||
pb_class = None
|
||||
|
||||
def preprocess(self, msg) -> pb_class:
|
||||
pb = self.pb_class()
|
||||
pb.ParseFromString(msg.value)
|
||||
return pb
|
||||
|
||||
|
||||
class BaseJsonConsumer(BaseConsumer):
|
||||
pb_class = None
|
||||
|
||||
def preprocess(self, msg) -> pb_class:
|
||||
pb = self.pb_class()
|
||||
message = json.loads(msg.value)
|
||||
ParseDict(message, pb, ignore_unknown_fields=True)
|
||||
return pb
|
||||
|
||||
|
||||
class BaseBulkConsumer(BaseConsumer):
|
||||
bulk_size = 20
|
||||
timeout = 1
|
||||
|
||||
async def start(self):
|
||||
logging.getLogger('statbox').info({
|
||||
'action': 'started',
|
||||
'group_id': self.group_id,
|
||||
'topic_names': self.topic_names,
|
||||
})
|
||||
self.consumer = self.create_consumer()
|
||||
await self.consumer.start()
|
||||
while self.started:
|
||||
try:
|
||||
result = await self.consumer.getmany(timeout_ms=self.timeout * 1000, max_records=self.bulk_size)
|
||||
except ConsumerStoppedError:
|
||||
break
|
||||
collector = []
|
||||
for tp, messages in result.items():
|
||||
if messages:
|
||||
for message in messages:
|
||||
preprocessed_msg = self.preprocess(message)
|
||||
if preprocessed_msg:
|
||||
collector.append(preprocessed_msg)
|
||||
for processor in self.processors:
|
||||
filtered = filter(processor.filter, collector)
|
||||
try:
|
||||
await processor.process_bulk(filtered)
|
||||
except InterruptProcessing as e:
|
||||
logging.getLogger('statbox').info(e)
|
||||
except Exception as e:
|
||||
logging.getLogger('error').error(e)
|
||||
raise
|
||||
try:
|
||||
await self.consumer.commit()
|
||||
except CommitFailedError as e:
|
||||
logging.getLogger('error').error(e)
|
||||
continue
|
15
nexus/pipe/consumers/cross_references_consumer.py
Normal file
15
nexus/pipe/consumers/cross_references_consumer.py
Normal file
@ -0,0 +1,15 @@
|
||||
from nexus.models.proto.operation_pb2 import \
|
||||
CrossReferenceOperation as CrossReferenceOperationPb
|
||||
|
||||
from .base import (
|
||||
BaseBulkConsumer,
|
||||
BasePbConsumer,
|
||||
)
|
||||
|
||||
|
||||
class CrossReferencesConsumer(BasePbConsumer):
|
||||
pb_class = CrossReferenceOperationPb
|
||||
|
||||
|
||||
class CrossReferencesBulkConsumer(BaseBulkConsumer, CrossReferencesConsumer):
|
||||
pass
|
20
nexus/pipe/consumers/document_operations_consumer.py
Normal file
20
nexus/pipe/consumers/document_operations_consumer.py
Normal file
@ -0,0 +1,20 @@
|
||||
from nexus.models.proto.operation_pb2 import \
|
||||
DocumentOperation as DocumentOperationPb
|
||||
|
||||
from .base import (
|
||||
BaseBulkConsumer,
|
||||
BaseJsonConsumer,
|
||||
BasePbConsumer,
|
||||
)
|
||||
|
||||
|
||||
class DocumentOperationsConsumer(BasePbConsumer):
|
||||
pb_class = DocumentOperationPb
|
||||
|
||||
|
||||
class DocumentOperationsJsonConsumer(BaseJsonConsumer):
|
||||
pb_class = DocumentOperationPb
|
||||
|
||||
|
||||
class DocumentOperationsBulkConsumer(BaseBulkConsumer, DocumentOperationsConsumer):
|
||||
pass
|
16
nexus/pipe/filters/__init__.py
Normal file
16
nexus/pipe/filters/__init__.py
Normal file
@ -0,0 +1,16 @@
|
||||
from aiokit import AioThing
|
||||
from nexus.models.proto.operation_pb2 import \
|
||||
DocumentOperation as DocumentOperationPb
|
||||
|
||||
|
||||
class DocumentOperationFilter(AioThing):
|
||||
def __init__(self, operation, document):
|
||||
super().__init__()
|
||||
self.operation = operation
|
||||
self.document = document
|
||||
|
||||
def filter(self, document_operation_pb: DocumentOperationPb) -> bool:
|
||||
if document_operation_pb.WhichOneof('operation') != self.operation:
|
||||
return False
|
||||
operation = getattr(document_operation_pb, document_operation_pb.WhichOneof('operation'))
|
||||
return operation.typed_document.HasField(self.document)
|
65
nexus/pipe/main.py
Normal file
65
nexus/pipe/main.py
Normal file
@ -0,0 +1,65 @@
|
||||
import logging
|
||||
import ssl
|
||||
from functools import partial
|
||||
|
||||
from aiokit import MultiprocessAsyncExecutor
|
||||
from izihawa_utils.env import node_name
|
||||
from izihawa_utils.importlib import (
|
||||
import_object,
|
||||
instantiate_object,
|
||||
)
|
||||
from library.logging import configure_logging
|
||||
from nexus.pipe.configs import config
|
||||
|
||||
|
||||
def create_aiothing(consumer_cls, topic_names, group_id, processors, shard):
|
||||
processors = [instantiate_object(processor) for processor in processors]
|
||||
return consumer_cls(
|
||||
topic_names=topic_names,
|
||||
processors=processors,
|
||||
bootstrap_servers=config['pipe']['brokers'],
|
||||
group_id=group_id,
|
||||
)
|
||||
|
||||
|
||||
# OpenSSL issue: https://github.com/psf/requests/issues/4775
|
||||
def set_ssl_hack():
|
||||
ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLSv1_2)
|
||||
ssl_context.set_ciphers('HIGH:!DH:!aNULL')
|
||||
ssl_context.set_ciphers('DEFAULT@SECLEVEL=1')
|
||||
|
||||
|
||||
def main():
|
||||
configure_logging(config)
|
||||
set_ssl_hack()
|
||||
|
||||
logger = logging.getLogger('statbox')
|
||||
logger.info({
|
||||
'action': 'started',
|
||||
'mode': 'startup',
|
||||
})
|
||||
|
||||
create_aiothings = []
|
||||
for instance_config in config['pipe']['schema']:
|
||||
node_names = instance_config.get('node_names', [])
|
||||
if node_names and node_name not in node_names:
|
||||
continue
|
||||
for consumer_config in instance_config['consumers']:
|
||||
consumer_cls = import_object(consumer_config['class'])
|
||||
for topic_config in consumer_config['topics']:
|
||||
for _ in range(topic_config['workers']):
|
||||
create_aiothings.append(partial(
|
||||
create_aiothing,
|
||||
consumer_cls,
|
||||
topic_config['name'],
|
||||
instance_config['group_id'],
|
||||
instance_config['processors'],
|
||||
))
|
||||
|
||||
executor = MultiprocessAsyncExecutor(create_aiothings=create_aiothings)
|
||||
executor.start()
|
||||
executor.join()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
7
nexus/pipe/processors/__init__.py
Normal file
7
nexus/pipe/processors/__init__.py
Normal file
@ -0,0 +1,7 @@
|
||||
from .base import ActionProcessor
|
||||
from .cross_references_processor import CrossReferencesProcessor
|
||||
|
||||
__all__ = [
|
||||
'ActionProcessor',
|
||||
'CrossReferencesProcessor',
|
||||
]
|
41
nexus/pipe/processors/base.py
Normal file
41
nexus/pipe/processors/base.py
Normal file
@ -0,0 +1,41 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio.exceptions
|
||||
from typing import Iterable
|
||||
|
||||
from aiokit import AioThing
|
||||
from izihawa_utils.importlib import instantiate_object
|
||||
from tenacity import (
|
||||
retry,
|
||||
retry_if_exception_type,
|
||||
wait_fixed,
|
||||
)
|
||||
|
||||
|
||||
class Processor(AioThing):
|
||||
def filter(self, message) -> bool:
|
||||
return True
|
||||
|
||||
async def process(self, message):
|
||||
return message
|
||||
|
||||
async def process_bulk(self, messages: Iterable):
|
||||
for message in messages:
|
||||
await self.process(message)
|
||||
|
||||
|
||||
class ActionProcessor(Processor):
|
||||
def __init__(self, actions, filter):
|
||||
super().__init__()
|
||||
self.actions = [instantiate_object(action) for action in actions]
|
||||
self.filter_object = instantiate_object(filter)
|
||||
self.waits.append(self.filter_object)
|
||||
self.waits.extend(self.actions)
|
||||
|
||||
def filter(self, message) -> bool:
|
||||
return self.filter_object.filter(message)
|
||||
|
||||
@retry(retry=retry_if_exception_type(asyncio.exceptions.TimeoutError), wait=wait_fixed(5))
|
||||
async def process(self, message):
|
||||
for action in self.actions:
|
||||
message = await action.do(message)
|
145
nexus/pipe/processors/cross_references_processor.py
Normal file
145
nexus/pipe/processors/cross_references_processor.py
Normal file
@ -0,0 +1,145 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
from typing import Iterable
|
||||
|
||||
import aiopg
|
||||
from aiokafka import AIOKafkaProducer
|
||||
from izihawa_utils.exceptions import NeedRetryError
|
||||
from library.aiopostgres.pool_holder import AioPostgresPoolHolder
|
||||
from nexus.actions.common import canonize_doi
|
||||
from nexus.models.proto.operation_pb2 import \
|
||||
CrossReferenceOperation as CrossReferenceOperationPb
|
||||
from nexus.models.proto.operation_pb2 import \
|
||||
DocumentOperation as DocumentOperationPb
|
||||
from nexus.models.proto.operation_pb2 import UpdateDocument as UpdateDocumentPb
|
||||
from nexus.models.proto.scimag_pb2 import Scimag as ScimagPb
|
||||
from nexus.models.proto.typed_document_pb2 import \
|
||||
TypedDocument as TypedDocumentPb
|
||||
from pypika import (
|
||||
PostgreSQLQuery,
|
||||
Table,
|
||||
)
|
||||
from tenacity import (
|
||||
retry,
|
||||
retry_if_exception_type,
|
||||
wait_fixed,
|
||||
)
|
||||
|
||||
from .base import Processor
|
||||
|
||||
|
||||
class CrossReferencesProcessor(Processor):
|
||||
scimag_table = Table('scimag')
|
||||
cross_references_table = Table('cross_references')
|
||||
topic = 'cross_references'
|
||||
|
||||
def __init__(self, brokers, database):
|
||||
super().__init__()
|
||||
self.pool_holder = AioPostgresPoolHolder(
|
||||
fn=aiopg.create_pool,
|
||||
dsn=f'dbname={database["database"]} '
|
||||
f'user={database["username"]} '
|
||||
f'password={database["password"]} '
|
||||
f'host={database["host"]}',
|
||||
timeout=30,
|
||||
pool_recycle=60,
|
||||
maxsize=4,
|
||||
)
|
||||
self.brokers = brokers
|
||||
self.producer = None
|
||||
self.waits.append(self.pool_holder)
|
||||
|
||||
async def start(self):
|
||||
self.producer = self.get_producer()
|
||||
await self.producer.start()
|
||||
|
||||
async def stop(self):
|
||||
await self.producer.stop()
|
||||
self.producer = None
|
||||
|
||||
def get_producer(self):
|
||||
return AIOKafkaProducer(
|
||||
loop=asyncio.get_event_loop(),
|
||||
bootstrap_servers=self.brokers,
|
||||
)
|
||||
|
||||
@retry(retry=retry_if_exception_type(NeedRetryError), wait=wait_fixed(15))
|
||||
async def process_bulk(self, messages: Iterable[CrossReferenceOperationPb]):
|
||||
need_delay = False
|
||||
for message in messages:
|
||||
if message.retry_count > 1:
|
||||
logging.getLogger('error').warning({
|
||||
'status': 'error',
|
||||
'error': 'not_found',
|
||||
'source': message.source,
|
||||
'target': message.target,
|
||||
})
|
||||
continue
|
||||
|
||||
now = time.time()
|
||||
if now - message.last_retry_unixtime < 60:
|
||||
need_delay = True
|
||||
await self.producer.send_and_wait(
|
||||
'cross_references',
|
||||
message.SerializeToString(),
|
||||
)
|
||||
continue
|
||||
|
||||
source = canonize_doi(message.source)
|
||||
target = canonize_doi(message.target)
|
||||
target_row = await self.pool_holder.execute(
|
||||
PostgreSQLQuery
|
||||
.from_('scimag')
|
||||
.select('id')
|
||||
.where(self.scimag_table.doi == target)
|
||||
.get_sql(),
|
||||
fetch=True,
|
||||
)
|
||||
|
||||
if not target_row:
|
||||
if message.retry_count == 0:
|
||||
document_operation = DocumentOperationPb(
|
||||
update_document=UpdateDocumentPb(
|
||||
commit=True,
|
||||
reindex=True,
|
||||
should_fill_from_external_source=True,
|
||||
typed_document=TypedDocumentPb(scimag=ScimagPb(doi=target)),
|
||||
),
|
||||
)
|
||||
|
||||
await self.producer.send_and_wait(
|
||||
'operations_binary_hp',
|
||||
document_operation.SerializeToString(),
|
||||
)
|
||||
new_message = CrossReferenceOperationPb()
|
||||
new_message.CopyFrom(message)
|
||||
new_message.retry_count += 1
|
||||
new_message.last_retry_unixtime = int(time.time())
|
||||
await self.producer.send_and_wait(
|
||||
self.topic,
|
||||
new_message.SerializeToString(),
|
||||
)
|
||||
continue
|
||||
|
||||
target_id = target_row[0][0]
|
||||
source_subquery = (
|
||||
PostgreSQLQuery
|
||||
.from_('scimag')
|
||||
.select('id')
|
||||
.where(self.scimag_table.doi == source)
|
||||
)
|
||||
await self.pool_holder.execute(
|
||||
PostgreSQLQuery
|
||||
.into('cross_references')
|
||||
.columns(
|
||||
'source_id',
|
||||
'target_id',
|
||||
)
|
||||
.insert(source_subquery, target_id)
|
||||
.on_conflict(self.cross_references_table.source_id, self.cross_references_table.target_id)
|
||||
.do_nothing()
|
||||
.get_sql()
|
||||
)
|
||||
if need_delay:
|
||||
await asyncio.sleep(1.0)
|
3
nexus/summa/BUILD.bazel
Normal file
3
nexus/summa/BUILD.bazel
Normal file
@ -0,0 +1,3 @@
|
||||
|
||||
package(default_visibility = ["//visibility:public"])
|
||||
|
115
nexus/summa/README.md
Normal file
115
nexus/summa/README.md
Normal file
@ -0,0 +1,115 @@
|
||||
# Summa Setup Scripts
|
||||
|
||||
## Guide
|
||||
|
||||
#### 1. Find data dumps
|
||||
|
||||
Current version: `20210103.1`
|
||||
|
||||
| File | IPFS |
|
||||
| --------------------|:------------------------------------------------:|
|
||||
| `scitech.index.tar` | `QmVaWFRNTHC3ser4ViHybcD7nuhv2CUAorhXs4JbYYHYm7` |
|
||||
| `scitech.store.tar` | `QmP3p577gRokXXtusRYXXV7MtF3pVmGSdNEUE5TwFzRtAm` |
|
||||
| `scimag.index.tar` | `<upcoming>` |
|
||||
| `scimag.store.tar` | `<upcoming>` |
|
||||
|
||||
If files are not available ask guys from beyond the blackwall.
|
||||
|
||||
#### 2. Deploy data dumps to Summa
|
||||
|
||||
```shell script
|
||||
bazel run -c opt installer -- import-to-summa \
|
||||
--store-filepath scimag.store.tar \
|
||||
--index-filepath scimag.index.tar \
|
||||
--schema-filepath schema/scimag.yaml \
|
||||
--database-path /tmp/summa
|
||||
bazel run -c opt installer -- import-to-summa \
|
||||
--store-filepath scitech.store.tar \
|
||||
--index-filepath scitech.index.tar \
|
||||
--schema-filepath schema/scitech.yaml \
|
||||
--database-path /tmp/summa
|
||||
```
|
||||
|
||||
#### 3. Launch Summa
|
||||
|
||||
```shell script
|
||||
docker run -e ENV_TYPE=production \
|
||||
-v /tmp/summa:/summa -v $(realpath configs/config.yaml):/summa/config.yaml \
|
||||
-p 50000:80 izihawa/summa:latest -c /summa/config.yaml
|
||||
```
|
||||
|
||||
#### 4. Use it
|
||||
|
||||
```shell script
|
||||
curl "localhost:50000/v1/scitech/search/?query=covid&page_size=2" | python3 -m json.tool
|
||||
```
|
||||
```json
|
||||
{
|
||||
"has_next": true,
|
||||
"scored_documents": [
|
||||
{
|
||||
"schema": "scitech",
|
||||
"document": {
|
||||
"authors": [
|
||||
"National committee for Management of COVID-19 Cases (Dubai Health Authority)"
|
||||
],
|
||||
"cu_suf": "g",
|
||||
"description": "Objectives\r\nThe objectives of this document are:\r\n\u2022 To provide guidance on clinical management of the COVID-19 infection\r\n\u2022 To provide a protocol on the practical steps to deal with COVID-19 cases\r\n\u2022 To detail the measures necessary to protect hospital staff, patients and visitors\r\n\u2022 This guideline is not intended to override the clinical decisions that will be made by clinicians providing individualized patient care.\r\n\u2022 This guideline will be updated as more information becomes available.\r\nIntroduction to Coronaviruses (CoV)\r\n\u2022 Corona virus is a large family of viruses that cause illness in humans and animals\r\n\u2022 In people, CoV can cause illness ranging in severity from the common cold to SARS.\r\n\u2022 SARS COV2 is one of seven types of known human coronaviruses. SARS COV2 like the MERS and SARS coronaviruses, likely evolved from a virus previously found in animals\r\n\u2022 The estimated incubation period is unknown and currently considered to be up to 14 days\r\nCase Definition:\r\nSuspected COVID-19 case is defined as:\r\n1. Please refer to the local health authority websites for updated information on local case definition.\r\nMOHAP, DoH, SEHA and DHA\r\nConfirmed COVID-19 is defined as:\r\nA person with confirmed positive COVID-19 test by a reference laboratory.",
|
||||
"extension": "pdf",
|
||||
"filesize": 2240001,
|
||||
"id": 100126757,
|
||||
"ipfs_multihashes": [
|
||||
"bafykbzacebasnsyh4sypqcojwmsd7ujw3ymogwhnx5vhywk7syptxovkyyzvk",
|
||||
"QmSd3tYXxJnWzm8vxpW1M6uxLhvBSpSLQd7cHjdsaoE38D"
|
||||
],
|
||||
"issued_at": 1577836800,
|
||||
"language": "en",
|
||||
"libgen_id": 2492432,
|
||||
"md5": "faf8bcab6ce58a59b3ed09f1e1d9270e",
|
||||
"tags": [
|
||||
"COVID-19 Treatment"
|
||||
],
|
||||
"title": "National Guidelines for Clinical Management and Treatment of COVID-19 (March 19, 2020) Version 1.1"
|
||||
},
|
||||
"score": 36.404663
|
||||
},
|
||||
{
|
||||
"schema": "scitech",
|
||||
"document": {
|
||||
"authors": [
|
||||
"Dr. Tinku Joseph, Dr. Mohammed Ashkan"
|
||||
],
|
||||
"cu_suf": "g",
|
||||
"description": "Corona virus comprises of a large family of viruses that are common in human beings as\r\nwell animals (camels, cattle, cats, and bats). There are seven different strains of corona\r\nvirus. [15]\r\n229E (alpha coronavirus)\r\nNL63 (alpha coronavirus)\r\nOC43 (beta coronavirus)\r\nHKU1 (beta coronavirus)\r\nMERS-CoV (the beta coronavirus that causes Middle East Respiratory Syndrome, or\r\nMERS)\r\nSARS-CoV (the beta coronavirus that causes severe acute respiratory syndrome, or\r\nSARS)\r\nSARS-CoV-2 (the novel coronavirus that causes coronavirus disease 2019, or\r\nCOVID-19)\r\nSometimes corona virus from animals infect people and spread further via human to human\r\ntransmission such as with MERS-CoV, SARS-CoV, and now with this COVID 19 (Corona\r\ndisease 2019). The virus that causes COVID-19 is designated severe acute respiratory\r\nsyndrome corona virus 2 (SARS-CoV-2); previously, referred to as 2019-nCoV.\r\nTowards December 2019, this novel corona virus was identified as a cause of upper and\r\nlower respiratory tract infections in Wuhan, a city in the Hubei Province of China. It rapidly\r\nspread, resulting in an epidemic throughout China and then gradually spreading to other\r\nparts of the world in pandemic proportions. It has affected almost every continent in this\r\nworld, except Antarctica. In February 2020, the World Health Organization designated the\r\ndisease COVID-19, which stands for corona virus disease 2019 [1].",
|
||||
"extension": "pdf",
|
||||
"filesize": 1512761,
|
||||
"id": 100110426,
|
||||
"issued_at": 1577836800,
|
||||
"language": "en",
|
||||
"libgen_id": 2494250,
|
||||
"md5": "23015d4934b216fe797b18b561267fe4",
|
||||
"pages": 43,
|
||||
"tags": [
|
||||
"COVID-19"
|
||||
],
|
||||
"title": "International Pulmonologist\u2019s Consensus on COVID-19"
|
||||
},
|
||||
"score": 32.969494
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
#### 5. (Optional) Deploy data dumps into your database
|
||||
|
||||
There is a function `work` in [`traversing script`](installer/scripts/iterate.py)
|
||||
that you can reimplement to iterate over the whole dataset and insert it into your
|
||||
own database or do whatever you want in parallel mode.
|
||||
|
||||
By default this script is just printing documents.
|
||||
|
||||
```shell script
|
||||
bazel run -c opt installer -- iterate \
|
||||
--store-filepath scitech.store.tar \
|
||||
--schema-filepath schema/scitech.yaml
|
||||
```
|
0
nexus/summa/__init__.py
Normal file
0
nexus/summa/__init__.py
Normal file
14
nexus/summa/configs/config.yaml
Normal file
14
nexus/summa/configs/config.yaml
Normal file
@ -0,0 +1,14 @@
|
||||
---
|
||||
|
||||
http:
|
||||
bind_addr: 0.0.0.0:80
|
||||
keep_alive_secs: 75
|
||||
max_body_size_mb: 32
|
||||
workers: 4
|
||||
log_path: /var/log/summa/{{ ENV_TYPE }}
|
||||
search_engine:
|
||||
data_path: /summa
|
||||
default_page_size: 5
|
||||
timeout_secs: 5
|
||||
writer_memory_mb: 4096
|
||||
writer_threads: 4
|
19
nexus/summa/installer/BUILD.bazel
Normal file
19
nexus/summa/installer/BUILD.bazel
Normal file
@ -0,0 +1,19 @@
|
||||
load("@pip_modules_external//:requirements.bzl", "requirement")
|
||||
load("@rules_python//python:defs.bzl", "py_binary")
|
||||
|
||||
py_binary(
|
||||
name = "installer",
|
||||
srcs = glob([
|
||||
"**/*.py",
|
||||
]),
|
||||
imports = ["."],
|
||||
main = "main.py",
|
||||
srcs_version = "PY3",
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
requirement("psycopg2-binary"),
|
||||
requirement("fire"),
|
||||
requirement("tantipy"),
|
||||
requirement("izihawa_utils"),
|
||||
],
|
||||
)
|
0
nexus/summa/installer/__init__.py
Normal file
0
nexus/summa/installer/__init__.py
Normal file
13
nexus/summa/installer/main.py
Normal file
13
nexus/summa/installer/main.py
Normal file
@ -0,0 +1,13 @@
|
||||
import time
|
||||
|
||||
import fire
|
||||
from nexus.summa.installer.scripts.import_to_summa import import_to_summa
|
||||
from nexus.summa.installer.scripts.iterate import iterate
|
||||
|
||||
if __name__ == '__main__':
|
||||
start = time.time()
|
||||
fire.Fire({
|
||||
'import-to-summa': import_to_summa,
|
||||
'iterate': iterate,
|
||||
})
|
||||
print(f'Elapsed {time.time() - start:.2f} secs')
|
0
nexus/summa/installer/scripts/__init__.py
Normal file
0
nexus/summa/installer/scripts/__init__.py
Normal file
9
nexus/summa/installer/scripts/common.py
Normal file
9
nexus/summa/installer/scripts/common.py
Normal file
@ -0,0 +1,9 @@
|
||||
import os
|
||||
|
||||
|
||||
def resolve_path(filepath):
|
||||
if os.path.isabs(filepath):
|
||||
return filepath
|
||||
cwd = os.environ.get('BUILD_WORKING_DIRECTORY', os.getcwd())
|
||||
filepath = os.path.join(cwd, filepath)
|
||||
return filepath
|
0
nexus/summa/installer/scripts/download.py
Normal file
0
nexus/summa/installer/scripts/download.py
Normal file
26
nexus/summa/installer/scripts/import_to_summa.py
Normal file
26
nexus/summa/installer/scripts/import_to_summa.py
Normal file
@ -0,0 +1,26 @@
|
||||
import os
|
||||
import shutil
|
||||
import tarfile
|
||||
|
||||
import yaml
|
||||
from izihawa_utils.file import mkdir_p
|
||||
|
||||
from .common import resolve_path
|
||||
|
||||
|
||||
def import_to_summa(store_filepath, index_filepath, schema_filepath, database_path):
|
||||
store_filepath = resolve_path(store_filepath)
|
||||
index_filepath = resolve_path(index_filepath)
|
||||
schema_filepath = resolve_path(schema_filepath)
|
||||
database_path = resolve_path(database_path)
|
||||
|
||||
mkdir_p(os.path.join(database_path, 'schema'))
|
||||
mkdir_p(os.path.join(database_path, 'index'))
|
||||
shutil.copy(schema_filepath, os.path.join(database_path, 'schema', os.path.basename(schema_filepath)))
|
||||
with open(schema_filepath, 'r') as f:
|
||||
database_path = os.path.join(database_path, 'index', yaml.safe_load(f)['name'])
|
||||
|
||||
with tarfile.open(store_filepath) as f:
|
||||
f.extractall(database_path)
|
||||
with tarfile.open(index_filepath) as f:
|
||||
f.extractall(database_path)
|
51
nexus/summa/installer/scripts/iterate.py
Normal file
51
nexus/summa/installer/scripts/iterate.py
Normal file
@ -0,0 +1,51 @@
|
||||
import multiprocessing
|
||||
import tarfile
|
||||
from functools import partial
|
||||
|
||||
import yaml
|
||||
from izihawa_utils.itertools import ichunks
|
||||
from tantipy import (
|
||||
TantivyCoder,
|
||||
TantivyReader,
|
||||
)
|
||||
|
||||
from .common import resolve_path
|
||||
|
||||
|
||||
def work(document):
|
||||
# ToDo: Replace this function to what you want to do with document
|
||||
print(document)
|
||||
|
||||
|
||||
def _do_work(coder, filepath, chunk_size, limit, member):
|
||||
with tarfile.open(filepath, 'r') as tar_file:
|
||||
file = tar_file.extractfile(member)
|
||||
data = file.read()
|
||||
print(f'Processing segment {member.name}, size: {len(data) / (1024 * 1024):.2f} Mb ...')
|
||||
tr = TantivyReader(data, coder=coder)
|
||||
for chunk_num, documents in enumerate(ichunks(tr.documents(), chunk_size)):
|
||||
for doc_num, document in enumerate(documents):
|
||||
if chunk_num * chunk_size + doc_num > limit:
|
||||
print(f'Segment {member.name} early terminated due to limits')
|
||||
return
|
||||
work(document)
|
||||
print(f'Segment {member.name} successfully processed')
|
||||
|
||||
|
||||
def iterate(store_filepath, schema_filepath, processes=8, chunk_size=100, limit=1):
|
||||
store_filepath = resolve_path(store_filepath)
|
||||
schema_filepath = resolve_path(schema_filepath)
|
||||
|
||||
with open(schema_filepath) as schema_file:
|
||||
coder = TantivyCoder(yaml.safe_load(schema_file.read()))
|
||||
|
||||
with tarfile.open(store_filepath, 'r') as tar_file:
|
||||
members = []
|
||||
for member in tar_file.getmembers():
|
||||
if not member.name.endswith('store'):
|
||||
continue
|
||||
members.append(member)
|
||||
|
||||
print(f'Total segments: {len(members)}')
|
||||
pool = multiprocessing.Pool(processes)
|
||||
pool.map(partial(_do_work, coder, store_filepath, chunk_size, limit), members)
|
24
nexus/summa/schema/BUILD.bazel
Normal file
24
nexus/summa/schema/BUILD.bazel
Normal file
@ -0,0 +1,24 @@
|
||||
load("@pip_modules_external//:requirements.bzl", "requirement")
|
||||
load("@rules_python//python:defs.bzl", "py_library")
|
||||
|
||||
exports_files([
|
||||
"scimag.yaml",
|
||||
"scitech.yaml",
|
||||
])
|
||||
|
||||
py_library(
|
||||
name = "schema",
|
||||
srcs = glob([
|
||||
"**/*.py",
|
||||
]),
|
||||
data = [
|
||||
"scimag.yaml",
|
||||
"scitech.yaml",
|
||||
],
|
||||
srcs_version = "PY3",
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
requirement("tantipy"),
|
||||
requirement("pyyaml"),
|
||||
],
|
||||
)
|
7
nexus/summa/schema/__init__.py
Normal file
7
nexus/summa/schema/__init__.py
Normal file
@ -0,0 +1,7 @@
|
||||
from .scimag import scimag_coder
|
||||
from .scitech import scitech_coder
|
||||
|
||||
coders = {
|
||||
'scimag': scimag_coder,
|
||||
'scitech': scitech_coder,
|
||||
}
|
5
nexus/summa/schema/scimag.py
Normal file
5
nexus/summa/schema/scimag.py
Normal file
@ -0,0 +1,5 @@
|
||||
import yaml
|
||||
from tantipy import TantivyCoder
|
||||
|
||||
with open('nexus/summa/schema/scimag.yaml') as file:
|
||||
scimag_coder = TantivyCoder(yaml.safe_load(file.read()))
|
122
nexus/summa/schema/scimag.yaml
Normal file
122
nexus/summa/schema/scimag.yaml
Normal file
@ -0,0 +1,122 @@
|
||||
---
|
||||
# yamllint disable rule:key-ordering
|
||||
default_fields: ["abstract", "authors", "language", "title", "tags", "year"]
|
||||
enabled: true
|
||||
key_field: "id"
|
||||
multi_fields: ["authors", "ipfs_multihashes", "issns", "references", "tags"]
|
||||
name: scimag
|
||||
schema:
|
||||
- name: id
|
||||
type: i64
|
||||
options:
|
||||
fast: single
|
||||
indexed: true
|
||||
stored: true
|
||||
- name: abstract
|
||||
type: text
|
||||
options:
|
||||
indexing:
|
||||
record: position
|
||||
tokenizer: default
|
||||
stored: true
|
||||
- name: authors
|
||||
type: text
|
||||
options:
|
||||
indexing:
|
||||
record: position
|
||||
tokenizer: default
|
||||
stored: true
|
||||
- name: doi
|
||||
type: text
|
||||
options:
|
||||
indexing:
|
||||
record: basic
|
||||
tokenizer: raw
|
||||
stored: true
|
||||
- name: first_page
|
||||
type: i64
|
||||
options:
|
||||
indexed: false
|
||||
stored: true
|
||||
- name: container_title
|
||||
type: text
|
||||
options:
|
||||
indexing:
|
||||
record: position
|
||||
tokenizer: default
|
||||
stored: true
|
||||
- name: issns
|
||||
type: text
|
||||
options:
|
||||
indexing: null
|
||||
stored: true
|
||||
- name: issue
|
||||
type: text
|
||||
options:
|
||||
indexing: null
|
||||
stored: true
|
||||
- name: issued_at
|
||||
type: i64
|
||||
options:
|
||||
indexed: true
|
||||
stored: true
|
||||
- name: language
|
||||
type: text
|
||||
options:
|
||||
indexing:
|
||||
record: basic
|
||||
tokenizer: raw
|
||||
stored: true
|
||||
- name: last_page
|
||||
type: i64
|
||||
options:
|
||||
indexed: false
|
||||
stored: true
|
||||
- name: ref_by_count
|
||||
type: i64
|
||||
options:
|
||||
indexed: false
|
||||
stored: true
|
||||
- name: references
|
||||
type: text
|
||||
options:
|
||||
indexing:
|
||||
record: basic
|
||||
tokenizer: raw
|
||||
stored: false
|
||||
- name: scimag_bulk_id
|
||||
type: i64
|
||||
options:
|
||||
indexed: false
|
||||
stored: true
|
||||
- name: tags
|
||||
type: text
|
||||
options:
|
||||
indexing:
|
||||
record: position
|
||||
tokenizer: default
|
||||
stored: true
|
||||
- name: title
|
||||
type: text
|
||||
options:
|
||||
indexing:
|
||||
record: position
|
||||
tokenizer: default
|
||||
stored: true
|
||||
- name: updated_at
|
||||
type: i64
|
||||
options:
|
||||
indexed: true
|
||||
stored: true
|
||||
- name: volume
|
||||
type: text
|
||||
options:
|
||||
indexing: null
|
||||
stored: true
|
||||
- name: year
|
||||
type: text
|
||||
options:
|
||||
indexing:
|
||||
record: basic
|
||||
tokenizer: raw
|
||||
stored: false
|
5
nexus/summa/schema/scitech.py
Normal file
5
nexus/summa/schema/scitech.py
Normal file
@ -0,0 +1,5 @@
|
||||
import yaml
|
||||
from tantipy import TantivyCoder
|
||||
|
||||
with open('nexus/summa/schema/scitech.yaml') as file:
|
||||
scitech_coder = TantivyCoder(yaml.safe_load(file.read()))
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user