No description

GitOrigin-RevId: ddf02e70d2827c048db49b687ebbcdcc67807ca6
This commit is contained in:
the-superpirate 2021-01-04 11:35:31 +03:00
parent 66ecaf0d41
commit 8472f27ec5
517 changed files with 55814 additions and 0 deletions

37
BUILD.bazel Normal file
View File

@ -0,0 +1,37 @@
load("@io_bazel_rules_k8s//k8s:objects.bzl", "k8s_objects")
# System-wide settings
config_setting(
name = "osx",
constraint_values = ["@bazel_tools//platforms:osx"],
)
config_setting(
name = "linux",
constraint_values = ["@bazel_tools//platforms:linux"],
)
platform(
name = "linux_x86",
constraint_values = [
"@io_bazel_rules_rust//rust/platform:linux",
"@bazel_tools//platforms:linux",
"@bazel_tools//platforms:x86_64",
],
)
load("@io_bazel_rules_rust//proto:toolchain.bzl", "rust_proto_toolchain")
rust_proto_toolchain(
name = "proto-toolchain-impl",
grpc_plugin = "//rules/rust/cargo:cargo_bin_protoc_gen_rust_grpc",
proto_plugin = "//rules/rust/cargo:cargo_bin_protoc_gen_rust",
protoc = "@com_google_protobuf//:protoc",
)
toolchain(
name = "proto-toolchain",
toolchain = ":proto-toolchain-impl",
toolchain_type = "@io_bazel_rules_rust//proto:toolchain",
)

View File

@ -1,3 +1,4 @@
This is free and unencumbered software released into the public domain.
Anyone is free to copy, modify, publish, use, compile, sell, or

54
README.md Normal file
View File

@ -0,0 +1,54 @@
# Hyperboria
## Introduction
Hyperboria repository is a pack of tools for dealing with SciMag and SciTech collections.
It consists of configurable [`search engine`](nexus/summa), [`pipeline`](nexus/pipe) for [`ingesting`](nexus/ingest) data
from upstream sources. So-called [`actions`](nexus/actions) aimed to converting data from external APIs
into [`internal Protobuf format`](nexus/models) and to landing converted data into databases and/or search engines.
## Prerequisite
### Ubuntu 20.04
#### Docker
[Installation Guide](https://docs.docker.com/engine/install/ubuntu/)
#### System Compilers
```shell script
sudo apt-get install -y --no-install-recommends g++ python3.9 protobuf-compiler libprotobuf-dev
```
#### Bazel Build System
[Installation Guide](https://docs.bazel.build/versions/master/install-ubuntu.html) or _one-liner_:
```shell script
sudo apt install curl gnupg
curl -fsSL https://bazel.build/bazel-release.pub.gpg | gpg --dearmor > bazel.gpg
sudo mv bazel.gpg /etc/apt/trusted.gpg.d/
echo "deb [arch=amd64] https://storage.googleapis.com/bazel-apt stable jdk1.8" | sudo tee /etc/apt/sources.list.d/bazel.list
sudo apt update && sudo apt install bazel
```
### MacOS
#### Docker
[Installation Guide](https://docs.docker.com/docker-for-mac/install/)
#### System Compilers
```shell script
brew install llvm protobuf python3.9
```
#### Bazel Build System
[Installation Guide](https://docs.bazel.build/versions/master/install-os-x.html) or _one-liner_:
```shell script
brew install bazel
```
## Content
- [`images`](images) - base docker images for [`nexus`](nexus)
- [`library`](library) - shared libraries
- [`nexus`](nexus) - processing and searching in scientific text collections
- [`rules`](rules) - build rules

285
WORKSPACE Normal file
View File

@ -0,0 +1,285 @@
workspace(
name = "hyperboria",
managed_directories = {"@npm": ["rules/nodejs/node_modules"]},
)
load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
http_archive(
name = "bazel_skylib",
sha256 = "11b0e65ec07113b2ea81be554c7471bb80fc5766aba6239c91d071602c46d50f",
strip_prefix = "bazel-skylib-dc080e95161964a1ff841bfd0b871a1123c027a8",
urls = [
"https://github.com/bazelbuild/bazel-skylib/archive/dc080e95161964a1ff841bfd0b871a1123c027a8.tar.gz",
],
)
http_archive(
name = "build_bazel_rules_nodejs",
sha256 = "6a67a8a1bf6fddc9113f73471029b819eef4575c3a936a4a01d57e411894d692",
urls = [
"https://github.com/bazelbuild/rules_nodejs/releases/download/2.0.2/rules_nodejs-2.0.2.tar.gz",
],
)
http_archive(
name = "com_github_grpc_grpc",
sha256 = "f046d4cb4d60d4f2a2087e9d46c7ec0c523cd54ebf68eda6272de4ce65e20ac7",
strip_prefix = "grpc-ae7f520358d7145a7484db693376fdebbd72662d",
urls = [
"https://github.com/grpc/grpc/archive/ae7f520358d7145a7484db693376fdebbd72662d.tar.gz",
],
)
http_archive(
name = "com_google_protobuf",
sha256 = "7d663c8dc81d282dc92e884b38e9c179671e31ccacce311154420e65f7d142c6",
strip_prefix = "protobuf-3.13.0.1",
urls = [
"https://github.com/protocolbuffers/protobuf/archive/v3.13.0.1.tar.gz",
],
)
http_archive(
name = "io_bazel_rules_docker",
sha256 = "ba415feb61f7dd08051c7096df9feeb2109bc918878ef924ad9262fe0fcdf6f9",
strip_prefix = "rules_docker-9bfcd7dbf0294ed9d11a99da6363fc28df904502",
urls = [
"https://github.com/bazelbuild/rules_docker/archive/9bfcd7dbf0294ed9d11a99da6363fc28df904502.zip",
],
)
http_archive(
name = "io_bazel_rules_k8s",
sha256 = "95addfd2b7b07b5a4e75663d15aa57dc271f7b831ec404109322288e1b6bf126",
strip_prefix = "rules_k8s-9f9886c7252d66bb2e2206842b149a6ceebe6fe5",
urls = [
"https://github.com/bazelbuild/rules_k8s/archive/9f9886c7252d66bb2e2206842b149a6ceebe6fe5.zip",
],
)
http_archive(
name = "io_bazel_rules_rust",
sha256 = "50a772198877e21a61823fa292d28539f8bc99d72463e55b5b09942394ec370e",
strip_prefix = "rules_rust-9a8ef691b8e8f682d767189c38339cbee16d0a16",
urls = [
# Master branch as of 2020-10-16
"https://github.com/bazelbuild/rules_rust/archive/9a8ef691b8e8f682d767189c38339cbee16d0a16.tar.gz",
],
)
http_archive(
name = "rules_jvm_external",
sha256 = "d85951a92c0908c80bd8551002d66cb23c3434409c814179c0ff026b53544dab",
strip_prefix = "rules_jvm_external-3.3",
urls = [
"https://github.com/bazelbuild/rules_jvm_external/archive/3.3.zip",
],
)
http_archive(
name = "rules_pkg",
sha256 = "0a33148c4957e666a29443f75b2c0db1fe3e0baf7256742fc47a35731f7a1d2e",
strip_prefix = "rules_pkg-4b0b9f4679484f107f750a60190ff5ec6b164a5f/pkg",
urls = [
"https://github.com/bazelbuild/rules_pkg/archive/4b0b9f4679484f107f750a60190ff5ec6b164a5f.zip",
],
)
http_archive(
name = "rules_proto",
sha256 = "aa1ee19226f707d44bee44c720915199c20c84a23318bb0597ed4e5c873ccbd5",
strip_prefix = "rules_proto-40298556293ae502c66579620a7ce867d5f57311",
urls = [
"https://github.com/bazelbuild/rules_proto/archive/40298556293ae502c66579620a7ce867d5f57311.tar.gz",
],
)
http_archive(
name = "rules_python",
sha256 = "ae3c1380c3c19d47fb474f201862dde7c14601130be2befa73bb02211267e960",
strip_prefix = "rules_python-e3df8bcf0f675d20aaf752c8ba32a0259dd79996",
urls = [
"https://github.com/bazelbuild/rules_python/archive/e3df8bcf0f675d20aaf752c8ba32a0259dd79996.tar.gz",
],
)
http_archive(
name = "rules_python_external",
sha256 = "30987e33c0b00ef75d11dec756db6a5d57ccd4085525f8888d5237ef798f8d16",
strip_prefix = "rules_python_external-2c78da5b5beb78c4a96b8b4d84e9c34de8178efb",
urls = [
"https://github.com/dillon-giacoppo/rules_python_external/archive/2c78da5b5beb78c4a96b8b4d84e9c34de8178efb.zip",
],
)
http_archive(
name = "subpar",
sha256 = "e6e4332bf9af36c4165ad6cc7b2c76288e9f156eba35dc95b739e58c46f30a50",
strip_prefix = "subpar-9fae6b63cfeace2e0fb93c9c1ebdc28d3991b16f",
urls = [
"https://github.com/google/subpar/archive/9fae6b63cfeace2e0fb93c9c1ebdc28d3991b16f.zip",
],
)
http_archive(
name = "cython",
build_file = "@com_github_grpc_grpc//third_party:cython.BUILD",
sha256 = "e2e38e1f0572ca54d6085df3dec8b607d20e81515fb80215aed19c81e8fe2079",
strip_prefix = "cython-0.29.21",
urls = [
"https://github.com/cython/cython/archive/0.29.21.tar.gz",
],
)
# Java
load("//rules/java:artifacts.bzl", "maven_fetch_remote_artifacts")
maven_fetch_remote_artifacts()
# Rust
load("@io_bazel_rules_rust//rust:repositories.bzl", "rust_repository_set")
rust_version = "1.48.0"
rustfmt_version = "1.4.20"
rust_repository_set(
name = "rust_linux_x86_64",
edition = "2018",
exec_triple = "x86_64-unknown-linux-gnu",
extra_target_triples = ["wasm32-unknown-unknown"],
rustfmt_version = rustfmt_version,
version = rust_version,
)
rust_repository_set(
name = "rust_darwin_x86_64",
edition = "2018",
exec_triple = "x86_64-apple-darwin",
extra_target_triples = ["wasm32-unknown-unknown"],
rustfmt_version = rustfmt_version,
version = rust_version,
)
load("@io_bazel_rules_rust//:workspace.bzl", "bazel_version")
bazel_version(name = "bazel_version")
load("//rules/rust:crates.bzl", "raze_fetch_remote_crates")
raze_fetch_remote_crates()
register_toolchains("//:proto-toolchain")
# NodeJS
load("@build_bazel_rules_nodejs//:index.bzl", "yarn_install")
yarn_install(
name = "npm",
package_json = "//rules/nodejs:package.json",
symlink_node_modules = True,
use_global_yarn_cache = True,
yarn_lock = "//rules/nodejs:yarn.lock",
)
# Packaging
load("@rules_pkg//:deps.bzl", "rules_pkg_dependencies")
rules_pkg_dependencies()
# Docker Setup
load(
"@io_bazel_rules_docker//toolchains/docker:toolchain.bzl",
docker_toolchain_configure = "toolchain_configure",
)
docker_toolchain_configure(
name = "docker_config",
client_config = "/docker",
)
load("@io_bazel_rules_docker//repositories:repositories.bzl", container_repositories = "repositories")
container_repositories()
load("@io_bazel_rules_docker//repositories:deps.bzl", container_deps = "deps")
container_deps()
load("@io_bazel_rules_docker//repositories:pip_repositories.bzl", "pip_deps")
pip_deps()
load("@io_bazel_rules_docker//java:image.bzl", java_image_repos = "repositories")
load("@io_bazel_rules_docker//python3:image.bzl", py3_image_repos = "repositories")
load("@io_bazel_rules_docker//nodejs:image.bzl", nodejs_image_repos = "repositories")
load("@io_bazel_rules_docker//rust:image.bzl", rust_image_repos = "repositories")
java_image_repos()
nodejs_image_repos()
py3_image_repos()
rust_image_repos()
# Python
register_toolchains("//rules/python:py_toolchain")
load("@rules_python_external//:defs.bzl", "pip_install")
pip_install(
name = "pip_modules_external",
requirements = "//rules/python:requirements.txt",
)
load("@rules_python_external//:repositories.bzl", "rules_python_external_dependencies")
rules_python_external_dependencies()
# K8s
load("@io_bazel_rules_k8s//k8s:k8s.bzl", "k8s_repositories")
k8s_repositories()
load("@io_bazel_rules_k8s//k8s:k8s_go_deps.bzl", k8s_go_deps = "deps")
k8s_go_deps()
# Miscellaneous
load("//rules/misc:setup.bzl", "rules_misc_setup_internal")
rules_misc_setup_internal()
load("//rules/misc:install.bzl", "rules_misc_install_internal")
rules_misc_install_internal()
# Images Install
load("//images:install.bzl", "images_install")
images_install()
# Proto / gRPC
load("@rules_proto//proto:repositories.bzl", "rules_proto_dependencies", "rules_proto_toolchains")
rules_proto_dependencies()
rules_proto_toolchains()
load("@com_github_grpc_grpc//bazel:grpc_deps.bzl", "grpc_deps")
grpc_deps()
load("@com_github_grpc_grpc//bazel:grpc_extra_deps.bzl", "grpc_extra_deps")
grpc_extra_deps()

8
images/BUILD.bazel Normal file
View File

@ -0,0 +1,8 @@
load("@io_bazel_rules_docker//container:container.bzl", "container_image")
package(default_visibility = ["//visibility:public"])
container_image(
name = "base-image",
base = "@ubuntu//image",
)

18
images/install.bzl Normal file
View File

@ -0,0 +1,18 @@
"""
Install various images
"""
load("@io_bazel_rules_docker//container:pull.bzl", "container_pull")
def images_install():
"""
Docker predefined images
"""
container_pull(
name = "ubuntu",
registry = "index.docker.io",
repository = "library/ubuntu",
digest = "sha256:4e4bc990609ed865e07afc8427c30ffdddca5153fd4e82c20d8f0783a291e241",
tag = "20.04",
)

View File

@ -0,0 +1,39 @@
load("@io_bazel_rules_docker//container:container.bzl", "container_image", "container_push")
load("@io_bazel_rules_docker//docker/package_managers:download_pkgs.bzl", "download_pkgs")
load("@io_bazel_rules_docker//docker/package_managers:install_pkgs.bzl", "install_pkgs")
package(default_visibility = ["//visibility:public"])
download_pkgs(
name = "download-base-python-image",
image_tar = "//images:base-image.tar",
packages = [
"bash",
"libev4",
"libgomp1",
"libprotobuf17",
"libssl1.1",
"python3.9",
],
)
install_pkgs(
name = "install-base-python-image",
image_tar = "//images:base-image.tar",
installables_tar = ":download-base-python-image.tar",
installation_cleanup_commands = "rm -rf /var/lib/apt/lists/*",
output_image_name = "installed-base-python-image",
)
container_image(
name = "base-python-image",
base = ":install-base-python-image",
entrypoint = ["/usr/bin/python3.9"],
env = {"LANG": "C.UTF-8"},
symlinks = {
"/usr/bin/python": "/usr/bin/python3.9",
"/usr/bin/python3": "/usr/bin/python3.9",
},
visibility = ["//visibility:public"],
)

0
library/__init__.py Normal file
View File

View File

@ -0,0 +1,19 @@
load("@pip_modules_external//:requirements.bzl", "requirement")
load("@rules_python//python:defs.bzl", "py_library")
py_library(
name = "aiogrpctools",
srcs = glob(
["**/*.py"],
exclude = ["tests/**"],
),
srcs_version = "PY3ONLY",
visibility = ["//visibility:public"],
deps = [
requirement("grpcio"),
requirement("aiokit"),
"//library/configurator",
"//library/logging",
requirement("izihawa_utils"),
],
)

View File

@ -0,0 +1,7 @@
from .base import (
AioGrpcServer,
aiogrpc_request_wrapper,
aiogrpc_streaming_request_wrapper,
)
__all__ = ['AioGrpcServer', 'aiogrpc_streaming_request_wrapper', 'aiogrpc_request_wrapper']

View File

@ -0,0 +1,100 @@
import logging
from functools import wraps
from aiokit import (
AioRootThing,
AioThing,
)
from google.protobuf.json_format import MessageToDict
from grpc import aio
from izihawa_utils.text import camel_to_snake
from library.logging import error_log
class AioGrpcServer(AioRootThing):
def __init__(self, address, port):
super().__init__()
self.server = aio.server()
self.server.add_insecure_port(f'{address}:{port}')
async def start(self):
await self.server.start()
await self.server.wait_for_termination()
async def stop(self):
await self.server.stop(None)
class BaseService(AioThing):
error_mapping = {}
def __init__(self, service_name):
super().__init__()
self.service_name = service_name
self.class_name = camel_to_snake(self.__class__.__name__)
def get_default_service_fields(self):
return {'service_name': self.service_name, 'view': self.class_name}
def statbox(self, **kwargs):
logging.getLogger('statbox').info(self.get_default_service_fields() | kwargs)
def aiogrpc_request_wrapper(log=True):
def _aiogrpc_request_wrapper(func):
@wraps(func)
async def wrapped(self, request, context):
metadata = dict(context.invocation_metadata())
try:
if log:
self.statbox(
action='enter',
mode=func.__name__,
request_id=metadata['request-id'],
)
r = await func(self, request, context, metadata)
if log:
self.statbox(
action='exit',
mode=func.__name__,
request_id=metadata['request-id'],
)
return r
except aio.AbortError:
raise
except Exception as e:
serialized_request = MessageToDict(request, preserving_proto_field_name=True)
error_log(e, request=serialized_request, request_id=metadata['request-id'])
if e.__class__ in self.error_mapping:
await context.abort(*self.error_mapping[e.__class__])
raise e
return wrapped
return _aiogrpc_request_wrapper
def aiogrpc_streaming_request_wrapper(func):
@wraps(func)
async def wrapped(self, request, context):
metadata = dict(context.invocation_metadata())
try:
self.statbox(
action='enter',
mode=func.__name__,
request_id=metadata['request-id'],
)
async for item in func(self, request, context, metadata):
yield item
self.statbox(
action='exit',
mode=func.__name__,
request_id=metadata['request-id'],
)
except aio.AbortError:
raise
except Exception as e:
serialized_request = MessageToDict(request, preserving_proto_field_name=True)
error_log(e, request=serialized_request, request_id=metadata['request-id'])
if e.__class__ in self.error_mapping:
await context.abort(*self.error_mapping[e.__class__])
raise e
return wrapped

View File

@ -0,0 +1,17 @@
load("@pip_modules_external//:requirements.bzl", "requirement")
load("@rules_python//python:defs.bzl", "py_library")
py_library(
name = "aiopostgres",
srcs = glob(
["**/*.py"],
exclude = ["tests/**"],
),
srcs_version = "PY3",
visibility = ["//visibility:public"],
deps = [
requirement("aiopg"),
requirement("tenacity"),
requirement("aiokit"),
],
)

View File

@ -0,0 +1,3 @@
from .pool_holder import AioPostgresPoolHolder
__all__ = ['AioPostgresPoolHolder']

View File

@ -0,0 +1,41 @@
import psycopg2.extras
from aiokit import AioThing
from psycopg2 import OperationalError
from tenacity import (
retry,
retry_if_exception_type,
stop_after_attempt,
wait_fixed,
)
class AioPostgresPoolHolder(AioThing):
def __init__(self, fn, *args, **kwargs):
super().__init__()
self.fn = fn
self.args = args
self.kwargs = kwargs
self.pool = None
@retry(
retry=retry_if_exception_type(OperationalError),
stop=stop_after_attempt(3),
wait=wait_fixed(1.0),
)
async def start(self):
if not self.pool:
self.pool = await self.fn(*self.args, **self.kwargs)
async def stop(self):
if self.pool:
self.pool.close()
await self.pool.wait_closed()
self.pool = None
async def execute(self, stmt, values=None, fetch=False, timeout=None, cursor_factory=psycopg2.extras.DictCursor):
async with self.pool.acquire() as conn:
async with conn.cursor(cursor_factory=cursor_factory) as cur:
await cur.execute(stmt, values, timeout=timeout)
if fetch:
return await cur.fetchall()
return cur.rowcount

View File

@ -0,0 +1,18 @@
load("@pip_modules_external//:requirements.bzl", "requirement")
load("@rules_python//python:defs.bzl", "py_library")
py_library(
name = "configurator",
srcs = glob(
["**/*.py"],
exclude = ["tests/**"],
),
srcs_version = "PY3",
visibility = ["//visibility:public"],
deps = [
requirement("jinja2"),
requirement("orjson"),
requirement("pyyaml"),
requirement("izihawa_utils"),
],
)

View File

@ -0,0 +1,148 @@
import os
import os.path
from types import ModuleType
import orjson as json
import yaml
from izihawa_utils.common import smart_merge_dicts
from jinja2 import Template
from library.configurator.exceptions import UnknownConfigFormatError
class ConfigObject(dict):
def __getattr__(self, name):
try:
return self[name]
except KeyError as e:
raise AttributeError(e)
class AnyOf:
def __init__(self, *args):
self.args = args
class RichDict(dict):
def has(self, *args):
current = self
for c in args:
if c not in current:
return False
current = current[c]
return True
def copy_if_exists(self, source_keys, target_key):
current = self
for c in source_keys:
if c not in current:
return False
current = current[c]
self[target_key] = current
return True
class Configurator(RichDict):
def __init__(self, configs: list):
"""
Create Configurator object
:param configs: list of paths to config files, dicts or modules.
End filepath with `?` to mark it as optional config.
"""
super().__init__()
self._by_basenames = {}
self._omitted_files = []
env_config = {}
env_config_var = os.environ.get('CONFIGURATOR', '')
if env_config_var:
env_config = yaml.safe_load(env_config_var)
for config in ([os.environ] + configs + [env_config]):
file_found = self.update(config)
if not file_found:
self._omitted_files.append(config)
def _config_filename(self, filename):
return os.path.join(os.getcwd(), filename)
def walk_and_render(self, c):
if isinstance(c, str):
return Template(c).render(**self)
elif isinstance(c, list):
return [self.walk_and_render(e) for e in c]
elif isinstance(c, dict):
for key in c:
c[key] = self.walk_and_render(c[key])
return c
def update(self, new_config, basename=None, **kwargs):
if isinstance(new_config, AnyOf):
for config in new_config.args:
try:
return self.update(config.rstrip('?'))
except IOError:
pass
raise IOError('None of %s was found' % ', '.join(new_config.args))
elif isinstance(new_config, str):
optional = new_config.endswith('?')
filename = new_config.rstrip('?')
basename = basename or os.path.basename(filename)
config_filename = self._config_filename(filename)
data = None
if os.path.exists(config_filename) and os.access(config_filename, os.R_OK):
with open(config_filename) as f:
data = f.read()
if data is None:
if optional:
return False
else:
raise IOError(f'File {config_filename} not found')
if filename.endswith('.json'):
new_config = json.loads(data)
elif filename.endswith('.yaml'):
new_config = yaml.safe_load(data)
else:
raise UnknownConfigFormatError(filename)
new_config = self.walk_and_render(new_config)
elif isinstance(new_config, ModuleType):
new_config = new_config.__dict__
elif callable(new_config):
new_config = new_config(self)
if not new_config:
new_config = {}
for k in new_config:
if callable(new_config[k]):
new_config[k] = new_config[k](context=self)
if 'log_path' in new_config:
new_config['log_path'] = os.path.expanduser(new_config['log_path']).rstrip('/')
smart_merge_dicts(self, new_config, list_policy='override', copy=False)
if basename:
self._by_basenames[basename] = new_config
return True
def get_config_by_basename(self, basename):
return self._by_basenames[basename]
def get_object_by_basename(self, basename):
return ConfigObject(self._by_basenames[basename])
def has_missed_configs(self):
return bool(self._omitted_files)
def has_file(self, basename):
return basename in self._by_basenames

View File

@ -0,0 +1,2 @@
class UnknownConfigFormatError(Exception):
pass

View File

@ -0,0 +1,15 @@
load("@pip_modules_external//:requirements.bzl", "requirement")
load("@rules_python//python:defs.bzl", "py_library")
py_library(
name = "logging",
srcs = glob(["**/*.py"]),
srcs_version = "PY3ONLY",
visibility = ["//visibility:public"],
deps = [
requirement("orjson"),
requirement("prometheus_client"),
requirement("izihawa_types"),
requirement("izihawa_utils"),
],
)

View File

@ -0,0 +1,44 @@
import logging
import logging.config
import sys
from izihawa_utils.exceptions import BaseError
from izihawa_utils.file import mkdir_p
from library.logging.formatters import (
DefaultFormatter,
DefaultHttpFormatter,
)
from library.logging.handlers import QueueHandler
from prometheus_client import Counter
error_counter = Counter('errors_total', 'counter for error.log')
def configure_logging(config, make_path=True):
if config.get('application', {}).get('debug', False) or 'logging' not in config:
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
else:
if make_path:
mkdir_p(config['log_path'])
logging.config.dictConfig(config['logging'])
def error_log(e, level=logging.ERROR, **fields):
level = getattr(e, 'level', level)
if level == logging.ERROR:
error_counter.inc()
if isinstance(e, BaseError):
e = e.as_internal_dict()
e.update(fields)
elif fields:
e = {'error': str(e), **fields}
logging.getLogger('error').log(
msg=e,
level=level
)
__all__ = [
'DefaultFormatter', 'DefaultHttpFormatter',
'QueueHandler', 'configure_logging', 'error_log',
]

View File

@ -0,0 +1,94 @@
import dataclasses
import datetime
import logging
import os
import pprint
import time
import traceback
import typing
import orjson as json
from izihawa_utils.exceptions import BaseError
DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S.%f'
class BaseFormatter(logging.Formatter):
def _prepare(self, record):
if isinstance(record.msg, BaseError):
return record.msg.as_internal_dict()
elif isinstance(record.msg, typing.Dict) or dataclasses.is_dataclass(record.msg):
return record.msg
else:
return dict(message=super().format(record))
def format(self, record):
log_record = self._prepare(record)
return json.dumps(log_record).decode()
class DefaultHttpFormatter(BaseFormatter):
def _prepare(self, record):
log_record = super()._prepare(record)
timestamp = time.time()
formatted_datetime = datetime.datetime.fromtimestamp(timestamp).strftime(DATETIME_FORMAT)
user_ip = getattr(record, 'user_ip', None)
request_id = getattr(record, 'request_id', None)
method = getattr(record, 'method', None)
path = getattr(record, 'path', None)
log_record.update(
unixtime=int(timestamp),
timestamp=int(timestamp * 1_000_000),
datetime=formatted_datetime,
process=os.getpid(),
)
if user_ip:
log_record['user_ip'] = user_ip
if request_id:
log_record['request_id'] = request_id
if method:
log_record['method'] = method
if path:
log_record['path'] = path
return log_record
def format(self, record):
log_record = self._prepare(record)
return json.dumps(log_record).decode()
class DefaultFormatter(BaseFormatter):
def _prepare(self, record):
log_record = super()._prepare(record)
timestamp = time.time()
formatted_datetime = datetime.datetime.fromtimestamp(timestamp).strftime(DATETIME_FORMAT)
log_record.update(
unixtime=int(timestamp),
timestamp=int(timestamp * 1_000_000),
datetime=formatted_datetime,
process=os.getpid(),
)
return log_record
def format(self, record):
log_record = self._prepare(record)
return json.dumps(log_record).decode()
class TracebackFormatter(DefaultFormatter):
def format(self, record):
log_record = self._prepare(record)
value = pprint.pformat(log_record, indent=2)
if traceback.sys.exc_info()[0] is not None:
value += '\n' + traceback.format_exc()
return value
default_formatter = DefaultFormatter()
default_traceback_formatter = TracebackFormatter()

View File

@ -0,0 +1,42 @@
import logging.handlers
import os
import queue
from izihawa_types.var import varstr
class QueueHandler(logging.handlers.QueueHandler):
def __init__(self, *handlers):
self._queue = queue.Queue(-1)
self._listener = logging.handlers.QueueListener(self._queue, *handlers, respect_handler_level=True)
self.setLevel('INFO')
super().__init__(self._queue)
self._listener.start()
def stop(self):
self._listener.stop()
def prepare(self, record):
return record
class BaseFileHandler(logging.handlers.WatchedFileHandler):
def _open(self):
file = super()._open()
os.chmod(self.baseFilename, 0o644)
return file
class BaseBinaryFileHandler(BaseFileHandler):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs, mode='ab+')
def emit(self, record):
try:
self.stream.write(varstr(record.msg))
self.flush()
except RecursionError:
raise
except Exception:
self.handleError(record)

18
nexus/README.md Normal file
View File

@ -0,0 +1,18 @@
# Nexus
## Content
- ✅ [`actions`](actions) - shared code for ingesting data from external APIs (LibGen/CrossrefAPI)
- 🛑 `bot` - telegram bot for Summa
- 🛑 `cognitron` - bundled app for IPFS, search server and web frontend
- 🛑 `hub` - downloading & sending
- ✅ [`ingest`](ingest) - retrieving metadata from external APIs and putting it onto Kafka
- 🛑 `meta_api` - rescoring and merging API for Summa backends
- ✅ [`models`](models) - shared Protobuf models
- ✅ [`nlptools`](nlptools) - text routines
- ✅ [`pipe`](pipe) - processing pipeline based on Kafka
- 🛑 `pylon` - smart proxy for downloading files from the Internet/IPFS
- ✅ [`summa`](summa) - scripts for setting Summa
- 🛑 `translations` - text translations used in `bot` and `hub`
- 🛑 `views` - shared views for [`models`](models)
- 🛑 `web` - web frontend for Summa

0
nexus/__init__.py Normal file
View File

26
nexus/actions/BUILD.bazel Normal file
View File

@ -0,0 +1,26 @@
load("@pip_modules_external//:requirements.bzl", "requirement")
load("@rules_python//python:defs.bzl", "py_library")
py_library(
name = "actions",
srcs = glob(
["**/*.py"],
exclude = ["tests/**"],
),
imports = ["."],
srcs_version = "PY3",
visibility = ["//visibility:public"],
deps = [
requirement("beautifulsoup4"),
requirement("lxml"),
requirement("pypika"),
requirement("numpy"),
requirement("aiocrossref"),
requirement("aiolibgen"),
"//library/aiopostgres",
"//nexus/models/proto:models_proto_py",
"//nexus/nlptools",
"//nexus/summa/schema",
requirement("aiosumma"),
],
)

5
nexus/actions/README.md Normal file
View File

@ -0,0 +1,5 @@
# Nexus Actions
`Actions` is segregated dirty code for processing Crossref API and LibGen API responses.
Also, module has parts required for landing data onto databases and/or search engines.

21
nexus/actions/__init__.py Normal file
View File

@ -0,0 +1,21 @@
from .update_document import SendDocumentOperationUpdateDocumentPbToSummaAction
from .update_document_scimag import (
CleanDocumentOperationUpdateDocumentScimagPbAction,
FillDocumentOperationUpdateDocumentScimagPbFromExternalSourceAction,
SendDocumentOperationUpdateDocumentScimagPbReferencesToKafkaAction,
SendDocumentOperationUpdateDocumentScimagPbToGoldenPostgresAction,
)
from .update_document_scitech import (
CleanDocumentOperationUpdateDocumentScitechPbAction,
SendDocumentOperationUpdateDocumentScitechPbToGoldenPostgresAction,
)
__all__ = [
'CleanDocumentOperationUpdateDocumentScimagPbAction',
'CleanDocumentOperationUpdateDocumentScitechPbAction',
'FillDocumentOperationUpdateDocumentScimagPbFromExternalSourceAction',
'SendDocumentOperationUpdateDocumentPbToSummaAction',
'SendDocumentOperationUpdateDocumentScimagPbReferencesToKafkaAction',
'SendDocumentOperationUpdateDocumentScimagPbToGoldenPostgresAction',
'SendDocumentOperationUpdateDocumentScitechPbToGoldenPostgresAction',
]

6
nexus/actions/base.py Normal file
View File

@ -0,0 +1,6 @@
from aiokit import AioThing
class BaseAction(AioThing):
async def do(self, item):
pass

5
nexus/actions/common.py Normal file
View File

@ -0,0 +1,5 @@
from urllib.parse import unquote
def canonize_doi(doi):
return unquote(doi.lower())

View File

@ -0,0 +1,108 @@
import time
from datetime import date
from nexus.models.proto.scimag_pb2 import Scimag as ScimagPb
from .base import BaseAction
def extract_authors(authors):
result = []
if authors:
for author in authors:
if 'family' in author and 'given' in author:
result.append(f'{author["family"]}, {author["given"]}')
return result
def extract_dates(date_parts):
if not date_parts or not date_parts[0]:
return '', None
year, month, day = date_parts[0] + [0] * (3 - len(date_parts[0]))
if year:
issued_at = int(time.mktime(date(
year=year,
month=month if month else 1,
day=day if day else 1,
).timetuple()))
return str(year), issued_at
return '', None
def extract_first(arr, default=''):
if arr and len(arr) > 0:
return arr[0]
return default
def extract_page(page, default=0):
np = ''
for c in page:
if c.isdigit():
np += c
if np:
np = int(np)
if np < 2**31:
return np
return default
def extract_pages(pages, default=0):
try:
if pages is None:
return default, default
pages = pages.split('-')
if len(pages) == 2:
return extract_page(pages[0], default=default), extract_page(pages[1], default=default)
elif len(pages) == 1:
return extract_page(pages[0], default=default), default
return default, default
except ValueError:
return default, default
def extract_references(references):
if references:
dois = []
for reference in references:
if reference.get('DOI'):
dois.append(reference['DOI'])
return dois
def extract_title(title, subtitle):
return ': '.join(filter(lambda x: bool(x), [title.strip(), subtitle.strip()]))
class CrossrefApiToThinScimagPbAction(BaseAction):
async def do(self, item: dict) -> ScimagPb:
return ScimagPb(doi=item['DOI'])
class CrossrefApiToScimagPbAction(BaseAction):
async def do(self, item: dict) -> ScimagPb:
scimag_pb = ScimagPb(
abstract=item.get('abstract'),
container_title=extract_first(item.get('container-title')),
doi=item['DOI'],
issue=item.get('issue'),
issns=item.get('ISSN'),
language=item.get('language'),
ref_by_count=item.get('is-referenced-by-count'),
references=extract_references(item.get('reference')),
tags=item.get('subject'),
title=extract_title(extract_first(item.get('title')), extract_first(item.get('subtitle'))),
type=item.get('type'),
volume=item.get('volume'),
)
if item.get('author'):
scimag_pb.authors.extend(extract_authors(item['author']))
elif item.get('editor'):
scimag_pb.authors.extend(extract_authors(item['editor']))
scimag_pb.first_page, scimag_pb.last_page = extract_pages(item.get('page'))
scimag_pb.year, issued_at = extract_dates(item.get('issued', {}).get('date-parts'))
if issued_at is not None:
scimag_pb.issued_at = issued_at
return scimag_pb

View File

@ -0,0 +1,17 @@
from typing import List
from izihawa_utils.exceptions import BaseError
class InterruptProcessing(BaseError):
code = 'interrupt_processing'
def __init__(self, doc_id, reason):
super().__init__(doc_id=doc_id, reason=reason)
class ConflictError(BaseError):
code = 'conflict_error'
def __init__(self, document, duplicates: List[dict]):
super().__init__(document=document, duplicates=duplicates)

View File

@ -0,0 +1,8 @@
from nexus.models.proto.scimag_pb2 import Scimag as ScimagPb
from .base import BaseAction
class GoldenPostgresToThinScimagPbAction(BaseAction):
async def do(self, item: dict) -> ScimagPb:
return ScimagPb(doi=item['doi'])

125
nexus/actions/libgen_api.py Normal file
View File

@ -0,0 +1,125 @@
import numpy as np
from izihawa_types.safecast import safe_int
from nexus.models.proto.scitech_pb2 import Scitech as ScitechPb
from .base import BaseAction
LANGUAGE_TRANSLATION = {
'English': 'en',
'Russian': 'ru',
'German': 'de',
'Ukrainian': 'uk',
'French': 'fr',
'Italian': 'it',
'Spanish': 'es',
'Portuguese': 'pt',
'Chinese': 'cn',
'Polish': 'pl',
'english': 'en',
'Russian-Ukrainian': 'ru,uk',
'Russian-Ukrainian-English': 'en,ru,uk',
'Russian(Old)': 'ru',
'English-Russian': 'en,ru',
'Turkish': 'tr',
'Greek': 'el',
'Romanian': 'ro',
'Russian (Old)': 'ru',
'Arabic': 'ar',
'Français': 'fr',
'Dutch': 'nl',
'Japanese': 'ja',
'Persian': 'fa',
'Hungarian': 'hu',
'Latin': 'la',
'Serbian': 'sr',
'Spanish,Castilian': 'es',
'German-Russian': 'de,ru',
'Croatian': 'hr',
'Lithuanian': 'lt',
'Hebrew': 'iw',
'French-Russian': 'fr,ru',
'Czech': 'cs',
'Kazakh': 'kz',
'Swedish': 'sv',
'Indonesian': 'id',
'Greek(Modern)': 'el',
'Chinese(PRC)': 'cn',
'Belorussian': 'by',
'Deutsch': 'de',
'German-English': 'de,en',
'English, German': 'de,en',
'English-Ukrainian': 'en,uk',
'English, French': 'en,fr',
'Bulgarian': 'bg',
'Romanian,Moldavian,Moldovan': 'mo',
'Belarusian': 'by',
'Finnish': 'fi',
'Azerbaijani': 'az',
'Bengali': 'bn',
'English-French': 'en,fr',
'English-German': 'de,en',
'Chinese-English': 'cn,en',
}
def create_cu(libgen_id, coverurl, md5):
cu_suf = ''
bulk_id = (libgen_id - (libgen_id % 1000))
proposed_coverurl = f"{bulk_id}/{md5}.jpg"
proposed_coverurl_d = f"{bulk_id}/{md5}-d.jpg"
proposed_coverurl_g = f"{bulk_id}/{md5}-g.jpg"
if coverurl == proposed_coverurl:
coverurl = ''
elif coverurl == proposed_coverurl_d:
cu_suf = 'd'
coverurl = ''
elif coverurl == proposed_coverurl_g:
cu_suf = 'g'
coverurl = ''
return coverurl, cu_suf
class LibgenApiToScitechPbAction(BaseAction):
async def do(self, item: dict) -> ScitechPb:
scitech_pb = ScitechPb(
authors=(item.get('author') or '').split('; '),
description=item.get('descr'),
doi=item.get('doi'),
edition=item.get('edition'),
extension=item.get('extension'),
filesize=safe_int(item['filesize']) or 0,
is_deleted=item.get('visible', '') != '',
isbns=list(filter(
lambda x: bool(x),
map(
lambda x: x.replace('-', '').strip(),
item['identifier'].replace(';', ',').split(',')
),
)),
language=LANGUAGE_TRANSLATION.get(item['language']),
libgen_id=int(item['id']),
md5=item['md5'].lower(),
pages=safe_int(item['pages']),
series=item.get('series'),
tags=list(filter(
lambda x: bool(x),
map(
lambda x: x.strip(),
item['tags'].split(';')
),
)),
title=item['title'],
)
scitech_pb.cu, scitech_pb.cu_suf = create_cu(
libgen_id=scitech_pb.libgen_id,
coverurl=item['coverurl'].lower(),
md5=scitech_pb.md5
)
year = safe_int(item['year'])
if year and year < 9999:
scitech_pb.year = str(year)
scitech_pb.issued_at = np.datetime64(scitech_pb.year).astype('<M8[s]').astype(np.int64)
return scitech_pb

73
nexus/actions/scimag.py Normal file
View File

@ -0,0 +1,73 @@
from html import unescape
from bs4 import BeautifulSoup
from nexus.actions.common import canonize_doi
from nexus.models.proto.operation_pb2 import \
DocumentOperation as DocumentOperationPb
from nexus.models.proto.operation_pb2 import UpdateDocument as UpdateDocumentPb
from nexus.models.proto.scimag_pb2 import Scimag as ScimagPb
from nexus.models.proto.typed_document_pb2 import \
TypedDocument as TypedDocumentPb
from nexus.nlptools.language_detect import detect_language
from nexus.nlptools.utils import (
despace,
despace_full,
)
from .base import BaseAction
class CleanScimagPbAction(BaseAction):
async def do(self, scimag_pb: ScimagPb) -> ScimagPb:
if scimag_pb.abstract:
abstract_soup = BeautifulSoup(unescape(scimag_pb.abstract), 'lxml')
for line in abstract_soup.select(r'p, title, jats\:title, jats\:p'):
line.replace_with(f'\n{line.text.strip()}\n')
scimag_pb.abstract = despace(abstract_soup.text.strip())
if scimag_pb.title:
scimag_pb.title = despace_full(BeautifulSoup(unescape(scimag_pb.title), 'lxml').text.strip())
if scimag_pb.authors:
for i, author in enumerate(scimag_pb.authors):
scimag_pb.authors[i] = despace_full(BeautifulSoup(unescape(author), 'lxml').text.strip())
if scimag_pb.container_title:
scimag_pb.container_title = scimag_pb.container_title.replace(
'<html_ent glyph="@lt;" ascii="&lt;"/>'
'html_ent glyph="@amp;" ascii="<html_ent glyph="@amp;" ascii="&amp;"/>"/'
'<html_ent glyph="@gt;" ascii="&gt;"/>',
'&'
)
scimag_pb.container_title = scimag_pb.container_title.replace('<html_ent glyph="@amp;" ascii="&amp;"/>', '&')
scimag_pb.container_title = scimag_pb.container_title.replace(
'<html_ent glyph="@lt;" ascii="&lt;"/>'
'html_ent glyph="@amp;" ascii="&amp;"/'
'<html_ent glyph="@gt;" ascii="&gt;"/>',
'&'
)
scimag_pb.container_title = scimag_pb.container_title.replace('<html_ent glyph="@lt;" ascii="&lt;"/>', '')
scimag_pb.container_title = scimag_pb.container_title.replace('<html_ent glyph="@gt;" ascii="&gt;"/>', '')
scimag_pb.container_title = BeautifulSoup(unescape(scimag_pb.container_title), 'lxml').text.strip()
if scimag_pb.doi:
scimag_pb.doi = canonize_doi(scimag_pb.doi)
if scimag_pb.references:
canonized_references = list(map(canonize_doi, scimag_pb.references))
del scimag_pb.references[:]
scimag_pb.references.extend(canonized_references)
if not scimag_pb.meta_language and (scimag_pb.title or scimag_pb.abstract):
detected_language = detect_language(f'{scimag_pb.title} {scimag_pb.abstract}')
if detected_language:
scimag_pb.meta_language = detected_language
if not scimag_pb.language:
scimag_pb.language = scimag_pb.meta_language
return scimag_pb
class ScimagPbToDocumentOperationBytesAction(BaseAction):
async def do(self, item: ScimagPb) -> bytes:
document_operation_pb = DocumentOperationPb(
update_document=UpdateDocumentPb(
reindex=True,
should_fill_from_external_source=True,
typed_document=TypedDocumentPb(scimag=item),
),
)
return document_operation_pb.SerializeToString()

58
nexus/actions/scitech.py Normal file
View File

@ -0,0 +1,58 @@
from html import unescape
from bs4 import BeautifulSoup
from nexus.actions.common import canonize_doi
from nexus.models.proto.operation_pb2 import \
DocumentOperation as DocumentOperationPb
from nexus.models.proto.operation_pb2 import UpdateDocument as UpdateDocumentPb
from nexus.models.proto.scitech_pb2 import Scitech as ScitechPb
from nexus.models.proto.typed_document_pb2 import \
TypedDocument as TypedDocumentPb
from nexus.nlptools.language_detect import detect_language
from nexus.nlptools.utils import (
despace,
despace_full,
)
from .base import BaseAction
class CleanScitechAction(BaseAction):
async def do(self, scitech_pb: ScitechPb) -> ScitechPb:
if scitech_pb.authors:
for i, author in enumerate(scitech_pb.authors):
scitech_pb.authors[i] = despace_full(author)
if scitech_pb.description:
description_soup = BeautifulSoup(unescape(scitech_pb.description), 'lxml')
for line in description_soup.select(r'p, title, jats\:title, jats\:p'):
line.replace_with(f'\n{line.text.strip()}\n')
scitech_pb.description = despace(description_soup.text.strip())
scitech_pb.series = despace_full(scitech_pb.series)
scitech_pb.title = despace_full(scitech_pb.title)
if not scitech_pb.meta_language and (scitech_pb.title or scitech_pb.description):
detected_language = detect_language(f'{scitech_pb.title} {scitech_pb.description }')
if detected_language:
scitech_pb.meta_language = detected_language
if not scitech_pb.language:
scitech_pb.language = scitech_pb.meta_language
scitech_pb.md5 = scitech_pb.md5.lower()
scitech_pb.extension = scitech_pb.extension.lower()
scitech_pb.doi = canonize_doi(scitech_pb.doi)
if scitech_pb.edition == 'None':
scitech_pb.edition = ''
return scitech_pb
class ScitechPbToDocumentOperationBytesAction(BaseAction):
async def do(self, item: ScitechPb) -> bytes:
document_operation_pb = DocumentOperationPb(
update_document=UpdateDocumentPb(
reindex=True,
typed_document=TypedDocumentPb(scitech=item),
),
)
return document_operation_pb.SerializeToString()

View File

@ -0,0 +1,26 @@
from aiosumma import SummaHttpClient
from nexus.models.proto.operation_pb2 import \
DocumentOperation as DocumentOperationPb
from nexus.summa.schema import coders
from .base import BaseAction
class SendDocumentOperationUpdateDocumentPbToSummaAction(BaseAction):
def __init__(self, summa):
super().__init__()
self.summa_client = SummaHttpClient(**summa)
self.waits.append(self.summa_client)
async def do(self, document_operation_pb: DocumentOperationPb) -> DocumentOperationPb:
update_document_pb = document_operation_pb.update_document
schema = update_document_pb.typed_document.WhichOneof('document')
document = getattr(update_document_pb.typed_document, schema)
original_id = getattr(document, 'original_id', None)
if not update_document_pb.reindex or original_id:
return document_operation_pb
document_tantivy = coders[schema].encode_document(document)
await self.summa_client.put_document(schema, document_tantivy)
if update_document_pb.commit:
await self.summa_client.commit(schema)
return document_operation_pb

View File

@ -0,0 +1,232 @@
import asyncio
from typing import (
Optional,
Set,
)
import aiopg
from aiocrossref import CrossrefClient
from aiocrossref.exceptions import (
NotFoundError,
WrongContentTypeError,
)
from aiokafka import AIOKafkaProducer
from library.aiopostgres.pool_holder import AioPostgresPoolHolder
from nexus.models.proto.operation_pb2 import \
CrossReferenceOperation as CrossReferenceOperationPb
from nexus.models.proto.operation_pb2 import \
DocumentOperation as DocumentOperationPb
from nexus.models.proto.scimag_pb2 import Scimag as ScimagPb
from pypika import (
PostgreSQLQuery,
Table,
)
from pypika.terms import Array
from .base import BaseAction
from .crossref_api import CrossrefApiToScimagPbAction
from .exceptions import InterruptProcessing
from .scimag import CleanScimagPbAction
class SendDocumentOperationUpdateDocumentScimagPbToGoldenPostgresAction(BaseAction):
scimag_table = Table('scimag')
db_multi_fields = {
'authors',
'ipfs_multihashes',
'issns',
'tags',
}
db_single_fields = {
'id',
'abstract',
'container_title',
'doi',
'embedding',
'filesize',
'first_page',
'is_deleted',
'issued_at',
'issue',
'journal_id',
'language',
'last_page',
'meta_language',
'md5',
'ref_by_count',
'scimag_bulk_id',
'telegram_file_id',
'title',
'type',
'updated_at',
'volume',
}
db_fields = db_single_fields | db_multi_fields
def __init__(self, database):
super().__init__()
self.pool_holder = AioPostgresPoolHolder(
fn=aiopg.create_pool,
dsn=f'dbname={database["database"]} '
f'user={database["username"]} '
f'password={database["password"]} '
f'host={database["host"]}',
timeout=30,
pool_recycle=60,
maxsize=4,
)
self.waits.append(self.pool_holder)
def cast_field_value(self, field_name: str, field_value):
if field_name in self.db_multi_fields:
field_value = Array(*field_value)
return field_name, field_value
def is_field_set(self, scimag_pb: ScimagPb, field_name: str):
field_value = getattr(scimag_pb, field_name)
if field_name in {'scimag_bulk_id', 'issued_at'}:
return scimag_pb.HasField(field_name)
return field_value
def generate_delete_sql(self, scimag_pb: ScimagPb):
return (
PostgreSQLQuery
.from_('scimag')
.where(self.scimag_table.id == scimag_pb.id)
.delete()
.get_sql()
)
def generate_insert_sql(self, scimag_pb: ScimagPb, fields: Optional[Set[str]] = None):
columns = []
inserts = []
fields = fields or self.db_fields
for field_name in fields:
if self.is_field_set(scimag_pb, field_name):
field_value = getattr(scimag_pb, field_name)
field_name, field_value = self.cast_field_value(field_name, field_value)
columns.append(field_name)
inserts.append(field_value)
query = PostgreSQLQuery.into(self.scimag_table).columns(*columns).insert(*inserts)
if columns:
query = query.on_conflict('doi')
for field, val in zip(columns, inserts):
query = query.do_update(field, val)
return query.returning(self.scimag_table.id).get_sql()
def generate_update_sql(
self,
scimag_pb: ScimagPb,
fields: Optional[Set[str]] = None,
) -> str:
query = (
PostgreSQLQuery
.update(self.scimag_table)
)
fields = fields or self.db_fields
for field_name in fields:
if self.is_field_set(scimag_pb, field_name):
field_value = getattr(scimag_pb, field_name)
field_name, field_value = self.cast_field_value(field_name, field_value)
query = query.set(field_name, field_value)
return query.where(self.scimag_table.id == scimag_pb.id).get_sql()
async def do(self, document_operation_pb: DocumentOperationPb) -> DocumentOperationPb:
update_document_pb = document_operation_pb.update_document
scimag_pb = update_document_pb.typed_document.scimag
fields = update_document_pb.fields or self.db_fields
if scimag_pb.id:
if not scimag_pb.is_deleted:
sql = self.generate_update_sql(
scimag_pb,
fields=fields,
)
else:
sql = self.generate_delete_sql(scimag_pb)
await self.pool_holder.execute(sql)
else:
sql = self.generate_insert_sql(
scimag_pb=scimag_pb,
fields=fields,
)
result = await self.pool_holder.execute(sql, fetch=True)
scimag_pb.id = result[0][0]
return document_operation_pb
class SendDocumentOperationUpdateDocumentScimagPbReferencesToKafkaAction(BaseAction):
def __init__(self, topic, brokers):
super().__init__()
self.topic = topic
self.brokers = brokers
self.producer = None
async def start(self):
self.producer = self.get_producer()
await self.producer.start()
async def stop(self):
await self.producer.stop()
self.producer = None
def get_producer(self):
return AIOKafkaProducer(
loop=asyncio.get_running_loop(),
bootstrap_servers=self.brokers,
)
async def do(self, document_operation_pb: DocumentOperationPb) -> DocumentOperationPb:
update_document_pb = document_operation_pb.update_document
scimag_pb = update_document_pb.typed_document.scimag
for reference in scimag_pb.references:
reference_operation = CrossReferenceOperationPb(
source=scimag_pb.doi,
target=reference,
)
await self.producer.send_and_wait(
self.topic,
reference_operation.SerializeToString(),
)
return document_operation_pb
class FillDocumentOperationUpdateDocumentScimagPbFromExternalSourceAction(BaseAction):
def __init__(self, crossref):
super().__init__()
self.crossref_client = CrossrefClient(
delay=1.0 / crossref['rps'],
max_retries=60,
timeout=crossref.get('timeout'),
user_agent=crossref.get('user_agent'),
)
self.crossref_api_to_scimag_pb_action = CrossrefApiToScimagPbAction()
self.waits.append(self.crossref_client)
async def do(self, document_operation_pb: DocumentOperationPb) -> DocumentOperationPb:
update_document_pb = document_operation_pb.update_document
if not update_document_pb.should_fill_from_external_source:
return document_operation_pb
scimag_pb = update_document_pb.typed_document.scimag
try:
crossref_api_response = await self.crossref_client.works(doi=scimag_pb.doi)
except (WrongContentTypeError, NotFoundError) as e:
raise InterruptProcessing(doc_id=scimag_pb.doi, reason=str(e))
new_scimag_pb = await self.crossref_api_to_scimag_pb_action.do(crossref_api_response)
scimag_pb.MergeFrom(new_scimag_pb)
return document_operation_pb
class CleanDocumentOperationUpdateDocumentScimagPbAction(BaseAction):
def __init__(self):
super().__init__()
self.cleaner = CleanScimagPbAction()
self.waits.append(self.cleaner)
async def do(self, document_operation_pb: DocumentOperationPb) -> DocumentOperationPb:
update_document_pb = document_operation_pb.update_document
update_document_pb.typed_document.scimag.CopyFrom(await self.cleaner.do(update_document_pb.typed_document.scimag))
return document_operation_pb

View File

@ -0,0 +1,161 @@
import aiopg
from library.aiopostgres.pool_holder import AioPostgresPoolHolder
from nexus.models.proto.operation_pb2 import \
DocumentOperation as DocumentOperationPb
from nexus.models.proto.scitech_pb2 import Scitech as ScitechPb
from pypika import (
PostgreSQLQuery,
Table,
functions,
)
from pypika.terms import Array
from .base import BaseAction
from .exceptions import ConflictError
from .scitech import CleanScitechAction
class UuidFunction(functions.Function):
def __init__(self, uuid, alias=None):
super(UuidFunction, self).__init__('UUID', uuid, alias=alias)
class SendDocumentOperationUpdateDocumentScitechPbToGoldenPostgresAction(BaseAction):
scitech_table = Table('scitech')
db_single_fields = {
'id',
'cu',
'cu_suf',
'description',
'doi',
'edition',
'extension',
'fiction_id',
'filesize',
'is_deleted',
'issued_at',
'language',
'libgen_id',
'meta_language',
'md5',
'original_id',
'pages',
'series',
'telegram_file_id',
'title',
'updated_at',
'volume',
}
db_multi_fields = {
'authors',
'ipfs_multihashes',
'isbns',
'tags',
}
db_fields = db_single_fields | db_multi_fields
def __init__(self, database):
super().__init__()
self.pool_holder = AioPostgresPoolHolder(
fn=aiopg.create_pool,
dsn=f'dbname={database["database"]} '
f'user={database["username"]} '
f'password={database["password"]} '
f'host={database["host"]}',
timeout=30,
pool_recycle=60,
maxsize=4,
)
self.waits.append(self.pool_holder)
def cast_field_value(self, field_name, field_value):
if field_name in self.db_multi_fields:
field_value = Array(*field_value)
return field_name, field_value
def is_field_set(self, scitech_pb: ScitechPb, field_name: str):
field_value = getattr(scitech_pb, field_name)
if field_name in {'issued_at'}:
return scitech_pb.HasField(field_name)
return field_value
async def do(self, document_operation_pb: DocumentOperationPb) -> DocumentOperationPb:
update_document_pb = document_operation_pb.update_document
scitech_pb = update_document_pb.typed_document.scitech
fields = update_document_pb.fields or self.db_fields
conditions = []
if scitech_pb.id:
conditions.append(self.scitech_table.id == scitech_pb.id)
if scitech_pb.libgen_id:
conditions.append(self.scitech_table.libgen_id == scitech_pb.libgen_id)
if scitech_pb.fiction_id:
conditions.append(self.scitech_table.fiction_id == scitech_pb.fiction_id)
if scitech_pb.doi:
conditions.append(self.scitech_table.doi == scitech_pb.doi)
# if scitech_pb.md5:
# conditions.append(self.scitech_table.md5 == UuidFunction(scitech_pb.md5))
if conditions:
casted_conditions = conditions[0]
for condition in conditions[1:]:
casted_conditions = casted_conditions | condition
sql = (
PostgreSQLQuery
.from_(self.scitech_table)
.select(functions.Count('*'))
.where(casted_conditions)
.get_sql()
)
result = await self.pool_holder.execute(
sql,
fetch=True
)
count = result[0][0]
if count > 1:
raise ConflictError(scitech_pb, duplicates=[])
if count == 1:
query = PostgreSQLQuery.update(self.scitech_table)
for field_name in fields:
if self.is_field_set(scitech_pb, field_name):
field_value = getattr(scitech_pb, field_name)
field_name, field_value = self.cast_field_value(field_name, field_value)
query = query.set(field_name, field_value)
sql = query.where(casted_conditions).returning('id', 'original_id').get_sql()
else:
columns = []
inserts = []
for field_name in fields:
if self.is_field_set(scitech_pb, field_name):
field_value = getattr(scitech_pb, field_name)
field_name, field_value = self.cast_field_value(field_name, field_value)
columns.append(field_name)
inserts.append(field_value)
query = (
PostgreSQLQuery
.into(self.scitech_table)
.columns(*columns)
.insert(*inserts)
.on_conflict('libgen_id', 'doi')
)
for col, val in zip(columns, inserts):
query = query.do_update(col, val)
sql = query.returning('id', 'original_id').get_sql()
result = await self.pool_holder.execute(sql, fetch=True)
scitech_pb.id, scitech_pb.original_id = result[0][0], result[0][1] or 0
return document_operation_pb
class CleanDocumentOperationUpdateDocumentScitechPbAction(BaseAction):
def __init__(self):
super().__init__()
self.cleaner = CleanScitechAction()
self.waits.append(self.cleaner)
async def do(self, document_operation_pb: DocumentOperationPb) -> DocumentOperationPb:
update_document_pb = document_operation_pb.update_document
update_document_pb.typed_document.scitech.CopyFrom(await self.cleaner.do(update_document_pb.typed_document.scitech))
return document_operation_pb

34
nexus/ingest/BUILD.bazel Normal file
View File

@ -0,0 +1,34 @@
load("@io_bazel_rules_docker//python3:image.bzl", "py3_image")
load("@pip_modules_external//:requirements.bzl", "requirement")
alias(
name = "binary",
actual = ":image.binary",
)
py3_image(
name = "image",
srcs = glob(["**/*.py"]),
base = "//images/production:base-python-image",
data = [
"configs/base.yaml",
"configs/logging.yaml",
],
main = "main.py",
srcs_version = "PY3ONLY",
visibility = ["//visibility:public"],
deps = [
requirement("aiokafka"),
requirement("aiopg"),
requirement("fire"),
requirement("aiocrossref"),
requirement("aiokit"),
requirement("aiolibgen"),
"//library/aiopostgres",
"//library/configurator",
"//library/logging",
"//nexus/actions",
],
)

45
nexus/ingest/README.md Normal file
View File

@ -0,0 +1,45 @@
# Nexus Ingest
`Ingest` goes to Internet and send retrived data to Kafka queue of operations.
This version has cut `configs` subdirectory due to hard reliance of configs on the network infrastructure you are using.
You have to write your own configs taking example below into account.
## Sample `configs/base.yaml`
```yaml
---
jobs:
crossref-api:
class: nexus.ingest.jobs.CrossrefApiJob
kwargs:
actions:
- class: nexus.actions.crossref_api.CrossrefApiToThinScimagPbAction
- class: nexus.actions.scimag.ScimagPbToDocumentOperationBytesAction
base_url: https://api.crossref.org/
max_retries: 60
retry_delay: 10
sinks:
- class: nexus.ingest.sinks.KafkaSink
kwargs:
kafka_hosts:
- kafka-0.example.net
- kafka-1.example.net
topic_name: operations_binary
libgen-api:
class: nexus.ingest.jobs.LibgenApiJob
kwargs:
actions:
- class: nexus.actions.libgen_api.LibgenApiToScitechPbAction
- class: nexus.actions.scitech.ScitechPbToDocumentOperationBytesAction
base_url: libgen.example.net
max_retries: 60
retry_delay: 10
sinks:
- class: nexus.ingest.sinks.KafkaSink
kwargs:
kafka_hosts:
- kafka-0.example.net
- kafka-1.example.net
topic_name: operations_binary
log_path: '/var/log/nexus-ingest/{{ ENV_TYPE }}'
```

6
nexus/ingest/__init__.py Normal file
View File

@ -0,0 +1,6 @@
from . import (
jobs,
sinks,
)
__all__ = ['jobs', 'sinks']

View File

@ -0,0 +1,5 @@
from .crossref_api import CrossrefApiJob
from .libgen_api import LibgenApiJob
from .self_feed import SelfFeedJob
__all__ = ['CrossrefApiJob', 'LibgenApiJob', 'SelfFeedJob']

47
nexus/ingest/jobs/base.py Normal file
View File

@ -0,0 +1,47 @@
from typing import (
Any,
AsyncIterable,
Iterable,
)
from aiokit import AioRootThing
from izihawa_utils.importlib import import_object
from ..sinks.base import BaseSink
class BaseJob(AioRootThing):
name = None
def __init__(self, actions: Iterable[dict], sinks: Iterable[dict]):
super().__init__()
real_sinks = []
for sink in sinks:
if isinstance(sink, BaseSink):
real_sinks.append(sink)
else:
real_sinks.append(import_object(sink['class'])(**sink.get('kwargs', {})))
self.sinks = real_sinks
real_actions = []
for action in actions:
real_actions.append(import_object(action['class'])(**action.get('kwargs', {})))
self.actions = real_actions
self.waits.extend(self.sinks)
self.waits.extend(self.actions)
async def iterator(self) -> AsyncIterable[Any]:
raise NotImplementedError()
async def action_iterator(self) -> AsyncIterable[Any]:
async for item in self.iterator():
processed_item = item
for action in self.actions:
processed_item = await action.do(processed_item)
yield processed_item
async def start(self):
async for data in self.action_iterator():
for sink in self.sinks:
await sink.send(data)

View File

@ -0,0 +1,40 @@
from datetime import (
datetime,
timedelta,
)
from typing import (
Any,
AsyncIterable,
Iterable,
Optional,
)
from aiocrossref import CrossrefClient
from nexus.ingest.jobs.base import BaseJob
class CrossrefApiJob(BaseJob):
name = 'crossref-api'
def __init__(
self,
base_url: str,
max_retries: int,
retry_delay: int,
actions: Iterable[dict],
sinks: Iterable[dict],
from_date: Optional[str] = None,
):
super().__init__(actions=actions, sinks=sinks)
self.crossref_client = CrossrefClient(base_url=base_url, max_retries=max_retries, retry_delay=retry_delay)
self.from_date = from_date or str(datetime.date(datetime.now()) - timedelta(days=1))
self.starts.append(self.crossref_client)
async def iterator(self) -> AsyncIterable[Any]:
async for chunk in self.crossref_client.works_cursor(
filter=f'from-index-date:{self.from_date}',
rows=1000,
select='DOI',
):
for item in chunk['items']:
yield item

View File

@ -0,0 +1,35 @@
from datetime import (
datetime,
timedelta,
)
from typing import (
Any,
AsyncIterable,
Iterable,
Optional,
)
from aiolibgen import LibgenClient
from nexus.ingest.jobs.base import BaseJob
class LibgenApiJob(BaseJob):
name = 'libgen-api'
def __init__(
self,
base_url: str,
max_retries: int,
retry_delay: int,
actions: Iterable[dict],
sinks: Iterable[dict],
from_date: Optional[str] = None,
):
super().__init__(sinks=sinks, actions=actions)
self.libgen_client = LibgenClient(base_url=base_url, max_retries=max_retries, retry_delay=retry_delay)
self.from_date = from_date or str(datetime.date(datetime.now()) - timedelta(days=1))
self.starts.append(self.libgen_client)
async def iterator(self) -> AsyncIterable[Any]:
async for item in self.libgen_client.newer(timenewer=f'{self.from_date} 00:00:00'):
yield item

View File

@ -0,0 +1,39 @@
from typing import (
Any,
AsyncIterable,
Iterable,
)
import aiopg
from library.aiopostgres.pool_holder import AioPostgresPoolHolder
from nexus.ingest.jobs.base import BaseJob
class SelfFeedJob(BaseJob):
name = 'self-feed-job'
def __init__(
self,
database: dict,
sql: str,
actions: Iterable[dict],
sinks: Iterable[dict],
):
super().__init__(actions=actions, sinks=sinks)
self.sql = sql
self.pool_holder = AioPostgresPoolHolder(
fn=aiopg.create_pool,
dsn=f'dbname={database["database"]} '
f'user={database["username"]} '
f'password={database["password"]} '
f'host={database["host"]}',
timeout=30,
pool_recycle=60,
maxsize=4,
)
self.waits.append(self.pool_holder)
async def iterator(self) -> AsyncIterable[Any]:
rows = await self.pool_holder.execute(self.sql, fetch=True, timeout=3600)
for row in rows:
yield row

35
nexus/ingest/main.py Normal file
View File

@ -0,0 +1,35 @@
import fire
from aiokit.utils import sync_fu
from izihawa_utils.importlib import import_object
from library.logging import (
configure_logging,
error_log,
)
from nexus.ingest.configs import get_config
async def run_job(name, **kwargs):
config = get_config()
configure_logging(config)
job_config = config['jobs'][name]
job_class = import_object(job_config['class'])
real_kwargs = job_config['kwargs'].copy()
real_kwargs.update(kwargs)
job = job_class(**real_kwargs)
try:
await job.start_and_wait()
except Exception as e:
error_log(e)
raise
finally:
await job.stop()
def main():
fire.Fire({'run-job': sync_fu(run_job)})
if __name__ == '__main__':
main()

View File

@ -0,0 +1,3 @@
from .kafka import KafkaSink
__all__ = ['KafkaSink', ]

View File

@ -0,0 +1,12 @@
from aiokit import AioThing
class BaseSink(AioThing):
def __init__(self):
super().__init__()
def send(self, data: bytes):
raise NotImplementedError()
async def on_shutdown(self):
pass

View File

@ -0,0 +1,23 @@
import asyncio
from typing import Iterable
from aiokafka import AIOKafkaProducer
from .base import BaseSink
class KafkaSink(BaseSink):
def __init__(self, kafka_hosts: Iterable[str], topic_name: str):
super().__init__()
self.producer = AIOKafkaProducer(
loop=asyncio.get_event_loop(),
bootstrap_servers=kafka_hosts,
)
self.topic_name = topic_name
self.starts.append(self.producer)
async def send(self, data: bytes):
await self.producer.send_and_wait(
self.topic_name,
data,
)

4
nexus/models/README.md Normal file
View File

@ -0,0 +1,4 @@
# Nexus Models
## Warning
Do not rely heavily on the format, it is still a subject of redesigning and experimenting.

0
nexus/models/__init__.py Normal file
View File

View File

@ -0,0 +1,24 @@
load("@com_github_grpc_grpc//bazel:python_rules.bzl", "py_proto_library")
load("@io_bazel_rules_rust//proto:proto.bzl", "rust_proto_library")
load("@rules_proto//proto:defs.bzl", "proto_library")
proto_library(
name = "models_proto",
srcs = glob([
"*.proto",
]),
visibility = ["//visibility:public"],
)
py_proto_library(
name = "models_proto_py",
visibility = ["//visibility:public"],
deps = [":models_proto"],
)
rust_proto_library(
name = "models_proto_rust",
rust_deps = ["//rules/rust/cargo:protobuf"],
visibility = ["//visibility:public"],
deps = [":models_proto"],
)

View File

@ -0,0 +1,26 @@
syntax = "proto3";
package nexus.models.proto;
import "nexus/models/proto/typed_document.proto";
message CrossReferenceOperation {
string source = 1;
string target = 2;
uint32 last_retry_unixtime = 3;
uint32 retry_count = 4;
}
message DocumentOperation {
oneof operation {
UpdateDocument update_document = 3;
};
}
message UpdateDocument {
repeated string fields = 1;
bool should_fill_from_external_source = 2;
bool commit = 3;
bool reindex = 4;
TypedDocument typed_document = 5;
}

View File

@ -0,0 +1,212 @@
# -*- coding: utf-8 -*-
# Generated by the protocol buffer compiler. DO NOT EDIT!
# source: nexus/models/proto/operation.proto
"""Generated protocol buffer code."""
from google.protobuf import descriptor as _descriptor
from google.protobuf import message as _message
from google.protobuf import reflection as _reflection
from google.protobuf import symbol_database as _symbol_database
# @@protoc_insertion_point(imports)
_sym_db = _symbol_database.Default()
from nexus.models.proto import \
typed_document_pb2 as nexus_dot_models_dot_proto_dot_typed__document__pb2
DESCRIPTOR = _descriptor.FileDescriptor(
name='nexus/models/proto/operation.proto',
package='nexus.models.proto',
syntax='proto3',
serialized_options=None,
create_key=_descriptor._internal_create_key,
serialized_pb=b'\n\"nexus/models/proto/operation.proto\x12\x12nexus.models.proto\x1a\'nexus/models/proto/typed_document.proto\"k\n\x17\x43rossReferenceOperation\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x0e\n\x06target\x18\x02 \x01(\t\x12\x1b\n\x13last_retry_unixtime\x18\x03 \x01(\r\x12\x13\n\x0bretry_count\x18\x04 \x01(\r\"_\n\x11\x44ocumentOperation\x12=\n\x0fupdate_document\x18\x03 \x01(\x0b\x32\".nexus.models.proto.UpdateDocumentH\x00\x42\x0b\n\toperation\"\xa6\x01\n\x0eUpdateDocument\x12\x0e\n\x06\x66ields\x18\x01 \x03(\t\x12(\n should_fill_from_external_source\x18\x02 \x01(\x08\x12\x0e\n\x06\x63ommit\x18\x03 \x01(\x08\x12\x0f\n\x07reindex\x18\x04 \x01(\x08\x12\x39\n\x0etyped_document\x18\x05 \x01(\x0b\x32!.nexus.models.proto.TypedDocumentb\x06proto3'
,
dependencies=[nexus_dot_models_dot_proto_dot_typed__document__pb2.DESCRIPTOR,])
_CROSSREFERENCEOPERATION = _descriptor.Descriptor(
name='CrossReferenceOperation',
full_name='nexus.models.proto.CrossReferenceOperation',
filename=None,
file=DESCRIPTOR,
containing_type=None,
create_key=_descriptor._internal_create_key,
fields=[
_descriptor.FieldDescriptor(
name='source', full_name='nexus.models.proto.CrossReferenceOperation.source', index=0,
number=1, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='target', full_name='nexus.models.proto.CrossReferenceOperation.target', index=1,
number=2, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='last_retry_unixtime', full_name='nexus.models.proto.CrossReferenceOperation.last_retry_unixtime', index=2,
number=3, type=13, cpp_type=3, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='retry_count', full_name='nexus.models.proto.CrossReferenceOperation.retry_count', index=3,
number=4, type=13, cpp_type=3, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
],
extensions=[
],
nested_types=[],
enum_types=[
],
serialized_options=None,
is_extendable=False,
syntax='proto3',
extension_ranges=[],
oneofs=[
],
serialized_start=99,
serialized_end=206,
)
_DOCUMENTOPERATION = _descriptor.Descriptor(
name='DocumentOperation',
full_name='nexus.models.proto.DocumentOperation',
filename=None,
file=DESCRIPTOR,
containing_type=None,
create_key=_descriptor._internal_create_key,
fields=[
_descriptor.FieldDescriptor(
name='update_document', full_name='nexus.models.proto.DocumentOperation.update_document', index=0,
number=3, type=11, cpp_type=10, label=1,
has_default_value=False, default_value=None,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
],
extensions=[
],
nested_types=[],
enum_types=[
],
serialized_options=None,
is_extendable=False,
syntax='proto3',
extension_ranges=[],
oneofs=[
_descriptor.OneofDescriptor(
name='operation', full_name='nexus.models.proto.DocumentOperation.operation',
index=0, containing_type=None,
create_key=_descriptor._internal_create_key,
fields=[]),
],
serialized_start=208,
serialized_end=303,
)
_UPDATEDOCUMENT = _descriptor.Descriptor(
name='UpdateDocument',
full_name='nexus.models.proto.UpdateDocument',
filename=None,
file=DESCRIPTOR,
containing_type=None,
create_key=_descriptor._internal_create_key,
fields=[
_descriptor.FieldDescriptor(
name='fields', full_name='nexus.models.proto.UpdateDocument.fields', index=0,
number=1, type=9, cpp_type=9, label=3,
has_default_value=False, default_value=[],
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='should_fill_from_external_source', full_name='nexus.models.proto.UpdateDocument.should_fill_from_external_source', index=1,
number=2, type=8, cpp_type=7, label=1,
has_default_value=False, default_value=False,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='commit', full_name='nexus.models.proto.UpdateDocument.commit', index=2,
number=3, type=8, cpp_type=7, label=1,
has_default_value=False, default_value=False,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='reindex', full_name='nexus.models.proto.UpdateDocument.reindex', index=3,
number=4, type=8, cpp_type=7, label=1,
has_default_value=False, default_value=False,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='typed_document', full_name='nexus.models.proto.UpdateDocument.typed_document', index=4,
number=5, type=11, cpp_type=10, label=1,
has_default_value=False, default_value=None,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
],
extensions=[
],
nested_types=[],
enum_types=[
],
serialized_options=None,
is_extendable=False,
syntax='proto3',
extension_ranges=[],
oneofs=[
],
serialized_start=306,
serialized_end=472,
)
_DOCUMENTOPERATION.fields_by_name['update_document'].message_type = _UPDATEDOCUMENT
_DOCUMENTOPERATION.oneofs_by_name['operation'].fields.append(
_DOCUMENTOPERATION.fields_by_name['update_document'])
_DOCUMENTOPERATION.fields_by_name['update_document'].containing_oneof = _DOCUMENTOPERATION.oneofs_by_name['operation']
_UPDATEDOCUMENT.fields_by_name['typed_document'].message_type = nexus_dot_models_dot_proto_dot_typed__document__pb2._TYPEDDOCUMENT
DESCRIPTOR.message_types_by_name['CrossReferenceOperation'] = _CROSSREFERENCEOPERATION
DESCRIPTOR.message_types_by_name['DocumentOperation'] = _DOCUMENTOPERATION
DESCRIPTOR.message_types_by_name['UpdateDocument'] = _UPDATEDOCUMENT
_sym_db.RegisterFileDescriptor(DESCRIPTOR)
CrossReferenceOperation = _reflection.GeneratedProtocolMessageType('CrossReferenceOperation', (_message.Message,), {
'DESCRIPTOR' : _CROSSREFERENCEOPERATION,
'__module__' : 'nexus.models.proto.operation_pb2'
# @@protoc_insertion_point(class_scope:nexus.models.proto.CrossReferenceOperation)
})
_sym_db.RegisterMessage(CrossReferenceOperation)
DocumentOperation = _reflection.GeneratedProtocolMessageType('DocumentOperation', (_message.Message,), {
'DESCRIPTOR' : _DOCUMENTOPERATION,
'__module__' : 'nexus.models.proto.operation_pb2'
# @@protoc_insertion_point(class_scope:nexus.models.proto.DocumentOperation)
})
_sym_db.RegisterMessage(DocumentOperation)
UpdateDocument = _reflection.GeneratedProtocolMessageType('UpdateDocument', (_message.Message,), {
'DESCRIPTOR' : _UPDATEDOCUMENT,
'__module__' : 'nexus.models.proto.operation_pb2'
# @@protoc_insertion_point(class_scope:nexus.models.proto.UpdateDocument)
})
_sym_db.RegisterMessage(UpdateDocument)
# @@protoc_insertion_point(module_scope)

View File

@ -0,0 +1,38 @@
syntax = "proto3";
package nexus.models.proto;
message Scimag {
int64 id = 1;
string abstract = 2;
repeated string authors = 3;
string container_title = 11;
string doi = 4;
uint32 downloads_count = 27;
bytes embedding = 22;
uint32 filesize = 5;
uint32 first_page = 6;
repeated string ipfs_multihashes = 31;
bool is_deleted = 7;
repeated string issns = 25;
string issue = 10;
oneof optional_issued_at {
int64 issued_at = 26;
}
uint32 journal_id = 12;
string language = 13;
uint32 last_page = 14;
string meta_language = 15;
string md5 = 16;
int32 ref_by_count = 23;
repeated string references = 28;
oneof optional_scimag_bulk_id {
int32 scimag_bulk_id = 24;
}
repeated string tags = 17;
string telegram_file_id = 18;
string title = 19;
string type = 29;
int32 updated_at = 20;
string volume = 21;
string year = 30;
}

View File

@ -0,0 +1,283 @@
# -*- coding: utf-8 -*-
# Generated by the protocol buffer compiler. DO NOT EDIT!
# source: nexus/models/proto/scimag.proto
"""Generated protocol buffer code."""
from google.protobuf import descriptor as _descriptor
from google.protobuf import message as _message
from google.protobuf import reflection as _reflection
from google.protobuf import symbol_database as _symbol_database
# @@protoc_insertion_point(imports)
_sym_db = _symbol_database.Default()
DESCRIPTOR = _descriptor.FileDescriptor(
name='nexus/models/proto/scimag.proto',
package='nexus.models.proto',
syntax='proto3',
serialized_options=None,
create_key=_descriptor._internal_create_key,
serialized_pb=b'\n\x1fnexus/models/proto/scimag.proto\x12\x12nexus.models.proto\"\xd9\x04\n\x06Scimag\x12\n\n\x02id\x18\x01 \x01(\x03\x12\x10\n\x08\x61\x62stract\x18\x02 \x01(\t\x12\x0f\n\x07\x61uthors\x18\x03 \x03(\t\x12\x17\n\x0f\x63ontainer_title\x18\x0b \x01(\t\x12\x0b\n\x03\x64oi\x18\x04 \x01(\t\x12\x17\n\x0f\x64ownloads_count\x18\x1b \x01(\r\x12\x11\n\tembedding\x18\x16 \x01(\x0c\x12\x10\n\x08\x66ilesize\x18\x05 \x01(\r\x12\x12\n\nfirst_page\x18\x06 \x01(\r\x12\x18\n\x10ipfs_multihashes\x18\x1f \x03(\t\x12\x12\n\nis_deleted\x18\x07 \x01(\x08\x12\r\n\x05issns\x18\x19 \x03(\t\x12\r\n\x05issue\x18\n \x01(\t\x12\x13\n\tissued_at\x18\x1a \x01(\x03H\x00\x12\x12\n\njournal_id\x18\x0c \x01(\r\x12\x10\n\x08language\x18\r \x01(\t\x12\x11\n\tlast_page\x18\x0e \x01(\r\x12\x15\n\rmeta_language\x18\x0f \x01(\t\x12\x0b\n\x03md5\x18\x10 \x01(\t\x12\x14\n\x0cref_by_count\x18\x17 \x01(\x05\x12\x12\n\nreferences\x18\x1c \x03(\t\x12\x18\n\x0escimag_bulk_id\x18\x18 \x01(\x05H\x01\x12\x0c\n\x04tags\x18\x11 \x03(\t\x12\x18\n\x10telegram_file_id\x18\x12 \x01(\t\x12\r\n\x05title\x18\x13 \x01(\t\x12\x0c\n\x04type\x18\x1d \x01(\t\x12\x12\n\nupdated_at\x18\x14 \x01(\x05\x12\x0e\n\x06volume\x18\x15 \x01(\t\x12\x0c\n\x04year\x18\x1e \x01(\tB\x14\n\x12optional_issued_atB\x19\n\x17optional_scimag_bulk_idb\x06proto3'
)
_SCIMAG = _descriptor.Descriptor(
name='Scimag',
full_name='nexus.models.proto.Scimag',
filename=None,
file=DESCRIPTOR,
containing_type=None,
create_key=_descriptor._internal_create_key,
fields=[
_descriptor.FieldDescriptor(
name='id', full_name='nexus.models.proto.Scimag.id', index=0,
number=1, type=3, cpp_type=2, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='abstract', full_name='nexus.models.proto.Scimag.abstract', index=1,
number=2, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='authors', full_name='nexus.models.proto.Scimag.authors', index=2,
number=3, type=9, cpp_type=9, label=3,
has_default_value=False, default_value=[],
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='container_title', full_name='nexus.models.proto.Scimag.container_title', index=3,
number=11, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='doi', full_name='nexus.models.proto.Scimag.doi', index=4,
number=4, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='downloads_count', full_name='nexus.models.proto.Scimag.downloads_count', index=5,
number=27, type=13, cpp_type=3, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='embedding', full_name='nexus.models.proto.Scimag.embedding', index=6,
number=22, type=12, cpp_type=9, label=1,
has_default_value=False, default_value=b"",
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='filesize', full_name='nexus.models.proto.Scimag.filesize', index=7,
number=5, type=13, cpp_type=3, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='first_page', full_name='nexus.models.proto.Scimag.first_page', index=8,
number=6, type=13, cpp_type=3, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='ipfs_multihashes', full_name='nexus.models.proto.Scimag.ipfs_multihashes', index=9,
number=31, type=9, cpp_type=9, label=3,
has_default_value=False, default_value=[],
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='is_deleted', full_name='nexus.models.proto.Scimag.is_deleted', index=10,
number=7, type=8, cpp_type=7, label=1,
has_default_value=False, default_value=False,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='issns', full_name='nexus.models.proto.Scimag.issns', index=11,
number=25, type=9, cpp_type=9, label=3,
has_default_value=False, default_value=[],
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='issue', full_name='nexus.models.proto.Scimag.issue', index=12,
number=10, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='issued_at', full_name='nexus.models.proto.Scimag.issued_at', index=13,
number=26, type=3, cpp_type=2, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='journal_id', full_name='nexus.models.proto.Scimag.journal_id', index=14,
number=12, type=13, cpp_type=3, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='language', full_name='nexus.models.proto.Scimag.language', index=15,
number=13, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='last_page', full_name='nexus.models.proto.Scimag.last_page', index=16,
number=14, type=13, cpp_type=3, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='meta_language', full_name='nexus.models.proto.Scimag.meta_language', index=17,
number=15, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='md5', full_name='nexus.models.proto.Scimag.md5', index=18,
number=16, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='ref_by_count', full_name='nexus.models.proto.Scimag.ref_by_count', index=19,
number=23, type=5, cpp_type=1, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='references', full_name='nexus.models.proto.Scimag.references', index=20,
number=28, type=9, cpp_type=9, label=3,
has_default_value=False, default_value=[],
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='scimag_bulk_id', full_name='nexus.models.proto.Scimag.scimag_bulk_id', index=21,
number=24, type=5, cpp_type=1, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='tags', full_name='nexus.models.proto.Scimag.tags', index=22,
number=17, type=9, cpp_type=9, label=3,
has_default_value=False, default_value=[],
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='telegram_file_id', full_name='nexus.models.proto.Scimag.telegram_file_id', index=23,
number=18, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='title', full_name='nexus.models.proto.Scimag.title', index=24,
number=19, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='type', full_name='nexus.models.proto.Scimag.type', index=25,
number=29, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='updated_at', full_name='nexus.models.proto.Scimag.updated_at', index=26,
number=20, type=5, cpp_type=1, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='volume', full_name='nexus.models.proto.Scimag.volume', index=27,
number=21, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='year', full_name='nexus.models.proto.Scimag.year', index=28,
number=30, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
],
extensions=[
],
nested_types=[],
enum_types=[
],
serialized_options=None,
is_extendable=False,
syntax='proto3',
extension_ranges=[],
oneofs=[
_descriptor.OneofDescriptor(
name='optional_issued_at', full_name='nexus.models.proto.Scimag.optional_issued_at',
index=0, containing_type=None,
create_key=_descriptor._internal_create_key,
fields=[]),
_descriptor.OneofDescriptor(
name='optional_scimag_bulk_id', full_name='nexus.models.proto.Scimag.optional_scimag_bulk_id',
index=1, containing_type=None,
create_key=_descriptor._internal_create_key,
fields=[]),
],
serialized_start=56,
serialized_end=657,
)
_SCIMAG.oneofs_by_name['optional_issued_at'].fields.append(
_SCIMAG.fields_by_name['issued_at'])
_SCIMAG.fields_by_name['issued_at'].containing_oneof = _SCIMAG.oneofs_by_name['optional_issued_at']
_SCIMAG.oneofs_by_name['optional_scimag_bulk_id'].fields.append(
_SCIMAG.fields_by_name['scimag_bulk_id'])
_SCIMAG.fields_by_name['scimag_bulk_id'].containing_oneof = _SCIMAG.oneofs_by_name['optional_scimag_bulk_id']
DESCRIPTOR.message_types_by_name['Scimag'] = _SCIMAG
_sym_db.RegisterFileDescriptor(DESCRIPTOR)
Scimag = _reflection.GeneratedProtocolMessageType('Scimag', (_message.Message,), {
'DESCRIPTOR' : _SCIMAG,
'__module__' : 'nexus.models.proto.scimag_pb2'
# @@protoc_insertion_point(class_scope:nexus.models.proto.Scimag)
})
_sym_db.RegisterMessage(Scimag)
# @@protoc_insertion_point(module_scope)

View File

@ -0,0 +1,36 @@
syntax = "proto3";
package nexus.models.proto;
message Scitech {
int64 id = 1;
repeated string authors = 2;
string cu = 3;
string cu_suf = 4;
string description = 5;
string doi = 6;
uint32 downloads_count = 28;
string edition = 7;
string extension = 8;
int64 fiction_id = 9;
uint64 filesize = 10;
repeated string ipfs_multihashes = 30;
bool is_deleted = 11;
repeated string isbns = 12;
bool has_duplicates = 31;
oneof optional_issued_at {
int64 issued_at = 25;
}
string language = 13;
int64 libgen_id = 14;
string meta_language = 15;
string md5 = 16;
int64 original_id = 23;
uint32 pages = 17;
string series = 18;
repeated string tags = 19;
string telegram_file_id = 20;
string title = 21;
int32 updated_at = 22;
string volume = 24;
string year = 29;
}

View File

@ -0,0 +1,275 @@
# -*- coding: utf-8 -*-
# Generated by the protocol buffer compiler. DO NOT EDIT!
# source: nexus/models/proto/scitech.proto
"""Generated protocol buffer code."""
from google.protobuf import descriptor as _descriptor
from google.protobuf import message as _message
from google.protobuf import reflection as _reflection
from google.protobuf import symbol_database as _symbol_database
# @@protoc_insertion_point(imports)
_sym_db = _symbol_database.Default()
DESCRIPTOR = _descriptor.FileDescriptor(
name='nexus/models/proto/scitech.proto',
package='nexus.models.proto',
syntax='proto3',
serialized_options=None,
create_key=_descriptor._internal_create_key,
serialized_pb=b'\n nexus/models/proto/scitech.proto\x12\x12nexus.models.proto\"\xad\x04\n\x07Scitech\x12\n\n\x02id\x18\x01 \x01(\x03\x12\x0f\n\x07\x61uthors\x18\x02 \x03(\t\x12\n\n\x02\x63u\x18\x03 \x01(\t\x12\x0e\n\x06\x63u_suf\x18\x04 \x01(\t\x12\x13\n\x0b\x64\x65scription\x18\x05 \x01(\t\x12\x0b\n\x03\x64oi\x18\x06 \x01(\t\x12\x17\n\x0f\x64ownloads_count\x18\x1c \x01(\r\x12\x0f\n\x07\x65\x64ition\x18\x07 \x01(\t\x12\x11\n\textension\x18\x08 \x01(\t\x12\x12\n\nfiction_id\x18\t \x01(\x03\x12\x10\n\x08\x66ilesize\x18\n \x01(\x04\x12\x18\n\x10ipfs_multihashes\x18\x1e \x03(\t\x12\x12\n\nis_deleted\x18\x0b \x01(\x08\x12\r\n\x05isbns\x18\x0c \x03(\t\x12\x16\n\x0ehas_duplicates\x18\x1f \x01(\x08\x12\x13\n\tissued_at\x18\x19 \x01(\x03H\x00\x12\x10\n\x08language\x18\r \x01(\t\x12\x11\n\tlibgen_id\x18\x0e \x01(\x03\x12\x15\n\rmeta_language\x18\x0f \x01(\t\x12\x0b\n\x03md5\x18\x10 \x01(\t\x12\x13\n\x0boriginal_id\x18\x17 \x01(\x03\x12\r\n\x05pages\x18\x11 \x01(\r\x12\x0e\n\x06series\x18\x12 \x01(\t\x12\x0c\n\x04tags\x18\x13 \x03(\t\x12\x18\n\x10telegram_file_id\x18\x14 \x01(\t\x12\r\n\x05title\x18\x15 \x01(\t\x12\x12\n\nupdated_at\x18\x16 \x01(\x05\x12\x0e\n\x06volume\x18\x18 \x01(\t\x12\x0c\n\x04year\x18\x1d \x01(\tB\x14\n\x12optional_issued_atb\x06proto3'
)
_SCITECH = _descriptor.Descriptor(
name='Scitech',
full_name='nexus.models.proto.Scitech',
filename=None,
file=DESCRIPTOR,
containing_type=None,
create_key=_descriptor._internal_create_key,
fields=[
_descriptor.FieldDescriptor(
name='id', full_name='nexus.models.proto.Scitech.id', index=0,
number=1, type=3, cpp_type=2, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='authors', full_name='nexus.models.proto.Scitech.authors', index=1,
number=2, type=9, cpp_type=9, label=3,
has_default_value=False, default_value=[],
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='cu', full_name='nexus.models.proto.Scitech.cu', index=2,
number=3, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='cu_suf', full_name='nexus.models.proto.Scitech.cu_suf', index=3,
number=4, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='description', full_name='nexus.models.proto.Scitech.description', index=4,
number=5, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='doi', full_name='nexus.models.proto.Scitech.doi', index=5,
number=6, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='downloads_count', full_name='nexus.models.proto.Scitech.downloads_count', index=6,
number=28, type=13, cpp_type=3, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='edition', full_name='nexus.models.proto.Scitech.edition', index=7,
number=7, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='extension', full_name='nexus.models.proto.Scitech.extension', index=8,
number=8, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='fiction_id', full_name='nexus.models.proto.Scitech.fiction_id', index=9,
number=9, type=3, cpp_type=2, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='filesize', full_name='nexus.models.proto.Scitech.filesize', index=10,
number=10, type=4, cpp_type=4, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='ipfs_multihashes', full_name='nexus.models.proto.Scitech.ipfs_multihashes', index=11,
number=30, type=9, cpp_type=9, label=3,
has_default_value=False, default_value=[],
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='is_deleted', full_name='nexus.models.proto.Scitech.is_deleted', index=12,
number=11, type=8, cpp_type=7, label=1,
has_default_value=False, default_value=False,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='isbns', full_name='nexus.models.proto.Scitech.isbns', index=13,
number=12, type=9, cpp_type=9, label=3,
has_default_value=False, default_value=[],
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='has_duplicates', full_name='nexus.models.proto.Scitech.has_duplicates', index=14,
number=31, type=8, cpp_type=7, label=1,
has_default_value=False, default_value=False,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='issued_at', full_name='nexus.models.proto.Scitech.issued_at', index=15,
number=25, type=3, cpp_type=2, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='language', full_name='nexus.models.proto.Scitech.language', index=16,
number=13, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='libgen_id', full_name='nexus.models.proto.Scitech.libgen_id', index=17,
number=14, type=3, cpp_type=2, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='meta_language', full_name='nexus.models.proto.Scitech.meta_language', index=18,
number=15, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='md5', full_name='nexus.models.proto.Scitech.md5', index=19,
number=16, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='original_id', full_name='nexus.models.proto.Scitech.original_id', index=20,
number=23, type=3, cpp_type=2, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='pages', full_name='nexus.models.proto.Scitech.pages', index=21,
number=17, type=13, cpp_type=3, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='series', full_name='nexus.models.proto.Scitech.series', index=22,
number=18, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='tags', full_name='nexus.models.proto.Scitech.tags', index=23,
number=19, type=9, cpp_type=9, label=3,
has_default_value=False, default_value=[],
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='telegram_file_id', full_name='nexus.models.proto.Scitech.telegram_file_id', index=24,
number=20, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='title', full_name='nexus.models.proto.Scitech.title', index=25,
number=21, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='updated_at', full_name='nexus.models.proto.Scitech.updated_at', index=26,
number=22, type=5, cpp_type=1, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='volume', full_name='nexus.models.proto.Scitech.volume', index=27,
number=24, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='year', full_name='nexus.models.proto.Scitech.year', index=28,
number=29, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
],
extensions=[
],
nested_types=[],
enum_types=[
],
serialized_options=None,
is_extendable=False,
syntax='proto3',
extension_ranges=[],
oneofs=[
_descriptor.OneofDescriptor(
name='optional_issued_at', full_name='nexus.models.proto.Scitech.optional_issued_at',
index=0, containing_type=None,
create_key=_descriptor._internal_create_key,
fields=[]),
],
serialized_start=57,
serialized_end=614,
)
_SCITECH.oneofs_by_name['optional_issued_at'].fields.append(
_SCITECH.fields_by_name['issued_at'])
_SCITECH.fields_by_name['issued_at'].containing_oneof = _SCITECH.oneofs_by_name['optional_issued_at']
DESCRIPTOR.message_types_by_name['Scitech'] = _SCITECH
_sym_db.RegisterFileDescriptor(DESCRIPTOR)
Scitech = _reflection.GeneratedProtocolMessageType('Scitech', (_message.Message,), {
'DESCRIPTOR' : _SCITECH,
'__module__' : 'nexus.models.proto.scitech_pb2'
# @@protoc_insertion_point(class_scope:nexus.models.proto.Scitech)
})
_sym_db.RegisterMessage(Scitech)
# @@protoc_insertion_point(module_scope)

View File

@ -0,0 +1,12 @@
syntax = "proto3";
package nexus.models.proto;
import "nexus/models/proto/scimag.proto";
import "nexus/models/proto/scitech.proto";
message TypedDocument {
oneof document {
Scimag scimag = 1;
Scitech scitech = 2;
}
}

View File

@ -0,0 +1,95 @@
# -*- coding: utf-8 -*-
# Generated by the protocol buffer compiler. DO NOT EDIT!
# source: nexus/models/proto/typed_document.proto
"""Generated protocol buffer code."""
from google.protobuf import descriptor as _descriptor
from google.protobuf import message as _message
from google.protobuf import reflection as _reflection
from google.protobuf import symbol_database as _symbol_database
# @@protoc_insertion_point(imports)
_sym_db = _symbol_database.Default()
from nexus.models.proto import \
scimag_pb2 as nexus_dot_models_dot_proto_dot_scimag__pb2
from nexus.models.proto import \
scitech_pb2 as nexus_dot_models_dot_proto_dot_scitech__pb2
DESCRIPTOR = _descriptor.FileDescriptor(
name='nexus/models/proto/typed_document.proto',
package='nexus.models.proto',
syntax='proto3',
serialized_options=None,
create_key=_descriptor._internal_create_key,
serialized_pb=b'\n\'nexus/models/proto/typed_document.proto\x12\x12nexus.models.proto\x1a\x1fnexus/models/proto/scimag.proto\x1a nexus/models/proto/scitech.proto\"y\n\rTypedDocument\x12,\n\x06scimag\x18\x01 \x01(\x0b\x32\x1a.nexus.models.proto.ScimagH\x00\x12.\n\x07scitech\x18\x02 \x01(\x0b\x32\x1b.nexus.models.proto.ScitechH\x00\x42\n\n\x08\x64ocumentb\x06proto3'
,
dependencies=[nexus_dot_models_dot_proto_dot_scimag__pb2.DESCRIPTOR,nexus_dot_models_dot_proto_dot_scitech__pb2.DESCRIPTOR,])
_TYPEDDOCUMENT = _descriptor.Descriptor(
name='TypedDocument',
full_name='nexus.models.proto.TypedDocument',
filename=None,
file=DESCRIPTOR,
containing_type=None,
create_key=_descriptor._internal_create_key,
fields=[
_descriptor.FieldDescriptor(
name='scimag', full_name='nexus.models.proto.TypedDocument.scimag', index=0,
number=1, type=11, cpp_type=10, label=1,
has_default_value=False, default_value=None,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
_descriptor.FieldDescriptor(
name='scitech', full_name='nexus.models.proto.TypedDocument.scitech', index=1,
number=2, type=11, cpp_type=10, label=1,
has_default_value=False, default_value=None,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key),
],
extensions=[
],
nested_types=[],
enum_types=[
],
serialized_options=None,
is_extendable=False,
syntax='proto3',
extension_ranges=[],
oneofs=[
_descriptor.OneofDescriptor(
name='document', full_name='nexus.models.proto.TypedDocument.document',
index=0, containing_type=None,
create_key=_descriptor._internal_create_key,
fields=[]),
],
serialized_start=130,
serialized_end=251,
)
_TYPEDDOCUMENT.fields_by_name['scimag'].message_type = nexus_dot_models_dot_proto_dot_scimag__pb2._SCIMAG
_TYPEDDOCUMENT.fields_by_name['scitech'].message_type = nexus_dot_models_dot_proto_dot_scitech__pb2._SCITECH
_TYPEDDOCUMENT.oneofs_by_name['document'].fields.append(
_TYPEDDOCUMENT.fields_by_name['scimag'])
_TYPEDDOCUMENT.fields_by_name['scimag'].containing_oneof = _TYPEDDOCUMENT.oneofs_by_name['document']
_TYPEDDOCUMENT.oneofs_by_name['document'].fields.append(
_TYPEDDOCUMENT.fields_by_name['scitech'])
_TYPEDDOCUMENT.fields_by_name['scitech'].containing_oneof = _TYPEDDOCUMENT.oneofs_by_name['document']
DESCRIPTOR.message_types_by_name['TypedDocument'] = _TYPEDDOCUMENT
_sym_db.RegisterFileDescriptor(DESCRIPTOR)
TypedDocument = _reflection.GeneratedProtocolMessageType('TypedDocument', (_message.Message,), {
'DESCRIPTOR' : _TYPEDDOCUMENT,
'__module__' : 'nexus.models.proto.typed_document_pb2'
# @@protoc_insertion_point(class_scope:nexus.models.proto.TypedDocument)
})
_sym_db.RegisterMessage(TypedDocument)
# @@protoc_insertion_point(module_scope)

View File

@ -0,0 +1,20 @@
load("@pip_modules_external//:requirements.bzl", "requirement")
load("@rules_python//python:defs.bzl", "py_library")
py_library(
name = "nlptools",
srcs = glob(
["**/*.py"],
exclude = ["tests/**"],
),
srcs_version = "PY3",
visibility = ["//visibility:public"],
deps = [
requirement("DAWG"),
requirement("emoji"),
requirement("lemminflect"),
requirement("pycld3"),
requirement("pymorphy2"),
requirement("spacy"),
],
)

View File

View File

@ -0,0 +1,9 @@
import cld3
def detect_language(text: str) -> str:
prediction = cld3.get_language(text)
if prediction and prediction.is_reliable:
if prediction.language.endswith('-Latn'):
return prediction.language[:2]
return prediction.language

View File

@ -0,0 +1,24 @@
import enum
class Language(enum.IntEnum):
unknown_language = 0
am = 1
ar = 2
bn = 3
de = 4
en = 5
es = 6
fa = 7
fr = 8
hi = 9
id = 10
it = 11
ja = 12
ms = 13
pt = 14
ru = 15
tg = 16
uk = 17
uz = 18
zh = 19

50
nexus/nlptools/morph.py Normal file
View File

@ -0,0 +1,50 @@
import math
import lemminflect # noqa
import pymorphy2
import spacy
class EnglishMorphology:
VERBS = {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'}
ADJS = {'JJ', 'JJR', 'JJS'}
NOUNS = {'NN', 'NNP', 'NNPS', 'NNS'}
ADVERBS = {'RB', 'RBR', 'RBS'}
WORD_KINDS = [VERBS, ADJS, NOUNS, ADVERBS]
def __init__(self, name):
self.nlp = spacy.load(name)
def derive_forms(self, word):
forms = set()
word = self.nlp(word)[0]
inflected = False
for kind in self.WORD_KINDS:
if word.tag_ in kind:
for w in kind:
inflection = word._.inflect(w)
if inflection:
inflected = True
forms.add(word._.inflect(w))
if not inflected and word:
forms.add(str(word))
return list(sorted(forms))
class RussianMorphology:
def __init__(self):
self.morph_analyzer = pymorphy2.MorphAnalyzer()
def derive_forms(self, word):
words = set()
phrase_word_form = self.morph_analyzer.parse(word)[0]
for lexeme in phrase_word_form.lexeme:
if lexeme.word == word:
coef = 1.0
else:
coef = 1.0 / math.log1p(len(phrase_word_form.lexeme))
if 'Abbr' in lexeme.tag:
continue
words.add(f'{lexeme.word}^{coef:.2f}')
return list(sorted(words))

32
nexus/nlptools/regex.py Normal file
View File

@ -0,0 +1,32 @@
import re
from emoji import get_emoji_regexp
ALNUMWHITESPACE_REGEX = re.compile(r'([^\s\w])+')
EMAIL_REGEX = re.compile(r'([a-zA-Z0-9_\-\.]+)@([a-zA-Z0-9_\-\.]+)\.([a-zA-Z]{2,5})')
EMOJI_REGEX = get_emoji_regexp()
HASHTAG_REGEX = re.compile(r'([#@]+)([A-Za-z0-9_]+)')
MULTIWHITESPACE_REGEX = re.compile(r"\s+")
STICKER_REGEX = re.compile(
'^[\U0001F1E0-\U0001F1FF'
'\U0001F300-\U0001F5FF'
'\U0001F600-\U0001F64F'
'\U0001F680-\U0001F6FF'
'\U0001F700-\U0001F77F'
'\U0001F780-\U0001F7FF'
'\U0001F800-\U0001F8FF'
'\U0001F900-\U0001F9FF'
'\U0001FA00-\U0001FA6F'
'\U0001FA70-\U0001FAFF'
'\U00002702-\U000027B0]$',
flags=re.UNICODE,
)
URL_REGEX = re.compile(r'^(https?|ftp)?:\/\/[^\s\/$.?#]+\.[^\s]*$')
HIDDEN_CHAR = ''
TELEGRAM_LINK_REGEX = re.compile('(?:https?://)?t\\.me/(?!joinchat/)([A-Za-z0-9_]+)')
DOI_REGEX = re.compile(r'(10.\d{4,9})\s?/\s?([-._;()<>/:A-Za-z0-9]+[^.?\s])')
ISBN_REGEX = re.compile(r'^(?:[iI][sS][bB][nN]\:?\s*)?((97(8|9))?\-?\d{9}(\d|X))$')
MD5_REGEX = re.compile(r'([A-Fa-f0-9]{32})')
NID_REGEX = re.compile(r'(?:[Nn][Ii][Dd]\s?:?\s*)([0-9]+)')
PUBMED_ID_REGEX = re.compile(r'(?:(?:https?://)?(?:www.)?ncbi.nlm.nih.gov/pubmed/|[Pp][Mm][Ii][Dd]\s?:?\s*)([0-9]+)')

View File

@ -0,0 +1,12 @@
load("@pip_modules_external//:requirements.bzl", "requirement")
load("@rules_python//python:defs.bzl", "py_test")
py_test(
name = "tests",
srcs = glob(["**/*.py"]),
main = "test.py",
deps = [
requirement("pytest"),
"//nexus/nlptools",
],
)

View File

View File

@ -0,0 +1,7 @@
import os
import sys
import pytest
if __name__ == '__main__':
sys.exit(pytest.main([os.path.dirname(__file__), '-vvv', '-W', 'ignore::DeprecationWarning']))

View File

@ -0,0 +1,20 @@
from nexus.nlptools.utils import (
cast_string_to_single_string,
despace,
remove_hashtags,
)
def test_cast_string_to_single_string():
assert cast_string_to_single_string('kek kek 123\nkek') == 'kek-kek-123-kek'
def test_despace():
assert despace(
'ArXiv Papers Related to Computer Science, AI , Deep Learning, Computer Vision, NLP, etc\n\n\n'
'From: @ai_python'
) == 'ArXiv Papers Related to Computer Science, AI , Deep Learning, Computer Vision, NLP, etc\nFrom: @ai_python'
def test_remove_hashtags():
assert remove_hashtags('#ny riot') == ' riot'

109
nexus/nlptools/utils.py Normal file
View File

@ -0,0 +1,109 @@
import re
import struct
import unicodedata
from .regex import (
ALNUMWHITESPACE_REGEX,
EMAIL_REGEX,
EMOJI_REGEX,
HASHTAG_REGEX,
MULTIWHITESPACE_REGEX,
TELEGRAM_LINK_REGEX,
URL_REGEX,
)
def add_surrogate(text):
return ''.join(
# SMP -> Surrogate Pairs (Telegram offsets are calculated with these).
# See https://en.wikipedia.org/wiki/Plane_(Unicode)#Overview for more.
''.join(chr(y) for y in struct.unpack('<HH', x.encode('utf-16le')))
if (0x10000 <= ord(x) <= 0x10FFFF) else x for x in text
)
def cast_string_to_single_string(s):
processed = MULTIWHITESPACE_REGEX.sub(' ', ALNUMWHITESPACE_REGEX.sub(' ', s))
processed = processed.strip().replace(' ', '-')
return processed
def clean_text(text):
text = remove_markdown(remove_emoji(text))
text = remove_url(text)
text = despace_smart(text)
return text.strip()
def despace(text):
text = re.sub(r'\n+', '\n', text)
text = re.sub(r'[ \t]+', ' ', text)
text = re.sub(r'\n[ \t]+', '\n', text)
return text
def despace_full(text):
return re.sub(r'\s+', ' ', text).strip()
def despace_smart(text):
text = re.sub(r'\n\s*[-•]+\s*', r'\n', text)
text = re.sub(r'\n{2,}', r'\n', text).strip()
text = re.sub(r'\.?(\s+)?\n', r'. ', text)
text = re.sub(r'\s+', ' ', text)
return text
def escape_format(text):
text = text.replace("__", "_").replace("**", "*").replace("`", "'")
text = text.replace('[', r'`[`').replace(']', r'`]`')
return text
def remove_markdown(text):
text = re.sub('[*_~]{2,}', '', text)
text = re.sub('[`]+', '', text)
text = re.sub(r'\[\s*(.*?)(\s*)\]\(.*?\)', r'\g<1>\g<2>', text, flags=re.MULTILINE)
return text
def normalize_string(string):
string = re.sub('[^a-zA-Z0-9_\\-]+', '', string.lower().strip().replace(' ', '-'))
return unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8')
def remove_emails(text):
return re.sub(EMAIL_REGEX, '', text)
def remove_emoji(text):
text = re.sub(EMOJI_REGEX, '', text)
text = re.sub(u'\ufe0f', '', text)
return text
def remove_hashtags(text):
return re.sub(HASHTAG_REGEX, '', text)
def remove_url(text):
return re.sub(URL_REGEX, '', text)
def replace_telegram_link(text):
return re.sub(TELEGRAM_LINK_REGEX, r'@\1', text)
def split_at(s, pos):
if len(s) < pos:
return s
pos -= 10
pos = max(0, pos)
for p in range(pos, min(pos + 20, len(s) - 1)):
if s[p] in [' ', '\n', '.', ',', ':', ';', '-']:
return s[:p] + '...'
return s[:pos] + '...'
def unwind_hashtags(text):
return re.sub(HASHTAG_REGEX, r'\2', text)

37
nexus/pipe/BUILD.bazel Normal file
View File

@ -0,0 +1,37 @@
load("@io_bazel_rules_docker//python3:image.bzl", "py3_image")
load("@pip_modules_external//:requirements.bzl", "requirement")
alias(
name = "binary",
actual = ":image.binary",
)
py3_image(
name = "image",
srcs = glob(["**/*.py"]),
base = "//images/production:base-python-image",
data = [
"configs/base.yaml",
"configs/logging.yaml",
],
main = "main.py",
srcs_version = "PY3ONLY",
visibility = ["//visibility:public"],
deps = [
requirement("aiokafka"),
requirement("orjson"),
requirement("pypika"),
requirement("aiocrossref"),
requirement("aiokit"),
"//library/aiopostgres",
"//library/configurator",
"//library/logging",
"//nexus/actions",
"//nexus/models/proto:models_proto_py",
"//nexus/summa/schema",
requirement("aiosumma"),
requirement("izihawa_utils"),
],
)

102
nexus/pipe/README.md Normal file
View File

@ -0,0 +1,102 @@
# Nexus Pipe
`Pipe` processes Kafka queue of operations. This version has cut `configs`
subdirectory due to hard reliance of configs on the network infrastructure you are using.
You have to write your own configs taking example below into account.
## Sample `configs/base.yaml`
```yaml
---
log_path: '/var/log/nexus-pipe/{{ ENV_TYPE }}'
pipe:
brokers: |
kafka-0.example.net,
kafka-1.example.net
schema:
- consumers:
- class: nexus.pipe.consumers.CrossReferencesBulkConsumer
topics:
- name: cross_references
workers: 4
group_id: pipe
processors:
- class: nexus.pipe.processors.CrossReferencesProcessor
kwargs:
brokers: |
kafka-0.example.net,
kafka-1.example.net
database:
database: nexus
host: postgres.example.net
password: '{{ DATABASE_PASSWORD }}'
username: '{{ DATABASE_USERNAME }}'
- consumers:
- class: nexus.pipe.consumers.DocumentOperationsJsonConsumer
topics:
- name: operations
workers: 2
- class: nexus.pipe.consumers.DocumentOperationsConsumer
topics:
- name: operations_binary_hp
workers: 4
- name: operations_binary
workers: 14
group_id: pipe
processors:
- class: nexus.pipe.processors.ActionProcessor
kwargs:
actions:
- class: nexus.actions.FillDocumentOperationUpdateDocumentScimagPbFromExternalSourceAction
kwargs:
crossref:
rps: 50
user_agent: 'ScienceLegion/1.0 (Linux x86_64; ) ScienceLegion/1.0.0'
- class: nexus.actions.CleanDocumentOperationUpdateDocumentScimagPbAction
- class: nexus.actions.SendDocumentOperationUpdateDocumentScimagPbToGoldenPostgresAction
kwargs:
database:
database: nexus
host: postgres.example.net
password: '{{ DATABASE_PASSWORD }}'
username: '{{ DATABASE_USERNAME }}'
- class: nexus.actions.SendDocumentOperationUpdateDocumentScimagPbReferencesToKafkaAction
kwargs:
brokers: |
kafka-0.example.net,
kafka-1.example.net
topic: cross_references
- class: nexus.actions.SendDocumentOperationUpdateDocumentPbToSummaAction
kwargs:
summa:
base_url: http://summa.example.net
timeout: 15
ttl_dns_cache: 30
filter:
class: nexus.pipe.filters.DocumentOperationFilter
kwargs:
document: scimag
operation: update_document
- class: nexus.pipe.processors.ActionProcessor
kwargs:
actions:
- class: nexus.actions.CleanDocumentOperationUpdateDocumentScitechPbAction
- class: nexus.actions.SendDocumentOperationUpdateDocumentScitechPbToGoldenPostgresAction
kwargs:
database:
database: nexus
host: postgres.example.net
password: '{{ DATABASE_PASSWORD }}'
username: '{{ DATABASE_USERNAME }}'
- class: nexus.actions.SendDocumentOperationUpdateDocumentPbToSummaAction
kwargs:
summa:
base_url: http://summa.example.net
timeout: 15
ttl_dns_cache: 30
filter:
class: nexus.pipe.filters.DocumentOperationFilter
kwargs:
document: scitech
operation: update_document
```

6
nexus/pipe/__init__.py Normal file
View File

@ -0,0 +1,6 @@
from . import (
consumers,
processors,
)
__all__ = ['consumers', 'processors']

View File

@ -0,0 +1,17 @@
from .cross_references_consumer import (
CrossReferencesBulkConsumer,
CrossReferencesConsumer,
)
from .document_operations_consumer import (
DocumentOperationsBulkConsumer,
DocumentOperationsConsumer,
DocumentOperationsJsonConsumer,
)
__all__ = [
'CrossReferencesBulkConsumer',
'CrossReferencesConsumer',
'DocumentOperationsConsumer',
'DocumentOperationsBulkConsumer',
'DocumentOperationsJsonConsumer',
]

View File

@ -0,0 +1,142 @@
from __future__ import annotations
import asyncio
import logging
from typing import (
List,
Union,
)
import orjson as json
from aiokafka import AIOKafkaConsumer
from aiokafka.errors import (
CommitFailedError,
ConsumerStoppedError,
)
from aiokit import AioRootThing
from google.protobuf.json_format import ParseDict
from nexus.actions.exceptions import (
ConflictError,
InterruptProcessing,
)
from nexus.pipe.processors.base import Processor
class BaseConsumer(AioRootThing):
def __init__(self, processors: List[Processor],
topic_names: Union[str, List[str]], bootstrap_servers: str, group_id: str):
super().__init__()
self.processors = processors
if isinstance(topic_names, str):
topic_names = [topic_names]
self.topic_names = topic_names
self.bootstrap_servers = bootstrap_servers
self.group_id = group_id
self.consumer = None
self.starts.extend(self.processors)
def create_consumer(self):
return AIOKafkaConsumer(
*self.topic_names,
auto_offset_reset='earliest',
loop=asyncio.get_event_loop(),
bootstrap_servers=self.bootstrap_servers,
group_id=self.group_id,
enable_auto_commit=False,
)
def preprocess(self, msg):
return msg
async def start(self):
logging.getLogger('statbox').info({
'action': 'started',
'group_id': self.group_id,
'topic_names': self.topic_names,
})
self.consumer = self.create_consumer()
await self.consumer.start()
try:
async for msg in self.consumer:
preprocessed_msg = self.preprocess(msg)
if preprocessed_msg:
for processor in self.processors:
if not processor.filter(preprocessed_msg):
continue
try:
await processor.process(preprocessed_msg)
except (ConflictError, InterruptProcessing) as e:
logging.getLogger('statbox').info(e)
except Exception as e:
logging.getLogger('error').error(e)
raise
try:
await self.consumer.commit()
except CommitFailedError as e:
logging.getLogger('error').error(e)
except ConsumerStoppedError:
pass
async def stop(self):
if not self.consumer:
return
await self.consumer.stop()
class BasePbConsumer(BaseConsumer):
pb_class = None
def preprocess(self, msg) -> pb_class:
pb = self.pb_class()
pb.ParseFromString(msg.value)
return pb
class BaseJsonConsumer(BaseConsumer):
pb_class = None
def preprocess(self, msg) -> pb_class:
pb = self.pb_class()
message = json.loads(msg.value)
ParseDict(message, pb, ignore_unknown_fields=True)
return pb
class BaseBulkConsumer(BaseConsumer):
bulk_size = 20
timeout = 1
async def start(self):
logging.getLogger('statbox').info({
'action': 'started',
'group_id': self.group_id,
'topic_names': self.topic_names,
})
self.consumer = self.create_consumer()
await self.consumer.start()
while self.started:
try:
result = await self.consumer.getmany(timeout_ms=self.timeout * 1000, max_records=self.bulk_size)
except ConsumerStoppedError:
break
collector = []
for tp, messages in result.items():
if messages:
for message in messages:
preprocessed_msg = self.preprocess(message)
if preprocessed_msg:
collector.append(preprocessed_msg)
for processor in self.processors:
filtered = filter(processor.filter, collector)
try:
await processor.process_bulk(filtered)
except InterruptProcessing as e:
logging.getLogger('statbox').info(e)
except Exception as e:
logging.getLogger('error').error(e)
raise
try:
await self.consumer.commit()
except CommitFailedError as e:
logging.getLogger('error').error(e)
continue

View File

@ -0,0 +1,15 @@
from nexus.models.proto.operation_pb2 import \
CrossReferenceOperation as CrossReferenceOperationPb
from .base import (
BaseBulkConsumer,
BasePbConsumer,
)
class CrossReferencesConsumer(BasePbConsumer):
pb_class = CrossReferenceOperationPb
class CrossReferencesBulkConsumer(BaseBulkConsumer, CrossReferencesConsumer):
pass

View File

@ -0,0 +1,20 @@
from nexus.models.proto.operation_pb2 import \
DocumentOperation as DocumentOperationPb
from .base import (
BaseBulkConsumer,
BaseJsonConsumer,
BasePbConsumer,
)
class DocumentOperationsConsumer(BasePbConsumer):
pb_class = DocumentOperationPb
class DocumentOperationsJsonConsumer(BaseJsonConsumer):
pb_class = DocumentOperationPb
class DocumentOperationsBulkConsumer(BaseBulkConsumer, DocumentOperationsConsumer):
pass

View File

@ -0,0 +1,16 @@
from aiokit import AioThing
from nexus.models.proto.operation_pb2 import \
DocumentOperation as DocumentOperationPb
class DocumentOperationFilter(AioThing):
def __init__(self, operation, document):
super().__init__()
self.operation = operation
self.document = document
def filter(self, document_operation_pb: DocumentOperationPb) -> bool:
if document_operation_pb.WhichOneof('operation') != self.operation:
return False
operation = getattr(document_operation_pb, document_operation_pb.WhichOneof('operation'))
return operation.typed_document.HasField(self.document)

65
nexus/pipe/main.py Normal file
View File

@ -0,0 +1,65 @@
import logging
import ssl
from functools import partial
from aiokit import MultiprocessAsyncExecutor
from izihawa_utils.env import node_name
from izihawa_utils.importlib import (
import_object,
instantiate_object,
)
from library.logging import configure_logging
from nexus.pipe.configs import config
def create_aiothing(consumer_cls, topic_names, group_id, processors, shard):
processors = [instantiate_object(processor) for processor in processors]
return consumer_cls(
topic_names=topic_names,
processors=processors,
bootstrap_servers=config['pipe']['brokers'],
group_id=group_id,
)
# OpenSSL issue: https://github.com/psf/requests/issues/4775
def set_ssl_hack():
ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLSv1_2)
ssl_context.set_ciphers('HIGH:!DH:!aNULL')
ssl_context.set_ciphers('DEFAULT@SECLEVEL=1')
def main():
configure_logging(config)
set_ssl_hack()
logger = logging.getLogger('statbox')
logger.info({
'action': 'started',
'mode': 'startup',
})
create_aiothings = []
for instance_config in config['pipe']['schema']:
node_names = instance_config.get('node_names', [])
if node_names and node_name not in node_names:
continue
for consumer_config in instance_config['consumers']:
consumer_cls = import_object(consumer_config['class'])
for topic_config in consumer_config['topics']:
for _ in range(topic_config['workers']):
create_aiothings.append(partial(
create_aiothing,
consumer_cls,
topic_config['name'],
instance_config['group_id'],
instance_config['processors'],
))
executor = MultiprocessAsyncExecutor(create_aiothings=create_aiothings)
executor.start()
executor.join()
if __name__ == '__main__':
main()

View File

@ -0,0 +1,7 @@
from .base import ActionProcessor
from .cross_references_processor import CrossReferencesProcessor
__all__ = [
'ActionProcessor',
'CrossReferencesProcessor',
]

View File

@ -0,0 +1,41 @@
from __future__ import annotations
import asyncio.exceptions
from typing import Iterable
from aiokit import AioThing
from izihawa_utils.importlib import instantiate_object
from tenacity import (
retry,
retry_if_exception_type,
wait_fixed,
)
class Processor(AioThing):
def filter(self, message) -> bool:
return True
async def process(self, message):
return message
async def process_bulk(self, messages: Iterable):
for message in messages:
await self.process(message)
class ActionProcessor(Processor):
def __init__(self, actions, filter):
super().__init__()
self.actions = [instantiate_object(action) for action in actions]
self.filter_object = instantiate_object(filter)
self.waits.append(self.filter_object)
self.waits.extend(self.actions)
def filter(self, message) -> bool:
return self.filter_object.filter(message)
@retry(retry=retry_if_exception_type(asyncio.exceptions.TimeoutError), wait=wait_fixed(5))
async def process(self, message):
for action in self.actions:
message = await action.do(message)

View File

@ -0,0 +1,145 @@
import asyncio
import logging
import time
from typing import Iterable
import aiopg
from aiokafka import AIOKafkaProducer
from izihawa_utils.exceptions import NeedRetryError
from library.aiopostgres.pool_holder import AioPostgresPoolHolder
from nexus.actions.common import canonize_doi
from nexus.models.proto.operation_pb2 import \
CrossReferenceOperation as CrossReferenceOperationPb
from nexus.models.proto.operation_pb2 import \
DocumentOperation as DocumentOperationPb
from nexus.models.proto.operation_pb2 import UpdateDocument as UpdateDocumentPb
from nexus.models.proto.scimag_pb2 import Scimag as ScimagPb
from nexus.models.proto.typed_document_pb2 import \
TypedDocument as TypedDocumentPb
from pypika import (
PostgreSQLQuery,
Table,
)
from tenacity import (
retry,
retry_if_exception_type,
wait_fixed,
)
from .base import Processor
class CrossReferencesProcessor(Processor):
scimag_table = Table('scimag')
cross_references_table = Table('cross_references')
topic = 'cross_references'
def __init__(self, brokers, database):
super().__init__()
self.pool_holder = AioPostgresPoolHolder(
fn=aiopg.create_pool,
dsn=f'dbname={database["database"]} '
f'user={database["username"]} '
f'password={database["password"]} '
f'host={database["host"]}',
timeout=30,
pool_recycle=60,
maxsize=4,
)
self.brokers = brokers
self.producer = None
self.waits.append(self.pool_holder)
async def start(self):
self.producer = self.get_producer()
await self.producer.start()
async def stop(self):
await self.producer.stop()
self.producer = None
def get_producer(self):
return AIOKafkaProducer(
loop=asyncio.get_event_loop(),
bootstrap_servers=self.brokers,
)
@retry(retry=retry_if_exception_type(NeedRetryError), wait=wait_fixed(15))
async def process_bulk(self, messages: Iterable[CrossReferenceOperationPb]):
need_delay = False
for message in messages:
if message.retry_count > 1:
logging.getLogger('error').warning({
'status': 'error',
'error': 'not_found',
'source': message.source,
'target': message.target,
})
continue
now = time.time()
if now - message.last_retry_unixtime < 60:
need_delay = True
await self.producer.send_and_wait(
'cross_references',
message.SerializeToString(),
)
continue
source = canonize_doi(message.source)
target = canonize_doi(message.target)
target_row = await self.pool_holder.execute(
PostgreSQLQuery
.from_('scimag')
.select('id')
.where(self.scimag_table.doi == target)
.get_sql(),
fetch=True,
)
if not target_row:
if message.retry_count == 0:
document_operation = DocumentOperationPb(
update_document=UpdateDocumentPb(
commit=True,
reindex=True,
should_fill_from_external_source=True,
typed_document=TypedDocumentPb(scimag=ScimagPb(doi=target)),
),
)
await self.producer.send_and_wait(
'operations_binary_hp',
document_operation.SerializeToString(),
)
new_message = CrossReferenceOperationPb()
new_message.CopyFrom(message)
new_message.retry_count += 1
new_message.last_retry_unixtime = int(time.time())
await self.producer.send_and_wait(
self.topic,
new_message.SerializeToString(),
)
continue
target_id = target_row[0][0]
source_subquery = (
PostgreSQLQuery
.from_('scimag')
.select('id')
.where(self.scimag_table.doi == source)
)
await self.pool_holder.execute(
PostgreSQLQuery
.into('cross_references')
.columns(
'source_id',
'target_id',
)
.insert(source_subquery, target_id)
.on_conflict(self.cross_references_table.source_id, self.cross_references_table.target_id)
.do_nothing()
.get_sql()
)
if need_delay:
await asyncio.sleep(1.0)

3
nexus/summa/BUILD.bazel Normal file
View File

@ -0,0 +1,3 @@
package(default_visibility = ["//visibility:public"])

115
nexus/summa/README.md Normal file
View File

@ -0,0 +1,115 @@
# Summa Setup Scripts
## Guide
#### 1. Find data dumps
Current version: `20210103.1`
| File | IPFS |
| --------------------|:------------------------------------------------:|
| `scitech.index.tar` | `QmVaWFRNTHC3ser4ViHybcD7nuhv2CUAorhXs4JbYYHYm7` |
| `scitech.store.tar` | `QmP3p577gRokXXtusRYXXV7MtF3pVmGSdNEUE5TwFzRtAm` |
| `scimag.index.tar` | `<upcoming>` |
| `scimag.store.tar` | `<upcoming>` |
If files are not available ask guys from beyond the blackwall.
#### 2. Deploy data dumps to Summa
```shell script
bazel run -c opt installer -- import-to-summa \
--store-filepath scimag.store.tar \
--index-filepath scimag.index.tar \
--schema-filepath schema/scimag.yaml \
--database-path /tmp/summa
bazel run -c opt installer -- import-to-summa \
--store-filepath scitech.store.tar \
--index-filepath scitech.index.tar \
--schema-filepath schema/scitech.yaml \
--database-path /tmp/summa
```
#### 3. Launch Summa
```shell script
docker run -e ENV_TYPE=production \
-v /tmp/summa:/summa -v $(realpath configs/config.yaml):/summa/config.yaml \
-p 50000:80 izihawa/summa:latest -c /summa/config.yaml
```
#### 4. Use it
```shell script
curl "localhost:50000/v1/scitech/search/?query=covid&page_size=2" | python3 -m json.tool
```
```json
{
"has_next": true,
"scored_documents": [
{
"schema": "scitech",
"document": {
"authors": [
"National committee for Management of COVID-19 Cases (Dubai Health Authority)"
],
"cu_suf": "g",
"description": "Objectives\r\nThe objectives of this document are:\r\n\u2022 To provide guidance on clinical management of the COVID-19 infection\r\n\u2022 To provide a protocol on the practical steps to deal with COVID-19 cases\r\n\u2022 To detail the measures necessary to protect hospital staff, patients and visitors\r\n\u2022 This guideline is not intended to override the clinical decisions that will be made by clinicians providing individualized patient care.\r\n\u2022 This guideline will be updated as more information becomes available.\r\nIntroduction to Coronaviruses (CoV)\r\n\u2022 Corona virus is a large family of viruses that cause illness in humans and animals\r\n\u2022 In people, CoV can cause illness ranging in severity from the common cold to SARS.\r\n\u2022 SARS COV2 is one of seven types of known human coronaviruses. SARS COV2 like the MERS and SARS coronaviruses, likely evolved from a virus previously found in animals\r\n\u2022 The estimated incubation period is unknown and currently considered to be up to 14 days\r\nCase Definition:\r\nSuspected COVID-19 case is defined as:\r\n1. Please refer to the local health authority websites for updated information on local case definition.\r\nMOHAP, DoH, SEHA and DHA\r\nConfirmed COVID-19 is defined as:\r\nA person with confirmed positive COVID-19 test by a reference laboratory.",
"extension": "pdf",
"filesize": 2240001,
"id": 100126757,
"ipfs_multihashes": [
"bafykbzacebasnsyh4sypqcojwmsd7ujw3ymogwhnx5vhywk7syptxovkyyzvk",
"QmSd3tYXxJnWzm8vxpW1M6uxLhvBSpSLQd7cHjdsaoE38D"
],
"issued_at": 1577836800,
"language": "en",
"libgen_id": 2492432,
"md5": "faf8bcab6ce58a59b3ed09f1e1d9270e",
"tags": [
"COVID-19 Treatment"
],
"title": "National Guidelines for Clinical Management and Treatment of COVID-19 (March 19, 2020) Version 1.1"
},
"score": 36.404663
},
{
"schema": "scitech",
"document": {
"authors": [
"Dr. Tinku Joseph, Dr. Mohammed Ashkan"
],
"cu_suf": "g",
"description": "Corona virus comprises of a large family of viruses that are common in human beings as\r\nwell animals (camels, cattle, cats, and bats). There are seven different strains of corona\r\nvirus. [15]\r\n229E (alpha coronavirus)\r\nNL63 (alpha coronavirus)\r\nOC43 (beta coronavirus)\r\nHKU1 (beta coronavirus)\r\nMERS-CoV (the beta coronavirus that causes Middle East Respiratory Syndrome, or\r\nMERS)\r\nSARS-CoV (the beta coronavirus that causes severe acute respiratory syndrome, or\r\nSARS)\r\nSARS-CoV-2 (the novel coronavirus that causes coronavirus disease 2019, or\r\nCOVID-19)\r\nSometimes corona virus from animals infect people and spread further via human to human\r\ntransmission such as with MERS-CoV, SARS-CoV, and now with this COVID 19 (Corona\r\ndisease 2019). The virus that causes COVID-19 is designated severe acute respiratory\r\nsyndrome corona virus 2 (SARS-CoV-2); previously, referred to as 2019-nCoV.\r\nTowards December 2019, this novel corona virus was identified as a cause of upper and\r\nlower respiratory tract infections in Wuhan, a city in the Hubei Province of China. It rapidly\r\nspread, resulting in an epidemic throughout China and then gradually spreading to other\r\nparts of the world in pandemic proportions. It has affected almost every continent in this\r\nworld, except Antarctica. In February 2020, the World Health Organization designated the\r\ndisease COVID-19, which stands for corona virus disease 2019 [1].",
"extension": "pdf",
"filesize": 1512761,
"id": 100110426,
"issued_at": 1577836800,
"language": "en",
"libgen_id": 2494250,
"md5": "23015d4934b216fe797b18b561267fe4",
"pages": 43,
"tags": [
"COVID-19"
],
"title": "International Pulmonologist\u2019s Consensus on COVID-19"
},
"score": 32.969494
}
]
}
```
#### 5. (Optional) Deploy data dumps into your database
There is a function `work` in [`traversing script`](installer/scripts/iterate.py)
that you can reimplement to iterate over the whole dataset and insert it into your
own database or do whatever you want in parallel mode.
By default this script is just printing documents.
```shell script
bazel run -c opt installer -- iterate \
--store-filepath scitech.store.tar \
--schema-filepath schema/scitech.yaml
```

0
nexus/summa/__init__.py Normal file
View File

View File

@ -0,0 +1,14 @@
---
http:
bind_addr: 0.0.0.0:80
keep_alive_secs: 75
max_body_size_mb: 32
workers: 4
log_path: /var/log/summa/{{ ENV_TYPE }}
search_engine:
data_path: /summa
default_page_size: 5
timeout_secs: 5
writer_memory_mb: 4096
writer_threads: 4

View File

@ -0,0 +1,19 @@
load("@pip_modules_external//:requirements.bzl", "requirement")
load("@rules_python//python:defs.bzl", "py_binary")
py_binary(
name = "installer",
srcs = glob([
"**/*.py",
]),
imports = ["."],
main = "main.py",
srcs_version = "PY3",
visibility = ["//visibility:public"],
deps = [
requirement("psycopg2-binary"),
requirement("fire"),
requirement("tantipy"),
requirement("izihawa_utils"),
],
)

View File

View File

@ -0,0 +1,13 @@
import time
import fire
from nexus.summa.installer.scripts.import_to_summa import import_to_summa
from nexus.summa.installer.scripts.iterate import iterate
if __name__ == '__main__':
start = time.time()
fire.Fire({
'import-to-summa': import_to_summa,
'iterate': iterate,
})
print(f'Elapsed {time.time() - start:.2f} secs')

View File

@ -0,0 +1,9 @@
import os
def resolve_path(filepath):
if os.path.isabs(filepath):
return filepath
cwd = os.environ.get('BUILD_WORKING_DIRECTORY', os.getcwd())
filepath = os.path.join(cwd, filepath)
return filepath

View File

@ -0,0 +1,26 @@
import os
import shutil
import tarfile
import yaml
from izihawa_utils.file import mkdir_p
from .common import resolve_path
def import_to_summa(store_filepath, index_filepath, schema_filepath, database_path):
store_filepath = resolve_path(store_filepath)
index_filepath = resolve_path(index_filepath)
schema_filepath = resolve_path(schema_filepath)
database_path = resolve_path(database_path)
mkdir_p(os.path.join(database_path, 'schema'))
mkdir_p(os.path.join(database_path, 'index'))
shutil.copy(schema_filepath, os.path.join(database_path, 'schema', os.path.basename(schema_filepath)))
with open(schema_filepath, 'r') as f:
database_path = os.path.join(database_path, 'index', yaml.safe_load(f)['name'])
with tarfile.open(store_filepath) as f:
f.extractall(database_path)
with tarfile.open(index_filepath) as f:
f.extractall(database_path)

View File

@ -0,0 +1,51 @@
import multiprocessing
import tarfile
from functools import partial
import yaml
from izihawa_utils.itertools import ichunks
from tantipy import (
TantivyCoder,
TantivyReader,
)
from .common import resolve_path
def work(document):
# ToDo: Replace this function to what you want to do with document
print(document)
def _do_work(coder, filepath, chunk_size, limit, member):
with tarfile.open(filepath, 'r') as tar_file:
file = tar_file.extractfile(member)
data = file.read()
print(f'Processing segment {member.name}, size: {len(data) / (1024 * 1024):.2f} Mb ...')
tr = TantivyReader(data, coder=coder)
for chunk_num, documents in enumerate(ichunks(tr.documents(), chunk_size)):
for doc_num, document in enumerate(documents):
if chunk_num * chunk_size + doc_num > limit:
print(f'Segment {member.name} early terminated due to limits')
return
work(document)
print(f'Segment {member.name} successfully processed')
def iterate(store_filepath, schema_filepath, processes=8, chunk_size=100, limit=1):
store_filepath = resolve_path(store_filepath)
schema_filepath = resolve_path(schema_filepath)
with open(schema_filepath) as schema_file:
coder = TantivyCoder(yaml.safe_load(schema_file.read()))
with tarfile.open(store_filepath, 'r') as tar_file:
members = []
for member in tar_file.getmembers():
if not member.name.endswith('store'):
continue
members.append(member)
print(f'Total segments: {len(members)}')
pool = multiprocessing.Pool(processes)
pool.map(partial(_do_work, coder, store_filepath, chunk_size, limit), members)

View File

@ -0,0 +1,24 @@
load("@pip_modules_external//:requirements.bzl", "requirement")
load("@rules_python//python:defs.bzl", "py_library")
exports_files([
"scimag.yaml",
"scitech.yaml",
])
py_library(
name = "schema",
srcs = glob([
"**/*.py",
]),
data = [
"scimag.yaml",
"scitech.yaml",
],
srcs_version = "PY3",
visibility = ["//visibility:public"],
deps = [
requirement("tantipy"),
requirement("pyyaml"),
],
)

View File

@ -0,0 +1,7 @@
from .scimag import scimag_coder
from .scitech import scitech_coder
coders = {
'scimag': scimag_coder,
'scitech': scitech_coder,
}

View File

@ -0,0 +1,5 @@
import yaml
from tantipy import TantivyCoder
with open('nexus/summa/schema/scimag.yaml') as file:
scimag_coder = TantivyCoder(yaml.safe_load(file.read()))

View File

@ -0,0 +1,122 @@
---
# yamllint disable rule:key-ordering
default_fields: ["abstract", "authors", "language", "title", "tags", "year"]
enabled: true
key_field: "id"
multi_fields: ["authors", "ipfs_multihashes", "issns", "references", "tags"]
name: scimag
schema:
- name: id
type: i64
options:
fast: single
indexed: true
stored: true
- name: abstract
type: text
options:
indexing:
record: position
tokenizer: default
stored: true
- name: authors
type: text
options:
indexing:
record: position
tokenizer: default
stored: true
- name: doi
type: text
options:
indexing:
record: basic
tokenizer: raw
stored: true
- name: first_page
type: i64
options:
indexed: false
stored: true
- name: container_title
type: text
options:
indexing:
record: position
tokenizer: default
stored: true
- name: issns
type: text
options:
indexing: null
stored: true
- name: issue
type: text
options:
indexing: null
stored: true
- name: issued_at
type: i64
options:
indexed: true
stored: true
- name: language
type: text
options:
indexing:
record: basic
tokenizer: raw
stored: true
- name: last_page
type: i64
options:
indexed: false
stored: true
- name: ref_by_count
type: i64
options:
indexed: false
stored: true
- name: references
type: text
options:
indexing:
record: basic
tokenizer: raw
stored: false
- name: scimag_bulk_id
type: i64
options:
indexed: false
stored: true
- name: tags
type: text
options:
indexing:
record: position
tokenizer: default
stored: true
- name: title
type: text
options:
indexing:
record: position
tokenizer: default
stored: true
- name: updated_at
type: i64
options:
indexed: true
stored: true
- name: volume
type: text
options:
indexing: null
stored: true
- name: year
type: text
options:
indexing:
record: basic
tokenizer: raw
stored: false

View File

@ -0,0 +1,5 @@
import yaml
from tantipy import TantivyCoder
with open('nexus/summa/schema/scitech.yaml') as file:
scitech_coder = TantivyCoder(yaml.safe_load(file.read()))

Some files were not shown because too many files have changed in this diff Show More