mirror of
https://github.com/nexus-stc/hyperboria
synced 2024-12-30 13:35:49 +01:00
d51e5ab65d
- fix: Translation fixes - fix: Various fixes - feat: PB translations, configuration changes - fix: Bugfixes GitOrigin-RevId: 55f8b148c42a296162fc707c36a5146ca0073b4b
47 lines
1.5 KiB
Python
47 lines
1.5 KiB
Python
import glob
|
|
import multiprocessing
|
|
import os
|
|
from functools import partial
|
|
|
|
import yaml
|
|
from izihawa_utils.itertools import ichunks
|
|
from tantipy import (
|
|
TantivyCoder,
|
|
TantivyReader,
|
|
)
|
|
|
|
from .common import resolve_path
|
|
|
|
|
|
def work(document):
|
|
# ToDo: Replace this function to what you want to do with document
|
|
print(document)
|
|
|
|
|
|
def _do_work(coder, chunk_size, limit, store_filepath):
|
|
with open(store_filepath, 'rb') as file:
|
|
data = file.read()
|
|
print(f'Processing segment {store_filepath}, size: {len(data) / (1024 * 1024):.2f} Mb ...')
|
|
tr = TantivyReader(data, coder=coder)
|
|
for chunk_num, documents in enumerate(ichunks(tr.documents(), chunk_size)):
|
|
for doc_num, document in enumerate(documents):
|
|
if limit and chunk_num * chunk_size + doc_num > limit:
|
|
print(f'Segment {store_filepath} early terminated due to limits')
|
|
return
|
|
work(document)
|
|
print(f'Segment {store_filepath} successfully processed')
|
|
|
|
|
|
def iterate(data_filepath, schema_filepath, processes=8, chunk_size=100, limit=1):
|
|
data_filepath = resolve_path(data_filepath)
|
|
schema_filepath = resolve_path(schema_filepath)
|
|
|
|
with open(schema_filepath) as schema_file:
|
|
coder = TantivyCoder(yaml.safe_load(schema_file.read()))
|
|
|
|
store_filepaths = glob.glob(os.path.join(data_filepath, '*.store'))
|
|
|
|
print(f'Total segments: {len(store_filepaths)}')
|
|
pool = multiprocessing.Pool(processes)
|
|
pool.map(partial(_do_work, coder, chunk_size, limit), store_filepaths)
|