mirror of
https://github.com/nexus-stc/hyperboria
synced 2025-01-02 15:05:48 +01:00
47 lines
1.5 KiB
Python
47 lines
1.5 KiB
Python
|
import glob
|
||
|
import multiprocessing
|
||
|
import os
|
||
|
from functools import partial
|
||
|
|
||
|
import yaml
|
||
|
from izihawa_utils.itertools import ichunks
|
||
|
from tantipy import (
|
||
|
TantivyCoder,
|
||
|
TantivyReader,
|
||
|
)
|
||
|
|
||
|
from .common import resolve_path
|
||
|
|
||
|
|
||
|
def work(document):
|
||
|
# ToDo: Replace this function to what you want to do with document
|
||
|
print(document)
|
||
|
|
||
|
|
||
|
def _do_work(coder, chunk_size, limit, store_filepath):
|
||
|
with open(store_filepath, 'rb') as file:
|
||
|
data = file.read()
|
||
|
print(f'Processing segment {store_filepath}, size: {len(data) / (1024 * 1024):.2f} Mb ...')
|
||
|
tr = TantivyReader(data, coder=coder)
|
||
|
for chunk_num, documents in enumerate(ichunks(tr.documents(), chunk_size)):
|
||
|
for doc_num, document in enumerate(documents):
|
||
|
if limit and chunk_num * chunk_size + doc_num > limit:
|
||
|
print(f'Segment {store_filepath} early terminated due to limits')
|
||
|
return
|
||
|
work(document)
|
||
|
print(f'Segment {store_filepath} successfully processed')
|
||
|
|
||
|
|
||
|
def iterate(data_filepath, schema_filepath, processes=8, chunk_size=100, limit=1):
|
||
|
data_filepath = resolve_path(data_filepath)
|
||
|
schema_filepath = resolve_path(schema_filepath)
|
||
|
|
||
|
with open(schema_filepath) as schema_file:
|
||
|
coder = TantivyCoder(yaml.safe_load(schema_file.read()))
|
||
|
|
||
|
store_filepaths = glob.glob(os.path.join(data_filepath, '*.store'))
|
||
|
|
||
|
print(f'Total segments: {len(store_filepaths)}')
|
||
|
pool = multiprocessing.Pool(processes)
|
||
|
pool.map(partial(_do_work, coder, chunk_size, limit), store_filepaths)
|