mirror of
https://github.com/nexus-stc/hyperboria
synced 2024-12-21 09:07:46 +01:00
9580401bc9
- Updated tantipy version GitOrigin-RevId: 11e96ee1b3d5ffa67c8caf87be9e41e3a297ed70
52 lines
1.7 KiB
Python
52 lines
1.7 KiB
Python
import multiprocessing
|
|
import tarfile
|
|
from functools import partial
|
|
|
|
import yaml
|
|
from izihawa_utils.itertools import ichunks
|
|
from tantipy import (
|
|
TantivyCoder,
|
|
TantivyReader,
|
|
)
|
|
|
|
from .common import resolve_path
|
|
|
|
|
|
def work(document):
|
|
# ToDo: Replace this function to what you want to do with document
|
|
print(document)
|
|
|
|
|
|
def _do_work(coder, filepath, chunk_size, limit, member):
|
|
with tarfile.open(filepath, 'r') as tar_file:
|
|
file = tar_file.extractfile(member)
|
|
data = file.read()
|
|
print(f'Processing segment {member.name}, size: {len(data) / (1024 * 1024):.2f} Mb ...')
|
|
tr = TantivyReader(data, coder=coder)
|
|
for chunk_num, documents in enumerate(ichunks(tr.documents(), chunk_size)):
|
|
for doc_num, document in enumerate(documents):
|
|
if limit and chunk_num * chunk_size + doc_num > limit:
|
|
print(f'Segment {member.name} early terminated due to limits')
|
|
return
|
|
work(document)
|
|
print(f'Segment {member.name} successfully processed')
|
|
|
|
|
|
def iterate(store_filepath, schema_filepath, processes=8, chunk_size=100, limit=1):
|
|
store_filepath = resolve_path(store_filepath)
|
|
schema_filepath = resolve_path(schema_filepath)
|
|
|
|
with open(schema_filepath) as schema_file:
|
|
coder = TantivyCoder(yaml.safe_load(schema_file.read()))
|
|
|
|
with tarfile.open(store_filepath, 'r') as tar_file:
|
|
members = []
|
|
for member in tar_file.getmembers():
|
|
if not member.name.endswith('store'):
|
|
continue
|
|
members.append(member)
|
|
|
|
print(f'Total segments: {len(members)}')
|
|
pool = multiprocessing.Pool(processes)
|
|
pool.map(partial(_do_work, coder, store_filepath, chunk_size, limit), members)
|