hyperboria/nexus/meta_api/mergers/top_docs.py

import heapq
from typing import List

from summa.proto import search_service_pb2


class TopDocsIterator:
    def __init__(self, top_docs_collector: search_service_pb2.TopDocsCollectorOutput):
        self.top_docs_collector = top_docs_collector
        self._current = 0

    def __lt__(self, other: 'TopDocsIterator'):
        self_score = self.current().score
        other_score = other.current().score
        self_score = getattr(self_score, self_score.WhichOneof('score'))
        other_score = getattr(other_score, other_score.WhichOneof('score'))
        return self_score > other_score

    def __iter__(self):
        return self

    def __next__(self) -> search_service_pb2.ScoredDocument:
        if self.has_any():
            item = self.current()
            self._current += 1
            return item
        raise StopIteration

    def current(self):
        return self.top_docs_collector.scored_documents[self._current]

    def has_next(self) -> bool:
        return self.top_docs_collector.has_next

    def has_any(self) -> bool:
        return self._current < len(self.top_docs_collector.scored_documents)


class TopDocsMerger:
    def __init__(self, top_docs_collectors: List[search_service_pb2.TopDocsCollectorOutput]):
        self.top_docs_heap = []
        for top_docs_collector in top_docs_collectors:
            top_docs_iterator = TopDocsIterator(top_docs_collector)
            if top_docs_iterator.has_any():
                self.top_docs_heap.append(top_docs_iterator)
        heapq.heapify(self.top_docs_heap)

    def merge(self) -> search_service_pb2.CollectorOutput:
        scored_documents = []
        has_next = any([top_docs_iterator for top_docs_iterator in self.top_docs_heap])

        position = 0
        while self.top_docs_heap:
            largest_top_docs_iterator = heapq.heappop(self.top_docs_heap)
            largest_item = next(largest_top_docs_iterator)
            largest_item.position = position
            scored_documents.append(largest_item)
            position += 1
            if largest_top_docs_iterator.has_any():
                heapq.heappush(self.top_docs_heap, largest_top_docs_iterator)

        return search_service_pb2.CollectorOutput(
            top_docs=search_service_pb2.TopDocsCollectorOutput(
                has_next=has_next,
                scored_documents=scored_documents,
            )
        )
- [nexus] Update schema - [nexus] Remove outdated protos - [nexus] Development - [nexus] Development - [nexus] Development - [nexus] Development - [nexus] Development - [nexus] Refactor views - [nexus] Update aiosumma - [nexus] Add tags - [nexus] Development - [nexus] Update repository - [nexus] Update repository - [nexus] Update dependencies - [nexus] Update dependencies - [nexus] Fixes for MetaAPI - [nexus] Support for new queries - [nexus] Adopt new versions of search - [nexus] Improving Nexus - [nexus] Various fixes - [nexus] Add profile - [nexus] Fixes for ingestion - [nexus] Refactorings and bugfixes - [idm] Add profile methods - [nexus] Fix stalled nexus-meta bugs - [nexus] Various bugfixes - [nexus] Restore IDM API functionality GitOrigin-RevId: a0842345a6dde5b321279ab5510a50c0def0e71a 2022-09-02 18:44:56 +03:00			`import heapq`
			`from typing import List`

			`from summa.proto import search_service_pb2`


			`class TopDocsIterator:`
			`def __init__(self, top_docs_collector: search_service_pb2.TopDocsCollectorOutput):`
			`self.top_docs_collector = top_docs_collector`
			`self._current = 0`

			`def __lt__(self, other: 'TopDocsIterator'):`
			`self_score = self.current().score`
			`other_score = other.current().score`
			`self_score = getattr(self_score, self_score.WhichOneof('score'))`
			`other_score = getattr(other_score, other_score.WhichOneof('score'))`
			`return self_score > other_score`

			`def __iter__(self):`
			`return self`

			`def __next__(self) -> search_service_pb2.ScoredDocument:`
			`if self.has_any():`
			`item = self.current()`
			`self._current += 1`
			`return item`
			`raise StopIteration`

			`def current(self):`
			`return self.top_docs_collector.scored_documents[self._current]`

			`def has_next(self) -> bool:`
			`return self.top_docs_collector.has_next`

			`def has_any(self) -> bool:`
			`return self._current < len(self.top_docs_collector.scored_documents)`


			`class TopDocsMerger:`
			`def __init__(self, top_docs_collectors: List[search_service_pb2.TopDocsCollectorOutput]):`
			`self.top_docs_heap = []`
			`for top_docs_collector in top_docs_collectors:`
			`top_docs_iterator = TopDocsIterator(top_docs_collector)`
			`if top_docs_iterator.has_any():`
			`self.top_docs_heap.append(top_docs_iterator)`
			`heapq.heapify(self.top_docs_heap)`

			`def merge(self) -> search_service_pb2.CollectorOutput:`
			`scored_documents = []`
			`has_next = any([top_docs_iterator for top_docs_iterator in self.top_docs_heap])`

			`position = 0`
			`while self.top_docs_heap:`
			`largest_top_docs_iterator = heapq.heappop(self.top_docs_heap)`
			`largest_item = next(largest_top_docs_iterator)`
			`largest_item.position = position`
			`scored_documents.append(largest_item)`
			`position += 1`
			`if largest_top_docs_iterator.has_any():`
			`heapq.heappush(self.top_docs_heap, largest_top_docs_iterator)`

			`return search_service_pb2.CollectorOutput(`
			`top_docs=search_service_pb2.TopDocsCollectorOutput(`
			`has_next=has_next,`
			`scored_documents=scored_documents,`
			`)`
			`)`