Reimplement MultiMoreLikeThis using BigCompositeReader
This commit is contained in:
parent
6b8c1025d1
commit
dc14192dca
@ -0,0 +1,25 @@
|
|||||||
|
package it.cavallium.dbengine.lucene;
|
||||||
|
|
||||||
|
import java.util.Comparator;
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
|
||||||
|
public class ArrayIndexComparator implements Comparator<IndexReader> {
|
||||||
|
|
||||||
|
private final Comparator<Object> comp;
|
||||||
|
|
||||||
|
public ArrayIndexComparator(IndexReader[] indexReaders) {
|
||||||
|
this.comp = Comparator.comparingInt(reader -> {
|
||||||
|
for (int i = 0; i < indexReaders.length; i++) {
|
||||||
|
if (indexReaders[i] == reader) {
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
throw new IllegalStateException();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int compare(IndexReader o1, IndexReader o2) {
|
||||||
|
return comp.compare(o1, o2);
|
||||||
|
}
|
||||||
|
}
|
@ -20,6 +20,7 @@ import it.cavallium.dbengine.lucene.analyzer.NCharGramEdgeAnalyzer;
|
|||||||
import it.cavallium.dbengine.lucene.analyzer.TextFieldsAnalyzer;
|
import it.cavallium.dbengine.lucene.analyzer.TextFieldsAnalyzer;
|
||||||
import it.cavallium.dbengine.lucene.analyzer.TextFieldsSimilarity;
|
import it.cavallium.dbengine.lucene.analyzer.TextFieldsSimilarity;
|
||||||
import it.cavallium.dbengine.lucene.analyzer.WordAnalyzer;
|
import it.cavallium.dbengine.lucene.analyzer.WordAnalyzer;
|
||||||
|
import it.cavallium.dbengine.lucene.mlt.BigCompositeReader;
|
||||||
import it.cavallium.dbengine.lucene.mlt.MultiMoreLikeThis;
|
import it.cavallium.dbengine.lucene.mlt.MultiMoreLikeThis;
|
||||||
import it.cavallium.dbengine.lucene.searcher.ExponentialPageLimits;
|
import it.cavallium.dbengine.lucene.searcher.ExponentialPageLimits;
|
||||||
import it.cavallium.dbengine.lucene.searcher.LocalQueryParams;
|
import it.cavallium.dbengine.lucene.searcher.LocalQueryParams;
|
||||||
@ -50,6 +51,7 @@ import org.apache.lucene.index.IndexReader;
|
|||||||
import org.apache.lucene.index.IndexableField;
|
import org.apache.lucene.index.IndexableField;
|
||||||
import org.apache.lucene.search.BooleanClause.Occur;
|
import org.apache.lucene.search.BooleanClause.Occur;
|
||||||
import org.apache.lucene.search.BooleanQuery;
|
import org.apache.lucene.search.BooleanQuery;
|
||||||
|
import org.apache.lucene.search.BooleanQuery.Builder;
|
||||||
import org.apache.lucene.search.ConstantScoreQuery;
|
import org.apache.lucene.search.ConstantScoreQuery;
|
||||||
import org.apache.lucene.search.FieldDoc;
|
import org.apache.lucene.search.FieldDoc;
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
@ -557,13 +559,13 @@ public class LuceneUtils {
|
|||||||
}
|
}
|
||||||
MultiMoreLikeThis mlt;
|
MultiMoreLikeThis mlt;
|
||||||
if (indexSearchers.size() == 1) {
|
if (indexSearchers.size() == 1) {
|
||||||
mlt = new MultiMoreLikeThis(indexSearchers.get(0).getIndexReader(), null);
|
mlt = new MultiMoreLikeThis(new BigCompositeReader<>(indexSearchers.get(0).getIndexReader(), IndexReader[]::new), null);
|
||||||
} else {
|
} else {
|
||||||
IndexReader[] indexReaders = new IndexReader[indexSearchers.size()];
|
IndexReader[] indexReaders = new IndexReader[indexSearchers.size()];
|
||||||
for (int i = 0, size = indexSearchers.size(); i < size; i++) {
|
for (int i = 0, size = indexSearchers.size(); i < size; i++) {
|
||||||
indexReaders[i] = indexSearchers.get(i).getIndexReader();
|
indexReaders[i] = indexSearchers.get(i).getIndexReader();
|
||||||
}
|
}
|
||||||
mlt = new MultiMoreLikeThis(indexReaders, null);
|
mlt = new MultiMoreLikeThis(new BigCompositeReader<>(indexReaders, new ArrayIndexComparator(indexReaders)), null);
|
||||||
}
|
}
|
||||||
mlt.setAnalyzer(analyzer);
|
mlt.setAnalyzer(analyzer);
|
||||||
mlt.setFieldNames(mltDocumentFields.keySet().toArray(String[]::new));
|
mlt.setFieldNames(mltDocumentFields.keySet().toArray(String[]::new));
|
||||||
@ -583,7 +585,7 @@ public class LuceneUtils {
|
|||||||
var mltQuery = mlt.like((Map) mltDocumentFields);
|
var mltQuery = mlt.like((Map) mltDocumentFields);
|
||||||
Query luceneQuery;
|
Query luceneQuery;
|
||||||
if (!(luceneAdditionalQuery instanceof MatchAllDocsQuery)) {
|
if (!(luceneAdditionalQuery instanceof MatchAllDocsQuery)) {
|
||||||
luceneQuery = new BooleanQuery.Builder()
|
luceneQuery = new Builder()
|
||||||
.add(mltQuery, Occur.MUST)
|
.add(mltQuery, Occur.MUST)
|
||||||
.add(new ConstantScoreQuery(luceneAdditionalQuery), Occur.MUST)
|
.add(new ConstantScoreQuery(luceneAdditionalQuery), Occur.MUST)
|
||||||
.build();
|
.build();
|
||||||
|
@ -0,0 +1,223 @@
|
|||||||
|
package it.cavallium.dbengine.lucene.mlt;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.math.BigInteger;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.concurrent.atomic.AtomicLong;
|
||||||
|
import java.util.function.IntFunction;
|
||||||
|
import java.util.function.ToIntFunction;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.StreamSupport;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.DocumentStoredFieldVisitor;
|
||||||
|
import org.apache.lucene.index.Fields;
|
||||||
|
import org.apache.lucene.index.IndexOptions;
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.index.IndexReaderContext;
|
||||||
|
import org.apache.lucene.index.LeafReaderContext;
|
||||||
|
import org.apache.lucene.index.StoredFieldVisitor;
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.store.AlreadyClosedException;
|
||||||
|
|
||||||
|
public class BigCompositeReader<R extends IndexReader> {
|
||||||
|
|
||||||
|
private static final long ACTUAL_MAX_DOCS = Long.MAX_VALUE - 10;
|
||||||
|
private final R[] subReaders;
|
||||||
|
protected final Comparator<R> subReadersSorter;
|
||||||
|
private final long[] starts;
|
||||||
|
private final long maxDoc;
|
||||||
|
private AtomicLong numDocs = new AtomicLong(-1);
|
||||||
|
private final List<R> subReadersList;
|
||||||
|
|
||||||
|
public BigCompositeReader(R subReader, IntFunction<R[]> arrayInstantiator) {
|
||||||
|
this(toArray(subReader, arrayInstantiator), null);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <R extends IndexReader> R[] toArray(R subReader, IntFunction<R[]> arrayInstantiator) {
|
||||||
|
var arr = arrayInstantiator.apply(1);
|
||||||
|
arr[0] = subReader;
|
||||||
|
return arr;
|
||||||
|
}
|
||||||
|
|
||||||
|
public BigCompositeReader(R[] subReaders, Comparator<R> subReadersSorter) {
|
||||||
|
if (subReadersSorter != null) {
|
||||||
|
Arrays.sort(subReaders, subReadersSorter);
|
||||||
|
}
|
||||||
|
|
||||||
|
this.subReaders = subReaders;
|
||||||
|
this.subReadersSorter = subReadersSorter;
|
||||||
|
this.subReadersList = Collections.unmodifiableList(Arrays.asList(subReaders));
|
||||||
|
this.starts = new long[subReaders.length + 1];
|
||||||
|
BigInteger maxDoc = BigInteger.ZERO;
|
||||||
|
|
||||||
|
for(int i = 0; i < subReaders.length; ++i) {
|
||||||
|
this.starts[i] = maxDoc.longValue();
|
||||||
|
IndexReader r = subReaders[i];
|
||||||
|
maxDoc = maxDoc.add(BigInteger.valueOf(r.maxDoc()));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (maxDoc.compareTo(BigInteger.ZERO) < 0 || maxDoc.compareTo(BigInteger.valueOf(ACTUAL_MAX_DOCS)) > 0) {
|
||||||
|
throw new IllegalArgumentException("Too many documents: composite IndexReaders cannot exceed "
|
||||||
|
+ ACTUAL_MAX_DOCS + " but readers have total maxDoc=" + maxDoc);
|
||||||
|
} else {
|
||||||
|
this.maxDoc = maxDoc.longValueExact();
|
||||||
|
this.starts[subReaders.length] = this.maxDoc;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static <T extends IndexReader> Collection<String> getIndexedFields(BigCompositeReader<T> readers) {
|
||||||
|
return readers.subReadersList
|
||||||
|
.stream()
|
||||||
|
.map(IndexReader::getContext)
|
||||||
|
.flatMap(l -> l.leaves().stream())
|
||||||
|
.flatMap((l) -> StreamSupport
|
||||||
|
.stream(l.reader().getFieldInfos().spliterator(), false)
|
||||||
|
.filter((fi) -> fi.getIndexOptions() != IndexOptions.NONE))
|
||||||
|
.map((fi) -> fi.name)
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
}
|
||||||
|
|
||||||
|
protected final List<? extends R> getSequentialSubReaders() {
|
||||||
|
return this.subReadersList;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void ensureOpen() {
|
||||||
|
for (R subReader : subReaders) {
|
||||||
|
if (subReader.getRefCount() <= 0) {
|
||||||
|
throw new AlreadyClosedException("this IndexReader is closed");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public long getDocCount(String field) throws IOException {
|
||||||
|
this.ensureOpen();
|
||||||
|
long total = 0;
|
||||||
|
R[] var3 = this.subReaders;
|
||||||
|
long var4 = var3.length;
|
||||||
|
|
||||||
|
for(int var5 = 0; var5 < var4; ++var5) {
|
||||||
|
R reader = var3[var5];
|
||||||
|
int sub = reader.getDocCount(field);
|
||||||
|
|
||||||
|
assert sub >= 0;
|
||||||
|
|
||||||
|
assert sub <= reader.maxDoc();
|
||||||
|
|
||||||
|
total += sub;
|
||||||
|
}
|
||||||
|
|
||||||
|
return total;
|
||||||
|
}
|
||||||
|
|
||||||
|
public long docFreq(Term term) throws IOException {
|
||||||
|
this.ensureOpen();
|
||||||
|
long total = 0;
|
||||||
|
|
||||||
|
for(int i = 0; i < this.subReaders.length; ++i) {
|
||||||
|
int sub = this.subReaders[i].docFreq(term);
|
||||||
|
|
||||||
|
assert sub >= 0;
|
||||||
|
|
||||||
|
assert sub <= this.subReaders[i].getDocCount(term.field());
|
||||||
|
|
||||||
|
total += sub;
|
||||||
|
}
|
||||||
|
|
||||||
|
return total;
|
||||||
|
}
|
||||||
|
|
||||||
|
public long numDocs() {
|
||||||
|
long numDocs = this.numDocs.getOpaque();
|
||||||
|
if (numDocs == -1L) {
|
||||||
|
numDocs = 0L;
|
||||||
|
IndexReader[] var2 = this.subReaders;
|
||||||
|
int var3 = var2.length;
|
||||||
|
|
||||||
|
for(int var4 = 0; var4 < var3; ++var4) {
|
||||||
|
IndexReader r = var2[var4];
|
||||||
|
numDocs += r.numDocs();
|
||||||
|
}
|
||||||
|
|
||||||
|
assert numDocs >= 0L;
|
||||||
|
|
||||||
|
this.numDocs.set(numDocs);
|
||||||
|
}
|
||||||
|
|
||||||
|
return numDocs;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Fields getTermVectors(long docID) throws IOException {
|
||||||
|
this.ensureOpen();
|
||||||
|
int i = this.readerIndex(docID);
|
||||||
|
return this.subReaders[i].getTermVectors(Math.toIntExact(docID - this.starts[i]));
|
||||||
|
}
|
||||||
|
|
||||||
|
protected final int readerIndex(long docID) {
|
||||||
|
if (docID >= 0 && docID < this.maxDoc) {
|
||||||
|
return subIndex(docID, this.starts);
|
||||||
|
} else {
|
||||||
|
throw new IllegalArgumentException("docID must be >= 0 and < maxDoc=" + this.maxDoc + " (got docID=" + docID + ")");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static int subIndex(long n, long[] docStarts) {
|
||||||
|
int size = docStarts.length;
|
||||||
|
int lo = 0;
|
||||||
|
int hi = size - 1;
|
||||||
|
|
||||||
|
while(hi >= lo) {
|
||||||
|
int mid = lo + hi >>> 1;
|
||||||
|
long midValue = docStarts[mid];
|
||||||
|
if (n < midValue) {
|
||||||
|
hi = mid - 1;
|
||||||
|
} else {
|
||||||
|
if (n <= midValue) {
|
||||||
|
while(mid + 1 < size && docStarts[mid + 1] == midValue) {
|
||||||
|
++mid;
|
||||||
|
}
|
||||||
|
|
||||||
|
return mid;
|
||||||
|
}
|
||||||
|
|
||||||
|
lo = mid + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return hi;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected final long readerBase(int readerIndex) {
|
||||||
|
if (readerIndex >= 0L && readerIndex < this.subReaders.length) {
|
||||||
|
return this.starts[readerIndex];
|
||||||
|
} else {
|
||||||
|
throw new IllegalArgumentException("readerIndex must be >= 0 and < getSequentialSubReaders().size()");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public final void document(long docID, StoredFieldVisitor visitor) throws IOException {
|
||||||
|
this.ensureOpen();
|
||||||
|
int i = this.readerIndex(docID);
|
||||||
|
this.subReaders[i].document(Math.toIntExact(docID - this.starts[i]), visitor);
|
||||||
|
}
|
||||||
|
|
||||||
|
public final Document document(long docID) throws IOException {
|
||||||
|
DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor();
|
||||||
|
this.document(docID, (StoredFieldVisitor)visitor);
|
||||||
|
return visitor.getDocument();
|
||||||
|
}
|
||||||
|
|
||||||
|
public final Document document(long docID, Set<String> fieldsToLoad) throws IOException {
|
||||||
|
DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor(fieldsToLoad);
|
||||||
|
this.document(docID, (StoredFieldVisitor)visitor);
|
||||||
|
return visitor.getDocument();
|
||||||
|
}
|
||||||
|
|
||||||
|
public long maxDoc() {
|
||||||
|
return this.maxDoc;
|
||||||
|
}
|
||||||
|
}
|
@ -29,9 +29,7 @@ import org.apache.lucene.analysis.TokenStream;
|
|||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.index.FieldInfos;
|
|
||||||
import org.apache.lucene.index.Fields;
|
import org.apache.lucene.index.Fields;
|
||||||
import org.apache.lucene.index.IndexReader;
|
|
||||||
import org.apache.lucene.index.IndexableField;
|
import org.apache.lucene.index.IndexableField;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.index.Terms;
|
import org.apache.lucene.index.Terms;
|
||||||
@ -52,7 +50,7 @@ import org.apache.lucene.util.PriorityQueue;
|
|||||||
* Generate "more like this" similarity queries. Based on this mail:
|
* Generate "more like this" similarity queries. Based on this mail:
|
||||||
*
|
*
|
||||||
* <pre><code>
|
* <pre><code>
|
||||||
* Lucene does let you access the document frequency of terms, with IndexReader.docFreq().
|
* Lucene does let you access the document frequency of terms, with BigCompositeReader.docFreq().
|
||||||
* Term frequencies can be computed by re-tokenizing the text, which, for a single document,
|
* Term frequencies can be computed by re-tokenizing the text, which, for a single document,
|
||||||
* is usually fast enough. But looking up the docFreq() of every term in the document is
|
* is usually fast enough. But looking up the docFreq() of every term in the document is
|
||||||
* probably too slow.
|
* probably too slow.
|
||||||
@ -86,7 +84,7 @@ import org.apache.lucene.util.PriorityQueue;
|
|||||||
* usage is as follows. The bold fragment is specific to this class. <br>
|
* usage is as follows. The bold fragment is specific to this class. <br>
|
||||||
*
|
*
|
||||||
* <pre class="prettyprint">
|
* <pre class="prettyprint">
|
||||||
* IndexReader ir = ...
|
* BigCompositeReader ir = ...
|
||||||
* IndexSearcher is = ...
|
* IndexSearcher is = ...
|
||||||
*
|
*
|
||||||
* MoreLikeThis mlt = new MoreLikeThis(ir);
|
* MoreLikeThis mlt = new MoreLikeThis(ir);
|
||||||
@ -137,7 +135,7 @@ import org.apache.lucene.util.PriorityQueue;
|
|||||||
* <pre>
|
* <pre>
|
||||||
* Changes: Mark Harwood 29/02/04
|
* Changes: Mark Harwood 29/02/04
|
||||||
* Some bugfixing, some refactoring, some optimisation.
|
* Some bugfixing, some refactoring, some optimisation.
|
||||||
* - bugfix: retrieveTerms(int docNum) was not working for indexes without a termvector -added missing code
|
* - bugfix: retrieveTerms(long docNum) was not working for indexes without a termvector -added missing code
|
||||||
* - bugfix: No significant terms being created for fields with a termvector - because
|
* - bugfix: No significant terms being created for fields with a termvector - because
|
||||||
* was only counting one occurrence per term/field pair in calculations(ie not including frequency info from TermVector)
|
* was only counting one occurrence per term/field pair in calculations(ie not including frequency info from TermVector)
|
||||||
* - refactor: moved common code into isNoiseWord()
|
* - refactor: moved common code into isNoiseWord()
|
||||||
@ -153,7 +151,7 @@ public final class MultiMoreLikeThis {
|
|||||||
*
|
*
|
||||||
* @see #getMaxNumTokensParsed
|
* @see #getMaxNumTokensParsed
|
||||||
*/
|
*/
|
||||||
public static final int DEFAULT_MAX_NUM_TOKENS_PARSED = 5000;
|
public static final long DEFAULT_MAX_NUM_TOKENS_PARSED = 5000;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Ignore terms with less than this frequency in the source doc.
|
* Ignore terms with less than this frequency in the source doc.
|
||||||
@ -161,7 +159,7 @@ public final class MultiMoreLikeThis {
|
|||||||
* @see #getMinTermFreq
|
* @see #getMinTermFreq
|
||||||
* @see #setMinTermFreq
|
* @see #setMinTermFreq
|
||||||
*/
|
*/
|
||||||
public static final int DEFAULT_MIN_TERM_FREQ = 2;
|
public static final long DEFAULT_MIN_TERM_FREQ = 2;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Ignore words which do not occur in at least this many docs.
|
* Ignore words which do not occur in at least this many docs.
|
||||||
@ -178,7 +176,7 @@ public final class MultiMoreLikeThis {
|
|||||||
* @see #setMaxDocFreq
|
* @see #setMaxDocFreq
|
||||||
* @see #setMaxDocFreqPct
|
* @see #setMaxDocFreqPct
|
||||||
*/
|
*/
|
||||||
public static final long DEFAULT_MAX_DOC_FREQ = Long.MAX_VALUE;
|
public static final long DEFAULT_MAX_DOC_FREQ = java.lang.Long.MAX_VALUE;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Boost terms in query based on score.
|
* Boost terms in query based on score.
|
||||||
@ -200,7 +198,7 @@ public final class MultiMoreLikeThis {
|
|||||||
* @see #getMinWordLen
|
* @see #getMinWordLen
|
||||||
* @see #setMinWordLen
|
* @see #setMinWordLen
|
||||||
*/
|
*/
|
||||||
public static final int DEFAULT_MIN_WORD_LENGTH = 0;
|
public static final long DEFAULT_MIN_WORD_LENGTH = 0;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Ignore words greater than this length or if 0 then this has no effect.
|
* Ignore words greater than this length or if 0 then this has no effect.
|
||||||
@ -208,7 +206,7 @@ public final class MultiMoreLikeThis {
|
|||||||
* @see #getMaxWordLen
|
* @see #getMaxWordLen
|
||||||
* @see #setMaxWordLen
|
* @see #setMaxWordLen
|
||||||
*/
|
*/
|
||||||
public static final int DEFAULT_MAX_WORD_LENGTH = 0;
|
public static final long DEFAULT_MAX_WORD_LENGTH = 0;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Default set of stopwords. If null means to allow stop words.
|
* Default set of stopwords. If null means to allow stop words.
|
||||||
@ -228,13 +226,13 @@ public final class MultiMoreLikeThis {
|
|||||||
* @see #getMaxQueryTerms
|
* @see #getMaxQueryTerms
|
||||||
* @see #setMaxQueryTerms
|
* @see #setMaxQueryTerms
|
||||||
*/
|
*/
|
||||||
public static final int DEFAULT_MAX_QUERY_TERMS = 25;
|
public static final long DEFAULT_MAX_QUERY_TERMS = 25;
|
||||||
|
|
||||||
/** Analyzer that will be used to parse the doc. */
|
/** Analyzer that will be used to parse the doc. */
|
||||||
private Analyzer analyzer = null;
|
private Analyzer analyzer = null;
|
||||||
|
|
||||||
/** Ignore words less frequent that this. */
|
/** Ignore words less frequent that this. */
|
||||||
private int minTermFreq = DEFAULT_MIN_TERM_FREQ;
|
private long minTermFreq = DEFAULT_MIN_TERM_FREQ;
|
||||||
|
|
||||||
/** Ignore words which do not occur in at least this many docs. */
|
/** Ignore words which do not occur in at least this many docs. */
|
||||||
private long minDocFreq = DEFAULT_MIN_DOC_FREQ;
|
private long minDocFreq = DEFAULT_MIN_DOC_FREQ;
|
||||||
@ -252,27 +250,22 @@ public final class MultiMoreLikeThis {
|
|||||||
* The maximum number of tokens to parse in each example doc field that is not stored with
|
* The maximum number of tokens to parse in each example doc field that is not stored with
|
||||||
* TermVector support
|
* TermVector support
|
||||||
*/
|
*/
|
||||||
private int maxNumTokensParsed = DEFAULT_MAX_NUM_TOKENS_PARSED;
|
private long maxNumTokensParsed = DEFAULT_MAX_NUM_TOKENS_PARSED;
|
||||||
|
|
||||||
/** Ignore words if less than this len. */
|
/** Ignore words if less than this len. */
|
||||||
private int minWordLen = DEFAULT_MIN_WORD_LENGTH;
|
private long minWordLen = DEFAULT_MIN_WORD_LENGTH;
|
||||||
|
|
||||||
/** Ignore words if greater than this len. */
|
/** Ignore words if greater than this len. */
|
||||||
private int maxWordLen = DEFAULT_MAX_WORD_LENGTH;
|
private long maxWordLen = DEFAULT_MAX_WORD_LENGTH;
|
||||||
|
|
||||||
/** Don't return a query longer than this. */
|
/** Don't return a query longer than this. */
|
||||||
private int maxQueryTerms = DEFAULT_MAX_QUERY_TERMS;
|
private long maxQueryTerms = DEFAULT_MAX_QUERY_TERMS;
|
||||||
|
|
||||||
/** For idf() calculations. */
|
/** For idf() calculations. */
|
||||||
private TFIDFSimilarity similarity; // = new DefaultSimilarity();
|
private TFIDFSimilarity similarity; // = new DefaultSimilarity();
|
||||||
|
|
||||||
/** IndexReader to use */
|
/** BigCompositeReader to use */
|
||||||
private final IndexReader ir;
|
private final BigCompositeReader<?> ir;
|
||||||
|
|
||||||
/**
|
|
||||||
* IndexReader array to use when multi-searchers are used.
|
|
||||||
*/
|
|
||||||
private final IndexReader[] irArray;
|
|
||||||
|
|
||||||
/** Boost factor to use when boosting the terms */
|
/** Boost factor to use when boosting the terms */
|
||||||
private float boostFactor = 1;
|
private float boostFactor = 1;
|
||||||
@ -296,28 +289,12 @@ public final class MultiMoreLikeThis {
|
|||||||
this.boostFactor = boostFactor;
|
this.boostFactor = boostFactor;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Constructor requiring an IndexReader. */
|
/** Constructor requiring a BigCompositeReader. */
|
||||||
public MultiMoreLikeThis(IndexReader ir) {
|
public MultiMoreLikeThis(BigCompositeReader<?> ir) {
|
||||||
this(ir, new ClassicSimilarity());
|
this(ir, new ClassicSimilarity());
|
||||||
}
|
}
|
||||||
|
|
||||||
public MultiMoreLikeThis(IndexReader ir, TFIDFSimilarity sim) {
|
public MultiMoreLikeThis(BigCompositeReader<?> ir, TFIDFSimilarity sim) {
|
||||||
this(ir, null, sim);
|
|
||||||
}
|
|
||||||
|
|
||||||
public MultiMoreLikeThis(IndexReader[] irArray) {
|
|
||||||
this(irArray, new ClassicSimilarity());
|
|
||||||
}
|
|
||||||
|
|
||||||
public MultiMoreLikeThis(IndexReader[] irArray, TFIDFSimilarity sim) {
|
|
||||||
this(null, irArray, sim);
|
|
||||||
}
|
|
||||||
|
|
||||||
private MultiMoreLikeThis(IndexReader ir, IndexReader[] irArray, TFIDFSimilarity sim) {
|
|
||||||
if ((ir == null) == (irArray == null)) {
|
|
||||||
throw new IllegalArgumentException();
|
|
||||||
}
|
|
||||||
this.irArray = irArray;
|
|
||||||
this.ir = ir;
|
this.ir = ir;
|
||||||
this.similarity = sim;
|
this.similarity = sim;
|
||||||
}
|
}
|
||||||
@ -342,7 +319,7 @@ public final class MultiMoreLikeThis {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Sets the analyzer to use. An analyzer is not required for generating a query with the {@link
|
* Sets the analyzer to use. An analyzer is not required for generating a query with the {@link
|
||||||
* #like(int)} method, all other 'like' methods require an analyzer.
|
* #like(long)} method, all other 'like' methods require an analyzer.
|
||||||
*
|
*
|
||||||
* @param analyzer the analyzer to use to tokenize text.
|
* @param analyzer the analyzer to use to tokenize text.
|
||||||
*/
|
*/
|
||||||
@ -356,7 +333,7 @@ public final class MultiMoreLikeThis {
|
|||||||
*
|
*
|
||||||
* @return the frequency below which terms will be ignored in the source doc.
|
* @return the frequency below which terms will be ignored in the source doc.
|
||||||
*/
|
*/
|
||||||
public int getMinTermFreq() {
|
public long getMinTermFreq() {
|
||||||
return minTermFreq;
|
return minTermFreq;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -365,7 +342,7 @@ public final class MultiMoreLikeThis {
|
|||||||
*
|
*
|
||||||
* @param minTermFreq the frequency below which terms will be ignored in the source doc.
|
* @param minTermFreq the frequency below which terms will be ignored in the source doc.
|
||||||
*/
|
*/
|
||||||
public void setMinTermFreq(int minTermFreq) {
|
public void setMinTermFreq(long minTermFreq) {
|
||||||
this.minTermFreq = minTermFreq;
|
this.minTermFreq = minTermFreq;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -423,17 +400,8 @@ public final class MultiMoreLikeThis {
|
|||||||
* @param maxPercentage the maximum percentage of documents (0-100) that a term may appear in to
|
* @param maxPercentage the maximum percentage of documents (0-100) that a term may appear in to
|
||||||
* be still considered relevant.
|
* be still considered relevant.
|
||||||
*/
|
*/
|
||||||
public void setMaxDocFreqPct(int maxPercentage) {
|
public void setMaxDocFreqPct(long maxPercentage) {
|
||||||
long maxDoc;
|
setMaxDocFreq((maxPercentage) * ir.maxDoc() / 100L);
|
||||||
if (irArray == null) {
|
|
||||||
maxDoc = ir.maxDoc();
|
|
||||||
} else {
|
|
||||||
maxDoc = 0L;
|
|
||||||
for (IndexReader ir : irArray) {
|
|
||||||
maxDoc += ir.maxDoc();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
setMaxDocFreq(Math.toIntExact((long) maxPercentage * maxDoc / 100L));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -469,7 +437,7 @@ public final class MultiMoreLikeThis {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Sets the field names that will be used when generating the 'More Like This' query. Set this to
|
* Sets the field names that will be used when generating the 'More Like This' query. Set this to
|
||||||
* null for the field names to be determined at runtime from the IndexReader provided in the
|
* null for the field names to be determined at runtime from the BigCompositeReader provided in the
|
||||||
* constructor.
|
* constructor.
|
||||||
*
|
*
|
||||||
* @param fieldNames the field names that will be used when generating the 'More Like This' query.
|
* @param fieldNames the field names that will be used when generating the 'More Like This' query.
|
||||||
@ -484,7 +452,7 @@ public final class MultiMoreLikeThis {
|
|||||||
*
|
*
|
||||||
* @return the minimum word length below which words will be ignored.
|
* @return the minimum word length below which words will be ignored.
|
||||||
*/
|
*/
|
||||||
public int getMinWordLen() {
|
public long getMinWordLen() {
|
||||||
return minWordLen;
|
return minWordLen;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -493,7 +461,7 @@ public final class MultiMoreLikeThis {
|
|||||||
*
|
*
|
||||||
* @param minWordLen the minimum word length below which words will be ignored.
|
* @param minWordLen the minimum word length below which words will be ignored.
|
||||||
*/
|
*/
|
||||||
public void setMinWordLen(int minWordLen) {
|
public void setMinWordLen(long minWordLen) {
|
||||||
this.minWordLen = minWordLen;
|
this.minWordLen = minWordLen;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -503,7 +471,7 @@ public final class MultiMoreLikeThis {
|
|||||||
*
|
*
|
||||||
* @return the maximum word length above which words will be ignored.
|
* @return the maximum word length above which words will be ignored.
|
||||||
*/
|
*/
|
||||||
public int getMaxWordLen() {
|
public long getMaxWordLen() {
|
||||||
return maxWordLen;
|
return maxWordLen;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -512,7 +480,7 @@ public final class MultiMoreLikeThis {
|
|||||||
*
|
*
|
||||||
* @param maxWordLen the maximum word length above which words will be ignored.
|
* @param maxWordLen the maximum word length above which words will be ignored.
|
||||||
*/
|
*/
|
||||||
public void setMaxWordLen(int maxWordLen) {
|
public void setMaxWordLen(long maxWordLen) {
|
||||||
this.maxWordLen = maxWordLen;
|
this.maxWordLen = maxWordLen;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -544,7 +512,7 @@ public final class MultiMoreLikeThis {
|
|||||||
*
|
*
|
||||||
* @return the maximum number of query terms that will be included in any generated query.
|
* @return the maximum number of query terms that will be included in any generated query.
|
||||||
*/
|
*/
|
||||||
public int getMaxQueryTerms() {
|
public long getMaxQueryTerms() {
|
||||||
return maxQueryTerms;
|
return maxQueryTerms;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -554,7 +522,7 @@ public final class MultiMoreLikeThis {
|
|||||||
* @param maxQueryTerms the maximum number of query terms that will be included in any generated
|
* @param maxQueryTerms the maximum number of query terms that will be included in any generated
|
||||||
* query.
|
* query.
|
||||||
*/
|
*/
|
||||||
public void setMaxQueryTerms(int maxQueryTerms) {
|
public void setMaxQueryTerms(long maxQueryTerms) {
|
||||||
this.maxQueryTerms = maxQueryTerms;
|
this.maxQueryTerms = maxQueryTerms;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -563,7 +531,7 @@ public final class MultiMoreLikeThis {
|
|||||||
* TermVector support
|
* TermVector support
|
||||||
* @see #DEFAULT_MAX_NUM_TOKENS_PARSED
|
* @see #DEFAULT_MAX_NUM_TOKENS_PARSED
|
||||||
*/
|
*/
|
||||||
public int getMaxNumTokensParsed() {
|
public long getMaxNumTokensParsed() {
|
||||||
return maxNumTokensParsed;
|
return maxNumTokensParsed;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -571,7 +539,7 @@ public final class MultiMoreLikeThis {
|
|||||||
* @param i The maximum number of tokens to parse in each example doc field that is not stored
|
* @param i The maximum number of tokens to parse in each example doc field that is not stored
|
||||||
* with TermVector support
|
* with TermVector support
|
||||||
*/
|
*/
|
||||||
public void setMaxNumTokensParsed(int i) {
|
public void setMaxNumTokensParsed(long i) {
|
||||||
maxNumTokensParsed = i;
|
maxNumTokensParsed = i;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -581,18 +549,11 @@ public final class MultiMoreLikeThis {
|
|||||||
* @param docNum the documentID of the lucene doc to generate the 'More Like This" query for.
|
* @param docNum the documentID of the lucene doc to generate the 'More Like This" query for.
|
||||||
* @return a query that will return docs like the passed lucene document ID.
|
* @return a query that will return docs like the passed lucene document ID.
|
||||||
*/
|
*/
|
||||||
public Query like(int docNum) throws IOException {
|
public Query like(long docNum) throws IOException {
|
||||||
if (fieldNames == null) {
|
if (fieldNames == null) {
|
||||||
// gather list of valid fields from lucene
|
// gather list of valid fields from lucene
|
||||||
Collection<String> fields;
|
Collection<String> fields;
|
||||||
if (irArray == null) {
|
fields = BigCompositeReader.getIndexedFields(ir);
|
||||||
fields = FieldInfos.getIndexedFields(ir);
|
|
||||||
} else {
|
|
||||||
fields = new ArrayList<>();
|
|
||||||
for (IndexReader ir : irArray) {
|
|
||||||
fields.addAll(FieldInfos.getIndexedFields(ir));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fieldNames = fields.toArray(String[]::new);
|
fieldNames = fields.toArray(String[]::new);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -606,15 +567,7 @@ public final class MultiMoreLikeThis {
|
|||||||
public Query like(Map<String, Collection<Object>> filteredDocument) throws IOException {
|
public Query like(Map<String, Collection<Object>> filteredDocument) throws IOException {
|
||||||
if (fieldNames == null) {
|
if (fieldNames == null) {
|
||||||
// gather list of valid fields from lucene
|
// gather list of valid fields from lucene
|
||||||
Collection<String> fields;
|
Collection<String> fields = BigCompositeReader.getIndexedFields(ir);
|
||||||
if (irArray == null) {
|
|
||||||
fields = FieldInfos.getIndexedFields(ir);
|
|
||||||
} else {
|
|
||||||
fields = new ArrayList<>();
|
|
||||||
for (IndexReader ir : irArray) {
|
|
||||||
fields.addAll(FieldInfos.getIndexedFields(ir));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fieldNames = fields.toArray(String[]::new);
|
fieldNames = fields.toArray(String[]::new);
|
||||||
}
|
}
|
||||||
return createQuery(retrieveTerms(filteredDocument));
|
return createQuery(retrieveTerms(filteredDocument));
|
||||||
@ -627,7 +580,7 @@ public final class MultiMoreLikeThis {
|
|||||||
* @return a query that will return docs like the passed Readers.
|
* @return a query that will return docs like the passed Readers.
|
||||||
*/
|
*/
|
||||||
public Query like(String fieldName, Reader... readers) throws IOException {
|
public Query like(String fieldName, Reader... readers) throws IOException {
|
||||||
Map<String, Map<String, Int>> perFieldTermFrequencies = new HashMap<>();
|
Map<String, Map<String, Long>> perFieldTermFrequencies = new HashMap<>();
|
||||||
for (Reader r : readers) {
|
for (Reader r : readers) {
|
||||||
addTermFrequencies(r, perFieldTermFrequencies, fieldName);
|
addTermFrequencies(r, perFieldTermFrequencies, fieldName);
|
||||||
}
|
}
|
||||||
@ -669,48 +622,28 @@ public final class MultiMoreLikeThis {
|
|||||||
* objects as the values.
|
* objects as the values.
|
||||||
*/
|
*/
|
||||||
private PriorityQueue<ScoreTerm> createQueue(
|
private PriorityQueue<ScoreTerm> createQueue(
|
||||||
Map<String, Map<String, Int>> perFieldTermFrequencies) throws IOException {
|
Map<String, Map<String, Long>> perFieldTermFrequencies) throws IOException {
|
||||||
// have collected all words in doc and their freqs
|
// have collected all words in doc and their freqs
|
||||||
final int limit = Math.min(maxQueryTerms, this.getTermsCount(perFieldTermFrequencies));
|
final long limit = Math.min(maxQueryTerms, this.getTermsCount(perFieldTermFrequencies));
|
||||||
FreqQ queue = new FreqQ(limit); // will order words by score
|
FreqQ queue = new FreqQ(Math.toIntExact(limit)); // will order words by score
|
||||||
for (Map.Entry<String, Map<String, Int>> entry : perFieldTermFrequencies.entrySet()) {
|
for (Map.Entry<String, Map<String, Long>> entry : perFieldTermFrequencies.entrySet()) {
|
||||||
Map<String, Int> perWordTermFrequencies = entry.getValue();
|
Map<String, Long> perWordTermFrequencies = entry.getValue();
|
||||||
String fieldName = entry.getKey();
|
String fieldName = entry.getKey();
|
||||||
|
|
||||||
long numDocs;
|
long numDocs = ir.getDocCount(fieldName);
|
||||||
if (irArray == null) {
|
if (numDocs == -1) {
|
||||||
numDocs = ir.getDocCount(fieldName);
|
numDocs = ir.numDocs();
|
||||||
if (numDocs == -1) {
|
|
||||||
numDocs = ir.numDocs();
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
numDocs = 0L;
|
|
||||||
for (IndexReader ir : irArray) {
|
|
||||||
long localNumDocs = ir.getDocCount(fieldName);
|
|
||||||
if (localNumDocs == -1) {
|
|
||||||
localNumDocs = ir.numDocs();
|
|
||||||
}
|
|
||||||
numDocs += localNumDocs;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for (Map.Entry<String, Int> tfEntry : perWordTermFrequencies.entrySet()) { // for every word
|
for (Map.Entry<String, Long> tfEntry : perWordTermFrequencies.entrySet()) { // for every word
|
||||||
String word = tfEntry.getKey();
|
String word = tfEntry.getKey();
|
||||||
int tf = tfEntry.getValue().x; // term freq in the source doc
|
long tf = tfEntry.getValue().x; // term freq in the source doc
|
||||||
if (minTermFreq > 0 && tf < minTermFreq) {
|
if (minTermFreq > 0 && tf < minTermFreq) {
|
||||||
continue; // filter out words that don't occur enough times in the source
|
continue; // filter out words that don't occur enough times in the source
|
||||||
}
|
}
|
||||||
|
|
||||||
long docFreq;
|
|
||||||
var fieldTerm = new Term(fieldName, word);
|
var fieldTerm = new Term(fieldName, word);
|
||||||
if (irArray == null) {
|
long docFreq = ir.docFreq(fieldTerm);
|
||||||
docFreq = ir.docFreq(fieldTerm);
|
|
||||||
} else {
|
|
||||||
docFreq = 0;
|
|
||||||
for (IndexReader ir : irArray) {
|
|
||||||
docFreq += ir.docFreq(fieldTerm);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (minDocFreq > 0L && docFreq < minDocFreq) {
|
if (minDocFreq > 0L && docFreq < minDocFreq) {
|
||||||
continue; // filter out words that don't occur in enough docs
|
continue; // filter out words that don't occur in enough docs
|
||||||
@ -743,10 +676,10 @@ public final class MultiMoreLikeThis {
|
|||||||
return queue;
|
return queue;
|
||||||
}
|
}
|
||||||
|
|
||||||
private int getTermsCount(Map<String, Map<String, Int>> perFieldTermFrequencies) {
|
private long getTermsCount(Map<String, Map<String, Long>> perFieldTermFrequencies) {
|
||||||
int totalTermsCount = 0;
|
long totalTermsCount = 0;
|
||||||
Collection<Map<String, Int>> values = perFieldTermFrequencies.values();
|
Collection<Map<String, Long>> values = perFieldTermFrequencies.values();
|
||||||
for (Map<String, Int> perWordTermFrequencies : values) {
|
for (Map<String, Long> perWordTermFrequencies : values) {
|
||||||
totalTermsCount += perWordTermFrequencies.size();
|
totalTermsCount += perWordTermFrequencies.size();
|
||||||
}
|
}
|
||||||
return totalTermsCount;
|
return totalTermsCount;
|
||||||
@ -776,20 +709,14 @@ public final class MultiMoreLikeThis {
|
|||||||
*
|
*
|
||||||
* @param docNum the id of the lucene document from which to find terms
|
* @param docNum the id of the lucene document from which to find terms
|
||||||
*/
|
*/
|
||||||
private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException {
|
private PriorityQueue<ScoreTerm> retrieveTerms(long docNum) throws IOException {
|
||||||
Map<String, Map<String, Int>> field2termFreqMap = new HashMap<>();
|
Map<String, Map<String, Long>> field2termFreqMap = new HashMap<>();
|
||||||
if (irArray == null) {
|
retrieveTermsOfIndexReader(ir, docNum, field2termFreqMap);
|
||||||
retrieveTermsOfIndexReader(ir, docNum, field2termFreqMap);
|
|
||||||
} else {
|
|
||||||
for (IndexReader ir : irArray) {
|
|
||||||
retrieveTermsOfIndexReader(ir, docNum, field2termFreqMap);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return createQueue(field2termFreqMap);
|
return createQueue(field2termFreqMap);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void retrieveTermsOfIndexReader(IndexReader ir, int docNum, Map<String, Map<String, Int>> field2termFreqMap)
|
private void retrieveTermsOfIndexReader(BigCompositeReader<?> ir, long docNum, Map<String, Map<String, Long>> field2termFreqMap)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
for (String fieldName : fieldNames) {
|
for (String fieldName : fieldNames) {
|
||||||
final Fields vectors = ir.getTermVectors(docNum);
|
final Fields vectors = ir.getTermVectors(docNum);
|
||||||
@ -818,7 +745,7 @@ public final class MultiMoreLikeThis {
|
|||||||
|
|
||||||
private PriorityQueue<ScoreTerm> retrieveTerms(Map<String, Collection<Object>> field2fieldValues)
|
private PriorityQueue<ScoreTerm> retrieveTerms(Map<String, Collection<Object>> field2fieldValues)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
Map<String, Map<String, Int>> field2termFreqMap = new HashMap<>();
|
Map<String, Map<String, Long>> field2termFreqMap = new HashMap<>();
|
||||||
for (String fieldName : fieldNames) {
|
for (String fieldName : fieldNames) {
|
||||||
Collection<Object> fieldValues = field2fieldValues.get(fieldName);
|
Collection<Object> fieldValues = field2fieldValues.get(fieldName);
|
||||||
if (fieldValues == null) {
|
if (fieldValues == null) {
|
||||||
@ -840,9 +767,9 @@ public final class MultiMoreLikeThis {
|
|||||||
* @param vector List of terms and their frequencies for a doc/field
|
* @param vector List of terms and their frequencies for a doc/field
|
||||||
*/
|
*/
|
||||||
private void addTermFrequencies(
|
private void addTermFrequencies(
|
||||||
Map<String, Map<String, Int>> field2termFreqMap, Terms vector, String fieldName)
|
Map<String, Map<String, Long>> field2termFreqMap, Terms vector, String fieldName)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
Map<String, Int> termFreqMap =
|
Map<String, Long> termFreqMap =
|
||||||
field2termFreqMap.computeIfAbsent(fieldName, k -> new HashMap<>());
|
field2termFreqMap.computeIfAbsent(fieldName, k -> new HashMap<>());
|
||||||
final TermsEnum termsEnum = vector.iterator();
|
final TermsEnum termsEnum = vector.iterator();
|
||||||
final CharsRefBuilder spare = new CharsRefBuilder();
|
final CharsRefBuilder spare = new CharsRefBuilder();
|
||||||
@ -853,12 +780,12 @@ public final class MultiMoreLikeThis {
|
|||||||
if (isNoiseWord(term)) {
|
if (isNoiseWord(term)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
final int freq = (int) termsEnum.totalTermFreq();
|
final long freq = termsEnum.totalTermFreq();
|
||||||
|
|
||||||
// increment frequency
|
// increment frequency
|
||||||
Int cnt = termFreqMap.get(term);
|
Long cnt = termFreqMap.get(term);
|
||||||
if (cnt == null) {
|
if (cnt == null) {
|
||||||
cnt = new Int();
|
cnt = new Long();
|
||||||
termFreqMap.put(term, cnt);
|
termFreqMap.put(term, cnt);
|
||||||
cnt.x = freq;
|
cnt.x = freq;
|
||||||
} else {
|
} else {
|
||||||
@ -875,16 +802,16 @@ public final class MultiMoreLikeThis {
|
|||||||
* @param fieldName Used by analyzer for any special per-field analysis
|
* @param fieldName Used by analyzer for any special per-field analysis
|
||||||
*/
|
*/
|
||||||
private void addTermFrequencies(
|
private void addTermFrequencies(
|
||||||
Reader r, Map<String, Map<String, Int>> perFieldTermFrequencies, String fieldName)
|
Reader r, Map<String, Map<String, Long>> perFieldTermFrequencies, String fieldName)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
if (analyzer == null) {
|
if (analyzer == null) {
|
||||||
throw new UnsupportedOperationException(
|
throw new UnsupportedOperationException(
|
||||||
"To use MoreLikeThis without " + "term vectors, you must provide an Analyzer");
|
"To use MoreLikeThis without " + "term vectors, you must provide an Analyzer");
|
||||||
}
|
}
|
||||||
Map<String, Int> termFreqMap =
|
Map<String, Long> termFreqMap =
|
||||||
perFieldTermFrequencies.computeIfAbsent(fieldName, k -> new HashMap<>());
|
perFieldTermFrequencies.computeIfAbsent(fieldName, k -> new HashMap<>());
|
||||||
try (TokenStream ts = analyzer.tokenStream(fieldName, r)) {
|
try (TokenStream ts = analyzer.tokenStream(fieldName, r)) {
|
||||||
int tokenCount = 0;
|
long tokenCount = 0;
|
||||||
// for every token
|
// for every token
|
||||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||||
TermFrequencyAttribute tfAtt = ts.addAttribute(TermFrequencyAttribute.class);
|
TermFrequencyAttribute tfAtt = ts.addAttribute(TermFrequencyAttribute.class);
|
||||||
@ -900,9 +827,9 @@ public final class MultiMoreLikeThis {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// increment frequency
|
// increment frequency
|
||||||
Int cnt = termFreqMap.get(word);
|
Long cnt = termFreqMap.get(word);
|
||||||
if (cnt == null) {
|
if (cnt == null) {
|
||||||
termFreqMap.put(word, new Int(tfAtt.getTermFrequency()));
|
termFreqMap.put(word, new Long(tfAtt.getTermFrequency()));
|
||||||
} else {
|
} else {
|
||||||
cnt.x += tfAtt.getTermFrequency();
|
cnt.x += tfAtt.getTermFrequency();
|
||||||
}
|
}
|
||||||
@ -918,7 +845,7 @@ public final class MultiMoreLikeThis {
|
|||||||
* @return true if should be ignored, false if should be used in further analysis
|
* @return true if should be ignored, false if should be used in further analysis
|
||||||
*/
|
*/
|
||||||
private boolean isNoiseWord(String term) {
|
private boolean isNoiseWord(String term) {
|
||||||
int len = term.length();
|
long len = term.length();
|
||||||
if (minWordLen > 0 && len < minWordLen) {
|
if (minWordLen > 0 && len < minWordLen) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -953,19 +880,19 @@ public final class MultiMoreLikeThis {
|
|||||||
* @see #retrieveInterestingTerms
|
* @see #retrieveInterestingTerms
|
||||||
*/
|
*/
|
||||||
private PriorityQueue<ScoreTerm> retrieveTerms(Reader r, String fieldName) throws IOException {
|
private PriorityQueue<ScoreTerm> retrieveTerms(Reader r, String fieldName) throws IOException {
|
||||||
Map<String, Map<String, Int>> field2termFreqMap = new HashMap<>();
|
Map<String, Map<String, Long>> field2termFreqMap = new HashMap<>();
|
||||||
addTermFrequencies(r, field2termFreqMap, fieldName);
|
addTermFrequencies(r, field2termFreqMap, fieldName);
|
||||||
return createQueue(field2termFreqMap);
|
return createQueue(field2termFreqMap);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** @see #retrieveInterestingTerms(java.io.Reader, String) */
|
/** @see #retrieveInterestingTerms(java.io.Reader, String) */
|
||||||
public String[] retrieveInterestingTerms(int docNum) throws IOException {
|
public String[] retrieveInterestingTerms(long docNum) throws IOException {
|
||||||
ArrayList<String> al = new ArrayList<>(maxQueryTerms);
|
ArrayList<String> al = new ArrayList<>(Math.toIntExact(maxQueryTerms));
|
||||||
PriorityQueue<ScoreTerm> pq = retrieveTerms(docNum);
|
PriorityQueue<ScoreTerm> pq = retrieveTerms(docNum);
|
||||||
ScoreTerm scoreTerm;
|
ScoreTerm scoreTerm;
|
||||||
// have to be careful, retrieveTerms returns all words but that's probably not useful to our
|
// have to be careful, retrieveTerms returns all words but that's probably not useful to our
|
||||||
// caller...
|
// caller...
|
||||||
int lim = maxQueryTerms;
|
long lim = maxQueryTerms;
|
||||||
// we just want to return the top words
|
// we just want to return the top words
|
||||||
while (((scoreTerm = pq.pop()) != null) && lim-- > 0) {
|
while (((scoreTerm = pq.pop()) != null) && lim-- > 0) {
|
||||||
al.add(scoreTerm.word); // the 1st entry is the interesting word
|
al.add(scoreTerm.word); // the 1st entry is the interesting word
|
||||||
@ -985,12 +912,12 @@ public final class MultiMoreLikeThis {
|
|||||||
* @see #setMaxQueryTerms
|
* @see #setMaxQueryTerms
|
||||||
*/
|
*/
|
||||||
public String[] retrieveInterestingTerms(Reader r, String fieldName) throws IOException {
|
public String[] retrieveInterestingTerms(Reader r, String fieldName) throws IOException {
|
||||||
ArrayList<String> al = new ArrayList<>(maxQueryTerms);
|
ArrayList<String> al = new ArrayList<>(Math.toIntExact(maxQueryTerms));
|
||||||
PriorityQueue<ScoreTerm> pq = retrieveTerms(r, fieldName);
|
PriorityQueue<ScoreTerm> pq = retrieveTerms(r, fieldName);
|
||||||
ScoreTerm scoreTerm;
|
ScoreTerm scoreTerm;
|
||||||
// have to be careful, retrieveTerms returns all words but that's probably not useful to our
|
// have to be careful, retrieveTerms returns all words but that's probably not useful to our
|
||||||
// caller...
|
// caller...
|
||||||
int lim = maxQueryTerms;
|
long lim = maxQueryTerms;
|
||||||
// we just want to return the top words
|
// we just want to return the top words
|
||||||
while (((scoreTerm = pq.pop()) != null) && lim-- > 0) {
|
while (((scoreTerm = pq.pop()) != null) && lim-- > 0) {
|
||||||
al.add(scoreTerm.word); // the 1st entry is the interesting word
|
al.add(scoreTerm.word); // the 1st entry is the interesting word
|
||||||
@ -1031,14 +958,14 @@ public final class MultiMoreLikeThis {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/** Use for frequencies and to avoid renewing Integers. */
|
/** Use for frequencies and to avoid renewing Integers. */
|
||||||
private static class Int {
|
private static class Long {
|
||||||
int x;
|
long x;
|
||||||
|
|
||||||
Int() {
|
Long() {
|
||||||
this(1);
|
this(1L);
|
||||||
}
|
}
|
||||||
|
|
||||||
Int(int initialValue) {
|
Long(long initialValue) {
|
||||||
x = initialValue;
|
x = initialValue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user