From 5cfb5f49cd1339d5d1dc6ab63acdd7cd4105dde6 Mon Sep 17 00:00:00 2001 From: Andrea Cavalli Date: Mon, 20 Sep 2021 00:22:22 +0200 Subject: [PATCH] Add custom MoreLikeThis with sharding support --- .../cavallium/dbengine/database/LLUtils.java | 89 +- .../lucene/mlt/MultiMoreLikeThis.java | 1045 +++++++++++++++++ 2 files changed, 1093 insertions(+), 41 deletions(-) create mode 100644 src/main/java/it/cavallium/dbengine/lucene/mlt/MultiMoreLikeThis.java diff --git a/src/main/java/it/cavallium/dbengine/database/LLUtils.java b/src/main/java/it/cavallium/dbengine/database/LLUtils.java index 200f4e4..1589748 100644 --- a/src/main/java/it/cavallium/dbengine/database/LLUtils.java +++ b/src/main/java/it/cavallium/dbengine/database/LLUtils.java @@ -10,13 +10,13 @@ import io.net5.buffer.api.Send; import io.net5.util.IllegalReferenceCountException; import io.net5.util.internal.PlatformDependent; import it.cavallium.dbengine.database.collections.DatabaseStage; -import it.cavallium.dbengine.database.disk.LLIndexContext; import it.cavallium.dbengine.database.disk.LLIndexSearcher; import it.cavallium.dbengine.database.disk.LLLocalLuceneIndex; import it.cavallium.dbengine.database.disk.MemorySegmentUtils; import it.cavallium.dbengine.database.serialization.SerializationException; import it.cavallium.dbengine.database.serialization.SerializationFunction; import it.cavallium.dbengine.lucene.RandomSortField; +import it.cavallium.dbengine.lucene.analyzer.WordAnalyzer; import it.cavallium.dbengine.lucene.searcher.LocalQueryParams; import java.nio.ByteBuffer; import java.nio.charset.Charset; @@ -33,6 +33,7 @@ import java.util.concurrent.Callable; import java.util.concurrent.atomic.AtomicLong; import java.util.function.Function; import java.util.function.ToIntFunction; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FloatPoint; @@ -41,9 +42,10 @@ import org.apache.lucene.document.LongPoint; import org.apache.lucene.document.SortedNumericDocValuesField; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; +import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.Term; -import org.apache.lucene.queries.mlt.MoreLikeThis; +import it.cavallium.dbengine.lucene.mlt.MultiMoreLikeThis; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.ConstantScoreQuery; @@ -51,10 +53,11 @@ import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.MatchNoDocsQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreMode; -import org.apache.lucene.search.SearcherManager; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.SortedNumericSortField; +import org.apache.lucene.search.similarities.ClassicSimilarity; +import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.search.similarities.TFIDFSimilarity; import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; @@ -397,9 +400,11 @@ public class LLUtils { } public static Mono getMoreLikeThisQuery( - LLIndexSearcher indexSearcher, + List indexSearchers, @Nullable LLSnapshot snapshot, LocalQueryParams localQueryParams, + Analyzer analyzer, + Similarity similarity, Flux>> mltDocumentFieldsFlux) { Query luceneAdditionalQuery; try { @@ -409,59 +414,61 @@ public class LLUtils { } return mltDocumentFieldsFlux .collectMap(Tuple2::getT1, Tuple2::getT2, HashMap::new) - .flatMap(mltDocumentFields -> { + .flatMap(mltDocumentFields -> Mono.fromCallable(() -> { mltDocumentFields.entrySet().removeIf(entry -> entry.getValue().isEmpty()); if (mltDocumentFields.isEmpty()) { - return Mono.just(new LocalQueryParams(new MatchNoDocsQuery(), + return new LocalQueryParams(new MatchNoDocsQuery(), localQueryParams.offset(), localQueryParams.limit(), localQueryParams.minCompetitiveScore(), localQueryParams.sort(), localQueryParams.scoreMode() - )); + ); } - new IndexSearcher - return indexSearcher.getIndexSearcher().search(snapshot, indexSearcher -> Mono.fromCallable(() -> { - var mlt = new MoreLikeThis(indexSearcher.getIndexReader()); - mlt.setAnalyzer(llLocalLuceneIndex.indexWriter.getAnalyzer()); - mlt.setFieldNames(mltDocumentFields.keySet().toArray(String[]::new)); - mlt.setMinTermFreq(1); - mlt.setMinDocFreq(3); - mlt.setMaxDocFreqPct(20); - mlt.setBoost(localQueryParams.scoreMode().needsScores()); - mlt.setStopWords(EnglishItalianStopFilter.getStopWordsString()); - var similarity = llLocalLuceneIndex.getSimilarity(); - if (similarity instanceof TFIDFSimilarity) { - mlt.setSimilarity((TFIDFSimilarity) similarity); - } else { - LLLocalLuceneIndex.logger.trace(MARKER_ROCKSDB, "Using an unsupported similarity algorithm for MoreLikeThis:" - + " {}. You must use a similarity instance based on TFIDFSimilarity!", similarity); + MultiMoreLikeThis mlt; + if (indexSearchers.size() == 1) { + mlt = new MultiMoreLikeThis(indexSearchers.get(0).getIndexReader(), null); + } else { + IndexReader[] indexReaders = new IndexReader[indexSearchers.size()]; + for (int i = 0, size = indexSearchers.size(); i < size; i++) { + indexReaders[i] = indexSearchers.get(i).getIndexReader(); } + mlt = new MultiMoreLikeThis(indexReaders, null); + } + mlt.setAnalyzer(analyzer); + mlt.setFieldNames(mltDocumentFields.keySet().toArray(String[]::new)); + mlt.setMinTermFreq(1); + mlt.setMinDocFreq(3); + mlt.setMaxDocFreqPct(20); + mlt.setBoost(localQueryParams.scoreMode().needsScores()); + mlt.setStopWords(EnglishItalianStopFilter.getStopWordsString()); + if (similarity instanceof TFIDFSimilarity tfidfSimilarity) { + mlt.setSimilarity(tfidfSimilarity); + } else { + mlt.setSimilarity(new ClassicSimilarity()); + } - // Get the reference docId and apply it to MoreLikeThis, to generate the query - @SuppressWarnings({"unchecked", "rawtypes"}) - var mltQuery = mlt.like((Map) mltDocumentFields); - Query luceneQuery; - if (!(luceneAdditionalQuery instanceof MatchAllDocsQuery)) { - luceneQuery = new BooleanQuery.Builder() - .add(mltQuery, Occur.MUST) - .add(new ConstantScoreQuery(luceneAdditionalQuery), Occur.MUST) - .build(); - } else { - luceneQuery = mltQuery; - } + // Get the reference docId and apply it to MoreLikeThis, to generate the query + @SuppressWarnings({"unchecked", "rawtypes"}) + var mltQuery = mlt.like((Map) mltDocumentFields); + Query luceneQuery; + if (!(luceneAdditionalQuery instanceof MatchAllDocsQuery)) { + luceneQuery = new BooleanQuery.Builder() + .add(mltQuery, Occur.MUST) + .add(new ConstantScoreQuery(luceneAdditionalQuery), Occur.MUST) + .build(); + } else { + luceneQuery = mltQuery; + } - return luceneQuery; - }) - .subscribeOn(Schedulers.boundedElastic()) - .map(luceneQuery -> new LocalQueryParams(luceneQuery, + return new LocalQueryParams(luceneQuery, localQueryParams.offset(), localQueryParams.limit(), localQueryParams.minCompetitiveScore(), localQueryParams.sort(), localQueryParams.scoreMode() - ))); - }); + ); + }).subscribeOn(Schedulers.boundedElastic())); } public static record DirectBuffer(@NotNull Send buffer, @NotNull ByteBuffer byteBuffer) {} diff --git a/src/main/java/it/cavallium/dbengine/lucene/mlt/MultiMoreLikeThis.java b/src/main/java/it/cavallium/dbengine/lucene/mlt/MultiMoreLikeThis.java new file mode 100644 index 0000000..6892155 --- /dev/null +++ b/src/main/java/it/cavallium/dbengine/lucene/mlt/MultiMoreLikeThis.java @@ -0,0 +1,1045 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package it.cavallium.dbengine.lucene.mlt; + +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.BoostQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.similarities.ClassicSimilarity; +import org.apache.lucene.search.similarities.TFIDFSimilarity; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRefBuilder; +import org.apache.lucene.util.PriorityQueue; + +/** + * Generate "more like this" similarity queries. Based on this mail: + * + *

+ * Lucene does let you access the document frequency of terms, with IndexReader.docFreq().
+ * Term frequencies can be computed by re-tokenizing the text, which, for a single document,
+ * is usually fast enough.  But looking up the docFreq() of every term in the document is
+ * probably too slow.
+ *
+ * You can use some heuristics to prune the set of terms, to avoid calling docFreq() too much,
+ * or at all.  Since you're trying to maximize a tf*idf score, you're probably most interested
+ * in terms with a high tf. Choosing a tf threshold even as low as two or three will radically
+ * reduce the number of terms under consideration.  Another heuristic is that terms with a
+ * high idf (i.e., a low df) tend to be longer.  So you could threshold the terms by the
+ * number of characters, not selecting anything less than, e.g., six or seven characters.
+ * With these sorts of heuristics you can usually find small set of, e.g., ten or fewer terms
+ * that do a pretty good job of characterizing a document.
+ *
+ * It all depends on what you're trying to do.  If you're trying to eek out that last percent
+ * of precision and recall regardless of computational difficulty so that you can win a TREC
+ * competition, then the techniques I mention above are useless.  But if you're trying to
+ * provide a "more like this" button on a search results page that does a decent job and has
+ * good performance, such techniques might be useful.
+ *
+ * An efficient, effective "more-like-this" query generator would be a great contribution, if
+ * anyone's interested.  I'd imagine that it would take a Reader or a String (the document's
+ * text), analyzer Analyzer, and return a set of representative terms using heuristics like those
+ * above.  The frequency and length thresholds could be parameters, etc.
+ *
+ * Doug
+ * 
+ * + *

Initial Usage

+ * + *

This class has lots of options to try to make it efficient and flexible. The simplest possible + * usage is as follows. The bold fragment is specific to this class.
+ * + *

+ * IndexReader ir = ...
+ * IndexSearcher is = ...
+ *
+ * MoreLikeThis mlt = new MoreLikeThis(ir);
+ * Reader target = ... // orig source of doc you want to find similarities to
+ * Query query = mlt.like( target);
+ *
+ * Hits hits = is.search(query);
+ * // now the usual iteration thru 'hits' - the only thing to watch for is to make sure
+ * //you ignore the doc if it matches your 'target' document, as it should be similar to itself
+ *
+ * 
+ * + *

Thus you: + * + *

    + *
  1. do your normal, Lucene setup for searching, + *
  2. create a MoreLikeThis, + *
  3. get the text of the doc you want to find similarities to + *
  4. then call one of the like() calls to generate a similarity query + *
  5. call the searcher to find the similar docs + *
+ * + *
+ * + *

More Advanced Usage

+ * + *

You may want to use {@link #setFieldNames setFieldNames(...)} so you can examine multiple + * fields (e.g. body and title) for similarity. + * + *

Depending on the size of your index and the size and makeup of your documents you may want to + * call the other set methods to control how the similarity queries are generated: + * + *

    + *
  • {@link #setMinTermFreq setMinTermFreq(...)} + *
  • {@link #setMinDocFreq setMinDocFreq(...)} + *
  • {@link #setMaxDocFreq setMaxDocFreq(...)} + *
  • {@link #setMaxDocFreqPct setMaxDocFreqPct(...)} + *
  • {@link #setMinWordLen setMinWordLen(...)} + *
  • {@link #setMaxWordLen setMaxWordLen(...)} + *
  • {@link #setMaxQueryTerms setMaxQueryTerms(...)} + *
  • {@link #setMaxNumTokensParsed setMaxNumTokensParsed(...)} + *
  • {@link #setStopWords setStopWord(...)} + *
+ * + *
+ *
+ * + *
+ * Changes: Mark Harwood 29/02/04
+ * Some bugfixing, some refactoring, some optimisation.
+ * - bugfix: retrieveTerms(int docNum) was not working for indexes without a termvector -added missing code
+ * - bugfix: No significant terms being created for fields with a termvector - because
+ * was only counting one occurrence per term/field pair in calculations(ie not including frequency info from TermVector)
+ * - refactor: moved common code into isNoiseWord()
+ * - optimise: when no termvector support available - used maxNumTermsParsed to limit amount of tokenization
+ * 
+ */ +@SuppressWarnings("unused") +public final class MultiMoreLikeThis { + + /** + * Default maximum number of tokens to parse in each example doc field that is not stored with + * TermVector support. + * + * @see #getMaxNumTokensParsed + */ + public static final int DEFAULT_MAX_NUM_TOKENS_PARSED = 5000; + + /** + * Ignore terms with less than this frequency in the source doc. + * + * @see #getMinTermFreq + * @see #setMinTermFreq + */ + public static final int DEFAULT_MIN_TERM_FREQ = 2; + + /** + * Ignore words which do not occur in at least this many docs. + * + * @see #getMinDocFreq + * @see #setMinDocFreq + */ + public static final long DEFAULT_MIN_DOC_FREQ = 5; + + /** + * Ignore words which occur in more than this many docs. + * + * @see #getMaxDocFreq + * @see #setMaxDocFreq + * @see #setMaxDocFreqPct + */ + public static final long DEFAULT_MAX_DOC_FREQ = Long.MAX_VALUE; + + /** + * Boost terms in query based on score. + * + * @see #isBoost + * @see #setBoost + */ + public static final boolean DEFAULT_BOOST = false; + + /** + * Default field names. Null is used to specify that the field names should be looked up at + * runtime from the provided reader. + */ + public static final String[] DEFAULT_FIELD_NAMES = new String[] {"contents"}; + + /** + * Ignore words less than this length or if 0 then this has no effect. + * + * @see #getMinWordLen + * @see #setMinWordLen + */ + public static final int DEFAULT_MIN_WORD_LENGTH = 0; + + /** + * Ignore words greater than this length or if 0 then this has no effect. + * + * @see #getMaxWordLen + * @see #setMaxWordLen + */ + public static final int DEFAULT_MAX_WORD_LENGTH = 0; + + /** + * Default set of stopwords. If null means to allow stop words. + * + * @see #setStopWords + * @see #getStopWords + */ + public static final Set DEFAULT_STOP_WORDS = null; + + /** Current set of stop words. */ + private Set stopWords = DEFAULT_STOP_WORDS; + + /** + * Return a Query with no more than this many terms. + * + * @see IndexSearcher#getMaxClauseCount + * @see #getMaxQueryTerms + * @see #setMaxQueryTerms + */ + public static final int DEFAULT_MAX_QUERY_TERMS = 25; + + /** Analyzer that will be used to parse the doc. */ + private Analyzer analyzer = null; + + /** Ignore words less frequent that this. */ + private int minTermFreq = DEFAULT_MIN_TERM_FREQ; + + /** Ignore words which do not occur in at least this many docs. */ + private long minDocFreq = DEFAULT_MIN_DOC_FREQ; + + /** Ignore words which occur in more than this many docs. */ + private long maxDocFreq = DEFAULT_MAX_DOC_FREQ; + + /** Should we apply a boost to the Query based on the scores? */ + private boolean boost = DEFAULT_BOOST; + + /** Field name we'll analyze. */ + private String[] fieldNames = DEFAULT_FIELD_NAMES; + + /** + * The maximum number of tokens to parse in each example doc field that is not stored with + * TermVector support + */ + private int maxNumTokensParsed = DEFAULT_MAX_NUM_TOKENS_PARSED; + + /** Ignore words if less than this len. */ + private int minWordLen = DEFAULT_MIN_WORD_LENGTH; + + /** Ignore words if greater than this len. */ + private int maxWordLen = DEFAULT_MAX_WORD_LENGTH; + + /** Don't return a query longer than this. */ + private int maxQueryTerms = DEFAULT_MAX_QUERY_TERMS; + + /** For idf() calculations. */ + private TFIDFSimilarity similarity; // = new DefaultSimilarity(); + + /** IndexReader to use */ + private final IndexReader ir; + + /** + * IndexReader array to use when multi-searchers are used. + */ + private final IndexReader[] irArray; + + /** Boost factor to use when boosting the terms */ + private float boostFactor = 1; + + /** + * Returns the boost factor used when boosting terms + * + * @return the boost factor used when boosting terms + * @see #setBoostFactor(float) + */ + public float getBoostFactor() { + return boostFactor; + } + + /** + * Sets the boost factor to use when boosting terms + * + * @see #getBoostFactor() + */ + public void setBoostFactor(float boostFactor) { + this.boostFactor = boostFactor; + } + + /** Constructor requiring an IndexReader. */ + public MultiMoreLikeThis(IndexReader ir) { + this(ir, new ClassicSimilarity()); + } + + public MultiMoreLikeThis(IndexReader ir, TFIDFSimilarity sim) { + this(ir, null, sim); + } + + public MultiMoreLikeThis(IndexReader[] irArray) { + this(irArray, new ClassicSimilarity()); + } + + public MultiMoreLikeThis(IndexReader[] irArray, TFIDFSimilarity sim) { + this(null, irArray, sim); + } + + private MultiMoreLikeThis(IndexReader ir, IndexReader[] irArray, TFIDFSimilarity sim) { + if ((ir == null) == (irArray == null)) { + throw new IllegalArgumentException(); + } + this.irArray = irArray; + this.ir = ir; + this.similarity = sim; + } + + public TFIDFSimilarity getSimilarity() { + return similarity; + } + + public void setSimilarity(TFIDFSimilarity similarity) { + this.similarity = similarity; + } + + /** + * Returns an analyzer that will be used to parse source doc with. The default analyzer is not + * set. + * + * @return the analyzer that will be used to parse source doc with. + */ + public Analyzer getAnalyzer() { + return analyzer; + } + + /** + * Sets the analyzer to use. An analyzer is not required for generating a query with the {@link + * #like(int)} method, all other 'like' methods require an analyzer. + * + * @param analyzer the analyzer to use to tokenize text. + */ + public void setAnalyzer(Analyzer analyzer) { + this.analyzer = analyzer; + } + + /** + * Returns the frequency below which terms will be ignored in the source doc. The default + * frequency is the {@link #DEFAULT_MIN_TERM_FREQ}. + * + * @return the frequency below which terms will be ignored in the source doc. + */ + public int getMinTermFreq() { + return minTermFreq; + } + + /** + * Sets the frequency below which terms will be ignored in the source doc. + * + * @param minTermFreq the frequency below which terms will be ignored in the source doc. + */ + public void setMinTermFreq(int minTermFreq) { + this.minTermFreq = minTermFreq; + } + + /** + * Returns the frequency at which words will be ignored which do not occur in at least this many + * docs. The default frequency is {@link #DEFAULT_MIN_DOC_FREQ}. + * + * @return the frequency at which words will be ignored which do not occur in at least this many + * docs. + */ + public long getMinDocFreq() { + return minDocFreq; + } + + /** + * Sets the frequency at which words will be ignored which do not occur in at least this many + * docs. + * + * @param minDocFreq the frequency at which words will be ignored which do not occur in at least + * this many docs. + */ + public void setMinDocFreq(long minDocFreq) { + this.minDocFreq = minDocFreq; + } + + /** + * Returns the maximum frequency in which words may still appear. Words that appear in more than + * this many docs will be ignored. The default frequency is {@link #DEFAULT_MAX_DOC_FREQ}. + * + * @return get the maximum frequency at which words are still allowed, words which occur in more + * docs than this are ignored. + */ + public long getMaxDocFreq() { + return maxDocFreq; + } + + /** + * Set the maximum frequency in which words may still appear. Words that appear in more than this + * many docs will be ignored. + * + * @param maxFreq the maximum count of documents that a term may appear in to be still considered + * relevant + */ + public void setMaxDocFreq(long maxFreq) { + this.maxDocFreq = maxFreq; + } + + /** + * Set the maximum percentage in which words may still appear. Words that appear in more than this + * many percent of all docs will be ignored. + * + *

This method calls {@link #setMaxDocFreq(long)} internally (both conditions cannot be used at + * the same time). + * + * @param maxPercentage the maximum percentage of documents (0-100) that a term may appear in to + * be still considered relevant. + */ + public void setMaxDocFreqPct(int maxPercentage) { + long maxDoc; + if (irArray == null) { + maxDoc = ir.maxDoc(); + } else { + maxDoc = 0L; + for (IndexReader ir : irArray) { + maxDoc += ir.maxDoc(); + } + } + setMaxDocFreq(Math.toIntExact((long) maxPercentage * maxDoc / 100L)); + } + + /** + * Returns whether to boost terms in query based on "score" or not. The default is {@link + * #DEFAULT_BOOST}. + * + * @return whether to boost terms in query based on "score" or not. + * @see #setBoost + */ + public boolean isBoost() { + return boost; + } + + /** + * Sets whether to boost terms in query based on "score" or not. + * + * @param boost true to boost terms in query based on "score", false otherwise. + * @see #isBoost + */ + public void setBoost(boolean boost) { + this.boost = boost; + } + + /** + * Returns the field names that will be used when generating the 'More Like This' query. The + * default field names that will be used is {@link #DEFAULT_FIELD_NAMES}. + * + * @return the field names that will be used when generating the 'More Like This' query. + */ + public String[] getFieldNames() { + return fieldNames; + } + + /** + * Sets the field names that will be used when generating the 'More Like This' query. Set this to + * null for the field names to be determined at runtime from the IndexReader provided in the + * constructor. + * + * @param fieldNames the field names that will be used when generating the 'More Like This' query. + */ + public void setFieldNames(String[] fieldNames) { + this.fieldNames = fieldNames; + } + + /** + * Returns the minimum word length below which words will be ignored. Set this to 0 for no minimum + * word length. The default is {@link #DEFAULT_MIN_WORD_LENGTH}. + * + * @return the minimum word length below which words will be ignored. + */ + public int getMinWordLen() { + return minWordLen; + } + + /** + * Sets the minimum word length below which words will be ignored. + * + * @param minWordLen the minimum word length below which words will be ignored. + */ + public void setMinWordLen(int minWordLen) { + this.minWordLen = minWordLen; + } + + /** + * Returns the maximum word length above which words will be ignored. Set this to 0 for no maximum + * word length. The default is {@link #DEFAULT_MAX_WORD_LENGTH}. + * + * @return the maximum word length above which words will be ignored. + */ + public int getMaxWordLen() { + return maxWordLen; + } + + /** + * Sets the maximum word length above which words will be ignored. + * + * @param maxWordLen the maximum word length above which words will be ignored. + */ + public void setMaxWordLen(int maxWordLen) { + this.maxWordLen = maxWordLen; + } + + /** + * Set the set of stopwords. Any word in this set is considered "uninteresting" and ignored. Even + * if your Analyzer allows stopwords, you might want to tell the MoreLikeThis code to ignore them, + * as for the purposes of document similarity it seems reasonable to assume that "a stop word is + * never interesting". + * + * @param stopWords set of stopwords, if null it means to allow stop words + * @see #getStopWords + */ + public void setStopWords(Set stopWords) { + this.stopWords = stopWords; + } + + /** + * Get the current stop words being used. + * + * @see #setStopWords + */ + public Set getStopWords() { + return stopWords; + } + + /** + * Returns the maximum number of query terms that will be included in any generated query. The + * default is {@link #DEFAULT_MAX_QUERY_TERMS}. + * + * @return the maximum number of query terms that will be included in any generated query. + */ + public int getMaxQueryTerms() { + return maxQueryTerms; + } + + /** + * Sets the maximum number of query terms that will be included in any generated query. + * + * @param maxQueryTerms the maximum number of query terms that will be included in any generated + * query. + */ + public void setMaxQueryTerms(int maxQueryTerms) { + this.maxQueryTerms = maxQueryTerms; + } + + /** + * @return The maximum number of tokens to parse in each example doc field that is not stored with + * TermVector support + * @see #DEFAULT_MAX_NUM_TOKENS_PARSED + */ + public int getMaxNumTokensParsed() { + return maxNumTokensParsed; + } + + /** + * @param i The maximum number of tokens to parse in each example doc field that is not stored + * with TermVector support + */ + public void setMaxNumTokensParsed(int i) { + maxNumTokensParsed = i; + } + + /** + * Return a query that will return docs like the passed lucene document ID. + * + * @param docNum the documentID of the lucene doc to generate the 'More Like This" query for. + * @return a query that will return docs like the passed lucene document ID. + */ + public Query like(int docNum) throws IOException { + if (fieldNames == null) { + // gather list of valid fields from lucene + Collection fields; + if (irArray == null) { + fields = FieldInfos.getIndexedFields(ir); + } else { + fields = new ArrayList<>(); + for (IndexReader ir : irArray) { + fields.addAll(FieldInfos.getIndexedFields(ir)); + } + } + fieldNames = fields.toArray(String[]::new); + } + + return createQuery(retrieveTerms(docNum)); + } + + /** + * @param filteredDocument Document with field values extracted for selected fields. + * @return More Like This query for the passed document. + */ + public Query like(Map> filteredDocument) throws IOException { + if (fieldNames == null) { + // gather list of valid fields from lucene + Collection fields; + if (irArray == null) { + fields = FieldInfos.getIndexedFields(ir); + } else { + fields = new ArrayList<>(); + for (IndexReader ir : irArray) { + fields.addAll(FieldInfos.getIndexedFields(ir)); + } + } + fieldNames = fields.toArray(String[]::new); + } + return createQuery(retrieveTerms(filteredDocument)); + } + + /** + * Return a query that will return docs like the passed Readers. This was added in order to treat + * multi-value fields. + * + * @return a query that will return docs like the passed Readers. + */ + public Query like(String fieldName, Reader... readers) throws IOException { + Map> perFieldTermFrequencies = new HashMap<>(); + for (Reader r : readers) { + addTermFrequencies(r, perFieldTermFrequencies, fieldName); + } + return createQuery(createQueue(perFieldTermFrequencies)); + } + + /** Create the More like query from a PriorityQueue */ + private Query createQuery(PriorityQueue q) { + BooleanQuery.Builder query = new BooleanQuery.Builder(); + ScoreTerm scoreTerm; + float bestScore = -1; + + while ((scoreTerm = q.pop()) != null) { + Query tq = new TermQuery(new Term(scoreTerm.topField, scoreTerm.word)); + + if (boost) { + if (bestScore == -1) { + bestScore = (scoreTerm.score); + } + float myScore = (scoreTerm.score); + tq = new BoostQuery(tq, boostFactor * myScore / bestScore); + } + + try { + query.add(tq, BooleanClause.Occur.SHOULD); + } catch ( + @SuppressWarnings("unused") + IndexSearcher.TooManyClauses ignore) { + break; + } + } + return query.build(); + } + + /** + * Create a PriorityQueue from a word->tf map. + * + * @param perFieldTermFrequencies a per field map of words keyed on the word(String) with Int + * objects as the values. + */ + private PriorityQueue createQueue( + Map> perFieldTermFrequencies) throws IOException { + // have collected all words in doc and their freqs + final int limit = Math.min(maxQueryTerms, this.getTermsCount(perFieldTermFrequencies)); + FreqQ queue = new FreqQ(limit); // will order words by score + for (Map.Entry> entry : perFieldTermFrequencies.entrySet()) { + Map perWordTermFrequencies = entry.getValue(); + String fieldName = entry.getKey(); + + long numDocs; + if (irArray == null) { + numDocs = ir.getDocCount(fieldName); + if (numDocs == -1) { + numDocs = ir.numDocs(); + } + } else { + numDocs = 0L; + for (IndexReader ir : irArray) { + long localNumDocs = ir.getDocCount(fieldName); + if (localNumDocs == -1) { + localNumDocs = ir.numDocs(); + } + numDocs += localNumDocs; + } + } + + for (Map.Entry tfEntry : perWordTermFrequencies.entrySet()) { // for every word + String word = tfEntry.getKey(); + int tf = tfEntry.getValue().x; // term freq in the source doc + if (minTermFreq > 0 && tf < minTermFreq) { + continue; // filter out words that don't occur enough times in the source + } + + long docFreq; + var fieldTerm = new Term(fieldName, word); + if (irArray == null) { + docFreq = ir.docFreq(fieldTerm); + } else { + docFreq = 0; + for (IndexReader ir : irArray) { + docFreq += ir.docFreq(fieldTerm); + } + } + + if (minDocFreq > 0L && docFreq < minDocFreq) { + continue; // filter out words that don't occur in enough docs + } + + if (docFreq > maxDocFreq) { + continue; // filter out words that occur in too many docs + } + + if (docFreq == 0) { + continue; // index update problem? + } + + float idf = similarity.idf(docFreq, numDocs); + float score = tf * idf; + + if (queue.size() < limit) { + // there is still space in the queue + queue.add(new ScoreTerm(word, fieldName, score)); + } else { + ScoreTerm term = queue.top(); + // update the smallest in the queue in place and update the queue. + if (term.score < score) { + term.update(word, fieldName, score); + queue.updateTop(); + } + } + } + } + return queue; + } + + private int getTermsCount(Map> perFieldTermFrequencies) { + int totalTermsCount = 0; + Collection> values = perFieldTermFrequencies.values(); + for (Map perWordTermFrequencies : values) { + totalTermsCount += perWordTermFrequencies.size(); + } + return totalTermsCount; + } + + /** Describe the parameters that control how the "more like this" query is formed. */ + public String describeParams() { + StringBuilder sb = new StringBuilder(); + sb.append("\t").append("maxQueryTerms : ").append(maxQueryTerms).append("\n"); + sb.append("\t").append("minWordLen : ").append(minWordLen).append("\n"); + sb.append("\t").append("maxWordLen : ").append(maxWordLen).append("\n"); + sb.append("\t").append("fieldNames : "); + String delim = ""; + for (String fieldName : fieldNames) { + sb.append(delim).append(fieldName); + delim = ", "; + } + sb.append("\n"); + sb.append("\t").append("boost : ").append(boost).append("\n"); + sb.append("\t").append("minTermFreq : ").append(minTermFreq).append("\n"); + sb.append("\t").append("minDocFreq : ").append(minDocFreq).append("\n"); + return sb.toString(); + } + + /** + * Find words for a more-like-this query former. + * + * @param docNum the id of the lucene document from which to find terms + */ + private PriorityQueue retrieveTerms(int docNum) throws IOException { + Map> field2termFreqMap = new HashMap<>(); + if (irArray == null) { + retrieveTermsOfIndexReader(ir, docNum, field2termFreqMap); + } else { + for (IndexReader ir : irArray) { + retrieveTermsOfIndexReader(ir, docNum, field2termFreqMap); + } + } + + return createQueue(field2termFreqMap); + } + + private void retrieveTermsOfIndexReader(IndexReader ir, int docNum, Map> field2termFreqMap) + throws IOException { + for (String fieldName : fieldNames) { + final Fields vectors = ir.getTermVectors(docNum); + final Terms vector; + if (vectors != null) { + vector = vectors.terms(fieldName); + } else { + vector = null; + } + + // field does not store term vector info + if (vector == null) { + Document d = ir.document(docNum); + IndexableField[] fields = d.getFields(fieldName); + for (IndexableField field : fields) { + final String stringValue = field.stringValue(); + if (stringValue != null) { + addTermFrequencies(new StringReader(stringValue), field2termFreqMap, fieldName); + } + } + } else { + addTermFrequencies(field2termFreqMap, vector, fieldName); + } + } + } + + private PriorityQueue retrieveTerms(Map> field2fieldValues) + throws IOException { + Map> field2termFreqMap = new HashMap<>(); + for (String fieldName : fieldNames) { + Collection fieldValues = field2fieldValues.get(fieldName); + if (fieldValues == null) { + continue; + } + for (Object fieldValue : fieldValues) { + if (fieldValue != null) { + addTermFrequencies( + new StringReader(String.valueOf(fieldValue)), field2termFreqMap, fieldName); + } + } + } + return createQueue(field2termFreqMap); + } + /** + * Adds terms and frequencies found in vector into the Map termFreqMap + * + * @param field2termFreqMap a Map of terms and their frequencies per field + * @param vector List of terms and their frequencies for a doc/field + */ + private void addTermFrequencies( + Map> field2termFreqMap, Terms vector, String fieldName) + throws IOException { + Map termFreqMap = + field2termFreqMap.computeIfAbsent(fieldName, k -> new HashMap<>()); + final TermsEnum termsEnum = vector.iterator(); + final CharsRefBuilder spare = new CharsRefBuilder(); + BytesRef text; + while ((text = termsEnum.next()) != null) { + spare.copyUTF8Bytes(text); + final String term = spare.toString(); + if (isNoiseWord(term)) { + continue; + } + final int freq = (int) termsEnum.totalTermFreq(); + + // increment frequency + Int cnt = termFreqMap.get(term); + if (cnt == null) { + cnt = new Int(); + termFreqMap.put(term, cnt); + cnt.x = freq; + } else { + cnt.x += freq; + } + } + } + + /** + * Adds term frequencies found by tokenizing text from reader into the Map words + * + * @param r a source of text to be tokenized + * @param perFieldTermFrequencies a Map of terms and their frequencies per field + * @param fieldName Used by analyzer for any special per-field analysis + */ + private void addTermFrequencies( + Reader r, Map> perFieldTermFrequencies, String fieldName) + throws IOException { + if (analyzer == null) { + throw new UnsupportedOperationException( + "To use MoreLikeThis without " + "term vectors, you must provide an Analyzer"); + } + Map termFreqMap = + perFieldTermFrequencies.computeIfAbsent(fieldName, k -> new HashMap<>()); + try (TokenStream ts = analyzer.tokenStream(fieldName, r)) { + int tokenCount = 0; + // for every token + CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); + TermFrequencyAttribute tfAtt = ts.addAttribute(TermFrequencyAttribute.class); + ts.reset(); + while (ts.incrementToken()) { + String word = termAtt.toString(); + tokenCount++; + if (tokenCount > maxNumTokensParsed) { + break; + } + if (isNoiseWord(word)) { + continue; + } + + // increment frequency + Int cnt = termFreqMap.get(word); + if (cnt == null) { + termFreqMap.put(word, new Int(tfAtt.getTermFrequency())); + } else { + cnt.x += tfAtt.getTermFrequency(); + } + } + ts.end(); + } + } + + /** + * determines if the passed term is likely to be of interest in "more like" comparisons + * + * @param term The word being considered + * @return true if should be ignored, false if should be used in further analysis + */ + private boolean isNoiseWord(String term) { + int len = term.length(); + if (minWordLen > 0 && len < minWordLen) { + return true; + } + if (maxWordLen > 0 && len > maxWordLen) { + return true; + } + return stopWords != null && stopWords.contains(term); + } + + /** + * Find words for a more-like-this query former. The result is a priority queue of arrays with one + * entry for every word in the document. Each array has 6 elements. The elements are: + * + *
    + *
  1. The word (String) + *
  2. The top field that this word comes from (String) + *
  3. The score for this word (Float) + *
  4. The IDF value (Float) + *
  5. The frequency of this word in the index (Integer) + *
  6. The frequency of this word in the source document (Integer) + *
+ * + * This is a somewhat "advanced" routine, and in general only the 1st entry in the array is of + * interest. This method is exposed so that you can identify the "interesting words" in a + * document. For an easier method to call see {@link #retrieveInterestingTerms + * retrieveInterestingTerms()}. + * + * @param r the reader that has the content of the document + * @param fieldName field passed to the analyzer to use when analyzing the content + * @return the most interesting words in the document ordered by score, with the highest scoring, + * or best entry, first + * @see #retrieveInterestingTerms + */ + private PriorityQueue retrieveTerms(Reader r, String fieldName) throws IOException { + Map> field2termFreqMap = new HashMap<>(); + addTermFrequencies(r, field2termFreqMap, fieldName); + return createQueue(field2termFreqMap); + } + + /** @see #retrieveInterestingTerms(java.io.Reader, String) */ + public String[] retrieveInterestingTerms(int docNum) throws IOException { + ArrayList al = new ArrayList<>(maxQueryTerms); + PriorityQueue pq = retrieveTerms(docNum); + ScoreTerm scoreTerm; + // have to be careful, retrieveTerms returns all words but that's probably not useful to our + // caller... + int lim = maxQueryTerms; + // we just want to return the top words + while (((scoreTerm = pq.pop()) != null) && lim-- > 0) { + al.add(scoreTerm.word); // the 1st entry is the interesting word + } + String[] res = new String[al.size()]; + return al.toArray(res); + } + + /** + * Convenience routine to make it easy to return the most interesting words in a document. More + * advanced users will call {@link #retrieveTerms(Reader, String) retrieveTerms()} directly. + * + * @param r the source document + * @param fieldName field passed to analyzer to use when analyzing the content + * @return the most interesting words in the document + * @see #retrieveTerms(java.io.Reader, String) + * @see #setMaxQueryTerms + */ + public String[] retrieveInterestingTerms(Reader r, String fieldName) throws IOException { + ArrayList al = new ArrayList<>(maxQueryTerms); + PriorityQueue pq = retrieveTerms(r, fieldName); + ScoreTerm scoreTerm; + // have to be careful, retrieveTerms returns all words but that's probably not useful to our + // caller... + int lim = maxQueryTerms; + // we just want to return the top words + while (((scoreTerm = pq.pop()) != null) && lim-- > 0) { + al.add(scoreTerm.word); // the 1st entry is the interesting word + } + String[] res = new String[al.size()]; + return al.toArray(res); + } + + /** PriorityQueue that orders words by score. */ + private static class FreqQ extends PriorityQueue { + FreqQ(int maxSize) { + super(maxSize); + } + + @Override + protected boolean lessThan(ScoreTerm a, ScoreTerm b) { + return a.score < b.score; + } + } + + private static class ScoreTerm { + // only really need 1st 3 entries, other ones are for troubleshooting + String word; + String topField; + float score; + + ScoreTerm(String word, String topField, float score) { + this.word = word; + this.topField = topField; + this.score = score; + } + + void update(String word, String topField, float score) { + this.word = word; + this.topField = topField; + this.score = score; + } + } + + /** Use for frequencies and to avoid renewing Integers. */ + private static class Int { + int x; + + Int() { + this(1); + } + + Int(int initialValue) { + x = initialValue; + } + } +} \ No newline at end of file