From e66bc6ce535a4d5b953809faadcea3a77dd6a65a Mon Sep 17 00:00:00 2001 From: Andrea Cavalli Date: Sun, 28 May 2023 16:44:54 +0200 Subject: [PATCH] Add solr query --- pom.xml | 10 +- src/main/data-generator/lucene-query.yaml | 10 +- .../dbengine/client/query/QueryParser.java | 398 ++++++++++++++++++ .../dbengine/client/query/QueryUtils.java | 7 + 4 files changed, 419 insertions(+), 6 deletions(-) diff --git a/pom.xml b/pom.xml index 6ef1a3e..474bd9f 100644 --- a/pom.xml +++ b/pom.xml @@ -206,11 +206,11 @@ lucene-core ${lucene.version} - - org.apache.lucene - lucene-join - ${lucene.version} - + + org.apache.lucene + lucene-join + ${lucene.version} + org.apache.lucene lucene-analysis-common diff --git a/src/main/data-generator/lucene-query.yaml b/src/main/data-generator/lucene-query.yaml index 83e4c3c..06054e9 100644 --- a/src/main/data-generator/lucene-query.yaml +++ b/src/main/data-generator/lucene-query.yaml @@ -6,7 +6,7 @@ superTypesData: Query: [ BoxedQuery, TermQuery, IntTermQuery, IntNDTermQuery, LongTermQuery, LongNDTermQuery, FloatTermQuery, FloatNDTermQuery, DoubleTermQuery, DoubleNDTermQuery, - PhraseQuery, WildcardQuery, SynonymQuery, FuzzyQuery, MatchAllDocsQuery, MatchNoDocsQuery, + PhraseQuery, SolrTextQuery, WildcardQuery, SynonymQuery, FuzzyQuery, MatchAllDocsQuery, MatchNoDocsQuery, BooleanQuery, SortedNumericDocValuesFieldSlowRangeQuery, SortedDocFieldExistsQuery, ConstantScoreQuery, BoostQuery, IntPointRangeQuery, IntNDPointRangeQuery, LongPointRangeQuery, FloatPointRangeQuery, DoublePointRangeQuery, LongNDPointRangeQuery, FloatNDPointRangeQuery, @@ -136,6 +136,14 @@ baseTypesData: # counted as characters from the beginning of the phrase. phrase: TermPosition[] slop: int + # Query that matches a phrase. (Solr) + SolrTextQuery: + data: + # Field name + field: String + # Text query + phrase: String + slop: int # Advanced query that matches text allowing asterisks in the query WildcardQuery: data: diff --git a/src/main/java/it/cavallium/dbengine/client/query/QueryParser.java b/src/main/java/it/cavallium/dbengine/client/query/QueryParser.java index 36bb21f..2195bb8 100644 --- a/src/main/java/it/cavallium/dbengine/client/query/QueryParser.java +++ b/src/main/java/it/cavallium/dbengine/client/query/QueryParser.java @@ -1,5 +1,8 @@ package it.cavallium.dbengine.client.query; +import com.google.common.xml.XmlEscapers; +import it.cavallium.dbengine.client.query.current.data.BooleanQuery; +import it.cavallium.dbengine.client.query.current.data.BooleanQueryBuilder; import it.cavallium.dbengine.client.query.current.data.BooleanQueryPart; import it.cavallium.dbengine.client.query.current.data.BoostQuery; import it.cavallium.dbengine.client.query.current.data.BoxedQuery; @@ -34,9 +37,12 @@ import it.cavallium.dbengine.client.query.current.data.LongPointRangeQuery; import it.cavallium.dbengine.client.query.current.data.LongPointSetQuery; import it.cavallium.dbengine.client.query.current.data.LongTermQuery; import it.cavallium.dbengine.client.query.current.data.NumericSort; +import it.cavallium.dbengine.client.query.current.data.OccurMust; +import it.cavallium.dbengine.client.query.current.data.OccurShould; import it.cavallium.dbengine.client.query.current.data.PhraseQuery; import it.cavallium.dbengine.client.query.current.data.PointConfig; import it.cavallium.dbengine.client.query.current.data.PointType; +import it.cavallium.dbengine.client.query.current.data.SolrTextQuery; import it.cavallium.dbengine.client.query.current.data.SortedDocFieldExistsQuery; import it.cavallium.dbengine.client.query.current.data.SortedNumericDocValuesFieldSlowRangeQuery; import it.cavallium.dbengine.client.query.current.data.SynonymQuery; @@ -47,9 +53,13 @@ import it.cavallium.dbengine.client.query.current.data.WildcardQuery; import it.cavallium.dbengine.lucene.RandomSortField; import java.text.DecimalFormat; import java.text.NumberFormat; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; import java.util.Map; import java.util.function.Function; import java.util.stream.Collectors; +import org.apache.commons.lang3.StringUtils; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; import org.apache.lucene.document.DoublePoint; @@ -61,6 +71,7 @@ import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.flexible.core.QueryNodeException; import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser; import org.apache.lucene.queryparser.flexible.standard.config.PointsConfig; +import org.apache.lucene.queryparser.xml.builders.UserInputQueryBuilder; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery.Builder; import org.apache.lucene.search.DocValuesFieldExistsQuery; @@ -72,9 +83,13 @@ import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.SortField.Type; import org.apache.lucene.search.SortedNumericSortField; +import org.jetbrains.annotations.Nullable; public class QueryParser { + private static final String[] QUERY_STRING_FIND = {"\\", "\""}; + private static final String[] QUERY_STRING_REPLACE = {"\\\\", "\\\""}; + public static Query toQuery(it.cavallium.dbengine.client.query.current.data.Query query, Analyzer analyzer) { if (query == null) { return null; @@ -350,6 +365,389 @@ public class QueryParser { } } + public static void toQueryXML(StringBuilder out, + it.cavallium.dbengine.client.query.current.data.Query query, + @Nullable Float boost) { + if (query == null) { + return; + } + switch (query.getBaseType$()) { + case StandardQuery -> { + var standardQuery = (it.cavallium.dbengine.client.query.current.data.StandardQuery) query; + + out.append(" 1) { + throw new UnsupportedOperationException("Maximum supported default fields count: 1"); + } + if (boost != null) { + out.append(" boost=\"").append(boost).append("\""); + } + if (standardQuery.defaultFields().size() == 1) { + out + .append(" fieldName=\"") + .append(XmlEscapers.xmlAttributeEscaper().escape(standardQuery.defaultFields().get(0))) + .append("\""); + } + if (!standardQuery.termFields().isEmpty()) { + throw new UnsupportedOperationException("Term fields unsupported"); + } + if (!standardQuery.pointsConfig().isEmpty()) { + throw new UnsupportedOperationException("Points config unsupported"); + } + out.append(">"); + out.append(XmlEscapers.xmlContentEscaper().escape(standardQuery.query())); + out.append("\n"); + } + case BooleanQuery -> { + var booleanQuery = (it.cavallium.dbengine.client.query.current.data.BooleanQuery) query; + + out.append("\n"); + + for (BooleanQueryPart part : booleanQuery.parts()) { + out.append(" "filter"; + case OccurMust -> "must"; + case OccurShould -> "should"; + case OccurMustNot -> "mustNot"; + default -> throw new IllegalStateException("Unexpected value: " + part.occur().getBaseType$()); + }).append("\""); + out.append(">\n"); + toQueryXML(out, part.query(), null); + out.append("\n"); + } + out.append("\n"); + } + case IntPointExactQuery -> { + var intPointExactQuery = (IntPointExactQuery) query; + out.append("\n"); + } + case IntNDPointExactQuery -> { + var intPointExactQuery = (IntPointExactQuery) query; + throw new UnsupportedOperationException("N-dimensional point queries are not supported"); + } + case LongPointExactQuery -> { + var longPointExactQuery = (LongPointExactQuery) query; + out.append("\n"); + } + case FloatPointExactQuery -> { + var floatPointExactQuery = (FloatPointExactQuery) query; + out.append("\n"); + } + case DoublePointExactQuery -> { + var doublePointExactQuery = (DoublePointExactQuery) query; + out.append("\n"); + } + case LongNDPointExactQuery -> { + var longndPointExactQuery = (LongNDPointExactQuery) query; + throw new UnsupportedOperationException("N-dimensional point queries are not supported"); + } + case FloatNDPointExactQuery -> { + var floatndPointExactQuery = (FloatNDPointExactQuery) query; + throw new UnsupportedOperationException("N-dimensional point queries are not supported"); + } + case DoubleNDPointExactQuery -> { + var doublendPointExactQuery = (DoubleNDPointExactQuery) query; + throw new UnsupportedOperationException("N-dimensional point queries are not supported"); + } + case IntPointSetQuery -> { + var intPointSetQuery = (IntPointSetQuery) query; + // Polyfill + toQueryXML(out, BooleanQuery.of(intPointSetQuery.values().intStream() + .mapToObj(val -> IntPointExactQuery.of(intPointSetQuery.field(), val)) + .map(q -> BooleanQueryPart.of(q, OccurShould.of())) + .toList(), 1), boost); + } + case LongPointSetQuery -> { + var longPointSetQuery = (LongPointSetQuery) query; + // Polyfill + toQueryXML(out, BooleanQuery.of(longPointSetQuery.values().longStream() + .mapToObj(val -> LongPointExactQuery.of(longPointSetQuery.field(), val)) + .map(q -> BooleanQueryPart.of(q, OccurShould.of())) + .toList(), 1), boost); + } + case FloatPointSetQuery -> { + var floatPointSetQuery = (FloatPointSetQuery) query; + // Polyfill + toQueryXML(out, BooleanQuery.of(floatPointSetQuery.values().stream() + .map(val -> FloatPointExactQuery.of(floatPointSetQuery.field(), val)) + .map(q -> BooleanQueryPart.of(q, OccurShould.of())) + .toList(), 1), boost); + } + case DoublePointSetQuery -> { + var doublePointSetQuery = (DoublePointSetQuery) query; + // Polyfill + toQueryXML(out, BooleanQuery.of(doublePointSetQuery.values().doubleStream() + .mapToObj(val -> DoublePointExactQuery.of(doublePointSetQuery.field(), val)) + .map(q -> BooleanQueryPart.of(q, OccurShould.of())) + .toList(), 1), boost); + } + case TermQuery -> { + var termQuery = (TermQuery) query; + out + .append(""); + out.append(XmlEscapers.xmlContentEscaper().escape(termQuery.term().value())); + out.append("\n"); + } + case IntTermQuery -> { + var intTermQuery = (IntTermQuery) query; + throw new UnsupportedOperationException("Non-string term fields are not supported"); + } + case IntNDTermQuery -> { + var intNDTermQuery = (IntNDTermQuery) query; + throw new UnsupportedOperationException("Non-string term fields are not supported"); + } + case LongTermQuery -> { + var longTermQuery = (LongTermQuery) query; + throw new UnsupportedOperationException("Non-string term fields are not supported"); + } + case LongNDTermQuery -> { + var longNDTermQuery = (LongNDTermQuery) query; + throw new UnsupportedOperationException("Non-string term fields are not supported"); + } + case FloatTermQuery -> { + var floatTermQuery = (FloatTermQuery) query; + throw new UnsupportedOperationException("Non-string term fields are not supported"); + } + case FloatNDTermQuery -> { + var floatNDTermQuery = (FloatNDTermQuery) query; + throw new UnsupportedOperationException("Non-string term fields are not supported"); + } + case DoubleTermQuery -> { + var doubleTermQuery = (DoubleTermQuery) query; + throw new UnsupportedOperationException("Non-string term fields are not supported"); + } + case DoubleNDTermQuery -> { + var doubleNDTermQuery = (DoubleNDTermQuery) query; + throw new UnsupportedOperationException("Non-string term fields are not supported"); + } + case FieldExistsQuery -> { + var fieldExistQuery = (FieldExistsQuery) query; + out.append(""); + ensureValidField(fieldExistQuery.field()); + out.append(fieldExistQuery.field()); + out.append(":[* TO *]"); + out.append("\n"); + } + case SolrTextQuery -> { + var solrTextQuery = (SolrTextQuery) query; + out.append(""); + ensureValidField(solrTextQuery.field()); + out.append(solrTextQuery.field()); + out.append(":"); + out.append("\"").append(escapeQueryStringValue(solrTextQuery.phrase())).append("\""); + if (solrTextQuery.slop() > 0) { + out.append("~").append(solrTextQuery.slop()); + } + out.append("\n"); + } + case BoostQuery -> { + var boostQuery = (BoostQuery) query; + toQueryXML(out, boostQuery.query(), boostQuery.scoreBoost()); + } + case ConstantScoreQuery -> { + var constantScoreQuery = (ConstantScoreQuery) query; + out.append("\n"); + toQueryXML(out, query, null); + out.append("\n"); + } + case BoxedQuery -> { + toQueryXML(out, ((BoxedQuery) query).query(), boost); + } + case FuzzyQuery -> { + var fuzzyQuery = (it.cavallium.dbengine.client.query.current.data.FuzzyQuery) query; + new FuzzyQuery(toTerm(fuzzyQuery.term()), + fuzzyQuery.maxEdits(), + fuzzyQuery.prefixLength(), + fuzzyQuery.maxExpansions(), + fuzzyQuery.transpositions() + ); + throw new UnsupportedOperationException("Fuzzy query is not supported, use span queries"); + } + case IntPointRangeQuery -> { + var intPointRangeQuery = (IntPointRangeQuery) query; + out.append("\n"); + } + case IntNDPointRangeQuery -> { + var intndPointRangeQuery = (IntNDPointRangeQuery) query; + throw new UnsupportedOperationException("N-dimensional point queries are not supported"); + } + case LongPointRangeQuery -> { + var longPointRangeQuery = (LongPointRangeQuery) query; + out.append("\n"); + } + case FloatPointRangeQuery -> { + var floatPointRangeQuery = (FloatPointRangeQuery) query; + out.append("\n"); + } + case DoublePointRangeQuery -> { + var doublePointRangeQuery = (DoublePointRangeQuery) query; + out.append("\n"); + } + case LongNDPointRangeQuery -> { + var longndPointRangeQuery = (LongNDPointRangeQuery) query; + throw new UnsupportedOperationException("N-dimensional point queries are not supported"); + } + case FloatNDPointRangeQuery -> { + var floatndPointRangeQuery = (FloatNDPointRangeQuery) query; + throw new UnsupportedOperationException("N-dimensional point queries are not supported"); + } + case DoubleNDPointRangeQuery -> { + var doublendPointRangeQuery = (DoubleNDPointRangeQuery) query; + throw new UnsupportedOperationException("N-dimensional point queries are not supported"); + } + case MatchAllDocsQuery -> { + out.append(""); + out.append("*:*"); + out.append("\n"); + } + case MatchNoDocsQuery -> { + out.append(""); + //todo: check if it's correct + out.append("!*:*"); + out.append("\n"); + } + case PhraseQuery -> { + //todo: check if it's correct + + var phraseQuery = (PhraseQuery) query; + out.append("\n"); + phraseQuery.phrase().stream().sorted(Comparator.comparingInt(TermPosition::position)).forEach(term -> { + out + .append("") + .append(XmlEscapers.xmlContentEscaper().escape(term.term().value())) + .append("\n"); + }); + out.append("\n"); + } + case SortedDocFieldExistsQuery -> { + var sortedDocFieldExistsQuery = (SortedDocFieldExistsQuery) query; + throw new UnsupportedOperationException("Field existence query is not supported"); + } + case SynonymQuery -> { + var synonymQuery = (SynonymQuery) query; + throw new UnsupportedOperationException("Synonym query is not supported"); + } + case SortedNumericDocValuesFieldSlowRangeQuery -> { + throw new UnsupportedOperationException("Slow range query is not supported"); + } + case WildcardQuery -> { + var wildcardQuery = (WildcardQuery) query; + throw new UnsupportedOperationException("Wildcard query is not supported"); + } + default -> throw new IllegalStateException("Unexpected value: " + query.getBaseType$()); + } + } + + private static String escapeQueryStringValue(String text) { + return StringUtils.replaceEach(text, QUERY_STRING_FIND, QUERY_STRING_REPLACE); + } + + private static void ensureValidField(String field) { + field.codePoints().forEach(codePoint -> { + if (!Character.isLetterOrDigit(codePoint) && codePoint != '_') { + throw new UnsupportedOperationException( + "Invalid character \"" + codePoint + "\" in field name \"" + field + "\""); + } + }); + } + private static NumberFormat toNumberFormat(it.cavallium.dbengine.client.query.current.data.NumberFormat numberFormat) { return switch (numberFormat.getBaseType$()) { case NumberFormatDecimal -> new DecimalFormat(); diff --git a/src/main/java/it/cavallium/dbengine/client/query/QueryUtils.java b/src/main/java/it/cavallium/dbengine/client/query/QueryUtils.java index 28294e3..757d612 100644 --- a/src/main/java/it/cavallium/dbengine/client/query/QueryUtils.java +++ b/src/main/java/it/cavallium/dbengine/client/query/QueryUtils.java @@ -38,12 +38,19 @@ public class QueryUtils { return transformQuery(field, luceneQuery); } + /** + * Deprecated: use solr SolrTextQuery + */ + @Deprecated public static Query phraseSearch(TextFieldsAnalyzer preferredAnalyzer, String field, String text, int slop) { var qb = new QueryBuilder(LuceneUtils.getAnalyzer(preferredAnalyzer)); var luceneQuery = qb.createPhraseQuery(field, text, slop); return transformQuery(field, luceneQuery); } + /** + * Deprecated: use solr SolrTextQuery + */ public static Query exactSearch(TextFieldsAnalyzer preferredAnalyzer, String field, String text) { var qb = new QueryBuilder(LuceneUtils.getAnalyzer(preferredAnalyzer)); var luceneQuery = qb.createPhraseQuery(field, text);