From b9c57fb78e8e0d80d786d8e1da433b6c7ebf2f1c Mon Sep 17 00:00:00 2001 From: 6543 <6543@obermui.de> Date: Sat, 23 Mar 2024 16:45:13 +0100 Subject: [PATCH] Determine fuzziness of bleve indexer by keyword length (#29706) also bleve did match on fuzzy search and the other way around. this also fix that bug. --- modules/indexer/code/bleve/bleve.go | 15 +++++++-------- modules/indexer/internal/bleve/query.go | 10 ++-------- modules/indexer/issues/bleve/bleve.go | 25 +++++++++++++------------ tests/integration/repo_search_test.go | 16 +++++++--------- 4 files changed, 29 insertions(+), 37 deletions(-) diff --git a/modules/indexer/code/bleve/bleve.go b/modules/indexer/code/bleve/bleve.go index d7f735e957d..c607d780ef9 100644 --- a/modules/indexer/code/bleve/bleve.go +++ b/modules/indexer/code/bleve/bleve.go @@ -39,6 +39,8 @@ import ( const ( unicodeNormalizeName = "unicodeNormalize" maxBatchSize = 16 + // fuzzyDenominator determines the levenshtein distance per each character of a keyword + fuzzyDenominator = 4 ) func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error { @@ -239,15 +241,12 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int keywordQuery query.Query ) + phraseQuery := bleve.NewMatchPhraseQuery(opts.Keyword) + phraseQuery.FieldVal = "Content" + phraseQuery.Analyzer = repoIndexerAnalyzer + keywordQuery = phraseQuery if opts.IsKeywordFuzzy { - phraseQuery := bleve.NewMatchPhraseQuery(opts.Keyword) - phraseQuery.FieldVal = "Content" - phraseQuery.Analyzer = repoIndexerAnalyzer - keywordQuery = phraseQuery - } else { - prefixQuery := bleve.NewPrefixQuery(opts.Keyword) - prefixQuery.FieldVal = "Content" - keywordQuery = prefixQuery + phraseQuery.Fuzziness = len(opts.Keyword) / fuzzyDenominator } if len(opts.RepoIDs) > 0 { diff --git a/modules/indexer/internal/bleve/query.go b/modules/indexer/internal/bleve/query.go index b96875343e5..21422b281c4 100644 --- a/modules/indexer/internal/bleve/query.go +++ b/modules/indexer/internal/bleve/query.go @@ -20,17 +20,11 @@ func NumericEqualityQuery(value int64, field string) *query.NumericRangeQuery { } // MatchPhraseQuery generates a match phrase query for the given phrase, field and analyzer -func MatchPhraseQuery(matchPhrase, field, analyzer string) *query.MatchPhraseQuery { +func MatchPhraseQuery(matchPhrase, field, analyzer string, fuzziness int) *query.MatchPhraseQuery { q := bleve.NewMatchPhraseQuery(matchPhrase) q.FieldVal = field q.Analyzer = analyzer - return q -} - -// PrefixQuery generates a match prefix query for the given prefix and field -func PrefixQuery(matchPrefix, field string) *query.PrefixQuery { - q := bleve.NewPrefixQuery(matchPrefix) - q.FieldVal = field + q.Fuzziness = fuzziness return q } diff --git a/modules/indexer/issues/bleve/bleve.go b/modules/indexer/issues/bleve/bleve.go index 927ad58cd4c..1f54be721b3 100644 --- a/modules/indexer/issues/bleve/bleve.go +++ b/modules/indexer/issues/bleve/bleve.go @@ -35,7 +35,11 @@ func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error { }) } -const maxBatchSize = 16 +const ( + maxBatchSize = 16 + // fuzzyDenominator determines the levenshtein distance per each character of a keyword + fuzzyDenominator = 4 +) // IndexerData an update to the issue indexer type IndexerData internal.IndexerData @@ -156,19 +160,16 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) ( var queries []query.Query if options.Keyword != "" { + fuzziness := 0 if options.IsFuzzyKeyword { - queries = append(queries, bleve.NewDisjunctionQuery([]query.Query{ - inner_bleve.MatchPhraseQuery(options.Keyword, "title", issueIndexerAnalyzer), - inner_bleve.MatchPhraseQuery(options.Keyword, "content", issueIndexerAnalyzer), - inner_bleve.MatchPhraseQuery(options.Keyword, "comments", issueIndexerAnalyzer), - }...)) - } else { - queries = append(queries, bleve.NewDisjunctionQuery([]query.Query{ - inner_bleve.PrefixQuery(options.Keyword, "title"), - inner_bleve.PrefixQuery(options.Keyword, "content"), - inner_bleve.PrefixQuery(options.Keyword, "comments"), - }...)) + fuzziness = len(options.Keyword) / fuzzyDenominator } + + queries = append(queries, bleve.NewDisjunctionQuery([]query.Query{ + inner_bleve.MatchPhraseQuery(options.Keyword, "title", issueIndexerAnalyzer, fuzziness), + inner_bleve.MatchPhraseQuery(options.Keyword, "content", issueIndexerAnalyzer, fuzziness), + inner_bleve.MatchPhraseQuery(options.Keyword, "comments", issueIndexerAnalyzer, fuzziness), + }...)) } if len(options.RepoIDs) > 0 || options.AllPublic { diff --git a/tests/integration/repo_search_test.go b/tests/integration/repo_search_test.go index cf199e98c28..56cc45d9010 100644 --- a/tests/integration/repo_search_test.go +++ b/tests/integration/repo_search_test.go @@ -32,7 +32,7 @@ func TestSearchRepo(t *testing.T) { repo, err := repo_model.GetRepositoryByOwnerAndName(db.DefaultContext, "user2", "repo1") assert.NoError(t, err) - executeIndexer(t, repo, code_indexer.UpdateRepoIndexer) + code_indexer.UpdateRepoIndexer(repo) testSearch(t, "/user2/repo1/search?q=Description&page=1", []string{"README.md"}) @@ -42,12 +42,14 @@ func TestSearchRepo(t *testing.T) { repo, err = repo_model.GetRepositoryByOwnerAndName(db.DefaultContext, "user2", "glob") assert.NoError(t, err) - executeIndexer(t, repo, code_indexer.UpdateRepoIndexer) + code_indexer.UpdateRepoIndexer(repo) testSearch(t, "/user2/glob/search?q=loren&page=1", []string{"a.txt"}) - testSearch(t, "/user2/glob/search?q=file3&page=1", []string{"x/b.txt"}) - testSearch(t, "/user2/glob/search?q=file4&page=1", []string{}) - testSearch(t, "/user2/glob/search?q=file5&page=1", []string{}) + testSearch(t, "/user2/glob/search?q=loren&page=1&t=match", []string{"a.txt"}) + testSearch(t, "/user2/glob/search?q=file3&page=1", []string{"x/b.txt", "a.txt"}) + testSearch(t, "/user2/glob/search?q=file3&page=1&t=match", []string{"x/b.txt", "a.txt"}) + testSearch(t, "/user2/glob/search?q=file4&page=1&t=match", []string{"x/b.txt", "a.txt"}) + testSearch(t, "/user2/glob/search?q=file5&page=1&t=match", []string{"x/b.txt", "a.txt"}) } func testSearch(t *testing.T, url string, expected []string) { @@ -57,7 +59,3 @@ func testSearch(t *testing.T, url string, expected []string) { filenames := resultFilenames(t, NewHTMLParser(t, resp.Body)) assert.EqualValues(t, expected, filenames) } - -func executeIndexer(t *testing.T, repo *repo_model.Repository, op func(*repo_model.Repository)) { - op(repo) -}