mirror of
https://github.com/go-gitea/gitea
synced 2025-01-22 11:57:45 +01:00
900ac62251
This is a large and complex PR, so let me explain in detail its changes. First, I had to create new index mappings for Bleve and ElasticSerach as the current ones do not support search by filename. This requires Gitea to recreate the code search indexes (I do not know if this is a breaking change, but I feel it deserves a heads-up). I've used [this approach](https://www.elastic.co/guide/en/elasticsearch/reference/7.17/analysis-pathhierarchy-tokenizer.html) to model the filename index. It allows us to efficiently search for both the full path and the name of a file. Bleve, however, does not support this out-of-box, so I had to code a brand new [token filter](https://blevesearch.com/docs/Token-Filters/) to generate the search terms. I also did an overhaul in the `indexer_test.go` file. It now asserts the order of the expected results (this is important since matches based on the name of a file are more relevant than those based on its content). I've added new test scenarios that deal with searching by filename. They use a new repo included in the Gitea fixture. The screenshot below depicts how Gitea shows the search results. It shows results based on content in the same way as the current version does. In matches based on the filename, the first seven lines of the file contents are shown (BTW, this is how GitHub does it). ![image](https://github.com/user-attachments/assets/9d938d86-1a8d-4f89-8644-1921a473e858) Resolves #32096 --------- Signed-off-by: Bruno Sofiato <bruno.sofiato@gmail.com>
436 lines
13 KiB
Go
436 lines
13 KiB
Go
// Copyright 2020 The Gitea Authors. All rights reserved.
|
|
// SPDX-License-Identifier: MIT
|
|
|
|
package elasticsearch
|
|
|
|
import (
|
|
"bufio"
|
|
"context"
|
|
"fmt"
|
|
"io"
|
|
"strconv"
|
|
"strings"
|
|
|
|
repo_model "code.gitea.io/gitea/models/repo"
|
|
"code.gitea.io/gitea/modules/analyze"
|
|
"code.gitea.io/gitea/modules/charset"
|
|
"code.gitea.io/gitea/modules/git"
|
|
"code.gitea.io/gitea/modules/gitrepo"
|
|
"code.gitea.io/gitea/modules/indexer/code/internal"
|
|
indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
|
|
inner_elasticsearch "code.gitea.io/gitea/modules/indexer/internal/elasticsearch"
|
|
"code.gitea.io/gitea/modules/json"
|
|
"code.gitea.io/gitea/modules/log"
|
|
"code.gitea.io/gitea/modules/setting"
|
|
"code.gitea.io/gitea/modules/timeutil"
|
|
"code.gitea.io/gitea/modules/typesniffer"
|
|
|
|
"github.com/go-enry/go-enry/v2"
|
|
"github.com/olivere/elastic/v7"
|
|
)
|
|
|
|
const (
|
|
esRepoIndexerLatestVersion = 2
|
|
// multi-match-types, currently only 2 types are used
|
|
// Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types
|
|
esMultiMatchTypeBestFields = "best_fields"
|
|
esMultiMatchTypePhrasePrefix = "phrase_prefix"
|
|
)
|
|
|
|
var _ internal.Indexer = &Indexer{}
|
|
|
|
// Indexer implements Indexer interface
|
|
type Indexer struct {
|
|
inner *inner_elasticsearch.Indexer
|
|
indexer_internal.Indexer // do not composite inner_elasticsearch.Indexer directly to avoid exposing too much
|
|
}
|
|
|
|
// NewIndexer creates a new elasticsearch indexer
|
|
func NewIndexer(url, indexerName string) *Indexer {
|
|
inner := inner_elasticsearch.NewIndexer(url, indexerName, esRepoIndexerLatestVersion, defaultMapping)
|
|
indexer := &Indexer{
|
|
inner: inner,
|
|
Indexer: inner,
|
|
}
|
|
return indexer
|
|
}
|
|
|
|
const (
|
|
defaultMapping = `{
|
|
"settings": {
|
|
"analysis": {
|
|
"analyzer": {
|
|
"filename_path_analyzer": {
|
|
"tokenizer": "path_tokenizer"
|
|
},
|
|
"reversed_filename_path_analyzer": {
|
|
"tokenizer": "reversed_path_tokenizer"
|
|
}
|
|
},
|
|
"tokenizer": {
|
|
"path_tokenizer": {
|
|
"type": "path_hierarchy",
|
|
"delimiter": "/"
|
|
},
|
|
"reversed_path_tokenizer": {
|
|
"type": "path_hierarchy",
|
|
"delimiter": "/",
|
|
"reverse": true
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"mappings": {
|
|
"properties": {
|
|
"repo_id": {
|
|
"type": "long",
|
|
"index": true
|
|
},
|
|
"filename": {
|
|
"type": "text",
|
|
"term_vector": "with_positions_offsets",
|
|
"index": true,
|
|
"fields": {
|
|
"path": {
|
|
"type": "text",
|
|
"analyzer": "reversed_filename_path_analyzer"
|
|
},
|
|
"path_reversed": {
|
|
"type": "text",
|
|
"analyzer": "filename_path_analyzer"
|
|
}
|
|
}
|
|
},
|
|
"content": {
|
|
"type": "text",
|
|
"term_vector": "with_positions_offsets",
|
|
"index": true
|
|
},
|
|
"commit_id": {
|
|
"type": "keyword",
|
|
"index": true
|
|
},
|
|
"language": {
|
|
"type": "keyword",
|
|
"index": true
|
|
},
|
|
"updated_at": {
|
|
"type": "long",
|
|
"index": true
|
|
}
|
|
}
|
|
}
|
|
}`
|
|
)
|
|
|
|
func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserError, batchReader *bufio.Reader, sha string, update internal.FileUpdate, repo *repo_model.Repository) ([]elastic.BulkableRequest, error) {
|
|
// Ignore vendored files in code search
|
|
if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) {
|
|
return nil, nil
|
|
}
|
|
|
|
size := update.Size
|
|
var err error
|
|
if !update.Sized {
|
|
var stdout string
|
|
stdout, _, err = git.NewCommand(ctx, "cat-file", "-s").AddDynamicArguments(update.BlobSha).RunStdString(&git.RunOpts{Dir: repo.RepoPath()})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if size, err = strconv.ParseInt(strings.TrimSpace(stdout), 10, 64); err != nil {
|
|
return nil, fmt.Errorf("misformatted git cat-file output: %w", err)
|
|
}
|
|
}
|
|
|
|
if size > setting.Indexer.MaxIndexerFileSize {
|
|
return []elastic.BulkableRequest{b.addDelete(update.Filename, repo)}, nil
|
|
}
|
|
|
|
if _, err := batchWriter.Write([]byte(update.BlobSha + "\n")); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
_, _, size, err = git.ReadBatchLine(batchReader)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
fileContents, err := io.ReadAll(io.LimitReader(batchReader, size))
|
|
if err != nil {
|
|
return nil, err
|
|
} else if !typesniffer.DetectContentType(fileContents).IsText() {
|
|
// FIXME: UTF-16 files will probably fail here
|
|
return nil, nil
|
|
}
|
|
|
|
if _, err = batchReader.Discard(1); err != nil {
|
|
return nil, err
|
|
}
|
|
id := internal.FilenameIndexerID(repo.ID, update.Filename)
|
|
|
|
return []elastic.BulkableRequest{
|
|
elastic.NewBulkIndexRequest().
|
|
Index(b.inner.VersionedIndexName()).
|
|
Id(id).
|
|
Doc(map[string]any{
|
|
"repo_id": repo.ID,
|
|
"filename": update.Filename,
|
|
"content": string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})),
|
|
"commit_id": sha,
|
|
"language": analyze.GetCodeLanguage(update.Filename, fileContents),
|
|
"updated_at": timeutil.TimeStampNow(),
|
|
}),
|
|
}, nil
|
|
}
|
|
|
|
func (b *Indexer) addDelete(filename string, repo *repo_model.Repository) elastic.BulkableRequest {
|
|
id := internal.FilenameIndexerID(repo.ID, filename)
|
|
return elastic.NewBulkDeleteRequest().
|
|
Index(b.inner.VersionedIndexName()).
|
|
Id(id)
|
|
}
|
|
|
|
// Index will save the index data
|
|
func (b *Indexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *internal.RepoChanges) error {
|
|
reqs := make([]elastic.BulkableRequest, 0)
|
|
if len(changes.Updates) > 0 {
|
|
r, err := gitrepo.OpenRepository(ctx, repo)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer r.Close()
|
|
batch, err := r.NewBatch(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer batch.Close()
|
|
|
|
for _, update := range changes.Updates {
|
|
updateReqs, err := b.addUpdate(ctx, batch.Writer, batch.Reader, sha, update, repo)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if len(updateReqs) > 0 {
|
|
reqs = append(reqs, updateReqs...)
|
|
}
|
|
}
|
|
batch.Close()
|
|
}
|
|
|
|
for _, filename := range changes.RemovedFilenames {
|
|
reqs = append(reqs, b.addDelete(filename, repo))
|
|
}
|
|
|
|
if len(reqs) > 0 {
|
|
esBatchSize := 50
|
|
|
|
for i := 0; i < len(reqs); i += esBatchSize {
|
|
_, err := b.inner.Client.Bulk().
|
|
Index(b.inner.VersionedIndexName()).
|
|
Add(reqs[i:min(i+esBatchSize, len(reqs))]...).
|
|
Do(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Delete entries by repoId
|
|
func (b *Indexer) Delete(ctx context.Context, repoID int64) error {
|
|
if err := b.doDelete(ctx, repoID); err != nil {
|
|
// Maybe there is a conflict during the delete operation, so we should retry after a refresh
|
|
log.Warn("Deletion of entries of repo %v within index %v was erroneus. Trying to refresh index before trying again", repoID, b.inner.VersionedIndexName(), err)
|
|
if err := b.refreshIndex(ctx); err != nil {
|
|
return err
|
|
}
|
|
if err := b.doDelete(ctx, repoID); err != nil {
|
|
log.Error("Could not delete entries of repo %v within index %v", repoID, b.inner.VersionedIndexName())
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (b *Indexer) refreshIndex(ctx context.Context) error {
|
|
if _, err := b.inner.Client.Refresh(b.inner.VersionedIndexName()).Do(ctx); err != nil {
|
|
log.Error("Error while trying to refresh index %v", b.inner.VersionedIndexName(), err)
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// Delete entries by repoId
|
|
func (b *Indexer) doDelete(ctx context.Context, repoID int64) error {
|
|
_, err := b.inner.Client.DeleteByQuery(b.inner.VersionedIndexName()).
|
|
Query(elastic.NewTermsQuery("repo_id", repoID)).
|
|
Do(ctx)
|
|
return err
|
|
}
|
|
|
|
// contentMatchIndexPos find words positions for start and the following end on content. It will
|
|
// return the beginning position of the first start and the ending position of the
|
|
// first end following the start string.
|
|
// If not found any of the positions, it will return -1, -1.
|
|
func contentMatchIndexPos(content, start, end string) (int, int) {
|
|
startIdx := strings.Index(content, start)
|
|
if startIdx < 0 {
|
|
return -1, -1
|
|
}
|
|
endIdx := strings.Index(content[startIdx+len(start):], end)
|
|
if endIdx < 0 {
|
|
return -1, -1
|
|
}
|
|
return startIdx, (startIdx + len(start) + endIdx + len(end)) - 9 // remove the length <em></em> since we give Content the original data
|
|
}
|
|
|
|
func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
|
|
hits := make([]*internal.SearchResult, 0, pageSize)
|
|
for _, hit := range searchResult.Hits.Hits {
|
|
repoID, fileName := internal.ParseIndexerID(hit.Id)
|
|
res := make(map[string]any)
|
|
if err := json.Unmarshal(hit.Source, &res); err != nil {
|
|
return 0, nil, nil, err
|
|
}
|
|
|
|
// FIXME: There is no way to get the position the keyword on the content currently on the same request.
|
|
// So we get it from content, this may made the query slower. See
|
|
// https://discuss.elastic.co/t/fetching-position-of-keyword-in-matched-document/94291
|
|
var startIndex, endIndex int
|
|
if c, ok := hit.Highlight["filename"]; ok && len(c) > 0 {
|
|
startIndex, endIndex = internal.FilenameMatchIndexPos(res["content"].(string))
|
|
} else if c, ok := hit.Highlight["content"]; ok && len(c) > 0 {
|
|
// FIXME: Since the highlighting content will include <em> and </em> for the keywords,
|
|
// now we should find the positions. But how to avoid html content which contains the
|
|
// <em> and </em> tags? If elastic search has handled that?
|
|
startIndex, endIndex = contentMatchIndexPos(c[0], "<em>", "</em>")
|
|
if startIndex == -1 {
|
|
panic(fmt.Sprintf("1===%s,,,%#v,,,%s", kw, hit.Highlight, c[0]))
|
|
}
|
|
} else {
|
|
panic(fmt.Sprintf("2===%#v", hit.Highlight))
|
|
}
|
|
|
|
language := res["language"].(string)
|
|
|
|
hits = append(hits, &internal.SearchResult{
|
|
RepoID: repoID,
|
|
Filename: fileName,
|
|
CommitID: res["commit_id"].(string),
|
|
Content: res["content"].(string),
|
|
UpdatedUnix: timeutil.TimeStamp(res["updated_at"].(float64)),
|
|
Language: language,
|
|
StartIndex: startIndex,
|
|
EndIndex: endIndex,
|
|
Color: enry.GetColor(language),
|
|
})
|
|
}
|
|
|
|
return searchResult.TotalHits(), hits, extractAggs(searchResult), nil
|
|
}
|
|
|
|
func extractAggs(searchResult *elastic.SearchResult) []*internal.SearchResultLanguages {
|
|
var searchResultLanguages []*internal.SearchResultLanguages
|
|
agg, found := searchResult.Aggregations.Terms("language")
|
|
if found {
|
|
searchResultLanguages = make([]*internal.SearchResultLanguages, 0, 10)
|
|
|
|
for _, bucket := range agg.Buckets {
|
|
searchResultLanguages = append(searchResultLanguages, &internal.SearchResultLanguages{
|
|
Language: bucket.Key.(string),
|
|
Color: enry.GetColor(bucket.Key.(string)),
|
|
Count: int(bucket.DocCount),
|
|
})
|
|
}
|
|
}
|
|
return searchResultLanguages
|
|
}
|
|
|
|
// Search searches for codes and language stats by given conditions.
|
|
func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
|
|
searchType := esMultiMatchTypePhrasePrefix
|
|
if opts.IsKeywordFuzzy {
|
|
searchType = esMultiMatchTypeBestFields
|
|
}
|
|
|
|
kwQuery := elastic.NewBoolQuery().Should(
|
|
elastic.NewMultiMatchQuery(opts.Keyword, "content").Type(searchType),
|
|
elastic.NewMultiMatchQuery(opts.Keyword, "filename^10").Type(esMultiMatchTypePhrasePrefix),
|
|
)
|
|
query := elastic.NewBoolQuery()
|
|
query = query.Must(kwQuery)
|
|
if len(opts.RepoIDs) > 0 {
|
|
repoStrs := make([]any, 0, len(opts.RepoIDs))
|
|
for _, repoID := range opts.RepoIDs {
|
|
repoStrs = append(repoStrs, repoID)
|
|
}
|
|
repoQuery := elastic.NewTermsQuery("repo_id", repoStrs...)
|
|
query = query.Must(repoQuery)
|
|
}
|
|
|
|
var (
|
|
start, pageSize = opts.GetSkipTake()
|
|
kw = "<em>" + opts.Keyword + "</em>"
|
|
aggregation = elastic.NewTermsAggregation().Field("language").Size(10).OrderByCountDesc()
|
|
)
|
|
|
|
if len(opts.Language) == 0 {
|
|
searchResult, err := b.inner.Client.Search().
|
|
Index(b.inner.VersionedIndexName()).
|
|
Aggregation("language", aggregation).
|
|
Query(query).
|
|
Highlight(
|
|
elastic.NewHighlight().
|
|
Field("content").
|
|
Field("filename").
|
|
NumOfFragments(0). // return all highting content on fragments
|
|
HighlighterType("fvh"),
|
|
).
|
|
Sort("_score", false).
|
|
Sort("updated_at", true).
|
|
From(start).Size(pageSize).
|
|
Do(ctx)
|
|
if err != nil {
|
|
return 0, nil, nil, err
|
|
}
|
|
|
|
return convertResult(searchResult, kw, pageSize)
|
|
}
|
|
|
|
langQuery := elastic.NewMatchQuery("language", opts.Language)
|
|
countResult, err := b.inner.Client.Search().
|
|
Index(b.inner.VersionedIndexName()).
|
|
Aggregation("language", aggregation).
|
|
Query(query).
|
|
Size(0). // We only need stats information
|
|
Do(ctx)
|
|
if err != nil {
|
|
return 0, nil, nil, err
|
|
}
|
|
|
|
query = query.Must(langQuery)
|
|
searchResult, err := b.inner.Client.Search().
|
|
Index(b.inner.VersionedIndexName()).
|
|
Query(query).
|
|
Highlight(
|
|
elastic.NewHighlight().
|
|
Field("content").
|
|
Field("filename").
|
|
NumOfFragments(0). // return all highting content on fragments
|
|
HighlighterType("fvh"),
|
|
).
|
|
Sort("_score", false).
|
|
Sort("updated_at", true).
|
|
From(start).Size(pageSize).
|
|
Do(ctx)
|
|
if err != nil {
|
|
return 0, nil, nil, err
|
|
}
|
|
|
|
total, hits, _, err := convertResult(searchResult, kw, pageSize)
|
|
|
|
return total, hits, extractAggs(countResult), err
|
|
}
|