mirror of
https://gitcode.com/gitea/gitea.git
synced 2025-06-19 03:08:30 +08:00
Improve issue & code search (#33860)
Each "indexer" should provide the "search modes" they support by themselves. And we need to remove the "fuzzy" search for code.
This commit is contained in:
@ -17,6 +17,7 @@ import (
|
||||
"code.gitea.io/gitea/modules/charset"
|
||||
"code.gitea.io/gitea/modules/git"
|
||||
"code.gitea.io/gitea/modules/gitrepo"
|
||||
"code.gitea.io/gitea/modules/indexer"
|
||||
path_filter "code.gitea.io/gitea/modules/indexer/code/bleve/token/path"
|
||||
"code.gitea.io/gitea/modules/indexer/code/internal"
|
||||
indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
|
||||
@ -136,6 +137,10 @@ type Indexer struct {
|
||||
indexer_internal.Indexer // do not composite inner_bleve.Indexer directly to avoid exposing too much
|
||||
}
|
||||
|
||||
func (b *Indexer) SupportedSearchModes() []indexer.SearchMode {
|
||||
return indexer.SearchModesExactWords()
|
||||
}
|
||||
|
||||
// NewIndexer creates a new bleve local indexer
|
||||
func NewIndexer(indexDir string) *Indexer {
|
||||
inner := inner_bleve.NewIndexer(indexDir, repoIndexerLatestVersion, generateBleveIndexMapping)
|
||||
@ -267,19 +272,18 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
|
||||
pathQuery.FieldVal = "Filename"
|
||||
pathQuery.SetBoost(10)
|
||||
|
||||
keywordAsPhrase, isPhrase := internal.ParseKeywordAsPhrase(opts.Keyword)
|
||||
if isPhrase {
|
||||
q := bleve.NewMatchPhraseQuery(keywordAsPhrase)
|
||||
if opts.SearchMode == indexer.SearchModeExact {
|
||||
q := bleve.NewMatchPhraseQuery(opts.Keyword)
|
||||
q.FieldVal = "Content"
|
||||
if opts.IsKeywordFuzzy {
|
||||
q.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(keywordAsPhrase)
|
||||
}
|
||||
contentQuery = q
|
||||
} else {
|
||||
} else /* words */ {
|
||||
q := bleve.NewMatchQuery(opts.Keyword)
|
||||
q.FieldVal = "Content"
|
||||
if opts.IsKeywordFuzzy {
|
||||
if opts.SearchMode == indexer.SearchModeFuzzy {
|
||||
// this logic doesn't seem right, it is only used to pass the test-case `Keyword: "dESCRIPTION"`, which doesn't seem to be a real-life use-case.
|
||||
q.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword)
|
||||
} else {
|
||||
q.Operator = query.MatchQueryOperatorAnd
|
||||
}
|
||||
contentQuery = q
|
||||
}
|
||||
|
@ -16,6 +16,7 @@ import (
|
||||
"code.gitea.io/gitea/modules/charset"
|
||||
"code.gitea.io/gitea/modules/git"
|
||||
"code.gitea.io/gitea/modules/gitrepo"
|
||||
"code.gitea.io/gitea/modules/indexer"
|
||||
"code.gitea.io/gitea/modules/indexer/code/internal"
|
||||
indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
|
||||
inner_elasticsearch "code.gitea.io/gitea/modules/indexer/internal/elasticsearch"
|
||||
@ -24,7 +25,6 @@ import (
|
||||
"code.gitea.io/gitea/modules/setting"
|
||||
"code.gitea.io/gitea/modules/timeutil"
|
||||
"code.gitea.io/gitea/modules/typesniffer"
|
||||
"code.gitea.io/gitea/modules/util"
|
||||
|
||||
"github.com/go-enry/go-enry/v2"
|
||||
"github.com/olivere/elastic/v7"
|
||||
@ -46,6 +46,10 @@ type Indexer struct {
|
||||
indexer_internal.Indexer // do not composite inner_elasticsearch.Indexer directly to avoid exposing too much
|
||||
}
|
||||
|
||||
func (b *Indexer) SupportedSearchModes() []indexer.SearchMode {
|
||||
return indexer.SearchModesExactWords()
|
||||
}
|
||||
|
||||
// NewIndexer creates a new elasticsearch indexer
|
||||
func NewIndexer(url, indexerName string) *Indexer {
|
||||
inner := inner_elasticsearch.NewIndexer(url, indexerName, esRepoIndexerLatestVersion, defaultMapping)
|
||||
@ -361,15 +365,10 @@ func extractAggs(searchResult *elastic.SearchResult) []*internal.SearchResultLan
|
||||
// Search searches for codes and language stats by given conditions.
|
||||
func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
|
||||
var contentQuery elastic.Query
|
||||
keywordAsPhrase, isPhrase := internal.ParseKeywordAsPhrase(opts.Keyword)
|
||||
if isPhrase {
|
||||
contentQuery = elastic.NewMatchPhraseQuery("content", keywordAsPhrase)
|
||||
} else {
|
||||
// TODO: this is the old logic, but not really using "fuzziness"
|
||||
// * IsKeywordFuzzy=true: "best_fields"
|
||||
// * IsKeywordFuzzy=false: "phrase_prefix"
|
||||
contentQuery = elastic.NewMultiMatchQuery("content", opts.Keyword).
|
||||
Type(util.Iif(opts.IsKeywordFuzzy, esMultiMatchTypeBestFields, esMultiMatchTypePhrasePrefix))
|
||||
if opts.SearchMode == indexer.SearchModeExact {
|
||||
contentQuery = elastic.NewMatchPhraseQuery("content", opts.Keyword)
|
||||
} else /* words */ {
|
||||
contentQuery = elastic.NewMultiMatchQuery("content", opts.Keyword).Type(esMultiMatchTypeBestFields).Operator("and")
|
||||
}
|
||||
kwQuery := elastic.NewBoolQuery().Should(
|
||||
contentQuery,
|
||||
|
@ -9,6 +9,7 @@ import (
|
||||
"strings"
|
||||
|
||||
"code.gitea.io/gitea/modules/git"
|
||||
"code.gitea.io/gitea/modules/indexer"
|
||||
code_indexer "code.gitea.io/gitea/modules/indexer/code"
|
||||
"code.gitea.io/gitea/modules/setting"
|
||||
)
|
||||
@ -23,11 +24,16 @@ func indexSettingToGitGrepPathspecList() (list []string) {
|
||||
return list
|
||||
}
|
||||
|
||||
func PerformSearch(ctx context.Context, page int, repoID int64, gitRepo *git.Repository, ref git.RefName, keyword string, isFuzzy bool) (searchResults []*code_indexer.Result, total int, err error) {
|
||||
// TODO: it should also respect ParseKeywordAsPhrase and clarify the "fuzzy" behavior
|
||||
func PerformSearch(ctx context.Context, page int, repoID int64, gitRepo *git.Repository, ref git.RefName, keyword string, searchMode indexer.SearchModeType) (searchResults []*code_indexer.Result, total int, err error) {
|
||||
grepMode := git.GrepModeWords
|
||||
if searchMode == indexer.SearchModeExact {
|
||||
grepMode = git.GrepModeExact
|
||||
} else if searchMode == indexer.SearchModeRegexp {
|
||||
grepMode = git.GrepModeRegexp
|
||||
}
|
||||
res, err := git.GrepSearch(ctx, gitRepo, keyword, git.GrepOptions{
|
||||
ContextLineNumber: 1,
|
||||
IsFuzzy: isFuzzy,
|
||||
GrepMode: grepMode,
|
||||
RefName: ref.String(),
|
||||
PathspecList: indexSettingToGitGrepPathspecList(),
|
||||
})
|
||||
|
@ -14,6 +14,7 @@ import (
|
||||
"code.gitea.io/gitea/models/db"
|
||||
repo_model "code.gitea.io/gitea/models/repo"
|
||||
"code.gitea.io/gitea/modules/graceful"
|
||||
"code.gitea.io/gitea/modules/indexer"
|
||||
"code.gitea.io/gitea/modules/indexer/code/bleve"
|
||||
"code.gitea.io/gitea/modules/indexer/code/elasticsearch"
|
||||
"code.gitea.io/gitea/modules/indexer/code/internal"
|
||||
@ -302,3 +303,11 @@ func populateRepoIndexer(ctx context.Context) {
|
||||
}
|
||||
log.Info("Done (re)populating the repo indexer with existing repositories")
|
||||
}
|
||||
|
||||
func SupportedSearchModes() []indexer.SearchMode {
|
||||
gi := globalIndexer.Load()
|
||||
if gi == nil {
|
||||
return nil
|
||||
}
|
||||
return (*gi).SupportedSearchModes()
|
||||
}
|
||||
|
@ -11,6 +11,7 @@ import (
|
||||
|
||||
"code.gitea.io/gitea/models/db"
|
||||
"code.gitea.io/gitea/models/unittest"
|
||||
indexer_module "code.gitea.io/gitea/modules/indexer"
|
||||
"code.gitea.io/gitea/modules/indexer/code/bleve"
|
||||
"code.gitea.io/gitea/modules/indexer/code/elasticsearch"
|
||||
"code.gitea.io/gitea/modules/indexer/code/internal"
|
||||
@ -39,10 +40,11 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
|
||||
assert.NoError(t, setupRepositoryIndexes(t.Context(), indexer))
|
||||
|
||||
keywords := []struct {
|
||||
RepoIDs []int64
|
||||
Keyword string
|
||||
Langs int
|
||||
Results []codeSearchResult
|
||||
RepoIDs []int64
|
||||
Keyword string
|
||||
Langs int
|
||||
SearchMode indexer_module.SearchModeType
|
||||
Results []codeSearchResult
|
||||
}{
|
||||
// Search for an exact match on the contents of a file
|
||||
// This scenario yields a single result (the file README.md on the repo '1')
|
||||
@ -183,9 +185,10 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
|
||||
},
|
||||
// Search for matches on the contents of files regardless of case.
|
||||
{
|
||||
RepoIDs: nil,
|
||||
Keyword: "dESCRIPTION",
|
||||
Langs: 1,
|
||||
RepoIDs: nil,
|
||||
Keyword: "dESCRIPTION",
|
||||
Langs: 1,
|
||||
SearchMode: indexer_module.SearchModeFuzzy,
|
||||
Results: []codeSearchResult{
|
||||
{
|
||||
Filename: "README.md",
|
||||
@ -193,7 +196,7 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
|
||||
},
|
||||
},
|
||||
},
|
||||
// Search for an exact match on the filename within the repo '62' (case insenstive).
|
||||
// Search for an exact match on the filename within the repo '62' (case-insensitive).
|
||||
// This scenario yields a single result (the file avocado.md on the repo '62')
|
||||
{
|
||||
RepoIDs: []int64{62},
|
||||
@ -206,7 +209,7 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
|
||||
},
|
||||
},
|
||||
},
|
||||
// Search for matches on the contents of files when the criteria is a expression.
|
||||
// Search for matches on the contents of files when the criteria are an expression.
|
||||
{
|
||||
RepoIDs: []int64{62},
|
||||
Keyword: "console.log",
|
||||
@ -218,7 +221,7 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
|
||||
},
|
||||
},
|
||||
},
|
||||
// Search for matches on the contents of files when the criteria is part of a expression.
|
||||
// Search for matches on the contents of files when the criteria are parts of an expression.
|
||||
{
|
||||
RepoIDs: []int64{62},
|
||||
Keyword: "log",
|
||||
@ -235,16 +238,16 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
|
||||
for _, kw := range keywords {
|
||||
t.Run(kw.Keyword, func(t *testing.T) {
|
||||
total, res, langs, err := indexer.Search(t.Context(), &internal.SearchOptions{
|
||||
RepoIDs: kw.RepoIDs,
|
||||
Keyword: kw.Keyword,
|
||||
RepoIDs: kw.RepoIDs,
|
||||
Keyword: kw.Keyword,
|
||||
SearchMode: kw.SearchMode,
|
||||
Paginator: &db.ListOptions{
|
||||
Page: 1,
|
||||
PageSize: 10,
|
||||
},
|
||||
IsKeywordFuzzy: true,
|
||||
})
|
||||
assert.NoError(t, err)
|
||||
assert.Len(t, langs, kw.Langs)
|
||||
require.NoError(t, err)
|
||||
require.Len(t, langs, kw.Langs)
|
||||
|
||||
hits := make([]codeSearchResult, 0, len(res))
|
||||
|
||||
@ -289,7 +292,7 @@ func TestBleveIndexAndSearch(t *testing.T) {
|
||||
_, err := idx.Init(t.Context())
|
||||
require.NoError(t, err)
|
||||
|
||||
testIndexer("beleve", t, idx)
|
||||
testIndexer("bleve", t, idx)
|
||||
}
|
||||
|
||||
func TestESIndexAndSearch(t *testing.T) {
|
||||
|
@ -9,6 +9,7 @@ import (
|
||||
|
||||
"code.gitea.io/gitea/models/db"
|
||||
repo_model "code.gitea.io/gitea/models/repo"
|
||||
"code.gitea.io/gitea/modules/indexer"
|
||||
"code.gitea.io/gitea/modules/indexer/internal"
|
||||
)
|
||||
|
||||
@ -18,6 +19,7 @@ type Indexer interface {
|
||||
Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *RepoChanges) error
|
||||
Delete(ctx context.Context, repoID int64) error
|
||||
Search(ctx context.Context, opts *SearchOptions) (int64, []*SearchResult, []*SearchResultLanguages, error)
|
||||
SupportedSearchModes() []indexer.SearchMode
|
||||
}
|
||||
|
||||
type SearchOptions struct {
|
||||
@ -25,7 +27,7 @@ type SearchOptions struct {
|
||||
Keyword string
|
||||
Language string
|
||||
|
||||
IsKeywordFuzzy bool
|
||||
SearchMode indexer.SearchModeType
|
||||
|
||||
db.Paginator
|
||||
}
|
||||
@ -41,6 +43,10 @@ type dummyIndexer struct {
|
||||
internal.Indexer
|
||||
}
|
||||
|
||||
func (d *dummyIndexer) SupportedSearchModes() []indexer.SearchMode {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (d *dummyIndexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *RepoChanges) error {
|
||||
return fmt.Errorf("indexer is not ready")
|
||||
}
|
||||
|
@ -10,9 +10,7 @@ import (
|
||||
"code.gitea.io/gitea/modules/log"
|
||||
)
|
||||
|
||||
const (
|
||||
filenameMatchNumberOfLines = 7 // Copied from github search
|
||||
)
|
||||
const filenameMatchNumberOfLines = 7 // Copied from GitHub search
|
||||
|
||||
func FilenameIndexerID(repoID int64, filename string) string {
|
||||
return internal.Base36(repoID) + "_" + filename
|
||||
@ -48,11 +46,3 @@ func FilenameMatchIndexPos(content string) (int, int) {
|
||||
}
|
||||
return 0, len(content)
|
||||
}
|
||||
|
||||
func ParseKeywordAsPhrase(keyword string) (string, bool) {
|
||||
if strings.HasPrefix(keyword, `"`) && strings.HasSuffix(keyword, `"`) && len(keyword) > 1 {
|
||||
// only remove the prefix and suffix quotes, no need to decode the content at the moment
|
||||
return keyword[1 : len(keyword)-1], true
|
||||
}
|
||||
return "", false
|
||||
}
|
||||
|
@ -1,30 +0,0 @@
|
||||
// Copyright 2025 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package internal
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestParseKeywordAsPhrase(t *testing.T) {
|
||||
cases := []struct {
|
||||
keyword string
|
||||
phrase string
|
||||
isPhrase bool
|
||||
}{
|
||||
{``, "", false},
|
||||
{`a`, "", false},
|
||||
{`"`, "", false},
|
||||
{`"a`, "", false},
|
||||
{`"a"`, "a", true},
|
||||
{`""\"""`, `"\""`, true},
|
||||
}
|
||||
for _, c := range cases {
|
||||
phrase, isPhrase := ParseKeywordAsPhrase(c.keyword)
|
||||
assert.Equal(t, c.phrase, phrase, "keyword=%q", c.keyword)
|
||||
assert.Equal(t, c.isPhrase, isPhrase, "keyword=%q", c.keyword)
|
||||
}
|
||||
}
|
@ -129,7 +129,6 @@ func searchResult(result *internal.SearchResult, startIndex, endIndex int) (*Res
|
||||
}
|
||||
|
||||
// PerformSearch perform a search on a repository
|
||||
// if isFuzzy is true set the Damerau-Levenshtein distance from 0 to 2
|
||||
func PerformSearch(ctx context.Context, opts *SearchOptions) (int, []*Result, []*SearchResultLanguages, error) {
|
||||
if opts == nil || len(opts.Keyword) == 0 {
|
||||
return 0, nil, nil, nil
|
||||
|
Reference in New Issue
Block a user