gitea源码

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399
  1. // Copyright 2019 The Gitea Authors. All rights reserved.
  2. // SPDX-License-Identifier: MIT
  3. package bleve
  4. import (
  5. "bufio"
  6. "context"
  7. "fmt"
  8. "io"
  9. "strconv"
  10. "strings"
  11. "time"
  12. repo_model "code.gitea.io/gitea/models/repo"
  13. "code.gitea.io/gitea/modules/analyze"
  14. "code.gitea.io/gitea/modules/charset"
  15. "code.gitea.io/gitea/modules/git"
  16. "code.gitea.io/gitea/modules/git/gitcmd"
  17. "code.gitea.io/gitea/modules/indexer"
  18. path_filter "code.gitea.io/gitea/modules/indexer/code/bleve/token/path"
  19. "code.gitea.io/gitea/modules/indexer/code/internal"
  20. indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
  21. inner_bleve "code.gitea.io/gitea/modules/indexer/internal/bleve"
  22. "code.gitea.io/gitea/modules/setting"
  23. "code.gitea.io/gitea/modules/timeutil"
  24. "code.gitea.io/gitea/modules/typesniffer"
  25. "code.gitea.io/gitea/modules/util"
  26. "github.com/blevesearch/bleve/v2"
  27. analyzer_custom "github.com/blevesearch/bleve/v2/analysis/analyzer/custom"
  28. analyzer_keyword "github.com/blevesearch/bleve/v2/analysis/analyzer/keyword"
  29. "github.com/blevesearch/bleve/v2/analysis/token/lowercase"
  30. "github.com/blevesearch/bleve/v2/analysis/token/unicodenorm"
  31. "github.com/blevesearch/bleve/v2/analysis/tokenizer/letter"
  32. "github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
  33. "github.com/blevesearch/bleve/v2/mapping"
  34. "github.com/blevesearch/bleve/v2/search/query"
  35. "github.com/go-enry/go-enry/v2"
  36. )
  37. const (
  38. unicodeNormalizeName = "unicodeNormalize"
  39. maxBatchSize = 16
  40. )
  41. func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
  42. return m.AddCustomTokenFilter(unicodeNormalizeName, map[string]any{
  43. "type": unicodenorm.Name,
  44. "form": unicodenorm.NFC,
  45. })
  46. }
  47. // RepoIndexerData data stored in the repo indexer
  48. type RepoIndexerData struct {
  49. RepoID int64
  50. CommitID string
  51. Content string
  52. Filename string
  53. Language string
  54. UpdatedAt time.Time
  55. }
  56. // Type returns the document type, for bleve's mapping.Classifier interface.
  57. func (d *RepoIndexerData) Type() string {
  58. return repoIndexerDocType
  59. }
  60. const (
  61. repoIndexerAnalyzer = "repoIndexerAnalyzer"
  62. filenameIndexerAnalyzer = "filenameIndexerAnalyzer"
  63. filenameIndexerTokenizer = "filenameIndexerTokenizer"
  64. repoIndexerDocType = "repoIndexerDocType"
  65. repoIndexerLatestVersion = 9
  66. )
  67. // generateBleveIndexMapping generates a bleve index mapping for the repo indexer
  68. func generateBleveIndexMapping() (mapping.IndexMapping, error) {
  69. docMapping := bleve.NewDocumentMapping()
  70. numericFieldMapping := bleve.NewNumericFieldMapping()
  71. numericFieldMapping.IncludeInAll = false
  72. docMapping.AddFieldMappingsAt("RepoID", numericFieldMapping)
  73. textFieldMapping := bleve.NewTextFieldMapping()
  74. textFieldMapping.IncludeInAll = false
  75. docMapping.AddFieldMappingsAt("Content", textFieldMapping)
  76. fileNamedMapping := bleve.NewTextFieldMapping()
  77. fileNamedMapping.IncludeInAll = false
  78. fileNamedMapping.Analyzer = filenameIndexerAnalyzer
  79. docMapping.AddFieldMappingsAt("Filename", fileNamedMapping)
  80. termFieldMapping := bleve.NewTextFieldMapping()
  81. termFieldMapping.IncludeInAll = false
  82. termFieldMapping.Analyzer = analyzer_keyword.Name
  83. docMapping.AddFieldMappingsAt("Language", termFieldMapping)
  84. docMapping.AddFieldMappingsAt("CommitID", termFieldMapping)
  85. timeFieldMapping := bleve.NewDateTimeFieldMapping()
  86. timeFieldMapping.IncludeInAll = false
  87. docMapping.AddFieldMappingsAt("UpdatedAt", timeFieldMapping)
  88. mapping := bleve.NewIndexMapping()
  89. if err := addUnicodeNormalizeTokenFilter(mapping); err != nil {
  90. return nil, err
  91. } else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]any{
  92. "type": analyzer_custom.Name,
  93. "char_filters": []string{},
  94. "tokenizer": letter.Name,
  95. "token_filters": []string{unicodeNormalizeName, lowercase.Name},
  96. }); err != nil {
  97. return nil, err
  98. }
  99. if err := mapping.AddCustomAnalyzer(filenameIndexerAnalyzer, map[string]any{
  100. "type": analyzer_custom.Name,
  101. "char_filters": []string{},
  102. "tokenizer": unicode.Name,
  103. "token_filters": []string{unicodeNormalizeName, path_filter.Name, lowercase.Name},
  104. }); err != nil {
  105. return nil, err
  106. }
  107. mapping.DefaultAnalyzer = repoIndexerAnalyzer
  108. mapping.AddDocumentMapping(repoIndexerDocType, docMapping)
  109. mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping())
  110. return mapping, nil
  111. }
  112. var _ internal.Indexer = &Indexer{}
  113. // Indexer represents a bleve indexer implementation
  114. type Indexer struct {
  115. inner *inner_bleve.Indexer
  116. indexer_internal.Indexer // do not composite inner_bleve.Indexer directly to avoid exposing too much
  117. }
  118. func (b *Indexer) SupportedSearchModes() []indexer.SearchMode {
  119. return indexer.SearchModesExactWords()
  120. }
  121. // NewIndexer creates a new bleve local indexer
  122. func NewIndexer(indexDir string) *Indexer {
  123. inner := inner_bleve.NewIndexer(indexDir, repoIndexerLatestVersion, generateBleveIndexMapping)
  124. return &Indexer{
  125. Indexer: inner,
  126. inner: inner,
  127. }
  128. }
  129. func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserError, batchReader *bufio.Reader, commitSha string,
  130. update internal.FileUpdate, repo *repo_model.Repository, batch *inner_bleve.FlushingBatch,
  131. ) error {
  132. // Ignore vendored files in code search
  133. if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) {
  134. return nil
  135. }
  136. size := update.Size
  137. var err error
  138. if !update.Sized {
  139. var stdout string
  140. stdout, _, err = gitcmd.NewCommand("cat-file", "-s").AddDynamicArguments(update.BlobSha).RunStdString(ctx, &gitcmd.RunOpts{Dir: repo.RepoPath()})
  141. if err != nil {
  142. return err
  143. }
  144. if size, err = strconv.ParseInt(strings.TrimSpace(stdout), 10, 64); err != nil {
  145. return fmt.Errorf("misformatted git cat-file output: %w", err)
  146. }
  147. }
  148. if size > setting.Indexer.MaxIndexerFileSize {
  149. return b.addDelete(update.Filename, repo, batch)
  150. }
  151. if _, err := batchWriter.Write([]byte(update.BlobSha + "\n")); err != nil {
  152. return err
  153. }
  154. _, _, size, err = git.ReadBatchLine(batchReader)
  155. if err != nil {
  156. return err
  157. }
  158. fileContents, err := io.ReadAll(io.LimitReader(batchReader, size))
  159. if err != nil {
  160. return err
  161. } else if !typesniffer.DetectContentType(fileContents).IsText() {
  162. // FIXME: UTF-16 files will probably fail here
  163. // Even if the file is not recognized as a "text file", we could still put its name into the indexers to make the filename become searchable, while leave the content to empty.
  164. fileContents = nil
  165. }
  166. if _, err = batchReader.Discard(1); err != nil {
  167. return err
  168. }
  169. id := internal.FilenameIndexerID(repo.ID, update.Filename)
  170. return batch.Index(id, &RepoIndexerData{
  171. RepoID: repo.ID,
  172. CommitID: commitSha,
  173. Filename: update.Filename,
  174. Content: string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})),
  175. Language: analyze.GetCodeLanguage(update.Filename, fileContents),
  176. UpdatedAt: time.Now().UTC(),
  177. })
  178. }
  179. func (b *Indexer) addDelete(filename string, repo *repo_model.Repository, batch *inner_bleve.FlushingBatch) error {
  180. id := internal.FilenameIndexerID(repo.ID, filename)
  181. return batch.Delete(id)
  182. }
  183. // Index indexes the data
  184. func (b *Indexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *internal.RepoChanges) error {
  185. batch := inner_bleve.NewFlushingBatch(b.inner.Indexer, maxBatchSize)
  186. if len(changes.Updates) > 0 {
  187. gitBatch, err := git.NewBatch(ctx, repo.RepoPath())
  188. if err != nil {
  189. return err
  190. }
  191. defer gitBatch.Close()
  192. for _, update := range changes.Updates {
  193. if err := b.addUpdate(ctx, gitBatch.Writer, gitBatch.Reader, sha, update, repo, batch); err != nil {
  194. return err
  195. }
  196. }
  197. gitBatch.Close()
  198. }
  199. for _, filename := range changes.RemovedFilenames {
  200. if err := b.addDelete(filename, repo, batch); err != nil {
  201. return err
  202. }
  203. }
  204. return batch.Flush()
  205. }
  206. // Delete deletes indexes by ids
  207. func (b *Indexer) Delete(_ context.Context, repoID int64) error {
  208. query := inner_bleve.NumericEqualityQuery(repoID, "RepoID")
  209. searchRequest := bleve.NewSearchRequestOptions(query, 2147483647, 0, false)
  210. result, err := b.inner.Indexer.Search(searchRequest)
  211. if err != nil {
  212. return err
  213. }
  214. batch := inner_bleve.NewFlushingBatch(b.inner.Indexer, maxBatchSize)
  215. for _, hit := range result.Hits {
  216. if err = batch.Delete(hit.ID); err != nil {
  217. return err
  218. }
  219. }
  220. return batch.Flush()
  221. }
  222. // Search searches for files in the specified repo.
  223. // Returns the matching file-paths
  224. func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
  225. var (
  226. indexerQuery query.Query
  227. keywordQuery query.Query
  228. contentQuery query.Query
  229. )
  230. pathQuery := bleve.NewPrefixQuery(strings.ToLower(opts.Keyword))
  231. pathQuery.FieldVal = "Filename"
  232. pathQuery.SetBoost(10)
  233. searchMode := util.IfZero(opts.SearchMode, b.SupportedSearchModes()[0].ModeValue)
  234. if searchMode == indexer.SearchModeExact {
  235. // 1.21 used NewPrefixQuery, but it seems not working well, and later releases changed to NewMatchPhraseQuery
  236. q := bleve.NewMatchPhraseQuery(opts.Keyword)
  237. q.Analyzer = repoIndexerAnalyzer
  238. q.FieldVal = "Content"
  239. contentQuery = q
  240. } else /* words */ {
  241. q := bleve.NewMatchQuery(opts.Keyword)
  242. q.FieldVal = "Content"
  243. q.Analyzer = repoIndexerAnalyzer
  244. if searchMode == indexer.SearchModeFuzzy {
  245. // this logic doesn't seem right, it is only used to pass the test-case `Keyword: "dESCRIPTION"`, which doesn't seem to be a real-life use-case.
  246. q.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword)
  247. } else {
  248. q.Operator = query.MatchQueryOperatorAnd
  249. }
  250. contentQuery = q
  251. }
  252. keywordQuery = bleve.NewDisjunctionQuery(contentQuery, pathQuery)
  253. if len(opts.RepoIDs) > 0 {
  254. repoQueries := make([]query.Query, 0, len(opts.RepoIDs))
  255. for _, repoID := range opts.RepoIDs {
  256. repoQueries = append(repoQueries, inner_bleve.NumericEqualityQuery(repoID, "RepoID"))
  257. }
  258. indexerQuery = bleve.NewConjunctionQuery(
  259. bleve.NewDisjunctionQuery(repoQueries...),
  260. keywordQuery,
  261. )
  262. } else {
  263. indexerQuery = keywordQuery
  264. }
  265. // Save for reuse without language filter
  266. facetQuery := indexerQuery
  267. if len(opts.Language) > 0 {
  268. languageQuery := bleve.NewMatchQuery(opts.Language)
  269. languageQuery.FieldVal = "Language"
  270. languageQuery.Analyzer = analyzer_keyword.Name
  271. indexerQuery = bleve.NewConjunctionQuery(
  272. indexerQuery,
  273. languageQuery,
  274. )
  275. }
  276. from, pageSize := opts.GetSkipTake()
  277. searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false)
  278. searchRequest.Fields = []string{"Content", "Filename", "RepoID", "Language", "CommitID", "UpdatedAt"}
  279. searchRequest.IncludeLocations = true
  280. if len(opts.Language) == 0 {
  281. searchRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
  282. }
  283. searchRequest.SortBy([]string{"-_score", "UpdatedAt"})
  284. result, err := b.inner.Indexer.SearchInContext(ctx, searchRequest)
  285. if err != nil {
  286. return 0, nil, nil, err
  287. }
  288. total := int64(result.Total)
  289. searchResults := make([]*internal.SearchResult, len(result.Hits))
  290. for i, hit := range result.Hits {
  291. startIndex, endIndex := -1, -1
  292. for _, locations := range hit.Locations["Content"] {
  293. location := locations[0]
  294. locationStart := int(location.Start)
  295. locationEnd := int(location.End)
  296. if startIndex < 0 || locationStart < startIndex {
  297. startIndex = locationStart
  298. }
  299. if endIndex < 0 || locationEnd > endIndex {
  300. endIndex = locationEnd
  301. }
  302. }
  303. if len(hit.Locations["Filename"]) > 0 {
  304. startIndex, endIndex = internal.FilenameMatchIndexPos(hit.Fields["Content"].(string))
  305. }
  306. language := hit.Fields["Language"].(string)
  307. var updatedUnix timeutil.TimeStamp
  308. if t, err := time.Parse(time.RFC3339, hit.Fields["UpdatedAt"].(string)); err == nil {
  309. updatedUnix = timeutil.TimeStamp(t.Unix())
  310. }
  311. searchResults[i] = &internal.SearchResult{
  312. RepoID: int64(hit.Fields["RepoID"].(float64)),
  313. StartIndex: startIndex,
  314. EndIndex: endIndex,
  315. Filename: internal.FilenameOfIndexerID(hit.ID),
  316. Content: hit.Fields["Content"].(string),
  317. CommitID: hit.Fields["CommitID"].(string),
  318. UpdatedUnix: updatedUnix,
  319. Language: language,
  320. Color: enry.GetColor(language),
  321. }
  322. }
  323. searchResultLanguages := make([]*internal.SearchResultLanguages, 0, 10)
  324. if len(opts.Language) > 0 {
  325. // Use separate query to go get all language counts
  326. facetRequest := bleve.NewSearchRequestOptions(facetQuery, 1, 0, false)
  327. facetRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
  328. facetRequest.IncludeLocations = true
  329. facetRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
  330. if result, err = b.inner.Indexer.Search(facetRequest); err != nil {
  331. return 0, nil, nil, err
  332. }
  333. }
  334. languagesFacet := result.Facets["languages"]
  335. for _, term := range languagesFacet.Terms.Terms() {
  336. if len(term.Term) == 0 {
  337. continue
  338. }
  339. searchResultLanguages = append(searchResultLanguages, &internal.SearchResultLanguages{
  340. Language: term.Term,
  341. Color: enry.GetColor(term.Term),
  342. Count: term.Count,
  343. })
  344. }
  345. return total, searchResults, searchResultLanguages, nil
  346. }