gitea源码

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449
  1. // Copyright 2020 The Gitea Authors. All rights reserved.
  2. // SPDX-License-Identifier: MIT
  3. package elasticsearch
  4. import (
  5. "bufio"
  6. "context"
  7. "fmt"
  8. "io"
  9. "strconv"
  10. "strings"
  11. repo_model "code.gitea.io/gitea/models/repo"
  12. "code.gitea.io/gitea/modules/analyze"
  13. "code.gitea.io/gitea/modules/charset"
  14. "code.gitea.io/gitea/modules/git"
  15. "code.gitea.io/gitea/modules/git/gitcmd"
  16. "code.gitea.io/gitea/modules/indexer"
  17. "code.gitea.io/gitea/modules/indexer/code/internal"
  18. indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
  19. inner_elasticsearch "code.gitea.io/gitea/modules/indexer/internal/elasticsearch"
  20. "code.gitea.io/gitea/modules/json"
  21. "code.gitea.io/gitea/modules/log"
  22. "code.gitea.io/gitea/modules/setting"
  23. "code.gitea.io/gitea/modules/timeutil"
  24. "code.gitea.io/gitea/modules/typesniffer"
  25. "code.gitea.io/gitea/modules/util"
  26. "github.com/go-enry/go-enry/v2"
  27. "github.com/olivere/elastic/v7"
  28. )
  29. const (
  30. esRepoIndexerLatestVersion = 3
  31. // multi-match-types, currently only 2 types are used
  32. // Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types
  33. esMultiMatchTypeBestFields = "best_fields"
  34. esMultiMatchTypePhrasePrefix = "phrase_prefix"
  35. )
  36. var _ internal.Indexer = &Indexer{}
  37. // Indexer implements Indexer interface
  38. type Indexer struct {
  39. inner *inner_elasticsearch.Indexer
  40. indexer_internal.Indexer // do not composite inner_elasticsearch.Indexer directly to avoid exposing too much
  41. }
  42. func (b *Indexer) SupportedSearchModes() []indexer.SearchMode {
  43. return indexer.SearchModesExactWords()
  44. }
  45. // NewIndexer creates a new elasticsearch indexer
  46. func NewIndexer(url, indexerName string) *Indexer {
  47. inner := inner_elasticsearch.NewIndexer(url, indexerName, esRepoIndexerLatestVersion, defaultMapping)
  48. indexer := &Indexer{
  49. inner: inner,
  50. Indexer: inner,
  51. }
  52. return indexer
  53. }
  54. const (
  55. defaultMapping = `{
  56. "settings": {
  57. "analysis": {
  58. "analyzer": {
  59. "content_analyzer": {
  60. "tokenizer": "content_tokenizer",
  61. "filter" : ["lowercase"]
  62. },
  63. "filename_path_analyzer": {
  64. "tokenizer": "path_tokenizer"
  65. },
  66. "reversed_filename_path_analyzer": {
  67. "tokenizer": "reversed_path_tokenizer"
  68. }
  69. },
  70. "tokenizer": {
  71. "content_tokenizer": {
  72. "type": "simple_pattern_split",
  73. "pattern": "[^a-zA-Z0-9]"
  74. },
  75. "path_tokenizer": {
  76. "type": "path_hierarchy",
  77. "delimiter": "/"
  78. },
  79. "reversed_path_tokenizer": {
  80. "type": "path_hierarchy",
  81. "delimiter": "/",
  82. "reverse": true
  83. }
  84. }
  85. }
  86. },
  87. "mappings": {
  88. "properties": {
  89. "repo_id": {
  90. "type": "long",
  91. "index": true
  92. },
  93. "filename": {
  94. "type": "text",
  95. "term_vector": "with_positions_offsets",
  96. "index": true,
  97. "fields": {
  98. "path": {
  99. "type": "text",
  100. "analyzer": "reversed_filename_path_analyzer"
  101. },
  102. "path_reversed": {
  103. "type": "text",
  104. "analyzer": "filename_path_analyzer"
  105. }
  106. }
  107. },
  108. "content": {
  109. "type": "text",
  110. "term_vector": "with_positions_offsets",
  111. "index": true,
  112. "analyzer": "content_analyzer"
  113. },
  114. "commit_id": {
  115. "type": "keyword",
  116. "index": true
  117. },
  118. "language": {
  119. "type": "keyword",
  120. "index": true
  121. },
  122. "updated_at": {
  123. "type": "long",
  124. "index": true
  125. }
  126. }
  127. }
  128. }`
  129. )
  130. func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserError, batchReader *bufio.Reader, sha string, update internal.FileUpdate, repo *repo_model.Repository) ([]elastic.BulkableRequest, error) {
  131. // Ignore vendored files in code search
  132. if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) {
  133. return nil, nil
  134. }
  135. size := update.Size
  136. var err error
  137. if !update.Sized {
  138. var stdout string
  139. stdout, _, err = gitcmd.NewCommand("cat-file", "-s").AddDynamicArguments(update.BlobSha).RunStdString(ctx, &gitcmd.RunOpts{Dir: repo.RepoPath()})
  140. if err != nil {
  141. return nil, err
  142. }
  143. if size, err = strconv.ParseInt(strings.TrimSpace(stdout), 10, 64); err != nil {
  144. return nil, fmt.Errorf("misformatted git cat-file output: %w", err)
  145. }
  146. }
  147. if size > setting.Indexer.MaxIndexerFileSize {
  148. return []elastic.BulkableRequest{b.addDelete(update.Filename, repo)}, nil
  149. }
  150. if _, err := batchWriter.Write([]byte(update.BlobSha + "\n")); err != nil {
  151. return nil, err
  152. }
  153. _, _, size, err = git.ReadBatchLine(batchReader)
  154. if err != nil {
  155. return nil, err
  156. }
  157. fileContents, err := io.ReadAll(io.LimitReader(batchReader, size))
  158. if err != nil {
  159. return nil, err
  160. } else if !typesniffer.DetectContentType(fileContents).IsText() {
  161. // FIXME: UTF-16 files will probably fail here
  162. return nil, nil
  163. }
  164. if _, err = batchReader.Discard(1); err != nil {
  165. return nil, err
  166. }
  167. id := internal.FilenameIndexerID(repo.ID, update.Filename)
  168. return []elastic.BulkableRequest{
  169. elastic.NewBulkIndexRequest().
  170. Index(b.inner.VersionedIndexName()).
  171. Id(id).
  172. Doc(map[string]any{
  173. "repo_id": repo.ID,
  174. "filename": update.Filename,
  175. "content": string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})),
  176. "commit_id": sha,
  177. "language": analyze.GetCodeLanguage(update.Filename, fileContents),
  178. "updated_at": timeutil.TimeStampNow(),
  179. }),
  180. }, nil
  181. }
  182. func (b *Indexer) addDelete(filename string, repo *repo_model.Repository) elastic.BulkableRequest {
  183. id := internal.FilenameIndexerID(repo.ID, filename)
  184. return elastic.NewBulkDeleteRequest().
  185. Index(b.inner.VersionedIndexName()).
  186. Id(id)
  187. }
  188. // Index will save the index data
  189. func (b *Indexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *internal.RepoChanges) error {
  190. reqs := make([]elastic.BulkableRequest, 0)
  191. if len(changes.Updates) > 0 {
  192. batch, err := git.NewBatch(ctx, repo.RepoPath())
  193. if err != nil {
  194. return err
  195. }
  196. defer batch.Close()
  197. for _, update := range changes.Updates {
  198. updateReqs, err := b.addUpdate(ctx, batch.Writer, batch.Reader, sha, update, repo)
  199. if err != nil {
  200. return err
  201. }
  202. if len(updateReqs) > 0 {
  203. reqs = append(reqs, updateReqs...)
  204. }
  205. }
  206. batch.Close()
  207. }
  208. for _, filename := range changes.RemovedFilenames {
  209. reqs = append(reqs, b.addDelete(filename, repo))
  210. }
  211. if len(reqs) > 0 {
  212. esBatchSize := 50
  213. for i := 0; i < len(reqs); i += esBatchSize {
  214. _, err := b.inner.Client.Bulk().
  215. Index(b.inner.VersionedIndexName()).
  216. Add(reqs[i:min(i+esBatchSize, len(reqs))]...).
  217. Do(ctx)
  218. if err != nil {
  219. return err
  220. }
  221. }
  222. }
  223. return nil
  224. }
  225. // Delete entries by repoId
  226. func (b *Indexer) Delete(ctx context.Context, repoID int64) error {
  227. if err := b.doDelete(ctx, repoID); err != nil {
  228. // Maybe there is a conflict during the delete operation, so we should retry after a refresh
  229. log.Warn("Deletion of entries of repo %v within index %v was erroneous. Trying to refresh index before trying again", repoID, b.inner.VersionedIndexName(), err)
  230. if err := b.refreshIndex(ctx); err != nil {
  231. return err
  232. }
  233. if err := b.doDelete(ctx, repoID); err != nil {
  234. log.Error("Could not delete entries of repo %v within index %v", repoID, b.inner.VersionedIndexName())
  235. return err
  236. }
  237. }
  238. return nil
  239. }
  240. func (b *Indexer) refreshIndex(ctx context.Context) error {
  241. if _, err := b.inner.Client.Refresh(b.inner.VersionedIndexName()).Do(ctx); err != nil {
  242. log.Error("Error while trying to refresh index %v", b.inner.VersionedIndexName(), err)
  243. return err
  244. }
  245. return nil
  246. }
  247. // Delete entries by repoId
  248. func (b *Indexer) doDelete(ctx context.Context, repoID int64) error {
  249. _, err := b.inner.Client.DeleteByQuery(b.inner.VersionedIndexName()).
  250. Query(elastic.NewTermsQuery("repo_id", repoID)).
  251. Do(ctx)
  252. return err
  253. }
  254. // contentMatchIndexPos find words positions for start and the following end on content. It will
  255. // return the beginning position of the first start and the ending position of the
  256. // first end following the start string.
  257. // If not found any of the positions, it will return -1, -1.
  258. func contentMatchIndexPos(content, start, end string) (int, int) {
  259. startIdx := strings.Index(content, start)
  260. if startIdx < 0 {
  261. return -1, -1
  262. }
  263. endIdx := strings.Index(content[startIdx+len(start):], end)
  264. if endIdx < 0 {
  265. return -1, -1
  266. }
  267. return startIdx, (startIdx + len(start) + endIdx + len(end)) - 9 // remove the length <em></em> since we give Content the original data
  268. }
  269. func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
  270. hits := make([]*internal.SearchResult, 0, pageSize)
  271. for _, hit := range searchResult.Hits.Hits {
  272. repoID, fileName := internal.ParseIndexerID(hit.Id)
  273. res := make(map[string]any)
  274. if err := json.Unmarshal(hit.Source, &res); err != nil {
  275. return 0, nil, nil, err
  276. }
  277. // FIXME: There is no way to get the position the keyword on the content currently on the same request.
  278. // So we get it from content, this may made the query slower. See
  279. // https://discuss.elastic.co/t/fetching-position-of-keyword-in-matched-document/94291
  280. var startIndex, endIndex int
  281. if c, ok := hit.Highlight["filename"]; ok && len(c) > 0 {
  282. startIndex, endIndex = internal.FilenameMatchIndexPos(res["content"].(string))
  283. } else if c, ok := hit.Highlight["content"]; ok && len(c) > 0 {
  284. // FIXME: Since the highlighting content will include <em> and </em> for the keywords,
  285. // now we should find the positions. But how to avoid html content which contains the
  286. // <em> and </em> tags? If elastic search has handled that?
  287. startIndex, endIndex = contentMatchIndexPos(c[0], "<em>", "</em>")
  288. if startIndex == -1 {
  289. panic(fmt.Sprintf("1===%s,,,%#v,,,%s", kw, hit.Highlight, c[0]))
  290. }
  291. } else {
  292. panic(fmt.Sprintf("2===%#v", hit.Highlight))
  293. }
  294. language := res["language"].(string)
  295. hits = append(hits, &internal.SearchResult{
  296. RepoID: repoID,
  297. Filename: fileName,
  298. CommitID: res["commit_id"].(string),
  299. Content: res["content"].(string),
  300. UpdatedUnix: timeutil.TimeStamp(res["updated_at"].(float64)),
  301. Language: language,
  302. StartIndex: startIndex,
  303. EndIndex: endIndex,
  304. Color: enry.GetColor(language),
  305. })
  306. }
  307. return searchResult.TotalHits(), hits, extractAggs(searchResult), nil
  308. }
  309. func extractAggs(searchResult *elastic.SearchResult) []*internal.SearchResultLanguages {
  310. var searchResultLanguages []*internal.SearchResultLanguages
  311. agg, found := searchResult.Aggregations.Terms("language")
  312. if found {
  313. searchResultLanguages = make([]*internal.SearchResultLanguages, 0, 10)
  314. for _, bucket := range agg.Buckets {
  315. searchResultLanguages = append(searchResultLanguages, &internal.SearchResultLanguages{
  316. Language: bucket.Key.(string),
  317. Color: enry.GetColor(bucket.Key.(string)),
  318. Count: int(bucket.DocCount),
  319. })
  320. }
  321. }
  322. return searchResultLanguages
  323. }
  324. // Search searches for codes and language stats by given conditions.
  325. func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
  326. var contentQuery elastic.Query
  327. searchMode := util.IfZero(opts.SearchMode, b.SupportedSearchModes()[0].ModeValue)
  328. if searchMode == indexer.SearchModeExact {
  329. // 1.21 used NewMultiMatchQuery().Type(esMultiMatchTypePhrasePrefix), but later releases changed to NewMatchPhraseQuery
  330. contentQuery = elastic.NewMatchPhraseQuery("content", opts.Keyword)
  331. } else /* words */ {
  332. contentQuery = elastic.NewMultiMatchQuery("content", opts.Keyword).Type(esMultiMatchTypeBestFields).Operator("and")
  333. }
  334. kwQuery := elastic.NewBoolQuery().Should(
  335. contentQuery,
  336. elastic.NewMultiMatchQuery(opts.Keyword, "filename^10").Type(esMultiMatchTypePhrasePrefix),
  337. )
  338. query := elastic.NewBoolQuery()
  339. query = query.Must(kwQuery)
  340. if len(opts.RepoIDs) > 0 {
  341. repoStrs := make([]any, 0, len(opts.RepoIDs))
  342. for _, repoID := range opts.RepoIDs {
  343. repoStrs = append(repoStrs, repoID)
  344. }
  345. repoQuery := elastic.NewTermsQuery("repo_id", repoStrs...)
  346. query = query.Must(repoQuery)
  347. }
  348. var (
  349. start, pageSize = opts.GetSkipTake()
  350. kw = "<em>" + opts.Keyword + "</em>"
  351. aggregation = elastic.NewTermsAggregation().Field("language").Size(10).OrderByCountDesc()
  352. )
  353. if len(opts.Language) == 0 {
  354. searchResult, err := b.inner.Client.Search().
  355. Index(b.inner.VersionedIndexName()).
  356. Aggregation("language", aggregation).
  357. Query(query).
  358. Highlight(
  359. elastic.NewHighlight().
  360. Field("content").
  361. Field("filename").
  362. NumOfFragments(0). // return all highting content on fragments
  363. HighlighterType("fvh"),
  364. ).
  365. Sort("_score", false).
  366. Sort("updated_at", true).
  367. From(start).Size(pageSize).
  368. Do(ctx)
  369. if err != nil {
  370. return 0, nil, nil, err
  371. }
  372. return convertResult(searchResult, kw, pageSize)
  373. }
  374. langQuery := elastic.NewMatchQuery("language", opts.Language)
  375. countResult, err := b.inner.Client.Search().
  376. Index(b.inner.VersionedIndexName()).
  377. Aggregation("language", aggregation).
  378. Query(query).
  379. Size(0). // We only need stats information
  380. Do(ctx)
  381. if err != nil {
  382. return 0, nil, nil, err
  383. }
  384. query = query.Must(langQuery)
  385. searchResult, err := b.inner.Client.Search().
  386. Index(b.inner.VersionedIndexName()).
  387. Query(query).
  388. Highlight(
  389. elastic.NewHighlight().
  390. Field("content").
  391. Field("filename").
  392. NumOfFragments(0). // return all highting content on fragments
  393. HighlighterType("fvh"),
  394. ).
  395. Sort("_score", false).
  396. Sort("updated_at", true).
  397. From(start).Size(pageSize).
  398. Do(ctx)
  399. if err != nil {
  400. return 0, nil, nil, err
  401. }
  402. total, hits, _, err := convertResult(searchResult, kw, pageSize)
  403. return total, hits, extractAggs(countResult), err
  404. }