gitea源码

util.go 2.6KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. // Copyright 2023 The Gitea Authors. All rights reserved.
  2. // SPDX-License-Identifier: MIT
  3. package bleve
  4. import (
  5. "errors"
  6. "os"
  7. "unicode"
  8. "code.gitea.io/gitea/modules/log"
  9. "code.gitea.io/gitea/modules/setting"
  10. "code.gitea.io/gitea/modules/util"
  11. "github.com/blevesearch/bleve/v2"
  12. unicode_tokenizer "github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
  13. "github.com/blevesearch/bleve/v2/index/upsidedown"
  14. "github.com/ethantkoenig/rupture"
  15. )
  16. const (
  17. maxFuzziness = 2
  18. )
  19. // openIndexer open the index at the specified path, checking for metadata
  20. // updates and bleve version updates. If index needs to be created (or
  21. // re-created), returns (nil, nil)
  22. func openIndexer(path string, latestVersion int) (bleve.Index, int, error) {
  23. _, err := os.Stat(path)
  24. if err != nil && os.IsNotExist(err) {
  25. return nil, 0, nil
  26. } else if err != nil {
  27. return nil, 0, err
  28. }
  29. metadata, err := rupture.ReadIndexMetadata(path)
  30. if err != nil {
  31. return nil, 0, err
  32. }
  33. if metadata.Version < latestVersion {
  34. // the indexer is using a previous version, so we should delete it and
  35. // re-populate
  36. return nil, metadata.Version, util.RemoveAll(path)
  37. }
  38. index, err := bleve.Open(path)
  39. if err != nil {
  40. if errors.Is(err, upsidedown.IncompatibleVersion) {
  41. log.Warn("Indexer was built with a previous version of bleve, deleting and rebuilding")
  42. return nil, 0, util.RemoveAll(path)
  43. }
  44. return nil, 0, err
  45. }
  46. return index, 0, nil
  47. }
  48. // GuessFuzzinessByKeyword guesses fuzziness based on the levenshtein distance and determines how many chars
  49. // may be different on two string, and they still be considered equivalent.
  50. // Given a phrase, its shortest word determines its fuzziness. If a phrase uses CJK (eg: `갃갃갃` `啊啊啊`), the fuzziness is zero.
  51. func GuessFuzzinessByKeyword(s string) int {
  52. tokenizer := unicode_tokenizer.NewUnicodeTokenizer()
  53. tokens := tokenizer.Tokenize([]byte(s))
  54. if len(tokens) > 0 {
  55. fuzziness := maxFuzziness
  56. for _, token := range tokens {
  57. fuzziness = min(fuzziness, guessFuzzinessByKeyword(string(token.Term)))
  58. }
  59. return fuzziness
  60. }
  61. return 0
  62. }
  63. func guessFuzzinessByKeyword(s string) int {
  64. // according to https://github.com/blevesearch/bleve/issues/1563, the supported max fuzziness is 2
  65. // magic number 4 was chosen to determine the levenshtein distance per each character of a keyword
  66. // BUT, when using CJK (eg: `갃갃갃` `啊啊啊`), it mismatches a lot.
  67. // Likewise, queries whose terms contains characters that are *not* letters should not use fuzziness
  68. for _, r := range s {
  69. if r >= 128 || !unicode.IsLetter(r) {
  70. return 0
  71. }
  72. }
  73. return min(min(setting.Indexer.TypeBleveMaxFuzzniess, maxFuzziness), len(s)/4)
  74. }