gitea源码

escape_stream.go 7.3KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290
  1. // Copyright 2022 The Gitea Authors. All rights reserved.
  2. // SPDX-License-Identifier: MIT
  3. package charset
  4. import (
  5. "fmt"
  6. "regexp"
  7. "strings"
  8. "unicode"
  9. "unicode/utf8"
  10. "code.gitea.io/gitea/modules/translation"
  11. "golang.org/x/net/html"
  12. )
  13. // VScode defaultWordRegexp
  14. var defaultWordRegexp = regexp.MustCompile(`(-?\d*\.\d\w*)|([^\` + "`" + `\~\!\@\#\$\%\^\&\*\(\)\-\=\+\[\{\]\}\\\|\;\:\'\"\,\.\<\>\/\?\s\x00-\x1f]+)`)
  15. func NewEscapeStreamer(locale translation.Locale, next HTMLStreamer, allowed ...rune) HTMLStreamer {
  16. allowedM := make(map[rune]bool, len(allowed))
  17. for _, v := range allowed {
  18. allowedM[v] = true
  19. }
  20. return &escapeStreamer{
  21. escaped: &EscapeStatus{},
  22. PassthroughHTMLStreamer: *NewPassthroughStreamer(next),
  23. locale: locale,
  24. ambiguousTables: AmbiguousTablesForLocale(locale),
  25. allowed: allowedM,
  26. }
  27. }
  28. type escapeStreamer struct {
  29. PassthroughHTMLStreamer
  30. escaped *EscapeStatus
  31. locale translation.Locale
  32. ambiguousTables []*AmbiguousTable
  33. allowed map[rune]bool
  34. }
  35. func (e *escapeStreamer) EscapeStatus() *EscapeStatus {
  36. return e.escaped
  37. }
  38. // Text tells the next streamer there is a text
  39. func (e *escapeStreamer) Text(data string) error {
  40. sb := &strings.Builder{}
  41. var until int
  42. var next int
  43. pos := 0
  44. if len(data) > len(UTF8BOM) && data[:len(UTF8BOM)] == string(UTF8BOM) {
  45. _, _ = sb.WriteString(data[:len(UTF8BOM)])
  46. pos = len(UTF8BOM)
  47. }
  48. dataBytes := []byte(data)
  49. for pos < len(data) {
  50. nextIdxs := defaultWordRegexp.FindStringIndex(data[pos:])
  51. if nextIdxs == nil {
  52. until = len(data)
  53. next = until
  54. } else {
  55. until, next = nextIdxs[0]+pos, nextIdxs[1]+pos
  56. }
  57. // from pos until we know that the runes are not \r\t\n or even ' '
  58. runes := make([]rune, 0, next-until)
  59. positions := make([]int, 0, next-until+1)
  60. for pos < until {
  61. r, sz := utf8.DecodeRune(dataBytes[pos:])
  62. positions = positions[:0]
  63. positions = append(positions, pos, pos+sz)
  64. types, confusables, _ := e.runeTypes(r)
  65. if err := e.handleRunes(dataBytes, []rune{r}, positions, types, confusables, sb); err != nil {
  66. return err
  67. }
  68. pos += sz
  69. }
  70. for i := pos; i < next; {
  71. r, sz := utf8.DecodeRune(dataBytes[i:])
  72. runes = append(runes, r)
  73. positions = append(positions, i)
  74. i += sz
  75. }
  76. positions = append(positions, next)
  77. types, confusables, runeCounts := e.runeTypes(runes...)
  78. if runeCounts.needsEscape() {
  79. if err := e.handleRunes(dataBytes, runes, positions, types, confusables, sb); err != nil {
  80. return err
  81. }
  82. } else {
  83. _, _ = sb.Write(dataBytes[pos:next])
  84. }
  85. pos = next
  86. }
  87. if sb.Len() > 0 {
  88. if err := e.PassthroughHTMLStreamer.Text(sb.String()); err != nil {
  89. return err
  90. }
  91. }
  92. return nil
  93. }
  94. func (e *escapeStreamer) handleRunes(data []byte, runes []rune, positions []int, types []runeType, confusables []rune, sb *strings.Builder) error {
  95. for i, r := range runes {
  96. switch types[i] {
  97. case brokenRuneType:
  98. if sb.Len() > 0 {
  99. if err := e.PassthroughHTMLStreamer.Text(sb.String()); err != nil {
  100. return err
  101. }
  102. sb.Reset()
  103. }
  104. end := positions[i+1]
  105. start := positions[i]
  106. if err := e.brokenRune(data[start:end]); err != nil {
  107. return err
  108. }
  109. case ambiguousRuneType:
  110. if sb.Len() > 0 {
  111. if err := e.PassthroughHTMLStreamer.Text(sb.String()); err != nil {
  112. return err
  113. }
  114. sb.Reset()
  115. }
  116. if err := e.ambiguousRune(r, confusables[0]); err != nil {
  117. return err
  118. }
  119. confusables = confusables[1:]
  120. case invisibleRuneType:
  121. if sb.Len() > 0 {
  122. if err := e.PassthroughHTMLStreamer.Text(sb.String()); err != nil {
  123. return err
  124. }
  125. sb.Reset()
  126. }
  127. if err := e.invisibleRune(r); err != nil {
  128. return err
  129. }
  130. default:
  131. _, _ = sb.WriteRune(r)
  132. }
  133. }
  134. return nil
  135. }
  136. func (e *escapeStreamer) brokenRune(bs []byte) error {
  137. e.escaped.Escaped = true
  138. e.escaped.HasBadRunes = true
  139. if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{
  140. Key: "class",
  141. Val: "broken-code-point",
  142. }); err != nil {
  143. return err
  144. }
  145. if err := e.PassthroughHTMLStreamer.Text(fmt.Sprintf("<%X>", bs)); err != nil {
  146. return err
  147. }
  148. return e.PassthroughHTMLStreamer.EndTag("span")
  149. }
  150. func (e *escapeStreamer) ambiguousRune(r, c rune) error {
  151. e.escaped.Escaped = true
  152. e.escaped.HasAmbiguous = true
  153. if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{
  154. Key: "class",
  155. Val: "ambiguous-code-point",
  156. }, html.Attribute{
  157. Key: "data-tooltip-content",
  158. Val: e.locale.TrString("repo.ambiguous_character", r, c),
  159. }); err != nil {
  160. return err
  161. }
  162. if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{
  163. Key: "class",
  164. Val: "char",
  165. }); err != nil {
  166. return err
  167. }
  168. if err := e.PassthroughHTMLStreamer.Text(string(r)); err != nil {
  169. return err
  170. }
  171. if err := e.PassthroughHTMLStreamer.EndTag("span"); err != nil {
  172. return err
  173. }
  174. return e.PassthroughHTMLStreamer.EndTag("span")
  175. }
  176. func (e *escapeStreamer) invisibleRune(r rune) error {
  177. e.escaped.Escaped = true
  178. e.escaped.HasInvisible = true
  179. if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{
  180. Key: "class",
  181. Val: "escaped-code-point",
  182. }, html.Attribute{
  183. Key: "data-escaped",
  184. Val: fmt.Sprintf("[U+%04X]", r),
  185. }); err != nil {
  186. return err
  187. }
  188. if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{
  189. Key: "class",
  190. Val: "char",
  191. }); err != nil {
  192. return err
  193. }
  194. if err := e.PassthroughHTMLStreamer.Text(string(r)); err != nil {
  195. return err
  196. }
  197. if err := e.PassthroughHTMLStreamer.EndTag("span"); err != nil {
  198. return err
  199. }
  200. return e.PassthroughHTMLStreamer.EndTag("span")
  201. }
  202. type runeCountType struct {
  203. numBasicRunes int
  204. numNonConfusingNonBasicRunes int
  205. numAmbiguousRunes int
  206. numInvisibleRunes int
  207. numBrokenRunes int
  208. }
  209. func (counts runeCountType) needsEscape() bool {
  210. if counts.numBrokenRunes > 0 {
  211. return true
  212. }
  213. if counts.numBasicRunes == 0 &&
  214. counts.numNonConfusingNonBasicRunes > 0 {
  215. return false
  216. }
  217. return counts.numAmbiguousRunes > 0 || counts.numInvisibleRunes > 0
  218. }
  219. type runeType int
  220. const (
  221. basicASCIIRuneType runeType = iota // <- This is technically deadcode but its self-documenting so it should stay
  222. brokenRuneType
  223. nonBasicASCIIRuneType
  224. ambiguousRuneType
  225. invisibleRuneType
  226. )
  227. func (e *escapeStreamer) runeTypes(runes ...rune) (types []runeType, confusables []rune, runeCounts runeCountType) {
  228. types = make([]runeType, len(runes))
  229. for i, r := range runes {
  230. var confusable rune
  231. switch {
  232. case r == utf8.RuneError:
  233. types[i] = brokenRuneType
  234. runeCounts.numBrokenRunes++
  235. case r == ' ' || r == '\t' || r == '\n':
  236. runeCounts.numBasicRunes++
  237. case e.allowed[r]:
  238. if r > 0x7e || r < 0x20 {
  239. types[i] = nonBasicASCIIRuneType
  240. runeCounts.numNonConfusingNonBasicRunes++
  241. } else {
  242. runeCounts.numBasicRunes++
  243. }
  244. case unicode.Is(InvisibleRanges, r):
  245. types[i] = invisibleRuneType
  246. runeCounts.numInvisibleRunes++
  247. case unicode.IsControl(r):
  248. types[i] = invisibleRuneType
  249. runeCounts.numInvisibleRunes++
  250. case isAmbiguous(r, &confusable, e.ambiguousTables...):
  251. confusables = append(confusables, confusable)
  252. types[i] = ambiguousRuneType
  253. runeCounts.numAmbiguousRunes++
  254. case r > 0x7e || r < 0x20:
  255. types[i] = nonBasicASCIIRuneType
  256. runeCounts.numNonConfusingNonBasicRunes++
  257. default:
  258. runeCounts.numBasicRunes++
  259. }
  260. }
  261. return types, confusables, runeCounts
  262. }