gitea源码

highlightdiff.go 9.4KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266
  1. // Copyright 2022 The Gitea Authors. All rights reserved.
  2. // SPDX-License-Identifier: MIT
  3. package gitdiff
  4. import (
  5. "bytes"
  6. "html/template"
  7. "strings"
  8. "github.com/sergi/go-diff/diffmatchpatch"
  9. )
  10. // token is a html tag or entity, eg: "<span ...>", "</span>", "&lt;"
  11. func extractHTMLToken(s string) (before, token, after string, valid bool) {
  12. for pos1 := 0; pos1 < len(s); pos1++ {
  13. switch s[pos1] {
  14. case '<':
  15. pos2 := strings.IndexByte(s[pos1:], '>')
  16. if pos2 == -1 {
  17. return "", "", s, false
  18. }
  19. return s[:pos1], s[pos1 : pos1+pos2+1], s[pos1+pos2+1:], true
  20. case '&':
  21. pos2 := strings.IndexByte(s[pos1:], ';')
  22. if pos2 == -1 {
  23. return "", "", s, false
  24. }
  25. return s[:pos1], s[pos1 : pos1+pos2+1], s[pos1+pos2+1:], true
  26. }
  27. }
  28. return "", "", s, true
  29. }
  30. // highlightCodeDiff is used to do diff with highlighted HTML code.
  31. // It totally depends on Chroma's valid HTML output and its structure, do not use these functions for other purposes.
  32. // The HTML tags and entities will be replaced by Unicode placeholders: "<span>{TEXT}</span>" => "\uE000{TEXT}\uE001"
  33. // These Unicode placeholders are friendly to the diff.
  34. // Then after diff, the placeholders in diff result will be recovered to the HTML tags and entities.
  35. // It's guaranteed that the tags in final diff result are paired correctly.
  36. type highlightCodeDiff struct {
  37. placeholderBegin rune
  38. placeholderMaxCount int
  39. placeholderIndex int
  40. placeholderTokenMap map[rune]string
  41. tokenPlaceholderMap map[string]rune
  42. placeholderOverflowCount int
  43. lineWrapperTags []string
  44. }
  45. func newHighlightCodeDiff() *highlightCodeDiff {
  46. return &highlightCodeDiff{
  47. placeholderBegin: rune(0x100000), // Plane 16: Supplementary Private Use Area B (U+100000..U+10FFFD)
  48. placeholderMaxCount: 64000,
  49. placeholderTokenMap: map[rune]string{},
  50. tokenPlaceholderMap: map[string]rune{},
  51. }
  52. }
  53. // nextPlaceholder returns 0 if no more placeholder can be used
  54. // the diff is done line by line, usually there are only a few (no more than 10) placeholders in one line
  55. // so the placeholderMaxCount is impossible to be exhausted in real cases.
  56. func (hcd *highlightCodeDiff) nextPlaceholder() rune {
  57. for hcd.placeholderIndex < hcd.placeholderMaxCount {
  58. r := hcd.placeholderBegin + rune(hcd.placeholderIndex)
  59. hcd.placeholderIndex++
  60. // only use non-existing (not used by code) rune as placeholders
  61. if _, ok := hcd.placeholderTokenMap[r]; !ok {
  62. return r
  63. }
  64. }
  65. return 0 // no more available placeholder
  66. }
  67. func (hcd *highlightCodeDiff) isInPlaceholderRange(r rune) bool {
  68. return hcd.placeholderBegin <= r && r < hcd.placeholderBegin+rune(hcd.placeholderMaxCount)
  69. }
  70. func (hcd *highlightCodeDiff) collectUsedRunes(code template.HTML) {
  71. for _, r := range code {
  72. if hcd.isInPlaceholderRange(r) {
  73. // put the existing rune (used by code) in map, then this rune won't be used a placeholder anymore.
  74. hcd.placeholderTokenMap[r] = ""
  75. }
  76. }
  77. }
  78. func (hcd *highlightCodeDiff) diffLineWithHighlight(lineType DiffLineType, codeA, codeB template.HTML) template.HTML {
  79. return hcd.diffLineWithHighlightWrapper(nil, lineType, codeA, codeB)
  80. }
  81. func (hcd *highlightCodeDiff) diffLineWithHighlightWrapper(lineWrapperTags []string, lineType DiffLineType, codeA, codeB template.HTML) template.HTML {
  82. hcd.collectUsedRunes(codeA)
  83. hcd.collectUsedRunes(codeB)
  84. convertedCodeA := hcd.convertToPlaceholders(codeA)
  85. convertedCodeB := hcd.convertToPlaceholders(codeB)
  86. dmp := defaultDiffMatchPatch()
  87. diffs := dmp.DiffMain(convertedCodeA, convertedCodeB, true)
  88. diffs = dmp.DiffCleanupSemantic(diffs)
  89. buf := bytes.NewBuffer(nil)
  90. // restore the line wrapper tags <span class="line"> and <span class="cl">, if necessary
  91. for _, tag := range lineWrapperTags {
  92. buf.WriteString(tag)
  93. }
  94. addedCodePrefix := hcd.registerTokenAsPlaceholder(`<span class="added-code">`)
  95. removedCodePrefix := hcd.registerTokenAsPlaceholder(`<span class="removed-code">`)
  96. codeTagSuffix := hcd.registerTokenAsPlaceholder(`</span>`)
  97. if codeTagSuffix != 0 {
  98. for _, diff := range diffs {
  99. switch {
  100. case diff.Type == diffmatchpatch.DiffEqual:
  101. buf.WriteString(diff.Text)
  102. case diff.Type == diffmatchpatch.DiffInsert && lineType == DiffLineAdd:
  103. buf.WriteRune(addedCodePrefix)
  104. buf.WriteString(diff.Text)
  105. buf.WriteRune(codeTagSuffix)
  106. case diff.Type == diffmatchpatch.DiffDelete && lineType == DiffLineDel:
  107. buf.WriteRune(removedCodePrefix)
  108. buf.WriteString(diff.Text)
  109. buf.WriteRune(codeTagSuffix)
  110. }
  111. }
  112. } else {
  113. // placeholder map space is exhausted
  114. for _, diff := range diffs {
  115. take := diff.Type == diffmatchpatch.DiffEqual || (diff.Type == diffmatchpatch.DiffInsert && lineType == DiffLineAdd) || (diff.Type == diffmatchpatch.DiffDelete && lineType == DiffLineDel)
  116. if take {
  117. buf.WriteString(diff.Text)
  118. }
  119. }
  120. }
  121. for range lineWrapperTags {
  122. buf.WriteString("</span>")
  123. }
  124. return hcd.recoverOneDiff(buf.String())
  125. }
  126. func (hcd *highlightCodeDiff) registerTokenAsPlaceholder(token string) rune {
  127. placeholder, ok := hcd.tokenPlaceholderMap[token]
  128. if !ok {
  129. placeholder = hcd.nextPlaceholder()
  130. if placeholder != 0 {
  131. hcd.tokenPlaceholderMap[token] = placeholder
  132. hcd.placeholderTokenMap[placeholder] = token
  133. }
  134. }
  135. return placeholder
  136. }
  137. // convertToPlaceholders totally depends on Chroma's valid HTML output and its structure, do not use these functions for other purposes.
  138. func (hcd *highlightCodeDiff) convertToPlaceholders(htmlContent template.HTML) string {
  139. var tagStack []string
  140. res := strings.Builder{}
  141. firstRunForLineTags := hcd.lineWrapperTags == nil
  142. var beforeToken, token string
  143. var valid bool
  144. htmlCode := string(htmlContent)
  145. // the standard chroma highlight HTML is "<span class="line [hl]"><span class="cl"> ... </span></span>"
  146. for {
  147. beforeToken, token, htmlCode, valid = extractHTMLToken(htmlCode)
  148. if !valid || token == "" {
  149. break
  150. }
  151. // write the content before the token into result string, and consume the token in the string
  152. res.WriteString(beforeToken)
  153. // the line wrapper tags should be removed before diff
  154. if strings.HasPrefix(token, `<span class="line`) || strings.HasPrefix(token, `<span class="cl"`) {
  155. if firstRunForLineTags {
  156. // if this is the first run for converting, save the line wrapper tags for later use, they should be added back
  157. hcd.lineWrapperTags = append(hcd.lineWrapperTags, token)
  158. }
  159. htmlCode = strings.TrimSuffix(htmlCode, "</span>")
  160. continue
  161. }
  162. var tokenInMap string
  163. if strings.HasSuffix(token, "</") { // for closing tag
  164. if len(tagStack) == 0 {
  165. break // invalid diff result, no opening tag but see closing tag
  166. }
  167. // make sure the closing tag in map is related to the open tag, to make the diff algorithm can match the opening/closing tags
  168. // the closing tag will be recorded in the map by key "</span><!-- <span the-opening> -->" for "<span the-opening>"
  169. tokenInMap = token + "<!-- " + tagStack[len(tagStack)-1] + "-->"
  170. tagStack = tagStack[:len(tagStack)-1]
  171. } else if token[0] == '<' { // for opening tag
  172. tokenInMap = token
  173. tagStack = append(tagStack, token)
  174. } else if token[0] == '&' { // for html entity
  175. tokenInMap = token
  176. } // else: impossible
  177. // remember the placeholder and token in the map
  178. placeholder := hcd.registerTokenAsPlaceholder(tokenInMap)
  179. if placeholder != 0 {
  180. res.WriteRune(placeholder) // use the placeholder to replace the token
  181. } else {
  182. // unfortunately, all private use runes has been exhausted, no more placeholder could be used, no more converting
  183. // usually, the exhausting won't occur in real cases, the magnitude of used placeholders is not larger than that of the CSS classes outputted by chroma.
  184. hcd.placeholderOverflowCount++
  185. if strings.HasPrefix(token, "&") {
  186. // when the token is a html entity, something must be outputted even if there is no placeholder.
  187. res.WriteRune(0xFFFD) // replacement character TODO: how to handle this case more gracefully?
  188. res.WriteString(token[1:]) // still output the entity code part, otherwise there will be no diff result.
  189. }
  190. }
  191. }
  192. // write the remaining string
  193. res.WriteString(htmlCode)
  194. return res.String()
  195. }
  196. func (hcd *highlightCodeDiff) recoverOneDiff(str string) template.HTML {
  197. sb := strings.Builder{}
  198. var tagStack []string
  199. for _, r := range str {
  200. token, ok := hcd.placeholderTokenMap[r]
  201. if !ok || token == "" {
  202. sb.WriteRune(r) // if the rune is not a placeholder, write it as it is
  203. continue
  204. }
  205. var tokenToRecover string
  206. if strings.HasPrefix(token, "</") { // for closing tag
  207. // only get the tag itself, ignore the trailing comment (for how the comment is generated, see the code in `convert` function)
  208. tokenToRecover = token[:strings.IndexByte(token, '>')+1]
  209. if len(tagStack) == 0 {
  210. continue // if no opening tag in stack yet, skip the closing tag
  211. }
  212. tagStack = tagStack[:len(tagStack)-1]
  213. } else if token[0] == '<' { // for opening tag
  214. tokenToRecover = token
  215. tagStack = append(tagStack, token)
  216. } else if token[0] == '&' { // for html entity
  217. tokenToRecover = token
  218. } // else: impossible
  219. sb.WriteString(tokenToRecover)
  220. }
  221. if len(tagStack) > 0 {
  222. // close all opening tags
  223. for i := len(tagStack) - 1; i >= 0; i-- {
  224. tagToClose := tagStack[i]
  225. // get the closing tag "</span>" from "<span class=...>" or "<span>"
  226. pos := strings.IndexAny(tagToClose, " >")
  227. if pos != -1 {
  228. sb.WriteString("</" + tagToClose[1:pos] + ">")
  229. } // else: impossible. every tag was pushed into the stack by the code above and is valid HTML opening tag
  230. }
  231. }
  232. return template.HTML(sb.String())
  233. }