gitea源码

typesniffer.go 4.8KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. // Copyright 2021 The Gitea Authors. All rights reserved.
  2. // SPDX-License-Identifier: MIT
  3. package typesniffer
  4. import (
  5. "bytes"
  6. "encoding/binary"
  7. "net/http"
  8. "regexp"
  9. "slices"
  10. "strings"
  11. "sync"
  12. )
  13. const SniffContentSize = 1024
  14. const (
  15. MimeTypeImageSvg = "image/svg+xml"
  16. MimeTypeImageAvif = "image/avif"
  17. MimeTypeApplicationOctetStream = "application/octet-stream"
  18. )
  19. var globalVars = sync.OnceValue(func() (ret struct {
  20. svgComment, svgTagRegex, svgTagInXMLRegex *regexp.Regexp
  21. },
  22. ) {
  23. ret.svgComment = regexp.MustCompile(`(?s)<!--.*?-->`)
  24. ret.svgTagRegex = regexp.MustCompile(`(?si)\A\s*(?:(<!DOCTYPE\s+svg([\s:]+.*?>|>))\s*)*<svg\b`)
  25. ret.svgTagInXMLRegex = regexp.MustCompile(`(?si)\A<\?xml\b.*?\?>\s*(?:(<!DOCTYPE\s+svg([\s:]+.*?>|>))\s*)*<svg\b`)
  26. return ret
  27. })
  28. // SniffedType contains information about a blob's type.
  29. type SniffedType struct {
  30. contentType string
  31. }
  32. // IsText detects if the content format is text family, including text/plain, text/html, text/css, etc.
  33. func (ct SniffedType) IsText() bool {
  34. return strings.Contains(ct.contentType, "text/")
  35. }
  36. func (ct SniffedType) IsTextPlain() bool {
  37. return strings.Contains(ct.contentType, "text/plain")
  38. }
  39. // IsImage detects if data is an image format
  40. func (ct SniffedType) IsImage() bool {
  41. return strings.Contains(ct.contentType, "image/")
  42. }
  43. // IsSvgImage detects if data is an SVG image format
  44. func (ct SniffedType) IsSvgImage() bool {
  45. return strings.Contains(ct.contentType, MimeTypeImageSvg)
  46. }
  47. // IsPDF detects if data is a PDF format
  48. func (ct SniffedType) IsPDF() bool {
  49. return strings.Contains(ct.contentType, "application/pdf")
  50. }
  51. // IsVideo detects if data is a video format
  52. func (ct SniffedType) IsVideo() bool {
  53. return strings.Contains(ct.contentType, "video/")
  54. }
  55. // IsAudio detects if data is a video format
  56. func (ct SniffedType) IsAudio() bool {
  57. return strings.Contains(ct.contentType, "audio/")
  58. }
  59. // IsRepresentableAsText returns true if file content can be represented as
  60. // plain text or is empty.
  61. func (ct SniffedType) IsRepresentableAsText() bool {
  62. return ct.IsText() || ct.IsSvgImage()
  63. }
  64. // IsBrowsableBinaryType returns whether a non-text type can be displayed in a browser
  65. func (ct SniffedType) IsBrowsableBinaryType() bool {
  66. return ct.IsImage() || ct.IsSvgImage() || ct.IsPDF() || ct.IsVideo() || ct.IsAudio()
  67. }
  68. // GetMimeType returns the mime type
  69. func (ct SniffedType) GetMimeType() string {
  70. return strings.SplitN(ct.contentType, ";", 2)[0]
  71. }
  72. // https://en.wikipedia.org/wiki/ISO_base_media_file_format#File_type_box
  73. func detectFileTypeBox(data []byte) (brands []string, found bool) {
  74. if len(data) < 12 {
  75. return nil, false
  76. }
  77. boxSize := int(binary.BigEndian.Uint32(data[:4]))
  78. if boxSize < 12 || boxSize > len(data) {
  79. return nil, false
  80. }
  81. tag := string(data[4:8])
  82. if tag != "ftyp" {
  83. return nil, false
  84. }
  85. brands = append(brands, string(data[8:12]))
  86. for i := 16; i+4 <= boxSize; i += 4 {
  87. brands = append(brands, string(data[i:i+4]))
  88. }
  89. return brands, true
  90. }
  91. // DetectContentType extends http.DetectContentType with more content types. Defaults to text/plain if input is empty.
  92. func DetectContentType(data []byte) SniffedType {
  93. if len(data) == 0 {
  94. return SniffedType{"text/plain"}
  95. }
  96. ct := http.DetectContentType(data)
  97. if len(data) > SniffContentSize {
  98. data = data[:SniffContentSize]
  99. }
  100. vars := globalVars()
  101. // SVG is unsupported by http.DetectContentType, https://github.com/golang/go/issues/15888
  102. detectByHTML := strings.Contains(ct, "text/plain") || strings.Contains(ct, "text/html")
  103. detectByXML := strings.Contains(ct, "text/xml")
  104. if detectByHTML || detectByXML {
  105. dataProcessed := vars.svgComment.ReplaceAll(data, nil)
  106. dataProcessed = bytes.TrimSpace(dataProcessed)
  107. if detectByHTML && vars.svgTagRegex.Match(dataProcessed) ||
  108. detectByXML && vars.svgTagInXMLRegex.Match(dataProcessed) {
  109. ct = MimeTypeImageSvg
  110. }
  111. }
  112. if strings.HasPrefix(ct, "audio/") && bytes.HasPrefix(data, []byte("ID3")) {
  113. // The MP3 detection is quite inaccurate, any content with "ID3" prefix will result in "audio/mpeg".
  114. // So remove the "ID3" prefix and detect again, then if the result is "text", it must be text content.
  115. // This works especially because audio files contain many unprintable/invalid characters like `0x00`
  116. ct2 := http.DetectContentType(data[3:])
  117. if strings.HasPrefix(ct2, "text/") {
  118. ct = ct2
  119. }
  120. }
  121. fileTypeBrands, found := detectFileTypeBox(data)
  122. if found && slices.Contains(fileTypeBrands, "avif") {
  123. ct = MimeTypeImageAvif
  124. }
  125. if ct == "application/ogg" {
  126. dataHead := data
  127. if len(dataHead) > 256 {
  128. dataHead = dataHead[:256] // only need to do a quick check for the file header
  129. }
  130. if bytes.Contains(dataHead, []byte("theora")) || bytes.Contains(dataHead, []byte("dirac")) {
  131. ct = "video/ogg" // ogg is only used for some video formats, and it's not popular
  132. } else {
  133. ct = "audio/ogg" // for most cases, it is used as an audio container
  134. }
  135. }
  136. return SniffedType{ct}
  137. }