本站源代码
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

253 line
11KB

  1. // Copyright 2019 The Gitea Authors. All rights reserved.
  2. // Use of this source code is governed by a MIT-style
  3. // license that can be found in the LICENSE file.
  4. package charset
  5. import (
  6. "testing"
  7. "code.gitea.io/gitea/modules/setting"
  8. "github.com/stretchr/testify/assert"
  9. )
  10. func TestRemoveBOMIfPresent(t *testing.T) {
  11. res := RemoveBOMIfPresent([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
  12. assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
  13. res = RemoveBOMIfPresent([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
  14. assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
  15. }
  16. func TestToUTF8WithErr(t *testing.T) {
  17. var res string
  18. var err error
  19. // Note: golang compiler seems so behave differently depending on the current
  20. // locale, so some conversions might behave differently. For that reason, we don't
  21. // depend on particular conversions but in expected behaviors.
  22. res, err = ToUTF8WithErr([]byte{0x41, 0x42, 0x43})
  23. assert.NoError(t, err)
  24. assert.Equal(t, "ABC", res)
  25. // "áéíóú"
  26. res, err = ToUTF8WithErr([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
  27. assert.NoError(t, err)
  28. assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, []byte(res))
  29. // "áéíóú"
  30. res, err = ToUTF8WithErr([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3,
  31. 0xc3, 0xba})
  32. assert.NoError(t, err)
  33. assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, []byte(res))
  34. res, err = ToUTF8WithErr([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
  35. 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73, 0x41, 0x41, 0x41, 0x2e})
  36. assert.NoError(t, err)
  37. stringMustStartWith(t, "Hola,", res)
  38. stringMustEndWith(t, "AAA.", res)
  39. res, err = ToUTF8WithErr([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
  40. 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73, 0x41, 0x41, 0x41, 0x2e})
  41. assert.NoError(t, err)
  42. stringMustStartWith(t, "Hola,", res)
  43. stringMustEndWith(t, "AAA.", res)
  44. res, err = ToUTF8WithErr([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
  45. 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73, 0x41, 0x41, 0x41, 0x2e})
  46. assert.NoError(t, err)
  47. stringMustStartWith(t, "Hola,", res)
  48. stringMustEndWith(t, "AAA.", res)
  49. // Japanese (Shift-JIS)
  50. // 日属秘ぞしちゅ。
  51. res, err = ToUTF8WithErr([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82,
  52. 0xBF, 0x82, 0xE3, 0x81, 0x42})
  53. assert.NoError(t, err)
  54. assert.Equal(t, []byte{0xE6, 0x97, 0xA5, 0xE5, 0xB1, 0x9E, 0xE7, 0xA7, 0x98, 0xE3,
  55. 0x81, 0x9E, 0xE3, 0x81, 0x97, 0xE3, 0x81, 0xA1, 0xE3, 0x82, 0x85, 0xE3, 0x80, 0x82},
  56. []byte(res))
  57. res, err = ToUTF8WithErr([]byte{0x00, 0x00, 0x00, 0x00})
  58. assert.NoError(t, err)
  59. assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, []byte(res))
  60. }
  61. func TestToUTF8WithFallback(t *testing.T) {
  62. // "ABC"
  63. res := ToUTF8WithFallback([]byte{0x41, 0x42, 0x43})
  64. assert.Equal(t, []byte{0x41, 0x42, 0x43}, res)
  65. // "áéíóú"
  66. res = ToUTF8WithFallback([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
  67. assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
  68. // UTF8 BOM + "áéíóú"
  69. res = ToUTF8WithFallback([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
  70. assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
  71. // "Hola, así cómo ños"
  72. res = ToUTF8WithFallback([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
  73. 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73})
  74. assert.Equal(t, []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xC3, 0xAD, 0x20, 0x63,
  75. 0xC3, 0xB3, 0x6D, 0x6F, 0x20, 0xC3, 0xB1, 0x6F, 0x73}, res)
  76. // "Hola, así cómo "
  77. minmatch := []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xC3, 0xAD, 0x20, 0x63, 0xC3, 0xB3, 0x6D, 0x6F, 0x20}
  78. res = ToUTF8WithFallback([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73})
  79. // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
  80. assert.Equal(t, minmatch, res[0:len(minmatch)])
  81. res = ToUTF8WithFallback([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73})
  82. // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
  83. assert.Equal(t, minmatch, res[0:len(minmatch)])
  84. // Japanese (Shift-JIS)
  85. // "日属秘ぞしちゅ。"
  86. res = ToUTF8WithFallback([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, 0xBF, 0x82, 0xE3, 0x81, 0x42})
  87. assert.Equal(t, []byte{0xE6, 0x97, 0xA5, 0xE5, 0xB1, 0x9E, 0xE7, 0xA7, 0x98, 0xE3,
  88. 0x81, 0x9E, 0xE3, 0x81, 0x97, 0xE3, 0x81, 0xA1, 0xE3, 0x82, 0x85, 0xE3, 0x80, 0x82}, res)
  89. res = ToUTF8WithFallback([]byte{0x00, 0x00, 0x00, 0x00})
  90. assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, res)
  91. }
  92. func TestToUTF8(t *testing.T) {
  93. // Note: golang compiler seems so behave differently depending on the current
  94. // locale, so some conversions might behave differently. For that reason, we don't
  95. // depend on particular conversions but in expected behaviors.
  96. res := ToUTF8(string([]byte{0x41, 0x42, 0x43}))
  97. assert.Equal(t, "ABC", res)
  98. // "áéíóú"
  99. res = ToUTF8(string([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}))
  100. assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, []byte(res))
  101. // BOM + "áéíóú"
  102. res = ToUTF8(string([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3,
  103. 0xc3, 0xba}))
  104. assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, []byte(res))
  105. // Latin1
  106. // Hola, así cómo ños
  107. res = ToUTF8(string([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
  108. 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73}))
  109. assert.Equal(t, []byte{0x48, 0x6f, 0x6c, 0x61, 0x2c, 0x20, 0x61, 0x73, 0xc3, 0xad, 0x20, 0x63,
  110. 0xc3, 0xb3, 0x6d, 0x6f, 0x20, 0xc3, 0xb1, 0x6f, 0x73}, []byte(res))
  111. // Latin1
  112. // Hola, así cómo \x07ños
  113. res = ToUTF8(string([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
  114. 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73}))
  115. // Hola,
  116. bytesMustStartWith(t, []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C}, []byte(res))
  117. // This test FAILS
  118. // res = ToUTF8("Hola, así cómo \x81ños")
  119. // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
  120. // assert.Regexp(t, "^Hola, así cómo", res)
  121. // Japanese (Shift-JIS)
  122. // 日属秘ぞしちゅ。
  123. res = ToUTF8(string([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82,
  124. 0xBF, 0x82, 0xE3, 0x81, 0x42}))
  125. assert.Equal(t, []byte{0xE6, 0x97, 0xA5, 0xE5, 0xB1, 0x9E, 0xE7, 0xA7, 0x98, 0xE3,
  126. 0x81, 0x9E, 0xE3, 0x81, 0x97, 0xE3, 0x81, 0xA1, 0xE3, 0x82, 0x85, 0xE3, 0x80, 0x82},
  127. []byte(res))
  128. res = ToUTF8("\x00\x00\x00\x00")
  129. assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, []byte(res))
  130. }
  131. func TestToUTF8DropErrors(t *testing.T) {
  132. // "ABC"
  133. res := ToUTF8DropErrors([]byte{0x41, 0x42, 0x43})
  134. assert.Equal(t, []byte{0x41, 0x42, 0x43}, res)
  135. // "áéíóú"
  136. res = ToUTF8DropErrors([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
  137. assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
  138. // UTF8 BOM + "áéíóú"
  139. res = ToUTF8DropErrors([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
  140. assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
  141. // "Hola, así cómo ños"
  142. res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73})
  143. assert.Equal(t, []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73}, res[:8])
  144. assert.Equal(t, []byte{0x73}, res[len(res)-1:])
  145. // "Hola, así cómo "
  146. minmatch := []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xC3, 0xAD, 0x20, 0x63, 0xC3, 0xB3, 0x6D, 0x6F, 0x20}
  147. res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73})
  148. // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
  149. assert.Equal(t, minmatch, res[0:len(minmatch)])
  150. res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73})
  151. // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
  152. assert.Equal(t, minmatch, res[0:len(minmatch)])
  153. // Japanese (Shift-JIS)
  154. // "日属秘ぞしちゅ。"
  155. res = ToUTF8DropErrors([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, 0xBF, 0x82, 0xE3, 0x81, 0x42})
  156. assert.Equal(t, []byte{0xE6, 0x97, 0xA5, 0xE5, 0xB1, 0x9E, 0xE7, 0xA7, 0x98, 0xE3,
  157. 0x81, 0x9E, 0xE3, 0x81, 0x97, 0xE3, 0x81, 0xA1, 0xE3, 0x82, 0x85, 0xE3, 0x80, 0x82}, res)
  158. res = ToUTF8DropErrors([]byte{0x00, 0x00, 0x00, 0x00})
  159. assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, res)
  160. }
  161. func TestDetectEncoding(t *testing.T) {
  162. testSuccess := func(b []byte, expected string) {
  163. encoding, err := DetectEncoding(b)
  164. assert.NoError(t, err)
  165. assert.Equal(t, expected, encoding)
  166. }
  167. // utf-8
  168. b := []byte("just some ascii")
  169. testSuccess(b, "UTF-8")
  170. // utf-8-sig: "hey" (with BOM)
  171. b = []byte{0xef, 0xbb, 0xbf, 0x68, 0x65, 0x79}
  172. testSuccess(b, "UTF-8")
  173. // utf-16: "hey<accented G>"
  174. b = []byte{0xff, 0xfe, 0x68, 0x00, 0x65, 0x00, 0x79, 0x00, 0xf4, 0x01}
  175. testSuccess(b, "UTF-16LE")
  176. // iso-8859-1: d<accented e>cor<newline>
  177. b = []byte{0x44, 0xe9, 0x63, 0x6f, 0x72, 0x0a}
  178. encoding, err := DetectEncoding(b)
  179. assert.NoError(t, err)
  180. // due to a race condition in `chardet` library, it could either detect
  181. // "ISO-8859-1" or "IS0-8859-2" here. Technically either is correct, so
  182. // we accept either.
  183. assert.Contains(t, encoding, "ISO-8859")
  184. setting.Repository.AnsiCharset = "placeholder"
  185. testSuccess(b, "placeholder")
  186. // invalid bytes
  187. b = []byte{0xfa}
  188. _, err = DetectEncoding(b)
  189. assert.Error(t, err)
  190. }
  191. func stringMustStartWith(t *testing.T, expected string, value string) {
  192. assert.Equal(t, expected, string(value[:len(expected)]))
  193. }
  194. func stringMustEndWith(t *testing.T, expected string, value string) {
  195. assert.Equal(t, expected, string(value[len(value)-len(expected):]))
  196. }
  197. func bytesMustStartWith(t *testing.T, expected []byte, value []byte) {
  198. assert.Equal(t, expected, value[:len(expected)])
  199. }
上海开阖软件有限公司 沪ICP备12045867号-1