本站源代码
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

337 lines
9.3KB

  1. // Copyright 2016 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // Package bidirule implements the Bidi Rule defined by RFC 5893.
  5. //
  6. // This package is under development. The API may change without notice and
  7. // without preserving backward compatibility.
  8. package bidirule
  9. import (
  10. "errors"
  11. "unicode/utf8"
  12. "golang.org/x/text/transform"
  13. "golang.org/x/text/unicode/bidi"
  14. )
  15. // This file contains an implementation of RFC 5893: Right-to-Left Scripts for
  16. // Internationalized Domain Names for Applications (IDNA)
  17. //
  18. // A label is an individual component of a domain name. Labels are usually
  19. // shown separated by dots; for example, the domain name "www.example.com" is
  20. // composed of three labels: "www", "example", and "com".
  21. //
  22. // An RTL label is a label that contains at least one character of class R, AL,
  23. // or AN. An LTR label is any label that is not an RTL label.
  24. //
  25. // A "Bidi domain name" is a domain name that contains at least one RTL label.
  26. //
  27. // The following guarantees can be made based on the above:
  28. //
  29. // o In a domain name consisting of only labels that satisfy the rule,
  30. // the requirements of Section 3 are satisfied. Note that even LTR
  31. // labels and pure ASCII labels have to be tested.
  32. //
  33. // o In a domain name consisting of only LDH labels (as defined in the
  34. // Definitions document [RFC5890]) and labels that satisfy the rule,
  35. // the requirements of Section 3 are satisfied as long as a label
  36. // that starts with an ASCII digit does not come after a
  37. // right-to-left label.
  38. //
  39. // No guarantee is given for other combinations.
  40. // ErrInvalid indicates a label is invalid according to the Bidi Rule.
  41. var ErrInvalid = errors.New("bidirule: failed Bidi Rule")
  42. type ruleState uint8
  43. const (
  44. ruleInitial ruleState = iota
  45. ruleLTR
  46. ruleLTRFinal
  47. ruleRTL
  48. ruleRTLFinal
  49. ruleInvalid
  50. )
  51. type ruleTransition struct {
  52. next ruleState
  53. mask uint16
  54. }
  55. var transitions = [...][2]ruleTransition{
  56. // [2.1] The first character must be a character with Bidi property L, R, or
  57. // AL. If it has the R or AL property, it is an RTL label; if it has the L
  58. // property, it is an LTR label.
  59. ruleInitial: {
  60. {ruleLTRFinal, 1 << bidi.L},
  61. {ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL},
  62. },
  63. ruleRTL: {
  64. // [2.3] In an RTL label, the end of the label must be a character with
  65. // Bidi property R, AL, EN, or AN, followed by zero or more characters
  66. // with Bidi property NSM.
  67. {ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL | 1<<bidi.EN | 1<<bidi.AN},
  68. // [2.2] In an RTL label, only characters with the Bidi properties R,
  69. // AL, AN, EN, ES, CS, ET, ON, BN, or NSM are allowed.
  70. // We exclude the entries from [2.3]
  71. {ruleRTL, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN | 1<<bidi.NSM},
  72. },
  73. ruleRTLFinal: {
  74. // [2.3] In an RTL label, the end of the label must be a character with
  75. // Bidi property R, AL, EN, or AN, followed by zero or more characters
  76. // with Bidi property NSM.
  77. {ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL | 1<<bidi.EN | 1<<bidi.AN | 1<<bidi.NSM},
  78. // [2.2] In an RTL label, only characters with the Bidi properties R,
  79. // AL, AN, EN, ES, CS, ET, ON, BN, or NSM are allowed.
  80. // We exclude the entries from [2.3] and NSM.
  81. {ruleRTL, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN},
  82. },
  83. ruleLTR: {
  84. // [2.6] In an LTR label, the end of the label must be a character with
  85. // Bidi property L or EN, followed by zero or more characters with Bidi
  86. // property NSM.
  87. {ruleLTRFinal, 1<<bidi.L | 1<<bidi.EN},
  88. // [2.5] In an LTR label, only characters with the Bidi properties L,
  89. // EN, ES, CS, ET, ON, BN, or NSM are allowed.
  90. // We exclude the entries from [2.6].
  91. {ruleLTR, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN | 1<<bidi.NSM},
  92. },
  93. ruleLTRFinal: {
  94. // [2.6] In an LTR label, the end of the label must be a character with
  95. // Bidi property L or EN, followed by zero or more characters with Bidi
  96. // property NSM.
  97. {ruleLTRFinal, 1<<bidi.L | 1<<bidi.EN | 1<<bidi.NSM},
  98. // [2.5] In an LTR label, only characters with the Bidi properties L,
  99. // EN, ES, CS, ET, ON, BN, or NSM are allowed.
  100. // We exclude the entries from [2.6].
  101. {ruleLTR, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN},
  102. },
  103. ruleInvalid: {
  104. {ruleInvalid, 0},
  105. {ruleInvalid, 0},
  106. },
  107. }
  108. // [2.4] In an RTL label, if an EN is present, no AN may be present, and
  109. // vice versa.
  110. const exclusiveRTL = uint16(1<<bidi.EN | 1<<bidi.AN)
  111. // From RFC 5893
  112. // An RTL label is a label that contains at least one character of type
  113. // R, AL, or AN.
  114. //
  115. // An LTR label is any label that is not an RTL label.
  116. // Direction reports the direction of the given label as defined by RFC 5893.
  117. // The Bidi Rule does not have to be applied to labels of the category
  118. // LeftToRight.
  119. func Direction(b []byte) bidi.Direction {
  120. for i := 0; i < len(b); {
  121. e, sz := bidi.Lookup(b[i:])
  122. if sz == 0 {
  123. i++
  124. }
  125. c := e.Class()
  126. if c == bidi.R || c == bidi.AL || c == bidi.AN {
  127. return bidi.RightToLeft
  128. }
  129. i += sz
  130. }
  131. return bidi.LeftToRight
  132. }
  133. // DirectionString reports the direction of the given label as defined by RFC
  134. // 5893. The Bidi Rule does not have to be applied to labels of the category
  135. // LeftToRight.
  136. func DirectionString(s string) bidi.Direction {
  137. for i := 0; i < len(s); {
  138. e, sz := bidi.LookupString(s[i:])
  139. if sz == 0 {
  140. i++
  141. continue
  142. }
  143. c := e.Class()
  144. if c == bidi.R || c == bidi.AL || c == bidi.AN {
  145. return bidi.RightToLeft
  146. }
  147. i += sz
  148. }
  149. return bidi.LeftToRight
  150. }
  151. // Valid reports whether b conforms to the BiDi rule.
  152. func Valid(b []byte) bool {
  153. var t Transformer
  154. if n, ok := t.advance(b); !ok || n < len(b) {
  155. return false
  156. }
  157. return t.isFinal()
  158. }
  159. // ValidString reports whether s conforms to the BiDi rule.
  160. func ValidString(s string) bool {
  161. var t Transformer
  162. if n, ok := t.advanceString(s); !ok || n < len(s) {
  163. return false
  164. }
  165. return t.isFinal()
  166. }
  167. // New returns a Transformer that verifies that input adheres to the Bidi Rule.
  168. func New() *Transformer {
  169. return &Transformer{}
  170. }
  171. // Transformer implements transform.Transform.
  172. type Transformer struct {
  173. state ruleState
  174. hasRTL bool
  175. seen uint16
  176. }
  177. // A rule can only be violated for "Bidi Domain names", meaning if one of the
  178. // following categories has been observed.
  179. func (t *Transformer) isRTL() bool {
  180. const isRTL = 1<<bidi.R | 1<<bidi.AL | 1<<bidi.AN
  181. return t.seen&isRTL != 0
  182. }
  183. // Reset implements transform.Transformer.
  184. func (t *Transformer) Reset() { *t = Transformer{} }
  185. // Transform implements transform.Transformer. This Transformer has state and
  186. // needs to be reset between uses.
  187. func (t *Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  188. if len(dst) < len(src) {
  189. src = src[:len(dst)]
  190. atEOF = false
  191. err = transform.ErrShortDst
  192. }
  193. n, err1 := t.Span(src, atEOF)
  194. copy(dst, src[:n])
  195. if err == nil || err1 != nil && err1 != transform.ErrShortSrc {
  196. err = err1
  197. }
  198. return n, n, err
  199. }
  200. // Span returns the first n bytes of src that conform to the Bidi rule.
  201. func (t *Transformer) Span(src []byte, atEOF bool) (n int, err error) {
  202. if t.state == ruleInvalid && t.isRTL() {
  203. return 0, ErrInvalid
  204. }
  205. n, ok := t.advance(src)
  206. switch {
  207. case !ok:
  208. err = ErrInvalid
  209. case n < len(src):
  210. if !atEOF {
  211. err = transform.ErrShortSrc
  212. break
  213. }
  214. err = ErrInvalid
  215. case !t.isFinal():
  216. err = ErrInvalid
  217. }
  218. return n, err
  219. }
  220. // Precomputing the ASCII values decreases running time for the ASCII fast path
  221. // by about 30%.
  222. var asciiTable [128]bidi.Properties
  223. func init() {
  224. for i := range asciiTable {
  225. p, _ := bidi.LookupRune(rune(i))
  226. asciiTable[i] = p
  227. }
  228. }
  229. func (t *Transformer) advance(s []byte) (n int, ok bool) {
  230. var e bidi.Properties
  231. var sz int
  232. for n < len(s) {
  233. if s[n] < utf8.RuneSelf {
  234. e, sz = asciiTable[s[n]], 1
  235. } else {
  236. e, sz = bidi.Lookup(s[n:])
  237. if sz <= 1 {
  238. if sz == 1 {
  239. // We always consider invalid UTF-8 to be invalid, even if
  240. // the string has not yet been determined to be RTL.
  241. // TODO: is this correct?
  242. return n, false
  243. }
  244. return n, true // incomplete UTF-8 encoding
  245. }
  246. }
  247. // TODO: using CompactClass would result in noticeable speedup.
  248. // See unicode/bidi/prop.go:Properties.CompactClass.
  249. c := uint16(1 << e.Class())
  250. t.seen |= c
  251. if t.seen&exclusiveRTL == exclusiveRTL {
  252. t.state = ruleInvalid
  253. return n, false
  254. }
  255. switch tr := transitions[t.state]; {
  256. case tr[0].mask&c != 0:
  257. t.state = tr[0].next
  258. case tr[1].mask&c != 0:
  259. t.state = tr[1].next
  260. default:
  261. t.state = ruleInvalid
  262. if t.isRTL() {
  263. return n, false
  264. }
  265. }
  266. n += sz
  267. }
  268. return n, true
  269. }
  270. func (t *Transformer) advanceString(s string) (n int, ok bool) {
  271. var e bidi.Properties
  272. var sz int
  273. for n < len(s) {
  274. if s[n] < utf8.RuneSelf {
  275. e, sz = asciiTable[s[n]], 1
  276. } else {
  277. e, sz = bidi.LookupString(s[n:])
  278. if sz <= 1 {
  279. if sz == 1 {
  280. return n, false // invalid UTF-8
  281. }
  282. return n, true // incomplete UTF-8 encoding
  283. }
  284. }
  285. // TODO: using CompactClass results in noticeable speedup.
  286. // See unicode/bidi/prop.go:Properties.CompactClass.
  287. c := uint16(1 << e.Class())
  288. t.seen |= c
  289. if t.seen&exclusiveRTL == exclusiveRTL {
  290. t.state = ruleInvalid
  291. return n, false
  292. }
  293. switch tr := transitions[t.state]; {
  294. case tr[0].mask&c != 0:
  295. t.state = tr[0].next
  296. case tr[1].mask&c != 0:
  297. t.state = tr[1].next
  298. default:
  299. t.state = ruleInvalid
  300. if t.isRTL() {
  301. return n, false
  302. }
  303. }
  304. n += sz
  305. }
  306. return n, true
  307. }
上海开阖软件有限公司 沪ICP备12045867号-1