本站源代码
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

336 lines
11KB

  1. // Copyright 2015 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package bidi
  5. import (
  6. "container/list"
  7. "fmt"
  8. "sort"
  9. )
  10. // This file contains a port of the reference implementation of the
  11. // Bidi Parentheses Algorithm:
  12. // https://www.unicode.org/Public/PROGRAMS/BidiReferenceJava/BidiPBAReference.java
  13. //
  14. // The implementation in this file covers definitions BD14-BD16 and rule N0
  15. // of UAX#9.
  16. //
  17. // Some preprocessing is done for each rune before data is passed to this
  18. // algorithm:
  19. // - opening and closing brackets are identified
  20. // - a bracket pair type, like '(' and ')' is assigned a unique identifier that
  21. // is identical for the opening and closing bracket. It is left to do these
  22. // mappings.
  23. // - The BPA algorithm requires that bracket characters that are canonical
  24. // equivalents of each other be able to be substituted for each other.
  25. // It is the responsibility of the caller to do this canonicalization.
  26. //
  27. // In implementing BD16, this implementation departs slightly from the "logical"
  28. // algorithm defined in UAX#9. In particular, the stack referenced there
  29. // supports operations that go beyond a "basic" stack. An equivalent
  30. // implementation based on a linked list is used here.
  31. // Bidi_Paired_Bracket_Type
  32. // BD14. An opening paired bracket is a character whose
  33. // Bidi_Paired_Bracket_Type property value is Open.
  34. //
  35. // BD15. A closing paired bracket is a character whose
  36. // Bidi_Paired_Bracket_Type property value is Close.
  37. type bracketType byte
  38. const (
  39. bpNone bracketType = iota
  40. bpOpen
  41. bpClose
  42. )
  43. // bracketPair holds a pair of index values for opening and closing bracket
  44. // location of a bracket pair.
  45. type bracketPair struct {
  46. opener int
  47. closer int
  48. }
  49. func (b *bracketPair) String() string {
  50. return fmt.Sprintf("(%v, %v)", b.opener, b.closer)
  51. }
  52. // bracketPairs is a slice of bracketPairs with a sort.Interface implementation.
  53. type bracketPairs []bracketPair
  54. func (b bracketPairs) Len() int { return len(b) }
  55. func (b bracketPairs) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
  56. func (b bracketPairs) Less(i, j int) bool { return b[i].opener < b[j].opener }
  57. // resolvePairedBrackets runs the paired bracket part of the UBA algorithm.
  58. //
  59. // For each rune, it takes the indexes into the original string, the class the
  60. // bracket type (in pairTypes) and the bracket identifier (pairValues). It also
  61. // takes the direction type for the start-of-sentence and the embedding level.
  62. //
  63. // The identifiers for bracket types are the rune of the canonicalized opening
  64. // bracket for brackets (open or close) or 0 for runes that are not brackets.
  65. func resolvePairedBrackets(s *isolatingRunSequence) {
  66. p := bracketPairer{
  67. sos: s.sos,
  68. openers: list.New(),
  69. codesIsolatedRun: s.types,
  70. indexes: s.indexes,
  71. }
  72. dirEmbed := L
  73. if s.level&1 != 0 {
  74. dirEmbed = R
  75. }
  76. p.locateBrackets(s.p.pairTypes, s.p.pairValues)
  77. p.resolveBrackets(dirEmbed, s.p.initialTypes)
  78. }
  79. type bracketPairer struct {
  80. sos Class // direction corresponding to start of sequence
  81. // The following is a restatement of BD 16 using non-algorithmic language.
  82. //
  83. // A bracket pair is a pair of characters consisting of an opening
  84. // paired bracket and a closing paired bracket such that the
  85. // Bidi_Paired_Bracket property value of the former equals the latter,
  86. // subject to the following constraints.
  87. // - both characters of a pair occur in the same isolating run sequence
  88. // - the closing character of a pair follows the opening character
  89. // - any bracket character can belong at most to one pair, the earliest possible one
  90. // - any bracket character not part of a pair is treated like an ordinary character
  91. // - pairs may nest properly, but their spans may not overlap otherwise
  92. // Bracket characters with canonical decompositions are supposed to be
  93. // treated as if they had been normalized, to allow normalized and non-
  94. // normalized text to give the same result. In this implementation that step
  95. // is pushed out to the caller. The caller has to ensure that the pairValue
  96. // slices contain the rune of the opening bracket after normalization for
  97. // any opening or closing bracket.
  98. openers *list.List // list of positions for opening brackets
  99. // bracket pair positions sorted by location of opening bracket
  100. pairPositions bracketPairs
  101. codesIsolatedRun []Class // directional bidi codes for an isolated run
  102. indexes []int // array of index values into the original string
  103. }
  104. // matchOpener reports whether characters at given positions form a matching
  105. // bracket pair.
  106. func (p *bracketPairer) matchOpener(pairValues []rune, opener, closer int) bool {
  107. return pairValues[p.indexes[opener]] == pairValues[p.indexes[closer]]
  108. }
  109. const maxPairingDepth = 63
  110. // locateBrackets locates matching bracket pairs according to BD16.
  111. //
  112. // This implementation uses a linked list instead of a stack, because, while
  113. // elements are added at the front (like a push) they are not generally removed
  114. // in atomic 'pop' operations, reducing the benefit of the stack archetype.
  115. func (p *bracketPairer) locateBrackets(pairTypes []bracketType, pairValues []rune) {
  116. // traverse the run
  117. // do that explicitly (not in a for-each) so we can record position
  118. for i, index := range p.indexes {
  119. // look at the bracket type for each character
  120. if pairTypes[index] == bpNone || p.codesIsolatedRun[i] != ON {
  121. // continue scanning
  122. continue
  123. }
  124. switch pairTypes[index] {
  125. case bpOpen:
  126. // check if maximum pairing depth reached
  127. if p.openers.Len() == maxPairingDepth {
  128. p.openers.Init()
  129. return
  130. }
  131. // remember opener location, most recent first
  132. p.openers.PushFront(i)
  133. case bpClose:
  134. // see if there is a match
  135. count := 0
  136. for elem := p.openers.Front(); elem != nil; elem = elem.Next() {
  137. count++
  138. opener := elem.Value.(int)
  139. if p.matchOpener(pairValues, opener, i) {
  140. // if the opener matches, add nested pair to the ordered list
  141. p.pairPositions = append(p.pairPositions, bracketPair{opener, i})
  142. // remove up to and including matched opener
  143. for ; count > 0; count-- {
  144. p.openers.Remove(p.openers.Front())
  145. }
  146. break
  147. }
  148. }
  149. sort.Sort(p.pairPositions)
  150. // if we get here, the closing bracket matched no openers
  151. // and gets ignored
  152. }
  153. }
  154. }
  155. // Bracket pairs within an isolating run sequence are processed as units so
  156. // that both the opening and the closing paired bracket in a pair resolve to
  157. // the same direction.
  158. //
  159. // N0. Process bracket pairs in an isolating run sequence sequentially in
  160. // the logical order of the text positions of the opening paired brackets
  161. // using the logic given below. Within this scope, bidirectional types EN
  162. // and AN are treated as R.
  163. //
  164. // Identify the bracket pairs in the current isolating run sequence
  165. // according to BD16. For each bracket-pair element in the list of pairs of
  166. // text positions:
  167. //
  168. // a Inspect the bidirectional types of the characters enclosed within the
  169. // bracket pair.
  170. //
  171. // b If any strong type (either L or R) matching the embedding direction is
  172. // found, set the type for both brackets in the pair to match the embedding
  173. // direction.
  174. //
  175. // o [ e ] o -> o e e e o
  176. //
  177. // o [ o e ] -> o e o e e
  178. //
  179. // o [ NI e ] -> o e NI e e
  180. //
  181. // c Otherwise, if a strong type (opposite the embedding direction) is
  182. // found, test for adjacent strong types as follows: 1 First, check
  183. // backwards before the opening paired bracket until the first strong type
  184. // (L, R, or sos) is found. If that first preceding strong type is opposite
  185. // the embedding direction, then set the type for both brackets in the pair
  186. // to that type. 2 Otherwise, set the type for both brackets in the pair to
  187. // the embedding direction.
  188. //
  189. // o [ o ] e -> o o o o e
  190. //
  191. // o [ o NI ] o -> o o o NI o o
  192. //
  193. // e [ o ] o -> e e o e o
  194. //
  195. // e [ o ] e -> e e o e e
  196. //
  197. // e ( o [ o ] NI ) e -> e e o o o o NI e e
  198. //
  199. // d Otherwise, do not set the type for the current bracket pair. Note that
  200. // if the enclosed text contains no strong types the paired brackets will
  201. // both resolve to the same level when resolved individually using rules N1
  202. // and N2.
  203. //
  204. // e ( NI ) o -> e ( NI ) o
  205. // getStrongTypeN0 maps character's directional code to strong type as required
  206. // by rule N0.
  207. //
  208. // TODO: have separate type for "strong" directionality.
  209. func (p *bracketPairer) getStrongTypeN0(index int) Class {
  210. switch p.codesIsolatedRun[index] {
  211. // in the scope of N0, number types are treated as R
  212. case EN, AN, AL, R:
  213. return R
  214. case L:
  215. return L
  216. default:
  217. return ON
  218. }
  219. }
  220. // classifyPairContent reports the strong types contained inside a Bracket Pair,
  221. // assuming the given embedding direction.
  222. //
  223. // It returns ON if no strong type is found. If a single strong type is found,
  224. // it returns this type. Otherwise it returns the embedding direction.
  225. //
  226. // TODO: use separate type for "strong" directionality.
  227. func (p *bracketPairer) classifyPairContent(loc bracketPair, dirEmbed Class) Class {
  228. dirOpposite := ON
  229. for i := loc.opener + 1; i < loc.closer; i++ {
  230. dir := p.getStrongTypeN0(i)
  231. if dir == ON {
  232. continue
  233. }
  234. if dir == dirEmbed {
  235. return dir // type matching embedding direction found
  236. }
  237. dirOpposite = dir
  238. }
  239. // return ON if no strong type found, or class opposite to dirEmbed
  240. return dirOpposite
  241. }
  242. // classBeforePair determines which strong types are present before a Bracket
  243. // Pair. Return R or L if strong type found, otherwise ON.
  244. func (p *bracketPairer) classBeforePair(loc bracketPair) Class {
  245. for i := loc.opener - 1; i >= 0; i-- {
  246. if dir := p.getStrongTypeN0(i); dir != ON {
  247. return dir
  248. }
  249. }
  250. // no strong types found, return sos
  251. return p.sos
  252. }
  253. // assignBracketType implements rule N0 for a single bracket pair.
  254. func (p *bracketPairer) assignBracketType(loc bracketPair, dirEmbed Class, initialTypes []Class) {
  255. // rule "N0, a", inspect contents of pair
  256. dirPair := p.classifyPairContent(loc, dirEmbed)
  257. // dirPair is now L, R, or N (no strong type found)
  258. // the following logical tests are performed out of order compared to
  259. // the statement of the rules but yield the same results
  260. if dirPair == ON {
  261. return // case "d" - nothing to do
  262. }
  263. if dirPair != dirEmbed {
  264. // case "c": strong type found, opposite - check before (c.1)
  265. dirPair = p.classBeforePair(loc)
  266. if dirPair == dirEmbed || dirPair == ON {
  267. // no strong opposite type found before - use embedding (c.2)
  268. dirPair = dirEmbed
  269. }
  270. }
  271. // else: case "b", strong type found matching embedding,
  272. // no explicit action needed, as dirPair is already set to embedding
  273. // direction
  274. // set the bracket types to the type found
  275. p.setBracketsToType(loc, dirPair, initialTypes)
  276. }
  277. func (p *bracketPairer) setBracketsToType(loc bracketPair, dirPair Class, initialTypes []Class) {
  278. p.codesIsolatedRun[loc.opener] = dirPair
  279. p.codesIsolatedRun[loc.closer] = dirPair
  280. for i := loc.opener + 1; i < loc.closer; i++ {
  281. index := p.indexes[i]
  282. if initialTypes[index] != NSM {
  283. break
  284. }
  285. p.codesIsolatedRun[i] = dirPair
  286. }
  287. for i := loc.closer + 1; i < len(p.indexes); i++ {
  288. index := p.indexes[i]
  289. if initialTypes[index] != NSM {
  290. break
  291. }
  292. p.codesIsolatedRun[i] = dirPair
  293. }
  294. }
  295. // resolveBrackets implements rule N0 for a list of pairs.
  296. func (p *bracketPairer) resolveBrackets(dirEmbed Class, initialTypes []Class) {
  297. for _, loc := range p.pairPositions {
  298. p.assignBracketType(loc, dirEmbed, initialTypes)
  299. }
  300. }
上海开阖软件有限公司 沪ICP备12045867号-1