本站源代码
Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

336 lines
7.2KB

  1. // Copyright (c) 2017 Couchbase, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package regexp
  15. import (
  16. "regexp/syntax"
  17. "unicode"
  18. unicode_utf8 "unicode/utf8"
  19. "github.com/couchbase/vellum/utf8"
  20. )
  21. type compiler struct {
  22. sizeLimit uint
  23. insts prog
  24. instsPool []inst
  25. sequences utf8.Sequences
  26. rangeStack utf8.RangeStack
  27. startBytes []byte
  28. endBytes []byte
  29. }
  30. func newCompiler(sizeLimit uint) *compiler {
  31. return &compiler{
  32. sizeLimit: sizeLimit,
  33. startBytes: make([]byte, unicode_utf8.UTFMax),
  34. endBytes: make([]byte, unicode_utf8.UTFMax),
  35. }
  36. }
  37. func (c *compiler) compile(ast *syntax.Regexp) (prog, error) {
  38. err := c.c(ast)
  39. if err != nil {
  40. return nil, err
  41. }
  42. inst := c.allocInst()
  43. inst.op = OpMatch
  44. c.insts = append(c.insts, inst)
  45. return c.insts, nil
  46. }
  47. func (c *compiler) c(ast *syntax.Regexp) (err error) {
  48. if ast.Flags&syntax.NonGreedy > 1 {
  49. return ErrNoLazy
  50. }
  51. switch ast.Op {
  52. case syntax.OpEndLine, syntax.OpBeginLine,
  53. syntax.OpBeginText, syntax.OpEndText:
  54. return ErrNoEmpty
  55. case syntax.OpWordBoundary, syntax.OpNoWordBoundary:
  56. return ErrNoWordBoundary
  57. case syntax.OpEmptyMatch:
  58. return nil
  59. case syntax.OpLiteral:
  60. for _, r := range ast.Rune {
  61. if ast.Flags&syntax.FoldCase > 0 {
  62. next := syntax.Regexp{
  63. Op: syntax.OpCharClass,
  64. Flags: ast.Flags & syntax.FoldCase,
  65. Rune0: [2]rune{r, r},
  66. }
  67. next.Rune = next.Rune0[0:2]
  68. return c.c(&next)
  69. }
  70. c.sequences, c.rangeStack, err = utf8.NewSequencesPrealloc(
  71. r, r, c.sequences, c.rangeStack, c.startBytes, c.endBytes)
  72. if err != nil {
  73. return err
  74. }
  75. for _, seq := range c.sequences {
  76. c.compileUtf8Ranges(seq)
  77. }
  78. }
  79. case syntax.OpAnyChar:
  80. next := syntax.Regexp{
  81. Op: syntax.OpCharClass,
  82. Flags: ast.Flags & syntax.FoldCase,
  83. Rune0: [2]rune{0, unicode.MaxRune},
  84. }
  85. next.Rune = next.Rune0[:2]
  86. return c.c(&next)
  87. case syntax.OpAnyCharNotNL:
  88. next := syntax.Regexp{
  89. Op: syntax.OpCharClass,
  90. Flags: ast.Flags & syntax.FoldCase,
  91. Rune: []rune{0, 0x09, 0x0B, unicode.MaxRune},
  92. }
  93. return c.c(&next)
  94. case syntax.OpCharClass:
  95. return c.compileClass(ast)
  96. case syntax.OpCapture:
  97. return c.c(ast.Sub[0])
  98. case syntax.OpConcat:
  99. for _, sub := range ast.Sub {
  100. err := c.c(sub)
  101. if err != nil {
  102. return err
  103. }
  104. }
  105. return nil
  106. case syntax.OpAlternate:
  107. if len(ast.Sub) == 0 {
  108. return nil
  109. }
  110. jmpsToEnd := make([]uint, 0, len(ast.Sub)-1)
  111. // does not handle last entry
  112. for i := 0; i < len(ast.Sub)-1; i++ {
  113. sub := ast.Sub[i]
  114. split := c.emptySplit()
  115. j1 := c.top()
  116. err := c.c(sub)
  117. if err != nil {
  118. return err
  119. }
  120. jmpsToEnd = append(jmpsToEnd, c.emptyJump())
  121. j2 := c.top()
  122. c.setSplit(split, j1, j2)
  123. }
  124. // handle last entry
  125. err := c.c(ast.Sub[len(ast.Sub)-1])
  126. if err != nil {
  127. return err
  128. }
  129. end := uint(len(c.insts))
  130. for _, jmpToEnd := range jmpsToEnd {
  131. c.setJump(jmpToEnd, end)
  132. }
  133. case syntax.OpQuest:
  134. split := c.emptySplit()
  135. j1 := c.top()
  136. err := c.c(ast.Sub[0])
  137. if err != nil {
  138. return err
  139. }
  140. j2 := c.top()
  141. c.setSplit(split, j1, j2)
  142. case syntax.OpStar:
  143. j1 := c.top()
  144. split := c.emptySplit()
  145. j2 := c.top()
  146. err := c.c(ast.Sub[0])
  147. if err != nil {
  148. return err
  149. }
  150. jmp := c.emptyJump()
  151. j3 := uint(len(c.insts))
  152. c.setJump(jmp, j1)
  153. c.setSplit(split, j2, j3)
  154. case syntax.OpPlus:
  155. j1 := c.top()
  156. err := c.c(ast.Sub[0])
  157. if err != nil {
  158. return err
  159. }
  160. split := c.emptySplit()
  161. j2 := c.top()
  162. c.setSplit(split, j1, j2)
  163. case syntax.OpRepeat:
  164. if ast.Max == -1 {
  165. for i := 0; i < ast.Min; i++ {
  166. err := c.c(ast.Sub[0])
  167. if err != nil {
  168. return err
  169. }
  170. }
  171. next := syntax.Regexp{
  172. Op: syntax.OpStar,
  173. Flags: ast.Flags,
  174. Sub: ast.Sub,
  175. Sub0: ast.Sub0,
  176. Rune: ast.Rune,
  177. Rune0: ast.Rune0,
  178. }
  179. return c.c(&next)
  180. }
  181. for i := 0; i < ast.Min; i++ {
  182. err := c.c(ast.Sub[0])
  183. if err != nil {
  184. return err
  185. }
  186. }
  187. splits := make([]uint, 0, ast.Max-ast.Min)
  188. starts := make([]uint, 0, ast.Max-ast.Min)
  189. for i := ast.Min; i < ast.Max; i++ {
  190. splits = append(splits, c.emptySplit())
  191. starts = append(starts, uint(len(c.insts)))
  192. err := c.c(ast.Sub[0])
  193. if err != nil {
  194. return err
  195. }
  196. }
  197. end := uint(len(c.insts))
  198. for i := 0; i < len(splits); i++ {
  199. c.setSplit(splits[i], starts[i], end)
  200. }
  201. }
  202. return c.checkSize()
  203. }
  204. func (c *compiler) checkSize() error {
  205. if uint(len(c.insts)*instSize) > c.sizeLimit {
  206. return ErrCompiledTooBig
  207. }
  208. return nil
  209. }
  210. func (c *compiler) compileClass(ast *syntax.Regexp) error {
  211. if len(ast.Rune) == 0 {
  212. return nil
  213. }
  214. jmps := make([]uint, 0, len(ast.Rune)-2)
  215. // does not do last pair
  216. for i := 0; i < len(ast.Rune)-2; i += 2 {
  217. rstart := ast.Rune[i]
  218. rend := ast.Rune[i+1]
  219. split := c.emptySplit()
  220. j1 := c.top()
  221. err := c.compileClassRange(rstart, rend)
  222. if err != nil {
  223. return err
  224. }
  225. jmps = append(jmps, c.emptyJump())
  226. j2 := c.top()
  227. c.setSplit(split, j1, j2)
  228. }
  229. // handle last pair
  230. rstart := ast.Rune[len(ast.Rune)-2]
  231. rend := ast.Rune[len(ast.Rune)-1]
  232. err := c.compileClassRange(rstart, rend)
  233. if err != nil {
  234. return err
  235. }
  236. end := c.top()
  237. for _, jmp := range jmps {
  238. c.setJump(jmp, end)
  239. }
  240. return nil
  241. }
  242. func (c *compiler) compileClassRange(startR, endR rune) (err error) {
  243. c.sequences, c.rangeStack, err = utf8.NewSequencesPrealloc(
  244. startR, endR, c.sequences, c.rangeStack, c.startBytes, c.endBytes)
  245. if err != nil {
  246. return err
  247. }
  248. jmps := make([]uint, 0, len(c.sequences)-1)
  249. // does not do last entry
  250. for i := 0; i < len(c.sequences)-1; i++ {
  251. seq := c.sequences[i]
  252. split := c.emptySplit()
  253. j1 := c.top()
  254. c.compileUtf8Ranges(seq)
  255. jmps = append(jmps, c.emptyJump())
  256. j2 := c.top()
  257. c.setSplit(split, j1, j2)
  258. }
  259. // handle last entry
  260. c.compileUtf8Ranges(c.sequences[len(c.sequences)-1])
  261. end := c.top()
  262. for _, jmp := range jmps {
  263. c.setJump(jmp, end)
  264. }
  265. return nil
  266. }
  267. func (c *compiler) compileUtf8Ranges(seq utf8.Sequence) {
  268. for _, r := range seq {
  269. inst := c.allocInst()
  270. inst.op = OpRange
  271. inst.rangeStart = r.Start
  272. inst.rangeEnd = r.End
  273. c.insts = append(c.insts, inst)
  274. }
  275. }
  276. func (c *compiler) emptySplit() uint {
  277. inst := c.allocInst()
  278. inst.op = OpSplit
  279. c.insts = append(c.insts, inst)
  280. return c.top() - 1
  281. }
  282. func (c *compiler) emptyJump() uint {
  283. inst := c.allocInst()
  284. inst.op = OpJmp
  285. c.insts = append(c.insts, inst)
  286. return c.top() - 1
  287. }
  288. func (c *compiler) setSplit(i, pc1, pc2 uint) {
  289. split := c.insts[i]
  290. split.splitA = pc1
  291. split.splitB = pc2
  292. }
  293. func (c *compiler) setJump(i, pc uint) {
  294. jmp := c.insts[i]
  295. jmp.to = pc
  296. }
  297. func (c *compiler) top() uint {
  298. return uint(len(c.insts))
  299. }
  300. func (c *compiler) allocInst() *inst {
  301. if len(c.instsPool) <= 0 {
  302. c.instsPool = make([]inst, 16)
  303. }
  304. inst := &c.instsPool[0]
  305. c.instsPool = c.instsPool[1:]
  306. return inst
  307. }
上海开阖软件有限公司 沪ICP备12045867号-1