|
- // Copyright (c) 2017 Couchbase, Inc.
- //
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
-
- package regexp
-
- import (
- "regexp/syntax"
- "unicode"
-
- unicode_utf8 "unicode/utf8"
-
- "github.com/couchbase/vellum/utf8"
- )
-
- type compiler struct {
- sizeLimit uint
- insts prog
- instsPool []inst
-
- sequences utf8.Sequences
- rangeStack utf8.RangeStack
- startBytes []byte
- endBytes []byte
- }
-
- func newCompiler(sizeLimit uint) *compiler {
- return &compiler{
- sizeLimit: sizeLimit,
- startBytes: make([]byte, unicode_utf8.UTFMax),
- endBytes: make([]byte, unicode_utf8.UTFMax),
- }
- }
-
- func (c *compiler) compile(ast *syntax.Regexp) (prog, error) {
- err := c.c(ast)
- if err != nil {
- return nil, err
- }
- inst := c.allocInst()
- inst.op = OpMatch
- c.insts = append(c.insts, inst)
- return c.insts, nil
- }
-
- func (c *compiler) c(ast *syntax.Regexp) (err error) {
- if ast.Flags&syntax.NonGreedy > 1 {
- return ErrNoLazy
- }
-
- switch ast.Op {
- case syntax.OpEndLine, syntax.OpBeginLine,
- syntax.OpBeginText, syntax.OpEndText:
- return ErrNoEmpty
- case syntax.OpWordBoundary, syntax.OpNoWordBoundary:
- return ErrNoWordBoundary
- case syntax.OpEmptyMatch:
- return nil
- case syntax.OpLiteral:
- for _, r := range ast.Rune {
- if ast.Flags&syntax.FoldCase > 0 {
- next := syntax.Regexp{
- Op: syntax.OpCharClass,
- Flags: ast.Flags & syntax.FoldCase,
- Rune0: [2]rune{r, r},
- }
- next.Rune = next.Rune0[0:2]
- return c.c(&next)
- }
- c.sequences, c.rangeStack, err = utf8.NewSequencesPrealloc(
- r, r, c.sequences, c.rangeStack, c.startBytes, c.endBytes)
- if err != nil {
- return err
- }
- for _, seq := range c.sequences {
- c.compileUtf8Ranges(seq)
- }
- }
- case syntax.OpAnyChar:
- next := syntax.Regexp{
- Op: syntax.OpCharClass,
- Flags: ast.Flags & syntax.FoldCase,
- Rune0: [2]rune{0, unicode.MaxRune},
- }
- next.Rune = next.Rune0[:2]
- return c.c(&next)
- case syntax.OpAnyCharNotNL:
- next := syntax.Regexp{
- Op: syntax.OpCharClass,
- Flags: ast.Flags & syntax.FoldCase,
- Rune: []rune{0, 0x09, 0x0B, unicode.MaxRune},
- }
- return c.c(&next)
- case syntax.OpCharClass:
- return c.compileClass(ast)
- case syntax.OpCapture:
- return c.c(ast.Sub[0])
- case syntax.OpConcat:
- for _, sub := range ast.Sub {
- err := c.c(sub)
- if err != nil {
- return err
- }
- }
- return nil
- case syntax.OpAlternate:
- if len(ast.Sub) == 0 {
- return nil
- }
- jmpsToEnd := make([]uint, 0, len(ast.Sub)-1)
- // does not handle last entry
- for i := 0; i < len(ast.Sub)-1; i++ {
- sub := ast.Sub[i]
- split := c.emptySplit()
- j1 := c.top()
- err := c.c(sub)
- if err != nil {
- return err
- }
- jmpsToEnd = append(jmpsToEnd, c.emptyJump())
- j2 := c.top()
- c.setSplit(split, j1, j2)
- }
- // handle last entry
- err := c.c(ast.Sub[len(ast.Sub)-1])
- if err != nil {
- return err
- }
- end := uint(len(c.insts))
- for _, jmpToEnd := range jmpsToEnd {
- c.setJump(jmpToEnd, end)
- }
- case syntax.OpQuest:
- split := c.emptySplit()
- j1 := c.top()
- err := c.c(ast.Sub[0])
- if err != nil {
- return err
- }
- j2 := c.top()
- c.setSplit(split, j1, j2)
-
- case syntax.OpStar:
- j1 := c.top()
- split := c.emptySplit()
- j2 := c.top()
- err := c.c(ast.Sub[0])
- if err != nil {
- return err
- }
- jmp := c.emptyJump()
- j3 := uint(len(c.insts))
-
- c.setJump(jmp, j1)
- c.setSplit(split, j2, j3)
-
- case syntax.OpPlus:
- j1 := c.top()
- err := c.c(ast.Sub[0])
- if err != nil {
- return err
- }
- split := c.emptySplit()
- j2 := c.top()
- c.setSplit(split, j1, j2)
-
- case syntax.OpRepeat:
- if ast.Max == -1 {
- for i := 0; i < ast.Min; i++ {
- err := c.c(ast.Sub[0])
- if err != nil {
- return err
- }
- }
- next := syntax.Regexp{
- Op: syntax.OpStar,
- Flags: ast.Flags,
- Sub: ast.Sub,
- Sub0: ast.Sub0,
- Rune: ast.Rune,
- Rune0: ast.Rune0,
- }
- return c.c(&next)
- }
- for i := 0; i < ast.Min; i++ {
- err := c.c(ast.Sub[0])
- if err != nil {
- return err
- }
- }
- splits := make([]uint, 0, ast.Max-ast.Min)
- starts := make([]uint, 0, ast.Max-ast.Min)
- for i := ast.Min; i < ast.Max; i++ {
- splits = append(splits, c.emptySplit())
- starts = append(starts, uint(len(c.insts)))
- err := c.c(ast.Sub[0])
- if err != nil {
- return err
- }
- }
- end := uint(len(c.insts))
- for i := 0; i < len(splits); i++ {
- c.setSplit(splits[i], starts[i], end)
- }
-
- }
-
- return c.checkSize()
- }
-
- func (c *compiler) checkSize() error {
- if uint(len(c.insts)*instSize) > c.sizeLimit {
- return ErrCompiledTooBig
- }
- return nil
- }
-
- func (c *compiler) compileClass(ast *syntax.Regexp) error {
- if len(ast.Rune) == 0 {
- return nil
- }
- jmps := make([]uint, 0, len(ast.Rune)-2)
- // does not do last pair
- for i := 0; i < len(ast.Rune)-2; i += 2 {
- rstart := ast.Rune[i]
- rend := ast.Rune[i+1]
-
- split := c.emptySplit()
- j1 := c.top()
- err := c.compileClassRange(rstart, rend)
- if err != nil {
- return err
- }
- jmps = append(jmps, c.emptyJump())
- j2 := c.top()
- c.setSplit(split, j1, j2)
- }
- // handle last pair
- rstart := ast.Rune[len(ast.Rune)-2]
- rend := ast.Rune[len(ast.Rune)-1]
- err := c.compileClassRange(rstart, rend)
- if err != nil {
- return err
- }
- end := c.top()
- for _, jmp := range jmps {
- c.setJump(jmp, end)
- }
- return nil
- }
-
- func (c *compiler) compileClassRange(startR, endR rune) (err error) {
- c.sequences, c.rangeStack, err = utf8.NewSequencesPrealloc(
- startR, endR, c.sequences, c.rangeStack, c.startBytes, c.endBytes)
- if err != nil {
- return err
- }
- jmps := make([]uint, 0, len(c.sequences)-1)
- // does not do last entry
- for i := 0; i < len(c.sequences)-1; i++ {
- seq := c.sequences[i]
- split := c.emptySplit()
- j1 := c.top()
- c.compileUtf8Ranges(seq)
- jmps = append(jmps, c.emptyJump())
- j2 := c.top()
- c.setSplit(split, j1, j2)
- }
- // handle last entry
- c.compileUtf8Ranges(c.sequences[len(c.sequences)-1])
- end := c.top()
- for _, jmp := range jmps {
- c.setJump(jmp, end)
- }
-
- return nil
- }
-
- func (c *compiler) compileUtf8Ranges(seq utf8.Sequence) {
- for _, r := range seq {
- inst := c.allocInst()
- inst.op = OpRange
- inst.rangeStart = r.Start
- inst.rangeEnd = r.End
- c.insts = append(c.insts, inst)
- }
- }
-
- func (c *compiler) emptySplit() uint {
- inst := c.allocInst()
- inst.op = OpSplit
- c.insts = append(c.insts, inst)
- return c.top() - 1
- }
-
- func (c *compiler) emptyJump() uint {
- inst := c.allocInst()
- inst.op = OpJmp
- c.insts = append(c.insts, inst)
- return c.top() - 1
- }
-
- func (c *compiler) setSplit(i, pc1, pc2 uint) {
- split := c.insts[i]
- split.splitA = pc1
- split.splitB = pc2
- }
-
- func (c *compiler) setJump(i, pc uint) {
- jmp := c.insts[i]
- jmp.to = pc
- }
-
- func (c *compiler) top() uint {
- return uint(len(c.insts))
- }
-
- func (c *compiler) allocInst() *inst {
- if len(c.instsPool) <= 0 {
- c.instsPool = make([]inst, 16)
- }
- inst := &c.instsPool[0]
- c.instsPool = c.instsPool[1:]
- return inst
- }
|