|
- // Copyright (c) 2018 Couchbase, Inc.
- //
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
-
- package levenshtein2
-
- import (
- "crypto/md5"
- "encoding/json"
- "fmt"
- "math"
- )
-
- type ParametricState struct {
- shapeID uint32
- offset uint32
- }
-
- func newParametricState() ParametricState {
- return ParametricState{}
- }
-
- func (ps *ParametricState) isDeadEnd() bool {
- return ps.shapeID == 0
- }
-
- type Transition struct {
- destShapeID uint32
- deltaOffset uint32
- }
-
- func (t *Transition) apply(state ParametricState) ParametricState {
- ps := ParametricState{
- shapeID: t.destShapeID}
- // don't need any offset if we are in the dead state,
- // this ensures we have only one dead state.
- if t.destShapeID != 0 {
- ps.offset = state.offset + t.deltaOffset
- }
-
- return ps
- }
-
- type ParametricStateIndex struct {
- stateIndex []uint32
- stateQueue []ParametricState
- numOffsets uint32
- }
-
- func newParametricStateIndex(queryLen,
- numParamState uint32) ParametricStateIndex {
- numOffsets := queryLen + 1
- if numParamState == 0 {
- numParamState = numOffsets
- }
- maxNumStates := numParamState * numOffsets
- psi := ParametricStateIndex{
- stateIndex: make([]uint32, maxNumStates),
- stateQueue: make([]ParametricState, 0, 150),
- numOffsets: numOffsets,
- }
-
- for i := uint32(0); i < maxNumStates; i++ {
- psi.stateIndex[i] = math.MaxUint32
- }
- return psi
- }
-
- func (psi *ParametricStateIndex) numStates() int {
- return len(psi.stateQueue)
- }
-
- func (psi *ParametricStateIndex) maxNumStates() int {
- return len(psi.stateIndex)
- }
-
- func (psi *ParametricStateIndex) get(stateID uint32) ParametricState {
- return psi.stateQueue[stateID]
- }
-
- func (psi *ParametricStateIndex) getOrAllocate(ps ParametricState) uint32 {
- bucket := ps.shapeID*psi.numOffsets + ps.offset
- if bucket < uint32(len(psi.stateIndex)) &&
- psi.stateIndex[bucket] != math.MaxUint32 {
- return psi.stateIndex[bucket]
- }
- nState := uint32(len(psi.stateQueue))
- psi.stateQueue = append(psi.stateQueue, ps)
-
- psi.stateIndex[bucket] = nState
- return nState
- }
-
- type ParametricDFA struct {
- distance []uint8
- transitions []Transition
- maxDistance uint8
- transitionStride uint32
- diameter uint32
- }
-
- func (pdfa *ParametricDFA) initialState() ParametricState {
- return ParametricState{shapeID: 1}
- }
-
- // Returns true iff whatever characters come afterward,
- // we will never reach a shorter distance
- func (pdfa *ParametricDFA) isPrefixSink(state ParametricState, queryLen uint32) bool {
- if state.isDeadEnd() {
- return true
- }
-
- remOffset := queryLen - state.offset
- if remOffset < pdfa.diameter {
- stateDistances := pdfa.distance[pdfa.diameter*state.shapeID:]
- prefixDistance := stateDistances[remOffset]
- if prefixDistance > pdfa.maxDistance {
- return false
- }
-
- for _, d := range stateDistances {
- if d < prefixDistance {
- return false
- }
- }
- return true
- }
- return false
- }
-
- func (pdfa *ParametricDFA) numStates() int {
- return len(pdfa.transitions) / int(pdfa.transitionStride)
- }
-
- func min(x, y uint32) uint32 {
- if x < y {
- return x
- }
- return y
- }
-
- func (pdfa *ParametricDFA) transition(state ParametricState,
- chi uint32) Transition {
- return pdfa.transitions[pdfa.transitionStride*state.shapeID+chi]
- }
-
- func (pdfa *ParametricDFA) getDistance(state ParametricState,
- qLen uint32) Distance {
- remainingOffset := qLen - state.offset
- if state.isDeadEnd() || remainingOffset >= pdfa.diameter {
- return Atleast{d: pdfa.maxDistance + 1}
- }
- dist := pdfa.distance[int(pdfa.diameter*state.shapeID)+int(remainingOffset)]
- if dist > pdfa.maxDistance {
- return Atleast{d: dist}
- }
- return Exact{d: dist}
- }
-
- func (pdfa *ParametricDFA) computeDistance(left, right string) Distance {
- state := pdfa.initialState()
- leftChars := []rune(left)
- for _, chr := range []rune(right) {
- start := state.offset
- stop := min(start+pdfa.diameter, uint32(len(leftChars)))
- chi := characteristicVector(leftChars[start:stop], chr)
- transition := pdfa.transition(state, uint32(chi))
- state = transition.apply(state)
- if state.isDeadEnd() {
- return Atleast{d: pdfa.maxDistance + 1}
- }
- }
- return pdfa.getDistance(state, uint32(len(left)))
- }
-
- func (pdfa *ParametricDFA) buildDfa(query string, distance uint8,
- prefix bool) (*DFA, error) {
- qLen := uint32(len([]rune(query)))
- alphabet := queryChars(query)
-
- psi := newParametricStateIndex(qLen, uint32(pdfa.numStates()))
- maxNumStates := psi.maxNumStates()
- deadEndStateID := psi.getOrAllocate(newParametricState())
- if deadEndStateID != 0 {
- return nil, fmt.Errorf("Invalid dead end state")
- }
-
- initialStateID := psi.getOrAllocate(pdfa.initialState())
- dfaBuilder := withMaxStates(uint32(maxNumStates))
- mask := uint32((1 << pdfa.diameter) - 1)
-
- var stateID int
- for stateID = 0; stateID < StateLimit; stateID++ {
- if stateID == psi.numStates() {
- break
- }
- state := psi.get(uint32(stateID))
- if prefix && pdfa.isPrefixSink(state, qLen) {
- distance := pdfa.getDistance(state, qLen)
- dfaBuilder.addState(uint32(stateID), uint32(stateID), distance)
- } else {
- transition := pdfa.transition(state, 0)
- defSuccessor := transition.apply(state)
- defSuccessorID := psi.getOrAllocate(defSuccessor)
- distance := pdfa.getDistance(state, qLen)
- stateBuilder, err := dfaBuilder.addState(uint32(stateID), defSuccessorID, distance)
-
- if err != nil {
- return nil, fmt.Errorf("parametric_dfa: buildDfa, err: %v", err)
- }
-
- alphabet.resetNext()
- chr, cv, err := alphabet.next()
- for err == nil {
- chi := cv.shiftAndMask(state.offset, mask)
-
- transition := pdfa.transition(state, chi)
-
- destState := transition.apply(state)
-
- destStateID := psi.getOrAllocate(destState)
-
- stateBuilder.addTransition(chr, destStateID)
-
- chr, cv, err = alphabet.next()
- }
- }
- }
-
- if stateID == StateLimit {
- return nil, ErrTooManyStates
- }
-
- dfaBuilder.setInitialState(initialStateID)
- return dfaBuilder.build(distance), nil
- }
-
- func fromNfa(nfa *LevenshteinNFA) (*ParametricDFA, error) {
- lookUp := newHash()
- lookUp.getOrAllocate(*newMultiState())
- initialState := nfa.initialStates()
- lookUp.getOrAllocate(*initialState)
-
- maxDistance := nfa.maxDistance()
- msDiameter := nfa.msDiameter()
-
- numChi := 1 << msDiameter
- chiValues := make([]uint64, numChi)
- for i := 0; i < numChi; i++ {
- chiValues[i] = uint64(i)
- }
-
- transitions := make([]Transition, 0, numChi*int(msDiameter))
- var stateID int
- for stateID = 0; stateID < StateLimit; stateID++ {
- if stateID == len(lookUp.items) {
- break
- }
-
- for _, chi := range chiValues {
- destMs := newMultiState()
-
- ms := lookUp.getFromID(stateID)
-
- nfa.transition(ms, destMs, chi)
-
- translation := destMs.normalize()
-
- destID := lookUp.getOrAllocate(*destMs)
-
- transitions = append(transitions, Transition{
- destShapeID: uint32(destID),
- deltaOffset: translation,
- })
- }
- }
-
- if stateID == StateLimit {
- return nil, ErrTooManyStates
- }
-
- ns := len(lookUp.items)
- diameter := int(msDiameter)
-
- distances := make([]uint8, 0, diameter*ns)
- for stateID := 0; stateID < ns; stateID++ {
- ms := lookUp.getFromID(stateID)
- for offset := 0; offset < diameter; offset++ {
- dist := nfa.multistateDistance(ms, uint32(offset))
- distances = append(distances, dist.distance())
- }
- }
-
- return &ParametricDFA{
- diameter: uint32(msDiameter),
- transitions: transitions,
- maxDistance: maxDistance,
- transitionStride: uint32(numChi),
- distance: distances,
- }, nil
- }
-
- type hash struct {
- index map[[16]byte]int
- items []MultiState
- }
-
- func newHash() *hash {
- return &hash{
- index: make(map[[16]byte]int, 100),
- items: make([]MultiState, 0, 100),
- }
- }
-
- func (h *hash) getOrAllocate(m MultiState) int {
- size := len(h.items)
- var exists bool
- var pos int
- md5 := getHash(&m)
- if pos, exists = h.index[md5]; !exists {
- h.index[md5] = size
- pos = size
- h.items = append(h.items, m)
- }
- return pos
- }
-
- func (h *hash) getFromID(id int) *MultiState {
- return &h.items[id]
- }
-
- func getHash(ms *MultiState) [16]byte {
- msBytes := []byte{}
- for _, state := range ms.states {
- jsonBytes, _ := json.Marshal(&state)
- msBytes = append(msBytes, jsonBytes...)
- }
- return md5.Sum(msBytes)
- }
|