251 lines
6.4 KiB
Go
Raw Normal View History

// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package levenshtein2
import (
"fmt"
"math"
)
const SinkState = uint32(0)
type DFA struct {
transitions [][256]uint32
distances []Distance
initState int
ed uint8
}
/// Returns the initial state
func (d *DFA) initialState() int {
return d.initState
}
/// Returns the Levenshtein distance associated to the
/// current state.
func (d *DFA) distance(stateId int) Distance {
return d.distances[stateId]
}
/// Returns the number of states in the `DFA`.
func (d *DFA) numStates() int {
return len(d.transitions)
}
/// Returns the destination state reached after consuming a given byte.
func (d *DFA) transition(fromState int, b uint8) int {
return int(d.transitions[fromState][b])
}
func (d *DFA) eval(bytes []uint8) Distance {
state := d.initialState()
for _, b := range bytes {
state = d.transition(state, b)
}
return d.distance(state)
}
func (d *DFA) Start() int {
return int(d.initialState())
}
func (d *DFA) IsMatch(state int) bool {
if _, ok := d.distance(state).(Exact); ok {
return true
}
return false
}
func (d *DFA) CanMatch(state int) bool {
return state > 0 && state < d.numStates()
}
func (d *DFA) Accept(state int, b byte) int {
return int(d.transition(state, b))
}
// WillAlwaysMatch returns if the specified state will always end in a
// matching state.
func (d *DFA) WillAlwaysMatch(state int) bool {
return false
}
func fill(dest []uint32, val uint32) {
for i := range dest {
dest[i] = val
}
}
func fillTransitions(dest *[256]uint32, val uint32) {
for i := range dest {
dest[i] = val
}
}
type Utf8DFAStateBuilder struct {
dfaBuilder *Utf8DFABuilder
stateID uint32
defaultSuccessor []uint32
}
func (sb *Utf8DFAStateBuilder) addTransitionID(fromStateID uint32, b uint8,
toStateID uint32) {
sb.dfaBuilder.transitions[fromStateID][b] = toStateID
}
func (sb *Utf8DFAStateBuilder) addTransition(in rune, toStateID uint32) {
fromStateID := sb.stateID
chars := []byte(string(in))
lastByte := chars[len(chars)-1]
for i, ch := range chars[:len(chars)-1] {
remNumBytes := len(chars) - i - 1
defaultSuccessor := sb.defaultSuccessor[remNumBytes]
intermediateStateID := sb.dfaBuilder.transitions[fromStateID][ch]
if intermediateStateID == defaultSuccessor {
intermediateStateID = sb.dfaBuilder.allocate()
fillTransitions(&sb.dfaBuilder.transitions[intermediateStateID],
sb.defaultSuccessor[remNumBytes-1])
}
sb.addTransitionID(fromStateID, ch, intermediateStateID)
fromStateID = intermediateStateID
}
toStateIDDecoded := sb.dfaBuilder.getOrAllocate(original(toStateID))
sb.addTransitionID(fromStateID, lastByte, toStateIDDecoded)
}
type Utf8StateId uint32
func original(stateId uint32) Utf8StateId {
return predecessor(stateId, 0)
}
func predecessor(stateId uint32, numSteps uint8) Utf8StateId {
return Utf8StateId(stateId*4 + uint32(numSteps))
}
// Utf8DFABuilder makes it possible to define a DFA
// that takes unicode character, and build a `DFA`
// that operates on utf-8 encoded
type Utf8DFABuilder struct {
index []uint32
distances []Distance
transitions [][256]uint32
initialState uint32
numStates uint32
maxNumStates uint32
}
func withMaxStates(maxStates uint32) *Utf8DFABuilder {
rv := &Utf8DFABuilder{
index: make([]uint32, maxStates*2+100),
distances: make([]Distance, 0, maxStates),
transitions: make([][256]uint32, 0, maxStates),
maxNumStates: maxStates,
}
for i := range rv.index {
rv.index[i] = math.MaxUint32
}
return rv
}
func (dfab *Utf8DFABuilder) allocate() uint32 {
newState := dfab.numStates
dfab.numStates++
dfab.distances = append(dfab.distances, Atleast{d: 255})
dfab.transitions = append(dfab.transitions, [256]uint32{})
return newState
}
func (dfab *Utf8DFABuilder) getOrAllocate(state Utf8StateId) uint32 {
if int(state) >= cap(dfab.index) {
cloneIndex := make([]uint32, int(state)*2)
copy(cloneIndex, dfab.index)
dfab.index = cloneIndex
}
if dfab.index[state] != math.MaxUint32 {
return dfab.index[state]
}
nstate := dfab.allocate()
dfab.index[state] = nstate
return nstate
}
func (dfab *Utf8DFABuilder) setInitialState(iState uint32) {
decodedID := dfab.getOrAllocate(original(iState))
dfab.initialState = decodedID
}
func (dfab *Utf8DFABuilder) build(ed uint8) *DFA {
return &DFA{
transitions: dfab.transitions,
distances: dfab.distances,
initState: int(dfab.initialState),
ed: ed,
}
}
func (dfab *Utf8DFABuilder) addState(state, default_suc_orig uint32,
distance Distance) (*Utf8DFAStateBuilder, error) {
if state > dfab.maxNumStates {
return nil, fmt.Errorf("State id is larger than maxNumStates")
}
stateID := dfab.getOrAllocate(original(state))
dfab.distances[stateID] = distance
defaultSuccID := dfab.getOrAllocate(original(default_suc_orig))
// creates a chain of states of predecessors of `default_suc_orig`.
// Accepting k-bytes (whatever the bytes are) from `predecessor_states[k-1]`
// leads to the `default_suc_orig` state.
predecessorStates := []uint32{defaultSuccID,
defaultSuccID,
defaultSuccID,
defaultSuccID}
for numBytes := uint8(1); numBytes < 4; numBytes++ {
predecessorState := predecessor(default_suc_orig, numBytes)
predecessorStateID := dfab.getOrAllocate(predecessorState)
predecessorStates[numBytes] = predecessorStateID
succ := predecessorStates[numBytes-1]
fillTransitions(&dfab.transitions[predecessorStateID], succ)
}
// 1-byte encoded chars.
fill(dfab.transitions[stateID][0:192], predecessorStates[0])
// 2-bytes encoded chars.
fill(dfab.transitions[stateID][192:224], predecessorStates[1])
// 3-bytes encoded chars.
fill(dfab.transitions[stateID][224:240], predecessorStates[2])
// 4-bytes encoded chars.
fill(dfab.transitions[stateID][240:256], predecessorStates[3])
return &Utf8DFAStateBuilder{
dfaBuilder: dfab,
stateID: stateID,
defaultSuccessor: predecessorStates}, nil
}