Skip to content

Commit 9e80067

Browse files
Add trie for patterns (#3076)
## Why A prefix tree datastructure for patterns allows us to do quick lookup for patterns. This will be useful for required and enum field validation. Downstream validation PR: #3044 ## Tests Unit tests.
1 parent 862faa8 commit 9e80067

File tree

2 files changed

+367
-0
lines changed

2 files changed

+367
-0
lines changed

libs/dyn/pattern_trie.go

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
package dyn
2+
3+
import (
4+
"fmt"
5+
)
6+
7+
// TrieNode is a trie data structure for storing and querying patterns.
8+
// It supports both exact matches and wildcard matches. You can insert [Pattern]s
9+
// into the trie and then query it to see if a given [Path] matches any of the
10+
// patterns.
11+
//
12+
// TrieNode represents a node in the pattern trie.
13+
// Each node in the array represents one or more of:
14+
// 1. An [AnyKey] component. This is the "*" wildcard which matches any map key.
15+
// 2. An [AnyIndex] component. This is the "[*]" wildcard which matches any array index.
16+
// 3. Multiple [Key] components. These are multiple static path keys for this this node would match.
17+
//
18+
// Note: It's valid for both anyKey and pathKey to be set at the same time.
19+
// For example, adding both "foo.*.bar" and "foo.bar" to a trie is valid.
20+
//
21+
// Note: Setting both key (one of pathKey or anyKey) and index (anyIndex)
22+
// is not supported by the [PatternTrie.SearchPath] method. We don't perform validation for this
23+
// case because it's not expected to arise in practice where a field is either a map or an array,
24+
// but not both.
25+
type TrieNode struct {
26+
// If set this indicates the trie node is an AnyKey node.
27+
// Maps to the [AnyKey] component.
28+
AnyKey *TrieNode
29+
30+
// Indicates the trie node is an AnyIndex node.
31+
// Maps to the [AnyIndex] component.
32+
AnyIndex *TrieNode
33+
34+
// Set of strings which this trie node matches.
35+
// Maps to the [Key] component.
36+
PathKey map[string]*TrieNode
37+
38+
// Indicates if this node is the end of a pattern. Encountering a node
39+
// with IsEnd set to true in a trie means the pattern from the root to this
40+
// node is a complete pattern.
41+
IsEnd bool
42+
}
43+
44+
// Insert adds a pattern to the trie.
45+
func (t *TrieNode) Insert(pattern Pattern) error {
46+
// Empty pattern represents the root.
47+
if len(pattern) == 0 {
48+
t.IsEnd = true
49+
return nil
50+
}
51+
52+
current := t
53+
for i, component := range pattern {
54+
// Create next node based on component type
55+
var next *TrieNode
56+
switch c := component.(type) {
57+
case anyKeyComponent:
58+
if current.AnyKey == nil {
59+
current.AnyKey = &TrieNode{}
60+
}
61+
next = current.AnyKey
62+
63+
case anyIndexComponent:
64+
if current.AnyIndex == nil {
65+
current.AnyIndex = &TrieNode{}
66+
}
67+
next = current.AnyIndex
68+
69+
case pathComponent:
70+
if key := c.Key(); key != "" {
71+
if current.PathKey == nil {
72+
current.PathKey = make(map[string]*TrieNode)
73+
}
74+
if _, exists := current.PathKey[key]; !exists {
75+
current.PathKey[key] = &TrieNode{}
76+
}
77+
next = current.PathKey[key]
78+
} else {
79+
return fmt.Errorf("fixed index patterns are not supported: %#v", pattern)
80+
}
81+
}
82+
83+
if next == nil {
84+
return fmt.Errorf("invalid component type: %T", component)
85+
}
86+
87+
// Mark as end of pattern if this is the last component.
88+
if i == len(pattern)-1 {
89+
next.IsEnd = true
90+
}
91+
92+
// Move to next node
93+
current = next
94+
}
95+
96+
return nil
97+
}
98+
99+
// SearchPath checks if the given path matches any pattern in the trie.
100+
// A path matches if it exactly matches a pattern or if it matches a pattern with wildcards.
101+
func (t *TrieNode) SearchPath(path Path) (Pattern, bool) {
102+
// We pre-allocate the prefix array that is used to track the current
103+
// prefix accumulated while walking the prefix tree. Pre-allocating
104+
// ensures that we do not allocate memory on every recursive call.
105+
prefix := make(Pattern, len(path))
106+
pattern, ok := t.searchPathRecursive(t, path, prefix, 0)
107+
return pattern, ok
108+
}
109+
110+
// searchPathRecursive is a helper function that recursively checks if a path matches any pattern.
111+
// Arguments:
112+
// - node: the current node in the trie.
113+
// - path: the path to check.
114+
// - prefix: the prefix accumulated while walking the prefix tree.
115+
// - index: the current index in the path / prefix
116+
//
117+
// Note we always expect the path and prefix to be the same length because wildcards like * and [*]
118+
// only match a single path component.
119+
func (t *TrieNode) searchPathRecursive(node *TrieNode, path Path, prefix Pattern, index int) (Pattern, bool) {
120+
if node == nil {
121+
return nil, false
122+
}
123+
124+
// Zero case, when the query path is the root node. We return nil here to match
125+
// the semantics of [MustPatternFromString] which returns nil for the empty string.
126+
//
127+
// We cannot return a Pattern{} object here because then MustPatternFromString(""), which
128+
// returns nil will not be equal to the Pattern{} object returned by this function. An equality
129+
// is useful because users of this function can use it to check whether the root / empty pattern
130+
// had been inserted into the trie.
131+
if len(path) == 0 {
132+
return nil, node.IsEnd
133+
}
134+
135+
// If we've reached the end of the path, check if this node is a valid end of a pattern.
136+
isLast := index == len(path)
137+
if isLast {
138+
return prefix, node.IsEnd
139+
}
140+
141+
currentComponent := path[index]
142+
143+
// First check if the key wildcard is set for the current index.
144+
if currentComponent.isKey() && node.AnyKey != nil {
145+
prefix[index] = AnyKey()
146+
pattern, ok := t.searchPathRecursive(node.AnyKey, path, prefix, index+1)
147+
if ok {
148+
return pattern, true
149+
}
150+
}
151+
152+
// If no key wildcard is set, check if the key is an exact match.
153+
if currentComponent.isKey() {
154+
child, exists := node.PathKey[currentComponent.Key()]
155+
if !exists {
156+
return nil, false
157+
}
158+
prefix[index] = currentComponent
159+
return t.searchPathRecursive(child, path, prefix, index+1)
160+
}
161+
162+
if currentComponent.isIndex() && node.AnyIndex != nil {
163+
prefix[index] = AnyIndex()
164+
pattern, ok := t.searchPathRecursive(node.AnyIndex, path, prefix, index+1)
165+
if ok {
166+
return pattern, true
167+
}
168+
}
169+
170+
// If we've reached this point, the path does not match any patterns in the trie.
171+
return nil, false
172+
}

libs/dyn/pattern_trie_test.go

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
package dyn_test
2+
3+
import (
4+
"testing"
5+
6+
"github.com/databricks/cli/libs/dyn"
7+
assert "github.com/databricks/cli/libs/dyn/dynassert"
8+
)
9+
10+
func TestPatternTrie_SearchPath(t *testing.T) {
11+
tests := []struct {
12+
name string
13+
pattern string
14+
mustMatch []string
15+
mustNotMatch []string
16+
}{
17+
{
18+
name: "empty pattern",
19+
pattern: "",
20+
mustMatch: []string{""},
21+
mustNotMatch: []string{"foo"},
22+
},
23+
{
24+
name: "simple key pattern",
25+
pattern: "foo",
26+
mustMatch: []string{"foo"},
27+
mustNotMatch: []string{"foo.bar", "foo[0]", "bar"},
28+
},
29+
30+
{
31+
name: "nested key pattern",
32+
pattern: "foo.bar",
33+
mustMatch: []string{"foo.bar"},
34+
mustNotMatch: []string{"foo", "foo[0]", "bar.foo", "foo.baz"},
35+
},
36+
{
37+
name: "root wildcard",
38+
pattern: "*",
39+
mustMatch: []string{"foo", "bar"},
40+
mustNotMatch: []string{"", "bar.foo", "foo.baz"},
41+
},
42+
{
43+
name: "wildcard * after foo",
44+
pattern: "foo.*",
45+
mustMatch: []string{"foo.bar", "foo.baz"},
46+
mustNotMatch: []string{"foo", "bar", "foo.bar.baz"},
47+
},
48+
{
49+
name: "wildcard [*] after foo",
50+
pattern: "foo[*]",
51+
mustMatch: []string{"foo[0]", "foo[1]", "foo[2025]"},
52+
mustNotMatch: []string{"foo", "bar", "foo[0].bar"},
53+
},
54+
{
55+
name: "key after * wildcard",
56+
pattern: "foo.*.bar",
57+
mustMatch: []string{"foo.abc.bar", "foo.def.bar"},
58+
mustNotMatch: []string{"foo", "bar", "foo.bar.baz"},
59+
},
60+
{
61+
name: "key after [*] wildcard",
62+
pattern: "foo[*].bar",
63+
mustMatch: []string{"foo[0].bar", "foo[1].bar", "foo[2025].bar"},
64+
mustNotMatch: []string{"foo", "bar", "foo[0].baz"},
65+
},
66+
{
67+
name: "multiple * wildcards",
68+
pattern: "*.*.*",
69+
mustMatch: []string{"foo.bar.baz", "foo.bar.qux"},
70+
mustNotMatch: []string{"foo", "bar", "foo.bar", "foo.bar.baz.qux"},
71+
},
72+
{
73+
name: "multiple [*] wildcards",
74+
pattern: "foo[*][*]",
75+
mustMatch: []string{"foo[0][0]", "foo[1][1]", "foo[2025][2025]"},
76+
mustNotMatch: []string{"foo", "bar", "foo[0][0][0]"},
77+
},
78+
{
79+
name: "[*] after * wildcard",
80+
pattern: "*[*]",
81+
mustMatch: []string{"foo[0]", "foo[1]", "foo[2025]"},
82+
mustNotMatch: []string{"foo", "bar", "foo[0].bar", "[0].foo"},
83+
},
84+
}
85+
for _, tt := range tests {
86+
t.Run(tt.name, func(t *testing.T) {
87+
trie := &dyn.TrieNode{}
88+
pattern := dyn.MustPatternFromString(tt.pattern)
89+
90+
// None of the expected paths should match yet.
91+
for _, path := range tt.mustMatch {
92+
_, ok := trie.SearchPath(dyn.MustPathFromString(path))
93+
assert.False(t, ok)
94+
}
95+
for _, path := range tt.mustNotMatch {
96+
_, ok := trie.SearchPath(dyn.MustPathFromString(path))
97+
assert.False(t, ok)
98+
}
99+
100+
err := trie.Insert(pattern)
101+
assert.NoError(t, err)
102+
103+
// Now all the expected paths should match.
104+
for _, path := range tt.mustMatch {
105+
pattern, ok := trie.SearchPath(dyn.MustPathFromString(path))
106+
assert.True(t, ok)
107+
assert.Equal(t, dyn.MustPatternFromString(tt.pattern), pattern)
108+
}
109+
for _, path := range tt.mustNotMatch {
110+
_, ok := trie.SearchPath(dyn.MustPathFromString(path))
111+
assert.False(t, ok)
112+
}
113+
})
114+
}
115+
}
116+
117+
func TestPatternTrie_MultiplePatterns(t *testing.T) {
118+
trie := &dyn.TrieNode{}
119+
120+
patterns := []string{
121+
"foo.bar",
122+
"foo.*.baz",
123+
"def[*]",
124+
}
125+
126+
mustMatch := map[string]string{
127+
"foo.bar": "foo.bar",
128+
"foo.abc.baz": "foo.*.baz",
129+
"foo.def.baz": "foo.*.baz",
130+
"def[0]": "def[*]",
131+
"def[1]": "def[*]",
132+
}
133+
134+
mustNotMatch := []string{
135+
"foo",
136+
"abc[0]",
137+
"abc[1]",
138+
"def[2].x",
139+
"foo.y",
140+
"foo.bar.baz.qux",
141+
}
142+
143+
for _, pattern := range patterns {
144+
err := trie.Insert(dyn.MustPatternFromString(pattern))
145+
assert.NoError(t, err)
146+
}
147+
148+
for path, expectedPattern := range mustMatch {
149+
pattern, ok := trie.SearchPath(dyn.MustPathFromString(path))
150+
assert.True(t, ok)
151+
assert.Equal(t, dyn.MustPatternFromString(expectedPattern), pattern)
152+
}
153+
154+
for _, path := range mustNotMatch {
155+
_, ok := trie.SearchPath(dyn.MustPathFromString(path))
156+
assert.False(t, ok)
157+
}
158+
}
159+
160+
func TestPatternTrie_OverlappingPatterns(t *testing.T) {
161+
trie := &dyn.TrieNode{}
162+
163+
// Insert overlapping patterns
164+
patterns := []string{
165+
"foo.bar",
166+
"foo.*",
167+
"*.bar",
168+
"*.*",
169+
}
170+
171+
for _, pattern := range patterns {
172+
err := trie.Insert(dyn.MustPatternFromString(pattern))
173+
assert.NoError(t, err)
174+
}
175+
176+
for _, path := range []string{
177+
"foo.bar",
178+
"foo.baz",
179+
"baz.bar",
180+
"baz.qux",
181+
} {
182+
_, ok := trie.SearchPath(dyn.MustPathFromString(path))
183+
assert.True(t, ok)
184+
}
185+
}
186+
187+
func TestPatternTrie_FixedIndexPatterns(t *testing.T) {
188+
trie := &dyn.TrieNode{}
189+
190+
err := trie.Insert(dyn.MustPatternFromString("foo[0]"))
191+
assert.EqualError(t, err, "fixed index patterns are not supported: dyn.Pattern{dyn.pathComponent{key:\"foo\", index:0}, dyn.pathComponent{key:\"\", index:0}}")
192+
193+
err = trie.Insert(dyn.MustPatternFromString("foo[2]"))
194+
assert.EqualError(t, err, "fixed index patterns are not supported: dyn.Pattern{dyn.pathComponent{key:\"foo\", index:0}, dyn.pathComponent{key:\"\", index:2}}")
195+
}

0 commit comments

Comments
 (0)