diff --git a/README.md b/README.md index 0a269d8..714459b 100644 --- a/README.md +++ b/README.md @@ -1,22 +1,22 @@
-
-
+
+
-
-
+
+
-
-
+
+
-
+
@@ -26,12 +26,12 @@
strutil provides a collection of string metrics for calculating string similarity as well as
other string utility functions.
-Full documentation can be found at https://pkg.go.dev/github.com/adrg/strutil.
+Full documentation can be found at https://pkg.go.dev/github.com/dorzzz/strutil.
## Installation
```
-go get github.com/adrg/strutil
+go get github.com/dorzzz/strutil
```
## String metrics
@@ -60,7 +60,7 @@ func Similarity(a, b string, metric StringMetric) float64 {
```
All defined string metrics can be found in the
-[metrics](https://pkg.go.dev/github.com/adrg/strutil/metrics) package.
+[metrics](https://pkg.go.dev/github.com/dorzzz/strutil/metrics) package.
#### Hamming
@@ -77,7 +77,7 @@ fmt.Printf("%d\n", ham.Distance("one", "once")) // Output: 2
```
More information and additional examples can be found on
-[pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#Hamming).
+[pkg.go.dev](https://pkg.go.dev/github.com/dorzzz/strutil/metrics#Hamming).
#### Levenshtein
@@ -106,7 +106,7 @@ fmt.Printf("%d\n", lev.Distance("graph", "giraffe")) // Output: 4
```
More information and additional examples can be found on
-[pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#Levenshtein).
+[pkg.go.dev](https://pkg.go.dev/github.com/dorzzz/strutil/metrics#Levenshtein).
#### Jaro
@@ -116,7 +116,7 @@ fmt.Printf("%.2f\n", similarity) // Output: 0.78
```
More information and additional examples can be found on
-[pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#Jaro).
+[pkg.go.dev](https://pkg.go.dev/github.com/dorzzz/strutil/metrics#Jaro).
#### Jaro-Winkler
@@ -126,7 +126,7 @@ fmt.Printf("%.2f\n", similarity) // Output: 0.80
```
More information and additional examples can be found on
-[pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#JaroWinkler).
+[pkg.go.dev](https://pkg.go.dev/github.com/dorzzz/strutil/metrics#JaroWinkler).
#### Smith-Waterman-Gotoh
@@ -152,7 +152,7 @@ fmt.Printf("%.2f\n", similarity) // Output: 0.96
```
More information and additional examples can be found on
-[pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#SmithWatermanGotoh).
+[pkg.go.dev](https://pkg.go.dev/github.com/dorzzz/strutil/metrics#SmithWatermanGotoh).
#### Sorensen-Dice
@@ -174,7 +174,7 @@ fmt.Printf("%.2f\n", similarity) // Output: 0.53
```
More information and additional examples can be found on
-[pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#SorensenDice).
+[pkg.go.dev](https://pkg.go.dev/github.com/dorzzz/strutil/metrics#SorensenDice).
#### Jaccard
@@ -214,7 +214,7 @@ where SD is the Sorensen-Dice coefficient and J is the Jaccard index.
```
More information and additional examples can be found on
-[pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#Jaccard).
+[pkg.go.dev](https://pkg.go.dev/github.com/dorzzz/strutil/metrics#Jaccard).
#### Overlap Coefficient
@@ -236,7 +236,7 @@ fmt.Printf("%.2f\n", similarity) // Output: 0.57
```
More information and additional examples can be found on
-[pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#OverlapCoefficient).
+[pkg.go.dev](https://pkg.go.dev/github.com/dorzzz/strutil/metrics#OverlapCoefficient).
## References
diff --git a/example_test.go b/example_test.go
index 3da3d77..22858d8 100644
--- a/example_test.go
+++ b/example_test.go
@@ -3,8 +3,8 @@ package strutil_test
import (
"fmt"
- "github.com/adrg/strutil"
- "github.com/adrg/strutil/metrics"
+ "github.com/dorzzz/strutil"
+ "github.com/dorzzz/strutil/metrics"
)
func ExampleSimilarity() {
diff --git a/go.mod b/go.mod
index 9a61806..b5b4608 100644
--- a/go.mod
+++ b/go.mod
@@ -1,4 +1,4 @@
-module github.com/adrg/strutil
+module github.com/dorzzz/strutil
go 1.19
diff --git a/internal/mathutil/mathutil_test.go b/internal/mathutil/mathutil_test.go
index 32a1475..94c4fe1 100644
--- a/internal/mathutil/mathutil_test.go
+++ b/internal/mathutil/mathutil_test.go
@@ -3,7 +3,7 @@ package mathutil_test
import (
"testing"
- "github.com/adrg/strutil/internal/mathutil"
+ "github.com/dorzzz/strutil/internal/mathutil"
"github.com/stretchr/testify/require"
)
diff --git a/internal/ngram/ngram.go b/internal/ngram/ngram.go
index 10e9041..5fc31be 100644
--- a/internal/ngram/ngram.go
+++ b/internal/ngram/ngram.go
@@ -1,6 +1,6 @@
package ngram
-import "github.com/adrg/strutil/internal/mathutil"
+import "github.com/dorzzz/strutil/internal/mathutil"
// Count returns the n-gram count of the specified size for the
// provided term. An n-gram size of 1 is used if the provided size is
diff --git a/internal/ngram/ngram_test.go b/internal/ngram/ngram_test.go
index c9e0332..3a1600b 100644
--- a/internal/ngram/ngram_test.go
+++ b/internal/ngram/ngram_test.go
@@ -3,7 +3,7 @@ package ngram_test
import (
"testing"
- "github.com/adrg/strutil/internal/ngram"
+ "github.com/dorzzz/strutil/internal/ngram"
"github.com/stretchr/testify/require"
)
diff --git a/internal/stringutil/stringutil_test.go b/internal/stringutil/stringutil_test.go
index b04dcd2..5175744 100644
--- a/internal/stringutil/stringutil_test.go
+++ b/internal/stringutil/stringutil_test.go
@@ -3,7 +3,7 @@ package stringutil_test
import (
"testing"
- "github.com/adrg/strutil/internal/stringutil"
+ "github.com/dorzzz/strutil/internal/stringutil"
"github.com/stretchr/testify/require"
)
diff --git a/metrics/examples_test.go b/metrics/examples_test.go
index 9496e28..16f78e8 100644
--- a/metrics/examples_test.go
+++ b/metrics/examples_test.go
@@ -3,7 +3,7 @@ package metrics_test
import (
"fmt"
- "github.com/adrg/strutil/metrics"
+ "github.com/dorzzz/strutil/metrics"
)
func ExampleHamming() {
diff --git a/metrics/jaccard.go b/metrics/jaccard.go
index cf3215a..d4eb795 100644
--- a/metrics/jaccard.go
+++ b/metrics/jaccard.go
@@ -3,7 +3,7 @@ package metrics
import (
"strings"
- "github.com/adrg/strutil/internal/ngram"
+ "github.com/dorzzz/strutil/internal/ngram"
)
// Jaccard represents the Jaccard index for measuring the similarity
diff --git a/metrics/jaro.go b/metrics/jaro.go
index 7b394b2..07e4602 100644
--- a/metrics/jaro.go
+++ b/metrics/jaro.go
@@ -4,7 +4,7 @@ import (
"strings"
"unicode/utf8"
- "github.com/adrg/strutil/internal/mathutil"
+ "github.com/dorzzz/strutil/internal/mathutil"
)
// Jaro represents the Jaro metric for measuring the similarity
@@ -12,31 +12,36 @@ import (
// For more information see https://en.wikipedia.org/wiki/Jaro-Winkler_distance.
type Jaro struct {
// CaseSensitive specifies if the string comparison is case sensitive.
- CaseSensitive bool
+ CaseSensitive bool
+ UseStandardWindow int
}
// NewJaro returns a new Jaro string metric.
//
// Default options:
// CaseSensitive: true
+// UseStandardWindow: 0 (uses original strutil algorithm)
func NewJaro() *Jaro {
return &Jaro{
- CaseSensitive: true,
+ CaseSensitive: true,
+ UseStandardWindow: 0,
}
}
// Compare returns the Jaro similarity of a and b. The returned similarity is
// a number between 0 and 1. Larger similarity numbers indicate closer matches.
func (m *Jaro) Compare(a, b string) float64 {
- // Check if both terms are empty.
+ // Use rune counts (UTF-8 code points) for lengths.
lenA, lenB := utf8.RuneCountInString(a), utf8.RuneCountInString(b)
+
+ // Check if both terms are empty.
if lenA == 0 && lenB == 0 {
- return 1
+ return 1.0
}
// Check if one of the terms is empty.
if lenA == 0 || lenB == 0 {
- return 0
+ return 0.0
}
// Lower terms if case insensitive comparison is specified.
@@ -45,7 +50,28 @@ func (m *Jaro) Compare(a, b string) float64 {
b = strings.ToLower(b)
}
- // Get matching runes.
+ // Choose algorithm based on UseStandardWindow
+ if m.UseStandardWindow == 1 {
+ // Apache Commons implementation
+ if a == b {
+ return 1.0
+ }
+
+ ra := []rune(a)
+ rb := []rune(b)
+
+ matches, halfTranspositions := jaroMatches(ra, rb, m.UseStandardWindow)
+ if matches == 0 {
+ return 0.0
+ }
+
+ mFloat := float64(matches)
+ return (mFloat/float64(lenA) +
+ mFloat/float64(lenB) +
+ (mFloat-float64(halfTranspositions)/2.0)/mFloat) / 3.0
+ }
+
+ // Original strutil implementation (default)
halfLen := mathutil.Max(0, mathutil.Max(lenA, lenB)/2)
mrA := matchingRunes(a, b, halfLen)
mrB := matchingRunes(b, a, halfLen)
@@ -55,12 +81,78 @@ func (m *Jaro) Compare(a, b string) float64 {
return 0.0
}
- // Return similarity.
return (float64(fmLen)/float64(lenA) +
float64(smLen)/float64(lenB) +
float64(fmLen-transpositions(mrA, mrB)/2)/float64(fmLen)) / 3.0
}
+// jaroMatches mirrors Apache's JaroWinklerSimilarity.matches(...) logic,
+// but operating on rune slices instead of Java chars.
+func jaroMatches(first, second []rune, useStandardWindow int) (matches int, halfTranspositions int) {
+ var maxRunes, minRunes []rune
+ if len(first) > len(second) {
+ maxRunes = first
+ minRunes = second
+ } else {
+ maxRunes = second
+ minRunes = first
+ }
+
+ // range = Math.max(max.length()/2 - 1, 0)
+ rng := maxInt(len(maxRunes)/2-useStandardWindow, 0)
+
+ matchIndexes := make([]int, len(minRunes))
+ for i := range matchIndexes {
+ matchIndexes[i] = -1
+ }
+ matchFlags := make([]bool, len(maxRunes))
+
+ // Find matches
+ for mi, c1 := range minRunes {
+ start := maxInt(mi-rng, 0)
+ end := minInt(mi+rng+1, len(maxRunes))
+ for xi := start; xi < end; xi++ {
+ if !matchFlags[xi] && c1 == maxRunes[xi] {
+ matchIndexes[mi] = xi
+ matchFlags[xi] = true
+ matches++
+ break
+ }
+ }
+ }
+
+ // Build the two matched sequences ms1, ms2
+ ms1 := make([]rune, matches)
+ ms2 := make([]rune, matches)
+
+ si := 0
+ for i := 0; i < len(minRunes); i++ {
+ if matchIndexes[i] != -1 {
+ ms1[si] = minRunes[i]
+ si++
+ }
+ }
+
+ si = 0
+ for i := 0; i < len(maxRunes); i++ {
+ if matchFlags[i] {
+ ms2[si] = maxRunes[i]
+ si++
+ }
+ }
+
+ // Count half-transpositions
+ for i := 0; i < len(ms1); i++ {
+ if ms1[i] != ms2[i] {
+ halfTranspositions++
+ }
+ }
+
+ return matches, halfTranspositions
+}
+
+// matchingRunes returns the matching runes between a and b within the specified limit.
+// This is the original strutil implementation.
func matchingRunes(a, b string, limit int) []rune {
var (
runesA = []rune(a)
@@ -83,6 +175,8 @@ func matchingRunes(a, b string, limit int) []rune {
return runesCommon
}
+// transpositions counts the number of transpositions between two rune slices.
+// This is the original strutil implementation.
func transpositions(a, b []rune) int {
var count int
@@ -95,3 +189,18 @@ func transpositions(a, b []rune) int {
return count
}
+
+// local int helpers
+func minInt(a, b int) int {
+ if a < b {
+ return a
+ }
+ return b
+}
+
+func maxInt(a, b int) int {
+ if a > b {
+ return a
+ }
+ return b
+}
diff --git a/metrics/jaro_winkler.go b/metrics/jaro_winkler.go
index e89f00d..cdc2ae4 100644
--- a/metrics/jaro_winkler.go
+++ b/metrics/jaro_winkler.go
@@ -4,7 +4,7 @@ import (
"strings"
"unicode/utf8"
- "github.com/adrg/strutil/internal/stringutil"
+ "github.com/dorzzz/strutil/internal/stringutil"
)
// JaroWinkler represents the Jaro-Winkler metric for measuring the similarity
@@ -12,16 +12,22 @@ import (
// For more information see https://en.wikipedia.org/wiki/Jaro-Winkler_distance.
type JaroWinkler struct {
// CaseSensitive specifies if the string comparison is case sensitive.
- CaseSensitive bool
+ CaseSensitive bool
+ Threshold float64
+ UseStandardWindow int
}
// NewJaroWinkler returns a new Jaro-Winkler string metric.
//
// Default options:
// CaseSensitive: true
+// Threshold: 0 (always applies Winkler bonus)
+// UseStandardWindow: 0 (uses original strutil algorithm)
func NewJaroWinkler() *JaroWinkler {
return &JaroWinkler{
- CaseSensitive: true,
+ CaseSensitive: true,
+ Threshold: 0,
+ UseStandardWindow: 0,
}
}
@@ -43,8 +49,14 @@ func (m *JaroWinkler) Compare(a, b string) float64 {
jaro := NewJaro()
jaro.CaseSensitive = m.CaseSensitive
+ jaro.UseStandardWindow = m.UseStandardWindow
// Return similarity.
similarity := jaro.Compare(a, b)
+ if similarity < m.Threshold {
+ return similarity
+ }
+
+ // Apply Winkler bonus.
return similarity + (0.1 * float64(lenPrefix) * (1.0 - similarity))
}
diff --git a/metrics/levenshtein.go b/metrics/levenshtein.go
index e0e9e9c..3a19432 100644
--- a/metrics/levenshtein.go
+++ b/metrics/levenshtein.go
@@ -3,7 +3,7 @@ package metrics
import (
"strings"
- "github.com/adrg/strutil/internal/mathutil"
+ "github.com/dorzzz/strutil/internal/mathutil"
)
// Levenshtein represents the Levenshtein metric for measuring the similarity
diff --git a/metrics/metrics_test.go b/metrics/metrics_test.go
index f2bf94e..b6502a1 100644
--- a/metrics/metrics_test.go
+++ b/metrics/metrics_test.go
@@ -4,7 +4,7 @@ import (
"fmt"
"testing"
- "github.com/adrg/strutil/metrics"
+ "github.com/dorzzz/strutil/metrics"
"github.com/stretchr/testify/require"
)
diff --git a/metrics/overlap_coefficient.go b/metrics/overlap_coefficient.go
index caeb350..2095c0a 100644
--- a/metrics/overlap_coefficient.go
+++ b/metrics/overlap_coefficient.go
@@ -3,8 +3,8 @@ package metrics
import (
"strings"
- "github.com/adrg/strutil/internal/mathutil"
- "github.com/adrg/strutil/internal/ngram"
+ "github.com/dorzzz/strutil/internal/mathutil"
+ "github.com/dorzzz/strutil/internal/ngram"
)
// OverlapCoefficient represents the overlap coefficient for measuring the
diff --git a/metrics/smith_waterman_gotoh.go b/metrics/smith_waterman_gotoh.go
index ab6b49e..0f4089a 100644
--- a/metrics/smith_waterman_gotoh.go
+++ b/metrics/smith_waterman_gotoh.go
@@ -3,7 +3,7 @@ package metrics
import (
"strings"
- "github.com/adrg/strutil/internal/mathutil"
+ "github.com/dorzzz/strutil/internal/mathutil"
)
// SmithWatermanGotoh represents the Smith-Waterman-Gotoh metric for measuring
diff --git a/metrics/sorensen_dice.go b/metrics/sorensen_dice.go
index c48b751..506a4e9 100644
--- a/metrics/sorensen_dice.go
+++ b/metrics/sorensen_dice.go
@@ -3,7 +3,7 @@ package metrics
import (
"strings"
- "github.com/adrg/strutil/internal/ngram"
+ "github.com/dorzzz/strutil/internal/ngram"
)
// SorensenDice represents the Sorensen-Dice metric for measuring the
diff --git a/strutil.go b/strutil.go
index 8e69d5c..155586e 100644
--- a/strutil.go
+++ b/strutil.go
@@ -1,7 +1,7 @@
/*
Package strutil provides string metrics for calculating string similarity as
well as other string utility functions. Documentation for all the metrics can
-be found at https://pkg.go.dev/github.com/adrg/strutil/metrics.
+be found at https://pkg.go.dev/github.com/dorzzz/strutil/metrics.
Included string metrics:
- Hamming
@@ -17,8 +17,8 @@ Included string metrics:
package strutil
import (
- "github.com/adrg/strutil/internal/ngram"
- "github.com/adrg/strutil/internal/stringutil"
+ "github.com/dorzzz/strutil/internal/ngram"
+ "github.com/dorzzz/strutil/internal/stringutil"
)
// StringMetric represents a metric for measuring the similarity between
@@ -32,7 +32,7 @@ import (
// - Jaccard
// - Overlap coefficient
//
-// For more information see https://pkg.go.dev/github.com/adrg/strutil/metrics.
+// For more information see https://pkg.go.dev/github.com/dorzzz/strutil/metrics.
type StringMetric interface {
Compare(a, b string) float64
}