From 2592f2b378d02b8d252ffc1a88f119447cd4f4a5 Mon Sep 17 00:00:00 2001 From: Dor Adiv Date: Mon, 17 Nov 2025 14:46:35 +0200 Subject: [PATCH 1/3] Change module path from 'adrg' to 'dorzzz' Changed URL for fork --- go.mod | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go.mod b/go.mod index 9a61806..b5b4608 100644 --- a/go.mod +++ b/go.mod @@ -1,4 +1,4 @@ -module github.com/adrg/strutil +module github.com/dorzzz/strutil go 1.19 From 329b7f8a5c85d5e5d4f291915b9bf392d26e0bc9 Mon Sep 17 00:00:00 2001 From: Dor Adiv Date: Mon, 17 Nov 2025 15:03:55 +0200 Subject: [PATCH 2/3] Fixed import URL --- README.md | 36 +++++++++++++------------- example_test.go | 4 +-- internal/mathutil/mathutil_test.go | 2 +- internal/ngram/ngram.go | 2 +- internal/ngram/ngram_test.go | 2 +- internal/stringutil/stringutil_test.go | 2 +- metrics/examples_test.go | 2 +- metrics/jaccard.go | 2 +- metrics/jaro.go | 2 +- metrics/jaro_winkler.go | 2 +- metrics/levenshtein.go | 2 +- metrics/metrics_test.go | 2 +- metrics/overlap_coefficient.go | 4 +-- metrics/smith_waterman_gotoh.go | 2 +- metrics/sorensen_dice.go | 2 +- strutil.go | 8 +++--- 16 files changed, 38 insertions(+), 38 deletions(-) diff --git a/README.md b/README.md index 0a269d8..714459b 100644 --- a/README.md +++ b/README.md @@ -1,22 +1,22 @@

strutil

- - Tests status + + Tests status Code coverage - - pkg.go.dev documentation + + pkg.go.dev documentation MIT license - - Go report card + + Go report card - + GitHub issues @@ -26,12 +26,12 @@ strutil provides a collection of string metrics for calculating string similarity as well as other string utility functions. -Full documentation can be found at https://pkg.go.dev/github.com/adrg/strutil. +Full documentation can be found at https://pkg.go.dev/github.com/dorzzz/strutil. ## Installation ``` -go get github.com/adrg/strutil +go get github.com/dorzzz/strutil ``` ## String metrics @@ -60,7 +60,7 @@ func Similarity(a, b string, metric StringMetric) float64 { ``` All defined string metrics can be found in the -[metrics](https://pkg.go.dev/github.com/adrg/strutil/metrics) package. +[metrics](https://pkg.go.dev/github.com/dorzzz/strutil/metrics) package. #### Hamming @@ -77,7 +77,7 @@ fmt.Printf("%d\n", ham.Distance("one", "once")) // Output: 2 ``` More information and additional examples can be found on -[pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#Hamming). +[pkg.go.dev](https://pkg.go.dev/github.com/dorzzz/strutil/metrics#Hamming). #### Levenshtein @@ -106,7 +106,7 @@ fmt.Printf("%d\n", lev.Distance("graph", "giraffe")) // Output: 4 ``` More information and additional examples can be found on -[pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#Levenshtein). +[pkg.go.dev](https://pkg.go.dev/github.com/dorzzz/strutil/metrics#Levenshtein). #### Jaro @@ -116,7 +116,7 @@ fmt.Printf("%.2f\n", similarity) // Output: 0.78 ``` More information and additional examples can be found on -[pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#Jaro). +[pkg.go.dev](https://pkg.go.dev/github.com/dorzzz/strutil/metrics#Jaro). #### Jaro-Winkler @@ -126,7 +126,7 @@ fmt.Printf("%.2f\n", similarity) // Output: 0.80 ``` More information and additional examples can be found on -[pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#JaroWinkler). +[pkg.go.dev](https://pkg.go.dev/github.com/dorzzz/strutil/metrics#JaroWinkler). #### Smith-Waterman-Gotoh @@ -152,7 +152,7 @@ fmt.Printf("%.2f\n", similarity) // Output: 0.96 ``` More information and additional examples can be found on -[pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#SmithWatermanGotoh). +[pkg.go.dev](https://pkg.go.dev/github.com/dorzzz/strutil/metrics#SmithWatermanGotoh). #### Sorensen-Dice @@ -174,7 +174,7 @@ fmt.Printf("%.2f\n", similarity) // Output: 0.53 ``` More information and additional examples can be found on -[pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#SorensenDice). +[pkg.go.dev](https://pkg.go.dev/github.com/dorzzz/strutil/metrics#SorensenDice). #### Jaccard @@ -214,7 +214,7 @@ where SD is the Sorensen-Dice coefficient and J is the Jaccard index. ``` More information and additional examples can be found on -[pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#Jaccard). +[pkg.go.dev](https://pkg.go.dev/github.com/dorzzz/strutil/metrics#Jaccard). #### Overlap Coefficient @@ -236,7 +236,7 @@ fmt.Printf("%.2f\n", similarity) // Output: 0.57 ``` More information and additional examples can be found on -[pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#OverlapCoefficient). +[pkg.go.dev](https://pkg.go.dev/github.com/dorzzz/strutil/metrics#OverlapCoefficient). ## References diff --git a/example_test.go b/example_test.go index 3da3d77..22858d8 100644 --- a/example_test.go +++ b/example_test.go @@ -3,8 +3,8 @@ package strutil_test import ( "fmt" - "github.com/adrg/strutil" - "github.com/adrg/strutil/metrics" + "github.com/dorzzz/strutil" + "github.com/dorzzz/strutil/metrics" ) func ExampleSimilarity() { diff --git a/internal/mathutil/mathutil_test.go b/internal/mathutil/mathutil_test.go index 32a1475..94c4fe1 100644 --- a/internal/mathutil/mathutil_test.go +++ b/internal/mathutil/mathutil_test.go @@ -3,7 +3,7 @@ package mathutil_test import ( "testing" - "github.com/adrg/strutil/internal/mathutil" + "github.com/dorzzz/strutil/internal/mathutil" "github.com/stretchr/testify/require" ) diff --git a/internal/ngram/ngram.go b/internal/ngram/ngram.go index 10e9041..5fc31be 100644 --- a/internal/ngram/ngram.go +++ b/internal/ngram/ngram.go @@ -1,6 +1,6 @@ package ngram -import "github.com/adrg/strutil/internal/mathutil" +import "github.com/dorzzz/strutil/internal/mathutil" // Count returns the n-gram count of the specified size for the // provided term. An n-gram size of 1 is used if the provided size is diff --git a/internal/ngram/ngram_test.go b/internal/ngram/ngram_test.go index c9e0332..3a1600b 100644 --- a/internal/ngram/ngram_test.go +++ b/internal/ngram/ngram_test.go @@ -3,7 +3,7 @@ package ngram_test import ( "testing" - "github.com/adrg/strutil/internal/ngram" + "github.com/dorzzz/strutil/internal/ngram" "github.com/stretchr/testify/require" ) diff --git a/internal/stringutil/stringutil_test.go b/internal/stringutil/stringutil_test.go index b04dcd2..5175744 100644 --- a/internal/stringutil/stringutil_test.go +++ b/internal/stringutil/stringutil_test.go @@ -3,7 +3,7 @@ package stringutil_test import ( "testing" - "github.com/adrg/strutil/internal/stringutil" + "github.com/dorzzz/strutil/internal/stringutil" "github.com/stretchr/testify/require" ) diff --git a/metrics/examples_test.go b/metrics/examples_test.go index 9496e28..16f78e8 100644 --- a/metrics/examples_test.go +++ b/metrics/examples_test.go @@ -3,7 +3,7 @@ package metrics_test import ( "fmt" - "github.com/adrg/strutil/metrics" + "github.com/dorzzz/strutil/metrics" ) func ExampleHamming() { diff --git a/metrics/jaccard.go b/metrics/jaccard.go index cf3215a..d4eb795 100644 --- a/metrics/jaccard.go +++ b/metrics/jaccard.go @@ -3,7 +3,7 @@ package metrics import ( "strings" - "github.com/adrg/strutil/internal/ngram" + "github.com/dorzzz/strutil/internal/ngram" ) // Jaccard represents the Jaccard index for measuring the similarity diff --git a/metrics/jaro.go b/metrics/jaro.go index 7b394b2..1cf1e7d 100644 --- a/metrics/jaro.go +++ b/metrics/jaro.go @@ -4,7 +4,7 @@ import ( "strings" "unicode/utf8" - "github.com/adrg/strutil/internal/mathutil" + "github.com/dorzzz/strutil/internal/mathutil" ) // Jaro represents the Jaro metric for measuring the similarity diff --git a/metrics/jaro_winkler.go b/metrics/jaro_winkler.go index e89f00d..d2d4a65 100644 --- a/metrics/jaro_winkler.go +++ b/metrics/jaro_winkler.go @@ -4,7 +4,7 @@ import ( "strings" "unicode/utf8" - "github.com/adrg/strutil/internal/stringutil" + "github.com/dorzzz/strutil/internal/stringutil" ) // JaroWinkler represents the Jaro-Winkler metric for measuring the similarity diff --git a/metrics/levenshtein.go b/metrics/levenshtein.go index e0e9e9c..3a19432 100644 --- a/metrics/levenshtein.go +++ b/metrics/levenshtein.go @@ -3,7 +3,7 @@ package metrics import ( "strings" - "github.com/adrg/strutil/internal/mathutil" + "github.com/dorzzz/strutil/internal/mathutil" ) // Levenshtein represents the Levenshtein metric for measuring the similarity diff --git a/metrics/metrics_test.go b/metrics/metrics_test.go index f2bf94e..b6502a1 100644 --- a/metrics/metrics_test.go +++ b/metrics/metrics_test.go @@ -4,7 +4,7 @@ import ( "fmt" "testing" - "github.com/adrg/strutil/metrics" + "github.com/dorzzz/strutil/metrics" "github.com/stretchr/testify/require" ) diff --git a/metrics/overlap_coefficient.go b/metrics/overlap_coefficient.go index caeb350..2095c0a 100644 --- a/metrics/overlap_coefficient.go +++ b/metrics/overlap_coefficient.go @@ -3,8 +3,8 @@ package metrics import ( "strings" - "github.com/adrg/strutil/internal/mathutil" - "github.com/adrg/strutil/internal/ngram" + "github.com/dorzzz/strutil/internal/mathutil" + "github.com/dorzzz/strutil/internal/ngram" ) // OverlapCoefficient represents the overlap coefficient for measuring the diff --git a/metrics/smith_waterman_gotoh.go b/metrics/smith_waterman_gotoh.go index ab6b49e..0f4089a 100644 --- a/metrics/smith_waterman_gotoh.go +++ b/metrics/smith_waterman_gotoh.go @@ -3,7 +3,7 @@ package metrics import ( "strings" - "github.com/adrg/strutil/internal/mathutil" + "github.com/dorzzz/strutil/internal/mathutil" ) // SmithWatermanGotoh represents the Smith-Waterman-Gotoh metric for measuring diff --git a/metrics/sorensen_dice.go b/metrics/sorensen_dice.go index c48b751..506a4e9 100644 --- a/metrics/sorensen_dice.go +++ b/metrics/sorensen_dice.go @@ -3,7 +3,7 @@ package metrics import ( "strings" - "github.com/adrg/strutil/internal/ngram" + "github.com/dorzzz/strutil/internal/ngram" ) // SorensenDice represents the Sorensen-Dice metric for measuring the diff --git a/strutil.go b/strutil.go index 8e69d5c..155586e 100644 --- a/strutil.go +++ b/strutil.go @@ -1,7 +1,7 @@ /* Package strutil provides string metrics for calculating string similarity as well as other string utility functions. Documentation for all the metrics can -be found at https://pkg.go.dev/github.com/adrg/strutil/metrics. +be found at https://pkg.go.dev/github.com/dorzzz/strutil/metrics. Included string metrics: - Hamming @@ -17,8 +17,8 @@ Included string metrics: package strutil import ( - "github.com/adrg/strutil/internal/ngram" - "github.com/adrg/strutil/internal/stringutil" + "github.com/dorzzz/strutil/internal/ngram" + "github.com/dorzzz/strutil/internal/stringutil" ) // StringMetric represents a metric for measuring the similarity between @@ -32,7 +32,7 @@ import ( // - Jaccard // - Overlap coefficient // -// For more information see https://pkg.go.dev/github.com/adrg/strutil/metrics. +// For more information see https://pkg.go.dev/github.com/dorzzz/strutil/metrics. type StringMetric interface { Compare(a, b string) float64 } From 08e927c4dd92afa10ef1963d7ac78bedf3fd1b0a Mon Sep 17 00:00:00 2001 From: Dor Adiv Date: Mon, 17 Nov 2025 17:56:01 +0200 Subject: [PATCH 3/3] feat: make Jaro and Jaro-Winkler implementations more canonical and formula-accurate --- metrics/jaro.go | 123 +++++++++++++++++++++++++++++++++++++--- metrics/jaro_winkler.go | 16 +++++- 2 files changed, 130 insertions(+), 9 deletions(-) diff --git a/metrics/jaro.go b/metrics/jaro.go index 1cf1e7d..07e4602 100644 --- a/metrics/jaro.go +++ b/metrics/jaro.go @@ -12,31 +12,36 @@ import ( // For more information see https://en.wikipedia.org/wiki/Jaro-Winkler_distance. type Jaro struct { // CaseSensitive specifies if the string comparison is case sensitive. - CaseSensitive bool + CaseSensitive bool + UseStandardWindow int } // NewJaro returns a new Jaro string metric. // // Default options: // CaseSensitive: true +// UseStandardWindow: 0 (uses original strutil algorithm) func NewJaro() *Jaro { return &Jaro{ - CaseSensitive: true, + CaseSensitive: true, + UseStandardWindow: 0, } } // Compare returns the Jaro similarity of a and b. The returned similarity is // a number between 0 and 1. Larger similarity numbers indicate closer matches. func (m *Jaro) Compare(a, b string) float64 { - // Check if both terms are empty. + // Use rune counts (UTF-8 code points) for lengths. lenA, lenB := utf8.RuneCountInString(a), utf8.RuneCountInString(b) + + // Check if both terms are empty. if lenA == 0 && lenB == 0 { - return 1 + return 1.0 } // Check if one of the terms is empty. if lenA == 0 || lenB == 0 { - return 0 + return 0.0 } // Lower terms if case insensitive comparison is specified. @@ -45,7 +50,28 @@ func (m *Jaro) Compare(a, b string) float64 { b = strings.ToLower(b) } - // Get matching runes. + // Choose algorithm based on UseStandardWindow + if m.UseStandardWindow == 1 { + // Apache Commons implementation + if a == b { + return 1.0 + } + + ra := []rune(a) + rb := []rune(b) + + matches, halfTranspositions := jaroMatches(ra, rb, m.UseStandardWindow) + if matches == 0 { + return 0.0 + } + + mFloat := float64(matches) + return (mFloat/float64(lenA) + + mFloat/float64(lenB) + + (mFloat-float64(halfTranspositions)/2.0)/mFloat) / 3.0 + } + + // Original strutil implementation (default) halfLen := mathutil.Max(0, mathutil.Max(lenA, lenB)/2) mrA := matchingRunes(a, b, halfLen) mrB := matchingRunes(b, a, halfLen) @@ -55,12 +81,78 @@ func (m *Jaro) Compare(a, b string) float64 { return 0.0 } - // Return similarity. return (float64(fmLen)/float64(lenA) + float64(smLen)/float64(lenB) + float64(fmLen-transpositions(mrA, mrB)/2)/float64(fmLen)) / 3.0 } +// jaroMatches mirrors Apache's JaroWinklerSimilarity.matches(...) logic, +// but operating on rune slices instead of Java chars. +func jaroMatches(first, second []rune, useStandardWindow int) (matches int, halfTranspositions int) { + var maxRunes, minRunes []rune + if len(first) > len(second) { + maxRunes = first + minRunes = second + } else { + maxRunes = second + minRunes = first + } + + // range = Math.max(max.length()/2 - 1, 0) + rng := maxInt(len(maxRunes)/2-useStandardWindow, 0) + + matchIndexes := make([]int, len(minRunes)) + for i := range matchIndexes { + matchIndexes[i] = -1 + } + matchFlags := make([]bool, len(maxRunes)) + + // Find matches + for mi, c1 := range minRunes { + start := maxInt(mi-rng, 0) + end := minInt(mi+rng+1, len(maxRunes)) + for xi := start; xi < end; xi++ { + if !matchFlags[xi] && c1 == maxRunes[xi] { + matchIndexes[mi] = xi + matchFlags[xi] = true + matches++ + break + } + } + } + + // Build the two matched sequences ms1, ms2 + ms1 := make([]rune, matches) + ms2 := make([]rune, matches) + + si := 0 + for i := 0; i < len(minRunes); i++ { + if matchIndexes[i] != -1 { + ms1[si] = minRunes[i] + si++ + } + } + + si = 0 + for i := 0; i < len(maxRunes); i++ { + if matchFlags[i] { + ms2[si] = maxRunes[i] + si++ + } + } + + // Count half-transpositions + for i := 0; i < len(ms1); i++ { + if ms1[i] != ms2[i] { + halfTranspositions++ + } + } + + return matches, halfTranspositions +} + +// matchingRunes returns the matching runes between a and b within the specified limit. +// This is the original strutil implementation. func matchingRunes(a, b string, limit int) []rune { var ( runesA = []rune(a) @@ -83,6 +175,8 @@ func matchingRunes(a, b string, limit int) []rune { return runesCommon } +// transpositions counts the number of transpositions between two rune slices. +// This is the original strutil implementation. func transpositions(a, b []rune) int { var count int @@ -95,3 +189,18 @@ func transpositions(a, b []rune) int { return count } + +// local int helpers +func minInt(a, b int) int { + if a < b { + return a + } + return b +} + +func maxInt(a, b int) int { + if a > b { + return a + } + return b +} diff --git a/metrics/jaro_winkler.go b/metrics/jaro_winkler.go index d2d4a65..cdc2ae4 100644 --- a/metrics/jaro_winkler.go +++ b/metrics/jaro_winkler.go @@ -12,16 +12,22 @@ import ( // For more information see https://en.wikipedia.org/wiki/Jaro-Winkler_distance. type JaroWinkler struct { // CaseSensitive specifies if the string comparison is case sensitive. - CaseSensitive bool + CaseSensitive bool + Threshold float64 + UseStandardWindow int } // NewJaroWinkler returns a new Jaro-Winkler string metric. // // Default options: // CaseSensitive: true +// Threshold: 0 (always applies Winkler bonus) +// UseStandardWindow: 0 (uses original strutil algorithm) func NewJaroWinkler() *JaroWinkler { return &JaroWinkler{ - CaseSensitive: true, + CaseSensitive: true, + Threshold: 0, + UseStandardWindow: 0, } } @@ -43,8 +49,14 @@ func (m *JaroWinkler) Compare(a, b string) float64 { jaro := NewJaro() jaro.CaseSensitive = m.CaseSensitive + jaro.UseStandardWindow = m.UseStandardWindow // Return similarity. similarity := jaro.Compare(a, b) + if similarity < m.Threshold { + return similarity + } + + // Apply Winkler bonus. return similarity + (0.1 * float64(lenPrefix) * (1.0 - similarity)) }