Skip to content

Commit b7cbfe4

Browse files
committed
Replace DiceCoefficient implementation
Removes input string padding as it introduces bias for the first and last character matches. As a consequence, comparing with one-letter string will return score of 0 as it has zero bigrams now. Fixes #8
1 parent 6399781 commit b7cbfe4

File tree

3 files changed

+76
-21
lines changed

3 files changed

+76
-21
lines changed

.gitignore

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,6 @@ pip-log.txt
161161

162162
# Mac crap
163163
.DS_Store
164-
/src/DuoVia.FuzzyStrings/.vs/DuoVia.FuzzyStrings/v15/Server/sqlite3
165-
/src/DuoVia.FuzzyStrings/.vs/DuoVia.FuzzyStrings/DesignTimeBuild
166-
/src/DuoVia.FuzzyStrings/.vs/DuoVia.FuzzyStrings/v16/TestStore/0/004.testlog
167-
/src/DuoVia.FuzzyStrings/.vs/DuoVia.FuzzyStrings/v16/TestStore/0/testlog.manifest
164+
.vs/
165+
.vscode/
166+
.idea/

src/DuoVia.FuzzyStrings/DuoVia.FuzzyStrings/DiceCoefficientExtensions.cs

Lines changed: 52 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,15 @@ public static class DiceCoefficientExtensions
2020
/// <returns></returns>
2121
public static double DiceCoefficient(this string input, string comparedTo)
2222
{
23-
var ngrams = input.ToBiGrams();
24-
var compareToNgrams = comparedTo.ToBiGrams();
25-
return ngrams.DiceCoefficient(compareToNgrams);
23+
if (input == comparedTo)
24+
return 1.0d;
25+
26+
if (input.Length < 2 || comparedTo.Length < 2)
27+
return 0.0d;
28+
29+
var biGrams = input.ToBiGrams(false);
30+
var compareToBiGrams = comparedTo.ToBiGrams(false);
31+
return DiceCoefficient(biGrams, compareToBiGrams);
2632
}
2733

2834
/// <summary>
@@ -33,41 +39,71 @@ public static double DiceCoefficient(this string input, string comparedTo)
3339
/// <returns></returns>
3440
public static double DiceCoefficient(this string[] nGrams, string[] compareToNGrams)
3541
{
36-
int matches = nGrams.Intersect(compareToNGrams).Count();
37-
if (matches == 0) return 0.0d;
42+
var nGramMap = new Dictionary<string, int>(nGrams.Length);
43+
var compareToNGramMap = new Dictionary<string, int>(compareToNGrams.Length);
44+
var nGramSet = new HashSet<string>();
45+
var compareToNGramSet = new HashSet<string>();
46+
foreach (var nGram in nGrams)
47+
{
48+
if (nGramSet.Add(nGram))
49+
nGramMap[nGram] = 1;
50+
else
51+
nGramMap[nGram]++;
52+
}
53+
foreach (var nGram in compareToNGrams)
54+
{
55+
if (compareToNGramSet.Add(nGram))
56+
compareToNGramMap[nGram] = 1;
57+
else
58+
compareToNGramMap[nGram]++;
59+
}
60+
nGramSet.IntersectWith(compareToNGramSet);
61+
var matches = 0;
62+
foreach (var nGram in nGramSet)
63+
matches += Math.Min(nGramMap[nGram], compareToNGramMap[nGram]);
64+
if (matches == 0)
65+
return 0.0d;
66+
3867
double totalBigrams = nGrams.Length + compareToNGrams.Length;
3968
return (2 * matches) / totalBigrams;
4069
}
4170

42-
public static string[] ToBiGrams(this string input)
71+
public static string[] ToBiGrams(this string input, bool usePadding = true)
4372
{
44-
// nLength == 2
45-
// from Jackson, return %j ja ac ck ks so on n#
46-
// from Main, return #m ma ai in n#
47-
input = SinglePercent + input + SinglePound;
73+
if (usePadding)
74+
{
75+
// nLength == 2
76+
// from Jackson, return %j ja ac ck ks so on n#
77+
// from Main, return %m ma ai in n#
78+
input = SinglePercent + input + SinglePound;
79+
}
80+
if (input.Length < 2)
81+
return new string[0];
82+
4883
return ToNGrams(input, 2);
4984
}
5085

5186
public static string[] ToTriGrams(this string input)
5287
{
5388
// nLength == 3
54-
// from Jackson, return %%j %ja jac ack cks kso son on# n##
55-
// from Main, return ##m #ma mai ain in# n##
56-
input = DoublePercent + input + DoublePount;
89+
// from Jackson, return &&j &ja jac ack cks kso son on# n##
90+
// from Main, return &&m &ma mai ain in# n##
91+
input = DoubleAmpersand + input + DoublePound;
5792
return ToNGrams(input, 3);
5893
}
5994

6095
private static string[] ToNGrams(string input, int nLength)
6196
{
6297
int itemsCount = input.Length - 1;
6398
string[] ngrams = new string[input.Length - 1];
64-
for (int i = 0; i < itemsCount; i++) ngrams[i] = input.Substring(i, nLength);
99+
for (int i = 0; i < itemsCount; i++)
100+
ngrams[i] = input.Substring(i, nLength);
65101
return ngrams;
66102
}
67103

68104
private const string SinglePercent = "%";
69105
private const string SinglePound = "#";
70-
private const string DoublePercent = "&&";
71-
private const string DoublePount = "##";
106+
private const string DoubleAmpersand = "&&";
107+
private const string DoublePound = "##";
72108
}
73109
}

src/DuoVia.FuzzyStrings/DuoVia.FuzzyStringsTests/FuzzyTests.cs

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ public void FuzzyMatchTests(string input, string match)
5050
}
5151

5252
[Theory]
53+
[InlineData("wwww", "w")]
5354
[InlineData("test", "w")]
5455
[InlineData("test", "W")]
5556
[InlineData("test", "w ")]
@@ -78,11 +79,30 @@ public void FuzzyMatchTests(string input, string match)
7879
[InlineData("2130 South Fort Union Blvd.", "Rural Route 2 Box 29")]
7980
[InlineData("2130 South Fort Union Blvd.", "PO Box 3487")]
8081
[InlineData("2130 South Fort Union Blvd.", "3 Harvard Square")]
82+
[InlineData("eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee", "ee")]
83+
[InlineData("aaaaaaaaa", "aaaaaaaaa")]
8184
public void DiceCoefficientTests(string input, string match)
8285
{
8386
var result = input.DiceCoefficient(match);
87+
var reversedResult = match.DiceCoefficient(input);
88+
var inputBiGrams = input.ToBiGrams(false);
89+
var matchBiGrams = match.ToBiGrams(false);
90+
var biGramResult = inputBiGrams.DiceCoefficient(matchBiGrams);
91+
var reversedBiGramResult = matchBiGrams.DiceCoefficient(inputBiGrams);
92+
output.WriteLine($"DiceCoefficient of \"{match}\" against \"{input}\" was {result} (reversed was {reversedResult}), biGramResult was {biGramResult} (reversed was {reversedBiGramResult}).");
93+
94+
Assert.True(Math.Abs(result - reversedResult) < double.Epsilon);
95+
Assert.True(Math.Abs(biGramResult - reversedBiGramResult) < double.Epsilon);
96+
Assert.True(Math.Abs(result - biGramResult) < double.Epsilon);
8497
Assert.True(result >= 0.0);
85-
output.WriteLine($"DiceCoefficient of \"{match}\" against \"{input}\" was {result}.");
98+
Assert.True(result <= 1.0);
99+
Assert.True(biGramResult >= 0.0);
100+
Assert.True(biGramResult <= 1.0);
101+
if (input == match)
102+
{
103+
Assert.True(Math.Abs(result - 1.0) < double.Epsilon);
104+
Assert.True(Math.Abs(biGramResult - 1.0) < double.Epsilon);
105+
}
86106
}
87107

88108
[Theory]

0 commit comments

Comments
 (0)