@@ -20,9 +20,15 @@ public static class DiceCoefficientExtensions
2020 /// <returns></returns>
2121 public static double DiceCoefficient ( this string input , string comparedTo )
2222 {
23- var ngrams = input . ToBiGrams ( ) ;
24- var compareToNgrams = comparedTo . ToBiGrams ( ) ;
25- return ngrams . DiceCoefficient ( compareToNgrams ) ;
23+ if ( input == comparedTo )
24+ return 1.0d ;
25+
26+ if ( input . Length < 2 || comparedTo . Length < 2 )
27+ return 0.0d ;
28+
29+ var biGrams = input . ToBiGrams ( false ) ;
30+ var compareToBiGrams = comparedTo . ToBiGrams ( false ) ;
31+ return DiceCoefficient ( biGrams , compareToBiGrams ) ;
2632 }
2733
2834 /// <summary>
@@ -33,41 +39,71 @@ public static double DiceCoefficient(this string input, string comparedTo)
3339 /// <returns></returns>
3440 public static double DiceCoefficient ( this string [ ] nGrams , string [ ] compareToNGrams )
3541 {
36- int matches = nGrams . Intersect ( compareToNGrams ) . Count ( ) ;
37- if ( matches == 0 ) return 0.0d ;
42+ var nGramMap = new Dictionary < string , int > ( nGrams . Length ) ;
43+ var compareToNGramMap = new Dictionary < string , int > ( compareToNGrams . Length ) ;
44+ var nGramSet = new HashSet < string > ( ) ;
45+ var compareToNGramSet = new HashSet < string > ( ) ;
46+ foreach ( var nGram in nGrams )
47+ {
48+ if ( nGramSet . Add ( nGram ) )
49+ nGramMap [ nGram ] = 1 ;
50+ else
51+ nGramMap [ nGram ] ++ ;
52+ }
53+ foreach ( var nGram in compareToNGrams )
54+ {
55+ if ( compareToNGramSet . Add ( nGram ) )
56+ compareToNGramMap [ nGram ] = 1 ;
57+ else
58+ compareToNGramMap [ nGram ] ++ ;
59+ }
60+ nGramSet . IntersectWith ( compareToNGramSet ) ;
61+ var matches = 0 ;
62+ foreach ( var nGram in nGramSet )
63+ matches += Math . Min ( nGramMap [ nGram ] , compareToNGramMap [ nGram ] ) ;
64+ if ( matches == 0 )
65+ return 0.0d ;
66+
3867 double totalBigrams = nGrams . Length + compareToNGrams . Length ;
3968 return ( 2 * matches ) / totalBigrams ;
4069 }
4170
42- public static string [ ] ToBiGrams ( this string input )
71+ public static string [ ] ToBiGrams ( this string input , bool usePadding = true )
4372 {
44- // nLength == 2
45- // from Jackson, return %j ja ac ck ks so on n#
46- // from Main, return #m ma ai in n#
47- input = SinglePercent + input + SinglePound ;
73+ if ( usePadding )
74+ {
75+ // nLength == 2
76+ // from Jackson, return %j ja ac ck ks so on n#
77+ // from Main, return %m ma ai in n#
78+ input = SinglePercent + input + SinglePound ;
79+ }
80+ if ( input . Length < 2 )
81+ return new string [ 0 ] ;
82+
4883 return ToNGrams ( input , 2 ) ;
4984 }
5085
5186 public static string [ ] ToTriGrams ( this string input )
5287 {
5388 // nLength == 3
54- // from Jackson, return %%j % ja jac ack cks kso son on# n##
55- // from Main, return ##m # ma mai ain in# n##
56- input = DoublePercent + input + DoublePount ;
89+ // from Jackson, return &&j & ja jac ack cks kso son on# n##
90+ // from Main, return &&m & ma mai ain in# n##
91+ input = DoubleAmpersand + input + DoublePound ;
5792 return ToNGrams ( input , 3 ) ;
5893 }
5994
6095 private static string [ ] ToNGrams ( string input , int nLength )
6196 {
6297 int itemsCount = input . Length - 1 ;
6398 string [ ] ngrams = new string [ input . Length - 1 ] ;
64- for ( int i = 0 ; i < itemsCount ; i ++ ) ngrams [ i ] = input . Substring ( i , nLength ) ;
99+ for ( int i = 0 ; i < itemsCount ; i ++ )
100+ ngrams [ i ] = input . Substring ( i , nLength ) ;
65101 return ngrams ;
66102 }
67103
68104 private const string SinglePercent = "%" ;
69105 private const string SinglePound = "#" ;
70- private const string DoublePercent = "&&" ;
71- private const string DoublePount = "##" ;
106+ private const string DoubleAmpersand = "&&" ;
107+ private const string DoublePound = "##" ;
72108 }
73109}
0 commit comments