Skip to content

Commit feb3bee

Browse files
committedJun 27, 2015
Add a bunch of optimised fuzzy string matching algorithms.
A new QgsStringUtils class has been added containing some common fuzzy matching algorithms, including Levenshtein edit distance and Soundex. These can be used for finding "similar" strings in a table. Expression functions for these algorithms have also been added to a new "Fuzzy Matching" group.
1 parent 79305b2 commit feb3bee

File tree

13 files changed

+661
-2
lines changed

13 files changed

+661
-2
lines changed
 

‎python/core/core.sip

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@
104104
%Include qgssnappingutils.sip
105105
%Include qgsspatialindex.sip
106106
%Include qgsstatisticalsummary.sip
107+
%Include qgsstringutils.sip
107108
%Include qgstolerance.sip
108109
%Include qgsvectordataprovider.sip
109110
%Include qgsvectorfilewriter.sip

‎python/core/qgsstringutils.sip

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
/** \ingroup core
2+
* \class QgsStringUtils
3+
* \brief Utility functions for working with strings.
4+
* \note Added in version 2.11
5+
*/
6+
7+
class QgsStringUtils
8+
{
9+
%TypeHeaderCode
10+
#include <qgsstringutils.h>
11+
%End
12+
13+
public:
14+
/** Returns the Levenshtein edit distance between two strings. This equates to the minimum
15+
* number of character edits (insertions, deletions or substitutions) required to change
16+
* one string to another.
17+
* @param string1 first string
18+
* @param string2 second string
19+
* @param caseSensitive set to true for case sensitive comparison
20+
* @returns edit distance. Lower distances indicate more similiar strings.
21+
*/
22+
static int levenshteinDistance( const QString &string1, const QString &string2, bool caseSensitive = false );
23+
24+
/** Returns the longest common substring between two strings. This substring is the longest
25+
* string that is a substring of the two input strings. Eg, the longest common substring
26+
* of "ABABC" and "BABCA" is "ABC".
27+
* @param string1 first string
28+
* @param string2 second string
29+
* @param caseSensitive set to true for case sensitive comparison
30+
* @returns longest common substring
31+
*/
32+
static QString longestCommonSubstring( const QString &string1, const QString &string2, bool caseSensitive = false );
33+
34+
/** Returns the Hamming distance between two strings. This equates to the number of characters at
35+
* corresponding positions within the input strings where the characters are different. The input
36+
* strings must be the same length.
37+
* @param string1 first string
38+
* @param string2 second string
39+
* @param caseSensitive set to true for case sensitive comparison
40+
* @returns Hamming distance between strings, or -1 if strings are different lengths.
41+
*/
42+
static int hammingDistance( const QString &string1, const QString &string2, bool caseSensitive = false );
43+
44+
/** Returns the Soundex representation of a string. Soundex is a phonetic matching algorithm,
45+
* so strings with similar sounds should be represented by the same Soundex code.
46+
* @param string input string
47+
* @returns 4 letter Soundex code
48+
*/
49+
static QString soundex( const QString &string );
50+
51+
};
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
<h3>hamming_distance function</h3>
2+
Returns the Hamming distance between two strings. This equates to the number of characters at
3+
corresponding positions within the input strings where the characters are different. The input
4+
strings must be the same length, and the comparison is case-sensitive.
5+
6+
<h4>Syntax</h4>
7+
<pre>hamming_distance(string1,string2)</pre>
8+
9+
<h4>Arguments</h4>
10+
string1 &rarr; a string<br />
11+
string2 &rarr; a string<br />
12+
13+
<h4>Example</h4>
14+
<pre> hamming_distance('abc','xec') &rarr; 2</pre><br />
15+
<pre> hamming_distance('abc','ABc') &rarr; 2</pre><br />
16+
<pre> hamming_distance(upper('abc'),upper('ABC')) &rarr; 0</pre>
17+

‎resources/function_help/levenshtein

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
<h3>levenshtein function</h3>
2+
Returns the Levenshtein edit distance between two strings. This equates to the minimum
3+
number of character edits (insertions, deletions or substitutions) required to change
4+
one string to another.<br />
5+
The Levenshtein distance is a measure of the similarity between two strings. Smaller
6+
distances mean the strings are more similar, and larger distances indicate more
7+
different strings. The distance is case sensitive.
8+
9+
<h4>Syntax</h4>
10+
<pre>levenshtein(string1,string2)</pre>
11+
12+
<h4>Arguments</h4>
13+
string1 &rarr; a string<br />
14+
string2 &rarr; a string<br />
15+
16+
<h4>Example</h4>
17+
<pre> levenshtein('kittens','mitten') &rarr; 2</pre><br />
18+
<pre> levenshtein('Kitten','kitten') &rarr; 1</pre><br />
19+
<pre> levenshtein(upper('Kitten'),upper('kitten')) &rarr; 0</pre>
20+
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
<h3>longest_common_substring function</h3>
2+
Returns the longest common substring between two strings. This substring is the longest
3+
string that is a substring of the two input strings. Eg, the longest common substring
4+
of "ABABC" and "BABCA" is "ABC". The substring is case sensitive.
5+
6+
<h4>Syntax</h4>
7+
<pre>longest_common_substring(string1,string2)</pre>
8+
9+
<h4>Arguments</h4>
10+
string1 &rarr; a string<br />
11+
string2 &rarr; a string<br />
12+
13+
<h4>Example</h4>
14+
<pre> longest_common_substring('ABABC','BABCA') &rarr; 'ABC'</pre><br />
15+
<pre> longest_common_substring('abcDeF','abcdef') &rarr; 'abc'</pre><br />
16+
<pre> longest_common_substring(upper('abcDeF'),upper('abcdex')) &rarr; 'ABCDE'</pre>
17+

‎resources/function_help/soundex

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
<h3>soundex function</h3>
2+
Returns the Soundex representation of a string. Soundex is a phonetic matching algorithm,
3+
so strings with similar sounds should be represented by the same Soundex code.
4+
5+
<h4>Syntax</h4>
6+
<pre>soundex(string)</pre>
7+
8+
<h4>Arguments</h4>
9+
string &rarr; a string
10+
11+
<h4>Example</h4>
12+
<pre> soundex('robert') &rarr; 'R163'</pre><br />
13+
<pre> soundex('rupert') &rarr; 'R163'</pre><br />
14+
<pre> soundex('rubin') &rarr; 'R150'</pre>
15+

‎src/core/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,7 @@ SET(QGIS_CORE_SRCS
169169
qgssnappingutils.cpp
170170
qgsspatialindex.cpp
171171
qgsstatisticalsummary.cpp
172+
qgsstringutils.cpp
172173
qgstransaction.cpp
173174
qgstolerance.cpp
174175
qgsvectordataprovider.cpp
@@ -590,6 +591,7 @@ SET(QGIS_CORE_HDRS
590591
qgssnappingutils.h
591592
qgsspatialindex.h
592593
qgsstatisticalsummary.h
594+
qgsstringutils.h
593595
qgstolerance.h
594596
qgstransaction.h
595597
qgsvectordataprovider.h

‎src/core/qgsexpression.cpp

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
#include "qgssymbollayerv2utils.h"
3636
#include "qgsvectorcolorrampv2.h"
3737
#include "qgsstylev2.h"
38+
#include "qgsstringutils.h"
3839

3940
// from parser
4041
extern QgsExpression::Node* parseExpression( const QString& str, QString& parserErrorMsg );
@@ -657,6 +658,34 @@ static QVariant fcnTrim( const QVariantList& values, const QgsFeature*, QgsExpre
657658
return QVariant( str.trimmed() );
658659
}
659660

661+
static QVariant fcnLevenshtein( const QVariantList& values, const QgsFeature*, QgsExpression* parent )
662+
{
663+
QString string1 = getStringValue( values.at( 0 ), parent );
664+
QString string2 = getStringValue( values.at( 1 ), parent );
665+
return QVariant( QgsStringUtils::levenshteinDistance( string1, string2, true) );
666+
}
667+
668+
static QVariant fcnLCS( const QVariantList& values, const QgsFeature*, QgsExpression* parent )
669+
{
670+
QString string1 = getStringValue( values.at( 0 ), parent );
671+
QString string2 = getStringValue( values.at( 1 ), parent );
672+
return QVariant( QgsStringUtils::longestCommonSubstring( string1, string2, true ) );
673+
}
674+
675+
static QVariant fcnHamming( const QVariantList& values, const QgsFeature*, QgsExpression* parent )
676+
{
677+
QString string1 = getStringValue( values.at( 0 ), parent );
678+
QString string2 = getStringValue( values.at( 1 ), parent );
679+
int dist = QgsStringUtils::hammingDistance( string1, string2 );
680+
return ( dist < 0 ? QVariant() : QVariant( QgsStringUtils::hammingDistance( string1, string2, true ) ) );
681+
}
682+
683+
static QVariant fcnSoundex( const QVariantList& values, const QgsFeature*, QgsExpression* parent )
684+
{
685+
QString string = getStringValue( values.at( 0 ), parent );
686+
return QVariant( QgsStringUtils::soundex( string ) );
687+
}
688+
660689
static QVariant fcnWordwrap( const QVariantList& values, const QgsFeature*, QgsExpression* parent )
661690
{
662691
if ( values.length() == 2 || values.length() == 3 )
@@ -1697,7 +1726,8 @@ const QStringList& QgsExpression::BuiltinFunctions()
16971726
<< "distance" << "intersection" << "sym_difference" << "combine"
16981727
<< "union" << "geom_to_wkt" << "geomToWKT" << "geometry"
16991728
<< "transform" << "get_feature" << "getFeature"
1700-
<< "attribute"
1729+
<< "attribute" << "levenshtein" << "longest_common_substring" << "hamming_distance"
1730+
<< "soundex"
17011731
<< "$rownum" << "$id" << "$scale" << "_specialcol_";
17021732
}
17031733
return gmBuiltinFunctions;
@@ -1757,6 +1787,10 @@ const QList<QgsExpression::Function*>& QgsExpression::Functions()
17571787
<< new StaticFunction( "upper", 1, fcnUpper, "String" )
17581788
<< new StaticFunction( "title", 1, fcnTitle, "String" )
17591789
<< new StaticFunction( "trim", 1, fcnTrim, "String" )
1790+
<< new StaticFunction( "levenshtein", 2, fcnLevenshtein, "Fuzzy Matching" )
1791+
<< new StaticFunction( "longest_common_substring", 2, fcnLCS, "Fuzzy Matching" )
1792+
<< new StaticFunction( "hamming_distance", 2, fcnHamming, "Fuzzy Matching" )
1793+
<< new StaticFunction( "soundex", 1, fcnSoundex, "Fuzzy Matching" )
17601794
<< new StaticFunction( "wordwrap", -1, fcnWordwrap, "String" )
17611795
<< new StaticFunction( "length", 1, fcnLength, "String" )
17621796
<< new StaticFunction( "replace", 3, fcnReplace, "String" )

‎src/core/qgsstringutils.cpp

Lines changed: 296 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,296 @@
1+
/***************************************************************************
2+
qgsstringutils.cpp
3+
------------------
4+
begin : June 2015
5+
copyright : (C) 2015 by Nyall Dawson
6+
email : nyall dot dawson at gmail dot com
7+
***************************************************************************
8+
* *
9+
* This program is free software; you can redistribute it and/or modify *
10+
* it under the terms of the GNU General Public License as published by *
11+
* the Free Software Foundation; either version 2 of the License, or *
12+
* (at your option) any later version. *
13+
* *
14+
***************************************************************************/
15+
16+
#include "qgsstringutils.h"
17+
#include <QVector>
18+
19+
int QgsStringUtils::levenshteinDistance( const QString& string1, const QString& string2, bool caseSensitive )
20+
{
21+
int length1 = string1.length();
22+
int length2 = string2.length();
23+
24+
//empty strings? solution is trivial...
25+
if ( string1.isEmpty() )
26+
{
27+
return length2;
28+
}
29+
else if ( string2.isEmpty() )
30+
{
31+
return length1;
32+
}
33+
34+
//handle case sensitive flag (or not)
35+
QString s1( caseSensitive ? string1 : string1.toLower() );
36+
QString s2( caseSensitive ? string2 : string2.toLower() );
37+
38+
const QChar* s1Char = s1.constData();
39+
const QChar* s2Char = s2.constData();
40+
41+
//strip out any common prefix
42+
int commonPrefixLen = 0;
43+
while ( length1 > 0 && length2 > 0 && *s1Char == *s2Char )
44+
{
45+
commonPrefixLen++;
46+
length1--;
47+
length2--;
48+
s1Char++;
49+
s2Char++;
50+
}
51+
52+
//strip out any common suffix
53+
while ( length1 > 0 && length2 > 0 && s1.at( commonPrefixLen + length1 - 1 ) == s2.at( commonPrefixLen + length2 - 1 ) )
54+
{
55+
length1--;
56+
length2--;
57+
}
58+
59+
//fully checked either string? if so, the answer is easy...
60+
if ( length1 == 0 )
61+
{
62+
return length2;
63+
}
64+
else if ( length2 == 0 )
65+
{
66+
return length1;
67+
}
68+
69+
//ensure the inner loop is longer
70+
if ( length1 > length2 )
71+
{
72+
qSwap( s1, s2 );
73+
qSwap( length1, length2 );
74+
}
75+
76+
//levenshtein algorithm begins here
77+
QVector< int > col;
78+
col.fill( 0, length2 + 1 );
79+
QVector< int > prevCol;
80+
prevCol.reserve( length2 + 1 );
81+
for ( int i = 0; i < length2 + 1; ++i )
82+
{
83+
prevCol << i;
84+
}
85+
const QChar* s2start = s2Char;
86+
for ( int i = 0; i < length1; ++i )
87+
{
88+
col[0] = i + 1;
89+
s2Char = s2start;
90+
for ( int j = 0; j < length2; ++j )
91+
{
92+
col[j + 1] = qMin( qMin( 1 + col[j], 1 + prevCol[1 + j] ), prevCol[j] + (( *s1Char == *s2Char ) ? 0 : 1 ) );
93+
s2Char++;
94+
}
95+
col.swap( prevCol );
96+
s1Char++;
97+
}
98+
return prevCol[length2];
99+
}
100+
101+
QString QgsStringUtils::longestCommonSubstring( const QString& string1, const QString& string2, bool caseSensitive )
102+
{
103+
if ( string1.isEmpty() || string2.isEmpty() )
104+
{
105+
//empty strings, solution is trivial...
106+
return QString();
107+
}
108+
109+
//handle case sensitive flag (or not)
110+
QString s1( caseSensitive ? string1 : string1.toLower() );
111+
QString s2( caseSensitive ? string2 : string2.toLower() );
112+
113+
if ( s1 == s2 )
114+
{
115+
//another trivial case, identical strings
116+
return s1;
117+
}
118+
119+
int* currentScores = new int [ s2.length()];
120+
int* previousScores = new int [ s2.length()];
121+
int maxCommonLength = 0;
122+
int lastMaxBeginIndex = 0;
123+
124+
const QChar* s1Char = s1.constData();
125+
const QChar* s2Char = s2.constData();
126+
const QChar* s2Start = s2Char;
127+
128+
for ( int i = 0; i < s1.length(); ++i )
129+
{
130+
for ( int j = 0; j < s2.length(); ++j )
131+
{
132+
if ( *s1Char != *s2Char )
133+
{
134+
currentScores[j] = 0;
135+
}
136+
else
137+
{
138+
if ( i == 0 || j == 0 )
139+
{
140+
currentScores[j] = 1;
141+
}
142+
else
143+
{
144+
currentScores[j] = 1 + previousScores[j - 1];
145+
}
146+
147+
if ( maxCommonLength < currentScores[j] )
148+
{
149+
maxCommonLength = currentScores[j];
150+
lastMaxBeginIndex = i;
151+
}
152+
}
153+
s2Char++;
154+
}
155+
qSwap( currentScores, previousScores );
156+
s1Char++;
157+
s2Char = s2Start;
158+
}
159+
delete [] currentScores;
160+
delete [] previousScores;
161+
return string1.mid( lastMaxBeginIndex - maxCommonLength + 1, maxCommonLength );
162+
}
163+
164+
int QgsStringUtils::hammingDistance( const QString& string1, const QString& string2, bool caseSensitive )
165+
{
166+
if ( string1.isEmpty() && string2.isEmpty() )
167+
{
168+
//empty strings, solution is trivial...
169+
return 0;
170+
}
171+
172+
if ( string1.length() != string2.length() )
173+
{
174+
//invalid inputs
175+
return -1;
176+
}
177+
178+
//handle case sensitive flag (or not)
179+
QString s1( caseSensitive ? string1 : string1.toLower() );
180+
QString s2( caseSensitive ? string2 : string2.toLower() );
181+
182+
if ( s1 == s2 )
183+
{
184+
//another trivial case, identical strings
185+
return 0;
186+
}
187+
188+
int distance = 0;
189+
const QChar* s1Char = s1.constData();
190+
const QChar* s2Char = s2.constData();
191+
192+
for ( int i = 0; i < string1.length(); ++i )
193+
{
194+
if ( *s1Char != *s2Char )
195+
distance++;
196+
s1Char++;
197+
s2Char++;
198+
}
199+
200+
return distance;
201+
}
202+
203+
QString QgsStringUtils::soundex( const QString& string )
204+
{
205+
if ( string.isEmpty() )
206+
return QString();
207+
208+
QString tmp = string.toUpper();
209+
210+
//strip non character codes, and vowel like characters after the first character
211+
QChar* char1 = tmp.data();
212+
QChar* char2 = tmp.data();
213+
int outLen = 0;
214+
for ( int i = 0; i < tmp.length(); ++i, ++char2 )
215+
{
216+
if (( *char2 ).unicode() >= 0x41 && ( *char2 ).unicode() <= 0x5A && ( i == 0 || (( *char2 ).unicode() != 0x41 && ( *char2 ).unicode() != 0x45
217+
&& ( *char2 ).unicode() != 0x48 && ( *char2 ).unicode() != 0x49
218+
&& ( *char2 ).unicode() != 0x4F && ( *char2 ).unicode() != 0x55
219+
&& ( *char2 ).unicode() != 0x57 && ( *char2 ).unicode() != 0x59 ) ) )
220+
{
221+
*char1 = *char2;
222+
char1++;
223+
outLen++;
224+
}
225+
}
226+
tmp.truncate( outLen );
227+
228+
QChar* tmpChar = tmp.data();
229+
tmpChar++;
230+
for ( int i = 1; i < tmp.length(); ++i, ++tmpChar )
231+
{
232+
switch (( *tmpChar ).unicode() )
233+
{
234+
case 0x42:
235+
case 0x46:
236+
case 0x50:
237+
case 0x56:
238+
tmp.replace( i, 1, QChar( 0x31 ) );
239+
break;
240+
241+
case 0x43:
242+
case 0x47:
243+
case 0x4A:
244+
case 0x4B:
245+
case 0x51:
246+
case 0x53:
247+
case 0x58:
248+
case 0x5A:
249+
tmp.replace( i, 1, QChar( 0x32 ) );
250+
break;
251+
252+
case 0x44:
253+
case 0x54:
254+
tmp.replace( i, 1, QChar( 0x33 ) );
255+
break;
256+
257+
case 0x4C:
258+
tmp.replace( i, 1, QChar( 0x34 ) );
259+
break;
260+
261+
case 0x4D:
262+
case 0x4E:
263+
tmp.replace( i, 1, QChar( 0x35 ) );
264+
break;
265+
266+
case 0x52:
267+
tmp.replace( i, 1, QChar( 0x36 ) );
268+
break;
269+
}
270+
}
271+
272+
//remove adjacent duplicates
273+
char1 = tmp.data();
274+
char2 = tmp.data();
275+
char2++;
276+
outLen = 1;
277+
for ( int i = 1; i < tmp.length(); ++i, ++char2 )
278+
{
279+
if ( *char2 != *char1 )
280+
{
281+
char1++;
282+
*char1 = *char2;
283+
outLen++;
284+
if ( outLen == 4 )
285+
break;
286+
}
287+
}
288+
tmp.truncate( outLen );
289+
if ( tmp.length() < 4 )
290+
{
291+
tmp.append( "000" );
292+
tmp.truncate( 4 );
293+
}
294+
295+
return tmp;
296+
}

‎src/core/qgsstringutils.h

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
/***************************************************************************
2+
qgsstringutils.h
3+
----------------
4+
begin : June 2015
5+
copyright : (C) 2015 by Nyall Dawson
6+
email : nyall dot dawson at gmail dot com
7+
***************************************************************************
8+
* *
9+
* This program is free software; you can redistribute it and/or modify *
10+
* it under the terms of the GNU General Public License as published by *
11+
* the Free Software Foundation; either version 2 of the License, or *
12+
* (at your option) any later version. *
13+
* *
14+
***************************************************************************/
15+
16+
#include <QString>
17+
18+
#ifndef QGSSTRINGUTILS_H
19+
#define QGSSTRINGUTILS_H
20+
21+
/** \ingroup core
22+
* \class QgsStringUtils
23+
* \brief Utility functions for working with strings.
24+
* \note Added in version 2.11
25+
*/
26+
27+
class CORE_EXPORT QgsStringUtils
28+
{
29+
public:
30+
31+
/** Returns the Levenshtein edit distance between two strings. This equates to the minimum
32+
* number of character edits (insertions, deletions or substitutions) required to change
33+
* one string to another.
34+
* @param string1 first string
35+
* @param string2 second string
36+
* @param caseSensitive set to true for case sensitive comparison
37+
* @returns edit distance. Lower distances indicate more similiar strings.
38+
*/
39+
static int levenshteinDistance( const QString &string1, const QString &string2, bool caseSensitive = false );
40+
41+
/** Returns the longest common substring between two strings. This substring is the longest
42+
* string that is a substring of the two input strings. Eg, the longest common substring
43+
* of "ABABC" and "BABCA" is "ABC".
44+
* @param string1 first string
45+
* @param string2 second string
46+
* @param caseSensitive set to true for case sensitive comparison
47+
* @returns longest common substring
48+
*/
49+
static QString longestCommonSubstring( const QString &string1, const QString &string2, bool caseSensitive = false );
50+
51+
/** Returns the Hamming distance between two strings. This equates to the number of characters at
52+
* corresponding positions within the input strings where the characters are different. The input
53+
* strings must be the same length.
54+
* @param string1 first string
55+
* @param string2 second string
56+
* @param caseSensitive set to true for case sensitive comparison
57+
* @returns Hamming distance between strings, or -1 if strings are different lengths.
58+
*/
59+
static int hammingDistance( const QString &string1, const QString &string2, bool caseSensitive = false );
60+
61+
/** Returns the Soundex representation of a string. Soundex is a phonetic matching algorithm,
62+
* so strings with similar sounds should be represented by the same Soundex code.
63+
* @param string input string
64+
* @returns 4 letter Soundex code
65+
*/
66+
static QString soundex( const QString &string );
67+
};
68+
69+
#endif //QGSSTRINGUTILS_H

‎tests/src/core/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,6 @@ ADD_QGIS_TEST(histogramtest testqgshistogram.cpp)
162162
ADD_QGIS_TEST(pallabelingtest testqgspallabeling.cpp)
163163
ADD_QGIS_TEST(graduatedsymbolrenderertest testqgsgraduatedsymbolrenderer.cpp)
164164
ADD_QGIS_TEST(fontutils testqgsfontutils.cpp)
165-
165+
ADD_QGIS_TEST(stringutilstest testqgsstringutils.cpp)
166166

167167

‎tests/src/core/testqgsexpression.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -390,6 +390,19 @@ class TestQgsExpression: public QObject
390390
QTest::newRow( "concat function single" ) << "concat('a')" << false << QVariant( "a" );
391391
QTest::newRow( "concat function with NULL" ) << "concat(NULL,'a','b')" << false << QVariant( "ab" );
392392

393+
//fuzzy matching
394+
QTest::newRow( "levenshtein" ) << "levenshtein('kitten','sitting')" << false << QVariant( 3 );
395+
QTest::newRow( "levenshtein" ) << "levenshtein('kitten','kiTTen')" << false << QVariant( 2 );
396+
QTest::newRow( "levenshtein" ) << "levenshtein('','')" << false << QVariant( 0 );
397+
QTest::newRow( "longest_common_substring" ) << "longest_common_substring('expression','impression')" << false << QVariant( "pression" );
398+
QTest::newRow( "longest_common_substring" ) << "longest_common_substring('abCdE','abcde')" << false << QVariant( "ab" );
399+
QTest::newRow( "longest_common_substring" ) << "longest_common_substring('','')" << false << QVariant( "" );
400+
QTest::newRow( "hamming_distance" ) << "hamming_distance('abc','xec')" << false << QVariant( 2 );
401+
QTest::newRow( "hamming_distance" ) << "hamming_distance('abc','ABc')" << false << QVariant( 2 );
402+
QTest::newRow( "hamming_distance" ) << "hamming_distance('abcd','xec')" << false << QVariant();
403+
QTest::newRow( "soundex" ) << "soundex('jackson')" << false << QVariant( "J250" );
404+
QTest::newRow( "soundex" ) << "soundex('')" << false << QVariant( "" );
405+
393406
// implicit conversions
394407
QTest::newRow( "implicit int->text" ) << "length(123)" << false << QVariant( 3 );
395408
QTest::newRow( "implicit double->text" ) << "length(1.23)" << false << QVariant( 4 );

‎tests/src/core/testqgsstringutils.cpp

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
/***************************************************************************
2+
testqgsstringutils.cpp
3+
----------------------
4+
begin : June 2015
5+
copyright : (C) 2015 by Nyall Dawson
6+
email : nyall dot dawson at gmail dot com
7+
***************************************************************************/
8+
9+
/***************************************************************************
10+
* *
11+
* This program is free software; you can redistribute it and/or modify *
12+
* it under the terms of the GNU General Public License as published by *
13+
* the Free Software Foundation; either version 2 of the License, or *
14+
* (at your option) any later version. *
15+
* *
16+
***************************************************************************/
17+
18+
#include "qgsstringutils.h"
19+
#include <QObject>
20+
#include <QtTest/QtTest>
21+
22+
class TestQgsStringUtils : public QObject
23+
{
24+
Q_OBJECT
25+
26+
private slots:
27+
void initTestCase();// will be called before the first testfunction is executed.
28+
void cleanupTestCase();// will be called after the last testfunction was executed.
29+
void init();// will be called before each testfunction is executed.
30+
void cleanup();// will be called after every testfunction.
31+
32+
void levenshtein();
33+
void longestCommonSubstring();
34+
void hammingDistance();
35+
void soundex();
36+
37+
};
38+
39+
void TestQgsStringUtils::initTestCase()
40+
{
41+
42+
}
43+
44+
void TestQgsStringUtils::cleanupTestCase()
45+
{
46+
47+
}
48+
49+
void TestQgsStringUtils::init()
50+
{
51+
}
52+
53+
void TestQgsStringUtils::cleanup()
54+
{
55+
}
56+
57+
void TestQgsStringUtils::levenshtein()
58+
{
59+
QCOMPARE( QgsStringUtils::levenshteinDistance( QString(), QString() ), 0 );
60+
QCOMPARE( QgsStringUtils::levenshteinDistance( "abc", QString() ), 3 );
61+
QCOMPARE( QgsStringUtils::levenshteinDistance( QString(), "abc" ), 3 );
62+
QCOMPARE( QgsStringUtils::levenshteinDistance( "abc", "abc" ), 0 );
63+
QCOMPARE( QgsStringUtils::levenshteinDistance( "abc", "aBc", true ), 1 );
64+
QCOMPARE( QgsStringUtils::levenshteinDistance( "abc", "xec" ), 2 );
65+
QCOMPARE( QgsStringUtils::levenshteinDistance( "abc", "abd" ), 1 );
66+
QCOMPARE( QgsStringUtils::levenshteinDistance( "abc", "ebg" ), 2 );
67+
QCOMPARE( QgsStringUtils::levenshteinDistance( "kitten", "sitting" ), 3 );
68+
QCOMPARE( QgsStringUtils::levenshteinDistance( "kItten", "sitting" ), 3 );
69+
QCOMPARE( QgsStringUtils::levenshteinDistance( "kitten", "sitTing", true ), 4 );
70+
QCOMPARE( QgsStringUtils::levenshteinDistance( "kitten", "xkitte" ), 2 );
71+
}
72+
73+
void TestQgsStringUtils::longestCommonSubstring()
74+
{
75+
QCOMPARE( QgsStringUtils::longestCommonSubstring( QString(), QString() ), QString() );
76+
QCOMPARE( QgsStringUtils::longestCommonSubstring( "abc", QString() ), QString() );
77+
QCOMPARE( QgsStringUtils::longestCommonSubstring( QString(), "abc" ), QString() );
78+
QCOMPARE( QgsStringUtils::longestCommonSubstring( "abc", "def" ), QString() );
79+
QCOMPARE( QgsStringUtils::longestCommonSubstring( "abc", "abd" ), QString( "ab" ) );
80+
QCOMPARE( QgsStringUtils::longestCommonSubstring( "abc", "xbc" ), QString( "bc" ) );
81+
QCOMPARE( QgsStringUtils::longestCommonSubstring( "abc", "xbd" ), QString( "b" ) );
82+
QCOMPARE( QgsStringUtils::longestCommonSubstring( "longer test", "inger task" ), QString( "nger t" ) );
83+
QCOMPARE( QgsStringUtils::longestCommonSubstring( "lonGer test", "inger task", true ), QString( "er t" ) );
84+
}
85+
86+
void TestQgsStringUtils::hammingDistance()
87+
{
88+
QCOMPARE( QgsStringUtils::hammingDistance( QString(), QString() ), 0 );
89+
QCOMPARE( QgsStringUtils::hammingDistance( "abc", QString() ), -1 );
90+
QCOMPARE( QgsStringUtils::hammingDistance( QString(), "abc" ), -1 );
91+
QCOMPARE( QgsStringUtils::hammingDistance( "abc", "abcd" ), -1 );
92+
QCOMPARE( QgsStringUtils::hammingDistance( "abcd", "abc" ), -1 );
93+
QCOMPARE( QgsStringUtils::hammingDistance( "abc", "abc" ), 0 );
94+
QCOMPARE( QgsStringUtils::hammingDistance( "abc", "aBc", true ), 1 );
95+
QCOMPARE( QgsStringUtils::hammingDistance( "abc", "xec" ), 2 );
96+
QCOMPARE( QgsStringUtils::hammingDistance( "abc", "abd" ), 1 );
97+
QCOMPARE( QgsStringUtils::hammingDistance( "abc", "ebg" ), 2 );
98+
QCOMPARE( QgsStringUtils::hammingDistance( "kitten", "sittin" ), 2 );
99+
QCOMPARE( QgsStringUtils::hammingDistance( "kItten", "sittin" ), 2 );
100+
QCOMPARE( QgsStringUtils::hammingDistance( "kitten", "sitTin", true ), 3 );
101+
QCOMPARE( QgsStringUtils::hammingDistance( "kitten", "xkitte" ), 5 );
102+
}
103+
104+
void TestQgsStringUtils::soundex()
105+
{
106+
QCOMPARE( QgsStringUtils::soundex( QString() ), QString() );
107+
//test data from jellyfish & fuzzycomp python libraries
108+
QCOMPARE( QgsStringUtils::soundex( "Washington" ), QString( "W252" ) );
109+
QCOMPARE( QgsStringUtils::soundex( "Lee" ), QString( "L000" ) );
110+
QCOMPARE( QgsStringUtils::soundex( "Gutierrez" ), QString( "G362" ) );
111+
QCOMPARE( QgsStringUtils::soundex( "Jackson" ), QString( "J250" ) );
112+
QCOMPARE( QgsStringUtils::soundex( "a" ), QString( "A000" ) );
113+
QCOMPARE( QgsStringUtils::soundex( "herman" ), QString( "H650" ) );
114+
QCOMPARE( QgsStringUtils::soundex( "robert" ), QString( "R163" ) );
115+
QCOMPARE( QgsStringUtils::soundex( "RuperT" ), QString( "R163" ) );
116+
QCOMPARE( QgsStringUtils::soundex( "rubin" ), QString( "R150" ) );
117+
QCOMPARE( QgsStringUtils::soundex( "ashcraft" ), QString( "A261" ) );
118+
QCOMPARE( QgsStringUtils::soundex( "ashcroft" ), QString( "A261" ) );
119+
}
120+
121+
122+
QTEST_MAIN( TestQgsStringUtils )
123+
#include "testqgsstringutils.moc"
124+

0 commit comments

Comments
 (0)
Please sign in to comment.