From 1a76de69a77c088089ca5d031d084c29144b93d0 Mon Sep 17 00:00:00 2001 From: Matt Hill Date: Sun, 7 Dec 2014 14:53:11 -0500 Subject: [PATCH] Added function for efficient levenshtein distance and unit test --- src/openalpr/utility.cpp | 119 +++++++++++++++++++++++++++++++++++++ src/openalpr/utility.h | 2 +- src/tests/test_utility.cpp | 22 +++++++ 3 files changed, 142 insertions(+), 1 deletion(-) diff --git a/src/openalpr/utility.cpp b/src/openalpr/utility.cpp index cc69969..6cef65b 100644 --- a/src/openalpr/utility.cpp +++ b/src/openalpr/utility.cpp @@ -264,6 +264,125 @@ namespace alpr } } + +// Compares two strings and computes the edit distance between them +// http://en.wikipedia.org/wiki/Levenshtein_distance +// max is the cutoff (i.e., max distance) where we stop trying to find distance +int levenshteinDistance (const std::string &s1, const std::string &s2, int max) +{ + const char* word1 = s1.c_str(); + int len1 = s1.length(); + const char* word2 = s2.c_str(); + int len2 = s2.length(); + max--; + + int matrix[2][len2 + 1]; + int i; + int j; + + /* + Initialize the 0 row of "matrix". + + 0 + 1 + 2 + 3 + + */ + + for (j = 0; j <= len2; j++) { + matrix[0][j] = j; + } + + /* Loop over column. */ + for (i = 1; i <= len1; i++) { + char c1; + /* The first value to consider of the ith column. */ + int min_j; + /* The last value to consider of the ith column. */ + int max_j; + /* The smallest value of the matrix in the ith column. */ + int col_min; + /* The next column of the matrix to fill in. */ + int next; + /* The previously-filled-in column of the matrix. */ + int prev; + + c1 = word1[i-1]; + min_j = 1; + if (i > max) { + min_j = i - max; + } + max_j = len2; + if (len2 > max + i) { + max_j = max + i; + } + col_min = INT_MAX; + next = i % 2; + if (next == 1) { + prev = 0; + } + else { + prev = 1; + } + matrix[next][0] = i; + /* Loop over rows. */ + for (j = 1; j <= len2; j++) { + if (j < min_j || j > max_j) { + /* Put a large value in there. */ + matrix[next][j] = max + 1; + } + else { + char c2; + + c2 = word2[j-1]; + if (c1 == c2) { + /* The character at position i in word1 is the same as + the character at position j in word2. */ + matrix[next][j] = matrix[prev][j-1]; + } + else { + /* The character at position i in word1 is not the + same as the character at position j in word2, so + work out what the minimum cost for getting to cell + i, j is. */ + int del; + int insert; + int substitute; + int minimum; + + del = matrix[prev][j] + 1; + insert = matrix[next][j-1] + 1; + substitute = matrix[prev][j-1] + 1; + minimum = del; + if (insert < minimum) { + minimum = insert; + } + if (substitute < minimum) { + minimum = substitute; + } + matrix[next][j] = minimum; + } + } + /* Find the minimum value in the ith column. */ + if (matrix[next][j] < col_min) { + col_min = matrix[next][j]; + } + } + if (col_min > max) { + /* All the elements of the ith column are greater than the + maximum, so no match less than or equal to max can be + found by looking at succeeding columns. */ + return max + 1; + } + } + int returnval = matrix[len1 % 2][len2]; + if (returnval > max + 1) + returnval = max + 1; + return returnval; +} + + LineSegment::LineSegment() { init(0, 0, 0, 0); diff --git a/src/openalpr/utility.h b/src/openalpr/utility.h index 8b58423..3399ee7 100644 --- a/src/openalpr/utility.h +++ b/src/openalpr/utility.h @@ -101,7 +101,7 @@ namespace alpr cv::Mat addLabel(cv::Mat input, std::string label); - + int levenshteinDistance (const std::string &s1, const std::string &s2, int max); std::string toString(int value); std::string toString(unsigned int value); std::string toString(float value); diff --git a/src/tests/test_utility.cpp b/src/tests/test_utility.cpp index de262b0..848db8f 100644 --- a/src/tests/test_utility.cpp +++ b/src/tests/test_utility.cpp @@ -36,4 +36,26 @@ TEST_CASE( "LineSegment Test", "[2d primitives]" ) { REQUIRE( median(testarray1, 6) == 3 ); REQUIRE( median(testarray2, 6) == 1 ); REQUIRE( median(testarray3, 0) == 0 ); +} + +TEST_CASE( "Test Levenshtein Distance", "[levenshtein]" ) { + + // Test the maximum works correctly + REQUIRE( levenshteinDistance("asdf", "bbbb", 10) == 4 ); + REQUIRE( levenshteinDistance("asdf", "bbbb", 4) == 4 ); + REQUIRE( levenshteinDistance("asdf", "bbbb", 3) == 3 ); + REQUIRE( levenshteinDistance("asdf", "bbbb", 2) == 2 ); + REQUIRE( levenshteinDistance("asdf", "bbbb", 1) == 1 ); + REQUIRE( levenshteinDistance("asdf", "bbbb", 0) == 0 ); + + // Test some substitutions + REQUIRE( levenshteinDistance("P32RX", "PE32RX", 10) == 1 ); + REQUIRE( levenshteinDistance("P32RX", "PE32RX", 2) == 1 ); + REQUIRE( levenshteinDistance("ASDF11", "ASDF1", 10) == 1 ); + REQUIRE( levenshteinDistance("1ASDF1", "ASDF1", 10) == 1 ); + REQUIRE( levenshteinDistance("ASD", "ASDF1", 2) == 2 ); + REQUIRE( levenshteinDistance("11111", "11I11", 2) == 1 ); + + REQUIRE( levenshteinDistance("", "AAAA", 2) == 2 ); + REQUIRE( levenshteinDistance("BA", "AAAA", 2) == 2 ); } \ No newline at end of file