tlx
Loading...
Searching...
No Matches
levenshtein.hpp
Go to the documentation of this file.
1/*******************************************************************************
2 * tlx/string/levenshtein.hpp
3 *
4 * Part of tlx - http://panthema.net/tlx
5 *
6 * Copyright (C) 2007-2018 Timo Bingmann <tb@panthema.net>
7 *
8 * All rights reserved. Published under the Boost Software License, Version 1.0
9 ******************************************************************************/
10
11#ifndef TLX_STRING_LEVENSHTEIN_HEADER
12#define TLX_STRING_LEVENSHTEIN_HEADER
13
14#include <tlx/simple_vector.hpp>
16
17#include <algorithm>
18#include <cstring>
19#include <string>
20
21namespace tlx {
22
23//! \addtogroup tlx_string
24//! \{
25
26// *** Parameter Struct and Algorithm ***
27
28/*!
29 * Standard parameters to levenshtein distance function. Costs are all 1 and
30 * characters are compared directly.
31 */
33 static const unsigned int cost_insert_delete = 1;
34 static const unsigned int cost_replace = 1;
35
36 static inline bool char_equal(const char& a, const char& b)
37 { return (a == b); }
38};
39
40/*!
41 * Standard parameters to Levenshtein distance function. Costs are all 1 and
42 * characters are compared case-insensitively.
43 */
45 static const unsigned int cost_insert_delete = 1;
46 static const unsigned int cost_replace = 1;
47
48 static inline bool char_equal(const char& a, const char& b)
49 { return to_lower(a) == to_lower(b); }
50};
51
52/*!
53 * Computes the Levenshtein string distance also called edit distance between
54 * two strings. The distance is the minimum number of
55 * replacements/inserts/deletes needed to change one string into the
56 * other. Implemented with time complexity O(|n|+|m|) and memory complexity
57 * O(2*max(|n|,|m|))
58 *
59 * \param a first string
60 * \param a_size size of first string
61 * \param b second string
62 * \param b_size size of second string
63 * \return Levenshtein distance
64 */
65template <typename Param>
66static inline
67size_t levenshtein_algorithm(const char* a, size_t a_size,
68 const char* b, size_t b_size) {
69 // if one of the strings is zero, then all characters of the other must
70 // be inserted.
71 if (a_size == 0) return b_size * Param::cost_insert_delete;
72 if (b_size == 0) return a_size * Param::cost_insert_delete;
73
74 // make "as" the longer string and "bs" the shorter.
75 if (a_size < b_size) {
76 std::swap(a, b);
77 std::swap(a_size, b_size);
78 }
79
80 // only allocate two rows of the needed matrix.
81 simple_vector<size_t> lastrow(a_size + 1);
82 simple_vector<size_t> thisrow(a_size + 1);
83
84 // fill this row with ascending ordinals.
85 for (size_t i = 0; i < a_size + 1; i++) {
86 thisrow[i] = i;
87 }
88
89 // compute distance
90 for (size_t j = 1; j < b_size + 1; j++)
91 {
92 // switch rows
93 std::swap(lastrow, thisrow);
94
95 // compute new row
96 thisrow[0] = j;
97
98 for (size_t i = 1; i < a_size + 1; i++)
99 {
100 // three-way mimimum of
101 thisrow[i] = std::min(
102 std::min(
103 // left plus insert cost
104 thisrow[i - 1] + Param::cost_insert_delete,
105 // top plus delete cost
106 lastrow[i] + Param::cost_insert_delete),
107 // top left plus replacement cost
108 lastrow[i - 1] + (
109 Param::char_equal(a[i - 1], b[j - 1])
110 ? 0 : Param::cost_replace)
111 );
112 }
113 }
114
115 // result is in the last cell of the last computed row
116 return thisrow[a_size];
117}
118
119/*!
120 * Computes the Levenshtein string distance between two strings. The distance
121 * is the minimum number of replacements/inserts/deletes needed to change one
122 * string into the other.
123 *
124 * \param a first string
125 * \param b second string
126 * \return Levenshtein distance
127 */
128static inline size_t levenshtein(const char* a, const char* b) {
129 return levenshtein_algorithm<LevenshteinStandardParameters>(
130 a, std::strlen(a), b, std::strlen(b));
131}
132
133/*!
134 * Computes the Levenshtein string distance between two strings. The distance
135 * is the minimum number of replacements/inserts/deletes needed to change one
136 * string into the other.
137 *
138 * \param a first string
139 * \param b second string
140 * \return Levenshtein distance
141 */
142static inline size_t levenshtein_icase(const char* a, const char* b) {
143 return levenshtein_algorithm<LevenshteinStandardICaseParameters>(
144 a, std::strlen(a), b, std::strlen(b));
145}
146
147/*!
148 * Computes the Levenshtein string distance between two strings. The distance
149 * is the minimum number of replacements/inserts/deletes needed to change one
150 * string into the other.
151 *
152 * \param a first string
153 * \param b second string
154 * \return Levenshtein distance
155 */
156static inline size_t levenshtein(const std::string& a, const std::string& b) {
157 return levenshtein_algorithm<LevenshteinStandardParameters>(
158 a.data(), a.size(), b.data(), b.size());
159}
160
161/*!
162 * Computes the Levenshtein string distance between two strings. The distance
163 * is the minimum number of replacements/inserts/deletes needed to change one
164 * string into the other. Character comparison is done case-insensitively.
165 *
166 * \param a first string
167 * \param b second string
168 * \return Levenshtein distance
169 */
170static inline
171size_t levenshtein_icase(const std::string& a, const std::string& b) {
172 return levenshtein_algorithm<LevenshteinStandardICaseParameters>(
173 a.data(), a.size(), b.data(), b.size());
174}
175
176//! \}
177
178} // namespace tlx
179
180#endif // !TLX_STRING_LEVENSHTEIN_HEADER
181
182/******************************************************************************/
Simpler non-growing vector without initialization.
static size_t levenshtein_icase(const char *a, const char *b)
Computes the Levenshtein string distance between two strings.
char to_lower(char ch)
Transform the given character to lower case without any localization.
Definition to_lower.cpp:17
static size_t levenshtein_algorithm(const char *a, size_t a_size, const char *b, size_t b_size)
Computes the Levenshtein string distance also called edit distance between two strings.
static size_t levenshtein(const char *a, const char *b)
Computes the Levenshtein string distance between two strings.
Standard parameters to Levenshtein distance function.
static bool char_equal(const char &a, const char &b)
static const unsigned int cost_insert_delete
static const unsigned int cost_replace
Standard parameters to levenshtein distance function.
static bool char_equal(const char &a, const char &b)
static const unsigned int cost_insert_delete
static const unsigned int cost_replace