Fallback with Maximal Common Substring if no similarity found
This commit is contained in:
45
README.md
45
README.md
@@ -1,11 +1,12 @@
|
|||||||
# fuzzy-match
|
# fuzzy-match
|
||||||
|
|
||||||
A fast command-line tool for fuzzy string matching using the Damerau-Levenshtein distance algorithm.
|
A fast command-line tool for fuzzy string matching using the Damerau-Levenshtein distance algorithm, with a longest-common-substring fallback when no strong match is found.
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
- **Damerau-Levenshtein Distance**: Measures similarity between strings accounting for insertions, deletions, substitutions, and transpositions
|
- **Damerau-Levenshtein Distance**: Measures similarity between strings accounting for insertions, deletions, substitutions, and transpositions
|
||||||
- **Normalized Scoring**: Calculates similarity score as `distance / MAX(queryLength, lineLength)` for fair comparison regardless of string lengths
|
- **Normalized Scoring**: Calculates similarity score as `1 - distance / MAX(queryLength, lineLength)` so higher scores are better
|
||||||
|
- **Fallback Matching**: If the best Damerau-Levenshtein similarity is below `0.5`, recalculates every score using the maximal common substring length
|
||||||
- **Sorted Output**: Results are sorted by similarity score (best matches first)
|
- **Sorted Output**: Results are sorted by similarity score (best matches first)
|
||||||
- **Efficient Processing**: Handles large input streams with dynamic memory allocation
|
- **Efficient Processing**: Handles large input streams with dynamic memory allocation
|
||||||
|
|
||||||
@@ -41,14 +42,14 @@ echo -e "apple\napple pie\norange\nbanana\nappl" | fuzzy-match "apple"
|
|||||||
|
|
||||||
### Output Format
|
### Output Format
|
||||||
|
|
||||||
Each line is printed with its similarity score (lower is more similar):
|
Each line is printed with its similarity score (higher is more similar):
|
||||||
|
|
||||||
```
|
```
|
||||||
0.0000 apple
|
1.0000 apple
|
||||||
0.2000 appl
|
0.8000 appl
|
||||||
0.5000 apple pie
|
0.5556 apple pie
|
||||||
0.6667 banana
|
0.1667 banana
|
||||||
1.0000 orange
|
0.1667 orange
|
||||||
```
|
```
|
||||||
|
|
||||||
## Examples
|
## Examples
|
||||||
@@ -56,28 +57,36 @@ Each line is printed with its similarity score (lower is more similar):
|
|||||||
### Basic matching
|
### Basic matching
|
||||||
```bash
|
```bash
|
||||||
$ echo -e "cat\ncar\ndog\nhat" | fuzzy-match "cat"
|
$ echo -e "cat\ncar\ndog\nhat" | fuzzy-match "cat"
|
||||||
0.0000 cat
|
1.0000 cat
|
||||||
0.3333 car
|
0.6667 car
|
||||||
0.6667 hat
|
0.6667 hat
|
||||||
1.0000 dog
|
0.0000 dog
|
||||||
```
|
```
|
||||||
|
|
||||||
### Matching with typos
|
### Matching with typos
|
||||||
```bash
|
```bash
|
||||||
$ echo -e "programming\nprograming\nprogram\nprogamming" | fuzzy-match "programming"
|
$ echo -e "programming\nprograming\nprogram\nprogamming" | fuzzy-match "programming"
|
||||||
0.0000 programming
|
1.0000 programming
|
||||||
0.0909 programing
|
0.9091 programing
|
||||||
0.1818 progamming
|
0.9091 progamming
|
||||||
0.3333 program
|
0.6364 program
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Fallback to maximal common substring
|
||||||
|
If no Damerau-Levenshtein similarity reaches `0.5`, every score is recalculated using the longest common substring length instead.
|
||||||
|
|
||||||
## Algorithm
|
## Algorithm
|
||||||
|
|
||||||
The program implements the **Damerau-Levenshtein distance** algorithm, which measures the minimum number of single-character edits (insertions, deletions, substitutions, and transpositions) needed to transform one string into another.
|
The program first computes a **Damerau-Levenshtein similarity**, based on the minimum number of single-character edits (insertions, deletions, substitutions, and transpositions) needed to transform one string into another.
|
||||||
|
|
||||||
The similarity score is normalized to account for string length differences:
|
The primary similarity score is normalized to account for string length differences:
|
||||||
```
|
```
|
||||||
similarity_score = damerau_levenshtein_distance / MAX(query_length, line_length)
|
similarity_score = 1 - damerau_levenshtein_distance / MAX(query_length, line_length)
|
||||||
|
```
|
||||||
|
|
||||||
|
If the highest primary similarity is below `0.5`, the program recalculates every score using the maximal common substring length instead:
|
||||||
|
```
|
||||||
|
similarity_score = longest_common_substring_length / MAX(query_length, line_length)
|
||||||
```
|
```
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|||||||
@@ -6,6 +6,9 @@ typedef struct {
|
|||||||
char *line;
|
char *line;
|
||||||
int distance;
|
int distance;
|
||||||
size_t max_len;
|
size_t max_len;
|
||||||
|
size_t min_len;
|
||||||
|
size_t common_substring_length;
|
||||||
|
double score;
|
||||||
} LineScore;
|
} LineScore;
|
||||||
|
|
||||||
static void freeLineScores(LineScore *lines, const size_t line_count) {
|
static void freeLineScores(LineScore *lines, const size_t line_count) {
|
||||||
@@ -13,9 +16,34 @@ static void freeLineScores(LineScore *lines, const size_t line_count) {
|
|||||||
free(lines);
|
free(lines);
|
||||||
}
|
}
|
||||||
|
|
||||||
static double similarityScore(const int distance, const size_t max_len) {
|
static double damerauLevenshteinSimilarity(const int distance, const size_t max_len) {
|
||||||
if (max_len == 0) return 0.0;
|
if (max_len == 0) return 1.0;
|
||||||
return (double) distance / (double) max_len;
|
|
||||||
|
return 1.0 - (double) distance / (double) max_len;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool equals(const char *s1, const char *s2, const size_t len) {
|
||||||
|
for (size_t i = 0; i < len; i++) if (s1[i] != s2[i]) return false;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t maximalCommonSubstringLength(const char *s1, const char *s2) {
|
||||||
|
const size_t len1 = strlen(s1);
|
||||||
|
const size_t len2 = strlen(s2);
|
||||||
|
const size_t max_len = len1 > len2 ? len2 : len1;
|
||||||
|
for (size_t len = max_len; len > 0; len--)
|
||||||
|
for (size_t i = 0; i <= len1 - len; ++i)
|
||||||
|
for (size_t j = 0; j <= len2 - len; ++j)
|
||||||
|
if (equals(s1 + i, s2 + j, len)) return len;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static double maximalCommonSubstringSimilarity(const size_t common_substring_length, const size_t min_len) {
|
||||||
|
if (min_len == 0) return 0.0;
|
||||||
|
|
||||||
|
return (double) common_substring_length / (double) min_len;
|
||||||
}
|
}
|
||||||
|
|
||||||
int damerauLevenshteinDistance(const char *s1, const char *s2) {
|
int damerauLevenshteinDistance(const char *s1, const char *s2) {
|
||||||
@@ -54,10 +82,11 @@ int damerauLevenshteinDistance(const char *s1, const char *s2) {
|
|||||||
int comparator(const void *a, const void *b) {
|
int comparator(const void *a, const void *b) {
|
||||||
auto const ls_a = (const LineScore *)a;
|
auto const ls_a = (const LineScore *)a;
|
||||||
auto const ls_b = (const LineScore *)b;
|
auto const ls_b = (const LineScore *)b;
|
||||||
const double score_a = similarityScore(ls_a->distance, ls_a->max_len);
|
|
||||||
const double score_b = similarityScore(ls_b->distance, ls_b->max_len);
|
if (ls_a->score < ls_b->score) return 1;
|
||||||
if (score_a < score_b) return -1;
|
if (ls_a->score > ls_b->score) return -1;
|
||||||
if (score_a > score_b) return 1;
|
if (ls_a->common_substring_length < ls_b->common_substring_length) return 1;
|
||||||
|
if (ls_a->common_substring_length > ls_b->common_substring_length) return -1;
|
||||||
if (ls_a->distance < ls_b->distance) return -1;
|
if (ls_a->distance < ls_b->distance) return -1;
|
||||||
if (ls_a->distance > ls_b->distance) return 1;
|
if (ls_a->distance > ls_b->distance) return 1;
|
||||||
|
|
||||||
@@ -73,6 +102,7 @@ int main(const int argc, char *argv[]) {
|
|||||||
const size_t query_len = strlen(query);
|
const size_t query_len = strlen(query);
|
||||||
size_t line_count = 0;
|
size_t line_count = 0;
|
||||||
size_t capacity = 100;
|
size_t capacity = 100;
|
||||||
|
double max_similarity = 0.0;
|
||||||
LineScore *lines = malloc(capacity * sizeof(*lines));
|
LineScore *lines = malloc(capacity * sizeof(*lines));
|
||||||
if (lines == NULL) {
|
if (lines == NULL) {
|
||||||
fprintf(stderr, "Failed to allocate result buffer\n");
|
fprintf(stderr, "Failed to allocate result buffer\n");
|
||||||
@@ -97,6 +127,8 @@ int main(const int argc, char *argv[]) {
|
|||||||
}
|
}
|
||||||
const int distance = damerauLevenshteinDistance(query, buffer);
|
const int distance = damerauLevenshteinDistance(query, buffer);
|
||||||
const size_t max_len = query_len > len ? query_len : len;
|
const size_t max_len = query_len > len ? query_len : len;
|
||||||
|
const size_t min_len = query_len > len ? len : query_len;
|
||||||
|
const double score = damerauLevenshteinSimilarity(distance, max_len);
|
||||||
lines[line_count].line = malloc(len + 1);
|
lines[line_count].line = malloc(len + 1);
|
||||||
if (lines[line_count].line == NULL) {
|
if (lines[line_count].line == NULL) {
|
||||||
fprintf(stderr, "Failed to allocate line buffer\n");
|
fprintf(stderr, "Failed to allocate line buffer\n");
|
||||||
@@ -106,12 +138,25 @@ int main(const int argc, char *argv[]) {
|
|||||||
strcpy(lines[line_count].line, buffer);
|
strcpy(lines[line_count].line, buffer);
|
||||||
lines[line_count].distance = distance;
|
lines[line_count].distance = distance;
|
||||||
lines[line_count].max_len = max_len;
|
lines[line_count].max_len = max_len;
|
||||||
|
lines[line_count].min_len = min_len;
|
||||||
|
lines[line_count].common_substring_length = 0;
|
||||||
|
lines[line_count].score = score;
|
||||||
|
if (line_count == 0 || score > max_similarity) {
|
||||||
|
max_similarity = score;
|
||||||
|
}
|
||||||
line_count++;
|
line_count++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (max_similarity < 0.5) {
|
||||||
|
for (size_t i = 0; i < line_count; i++) {
|
||||||
|
lines[i].common_substring_length = maximalCommonSubstringLength(query, lines[i].line);
|
||||||
|
lines[i].score = maximalCommonSubstringSimilarity(lines[i].common_substring_length, lines[i].min_len);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
qsort(lines, line_count, sizeof(*lines), comparator);
|
qsort(lines, line_count, sizeof(*lines), comparator);
|
||||||
for (size_t i = 0; i < line_count; i++) {
|
for (size_t i = 0; i < line_count; i++) {
|
||||||
const double similarity = similarityScore(lines[i].distance, lines[i].max_len);
|
printf("%.4f\t%s\n", lines[i].score, lines[i].line);
|
||||||
printf("%.4f\t%s\n", similarity, lines[i].line);
|
|
||||||
}
|
}
|
||||||
freeLineScores(lines, line_count);
|
freeLineScores(lines, line_count);
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user