From 88774e7cc27c0a377bb4f53fad4978da9bb32d92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Se=C3=A1n=20Healy?= Date: Tue, 14 Apr 2026 03:59:56 +0100 Subject: [PATCH] Fallback with Maximal Common Substring if no similarity found --- README.md | 45 ++++++++++++++++++++-------------- fuzzy-match.c | 67 ++++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 83 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index f52b975..a025569 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,12 @@ # fuzzy-match -A fast command-line tool for fuzzy string matching using the Damerau-Levenshtein distance algorithm. +A fast command-line tool for fuzzy string matching using the Damerau-Levenshtein distance algorithm, with a longest-common-substring fallback when no strong match is found. ## Features - **Damerau-Levenshtein Distance**: Measures similarity between strings accounting for insertions, deletions, substitutions, and transpositions -- **Normalized Scoring**: Calculates similarity score as `distance / MAX(queryLength, lineLength)` for fair comparison regardless of string lengths +- **Normalized Scoring**: Calculates similarity score as `1 - distance / MAX(queryLength, lineLength)` so higher scores are better +- **Fallback Matching**: If the best Damerau-Levenshtein similarity is below `0.5`, recalculates every score using the maximal common substring length - **Sorted Output**: Results are sorted by similarity score (best matches first) - **Efficient Processing**: Handles large input streams with dynamic memory allocation @@ -41,14 +42,14 @@ echo -e "apple\napple pie\norange\nbanana\nappl" | fuzzy-match "apple" ### Output Format -Each line is printed with its similarity score (lower is more similar): +Each line is printed with its similarity score (higher is more similar): ``` -0.0000 apple -0.2000 appl -0.5000 apple pie -0.6667 banana -1.0000 orange +1.0000 apple +0.8000 appl +0.5556 apple pie +0.1667 banana +0.1667 orange ``` ## Examples @@ -56,28 +57,36 @@ Each line is printed with its similarity score (lower is more similar): ### Basic matching ```bash $ echo -e "cat\ncar\ndog\nhat" | fuzzy-match "cat" -0.0000 cat -0.3333 car +1.0000 cat +0.6667 car 0.6667 hat -1.0000 dog +0.0000 dog ``` ### Matching with typos ```bash $ echo -e "programming\nprograming\nprogram\nprogamming" | fuzzy-match "programming" -0.0000 programming -0.0909 programing -0.1818 progamming -0.3333 program +1.0000 programming +0.9091 programing +0.9091 progamming +0.6364 program ``` +### Fallback to maximal common substring +If no Damerau-Levenshtein similarity reaches `0.5`, every score is recalculated using the longest common substring length instead. + ## Algorithm -The program implements the **Damerau-Levenshtein distance** algorithm, which measures the minimum number of single-character edits (insertions, deletions, substitutions, and transpositions) needed to transform one string into another. +The program first computes a **Damerau-Levenshtein similarity**, based on the minimum number of single-character edits (insertions, deletions, substitutions, and transpositions) needed to transform one string into another. -The similarity score is normalized to account for string length differences: +The primary similarity score is normalized to account for string length differences: ``` -similarity_score = damerau_levenshtein_distance / MAX(query_length, line_length) +similarity_score = 1 - damerau_levenshtein_distance / MAX(query_length, line_length) +``` + +If the highest primary similarity is below `0.5`, the program recalculates every score using the maximal common substring length instead: +``` +similarity_score = longest_common_substring_length / MAX(query_length, line_length) ``` ## Installation diff --git a/fuzzy-match.c b/fuzzy-match.c index 0e55c5e..49b861c 100644 --- a/fuzzy-match.c +++ b/fuzzy-match.c @@ -6,6 +6,9 @@ typedef struct { char *line; int distance; size_t max_len; + size_t min_len; + size_t common_substring_length; + double score; } LineScore; static void freeLineScores(LineScore *lines, const size_t line_count) { @@ -13,9 +16,34 @@ static void freeLineScores(LineScore *lines, const size_t line_count) { free(lines); } -static double similarityScore(const int distance, const size_t max_len) { - if (max_len == 0) return 0.0; - return (double) distance / (double) max_len; +static double damerauLevenshteinSimilarity(const int distance, const size_t max_len) { + if (max_len == 0) return 1.0; + + return 1.0 - (double) distance / (double) max_len; +} + +static bool equals(const char *s1, const char *s2, const size_t len) { + for (size_t i = 0; i < len; i++) if (s1[i] != s2[i]) return false; + + return true; +} + +static size_t maximalCommonSubstringLength(const char *s1, const char *s2) { + const size_t len1 = strlen(s1); + const size_t len2 = strlen(s2); + const size_t max_len = len1 > len2 ? len2 : len1; + for (size_t len = max_len; len > 0; len--) + for (size_t i = 0; i <= len1 - len; ++i) + for (size_t j = 0; j <= len2 - len; ++j) + if (equals(s1 + i, s2 + j, len)) return len; + + return 0; +} + +static double maximalCommonSubstringSimilarity(const size_t common_substring_length, const size_t min_len) { + if (min_len == 0) return 0.0; + + return (double) common_substring_length / (double) min_len; } int damerauLevenshteinDistance(const char *s1, const char *s2) { @@ -52,12 +80,13 @@ int damerauLevenshteinDistance(const char *s1, const char *s2) { return result; } int comparator(const void *a, const void *b) { - auto const ls_a = (const LineScore *) a; - auto const ls_b = (const LineScore *) b; - const double score_a = similarityScore(ls_a->distance, ls_a->max_len); - const double score_b = similarityScore(ls_b->distance, ls_b->max_len); - if (score_a < score_b) return -1; - if (score_a > score_b) return 1; + auto const ls_a = (const LineScore *)a; + auto const ls_b = (const LineScore *)b; + + if (ls_a->score < ls_b->score) return 1; + if (ls_a->score > ls_b->score) return -1; + if (ls_a->common_substring_length < ls_b->common_substring_length) return 1; + if (ls_a->common_substring_length > ls_b->common_substring_length) return -1; if (ls_a->distance < ls_b->distance) return -1; if (ls_a->distance > ls_b->distance) return 1; @@ -73,6 +102,7 @@ int main(const int argc, char *argv[]) { const size_t query_len = strlen(query); size_t line_count = 0; size_t capacity = 100; + double max_similarity = 0.0; LineScore *lines = malloc(capacity * sizeof(*lines)); if (lines == NULL) { fprintf(stderr, "Failed to allocate result buffer\n"); @@ -97,6 +127,8 @@ int main(const int argc, char *argv[]) { } const int distance = damerauLevenshteinDistance(query, buffer); const size_t max_len = query_len > len ? query_len : len; + const size_t min_len = query_len > len ? len : query_len; + const double score = damerauLevenshteinSimilarity(distance, max_len); lines[line_count].line = malloc(len + 1); if (lines[line_count].line == NULL) { fprintf(stderr, "Failed to allocate line buffer\n"); @@ -106,12 +138,25 @@ int main(const int argc, char *argv[]) { strcpy(lines[line_count].line, buffer); lines[line_count].distance = distance; lines[line_count].max_len = max_len; + lines[line_count].min_len = min_len; + lines[line_count].common_substring_length = 0; + lines[line_count].score = score; + if (line_count == 0 || score > max_similarity) { + max_similarity = score; + } line_count++; } + + if (max_similarity < 0.5) { + for (size_t i = 0; i < line_count; i++) { + lines[i].common_substring_length = maximalCommonSubstringLength(query, lines[i].line); + lines[i].score = maximalCommonSubstringSimilarity(lines[i].common_substring_length, lines[i].min_len); + } + } + qsort(lines, line_count, sizeof(*lines), comparator); for (size_t i = 0; i < line_count; i++) { - const double similarity = similarityScore(lines[i].distance, lines[i].max_len); - printf("%.4f\t%s\n", similarity, lines[i].line); + printf("%.4f\t%s\n", lines[i].score, lines[i].line); } freeLineScores(lines, line_count);