From 88774e7cc27c0a377bb4f53fad4978da9bb32d92 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Se=C3=A1n=20Healy?= <sean.healy@proxymighty.com>
Date: Tue, 14 Apr 2026 03:59:56 +0100
Subject: [PATCH] Fallback with Maximal Common Substring if no similarity found

---
 README.md     | 45 ++++++++++++++++++++--------------
 fuzzy-match.c | 67 ++++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 83 insertions(+), 29 deletions(-)

diff --git a/README.md b/README.md
index f52b975..a025569 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,12 @@
 # fuzzy-match
 
-A fast command-line tool for fuzzy string matching using the Damerau-Levenshtein distance algorithm.
+A fast command-line tool for fuzzy string matching using the Damerau-Levenshtein distance algorithm, with a longest-common-substring fallback when no strong match is found.
 
 ## Features
 
 - **Damerau-Levenshtein Distance**: Measures similarity between strings accounting for insertions, deletions, substitutions, and transpositions
-- **Normalized Scoring**: Calculates similarity score as `distance / MAX(queryLength, lineLength)` for fair comparison regardless of string lengths
+- **Normalized Scoring**: Calculates similarity score as `1 - distance / MAX(queryLength, lineLength)` so higher scores are better
+- **Fallback Matching**: If the best Damerau-Levenshtein similarity is below `0.5`, recalculates every score using the maximal common substring length
 - **Sorted Output**: Results are sorted by similarity score (best matches first)
 - **Efficient Processing**: Handles large input streams with dynamic memory allocation
 
@@ -41,14 +42,14 @@ echo -e "apple\napple pie\norange\nbanana\nappl" | fuzzy-match "apple"
 
 ### Output Format
 
-Each line is printed with its similarity score (lower is more similar):
+Each line is printed with its similarity score (higher is more similar):
 
 ```
-0.0000	apple
-0.2000	appl
-0.5000	apple pie
-0.6667	banana
-1.0000	orange
+1.0000	apple
+0.8000	appl
+0.5556	apple pie
+0.1667	banana
+0.1667	orange
 ```
 
 ## Examples
@@ -56,28 +57,36 @@ Each line is printed with its similarity score (lower is more similar):
 ### Basic matching
 ```bash
 $ echo -e "cat\ncar\ndog\nhat" | fuzzy-match "cat"
-0.0000	cat
-0.3333	car
+1.0000	cat
+0.6667	car
 0.6667	hat
-1.0000	dog
+0.0000	dog
 ```
 
 ### Matching with typos
 ```bash
 $ echo -e "programming\nprograming\nprogram\nprogamming" | fuzzy-match "programming"
-0.0000	programming
-0.0909	programing
-0.1818	progamming
-0.3333	program
+1.0000	programming
+0.9091	programing
+0.9091	progamming
+0.6364	program
 ```
 
+### Fallback to maximal common substring
+If no Damerau-Levenshtein similarity reaches `0.5`, every score is recalculated using the longest common substring length instead.
+
 ## Algorithm
 
-The program implements the **Damerau-Levenshtein distance** algorithm, which measures the minimum number of single-character edits (insertions, deletions, substitutions, and transpositions) needed to transform one string into another.
+The program first computes a **Damerau-Levenshtein similarity**, based on the minimum number of single-character edits (insertions, deletions, substitutions, and transpositions) needed to transform one string into another.
 
-The similarity score is normalized to account for string length differences:
+The primary similarity score is normalized to account for string length differences:
 ```
-similarity_score = damerau_levenshtein_distance / MAX(query_length, line_length)
+similarity_score = 1 - damerau_levenshtein_distance / MAX(query_length, line_length)
+```
+
+If the highest primary similarity is below `0.5`, the program recalculates every score using the maximal common substring length instead:
+```
+similarity_score = longest_common_substring_length / MAX(query_length, line_length)
 ```
 
 ## Installation
diff --git a/fuzzy-match.c b/fuzzy-match.c
index 0e55c5e..49b861c 100644
--- a/fuzzy-match.c
+++ b/fuzzy-match.c
@@ -6,6 +6,9 @@ typedef struct {
     char *line;
     int distance;
     size_t max_len;
+    size_t min_len;
+    size_t common_substring_length;
+    double score;
 } LineScore;
 
 static void freeLineScores(LineScore *lines, const size_t line_count) {
@@ -13,9 +16,34 @@ static void freeLineScores(LineScore *lines, const size_t line_count) {
     free(lines);
 }
 
-static double similarityScore(const int distance, const size_t max_len) {
-    if (max_len == 0) return 0.0;
-    return (double) distance / (double) max_len;
+static double damerauLevenshteinSimilarity(const int distance, const size_t max_len) {
+    if (max_len == 0) return 1.0;
+
+    return 1.0 - (double) distance / (double) max_len;
+}
+
+static bool equals(const char *s1, const char *s2, const size_t len) {
+    for (size_t i = 0; i < len; i++) if (s1[i] != s2[i]) return false;
+
+    return true;
+}
+
+static size_t maximalCommonSubstringLength(const char *s1, const char *s2) {
+    const size_t len1 = strlen(s1);
+    const size_t len2 = strlen(s2);
+    const size_t max_len = len1 > len2 ? len2 : len1;
+    for (size_t len = max_len; len > 0; len--)
+        for (size_t i = 0; i <= len1 - len; ++i)
+            for (size_t j = 0; j <= len2 - len; ++j)
+                if (equals(s1 + i, s2 + j, len)) return len;
+
+    return 0;
+}
+
+static double maximalCommonSubstringSimilarity(const size_t common_substring_length, const size_t min_len) {
+    if (min_len == 0) return 0.0;
+
+    return (double) common_substring_length / (double) min_len;
 }
 
 int damerauLevenshteinDistance(const char *s1, const char *s2) {
@@ -52,12 +80,13 @@ int damerauLevenshteinDistance(const char *s1, const char *s2) {
     return result;
 }
 int comparator(const void *a, const void *b) {
-    auto const ls_a = (const LineScore *) a;
-    auto const ls_b = (const LineScore *) b;
-    const double score_a = similarityScore(ls_a->distance, ls_a->max_len);
-    const double score_b = similarityScore(ls_b->distance, ls_b->max_len);
-    if (score_a < score_b) return -1;
-    if (score_a > score_b) return 1;
+    auto const ls_a = (const LineScore *)a;
+    auto const ls_b = (const LineScore *)b;
+
+    if (ls_a->score < ls_b->score) return 1;
+    if (ls_a->score > ls_b->score) return -1;
+    if (ls_a->common_substring_length < ls_b->common_substring_length) return 1;
+    if (ls_a->common_substring_length > ls_b->common_substring_length) return -1;
     if (ls_a->distance < ls_b->distance) return -1;
     if (ls_a->distance > ls_b->distance) return 1;
 
@@ -73,6 +102,7 @@ int main(const int argc, char *argv[]) {
     const size_t query_len = strlen(query);
     size_t line_count = 0;
     size_t capacity = 100;
+    double max_similarity = 0.0;
     LineScore *lines = malloc(capacity * sizeof(*lines));
     if (lines == NULL) {
         fprintf(stderr, "Failed to allocate result buffer\n");
@@ -97,6 +127,8 @@ int main(const int argc, char *argv[]) {
         }
         const int distance = damerauLevenshteinDistance(query, buffer);
         const size_t max_len = query_len > len ? query_len : len;
+        const size_t min_len = query_len > len ? len : query_len;
+        const double score = damerauLevenshteinSimilarity(distance, max_len);
         lines[line_count].line = malloc(len + 1);
         if (lines[line_count].line == NULL) {
             fprintf(stderr, "Failed to allocate line buffer\n");
@@ -106,12 +138,25 @@ int main(const int argc, char *argv[]) {
         strcpy(lines[line_count].line, buffer);
         lines[line_count].distance = distance;
         lines[line_count].max_len = max_len;
+        lines[line_count].min_len = min_len;
+        lines[line_count].common_substring_length = 0;
+        lines[line_count].score = score;
+        if (line_count == 0 || score > max_similarity) {
+            max_similarity = score;
+        }
         line_count++;
     }
+
+    if (max_similarity < 0.5) {
+        for (size_t i = 0; i < line_count; i++) {
+            lines[i].common_substring_length = maximalCommonSubstringLength(query, lines[i].line);
+            lines[i].score = maximalCommonSubstringSimilarity(lines[i].common_substring_length, lines[i].min_len);
+        }
+    }
+
     qsort(lines, line_count, sizeof(*lines), comparator);
     for (size_t i = 0; i < line_count; i++) {
-        const double similarity = similarityScore(lines[i].distance, lines[i].max_len);
-        printf("%.4f\t%s\n", similarity, lines[i].line);
+        printf("%.4f\t%s\n", lines[i].score, lines[i].line);
     }
     freeLineScores(lines, line_count);