Fallback with Maximal Common Substring if no similarity found

This commit is contained in:
2026-04-14 03:59:56 +01:00
parent b19ffff91b
commit 88774e7cc2
2 changed files with 83 additions and 29 deletions

View File

@@ -6,6 +6,9 @@ typedef struct {
char *line;
int distance;
size_t max_len;
size_t min_len;
size_t common_substring_length;
double score;
} LineScore;
static void freeLineScores(LineScore *lines, const size_t line_count) {
@@ -13,9 +16,34 @@ static void freeLineScores(LineScore *lines, const size_t line_count) {
free(lines);
}
static double similarityScore(const int distance, const size_t max_len) {
if (max_len == 0) return 0.0;
return (double) distance / (double) max_len;
static double damerauLevenshteinSimilarity(const int distance, const size_t max_len) {
if (max_len == 0) return 1.0;
return 1.0 - (double) distance / (double) max_len;
}
static bool equals(const char *s1, const char *s2, const size_t len) {
for (size_t i = 0; i < len; i++) if (s1[i] != s2[i]) return false;
return true;
}
static size_t maximalCommonSubstringLength(const char *s1, const char *s2) {
const size_t len1 = strlen(s1);
const size_t len2 = strlen(s2);
const size_t max_len = len1 > len2 ? len2 : len1;
for (size_t len = max_len; len > 0; len--)
for (size_t i = 0; i <= len1 - len; ++i)
for (size_t j = 0; j <= len2 - len; ++j)
if (equals(s1 + i, s2 + j, len)) return len;
return 0;
}
static double maximalCommonSubstringSimilarity(const size_t common_substring_length, const size_t min_len) {
if (min_len == 0) return 0.0;
return (double) common_substring_length / (double) min_len;
}
int damerauLevenshteinDistance(const char *s1, const char *s2) {
@@ -52,12 +80,13 @@ int damerauLevenshteinDistance(const char *s1, const char *s2) {
return result;
}
int comparator(const void *a, const void *b) {
auto const ls_a = (const LineScore *) a;
auto const ls_b = (const LineScore *) b;
const double score_a = similarityScore(ls_a->distance, ls_a->max_len);
const double score_b = similarityScore(ls_b->distance, ls_b->max_len);
if (score_a < score_b) return -1;
if (score_a > score_b) return 1;
auto const ls_a = (const LineScore *)a;
auto const ls_b = (const LineScore *)b;
if (ls_a->score < ls_b->score) return 1;
if (ls_a->score > ls_b->score) return -1;
if (ls_a->common_substring_length < ls_b->common_substring_length) return 1;
if (ls_a->common_substring_length > ls_b->common_substring_length) return -1;
if (ls_a->distance < ls_b->distance) return -1;
if (ls_a->distance > ls_b->distance) return 1;
@@ -73,6 +102,7 @@ int main(const int argc, char *argv[]) {
const size_t query_len = strlen(query);
size_t line_count = 0;
size_t capacity = 100;
double max_similarity = 0.0;
LineScore *lines = malloc(capacity * sizeof(*lines));
if (lines == NULL) {
fprintf(stderr, "Failed to allocate result buffer\n");
@@ -97,6 +127,8 @@ int main(const int argc, char *argv[]) {
}
const int distance = damerauLevenshteinDistance(query, buffer);
const size_t max_len = query_len > len ? query_len : len;
const size_t min_len = query_len > len ? len : query_len;
const double score = damerauLevenshteinSimilarity(distance, max_len);
lines[line_count].line = malloc(len + 1);
if (lines[line_count].line == NULL) {
fprintf(stderr, "Failed to allocate line buffer\n");
@@ -106,12 +138,25 @@ int main(const int argc, char *argv[]) {
strcpy(lines[line_count].line, buffer);
lines[line_count].distance = distance;
lines[line_count].max_len = max_len;
lines[line_count].min_len = min_len;
lines[line_count].common_substring_length = 0;
lines[line_count].score = score;
if (line_count == 0 || score > max_similarity) {
max_similarity = score;
}
line_count++;
}
if (max_similarity < 0.5) {
for (size_t i = 0; i < line_count; i++) {
lines[i].common_substring_length = maximalCommonSubstringLength(query, lines[i].line);
lines[i].score = maximalCommonSubstringSimilarity(lines[i].common_substring_length, lines[i].min_len);
}
}
qsort(lines, line_count, sizeof(*lines), comparator);
for (size_t i = 0; i < line_count; i++) {
const double similarity = similarityScore(lines[i].distance, lines[i].max_len);
printf("%.4f\t%s\n", similarity, lines[i].line);
printf("%.4f\t%s\n", lines[i].score, lines[i].line);
}
freeLineScores(lines, line_count);