From fcc187154468a8b73294b950fcb139edf0a1dda3 Mon Sep 17 00:00:00 2001 From: John Hawthorn Date: Sat, 12 Jul 2014 17:45:07 -0700 Subject: New DP algorithm match scoring algorithm --- Makefile | 2 +- match.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- test.rb | 4 ++-- 3 files changed, 66 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 974332c..6253311 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -CFLAGS+=-Wall -Wextra -g +CFLAGS+=-Wall -Wextra -g -std=c99 all: fzy testscore diff --git a/match.c b/match.c index 8ef15a1..967607b 100644 --- a/match.c +++ b/match.c @@ -1,5 +1,7 @@ #include #include +#include +#include static int is_subset(const char *needle, const char *haystack){ while(*needle){ @@ -11,14 +13,73 @@ static int is_subset(const char *needle, const char *haystack){ return 1; } +/* print one of the internal matrices */ +void mat_print(int *mat, int n, int m){ + int i, j; + for(i = 0; i < n; i++){ + for(j = 0; j < m; j++){ + fprintf(stderr, " %3zd", mat[i*m + j]); + } + fprintf(stderr, "\n"); + } + fprintf(stderr, "\n\n"); +} + +#define max(a, b) (((a) > (b)) ? (a) : (b)) +typedef int score_t; + +double calculate_score(const char *needle, const char *haystack){ + int n = strlen(needle); + int m = strlen(haystack); + + int bow[m]; + int D[n][m], M[n][m]; + bzero(D, sizeof(D)); + bzero(M, sizeof(M)); + + /* + * D[][] Stores the best score for this position ending with a match. + * M[][] Stores the best possible score at this position. + */ + + /* Which positions are beginning of words */ + int at_bow = 1; + for(int i = 0; i < m; i++){ + char ch = haystack[i]; + /* TODO: What about allcaps (ex. README) */ + bow[i] = (at_bow && isalnum(ch)) || isupper(ch); + at_bow = !isalnum(ch); + } + + for(int i = 0; i < n; i++){ + for(int j = 0; j < m; j++){ + int match = tolower(needle[i]) == tolower(haystack[j]); + if(match){ + score_t score = 0; + if(i && j) + score = M[i-1][j-1]; + if(bow[j]) + score += 2; + else if(i && j && D[i-1][j-1]) + score = max(score, 1 + D[i-1][j-1]); + M[i][j] = D[i][j] = score; + } + if(j) + M[i][j] = max(M[i][j], M[i][j-1]); + } + } + + return (float)(M[n-1][m-1]) / (float)(n * 2 + 1); +} + double match(const char *needle, const char *haystack){ if(!*needle){ return 1.0; }else if(!is_subset(needle, haystack)){ - return 0.0; + return -1.0; }else if(!strcasecmp(needle, haystack)){ return 1.0; }else{ - return 0.9; + return calculate_score(needle, haystack); } } diff --git a/test.rb b/test.rb index e1e9f0b..849bde2 100644 --- a/test.rb +++ b/test.rb @@ -8,11 +8,11 @@ describe "score" do end def assert_unmatched(candidate, query) - assert_equal 0, score(candidate, query) + assert_equal -1, score(candidate, query) end def assert_matched(candidate, query) - assert_operator 0, :<, score(candidate, query) + assert_operator 0, :<=, score(candidate, query) end it "scores 1 when the query is empty" do -- cgit v1.2.3