diff options
author | John Hawthorn <john.hawthorn@gmail.com> | 2016-05-21 14:56:03 -0700 |
---|---|---|
committer | John Hawthorn <john.hawthorn@gmail.com> | 2016-05-21 14:56:25 -0700 |
commit | 2b3c3a85ec8aa8b4b94c0e594c32449dd70b4d26 (patch) | |
tree | b72c2815f22fbae416cf10284c5acd273c75af7f /src/match.c | |
parent | 45499644d85a9ba93dd9f1504d1ca7df15b60148 (diff) |
Move sources into src directory
Diffstat (limited to 'src/match.c')
-rw-r--r-- | src/match.c | 183 |
1 files changed, 183 insertions, 0 deletions
diff --git a/src/match.c b/src/match.c new file mode 100644 index 0000000..9602700 --- /dev/null +++ b/src/match.c @@ -0,0 +1,183 @@ +#include <ctype.h> +#include <string.h> +#include <strings.h> +#include <stdio.h> +#include <float.h> +#include <math.h> + +#include "match.h" + +#include "../config.h" + +char *strcasechr(const char *s, char c) { + const char accept[3] = {c, toupper(c), 0}; + return strpbrk(s, accept); +} + +int has_match(const char *needle, const char *haystack) { + while (*needle) { + char nch = *needle++; + + if (!(haystack = strcasechr(haystack, nch))) { + return 0; + } + haystack++; + } + return 1; +} + +#define max(a, b) (((a) > (b)) ? (a) : (b)) + +#ifdef DEBUG_VERBOSE +/* print one of the internal matrices */ +void mat_print(score_t *mat, char name, const char *needle, const char *haystack) { + int n = strlen(needle); + int m = strlen(haystack); + int i, j; + fprintf(stderr, "%c ", name); + for (j = 0; j < m; j++) { + fprintf(stderr, " %c", haystack[j]); + } + fprintf(stderr, "\n"); + for (i = 0; i < n; i++) { + fprintf(stderr, " %c |", needle[i]); + for (j = 0; j < m; j++) { + score_t val = mat[i * m + j]; + if (val == SCORE_MIN) { + fprintf(stderr, " -\u221E"); + } else { + fprintf(stderr, " %.3f", val); + } + } + fprintf(stderr, "\n"); + } + fprintf(stderr, "\n\n"); +} +#endif + +score_t calculate_score(const char *needle, const char *haystack, size_t *positions) { + if (!*haystack || !*needle) + return SCORE_MIN; + + int n = strlen(needle); + int m = strlen(haystack); + + if (m > 1024) { + /* + * Unreasonably large candidate: return no score + * If it is a valid match it will still be returned, it will + * just be ranked below any reasonably sized candidates + */ + return SCORE_MIN; + } + + score_t match_bonus[m]; + score_t D[n][m], M[n][m]; + + /* + * D[][] Stores the best score for this position ending with a match. + * M[][] Stores the best possible score at this position. + */ + + /* Which positions are beginning of words */ + char last_ch = '\0'; + for (int i = 0; i < m; i++) { + char ch = haystack[i]; + + score_t score = 0; + if (isalnum(ch)) { + if (!last_ch || last_ch == '/') { + score = SCORE_MATCH_SLASH; + } else if (last_ch == '-' || last_ch == '_' || last_ch == ' ' || + (last_ch >= '0' && last_ch <= '9')) { + score = SCORE_MATCH_WORD; + } else if (last_ch >= 'a' && last_ch <= 'z' && ch >= 'A' && ch <= 'Z') { + /* CamelCase */ + score = SCORE_MATCH_CAPITAL; + } else if (last_ch == '.') { + score = SCORE_MATCH_DOT; + } + } + + match_bonus[i] = score; + last_ch = ch; + } + + for (int i = 0; i < n; i++) { + score_t prev_score = SCORE_MIN; + score_t gap_score = i == n - 1 ? SCORE_GAP_TRAILING : SCORE_GAP_INNER; + for (int j = 0; j < m; j++) { + score_t score = SCORE_MIN; + if (tolower(needle[i]) == tolower(haystack[j])) { + if (!i) { + score = (j * SCORE_GAP_LEADING) + match_bonus[j]; + } else if (j) { /* i > 0 && j > 0*/ + score = max( + M[i - 1][j - 1] + match_bonus[j], + + /* consecutive match, doesn't stack with match_bonus */ + D[i - 1][j - 1] + SCORE_MATCH_CONSECUTIVE); + } + } + D[i][j] = score; + M[i][j] = prev_score = max(score, prev_score + gap_score); + } + } + +#ifdef DEBUG_VERBOSE + fprintf(stderr, "\"%s\" =~ \"%s\"\n", needle, haystack); + mat_print(&D[0][0], 'D', needle, haystack); + mat_print(&M[0][0], 'M', needle, haystack); + fprintf(stderr, "\n"); +#endif + + /* backtrace to find the positions of optimal matching */ + if (positions) { + int match_required = 0; + for (int i = n - 1, j = m - 1; i >= 0; i--) { + for (; j >= 0; j--) { + /* + * There may be multiple paths which result in + * the optimal weight. + * + * For simplicity, we will pick the first one + * we encounter, the latest in the candidate + * string. + */ + if (D[i][j] != SCORE_MIN && + (match_required || D[i][j] == M[i][j])) { + /* If this score was determined using + * SCORE_MATCH_CONSECUTIVE, the + * previous character MUST be a match + */ + match_required = + i && j && + M[i][j] == D[i - 1][j - 1] + SCORE_MATCH_CONSECUTIVE; + positions[i] = j--; + break; + } + } + } + } + + return M[n - 1][m - 1]; +} + +score_t match_positions(const char *needle, const char *haystack, size_t *positions) { + if (!*needle) { + return SCORE_MAX; + } else if (!strcasecmp(needle, haystack)) { + if (positions) { + int n = strlen(needle); + for (int i = 0; i < n; i++) + positions[i] = i; + } + return SCORE_MAX; + } else { + return calculate_score(needle, haystack, positions); + } +} + +score_t match(const char *needle, const char *haystack) { + return match_positions(needle, haystack, NULL); +} |