#include #include #include #include #include #include #include "match.h" #include "config.h" char *strcasechr(const char *s, char c) { const char accept[3] = {c, toupper(c), 0}; return strpbrk(s, accept); } int has_match(const char *needle, const char *haystack) { while (*needle) { char nch = *needle++; if (!(haystack = strcasechr(haystack, nch))) { return 0; } haystack++; } return 1; } #define max(a, b) (((a) > (b)) ? (a) : (b)) #ifdef DEBUG_VERBOSE /* print one of the internal matrices */ void mat_print(score_t *mat, const char *needle, const char *haystack) { int n = strlen(needle); int m = strlen(haystack); int i, j; fprintf(stderr, " "); for (j = 0; j < m; j++) { fprintf(stderr, " %c", haystack[j]); } fprintf(stderr, "\n"); for (i = 0; i < n; i++) { fprintf(stderr, " %c |", needle[i]); for (j = 0; j < m; j++) { score_t val = mat[i * m + j]; if (val == SCORE_MIN) { fprintf(stderr, " -inf"); } else { fprintf(stderr, " % .2f", val); } } fprintf(stderr, "\n"); } fprintf(stderr, "\n\n"); } #endif score_t calculate_score(const char *needle, const char *haystack, size_t *positions) { if (!*haystack || !*needle) return SCORE_MIN; int n = strlen(needle); int m = strlen(haystack); if (m > 1024) { /* * Unreasonably large candidate: return no score * If it is a valid match it will still be returned, it will * just be ranked below any reasonably sized candidates */ return SCORE_MIN; } score_t match_bonus[m]; score_t D[n][m], M[n][m]; /* * D[][] Stores the best score for this position ending with a match. * M[][] Stores the best possible score at this position. */ /* Which positions are beginning of words */ char last_ch = '\0'; for (int i = 0; i < m; i++) { char ch = haystack[i]; score_t score = 0; if (isalnum(ch)) { if (!last_ch || last_ch == '/') { score = SCORE_MATCH_SLASH; } else if (last_ch == '-' || last_ch == '_' || last_ch == ' ' || (last_ch >= '0' && last_ch <= '9')) { score = SCORE_MATCH_WORD; } else if (last_ch >= 'a' && last_ch <= 'z' && ch >= 'A' && ch <= 'Z') { /* CamelCase */ score = SCORE_MATCH_CAPITAL; } else if (last_ch == '.') { score = SCORE_MATCH_DOT; } } match_bonus[i] = score; last_ch = ch; } for (int i = 0; i < n; i++) { score_t prev_score = SCORE_MIN; score_t gap_score = i == n - 1 ? SCORE_GAP_TRAILING : SCORE_GAP_INNER; for (int j = 0; j < m; j++) { score_t score = SCORE_MIN; if (tolower(needle[i]) == tolower(haystack[j])) { if (!i) { score = (j * SCORE_GAP_LEADING) + match_bonus[j]; } else if (j) { score = max( M[i - 1][j - 1] + match_bonus[j], /* consecutive match, doesn't stack with match_bonus */ D[i - 1][j - 1] + SCORE_MATCH_CONSECUTIVE); } } D[i][j] = score; M[i][j] = prev_score = max(score, prev_score + gap_score); } } #ifdef DEBUG_VERBOSE fprintf(stderr, "\"%s\" =~ \"%s\"\n", needle, haystack); mat_print(&D[0][0], needle, haystack); mat_print(&M[0][0], needle, haystack); fprintf(stderr, "\n"); #endif /* backtrace to find the positions of optimal matching */ if (positions) { int match_required = 0; for (int i = n - 1, j = m - 1; i >= 0; i--) { for (; j >= 0; j--) { /* * There may be multiple paths which result in * the optimal weight. * * For simplicity, we will pick the first one * we encounter, the latest in the candidate * string. */ if (D[i][j] != SCORE_MIN && (match_required || D[i][j] == M[i][j])) { /* If this score was determined using * SCORE_MATCH_CONSECUTIVE, the * previous character MUST be a match */ match_required = i && j && M[i][j] == D[i - 1][j - 1] + SCORE_MATCH_CONSECUTIVE; positions[i] = j--; break; } } } } return M[n - 1][m - 1]; } score_t match_positions(const char *needle, const char *haystack, size_t *positions) { if (!*needle) { return SCORE_MAX; } else if (!strcasecmp(needle, haystack)) { if (positions) { int n = strlen(needle); for (int i = 0; i < n; i++) positions[i] = i; } return SCORE_MAX; } else { return calculate_score(needle, haystack, positions); } } score_t match(const char *needle, const char *haystack) { return match_positions(needle, haystack, NULL); }