summaryrefslogtreecommitdiff
path: root/match.c
blob: a3eb9c1c3d0abc626977327a913d779e7fefe16a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#include <ctype.h>
#include <string.h>
#include <strings.h>
#include <stdio.h>
#include <float.h>

#include "fzy.h"

int has_match(const char *needle, const char *haystack){
	while(*needle){
		if(!*haystack)
			return 0;
		while(*haystack && tolower(*needle) == tolower(*haystack++))
			needle++;
	}
	return 1;
}

#define max(a, b) (((a) > (b)) ? (a) : (b))
typedef double score_t;
#define SCORE_MAX DBL_MAX
#define SCORE_MIN -DBL_MAX

/* print one of the internal matrices */
void mat_print(score_t *mat, int n, int m){
	int i, j;
	for(i = 0; i < n; i++){
		for(j = 0; j < m; j++){
			score_t val = mat[i*m + j];
			if(val == SCORE_MIN){
				fprintf(stderr, " -inf");
			}else{
				fprintf(stderr, " %.2f", val);
			}
		}
		fprintf(stderr, "\n");
	}
	fprintf(stderr, "\n\n");
}

double calculate_score(const char *needle, const char *haystack, size_t *positions){
	if(!*haystack || !*needle)
		return SCORE_MIN;

	int n = strlen(needle);
	int m = strlen(haystack);

	if(m > 1024){
		/*
		 * Unreasonably large candidate: return no score
		 * If it is a valid match it will still be returned, it will
		 * just be ranked below any reasonably sized candidates
		 */
		return 0;
	}

	int bow[m];
	score_t D[n][m], M[n][m];
	bzero(D, sizeof(D));
	bzero(M, sizeof(M));

	/*
	 * D[][] Stores the best score for this position ending with a match.
	 * M[][] Stores the best possible score at this position.
	 */

	/* Which positions are beginning of words */
	int at_bow = 1;
	char last_ch = '\0';
	for(int i = 0; i < m; i++){
		char ch = haystack[i];
		/* TODO: What about allcaps (ex. README) */
		bow[i] = (at_bow && isalnum(ch)) || (isupper(ch) && !isupper(last_ch));
		at_bow = !isalnum(ch);
		last_ch = ch;
	}

	for(int i = 0; i < n; i++){
		for(int j = 0; j < m; j++){
			D[i][j] = SCORE_MIN;
			int match = tolower(needle[i]) == tolower(haystack[j]);
			if(match){
				score_t score = 0;
				if(i && j)
					score = M[i-1][j-1];
				if(bow[j])
					score += 1.5;
				else if(i && j && D[i-1][j-1])
					score = max(score, 1 + D[i-1][j-1]);
				M[i][j] = D[i][j] = score;
			}
			if(j)
				M[i][j] = max(M[i][j], M[i][j-1] - 0.05);
		}
	}

#if 0
	mat_print(&D[0][0], n, m);
	mat_print(&M[0][0], n, m);
#endif

	/* backtrace to find the positions of optimal matching */
	if(positions){
		for(int i = n-1, j = m-1; i >= 0; i--){
			for(; j >= 0; j--){
				/*
				 * There may be multiple paths which result in
				 * the optimal weight.
				 *
				 * For simplicity, we will pick the first one
				 * we encounter, the latest in the candidate
				 * string.
				 */
				if(D[i][j] == M[i][j]){
					positions[i] = j;
					break;
				}
			}
		}
	}

	return (float)(M[n-1][m-1]) / (float)(n * 2 + 1);
}

double match_positions(const char *needle, const char *haystack, size_t *positions){
	if(!*needle){
		return SCORE_MAX;
	}else if(!has_match(needle, haystack)){
		return SCORE_MIN;
	}else if(!strcasecmp(needle, haystack)){
		if(positions){
			int n = strlen(needle);
			for(int i = 0; i < n; i++)
				positions[i] = i;
		}
		return SCORE_MAX;
	}else{
		return calculate_score(needle, haystack, positions);
	}
}

double match(const char *needle, const char *haystack){
	return match_positions(needle, haystack, NULL);
}