1:
2: #include <stdio.h>
3: #include <string.h>
4: #include <stdlib.h>
5: static char __file__[] = __FILE__; /* for tassert.h */
6: #include "tassert.h"
7:
8: #if __sun&&__SVR4
9: #include <alloca.h>
10: #elif _MSC_VER
11: #include <malloc.h>
12: #endif
13:
14: #include "speller.h"
15:
16: const char idchars[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_";
17:
18: /**************************************************
19: * Looks for correct spelling.
20: * Currently only looks a 'distance' of one from the seed[].
21: * This does an exhaustive search, so can potentially be very slow.
22: * Input:
23: * seed wrongly spelled word
24: * fp search function
25: * fparg argument to search function
26: * charset character set
27: * Returns:
28: * NULL no correct spellings found
29: * void* value returned by fp() for first possible correct spelling
30: */
31:
32: void *spellerY(const char *seed, size_t seedlen, fp_speller_t fp, void *fparg,
33: const char *charset, size_t index)
34: {
35: if (!seedlen)
36: return NULL;
37: assert(seed[seedlen] == 0);
38:
39: char tmp[30];
40: char *buf;
41: if (seedlen <= sizeof(tmp) - 2)
42: buf = tmp;
43: else
44: {
45: buf = (char *)alloca(seedlen + 2); // leave space for extra char
warning C6255: _alloca indicates failure by raising a stack overflow exception. Consider using _malloca instead
46: if (!buf)
47: return NULL; // no matches
48: }
49:
50: memcpy(buf, seed, index);
51:
52: /* Delete at seed[index] */
53: if (index < seedlen)
54: {
55: memcpy(buf + index, seed + index + 1, seedlen - index);
56: assert(buf[seedlen - 1] == 0);
warning C6385: Invalid data: accessing 'buf', the readable size is 'index' bytes, but '1000' bytes might be read: Lines: 35, 37, 39, 40, 41, 45, 46, 50, 53, 55, 56
57: void *p = (*fp)(fparg, buf);
58: if (p)
59: return p;
60: }
61:
62: if (charset && *charset)
63: {
64: /* Substitutions */
65: if (index < seedlen)
66: {
67: memcpy(buf, seed, seedlen + 1);
68: for (const char *s = charset; *s; s++)
69: {
70: buf[index] = *s;
71:
72: //printf("sub buf = '%s'\n", buf);
73: void *p = (*fp)(fparg, buf);
74: if (p)
75: return p;
76: }
77: assert(buf[seedlen] == 0);
78: }
79:
80: /* Insertions */
81: memcpy (buf + index + 1, seed + index, seedlen + 1 - index);
82:
83: for (const char *s = charset; *s; s++)
84: {
85: buf[index] = *s;
86:
87: //printf("ins buf = '%s'\n", buf);
88: void *p = (*fp)(fparg, buf);
89: if (p)
90: return p;
91: }
92: assert(buf[seedlen + 1] == 0);
93: }
94:
95: return NULL; // didn't find any corrections
96: }
97:
98: void *spellerX(const char *seed, size_t seedlen, fp_speller_t fp, void *fparg,
99: const char *charset, int flag)
100: {
101: if (!seedlen)
102: return NULL;
103:
104: char tmp[30];
105: char *buf;
106: if (seedlen <= sizeof(tmp) - 2)
107: buf = tmp;
108: else
109: {
110: buf = (char *)alloca(seedlen + 2); // leave space for extra char
warning C6255: _alloca indicates failure by raising a stack overflow exception. Consider using _malloca instead
111: if (!buf)
112: return NULL; // no matches
113: }
114:
115: /* Deletions */
116: memcpy(buf, seed + 1, seedlen);
117: for (int i = 0; i < seedlen; i++)
warning C4018: '<' : signed/unsigned mismatch
118: {
119: //printf("del buf = '%s'\n", buf);
120: void *p;
121: if (flag)
122: p = spellerY(buf, seedlen - 1, fp, fparg, charset, i);
123: else
124: p = (*fp)(fparg, buf);
125: if (p)
126: return p;
127:
128: buf[i] = seed[i];
129: }
130:
131: /* Transpositions */
132: if (!flag)
133: {
134: memcpy(buf, seed, seedlen + 1);
135: for (int i = 0; i + 1 < seedlen; i++)
warning C4018: '<' : signed/unsigned mismatch
136: {
137: // swap [i] and [i + 1]
138: buf[i] = seed[i + 1];
139: buf[i + 1] = seed[i];
140:
141: //printf("tra buf = '%s'\n", buf);
142: void *p = (*fp)(fparg, buf);
143: if (p)
144: return p;
145:
146: buf[i] = seed[i];
147: }
148: }
149:
150: if (charset && *charset)
151: {
152: /* Substitutions */
153: memcpy(buf, seed, seedlen + 1);
154: for (int i = 0; i < seedlen; i++)
warning C4018: '<' : signed/unsigned mismatch
155: {
156: for (const char *s = charset; *s; s++)
157: {
158: buf[i] = *s;
159:
160: //printf("sub buf = '%s'\n", buf);
161: void *p;
162: if (flag)
163: p = spellerY(buf, seedlen, fp, fparg, charset, i + 1);
164: else
165: p = (*fp)(fparg, buf);
166: if (p)
167: return p;
168: }
169: buf[i] = seed[i];
170: }
171:
172: /* Insertions */
173: memcpy(buf + 1, seed, seedlen + 1);
174: for (int i = 0; i <= seedlen; i++) // yes, do seedlen+1 iterations
warning C4018: '<=' : signed/unsigned mismatch
175: {
176: for (const char *s = charset; *s; s++)
177: {
178: buf[i] = *s;
179:
180: //printf("ins buf = '%s'\n", buf);
181: void *p;
182: if (flag)
183: p = spellerY(buf, seedlen + 1, fp, fparg, charset, i + 1);
184: else
185: p = (*fp)(fparg, buf);
186: if (p)
187: return p;
188: }
189: buf[i] = seed[i]; // going past end of seed[] is ok, as we hit the 0
190: }
191: }
192:
193: return NULL; // didn't find any corrections
194: }
195:
196: void *speller(const char *seed, fp_speller_t fp, void *fparg, const char *charset)
197: {
198: size_t seedlen = strlen(seed);
199: for (int distance = 0; distance < 2; distance++)
200: { void *p = spellerX(seed, seedlen, fp, fparg, charset, distance);
201: if (p)
202: return p;
203: // if (seedlen > 10)
204: // break;
205: }
206: return NULL; // didn't find it
207: }
208:
209:
210: #if UNITTEST
211:
212: #include <stdio.h>
213: #include <string.h>
214: static char __file__[] = __FILE__; /* for tassert.h */
215: #include "tassert.h"
216:
217: void *speller_test(void *fparg, const char *s)
218: {
219: //printf("speller_test(%s, %s)\n", fparg, s);
220: if (strcmp((char *)fparg, s) == 0)
221: return fparg;
222: return NULL;
223: }
224:
225: void unittest_speller()
226: {
227: static const char *cases[][3] =
228: {
229: { "hello", "hell", "y" },
230: { "hello", "hel", "y" },
231: { "hello", "ello", "y" },
232: { "hello", "llo", "y" },
233: { "hello", "hellox", "y" },
234: { "hello", "helloxy", "y" },
235: { "hello", "xhello", "y" },
236: { "hello", "xyhello", "y" },
237: { "hello", "ehllo", "y" },
238: { "hello", "helol", "y" },
239: { "hello", "abcd", "n" },
240: //{ "ehllo", "helol", "y" },
241: { "hello", "helxxlo", "y" },
242: { "hello", "ehlxxlo", "n" },
243: { "hello", "heaao", "y" },
244: { "_123456789_123456789_123456789_123456789", "_123456789_123456789_123456789_12345678", "y" },
245: };
246: //printf("unittest_speller()\n");
247: const void *p = speller("hello", &speller_test, (void *)"hell", idchars);
248: assert(p != NULL);
249: for (int i = 0; i < sizeof(cases)/sizeof(cases[0]); i++)
250: {
251: //printf("case [%d]\n", i);
252: void *p = speller(cases[i][0], &speller_test, (void *)cases[i][1], idchars);
253: if (p)
254: assert(cases[i][2][0] == 'y');
255: else
256: assert(cases[i][2][0] == 'n');
257: }
258: //printf("unittest_speller() success\n");
259: }
260:
261: #endif
262: