-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfuzzyword.c
289 lines (260 loc) · 9.04 KB
/
fuzzyword.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
#include <stdlib.h>
#include <stdio.h>
#include <locale.h>
#include <wchar.h>
#include <wctype.h>
#include <stdint.h>
#include <limits.h>
#include <assert.h>
#include "wqm.h"
#define MIN(a,b) ((a) < (b) ? (a) : (b))
#define _(s) (s)
// ----- Thread pool 2
struct tp2_local
{
size_t length;
unsigned long int *array;
};
static void *
tp2_make_local ()
{
return calloc (1, sizeof (struct tp2_local)); // 0-length array.
}
static void
tp2_delete_local (void *local_data)
{
free (((struct tp2_local *) local_data)->array);
free (local_data);
}
struct tp2_global
{
const wchar_t *match;
unsigned long int dmatch;
};
struct tp2_job
{
struct
{
const wchar_t *word;
const wchar_t *realword;
const wchar_t *fuzzyword;
} input;
struct
{
unsigned long int d;
const wchar_t *match;
} result;
};
static void
tp2_job_free (void *arg)
{
struct tp2_global *tp2_global = threadpool_global_data ();
struct tp2_job *tp2_job = arg;
if (tp2_job->result.d <= tp2_global->dmatch) // Aggregation for job results.
{
tp2_global->match = tp2_job->result.match;
tp2_global->dmatch = tp2_job->result.d;
}
free (arg);
}
// https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
static unsigned long int
dld (size_t lwa, const wchar_t *wa, size_t lwb, const wchar_t *wb, int transpose)
{
struct tp2_local *d = threadpool_worker_local_data ();
if (d->length < (lwa + 1) * (lwb + 1))
d->array = realloc (d->array, (d->length = (lwa + 1) * (lwb + 1)) * sizeof (*d->array));
for (size_t ia = 0; ia <= lwa; ia++)
d->array[ia * (lwb + 1)] = ia;
for (size_t ib = 1; ib <= lwb; ib++)
d->array[ib] = ib;
static const size_t INSERTION = 2;
static const size_t DELETION = 4;
static const size_t MISMATCH = 5; // A mismatch is less than a deletion followed by an insertion
static const size_t TRANSPOSITION = 1;
for (size_t ia = 1; ia <= lwa; ia++)
for (size_t ib = 1; ib <= lwb; ib++)
{
unsigned int cost = (wa[ia - 1] != wb[ib - 1] ? 1 : 0);
// Levenshtein distance
d->array[ia * (lwb + 1) + ib] =
MIN (MIN (d->array[(ia - 1) * (lwb + 1) + ib] + INSERTION, d->array[ia * (lwb + 1) + (ib - 1)] + DELETION), d->array[(ia - 1) * (lwb + 1) + (ib - 1)] + cost * MISMATCH);
// Damerau–Levenshtein distance (transposition of two adjacent characters)
if (transpose && ia > 1 && ib > 1 && wa[ia - 2] == wb[ib - 1] && wa[ia - 1] == wb[ib - 2])
d->array[ia * (lwb + 1) + ib] = MIN (d->array[ia * (lwb + 1) + ib], d->array[(ia - 2) * (lwb + 1) + (ib - 2)] + cost * TRANSPOSITION);
}
unsigned long int dld = d->array[(lwa + 1) * (lwb + 1) - 1];
return dld;
}
static int
tp2_worker (struct threadpool *threadpool, void *arg)
{
(void) threadpool;
struct tp2_job *tp2 = arg;
tp2->result.d = dld (wcslen (tp2->input.word), tp2->input.word, wcslen (tp2->input.fuzzyword), tp2->input.fuzzyword, 1);
tp2->result.match = tp2->input.realword;
return 0;
}
// ----- Thread pool 1
struct tp1_global
{
const char *listofwords;
};
struct tp1_resource
{
size_t nb_lines;
wchar_t (*lines)[100]; // Pointer to type const wchar_t[100]
wchar_t **colllines;
};
static void *
tp1_res_alloc (void *global_data)
{
static struct tp1_resource res;
res.lines = 0;
#ifdef COLLATE
fprintf (stderr, _("Collating with locale %s\n"), setlocale (LC_COLLATE, 0));
res.colllines = 0;
#endif
const char *listofwords = ((struct tp1_global *) global_data)->listofwords;
fprintf (stderr, _("Reading the french dictionnary of words %s...\n"), listofwords);
res.nb_lines = 0;
FILE *f;
assert ((f = fopen (listofwords, "r,ccs=UTF-8"))); // File is encoded in UTF-8.
for (wchar_t line[100]; fgetws (line, 100, f);) // Reads a string of at most n-1 wide characters, and adds a terminating null wide character
res.nb_lines++;
res.lines = malloc (res.nb_lines * sizeof (*res.lines));
#ifdef COLLATE
res.colllines = malloc (res.nb_lines * sizeof (*res.colllines));
#endif
f = freopen (0, "r,ccs=UTF-8", f); // File is encoded in UTF-8.
res.nb_lines = 0;
for (wchar_t line[100]; fgetws (line, 100, f);) // Reads a string of at most n-1 wide characters, and adds a terminating null wide character
{
if (wcslen (line) && line[wcslen (line) - 1] == L'\n')
line[wcslen (line) - 1] = L'\0';
res.nb_lines++;
wcsncpy (res.lines[res.nb_lines - 1], line, wcslen (line) + 1); // Copies at most n wide characters, including the terminating null wide character.
#ifdef COLLATE
size_t lcollline = wcsxfrm (0, line, 0); // Returns the number of bytes required to store the transformed string, excluding the terminating null byte.
wchar_t *collline = calloc ((lcollline + 1), sizeof (*collline));
wcsxfrm (collline, line, lcollline); // Transforms at most n bytes.
res.colllines[res.nb_lines - 1] = collline; // Copies at most n wide characters, including the terminating null wide character.
#endif
}
fclose (f);
return &res;
}
static void
tp1_res_dealloc (void *p)
{
struct tp1_resource *res = p;
free (res->lines);
#ifdef COLLATE
for (size_t i = 0; i < res->nb_lines; i++)
free (res->colllines[i]);
free (res->colllines);
#endif
}
struct tp1_job
{
struct
{
wchar_t *wa; // fuzzy word
} input;
struct
{
const wchar_t *match_ref; // Best match
} result;
};
static void
tp1_job_free (void *arg)
{
struct tp1_job *ta = arg;
fprintf (stdout, "\"%1$ls\" => \"%2$ls\"\n", ta->input.wa, ta->result.match_ref); // Job post-processing.
free (ta->input.wa);
free (ta);
}
#undef COLLATE
#define COLLATE
static const wchar_t *
get_match (wchar_t *wa, size_t nb_lines, const wchar_t (*const lines)[100], wchar_t *const *const colllines)
{
size_t start = ((size_t) rand ()) % nb_lines;
struct tp2_global tp2_global = {.match = 0,.dmatch = ULONG_MAX };
for (size_t i = 0; !tp2_global.match && i < nb_lines; i++)
if (!wcscmp (wa, lines[(i + start) % nb_lines])) // To avoid false-sharing.
tp2_global.match = lines[(i + start) % nb_lines]; // Perfect match found.
if (!tp2_global.match) // Search for an approximate match.
{
wchar_t *fuzzyword = wa;
#ifdef COLLATE
size_t lcollwa = wcsxfrm (0, wa, 0); // Returns the number of bytes required to store the transformed string, excluding the terminating null byte.
wchar_t *collwa = calloc ((lcollwa + 1), sizeof (*collwa));
wcsxfrm (collwa, wa, lcollwa); // Transforms at most n bytes.
fuzzyword = collwa;
#endif
struct threadpool *tp2 = threadpool_create_and_start (NB_CPU, &tp2_global);
threadpool_set_worker_local_data_manager (tp2, tp2_make_local, tp2_delete_local);
for (size_t i = 0; tp2_global.dmatch && i < nb_lines; i++)
{
const wchar_t *realword = lines[(i + start) % nb_lines]; // To avoid false-sharing.
const wchar_t *word = realword;
#ifdef COLLATE
word = colllines[(i + start) % nb_lines]; // To avoid false-sharing.
#endif
struct tp2_job *job = malloc (sizeof (*job));
*job = (struct tp2_job)
{
{word, realword, fuzzyword},
{0, 0},
};
threadpool_add_task (tp2, tp2_worker, job, tp2_job_free);
} // for (size_t i = 0; dmatch && i < nb_lines; i++)
threadpool_wait_and_destroy (tp2);
#ifdef COLLATE
free (collwa);
#endif
} // if (!match)
return tp2_global.match;
}
static int
tp1_worker (struct threadpool *threadpool, void *arg)
{
(void) threadpool;
struct tp1_job *ta = arg;
struct tp1_resource *tp1_resource = threadpool_global_resource ();
ta->result.match_ref = get_match (ta->input.wa, tp1_resource->nb_lines, tp1_resource->lines, tp1_resource->colllines);
return 0;
}
int
main (int argc, char *argv[])
{
setlocale (LC_ALL, "fr_FR.UTF-8"); // File listofwords is a list of french words.
struct tp1_global tp1_global = { "liste.de.mots.francais.frgut.txt" };
struct threadpool *tp1 = threadpool_create_and_start (SEQUENTIAL, &tp1_global);
threadpool_set_global_resource_manager (tp1, tp1_res_alloc, tp1_res_dealloc);
threadpool_set_monitor (tp1, threadpool_monitor_to_terminal, stderr, 0);
fprintf (stderr, "Searching for matching words...\n");
for (int iarg = 1; iarg < argc; iarg++)
{
const char *a = argv[iarg];
mbstate_t s = { };
size_t lwa = mbsrtowcs (0, &a, 0, &s); // Returns the number of wide characters that make up the converted part of the wide-character string, not including the terminating null wide character.
if (lwa == (size_t) -1)
return EXIT_FAILURE;
wchar_t *wa = calloc ((lwa + 1), sizeof (*wa));
mbsrtowcs (wa, &a, lwa, &s); // At most n wide characters are written.
wa[lwa] = 0;
for (size_t i = 0; i < wcslen (wa) && i < lwa; i++)
wa[i] = (wchar_t) towlower ((wint_t) wa[i]);
struct tp1_job *job = malloc (sizeof (*job));
*job = (struct tp1_job)
{
{wa},
{0}
};
threadpool_add_task (tp1, tp1_worker, job, tp1_job_free);
} // for (int iarg = 1; iarg < argc; iarg++)
threadpool_wait_and_destroy (tp1);
fprintf (stderr, "Done.\n");
}