diff options
Diffstat (limited to 'libraries/sqlite/win32/fts1_porter.c')
-rwxr-xr-x | libraries/sqlite/win32/fts1_porter.c | 643 |
1 files changed, 643 insertions, 0 deletions
diff --git a/libraries/sqlite/win32/fts1_porter.c b/libraries/sqlite/win32/fts1_porter.c new file mode 100755 index 0000000..1d26236 --- /dev/null +++ b/libraries/sqlite/win32/fts1_porter.c | |||
@@ -0,0 +1,643 @@ | |||
1 | /* | ||
2 | ** 2006 September 30 | ||
3 | ** | ||
4 | ** The author disclaims copyright to this source code. In place of | ||
5 | ** a legal notice, here is a blessing: | ||
6 | ** | ||
7 | ** May you do good and not evil. | ||
8 | ** May you find forgiveness for yourself and forgive others. | ||
9 | ** May you share freely, never taking more than you give. | ||
10 | ** | ||
11 | ************************************************************************* | ||
12 | ** Implementation of the full-text-search tokenizer that implements | ||
13 | ** a Porter stemmer. | ||
14 | */ | ||
15 | |||
16 | /* | ||
17 | ** The code in this file is only compiled if: | ||
18 | ** | ||
19 | ** * The FTS1 module is being built as an extension | ||
20 | ** (in which case SQLITE_CORE is not defined), or | ||
21 | ** | ||
22 | ** * The FTS1 module is being built into the core of | ||
23 | ** SQLite (in which case SQLITE_ENABLE_FTS1 is defined). | ||
24 | */ | ||
25 | #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) | ||
26 | |||
27 | |||
28 | #include <assert.h> | ||
29 | #include <stdlib.h> | ||
30 | #include <stdio.h> | ||
31 | #include <string.h> | ||
32 | #include <ctype.h> | ||
33 | |||
34 | #include "fts1_tokenizer.h" | ||
35 | |||
36 | /* | ||
37 | ** Class derived from sqlite3_tokenizer | ||
38 | */ | ||
39 | typedef struct porter_tokenizer { | ||
40 | sqlite3_tokenizer base; /* Base class */ | ||
41 | } porter_tokenizer; | ||
42 | |||
43 | /* | ||
44 | ** Class derived from sqlit3_tokenizer_cursor | ||
45 | */ | ||
46 | typedef struct porter_tokenizer_cursor { | ||
47 | sqlite3_tokenizer_cursor base; | ||
48 | const char *zInput; /* input we are tokenizing */ | ||
49 | int nInput; /* size of the input */ | ||
50 | int iOffset; /* current position in zInput */ | ||
51 | int iToken; /* index of next token to be returned */ | ||
52 | char *zToken; /* storage for current token */ | ||
53 | int nAllocated; /* space allocated to zToken buffer */ | ||
54 | } porter_tokenizer_cursor; | ||
55 | |||
56 | |||
57 | /* Forward declaration */ | ||
58 | static const sqlite3_tokenizer_module porterTokenizerModule; | ||
59 | |||
60 | |||
61 | /* | ||
62 | ** Create a new tokenizer instance. | ||
63 | */ | ||
64 | static int porterCreate( | ||
65 | int argc, const char * const *argv, | ||
66 | sqlite3_tokenizer **ppTokenizer | ||
67 | ){ | ||
68 | porter_tokenizer *t; | ||
69 | t = (porter_tokenizer *) calloc(sizeof(*t), 1); | ||
70 | if( t==NULL ) return SQLITE_NOMEM; | ||
71 | |||
72 | *ppTokenizer = &t->base; | ||
73 | return SQLITE_OK; | ||
74 | } | ||
75 | |||
76 | /* | ||
77 | ** Destroy a tokenizer | ||
78 | */ | ||
79 | static int porterDestroy(sqlite3_tokenizer *pTokenizer){ | ||
80 | free(pTokenizer); | ||
81 | return SQLITE_OK; | ||
82 | } | ||
83 | |||
84 | /* | ||
85 | ** Prepare to begin tokenizing a particular string. The input | ||
86 | ** string to be tokenized is zInput[0..nInput-1]. A cursor | ||
87 | ** used to incrementally tokenize this string is returned in | ||
88 | ** *ppCursor. | ||
89 | */ | ||
90 | static int porterOpen( | ||
91 | sqlite3_tokenizer *pTokenizer, /* The tokenizer */ | ||
92 | const char *zInput, int nInput, /* String to be tokenized */ | ||
93 | sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ | ||
94 | ){ | ||
95 | porter_tokenizer_cursor *c; | ||
96 | |||
97 | c = (porter_tokenizer_cursor *) malloc(sizeof(*c)); | ||
98 | if( c==NULL ) return SQLITE_NOMEM; | ||
99 | |||
100 | c->zInput = zInput; | ||
101 | if( zInput==0 ){ | ||
102 | c->nInput = 0; | ||
103 | }else if( nInput<0 ){ | ||
104 | c->nInput = (int)strlen(zInput); | ||
105 | }else{ | ||
106 | c->nInput = nInput; | ||
107 | } | ||
108 | c->iOffset = 0; /* start tokenizing at the beginning */ | ||
109 | c->iToken = 0; | ||
110 | c->zToken = NULL; /* no space allocated, yet. */ | ||
111 | c->nAllocated = 0; | ||
112 | |||
113 | *ppCursor = &c->base; | ||
114 | return SQLITE_OK; | ||
115 | } | ||
116 | |||
117 | /* | ||
118 | ** Close a tokenization cursor previously opened by a call to | ||
119 | ** porterOpen() above. | ||
120 | */ | ||
121 | static int porterClose(sqlite3_tokenizer_cursor *pCursor){ | ||
122 | porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor; | ||
123 | free(c->zToken); | ||
124 | free(c); | ||
125 | return SQLITE_OK; | ||
126 | } | ||
127 | /* | ||
128 | ** Vowel or consonant | ||
129 | */ | ||
130 | static const char cType[] = { | ||
131 | 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, | ||
132 | 1, 1, 1, 2, 1 | ||
133 | }; | ||
134 | |||
135 | /* | ||
136 | ** isConsonant() and isVowel() determine if their first character in | ||
137 | ** the string they point to is a consonant or a vowel, according | ||
138 | ** to Porter ruls. | ||
139 | ** | ||
140 | ** A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'. | ||
141 | ** 'Y' is a consonant unless it follows another consonant, | ||
142 | ** in which case it is a vowel. | ||
143 | ** | ||
144 | ** In these routine, the letters are in reverse order. So the 'y' rule | ||
145 | ** is that 'y' is a consonant unless it is followed by another | ||
146 | ** consonent. | ||
147 | */ | ||
148 | static int isVowel(const char*); | ||
149 | static int isConsonant(const char *z){ | ||
150 | int j; | ||
151 | char x = *z; | ||
152 | if( x==0 ) return 0; | ||
153 | assert( x>='a' && x<='z' ); | ||
154 | j = cType[x-'a']; | ||
155 | if( j<2 ) return j; | ||
156 | return z[1]==0 || isVowel(z + 1); | ||
157 | } | ||
158 | static int isVowel(const char *z){ | ||
159 | int j; | ||
160 | char x = *z; | ||
161 | if( x==0 ) return 0; | ||
162 | assert( x>='a' && x<='z' ); | ||
163 | j = cType[x-'a']; | ||
164 | if( j<2 ) return 1-j; | ||
165 | return isConsonant(z + 1); | ||
166 | } | ||
167 | |||
168 | /* | ||
169 | ** Let any sequence of one or more vowels be represented by V and let | ||
170 | ** C be sequence of one or more consonants. Then every word can be | ||
171 | ** represented as: | ||
172 | ** | ||
173 | ** [C] (VC){m} [V] | ||
174 | ** | ||
175 | ** In prose: A word is an optional consonant followed by zero or | ||
176 | ** vowel-consonant pairs followed by an optional vowel. "m" is the | ||
177 | ** number of vowel consonant pairs. This routine computes the value | ||
178 | ** of m for the first i bytes of a word. | ||
179 | ** | ||
180 | ** Return true if the m-value for z is 1 or more. In other words, | ||
181 | ** return true if z contains at least one vowel that is followed | ||
182 | ** by a consonant. | ||
183 | ** | ||
184 | ** In this routine z[] is in reverse order. So we are really looking | ||
185 | ** for an instance of of a consonant followed by a vowel. | ||
186 | */ | ||
187 | static int m_gt_0(const char *z){ | ||
188 | while( isVowel(z) ){ z++; } | ||
189 | if( *z==0 ) return 0; | ||
190 | while( isConsonant(z) ){ z++; } | ||
191 | return *z!=0; | ||
192 | } | ||
193 | |||
194 | /* Like mgt0 above except we are looking for a value of m which is | ||
195 | ** exactly 1 | ||
196 | */ | ||
197 | static int m_eq_1(const char *z){ | ||
198 | while( isVowel(z) ){ z++; } | ||
199 | if( *z==0 ) return 0; | ||
200 | while( isConsonant(z) ){ z++; } | ||
201 | if( *z==0 ) return 0; | ||
202 | while( isVowel(z) ){ z++; } | ||
203 | if( *z==0 ) return 1; | ||
204 | while( isConsonant(z) ){ z++; } | ||
205 | return *z==0; | ||
206 | } | ||
207 | |||
208 | /* Like mgt0 above except we are looking for a value of m>1 instead | ||
209 | ** or m>0 | ||
210 | */ | ||
211 | static int m_gt_1(const char *z){ | ||
212 | while( isVowel(z) ){ z++; } | ||
213 | if( *z==0 ) return 0; | ||
214 | while( isConsonant(z) ){ z++; } | ||
215 | if( *z==0 ) return 0; | ||
216 | while( isVowel(z) ){ z++; } | ||
217 | if( *z==0 ) return 0; | ||
218 | while( isConsonant(z) ){ z++; } | ||
219 | return *z!=0; | ||
220 | } | ||
221 | |||
222 | /* | ||
223 | ** Return TRUE if there is a vowel anywhere within z[0..n-1] | ||
224 | */ | ||
225 | static int hasVowel(const char *z){ | ||
226 | while( isConsonant(z) ){ z++; } | ||
227 | return *z!=0; | ||
228 | } | ||
229 | |||
230 | /* | ||
231 | ** Return TRUE if the word ends in a double consonant. | ||
232 | ** | ||
233 | ** The text is reversed here. So we are really looking at | ||
234 | ** the first two characters of z[]. | ||
235 | */ | ||
236 | static int doubleConsonant(const char *z){ | ||
237 | return isConsonant(z) && z[0]==z[1] && isConsonant(z+1); | ||
238 | } | ||
239 | |||
240 | /* | ||
241 | ** Return TRUE if the word ends with three letters which | ||
242 | ** are consonant-vowel-consonent and where the final consonant | ||
243 | ** is not 'w', 'x', or 'y'. | ||
244 | ** | ||
245 | ** The word is reversed here. So we are really checking the | ||
246 | ** first three letters and the first one cannot be in [wxy]. | ||
247 | */ | ||
248 | static int star_oh(const char *z){ | ||
249 | return | ||
250 | z[0]!=0 && isConsonant(z) && | ||
251 | z[0]!='w' && z[0]!='x' && z[0]!='y' && | ||
252 | z[1]!=0 && isVowel(z+1) && | ||
253 | z[2]!=0 && isConsonant(z+2); | ||
254 | } | ||
255 | |||
256 | /* | ||
257 | ** If the word ends with zFrom and xCond() is true for the stem | ||
258 | ** of the word that preceeds the zFrom ending, then change the | ||
259 | ** ending to zTo. | ||
260 | ** | ||
261 | ** The input word *pz and zFrom are both in reverse order. zTo | ||
262 | ** is in normal order. | ||
263 | ** | ||
264 | ** Return TRUE if zFrom matches. Return FALSE if zFrom does not | ||
265 | ** match. Not that TRUE is returned even if xCond() fails and | ||
266 | ** no substitution occurs. | ||
267 | */ | ||
268 | static int stem( | ||
269 | char **pz, /* The word being stemmed (Reversed) */ | ||
270 | const char *zFrom, /* If the ending matches this... (Reversed) */ | ||
271 | const char *zTo, /* ... change the ending to this (not reversed) */ | ||
272 | int (*xCond)(const char*) /* Condition that must be true */ | ||
273 | ){ | ||
274 | char *z = *pz; | ||
275 | while( *zFrom && *zFrom==*z ){ z++; zFrom++; } | ||
276 | if( *zFrom!=0 ) return 0; | ||
277 | if( xCond && !xCond(z) ) return 1; | ||
278 | while( *zTo ){ | ||
279 | *(--z) = *(zTo++); | ||
280 | } | ||
281 | *pz = z; | ||
282 | return 1; | ||
283 | } | ||
284 | |||
285 | /* | ||
286 | ** This is the fallback stemmer used when the porter stemmer is | ||
287 | ** inappropriate. The input word is copied into the output with | ||
288 | ** US-ASCII case folding. If the input word is too long (more | ||
289 | ** than 20 bytes if it contains no digits or more than 6 bytes if | ||
290 | ** it contains digits) then word is truncated to 20 or 6 bytes | ||
291 | ** by taking 10 or 3 bytes from the beginning and end. | ||
292 | */ | ||
293 | static void copy_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){ | ||
294 | int i, mx, j; | ||
295 | int hasDigit = 0; | ||
296 | for(i=0; i<nIn; i++){ | ||
297 | int c = zIn[i]; | ||
298 | if( c>='A' && c<='Z' ){ | ||
299 | zOut[i] = c - 'A' + 'a'; | ||
300 | }else{ | ||
301 | if( c>='0' && c<='9' ) hasDigit = 1; | ||
302 | zOut[i] = c; | ||
303 | } | ||
304 | } | ||
305 | mx = hasDigit ? 3 : 10; | ||
306 | if( nIn>mx*2 ){ | ||
307 | for(j=mx, i=nIn-mx; i<nIn; i++, j++){ | ||
308 | zOut[j] = zOut[i]; | ||
309 | } | ||
310 | i = j; | ||
311 | } | ||
312 | zOut[i] = 0; | ||
313 | *pnOut = i; | ||
314 | } | ||
315 | |||
316 | |||
317 | /* | ||
318 | ** Stem the input word zIn[0..nIn-1]. Store the output in zOut. | ||
319 | ** zOut is at least big enough to hold nIn bytes. Write the actual | ||
320 | ** size of the output word (exclusive of the '\0' terminator) into *pnOut. | ||
321 | ** | ||
322 | ** Any upper-case characters in the US-ASCII character set ([A-Z]) | ||
323 | ** are converted to lower case. Upper-case UTF characters are | ||
324 | ** unchanged. | ||
325 | ** | ||
326 | ** Words that are longer than about 20 bytes are stemmed by retaining | ||
327 | ** a few bytes from the beginning and the end of the word. If the | ||
328 | ** word contains digits, 3 bytes are taken from the beginning and | ||
329 | ** 3 bytes from the end. For long words without digits, 10 bytes | ||
330 | ** are taken from each end. US-ASCII case folding still applies. | ||
331 | ** | ||
332 | ** If the input word contains not digits but does characters not | ||
333 | ** in [a-zA-Z] then no stemming is attempted and this routine just | ||
334 | ** copies the input into the input into the output with US-ASCII | ||
335 | ** case folding. | ||
336 | ** | ||
337 | ** Stemming never increases the length of the word. So there is | ||
338 | ** no chance of overflowing the zOut buffer. | ||
339 | */ | ||
340 | static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){ | ||
341 | int i, j, c; | ||
342 | char zReverse[28]; | ||
343 | char *z, *z2; | ||
344 | if( nIn<3 || nIn>=sizeof(zReverse)-7 ){ | ||
345 | /* The word is too big or too small for the porter stemmer. | ||
346 | ** Fallback to the copy stemmer */ | ||
347 | copy_stemmer(zIn, nIn, zOut, pnOut); | ||
348 | return; | ||
349 | } | ||
350 | for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){ | ||
351 | c = zIn[i]; | ||
352 | if( c>='A' && c<='Z' ){ | ||
353 | zReverse[j] = c + 'a' - 'A'; | ||
354 | }else if( c>='a' && c<='z' ){ | ||
355 | zReverse[j] = c; | ||
356 | }else{ | ||
357 | /* The use of a character not in [a-zA-Z] means that we fallback | ||
358 | ** to the copy stemmer */ | ||
359 | copy_stemmer(zIn, nIn, zOut, pnOut); | ||
360 | return; | ||
361 | } | ||
362 | } | ||
363 | memset(&zReverse[sizeof(zReverse)-5], 0, 5); | ||
364 | z = &zReverse[j+1]; | ||
365 | |||
366 | |||
367 | /* Step 1a */ | ||
368 | if( z[0]=='s' ){ | ||
369 | if( | ||
370 | !stem(&z, "sess", "ss", 0) && | ||
371 | !stem(&z, "sei", "i", 0) && | ||
372 | !stem(&z, "ss", "ss", 0) | ||
373 | ){ | ||
374 | z++; | ||
375 | } | ||
376 | } | ||
377 | |||
378 | /* Step 1b */ | ||
379 | z2 = z; | ||
380 | if( stem(&z, "dee", "ee", m_gt_0) ){ | ||
381 | /* Do nothing. The work was all in the test */ | ||
382 | }else if( | ||
383 | (stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel)) | ||
384 | && z!=z2 | ||
385 | ){ | ||
386 | if( stem(&z, "ta", "ate", 0) || | ||
387 | stem(&z, "lb", "ble", 0) || | ||
388 | stem(&z, "zi", "ize", 0) ){ | ||
389 | /* Do nothing. The work was all in the test */ | ||
390 | }else if( doubleConsonant(z) && (*z!='l' && *z!='s' && *z!='z') ){ | ||
391 | z++; | ||
392 | }else if( m_eq_1(z) && star_oh(z) ){ | ||
393 | *(--z) = 'e'; | ||
394 | } | ||
395 | } | ||
396 | |||
397 | /* Step 1c */ | ||
398 | if( z[0]=='y' && hasVowel(z+1) ){ | ||
399 | z[0] = 'i'; | ||
400 | } | ||
401 | |||
402 | /* Step 2 */ | ||
403 | switch( z[1] ){ | ||
404 | case 'a': | ||
405 | stem(&z, "lanoita", "ate", m_gt_0) || | ||
406 | stem(&z, "lanoit", "tion", m_gt_0); | ||
407 | break; | ||
408 | case 'c': | ||
409 | stem(&z, "icne", "ence", m_gt_0) || | ||
410 | stem(&z, "icna", "ance", m_gt_0); | ||
411 | break; | ||
412 | case 'e': | ||
413 | stem(&z, "rezi", "ize", m_gt_0); | ||
414 | break; | ||
415 | case 'g': | ||
416 | stem(&z, "igol", "log", m_gt_0); | ||
417 | break; | ||
418 | case 'l': | ||
419 | stem(&z, "ilb", "ble", m_gt_0) || | ||
420 | stem(&z, "illa", "al", m_gt_0) || | ||
421 | stem(&z, "iltne", "ent", m_gt_0) || | ||
422 | stem(&z, "ile", "e", m_gt_0) || | ||
423 | stem(&z, "ilsuo", "ous", m_gt_0); | ||
424 | break; | ||
425 | case 'o': | ||
426 | stem(&z, "noitazi", "ize", m_gt_0) || | ||
427 | stem(&z, "noita", "ate", m_gt_0) || | ||
428 | stem(&z, "rota", "ate", m_gt_0); | ||
429 | break; | ||
430 | case 's': | ||
431 | stem(&z, "msila", "al", m_gt_0) || | ||
432 | stem(&z, "ssenevi", "ive", m_gt_0) || | ||
433 | stem(&z, "ssenluf", "ful", m_gt_0) || | ||
434 | stem(&z, "ssensuo", "ous", m_gt_0); | ||
435 | break; | ||
436 | case 't': | ||
437 | stem(&z, "itila", "al", m_gt_0) || | ||
438 | stem(&z, "itivi", "ive", m_gt_0) || | ||
439 | stem(&z, "itilib", "ble", m_gt_0); | ||
440 | break; | ||
441 | } | ||
442 | |||
443 | /* Step 3 */ | ||
444 | switch( z[0] ){ | ||
445 | case 'e': | ||
446 | stem(&z, "etaci", "ic", m_gt_0) || | ||
447 | stem(&z, "evita", "", m_gt_0) || | ||
448 | stem(&z, "ezila", "al", m_gt_0); | ||
449 | break; | ||
450 | case 'i': | ||
451 | stem(&z, "itici", "ic", m_gt_0); | ||
452 | break; | ||
453 | case 'l': | ||
454 | stem(&z, "laci", "ic", m_gt_0) || | ||
455 | stem(&z, "luf", "", m_gt_0); | ||
456 | break; | ||
457 | case 's': | ||
458 | stem(&z, "ssen", "", m_gt_0); | ||
459 | break; | ||
460 | } | ||
461 | |||
462 | /* Step 4 */ | ||
463 | switch( z[1] ){ | ||
464 | case 'a': | ||
465 | if( z[0]=='l' && m_gt_1(z+2) ){ | ||
466 | z += 2; | ||
467 | } | ||
468 | break; | ||
469 | case 'c': | ||
470 | if( z[0]=='e' && z[2]=='n' && (z[3]=='a' || z[3]=='e') && m_gt_1(z+4) ){ | ||
471 | z += 4; | ||
472 | } | ||
473 | break; | ||
474 | case 'e': | ||
475 | if( z[0]=='r' && m_gt_1(z+2) ){ | ||
476 | z += 2; | ||
477 | } | ||
478 | break; | ||
479 | case 'i': | ||
480 | if( z[0]=='c' && m_gt_1(z+2) ){ | ||
481 | z += 2; | ||
482 | } | ||
483 | break; | ||
484 | case 'l': | ||
485 | if( z[0]=='e' && z[2]=='b' && (z[3]=='a' || z[3]=='i') && m_gt_1(z+4) ){ | ||
486 | z += 4; | ||
487 | } | ||
488 | break; | ||
489 | case 'n': | ||
490 | if( z[0]=='t' ){ | ||
491 | if( z[2]=='a' ){ | ||
492 | if( m_gt_1(z+3) ){ | ||
493 | z += 3; | ||
494 | } | ||
495 | }else if( z[2]=='e' ){ | ||
496 | stem(&z, "tneme", "", m_gt_1) || | ||
497 | stem(&z, "tnem", "", m_gt_1) || | ||
498 | stem(&z, "tne", "", m_gt_1); | ||
499 | } | ||
500 | } | ||
501 | break; | ||
502 | case 'o': | ||
503 | if( z[0]=='u' ){ | ||
504 | if( m_gt_1(z+2) ){ | ||
505 | z += 2; | ||
506 | } | ||
507 | }else if( z[3]=='s' || z[3]=='t' ){ | ||
508 | stem(&z, "noi", "", m_gt_1); | ||
509 | } | ||
510 | break; | ||
511 | case 's': | ||
512 | if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){ | ||
513 | z += 3; | ||
514 | } | ||
515 | break; | ||
516 | case 't': | ||
517 | stem(&z, "eta", "", m_gt_1) || | ||
518 | stem(&z, "iti", "", m_gt_1); | ||
519 | break; | ||
520 | case 'u': | ||
521 | if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){ | ||
522 | z += 3; | ||
523 | } | ||
524 | break; | ||
525 | case 'v': | ||
526 | case 'z': | ||
527 | if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){ | ||
528 | z += 3; | ||
529 | } | ||
530 | break; | ||
531 | } | ||
532 | |||
533 | /* Step 5a */ | ||
534 | if( z[0]=='e' ){ | ||
535 | if( m_gt_1(z+1) ){ | ||
536 | z++; | ||
537 | }else if( m_eq_1(z+1) && !star_oh(z+1) ){ | ||
538 | z++; | ||
539 | } | ||
540 | } | ||
541 | |||
542 | /* Step 5b */ | ||
543 | if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){ | ||
544 | z++; | ||
545 | } | ||
546 | |||
547 | /* z[] is now the stemmed word in reverse order. Flip it back | ||
548 | ** around into forward order and return. | ||
549 | */ | ||
550 | *pnOut = i = strlen(z); | ||
551 | zOut[i] = 0; | ||
552 | while( *z ){ | ||
553 | zOut[--i] = *(z++); | ||
554 | } | ||
555 | } | ||
556 | |||
557 | /* | ||
558 | ** Characters that can be part of a token. We assume any character | ||
559 | ** whose value is greater than 0x80 (any UTF character) can be | ||
560 | ** part of a token. In other words, delimiters all must have | ||
561 | ** values of 0x7f or lower. | ||
562 | */ | ||
563 | static const char isIdChar[] = { | ||
564 | /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */ | ||
565 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */ | ||
566 | 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */ | ||
567 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */ | ||
568 | 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */ | ||
569 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */ | ||
570 | }; | ||
571 | #define idChar(C) (((ch=C)&0x80)!=0 || (ch>0x2f && isIdChar[ch-0x30])) | ||
572 | #define isDelim(C) (((ch=C)&0x80)==0 && (ch<0x30 || !isIdChar[ch-0x30])) | ||
573 | |||
574 | /* | ||
575 | ** Extract the next token from a tokenization cursor. The cursor must | ||
576 | ** have been opened by a prior call to porterOpen(). | ||
577 | */ | ||
578 | static int porterNext( | ||
579 | sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by porterOpen */ | ||
580 | const char **pzToken, /* OUT: *pzToken is the token text */ | ||
581 | int *pnBytes, /* OUT: Number of bytes in token */ | ||
582 | int *piStartOffset, /* OUT: Starting offset of token */ | ||
583 | int *piEndOffset, /* OUT: Ending offset of token */ | ||
584 | int *piPosition /* OUT: Position integer of token */ | ||
585 | ){ | ||
586 | porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor; | ||
587 | const char *z = c->zInput; | ||
588 | |||
589 | while( c->iOffset<c->nInput ){ | ||
590 | int iStartOffset, ch; | ||
591 | |||
592 | /* Scan past delimiter characters */ | ||
593 | while( c->iOffset<c->nInput && isDelim(z[c->iOffset]) ){ | ||
594 | c->iOffset++; | ||
595 | } | ||
596 | |||
597 | /* Count non-delimiter characters. */ | ||
598 | iStartOffset = c->iOffset; | ||
599 | while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){ | ||
600 | c->iOffset++; | ||
601 | } | ||
602 | |||
603 | if( c->iOffset>iStartOffset ){ | ||
604 | int n = c->iOffset-iStartOffset; | ||
605 | if( n>c->nAllocated ){ | ||
606 | c->nAllocated = n+20; | ||
607 | c->zToken = realloc(c->zToken, c->nAllocated); | ||
608 | if( c->zToken==NULL ) return SQLITE_NOMEM; | ||
609 | } | ||
610 | porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes); | ||
611 | *pzToken = c->zToken; | ||
612 | *piStartOffset = iStartOffset; | ||
613 | *piEndOffset = c->iOffset; | ||
614 | *piPosition = c->iToken++; | ||
615 | return SQLITE_OK; | ||
616 | } | ||
617 | } | ||
618 | return SQLITE_DONE; | ||
619 | } | ||
620 | |||
621 | /* | ||
622 | ** The set of routines that implement the porter-stemmer tokenizer | ||
623 | */ | ||
624 | static const sqlite3_tokenizer_module porterTokenizerModule = { | ||
625 | 0, | ||
626 | porterCreate, | ||
627 | porterDestroy, | ||
628 | porterOpen, | ||
629 | porterClose, | ||
630 | porterNext, | ||
631 | }; | ||
632 | |||
633 | /* | ||
634 | ** Allocate a new porter tokenizer. Return a pointer to the new | ||
635 | ** tokenizer in *ppModule | ||
636 | */ | ||
637 | void sqlite3Fts1PorterTokenizerModule( | ||
638 | sqlite3_tokenizer_module const**ppModule | ||
639 | ){ | ||
640 | *ppModule = &porterTokenizerModule; | ||
641 | } | ||
642 | |||
643 | #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */ | ||