diff options
Diffstat (limited to 'libraries/sqlite/win32/fts1.c')
-rwxr-xr-x | libraries/sqlite/win32/fts1.c | 3344 |
1 files changed, 3344 insertions, 0 deletions
diff --git a/libraries/sqlite/win32/fts1.c b/libraries/sqlite/win32/fts1.c new file mode 100755 index 0000000..5a69965 --- /dev/null +++ b/libraries/sqlite/win32/fts1.c | |||
@@ -0,0 +1,3344 @@ | |||
1 | /* fts1 has a design flaw which can lead to database corruption (see | ||
2 | ** below). It is recommended not to use it any longer, instead use | ||
3 | ** fts3 (or higher). If you believe that your use of fts1 is safe, | ||
4 | ** add -DSQLITE_ENABLE_BROKEN_FTS1=1 to your CFLAGS. | ||
5 | */ | ||
6 | #ifndef SQLITE_ENABLE_BROKEN_FTS1 | ||
7 | #error fts1 has a design flaw and has been deprecated. | ||
8 | #endif | ||
9 | /* The flaw is that fts1 uses the content table's unaliased rowid as | ||
10 | ** the unique docid. fts1 embeds the rowid in the index it builds, | ||
11 | ** and expects the rowid to not change. The SQLite VACUUM operation | ||
12 | ** will renumber such rowids, thereby breaking fts1. If you are using | ||
13 | ** fts1 in a system which has disabled VACUUM, then you can continue | ||
14 | ** to use it safely. Note that PRAGMA auto_vacuum does NOT disable | ||
15 | ** VACUUM, though systems using auto_vacuum are unlikely to invoke | ||
16 | ** VACUUM. | ||
17 | ** | ||
18 | ** fts1 should be safe even across VACUUM if you only insert documents | ||
19 | ** and never delete. | ||
20 | */ | ||
21 | |||
22 | /* The author disclaims copyright to this source code. | ||
23 | * | ||
24 | * This is an SQLite module implementing full-text search. | ||
25 | */ | ||
26 | |||
27 | /* | ||
28 | ** The code in this file is only compiled if: | ||
29 | ** | ||
30 | ** * The FTS1 module is being built as an extension | ||
31 | ** (in which case SQLITE_CORE is not defined), or | ||
32 | ** | ||
33 | ** * The FTS1 module is being built into the core of | ||
34 | ** SQLite (in which case SQLITE_ENABLE_FTS1 is defined). | ||
35 | */ | ||
36 | #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) | ||
37 | |||
38 | #if defined(SQLITE_ENABLE_FTS1) && !defined(SQLITE_CORE) | ||
39 | # define SQLITE_CORE 1 | ||
40 | #endif | ||
41 | |||
42 | #include <assert.h> | ||
43 | #include <stdlib.h> | ||
44 | #include <stdio.h> | ||
45 | #include <string.h> | ||
46 | #include <ctype.h> | ||
47 | |||
48 | #include "fts1.h" | ||
49 | #include "fts1_hash.h" | ||
50 | #include "fts1_tokenizer.h" | ||
51 | #include "sqlite3.h" | ||
52 | #include "sqlite3ext.h" | ||
53 | SQLITE_EXTENSION_INIT1 | ||
54 | |||
55 | |||
56 | #if 0 | ||
57 | # define TRACE(A) printf A; fflush(stdout) | ||
58 | #else | ||
59 | # define TRACE(A) | ||
60 | #endif | ||
61 | |||
62 | /* utility functions */ | ||
63 | |||
64 | typedef struct StringBuffer { | ||
65 | int len; /* length, not including null terminator */ | ||
66 | int alloced; /* Space allocated for s[] */ | ||
67 | char *s; /* Content of the string */ | ||
68 | } StringBuffer; | ||
69 | |||
70 | static void initStringBuffer(StringBuffer *sb){ | ||
71 | sb->len = 0; | ||
72 | sb->alloced = 100; | ||
73 | sb->s = malloc(100); | ||
74 | sb->s[0] = '\0'; | ||
75 | } | ||
76 | |||
77 | static void nappend(StringBuffer *sb, const char *zFrom, int nFrom){ | ||
78 | if( sb->len + nFrom >= sb->alloced ){ | ||
79 | sb->alloced = sb->len + nFrom + 100; | ||
80 | sb->s = realloc(sb->s, sb->alloced+1); | ||
81 | if( sb->s==0 ){ | ||
82 | initStringBuffer(sb); | ||
83 | return; | ||
84 | } | ||
85 | } | ||
86 | memcpy(sb->s + sb->len, zFrom, nFrom); | ||
87 | sb->len += nFrom; | ||
88 | sb->s[sb->len] = 0; | ||
89 | } | ||
90 | static void append(StringBuffer *sb, const char *zFrom){ | ||
91 | nappend(sb, zFrom, strlen(zFrom)); | ||
92 | } | ||
93 | |||
94 | /* We encode variable-length integers in little-endian order using seven bits | ||
95 | * per byte as follows: | ||
96 | ** | ||
97 | ** KEY: | ||
98 | ** A = 0xxxxxxx 7 bits of data and one flag bit | ||
99 | ** B = 1xxxxxxx 7 bits of data and one flag bit | ||
100 | ** | ||
101 | ** 7 bits - A | ||
102 | ** 14 bits - BA | ||
103 | ** 21 bits - BBA | ||
104 | ** and so on. | ||
105 | */ | ||
106 | |||
107 | /* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */ | ||
108 | #define VARINT_MAX 10 | ||
109 | |||
110 | /* Write a 64-bit variable-length integer to memory starting at p[0]. | ||
111 | * The length of data written will be between 1 and VARINT_MAX bytes. | ||
112 | * The number of bytes written is returned. */ | ||
113 | static int putVarint(char *p, sqlite_int64 v){ | ||
114 | unsigned char *q = (unsigned char *) p; | ||
115 | sqlite_uint64 vu = v; | ||
116 | do{ | ||
117 | *q++ = (unsigned char) ((vu & 0x7f) | 0x80); | ||
118 | vu >>= 7; | ||
119 | }while( vu!=0 ); | ||
120 | q[-1] &= 0x7f; /* turn off high bit in final byte */ | ||
121 | assert( q - (unsigned char *)p <= VARINT_MAX ); | ||
122 | return (int) (q - (unsigned char *)p); | ||
123 | } | ||
124 | |||
125 | /* Read a 64-bit variable-length integer from memory starting at p[0]. | ||
126 | * Return the number of bytes read, or 0 on error. | ||
127 | * The value is stored in *v. */ | ||
128 | static int getVarint(const char *p, sqlite_int64 *v){ | ||
129 | const unsigned char *q = (const unsigned char *) p; | ||
130 | sqlite_uint64 x = 0, y = 1; | ||
131 | while( (*q & 0x80) == 0x80 ){ | ||
132 | x += y * (*q++ & 0x7f); | ||
133 | y <<= 7; | ||
134 | if( q - (unsigned char *)p >= VARINT_MAX ){ /* bad data */ | ||
135 | assert( 0 ); | ||
136 | return 0; | ||
137 | } | ||
138 | } | ||
139 | x += y * (*q++); | ||
140 | *v = (sqlite_int64) x; | ||
141 | return (int) (q - (unsigned char *)p); | ||
142 | } | ||
143 | |||
144 | static int getVarint32(const char *p, int *pi){ | ||
145 | sqlite_int64 i; | ||
146 | int ret = getVarint(p, &i); | ||
147 | *pi = (int) i; | ||
148 | assert( *pi==i ); | ||
149 | return ret; | ||
150 | } | ||
151 | |||
152 | /*** Document lists *** | ||
153 | * | ||
154 | * A document list holds a sorted list of varint-encoded document IDs. | ||
155 | * | ||
156 | * A doclist with type DL_POSITIONS_OFFSETS is stored like this: | ||
157 | * | ||
158 | * array { | ||
159 | * varint docid; | ||
160 | * array { | ||
161 | * varint position; (delta from previous position plus POS_BASE) | ||
162 | * varint startOffset; (delta from previous startOffset) | ||
163 | * varint endOffset; (delta from startOffset) | ||
164 | * } | ||
165 | * } | ||
166 | * | ||
167 | * Here, array { X } means zero or more occurrences of X, adjacent in memory. | ||
168 | * | ||
169 | * A position list may hold positions for text in multiple columns. A position | ||
170 | * POS_COLUMN is followed by a varint containing the index of the column for | ||
171 | * following positions in the list. Any positions appearing before any | ||
172 | * occurrences of POS_COLUMN are for column 0. | ||
173 | * | ||
174 | * A doclist with type DL_POSITIONS is like the above, but holds only docids | ||
175 | * and positions without offset information. | ||
176 | * | ||
177 | * A doclist with type DL_DOCIDS is like the above, but holds only docids | ||
178 | * without positions or offset information. | ||
179 | * | ||
180 | * On disk, every document list has positions and offsets, so we don't bother | ||
181 | * to serialize a doclist's type. | ||
182 | * | ||
183 | * We don't yet delta-encode document IDs; doing so will probably be a | ||
184 | * modest win. | ||
185 | * | ||
186 | * NOTE(shess) I've thought of a slightly (1%) better offset encoding. | ||
187 | * After the first offset, estimate the next offset by using the | ||
188 | * current token position and the previous token position and offset, | ||
189 | * offset to handle some variance. So the estimate would be | ||
190 | * (iPosition*w->iStartOffset/w->iPosition-64), which is delta-encoded | ||
191 | * as normal. Offsets more than 64 chars from the estimate are | ||
192 | * encoded as the delta to the previous start offset + 128. An | ||
193 | * additional tiny increment can be gained by using the end offset of | ||
194 | * the previous token to make the estimate a tiny bit more precise. | ||
195 | */ | ||
196 | |||
197 | /* It is not safe to call isspace(), tolower(), or isalnum() on | ||
198 | ** hi-bit-set characters. This is the same solution used in the | ||
199 | ** tokenizer. | ||
200 | */ | ||
201 | /* TODO(shess) The snippet-generation code should be using the | ||
202 | ** tokenizer-generated tokens rather than doing its own local | ||
203 | ** tokenization. | ||
204 | */ | ||
205 | /* TODO(shess) Is __isascii() a portable version of (c&0x80)==0? */ | ||
206 | static int safe_isspace(char c){ | ||
207 | return (c&0x80)==0 ? isspace(c) : 0; | ||
208 | } | ||
209 | static int safe_tolower(char c){ | ||
210 | return (c&0x80)==0 ? tolower(c) : c; | ||
211 | } | ||
212 | static int safe_isalnum(char c){ | ||
213 | return (c&0x80)==0 ? isalnum(c) : 0; | ||
214 | } | ||
215 | |||
216 | typedef enum DocListType { | ||
217 | DL_DOCIDS, /* docids only */ | ||
218 | DL_POSITIONS, /* docids + positions */ | ||
219 | DL_POSITIONS_OFFSETS /* docids + positions + offsets */ | ||
220 | } DocListType; | ||
221 | |||
222 | /* | ||
223 | ** By default, only positions and not offsets are stored in the doclists. | ||
224 | ** To change this so that offsets are stored too, compile with | ||
225 | ** | ||
226 | ** -DDL_DEFAULT=DL_POSITIONS_OFFSETS | ||
227 | ** | ||
228 | */ | ||
229 | #ifndef DL_DEFAULT | ||
230 | # define DL_DEFAULT DL_POSITIONS | ||
231 | #endif | ||
232 | |||
233 | typedef struct DocList { | ||
234 | char *pData; | ||
235 | int nData; | ||
236 | DocListType iType; | ||
237 | int iLastColumn; /* the last column written */ | ||
238 | int iLastPos; /* the last position written */ | ||
239 | int iLastOffset; /* the last start offset written */ | ||
240 | } DocList; | ||
241 | |||
242 | enum { | ||
243 | POS_END = 0, /* end of this position list */ | ||
244 | POS_COLUMN, /* followed by new column number */ | ||
245 | POS_BASE | ||
246 | }; | ||
247 | |||
248 | /* Initialize a new DocList to hold the given data. */ | ||
249 | static void docListInit(DocList *d, DocListType iType, | ||
250 | const char *pData, int nData){ | ||
251 | d->nData = nData; | ||
252 | if( nData>0 ){ | ||
253 | d->pData = malloc(nData); | ||
254 | memcpy(d->pData, pData, nData); | ||
255 | } else { | ||
256 | d->pData = NULL; | ||
257 | } | ||
258 | d->iType = iType; | ||
259 | d->iLastColumn = 0; | ||
260 | d->iLastPos = d->iLastOffset = 0; | ||
261 | } | ||
262 | |||
263 | /* Create a new dynamically-allocated DocList. */ | ||
264 | static DocList *docListNew(DocListType iType){ | ||
265 | DocList *d = (DocList *) malloc(sizeof(DocList)); | ||
266 | docListInit(d, iType, 0, 0); | ||
267 | return d; | ||
268 | } | ||
269 | |||
270 | static void docListDestroy(DocList *d){ | ||
271 | free(d->pData); | ||
272 | #ifndef NDEBUG | ||
273 | memset(d, 0x55, sizeof(*d)); | ||
274 | #endif | ||
275 | } | ||
276 | |||
277 | static void docListDelete(DocList *d){ | ||
278 | docListDestroy(d); | ||
279 | free(d); | ||
280 | } | ||
281 | |||
282 | static char *docListEnd(DocList *d){ | ||
283 | return d->pData + d->nData; | ||
284 | } | ||
285 | |||
286 | /* Append a varint to a DocList's data. */ | ||
287 | static void appendVarint(DocList *d, sqlite_int64 i){ | ||
288 | char c[VARINT_MAX]; | ||
289 | int n = putVarint(c, i); | ||
290 | d->pData = realloc(d->pData, d->nData + n); | ||
291 | memcpy(d->pData + d->nData, c, n); | ||
292 | d->nData += n; | ||
293 | } | ||
294 | |||
295 | static void docListAddDocid(DocList *d, sqlite_int64 iDocid){ | ||
296 | appendVarint(d, iDocid); | ||
297 | if( d->iType>=DL_POSITIONS ){ | ||
298 | appendVarint(d, POS_END); /* initially empty position list */ | ||
299 | d->iLastColumn = 0; | ||
300 | d->iLastPos = d->iLastOffset = 0; | ||
301 | } | ||
302 | } | ||
303 | |||
304 | /* helper function for docListAddPos and docListAddPosOffset */ | ||
305 | static void addPos(DocList *d, int iColumn, int iPos){ | ||
306 | assert( d->nData>0 ); | ||
307 | --d->nData; /* remove previous terminator */ | ||
308 | if( iColumn!=d->iLastColumn ){ | ||
309 | assert( iColumn>d->iLastColumn ); | ||
310 | appendVarint(d, POS_COLUMN); | ||
311 | appendVarint(d, iColumn); | ||
312 | d->iLastColumn = iColumn; | ||
313 | d->iLastPos = d->iLastOffset = 0; | ||
314 | } | ||
315 | assert( iPos>=d->iLastPos ); | ||
316 | appendVarint(d, iPos-d->iLastPos+POS_BASE); | ||
317 | d->iLastPos = iPos; | ||
318 | } | ||
319 | |||
320 | /* Add a position to the last position list in a doclist. */ | ||
321 | static void docListAddPos(DocList *d, int iColumn, int iPos){ | ||
322 | assert( d->iType==DL_POSITIONS ); | ||
323 | addPos(d, iColumn, iPos); | ||
324 | appendVarint(d, POS_END); /* add new terminator */ | ||
325 | } | ||
326 | |||
327 | /* | ||
328 | ** Add a position and starting and ending offsets to a doclist. | ||
329 | ** | ||
330 | ** If the doclist is setup to handle only positions, then insert | ||
331 | ** the position only and ignore the offsets. | ||
332 | */ | ||
333 | static void docListAddPosOffset( | ||
334 | DocList *d, /* Doclist under construction */ | ||
335 | int iColumn, /* Column the inserted term is part of */ | ||
336 | int iPos, /* Position of the inserted term */ | ||
337 | int iStartOffset, /* Starting offset of inserted term */ | ||
338 | int iEndOffset /* Ending offset of inserted term */ | ||
339 | ){ | ||
340 | assert( d->iType>=DL_POSITIONS ); | ||
341 | addPos(d, iColumn, iPos); | ||
342 | if( d->iType==DL_POSITIONS_OFFSETS ){ | ||
343 | assert( iStartOffset>=d->iLastOffset ); | ||
344 | appendVarint(d, iStartOffset-d->iLastOffset); | ||
345 | d->iLastOffset = iStartOffset; | ||
346 | assert( iEndOffset>=iStartOffset ); | ||
347 | appendVarint(d, iEndOffset-iStartOffset); | ||
348 | } | ||
349 | appendVarint(d, POS_END); /* add new terminator */ | ||
350 | } | ||
351 | |||
352 | /* | ||
353 | ** A DocListReader object is a cursor into a doclist. Initialize | ||
354 | ** the cursor to the beginning of the doclist by calling readerInit(). | ||
355 | ** Then use routines | ||
356 | ** | ||
357 | ** peekDocid() | ||
358 | ** readDocid() | ||
359 | ** readPosition() | ||
360 | ** skipPositionList() | ||
361 | ** and so forth... | ||
362 | ** | ||
363 | ** to read information out of the doclist. When we reach the end | ||
364 | ** of the doclist, atEnd() returns TRUE. | ||
365 | */ | ||
366 | typedef struct DocListReader { | ||
367 | DocList *pDoclist; /* The document list we are stepping through */ | ||
368 | char *p; /* Pointer to next unread byte in the doclist */ | ||
369 | int iLastColumn; | ||
370 | int iLastPos; /* the last position read, or -1 when not in a position list */ | ||
371 | } DocListReader; | ||
372 | |||
373 | /* | ||
374 | ** Initialize the DocListReader r to point to the beginning of pDoclist. | ||
375 | */ | ||
376 | static void readerInit(DocListReader *r, DocList *pDoclist){ | ||
377 | r->pDoclist = pDoclist; | ||
378 | if( pDoclist!=NULL ){ | ||
379 | r->p = pDoclist->pData; | ||
380 | } | ||
381 | r->iLastColumn = -1; | ||
382 | r->iLastPos = -1; | ||
383 | } | ||
384 | |||
385 | /* | ||
386 | ** Return TRUE if we have reached then end of pReader and there is | ||
387 | ** nothing else left to read. | ||
388 | */ | ||
389 | static int atEnd(DocListReader *pReader){ | ||
390 | return pReader->pDoclist==0 || (pReader->p >= docListEnd(pReader->pDoclist)); | ||
391 | } | ||
392 | |||
393 | /* Peek at the next docid without advancing the read pointer. | ||
394 | */ | ||
395 | static sqlite_int64 peekDocid(DocListReader *pReader){ | ||
396 | sqlite_int64 ret; | ||
397 | assert( !atEnd(pReader) ); | ||
398 | assert( pReader->iLastPos==-1 ); | ||
399 | getVarint(pReader->p, &ret); | ||
400 | return ret; | ||
401 | } | ||
402 | |||
403 | /* Read the next docid. See also nextDocid(). | ||
404 | */ | ||
405 | static sqlite_int64 readDocid(DocListReader *pReader){ | ||
406 | sqlite_int64 ret; | ||
407 | assert( !atEnd(pReader) ); | ||
408 | assert( pReader->iLastPos==-1 ); | ||
409 | pReader->p += getVarint(pReader->p, &ret); | ||
410 | if( pReader->pDoclist->iType>=DL_POSITIONS ){ | ||
411 | pReader->iLastColumn = 0; | ||
412 | pReader->iLastPos = 0; | ||
413 | } | ||
414 | return ret; | ||
415 | } | ||
416 | |||
417 | /* Read the next position and column index from a position list. | ||
418 | * Returns the position, or -1 at the end of the list. */ | ||
419 | static int readPosition(DocListReader *pReader, int *iColumn){ | ||
420 | int i; | ||
421 | int iType = pReader->pDoclist->iType; | ||
422 | |||
423 | if( pReader->iLastPos==-1 ){ | ||
424 | return -1; | ||
425 | } | ||
426 | assert( !atEnd(pReader) ); | ||
427 | |||
428 | if( iType<DL_POSITIONS ){ | ||
429 | return -1; | ||
430 | } | ||
431 | pReader->p += getVarint32(pReader->p, &i); | ||
432 | if( i==POS_END ){ | ||
433 | pReader->iLastColumn = pReader->iLastPos = -1; | ||
434 | *iColumn = -1; | ||
435 | return -1; | ||
436 | } | ||
437 | if( i==POS_COLUMN ){ | ||
438 | pReader->p += getVarint32(pReader->p, &pReader->iLastColumn); | ||
439 | pReader->iLastPos = 0; | ||
440 | pReader->p += getVarint32(pReader->p, &i); | ||
441 | assert( i>=POS_BASE ); | ||
442 | } | ||
443 | pReader->iLastPos += ((int) i)-POS_BASE; | ||
444 | if( iType>=DL_POSITIONS_OFFSETS ){ | ||
445 | /* Skip over offsets, ignoring them for now. */ | ||
446 | int iStart, iEnd; | ||
447 | pReader->p += getVarint32(pReader->p, &iStart); | ||
448 | pReader->p += getVarint32(pReader->p, &iEnd); | ||
449 | } | ||
450 | *iColumn = pReader->iLastColumn; | ||
451 | return pReader->iLastPos; | ||
452 | } | ||
453 | |||
454 | /* Skip past the end of a position list. */ | ||
455 | static void skipPositionList(DocListReader *pReader){ | ||
456 | DocList *p = pReader->pDoclist; | ||
457 | if( p && p->iType>=DL_POSITIONS ){ | ||
458 | int iColumn; | ||
459 | while( readPosition(pReader, &iColumn)!=-1 ){} | ||
460 | } | ||
461 | } | ||
462 | |||
463 | /* Skip over a docid, including its position list if the doclist has | ||
464 | * positions. */ | ||
465 | static void skipDocument(DocListReader *pReader){ | ||
466 | readDocid(pReader); | ||
467 | skipPositionList(pReader); | ||
468 | } | ||
469 | |||
470 | /* Skip past all docids which are less than [iDocid]. Returns 1 if a docid | ||
471 | * matching [iDocid] was found. */ | ||
472 | static int skipToDocid(DocListReader *pReader, sqlite_int64 iDocid){ | ||
473 | sqlite_int64 d = 0; | ||
474 | while( !atEnd(pReader) && (d=peekDocid(pReader))<iDocid ){ | ||
475 | skipDocument(pReader); | ||
476 | } | ||
477 | return !atEnd(pReader) && d==iDocid; | ||
478 | } | ||
479 | |||
480 | /* Return the first document in a document list. | ||
481 | */ | ||
482 | static sqlite_int64 firstDocid(DocList *d){ | ||
483 | DocListReader r; | ||
484 | readerInit(&r, d); | ||
485 | return readDocid(&r); | ||
486 | } | ||
487 | |||
488 | #ifdef SQLITE_DEBUG | ||
489 | /* | ||
490 | ** This routine is used for debugging purpose only. | ||
491 | ** | ||
492 | ** Write the content of a doclist to standard output. | ||
493 | */ | ||
494 | static void printDoclist(DocList *p){ | ||
495 | DocListReader r; | ||
496 | const char *zSep = ""; | ||
497 | |||
498 | readerInit(&r, p); | ||
499 | while( !atEnd(&r) ){ | ||
500 | sqlite_int64 docid = readDocid(&r); | ||
501 | if( docid==0 ){ | ||
502 | skipPositionList(&r); | ||
503 | continue; | ||
504 | } | ||
505 | printf("%s%lld", zSep, docid); | ||
506 | zSep = ","; | ||
507 | if( p->iType>=DL_POSITIONS ){ | ||
508 | int iPos, iCol; | ||
509 | const char *zDiv = ""; | ||
510 | printf("("); | ||
511 | while( (iPos = readPosition(&r, &iCol))>=0 ){ | ||
512 | printf("%s%d:%d", zDiv, iCol, iPos); | ||
513 | zDiv = ":"; | ||
514 | } | ||
515 | printf(")"); | ||
516 | } | ||
517 | } | ||
518 | printf("\n"); | ||
519 | fflush(stdout); | ||
520 | } | ||
521 | #endif /* SQLITE_DEBUG */ | ||
522 | |||
523 | /* Trim the given doclist to contain only positions in column | ||
524 | * [iRestrictColumn]. */ | ||
525 | static void docListRestrictColumn(DocList *in, int iRestrictColumn){ | ||
526 | DocListReader r; | ||
527 | DocList out; | ||
528 | |||
529 | assert( in->iType>=DL_POSITIONS ); | ||
530 | readerInit(&r, in); | ||
531 | docListInit(&out, DL_POSITIONS, NULL, 0); | ||
532 | |||
533 | while( !atEnd(&r) ){ | ||
534 | sqlite_int64 iDocid = readDocid(&r); | ||
535 | int iPos, iColumn; | ||
536 | |||
537 | docListAddDocid(&out, iDocid); | ||
538 | while( (iPos = readPosition(&r, &iColumn)) != -1 ){ | ||
539 | if( iColumn==iRestrictColumn ){ | ||
540 | docListAddPos(&out, iColumn, iPos); | ||
541 | } | ||
542 | } | ||
543 | } | ||
544 | |||
545 | docListDestroy(in); | ||
546 | *in = out; | ||
547 | } | ||
548 | |||
549 | /* Trim the given doclist by discarding any docids without any remaining | ||
550 | * positions. */ | ||
551 | static void docListDiscardEmpty(DocList *in) { | ||
552 | DocListReader r; | ||
553 | DocList out; | ||
554 | |||
555 | /* TODO: It would be nice to implement this operation in place; that | ||
556 | * could save a significant amount of memory in queries with long doclists. */ | ||
557 | assert( in->iType>=DL_POSITIONS ); | ||
558 | readerInit(&r, in); | ||
559 | docListInit(&out, DL_POSITIONS, NULL, 0); | ||
560 | |||
561 | while( !atEnd(&r) ){ | ||
562 | sqlite_int64 iDocid = readDocid(&r); | ||
563 | int match = 0; | ||
564 | int iPos, iColumn; | ||
565 | while( (iPos = readPosition(&r, &iColumn)) != -1 ){ | ||
566 | if( !match ){ | ||
567 | docListAddDocid(&out, iDocid); | ||
568 | match = 1; | ||
569 | } | ||
570 | docListAddPos(&out, iColumn, iPos); | ||
571 | } | ||
572 | } | ||
573 | |||
574 | docListDestroy(in); | ||
575 | *in = out; | ||
576 | } | ||
577 | |||
578 | /* Helper function for docListUpdate() and docListAccumulate(). | ||
579 | ** Splices a doclist element into the doclist represented by r, | ||
580 | ** leaving r pointing after the newly spliced element. | ||
581 | */ | ||
582 | static void docListSpliceElement(DocListReader *r, sqlite_int64 iDocid, | ||
583 | const char *pSource, int nSource){ | ||
584 | DocList *d = r->pDoclist; | ||
585 | char *pTarget; | ||
586 | int nTarget, found; | ||
587 | |||
588 | found = skipToDocid(r, iDocid); | ||
589 | |||
590 | /* Describe slice in d to place pSource/nSource. */ | ||
591 | pTarget = r->p; | ||
592 | if( found ){ | ||
593 | skipDocument(r); | ||
594 | nTarget = r->p-pTarget; | ||
595 | }else{ | ||
596 | nTarget = 0; | ||
597 | } | ||
598 | |||
599 | /* The sense of the following is that there are three possibilities. | ||
600 | ** If nTarget==nSource, we should not move any memory nor realloc. | ||
601 | ** If nTarget>nSource, trim target and realloc. | ||
602 | ** If nTarget<nSource, realloc then expand target. | ||
603 | */ | ||
604 | if( nTarget>nSource ){ | ||
605 | memmove(pTarget+nSource, pTarget+nTarget, docListEnd(d)-(pTarget+nTarget)); | ||
606 | } | ||
607 | if( nTarget!=nSource ){ | ||
608 | int iDoclist = pTarget-d->pData; | ||
609 | d->pData = realloc(d->pData, d->nData+nSource-nTarget); | ||
610 | pTarget = d->pData+iDoclist; | ||
611 | } | ||
612 | if( nTarget<nSource ){ | ||
613 | memmove(pTarget+nSource, pTarget+nTarget, docListEnd(d)-(pTarget+nTarget)); | ||
614 | } | ||
615 | |||
616 | memcpy(pTarget, pSource, nSource); | ||
617 | d->nData += nSource-nTarget; | ||
618 | r->p = pTarget+nSource; | ||
619 | } | ||
620 | |||
621 | /* Insert/update pUpdate into the doclist. */ | ||
622 | static void docListUpdate(DocList *d, DocList *pUpdate){ | ||
623 | DocListReader reader; | ||
624 | |||
625 | assert( d!=NULL && pUpdate!=NULL ); | ||
626 | assert( d->iType==pUpdate->iType); | ||
627 | |||
628 | readerInit(&reader, d); | ||
629 | docListSpliceElement(&reader, firstDocid(pUpdate), | ||
630 | pUpdate->pData, pUpdate->nData); | ||
631 | } | ||
632 | |||
633 | /* Propagate elements from pUpdate to pAcc, overwriting elements with | ||
634 | ** matching docids. | ||
635 | */ | ||
636 | static void docListAccumulate(DocList *pAcc, DocList *pUpdate){ | ||
637 | DocListReader accReader, updateReader; | ||
638 | |||
639 | /* Handle edge cases where one doclist is empty. */ | ||
640 | assert( pAcc!=NULL ); | ||
641 | if( pUpdate==NULL || pUpdate->nData==0 ) return; | ||
642 | if( pAcc->nData==0 ){ | ||
643 | pAcc->pData = malloc(pUpdate->nData); | ||
644 | memcpy(pAcc->pData, pUpdate->pData, pUpdate->nData); | ||
645 | pAcc->nData = pUpdate->nData; | ||
646 | return; | ||
647 | } | ||
648 | |||
649 | readerInit(&accReader, pAcc); | ||
650 | readerInit(&updateReader, pUpdate); | ||
651 | |||
652 | while( !atEnd(&updateReader) ){ | ||
653 | char *pSource = updateReader.p; | ||
654 | sqlite_int64 iDocid = readDocid(&updateReader); | ||
655 | skipPositionList(&updateReader); | ||
656 | docListSpliceElement(&accReader, iDocid, pSource, updateReader.p-pSource); | ||
657 | } | ||
658 | } | ||
659 | |||
660 | /* | ||
661 | ** Read the next docid off of pIn. Return 0 if we reach the end. | ||
662 | * | ||
663 | * TODO: This assumes that docids are never 0, but they may actually be 0 since | ||
664 | * users can choose docids when inserting into a full-text table. Fix this. | ||
665 | */ | ||
666 | static sqlite_int64 nextDocid(DocListReader *pIn){ | ||
667 | skipPositionList(pIn); | ||
668 | return atEnd(pIn) ? 0 : readDocid(pIn); | ||
669 | } | ||
670 | |||
671 | /* | ||
672 | ** pLeft and pRight are two DocListReaders that are pointing to | ||
673 | ** positions lists of the same document: iDocid. | ||
674 | ** | ||
675 | ** If there are no instances in pLeft or pRight where the position | ||
676 | ** of pLeft is one less than the position of pRight, then this | ||
677 | ** routine adds nothing to pOut. | ||
678 | ** | ||
679 | ** If there are one or more instances where positions from pLeft | ||
680 | ** are exactly one less than positions from pRight, then add a new | ||
681 | ** document record to pOut. If pOut wants to hold positions, then | ||
682 | ** include the positions from pRight that are one more than a | ||
683 | ** position in pLeft. In other words: pRight.iPos==pLeft.iPos+1. | ||
684 | ** | ||
685 | ** pLeft and pRight are left pointing at the next document record. | ||
686 | */ | ||
687 | static void mergePosList( | ||
688 | DocListReader *pLeft, /* Left position list */ | ||
689 | DocListReader *pRight, /* Right position list */ | ||
690 | sqlite_int64 iDocid, /* The docid from pLeft and pRight */ | ||
691 | DocList *pOut /* Write the merged document record here */ | ||
692 | ){ | ||
693 | int iLeftCol, iLeftPos = readPosition(pLeft, &iLeftCol); | ||
694 | int iRightCol, iRightPos = readPosition(pRight, &iRightCol); | ||
695 | int match = 0; | ||
696 | |||
697 | /* Loop until we've reached the end of both position lists. */ | ||
698 | while( iLeftPos!=-1 && iRightPos!=-1 ){ | ||
699 | if( iLeftCol==iRightCol && iLeftPos+1==iRightPos ){ | ||
700 | if( !match ){ | ||
701 | docListAddDocid(pOut, iDocid); | ||
702 | match = 1; | ||
703 | } | ||
704 | if( pOut->iType>=DL_POSITIONS ){ | ||
705 | docListAddPos(pOut, iRightCol, iRightPos); | ||
706 | } | ||
707 | iLeftPos = readPosition(pLeft, &iLeftCol); | ||
708 | iRightPos = readPosition(pRight, &iRightCol); | ||
709 | }else if( iRightCol<iLeftCol || | ||
710 | (iRightCol==iLeftCol && iRightPos<iLeftPos+1) ){ | ||
711 | iRightPos = readPosition(pRight, &iRightCol); | ||
712 | }else{ | ||
713 | iLeftPos = readPosition(pLeft, &iLeftCol); | ||
714 | } | ||
715 | } | ||
716 | if( iLeftPos>=0 ) skipPositionList(pLeft); | ||
717 | if( iRightPos>=0 ) skipPositionList(pRight); | ||
718 | } | ||
719 | |||
720 | /* We have two doclists: pLeft and pRight. | ||
721 | ** Write the phrase intersection of these two doclists into pOut. | ||
722 | ** | ||
723 | ** A phrase intersection means that two documents only match | ||
724 | ** if pLeft.iPos+1==pRight.iPos. | ||
725 | ** | ||
726 | ** The output pOut may or may not contain positions. If pOut | ||
727 | ** does contain positions, they are the positions of pRight. | ||
728 | */ | ||
729 | static void docListPhraseMerge( | ||
730 | DocList *pLeft, /* Doclist resulting from the words on the left */ | ||
731 | DocList *pRight, /* Doclist for the next word to the right */ | ||
732 | DocList *pOut /* Write the combined doclist here */ | ||
733 | ){ | ||
734 | DocListReader left, right; | ||
735 | sqlite_int64 docidLeft, docidRight; | ||
736 | |||
737 | readerInit(&left, pLeft); | ||
738 | readerInit(&right, pRight); | ||
739 | docidLeft = nextDocid(&left); | ||
740 | docidRight = nextDocid(&right); | ||
741 | |||
742 | while( docidLeft>0 && docidRight>0 ){ | ||
743 | if( docidLeft<docidRight ){ | ||
744 | docidLeft = nextDocid(&left); | ||
745 | }else if( docidRight<docidLeft ){ | ||
746 | docidRight = nextDocid(&right); | ||
747 | }else{ | ||
748 | mergePosList(&left, &right, docidLeft, pOut); | ||
749 | docidLeft = nextDocid(&left); | ||
750 | docidRight = nextDocid(&right); | ||
751 | } | ||
752 | } | ||
753 | } | ||
754 | |||
755 | /* We have two doclists: pLeft and pRight. | ||
756 | ** Write the intersection of these two doclists into pOut. | ||
757 | ** Only docids are matched. Position information is ignored. | ||
758 | ** | ||
759 | ** The output pOut never holds positions. | ||
760 | */ | ||
761 | static void docListAndMerge( | ||
762 | DocList *pLeft, /* Doclist resulting from the words on the left */ | ||
763 | DocList *pRight, /* Doclist for the next word to the right */ | ||
764 | DocList *pOut /* Write the combined doclist here */ | ||
765 | ){ | ||
766 | DocListReader left, right; | ||
767 | sqlite_int64 docidLeft, docidRight; | ||
768 | |||
769 | assert( pOut->iType<DL_POSITIONS ); | ||
770 | |||
771 | readerInit(&left, pLeft); | ||
772 | readerInit(&right, pRight); | ||
773 | docidLeft = nextDocid(&left); | ||
774 | docidRight = nextDocid(&right); | ||
775 | |||
776 | while( docidLeft>0 && docidRight>0 ){ | ||
777 | if( docidLeft<docidRight ){ | ||
778 | docidLeft = nextDocid(&left); | ||
779 | }else if( docidRight<docidLeft ){ | ||
780 | docidRight = nextDocid(&right); | ||
781 | }else{ | ||
782 | docListAddDocid(pOut, docidLeft); | ||
783 | docidLeft = nextDocid(&left); | ||
784 | docidRight = nextDocid(&right); | ||
785 | } | ||
786 | } | ||
787 | } | ||
788 | |||
789 | /* We have two doclists: pLeft and pRight. | ||
790 | ** Write the union of these two doclists into pOut. | ||
791 | ** Only docids are matched. Position information is ignored. | ||
792 | ** | ||
793 | ** The output pOut never holds positions. | ||
794 | */ | ||
795 | static void docListOrMerge( | ||
796 | DocList *pLeft, /* Doclist resulting from the words on the left */ | ||
797 | DocList *pRight, /* Doclist for the next word to the right */ | ||
798 | DocList *pOut /* Write the combined doclist here */ | ||
799 | ){ | ||
800 | DocListReader left, right; | ||
801 | sqlite_int64 docidLeft, docidRight, priorLeft; | ||
802 | |||
803 | readerInit(&left, pLeft); | ||
804 | readerInit(&right, pRight); | ||
805 | docidLeft = nextDocid(&left); | ||
806 | docidRight = nextDocid(&right); | ||
807 | |||
808 | while( docidLeft>0 && docidRight>0 ){ | ||
809 | if( docidLeft<=docidRight ){ | ||
810 | docListAddDocid(pOut, docidLeft); | ||
811 | }else{ | ||
812 | docListAddDocid(pOut, docidRight); | ||
813 | } | ||
814 | priorLeft = docidLeft; | ||
815 | if( docidLeft<=docidRight ){ | ||
816 | docidLeft = nextDocid(&left); | ||
817 | } | ||
818 | if( docidRight>0 && docidRight<=priorLeft ){ | ||
819 | docidRight = nextDocid(&right); | ||
820 | } | ||
821 | } | ||
822 | while( docidLeft>0 ){ | ||
823 | docListAddDocid(pOut, docidLeft); | ||
824 | docidLeft = nextDocid(&left); | ||
825 | } | ||
826 | while( docidRight>0 ){ | ||
827 | docListAddDocid(pOut, docidRight); | ||
828 | docidRight = nextDocid(&right); | ||
829 | } | ||
830 | } | ||
831 | |||
832 | /* We have two doclists: pLeft and pRight. | ||
833 | ** Write into pOut all documents that occur in pLeft but not | ||
834 | ** in pRight. | ||
835 | ** | ||
836 | ** Only docids are matched. Position information is ignored. | ||
837 | ** | ||
838 | ** The output pOut never holds positions. | ||
839 | */ | ||
840 | static void docListExceptMerge( | ||
841 | DocList *pLeft, /* Doclist resulting from the words on the left */ | ||
842 | DocList *pRight, /* Doclist for the next word to the right */ | ||
843 | DocList *pOut /* Write the combined doclist here */ | ||
844 | ){ | ||
845 | DocListReader left, right; | ||
846 | sqlite_int64 docidLeft, docidRight, priorLeft; | ||
847 | |||
848 | readerInit(&left, pLeft); | ||
849 | readerInit(&right, pRight); | ||
850 | docidLeft = nextDocid(&left); | ||
851 | docidRight = nextDocid(&right); | ||
852 | |||
853 | while( docidLeft>0 && docidRight>0 ){ | ||
854 | priorLeft = docidLeft; | ||
855 | if( docidLeft<docidRight ){ | ||
856 | docListAddDocid(pOut, docidLeft); | ||
857 | } | ||
858 | if( docidLeft<=docidRight ){ | ||
859 | docidLeft = nextDocid(&left); | ||
860 | } | ||
861 | if( docidRight>0 && docidRight<=priorLeft ){ | ||
862 | docidRight = nextDocid(&right); | ||
863 | } | ||
864 | } | ||
865 | while( docidLeft>0 ){ | ||
866 | docListAddDocid(pOut, docidLeft); | ||
867 | docidLeft = nextDocid(&left); | ||
868 | } | ||
869 | } | ||
870 | |||
871 | static char *string_dup_n(const char *s, int n){ | ||
872 | char *str = malloc(n + 1); | ||
873 | memcpy(str, s, n); | ||
874 | str[n] = '\0'; | ||
875 | return str; | ||
876 | } | ||
877 | |||
878 | /* Duplicate a string; the caller must free() the returned string. | ||
879 | * (We don't use strdup() since it's not part of the standard C library and | ||
880 | * may not be available everywhere.) */ | ||
881 | static char *string_dup(const char *s){ | ||
882 | return string_dup_n(s, strlen(s)); | ||
883 | } | ||
884 | |||
885 | /* Format a string, replacing each occurrence of the % character with | ||
886 | * zDb.zName. This may be more convenient than sqlite_mprintf() | ||
887 | * when one string is used repeatedly in a format string. | ||
888 | * The caller must free() the returned string. */ | ||
889 | static char *string_format(const char *zFormat, | ||
890 | const char *zDb, const char *zName){ | ||
891 | const char *p; | ||
892 | size_t len = 0; | ||
893 | size_t nDb = strlen(zDb); | ||
894 | size_t nName = strlen(zName); | ||
895 | size_t nFullTableName = nDb+1+nName; | ||
896 | char *result; | ||
897 | char *r; | ||
898 | |||
899 | /* first compute length needed */ | ||
900 | for(p = zFormat ; *p ; ++p){ | ||
901 | len += (*p=='%' ? nFullTableName : 1); | ||
902 | } | ||
903 | len += 1; /* for null terminator */ | ||
904 | |||
905 | r = result = malloc(len); | ||
906 | for(p = zFormat; *p; ++p){ | ||
907 | if( *p=='%' ){ | ||
908 | memcpy(r, zDb, nDb); | ||
909 | r += nDb; | ||
910 | *r++ = '.'; | ||
911 | memcpy(r, zName, nName); | ||
912 | r += nName; | ||
913 | } else { | ||
914 | *r++ = *p; | ||
915 | } | ||
916 | } | ||
917 | *r++ = '\0'; | ||
918 | assert( r == result + len ); | ||
919 | return result; | ||
920 | } | ||
921 | |||
922 | static int sql_exec(sqlite3 *db, const char *zDb, const char *zName, | ||
923 | const char *zFormat){ | ||
924 | char *zCommand = string_format(zFormat, zDb, zName); | ||
925 | int rc; | ||
926 | TRACE(("FTS1 sql: %s\n", zCommand)); | ||
927 | rc = sqlite3_exec(db, zCommand, NULL, 0, NULL); | ||
928 | free(zCommand); | ||
929 | return rc; | ||
930 | } | ||
931 | |||
932 | static int sql_prepare(sqlite3 *db, const char *zDb, const char *zName, | ||
933 | sqlite3_stmt **ppStmt, const char *zFormat){ | ||
934 | char *zCommand = string_format(zFormat, zDb, zName); | ||
935 | int rc; | ||
936 | TRACE(("FTS1 prepare: %s\n", zCommand)); | ||
937 | rc = sqlite3_prepare(db, zCommand, -1, ppStmt, NULL); | ||
938 | free(zCommand); | ||
939 | return rc; | ||
940 | } | ||
941 | |||
942 | /* end utility functions */ | ||
943 | |||
944 | /* Forward reference */ | ||
945 | typedef struct fulltext_vtab fulltext_vtab; | ||
946 | |||
947 | /* A single term in a query is represented by an instances of | ||
948 | ** the following structure. | ||
949 | */ | ||
950 | typedef struct QueryTerm { | ||
951 | short int nPhrase; /* How many following terms are part of the same phrase */ | ||
952 | short int iPhrase; /* This is the i-th term of a phrase. */ | ||
953 | short int iColumn; /* Column of the index that must match this term */ | ||
954 | signed char isOr; /* this term is preceded by "OR" */ | ||
955 | signed char isNot; /* this term is preceded by "-" */ | ||
956 | char *pTerm; /* text of the term. '\000' terminated. malloced */ | ||
957 | int nTerm; /* Number of bytes in pTerm[] */ | ||
958 | } QueryTerm; | ||
959 | |||
960 | |||
961 | /* A query string is parsed into a Query structure. | ||
962 | * | ||
963 | * We could, in theory, allow query strings to be complicated | ||
964 | * nested expressions with precedence determined by parentheses. | ||
965 | * But none of the major search engines do this. (Perhaps the | ||
966 | * feeling is that an parenthesized expression is two complex of | ||
967 | * an idea for the average user to grasp.) Taking our lead from | ||
968 | * the major search engines, we will allow queries to be a list | ||
969 | * of terms (with an implied AND operator) or phrases in double-quotes, | ||
970 | * with a single optional "-" before each non-phrase term to designate | ||
971 | * negation and an optional OR connector. | ||
972 | * | ||
973 | * OR binds more tightly than the implied AND, which is what the | ||
974 | * major search engines seem to do. So, for example: | ||
975 | * | ||
976 | * [one two OR three] ==> one AND (two OR three) | ||
977 | * [one OR two three] ==> (one OR two) AND three | ||
978 | * | ||
979 | * A "-" before a term matches all entries that lack that term. | ||
980 | * The "-" must occur immediately before the term with in intervening | ||
981 | * space. This is how the search engines do it. | ||
982 | * | ||
983 | * A NOT term cannot be the right-hand operand of an OR. If this | ||
984 | * occurs in the query string, the NOT is ignored: | ||
985 | * | ||
986 | * [one OR -two] ==> one OR two | ||
987 | * | ||
988 | */ | ||
989 | typedef struct Query { | ||
990 | fulltext_vtab *pFts; /* The full text index */ | ||
991 | int nTerms; /* Number of terms in the query */ | ||
992 | QueryTerm *pTerms; /* Array of terms. Space obtained from malloc() */ | ||
993 | int nextIsOr; /* Set the isOr flag on the next inserted term */ | ||
994 | int nextColumn; /* Next word parsed must be in this column */ | ||
995 | int dfltColumn; /* The default column */ | ||
996 | } Query; | ||
997 | |||
998 | |||
999 | /* | ||
1000 | ** An instance of the following structure keeps track of generated | ||
1001 | ** matching-word offset information and snippets. | ||
1002 | */ | ||
1003 | typedef struct Snippet { | ||
1004 | int nMatch; /* Total number of matches */ | ||
1005 | int nAlloc; /* Space allocated for aMatch[] */ | ||
1006 | struct snippetMatch { /* One entry for each matching term */ | ||
1007 | char snStatus; /* Status flag for use while constructing snippets */ | ||
1008 | short int iCol; /* The column that contains the match */ | ||
1009 | short int iTerm; /* The index in Query.pTerms[] of the matching term */ | ||
1010 | short int nByte; /* Number of bytes in the term */ | ||
1011 | int iStart; /* The offset to the first character of the term */ | ||
1012 | } *aMatch; /* Points to space obtained from malloc */ | ||
1013 | char *zOffset; /* Text rendering of aMatch[] */ | ||
1014 | int nOffset; /* strlen(zOffset) */ | ||
1015 | char *zSnippet; /* Snippet text */ | ||
1016 | int nSnippet; /* strlen(zSnippet) */ | ||
1017 | } Snippet; | ||
1018 | |||
1019 | |||
1020 | typedef enum QueryType { | ||
1021 | QUERY_GENERIC, /* table scan */ | ||
1022 | QUERY_ROWID, /* lookup by rowid */ | ||
1023 | QUERY_FULLTEXT /* QUERY_FULLTEXT + [i] is a full-text search for column i*/ | ||
1024 | } QueryType; | ||
1025 | |||
1026 | /* TODO(shess) CHUNK_MAX controls how much data we allow in segment 0 | ||
1027 | ** before we start aggregating into larger segments. Lower CHUNK_MAX | ||
1028 | ** means that for a given input we have more individual segments per | ||
1029 | ** term, which means more rows in the table and a bigger index (due to | ||
1030 | ** both more rows and bigger rowids). But it also reduces the average | ||
1031 | ** cost of adding new elements to the segment 0 doclist, and it seems | ||
1032 | ** to reduce the number of pages read and written during inserts. 256 | ||
1033 | ** was chosen by measuring insertion times for a certain input (first | ||
1034 | ** 10k documents of Enron corpus), though including query performance | ||
1035 | ** in the decision may argue for a larger value. | ||
1036 | */ | ||
1037 | #define CHUNK_MAX 256 | ||
1038 | |||
1039 | typedef enum fulltext_statement { | ||
1040 | CONTENT_INSERT_STMT, | ||
1041 | CONTENT_SELECT_STMT, | ||
1042 | CONTENT_UPDATE_STMT, | ||
1043 | CONTENT_DELETE_STMT, | ||
1044 | |||
1045 | TERM_SELECT_STMT, | ||
1046 | TERM_SELECT_ALL_STMT, | ||
1047 | TERM_INSERT_STMT, | ||
1048 | TERM_UPDATE_STMT, | ||
1049 | TERM_DELETE_STMT, | ||
1050 | |||
1051 | MAX_STMT /* Always at end! */ | ||
1052 | } fulltext_statement; | ||
1053 | |||
1054 | /* These must exactly match the enum above. */ | ||
1055 | /* TODO(adam): Is there some risk that a statement (in particular, | ||
1056 | ** pTermSelectStmt) will be used in two cursors at once, e.g. if a | ||
1057 | ** query joins a virtual table to itself? If so perhaps we should | ||
1058 | ** move some of these to the cursor object. | ||
1059 | */ | ||
1060 | static const char *const fulltext_zStatement[MAX_STMT] = { | ||
1061 | /* CONTENT_INSERT */ NULL, /* generated in contentInsertStatement() */ | ||
1062 | /* CONTENT_SELECT */ "select * from %_content where rowid = ?", | ||
1063 | /* CONTENT_UPDATE */ NULL, /* generated in contentUpdateStatement() */ | ||
1064 | /* CONTENT_DELETE */ "delete from %_content where rowid = ?", | ||
1065 | |||
1066 | /* TERM_SELECT */ | ||
1067 | "select rowid, doclist from %_term where term = ? and segment = ?", | ||
1068 | /* TERM_SELECT_ALL */ | ||
1069 | "select doclist from %_term where term = ? order by segment", | ||
1070 | /* TERM_INSERT */ | ||
1071 | "insert into %_term (rowid, term, segment, doclist) values (?, ?, ?, ?)", | ||
1072 | /* TERM_UPDATE */ "update %_term set doclist = ? where rowid = ?", | ||
1073 | /* TERM_DELETE */ "delete from %_term where rowid = ?", | ||
1074 | }; | ||
1075 | |||
1076 | /* | ||
1077 | ** A connection to a fulltext index is an instance of the following | ||
1078 | ** structure. The xCreate and xConnect methods create an instance | ||
1079 | ** of this structure and xDestroy and xDisconnect free that instance. | ||
1080 | ** All other methods receive a pointer to the structure as one of their | ||
1081 | ** arguments. | ||
1082 | */ | ||
1083 | struct fulltext_vtab { | ||
1084 | sqlite3_vtab base; /* Base class used by SQLite core */ | ||
1085 | sqlite3 *db; /* The database connection */ | ||
1086 | const char *zDb; /* logical database name */ | ||
1087 | const char *zName; /* virtual table name */ | ||
1088 | int nColumn; /* number of columns in virtual table */ | ||
1089 | char **azColumn; /* column names. malloced */ | ||
1090 | char **azContentColumn; /* column names in content table; malloced */ | ||
1091 | sqlite3_tokenizer *pTokenizer; /* tokenizer for inserts and queries */ | ||
1092 | |||
1093 | /* Precompiled statements which we keep as long as the table is | ||
1094 | ** open. | ||
1095 | */ | ||
1096 | sqlite3_stmt *pFulltextStatements[MAX_STMT]; | ||
1097 | }; | ||
1098 | |||
1099 | /* | ||
1100 | ** When the core wants to do a query, it create a cursor using a | ||
1101 | ** call to xOpen. This structure is an instance of a cursor. It | ||
1102 | ** is destroyed by xClose. | ||
1103 | */ | ||
1104 | typedef struct fulltext_cursor { | ||
1105 | sqlite3_vtab_cursor base; /* Base class used by SQLite core */ | ||
1106 | QueryType iCursorType; /* Copy of sqlite3_index_info.idxNum */ | ||
1107 | sqlite3_stmt *pStmt; /* Prepared statement in use by the cursor */ | ||
1108 | int eof; /* True if at End Of Results */ | ||
1109 | Query q; /* Parsed query string */ | ||
1110 | Snippet snippet; /* Cached snippet for the current row */ | ||
1111 | int iColumn; /* Column being searched */ | ||
1112 | DocListReader result; /* used when iCursorType == QUERY_FULLTEXT */ | ||
1113 | } fulltext_cursor; | ||
1114 | |||
1115 | static struct fulltext_vtab *cursor_vtab(fulltext_cursor *c){ | ||
1116 | return (fulltext_vtab *) c->base.pVtab; | ||
1117 | } | ||
1118 | |||
1119 | static const sqlite3_module fulltextModule; /* forward declaration */ | ||
1120 | |||
1121 | /* Append a list of strings separated by commas to a StringBuffer. */ | ||
1122 | static void appendList(StringBuffer *sb, int nString, char **azString){ | ||
1123 | int i; | ||
1124 | for(i=0; i<nString; ++i){ | ||
1125 | if( i>0 ) append(sb, ", "); | ||
1126 | append(sb, azString[i]); | ||
1127 | } | ||
1128 | } | ||
1129 | |||
1130 | /* Return a dynamically generated statement of the form | ||
1131 | * insert into %_content (rowid, ...) values (?, ...) | ||
1132 | */ | ||
1133 | static const char *contentInsertStatement(fulltext_vtab *v){ | ||
1134 | StringBuffer sb; | ||
1135 | int i; | ||
1136 | |||
1137 | initStringBuffer(&sb); | ||
1138 | append(&sb, "insert into %_content (rowid, "); | ||
1139 | appendList(&sb, v->nColumn, v->azContentColumn); | ||
1140 | append(&sb, ") values (?"); | ||
1141 | for(i=0; i<v->nColumn; ++i) | ||
1142 | append(&sb, ", ?"); | ||
1143 | append(&sb, ")"); | ||
1144 | return sb.s; | ||
1145 | } | ||
1146 | |||
1147 | /* Return a dynamically generated statement of the form | ||
1148 | * update %_content set [col_0] = ?, [col_1] = ?, ... | ||
1149 | * where rowid = ? | ||
1150 | */ | ||
1151 | static const char *contentUpdateStatement(fulltext_vtab *v){ | ||
1152 | StringBuffer sb; | ||
1153 | int i; | ||
1154 | |||
1155 | initStringBuffer(&sb); | ||
1156 | append(&sb, "update %_content set "); | ||
1157 | for(i=0; i<v->nColumn; ++i) { | ||
1158 | if( i>0 ){ | ||
1159 | append(&sb, ", "); | ||
1160 | } | ||
1161 | append(&sb, v->azContentColumn[i]); | ||
1162 | append(&sb, " = ?"); | ||
1163 | } | ||
1164 | append(&sb, " where rowid = ?"); | ||
1165 | return sb.s; | ||
1166 | } | ||
1167 | |||
1168 | /* Puts a freshly-prepared statement determined by iStmt in *ppStmt. | ||
1169 | ** If the indicated statement has never been prepared, it is prepared | ||
1170 | ** and cached, otherwise the cached version is reset. | ||
1171 | */ | ||
1172 | static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt, | ||
1173 | sqlite3_stmt **ppStmt){ | ||
1174 | assert( iStmt<MAX_STMT ); | ||
1175 | if( v->pFulltextStatements[iStmt]==NULL ){ | ||
1176 | const char *zStmt; | ||
1177 | int rc; | ||
1178 | switch( iStmt ){ | ||
1179 | case CONTENT_INSERT_STMT: | ||
1180 | zStmt = contentInsertStatement(v); break; | ||
1181 | case CONTENT_UPDATE_STMT: | ||
1182 | zStmt = contentUpdateStatement(v); break; | ||
1183 | default: | ||
1184 | zStmt = fulltext_zStatement[iStmt]; | ||
1185 | } | ||
1186 | rc = sql_prepare(v->db, v->zDb, v->zName, &v->pFulltextStatements[iStmt], | ||
1187 | zStmt); | ||
1188 | if( zStmt != fulltext_zStatement[iStmt]) free((void *) zStmt); | ||
1189 | if( rc!=SQLITE_OK ) return rc; | ||
1190 | } else { | ||
1191 | int rc = sqlite3_reset(v->pFulltextStatements[iStmt]); | ||
1192 | if( rc!=SQLITE_OK ) return rc; | ||
1193 | } | ||
1194 | |||
1195 | *ppStmt = v->pFulltextStatements[iStmt]; | ||
1196 | return SQLITE_OK; | ||
1197 | } | ||
1198 | |||
1199 | /* Step the indicated statement, handling errors SQLITE_BUSY (by | ||
1200 | ** retrying) and SQLITE_SCHEMA (by re-preparing and transferring | ||
1201 | ** bindings to the new statement). | ||
1202 | ** TODO(adam): We should extend this function so that it can work with | ||
1203 | ** statements declared locally, not only globally cached statements. | ||
1204 | */ | ||
1205 | static int sql_step_statement(fulltext_vtab *v, fulltext_statement iStmt, | ||
1206 | sqlite3_stmt **ppStmt){ | ||
1207 | int rc; | ||
1208 | sqlite3_stmt *s = *ppStmt; | ||
1209 | assert( iStmt<MAX_STMT ); | ||
1210 | assert( s==v->pFulltextStatements[iStmt] ); | ||
1211 | |||
1212 | while( (rc=sqlite3_step(s))!=SQLITE_DONE && rc!=SQLITE_ROW ){ | ||
1213 | if( rc==SQLITE_BUSY ) continue; | ||
1214 | if( rc!=SQLITE_ERROR ) return rc; | ||
1215 | |||
1216 | /* If an SQLITE_SCHEMA error has occured, then finalizing this | ||
1217 | * statement is going to delete the fulltext_vtab structure. If | ||
1218 | * the statement just executed is in the pFulltextStatements[] | ||
1219 | * array, it will be finalized twice. So remove it before | ||
1220 | * calling sqlite3_finalize(). | ||
1221 | */ | ||
1222 | v->pFulltextStatements[iStmt] = NULL; | ||
1223 | rc = sqlite3_finalize(s); | ||
1224 | break; | ||
1225 | } | ||
1226 | return rc; | ||
1227 | |||
1228 | err: | ||
1229 | sqlite3_finalize(s); | ||
1230 | return rc; | ||
1231 | } | ||
1232 | |||
1233 | /* Like sql_step_statement(), but convert SQLITE_DONE to SQLITE_OK. | ||
1234 | ** Useful for statements like UPDATE, where we expect no results. | ||
1235 | */ | ||
1236 | static int sql_single_step_statement(fulltext_vtab *v, | ||
1237 | fulltext_statement iStmt, | ||
1238 | sqlite3_stmt **ppStmt){ | ||
1239 | int rc = sql_step_statement(v, iStmt, ppStmt); | ||
1240 | return (rc==SQLITE_DONE) ? SQLITE_OK : rc; | ||
1241 | } | ||
1242 | |||
1243 | /* insert into %_content (rowid, ...) values ([rowid], [pValues]) */ | ||
1244 | static int content_insert(fulltext_vtab *v, sqlite3_value *rowid, | ||
1245 | sqlite3_value **pValues){ | ||
1246 | sqlite3_stmt *s; | ||
1247 | int i; | ||
1248 | int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s); | ||
1249 | if( rc!=SQLITE_OK ) return rc; | ||
1250 | |||
1251 | rc = sqlite3_bind_value(s, 1, rowid); | ||
1252 | if( rc!=SQLITE_OK ) return rc; | ||
1253 | |||
1254 | for(i=0; i<v->nColumn; ++i){ | ||
1255 | rc = sqlite3_bind_value(s, 2+i, pValues[i]); | ||
1256 | if( rc!=SQLITE_OK ) return rc; | ||
1257 | } | ||
1258 | |||
1259 | return sql_single_step_statement(v, CONTENT_INSERT_STMT, &s); | ||
1260 | } | ||
1261 | |||
1262 | /* update %_content set col0 = pValues[0], col1 = pValues[1], ... | ||
1263 | * where rowid = [iRowid] */ | ||
1264 | static int content_update(fulltext_vtab *v, sqlite3_value **pValues, | ||
1265 | sqlite_int64 iRowid){ | ||
1266 | sqlite3_stmt *s; | ||
1267 | int i; | ||
1268 | int rc = sql_get_statement(v, CONTENT_UPDATE_STMT, &s); | ||
1269 | if( rc!=SQLITE_OK ) return rc; | ||
1270 | |||
1271 | for(i=0; i<v->nColumn; ++i){ | ||
1272 | rc = sqlite3_bind_value(s, 1+i, pValues[i]); | ||
1273 | if( rc!=SQLITE_OK ) return rc; | ||
1274 | } | ||
1275 | |||
1276 | rc = sqlite3_bind_int64(s, 1+v->nColumn, iRowid); | ||
1277 | if( rc!=SQLITE_OK ) return rc; | ||
1278 | |||
1279 | return sql_single_step_statement(v, CONTENT_UPDATE_STMT, &s); | ||
1280 | } | ||
1281 | |||
1282 | static void freeStringArray(int nString, const char **pString){ | ||
1283 | int i; | ||
1284 | |||
1285 | for (i=0 ; i < nString ; ++i) { | ||
1286 | if( pString[i]!=NULL ) free((void *) pString[i]); | ||
1287 | } | ||
1288 | free((void *) pString); | ||
1289 | } | ||
1290 | |||
1291 | /* select * from %_content where rowid = [iRow] | ||
1292 | * The caller must delete the returned array and all strings in it. | ||
1293 | * null fields will be NULL in the returned array. | ||
1294 | * | ||
1295 | * TODO: Perhaps we should return pointer/length strings here for consistency | ||
1296 | * with other code which uses pointer/length. */ | ||
1297 | static int content_select(fulltext_vtab *v, sqlite_int64 iRow, | ||
1298 | const char ***pValues){ | ||
1299 | sqlite3_stmt *s; | ||
1300 | const char **values; | ||
1301 | int i; | ||
1302 | int rc; | ||
1303 | |||
1304 | *pValues = NULL; | ||
1305 | |||
1306 | rc = sql_get_statement(v, CONTENT_SELECT_STMT, &s); | ||
1307 | if( rc!=SQLITE_OK ) return rc; | ||
1308 | |||
1309 | rc = sqlite3_bind_int64(s, 1, iRow); | ||
1310 | if( rc!=SQLITE_OK ) return rc; | ||
1311 | |||
1312 | rc = sql_step_statement(v, CONTENT_SELECT_STMT, &s); | ||
1313 | if( rc!=SQLITE_ROW ) return rc; | ||
1314 | |||
1315 | values = (const char **) malloc(v->nColumn * sizeof(const char *)); | ||
1316 | for(i=0; i<v->nColumn; ++i){ | ||
1317 | if( sqlite3_column_type(s, i)==SQLITE_NULL ){ | ||
1318 | values[i] = NULL; | ||
1319 | }else{ | ||
1320 | values[i] = string_dup((char*)sqlite3_column_text(s, i)); | ||
1321 | } | ||
1322 | } | ||
1323 | |||
1324 | /* We expect only one row. We must execute another sqlite3_step() | ||
1325 | * to complete the iteration; otherwise the table will remain locked. */ | ||
1326 | rc = sqlite3_step(s); | ||
1327 | if( rc==SQLITE_DONE ){ | ||
1328 | *pValues = values; | ||
1329 | return SQLITE_OK; | ||
1330 | } | ||
1331 | |||
1332 | freeStringArray(v->nColumn, values); | ||
1333 | return rc; | ||
1334 | } | ||
1335 | |||
1336 | /* delete from %_content where rowid = [iRow ] */ | ||
1337 | static int content_delete(fulltext_vtab *v, sqlite_int64 iRow){ | ||
1338 | sqlite3_stmt *s; | ||
1339 | int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s); | ||
1340 | if( rc!=SQLITE_OK ) return rc; | ||
1341 | |||
1342 | rc = sqlite3_bind_int64(s, 1, iRow); | ||
1343 | if( rc!=SQLITE_OK ) return rc; | ||
1344 | |||
1345 | return sql_single_step_statement(v, CONTENT_DELETE_STMT, &s); | ||
1346 | } | ||
1347 | |||
1348 | /* select rowid, doclist from %_term | ||
1349 | * where term = [pTerm] and segment = [iSegment] | ||
1350 | * If found, returns SQLITE_ROW; the caller must free the | ||
1351 | * returned doclist. If no rows found, returns SQLITE_DONE. */ | ||
1352 | static int term_select(fulltext_vtab *v, const char *pTerm, int nTerm, | ||
1353 | int iSegment, | ||
1354 | sqlite_int64 *rowid, DocList *out){ | ||
1355 | sqlite3_stmt *s; | ||
1356 | int rc = sql_get_statement(v, TERM_SELECT_STMT, &s); | ||
1357 | if( rc!=SQLITE_OK ) return rc; | ||
1358 | |||
1359 | rc = sqlite3_bind_text(s, 1, pTerm, nTerm, SQLITE_STATIC); | ||
1360 | if( rc!=SQLITE_OK ) return rc; | ||
1361 | |||
1362 | rc = sqlite3_bind_int(s, 2, iSegment); | ||
1363 | if( rc!=SQLITE_OK ) return rc; | ||
1364 | |||
1365 | rc = sql_step_statement(v, TERM_SELECT_STMT, &s); | ||
1366 | if( rc!=SQLITE_ROW ) return rc; | ||
1367 | |||
1368 | *rowid = sqlite3_column_int64(s, 0); | ||
1369 | docListInit(out, DL_DEFAULT, | ||
1370 | sqlite3_column_blob(s, 1), sqlite3_column_bytes(s, 1)); | ||
1371 | |||
1372 | /* We expect only one row. We must execute another sqlite3_step() | ||
1373 | * to complete the iteration; otherwise the table will remain locked. */ | ||
1374 | rc = sqlite3_step(s); | ||
1375 | return rc==SQLITE_DONE ? SQLITE_ROW : rc; | ||
1376 | } | ||
1377 | |||
1378 | /* Load the segment doclists for term pTerm and merge them in | ||
1379 | ** appropriate order into out. Returns SQLITE_OK if successful. If | ||
1380 | ** there are no segments for pTerm, successfully returns an empty | ||
1381 | ** doclist in out. | ||
1382 | ** | ||
1383 | ** Each document consists of 1 or more "columns". The number of | ||
1384 | ** columns is v->nColumn. If iColumn==v->nColumn, then return | ||
1385 | ** position information about all columns. If iColumn<v->nColumn, | ||
1386 | ** then only return position information about the iColumn-th column | ||
1387 | ** (where the first column is 0). | ||
1388 | */ | ||
1389 | static int term_select_all( | ||
1390 | fulltext_vtab *v, /* The fulltext index we are querying against */ | ||
1391 | int iColumn, /* If <nColumn, only look at the iColumn-th column */ | ||
1392 | const char *pTerm, /* The term whose posting lists we want */ | ||
1393 | int nTerm, /* Number of bytes in pTerm */ | ||
1394 | DocList *out /* Write the resulting doclist here */ | ||
1395 | ){ | ||
1396 | DocList doclist; | ||
1397 | sqlite3_stmt *s; | ||
1398 | int rc = sql_get_statement(v, TERM_SELECT_ALL_STMT, &s); | ||
1399 | if( rc!=SQLITE_OK ) return rc; | ||
1400 | |||
1401 | rc = sqlite3_bind_text(s, 1, pTerm, nTerm, SQLITE_STATIC); | ||
1402 | if( rc!=SQLITE_OK ) return rc; | ||
1403 | |||
1404 | docListInit(&doclist, DL_DEFAULT, 0, 0); | ||
1405 | |||
1406 | /* TODO(shess) Handle schema and busy errors. */ | ||
1407 | while( (rc=sql_step_statement(v, TERM_SELECT_ALL_STMT, &s))==SQLITE_ROW ){ | ||
1408 | DocList old; | ||
1409 | |||
1410 | /* TODO(shess) If we processed doclists from oldest to newest, we | ||
1411 | ** could skip the malloc() involved with the following call. For | ||
1412 | ** now, I'd rather keep this logic similar to index_insert_term(). | ||
1413 | ** We could additionally drop elements when we see deletes, but | ||
1414 | ** that would require a distinct version of docListAccumulate(). | ||
1415 | */ | ||
1416 | docListInit(&old, DL_DEFAULT, | ||
1417 | sqlite3_column_blob(s, 0), sqlite3_column_bytes(s, 0)); | ||
1418 | |||
1419 | if( iColumn<v->nColumn ){ /* querying a single column */ | ||
1420 | docListRestrictColumn(&old, iColumn); | ||
1421 | } | ||
1422 | |||
1423 | /* doclist contains the newer data, so write it over old. Then | ||
1424 | ** steal accumulated result for doclist. | ||
1425 | */ | ||
1426 | docListAccumulate(&old, &doclist); | ||
1427 | docListDestroy(&doclist); | ||
1428 | doclist = old; | ||
1429 | } | ||
1430 | if( rc!=SQLITE_DONE ){ | ||
1431 | docListDestroy(&doclist); | ||
1432 | return rc; | ||
1433 | } | ||
1434 | |||
1435 | docListDiscardEmpty(&doclist); | ||
1436 | *out = doclist; | ||
1437 | return SQLITE_OK; | ||
1438 | } | ||
1439 | |||
1440 | /* insert into %_term (rowid, term, segment, doclist) | ||
1441 | values ([piRowid], [pTerm], [iSegment], [doclist]) | ||
1442 | ** Lets sqlite select rowid if piRowid is NULL, else uses *piRowid. | ||
1443 | ** | ||
1444 | ** NOTE(shess) piRowid is IN, with values of "space of int64" plus | ||
1445 | ** null, it is not used to pass data back to the caller. | ||
1446 | */ | ||
1447 | static int term_insert(fulltext_vtab *v, sqlite_int64 *piRowid, | ||
1448 | const char *pTerm, int nTerm, | ||
1449 | int iSegment, DocList *doclist){ | ||
1450 | sqlite3_stmt *s; | ||
1451 | int rc = sql_get_statement(v, TERM_INSERT_STMT, &s); | ||
1452 | if( rc!=SQLITE_OK ) return rc; | ||
1453 | |||
1454 | if( piRowid==NULL ){ | ||
1455 | rc = sqlite3_bind_null(s, 1); | ||
1456 | }else{ | ||
1457 | rc = sqlite3_bind_int64(s, 1, *piRowid); | ||
1458 | } | ||
1459 | if( rc!=SQLITE_OK ) return rc; | ||
1460 | |||
1461 | rc = sqlite3_bind_text(s, 2, pTerm, nTerm, SQLITE_STATIC); | ||
1462 | if( rc!=SQLITE_OK ) return rc; | ||
1463 | |||
1464 | rc = sqlite3_bind_int(s, 3, iSegment); | ||
1465 | if( rc!=SQLITE_OK ) return rc; | ||
1466 | |||
1467 | rc = sqlite3_bind_blob(s, 4, doclist->pData, doclist->nData, SQLITE_STATIC); | ||
1468 | if( rc!=SQLITE_OK ) return rc; | ||
1469 | |||
1470 | return sql_single_step_statement(v, TERM_INSERT_STMT, &s); | ||
1471 | } | ||
1472 | |||
1473 | /* update %_term set doclist = [doclist] where rowid = [rowid] */ | ||
1474 | static int term_update(fulltext_vtab *v, sqlite_int64 rowid, | ||
1475 | DocList *doclist){ | ||
1476 | sqlite3_stmt *s; | ||
1477 | int rc = sql_get_statement(v, TERM_UPDATE_STMT, &s); | ||
1478 | if( rc!=SQLITE_OK ) return rc; | ||
1479 | |||
1480 | rc = sqlite3_bind_blob(s, 1, doclist->pData, doclist->nData, SQLITE_STATIC); | ||
1481 | if( rc!=SQLITE_OK ) return rc; | ||
1482 | |||
1483 | rc = sqlite3_bind_int64(s, 2, rowid); | ||
1484 | if( rc!=SQLITE_OK ) return rc; | ||
1485 | |||
1486 | return sql_single_step_statement(v, TERM_UPDATE_STMT, &s); | ||
1487 | } | ||
1488 | |||
1489 | static int term_delete(fulltext_vtab *v, sqlite_int64 rowid){ | ||
1490 | sqlite3_stmt *s; | ||
1491 | int rc = sql_get_statement(v, TERM_DELETE_STMT, &s); | ||
1492 | if( rc!=SQLITE_OK ) return rc; | ||
1493 | |||
1494 | rc = sqlite3_bind_int64(s, 1, rowid); | ||
1495 | if( rc!=SQLITE_OK ) return rc; | ||
1496 | |||
1497 | return sql_single_step_statement(v, TERM_DELETE_STMT, &s); | ||
1498 | } | ||
1499 | |||
1500 | /* | ||
1501 | ** Free the memory used to contain a fulltext_vtab structure. | ||
1502 | */ | ||
1503 | static void fulltext_vtab_destroy(fulltext_vtab *v){ | ||
1504 | int iStmt, i; | ||
1505 | |||
1506 | TRACE(("FTS1 Destroy %p\n", v)); | ||
1507 | for( iStmt=0; iStmt<MAX_STMT; iStmt++ ){ | ||
1508 | if( v->pFulltextStatements[iStmt]!=NULL ){ | ||
1509 | sqlite3_finalize(v->pFulltextStatements[iStmt]); | ||
1510 | v->pFulltextStatements[iStmt] = NULL; | ||
1511 | } | ||
1512 | } | ||
1513 | |||
1514 | if( v->pTokenizer!=NULL ){ | ||
1515 | v->pTokenizer->pModule->xDestroy(v->pTokenizer); | ||
1516 | v->pTokenizer = NULL; | ||
1517 | } | ||
1518 | |||
1519 | free(v->azColumn); | ||
1520 | for(i = 0; i < v->nColumn; ++i) { | ||
1521 | sqlite3_free(v->azContentColumn[i]); | ||
1522 | } | ||
1523 | free(v->azContentColumn); | ||
1524 | free(v); | ||
1525 | } | ||
1526 | |||
1527 | /* | ||
1528 | ** Token types for parsing the arguments to xConnect or xCreate. | ||
1529 | */ | ||
1530 | #define TOKEN_EOF 0 /* End of file */ | ||
1531 | #define TOKEN_SPACE 1 /* Any kind of whitespace */ | ||
1532 | #define TOKEN_ID 2 /* An identifier */ | ||
1533 | #define TOKEN_STRING 3 /* A string literal */ | ||
1534 | #define TOKEN_PUNCT 4 /* A single punctuation character */ | ||
1535 | |||
1536 | /* | ||
1537 | ** If X is a character that can be used in an identifier then | ||
1538 | ** IdChar(X) will be true. Otherwise it is false. | ||
1539 | ** | ||
1540 | ** For ASCII, any character with the high-order bit set is | ||
1541 | ** allowed in an identifier. For 7-bit characters, | ||
1542 | ** sqlite3IsIdChar[X] must be 1. | ||
1543 | ** | ||
1544 | ** Ticket #1066. the SQL standard does not allow '$' in the | ||
1545 | ** middle of identfiers. But many SQL implementations do. | ||
1546 | ** SQLite will allow '$' in identifiers for compatibility. | ||
1547 | ** But the feature is undocumented. | ||
1548 | */ | ||
1549 | static const char isIdChar[] = { | ||
1550 | /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */ | ||
1551 | 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 2x */ | ||
1552 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */ | ||
1553 | 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */ | ||
1554 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */ | ||
1555 | 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */ | ||
1556 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */ | ||
1557 | }; | ||
1558 | #define IdChar(C) (((c=C)&0x80)!=0 || (c>0x1f && isIdChar[c-0x20])) | ||
1559 | |||
1560 | |||
1561 | /* | ||
1562 | ** Return the length of the token that begins at z[0]. | ||
1563 | ** Store the token type in *tokenType before returning. | ||
1564 | */ | ||
1565 | static int getToken(const char *z, int *tokenType){ | ||
1566 | int i, c; | ||
1567 | switch( *z ){ | ||
1568 | case 0: { | ||
1569 | *tokenType = TOKEN_EOF; | ||
1570 | return 0; | ||
1571 | } | ||
1572 | case ' ': case '\t': case '\n': case '\f': case '\r': { | ||
1573 | for(i=1; safe_isspace(z[i]); i++){} | ||
1574 | *tokenType = TOKEN_SPACE; | ||
1575 | return i; | ||
1576 | } | ||
1577 | case '`': | ||
1578 | case '\'': | ||
1579 | case '"': { | ||
1580 | int delim = z[0]; | ||
1581 | for(i=1; (c=z[i])!=0; i++){ | ||
1582 | if( c==delim ){ | ||
1583 | if( z[i+1]==delim ){ | ||
1584 | i++; | ||
1585 | }else{ | ||
1586 | break; | ||
1587 | } | ||
1588 | } | ||
1589 | } | ||
1590 | *tokenType = TOKEN_STRING; | ||
1591 | return i + (c!=0); | ||
1592 | } | ||
1593 | case '[': { | ||
1594 | for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){} | ||
1595 | *tokenType = TOKEN_ID; | ||
1596 | return i; | ||
1597 | } | ||
1598 | default: { | ||
1599 | if( !IdChar(*z) ){ | ||
1600 | break; | ||
1601 | } | ||
1602 | for(i=1; IdChar(z[i]); i++){} | ||
1603 | *tokenType = TOKEN_ID; | ||
1604 | return i; | ||
1605 | } | ||
1606 | } | ||
1607 | *tokenType = TOKEN_PUNCT; | ||
1608 | return 1; | ||
1609 | } | ||
1610 | |||
1611 | /* | ||
1612 | ** A token extracted from a string is an instance of the following | ||
1613 | ** structure. | ||
1614 | */ | ||
1615 | typedef struct Token { | ||
1616 | const char *z; /* Pointer to token text. Not '\000' terminated */ | ||
1617 | short int n; /* Length of the token text in bytes. */ | ||
1618 | } Token; | ||
1619 | |||
1620 | /* | ||
1621 | ** Given a input string (which is really one of the argv[] parameters | ||
1622 | ** passed into xConnect or xCreate) split the string up into tokens. | ||
1623 | ** Return an array of pointers to '\000' terminated strings, one string | ||
1624 | ** for each non-whitespace token. | ||
1625 | ** | ||
1626 | ** The returned array is terminated by a single NULL pointer. | ||
1627 | ** | ||
1628 | ** Space to hold the returned array is obtained from a single | ||
1629 | ** malloc and should be freed by passing the return value to free(). | ||
1630 | ** The individual strings within the token list are all a part of | ||
1631 | ** the single memory allocation and will all be freed at once. | ||
1632 | */ | ||
1633 | static char **tokenizeString(const char *z, int *pnToken){ | ||
1634 | int nToken = 0; | ||
1635 | Token *aToken = malloc( strlen(z) * sizeof(aToken[0]) ); | ||
1636 | int n = 1; | ||
1637 | int e, i; | ||
1638 | int totalSize = 0; | ||
1639 | char **azToken; | ||
1640 | char *zCopy; | ||
1641 | while( n>0 ){ | ||
1642 | n = getToken(z, &e); | ||
1643 | if( e!=TOKEN_SPACE ){ | ||
1644 | aToken[nToken].z = z; | ||
1645 | aToken[nToken].n = n; | ||
1646 | nToken++; | ||
1647 | totalSize += n+1; | ||
1648 | } | ||
1649 | z += n; | ||
1650 | } | ||
1651 | azToken = (char**)malloc( nToken*sizeof(char*) + totalSize ); | ||
1652 | zCopy = (char*)&azToken[nToken]; | ||
1653 | nToken--; | ||
1654 | for(i=0; i<nToken; i++){ | ||
1655 | azToken[i] = zCopy; | ||
1656 | n = aToken[i].n; | ||
1657 | memcpy(zCopy, aToken[i].z, n); | ||
1658 | zCopy[n] = 0; | ||
1659 | zCopy += n+1; | ||
1660 | } | ||
1661 | azToken[nToken] = 0; | ||
1662 | free(aToken); | ||
1663 | *pnToken = nToken; | ||
1664 | return azToken; | ||
1665 | } | ||
1666 | |||
1667 | /* | ||
1668 | ** Convert an SQL-style quoted string into a normal string by removing | ||
1669 | ** the quote characters. The conversion is done in-place. If the | ||
1670 | ** input does not begin with a quote character, then this routine | ||
1671 | ** is a no-op. | ||
1672 | ** | ||
1673 | ** Examples: | ||
1674 | ** | ||
1675 | ** "abc" becomes abc | ||
1676 | ** 'xyz' becomes xyz | ||
1677 | ** [pqr] becomes pqr | ||
1678 | ** `mno` becomes mno | ||
1679 | */ | ||
1680 | static void dequoteString(char *z){ | ||
1681 | int quote; | ||
1682 | int i, j; | ||
1683 | if( z==0 ) return; | ||
1684 | quote = z[0]; | ||
1685 | switch( quote ){ | ||
1686 | case '\'': break; | ||
1687 | case '"': break; | ||
1688 | case '`': break; /* For MySQL compatibility */ | ||
1689 | case '[': quote = ']'; break; /* For MS SqlServer compatibility */ | ||
1690 | default: return; | ||
1691 | } | ||
1692 | for(i=1, j=0; z[i]; i++){ | ||
1693 | if( z[i]==quote ){ | ||
1694 | if( z[i+1]==quote ){ | ||
1695 | z[j++] = quote; | ||
1696 | i++; | ||
1697 | }else{ | ||
1698 | z[j++] = 0; | ||
1699 | break; | ||
1700 | } | ||
1701 | }else{ | ||
1702 | z[j++] = z[i]; | ||
1703 | } | ||
1704 | } | ||
1705 | } | ||
1706 | |||
1707 | /* | ||
1708 | ** The input azIn is a NULL-terminated list of tokens. Remove the first | ||
1709 | ** token and all punctuation tokens. Remove the quotes from | ||
1710 | ** around string literal tokens. | ||
1711 | ** | ||
1712 | ** Example: | ||
1713 | ** | ||
1714 | ** input: tokenize chinese ( 'simplifed' , 'mixed' ) | ||
1715 | ** output: chinese simplifed mixed | ||
1716 | ** | ||
1717 | ** Another example: | ||
1718 | ** | ||
1719 | ** input: delimiters ( '[' , ']' , '...' ) | ||
1720 | ** output: [ ] ... | ||
1721 | */ | ||
1722 | static void tokenListToIdList(char **azIn){ | ||
1723 | int i, j; | ||
1724 | if( azIn ){ | ||
1725 | for(i=0, j=-1; azIn[i]; i++){ | ||
1726 | if( safe_isalnum(azIn[i][0]) || azIn[i][1] ){ | ||
1727 | dequoteString(azIn[i]); | ||
1728 | if( j>=0 ){ | ||
1729 | azIn[j] = azIn[i]; | ||
1730 | } | ||
1731 | j++; | ||
1732 | } | ||
1733 | } | ||
1734 | azIn[j] = 0; | ||
1735 | } | ||
1736 | } | ||
1737 | |||
1738 | |||
1739 | /* | ||
1740 | ** Find the first alphanumeric token in the string zIn. Null-terminate | ||
1741 | ** this token. Remove any quotation marks. And return a pointer to | ||
1742 | ** the result. | ||
1743 | */ | ||
1744 | static char *firstToken(char *zIn, char **pzTail){ | ||
1745 | int n, ttype; | ||
1746 | while(1){ | ||
1747 | n = getToken(zIn, &ttype); | ||
1748 | if( ttype==TOKEN_SPACE ){ | ||
1749 | zIn += n; | ||
1750 | }else if( ttype==TOKEN_EOF ){ | ||
1751 | *pzTail = zIn; | ||
1752 | return 0; | ||
1753 | }else{ | ||
1754 | zIn[n] = 0; | ||
1755 | *pzTail = &zIn[1]; | ||
1756 | dequoteString(zIn); | ||
1757 | return zIn; | ||
1758 | } | ||
1759 | } | ||
1760 | /*NOTREACHED*/ | ||
1761 | } | ||
1762 | |||
1763 | /* Return true if... | ||
1764 | ** | ||
1765 | ** * s begins with the string t, ignoring case | ||
1766 | ** * s is longer than t | ||
1767 | ** * The first character of s beyond t is not a alphanumeric | ||
1768 | ** | ||
1769 | ** Ignore leading space in *s. | ||
1770 | ** | ||
1771 | ** To put it another way, return true if the first token of | ||
1772 | ** s[] is t[]. | ||
1773 | */ | ||
1774 | static int startsWith(const char *s, const char *t){ | ||
1775 | while( safe_isspace(*s) ){ s++; } | ||
1776 | while( *t ){ | ||
1777 | if( safe_tolower(*s++)!=safe_tolower(*t++) ) return 0; | ||
1778 | } | ||
1779 | return *s!='_' && !safe_isalnum(*s); | ||
1780 | } | ||
1781 | |||
1782 | /* | ||
1783 | ** An instance of this structure defines the "spec" of a | ||
1784 | ** full text index. This structure is populated by parseSpec | ||
1785 | ** and use by fulltextConnect and fulltextCreate. | ||
1786 | */ | ||
1787 | typedef struct TableSpec { | ||
1788 | const char *zDb; /* Logical database name */ | ||
1789 | const char *zName; /* Name of the full-text index */ | ||
1790 | int nColumn; /* Number of columns to be indexed */ | ||
1791 | char **azColumn; /* Original names of columns to be indexed */ | ||
1792 | char **azContentColumn; /* Column names for %_content */ | ||
1793 | char **azTokenizer; /* Name of tokenizer and its arguments */ | ||
1794 | } TableSpec; | ||
1795 | |||
1796 | /* | ||
1797 | ** Reclaim all of the memory used by a TableSpec | ||
1798 | */ | ||
1799 | static void clearTableSpec(TableSpec *p) { | ||
1800 | free(p->azColumn); | ||
1801 | free(p->azContentColumn); | ||
1802 | free(p->azTokenizer); | ||
1803 | } | ||
1804 | |||
1805 | /* Parse a CREATE VIRTUAL TABLE statement, which looks like this: | ||
1806 | * | ||
1807 | * CREATE VIRTUAL TABLE email | ||
1808 | * USING fts1(subject, body, tokenize mytokenizer(myarg)) | ||
1809 | * | ||
1810 | * We return parsed information in a TableSpec structure. | ||
1811 | * | ||
1812 | */ | ||
1813 | static int parseSpec(TableSpec *pSpec, int argc, const char *const*argv, | ||
1814 | char**pzErr){ | ||
1815 | int i, n; | ||
1816 | char *z, *zDummy; | ||
1817 | char **azArg; | ||
1818 | const char *zTokenizer = 0; /* argv[] entry describing the tokenizer */ | ||
1819 | |||
1820 | assert( argc>=3 ); | ||
1821 | /* Current interface: | ||
1822 | ** argv[0] - module name | ||
1823 | ** argv[1] - database name | ||
1824 | ** argv[2] - table name | ||
1825 | ** argv[3..] - columns, optionally followed by tokenizer specification | ||
1826 | ** and snippet delimiters specification. | ||
1827 | */ | ||
1828 | |||
1829 | /* Make a copy of the complete argv[][] array in a single allocation. | ||
1830 | ** The argv[][] array is read-only and transient. We can write to the | ||
1831 | ** copy in order to modify things and the copy is persistent. | ||
1832 | */ | ||
1833 | memset(pSpec, 0, sizeof(*pSpec)); | ||
1834 | for(i=n=0; i<argc; i++){ | ||
1835 | n += strlen(argv[i]) + 1; | ||
1836 | } | ||
1837 | azArg = malloc( sizeof(char*)*argc + n ); | ||
1838 | if( azArg==0 ){ | ||
1839 | return SQLITE_NOMEM; | ||
1840 | } | ||
1841 | z = (char*)&azArg[argc]; | ||
1842 | for(i=0; i<argc; i++){ | ||
1843 | azArg[i] = z; | ||
1844 | strcpy(z, argv[i]); | ||
1845 | z += strlen(z)+1; | ||
1846 | } | ||
1847 | |||
1848 | /* Identify the column names and the tokenizer and delimiter arguments | ||
1849 | ** in the argv[][] array. | ||
1850 | */ | ||
1851 | pSpec->zDb = azArg[1]; | ||
1852 | pSpec->zName = azArg[2]; | ||
1853 | pSpec->nColumn = 0; | ||
1854 | pSpec->azColumn = azArg; | ||
1855 | zTokenizer = "tokenize simple"; | ||
1856 | for(i=3; i<argc; ++i){ | ||
1857 | if( startsWith(azArg[i],"tokenize") ){ | ||
1858 | zTokenizer = azArg[i]; | ||
1859 | }else{ | ||
1860 | z = azArg[pSpec->nColumn] = firstToken(azArg[i], &zDummy); | ||
1861 | pSpec->nColumn++; | ||
1862 | } | ||
1863 | } | ||
1864 | if( pSpec->nColumn==0 ){ | ||
1865 | azArg[0] = "content"; | ||
1866 | pSpec->nColumn = 1; | ||
1867 | } | ||
1868 | |||
1869 | /* | ||
1870 | ** Construct the list of content column names. | ||
1871 | ** | ||
1872 | ** Each content column name will be of the form cNNAAAA | ||
1873 | ** where NN is the column number and AAAA is the sanitized | ||
1874 | ** column name. "sanitized" means that special characters are | ||
1875 | ** converted to "_". The cNN prefix guarantees that all column | ||
1876 | ** names are unique. | ||
1877 | ** | ||
1878 | ** The AAAA suffix is not strictly necessary. It is included | ||
1879 | ** for the convenience of people who might examine the generated | ||
1880 | ** %_content table and wonder what the columns are used for. | ||
1881 | */ | ||
1882 | pSpec->azContentColumn = malloc( pSpec->nColumn * sizeof(char *) ); | ||
1883 | if( pSpec->azContentColumn==0 ){ | ||
1884 | clearTableSpec(pSpec); | ||
1885 | return SQLITE_NOMEM; | ||
1886 | } | ||
1887 | for(i=0; i<pSpec->nColumn; i++){ | ||
1888 | char *p; | ||
1889 | pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]); | ||
1890 | for (p = pSpec->azContentColumn[i]; *p ; ++p) { | ||
1891 | if( !safe_isalnum(*p) ) *p = '_'; | ||
1892 | } | ||
1893 | } | ||
1894 | |||
1895 | /* | ||
1896 | ** Parse the tokenizer specification string. | ||
1897 | */ | ||
1898 | pSpec->azTokenizer = tokenizeString(zTokenizer, &n); | ||
1899 | tokenListToIdList(pSpec->azTokenizer); | ||
1900 | |||
1901 | return SQLITE_OK; | ||
1902 | } | ||
1903 | |||
1904 | /* | ||
1905 | ** Generate a CREATE TABLE statement that describes the schema of | ||
1906 | ** the virtual table. Return a pointer to this schema string. | ||
1907 | ** | ||
1908 | ** Space is obtained from sqlite3_mprintf() and should be freed | ||
1909 | ** using sqlite3_free(). | ||
1910 | */ | ||
1911 | static char *fulltextSchema( | ||
1912 | int nColumn, /* Number of columns */ | ||
1913 | const char *const* azColumn, /* List of columns */ | ||
1914 | const char *zTableName /* Name of the table */ | ||
1915 | ){ | ||
1916 | int i; | ||
1917 | char *zSchema, *zNext; | ||
1918 | const char *zSep = "("; | ||
1919 | zSchema = sqlite3_mprintf("CREATE TABLE x"); | ||
1920 | for(i=0; i<nColumn; i++){ | ||
1921 | zNext = sqlite3_mprintf("%s%s%Q", zSchema, zSep, azColumn[i]); | ||
1922 | sqlite3_free(zSchema); | ||
1923 | zSchema = zNext; | ||
1924 | zSep = ","; | ||
1925 | } | ||
1926 | zNext = sqlite3_mprintf("%s,%Q)", zSchema, zTableName); | ||
1927 | sqlite3_free(zSchema); | ||
1928 | return zNext; | ||
1929 | } | ||
1930 | |||
1931 | /* | ||
1932 | ** Build a new sqlite3_vtab structure that will describe the | ||
1933 | ** fulltext index defined by spec. | ||
1934 | */ | ||
1935 | static int constructVtab( | ||
1936 | sqlite3 *db, /* The SQLite database connection */ | ||
1937 | TableSpec *spec, /* Parsed spec information from parseSpec() */ | ||
1938 | sqlite3_vtab **ppVTab, /* Write the resulting vtab structure here */ | ||
1939 | char **pzErr /* Write any error message here */ | ||
1940 | ){ | ||
1941 | int rc; | ||
1942 | int n; | ||
1943 | fulltext_vtab *v = 0; | ||
1944 | const sqlite3_tokenizer_module *m = NULL; | ||
1945 | char *schema; | ||
1946 | |||
1947 | v = (fulltext_vtab *) malloc(sizeof(fulltext_vtab)); | ||
1948 | if( v==0 ) return SQLITE_NOMEM; | ||
1949 | memset(v, 0, sizeof(*v)); | ||
1950 | /* sqlite will initialize v->base */ | ||
1951 | v->db = db; | ||
1952 | v->zDb = spec->zDb; /* Freed when azColumn is freed */ | ||
1953 | v->zName = spec->zName; /* Freed when azColumn is freed */ | ||
1954 | v->nColumn = spec->nColumn; | ||
1955 | v->azContentColumn = spec->azContentColumn; | ||
1956 | spec->azContentColumn = 0; | ||
1957 | v->azColumn = spec->azColumn; | ||
1958 | spec->azColumn = 0; | ||
1959 | |||
1960 | if( spec->azTokenizer==0 ){ | ||
1961 | return SQLITE_NOMEM; | ||
1962 | } | ||
1963 | /* TODO(shess) For now, add new tokenizers as else if clauses. */ | ||
1964 | if( spec->azTokenizer[0]==0 || startsWith(spec->azTokenizer[0], "simple") ){ | ||
1965 | sqlite3Fts1SimpleTokenizerModule(&m); | ||
1966 | }else if( startsWith(spec->azTokenizer[0], "porter") ){ | ||
1967 | sqlite3Fts1PorterTokenizerModule(&m); | ||
1968 | }else{ | ||
1969 | *pzErr = sqlite3_mprintf("unknown tokenizer: %s", spec->azTokenizer[0]); | ||
1970 | rc = SQLITE_ERROR; | ||
1971 | goto err; | ||
1972 | } | ||
1973 | for(n=0; spec->azTokenizer[n]; n++){} | ||
1974 | if( n ){ | ||
1975 | rc = m->xCreate(n-1, (const char*const*)&spec->azTokenizer[1], | ||
1976 | &v->pTokenizer); | ||
1977 | }else{ | ||
1978 | rc = m->xCreate(0, 0, &v->pTokenizer); | ||
1979 | } | ||
1980 | if( rc!=SQLITE_OK ) goto err; | ||
1981 | v->pTokenizer->pModule = m; | ||
1982 | |||
1983 | /* TODO: verify the existence of backing tables foo_content, foo_term */ | ||
1984 | |||
1985 | schema = fulltextSchema(v->nColumn, (const char*const*)v->azColumn, | ||
1986 | spec->zName); | ||
1987 | rc = sqlite3_declare_vtab(db, schema); | ||
1988 | sqlite3_free(schema); | ||
1989 | if( rc!=SQLITE_OK ) goto err; | ||
1990 | |||
1991 | memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements)); | ||
1992 | |||
1993 | *ppVTab = &v->base; | ||
1994 | TRACE(("FTS1 Connect %p\n", v)); | ||
1995 | |||
1996 | return rc; | ||
1997 | |||
1998 | err: | ||
1999 | fulltext_vtab_destroy(v); | ||
2000 | return rc; | ||
2001 | } | ||
2002 | |||
2003 | static int fulltextConnect( | ||
2004 | sqlite3 *db, | ||
2005 | void *pAux, | ||
2006 | int argc, const char *const*argv, | ||
2007 | sqlite3_vtab **ppVTab, | ||
2008 | char **pzErr | ||
2009 | ){ | ||
2010 | TableSpec spec; | ||
2011 | int rc = parseSpec(&spec, argc, argv, pzErr); | ||
2012 | if( rc!=SQLITE_OK ) return rc; | ||
2013 | |||
2014 | rc = constructVtab(db, &spec, ppVTab, pzErr); | ||
2015 | clearTableSpec(&spec); | ||
2016 | return rc; | ||
2017 | } | ||
2018 | |||
2019 | /* The %_content table holds the text of each document, with | ||
2020 | ** the rowid used as the docid. | ||
2021 | ** | ||
2022 | ** The %_term table maps each term to a document list blob | ||
2023 | ** containing elements sorted by ascending docid, each element | ||
2024 | ** encoded as: | ||
2025 | ** | ||
2026 | ** docid varint-encoded | ||
2027 | ** token elements: | ||
2028 | ** position+1 varint-encoded as delta from previous position | ||
2029 | ** start offset varint-encoded as delta from previous start offset | ||
2030 | ** end offset varint-encoded as delta from start offset | ||
2031 | ** | ||
2032 | ** The sentinel position of 0 indicates the end of the token list. | ||
2033 | ** | ||
2034 | ** Additionally, doclist blobs are chunked into multiple segments, | ||
2035 | ** using segment to order the segments. New elements are added to | ||
2036 | ** the segment at segment 0, until it exceeds CHUNK_MAX. Then | ||
2037 | ** segment 0 is deleted, and the doclist is inserted at segment 1. | ||
2038 | ** If there is already a doclist at segment 1, the segment 0 doclist | ||
2039 | ** is merged with it, the segment 1 doclist is deleted, and the | ||
2040 | ** merged doclist is inserted at segment 2, repeating those | ||
2041 | ** operations until an insert succeeds. | ||
2042 | ** | ||
2043 | ** Since this structure doesn't allow us to update elements in place | ||
2044 | ** in case of deletion or update, these are simply written to | ||
2045 | ** segment 0 (with an empty token list in case of deletion), with | ||
2046 | ** docListAccumulate() taking care to retain lower-segment | ||
2047 | ** information in preference to higher-segment information. | ||
2048 | */ | ||
2049 | /* TODO(shess) Provide a VACUUM type operation which both removes | ||
2050 | ** deleted elements which are no longer necessary, and duplicated | ||
2051 | ** elements. I suspect this will probably not be necessary in | ||
2052 | ** practice, though. | ||
2053 | */ | ||
2054 | static int fulltextCreate(sqlite3 *db, void *pAux, | ||
2055 | int argc, const char * const *argv, | ||
2056 | sqlite3_vtab **ppVTab, char **pzErr){ | ||
2057 | int rc; | ||
2058 | TableSpec spec; | ||
2059 | StringBuffer schema; | ||
2060 | TRACE(("FTS1 Create\n")); | ||
2061 | |||
2062 | rc = parseSpec(&spec, argc, argv, pzErr); | ||
2063 | if( rc!=SQLITE_OK ) return rc; | ||
2064 | |||
2065 | initStringBuffer(&schema); | ||
2066 | append(&schema, "CREATE TABLE %_content("); | ||
2067 | appendList(&schema, spec.nColumn, spec.azContentColumn); | ||
2068 | append(&schema, ")"); | ||
2069 | rc = sql_exec(db, spec.zDb, spec.zName, schema.s); | ||
2070 | free(schema.s); | ||
2071 | if( rc!=SQLITE_OK ) goto out; | ||
2072 | |||
2073 | rc = sql_exec(db, spec.zDb, spec.zName, | ||
2074 | "create table %_term(term text, segment integer, doclist blob, " | ||
2075 | "primary key(term, segment));"); | ||
2076 | if( rc!=SQLITE_OK ) goto out; | ||
2077 | |||
2078 | rc = constructVtab(db, &spec, ppVTab, pzErr); | ||
2079 | |||
2080 | out: | ||
2081 | clearTableSpec(&spec); | ||
2082 | return rc; | ||
2083 | } | ||
2084 | |||
2085 | /* Decide how to handle an SQL query. */ | ||
2086 | static int fulltextBestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){ | ||
2087 | int i; | ||
2088 | TRACE(("FTS1 BestIndex\n")); | ||
2089 | |||
2090 | for(i=0; i<pInfo->nConstraint; ++i){ | ||
2091 | const struct sqlite3_index_constraint *pConstraint; | ||
2092 | pConstraint = &pInfo->aConstraint[i]; | ||
2093 | if( pConstraint->usable ) { | ||
2094 | if( pConstraint->iColumn==-1 && | ||
2095 | pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ ){ | ||
2096 | pInfo->idxNum = QUERY_ROWID; /* lookup by rowid */ | ||
2097 | TRACE(("FTS1 QUERY_ROWID\n")); | ||
2098 | } else if( pConstraint->iColumn>=0 && | ||
2099 | pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH ){ | ||
2100 | /* full-text search */ | ||
2101 | pInfo->idxNum = QUERY_FULLTEXT + pConstraint->iColumn; | ||
2102 | TRACE(("FTS1 QUERY_FULLTEXT %d\n", pConstraint->iColumn)); | ||
2103 | } else continue; | ||
2104 | |||
2105 | pInfo->aConstraintUsage[i].argvIndex = 1; | ||
2106 | pInfo->aConstraintUsage[i].omit = 1; | ||
2107 | |||
2108 | /* An arbitrary value for now. | ||
2109 | * TODO: Perhaps rowid matches should be considered cheaper than | ||
2110 | * full-text searches. */ | ||
2111 | pInfo->estimatedCost = 1.0; | ||
2112 | |||
2113 | return SQLITE_OK; | ||
2114 | } | ||
2115 | } | ||
2116 | pInfo->idxNum = QUERY_GENERIC; | ||
2117 | return SQLITE_OK; | ||
2118 | } | ||
2119 | |||
2120 | static int fulltextDisconnect(sqlite3_vtab *pVTab){ | ||
2121 | TRACE(("FTS1 Disconnect %p\n", pVTab)); | ||
2122 | fulltext_vtab_destroy((fulltext_vtab *)pVTab); | ||
2123 | return SQLITE_OK; | ||
2124 | } | ||
2125 | |||
2126 | static int fulltextDestroy(sqlite3_vtab *pVTab){ | ||
2127 | fulltext_vtab *v = (fulltext_vtab *)pVTab; | ||
2128 | int rc; | ||
2129 | |||
2130 | TRACE(("FTS1 Destroy %p\n", pVTab)); | ||
2131 | rc = sql_exec(v->db, v->zDb, v->zName, | ||
2132 | "drop table if exists %_content;" | ||
2133 | "drop table if exists %_term;" | ||
2134 | ); | ||
2135 | if( rc!=SQLITE_OK ) return rc; | ||
2136 | |||
2137 | fulltext_vtab_destroy((fulltext_vtab *)pVTab); | ||
2138 | return SQLITE_OK; | ||
2139 | } | ||
2140 | |||
2141 | static int fulltextOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){ | ||
2142 | fulltext_cursor *c; | ||
2143 | |||
2144 | c = (fulltext_cursor *) calloc(sizeof(fulltext_cursor), 1); | ||
2145 | /* sqlite will initialize c->base */ | ||
2146 | *ppCursor = &c->base; | ||
2147 | TRACE(("FTS1 Open %p: %p\n", pVTab, c)); | ||
2148 | |||
2149 | return SQLITE_OK; | ||
2150 | } | ||
2151 | |||
2152 | |||
2153 | /* Free all of the dynamically allocated memory held by *q | ||
2154 | */ | ||
2155 | static void queryClear(Query *q){ | ||
2156 | int i; | ||
2157 | for(i = 0; i < q->nTerms; ++i){ | ||
2158 | free(q->pTerms[i].pTerm); | ||
2159 | } | ||
2160 | free(q->pTerms); | ||
2161 | memset(q, 0, sizeof(*q)); | ||
2162 | } | ||
2163 | |||
2164 | /* Free all of the dynamically allocated memory held by the | ||
2165 | ** Snippet | ||
2166 | */ | ||
2167 | static void snippetClear(Snippet *p){ | ||
2168 | free(p->aMatch); | ||
2169 | free(p->zOffset); | ||
2170 | free(p->zSnippet); | ||
2171 | memset(p, 0, sizeof(*p)); | ||
2172 | } | ||
2173 | /* | ||
2174 | ** Append a single entry to the p->aMatch[] log. | ||
2175 | */ | ||
2176 | static void snippetAppendMatch( | ||
2177 | Snippet *p, /* Append the entry to this snippet */ | ||
2178 | int iCol, int iTerm, /* The column and query term */ | ||
2179 | int iStart, int nByte /* Offset and size of the match */ | ||
2180 | ){ | ||
2181 | int i; | ||
2182 | struct snippetMatch *pMatch; | ||
2183 | if( p->nMatch+1>=p->nAlloc ){ | ||
2184 | p->nAlloc = p->nAlloc*2 + 10; | ||
2185 | p->aMatch = realloc(p->aMatch, p->nAlloc*sizeof(p->aMatch[0]) ); | ||
2186 | if( p->aMatch==0 ){ | ||
2187 | p->nMatch = 0; | ||
2188 | p->nAlloc = 0; | ||
2189 | return; | ||
2190 | } | ||
2191 | } | ||
2192 | i = p->nMatch++; | ||
2193 | pMatch = &p->aMatch[i]; | ||
2194 | pMatch->iCol = iCol; | ||
2195 | pMatch->iTerm = iTerm; | ||
2196 | pMatch->iStart = iStart; | ||
2197 | pMatch->nByte = nByte; | ||
2198 | } | ||
2199 | |||
2200 | /* | ||
2201 | ** Sizing information for the circular buffer used in snippetOffsetsOfColumn() | ||
2202 | */ | ||
2203 | #define FTS1_ROTOR_SZ (32) | ||
2204 | #define FTS1_ROTOR_MASK (FTS1_ROTOR_SZ-1) | ||
2205 | |||
2206 | /* | ||
2207 | ** Add entries to pSnippet->aMatch[] for every match that occurs against | ||
2208 | ** document zDoc[0..nDoc-1] which is stored in column iColumn. | ||
2209 | */ | ||
2210 | static void snippetOffsetsOfColumn( | ||
2211 | Query *pQuery, | ||
2212 | Snippet *pSnippet, | ||
2213 | int iColumn, | ||
2214 | const char *zDoc, | ||
2215 | int nDoc | ||
2216 | ){ | ||
2217 | const sqlite3_tokenizer_module *pTModule; /* The tokenizer module */ | ||
2218 | sqlite3_tokenizer *pTokenizer; /* The specific tokenizer */ | ||
2219 | sqlite3_tokenizer_cursor *pTCursor; /* Tokenizer cursor */ | ||
2220 | fulltext_vtab *pVtab; /* The full text index */ | ||
2221 | int nColumn; /* Number of columns in the index */ | ||
2222 | const QueryTerm *aTerm; /* Query string terms */ | ||
2223 | int nTerm; /* Number of query string terms */ | ||
2224 | int i, j; /* Loop counters */ | ||
2225 | int rc; /* Return code */ | ||
2226 | unsigned int match, prevMatch; /* Phrase search bitmasks */ | ||
2227 | const char *zToken; /* Next token from the tokenizer */ | ||
2228 | int nToken; /* Size of zToken */ | ||
2229 | int iBegin, iEnd, iPos; /* Offsets of beginning and end */ | ||
2230 | |||
2231 | /* The following variables keep a circular buffer of the last | ||
2232 | ** few tokens */ | ||
2233 | unsigned int iRotor = 0; /* Index of current token */ | ||
2234 | int iRotorBegin[FTS1_ROTOR_SZ]; /* Beginning offset of token */ | ||
2235 | int iRotorLen[FTS1_ROTOR_SZ]; /* Length of token */ | ||
2236 | |||
2237 | pVtab = pQuery->pFts; | ||
2238 | nColumn = pVtab->nColumn; | ||
2239 | pTokenizer = pVtab->pTokenizer; | ||
2240 | pTModule = pTokenizer->pModule; | ||
2241 | rc = pTModule->xOpen(pTokenizer, zDoc, nDoc, &pTCursor); | ||
2242 | if( rc ) return; | ||
2243 | pTCursor->pTokenizer = pTokenizer; | ||
2244 | aTerm = pQuery->pTerms; | ||
2245 | nTerm = pQuery->nTerms; | ||
2246 | if( nTerm>=FTS1_ROTOR_SZ ){ | ||
2247 | nTerm = FTS1_ROTOR_SZ - 1; | ||
2248 | } | ||
2249 | prevMatch = 0; | ||
2250 | while(1){ | ||
2251 | rc = pTModule->xNext(pTCursor, &zToken, &nToken, &iBegin, &iEnd, &iPos); | ||
2252 | if( rc ) break; | ||
2253 | iRotorBegin[iRotor&FTS1_ROTOR_MASK] = iBegin; | ||
2254 | iRotorLen[iRotor&FTS1_ROTOR_MASK] = iEnd-iBegin; | ||
2255 | match = 0; | ||
2256 | for(i=0; i<nTerm; i++){ | ||
2257 | int iCol; | ||
2258 | iCol = aTerm[i].iColumn; | ||
2259 | if( iCol>=0 && iCol<nColumn && iCol!=iColumn ) continue; | ||
2260 | if( aTerm[i].nTerm!=nToken ) continue; | ||
2261 | if( memcmp(aTerm[i].pTerm, zToken, nToken) ) continue; | ||
2262 | if( aTerm[i].iPhrase>1 && (prevMatch & (1<<i))==0 ) continue; | ||
2263 | match |= 1<<i; | ||
2264 | if( i==nTerm-1 || aTerm[i+1].iPhrase==1 ){ | ||
2265 | for(j=aTerm[i].iPhrase-1; j>=0; j--){ | ||
2266 | int k = (iRotor-j) & FTS1_ROTOR_MASK; | ||
2267 | snippetAppendMatch(pSnippet, iColumn, i-j, | ||
2268 | iRotorBegin[k], iRotorLen[k]); | ||
2269 | } | ||
2270 | } | ||
2271 | } | ||
2272 | prevMatch = match<<1; | ||
2273 | iRotor++; | ||
2274 | } | ||
2275 | pTModule->xClose(pTCursor); | ||
2276 | } | ||
2277 | |||
2278 | |||
2279 | /* | ||
2280 | ** Compute all offsets for the current row of the query. | ||
2281 | ** If the offsets have already been computed, this routine is a no-op. | ||
2282 | */ | ||
2283 | static void snippetAllOffsets(fulltext_cursor *p){ | ||
2284 | int nColumn; | ||
2285 | int iColumn, i; | ||
2286 | int iFirst, iLast; | ||
2287 | fulltext_vtab *pFts; | ||
2288 | |||
2289 | if( p->snippet.nMatch ) return; | ||
2290 | if( p->q.nTerms==0 ) return; | ||
2291 | pFts = p->q.pFts; | ||
2292 | nColumn = pFts->nColumn; | ||
2293 | iColumn = p->iCursorType - QUERY_FULLTEXT; | ||
2294 | if( iColumn<0 || iColumn>=nColumn ){ | ||
2295 | iFirst = 0; | ||
2296 | iLast = nColumn-1; | ||
2297 | }else{ | ||
2298 | iFirst = iColumn; | ||
2299 | iLast = iColumn; | ||
2300 | } | ||
2301 | for(i=iFirst; i<=iLast; i++){ | ||
2302 | const char *zDoc; | ||
2303 | int nDoc; | ||
2304 | zDoc = (const char*)sqlite3_column_text(p->pStmt, i+1); | ||
2305 | nDoc = sqlite3_column_bytes(p->pStmt, i+1); | ||
2306 | snippetOffsetsOfColumn(&p->q, &p->snippet, i, zDoc, nDoc); | ||
2307 | } | ||
2308 | } | ||
2309 | |||
2310 | /* | ||
2311 | ** Convert the information in the aMatch[] array of the snippet | ||
2312 | ** into the string zOffset[0..nOffset-1]. | ||
2313 | */ | ||
2314 | static void snippetOffsetText(Snippet *p){ | ||
2315 | int i; | ||
2316 | int cnt = 0; | ||
2317 | StringBuffer sb; | ||
2318 | char zBuf[200]; | ||
2319 | if( p->zOffset ) return; | ||
2320 | initStringBuffer(&sb); | ||
2321 | for(i=0; i<p->nMatch; i++){ | ||
2322 | struct snippetMatch *pMatch = &p->aMatch[i]; | ||
2323 | zBuf[0] = ' '; | ||
2324 | sprintf(&zBuf[cnt>0], "%d %d %d %d", pMatch->iCol, | ||
2325 | pMatch->iTerm, pMatch->iStart, pMatch->nByte); | ||
2326 | append(&sb, zBuf); | ||
2327 | cnt++; | ||
2328 | } | ||
2329 | p->zOffset = sb.s; | ||
2330 | p->nOffset = sb.len; | ||
2331 | } | ||
2332 | |||
2333 | /* | ||
2334 | ** zDoc[0..nDoc-1] is phrase of text. aMatch[0..nMatch-1] are a set | ||
2335 | ** of matching words some of which might be in zDoc. zDoc is column | ||
2336 | ** number iCol. | ||
2337 | ** | ||
2338 | ** iBreak is suggested spot in zDoc where we could begin or end an | ||
2339 | ** excerpt. Return a value similar to iBreak but possibly adjusted | ||
2340 | ** to be a little left or right so that the break point is better. | ||
2341 | */ | ||
2342 | static int wordBoundary( | ||
2343 | int iBreak, /* The suggested break point */ | ||
2344 | const char *zDoc, /* Document text */ | ||
2345 | int nDoc, /* Number of bytes in zDoc[] */ | ||
2346 | struct snippetMatch *aMatch, /* Matching words */ | ||
2347 | int nMatch, /* Number of entries in aMatch[] */ | ||
2348 | int iCol /* The column number for zDoc[] */ | ||
2349 | ){ | ||
2350 | int i; | ||
2351 | if( iBreak<=10 ){ | ||
2352 | return 0; | ||
2353 | } | ||
2354 | if( iBreak>=nDoc-10 ){ | ||
2355 | return nDoc; | ||
2356 | } | ||
2357 | for(i=0; i<nMatch && aMatch[i].iCol<iCol; i++){} | ||
2358 | while( i<nMatch && aMatch[i].iStart+aMatch[i].nByte<iBreak ){ i++; } | ||
2359 | if( i<nMatch ){ | ||
2360 | if( aMatch[i].iStart<iBreak+10 ){ | ||
2361 | return aMatch[i].iStart; | ||
2362 | } | ||
2363 | if( i>0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){ | ||
2364 | return aMatch[i-1].iStart; | ||
2365 | } | ||
2366 | } | ||
2367 | for(i=1; i<=10; i++){ | ||
2368 | if( safe_isspace(zDoc[iBreak-i]) ){ | ||
2369 | return iBreak - i + 1; | ||
2370 | } | ||
2371 | if( safe_isspace(zDoc[iBreak+i]) ){ | ||
2372 | return iBreak + i + 1; | ||
2373 | } | ||
2374 | } | ||
2375 | return iBreak; | ||
2376 | } | ||
2377 | |||
2378 | /* | ||
2379 | ** If the StringBuffer does not end in white space, add a single | ||
2380 | ** space character to the end. | ||
2381 | */ | ||
2382 | static void appendWhiteSpace(StringBuffer *p){ | ||
2383 | if( p->len==0 ) return; | ||
2384 | if( safe_isspace(p->s[p->len-1]) ) return; | ||
2385 | append(p, " "); | ||
2386 | } | ||
2387 | |||
2388 | /* | ||
2389 | ** Remove white space from teh end of the StringBuffer | ||
2390 | */ | ||
2391 | static void trimWhiteSpace(StringBuffer *p){ | ||
2392 | while( p->len>0 && safe_isspace(p->s[p->len-1]) ){ | ||
2393 | p->len--; | ||
2394 | } | ||
2395 | } | ||
2396 | |||
2397 | |||
2398 | |||
2399 | /* | ||
2400 | ** Allowed values for Snippet.aMatch[].snStatus | ||
2401 | */ | ||
2402 | #define SNIPPET_IGNORE 0 /* It is ok to omit this match from the snippet */ | ||
2403 | #define SNIPPET_DESIRED 1 /* We want to include this match in the snippet */ | ||
2404 | |||
2405 | /* | ||
2406 | ** Generate the text of a snippet. | ||
2407 | */ | ||
2408 | static void snippetText( | ||
2409 | fulltext_cursor *pCursor, /* The cursor we need the snippet for */ | ||
2410 | const char *zStartMark, /* Markup to appear before each match */ | ||
2411 | const char *zEndMark, /* Markup to appear after each match */ | ||
2412 | const char *zEllipsis /* Ellipsis mark */ | ||
2413 | ){ | ||
2414 | int i, j; | ||
2415 | struct snippetMatch *aMatch; | ||
2416 | int nMatch; | ||
2417 | int nDesired; | ||
2418 | StringBuffer sb; | ||
2419 | int tailCol; | ||
2420 | int tailOffset; | ||
2421 | int iCol; | ||
2422 | int nDoc; | ||
2423 | const char *zDoc; | ||
2424 | int iStart, iEnd; | ||
2425 | int tailEllipsis = 0; | ||
2426 | int iMatch; | ||
2427 | |||
2428 | |||
2429 | free(pCursor->snippet.zSnippet); | ||
2430 | pCursor->snippet.zSnippet = 0; | ||
2431 | aMatch = pCursor->snippet.aMatch; | ||
2432 | nMatch = pCursor->snippet.nMatch; | ||
2433 | initStringBuffer(&sb); | ||
2434 | |||
2435 | for(i=0; i<nMatch; i++){ | ||
2436 | aMatch[i].snStatus = SNIPPET_IGNORE; | ||
2437 | } | ||
2438 | nDesired = 0; | ||
2439 | for(i=0; i<pCursor->q.nTerms; i++){ | ||
2440 | for(j=0; j<nMatch; j++){ | ||
2441 | if( aMatch[j].iTerm==i ){ | ||
2442 | aMatch[j].snStatus = SNIPPET_DESIRED; | ||
2443 | nDesired++; | ||
2444 | break; | ||
2445 | } | ||
2446 | } | ||
2447 | } | ||
2448 | |||
2449 | iMatch = 0; | ||
2450 | tailCol = -1; | ||
2451 | tailOffset = 0; | ||
2452 | for(i=0; i<nMatch && nDesired>0; i++){ | ||
2453 | if( aMatch[i].snStatus!=SNIPPET_DESIRED ) continue; | ||
2454 | nDesired--; | ||
2455 | iCol = aMatch[i].iCol; | ||
2456 | zDoc = (const char*)sqlite3_column_text(pCursor->pStmt, iCol+1); | ||
2457 | nDoc = sqlite3_column_bytes(pCursor->pStmt, iCol+1); | ||
2458 | iStart = aMatch[i].iStart - 40; | ||
2459 | iStart = wordBoundary(iStart, zDoc, nDoc, aMatch, nMatch, iCol); | ||
2460 | if( iStart<=10 ){ | ||
2461 | iStart = 0; | ||
2462 | } | ||
2463 | if( iCol==tailCol && iStart<=tailOffset+20 ){ | ||
2464 | iStart = tailOffset; | ||
2465 | } | ||
2466 | if( (iCol!=tailCol && tailCol>=0) || iStart!=tailOffset ){ | ||
2467 | trimWhiteSpace(&sb); | ||
2468 | appendWhiteSpace(&sb); | ||
2469 | append(&sb, zEllipsis); | ||
2470 | appendWhiteSpace(&sb); | ||
2471 | } | ||
2472 | iEnd = aMatch[i].iStart + aMatch[i].nByte + 40; | ||
2473 | iEnd = wordBoundary(iEnd, zDoc, nDoc, aMatch, nMatch, iCol); | ||
2474 | if( iEnd>=nDoc-10 ){ | ||
2475 | iEnd = nDoc; | ||
2476 | tailEllipsis = 0; | ||
2477 | }else{ | ||
2478 | tailEllipsis = 1; | ||
2479 | } | ||
2480 | while( iMatch<nMatch && aMatch[iMatch].iCol<iCol ){ iMatch++; } | ||
2481 | while( iStart<iEnd ){ | ||
2482 | while( iMatch<nMatch && aMatch[iMatch].iStart<iStart | ||
2483 | && aMatch[iMatch].iCol<=iCol ){ | ||
2484 | iMatch++; | ||
2485 | } | ||
2486 | if( iMatch<nMatch && aMatch[iMatch].iStart<iEnd | ||
2487 | && aMatch[iMatch].iCol==iCol ){ | ||
2488 | nappend(&sb, &zDoc[iStart], aMatch[iMatch].iStart - iStart); | ||
2489 | iStart = aMatch[iMatch].iStart; | ||
2490 | append(&sb, zStartMark); | ||
2491 | nappend(&sb, &zDoc[iStart], aMatch[iMatch].nByte); | ||
2492 | append(&sb, zEndMark); | ||
2493 | iStart += aMatch[iMatch].nByte; | ||
2494 | for(j=iMatch+1; j<nMatch; j++){ | ||
2495 | if( aMatch[j].iTerm==aMatch[iMatch].iTerm | ||
2496 | && aMatch[j].snStatus==SNIPPET_DESIRED ){ | ||
2497 | nDesired--; | ||
2498 | aMatch[j].snStatus = SNIPPET_IGNORE; | ||
2499 | } | ||
2500 | } | ||
2501 | }else{ | ||
2502 | nappend(&sb, &zDoc[iStart], iEnd - iStart); | ||
2503 | iStart = iEnd; | ||
2504 | } | ||
2505 | } | ||
2506 | tailCol = iCol; | ||
2507 | tailOffset = iEnd; | ||
2508 | } | ||
2509 | trimWhiteSpace(&sb); | ||
2510 | if( tailEllipsis ){ | ||
2511 | appendWhiteSpace(&sb); | ||
2512 | append(&sb, zEllipsis); | ||
2513 | } | ||
2514 | pCursor->snippet.zSnippet = sb.s; | ||
2515 | pCursor->snippet.nSnippet = sb.len; | ||
2516 | } | ||
2517 | |||
2518 | |||
2519 | /* | ||
2520 | ** Close the cursor. For additional information see the documentation | ||
2521 | ** on the xClose method of the virtual table interface. | ||
2522 | */ | ||
2523 | static int fulltextClose(sqlite3_vtab_cursor *pCursor){ | ||
2524 | fulltext_cursor *c = (fulltext_cursor *) pCursor; | ||
2525 | TRACE(("FTS1 Close %p\n", c)); | ||
2526 | sqlite3_finalize(c->pStmt); | ||
2527 | queryClear(&c->q); | ||
2528 | snippetClear(&c->snippet); | ||
2529 | if( c->result.pDoclist!=NULL ){ | ||
2530 | docListDelete(c->result.pDoclist); | ||
2531 | } | ||
2532 | free(c); | ||
2533 | return SQLITE_OK; | ||
2534 | } | ||
2535 | |||
2536 | static int fulltextNext(sqlite3_vtab_cursor *pCursor){ | ||
2537 | fulltext_cursor *c = (fulltext_cursor *) pCursor; | ||
2538 | sqlite_int64 iDocid; | ||
2539 | int rc; | ||
2540 | |||
2541 | TRACE(("FTS1 Next %p\n", pCursor)); | ||
2542 | snippetClear(&c->snippet); | ||
2543 | if( c->iCursorType < QUERY_FULLTEXT ){ | ||
2544 | /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */ | ||
2545 | rc = sqlite3_step(c->pStmt); | ||
2546 | switch( rc ){ | ||
2547 | case SQLITE_ROW: | ||
2548 | c->eof = 0; | ||
2549 | return SQLITE_OK; | ||
2550 | case SQLITE_DONE: | ||
2551 | c->eof = 1; | ||
2552 | return SQLITE_OK; | ||
2553 | default: | ||
2554 | c->eof = 1; | ||
2555 | return rc; | ||
2556 | } | ||
2557 | } else { /* full-text query */ | ||
2558 | rc = sqlite3_reset(c->pStmt); | ||
2559 | if( rc!=SQLITE_OK ) return rc; | ||
2560 | |||
2561 | iDocid = nextDocid(&c->result); | ||
2562 | if( iDocid==0 ){ | ||
2563 | c->eof = 1; | ||
2564 | return SQLITE_OK; | ||
2565 | } | ||
2566 | rc = sqlite3_bind_int64(c->pStmt, 1, iDocid); | ||
2567 | if( rc!=SQLITE_OK ) return rc; | ||
2568 | /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */ | ||
2569 | rc = sqlite3_step(c->pStmt); | ||
2570 | if( rc==SQLITE_ROW ){ /* the case we expect */ | ||
2571 | c->eof = 0; | ||
2572 | return SQLITE_OK; | ||
2573 | } | ||
2574 | /* an error occurred; abort */ | ||
2575 | return rc==SQLITE_DONE ? SQLITE_ERROR : rc; | ||
2576 | } | ||
2577 | } | ||
2578 | |||
2579 | |||
2580 | /* Return a DocList corresponding to the query term *pTerm. If *pTerm | ||
2581 | ** is the first term of a phrase query, go ahead and evaluate the phrase | ||
2582 | ** query and return the doclist for the entire phrase query. | ||
2583 | ** | ||
2584 | ** The result is stored in pTerm->doclist. | ||
2585 | */ | ||
2586 | static int docListOfTerm( | ||
2587 | fulltext_vtab *v, /* The full text index */ | ||
2588 | int iColumn, /* column to restrict to. No restrition if >=nColumn */ | ||
2589 | QueryTerm *pQTerm, /* Term we are looking for, or 1st term of a phrase */ | ||
2590 | DocList **ppResult /* Write the result here */ | ||
2591 | ){ | ||
2592 | DocList *pLeft, *pRight, *pNew; | ||
2593 | int i, rc; | ||
2594 | |||
2595 | pLeft = docListNew(DL_POSITIONS); | ||
2596 | rc = term_select_all(v, iColumn, pQTerm->pTerm, pQTerm->nTerm, pLeft); | ||
2597 | if( rc ){ | ||
2598 | docListDelete(pLeft); | ||
2599 | return rc; | ||
2600 | } | ||
2601 | for(i=1; i<=pQTerm->nPhrase; i++){ | ||
2602 | pRight = docListNew(DL_POSITIONS); | ||
2603 | rc = term_select_all(v, iColumn, pQTerm[i].pTerm, pQTerm[i].nTerm, pRight); | ||
2604 | if( rc ){ | ||
2605 | docListDelete(pLeft); | ||
2606 | return rc; | ||
2607 | } | ||
2608 | pNew = docListNew(i<pQTerm->nPhrase ? DL_POSITIONS : DL_DOCIDS); | ||
2609 | docListPhraseMerge(pLeft, pRight, pNew); | ||
2610 | docListDelete(pLeft); | ||
2611 | docListDelete(pRight); | ||
2612 | pLeft = pNew; | ||
2613 | } | ||
2614 | *ppResult = pLeft; | ||
2615 | return SQLITE_OK; | ||
2616 | } | ||
2617 | |||
2618 | /* Add a new term pTerm[0..nTerm-1] to the query *q. | ||
2619 | */ | ||
2620 | static void queryAdd(Query *q, const char *pTerm, int nTerm){ | ||
2621 | QueryTerm *t; | ||
2622 | ++q->nTerms; | ||
2623 | q->pTerms = realloc(q->pTerms, q->nTerms * sizeof(q->pTerms[0])); | ||
2624 | if( q->pTerms==0 ){ | ||
2625 | q->nTerms = 0; | ||
2626 | return; | ||
2627 | } | ||
2628 | t = &q->pTerms[q->nTerms - 1]; | ||
2629 | memset(t, 0, sizeof(*t)); | ||
2630 | t->pTerm = malloc(nTerm+1); | ||
2631 | memcpy(t->pTerm, pTerm, nTerm); | ||
2632 | t->pTerm[nTerm] = 0; | ||
2633 | t->nTerm = nTerm; | ||
2634 | t->isOr = q->nextIsOr; | ||
2635 | q->nextIsOr = 0; | ||
2636 | t->iColumn = q->nextColumn; | ||
2637 | q->nextColumn = q->dfltColumn; | ||
2638 | } | ||
2639 | |||
2640 | /* | ||
2641 | ** Check to see if the string zToken[0...nToken-1] matches any | ||
2642 | ** column name in the virtual table. If it does, | ||
2643 | ** return the zero-indexed column number. If not, return -1. | ||
2644 | */ | ||
2645 | static int checkColumnSpecifier( | ||
2646 | fulltext_vtab *pVtab, /* The virtual table */ | ||
2647 | const char *zToken, /* Text of the token */ | ||
2648 | int nToken /* Number of characters in the token */ | ||
2649 | ){ | ||
2650 | int i; | ||
2651 | for(i=0; i<pVtab->nColumn; i++){ | ||
2652 | if( memcmp(pVtab->azColumn[i], zToken, nToken)==0 | ||
2653 | && pVtab->azColumn[i][nToken]==0 ){ | ||
2654 | return i; | ||
2655 | } | ||
2656 | } | ||
2657 | return -1; | ||
2658 | } | ||
2659 | |||
2660 | /* | ||
2661 | ** Parse the text at pSegment[0..nSegment-1]. Add additional terms | ||
2662 | ** to the query being assemblied in pQuery. | ||
2663 | ** | ||
2664 | ** inPhrase is true if pSegment[0..nSegement-1] is contained within | ||
2665 | ** double-quotes. If inPhrase is true, then the first term | ||
2666 | ** is marked with the number of terms in the phrase less one and | ||
2667 | ** OR and "-" syntax is ignored. If inPhrase is false, then every | ||
2668 | ** term found is marked with nPhrase=0 and OR and "-" syntax is significant. | ||
2669 | */ | ||
2670 | static int tokenizeSegment( | ||
2671 | sqlite3_tokenizer *pTokenizer, /* The tokenizer to use */ | ||
2672 | const char *pSegment, int nSegment, /* Query expression being parsed */ | ||
2673 | int inPhrase, /* True if within "..." */ | ||
2674 | Query *pQuery /* Append results here */ | ||
2675 | ){ | ||
2676 | const sqlite3_tokenizer_module *pModule = pTokenizer->pModule; | ||
2677 | sqlite3_tokenizer_cursor *pCursor; | ||
2678 | int firstIndex = pQuery->nTerms; | ||
2679 | int iCol; | ||
2680 | int nTerm = 1; | ||
2681 | |||
2682 | int rc = pModule->xOpen(pTokenizer, pSegment, nSegment, &pCursor); | ||
2683 | if( rc!=SQLITE_OK ) return rc; | ||
2684 | pCursor->pTokenizer = pTokenizer; | ||
2685 | |||
2686 | while( 1 ){ | ||
2687 | const char *pToken; | ||
2688 | int nToken, iBegin, iEnd, iPos; | ||
2689 | |||
2690 | rc = pModule->xNext(pCursor, | ||
2691 | &pToken, &nToken, | ||
2692 | &iBegin, &iEnd, &iPos); | ||
2693 | if( rc!=SQLITE_OK ) break; | ||
2694 | if( !inPhrase && | ||
2695 | pSegment[iEnd]==':' && | ||
2696 | (iCol = checkColumnSpecifier(pQuery->pFts, pToken, nToken))>=0 ){ | ||
2697 | pQuery->nextColumn = iCol; | ||
2698 | continue; | ||
2699 | } | ||
2700 | if( !inPhrase && pQuery->nTerms>0 && nToken==2 | ||
2701 | && pSegment[iBegin]=='O' && pSegment[iBegin+1]=='R' ){ | ||
2702 | pQuery->nextIsOr = 1; | ||
2703 | continue; | ||
2704 | } | ||
2705 | queryAdd(pQuery, pToken, nToken); | ||
2706 | if( !inPhrase && iBegin>0 && pSegment[iBegin-1]=='-' ){ | ||
2707 | pQuery->pTerms[pQuery->nTerms-1].isNot = 1; | ||
2708 | } | ||
2709 | pQuery->pTerms[pQuery->nTerms-1].iPhrase = nTerm; | ||
2710 | if( inPhrase ){ | ||
2711 | nTerm++; | ||
2712 | } | ||
2713 | } | ||
2714 | |||
2715 | if( inPhrase && pQuery->nTerms>firstIndex ){ | ||
2716 | pQuery->pTerms[firstIndex].nPhrase = pQuery->nTerms - firstIndex - 1; | ||
2717 | } | ||
2718 | |||
2719 | return pModule->xClose(pCursor); | ||
2720 | } | ||
2721 | |||
2722 | /* Parse a query string, yielding a Query object pQuery. | ||
2723 | ** | ||
2724 | ** The calling function will need to queryClear() to clean up | ||
2725 | ** the dynamically allocated memory held by pQuery. | ||
2726 | */ | ||
2727 | static int parseQuery( | ||
2728 | fulltext_vtab *v, /* The fulltext index */ | ||
2729 | const char *zInput, /* Input text of the query string */ | ||
2730 | int nInput, /* Size of the input text */ | ||
2731 | int dfltColumn, /* Default column of the index to match against */ | ||
2732 | Query *pQuery /* Write the parse results here. */ | ||
2733 | ){ | ||
2734 | int iInput, inPhrase = 0; | ||
2735 | |||
2736 | if( zInput==0 ) nInput = 0; | ||
2737 | if( nInput<0 ) nInput = strlen(zInput); | ||
2738 | pQuery->nTerms = 0; | ||
2739 | pQuery->pTerms = NULL; | ||
2740 | pQuery->nextIsOr = 0; | ||
2741 | pQuery->nextColumn = dfltColumn; | ||
2742 | pQuery->dfltColumn = dfltColumn; | ||
2743 | pQuery->pFts = v; | ||
2744 | |||
2745 | for(iInput=0; iInput<nInput; ++iInput){ | ||
2746 | int i; | ||
2747 | for(i=iInput; i<nInput && zInput[i]!='"'; ++i){} | ||
2748 | if( i>iInput ){ | ||
2749 | tokenizeSegment(v->pTokenizer, zInput+iInput, i-iInput, inPhrase, | ||
2750 | pQuery); | ||
2751 | } | ||
2752 | iInput = i; | ||
2753 | if( i<nInput ){ | ||
2754 | assert( zInput[i]=='"' ); | ||
2755 | inPhrase = !inPhrase; | ||
2756 | } | ||
2757 | } | ||
2758 | |||
2759 | if( inPhrase ){ | ||
2760 | /* unmatched quote */ | ||
2761 | queryClear(pQuery); | ||
2762 | return SQLITE_ERROR; | ||
2763 | } | ||
2764 | return SQLITE_OK; | ||
2765 | } | ||
2766 | |||
2767 | /* Perform a full-text query using the search expression in | ||
2768 | ** zInput[0..nInput-1]. Return a list of matching documents | ||
2769 | ** in pResult. | ||
2770 | ** | ||
2771 | ** Queries must match column iColumn. Or if iColumn>=nColumn | ||
2772 | ** they are allowed to match against any column. | ||
2773 | */ | ||
2774 | static int fulltextQuery( | ||
2775 | fulltext_vtab *v, /* The full text index */ | ||
2776 | int iColumn, /* Match against this column by default */ | ||
2777 | const char *zInput, /* The query string */ | ||
2778 | int nInput, /* Number of bytes in zInput[] */ | ||
2779 | DocList **pResult, /* Write the result doclist here */ | ||
2780 | Query *pQuery /* Put parsed query string here */ | ||
2781 | ){ | ||
2782 | int i, iNext, rc; | ||
2783 | DocList *pLeft = NULL; | ||
2784 | DocList *pRight, *pNew, *pOr; | ||
2785 | int nNot = 0; | ||
2786 | QueryTerm *aTerm; | ||
2787 | |||
2788 | rc = parseQuery(v, zInput, nInput, iColumn, pQuery); | ||
2789 | if( rc!=SQLITE_OK ) return rc; | ||
2790 | |||
2791 | /* Merge AND terms. */ | ||
2792 | aTerm = pQuery->pTerms; | ||
2793 | for(i = 0; i<pQuery->nTerms; i=iNext){ | ||
2794 | if( aTerm[i].isNot ){ | ||
2795 | /* Handle all NOT terms in a separate pass */ | ||
2796 | nNot++; | ||
2797 | iNext = i + aTerm[i].nPhrase+1; | ||
2798 | continue; | ||
2799 | } | ||
2800 | iNext = i + aTerm[i].nPhrase + 1; | ||
2801 | rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &pRight); | ||
2802 | if( rc ){ | ||
2803 | queryClear(pQuery); | ||
2804 | return rc; | ||
2805 | } | ||
2806 | while( iNext<pQuery->nTerms && aTerm[iNext].isOr ){ | ||
2807 | rc = docListOfTerm(v, aTerm[iNext].iColumn, &aTerm[iNext], &pOr); | ||
2808 | iNext += aTerm[iNext].nPhrase + 1; | ||
2809 | if( rc ){ | ||
2810 | queryClear(pQuery); | ||
2811 | return rc; | ||
2812 | } | ||
2813 | pNew = docListNew(DL_DOCIDS); | ||
2814 | docListOrMerge(pRight, pOr, pNew); | ||
2815 | docListDelete(pRight); | ||
2816 | docListDelete(pOr); | ||
2817 | pRight = pNew; | ||
2818 | } | ||
2819 | if( pLeft==0 ){ | ||
2820 | pLeft = pRight; | ||
2821 | }else{ | ||
2822 | pNew = docListNew(DL_DOCIDS); | ||
2823 | docListAndMerge(pLeft, pRight, pNew); | ||
2824 | docListDelete(pRight); | ||
2825 | docListDelete(pLeft); | ||
2826 | pLeft = pNew; | ||
2827 | } | ||
2828 | } | ||
2829 | |||
2830 | if( nNot && pLeft==0 ){ | ||
2831 | /* We do not yet know how to handle a query of only NOT terms */ | ||
2832 | return SQLITE_ERROR; | ||
2833 | } | ||
2834 | |||
2835 | /* Do the EXCEPT terms */ | ||
2836 | for(i=0; i<pQuery->nTerms; i += aTerm[i].nPhrase + 1){ | ||
2837 | if( !aTerm[i].isNot ) continue; | ||
2838 | rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &pRight); | ||
2839 | if( rc ){ | ||
2840 | queryClear(pQuery); | ||
2841 | docListDelete(pLeft); | ||
2842 | return rc; | ||
2843 | } | ||
2844 | pNew = docListNew(DL_DOCIDS); | ||
2845 | docListExceptMerge(pLeft, pRight, pNew); | ||
2846 | docListDelete(pRight); | ||
2847 | docListDelete(pLeft); | ||
2848 | pLeft = pNew; | ||
2849 | } | ||
2850 | |||
2851 | *pResult = pLeft; | ||
2852 | return rc; | ||
2853 | } | ||
2854 | |||
2855 | /* | ||
2856 | ** This is the xFilter interface for the virtual table. See | ||
2857 | ** the virtual table xFilter method documentation for additional | ||
2858 | ** information. | ||
2859 | ** | ||
2860 | ** If idxNum==QUERY_GENERIC then do a full table scan against | ||
2861 | ** the %_content table. | ||
2862 | ** | ||
2863 | ** If idxNum==QUERY_ROWID then do a rowid lookup for a single entry | ||
2864 | ** in the %_content table. | ||
2865 | ** | ||
2866 | ** If idxNum>=QUERY_FULLTEXT then use the full text index. The | ||
2867 | ** column on the left-hand side of the MATCH operator is column | ||
2868 | ** number idxNum-QUERY_FULLTEXT, 0 indexed. argv[0] is the right-hand | ||
2869 | ** side of the MATCH operator. | ||
2870 | */ | ||
2871 | /* TODO(shess) Upgrade the cursor initialization and destruction to | ||
2872 | ** account for fulltextFilter() being called multiple times on the | ||
2873 | ** same cursor. The current solution is very fragile. Apply fix to | ||
2874 | ** fts2 as appropriate. | ||
2875 | */ | ||
2876 | static int fulltextFilter( | ||
2877 | sqlite3_vtab_cursor *pCursor, /* The cursor used for this query */ | ||
2878 | int idxNum, const char *idxStr, /* Which indexing scheme to use */ | ||
2879 | int argc, sqlite3_value **argv /* Arguments for the indexing scheme */ | ||
2880 | ){ | ||
2881 | fulltext_cursor *c = (fulltext_cursor *) pCursor; | ||
2882 | fulltext_vtab *v = cursor_vtab(c); | ||
2883 | int rc; | ||
2884 | char *zSql; | ||
2885 | |||
2886 | TRACE(("FTS1 Filter %p\n",pCursor)); | ||
2887 | |||
2888 | zSql = sqlite3_mprintf("select rowid, * from %%_content %s", | ||
2889 | idxNum==QUERY_GENERIC ? "" : "where rowid=?"); | ||
2890 | sqlite3_finalize(c->pStmt); | ||
2891 | rc = sql_prepare(v->db, v->zDb, v->zName, &c->pStmt, zSql); | ||
2892 | sqlite3_free(zSql); | ||
2893 | if( rc!=SQLITE_OK ) return rc; | ||
2894 | |||
2895 | c->iCursorType = idxNum; | ||
2896 | switch( idxNum ){ | ||
2897 | case QUERY_GENERIC: | ||
2898 | break; | ||
2899 | |||
2900 | case QUERY_ROWID: | ||
2901 | rc = sqlite3_bind_int64(c->pStmt, 1, sqlite3_value_int64(argv[0])); | ||
2902 | if( rc!=SQLITE_OK ) return rc; | ||
2903 | break; | ||
2904 | |||
2905 | default: /* full-text search */ | ||
2906 | { | ||
2907 | const char *zQuery = (const char *)sqlite3_value_text(argv[0]); | ||
2908 | DocList *pResult; | ||
2909 | assert( idxNum<=QUERY_FULLTEXT+v->nColumn); | ||
2910 | assert( argc==1 ); | ||
2911 | queryClear(&c->q); | ||
2912 | rc = fulltextQuery(v, idxNum-QUERY_FULLTEXT, zQuery, -1, &pResult, &c->q); | ||
2913 | if( rc!=SQLITE_OK ) return rc; | ||
2914 | if( c->result.pDoclist!=NULL ) docListDelete(c->result.pDoclist); | ||
2915 | readerInit(&c->result, pResult); | ||
2916 | break; | ||
2917 | } | ||
2918 | } | ||
2919 | |||
2920 | return fulltextNext(pCursor); | ||
2921 | } | ||
2922 | |||
2923 | /* This is the xEof method of the virtual table. The SQLite core | ||
2924 | ** calls this routine to find out if it has reached the end of | ||
2925 | ** a query's results set. | ||
2926 | */ | ||
2927 | static int fulltextEof(sqlite3_vtab_cursor *pCursor){ | ||
2928 | fulltext_cursor *c = (fulltext_cursor *) pCursor; | ||
2929 | return c->eof; | ||
2930 | } | ||
2931 | |||
2932 | /* This is the xColumn method of the virtual table. The SQLite | ||
2933 | ** core calls this method during a query when it needs the value | ||
2934 | ** of a column from the virtual table. This method needs to use | ||
2935 | ** one of the sqlite3_result_*() routines to store the requested | ||
2936 | ** value back in the pContext. | ||
2937 | */ | ||
2938 | static int fulltextColumn(sqlite3_vtab_cursor *pCursor, | ||
2939 | sqlite3_context *pContext, int idxCol){ | ||
2940 | fulltext_cursor *c = (fulltext_cursor *) pCursor; | ||
2941 | fulltext_vtab *v = cursor_vtab(c); | ||
2942 | |||
2943 | if( idxCol<v->nColumn ){ | ||
2944 | sqlite3_value *pVal = sqlite3_column_value(c->pStmt, idxCol+1); | ||
2945 | sqlite3_result_value(pContext, pVal); | ||
2946 | }else if( idxCol==v->nColumn ){ | ||
2947 | /* The extra column whose name is the same as the table. | ||
2948 | ** Return a blob which is a pointer to the cursor | ||
2949 | */ | ||
2950 | sqlite3_result_blob(pContext, &c, sizeof(c), SQLITE_TRANSIENT); | ||
2951 | } | ||
2952 | return SQLITE_OK; | ||
2953 | } | ||
2954 | |||
2955 | /* This is the xRowid method. The SQLite core calls this routine to | ||
2956 | ** retrive the rowid for the current row of the result set. The | ||
2957 | ** rowid should be written to *pRowid. | ||
2958 | */ | ||
2959 | static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){ | ||
2960 | fulltext_cursor *c = (fulltext_cursor *) pCursor; | ||
2961 | |||
2962 | *pRowid = sqlite3_column_int64(c->pStmt, 0); | ||
2963 | return SQLITE_OK; | ||
2964 | } | ||
2965 | |||
2966 | /* Add all terms in [zText] to the given hash table. If [iColumn] > 0, | ||
2967 | * we also store positions and offsets in the hash table using the given | ||
2968 | * column number. */ | ||
2969 | static int buildTerms(fulltext_vtab *v, fts1Hash *terms, sqlite_int64 iDocid, | ||
2970 | const char *zText, int iColumn){ | ||
2971 | sqlite3_tokenizer *pTokenizer = v->pTokenizer; | ||
2972 | sqlite3_tokenizer_cursor *pCursor; | ||
2973 | const char *pToken; | ||
2974 | int nTokenBytes; | ||
2975 | int iStartOffset, iEndOffset, iPosition; | ||
2976 | int rc; | ||
2977 | |||
2978 | rc = pTokenizer->pModule->xOpen(pTokenizer, zText, -1, &pCursor); | ||
2979 | if( rc!=SQLITE_OK ) return rc; | ||
2980 | |||
2981 | pCursor->pTokenizer = pTokenizer; | ||
2982 | while( SQLITE_OK==pTokenizer->pModule->xNext(pCursor, | ||
2983 | &pToken, &nTokenBytes, | ||
2984 | &iStartOffset, &iEndOffset, | ||
2985 | &iPosition) ){ | ||
2986 | DocList *p; | ||
2987 | |||
2988 | /* Positions can't be negative; we use -1 as a terminator internally. */ | ||
2989 | if( iPosition<0 ){ | ||
2990 | pTokenizer->pModule->xClose(pCursor); | ||
2991 | return SQLITE_ERROR; | ||
2992 | } | ||
2993 | |||
2994 | p = fts1HashFind(terms, pToken, nTokenBytes); | ||
2995 | if( p==NULL ){ | ||
2996 | p = docListNew(DL_DEFAULT); | ||
2997 | docListAddDocid(p, iDocid); | ||
2998 | fts1HashInsert(terms, pToken, nTokenBytes, p); | ||
2999 | } | ||
3000 | if( iColumn>=0 ){ | ||
3001 | docListAddPosOffset(p, iColumn, iPosition, iStartOffset, iEndOffset); | ||
3002 | } | ||
3003 | } | ||
3004 | |||
3005 | /* TODO(shess) Check return? Should this be able to cause errors at | ||
3006 | ** this point? Actually, same question about sqlite3_finalize(), | ||
3007 | ** though one could argue that failure there means that the data is | ||
3008 | ** not durable. *ponder* | ||
3009 | */ | ||
3010 | pTokenizer->pModule->xClose(pCursor); | ||
3011 | return rc; | ||
3012 | } | ||
3013 | |||
3014 | /* Update the %_terms table to map the term [pTerm] to the given rowid. */ | ||
3015 | static int index_insert_term(fulltext_vtab *v, const char *pTerm, int nTerm, | ||
3016 | DocList *d){ | ||
3017 | sqlite_int64 iIndexRow; | ||
3018 | DocList doclist; | ||
3019 | int iSegment = 0, rc; | ||
3020 | |||
3021 | rc = term_select(v, pTerm, nTerm, iSegment, &iIndexRow, &doclist); | ||
3022 | if( rc==SQLITE_DONE ){ | ||
3023 | docListInit(&doclist, DL_DEFAULT, 0, 0); | ||
3024 | docListUpdate(&doclist, d); | ||
3025 | /* TODO(shess) Consider length(doclist)>CHUNK_MAX? */ | ||
3026 | rc = term_insert(v, NULL, pTerm, nTerm, iSegment, &doclist); | ||
3027 | goto err; | ||
3028 | } | ||
3029 | if( rc!=SQLITE_ROW ) return SQLITE_ERROR; | ||
3030 | |||
3031 | docListUpdate(&doclist, d); | ||
3032 | if( doclist.nData<=CHUNK_MAX ){ | ||
3033 | rc = term_update(v, iIndexRow, &doclist); | ||
3034 | goto err; | ||
3035 | } | ||
3036 | |||
3037 | /* Doclist doesn't fit, delete what's there, and accumulate | ||
3038 | ** forward. | ||
3039 | */ | ||
3040 | rc = term_delete(v, iIndexRow); | ||
3041 | if( rc!=SQLITE_OK ) goto err; | ||
3042 | |||
3043 | /* Try to insert the doclist into a higher segment bucket. On | ||
3044 | ** failure, accumulate existing doclist with the doclist from that | ||
3045 | ** bucket, and put results in the next bucket. | ||
3046 | */ | ||
3047 | iSegment++; | ||
3048 | while( (rc=term_insert(v, &iIndexRow, pTerm, nTerm, iSegment, | ||
3049 | &doclist))!=SQLITE_OK ){ | ||
3050 | sqlite_int64 iSegmentRow; | ||
3051 | DocList old; | ||
3052 | int rc2; | ||
3053 | |||
3054 | /* Retain old error in case the term_insert() error was really an | ||
3055 | ** error rather than a bounced insert. | ||
3056 | */ | ||
3057 | rc2 = term_select(v, pTerm, nTerm, iSegment, &iSegmentRow, &old); | ||
3058 | if( rc2!=SQLITE_ROW ) goto err; | ||
3059 | |||
3060 | rc = term_delete(v, iSegmentRow); | ||
3061 | if( rc!=SQLITE_OK ) goto err; | ||
3062 | |||
3063 | /* Reusing lowest-number deleted row keeps the index smaller. */ | ||
3064 | if( iSegmentRow<iIndexRow ) iIndexRow = iSegmentRow; | ||
3065 | |||
3066 | /* doclist contains the newer data, so accumulate it over old. | ||
3067 | ** Then steal accumulated data for doclist. | ||
3068 | */ | ||
3069 | docListAccumulate(&old, &doclist); | ||
3070 | docListDestroy(&doclist); | ||
3071 | doclist = old; | ||
3072 | |||
3073 | iSegment++; | ||
3074 | } | ||
3075 | |||
3076 | err: | ||
3077 | docListDestroy(&doclist); | ||
3078 | return rc; | ||
3079 | } | ||
3080 | |||
3081 | /* Add doclists for all terms in [pValues] to the hash table [terms]. */ | ||
3082 | static int insertTerms(fulltext_vtab *v, fts1Hash *terms, sqlite_int64 iRowid, | ||
3083 | sqlite3_value **pValues){ | ||
3084 | int i; | ||
3085 | for(i = 0; i < v->nColumn ; ++i){ | ||
3086 | char *zText = (char*)sqlite3_value_text(pValues[i]); | ||
3087 | int rc = buildTerms(v, terms, iRowid, zText, i); | ||
3088 | if( rc!=SQLITE_OK ) return rc; | ||
3089 | } | ||
3090 | return SQLITE_OK; | ||
3091 | } | ||
3092 | |||
3093 | /* Add empty doclists for all terms in the given row's content to the hash | ||
3094 | * table [pTerms]. */ | ||
3095 | static int deleteTerms(fulltext_vtab *v, fts1Hash *pTerms, sqlite_int64 iRowid){ | ||
3096 | const char **pValues; | ||
3097 | int i; | ||
3098 | |||
3099 | int rc = content_select(v, iRowid, &pValues); | ||
3100 | if( rc!=SQLITE_OK ) return rc; | ||
3101 | |||
3102 | for(i = 0 ; i < v->nColumn; ++i) { | ||
3103 | rc = buildTerms(v, pTerms, iRowid, pValues[i], -1); | ||
3104 | if( rc!=SQLITE_OK ) break; | ||
3105 | } | ||
3106 | |||
3107 | freeStringArray(v->nColumn, pValues); | ||
3108 | return SQLITE_OK; | ||
3109 | } | ||
3110 | |||
3111 | /* Insert a row into the %_content table; set *piRowid to be the ID of the | ||
3112 | * new row. Fill [pTerms] with new doclists for the %_term table. */ | ||
3113 | static int index_insert(fulltext_vtab *v, sqlite3_value *pRequestRowid, | ||
3114 | sqlite3_value **pValues, | ||
3115 | sqlite_int64 *piRowid, fts1Hash *pTerms){ | ||
3116 | int rc; | ||
3117 | |||
3118 | rc = content_insert(v, pRequestRowid, pValues); /* execute an SQL INSERT */ | ||
3119 | if( rc!=SQLITE_OK ) return rc; | ||
3120 | *piRowid = sqlite3_last_insert_rowid(v->db); | ||
3121 | return insertTerms(v, pTerms, *piRowid, pValues); | ||
3122 | } | ||
3123 | |||
3124 | /* Delete a row from the %_content table; fill [pTerms] with empty doclists | ||
3125 | * to be written to the %_term table. */ | ||
3126 | static int index_delete(fulltext_vtab *v, sqlite_int64 iRow, fts1Hash *pTerms){ | ||
3127 | int rc = deleteTerms(v, pTerms, iRow); | ||
3128 | if( rc!=SQLITE_OK ) return rc; | ||
3129 | return content_delete(v, iRow); /* execute an SQL DELETE */ | ||
3130 | } | ||
3131 | |||
3132 | /* Update a row in the %_content table; fill [pTerms] with new doclists for the | ||
3133 | * %_term table. */ | ||
3134 | static int index_update(fulltext_vtab *v, sqlite_int64 iRow, | ||
3135 | sqlite3_value **pValues, fts1Hash *pTerms){ | ||
3136 | /* Generate an empty doclist for each term that previously appeared in this | ||
3137 | * row. */ | ||
3138 | int rc = deleteTerms(v, pTerms, iRow); | ||
3139 | if( rc!=SQLITE_OK ) return rc; | ||
3140 | |||
3141 | rc = content_update(v, pValues, iRow); /* execute an SQL UPDATE */ | ||
3142 | if( rc!=SQLITE_OK ) return rc; | ||
3143 | |||
3144 | /* Now add positions for terms which appear in the updated row. */ | ||
3145 | return insertTerms(v, pTerms, iRow, pValues); | ||
3146 | } | ||
3147 | |||
3148 | /* This function implements the xUpdate callback; it's the top-level entry | ||
3149 | * point for inserting, deleting or updating a row in a full-text table. */ | ||
3150 | static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg, | ||
3151 | sqlite_int64 *pRowid){ | ||
3152 | fulltext_vtab *v = (fulltext_vtab *) pVtab; | ||
3153 | fts1Hash terms; /* maps term string -> PosList */ | ||
3154 | int rc; | ||
3155 | fts1HashElem *e; | ||
3156 | |||
3157 | TRACE(("FTS1 Update %p\n", pVtab)); | ||
3158 | |||
3159 | fts1HashInit(&terms, FTS1_HASH_STRING, 1); | ||
3160 | |||
3161 | if( nArg<2 ){ | ||
3162 | rc = index_delete(v, sqlite3_value_int64(ppArg[0]), &terms); | ||
3163 | } else if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){ | ||
3164 | /* An update: | ||
3165 | * ppArg[0] = old rowid | ||
3166 | * ppArg[1] = new rowid | ||
3167 | * ppArg[2..2+v->nColumn-1] = values | ||
3168 | * ppArg[2+v->nColumn] = value for magic column (we ignore this) | ||
3169 | */ | ||
3170 | sqlite_int64 rowid = sqlite3_value_int64(ppArg[0]); | ||
3171 | if( sqlite3_value_type(ppArg[1]) != SQLITE_INTEGER || | ||
3172 | sqlite3_value_int64(ppArg[1]) != rowid ){ | ||
3173 | rc = SQLITE_ERROR; /* we don't allow changing the rowid */ | ||
3174 | } else { | ||
3175 | assert( nArg==2+v->nColumn+1); | ||
3176 | rc = index_update(v, rowid, &ppArg[2], &terms); | ||
3177 | } | ||
3178 | } else { | ||
3179 | /* An insert: | ||
3180 | * ppArg[1] = requested rowid | ||
3181 | * ppArg[2..2+v->nColumn-1] = values | ||
3182 | * ppArg[2+v->nColumn] = value for magic column (we ignore this) | ||
3183 | */ | ||
3184 | assert( nArg==2+v->nColumn+1); | ||
3185 | rc = index_insert(v, ppArg[1], &ppArg[2], pRowid, &terms); | ||
3186 | } | ||
3187 | |||
3188 | if( rc==SQLITE_OK ){ | ||
3189 | /* Write updated doclists to disk. */ | ||
3190 | for(e=fts1HashFirst(&terms); e; e=fts1HashNext(e)){ | ||
3191 | DocList *p = fts1HashData(e); | ||
3192 | rc = index_insert_term(v, fts1HashKey(e), fts1HashKeysize(e), p); | ||
3193 | if( rc!=SQLITE_OK ) break; | ||
3194 | } | ||
3195 | } | ||
3196 | |||
3197 | /* clean up */ | ||
3198 | for(e=fts1HashFirst(&terms); e; e=fts1HashNext(e)){ | ||
3199 | DocList *p = fts1HashData(e); | ||
3200 | docListDelete(p); | ||
3201 | } | ||
3202 | fts1HashClear(&terms); | ||
3203 | |||
3204 | return rc; | ||
3205 | } | ||
3206 | |||
3207 | /* | ||
3208 | ** Implementation of the snippet() function for FTS1 | ||
3209 | */ | ||
3210 | static void snippetFunc( | ||
3211 | sqlite3_context *pContext, | ||
3212 | int argc, | ||
3213 | sqlite3_value **argv | ||
3214 | ){ | ||
3215 | fulltext_cursor *pCursor; | ||
3216 | if( argc<1 ) return; | ||
3217 | if( sqlite3_value_type(argv[0])!=SQLITE_BLOB || | ||
3218 | sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){ | ||
3219 | sqlite3_result_error(pContext, "illegal first argument to html_snippet",-1); | ||
3220 | }else{ | ||
3221 | const char *zStart = "<b>"; | ||
3222 | const char *zEnd = "</b>"; | ||
3223 | const char *zEllipsis = "<b>...</b>"; | ||
3224 | memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor)); | ||
3225 | if( argc>=2 ){ | ||
3226 | zStart = (const char*)sqlite3_value_text(argv[1]); | ||
3227 | if( argc>=3 ){ | ||
3228 | zEnd = (const char*)sqlite3_value_text(argv[2]); | ||
3229 | if( argc>=4 ){ | ||
3230 | zEllipsis = (const char*)sqlite3_value_text(argv[3]); | ||
3231 | } | ||
3232 | } | ||
3233 | } | ||
3234 | snippetAllOffsets(pCursor); | ||
3235 | snippetText(pCursor, zStart, zEnd, zEllipsis); | ||
3236 | sqlite3_result_text(pContext, pCursor->snippet.zSnippet, | ||
3237 | pCursor->snippet.nSnippet, SQLITE_STATIC); | ||
3238 | } | ||
3239 | } | ||
3240 | |||
3241 | /* | ||
3242 | ** Implementation of the offsets() function for FTS1 | ||
3243 | */ | ||
3244 | static void snippetOffsetsFunc( | ||
3245 | sqlite3_context *pContext, | ||
3246 | int argc, | ||
3247 | sqlite3_value **argv | ||
3248 | ){ | ||
3249 | fulltext_cursor *pCursor; | ||
3250 | if( argc<1 ) return; | ||
3251 | if( sqlite3_value_type(argv[0])!=SQLITE_BLOB || | ||
3252 | sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){ | ||
3253 | sqlite3_result_error(pContext, "illegal first argument to offsets",-1); | ||
3254 | }else{ | ||
3255 | memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor)); | ||
3256 | snippetAllOffsets(pCursor); | ||
3257 | snippetOffsetText(&pCursor->snippet); | ||
3258 | sqlite3_result_text(pContext, | ||
3259 | pCursor->snippet.zOffset, pCursor->snippet.nOffset, | ||
3260 | SQLITE_STATIC); | ||
3261 | } | ||
3262 | } | ||
3263 | |||
3264 | /* | ||
3265 | ** This routine implements the xFindFunction method for the FTS1 | ||
3266 | ** virtual table. | ||
3267 | */ | ||
3268 | static int fulltextFindFunction( | ||
3269 | sqlite3_vtab *pVtab, | ||
3270 | int nArg, | ||
3271 | const char *zName, | ||
3272 | void (**pxFunc)(sqlite3_context*,int,sqlite3_value**), | ||
3273 | void **ppArg | ||
3274 | ){ | ||
3275 | if( strcmp(zName,"snippet")==0 ){ | ||
3276 | *pxFunc = snippetFunc; | ||
3277 | return 1; | ||
3278 | }else if( strcmp(zName,"offsets")==0 ){ | ||
3279 | *pxFunc = snippetOffsetsFunc; | ||
3280 | return 1; | ||
3281 | } | ||
3282 | return 0; | ||
3283 | } | ||
3284 | |||
3285 | /* | ||
3286 | ** Rename an fts1 table. | ||
3287 | */ | ||
3288 | static int fulltextRename( | ||
3289 | sqlite3_vtab *pVtab, | ||
3290 | const char *zName | ||
3291 | ){ | ||
3292 | fulltext_vtab *p = (fulltext_vtab *)pVtab; | ||
3293 | int rc = SQLITE_NOMEM; | ||
3294 | char *zSql = sqlite3_mprintf( | ||
3295 | "ALTER TABLE %Q.'%q_content' RENAME TO '%q_content';" | ||
3296 | "ALTER TABLE %Q.'%q_term' RENAME TO '%q_term';" | ||
3297 | , p->zDb, p->zName, zName | ||
3298 | , p->zDb, p->zName, zName | ||
3299 | ); | ||
3300 | if( zSql ){ | ||
3301 | rc = sqlite3_exec(p->db, zSql, 0, 0, 0); | ||
3302 | sqlite3_free(zSql); | ||
3303 | } | ||
3304 | return rc; | ||
3305 | } | ||
3306 | |||
3307 | static const sqlite3_module fulltextModule = { | ||
3308 | /* iVersion */ 0, | ||
3309 | /* xCreate */ fulltextCreate, | ||
3310 | /* xConnect */ fulltextConnect, | ||
3311 | /* xBestIndex */ fulltextBestIndex, | ||
3312 | /* xDisconnect */ fulltextDisconnect, | ||
3313 | /* xDestroy */ fulltextDestroy, | ||
3314 | /* xOpen */ fulltextOpen, | ||
3315 | /* xClose */ fulltextClose, | ||
3316 | /* xFilter */ fulltextFilter, | ||
3317 | /* xNext */ fulltextNext, | ||
3318 | /* xEof */ fulltextEof, | ||
3319 | /* xColumn */ fulltextColumn, | ||
3320 | /* xRowid */ fulltextRowid, | ||
3321 | /* xUpdate */ fulltextUpdate, | ||
3322 | /* xBegin */ 0, | ||
3323 | /* xSync */ 0, | ||
3324 | /* xCommit */ 0, | ||
3325 | /* xRollback */ 0, | ||
3326 | /* xFindFunction */ fulltextFindFunction, | ||
3327 | /* xRename */ fulltextRename, | ||
3328 | }; | ||
3329 | |||
3330 | int sqlite3Fts1Init(sqlite3 *db){ | ||
3331 | sqlite3_overload_function(db, "snippet", -1); | ||
3332 | sqlite3_overload_function(db, "offsets", -1); | ||
3333 | return sqlite3_create_module(db, "fts1", &fulltextModule, 0); | ||
3334 | } | ||
3335 | |||
3336 | #if !SQLITE_CORE | ||
3337 | int sqlite3_extension_init(sqlite3 *db, char **pzErrMsg, | ||
3338 | const sqlite3_api_routines *pApi){ | ||
3339 | SQLITE_EXTENSION_INIT2(pApi) | ||
3340 | return sqlite3Fts1Init(db); | ||
3341 | } | ||
3342 | #endif | ||
3343 | |||
3344 | #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */ | ||