diff options
author | dan miller | 2007-10-21 08:36:32 +0000 |
---|---|---|
committer | dan miller | 2007-10-21 08:36:32 +0000 |
commit | 2f8d7092bc2c9609fa98d6888106b96f38b22828 (patch) | |
tree | da6c37579258cc965b52a75aee6135fe44237698 /libraries/sqlite/unix/sqlite-3.5.1/ext | |
parent | * Committing new PolicyManager based on an ACL system. (diff) | |
download | opensim-SC_OLD-2f8d7092bc2c9609fa98d6888106b96f38b22828.zip opensim-SC_OLD-2f8d7092bc2c9609fa98d6888106b96f38b22828.tar.gz opensim-SC_OLD-2f8d7092bc2c9609fa98d6888106b96f38b22828.tar.bz2 opensim-SC_OLD-2f8d7092bc2c9609fa98d6888106b96f38b22828.tar.xz |
libraries moved to opensim-libs, a new repository
Diffstat (limited to 'libraries/sqlite/unix/sqlite-3.5.1/ext')
35 files changed, 0 insertions, 22182 deletions
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/README.txt b/libraries/sqlite/unix/sqlite-3.5.1/ext/README.txt deleted file mode 100644 index 009495f..0000000 --- a/libraries/sqlite/unix/sqlite-3.5.1/ext/README.txt +++ /dev/null | |||
@@ -1,2 +0,0 @@ | |||
1 | Version loadable extensions to SQLite are found in subfolders | ||
2 | of this folder. | ||
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/README.txt b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/README.txt deleted file mode 100644 index 292b7da..0000000 --- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/README.txt +++ /dev/null | |||
@@ -1,2 +0,0 @@ | |||
1 | This folder contains source code to the first full-text search | ||
2 | extension for SQLite. | ||
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1.c deleted file mode 100644 index 5a69965..0000000 --- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1.c +++ /dev/null | |||
@@ -1,3344 +0,0 @@ | |||
1 | /* fts1 has a design flaw which can lead to database corruption (see | ||
2 | ** below). It is recommended not to use it any longer, instead use | ||
3 | ** fts3 (or higher). If you believe that your use of fts1 is safe, | ||
4 | ** add -DSQLITE_ENABLE_BROKEN_FTS1=1 to your CFLAGS. | ||
5 | */ | ||
6 | #ifndef SQLITE_ENABLE_BROKEN_FTS1 | ||
7 | #error fts1 has a design flaw and has been deprecated. | ||
8 | #endif | ||
9 | /* The flaw is that fts1 uses the content table's unaliased rowid as | ||
10 | ** the unique docid. fts1 embeds the rowid in the index it builds, | ||
11 | ** and expects the rowid to not change. The SQLite VACUUM operation | ||
12 | ** will renumber such rowids, thereby breaking fts1. If you are using | ||
13 | ** fts1 in a system which has disabled VACUUM, then you can continue | ||
14 | ** to use it safely. Note that PRAGMA auto_vacuum does NOT disable | ||
15 | ** VACUUM, though systems using auto_vacuum are unlikely to invoke | ||
16 | ** VACUUM. | ||
17 | ** | ||
18 | ** fts1 should be safe even across VACUUM if you only insert documents | ||
19 | ** and never delete. | ||
20 | */ | ||
21 | |||
22 | /* The author disclaims copyright to this source code. | ||
23 | * | ||
24 | * This is an SQLite module implementing full-text search. | ||
25 | */ | ||
26 | |||
27 | /* | ||
28 | ** The code in this file is only compiled if: | ||
29 | ** | ||
30 | ** * The FTS1 module is being built as an extension | ||
31 | ** (in which case SQLITE_CORE is not defined), or | ||
32 | ** | ||
33 | ** * The FTS1 module is being built into the core of | ||
34 | ** SQLite (in which case SQLITE_ENABLE_FTS1 is defined). | ||
35 | */ | ||
36 | #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) | ||
37 | |||
38 | #if defined(SQLITE_ENABLE_FTS1) && !defined(SQLITE_CORE) | ||
39 | # define SQLITE_CORE 1 | ||
40 | #endif | ||
41 | |||
42 | #include <assert.h> | ||
43 | #include <stdlib.h> | ||
44 | #include <stdio.h> | ||
45 | #include <string.h> | ||
46 | #include <ctype.h> | ||
47 | |||
48 | #include "fts1.h" | ||
49 | #include "fts1_hash.h" | ||
50 | #include "fts1_tokenizer.h" | ||
51 | #include "sqlite3.h" | ||
52 | #include "sqlite3ext.h" | ||
53 | SQLITE_EXTENSION_INIT1 | ||
54 | |||
55 | |||
56 | #if 0 | ||
57 | # define TRACE(A) printf A; fflush(stdout) | ||
58 | #else | ||
59 | # define TRACE(A) | ||
60 | #endif | ||
61 | |||
62 | /* utility functions */ | ||
63 | |||
64 | typedef struct StringBuffer { | ||
65 | int len; /* length, not including null terminator */ | ||
66 | int alloced; /* Space allocated for s[] */ | ||
67 | char *s; /* Content of the string */ | ||
68 | } StringBuffer; | ||
69 | |||
70 | static void initStringBuffer(StringBuffer *sb){ | ||
71 | sb->len = 0; | ||
72 | sb->alloced = 100; | ||
73 | sb->s = malloc(100); | ||
74 | sb->s[0] = '\0'; | ||
75 | } | ||
76 | |||
77 | static void nappend(StringBuffer *sb, const char *zFrom, int nFrom){ | ||
78 | if( sb->len + nFrom >= sb->alloced ){ | ||
79 | sb->alloced = sb->len + nFrom + 100; | ||
80 | sb->s = realloc(sb->s, sb->alloced+1); | ||
81 | if( sb->s==0 ){ | ||
82 | initStringBuffer(sb); | ||
83 | return; | ||
84 | } | ||
85 | } | ||
86 | memcpy(sb->s + sb->len, zFrom, nFrom); | ||
87 | sb->len += nFrom; | ||
88 | sb->s[sb->len] = 0; | ||
89 | } | ||
90 | static void append(StringBuffer *sb, const char *zFrom){ | ||
91 | nappend(sb, zFrom, strlen(zFrom)); | ||
92 | } | ||
93 | |||
94 | /* We encode variable-length integers in little-endian order using seven bits | ||
95 | * per byte as follows: | ||
96 | ** | ||
97 | ** KEY: | ||
98 | ** A = 0xxxxxxx 7 bits of data and one flag bit | ||
99 | ** B = 1xxxxxxx 7 bits of data and one flag bit | ||
100 | ** | ||
101 | ** 7 bits - A | ||
102 | ** 14 bits - BA | ||
103 | ** 21 bits - BBA | ||
104 | ** and so on. | ||
105 | */ | ||
106 | |||
107 | /* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */ | ||
108 | #define VARINT_MAX 10 | ||
109 | |||
110 | /* Write a 64-bit variable-length integer to memory starting at p[0]. | ||
111 | * The length of data written will be between 1 and VARINT_MAX bytes. | ||
112 | * The number of bytes written is returned. */ | ||
113 | static int putVarint(char *p, sqlite_int64 v){ | ||
114 | unsigned char *q = (unsigned char *) p; | ||
115 | sqlite_uint64 vu = v; | ||
116 | do{ | ||
117 | *q++ = (unsigned char) ((vu & 0x7f) | 0x80); | ||
118 | vu >>= 7; | ||
119 | }while( vu!=0 ); | ||
120 | q[-1] &= 0x7f; /* turn off high bit in final byte */ | ||
121 | assert( q - (unsigned char *)p <= VARINT_MAX ); | ||
122 | return (int) (q - (unsigned char *)p); | ||
123 | } | ||
124 | |||
125 | /* Read a 64-bit variable-length integer from memory starting at p[0]. | ||
126 | * Return the number of bytes read, or 0 on error. | ||
127 | * The value is stored in *v. */ | ||
128 | static int getVarint(const char *p, sqlite_int64 *v){ | ||
129 | const unsigned char *q = (const unsigned char *) p; | ||
130 | sqlite_uint64 x = 0, y = 1; | ||
131 | while( (*q & 0x80) == 0x80 ){ | ||
132 | x += y * (*q++ & 0x7f); | ||
133 | y <<= 7; | ||
134 | if( q - (unsigned char *)p >= VARINT_MAX ){ /* bad data */ | ||
135 | assert( 0 ); | ||
136 | return 0; | ||
137 | } | ||
138 | } | ||
139 | x += y * (*q++); | ||
140 | *v = (sqlite_int64) x; | ||
141 | return (int) (q - (unsigned char *)p); | ||
142 | } | ||
143 | |||
144 | static int getVarint32(const char *p, int *pi){ | ||
145 | sqlite_int64 i; | ||
146 | int ret = getVarint(p, &i); | ||
147 | *pi = (int) i; | ||
148 | assert( *pi==i ); | ||
149 | return ret; | ||
150 | } | ||
151 | |||
152 | /*** Document lists *** | ||
153 | * | ||
154 | * A document list holds a sorted list of varint-encoded document IDs. | ||
155 | * | ||
156 | * A doclist with type DL_POSITIONS_OFFSETS is stored like this: | ||
157 | * | ||
158 | * array { | ||
159 | * varint docid; | ||
160 | * array { | ||
161 | * varint position; (delta from previous position plus POS_BASE) | ||
162 | * varint startOffset; (delta from previous startOffset) | ||
163 | * varint endOffset; (delta from startOffset) | ||
164 | * } | ||
165 | * } | ||
166 | * | ||
167 | * Here, array { X } means zero or more occurrences of X, adjacent in memory. | ||
168 | * | ||
169 | * A position list may hold positions for text in multiple columns. A position | ||
170 | * POS_COLUMN is followed by a varint containing the index of the column for | ||
171 | * following positions in the list. Any positions appearing before any | ||
172 | * occurrences of POS_COLUMN are for column 0. | ||
173 | * | ||
174 | * A doclist with type DL_POSITIONS is like the above, but holds only docids | ||
175 | * and positions without offset information. | ||
176 | * | ||
177 | * A doclist with type DL_DOCIDS is like the above, but holds only docids | ||
178 | * without positions or offset information. | ||
179 | * | ||
180 | * On disk, every document list has positions and offsets, so we don't bother | ||
181 | * to serialize a doclist's type. | ||
182 | * | ||
183 | * We don't yet delta-encode document IDs; doing so will probably be a | ||
184 | * modest win. | ||
185 | * | ||
186 | * NOTE(shess) I've thought of a slightly (1%) better offset encoding. | ||
187 | * After the first offset, estimate the next offset by using the | ||
188 | * current token position and the previous token position and offset, | ||
189 | * offset to handle some variance. So the estimate would be | ||
190 | * (iPosition*w->iStartOffset/w->iPosition-64), which is delta-encoded | ||
191 | * as normal. Offsets more than 64 chars from the estimate are | ||
192 | * encoded as the delta to the previous start offset + 128. An | ||
193 | * additional tiny increment can be gained by using the end offset of | ||
194 | * the previous token to make the estimate a tiny bit more precise. | ||
195 | */ | ||
196 | |||
197 | /* It is not safe to call isspace(), tolower(), or isalnum() on | ||
198 | ** hi-bit-set characters. This is the same solution used in the | ||
199 | ** tokenizer. | ||
200 | */ | ||
201 | /* TODO(shess) The snippet-generation code should be using the | ||
202 | ** tokenizer-generated tokens rather than doing its own local | ||
203 | ** tokenization. | ||
204 | */ | ||
205 | /* TODO(shess) Is __isascii() a portable version of (c&0x80)==0? */ | ||
206 | static int safe_isspace(char c){ | ||
207 | return (c&0x80)==0 ? isspace(c) : 0; | ||
208 | } | ||
209 | static int safe_tolower(char c){ | ||
210 | return (c&0x80)==0 ? tolower(c) : c; | ||
211 | } | ||
212 | static int safe_isalnum(char c){ | ||
213 | return (c&0x80)==0 ? isalnum(c) : 0; | ||
214 | } | ||
215 | |||
216 | typedef enum DocListType { | ||
217 | DL_DOCIDS, /* docids only */ | ||
218 | DL_POSITIONS, /* docids + positions */ | ||
219 | DL_POSITIONS_OFFSETS /* docids + positions + offsets */ | ||
220 | } DocListType; | ||
221 | |||
222 | /* | ||
223 | ** By default, only positions and not offsets are stored in the doclists. | ||
224 | ** To change this so that offsets are stored too, compile with | ||
225 | ** | ||
226 | ** -DDL_DEFAULT=DL_POSITIONS_OFFSETS | ||
227 | ** | ||
228 | */ | ||
229 | #ifndef DL_DEFAULT | ||
230 | # define DL_DEFAULT DL_POSITIONS | ||
231 | #endif | ||
232 | |||
233 | typedef struct DocList { | ||
234 | char *pData; | ||
235 | int nData; | ||
236 | DocListType iType; | ||
237 | int iLastColumn; /* the last column written */ | ||
238 | int iLastPos; /* the last position written */ | ||
239 | int iLastOffset; /* the last start offset written */ | ||
240 | } DocList; | ||
241 | |||
242 | enum { | ||
243 | POS_END = 0, /* end of this position list */ | ||
244 | POS_COLUMN, /* followed by new column number */ | ||
245 | POS_BASE | ||
246 | }; | ||
247 | |||
248 | /* Initialize a new DocList to hold the given data. */ | ||
249 | static void docListInit(DocList *d, DocListType iType, | ||
250 | const char *pData, int nData){ | ||
251 | d->nData = nData; | ||
252 | if( nData>0 ){ | ||
253 | d->pData = malloc(nData); | ||
254 | memcpy(d->pData, pData, nData); | ||
255 | } else { | ||
256 | d->pData = NULL; | ||
257 | } | ||
258 | d->iType = iType; | ||
259 | d->iLastColumn = 0; | ||
260 | d->iLastPos = d->iLastOffset = 0; | ||
261 | } | ||
262 | |||
263 | /* Create a new dynamically-allocated DocList. */ | ||
264 | static DocList *docListNew(DocListType iType){ | ||
265 | DocList *d = (DocList *) malloc(sizeof(DocList)); | ||
266 | docListInit(d, iType, 0, 0); | ||
267 | return d; | ||
268 | } | ||
269 | |||
270 | static void docListDestroy(DocList *d){ | ||
271 | free(d->pData); | ||
272 | #ifndef NDEBUG | ||
273 | memset(d, 0x55, sizeof(*d)); | ||
274 | #endif | ||
275 | } | ||
276 | |||
277 | static void docListDelete(DocList *d){ | ||
278 | docListDestroy(d); | ||
279 | free(d); | ||
280 | } | ||
281 | |||
282 | static char *docListEnd(DocList *d){ | ||
283 | return d->pData + d->nData; | ||
284 | } | ||
285 | |||
286 | /* Append a varint to a DocList's data. */ | ||
287 | static void appendVarint(DocList *d, sqlite_int64 i){ | ||
288 | char c[VARINT_MAX]; | ||
289 | int n = putVarint(c, i); | ||
290 | d->pData = realloc(d->pData, d->nData + n); | ||
291 | memcpy(d->pData + d->nData, c, n); | ||
292 | d->nData += n; | ||
293 | } | ||
294 | |||
295 | static void docListAddDocid(DocList *d, sqlite_int64 iDocid){ | ||
296 | appendVarint(d, iDocid); | ||
297 | if( d->iType>=DL_POSITIONS ){ | ||
298 | appendVarint(d, POS_END); /* initially empty position list */ | ||
299 | d->iLastColumn = 0; | ||
300 | d->iLastPos = d->iLastOffset = 0; | ||
301 | } | ||
302 | } | ||
303 | |||
304 | /* helper function for docListAddPos and docListAddPosOffset */ | ||
305 | static void addPos(DocList *d, int iColumn, int iPos){ | ||
306 | assert( d->nData>0 ); | ||
307 | --d->nData; /* remove previous terminator */ | ||
308 | if( iColumn!=d->iLastColumn ){ | ||
309 | assert( iColumn>d->iLastColumn ); | ||
310 | appendVarint(d, POS_COLUMN); | ||
311 | appendVarint(d, iColumn); | ||
312 | d->iLastColumn = iColumn; | ||
313 | d->iLastPos = d->iLastOffset = 0; | ||
314 | } | ||
315 | assert( iPos>=d->iLastPos ); | ||
316 | appendVarint(d, iPos-d->iLastPos+POS_BASE); | ||
317 | d->iLastPos = iPos; | ||
318 | } | ||
319 | |||
320 | /* Add a position to the last position list in a doclist. */ | ||
321 | static void docListAddPos(DocList *d, int iColumn, int iPos){ | ||
322 | assert( d->iType==DL_POSITIONS ); | ||
323 | addPos(d, iColumn, iPos); | ||
324 | appendVarint(d, POS_END); /* add new terminator */ | ||
325 | } | ||
326 | |||
327 | /* | ||
328 | ** Add a position and starting and ending offsets to a doclist. | ||
329 | ** | ||
330 | ** If the doclist is setup to handle only positions, then insert | ||
331 | ** the position only and ignore the offsets. | ||
332 | */ | ||
333 | static void docListAddPosOffset( | ||
334 | DocList *d, /* Doclist under construction */ | ||
335 | int iColumn, /* Column the inserted term is part of */ | ||
336 | int iPos, /* Position of the inserted term */ | ||
337 | int iStartOffset, /* Starting offset of inserted term */ | ||
338 | int iEndOffset /* Ending offset of inserted term */ | ||
339 | ){ | ||
340 | assert( d->iType>=DL_POSITIONS ); | ||
341 | addPos(d, iColumn, iPos); | ||
342 | if( d->iType==DL_POSITIONS_OFFSETS ){ | ||
343 | assert( iStartOffset>=d->iLastOffset ); | ||
344 | appendVarint(d, iStartOffset-d->iLastOffset); | ||
345 | d->iLastOffset = iStartOffset; | ||
346 | assert( iEndOffset>=iStartOffset ); | ||
347 | appendVarint(d, iEndOffset-iStartOffset); | ||
348 | } | ||
349 | appendVarint(d, POS_END); /* add new terminator */ | ||
350 | } | ||
351 | |||
352 | /* | ||
353 | ** A DocListReader object is a cursor into a doclist. Initialize | ||
354 | ** the cursor to the beginning of the doclist by calling readerInit(). | ||
355 | ** Then use routines | ||
356 | ** | ||
357 | ** peekDocid() | ||
358 | ** readDocid() | ||
359 | ** readPosition() | ||
360 | ** skipPositionList() | ||
361 | ** and so forth... | ||
362 | ** | ||
363 | ** to read information out of the doclist. When we reach the end | ||
364 | ** of the doclist, atEnd() returns TRUE. | ||
365 | */ | ||
366 | typedef struct DocListReader { | ||
367 | DocList *pDoclist; /* The document list we are stepping through */ | ||
368 | char *p; /* Pointer to next unread byte in the doclist */ | ||
369 | int iLastColumn; | ||
370 | int iLastPos; /* the last position read, or -1 when not in a position list */ | ||
371 | } DocListReader; | ||
372 | |||
373 | /* | ||
374 | ** Initialize the DocListReader r to point to the beginning of pDoclist. | ||
375 | */ | ||
376 | static void readerInit(DocListReader *r, DocList *pDoclist){ | ||
377 | r->pDoclist = pDoclist; | ||
378 | if( pDoclist!=NULL ){ | ||
379 | r->p = pDoclist->pData; | ||
380 | } | ||
381 | r->iLastColumn = -1; | ||
382 | r->iLastPos = -1; | ||
383 | } | ||
384 | |||
385 | /* | ||
386 | ** Return TRUE if we have reached then end of pReader and there is | ||
387 | ** nothing else left to read. | ||
388 | */ | ||
389 | static int atEnd(DocListReader *pReader){ | ||
390 | return pReader->pDoclist==0 || (pReader->p >= docListEnd(pReader->pDoclist)); | ||
391 | } | ||
392 | |||
393 | /* Peek at the next docid without advancing the read pointer. | ||
394 | */ | ||
395 | static sqlite_int64 peekDocid(DocListReader *pReader){ | ||
396 | sqlite_int64 ret; | ||
397 | assert( !atEnd(pReader) ); | ||
398 | assert( pReader->iLastPos==-1 ); | ||
399 | getVarint(pReader->p, &ret); | ||
400 | return ret; | ||
401 | } | ||
402 | |||
403 | /* Read the next docid. See also nextDocid(). | ||
404 | */ | ||
405 | static sqlite_int64 readDocid(DocListReader *pReader){ | ||
406 | sqlite_int64 ret; | ||
407 | assert( !atEnd(pReader) ); | ||
408 | assert( pReader->iLastPos==-1 ); | ||
409 | pReader->p += getVarint(pReader->p, &ret); | ||
410 | if( pReader->pDoclist->iType>=DL_POSITIONS ){ | ||
411 | pReader->iLastColumn = 0; | ||
412 | pReader->iLastPos = 0; | ||
413 | } | ||
414 | return ret; | ||
415 | } | ||
416 | |||
417 | /* Read the next position and column index from a position list. | ||
418 | * Returns the position, or -1 at the end of the list. */ | ||
419 | static int readPosition(DocListReader *pReader, int *iColumn){ | ||
420 | int i; | ||
421 | int iType = pReader->pDoclist->iType; | ||
422 | |||
423 | if( pReader->iLastPos==-1 ){ | ||
424 | return -1; | ||
425 | } | ||
426 | assert( !atEnd(pReader) ); | ||
427 | |||
428 | if( iType<DL_POSITIONS ){ | ||
429 | return -1; | ||
430 | } | ||
431 | pReader->p += getVarint32(pReader->p, &i); | ||
432 | if( i==POS_END ){ | ||
433 | pReader->iLastColumn = pReader->iLastPos = -1; | ||
434 | *iColumn = -1; | ||
435 | return -1; | ||
436 | } | ||
437 | if( i==POS_COLUMN ){ | ||
438 | pReader->p += getVarint32(pReader->p, &pReader->iLastColumn); | ||
439 | pReader->iLastPos = 0; | ||
440 | pReader->p += getVarint32(pReader->p, &i); | ||
441 | assert( i>=POS_BASE ); | ||
442 | } | ||
443 | pReader->iLastPos += ((int) i)-POS_BASE; | ||
444 | if( iType>=DL_POSITIONS_OFFSETS ){ | ||
445 | /* Skip over offsets, ignoring them for now. */ | ||
446 | int iStart, iEnd; | ||
447 | pReader->p += getVarint32(pReader->p, &iStart); | ||
448 | pReader->p += getVarint32(pReader->p, &iEnd); | ||
449 | } | ||
450 | *iColumn = pReader->iLastColumn; | ||
451 | return pReader->iLastPos; | ||
452 | } | ||
453 | |||
454 | /* Skip past the end of a position list. */ | ||
455 | static void skipPositionList(DocListReader *pReader){ | ||
456 | DocList *p = pReader->pDoclist; | ||
457 | if( p && p->iType>=DL_POSITIONS ){ | ||
458 | int iColumn; | ||
459 | while( readPosition(pReader, &iColumn)!=-1 ){} | ||
460 | } | ||
461 | } | ||
462 | |||
463 | /* Skip over a docid, including its position list if the doclist has | ||
464 | * positions. */ | ||
465 | static void skipDocument(DocListReader *pReader){ | ||
466 | readDocid(pReader); | ||
467 | skipPositionList(pReader); | ||
468 | } | ||
469 | |||
470 | /* Skip past all docids which are less than [iDocid]. Returns 1 if a docid | ||
471 | * matching [iDocid] was found. */ | ||
472 | static int skipToDocid(DocListReader *pReader, sqlite_int64 iDocid){ | ||
473 | sqlite_int64 d = 0; | ||
474 | while( !atEnd(pReader) && (d=peekDocid(pReader))<iDocid ){ | ||
475 | skipDocument(pReader); | ||
476 | } | ||
477 | return !atEnd(pReader) && d==iDocid; | ||
478 | } | ||
479 | |||
480 | /* Return the first document in a document list. | ||
481 | */ | ||
482 | static sqlite_int64 firstDocid(DocList *d){ | ||
483 | DocListReader r; | ||
484 | readerInit(&r, d); | ||
485 | return readDocid(&r); | ||
486 | } | ||
487 | |||
488 | #ifdef SQLITE_DEBUG | ||
489 | /* | ||
490 | ** This routine is used for debugging purpose only. | ||
491 | ** | ||
492 | ** Write the content of a doclist to standard output. | ||
493 | */ | ||
494 | static void printDoclist(DocList *p){ | ||
495 | DocListReader r; | ||
496 | const char *zSep = ""; | ||
497 | |||
498 | readerInit(&r, p); | ||
499 | while( !atEnd(&r) ){ | ||
500 | sqlite_int64 docid = readDocid(&r); | ||
501 | if( docid==0 ){ | ||
502 | skipPositionList(&r); | ||
503 | continue; | ||
504 | } | ||
505 | printf("%s%lld", zSep, docid); | ||
506 | zSep = ","; | ||
507 | if( p->iType>=DL_POSITIONS ){ | ||
508 | int iPos, iCol; | ||
509 | const char *zDiv = ""; | ||
510 | printf("("); | ||
511 | while( (iPos = readPosition(&r, &iCol))>=0 ){ | ||
512 | printf("%s%d:%d", zDiv, iCol, iPos); | ||
513 | zDiv = ":"; | ||
514 | } | ||
515 | printf(")"); | ||
516 | } | ||
517 | } | ||
518 | printf("\n"); | ||
519 | fflush(stdout); | ||
520 | } | ||
521 | #endif /* SQLITE_DEBUG */ | ||
522 | |||
523 | /* Trim the given doclist to contain only positions in column | ||
524 | * [iRestrictColumn]. */ | ||
525 | static void docListRestrictColumn(DocList *in, int iRestrictColumn){ | ||
526 | DocListReader r; | ||
527 | DocList out; | ||
528 | |||
529 | assert( in->iType>=DL_POSITIONS ); | ||
530 | readerInit(&r, in); | ||
531 | docListInit(&out, DL_POSITIONS, NULL, 0); | ||
532 | |||
533 | while( !atEnd(&r) ){ | ||
534 | sqlite_int64 iDocid = readDocid(&r); | ||
535 | int iPos, iColumn; | ||
536 | |||
537 | docListAddDocid(&out, iDocid); | ||
538 | while( (iPos = readPosition(&r, &iColumn)) != -1 ){ | ||
539 | if( iColumn==iRestrictColumn ){ | ||
540 | docListAddPos(&out, iColumn, iPos); | ||
541 | } | ||
542 | } | ||
543 | } | ||
544 | |||
545 | docListDestroy(in); | ||
546 | *in = out; | ||
547 | } | ||
548 | |||
549 | /* Trim the given doclist by discarding any docids without any remaining | ||
550 | * positions. */ | ||
551 | static void docListDiscardEmpty(DocList *in) { | ||
552 | DocListReader r; | ||
553 | DocList out; | ||
554 | |||
555 | /* TODO: It would be nice to implement this operation in place; that | ||
556 | * could save a significant amount of memory in queries with long doclists. */ | ||
557 | assert( in->iType>=DL_POSITIONS ); | ||
558 | readerInit(&r, in); | ||
559 | docListInit(&out, DL_POSITIONS, NULL, 0); | ||
560 | |||
561 | while( !atEnd(&r) ){ | ||
562 | sqlite_int64 iDocid = readDocid(&r); | ||
563 | int match = 0; | ||
564 | int iPos, iColumn; | ||
565 | while( (iPos = readPosition(&r, &iColumn)) != -1 ){ | ||
566 | if( !match ){ | ||
567 | docListAddDocid(&out, iDocid); | ||
568 | match = 1; | ||
569 | } | ||
570 | docListAddPos(&out, iColumn, iPos); | ||
571 | } | ||
572 | } | ||
573 | |||
574 | docListDestroy(in); | ||
575 | *in = out; | ||
576 | } | ||
577 | |||
578 | /* Helper function for docListUpdate() and docListAccumulate(). | ||
579 | ** Splices a doclist element into the doclist represented by r, | ||
580 | ** leaving r pointing after the newly spliced element. | ||
581 | */ | ||
582 | static void docListSpliceElement(DocListReader *r, sqlite_int64 iDocid, | ||
583 | const char *pSource, int nSource){ | ||
584 | DocList *d = r->pDoclist; | ||
585 | char *pTarget; | ||
586 | int nTarget, found; | ||
587 | |||
588 | found = skipToDocid(r, iDocid); | ||
589 | |||
590 | /* Describe slice in d to place pSource/nSource. */ | ||
591 | pTarget = r->p; | ||
592 | if( found ){ | ||
593 | skipDocument(r); | ||
594 | nTarget = r->p-pTarget; | ||
595 | }else{ | ||
596 | nTarget = 0; | ||
597 | } | ||
598 | |||
599 | /* The sense of the following is that there are three possibilities. | ||
600 | ** If nTarget==nSource, we should not move any memory nor realloc. | ||
601 | ** If nTarget>nSource, trim target and realloc. | ||
602 | ** If nTarget<nSource, realloc then expand target. | ||
603 | */ | ||
604 | if( nTarget>nSource ){ | ||
605 | memmove(pTarget+nSource, pTarget+nTarget, docListEnd(d)-(pTarget+nTarget)); | ||
606 | } | ||
607 | if( nTarget!=nSource ){ | ||
608 | int iDoclist = pTarget-d->pData; | ||
609 | d->pData = realloc(d->pData, d->nData+nSource-nTarget); | ||
610 | pTarget = d->pData+iDoclist; | ||
611 | } | ||
612 | if( nTarget<nSource ){ | ||
613 | memmove(pTarget+nSource, pTarget+nTarget, docListEnd(d)-(pTarget+nTarget)); | ||
614 | } | ||
615 | |||
616 | memcpy(pTarget, pSource, nSource); | ||
617 | d->nData += nSource-nTarget; | ||
618 | r->p = pTarget+nSource; | ||
619 | } | ||
620 | |||
621 | /* Insert/update pUpdate into the doclist. */ | ||
622 | static void docListUpdate(DocList *d, DocList *pUpdate){ | ||
623 | DocListReader reader; | ||
624 | |||
625 | assert( d!=NULL && pUpdate!=NULL ); | ||
626 | assert( d->iType==pUpdate->iType); | ||
627 | |||
628 | readerInit(&reader, d); | ||
629 | docListSpliceElement(&reader, firstDocid(pUpdate), | ||
630 | pUpdate->pData, pUpdate->nData); | ||
631 | } | ||
632 | |||
633 | /* Propagate elements from pUpdate to pAcc, overwriting elements with | ||
634 | ** matching docids. | ||
635 | */ | ||
636 | static void docListAccumulate(DocList *pAcc, DocList *pUpdate){ | ||
637 | DocListReader accReader, updateReader; | ||
638 | |||
639 | /* Handle edge cases where one doclist is empty. */ | ||
640 | assert( pAcc!=NULL ); | ||
641 | if( pUpdate==NULL || pUpdate->nData==0 ) return; | ||
642 | if( pAcc->nData==0 ){ | ||
643 | pAcc->pData = malloc(pUpdate->nData); | ||
644 | memcpy(pAcc->pData, pUpdate->pData, pUpdate->nData); | ||
645 | pAcc->nData = pUpdate->nData; | ||
646 | return; | ||
647 | } | ||
648 | |||
649 | readerInit(&accReader, pAcc); | ||
650 | readerInit(&updateReader, pUpdate); | ||
651 | |||
652 | while( !atEnd(&updateReader) ){ | ||
653 | char *pSource = updateReader.p; | ||
654 | sqlite_int64 iDocid = readDocid(&updateReader); | ||
655 | skipPositionList(&updateReader); | ||
656 | docListSpliceElement(&accReader, iDocid, pSource, updateReader.p-pSource); | ||
657 | } | ||
658 | } | ||
659 | |||
660 | /* | ||
661 | ** Read the next docid off of pIn. Return 0 if we reach the end. | ||
662 | * | ||
663 | * TODO: This assumes that docids are never 0, but they may actually be 0 since | ||
664 | * users can choose docids when inserting into a full-text table. Fix this. | ||
665 | */ | ||
666 | static sqlite_int64 nextDocid(DocListReader *pIn){ | ||
667 | skipPositionList(pIn); | ||
668 | return atEnd(pIn) ? 0 : readDocid(pIn); | ||
669 | } | ||
670 | |||
671 | /* | ||
672 | ** pLeft and pRight are two DocListReaders that are pointing to | ||
673 | ** positions lists of the same document: iDocid. | ||
674 | ** | ||
675 | ** If there are no instances in pLeft or pRight where the position | ||
676 | ** of pLeft is one less than the position of pRight, then this | ||
677 | ** routine adds nothing to pOut. | ||
678 | ** | ||
679 | ** If there are one or more instances where positions from pLeft | ||
680 | ** are exactly one less than positions from pRight, then add a new | ||
681 | ** document record to pOut. If pOut wants to hold positions, then | ||
682 | ** include the positions from pRight that are one more than a | ||
683 | ** position in pLeft. In other words: pRight.iPos==pLeft.iPos+1. | ||
684 | ** | ||
685 | ** pLeft and pRight are left pointing at the next document record. | ||
686 | */ | ||
687 | static void mergePosList( | ||
688 | DocListReader *pLeft, /* Left position list */ | ||
689 | DocListReader *pRight, /* Right position list */ | ||
690 | sqlite_int64 iDocid, /* The docid from pLeft and pRight */ | ||
691 | DocList *pOut /* Write the merged document record here */ | ||
692 | ){ | ||
693 | int iLeftCol, iLeftPos = readPosition(pLeft, &iLeftCol); | ||
694 | int iRightCol, iRightPos = readPosition(pRight, &iRightCol); | ||
695 | int match = 0; | ||
696 | |||
697 | /* Loop until we've reached the end of both position lists. */ | ||
698 | while( iLeftPos!=-1 && iRightPos!=-1 ){ | ||
699 | if( iLeftCol==iRightCol && iLeftPos+1==iRightPos ){ | ||
700 | if( !match ){ | ||
701 | docListAddDocid(pOut, iDocid); | ||
702 | match = 1; | ||
703 | } | ||
704 | if( pOut->iType>=DL_POSITIONS ){ | ||
705 | docListAddPos(pOut, iRightCol, iRightPos); | ||
706 | } | ||
707 | iLeftPos = readPosition(pLeft, &iLeftCol); | ||
708 | iRightPos = readPosition(pRight, &iRightCol); | ||
709 | }else if( iRightCol<iLeftCol || | ||
710 | (iRightCol==iLeftCol && iRightPos<iLeftPos+1) ){ | ||
711 | iRightPos = readPosition(pRight, &iRightCol); | ||
712 | }else{ | ||
713 | iLeftPos = readPosition(pLeft, &iLeftCol); | ||
714 | } | ||
715 | } | ||
716 | if( iLeftPos>=0 ) skipPositionList(pLeft); | ||
717 | if( iRightPos>=0 ) skipPositionList(pRight); | ||
718 | } | ||
719 | |||
720 | /* We have two doclists: pLeft and pRight. | ||
721 | ** Write the phrase intersection of these two doclists into pOut. | ||
722 | ** | ||
723 | ** A phrase intersection means that two documents only match | ||
724 | ** if pLeft.iPos+1==pRight.iPos. | ||
725 | ** | ||
726 | ** The output pOut may or may not contain positions. If pOut | ||
727 | ** does contain positions, they are the positions of pRight. | ||
728 | */ | ||
729 | static void docListPhraseMerge( | ||
730 | DocList *pLeft, /* Doclist resulting from the words on the left */ | ||
731 | DocList *pRight, /* Doclist for the next word to the right */ | ||
732 | DocList *pOut /* Write the combined doclist here */ | ||
733 | ){ | ||
734 | DocListReader left, right; | ||
735 | sqlite_int64 docidLeft, docidRight; | ||
736 | |||
737 | readerInit(&left, pLeft); | ||
738 | readerInit(&right, pRight); | ||
739 | docidLeft = nextDocid(&left); | ||
740 | docidRight = nextDocid(&right); | ||
741 | |||
742 | while( docidLeft>0 && docidRight>0 ){ | ||
743 | if( docidLeft<docidRight ){ | ||
744 | docidLeft = nextDocid(&left); | ||
745 | }else if( docidRight<docidLeft ){ | ||
746 | docidRight = nextDocid(&right); | ||
747 | }else{ | ||
748 | mergePosList(&left, &right, docidLeft, pOut); | ||
749 | docidLeft = nextDocid(&left); | ||
750 | docidRight = nextDocid(&right); | ||
751 | } | ||
752 | } | ||
753 | } | ||
754 | |||
755 | /* We have two doclists: pLeft and pRight. | ||
756 | ** Write the intersection of these two doclists into pOut. | ||
757 | ** Only docids are matched. Position information is ignored. | ||
758 | ** | ||
759 | ** The output pOut never holds positions. | ||
760 | */ | ||
761 | static void docListAndMerge( | ||
762 | DocList *pLeft, /* Doclist resulting from the words on the left */ | ||
763 | DocList *pRight, /* Doclist for the next word to the right */ | ||
764 | DocList *pOut /* Write the combined doclist here */ | ||
765 | ){ | ||
766 | DocListReader left, right; | ||
767 | sqlite_int64 docidLeft, docidRight; | ||
768 | |||
769 | assert( pOut->iType<DL_POSITIONS ); | ||
770 | |||
771 | readerInit(&left, pLeft); | ||
772 | readerInit(&right, pRight); | ||
773 | docidLeft = nextDocid(&left); | ||
774 | docidRight = nextDocid(&right); | ||
775 | |||
776 | while( docidLeft>0 && docidRight>0 ){ | ||
777 | if( docidLeft<docidRight ){ | ||
778 | docidLeft = nextDocid(&left); | ||
779 | }else if( docidRight<docidLeft ){ | ||
780 | docidRight = nextDocid(&right); | ||
781 | }else{ | ||
782 | docListAddDocid(pOut, docidLeft); | ||
783 | docidLeft = nextDocid(&left); | ||
784 | docidRight = nextDocid(&right); | ||
785 | } | ||
786 | } | ||
787 | } | ||
788 | |||
789 | /* We have two doclists: pLeft and pRight. | ||
790 | ** Write the union of these two doclists into pOut. | ||
791 | ** Only docids are matched. Position information is ignored. | ||
792 | ** | ||
793 | ** The output pOut never holds positions. | ||
794 | */ | ||
795 | static void docListOrMerge( | ||
796 | DocList *pLeft, /* Doclist resulting from the words on the left */ | ||
797 | DocList *pRight, /* Doclist for the next word to the right */ | ||
798 | DocList *pOut /* Write the combined doclist here */ | ||
799 | ){ | ||
800 | DocListReader left, right; | ||
801 | sqlite_int64 docidLeft, docidRight, priorLeft; | ||
802 | |||
803 | readerInit(&left, pLeft); | ||
804 | readerInit(&right, pRight); | ||
805 | docidLeft = nextDocid(&left); | ||
806 | docidRight = nextDocid(&right); | ||
807 | |||
808 | while( docidLeft>0 && docidRight>0 ){ | ||
809 | if( docidLeft<=docidRight ){ | ||
810 | docListAddDocid(pOut, docidLeft); | ||
811 | }else{ | ||
812 | docListAddDocid(pOut, docidRight); | ||
813 | } | ||
814 | priorLeft = docidLeft; | ||
815 | if( docidLeft<=docidRight ){ | ||
816 | docidLeft = nextDocid(&left); | ||
817 | } | ||
818 | if( docidRight>0 && docidRight<=priorLeft ){ | ||
819 | docidRight = nextDocid(&right); | ||
820 | } | ||
821 | } | ||
822 | while( docidLeft>0 ){ | ||
823 | docListAddDocid(pOut, docidLeft); | ||
824 | docidLeft = nextDocid(&left); | ||
825 | } | ||
826 | while( docidRight>0 ){ | ||
827 | docListAddDocid(pOut, docidRight); | ||
828 | docidRight = nextDocid(&right); | ||
829 | } | ||
830 | } | ||
831 | |||
832 | /* We have two doclists: pLeft and pRight. | ||
833 | ** Write into pOut all documents that occur in pLeft but not | ||
834 | ** in pRight. | ||
835 | ** | ||
836 | ** Only docids are matched. Position information is ignored. | ||
837 | ** | ||
838 | ** The output pOut never holds positions. | ||
839 | */ | ||
840 | static void docListExceptMerge( | ||
841 | DocList *pLeft, /* Doclist resulting from the words on the left */ | ||
842 | DocList *pRight, /* Doclist for the next word to the right */ | ||
843 | DocList *pOut /* Write the combined doclist here */ | ||
844 | ){ | ||
845 | DocListReader left, right; | ||
846 | sqlite_int64 docidLeft, docidRight, priorLeft; | ||
847 | |||
848 | readerInit(&left, pLeft); | ||
849 | readerInit(&right, pRight); | ||
850 | docidLeft = nextDocid(&left); | ||
851 | docidRight = nextDocid(&right); | ||
852 | |||
853 | while( docidLeft>0 && docidRight>0 ){ | ||
854 | priorLeft = docidLeft; | ||
855 | if( docidLeft<docidRight ){ | ||
856 | docListAddDocid(pOut, docidLeft); | ||
857 | } | ||
858 | if( docidLeft<=docidRight ){ | ||
859 | docidLeft = nextDocid(&left); | ||
860 | } | ||
861 | if( docidRight>0 && docidRight<=priorLeft ){ | ||
862 | docidRight = nextDocid(&right); | ||
863 | } | ||
864 | } | ||
865 | while( docidLeft>0 ){ | ||
866 | docListAddDocid(pOut, docidLeft); | ||
867 | docidLeft = nextDocid(&left); | ||
868 | } | ||
869 | } | ||
870 | |||
871 | static char *string_dup_n(const char *s, int n){ | ||
872 | char *str = malloc(n + 1); | ||
873 | memcpy(str, s, n); | ||
874 | str[n] = '\0'; | ||
875 | return str; | ||
876 | } | ||
877 | |||
878 | /* Duplicate a string; the caller must free() the returned string. | ||
879 | * (We don't use strdup() since it's not part of the standard C library and | ||
880 | * may not be available everywhere.) */ | ||
881 | static char *string_dup(const char *s){ | ||
882 | return string_dup_n(s, strlen(s)); | ||
883 | } | ||
884 | |||
885 | /* Format a string, replacing each occurrence of the % character with | ||
886 | * zDb.zName. This may be more convenient than sqlite_mprintf() | ||
887 | * when one string is used repeatedly in a format string. | ||
888 | * The caller must free() the returned string. */ | ||
889 | static char *string_format(const char *zFormat, | ||
890 | const char *zDb, const char *zName){ | ||
891 | const char *p; | ||
892 | size_t len = 0; | ||
893 | size_t nDb = strlen(zDb); | ||
894 | size_t nName = strlen(zName); | ||
895 | size_t nFullTableName = nDb+1+nName; | ||
896 | char *result; | ||
897 | char *r; | ||
898 | |||
899 | /* first compute length needed */ | ||
900 | for(p = zFormat ; *p ; ++p){ | ||
901 | len += (*p=='%' ? nFullTableName : 1); | ||
902 | } | ||
903 | len += 1; /* for null terminator */ | ||
904 | |||
905 | r = result = malloc(len); | ||
906 | for(p = zFormat; *p; ++p){ | ||
907 | if( *p=='%' ){ | ||
908 | memcpy(r, zDb, nDb); | ||
909 | r += nDb; | ||
910 | *r++ = '.'; | ||
911 | memcpy(r, zName, nName); | ||
912 | r += nName; | ||
913 | } else { | ||
914 | *r++ = *p; | ||
915 | } | ||
916 | } | ||
917 | *r++ = '\0'; | ||
918 | assert( r == result + len ); | ||
919 | return result; | ||
920 | } | ||
921 | |||
922 | static int sql_exec(sqlite3 *db, const char *zDb, const char *zName, | ||
923 | const char *zFormat){ | ||
924 | char *zCommand = string_format(zFormat, zDb, zName); | ||
925 | int rc; | ||
926 | TRACE(("FTS1 sql: %s\n", zCommand)); | ||
927 | rc = sqlite3_exec(db, zCommand, NULL, 0, NULL); | ||
928 | free(zCommand); | ||
929 | return rc; | ||
930 | } | ||
931 | |||
932 | static int sql_prepare(sqlite3 *db, const char *zDb, const char *zName, | ||
933 | sqlite3_stmt **ppStmt, const char *zFormat){ | ||
934 | char *zCommand = string_format(zFormat, zDb, zName); | ||
935 | int rc; | ||
936 | TRACE(("FTS1 prepare: %s\n", zCommand)); | ||
937 | rc = sqlite3_prepare(db, zCommand, -1, ppStmt, NULL); | ||
938 | free(zCommand); | ||
939 | return rc; | ||
940 | } | ||
941 | |||
942 | /* end utility functions */ | ||
943 | |||
944 | /* Forward reference */ | ||
945 | typedef struct fulltext_vtab fulltext_vtab; | ||
946 | |||
947 | /* A single term in a query is represented by an instances of | ||
948 | ** the following structure. | ||
949 | */ | ||
950 | typedef struct QueryTerm { | ||
951 | short int nPhrase; /* How many following terms are part of the same phrase */ | ||
952 | short int iPhrase; /* This is the i-th term of a phrase. */ | ||
953 | short int iColumn; /* Column of the index that must match this term */ | ||
954 | signed char isOr; /* this term is preceded by "OR" */ | ||
955 | signed char isNot; /* this term is preceded by "-" */ | ||
956 | char *pTerm; /* text of the term. '\000' terminated. malloced */ | ||
957 | int nTerm; /* Number of bytes in pTerm[] */ | ||
958 | } QueryTerm; | ||
959 | |||
960 | |||
961 | /* A query string is parsed into a Query structure. | ||
962 | * | ||
963 | * We could, in theory, allow query strings to be complicated | ||
964 | * nested expressions with precedence determined by parentheses. | ||
965 | * But none of the major search engines do this. (Perhaps the | ||
966 | * feeling is that an parenthesized expression is two complex of | ||
967 | * an idea for the average user to grasp.) Taking our lead from | ||
968 | * the major search engines, we will allow queries to be a list | ||
969 | * of terms (with an implied AND operator) or phrases in double-quotes, | ||
970 | * with a single optional "-" before each non-phrase term to designate | ||
971 | * negation and an optional OR connector. | ||
972 | * | ||
973 | * OR binds more tightly than the implied AND, which is what the | ||
974 | * major search engines seem to do. So, for example: | ||
975 | * | ||
976 | * [one two OR three] ==> one AND (two OR three) | ||
977 | * [one OR two three] ==> (one OR two) AND three | ||
978 | * | ||
979 | * A "-" before a term matches all entries that lack that term. | ||
980 | * The "-" must occur immediately before the term with in intervening | ||
981 | * space. This is how the search engines do it. | ||
982 | * | ||
983 | * A NOT term cannot be the right-hand operand of an OR. If this | ||
984 | * occurs in the query string, the NOT is ignored: | ||
985 | * | ||
986 | * [one OR -two] ==> one OR two | ||
987 | * | ||
988 | */ | ||
989 | typedef struct Query { | ||
990 | fulltext_vtab *pFts; /* The full text index */ | ||
991 | int nTerms; /* Number of terms in the query */ | ||
992 | QueryTerm *pTerms; /* Array of terms. Space obtained from malloc() */ | ||
993 | int nextIsOr; /* Set the isOr flag on the next inserted term */ | ||
994 | int nextColumn; /* Next word parsed must be in this column */ | ||
995 | int dfltColumn; /* The default column */ | ||
996 | } Query; | ||
997 | |||
998 | |||
999 | /* | ||
1000 | ** An instance of the following structure keeps track of generated | ||
1001 | ** matching-word offset information and snippets. | ||
1002 | */ | ||
1003 | typedef struct Snippet { | ||
1004 | int nMatch; /* Total number of matches */ | ||
1005 | int nAlloc; /* Space allocated for aMatch[] */ | ||
1006 | struct snippetMatch { /* One entry for each matching term */ | ||
1007 | char snStatus; /* Status flag for use while constructing snippets */ | ||
1008 | short int iCol; /* The column that contains the match */ | ||
1009 | short int iTerm; /* The index in Query.pTerms[] of the matching term */ | ||
1010 | short int nByte; /* Number of bytes in the term */ | ||
1011 | int iStart; /* The offset to the first character of the term */ | ||
1012 | } *aMatch; /* Points to space obtained from malloc */ | ||
1013 | char *zOffset; /* Text rendering of aMatch[] */ | ||
1014 | int nOffset; /* strlen(zOffset) */ | ||
1015 | char *zSnippet; /* Snippet text */ | ||
1016 | int nSnippet; /* strlen(zSnippet) */ | ||
1017 | } Snippet; | ||
1018 | |||
1019 | |||
1020 | typedef enum QueryType { | ||
1021 | QUERY_GENERIC, /* table scan */ | ||
1022 | QUERY_ROWID, /* lookup by rowid */ | ||
1023 | QUERY_FULLTEXT /* QUERY_FULLTEXT + [i] is a full-text search for column i*/ | ||
1024 | } QueryType; | ||
1025 | |||
1026 | /* TODO(shess) CHUNK_MAX controls how much data we allow in segment 0 | ||
1027 | ** before we start aggregating into larger segments. Lower CHUNK_MAX | ||
1028 | ** means that for a given input we have more individual segments per | ||
1029 | ** term, which means more rows in the table and a bigger index (due to | ||
1030 | ** both more rows and bigger rowids). But it also reduces the average | ||
1031 | ** cost of adding new elements to the segment 0 doclist, and it seems | ||
1032 | ** to reduce the number of pages read and written during inserts. 256 | ||
1033 | ** was chosen by measuring insertion times for a certain input (first | ||
1034 | ** 10k documents of Enron corpus), though including query performance | ||
1035 | ** in the decision may argue for a larger value. | ||
1036 | */ | ||
1037 | #define CHUNK_MAX 256 | ||
1038 | |||
1039 | typedef enum fulltext_statement { | ||
1040 | CONTENT_INSERT_STMT, | ||
1041 | CONTENT_SELECT_STMT, | ||
1042 | CONTENT_UPDATE_STMT, | ||
1043 | CONTENT_DELETE_STMT, | ||
1044 | |||
1045 | TERM_SELECT_STMT, | ||
1046 | TERM_SELECT_ALL_STMT, | ||
1047 | TERM_INSERT_STMT, | ||
1048 | TERM_UPDATE_STMT, | ||
1049 | TERM_DELETE_STMT, | ||
1050 | |||
1051 | MAX_STMT /* Always at end! */ | ||
1052 | } fulltext_statement; | ||
1053 | |||
1054 | /* These must exactly match the enum above. */ | ||
1055 | /* TODO(adam): Is there some risk that a statement (in particular, | ||
1056 | ** pTermSelectStmt) will be used in two cursors at once, e.g. if a | ||
1057 | ** query joins a virtual table to itself? If so perhaps we should | ||
1058 | ** move some of these to the cursor object. | ||
1059 | */ | ||
1060 | static const char *const fulltext_zStatement[MAX_STMT] = { | ||
1061 | /* CONTENT_INSERT */ NULL, /* generated in contentInsertStatement() */ | ||
1062 | /* CONTENT_SELECT */ "select * from %_content where rowid = ?", | ||
1063 | /* CONTENT_UPDATE */ NULL, /* generated in contentUpdateStatement() */ | ||
1064 | /* CONTENT_DELETE */ "delete from %_content where rowid = ?", | ||
1065 | |||
1066 | /* TERM_SELECT */ | ||
1067 | "select rowid, doclist from %_term where term = ? and segment = ?", | ||
1068 | /* TERM_SELECT_ALL */ | ||
1069 | "select doclist from %_term where term = ? order by segment", | ||
1070 | /* TERM_INSERT */ | ||
1071 | "insert into %_term (rowid, term, segment, doclist) values (?, ?, ?, ?)", | ||
1072 | /* TERM_UPDATE */ "update %_term set doclist = ? where rowid = ?", | ||
1073 | /* TERM_DELETE */ "delete from %_term where rowid = ?", | ||
1074 | }; | ||
1075 | |||
1076 | /* | ||
1077 | ** A connection to a fulltext index is an instance of the following | ||
1078 | ** structure. The xCreate and xConnect methods create an instance | ||
1079 | ** of this structure and xDestroy and xDisconnect free that instance. | ||
1080 | ** All other methods receive a pointer to the structure as one of their | ||
1081 | ** arguments. | ||
1082 | */ | ||
1083 | struct fulltext_vtab { | ||
1084 | sqlite3_vtab base; /* Base class used by SQLite core */ | ||
1085 | sqlite3 *db; /* The database connection */ | ||
1086 | const char *zDb; /* logical database name */ | ||
1087 | const char *zName; /* virtual table name */ | ||
1088 | int nColumn; /* number of columns in virtual table */ | ||
1089 | char **azColumn; /* column names. malloced */ | ||
1090 | char **azContentColumn; /* column names in content table; malloced */ | ||
1091 | sqlite3_tokenizer *pTokenizer; /* tokenizer for inserts and queries */ | ||
1092 | |||
1093 | /* Precompiled statements which we keep as long as the table is | ||
1094 | ** open. | ||
1095 | */ | ||
1096 | sqlite3_stmt *pFulltextStatements[MAX_STMT]; | ||
1097 | }; | ||
1098 | |||
1099 | /* | ||
1100 | ** When the core wants to do a query, it create a cursor using a | ||
1101 | ** call to xOpen. This structure is an instance of a cursor. It | ||
1102 | ** is destroyed by xClose. | ||
1103 | */ | ||
1104 | typedef struct fulltext_cursor { | ||
1105 | sqlite3_vtab_cursor base; /* Base class used by SQLite core */ | ||
1106 | QueryType iCursorType; /* Copy of sqlite3_index_info.idxNum */ | ||
1107 | sqlite3_stmt *pStmt; /* Prepared statement in use by the cursor */ | ||
1108 | int eof; /* True if at End Of Results */ | ||
1109 | Query q; /* Parsed query string */ | ||
1110 | Snippet snippet; /* Cached snippet for the current row */ | ||
1111 | int iColumn; /* Column being searched */ | ||
1112 | DocListReader result; /* used when iCursorType == QUERY_FULLTEXT */ | ||
1113 | } fulltext_cursor; | ||
1114 | |||
1115 | static struct fulltext_vtab *cursor_vtab(fulltext_cursor *c){ | ||
1116 | return (fulltext_vtab *) c->base.pVtab; | ||
1117 | } | ||
1118 | |||
1119 | static const sqlite3_module fulltextModule; /* forward declaration */ | ||
1120 | |||
1121 | /* Append a list of strings separated by commas to a StringBuffer. */ | ||
1122 | static void appendList(StringBuffer *sb, int nString, char **azString){ | ||
1123 | int i; | ||
1124 | for(i=0; i<nString; ++i){ | ||
1125 | if( i>0 ) append(sb, ", "); | ||
1126 | append(sb, azString[i]); | ||
1127 | } | ||
1128 | } | ||
1129 | |||
1130 | /* Return a dynamically generated statement of the form | ||
1131 | * insert into %_content (rowid, ...) values (?, ...) | ||
1132 | */ | ||
1133 | static const char *contentInsertStatement(fulltext_vtab *v){ | ||
1134 | StringBuffer sb; | ||
1135 | int i; | ||
1136 | |||
1137 | initStringBuffer(&sb); | ||
1138 | append(&sb, "insert into %_content (rowid, "); | ||
1139 | appendList(&sb, v->nColumn, v->azContentColumn); | ||
1140 | append(&sb, ") values (?"); | ||
1141 | for(i=0; i<v->nColumn; ++i) | ||
1142 | append(&sb, ", ?"); | ||
1143 | append(&sb, ")"); | ||
1144 | return sb.s; | ||
1145 | } | ||
1146 | |||
1147 | /* Return a dynamically generated statement of the form | ||
1148 | * update %_content set [col_0] = ?, [col_1] = ?, ... | ||
1149 | * where rowid = ? | ||
1150 | */ | ||
1151 | static const char *contentUpdateStatement(fulltext_vtab *v){ | ||
1152 | StringBuffer sb; | ||
1153 | int i; | ||
1154 | |||
1155 | initStringBuffer(&sb); | ||
1156 | append(&sb, "update %_content set "); | ||
1157 | for(i=0; i<v->nColumn; ++i) { | ||
1158 | if( i>0 ){ | ||
1159 | append(&sb, ", "); | ||
1160 | } | ||
1161 | append(&sb, v->azContentColumn[i]); | ||
1162 | append(&sb, " = ?"); | ||
1163 | } | ||
1164 | append(&sb, " where rowid = ?"); | ||
1165 | return sb.s; | ||
1166 | } | ||
1167 | |||
1168 | /* Puts a freshly-prepared statement determined by iStmt in *ppStmt. | ||
1169 | ** If the indicated statement has never been prepared, it is prepared | ||
1170 | ** and cached, otherwise the cached version is reset. | ||
1171 | */ | ||
1172 | static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt, | ||
1173 | sqlite3_stmt **ppStmt){ | ||
1174 | assert( iStmt<MAX_STMT ); | ||
1175 | if( v->pFulltextStatements[iStmt]==NULL ){ | ||
1176 | const char *zStmt; | ||
1177 | int rc; | ||
1178 | switch( iStmt ){ | ||
1179 | case CONTENT_INSERT_STMT: | ||
1180 | zStmt = contentInsertStatement(v); break; | ||
1181 | case CONTENT_UPDATE_STMT: | ||
1182 | zStmt = contentUpdateStatement(v); break; | ||
1183 | default: | ||
1184 | zStmt = fulltext_zStatement[iStmt]; | ||
1185 | } | ||
1186 | rc = sql_prepare(v->db, v->zDb, v->zName, &v->pFulltextStatements[iStmt], | ||
1187 | zStmt); | ||
1188 | if( zStmt != fulltext_zStatement[iStmt]) free((void *) zStmt); | ||
1189 | if( rc!=SQLITE_OK ) return rc; | ||
1190 | } else { | ||
1191 | int rc = sqlite3_reset(v->pFulltextStatements[iStmt]); | ||
1192 | if( rc!=SQLITE_OK ) return rc; | ||
1193 | } | ||
1194 | |||
1195 | *ppStmt = v->pFulltextStatements[iStmt]; | ||
1196 | return SQLITE_OK; | ||
1197 | } | ||
1198 | |||
1199 | /* Step the indicated statement, handling errors SQLITE_BUSY (by | ||
1200 | ** retrying) and SQLITE_SCHEMA (by re-preparing and transferring | ||
1201 | ** bindings to the new statement). | ||
1202 | ** TODO(adam): We should extend this function so that it can work with | ||
1203 | ** statements declared locally, not only globally cached statements. | ||
1204 | */ | ||
1205 | static int sql_step_statement(fulltext_vtab *v, fulltext_statement iStmt, | ||
1206 | sqlite3_stmt **ppStmt){ | ||
1207 | int rc; | ||
1208 | sqlite3_stmt *s = *ppStmt; | ||
1209 | assert( iStmt<MAX_STMT ); | ||
1210 | assert( s==v->pFulltextStatements[iStmt] ); | ||
1211 | |||
1212 | while( (rc=sqlite3_step(s))!=SQLITE_DONE && rc!=SQLITE_ROW ){ | ||
1213 | if( rc==SQLITE_BUSY ) continue; | ||
1214 | if( rc!=SQLITE_ERROR ) return rc; | ||
1215 | |||
1216 | /* If an SQLITE_SCHEMA error has occured, then finalizing this | ||
1217 | * statement is going to delete the fulltext_vtab structure. If | ||
1218 | * the statement just executed is in the pFulltextStatements[] | ||
1219 | * array, it will be finalized twice. So remove it before | ||
1220 | * calling sqlite3_finalize(). | ||
1221 | */ | ||
1222 | v->pFulltextStatements[iStmt] = NULL; | ||
1223 | rc = sqlite3_finalize(s); | ||
1224 | break; | ||
1225 | } | ||
1226 | return rc; | ||
1227 | |||
1228 | err: | ||
1229 | sqlite3_finalize(s); | ||
1230 | return rc; | ||
1231 | } | ||
1232 | |||
1233 | /* Like sql_step_statement(), but convert SQLITE_DONE to SQLITE_OK. | ||
1234 | ** Useful for statements like UPDATE, where we expect no results. | ||
1235 | */ | ||
1236 | static int sql_single_step_statement(fulltext_vtab *v, | ||
1237 | fulltext_statement iStmt, | ||
1238 | sqlite3_stmt **ppStmt){ | ||
1239 | int rc = sql_step_statement(v, iStmt, ppStmt); | ||
1240 | return (rc==SQLITE_DONE) ? SQLITE_OK : rc; | ||
1241 | } | ||
1242 | |||
1243 | /* insert into %_content (rowid, ...) values ([rowid], [pValues]) */ | ||
1244 | static int content_insert(fulltext_vtab *v, sqlite3_value *rowid, | ||
1245 | sqlite3_value **pValues){ | ||
1246 | sqlite3_stmt *s; | ||
1247 | int i; | ||
1248 | int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s); | ||
1249 | if( rc!=SQLITE_OK ) return rc; | ||
1250 | |||
1251 | rc = sqlite3_bind_value(s, 1, rowid); | ||
1252 | if( rc!=SQLITE_OK ) return rc; | ||
1253 | |||
1254 | for(i=0; i<v->nColumn; ++i){ | ||
1255 | rc = sqlite3_bind_value(s, 2+i, pValues[i]); | ||
1256 | if( rc!=SQLITE_OK ) return rc; | ||
1257 | } | ||
1258 | |||
1259 | return sql_single_step_statement(v, CONTENT_INSERT_STMT, &s); | ||
1260 | } | ||
1261 | |||
1262 | /* update %_content set col0 = pValues[0], col1 = pValues[1], ... | ||
1263 | * where rowid = [iRowid] */ | ||
1264 | static int content_update(fulltext_vtab *v, sqlite3_value **pValues, | ||
1265 | sqlite_int64 iRowid){ | ||
1266 | sqlite3_stmt *s; | ||
1267 | int i; | ||
1268 | int rc = sql_get_statement(v, CONTENT_UPDATE_STMT, &s); | ||
1269 | if( rc!=SQLITE_OK ) return rc; | ||
1270 | |||
1271 | for(i=0; i<v->nColumn; ++i){ | ||
1272 | rc = sqlite3_bind_value(s, 1+i, pValues[i]); | ||
1273 | if( rc!=SQLITE_OK ) return rc; | ||
1274 | } | ||
1275 | |||
1276 | rc = sqlite3_bind_int64(s, 1+v->nColumn, iRowid); | ||
1277 | if( rc!=SQLITE_OK ) return rc; | ||
1278 | |||
1279 | return sql_single_step_statement(v, CONTENT_UPDATE_STMT, &s); | ||
1280 | } | ||
1281 | |||
1282 | static void freeStringArray(int nString, const char **pString){ | ||
1283 | int i; | ||
1284 | |||
1285 | for (i=0 ; i < nString ; ++i) { | ||
1286 | if( pString[i]!=NULL ) free((void *) pString[i]); | ||
1287 | } | ||
1288 | free((void *) pString); | ||
1289 | } | ||
1290 | |||
1291 | /* select * from %_content where rowid = [iRow] | ||
1292 | * The caller must delete the returned array and all strings in it. | ||
1293 | * null fields will be NULL in the returned array. | ||
1294 | * | ||
1295 | * TODO: Perhaps we should return pointer/length strings here for consistency | ||
1296 | * with other code which uses pointer/length. */ | ||
1297 | static int content_select(fulltext_vtab *v, sqlite_int64 iRow, | ||
1298 | const char ***pValues){ | ||
1299 | sqlite3_stmt *s; | ||
1300 | const char **values; | ||
1301 | int i; | ||
1302 | int rc; | ||
1303 | |||
1304 | *pValues = NULL; | ||
1305 | |||
1306 | rc = sql_get_statement(v, CONTENT_SELECT_STMT, &s); | ||
1307 | if( rc!=SQLITE_OK ) return rc; | ||
1308 | |||
1309 | rc = sqlite3_bind_int64(s, 1, iRow); | ||
1310 | if( rc!=SQLITE_OK ) return rc; | ||
1311 | |||
1312 | rc = sql_step_statement(v, CONTENT_SELECT_STMT, &s); | ||
1313 | if( rc!=SQLITE_ROW ) return rc; | ||
1314 | |||
1315 | values = (const char **) malloc(v->nColumn * sizeof(const char *)); | ||
1316 | for(i=0; i<v->nColumn; ++i){ | ||
1317 | if( sqlite3_column_type(s, i)==SQLITE_NULL ){ | ||
1318 | values[i] = NULL; | ||
1319 | }else{ | ||
1320 | values[i] = string_dup((char*)sqlite3_column_text(s, i)); | ||
1321 | } | ||
1322 | } | ||
1323 | |||
1324 | /* We expect only one row. We must execute another sqlite3_step() | ||
1325 | * to complete the iteration; otherwise the table will remain locked. */ | ||
1326 | rc = sqlite3_step(s); | ||
1327 | if( rc==SQLITE_DONE ){ | ||
1328 | *pValues = values; | ||
1329 | return SQLITE_OK; | ||
1330 | } | ||
1331 | |||
1332 | freeStringArray(v->nColumn, values); | ||
1333 | return rc; | ||
1334 | } | ||
1335 | |||
1336 | /* delete from %_content where rowid = [iRow ] */ | ||
1337 | static int content_delete(fulltext_vtab *v, sqlite_int64 iRow){ | ||
1338 | sqlite3_stmt *s; | ||
1339 | int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s); | ||
1340 | if( rc!=SQLITE_OK ) return rc; | ||
1341 | |||
1342 | rc = sqlite3_bind_int64(s, 1, iRow); | ||
1343 | if( rc!=SQLITE_OK ) return rc; | ||
1344 | |||
1345 | return sql_single_step_statement(v, CONTENT_DELETE_STMT, &s); | ||
1346 | } | ||
1347 | |||
1348 | /* select rowid, doclist from %_term | ||
1349 | * where term = [pTerm] and segment = [iSegment] | ||
1350 | * If found, returns SQLITE_ROW; the caller must free the | ||
1351 | * returned doclist. If no rows found, returns SQLITE_DONE. */ | ||
1352 | static int term_select(fulltext_vtab *v, const char *pTerm, int nTerm, | ||
1353 | int iSegment, | ||
1354 | sqlite_int64 *rowid, DocList *out){ | ||
1355 | sqlite3_stmt *s; | ||
1356 | int rc = sql_get_statement(v, TERM_SELECT_STMT, &s); | ||
1357 | if( rc!=SQLITE_OK ) return rc; | ||
1358 | |||
1359 | rc = sqlite3_bind_text(s, 1, pTerm, nTerm, SQLITE_STATIC); | ||
1360 | if( rc!=SQLITE_OK ) return rc; | ||
1361 | |||
1362 | rc = sqlite3_bind_int(s, 2, iSegment); | ||
1363 | if( rc!=SQLITE_OK ) return rc; | ||
1364 | |||
1365 | rc = sql_step_statement(v, TERM_SELECT_STMT, &s); | ||
1366 | if( rc!=SQLITE_ROW ) return rc; | ||
1367 | |||
1368 | *rowid = sqlite3_column_int64(s, 0); | ||
1369 | docListInit(out, DL_DEFAULT, | ||
1370 | sqlite3_column_blob(s, 1), sqlite3_column_bytes(s, 1)); | ||
1371 | |||
1372 | /* We expect only one row. We must execute another sqlite3_step() | ||
1373 | * to complete the iteration; otherwise the table will remain locked. */ | ||
1374 | rc = sqlite3_step(s); | ||
1375 | return rc==SQLITE_DONE ? SQLITE_ROW : rc; | ||
1376 | } | ||
1377 | |||
1378 | /* Load the segment doclists for term pTerm and merge them in | ||
1379 | ** appropriate order into out. Returns SQLITE_OK if successful. If | ||
1380 | ** there are no segments for pTerm, successfully returns an empty | ||
1381 | ** doclist in out. | ||
1382 | ** | ||
1383 | ** Each document consists of 1 or more "columns". The number of | ||
1384 | ** columns is v->nColumn. If iColumn==v->nColumn, then return | ||
1385 | ** position information about all columns. If iColumn<v->nColumn, | ||
1386 | ** then only return position information about the iColumn-th column | ||
1387 | ** (where the first column is 0). | ||
1388 | */ | ||
1389 | static int term_select_all( | ||
1390 | fulltext_vtab *v, /* The fulltext index we are querying against */ | ||
1391 | int iColumn, /* If <nColumn, only look at the iColumn-th column */ | ||
1392 | const char *pTerm, /* The term whose posting lists we want */ | ||
1393 | int nTerm, /* Number of bytes in pTerm */ | ||
1394 | DocList *out /* Write the resulting doclist here */ | ||
1395 | ){ | ||
1396 | DocList doclist; | ||
1397 | sqlite3_stmt *s; | ||
1398 | int rc = sql_get_statement(v, TERM_SELECT_ALL_STMT, &s); | ||
1399 | if( rc!=SQLITE_OK ) return rc; | ||
1400 | |||
1401 | rc = sqlite3_bind_text(s, 1, pTerm, nTerm, SQLITE_STATIC); | ||
1402 | if( rc!=SQLITE_OK ) return rc; | ||
1403 | |||
1404 | docListInit(&doclist, DL_DEFAULT, 0, 0); | ||
1405 | |||
1406 | /* TODO(shess) Handle schema and busy errors. */ | ||
1407 | while( (rc=sql_step_statement(v, TERM_SELECT_ALL_STMT, &s))==SQLITE_ROW ){ | ||
1408 | DocList old; | ||
1409 | |||
1410 | /* TODO(shess) If we processed doclists from oldest to newest, we | ||
1411 | ** could skip the malloc() involved with the following call. For | ||
1412 | ** now, I'd rather keep this logic similar to index_insert_term(). | ||
1413 | ** We could additionally drop elements when we see deletes, but | ||
1414 | ** that would require a distinct version of docListAccumulate(). | ||
1415 | */ | ||
1416 | docListInit(&old, DL_DEFAULT, | ||
1417 | sqlite3_column_blob(s, 0), sqlite3_column_bytes(s, 0)); | ||
1418 | |||
1419 | if( iColumn<v->nColumn ){ /* querying a single column */ | ||
1420 | docListRestrictColumn(&old, iColumn); | ||
1421 | } | ||
1422 | |||
1423 | /* doclist contains the newer data, so write it over old. Then | ||
1424 | ** steal accumulated result for doclist. | ||
1425 | */ | ||
1426 | docListAccumulate(&old, &doclist); | ||
1427 | docListDestroy(&doclist); | ||
1428 | doclist = old; | ||
1429 | } | ||
1430 | if( rc!=SQLITE_DONE ){ | ||
1431 | docListDestroy(&doclist); | ||
1432 | return rc; | ||
1433 | } | ||
1434 | |||
1435 | docListDiscardEmpty(&doclist); | ||
1436 | *out = doclist; | ||
1437 | return SQLITE_OK; | ||
1438 | } | ||
1439 | |||
1440 | /* insert into %_term (rowid, term, segment, doclist) | ||
1441 | values ([piRowid], [pTerm], [iSegment], [doclist]) | ||
1442 | ** Lets sqlite select rowid if piRowid is NULL, else uses *piRowid. | ||
1443 | ** | ||
1444 | ** NOTE(shess) piRowid is IN, with values of "space of int64" plus | ||
1445 | ** null, it is not used to pass data back to the caller. | ||
1446 | */ | ||
1447 | static int term_insert(fulltext_vtab *v, sqlite_int64 *piRowid, | ||
1448 | const char *pTerm, int nTerm, | ||
1449 | int iSegment, DocList *doclist){ | ||
1450 | sqlite3_stmt *s; | ||
1451 | int rc = sql_get_statement(v, TERM_INSERT_STMT, &s); | ||
1452 | if( rc!=SQLITE_OK ) return rc; | ||
1453 | |||
1454 | if( piRowid==NULL ){ | ||
1455 | rc = sqlite3_bind_null(s, 1); | ||
1456 | }else{ | ||
1457 | rc = sqlite3_bind_int64(s, 1, *piRowid); | ||
1458 | } | ||
1459 | if( rc!=SQLITE_OK ) return rc; | ||
1460 | |||
1461 | rc = sqlite3_bind_text(s, 2, pTerm, nTerm, SQLITE_STATIC); | ||
1462 | if( rc!=SQLITE_OK ) return rc; | ||
1463 | |||
1464 | rc = sqlite3_bind_int(s, 3, iSegment); | ||
1465 | if( rc!=SQLITE_OK ) return rc; | ||
1466 | |||
1467 | rc = sqlite3_bind_blob(s, 4, doclist->pData, doclist->nData, SQLITE_STATIC); | ||
1468 | if( rc!=SQLITE_OK ) return rc; | ||
1469 | |||
1470 | return sql_single_step_statement(v, TERM_INSERT_STMT, &s); | ||
1471 | } | ||
1472 | |||
1473 | /* update %_term set doclist = [doclist] where rowid = [rowid] */ | ||
1474 | static int term_update(fulltext_vtab *v, sqlite_int64 rowid, | ||
1475 | DocList *doclist){ | ||
1476 | sqlite3_stmt *s; | ||
1477 | int rc = sql_get_statement(v, TERM_UPDATE_STMT, &s); | ||
1478 | if( rc!=SQLITE_OK ) return rc; | ||
1479 | |||
1480 | rc = sqlite3_bind_blob(s, 1, doclist->pData, doclist->nData, SQLITE_STATIC); | ||
1481 | if( rc!=SQLITE_OK ) return rc; | ||
1482 | |||
1483 | rc = sqlite3_bind_int64(s, 2, rowid); | ||
1484 | if( rc!=SQLITE_OK ) return rc; | ||
1485 | |||
1486 | return sql_single_step_statement(v, TERM_UPDATE_STMT, &s); | ||
1487 | } | ||
1488 | |||
1489 | static int term_delete(fulltext_vtab *v, sqlite_int64 rowid){ | ||
1490 | sqlite3_stmt *s; | ||
1491 | int rc = sql_get_statement(v, TERM_DELETE_STMT, &s); | ||
1492 | if( rc!=SQLITE_OK ) return rc; | ||
1493 | |||
1494 | rc = sqlite3_bind_int64(s, 1, rowid); | ||
1495 | if( rc!=SQLITE_OK ) return rc; | ||
1496 | |||
1497 | return sql_single_step_statement(v, TERM_DELETE_STMT, &s); | ||
1498 | } | ||
1499 | |||
1500 | /* | ||
1501 | ** Free the memory used to contain a fulltext_vtab structure. | ||
1502 | */ | ||
1503 | static void fulltext_vtab_destroy(fulltext_vtab *v){ | ||
1504 | int iStmt, i; | ||
1505 | |||
1506 | TRACE(("FTS1 Destroy %p\n", v)); | ||
1507 | for( iStmt=0; iStmt<MAX_STMT; iStmt++ ){ | ||
1508 | if( v->pFulltextStatements[iStmt]!=NULL ){ | ||
1509 | sqlite3_finalize(v->pFulltextStatements[iStmt]); | ||
1510 | v->pFulltextStatements[iStmt] = NULL; | ||
1511 | } | ||
1512 | } | ||
1513 | |||
1514 | if( v->pTokenizer!=NULL ){ | ||
1515 | v->pTokenizer->pModule->xDestroy(v->pTokenizer); | ||
1516 | v->pTokenizer = NULL; | ||
1517 | } | ||
1518 | |||
1519 | free(v->azColumn); | ||
1520 | for(i = 0; i < v->nColumn; ++i) { | ||
1521 | sqlite3_free(v->azContentColumn[i]); | ||
1522 | } | ||
1523 | free(v->azContentColumn); | ||
1524 | free(v); | ||
1525 | } | ||
1526 | |||
1527 | /* | ||
1528 | ** Token types for parsing the arguments to xConnect or xCreate. | ||
1529 | */ | ||
1530 | #define TOKEN_EOF 0 /* End of file */ | ||
1531 | #define TOKEN_SPACE 1 /* Any kind of whitespace */ | ||
1532 | #define TOKEN_ID 2 /* An identifier */ | ||
1533 | #define TOKEN_STRING 3 /* A string literal */ | ||
1534 | #define TOKEN_PUNCT 4 /* A single punctuation character */ | ||
1535 | |||
1536 | /* | ||
1537 | ** If X is a character that can be used in an identifier then | ||
1538 | ** IdChar(X) will be true. Otherwise it is false. | ||
1539 | ** | ||
1540 | ** For ASCII, any character with the high-order bit set is | ||
1541 | ** allowed in an identifier. For 7-bit characters, | ||
1542 | ** sqlite3IsIdChar[X] must be 1. | ||
1543 | ** | ||
1544 | ** Ticket #1066. the SQL standard does not allow '$' in the | ||
1545 | ** middle of identfiers. But many SQL implementations do. | ||
1546 | ** SQLite will allow '$' in identifiers for compatibility. | ||
1547 | ** But the feature is undocumented. | ||
1548 | */ | ||
1549 | static const char isIdChar[] = { | ||
1550 | /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */ | ||
1551 | 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 2x */ | ||
1552 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */ | ||
1553 | 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */ | ||
1554 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */ | ||
1555 | 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */ | ||
1556 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */ | ||
1557 | }; | ||
1558 | #define IdChar(C) (((c=C)&0x80)!=0 || (c>0x1f && isIdChar[c-0x20])) | ||
1559 | |||
1560 | |||
1561 | /* | ||
1562 | ** Return the length of the token that begins at z[0]. | ||
1563 | ** Store the token type in *tokenType before returning. | ||
1564 | */ | ||
1565 | static int getToken(const char *z, int *tokenType){ | ||
1566 | int i, c; | ||
1567 | switch( *z ){ | ||
1568 | case 0: { | ||
1569 | *tokenType = TOKEN_EOF; | ||
1570 | return 0; | ||
1571 | } | ||
1572 | case ' ': case '\t': case '\n': case '\f': case '\r': { | ||
1573 | for(i=1; safe_isspace(z[i]); i++){} | ||
1574 | *tokenType = TOKEN_SPACE; | ||
1575 | return i; | ||
1576 | } | ||
1577 | case '`': | ||
1578 | case '\'': | ||
1579 | case '"': { | ||
1580 | int delim = z[0]; | ||
1581 | for(i=1; (c=z[i])!=0; i++){ | ||
1582 | if( c==delim ){ | ||
1583 | if( z[i+1]==delim ){ | ||
1584 | i++; | ||
1585 | }else{ | ||
1586 | break; | ||
1587 | } | ||
1588 | } | ||
1589 | } | ||
1590 | *tokenType = TOKEN_STRING; | ||
1591 | return i + (c!=0); | ||
1592 | } | ||
1593 | case '[': { | ||
1594 | for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){} | ||
1595 | *tokenType = TOKEN_ID; | ||
1596 | return i; | ||
1597 | } | ||
1598 | default: { | ||
1599 | if( !IdChar(*z) ){ | ||
1600 | break; | ||
1601 | } | ||
1602 | for(i=1; IdChar(z[i]); i++){} | ||
1603 | *tokenType = TOKEN_ID; | ||
1604 | return i; | ||
1605 | } | ||
1606 | } | ||
1607 | *tokenType = TOKEN_PUNCT; | ||
1608 | return 1; | ||
1609 | } | ||
1610 | |||
1611 | /* | ||
1612 | ** A token extracted from a string is an instance of the following | ||
1613 | ** structure. | ||
1614 | */ | ||
1615 | typedef struct Token { | ||
1616 | const char *z; /* Pointer to token text. Not '\000' terminated */ | ||
1617 | short int n; /* Length of the token text in bytes. */ | ||
1618 | } Token; | ||
1619 | |||
1620 | /* | ||
1621 | ** Given a input string (which is really one of the argv[] parameters | ||
1622 | ** passed into xConnect or xCreate) split the string up into tokens. | ||
1623 | ** Return an array of pointers to '\000' terminated strings, one string | ||
1624 | ** for each non-whitespace token. | ||
1625 | ** | ||
1626 | ** The returned array is terminated by a single NULL pointer. | ||
1627 | ** | ||
1628 | ** Space to hold the returned array is obtained from a single | ||
1629 | ** malloc and should be freed by passing the return value to free(). | ||
1630 | ** The individual strings within the token list are all a part of | ||
1631 | ** the single memory allocation and will all be freed at once. | ||
1632 | */ | ||
1633 | static char **tokenizeString(const char *z, int *pnToken){ | ||
1634 | int nToken = 0; | ||
1635 | Token *aToken = malloc( strlen(z) * sizeof(aToken[0]) ); | ||
1636 | int n = 1; | ||
1637 | int e, i; | ||
1638 | int totalSize = 0; | ||
1639 | char **azToken; | ||
1640 | char *zCopy; | ||
1641 | while( n>0 ){ | ||
1642 | n = getToken(z, &e); | ||
1643 | if( e!=TOKEN_SPACE ){ | ||
1644 | aToken[nToken].z = z; | ||
1645 | aToken[nToken].n = n; | ||
1646 | nToken++; | ||
1647 | totalSize += n+1; | ||
1648 | } | ||
1649 | z += n; | ||
1650 | } | ||
1651 | azToken = (char**)malloc( nToken*sizeof(char*) + totalSize ); | ||
1652 | zCopy = (char*)&azToken[nToken]; | ||
1653 | nToken--; | ||
1654 | for(i=0; i<nToken; i++){ | ||
1655 | azToken[i] = zCopy; | ||
1656 | n = aToken[i].n; | ||
1657 | memcpy(zCopy, aToken[i].z, n); | ||
1658 | zCopy[n] = 0; | ||
1659 | zCopy += n+1; | ||
1660 | } | ||
1661 | azToken[nToken] = 0; | ||
1662 | free(aToken); | ||
1663 | *pnToken = nToken; | ||
1664 | return azToken; | ||
1665 | } | ||
1666 | |||
1667 | /* | ||
1668 | ** Convert an SQL-style quoted string into a normal string by removing | ||
1669 | ** the quote characters. The conversion is done in-place. If the | ||
1670 | ** input does not begin with a quote character, then this routine | ||
1671 | ** is a no-op. | ||
1672 | ** | ||
1673 | ** Examples: | ||
1674 | ** | ||
1675 | ** "abc" becomes abc | ||
1676 | ** 'xyz' becomes xyz | ||
1677 | ** [pqr] becomes pqr | ||
1678 | ** `mno` becomes mno | ||
1679 | */ | ||
1680 | static void dequoteString(char *z){ | ||
1681 | int quote; | ||
1682 | int i, j; | ||
1683 | if( z==0 ) return; | ||
1684 | quote = z[0]; | ||
1685 | switch( quote ){ | ||
1686 | case '\'': break; | ||
1687 | case '"': break; | ||
1688 | case '`': break; /* For MySQL compatibility */ | ||
1689 | case '[': quote = ']'; break; /* For MS SqlServer compatibility */ | ||
1690 | default: return; | ||
1691 | } | ||
1692 | for(i=1, j=0; z[i]; i++){ | ||
1693 | if( z[i]==quote ){ | ||
1694 | if( z[i+1]==quote ){ | ||
1695 | z[j++] = quote; | ||
1696 | i++; | ||
1697 | }else{ | ||
1698 | z[j++] = 0; | ||
1699 | break; | ||
1700 | } | ||
1701 | }else{ | ||
1702 | z[j++] = z[i]; | ||
1703 | } | ||
1704 | } | ||
1705 | } | ||
1706 | |||
1707 | /* | ||
1708 | ** The input azIn is a NULL-terminated list of tokens. Remove the first | ||
1709 | ** token and all punctuation tokens. Remove the quotes from | ||
1710 | ** around string literal tokens. | ||
1711 | ** | ||
1712 | ** Example: | ||
1713 | ** | ||
1714 | ** input: tokenize chinese ( 'simplifed' , 'mixed' ) | ||
1715 | ** output: chinese simplifed mixed | ||
1716 | ** | ||
1717 | ** Another example: | ||
1718 | ** | ||
1719 | ** input: delimiters ( '[' , ']' , '...' ) | ||
1720 | ** output: [ ] ... | ||
1721 | */ | ||
1722 | static void tokenListToIdList(char **azIn){ | ||
1723 | int i, j; | ||
1724 | if( azIn ){ | ||
1725 | for(i=0, j=-1; azIn[i]; i++){ | ||
1726 | if( safe_isalnum(azIn[i][0]) || azIn[i][1] ){ | ||
1727 | dequoteString(azIn[i]); | ||
1728 | if( j>=0 ){ | ||
1729 | azIn[j] = azIn[i]; | ||
1730 | } | ||
1731 | j++; | ||
1732 | } | ||
1733 | } | ||
1734 | azIn[j] = 0; | ||
1735 | } | ||
1736 | } | ||
1737 | |||
1738 | |||
1739 | /* | ||
1740 | ** Find the first alphanumeric token in the string zIn. Null-terminate | ||
1741 | ** this token. Remove any quotation marks. And return a pointer to | ||
1742 | ** the result. | ||
1743 | */ | ||
1744 | static char *firstToken(char *zIn, char **pzTail){ | ||
1745 | int n, ttype; | ||
1746 | while(1){ | ||
1747 | n = getToken(zIn, &ttype); | ||
1748 | if( ttype==TOKEN_SPACE ){ | ||
1749 | zIn += n; | ||
1750 | }else if( ttype==TOKEN_EOF ){ | ||
1751 | *pzTail = zIn; | ||
1752 | return 0; | ||
1753 | }else{ | ||
1754 | zIn[n] = 0; | ||
1755 | *pzTail = &zIn[1]; | ||
1756 | dequoteString(zIn); | ||
1757 | return zIn; | ||
1758 | } | ||
1759 | } | ||
1760 | /*NOTREACHED*/ | ||
1761 | } | ||
1762 | |||
1763 | /* Return true if... | ||
1764 | ** | ||
1765 | ** * s begins with the string t, ignoring case | ||
1766 | ** * s is longer than t | ||
1767 | ** * The first character of s beyond t is not a alphanumeric | ||
1768 | ** | ||
1769 | ** Ignore leading space in *s. | ||
1770 | ** | ||
1771 | ** To put it another way, return true if the first token of | ||
1772 | ** s[] is t[]. | ||
1773 | */ | ||
1774 | static int startsWith(const char *s, const char *t){ | ||
1775 | while( safe_isspace(*s) ){ s++; } | ||
1776 | while( *t ){ | ||
1777 | if( safe_tolower(*s++)!=safe_tolower(*t++) ) return 0; | ||
1778 | } | ||
1779 | return *s!='_' && !safe_isalnum(*s); | ||
1780 | } | ||
1781 | |||
1782 | /* | ||
1783 | ** An instance of this structure defines the "spec" of a | ||
1784 | ** full text index. This structure is populated by parseSpec | ||
1785 | ** and use by fulltextConnect and fulltextCreate. | ||
1786 | */ | ||
1787 | typedef struct TableSpec { | ||
1788 | const char *zDb; /* Logical database name */ | ||
1789 | const char *zName; /* Name of the full-text index */ | ||
1790 | int nColumn; /* Number of columns to be indexed */ | ||
1791 | char **azColumn; /* Original names of columns to be indexed */ | ||
1792 | char **azContentColumn; /* Column names for %_content */ | ||
1793 | char **azTokenizer; /* Name of tokenizer and its arguments */ | ||
1794 | } TableSpec; | ||
1795 | |||
1796 | /* | ||
1797 | ** Reclaim all of the memory used by a TableSpec | ||
1798 | */ | ||
1799 | static void clearTableSpec(TableSpec *p) { | ||
1800 | free(p->azColumn); | ||
1801 | free(p->azContentColumn); | ||
1802 | free(p->azTokenizer); | ||
1803 | } | ||
1804 | |||
1805 | /* Parse a CREATE VIRTUAL TABLE statement, which looks like this: | ||
1806 | * | ||
1807 | * CREATE VIRTUAL TABLE email | ||
1808 | * USING fts1(subject, body, tokenize mytokenizer(myarg)) | ||
1809 | * | ||
1810 | * We return parsed information in a TableSpec structure. | ||
1811 | * | ||
1812 | */ | ||
1813 | static int parseSpec(TableSpec *pSpec, int argc, const char *const*argv, | ||
1814 | char**pzErr){ | ||
1815 | int i, n; | ||
1816 | char *z, *zDummy; | ||
1817 | char **azArg; | ||
1818 | const char *zTokenizer = 0; /* argv[] entry describing the tokenizer */ | ||
1819 | |||
1820 | assert( argc>=3 ); | ||
1821 | /* Current interface: | ||
1822 | ** argv[0] - module name | ||
1823 | ** argv[1] - database name | ||
1824 | ** argv[2] - table name | ||
1825 | ** argv[3..] - columns, optionally followed by tokenizer specification | ||
1826 | ** and snippet delimiters specification. | ||
1827 | */ | ||
1828 | |||
1829 | /* Make a copy of the complete argv[][] array in a single allocation. | ||
1830 | ** The argv[][] array is read-only and transient. We can write to the | ||
1831 | ** copy in order to modify things and the copy is persistent. | ||
1832 | */ | ||
1833 | memset(pSpec, 0, sizeof(*pSpec)); | ||
1834 | for(i=n=0; i<argc; i++){ | ||
1835 | n += strlen(argv[i]) + 1; | ||
1836 | } | ||
1837 | azArg = malloc( sizeof(char*)*argc + n ); | ||
1838 | if( azArg==0 ){ | ||
1839 | return SQLITE_NOMEM; | ||
1840 | } | ||
1841 | z = (char*)&azArg[argc]; | ||
1842 | for(i=0; i<argc; i++){ | ||
1843 | azArg[i] = z; | ||
1844 | strcpy(z, argv[i]); | ||
1845 | z += strlen(z)+1; | ||
1846 | } | ||
1847 | |||
1848 | /* Identify the column names and the tokenizer and delimiter arguments | ||
1849 | ** in the argv[][] array. | ||
1850 | */ | ||
1851 | pSpec->zDb = azArg[1]; | ||
1852 | pSpec->zName = azArg[2]; | ||
1853 | pSpec->nColumn = 0; | ||
1854 | pSpec->azColumn = azArg; | ||
1855 | zTokenizer = "tokenize simple"; | ||
1856 | for(i=3; i<argc; ++i){ | ||
1857 | if( startsWith(azArg[i],"tokenize") ){ | ||
1858 | zTokenizer = azArg[i]; | ||
1859 | }else{ | ||
1860 | z = azArg[pSpec->nColumn] = firstToken(azArg[i], &zDummy); | ||
1861 | pSpec->nColumn++; | ||
1862 | } | ||
1863 | } | ||
1864 | if( pSpec->nColumn==0 ){ | ||
1865 | azArg[0] = "content"; | ||
1866 | pSpec->nColumn = 1; | ||
1867 | } | ||
1868 | |||
1869 | /* | ||
1870 | ** Construct the list of content column names. | ||
1871 | ** | ||
1872 | ** Each content column name will be of the form cNNAAAA | ||
1873 | ** where NN is the column number and AAAA is the sanitized | ||
1874 | ** column name. "sanitized" means that special characters are | ||
1875 | ** converted to "_". The cNN prefix guarantees that all column | ||
1876 | ** names are unique. | ||
1877 | ** | ||
1878 | ** The AAAA suffix is not strictly necessary. It is included | ||
1879 | ** for the convenience of people who might examine the generated | ||
1880 | ** %_content table and wonder what the columns are used for. | ||
1881 | */ | ||
1882 | pSpec->azContentColumn = malloc( pSpec->nColumn * sizeof(char *) ); | ||
1883 | if( pSpec->azContentColumn==0 ){ | ||
1884 | clearTableSpec(pSpec); | ||
1885 | return SQLITE_NOMEM; | ||
1886 | } | ||
1887 | for(i=0; i<pSpec->nColumn; i++){ | ||
1888 | char *p; | ||
1889 | pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]); | ||
1890 | for (p = pSpec->azContentColumn[i]; *p ; ++p) { | ||
1891 | if( !safe_isalnum(*p) ) *p = '_'; | ||
1892 | } | ||
1893 | } | ||
1894 | |||
1895 | /* | ||
1896 | ** Parse the tokenizer specification string. | ||
1897 | */ | ||
1898 | pSpec->azTokenizer = tokenizeString(zTokenizer, &n); | ||
1899 | tokenListToIdList(pSpec->azTokenizer); | ||
1900 | |||
1901 | return SQLITE_OK; | ||
1902 | } | ||
1903 | |||
1904 | /* | ||
1905 | ** Generate a CREATE TABLE statement that describes the schema of | ||
1906 | ** the virtual table. Return a pointer to this schema string. | ||
1907 | ** | ||
1908 | ** Space is obtained from sqlite3_mprintf() and should be freed | ||
1909 | ** using sqlite3_free(). | ||
1910 | */ | ||
1911 | static char *fulltextSchema( | ||
1912 | int nColumn, /* Number of columns */ | ||
1913 | const char *const* azColumn, /* List of columns */ | ||
1914 | const char *zTableName /* Name of the table */ | ||
1915 | ){ | ||
1916 | int i; | ||
1917 | char *zSchema, *zNext; | ||
1918 | const char *zSep = "("; | ||
1919 | zSchema = sqlite3_mprintf("CREATE TABLE x"); | ||
1920 | for(i=0; i<nColumn; i++){ | ||
1921 | zNext = sqlite3_mprintf("%s%s%Q", zSchema, zSep, azColumn[i]); | ||
1922 | sqlite3_free(zSchema); | ||
1923 | zSchema = zNext; | ||
1924 | zSep = ","; | ||
1925 | } | ||
1926 | zNext = sqlite3_mprintf("%s,%Q)", zSchema, zTableName); | ||
1927 | sqlite3_free(zSchema); | ||
1928 | return zNext; | ||
1929 | } | ||
1930 | |||
1931 | /* | ||
1932 | ** Build a new sqlite3_vtab structure that will describe the | ||
1933 | ** fulltext index defined by spec. | ||
1934 | */ | ||
1935 | static int constructVtab( | ||
1936 | sqlite3 *db, /* The SQLite database connection */ | ||
1937 | TableSpec *spec, /* Parsed spec information from parseSpec() */ | ||
1938 | sqlite3_vtab **ppVTab, /* Write the resulting vtab structure here */ | ||
1939 | char **pzErr /* Write any error message here */ | ||
1940 | ){ | ||
1941 | int rc; | ||
1942 | int n; | ||
1943 | fulltext_vtab *v = 0; | ||
1944 | const sqlite3_tokenizer_module *m = NULL; | ||
1945 | char *schema; | ||
1946 | |||
1947 | v = (fulltext_vtab *) malloc(sizeof(fulltext_vtab)); | ||
1948 | if( v==0 ) return SQLITE_NOMEM; | ||
1949 | memset(v, 0, sizeof(*v)); | ||
1950 | /* sqlite will initialize v->base */ | ||
1951 | v->db = db; | ||
1952 | v->zDb = spec->zDb; /* Freed when azColumn is freed */ | ||
1953 | v->zName = spec->zName; /* Freed when azColumn is freed */ | ||
1954 | v->nColumn = spec->nColumn; | ||
1955 | v->azContentColumn = spec->azContentColumn; | ||
1956 | spec->azContentColumn = 0; | ||
1957 | v->azColumn = spec->azColumn; | ||
1958 | spec->azColumn = 0; | ||
1959 | |||
1960 | if( spec->azTokenizer==0 ){ | ||
1961 | return SQLITE_NOMEM; | ||
1962 | } | ||
1963 | /* TODO(shess) For now, add new tokenizers as else if clauses. */ | ||
1964 | if( spec->azTokenizer[0]==0 || startsWith(spec->azTokenizer[0], "simple") ){ | ||
1965 | sqlite3Fts1SimpleTokenizerModule(&m); | ||
1966 | }else if( startsWith(spec->azTokenizer[0], "porter") ){ | ||
1967 | sqlite3Fts1PorterTokenizerModule(&m); | ||
1968 | }else{ | ||
1969 | *pzErr = sqlite3_mprintf("unknown tokenizer: %s", spec->azTokenizer[0]); | ||
1970 | rc = SQLITE_ERROR; | ||
1971 | goto err; | ||
1972 | } | ||
1973 | for(n=0; spec->azTokenizer[n]; n++){} | ||
1974 | if( n ){ | ||
1975 | rc = m->xCreate(n-1, (const char*const*)&spec->azTokenizer[1], | ||
1976 | &v->pTokenizer); | ||
1977 | }else{ | ||
1978 | rc = m->xCreate(0, 0, &v->pTokenizer); | ||
1979 | } | ||
1980 | if( rc!=SQLITE_OK ) goto err; | ||
1981 | v->pTokenizer->pModule = m; | ||
1982 | |||
1983 | /* TODO: verify the existence of backing tables foo_content, foo_term */ | ||
1984 | |||
1985 | schema = fulltextSchema(v->nColumn, (const char*const*)v->azColumn, | ||
1986 | spec->zName); | ||
1987 | rc = sqlite3_declare_vtab(db, schema); | ||
1988 | sqlite3_free(schema); | ||
1989 | if( rc!=SQLITE_OK ) goto err; | ||
1990 | |||
1991 | memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements)); | ||
1992 | |||
1993 | *ppVTab = &v->base; | ||
1994 | TRACE(("FTS1 Connect %p\n", v)); | ||
1995 | |||
1996 | return rc; | ||
1997 | |||
1998 | err: | ||
1999 | fulltext_vtab_destroy(v); | ||
2000 | return rc; | ||
2001 | } | ||
2002 | |||
2003 | static int fulltextConnect( | ||
2004 | sqlite3 *db, | ||
2005 | void *pAux, | ||
2006 | int argc, const char *const*argv, | ||
2007 | sqlite3_vtab **ppVTab, | ||
2008 | char **pzErr | ||
2009 | ){ | ||
2010 | TableSpec spec; | ||
2011 | int rc = parseSpec(&spec, argc, argv, pzErr); | ||
2012 | if( rc!=SQLITE_OK ) return rc; | ||
2013 | |||
2014 | rc = constructVtab(db, &spec, ppVTab, pzErr); | ||
2015 | clearTableSpec(&spec); | ||
2016 | return rc; | ||
2017 | } | ||
2018 | |||
2019 | /* The %_content table holds the text of each document, with | ||
2020 | ** the rowid used as the docid. | ||
2021 | ** | ||
2022 | ** The %_term table maps each term to a document list blob | ||
2023 | ** containing elements sorted by ascending docid, each element | ||
2024 | ** encoded as: | ||
2025 | ** | ||
2026 | ** docid varint-encoded | ||
2027 | ** token elements: | ||
2028 | ** position+1 varint-encoded as delta from previous position | ||
2029 | ** start offset varint-encoded as delta from previous start offset | ||
2030 | ** end offset varint-encoded as delta from start offset | ||
2031 | ** | ||
2032 | ** The sentinel position of 0 indicates the end of the token list. | ||
2033 | ** | ||
2034 | ** Additionally, doclist blobs are chunked into multiple segments, | ||
2035 | ** using segment to order the segments. New elements are added to | ||
2036 | ** the segment at segment 0, until it exceeds CHUNK_MAX. Then | ||
2037 | ** segment 0 is deleted, and the doclist is inserted at segment 1. | ||
2038 | ** If there is already a doclist at segment 1, the segment 0 doclist | ||
2039 | ** is merged with it, the segment 1 doclist is deleted, and the | ||
2040 | ** merged doclist is inserted at segment 2, repeating those | ||
2041 | ** operations until an insert succeeds. | ||
2042 | ** | ||
2043 | ** Since this structure doesn't allow us to update elements in place | ||
2044 | ** in case of deletion or update, these are simply written to | ||
2045 | ** segment 0 (with an empty token list in case of deletion), with | ||
2046 | ** docListAccumulate() taking care to retain lower-segment | ||
2047 | ** information in preference to higher-segment information. | ||
2048 | */ | ||
2049 | /* TODO(shess) Provide a VACUUM type operation which both removes | ||
2050 | ** deleted elements which are no longer necessary, and duplicated | ||
2051 | ** elements. I suspect this will probably not be necessary in | ||
2052 | ** practice, though. | ||
2053 | */ | ||
2054 | static int fulltextCreate(sqlite3 *db, void *pAux, | ||
2055 | int argc, const char * const *argv, | ||
2056 | sqlite3_vtab **ppVTab, char **pzErr){ | ||
2057 | int rc; | ||
2058 | TableSpec spec; | ||
2059 | StringBuffer schema; | ||
2060 | TRACE(("FTS1 Create\n")); | ||
2061 | |||
2062 | rc = parseSpec(&spec, argc, argv, pzErr); | ||
2063 | if( rc!=SQLITE_OK ) return rc; | ||
2064 | |||
2065 | initStringBuffer(&schema); | ||
2066 | append(&schema, "CREATE TABLE %_content("); | ||
2067 | appendList(&schema, spec.nColumn, spec.azContentColumn); | ||
2068 | append(&schema, ")"); | ||
2069 | rc = sql_exec(db, spec.zDb, spec.zName, schema.s); | ||
2070 | free(schema.s); | ||
2071 | if( rc!=SQLITE_OK ) goto out; | ||
2072 | |||
2073 | rc = sql_exec(db, spec.zDb, spec.zName, | ||
2074 | "create table %_term(term text, segment integer, doclist blob, " | ||
2075 | "primary key(term, segment));"); | ||
2076 | if( rc!=SQLITE_OK ) goto out; | ||
2077 | |||
2078 | rc = constructVtab(db, &spec, ppVTab, pzErr); | ||
2079 | |||
2080 | out: | ||
2081 | clearTableSpec(&spec); | ||
2082 | return rc; | ||
2083 | } | ||
2084 | |||
2085 | /* Decide how to handle an SQL query. */ | ||
2086 | static int fulltextBestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){ | ||
2087 | int i; | ||
2088 | TRACE(("FTS1 BestIndex\n")); | ||
2089 | |||
2090 | for(i=0; i<pInfo->nConstraint; ++i){ | ||
2091 | const struct sqlite3_index_constraint *pConstraint; | ||
2092 | pConstraint = &pInfo->aConstraint[i]; | ||
2093 | if( pConstraint->usable ) { | ||
2094 | if( pConstraint->iColumn==-1 && | ||
2095 | pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ ){ | ||
2096 | pInfo->idxNum = QUERY_ROWID; /* lookup by rowid */ | ||
2097 | TRACE(("FTS1 QUERY_ROWID\n")); | ||
2098 | } else if( pConstraint->iColumn>=0 && | ||
2099 | pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH ){ | ||
2100 | /* full-text search */ | ||
2101 | pInfo->idxNum = QUERY_FULLTEXT + pConstraint->iColumn; | ||
2102 | TRACE(("FTS1 QUERY_FULLTEXT %d\n", pConstraint->iColumn)); | ||
2103 | } else continue; | ||
2104 | |||
2105 | pInfo->aConstraintUsage[i].argvIndex = 1; | ||
2106 | pInfo->aConstraintUsage[i].omit = 1; | ||
2107 | |||
2108 | /* An arbitrary value for now. | ||
2109 | * TODO: Perhaps rowid matches should be considered cheaper than | ||
2110 | * full-text searches. */ | ||
2111 | pInfo->estimatedCost = 1.0; | ||
2112 | |||
2113 | return SQLITE_OK; | ||
2114 | } | ||
2115 | } | ||
2116 | pInfo->idxNum = QUERY_GENERIC; | ||
2117 | return SQLITE_OK; | ||
2118 | } | ||
2119 | |||
2120 | static int fulltextDisconnect(sqlite3_vtab *pVTab){ | ||
2121 | TRACE(("FTS1 Disconnect %p\n", pVTab)); | ||
2122 | fulltext_vtab_destroy((fulltext_vtab *)pVTab); | ||
2123 | return SQLITE_OK; | ||
2124 | } | ||
2125 | |||
2126 | static int fulltextDestroy(sqlite3_vtab *pVTab){ | ||
2127 | fulltext_vtab *v = (fulltext_vtab *)pVTab; | ||
2128 | int rc; | ||
2129 | |||
2130 | TRACE(("FTS1 Destroy %p\n", pVTab)); | ||
2131 | rc = sql_exec(v->db, v->zDb, v->zName, | ||
2132 | "drop table if exists %_content;" | ||
2133 | "drop table if exists %_term;" | ||
2134 | ); | ||
2135 | if( rc!=SQLITE_OK ) return rc; | ||
2136 | |||
2137 | fulltext_vtab_destroy((fulltext_vtab *)pVTab); | ||
2138 | return SQLITE_OK; | ||
2139 | } | ||
2140 | |||
2141 | static int fulltextOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){ | ||
2142 | fulltext_cursor *c; | ||
2143 | |||
2144 | c = (fulltext_cursor *) calloc(sizeof(fulltext_cursor), 1); | ||
2145 | /* sqlite will initialize c->base */ | ||
2146 | *ppCursor = &c->base; | ||
2147 | TRACE(("FTS1 Open %p: %p\n", pVTab, c)); | ||
2148 | |||
2149 | return SQLITE_OK; | ||
2150 | } | ||
2151 | |||
2152 | |||
2153 | /* Free all of the dynamically allocated memory held by *q | ||
2154 | */ | ||
2155 | static void queryClear(Query *q){ | ||
2156 | int i; | ||
2157 | for(i = 0; i < q->nTerms; ++i){ | ||
2158 | free(q->pTerms[i].pTerm); | ||
2159 | } | ||
2160 | free(q->pTerms); | ||
2161 | memset(q, 0, sizeof(*q)); | ||
2162 | } | ||
2163 | |||
2164 | /* Free all of the dynamically allocated memory held by the | ||
2165 | ** Snippet | ||
2166 | */ | ||
2167 | static void snippetClear(Snippet *p){ | ||
2168 | free(p->aMatch); | ||
2169 | free(p->zOffset); | ||
2170 | free(p->zSnippet); | ||
2171 | memset(p, 0, sizeof(*p)); | ||
2172 | } | ||
2173 | /* | ||
2174 | ** Append a single entry to the p->aMatch[] log. | ||
2175 | */ | ||
2176 | static void snippetAppendMatch( | ||
2177 | Snippet *p, /* Append the entry to this snippet */ | ||
2178 | int iCol, int iTerm, /* The column and query term */ | ||
2179 | int iStart, int nByte /* Offset and size of the match */ | ||
2180 | ){ | ||
2181 | int i; | ||
2182 | struct snippetMatch *pMatch; | ||
2183 | if( p->nMatch+1>=p->nAlloc ){ | ||
2184 | p->nAlloc = p->nAlloc*2 + 10; | ||
2185 | p->aMatch = realloc(p->aMatch, p->nAlloc*sizeof(p->aMatch[0]) ); | ||
2186 | if( p->aMatch==0 ){ | ||
2187 | p->nMatch = 0; | ||
2188 | p->nAlloc = 0; | ||
2189 | return; | ||
2190 | } | ||
2191 | } | ||
2192 | i = p->nMatch++; | ||
2193 | pMatch = &p->aMatch[i]; | ||
2194 | pMatch->iCol = iCol; | ||
2195 | pMatch->iTerm = iTerm; | ||
2196 | pMatch->iStart = iStart; | ||
2197 | pMatch->nByte = nByte; | ||
2198 | } | ||
2199 | |||
2200 | /* | ||
2201 | ** Sizing information for the circular buffer used in snippetOffsetsOfColumn() | ||
2202 | */ | ||
2203 | #define FTS1_ROTOR_SZ (32) | ||
2204 | #define FTS1_ROTOR_MASK (FTS1_ROTOR_SZ-1) | ||
2205 | |||
2206 | /* | ||
2207 | ** Add entries to pSnippet->aMatch[] for every match that occurs against | ||
2208 | ** document zDoc[0..nDoc-1] which is stored in column iColumn. | ||
2209 | */ | ||
2210 | static void snippetOffsetsOfColumn( | ||
2211 | Query *pQuery, | ||
2212 | Snippet *pSnippet, | ||
2213 | int iColumn, | ||
2214 | const char *zDoc, | ||
2215 | int nDoc | ||
2216 | ){ | ||
2217 | const sqlite3_tokenizer_module *pTModule; /* The tokenizer module */ | ||
2218 | sqlite3_tokenizer *pTokenizer; /* The specific tokenizer */ | ||
2219 | sqlite3_tokenizer_cursor *pTCursor; /* Tokenizer cursor */ | ||
2220 | fulltext_vtab *pVtab; /* The full text index */ | ||
2221 | int nColumn; /* Number of columns in the index */ | ||
2222 | const QueryTerm *aTerm; /* Query string terms */ | ||
2223 | int nTerm; /* Number of query string terms */ | ||
2224 | int i, j; /* Loop counters */ | ||
2225 | int rc; /* Return code */ | ||
2226 | unsigned int match, prevMatch; /* Phrase search bitmasks */ | ||
2227 | const char *zToken; /* Next token from the tokenizer */ | ||
2228 | int nToken; /* Size of zToken */ | ||
2229 | int iBegin, iEnd, iPos; /* Offsets of beginning and end */ | ||
2230 | |||
2231 | /* The following variables keep a circular buffer of the last | ||
2232 | ** few tokens */ | ||
2233 | unsigned int iRotor = 0; /* Index of current token */ | ||
2234 | int iRotorBegin[FTS1_ROTOR_SZ]; /* Beginning offset of token */ | ||
2235 | int iRotorLen[FTS1_ROTOR_SZ]; /* Length of token */ | ||
2236 | |||
2237 | pVtab = pQuery->pFts; | ||
2238 | nColumn = pVtab->nColumn; | ||
2239 | pTokenizer = pVtab->pTokenizer; | ||
2240 | pTModule = pTokenizer->pModule; | ||
2241 | rc = pTModule->xOpen(pTokenizer, zDoc, nDoc, &pTCursor); | ||
2242 | if( rc ) return; | ||
2243 | pTCursor->pTokenizer = pTokenizer; | ||
2244 | aTerm = pQuery->pTerms; | ||
2245 | nTerm = pQuery->nTerms; | ||
2246 | if( nTerm>=FTS1_ROTOR_SZ ){ | ||
2247 | nTerm = FTS1_ROTOR_SZ - 1; | ||
2248 | } | ||
2249 | prevMatch = 0; | ||
2250 | while(1){ | ||
2251 | rc = pTModule->xNext(pTCursor, &zToken, &nToken, &iBegin, &iEnd, &iPos); | ||
2252 | if( rc ) break; | ||
2253 | iRotorBegin[iRotor&FTS1_ROTOR_MASK] = iBegin; | ||
2254 | iRotorLen[iRotor&FTS1_ROTOR_MASK] = iEnd-iBegin; | ||
2255 | match = 0; | ||
2256 | for(i=0; i<nTerm; i++){ | ||
2257 | int iCol; | ||
2258 | iCol = aTerm[i].iColumn; | ||
2259 | if( iCol>=0 && iCol<nColumn && iCol!=iColumn ) continue; | ||
2260 | if( aTerm[i].nTerm!=nToken ) continue; | ||
2261 | if( memcmp(aTerm[i].pTerm, zToken, nToken) ) continue; | ||
2262 | if( aTerm[i].iPhrase>1 && (prevMatch & (1<<i))==0 ) continue; | ||
2263 | match |= 1<<i; | ||
2264 | if( i==nTerm-1 || aTerm[i+1].iPhrase==1 ){ | ||
2265 | for(j=aTerm[i].iPhrase-1; j>=0; j--){ | ||
2266 | int k = (iRotor-j) & FTS1_ROTOR_MASK; | ||
2267 | snippetAppendMatch(pSnippet, iColumn, i-j, | ||
2268 | iRotorBegin[k], iRotorLen[k]); | ||
2269 | } | ||
2270 | } | ||
2271 | } | ||
2272 | prevMatch = match<<1; | ||
2273 | iRotor++; | ||
2274 | } | ||
2275 | pTModule->xClose(pTCursor); | ||
2276 | } | ||
2277 | |||
2278 | |||
2279 | /* | ||
2280 | ** Compute all offsets for the current row of the query. | ||
2281 | ** If the offsets have already been computed, this routine is a no-op. | ||
2282 | */ | ||
2283 | static void snippetAllOffsets(fulltext_cursor *p){ | ||
2284 | int nColumn; | ||
2285 | int iColumn, i; | ||
2286 | int iFirst, iLast; | ||
2287 | fulltext_vtab *pFts; | ||
2288 | |||
2289 | if( p->snippet.nMatch ) return; | ||
2290 | if( p->q.nTerms==0 ) return; | ||
2291 | pFts = p->q.pFts; | ||
2292 | nColumn = pFts->nColumn; | ||
2293 | iColumn = p->iCursorType - QUERY_FULLTEXT; | ||
2294 | if( iColumn<0 || iColumn>=nColumn ){ | ||
2295 | iFirst = 0; | ||
2296 | iLast = nColumn-1; | ||
2297 | }else{ | ||
2298 | iFirst = iColumn; | ||
2299 | iLast = iColumn; | ||
2300 | } | ||
2301 | for(i=iFirst; i<=iLast; i++){ | ||
2302 | const char *zDoc; | ||
2303 | int nDoc; | ||
2304 | zDoc = (const char*)sqlite3_column_text(p->pStmt, i+1); | ||
2305 | nDoc = sqlite3_column_bytes(p->pStmt, i+1); | ||
2306 | snippetOffsetsOfColumn(&p->q, &p->snippet, i, zDoc, nDoc); | ||
2307 | } | ||
2308 | } | ||
2309 | |||
2310 | /* | ||
2311 | ** Convert the information in the aMatch[] array of the snippet | ||
2312 | ** into the string zOffset[0..nOffset-1]. | ||
2313 | */ | ||
2314 | static void snippetOffsetText(Snippet *p){ | ||
2315 | int i; | ||
2316 | int cnt = 0; | ||
2317 | StringBuffer sb; | ||
2318 | char zBuf[200]; | ||
2319 | if( p->zOffset ) return; | ||
2320 | initStringBuffer(&sb); | ||
2321 | for(i=0; i<p->nMatch; i++){ | ||
2322 | struct snippetMatch *pMatch = &p->aMatch[i]; | ||
2323 | zBuf[0] = ' '; | ||
2324 | sprintf(&zBuf[cnt>0], "%d %d %d %d", pMatch->iCol, | ||
2325 | pMatch->iTerm, pMatch->iStart, pMatch->nByte); | ||
2326 | append(&sb, zBuf); | ||
2327 | cnt++; | ||
2328 | } | ||
2329 | p->zOffset = sb.s; | ||
2330 | p->nOffset = sb.len; | ||
2331 | } | ||
2332 | |||
2333 | /* | ||
2334 | ** zDoc[0..nDoc-1] is phrase of text. aMatch[0..nMatch-1] are a set | ||
2335 | ** of matching words some of which might be in zDoc. zDoc is column | ||
2336 | ** number iCol. | ||
2337 | ** | ||
2338 | ** iBreak is suggested spot in zDoc where we could begin or end an | ||
2339 | ** excerpt. Return a value similar to iBreak but possibly adjusted | ||
2340 | ** to be a little left or right so that the break point is better. | ||
2341 | */ | ||
2342 | static int wordBoundary( | ||
2343 | int iBreak, /* The suggested break point */ | ||
2344 | const char *zDoc, /* Document text */ | ||
2345 | int nDoc, /* Number of bytes in zDoc[] */ | ||
2346 | struct snippetMatch *aMatch, /* Matching words */ | ||
2347 | int nMatch, /* Number of entries in aMatch[] */ | ||
2348 | int iCol /* The column number for zDoc[] */ | ||
2349 | ){ | ||
2350 | int i; | ||
2351 | if( iBreak<=10 ){ | ||
2352 | return 0; | ||
2353 | } | ||
2354 | if( iBreak>=nDoc-10 ){ | ||
2355 | return nDoc; | ||
2356 | } | ||
2357 | for(i=0; i<nMatch && aMatch[i].iCol<iCol; i++){} | ||
2358 | while( i<nMatch && aMatch[i].iStart+aMatch[i].nByte<iBreak ){ i++; } | ||
2359 | if( i<nMatch ){ | ||
2360 | if( aMatch[i].iStart<iBreak+10 ){ | ||
2361 | return aMatch[i].iStart; | ||
2362 | } | ||
2363 | if( i>0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){ | ||
2364 | return aMatch[i-1].iStart; | ||
2365 | } | ||
2366 | } | ||
2367 | for(i=1; i<=10; i++){ | ||
2368 | if( safe_isspace(zDoc[iBreak-i]) ){ | ||
2369 | return iBreak - i + 1; | ||
2370 | } | ||
2371 | if( safe_isspace(zDoc[iBreak+i]) ){ | ||
2372 | return iBreak + i + 1; | ||
2373 | } | ||
2374 | } | ||
2375 | return iBreak; | ||
2376 | } | ||
2377 | |||
2378 | /* | ||
2379 | ** If the StringBuffer does not end in white space, add a single | ||
2380 | ** space character to the end. | ||
2381 | */ | ||
2382 | static void appendWhiteSpace(StringBuffer *p){ | ||
2383 | if( p->len==0 ) return; | ||
2384 | if( safe_isspace(p->s[p->len-1]) ) return; | ||
2385 | append(p, " "); | ||
2386 | } | ||
2387 | |||
2388 | /* | ||
2389 | ** Remove white space from teh end of the StringBuffer | ||
2390 | */ | ||
2391 | static void trimWhiteSpace(StringBuffer *p){ | ||
2392 | while( p->len>0 && safe_isspace(p->s[p->len-1]) ){ | ||
2393 | p->len--; | ||
2394 | } | ||
2395 | } | ||
2396 | |||
2397 | |||
2398 | |||
2399 | /* | ||
2400 | ** Allowed values for Snippet.aMatch[].snStatus | ||
2401 | */ | ||
2402 | #define SNIPPET_IGNORE 0 /* It is ok to omit this match from the snippet */ | ||
2403 | #define SNIPPET_DESIRED 1 /* We want to include this match in the snippet */ | ||
2404 | |||
2405 | /* | ||
2406 | ** Generate the text of a snippet. | ||
2407 | */ | ||
2408 | static void snippetText( | ||
2409 | fulltext_cursor *pCursor, /* The cursor we need the snippet for */ | ||
2410 | const char *zStartMark, /* Markup to appear before each match */ | ||
2411 | const char *zEndMark, /* Markup to appear after each match */ | ||
2412 | const char *zEllipsis /* Ellipsis mark */ | ||
2413 | ){ | ||
2414 | int i, j; | ||
2415 | struct snippetMatch *aMatch; | ||
2416 | int nMatch; | ||
2417 | int nDesired; | ||
2418 | StringBuffer sb; | ||
2419 | int tailCol; | ||
2420 | int tailOffset; | ||
2421 | int iCol; | ||
2422 | int nDoc; | ||
2423 | const char *zDoc; | ||
2424 | int iStart, iEnd; | ||
2425 | int tailEllipsis = 0; | ||
2426 | int iMatch; | ||
2427 | |||
2428 | |||
2429 | free(pCursor->snippet.zSnippet); | ||
2430 | pCursor->snippet.zSnippet = 0; | ||
2431 | aMatch = pCursor->snippet.aMatch; | ||
2432 | nMatch = pCursor->snippet.nMatch; | ||
2433 | initStringBuffer(&sb); | ||
2434 | |||
2435 | for(i=0; i<nMatch; i++){ | ||
2436 | aMatch[i].snStatus = SNIPPET_IGNORE; | ||
2437 | } | ||
2438 | nDesired = 0; | ||
2439 | for(i=0; i<pCursor->q.nTerms; i++){ | ||
2440 | for(j=0; j<nMatch; j++){ | ||
2441 | if( aMatch[j].iTerm==i ){ | ||
2442 | aMatch[j].snStatus = SNIPPET_DESIRED; | ||
2443 | nDesired++; | ||
2444 | break; | ||
2445 | } | ||
2446 | } | ||
2447 | } | ||
2448 | |||
2449 | iMatch = 0; | ||
2450 | tailCol = -1; | ||
2451 | tailOffset = 0; | ||
2452 | for(i=0; i<nMatch && nDesired>0; i++){ | ||
2453 | if( aMatch[i].snStatus!=SNIPPET_DESIRED ) continue; | ||
2454 | nDesired--; | ||
2455 | iCol = aMatch[i].iCol; | ||
2456 | zDoc = (const char*)sqlite3_column_text(pCursor->pStmt, iCol+1); | ||
2457 | nDoc = sqlite3_column_bytes(pCursor->pStmt, iCol+1); | ||
2458 | iStart = aMatch[i].iStart - 40; | ||
2459 | iStart = wordBoundary(iStart, zDoc, nDoc, aMatch, nMatch, iCol); | ||
2460 | if( iStart<=10 ){ | ||
2461 | iStart = 0; | ||
2462 | } | ||
2463 | if( iCol==tailCol && iStart<=tailOffset+20 ){ | ||
2464 | iStart = tailOffset; | ||
2465 | } | ||
2466 | if( (iCol!=tailCol && tailCol>=0) || iStart!=tailOffset ){ | ||
2467 | trimWhiteSpace(&sb); | ||
2468 | appendWhiteSpace(&sb); | ||
2469 | append(&sb, zEllipsis); | ||
2470 | appendWhiteSpace(&sb); | ||
2471 | } | ||
2472 | iEnd = aMatch[i].iStart + aMatch[i].nByte + 40; | ||
2473 | iEnd = wordBoundary(iEnd, zDoc, nDoc, aMatch, nMatch, iCol); | ||
2474 | if( iEnd>=nDoc-10 ){ | ||
2475 | iEnd = nDoc; | ||
2476 | tailEllipsis = 0; | ||
2477 | }else{ | ||
2478 | tailEllipsis = 1; | ||
2479 | } | ||
2480 | while( iMatch<nMatch && aMatch[iMatch].iCol<iCol ){ iMatch++; } | ||
2481 | while( iStart<iEnd ){ | ||
2482 | while( iMatch<nMatch && aMatch[iMatch].iStart<iStart | ||
2483 | && aMatch[iMatch].iCol<=iCol ){ | ||
2484 | iMatch++; | ||
2485 | } | ||
2486 | if( iMatch<nMatch && aMatch[iMatch].iStart<iEnd | ||
2487 | && aMatch[iMatch].iCol==iCol ){ | ||
2488 | nappend(&sb, &zDoc[iStart], aMatch[iMatch].iStart - iStart); | ||
2489 | iStart = aMatch[iMatch].iStart; | ||
2490 | append(&sb, zStartMark); | ||
2491 | nappend(&sb, &zDoc[iStart], aMatch[iMatch].nByte); | ||
2492 | append(&sb, zEndMark); | ||
2493 | iStart += aMatch[iMatch].nByte; | ||
2494 | for(j=iMatch+1; j<nMatch; j++){ | ||
2495 | if( aMatch[j].iTerm==aMatch[iMatch].iTerm | ||
2496 | && aMatch[j].snStatus==SNIPPET_DESIRED ){ | ||
2497 | nDesired--; | ||
2498 | aMatch[j].snStatus = SNIPPET_IGNORE; | ||
2499 | } | ||
2500 | } | ||
2501 | }else{ | ||
2502 | nappend(&sb, &zDoc[iStart], iEnd - iStart); | ||
2503 | iStart = iEnd; | ||
2504 | } | ||
2505 | } | ||
2506 | tailCol = iCol; | ||
2507 | tailOffset = iEnd; | ||
2508 | } | ||
2509 | trimWhiteSpace(&sb); | ||
2510 | if( tailEllipsis ){ | ||
2511 | appendWhiteSpace(&sb); | ||
2512 | append(&sb, zEllipsis); | ||
2513 | } | ||
2514 | pCursor->snippet.zSnippet = sb.s; | ||
2515 | pCursor->snippet.nSnippet = sb.len; | ||
2516 | } | ||
2517 | |||
2518 | |||
2519 | /* | ||
2520 | ** Close the cursor. For additional information see the documentation | ||
2521 | ** on the xClose method of the virtual table interface. | ||
2522 | */ | ||
2523 | static int fulltextClose(sqlite3_vtab_cursor *pCursor){ | ||
2524 | fulltext_cursor *c = (fulltext_cursor *) pCursor; | ||
2525 | TRACE(("FTS1 Close %p\n", c)); | ||
2526 | sqlite3_finalize(c->pStmt); | ||
2527 | queryClear(&c->q); | ||
2528 | snippetClear(&c->snippet); | ||
2529 | if( c->result.pDoclist!=NULL ){ | ||
2530 | docListDelete(c->result.pDoclist); | ||
2531 | } | ||
2532 | free(c); | ||
2533 | return SQLITE_OK; | ||
2534 | } | ||
2535 | |||
2536 | static int fulltextNext(sqlite3_vtab_cursor *pCursor){ | ||
2537 | fulltext_cursor *c = (fulltext_cursor *) pCursor; | ||
2538 | sqlite_int64 iDocid; | ||
2539 | int rc; | ||
2540 | |||
2541 | TRACE(("FTS1 Next %p\n", pCursor)); | ||
2542 | snippetClear(&c->snippet); | ||
2543 | if( c->iCursorType < QUERY_FULLTEXT ){ | ||
2544 | /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */ | ||
2545 | rc = sqlite3_step(c->pStmt); | ||
2546 | switch( rc ){ | ||
2547 | case SQLITE_ROW: | ||
2548 | c->eof = 0; | ||
2549 | return SQLITE_OK; | ||
2550 | case SQLITE_DONE: | ||
2551 | c->eof = 1; | ||
2552 | return SQLITE_OK; | ||
2553 | default: | ||
2554 | c->eof = 1; | ||
2555 | return rc; | ||
2556 | } | ||
2557 | } else { /* full-text query */ | ||
2558 | rc = sqlite3_reset(c->pStmt); | ||
2559 | if( rc!=SQLITE_OK ) return rc; | ||
2560 | |||
2561 | iDocid = nextDocid(&c->result); | ||
2562 | if( iDocid==0 ){ | ||
2563 | c->eof = 1; | ||
2564 | return SQLITE_OK; | ||
2565 | } | ||
2566 | rc = sqlite3_bind_int64(c->pStmt, 1, iDocid); | ||
2567 | if( rc!=SQLITE_OK ) return rc; | ||
2568 | /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */ | ||
2569 | rc = sqlite3_step(c->pStmt); | ||
2570 | if( rc==SQLITE_ROW ){ /* the case we expect */ | ||
2571 | c->eof = 0; | ||
2572 | return SQLITE_OK; | ||
2573 | } | ||
2574 | /* an error occurred; abort */ | ||
2575 | return rc==SQLITE_DONE ? SQLITE_ERROR : rc; | ||
2576 | } | ||
2577 | } | ||
2578 | |||
2579 | |||
2580 | /* Return a DocList corresponding to the query term *pTerm. If *pTerm | ||
2581 | ** is the first term of a phrase query, go ahead and evaluate the phrase | ||
2582 | ** query and return the doclist for the entire phrase query. | ||
2583 | ** | ||
2584 | ** The result is stored in pTerm->doclist. | ||
2585 | */ | ||
2586 | static int docListOfTerm( | ||
2587 | fulltext_vtab *v, /* The full text index */ | ||
2588 | int iColumn, /* column to restrict to. No restrition if >=nColumn */ | ||
2589 | QueryTerm *pQTerm, /* Term we are looking for, or 1st term of a phrase */ | ||
2590 | DocList **ppResult /* Write the result here */ | ||
2591 | ){ | ||
2592 | DocList *pLeft, *pRight, *pNew; | ||
2593 | int i, rc; | ||
2594 | |||
2595 | pLeft = docListNew(DL_POSITIONS); | ||
2596 | rc = term_select_all(v, iColumn, pQTerm->pTerm, pQTerm->nTerm, pLeft); | ||
2597 | if( rc ){ | ||
2598 | docListDelete(pLeft); | ||
2599 | return rc; | ||
2600 | } | ||
2601 | for(i=1; i<=pQTerm->nPhrase; i++){ | ||
2602 | pRight = docListNew(DL_POSITIONS); | ||
2603 | rc = term_select_all(v, iColumn, pQTerm[i].pTerm, pQTerm[i].nTerm, pRight); | ||
2604 | if( rc ){ | ||
2605 | docListDelete(pLeft); | ||
2606 | return rc; | ||
2607 | } | ||
2608 | pNew = docListNew(i<pQTerm->nPhrase ? DL_POSITIONS : DL_DOCIDS); | ||
2609 | docListPhraseMerge(pLeft, pRight, pNew); | ||
2610 | docListDelete(pLeft); | ||
2611 | docListDelete(pRight); | ||
2612 | pLeft = pNew; | ||
2613 | } | ||
2614 | *ppResult = pLeft; | ||
2615 | return SQLITE_OK; | ||
2616 | } | ||
2617 | |||
2618 | /* Add a new term pTerm[0..nTerm-1] to the query *q. | ||
2619 | */ | ||
2620 | static void queryAdd(Query *q, const char *pTerm, int nTerm){ | ||
2621 | QueryTerm *t; | ||
2622 | ++q->nTerms; | ||
2623 | q->pTerms = realloc(q->pTerms, q->nTerms * sizeof(q->pTerms[0])); | ||
2624 | if( q->pTerms==0 ){ | ||
2625 | q->nTerms = 0; | ||
2626 | return; | ||
2627 | } | ||
2628 | t = &q->pTerms[q->nTerms - 1]; | ||
2629 | memset(t, 0, sizeof(*t)); | ||
2630 | t->pTerm = malloc(nTerm+1); | ||
2631 | memcpy(t->pTerm, pTerm, nTerm); | ||
2632 | t->pTerm[nTerm] = 0; | ||
2633 | t->nTerm = nTerm; | ||
2634 | t->isOr = q->nextIsOr; | ||
2635 | q->nextIsOr = 0; | ||
2636 | t->iColumn = q->nextColumn; | ||
2637 | q->nextColumn = q->dfltColumn; | ||
2638 | } | ||
2639 | |||
2640 | /* | ||
2641 | ** Check to see if the string zToken[0...nToken-1] matches any | ||
2642 | ** column name in the virtual table. If it does, | ||
2643 | ** return the zero-indexed column number. If not, return -1. | ||
2644 | */ | ||
2645 | static int checkColumnSpecifier( | ||
2646 | fulltext_vtab *pVtab, /* The virtual table */ | ||
2647 | const char *zToken, /* Text of the token */ | ||
2648 | int nToken /* Number of characters in the token */ | ||
2649 | ){ | ||
2650 | int i; | ||
2651 | for(i=0; i<pVtab->nColumn; i++){ | ||
2652 | if( memcmp(pVtab->azColumn[i], zToken, nToken)==0 | ||
2653 | && pVtab->azColumn[i][nToken]==0 ){ | ||
2654 | return i; | ||
2655 | } | ||
2656 | } | ||
2657 | return -1; | ||
2658 | } | ||
2659 | |||
2660 | /* | ||
2661 | ** Parse the text at pSegment[0..nSegment-1]. Add additional terms | ||
2662 | ** to the query being assemblied in pQuery. | ||
2663 | ** | ||
2664 | ** inPhrase is true if pSegment[0..nSegement-1] is contained within | ||
2665 | ** double-quotes. If inPhrase is true, then the first term | ||
2666 | ** is marked with the number of terms in the phrase less one and | ||
2667 | ** OR and "-" syntax is ignored. If inPhrase is false, then every | ||
2668 | ** term found is marked with nPhrase=0 and OR and "-" syntax is significant. | ||
2669 | */ | ||
2670 | static int tokenizeSegment( | ||
2671 | sqlite3_tokenizer *pTokenizer, /* The tokenizer to use */ | ||
2672 | const char *pSegment, int nSegment, /* Query expression being parsed */ | ||
2673 | int inPhrase, /* True if within "..." */ | ||
2674 | Query *pQuery /* Append results here */ | ||
2675 | ){ | ||
2676 | const sqlite3_tokenizer_module *pModule = pTokenizer->pModule; | ||
2677 | sqlite3_tokenizer_cursor *pCursor; | ||
2678 | int firstIndex = pQuery->nTerms; | ||
2679 | int iCol; | ||
2680 | int nTerm = 1; | ||
2681 | |||
2682 | int rc = pModule->xOpen(pTokenizer, pSegment, nSegment, &pCursor); | ||
2683 | if( rc!=SQLITE_OK ) return rc; | ||
2684 | pCursor->pTokenizer = pTokenizer; | ||
2685 | |||
2686 | while( 1 ){ | ||
2687 | const char *pToken; | ||
2688 | int nToken, iBegin, iEnd, iPos; | ||
2689 | |||
2690 | rc = pModule->xNext(pCursor, | ||
2691 | &pToken, &nToken, | ||
2692 | &iBegin, &iEnd, &iPos); | ||
2693 | if( rc!=SQLITE_OK ) break; | ||
2694 | if( !inPhrase && | ||
2695 | pSegment[iEnd]==':' && | ||
2696 | (iCol = checkColumnSpecifier(pQuery->pFts, pToken, nToken))>=0 ){ | ||
2697 | pQuery->nextColumn = iCol; | ||
2698 | continue; | ||
2699 | } | ||
2700 | if( !inPhrase && pQuery->nTerms>0 && nToken==2 | ||
2701 | && pSegment[iBegin]=='O' && pSegment[iBegin+1]=='R' ){ | ||
2702 | pQuery->nextIsOr = 1; | ||
2703 | continue; | ||
2704 | } | ||
2705 | queryAdd(pQuery, pToken, nToken); | ||
2706 | if( !inPhrase && iBegin>0 && pSegment[iBegin-1]=='-' ){ | ||
2707 | pQuery->pTerms[pQuery->nTerms-1].isNot = 1; | ||
2708 | } | ||
2709 | pQuery->pTerms[pQuery->nTerms-1].iPhrase = nTerm; | ||
2710 | if( inPhrase ){ | ||
2711 | nTerm++; | ||
2712 | } | ||
2713 | } | ||
2714 | |||
2715 | if( inPhrase && pQuery->nTerms>firstIndex ){ | ||
2716 | pQuery->pTerms[firstIndex].nPhrase = pQuery->nTerms - firstIndex - 1; | ||
2717 | } | ||
2718 | |||
2719 | return pModule->xClose(pCursor); | ||
2720 | } | ||
2721 | |||
2722 | /* Parse a query string, yielding a Query object pQuery. | ||
2723 | ** | ||
2724 | ** The calling function will need to queryClear() to clean up | ||
2725 | ** the dynamically allocated memory held by pQuery. | ||
2726 | */ | ||
2727 | static int parseQuery( | ||
2728 | fulltext_vtab *v, /* The fulltext index */ | ||
2729 | const char *zInput, /* Input text of the query string */ | ||
2730 | int nInput, /* Size of the input text */ | ||
2731 | int dfltColumn, /* Default column of the index to match against */ | ||
2732 | Query *pQuery /* Write the parse results here. */ | ||
2733 | ){ | ||
2734 | int iInput, inPhrase = 0; | ||
2735 | |||
2736 | if( zInput==0 ) nInput = 0; | ||
2737 | if( nInput<0 ) nInput = strlen(zInput); | ||
2738 | pQuery->nTerms = 0; | ||
2739 | pQuery->pTerms = NULL; | ||
2740 | pQuery->nextIsOr = 0; | ||
2741 | pQuery->nextColumn = dfltColumn; | ||
2742 | pQuery->dfltColumn = dfltColumn; | ||
2743 | pQuery->pFts = v; | ||
2744 | |||
2745 | for(iInput=0; iInput<nInput; ++iInput){ | ||
2746 | int i; | ||
2747 | for(i=iInput; i<nInput && zInput[i]!='"'; ++i){} | ||
2748 | if( i>iInput ){ | ||
2749 | tokenizeSegment(v->pTokenizer, zInput+iInput, i-iInput, inPhrase, | ||
2750 | pQuery); | ||
2751 | } | ||
2752 | iInput = i; | ||
2753 | if( i<nInput ){ | ||
2754 | assert( zInput[i]=='"' ); | ||
2755 | inPhrase = !inPhrase; | ||
2756 | } | ||
2757 | } | ||
2758 | |||
2759 | if( inPhrase ){ | ||
2760 | /* unmatched quote */ | ||
2761 | queryClear(pQuery); | ||
2762 | return SQLITE_ERROR; | ||
2763 | } | ||
2764 | return SQLITE_OK; | ||
2765 | } | ||
2766 | |||
2767 | /* Perform a full-text query using the search expression in | ||
2768 | ** zInput[0..nInput-1]. Return a list of matching documents | ||
2769 | ** in pResult. | ||
2770 | ** | ||
2771 | ** Queries must match column iColumn. Or if iColumn>=nColumn | ||
2772 | ** they are allowed to match against any column. | ||
2773 | */ | ||
2774 | static int fulltextQuery( | ||
2775 | fulltext_vtab *v, /* The full text index */ | ||
2776 | int iColumn, /* Match against this column by default */ | ||
2777 | const char *zInput, /* The query string */ | ||
2778 | int nInput, /* Number of bytes in zInput[] */ | ||
2779 | DocList **pResult, /* Write the result doclist here */ | ||
2780 | Query *pQuery /* Put parsed query string here */ | ||
2781 | ){ | ||
2782 | int i, iNext, rc; | ||
2783 | DocList *pLeft = NULL; | ||
2784 | DocList *pRight, *pNew, *pOr; | ||
2785 | int nNot = 0; | ||
2786 | QueryTerm *aTerm; | ||
2787 | |||
2788 | rc = parseQuery(v, zInput, nInput, iColumn, pQuery); | ||
2789 | if( rc!=SQLITE_OK ) return rc; | ||
2790 | |||
2791 | /* Merge AND terms. */ | ||
2792 | aTerm = pQuery->pTerms; | ||
2793 | for(i = 0; i<pQuery->nTerms; i=iNext){ | ||
2794 | if( aTerm[i].isNot ){ | ||
2795 | /* Handle all NOT terms in a separate pass */ | ||
2796 | nNot++; | ||
2797 | iNext = i + aTerm[i].nPhrase+1; | ||
2798 | continue; | ||
2799 | } | ||
2800 | iNext = i + aTerm[i].nPhrase + 1; | ||
2801 | rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &pRight); | ||
2802 | if( rc ){ | ||
2803 | queryClear(pQuery); | ||
2804 | return rc; | ||
2805 | } | ||
2806 | while( iNext<pQuery->nTerms && aTerm[iNext].isOr ){ | ||
2807 | rc = docListOfTerm(v, aTerm[iNext].iColumn, &aTerm[iNext], &pOr); | ||
2808 | iNext += aTerm[iNext].nPhrase + 1; | ||
2809 | if( rc ){ | ||
2810 | queryClear(pQuery); | ||
2811 | return rc; | ||
2812 | } | ||
2813 | pNew = docListNew(DL_DOCIDS); | ||
2814 | docListOrMerge(pRight, pOr, pNew); | ||
2815 | docListDelete(pRight); | ||
2816 | docListDelete(pOr); | ||
2817 | pRight = pNew; | ||
2818 | } | ||
2819 | if( pLeft==0 ){ | ||
2820 | pLeft = pRight; | ||
2821 | }else{ | ||
2822 | pNew = docListNew(DL_DOCIDS); | ||
2823 | docListAndMerge(pLeft, pRight, pNew); | ||
2824 | docListDelete(pRight); | ||
2825 | docListDelete(pLeft); | ||
2826 | pLeft = pNew; | ||
2827 | } | ||
2828 | } | ||
2829 | |||
2830 | if( nNot && pLeft==0 ){ | ||
2831 | /* We do not yet know how to handle a query of only NOT terms */ | ||
2832 | return SQLITE_ERROR; | ||
2833 | } | ||
2834 | |||
2835 | /* Do the EXCEPT terms */ | ||
2836 | for(i=0; i<pQuery->nTerms; i += aTerm[i].nPhrase + 1){ | ||
2837 | if( !aTerm[i].isNot ) continue; | ||
2838 | rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &pRight); | ||
2839 | if( rc ){ | ||
2840 | queryClear(pQuery); | ||
2841 | docListDelete(pLeft); | ||
2842 | return rc; | ||
2843 | } | ||
2844 | pNew = docListNew(DL_DOCIDS); | ||
2845 | docListExceptMerge(pLeft, pRight, pNew); | ||
2846 | docListDelete(pRight); | ||
2847 | docListDelete(pLeft); | ||
2848 | pLeft = pNew; | ||
2849 | } | ||
2850 | |||
2851 | *pResult = pLeft; | ||
2852 | return rc; | ||
2853 | } | ||
2854 | |||
2855 | /* | ||
2856 | ** This is the xFilter interface for the virtual table. See | ||
2857 | ** the virtual table xFilter method documentation for additional | ||
2858 | ** information. | ||
2859 | ** | ||
2860 | ** If idxNum==QUERY_GENERIC then do a full table scan against | ||
2861 | ** the %_content table. | ||
2862 | ** | ||
2863 | ** If idxNum==QUERY_ROWID then do a rowid lookup for a single entry | ||
2864 | ** in the %_content table. | ||
2865 | ** | ||
2866 | ** If idxNum>=QUERY_FULLTEXT then use the full text index. The | ||
2867 | ** column on the left-hand side of the MATCH operator is column | ||
2868 | ** number idxNum-QUERY_FULLTEXT, 0 indexed. argv[0] is the right-hand | ||
2869 | ** side of the MATCH operator. | ||
2870 | */ | ||
2871 | /* TODO(shess) Upgrade the cursor initialization and destruction to | ||
2872 | ** account for fulltextFilter() being called multiple times on the | ||
2873 | ** same cursor. The current solution is very fragile. Apply fix to | ||
2874 | ** fts2 as appropriate. | ||
2875 | */ | ||
2876 | static int fulltextFilter( | ||
2877 | sqlite3_vtab_cursor *pCursor, /* The cursor used for this query */ | ||
2878 | int idxNum, const char *idxStr, /* Which indexing scheme to use */ | ||
2879 | int argc, sqlite3_value **argv /* Arguments for the indexing scheme */ | ||
2880 | ){ | ||
2881 | fulltext_cursor *c = (fulltext_cursor *) pCursor; | ||
2882 | fulltext_vtab *v = cursor_vtab(c); | ||
2883 | int rc; | ||
2884 | char *zSql; | ||
2885 | |||
2886 | TRACE(("FTS1 Filter %p\n",pCursor)); | ||
2887 | |||
2888 | zSql = sqlite3_mprintf("select rowid, * from %%_content %s", | ||
2889 | idxNum==QUERY_GENERIC ? "" : "where rowid=?"); | ||
2890 | sqlite3_finalize(c->pStmt); | ||
2891 | rc = sql_prepare(v->db, v->zDb, v->zName, &c->pStmt, zSql); | ||
2892 | sqlite3_free(zSql); | ||
2893 | if( rc!=SQLITE_OK ) return rc; | ||
2894 | |||
2895 | c->iCursorType = idxNum; | ||
2896 | switch( idxNum ){ | ||
2897 | case QUERY_GENERIC: | ||
2898 | break; | ||
2899 | |||
2900 | case QUERY_ROWID: | ||
2901 | rc = sqlite3_bind_int64(c->pStmt, 1, sqlite3_value_int64(argv[0])); | ||
2902 | if( rc!=SQLITE_OK ) return rc; | ||
2903 | break; | ||
2904 | |||
2905 | default: /* full-text search */ | ||
2906 | { | ||
2907 | const char *zQuery = (const char *)sqlite3_value_text(argv[0]); | ||
2908 | DocList *pResult; | ||
2909 | assert( idxNum<=QUERY_FULLTEXT+v->nColumn); | ||
2910 | assert( argc==1 ); | ||
2911 | queryClear(&c->q); | ||
2912 | rc = fulltextQuery(v, idxNum-QUERY_FULLTEXT, zQuery, -1, &pResult, &c->q); | ||
2913 | if( rc!=SQLITE_OK ) return rc; | ||
2914 | if( c->result.pDoclist!=NULL ) docListDelete(c->result.pDoclist); | ||
2915 | readerInit(&c->result, pResult); | ||
2916 | break; | ||
2917 | } | ||
2918 | } | ||
2919 | |||
2920 | return fulltextNext(pCursor); | ||
2921 | } | ||
2922 | |||
2923 | /* This is the xEof method of the virtual table. The SQLite core | ||
2924 | ** calls this routine to find out if it has reached the end of | ||
2925 | ** a query's results set. | ||
2926 | */ | ||
2927 | static int fulltextEof(sqlite3_vtab_cursor *pCursor){ | ||
2928 | fulltext_cursor *c = (fulltext_cursor *) pCursor; | ||
2929 | return c->eof; | ||
2930 | } | ||
2931 | |||
2932 | /* This is the xColumn method of the virtual table. The SQLite | ||
2933 | ** core calls this method during a query when it needs the value | ||
2934 | ** of a column from the virtual table. This method needs to use | ||
2935 | ** one of the sqlite3_result_*() routines to store the requested | ||
2936 | ** value back in the pContext. | ||
2937 | */ | ||
2938 | static int fulltextColumn(sqlite3_vtab_cursor *pCursor, | ||
2939 | sqlite3_context *pContext, int idxCol){ | ||
2940 | fulltext_cursor *c = (fulltext_cursor *) pCursor; | ||
2941 | fulltext_vtab *v = cursor_vtab(c); | ||
2942 | |||
2943 | if( idxCol<v->nColumn ){ | ||
2944 | sqlite3_value *pVal = sqlite3_column_value(c->pStmt, idxCol+1); | ||
2945 | sqlite3_result_value(pContext, pVal); | ||
2946 | }else if( idxCol==v->nColumn ){ | ||
2947 | /* The extra column whose name is the same as the table. | ||
2948 | ** Return a blob which is a pointer to the cursor | ||
2949 | */ | ||
2950 | sqlite3_result_blob(pContext, &c, sizeof(c), SQLITE_TRANSIENT); | ||
2951 | } | ||
2952 | return SQLITE_OK; | ||
2953 | } | ||
2954 | |||
2955 | /* This is the xRowid method. The SQLite core calls this routine to | ||
2956 | ** retrive the rowid for the current row of the result set. The | ||
2957 | ** rowid should be written to *pRowid. | ||
2958 | */ | ||
2959 | static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){ | ||
2960 | fulltext_cursor *c = (fulltext_cursor *) pCursor; | ||
2961 | |||
2962 | *pRowid = sqlite3_column_int64(c->pStmt, 0); | ||
2963 | return SQLITE_OK; | ||
2964 | } | ||
2965 | |||
2966 | /* Add all terms in [zText] to the given hash table. If [iColumn] > 0, | ||
2967 | * we also store positions and offsets in the hash table using the given | ||
2968 | * column number. */ | ||
2969 | static int buildTerms(fulltext_vtab *v, fts1Hash *terms, sqlite_int64 iDocid, | ||
2970 | const char *zText, int iColumn){ | ||
2971 | sqlite3_tokenizer *pTokenizer = v->pTokenizer; | ||
2972 | sqlite3_tokenizer_cursor *pCursor; | ||
2973 | const char *pToken; | ||
2974 | int nTokenBytes; | ||
2975 | int iStartOffset, iEndOffset, iPosition; | ||
2976 | int rc; | ||
2977 | |||
2978 | rc = pTokenizer->pModule->xOpen(pTokenizer, zText, -1, &pCursor); | ||
2979 | if( rc!=SQLITE_OK ) return rc; | ||
2980 | |||
2981 | pCursor->pTokenizer = pTokenizer; | ||
2982 | while( SQLITE_OK==pTokenizer->pModule->xNext(pCursor, | ||
2983 | &pToken, &nTokenBytes, | ||
2984 | &iStartOffset, &iEndOffset, | ||
2985 | &iPosition) ){ | ||
2986 | DocList *p; | ||
2987 | |||
2988 | /* Positions can't be negative; we use -1 as a terminator internally. */ | ||
2989 | if( iPosition<0 ){ | ||
2990 | pTokenizer->pModule->xClose(pCursor); | ||
2991 | return SQLITE_ERROR; | ||
2992 | } | ||
2993 | |||
2994 | p = fts1HashFind(terms, pToken, nTokenBytes); | ||
2995 | if( p==NULL ){ | ||
2996 | p = docListNew(DL_DEFAULT); | ||
2997 | docListAddDocid(p, iDocid); | ||
2998 | fts1HashInsert(terms, pToken, nTokenBytes, p); | ||
2999 | } | ||
3000 | if( iColumn>=0 ){ | ||
3001 | docListAddPosOffset(p, iColumn, iPosition, iStartOffset, iEndOffset); | ||
3002 | } | ||
3003 | } | ||
3004 | |||
3005 | /* TODO(shess) Check return? Should this be able to cause errors at | ||
3006 | ** this point? Actually, same question about sqlite3_finalize(), | ||
3007 | ** though one could argue that failure there means that the data is | ||
3008 | ** not durable. *ponder* | ||
3009 | */ | ||
3010 | pTokenizer->pModule->xClose(pCursor); | ||
3011 | return rc; | ||
3012 | } | ||
3013 | |||
3014 | /* Update the %_terms table to map the term [pTerm] to the given rowid. */ | ||
3015 | static int index_insert_term(fulltext_vtab *v, const char *pTerm, int nTerm, | ||
3016 | DocList *d){ | ||
3017 | sqlite_int64 iIndexRow; | ||
3018 | DocList doclist; | ||
3019 | int iSegment = 0, rc; | ||
3020 | |||
3021 | rc = term_select(v, pTerm, nTerm, iSegment, &iIndexRow, &doclist); | ||
3022 | if( rc==SQLITE_DONE ){ | ||
3023 | docListInit(&doclist, DL_DEFAULT, 0, 0); | ||
3024 | docListUpdate(&doclist, d); | ||
3025 | /* TODO(shess) Consider length(doclist)>CHUNK_MAX? */ | ||
3026 | rc = term_insert(v, NULL, pTerm, nTerm, iSegment, &doclist); | ||
3027 | goto err; | ||
3028 | } | ||
3029 | if( rc!=SQLITE_ROW ) return SQLITE_ERROR; | ||
3030 | |||
3031 | docListUpdate(&doclist, d); | ||
3032 | if( doclist.nData<=CHUNK_MAX ){ | ||
3033 | rc = term_update(v, iIndexRow, &doclist); | ||
3034 | goto err; | ||
3035 | } | ||
3036 | |||
3037 | /* Doclist doesn't fit, delete what's there, and accumulate | ||
3038 | ** forward. | ||
3039 | */ | ||
3040 | rc = term_delete(v, iIndexRow); | ||
3041 | if( rc!=SQLITE_OK ) goto err; | ||
3042 | |||
3043 | /* Try to insert the doclist into a higher segment bucket. On | ||
3044 | ** failure, accumulate existing doclist with the doclist from that | ||
3045 | ** bucket, and put results in the next bucket. | ||
3046 | */ | ||
3047 | iSegment++; | ||
3048 | while( (rc=term_insert(v, &iIndexRow, pTerm, nTerm, iSegment, | ||
3049 | &doclist))!=SQLITE_OK ){ | ||
3050 | sqlite_int64 iSegmentRow; | ||
3051 | DocList old; | ||
3052 | int rc2; | ||
3053 | |||
3054 | /* Retain old error in case the term_insert() error was really an | ||
3055 | ** error rather than a bounced insert. | ||
3056 | */ | ||
3057 | rc2 = term_select(v, pTerm, nTerm, iSegment, &iSegmentRow, &old); | ||
3058 | if( rc2!=SQLITE_ROW ) goto err; | ||
3059 | |||
3060 | rc = term_delete(v, iSegmentRow); | ||
3061 | if( rc!=SQLITE_OK ) goto err; | ||
3062 | |||
3063 | /* Reusing lowest-number deleted row keeps the index smaller. */ | ||
3064 | if( iSegmentRow<iIndexRow ) iIndexRow = iSegmentRow; | ||
3065 | |||
3066 | /* doclist contains the newer data, so accumulate it over old. | ||
3067 | ** Then steal accumulated data for doclist. | ||
3068 | */ | ||
3069 | docListAccumulate(&old, &doclist); | ||
3070 | docListDestroy(&doclist); | ||
3071 | doclist = old; | ||
3072 | |||
3073 | iSegment++; | ||
3074 | } | ||
3075 | |||
3076 | err: | ||
3077 | docListDestroy(&doclist); | ||
3078 | return rc; | ||
3079 | } | ||
3080 | |||
3081 | /* Add doclists for all terms in [pValues] to the hash table [terms]. */ | ||
3082 | static int insertTerms(fulltext_vtab *v, fts1Hash *terms, sqlite_int64 iRowid, | ||
3083 | sqlite3_value **pValues){ | ||
3084 | int i; | ||
3085 | for(i = 0; i < v->nColumn ; ++i){ | ||
3086 | char *zText = (char*)sqlite3_value_text(pValues[i]); | ||
3087 | int rc = buildTerms(v, terms, iRowid, zText, i); | ||
3088 | if( rc!=SQLITE_OK ) return rc; | ||
3089 | } | ||
3090 | return SQLITE_OK; | ||
3091 | } | ||
3092 | |||
3093 | /* Add empty doclists for all terms in the given row's content to the hash | ||
3094 | * table [pTerms]. */ | ||
3095 | static int deleteTerms(fulltext_vtab *v, fts1Hash *pTerms, sqlite_int64 iRowid){ | ||
3096 | const char **pValues; | ||
3097 | int i; | ||
3098 | |||
3099 | int rc = content_select(v, iRowid, &pValues); | ||
3100 | if( rc!=SQLITE_OK ) return rc; | ||
3101 | |||
3102 | for(i = 0 ; i < v->nColumn; ++i) { | ||
3103 | rc = buildTerms(v, pTerms, iRowid, pValues[i], -1); | ||
3104 | if( rc!=SQLITE_OK ) break; | ||
3105 | } | ||
3106 | |||
3107 | freeStringArray(v->nColumn, pValues); | ||
3108 | return SQLITE_OK; | ||
3109 | } | ||
3110 | |||
3111 | /* Insert a row into the %_content table; set *piRowid to be the ID of the | ||
3112 | * new row. Fill [pTerms] with new doclists for the %_term table. */ | ||
3113 | static int index_insert(fulltext_vtab *v, sqlite3_value *pRequestRowid, | ||
3114 | sqlite3_value **pValues, | ||
3115 | sqlite_int64 *piRowid, fts1Hash *pTerms){ | ||
3116 | int rc; | ||
3117 | |||
3118 | rc = content_insert(v, pRequestRowid, pValues); /* execute an SQL INSERT */ | ||
3119 | if( rc!=SQLITE_OK ) return rc; | ||
3120 | *piRowid = sqlite3_last_insert_rowid(v->db); | ||
3121 | return insertTerms(v, pTerms, *piRowid, pValues); | ||
3122 | } | ||
3123 | |||
3124 | /* Delete a row from the %_content table; fill [pTerms] with empty doclists | ||
3125 | * to be written to the %_term table. */ | ||
3126 | static int index_delete(fulltext_vtab *v, sqlite_int64 iRow, fts1Hash *pTerms){ | ||
3127 | int rc = deleteTerms(v, pTerms, iRow); | ||
3128 | if( rc!=SQLITE_OK ) return rc; | ||
3129 | return content_delete(v, iRow); /* execute an SQL DELETE */ | ||
3130 | } | ||
3131 | |||
3132 | /* Update a row in the %_content table; fill [pTerms] with new doclists for the | ||
3133 | * %_term table. */ | ||
3134 | static int index_update(fulltext_vtab *v, sqlite_int64 iRow, | ||
3135 | sqlite3_value **pValues, fts1Hash *pTerms){ | ||
3136 | /* Generate an empty doclist for each term that previously appeared in this | ||
3137 | * row. */ | ||
3138 | int rc = deleteTerms(v, pTerms, iRow); | ||
3139 | if( rc!=SQLITE_OK ) return rc; | ||
3140 | |||
3141 | rc = content_update(v, pValues, iRow); /* execute an SQL UPDATE */ | ||
3142 | if( rc!=SQLITE_OK ) return rc; | ||
3143 | |||
3144 | /* Now add positions for terms which appear in the updated row. */ | ||
3145 | return insertTerms(v, pTerms, iRow, pValues); | ||
3146 | } | ||
3147 | |||
3148 | /* This function implements the xUpdate callback; it's the top-level entry | ||
3149 | * point for inserting, deleting or updating a row in a full-text table. */ | ||
3150 | static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg, | ||
3151 | sqlite_int64 *pRowid){ | ||
3152 | fulltext_vtab *v = (fulltext_vtab *) pVtab; | ||
3153 | fts1Hash terms; /* maps term string -> PosList */ | ||
3154 | int rc; | ||
3155 | fts1HashElem *e; | ||
3156 | |||
3157 | TRACE(("FTS1 Update %p\n", pVtab)); | ||
3158 | |||
3159 | fts1HashInit(&terms, FTS1_HASH_STRING, 1); | ||
3160 | |||
3161 | if( nArg<2 ){ | ||
3162 | rc = index_delete(v, sqlite3_value_int64(ppArg[0]), &terms); | ||
3163 | } else if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){ | ||
3164 | /* An update: | ||
3165 | * ppArg[0] = old rowid | ||
3166 | * ppArg[1] = new rowid | ||
3167 | * ppArg[2..2+v->nColumn-1] = values | ||
3168 | * ppArg[2+v->nColumn] = value for magic column (we ignore this) | ||
3169 | */ | ||
3170 | sqlite_int64 rowid = sqlite3_value_int64(ppArg[0]); | ||
3171 | if( sqlite3_value_type(ppArg[1]) != SQLITE_INTEGER || | ||
3172 | sqlite3_value_int64(ppArg[1]) != rowid ){ | ||
3173 | rc = SQLITE_ERROR; /* we don't allow changing the rowid */ | ||
3174 | } else { | ||
3175 | assert( nArg==2+v->nColumn+1); | ||
3176 | rc = index_update(v, rowid, &ppArg[2], &terms); | ||
3177 | } | ||
3178 | } else { | ||
3179 | /* An insert: | ||
3180 | * ppArg[1] = requested rowid | ||
3181 | * ppArg[2..2+v->nColumn-1] = values | ||
3182 | * ppArg[2+v->nColumn] = value for magic column (we ignore this) | ||
3183 | */ | ||
3184 | assert( nArg==2+v->nColumn+1); | ||
3185 | rc = index_insert(v, ppArg[1], &ppArg[2], pRowid, &terms); | ||
3186 | } | ||
3187 | |||
3188 | if( rc==SQLITE_OK ){ | ||
3189 | /* Write updated doclists to disk. */ | ||
3190 | for(e=fts1HashFirst(&terms); e; e=fts1HashNext(e)){ | ||
3191 | DocList *p = fts1HashData(e); | ||
3192 | rc = index_insert_term(v, fts1HashKey(e), fts1HashKeysize(e), p); | ||
3193 | if( rc!=SQLITE_OK ) break; | ||
3194 | } | ||
3195 | } | ||
3196 | |||
3197 | /* clean up */ | ||
3198 | for(e=fts1HashFirst(&terms); e; e=fts1HashNext(e)){ | ||
3199 | DocList *p = fts1HashData(e); | ||
3200 | docListDelete(p); | ||
3201 | } | ||
3202 | fts1HashClear(&terms); | ||
3203 | |||
3204 | return rc; | ||
3205 | } | ||
3206 | |||
3207 | /* | ||
3208 | ** Implementation of the snippet() function for FTS1 | ||
3209 | */ | ||
3210 | static void snippetFunc( | ||
3211 | sqlite3_context *pContext, | ||
3212 | int argc, | ||
3213 | sqlite3_value **argv | ||
3214 | ){ | ||
3215 | fulltext_cursor *pCursor; | ||
3216 | if( argc<1 ) return; | ||
3217 | if( sqlite3_value_type(argv[0])!=SQLITE_BLOB || | ||
3218 | sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){ | ||
3219 | sqlite3_result_error(pContext, "illegal first argument to html_snippet",-1); | ||
3220 | }else{ | ||
3221 | const char *zStart = "<b>"; | ||
3222 | const char *zEnd = "</b>"; | ||
3223 | const char *zEllipsis = "<b>...</b>"; | ||
3224 | memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor)); | ||
3225 | if( argc>=2 ){ | ||
3226 | zStart = (const char*)sqlite3_value_text(argv[1]); | ||
3227 | if( argc>=3 ){ | ||
3228 | zEnd = (const char*)sqlite3_value_text(argv[2]); | ||
3229 | if( argc>=4 ){ | ||
3230 | zEllipsis = (const char*)sqlite3_value_text(argv[3]); | ||
3231 | } | ||
3232 | } | ||
3233 | } | ||
3234 | snippetAllOffsets(pCursor); | ||
3235 | snippetText(pCursor, zStart, zEnd, zEllipsis); | ||
3236 | sqlite3_result_text(pContext, pCursor->snippet.zSnippet, | ||
3237 | pCursor->snippet.nSnippet, SQLITE_STATIC); | ||
3238 | } | ||
3239 | } | ||
3240 | |||
3241 | /* | ||
3242 | ** Implementation of the offsets() function for FTS1 | ||
3243 | */ | ||
3244 | static void snippetOffsetsFunc( | ||
3245 | sqlite3_context *pContext, | ||
3246 | int argc, | ||
3247 | sqlite3_value **argv | ||
3248 | ){ | ||
3249 | fulltext_cursor *pCursor; | ||
3250 | if( argc<1 ) return; | ||
3251 | if( sqlite3_value_type(argv[0])!=SQLITE_BLOB || | ||
3252 | sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){ | ||
3253 | sqlite3_result_error(pContext, "illegal first argument to offsets",-1); | ||
3254 | }else{ | ||
3255 | memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor)); | ||
3256 | snippetAllOffsets(pCursor); | ||
3257 | snippetOffsetText(&pCursor->snippet); | ||
3258 | sqlite3_result_text(pContext, | ||
3259 | pCursor->snippet.zOffset, pCursor->snippet.nOffset, | ||
3260 | SQLITE_STATIC); | ||
3261 | } | ||
3262 | } | ||
3263 | |||
3264 | /* | ||
3265 | ** This routine implements the xFindFunction method for the FTS1 | ||
3266 | ** virtual table. | ||
3267 | */ | ||
3268 | static int fulltextFindFunction( | ||
3269 | sqlite3_vtab *pVtab, | ||
3270 | int nArg, | ||
3271 | const char *zName, | ||
3272 | void (**pxFunc)(sqlite3_context*,int,sqlite3_value**), | ||
3273 | void **ppArg | ||
3274 | ){ | ||
3275 | if( strcmp(zName,"snippet")==0 ){ | ||
3276 | *pxFunc = snippetFunc; | ||
3277 | return 1; | ||
3278 | }else if( strcmp(zName,"offsets")==0 ){ | ||
3279 | *pxFunc = snippetOffsetsFunc; | ||
3280 | return 1; | ||
3281 | } | ||
3282 | return 0; | ||
3283 | } | ||
3284 | |||
3285 | /* | ||
3286 | ** Rename an fts1 table. | ||
3287 | */ | ||
3288 | static int fulltextRename( | ||
3289 | sqlite3_vtab *pVtab, | ||
3290 | const char *zName | ||
3291 | ){ | ||
3292 | fulltext_vtab *p = (fulltext_vtab *)pVtab; | ||
3293 | int rc = SQLITE_NOMEM; | ||
3294 | char *zSql = sqlite3_mprintf( | ||
3295 | "ALTER TABLE %Q.'%q_content' RENAME TO '%q_content';" | ||
3296 | "ALTER TABLE %Q.'%q_term' RENAME TO '%q_term';" | ||
3297 | , p->zDb, p->zName, zName | ||
3298 | , p->zDb, p->zName, zName | ||
3299 | ); | ||
3300 | if( zSql ){ | ||
3301 | rc = sqlite3_exec(p->db, zSql, 0, 0, 0); | ||
3302 | sqlite3_free(zSql); | ||
3303 | } | ||
3304 | return rc; | ||
3305 | } | ||
3306 | |||
3307 | static const sqlite3_module fulltextModule = { | ||
3308 | /* iVersion */ 0, | ||
3309 | /* xCreate */ fulltextCreate, | ||
3310 | /* xConnect */ fulltextConnect, | ||
3311 | /* xBestIndex */ fulltextBestIndex, | ||
3312 | /* xDisconnect */ fulltextDisconnect, | ||
3313 | /* xDestroy */ fulltextDestroy, | ||
3314 | /* xOpen */ fulltextOpen, | ||
3315 | /* xClose */ fulltextClose, | ||
3316 | /* xFilter */ fulltextFilter, | ||
3317 | /* xNext */ fulltextNext, | ||
3318 | /* xEof */ fulltextEof, | ||
3319 | /* xColumn */ fulltextColumn, | ||
3320 | /* xRowid */ fulltextRowid, | ||
3321 | /* xUpdate */ fulltextUpdate, | ||
3322 | /* xBegin */ 0, | ||
3323 | /* xSync */ 0, | ||
3324 | /* xCommit */ 0, | ||
3325 | /* xRollback */ 0, | ||
3326 | /* xFindFunction */ fulltextFindFunction, | ||
3327 | /* xRename */ fulltextRename, | ||
3328 | }; | ||
3329 | |||
3330 | int sqlite3Fts1Init(sqlite3 *db){ | ||
3331 | sqlite3_overload_function(db, "snippet", -1); | ||
3332 | sqlite3_overload_function(db, "offsets", -1); | ||
3333 | return sqlite3_create_module(db, "fts1", &fulltextModule, 0); | ||
3334 | } | ||
3335 | |||
3336 | #if !SQLITE_CORE | ||
3337 | int sqlite3_extension_init(sqlite3 *db, char **pzErrMsg, | ||
3338 | const sqlite3_api_routines *pApi){ | ||
3339 | SQLITE_EXTENSION_INIT2(pApi) | ||
3340 | return sqlite3Fts1Init(db); | ||
3341 | } | ||
3342 | #endif | ||
3343 | |||
3344 | #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */ | ||
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1.h b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1.h deleted file mode 100644 index d55e689..0000000 --- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1.h +++ /dev/null | |||
@@ -1,11 +0,0 @@ | |||
1 | #include "sqlite3.h" | ||
2 | |||
3 | #ifdef __cplusplus | ||
4 | extern "C" { | ||
5 | #endif /* __cplusplus */ | ||
6 | |||
7 | int sqlite3Fts1Init(sqlite3 *db); | ||
8 | |||
9 | #ifdef __cplusplus | ||
10 | } /* extern "C" */ | ||
11 | #endif /* __cplusplus */ | ||
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_hash.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_hash.c deleted file mode 100644 index 463a52b..0000000 --- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_hash.c +++ /dev/null | |||
@@ -1,369 +0,0 @@ | |||
1 | /* | ||
2 | ** 2001 September 22 | ||
3 | ** | ||
4 | ** The author disclaims copyright to this source code. In place of | ||
5 | ** a legal notice, here is a blessing: | ||
6 | ** | ||
7 | ** May you do good and not evil. | ||
8 | ** May you find forgiveness for yourself and forgive others. | ||
9 | ** May you share freely, never taking more than you give. | ||
10 | ** | ||
11 | ************************************************************************* | ||
12 | ** This is the implementation of generic hash-tables used in SQLite. | ||
13 | ** We've modified it slightly to serve as a standalone hash table | ||
14 | ** implementation for the full-text indexing module. | ||
15 | */ | ||
16 | #include <assert.h> | ||
17 | #include <stdlib.h> | ||
18 | #include <string.h> | ||
19 | |||
20 | /* | ||
21 | ** The code in this file is only compiled if: | ||
22 | ** | ||
23 | ** * The FTS1 module is being built as an extension | ||
24 | ** (in which case SQLITE_CORE is not defined), or | ||
25 | ** | ||
26 | ** * The FTS1 module is being built into the core of | ||
27 | ** SQLite (in which case SQLITE_ENABLE_FTS1 is defined). | ||
28 | */ | ||
29 | #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) | ||
30 | |||
31 | |||
32 | #include "fts1_hash.h" | ||
33 | |||
34 | static void *malloc_and_zero(int n){ | ||
35 | void *p = malloc(n); | ||
36 | if( p ){ | ||
37 | memset(p, 0, n); | ||
38 | } | ||
39 | return p; | ||
40 | } | ||
41 | |||
42 | /* Turn bulk memory into a hash table object by initializing the | ||
43 | ** fields of the Hash structure. | ||
44 | ** | ||
45 | ** "pNew" is a pointer to the hash table that is to be initialized. | ||
46 | ** keyClass is one of the constants | ||
47 | ** FTS1_HASH_BINARY or FTS1_HASH_STRING. The value of keyClass | ||
48 | ** determines what kind of key the hash table will use. "copyKey" is | ||
49 | ** true if the hash table should make its own private copy of keys and | ||
50 | ** false if it should just use the supplied pointer. | ||
51 | */ | ||
52 | void sqlite3Fts1HashInit(fts1Hash *pNew, int keyClass, int copyKey){ | ||
53 | assert( pNew!=0 ); | ||
54 | assert( keyClass>=FTS1_HASH_STRING && keyClass<=FTS1_HASH_BINARY ); | ||
55 | pNew->keyClass = keyClass; | ||
56 | pNew->copyKey = copyKey; | ||
57 | pNew->first = 0; | ||
58 | pNew->count = 0; | ||
59 | pNew->htsize = 0; | ||
60 | pNew->ht = 0; | ||
61 | pNew->xMalloc = malloc_and_zero; | ||
62 | pNew->xFree = free; | ||
63 | } | ||
64 | |||
65 | /* Remove all entries from a hash table. Reclaim all memory. | ||
66 | ** Call this routine to delete a hash table or to reset a hash table | ||
67 | ** to the empty state. | ||
68 | */ | ||
69 | void sqlite3Fts1HashClear(fts1Hash *pH){ | ||
70 | fts1HashElem *elem; /* For looping over all elements of the table */ | ||
71 | |||
72 | assert( pH!=0 ); | ||
73 | elem = pH->first; | ||
74 | pH->first = 0; | ||
75 | if( pH->ht ) pH->xFree(pH->ht); | ||
76 | pH->ht = 0; | ||
77 | pH->htsize = 0; | ||
78 | while( elem ){ | ||
79 | fts1HashElem *next_elem = elem->next; | ||
80 | if( pH->copyKey && elem->pKey ){ | ||
81 | pH->xFree(elem->pKey); | ||
82 | } | ||
83 | pH->xFree(elem); | ||
84 | elem = next_elem; | ||
85 | } | ||
86 | pH->count = 0; | ||
87 | } | ||
88 | |||
89 | /* | ||
90 | ** Hash and comparison functions when the mode is FTS1_HASH_STRING | ||
91 | */ | ||
92 | static int strHash(const void *pKey, int nKey){ | ||
93 | const char *z = (const char *)pKey; | ||
94 | int h = 0; | ||
95 | if( nKey<=0 ) nKey = (int) strlen(z); | ||
96 | while( nKey > 0 ){ | ||
97 | h = (h<<3) ^ h ^ *z++; | ||
98 | nKey--; | ||
99 | } | ||
100 | return h & 0x7fffffff; | ||
101 | } | ||
102 | static int strCompare(const void *pKey1, int n1, const void *pKey2, int n2){ | ||
103 | if( n1!=n2 ) return 1; | ||
104 | return strncmp((const char*)pKey1,(const char*)pKey2,n1); | ||
105 | } | ||
106 | |||
107 | /* | ||
108 | ** Hash and comparison functions when the mode is FTS1_HASH_BINARY | ||
109 | */ | ||
110 | static int binHash(const void *pKey, int nKey){ | ||
111 | int h = 0; | ||
112 | const char *z = (const char *)pKey; | ||
113 | while( nKey-- > 0 ){ | ||
114 | h = (h<<3) ^ h ^ *(z++); | ||
115 | } | ||
116 | return h & 0x7fffffff; | ||
117 | } | ||
118 | static int binCompare(const void *pKey1, int n1, const void *pKey2, int n2){ | ||
119 | if( n1!=n2 ) return 1; | ||
120 | return memcmp(pKey1,pKey2,n1); | ||
121 | } | ||
122 | |||
123 | /* | ||
124 | ** Return a pointer to the appropriate hash function given the key class. | ||
125 | ** | ||
126 | ** The C syntax in this function definition may be unfamilar to some | ||
127 | ** programmers, so we provide the following additional explanation: | ||
128 | ** | ||
129 | ** The name of the function is "hashFunction". The function takes a | ||
130 | ** single parameter "keyClass". The return value of hashFunction() | ||
131 | ** is a pointer to another function. Specifically, the return value | ||
132 | ** of hashFunction() is a pointer to a function that takes two parameters | ||
133 | ** with types "const void*" and "int" and returns an "int". | ||
134 | */ | ||
135 | static int (*hashFunction(int keyClass))(const void*,int){ | ||
136 | if( keyClass==FTS1_HASH_STRING ){ | ||
137 | return &strHash; | ||
138 | }else{ | ||
139 | assert( keyClass==FTS1_HASH_BINARY ); | ||
140 | return &binHash; | ||
141 | } | ||
142 | } | ||
143 | |||
144 | /* | ||
145 | ** Return a pointer to the appropriate hash function given the key class. | ||
146 | ** | ||
147 | ** For help in interpreted the obscure C code in the function definition, | ||
148 | ** see the header comment on the previous function. | ||
149 | */ | ||
150 | static int (*compareFunction(int keyClass))(const void*,int,const void*,int){ | ||
151 | if( keyClass==FTS1_HASH_STRING ){ | ||
152 | return &strCompare; | ||
153 | }else{ | ||
154 | assert( keyClass==FTS1_HASH_BINARY ); | ||
155 | return &binCompare; | ||
156 | } | ||
157 | } | ||
158 | |||
159 | /* Link an element into the hash table | ||
160 | */ | ||
161 | static void insertElement( | ||
162 | fts1Hash *pH, /* The complete hash table */ | ||
163 | struct _fts1ht *pEntry, /* The entry into which pNew is inserted */ | ||
164 | fts1HashElem *pNew /* The element to be inserted */ | ||
165 | ){ | ||
166 | fts1HashElem *pHead; /* First element already in pEntry */ | ||
167 | pHead = pEntry->chain; | ||
168 | if( pHead ){ | ||
169 | pNew->next = pHead; | ||
170 | pNew->prev = pHead->prev; | ||
171 | if( pHead->prev ){ pHead->prev->next = pNew; } | ||
172 | else { pH->first = pNew; } | ||
173 | pHead->prev = pNew; | ||
174 | }else{ | ||
175 | pNew->next = pH->first; | ||
176 | if( pH->first ){ pH->first->prev = pNew; } | ||
177 | pNew->prev = 0; | ||
178 | pH->first = pNew; | ||
179 | } | ||
180 | pEntry->count++; | ||
181 | pEntry->chain = pNew; | ||
182 | } | ||
183 | |||
184 | |||
185 | /* Resize the hash table so that it cantains "new_size" buckets. | ||
186 | ** "new_size" must be a power of 2. The hash table might fail | ||
187 | ** to resize if sqliteMalloc() fails. | ||
188 | */ | ||
189 | static void rehash(fts1Hash *pH, int new_size){ | ||
190 | struct _fts1ht *new_ht; /* The new hash table */ | ||
191 | fts1HashElem *elem, *next_elem; /* For looping over existing elements */ | ||
192 | int (*xHash)(const void*,int); /* The hash function */ | ||
193 | |||
194 | assert( (new_size & (new_size-1))==0 ); | ||
195 | new_ht = (struct _fts1ht *)pH->xMalloc( new_size*sizeof(struct _fts1ht) ); | ||
196 | if( new_ht==0 ) return; | ||
197 | if( pH->ht ) pH->xFree(pH->ht); | ||
198 | pH->ht = new_ht; | ||
199 | pH->htsize = new_size; | ||
200 | xHash = hashFunction(pH->keyClass); | ||
201 | for(elem=pH->first, pH->first=0; elem; elem = next_elem){ | ||
202 | int h = (*xHash)(elem->pKey, elem->nKey) & (new_size-1); | ||
203 | next_elem = elem->next; | ||
204 | insertElement(pH, &new_ht[h], elem); | ||
205 | } | ||
206 | } | ||
207 | |||
208 | /* This function (for internal use only) locates an element in an | ||
209 | ** hash table that matches the given key. The hash for this key has | ||
210 | ** already been computed and is passed as the 4th parameter. | ||
211 | */ | ||
212 | static fts1HashElem *findElementGivenHash( | ||
213 | const fts1Hash *pH, /* The pH to be searched */ | ||
214 | const void *pKey, /* The key we are searching for */ | ||
215 | int nKey, | ||
216 | int h /* The hash for this key. */ | ||
217 | ){ | ||
218 | fts1HashElem *elem; /* Used to loop thru the element list */ | ||
219 | int count; /* Number of elements left to test */ | ||
220 | int (*xCompare)(const void*,int,const void*,int); /* comparison function */ | ||
221 | |||
222 | if( pH->ht ){ | ||
223 | struct _fts1ht *pEntry = &pH->ht[h]; | ||
224 | elem = pEntry->chain; | ||
225 | count = pEntry->count; | ||
226 | xCompare = compareFunction(pH->keyClass); | ||
227 | while( count-- && elem ){ | ||
228 | if( (*xCompare)(elem->pKey,elem->nKey,pKey,nKey)==0 ){ | ||
229 | return elem; | ||
230 | } | ||
231 | elem = elem->next; | ||
232 | } | ||
233 | } | ||
234 | return 0; | ||
235 | } | ||
236 | |||
237 | /* Remove a single entry from the hash table given a pointer to that | ||
238 | ** element and a hash on the element's key. | ||
239 | */ | ||
240 | static void removeElementGivenHash( | ||
241 | fts1Hash *pH, /* The pH containing "elem" */ | ||
242 | fts1HashElem* elem, /* The element to be removed from the pH */ | ||
243 | int h /* Hash value for the element */ | ||
244 | ){ | ||
245 | struct _fts1ht *pEntry; | ||
246 | if( elem->prev ){ | ||
247 | elem->prev->next = elem->next; | ||
248 | }else{ | ||
249 | pH->first = elem->next; | ||
250 | } | ||
251 | if( elem->next ){ | ||
252 | elem->next->prev = elem->prev; | ||
253 | } | ||
254 | pEntry = &pH->ht[h]; | ||
255 | if( pEntry->chain==elem ){ | ||
256 | pEntry->chain = elem->next; | ||
257 | } | ||
258 | pEntry->count--; | ||
259 | if( pEntry->count<=0 ){ | ||
260 | pEntry->chain = 0; | ||
261 | } | ||
262 | if( pH->copyKey && elem->pKey ){ | ||
263 | pH->xFree(elem->pKey); | ||
264 | } | ||
265 | pH->xFree( elem ); | ||
266 | pH->count--; | ||
267 | if( pH->count<=0 ){ | ||
268 | assert( pH->first==0 ); | ||
269 | assert( pH->count==0 ); | ||
270 | fts1HashClear(pH); | ||
271 | } | ||
272 | } | ||
273 | |||
274 | /* Attempt to locate an element of the hash table pH with a key | ||
275 | ** that matches pKey,nKey. Return the data for this element if it is | ||
276 | ** found, or NULL if there is no match. | ||
277 | */ | ||
278 | void *sqlite3Fts1HashFind(const fts1Hash *pH, const void *pKey, int nKey){ | ||
279 | int h; /* A hash on key */ | ||
280 | fts1HashElem *elem; /* The element that matches key */ | ||
281 | int (*xHash)(const void*,int); /* The hash function */ | ||
282 | |||
283 | if( pH==0 || pH->ht==0 ) return 0; | ||
284 | xHash = hashFunction(pH->keyClass); | ||
285 | assert( xHash!=0 ); | ||
286 | h = (*xHash)(pKey,nKey); | ||
287 | assert( (pH->htsize & (pH->htsize-1))==0 ); | ||
288 | elem = findElementGivenHash(pH,pKey,nKey, h & (pH->htsize-1)); | ||
289 | return elem ? elem->data : 0; | ||
290 | } | ||
291 | |||
292 | /* Insert an element into the hash table pH. The key is pKey,nKey | ||
293 | ** and the data is "data". | ||
294 | ** | ||
295 | ** If no element exists with a matching key, then a new | ||
296 | ** element is created. A copy of the key is made if the copyKey | ||
297 | ** flag is set. NULL is returned. | ||
298 | ** | ||
299 | ** If another element already exists with the same key, then the | ||
300 | ** new data replaces the old data and the old data is returned. | ||
301 | ** The key is not copied in this instance. If a malloc fails, then | ||
302 | ** the new data is returned and the hash table is unchanged. | ||
303 | ** | ||
304 | ** If the "data" parameter to this function is NULL, then the | ||
305 | ** element corresponding to "key" is removed from the hash table. | ||
306 | */ | ||
307 | void *sqlite3Fts1HashInsert( | ||
308 | fts1Hash *pH, /* The hash table to insert into */ | ||
309 | const void *pKey, /* The key */ | ||
310 | int nKey, /* Number of bytes in the key */ | ||
311 | void *data /* The data */ | ||
312 | ){ | ||
313 | int hraw; /* Raw hash value of the key */ | ||
314 | int h; /* the hash of the key modulo hash table size */ | ||
315 | fts1HashElem *elem; /* Used to loop thru the element list */ | ||
316 | fts1HashElem *new_elem; /* New element added to the pH */ | ||
317 | int (*xHash)(const void*,int); /* The hash function */ | ||
318 | |||
319 | assert( pH!=0 ); | ||
320 | xHash = hashFunction(pH->keyClass); | ||
321 | assert( xHash!=0 ); | ||
322 | hraw = (*xHash)(pKey, nKey); | ||
323 | assert( (pH->htsize & (pH->htsize-1))==0 ); | ||
324 | h = hraw & (pH->htsize-1); | ||
325 | elem = findElementGivenHash(pH,pKey,nKey,h); | ||
326 | if( elem ){ | ||
327 | void *old_data = elem->data; | ||
328 | if( data==0 ){ | ||
329 | removeElementGivenHash(pH,elem,h); | ||
330 | }else{ | ||
331 | elem->data = data; | ||
332 | } | ||
333 | return old_data; | ||
334 | } | ||
335 | if( data==0 ) return 0; | ||
336 | new_elem = (fts1HashElem*)pH->xMalloc( sizeof(fts1HashElem) ); | ||
337 | if( new_elem==0 ) return data; | ||
338 | if( pH->copyKey && pKey!=0 ){ | ||
339 | new_elem->pKey = pH->xMalloc( nKey ); | ||
340 | if( new_elem->pKey==0 ){ | ||
341 | pH->xFree(new_elem); | ||
342 | return data; | ||
343 | } | ||
344 | memcpy((void*)new_elem->pKey, pKey, nKey); | ||
345 | }else{ | ||
346 | new_elem->pKey = (void*)pKey; | ||
347 | } | ||
348 | new_elem->nKey = nKey; | ||
349 | pH->count++; | ||
350 | if( pH->htsize==0 ){ | ||
351 | rehash(pH,8); | ||
352 | if( pH->htsize==0 ){ | ||
353 | pH->count = 0; | ||
354 | pH->xFree(new_elem); | ||
355 | return data; | ||
356 | } | ||
357 | } | ||
358 | if( pH->count > pH->htsize ){ | ||
359 | rehash(pH,pH->htsize*2); | ||
360 | } | ||
361 | assert( pH->htsize>0 ); | ||
362 | assert( (pH->htsize & (pH->htsize-1))==0 ); | ||
363 | h = hraw & (pH->htsize-1); | ||
364 | insertElement(pH, &pH->ht[h], new_elem); | ||
365 | new_elem->data = data; | ||
366 | return 0; | ||
367 | } | ||
368 | |||
369 | #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */ | ||
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_hash.h b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_hash.h deleted file mode 100644 index c31c430..0000000 --- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_hash.h +++ /dev/null | |||
@@ -1,112 +0,0 @@ | |||
1 | /* | ||
2 | ** 2001 September 22 | ||
3 | ** | ||
4 | ** The author disclaims copyright to this source code. In place of | ||
5 | ** a legal notice, here is a blessing: | ||
6 | ** | ||
7 | ** May you do good and not evil. | ||
8 | ** May you find forgiveness for yourself and forgive others. | ||
9 | ** May you share freely, never taking more than you give. | ||
10 | ** | ||
11 | ************************************************************************* | ||
12 | ** This is the header file for the generic hash-table implemenation | ||
13 | ** used in SQLite. We've modified it slightly to serve as a standalone | ||
14 | ** hash table implementation for the full-text indexing module. | ||
15 | ** | ||
16 | */ | ||
17 | #ifndef _FTS1_HASH_H_ | ||
18 | #define _FTS1_HASH_H_ | ||
19 | |||
20 | /* Forward declarations of structures. */ | ||
21 | typedef struct fts1Hash fts1Hash; | ||
22 | typedef struct fts1HashElem fts1HashElem; | ||
23 | |||
24 | /* A complete hash table is an instance of the following structure. | ||
25 | ** The internals of this structure are intended to be opaque -- client | ||
26 | ** code should not attempt to access or modify the fields of this structure | ||
27 | ** directly. Change this structure only by using the routines below. | ||
28 | ** However, many of the "procedures" and "functions" for modifying and | ||
29 | ** accessing this structure are really macros, so we can't really make | ||
30 | ** this structure opaque. | ||
31 | */ | ||
32 | struct fts1Hash { | ||
33 | char keyClass; /* HASH_INT, _POINTER, _STRING, _BINARY */ | ||
34 | char copyKey; /* True if copy of key made on insert */ | ||
35 | int count; /* Number of entries in this table */ | ||
36 | fts1HashElem *first; /* The first element of the array */ | ||
37 | void *(*xMalloc)(int); /* malloc() function to use */ | ||
38 | void (*xFree)(void *); /* free() function to use */ | ||
39 | int htsize; /* Number of buckets in the hash table */ | ||
40 | struct _fts1ht { /* the hash table */ | ||
41 | int count; /* Number of entries with this hash */ | ||
42 | fts1HashElem *chain; /* Pointer to first entry with this hash */ | ||
43 | } *ht; | ||
44 | }; | ||
45 | |||
46 | /* Each element in the hash table is an instance of the following | ||
47 | ** structure. All elements are stored on a single doubly-linked list. | ||
48 | ** | ||
49 | ** Again, this structure is intended to be opaque, but it can't really | ||
50 | ** be opaque because it is used by macros. | ||
51 | */ | ||
52 | struct fts1HashElem { | ||
53 | fts1HashElem *next, *prev; /* Next and previous elements in the table */ | ||
54 | void *data; /* Data associated with this element */ | ||
55 | void *pKey; int nKey; /* Key associated with this element */ | ||
56 | }; | ||
57 | |||
58 | /* | ||
59 | ** There are 2 different modes of operation for a hash table: | ||
60 | ** | ||
61 | ** FTS1_HASH_STRING pKey points to a string that is nKey bytes long | ||
62 | ** (including the null-terminator, if any). Case | ||
63 | ** is respected in comparisons. | ||
64 | ** | ||
65 | ** FTS1_HASH_BINARY pKey points to binary data nKey bytes long. | ||
66 | ** memcmp() is used to compare keys. | ||
67 | ** | ||
68 | ** A copy of the key is made if the copyKey parameter to fts1HashInit is 1. | ||
69 | */ | ||
70 | #define FTS1_HASH_STRING 1 | ||
71 | #define FTS1_HASH_BINARY 2 | ||
72 | |||
73 | /* | ||
74 | ** Access routines. To delete, insert a NULL pointer. | ||
75 | */ | ||
76 | void sqlite3Fts1HashInit(fts1Hash*, int keytype, int copyKey); | ||
77 | void *sqlite3Fts1HashInsert(fts1Hash*, const void *pKey, int nKey, void *pData); | ||
78 | void *sqlite3Fts1HashFind(const fts1Hash*, const void *pKey, int nKey); | ||
79 | void sqlite3Fts1HashClear(fts1Hash*); | ||
80 | |||
81 | /* | ||
82 | ** Shorthand for the functions above | ||
83 | */ | ||
84 | #define fts1HashInit sqlite3Fts1HashInit | ||
85 | #define fts1HashInsert sqlite3Fts1HashInsert | ||
86 | #define fts1HashFind sqlite3Fts1HashFind | ||
87 | #define fts1HashClear sqlite3Fts1HashClear | ||
88 | |||
89 | /* | ||
90 | ** Macros for looping over all elements of a hash table. The idiom is | ||
91 | ** like this: | ||
92 | ** | ||
93 | ** fts1Hash h; | ||
94 | ** fts1HashElem *p; | ||
95 | ** ... | ||
96 | ** for(p=fts1HashFirst(&h); p; p=fts1HashNext(p)){ | ||
97 | ** SomeStructure *pData = fts1HashData(p); | ||
98 | ** // do something with pData | ||
99 | ** } | ||
100 | */ | ||
101 | #define fts1HashFirst(H) ((H)->first) | ||
102 | #define fts1HashNext(E) ((E)->next) | ||
103 | #define fts1HashData(E) ((E)->data) | ||
104 | #define fts1HashKey(E) ((E)->pKey) | ||
105 | #define fts1HashKeysize(E) ((E)->nKey) | ||
106 | |||
107 | /* | ||
108 | ** Number of entries in a hash table | ||
109 | */ | ||
110 | #define fts1HashCount(H) ((H)->count) | ||
111 | |||
112 | #endif /* _FTS1_HASH_H_ */ | ||
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_porter.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_porter.c deleted file mode 100644 index 1d26236..0000000 --- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_porter.c +++ /dev/null | |||
@@ -1,643 +0,0 @@ | |||
1 | /* | ||
2 | ** 2006 September 30 | ||
3 | ** | ||
4 | ** The author disclaims copyright to this source code. In place of | ||
5 | ** a legal notice, here is a blessing: | ||
6 | ** | ||
7 | ** May you do good and not evil. | ||
8 | ** May you find forgiveness for yourself and forgive others. | ||
9 | ** May you share freely, never taking more than you give. | ||
10 | ** | ||
11 | ************************************************************************* | ||
12 | ** Implementation of the full-text-search tokenizer that implements | ||
13 | ** a Porter stemmer. | ||
14 | */ | ||
15 | |||
16 | /* | ||
17 | ** The code in this file is only compiled if: | ||
18 | ** | ||
19 | ** * The FTS1 module is being built as an extension | ||
20 | ** (in which case SQLITE_CORE is not defined), or | ||
21 | ** | ||
22 | ** * The FTS1 module is being built into the core of | ||
23 | ** SQLite (in which case SQLITE_ENABLE_FTS1 is defined). | ||
24 | */ | ||
25 | #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) | ||
26 | |||
27 | |||
28 | #include <assert.h> | ||
29 | #include <stdlib.h> | ||
30 | #include <stdio.h> | ||
31 | #include <string.h> | ||
32 | #include <ctype.h> | ||
33 | |||
34 | #include "fts1_tokenizer.h" | ||
35 | |||
36 | /* | ||
37 | ** Class derived from sqlite3_tokenizer | ||
38 | */ | ||
39 | typedef struct porter_tokenizer { | ||
40 | sqlite3_tokenizer base; /* Base class */ | ||
41 | } porter_tokenizer; | ||
42 | |||
43 | /* | ||
44 | ** Class derived from sqlit3_tokenizer_cursor | ||
45 | */ | ||
46 | typedef struct porter_tokenizer_cursor { | ||
47 | sqlite3_tokenizer_cursor base; | ||
48 | const char *zInput; /* input we are tokenizing */ | ||
49 | int nInput; /* size of the input */ | ||
50 | int iOffset; /* current position in zInput */ | ||
51 | int iToken; /* index of next token to be returned */ | ||
52 | char *zToken; /* storage for current token */ | ||
53 | int nAllocated; /* space allocated to zToken buffer */ | ||
54 | } porter_tokenizer_cursor; | ||
55 | |||
56 | |||
57 | /* Forward declaration */ | ||
58 | static const sqlite3_tokenizer_module porterTokenizerModule; | ||
59 | |||
60 | |||
61 | /* | ||
62 | ** Create a new tokenizer instance. | ||
63 | */ | ||
64 | static int porterCreate( | ||
65 | int argc, const char * const *argv, | ||
66 | sqlite3_tokenizer **ppTokenizer | ||
67 | ){ | ||
68 | porter_tokenizer *t; | ||
69 | t = (porter_tokenizer *) calloc(sizeof(*t), 1); | ||
70 | if( t==NULL ) return SQLITE_NOMEM; | ||
71 | |||
72 | *ppTokenizer = &t->base; | ||
73 | return SQLITE_OK; | ||
74 | } | ||
75 | |||
76 | /* | ||
77 | ** Destroy a tokenizer | ||
78 | */ | ||
79 | static int porterDestroy(sqlite3_tokenizer *pTokenizer){ | ||
80 | free(pTokenizer); | ||
81 | return SQLITE_OK; | ||
82 | } | ||
83 | |||
84 | /* | ||
85 | ** Prepare to begin tokenizing a particular string. The input | ||
86 | ** string to be tokenized is zInput[0..nInput-1]. A cursor | ||
87 | ** used to incrementally tokenize this string is returned in | ||
88 | ** *ppCursor. | ||
89 | */ | ||
90 | static int porterOpen( | ||
91 | sqlite3_tokenizer *pTokenizer, /* The tokenizer */ | ||
92 | const char *zInput, int nInput, /* String to be tokenized */ | ||
93 | sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ | ||
94 | ){ | ||
95 | porter_tokenizer_cursor *c; | ||
96 | |||
97 | c = (porter_tokenizer_cursor *) malloc(sizeof(*c)); | ||
98 | if( c==NULL ) return SQLITE_NOMEM; | ||
99 | |||
100 | c->zInput = zInput; | ||
101 | if( zInput==0 ){ | ||
102 | c->nInput = 0; | ||
103 | }else if( nInput<0 ){ | ||
104 | c->nInput = (int)strlen(zInput); | ||
105 | }else{ | ||
106 | c->nInput = nInput; | ||
107 | } | ||
108 | c->iOffset = 0; /* start tokenizing at the beginning */ | ||
109 | c->iToken = 0; | ||
110 | c->zToken = NULL; /* no space allocated, yet. */ | ||
111 | c->nAllocated = 0; | ||
112 | |||
113 | *ppCursor = &c->base; | ||
114 | return SQLITE_OK; | ||
115 | } | ||
116 | |||
117 | /* | ||
118 | ** Close a tokenization cursor previously opened by a call to | ||
119 | ** porterOpen() above. | ||
120 | */ | ||
121 | static int porterClose(sqlite3_tokenizer_cursor *pCursor){ | ||
122 | porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor; | ||
123 | free(c->zToken); | ||
124 | free(c); | ||
125 | return SQLITE_OK; | ||
126 | } | ||
127 | /* | ||
128 | ** Vowel or consonant | ||
129 | */ | ||
130 | static const char cType[] = { | ||
131 | 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, | ||
132 | 1, 1, 1, 2, 1 | ||
133 | }; | ||
134 | |||
135 | /* | ||
136 | ** isConsonant() and isVowel() determine if their first character in | ||
137 | ** the string they point to is a consonant or a vowel, according | ||
138 | ** to Porter ruls. | ||
139 | ** | ||
140 | ** A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'. | ||
141 | ** 'Y' is a consonant unless it follows another consonant, | ||
142 | ** in which case it is a vowel. | ||
143 | ** | ||
144 | ** In these routine, the letters are in reverse order. So the 'y' rule | ||
145 | ** is that 'y' is a consonant unless it is followed by another | ||
146 | ** consonent. | ||
147 | */ | ||
148 | static int isVowel(const char*); | ||
149 | static int isConsonant(const char *z){ | ||
150 | int j; | ||
151 | char x = *z; | ||
152 | if( x==0 ) return 0; | ||
153 | assert( x>='a' && x<='z' ); | ||
154 | j = cType[x-'a']; | ||
155 | if( j<2 ) return j; | ||
156 | return z[1]==0 || isVowel(z + 1); | ||
157 | } | ||
158 | static int isVowel(const char *z){ | ||
159 | int j; | ||
160 | char x = *z; | ||
161 | if( x==0 ) return 0; | ||
162 | assert( x>='a' && x<='z' ); | ||
163 | j = cType[x-'a']; | ||
164 | if( j<2 ) return 1-j; | ||
165 | return isConsonant(z + 1); | ||
166 | } | ||
167 | |||
168 | /* | ||
169 | ** Let any sequence of one or more vowels be represented by V and let | ||
170 | ** C be sequence of one or more consonants. Then every word can be | ||
171 | ** represented as: | ||
172 | ** | ||
173 | ** [C] (VC){m} [V] | ||
174 | ** | ||
175 | ** In prose: A word is an optional consonant followed by zero or | ||
176 | ** vowel-consonant pairs followed by an optional vowel. "m" is the | ||
177 | ** number of vowel consonant pairs. This routine computes the value | ||
178 | ** of m for the first i bytes of a word. | ||
179 | ** | ||
180 | ** Return true if the m-value for z is 1 or more. In other words, | ||
181 | ** return true if z contains at least one vowel that is followed | ||
182 | ** by a consonant. | ||
183 | ** | ||
184 | ** In this routine z[] is in reverse order. So we are really looking | ||
185 | ** for an instance of of a consonant followed by a vowel. | ||
186 | */ | ||
187 | static int m_gt_0(const char *z){ | ||
188 | while( isVowel(z) ){ z++; } | ||
189 | if( *z==0 ) return 0; | ||
190 | while( isConsonant(z) ){ z++; } | ||
191 | return *z!=0; | ||
192 | } | ||
193 | |||
194 | /* Like mgt0 above except we are looking for a value of m which is | ||
195 | ** exactly 1 | ||
196 | */ | ||
197 | static int m_eq_1(const char *z){ | ||
198 | while( isVowel(z) ){ z++; } | ||
199 | if( *z==0 ) return 0; | ||
200 | while( isConsonant(z) ){ z++; } | ||
201 | if( *z==0 ) return 0; | ||
202 | while( isVowel(z) ){ z++; } | ||
203 | if( *z==0 ) return 1; | ||
204 | while( isConsonant(z) ){ z++; } | ||
205 | return *z==0; | ||
206 | } | ||
207 | |||
208 | /* Like mgt0 above except we are looking for a value of m>1 instead | ||
209 | ** or m>0 | ||
210 | */ | ||
211 | static int m_gt_1(const char *z){ | ||
212 | while( isVowel(z) ){ z++; } | ||
213 | if( *z==0 ) return 0; | ||
214 | while( isConsonant(z) ){ z++; } | ||
215 | if( *z==0 ) return 0; | ||
216 | while( isVowel(z) ){ z++; } | ||
217 | if( *z==0 ) return 0; | ||
218 | while( isConsonant(z) ){ z++; } | ||
219 | return *z!=0; | ||
220 | } | ||
221 | |||
222 | /* | ||
223 | ** Return TRUE if there is a vowel anywhere within z[0..n-1] | ||
224 | */ | ||
225 | static int hasVowel(const char *z){ | ||
226 | while( isConsonant(z) ){ z++; } | ||
227 | return *z!=0; | ||
228 | } | ||
229 | |||
230 | /* | ||
231 | ** Return TRUE if the word ends in a double consonant. | ||
232 | ** | ||
233 | ** The text is reversed here. So we are really looking at | ||
234 | ** the first two characters of z[]. | ||
235 | */ | ||
236 | static int doubleConsonant(const char *z){ | ||
237 | return isConsonant(z) && z[0]==z[1] && isConsonant(z+1); | ||
238 | } | ||
239 | |||
240 | /* | ||
241 | ** Return TRUE if the word ends with three letters which | ||
242 | ** are consonant-vowel-consonent and where the final consonant | ||
243 | ** is not 'w', 'x', or 'y'. | ||
244 | ** | ||
245 | ** The word is reversed here. So we are really checking the | ||
246 | ** first three letters and the first one cannot be in [wxy]. | ||
247 | */ | ||
248 | static int star_oh(const char *z){ | ||
249 | return | ||
250 | z[0]!=0 && isConsonant(z) && | ||
251 | z[0]!='w' && z[0]!='x' && z[0]!='y' && | ||
252 | z[1]!=0 && isVowel(z+1) && | ||
253 | z[2]!=0 && isConsonant(z+2); | ||
254 | } | ||
255 | |||
256 | /* | ||
257 | ** If the word ends with zFrom and xCond() is true for the stem | ||
258 | ** of the word that preceeds the zFrom ending, then change the | ||
259 | ** ending to zTo. | ||
260 | ** | ||
261 | ** The input word *pz and zFrom are both in reverse order. zTo | ||
262 | ** is in normal order. | ||
263 | ** | ||
264 | ** Return TRUE if zFrom matches. Return FALSE if zFrom does not | ||
265 | ** match. Not that TRUE is returned even if xCond() fails and | ||
266 | ** no substitution occurs. | ||
267 | */ | ||
268 | static int stem( | ||
269 | char **pz, /* The word being stemmed (Reversed) */ | ||
270 | const char *zFrom, /* If the ending matches this... (Reversed) */ | ||
271 | const char *zTo, /* ... change the ending to this (not reversed) */ | ||
272 | int (*xCond)(const char*) /* Condition that must be true */ | ||
273 | ){ | ||
274 | char *z = *pz; | ||
275 | while( *zFrom && *zFrom==*z ){ z++; zFrom++; } | ||
276 | if( *zFrom!=0 ) return 0; | ||
277 | if( xCond && !xCond(z) ) return 1; | ||
278 | while( *zTo ){ | ||
279 | *(--z) = *(zTo++); | ||
280 | } | ||
281 | *pz = z; | ||
282 | return 1; | ||
283 | } | ||
284 | |||
285 | /* | ||
286 | ** This is the fallback stemmer used when the porter stemmer is | ||
287 | ** inappropriate. The input word is copied into the output with | ||
288 | ** US-ASCII case folding. If the input word is too long (more | ||
289 | ** than 20 bytes if it contains no digits or more than 6 bytes if | ||
290 | ** it contains digits) then word is truncated to 20 or 6 bytes | ||
291 | ** by taking 10 or 3 bytes from the beginning and end. | ||
292 | */ | ||
293 | static void copy_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){ | ||
294 | int i, mx, j; | ||
295 | int hasDigit = 0; | ||
296 | for(i=0; i<nIn; i++){ | ||
297 | int c = zIn[i]; | ||
298 | if( c>='A' && c<='Z' ){ | ||
299 | zOut[i] = c - 'A' + 'a'; | ||
300 | }else{ | ||
301 | if( c>='0' && c<='9' ) hasDigit = 1; | ||
302 | zOut[i] = c; | ||
303 | } | ||
304 | } | ||
305 | mx = hasDigit ? 3 : 10; | ||
306 | if( nIn>mx*2 ){ | ||
307 | for(j=mx, i=nIn-mx; i<nIn; i++, j++){ | ||
308 | zOut[j] = zOut[i]; | ||
309 | } | ||
310 | i = j; | ||
311 | } | ||
312 | zOut[i] = 0; | ||
313 | *pnOut = i; | ||
314 | } | ||
315 | |||
316 | |||
317 | /* | ||
318 | ** Stem the input word zIn[0..nIn-1]. Store the output in zOut. | ||
319 | ** zOut is at least big enough to hold nIn bytes. Write the actual | ||
320 | ** size of the output word (exclusive of the '\0' terminator) into *pnOut. | ||
321 | ** | ||
322 | ** Any upper-case characters in the US-ASCII character set ([A-Z]) | ||
323 | ** are converted to lower case. Upper-case UTF characters are | ||
324 | ** unchanged. | ||
325 | ** | ||
326 | ** Words that are longer than about 20 bytes are stemmed by retaining | ||
327 | ** a few bytes from the beginning and the end of the word. If the | ||
328 | ** word contains digits, 3 bytes are taken from the beginning and | ||
329 | ** 3 bytes from the end. For long words without digits, 10 bytes | ||
330 | ** are taken from each end. US-ASCII case folding still applies. | ||
331 | ** | ||
332 | ** If the input word contains not digits but does characters not | ||
333 | ** in [a-zA-Z] then no stemming is attempted and this routine just | ||
334 | ** copies the input into the input into the output with US-ASCII | ||
335 | ** case folding. | ||
336 | ** | ||
337 | ** Stemming never increases the length of the word. So there is | ||
338 | ** no chance of overflowing the zOut buffer. | ||
339 | */ | ||
340 | static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){ | ||
341 | int i, j, c; | ||
342 | char zReverse[28]; | ||
343 | char *z, *z2; | ||
344 | if( nIn<3 || nIn>=sizeof(zReverse)-7 ){ | ||
345 | /* The word is too big or too small for the porter stemmer. | ||
346 | ** Fallback to the copy stemmer */ | ||
347 | copy_stemmer(zIn, nIn, zOut, pnOut); | ||
348 | return; | ||
349 | } | ||
350 | for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){ | ||
351 | c = zIn[i]; | ||
352 | if( c>='A' && c<='Z' ){ | ||
353 | zReverse[j] = c + 'a' - 'A'; | ||
354 | }else if( c>='a' && c<='z' ){ | ||
355 | zReverse[j] = c; | ||
356 | }else{ | ||
357 | /* The use of a character not in [a-zA-Z] means that we fallback | ||
358 | ** to the copy stemmer */ | ||
359 | copy_stemmer(zIn, nIn, zOut, pnOut); | ||
360 | return; | ||
361 | } | ||
362 | } | ||
363 | memset(&zReverse[sizeof(zReverse)-5], 0, 5); | ||
364 | z = &zReverse[j+1]; | ||
365 | |||
366 | |||
367 | /* Step 1a */ | ||
368 | if( z[0]=='s' ){ | ||
369 | if( | ||
370 | !stem(&z, "sess", "ss", 0) && | ||
371 | !stem(&z, "sei", "i", 0) && | ||
372 | !stem(&z, "ss", "ss", 0) | ||
373 | ){ | ||
374 | z++; | ||
375 | } | ||
376 | } | ||
377 | |||
378 | /* Step 1b */ | ||
379 | z2 = z; | ||
380 | if( stem(&z, "dee", "ee", m_gt_0) ){ | ||
381 | /* Do nothing. The work was all in the test */ | ||
382 | }else if( | ||
383 | (stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel)) | ||
384 | && z!=z2 | ||
385 | ){ | ||
386 | if( stem(&z, "ta", "ate", 0) || | ||
387 | stem(&z, "lb", "ble", 0) || | ||
388 | stem(&z, "zi", "ize", 0) ){ | ||
389 | /* Do nothing. The work was all in the test */ | ||
390 | }else if( doubleConsonant(z) && (*z!='l' && *z!='s' && *z!='z') ){ | ||
391 | z++; | ||
392 | }else if( m_eq_1(z) && star_oh(z) ){ | ||
393 | *(--z) = 'e'; | ||
394 | } | ||
395 | } | ||
396 | |||
397 | /* Step 1c */ | ||
398 | if( z[0]=='y' && hasVowel(z+1) ){ | ||
399 | z[0] = 'i'; | ||
400 | } | ||
401 | |||
402 | /* Step 2 */ | ||
403 | switch( z[1] ){ | ||
404 | case 'a': | ||
405 | stem(&z, "lanoita", "ate", m_gt_0) || | ||
406 | stem(&z, "lanoit", "tion", m_gt_0); | ||
407 | break; | ||
408 | case 'c': | ||
409 | stem(&z, "icne", "ence", m_gt_0) || | ||
410 | stem(&z, "icna", "ance", m_gt_0); | ||
411 | break; | ||
412 | case 'e': | ||
413 | stem(&z, "rezi", "ize", m_gt_0); | ||
414 | break; | ||
415 | case 'g': | ||
416 | stem(&z, "igol", "log", m_gt_0); | ||
417 | break; | ||
418 | case 'l': | ||
419 | stem(&z, "ilb", "ble", m_gt_0) || | ||
420 | stem(&z, "illa", "al", m_gt_0) || | ||
421 | stem(&z, "iltne", "ent", m_gt_0) || | ||
422 | stem(&z, "ile", "e", m_gt_0) || | ||
423 | stem(&z, "ilsuo", "ous", m_gt_0); | ||
424 | break; | ||
425 | case 'o': | ||
426 | stem(&z, "noitazi", "ize", m_gt_0) || | ||
427 | stem(&z, "noita", "ate", m_gt_0) || | ||
428 | stem(&z, "rota", "ate", m_gt_0); | ||
429 | break; | ||
430 | case 's': | ||
431 | stem(&z, "msila", "al", m_gt_0) || | ||
432 | stem(&z, "ssenevi", "ive", m_gt_0) || | ||
433 | stem(&z, "ssenluf", "ful", m_gt_0) || | ||
434 | stem(&z, "ssensuo", "ous", m_gt_0); | ||
435 | break; | ||
436 | case 't': | ||
437 | stem(&z, "itila", "al", m_gt_0) || | ||
438 | stem(&z, "itivi", "ive", m_gt_0) || | ||
439 | stem(&z, "itilib", "ble", m_gt_0); | ||
440 | break; | ||
441 | } | ||
442 | |||
443 | /* Step 3 */ | ||
444 | switch( z[0] ){ | ||
445 | case 'e': | ||
446 | stem(&z, "etaci", "ic", m_gt_0) || | ||
447 | stem(&z, "evita", "", m_gt_0) || | ||
448 | stem(&z, "ezila", "al", m_gt_0); | ||
449 | break; | ||
450 | case 'i': | ||
451 | stem(&z, "itici", "ic", m_gt_0); | ||
452 | break; | ||
453 | case 'l': | ||
454 | stem(&z, "laci", "ic", m_gt_0) || | ||
455 | stem(&z, "luf", "", m_gt_0); | ||
456 | break; | ||
457 | case 's': | ||
458 | stem(&z, "ssen", "", m_gt_0); | ||
459 | break; | ||
460 | } | ||
461 | |||
462 | /* Step 4 */ | ||
463 | switch( z[1] ){ | ||
464 | case 'a': | ||
465 | if( z[0]=='l' && m_gt_1(z+2) ){ | ||
466 | z += 2; | ||
467 | } | ||
468 | break; | ||
469 | case 'c': | ||
470 | if( z[0]=='e' && z[2]=='n' && (z[3]=='a' || z[3]=='e') && m_gt_1(z+4) ){ | ||
471 | z += 4; | ||
472 | } | ||
473 | break; | ||
474 | case 'e': | ||
475 | if( z[0]=='r' && m_gt_1(z+2) ){ | ||
476 | z += 2; | ||
477 | } | ||
478 | break; | ||
479 | case 'i': | ||
480 | if( z[0]=='c' && m_gt_1(z+2) ){ | ||
481 | z += 2; | ||
482 | } | ||
483 | break; | ||
484 | case 'l': | ||
485 | if( z[0]=='e' && z[2]=='b' && (z[3]=='a' || z[3]=='i') && m_gt_1(z+4) ){ | ||
486 | z += 4; | ||
487 | } | ||
488 | break; | ||
489 | case 'n': | ||
490 | if( z[0]=='t' ){ | ||
491 | if( z[2]=='a' ){ | ||
492 | if( m_gt_1(z+3) ){ | ||
493 | z += 3; | ||
494 | } | ||
495 | }else if( z[2]=='e' ){ | ||
496 | stem(&z, "tneme", "", m_gt_1) || | ||
497 | stem(&z, "tnem", "", m_gt_1) || | ||
498 | stem(&z, "tne", "", m_gt_1); | ||
499 | } | ||
500 | } | ||
501 | break; | ||
502 | case 'o': | ||
503 | if( z[0]=='u' ){ | ||
504 | if( m_gt_1(z+2) ){ | ||
505 | z += 2; | ||
506 | } | ||
507 | }else if( z[3]=='s' || z[3]=='t' ){ | ||
508 | stem(&z, "noi", "", m_gt_1); | ||
509 | } | ||
510 | break; | ||
511 | case 's': | ||
512 | if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){ | ||
513 | z += 3; | ||
514 | } | ||
515 | break; | ||
516 | case 't': | ||
517 | stem(&z, "eta", "", m_gt_1) || | ||
518 | stem(&z, "iti", "", m_gt_1); | ||
519 | break; | ||
520 | case 'u': | ||
521 | if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){ | ||
522 | z += 3; | ||
523 | } | ||
524 | break; | ||
525 | case 'v': | ||
526 | case 'z': | ||
527 | if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){ | ||
528 | z += 3; | ||
529 | } | ||
530 | break; | ||
531 | } | ||
532 | |||
533 | /* Step 5a */ | ||
534 | if( z[0]=='e' ){ | ||
535 | if( m_gt_1(z+1) ){ | ||
536 | z++; | ||
537 | }else if( m_eq_1(z+1) && !star_oh(z+1) ){ | ||
538 | z++; | ||
539 | } | ||
540 | } | ||
541 | |||
542 | /* Step 5b */ | ||
543 | if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){ | ||
544 | z++; | ||
545 | } | ||
546 | |||
547 | /* z[] is now the stemmed word in reverse order. Flip it back | ||
548 | ** around into forward order and return. | ||
549 | */ | ||
550 | *pnOut = i = strlen(z); | ||
551 | zOut[i] = 0; | ||
552 | while( *z ){ | ||
553 | zOut[--i] = *(z++); | ||
554 | } | ||
555 | } | ||
556 | |||
557 | /* | ||
558 | ** Characters that can be part of a token. We assume any character | ||
559 | ** whose value is greater than 0x80 (any UTF character) can be | ||
560 | ** part of a token. In other words, delimiters all must have | ||
561 | ** values of 0x7f or lower. | ||
562 | */ | ||
563 | static const char isIdChar[] = { | ||
564 | /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */ | ||
565 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */ | ||
566 | 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */ | ||
567 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */ | ||
568 | 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */ | ||
569 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */ | ||
570 | }; | ||
571 | #define idChar(C) (((ch=C)&0x80)!=0 || (ch>0x2f && isIdChar[ch-0x30])) | ||
572 | #define isDelim(C) (((ch=C)&0x80)==0 && (ch<0x30 || !isIdChar[ch-0x30])) | ||
573 | |||
574 | /* | ||
575 | ** Extract the next token from a tokenization cursor. The cursor must | ||
576 | ** have been opened by a prior call to porterOpen(). | ||
577 | */ | ||
578 | static int porterNext( | ||
579 | sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by porterOpen */ | ||
580 | const char **pzToken, /* OUT: *pzToken is the token text */ | ||
581 | int *pnBytes, /* OUT: Number of bytes in token */ | ||
582 | int *piStartOffset, /* OUT: Starting offset of token */ | ||
583 | int *piEndOffset, /* OUT: Ending offset of token */ | ||
584 | int *piPosition /* OUT: Position integer of token */ | ||
585 | ){ | ||
586 | porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor; | ||
587 | const char *z = c->zInput; | ||
588 | |||
589 | while( c->iOffset<c->nInput ){ | ||
590 | int iStartOffset, ch; | ||
591 | |||
592 | /* Scan past delimiter characters */ | ||
593 | while( c->iOffset<c->nInput && isDelim(z[c->iOffset]) ){ | ||
594 | c->iOffset++; | ||
595 | } | ||
596 | |||
597 | /* Count non-delimiter characters. */ | ||
598 | iStartOffset = c->iOffset; | ||
599 | while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){ | ||
600 | c->iOffset++; | ||
601 | } | ||
602 | |||
603 | if( c->iOffset>iStartOffset ){ | ||
604 | int n = c->iOffset-iStartOffset; | ||
605 | if( n>c->nAllocated ){ | ||
606 | c->nAllocated = n+20; | ||
607 | c->zToken = realloc(c->zToken, c->nAllocated); | ||
608 | if( c->zToken==NULL ) return SQLITE_NOMEM; | ||
609 | } | ||
610 | porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes); | ||
611 | *pzToken = c->zToken; | ||
612 | *piStartOffset = iStartOffset; | ||
613 | *piEndOffset = c->iOffset; | ||
614 | *piPosition = c->iToken++; | ||
615 | return SQLITE_OK; | ||
616 | } | ||
617 | } | ||
618 | return SQLITE_DONE; | ||
619 | } | ||
620 | |||
621 | /* | ||
622 | ** The set of routines that implement the porter-stemmer tokenizer | ||
623 | */ | ||
624 | static const sqlite3_tokenizer_module porterTokenizerModule = { | ||
625 | 0, | ||
626 | porterCreate, | ||
627 | porterDestroy, | ||
628 | porterOpen, | ||
629 | porterClose, | ||
630 | porterNext, | ||
631 | }; | ||
632 | |||
633 | /* | ||
634 | ** Allocate a new porter tokenizer. Return a pointer to the new | ||
635 | ** tokenizer in *ppModule | ||
636 | */ | ||
637 | void sqlite3Fts1PorterTokenizerModule( | ||
638 | sqlite3_tokenizer_module const**ppModule | ||
639 | ){ | ||
640 | *ppModule = &porterTokenizerModule; | ||
641 | } | ||
642 | |||
643 | #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */ | ||
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_tokenizer.h b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_tokenizer.h deleted file mode 100644 index a48cb74..0000000 --- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_tokenizer.h +++ /dev/null | |||
@@ -1,90 +0,0 @@ | |||
1 | /* | ||
2 | ** 2006 July 10 | ||
3 | ** | ||
4 | ** The author disclaims copyright to this source code. | ||
5 | ** | ||
6 | ************************************************************************* | ||
7 | ** Defines the interface to tokenizers used by fulltext-search. There | ||
8 | ** are three basic components: | ||
9 | ** | ||
10 | ** sqlite3_tokenizer_module is a singleton defining the tokenizer | ||
11 | ** interface functions. This is essentially the class structure for | ||
12 | ** tokenizers. | ||
13 | ** | ||
14 | ** sqlite3_tokenizer is used to define a particular tokenizer, perhaps | ||
15 | ** including customization information defined at creation time. | ||
16 | ** | ||
17 | ** sqlite3_tokenizer_cursor is generated by a tokenizer to generate | ||
18 | ** tokens from a particular input. | ||
19 | */ | ||
20 | #ifndef _FTS1_TOKENIZER_H_ | ||
21 | #define _FTS1_TOKENIZER_H_ | ||
22 | |||
23 | /* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time. | ||
24 | ** If tokenizers are to be allowed to call sqlite3_*() functions, then | ||
25 | ** we will need a way to register the API consistently. | ||
26 | */ | ||
27 | #include "sqlite3.h" | ||
28 | |||
29 | /* | ||
30 | ** Structures used by the tokenizer interface. | ||
31 | */ | ||
32 | typedef struct sqlite3_tokenizer sqlite3_tokenizer; | ||
33 | typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor; | ||
34 | typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module; | ||
35 | |||
36 | struct sqlite3_tokenizer_module { | ||
37 | int iVersion; /* currently 0 */ | ||
38 | |||
39 | /* | ||
40 | ** Create and destroy a tokenizer. argc/argv are passed down from | ||
41 | ** the fulltext virtual table creation to allow customization. | ||
42 | */ | ||
43 | int (*xCreate)(int argc, const char *const*argv, | ||
44 | sqlite3_tokenizer **ppTokenizer); | ||
45 | int (*xDestroy)(sqlite3_tokenizer *pTokenizer); | ||
46 | |||
47 | /* | ||
48 | ** Tokenize a particular input. Call xOpen() to prepare to | ||
49 | ** tokenize, xNext() repeatedly until it returns SQLITE_DONE, then | ||
50 | ** xClose() to free any internal state. The pInput passed to | ||
51 | ** xOpen() must exist until the cursor is closed. The ppToken | ||
52 | ** result from xNext() is only valid until the next call to xNext() | ||
53 | ** or until xClose() is called. | ||
54 | */ | ||
55 | /* TODO(shess) current implementation requires pInput to be | ||
56 | ** nul-terminated. This should either be fixed, or pInput/nBytes | ||
57 | ** should be converted to zInput. | ||
58 | */ | ||
59 | int (*xOpen)(sqlite3_tokenizer *pTokenizer, | ||
60 | const char *pInput, int nBytes, | ||
61 | sqlite3_tokenizer_cursor **ppCursor); | ||
62 | int (*xClose)(sqlite3_tokenizer_cursor *pCursor); | ||
63 | int (*xNext)(sqlite3_tokenizer_cursor *pCursor, | ||
64 | const char **ppToken, int *pnBytes, | ||
65 | int *piStartOffset, int *piEndOffset, int *piPosition); | ||
66 | }; | ||
67 | |||
68 | struct sqlite3_tokenizer { | ||
69 | const sqlite3_tokenizer_module *pModule; /* The module for this tokenizer */ | ||
70 | /* Tokenizer implementations will typically add additional fields */ | ||
71 | }; | ||
72 | |||
73 | struct sqlite3_tokenizer_cursor { | ||
74 | sqlite3_tokenizer *pTokenizer; /* Tokenizer for this cursor. */ | ||
75 | /* Tokenizer implementations will typically add additional fields */ | ||
76 | }; | ||
77 | |||
78 | /* | ||
79 | ** Get the module for a tokenizer which generates tokens based on a | ||
80 | ** set of non-token characters. The default is to break tokens at any | ||
81 | ** non-alnum character, though the set of delimiters can also be | ||
82 | ** specified by the first argv argument to xCreate(). | ||
83 | */ | ||
84 | /* TODO(shess) This doesn't belong here. Need some sort of | ||
85 | ** registration process. | ||
86 | */ | ||
87 | void sqlite3Fts1SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule); | ||
88 | void sqlite3Fts1PorterTokenizerModule(sqlite3_tokenizer_module const**ppModule); | ||
89 | |||
90 | #endif /* _FTS1_TOKENIZER_H_ */ | ||
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_tokenizer1.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_tokenizer1.c deleted file mode 100644 index f58fba8..0000000 --- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_tokenizer1.c +++ /dev/null | |||
@@ -1,221 +0,0 @@ | |||
1 | /* | ||
2 | ** The author disclaims copyright to this source code. | ||
3 | ** | ||
4 | ************************************************************************* | ||
5 | ** Implementation of the "simple" full-text-search tokenizer. | ||
6 | */ | ||
7 | |||
8 | /* | ||
9 | ** The code in this file is only compiled if: | ||
10 | ** | ||
11 | ** * The FTS1 module is being built as an extension | ||
12 | ** (in which case SQLITE_CORE is not defined), or | ||
13 | ** | ||
14 | ** * The FTS1 module is being built into the core of | ||
15 | ** SQLite (in which case SQLITE_ENABLE_FTS1 is defined). | ||
16 | */ | ||
17 | #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) | ||
18 | |||
19 | |||
20 | #include <assert.h> | ||
21 | #include <stdlib.h> | ||
22 | #include <stdio.h> | ||
23 | #include <string.h> | ||
24 | #include <ctype.h> | ||
25 | |||
26 | #include "fts1_tokenizer.h" | ||
27 | |||
28 | typedef struct simple_tokenizer { | ||
29 | sqlite3_tokenizer base; | ||
30 | char delim[128]; /* flag ASCII delimiters */ | ||
31 | } simple_tokenizer; | ||
32 | |||
33 | typedef struct simple_tokenizer_cursor { | ||
34 | sqlite3_tokenizer_cursor base; | ||
35 | const char *pInput; /* input we are tokenizing */ | ||
36 | int nBytes; /* size of the input */ | ||
37 | int iOffset; /* current position in pInput */ | ||
38 | int iToken; /* index of next token to be returned */ | ||
39 | char *pToken; /* storage for current token */ | ||
40 | int nTokenAllocated; /* space allocated to zToken buffer */ | ||
41 | } simple_tokenizer_cursor; | ||
42 | |||
43 | |||
44 | /* Forward declaration */ | ||
45 | static const sqlite3_tokenizer_module simpleTokenizerModule; | ||
46 | |||
47 | static int isDelim(simple_tokenizer *t, unsigned char c){ | ||
48 | return c<0x80 && t->delim[c]; | ||
49 | } | ||
50 | |||
51 | /* | ||
52 | ** Create a new tokenizer instance. | ||
53 | */ | ||
54 | static int simpleCreate( | ||
55 | int argc, const char * const *argv, | ||
56 | sqlite3_tokenizer **ppTokenizer | ||
57 | ){ | ||
58 | simple_tokenizer *t; | ||
59 | |||
60 | t = (simple_tokenizer *) calloc(sizeof(*t), 1); | ||
61 | if( t==NULL ) return SQLITE_NOMEM; | ||
62 | |||
63 | /* TODO(shess) Delimiters need to remain the same from run to run, | ||
64 | ** else we need to reindex. One solution would be a meta-table to | ||
65 | ** track such information in the database, then we'd only want this | ||
66 | ** information on the initial create. | ||
67 | */ | ||
68 | if( argc>1 ){ | ||
69 | int i, n = strlen(argv[1]); | ||
70 | for(i=0; i<n; i++){ | ||
71 | unsigned char ch = argv[1][i]; | ||
72 | /* We explicitly don't support UTF-8 delimiters for now. */ | ||
73 | if( ch>=0x80 ){ | ||
74 | free(t); | ||
75 | return SQLITE_ERROR; | ||
76 | } | ||
77 | t->delim[ch] = 1; | ||
78 | } | ||
79 | } else { | ||
80 | /* Mark non-alphanumeric ASCII characters as delimiters */ | ||
81 | int i; | ||
82 | for(i=1; i<0x80; i++){ | ||
83 | t->delim[i] = !isalnum(i); | ||
84 | } | ||
85 | } | ||
86 | |||
87 | *ppTokenizer = &t->base; | ||
88 | return SQLITE_OK; | ||
89 | } | ||
90 | |||
91 | /* | ||
92 | ** Destroy a tokenizer | ||
93 | */ | ||
94 | static int simpleDestroy(sqlite3_tokenizer *pTokenizer){ | ||
95 | free(pTokenizer); | ||
96 | return SQLITE_OK; | ||
97 | } | ||
98 | |||
99 | /* | ||
100 | ** Prepare to begin tokenizing a particular string. The input | ||
101 | ** string to be tokenized is pInput[0..nBytes-1]. A cursor | ||
102 | ** used to incrementally tokenize this string is returned in | ||
103 | ** *ppCursor. | ||
104 | */ | ||
105 | static int simpleOpen( | ||
106 | sqlite3_tokenizer *pTokenizer, /* The tokenizer */ | ||
107 | const char *pInput, int nBytes, /* String to be tokenized */ | ||
108 | sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ | ||
109 | ){ | ||
110 | simple_tokenizer_cursor *c; | ||
111 | |||
112 | c = (simple_tokenizer_cursor *) malloc(sizeof(*c)); | ||
113 | if( c==NULL ) return SQLITE_NOMEM; | ||
114 | |||
115 | c->pInput = pInput; | ||
116 | if( pInput==0 ){ | ||
117 | c->nBytes = 0; | ||
118 | }else if( nBytes<0 ){ | ||
119 | c->nBytes = (int)strlen(pInput); | ||
120 | }else{ | ||
121 | c->nBytes = nBytes; | ||
122 | } | ||
123 | c->iOffset = 0; /* start tokenizing at the beginning */ | ||
124 | c->iToken = 0; | ||
125 | c->pToken = NULL; /* no space allocated, yet. */ | ||
126 | c->nTokenAllocated = 0; | ||
127 | |||
128 | *ppCursor = &c->base; | ||
129 | return SQLITE_OK; | ||
130 | } | ||
131 | |||
132 | /* | ||
133 | ** Close a tokenization cursor previously opened by a call to | ||
134 | ** simpleOpen() above. | ||
135 | */ | ||
136 | static int simpleClose(sqlite3_tokenizer_cursor *pCursor){ | ||
137 | simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor; | ||
138 | free(c->pToken); | ||
139 | free(c); | ||
140 | return SQLITE_OK; | ||
141 | } | ||
142 | |||
143 | /* | ||
144 | ** Extract the next token from a tokenization cursor. The cursor must | ||
145 | ** have been opened by a prior call to simpleOpen(). | ||
146 | */ | ||
147 | static int simpleNext( | ||
148 | sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */ | ||
149 | const char **ppToken, /* OUT: *ppToken is the token text */ | ||
150 | int *pnBytes, /* OUT: Number of bytes in token */ | ||
151 | int *piStartOffset, /* OUT: Starting offset of token */ | ||
152 | int *piEndOffset, /* OUT: Ending offset of token */ | ||
153 | int *piPosition /* OUT: Position integer of token */ | ||
154 | ){ | ||
155 | simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor; | ||
156 | simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer; | ||
157 | unsigned char *p = (unsigned char *)c->pInput; | ||
158 | |||
159 | while( c->iOffset<c->nBytes ){ | ||
160 | int iStartOffset; | ||
161 | |||
162 | /* Scan past delimiter characters */ | ||
163 | while( c->iOffset<c->nBytes && isDelim(t, p[c->iOffset]) ){ | ||
164 | c->iOffset++; | ||
165 | } | ||
166 | |||
167 | /* Count non-delimiter characters. */ | ||
168 | iStartOffset = c->iOffset; | ||
169 | while( c->iOffset<c->nBytes && !isDelim(t, p[c->iOffset]) ){ | ||
170 | c->iOffset++; | ||
171 | } | ||
172 | |||
173 | if( c->iOffset>iStartOffset ){ | ||
174 | int i, n = c->iOffset-iStartOffset; | ||
175 | if( n>c->nTokenAllocated ){ | ||
176 | c->nTokenAllocated = n+20; | ||
177 | c->pToken = realloc(c->pToken, c->nTokenAllocated); | ||
178 | if( c->pToken==NULL ) return SQLITE_NOMEM; | ||
179 | } | ||
180 | for(i=0; i<n; i++){ | ||
181 | /* TODO(shess) This needs expansion to handle UTF-8 | ||
182 | ** case-insensitivity. | ||
183 | */ | ||
184 | unsigned char ch = p[iStartOffset+i]; | ||
185 | c->pToken[i] = ch<0x80 ? tolower(ch) : ch; | ||
186 | } | ||
187 | *ppToken = c->pToken; | ||
188 | *pnBytes = n; | ||
189 | *piStartOffset = iStartOffset; | ||
190 | *piEndOffset = c->iOffset; | ||
191 | *piPosition = c->iToken++; | ||
192 | |||
193 | return SQLITE_OK; | ||
194 | } | ||
195 | } | ||
196 | return SQLITE_DONE; | ||
197 | } | ||
198 | |||
199 | /* | ||
200 | ** The set of routines that implement the simple tokenizer | ||
201 | */ | ||
202 | static const sqlite3_tokenizer_module simpleTokenizerModule = { | ||
203 | 0, | ||
204 | simpleCreate, | ||
205 | simpleDestroy, | ||
206 | simpleOpen, | ||
207 | simpleClose, | ||
208 | simpleNext, | ||
209 | }; | ||
210 | |||
211 | /* | ||
212 | ** Allocate a new simple tokenizer. Return a pointer to the new | ||
213 | ** tokenizer in *ppModule | ||
214 | */ | ||
215 | void sqlite3Fts1SimpleTokenizerModule( | ||
216 | sqlite3_tokenizer_module const**ppModule | ||
217 | ){ | ||
218 | *ppModule = &simpleTokenizerModule; | ||
219 | } | ||
220 | |||
221 | #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */ | ||
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/README.tokenizers b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/README.tokenizers deleted file mode 100644 index 6625b31..0000000 --- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/README.tokenizers +++ /dev/null | |||
@@ -1,134 +0,0 @@ | |||
1 | |||
2 | 1. FTS2 Tokenizers | ||
3 | |||
4 | When creating a new full-text table, FTS2 allows the user to select | ||
5 | the text tokenizer implementation to be used when indexing text | ||
6 | by specifying a "tokenizer" clause as part of the CREATE VIRTUAL TABLE | ||
7 | statement: | ||
8 | |||
9 | CREATE VIRTUAL TABLE <table-name> USING fts2( | ||
10 | <columns ...> [, tokenizer <tokenizer-name> [<tokenizer-args>]] | ||
11 | ); | ||
12 | |||
13 | The built-in tokenizers (valid values to pass as <tokenizer name>) are | ||
14 | "simple" and "porter". | ||
15 | |||
16 | <tokenizer-args> should consist of zero or more white-space separated | ||
17 | arguments to pass to the selected tokenizer implementation. The | ||
18 | interpretation of the arguments, if any, depends on the individual | ||
19 | tokenizer. | ||
20 | |||
21 | 2. Custom Tokenizers | ||
22 | |||
23 | FTS2 allows users to provide custom tokenizer implementations. The | ||
24 | interface used to create a new tokenizer is defined and described in | ||
25 | the fts2_tokenizer.h source file. | ||
26 | |||
27 | Registering a new FTS2 tokenizer is similar to registering a new | ||
28 | virtual table module with SQLite. The user passes a pointer to a | ||
29 | structure containing pointers to various callback functions that | ||
30 | make up the implementation of the new tokenizer type. For tokenizers, | ||
31 | the structure (defined in fts2_tokenizer.h) is called | ||
32 | "sqlite3_tokenizer_module". | ||
33 | |||
34 | FTS2 does not expose a C-function that users call to register new | ||
35 | tokenizer types with a database handle. Instead, the pointer must | ||
36 | be encoded as an SQL blob value and passed to FTS2 through the SQL | ||
37 | engine by evaluating a special scalar function, "fts2_tokenizer()". | ||
38 | The fts2_tokenizer() function may be called with one or two arguments, | ||
39 | as follows: | ||
40 | |||
41 | SELECT fts2_tokenizer(<tokenizer-name>); | ||
42 | SELECT fts2_tokenizer(<tokenizer-name>, <sqlite3_tokenizer_module ptr>); | ||
43 | |||
44 | Where <tokenizer-name> is a string identifying the tokenizer and | ||
45 | <sqlite3_tokenizer_module ptr> is a pointer to an sqlite3_tokenizer_module | ||
46 | structure encoded as an SQL blob. If the second argument is present, | ||
47 | it is registered as tokenizer <tokenizer-name> and a copy of it | ||
48 | returned. If only one argument is passed, a pointer to the tokenizer | ||
49 | implementation currently registered as <tokenizer-name> is returned, | ||
50 | encoded as a blob. Or, if no such tokenizer exists, an SQL exception | ||
51 | (error) is raised. | ||
52 | |||
53 | SECURITY: If the fts2 extension is used in an environment where potentially | ||
54 | malicious users may execute arbitrary SQL (i.e. gears), they should be | ||
55 | prevented from invoking the fts2_tokenizer() function, possibly using the | ||
56 | authorisation callback. | ||
57 | |||
58 | See "Sample code" below for an example of calling the fts2_tokenizer() | ||
59 | function from C code. | ||
60 | |||
61 | 3. ICU Library Tokenizers | ||
62 | |||
63 | If this extension is compiled with the SQLITE_ENABLE_ICU pre-processor | ||
64 | symbol defined, then there exists a built-in tokenizer named "icu" | ||
65 | implemented using the ICU library. The first argument passed to the | ||
66 | xCreate() method (see fts2_tokenizer.h) of this tokenizer may be | ||
67 | an ICU locale identifier. For example "tr_TR" for Turkish as used | ||
68 | in Turkey, or "en_AU" for English as used in Australia. For example: | ||
69 | |||
70 | "CREATE VIRTUAL TABLE thai_text USING fts2(text, tokenizer icu th_TH)" | ||
71 | |||
72 | The ICU tokenizer implementation is very simple. It splits the input | ||
73 | text according to the ICU rules for finding word boundaries and discards | ||
74 | any tokens that consist entirely of white-space. This may be suitable | ||
75 | for some applications in some locales, but not all. If more complex | ||
76 | processing is required, for example to implement stemming or | ||
77 | discard punctuation, this can be done by creating a tokenizer | ||
78 | implementation that uses the ICU tokenizer as part of it's implementation. | ||
79 | |||
80 | When using the ICU tokenizer this way, it is safe to overwrite the | ||
81 | contents of the strings returned by the xNext() method (see | ||
82 | fts2_tokenizer.h). | ||
83 | |||
84 | 4. Sample code. | ||
85 | |||
86 | The following two code samples illustrate the way C code should invoke | ||
87 | the fts2_tokenizer() scalar function: | ||
88 | |||
89 | int registerTokenizer( | ||
90 | sqlite3 *db, | ||
91 | char *zName, | ||
92 | const sqlite3_tokenizer_module *p | ||
93 | ){ | ||
94 | int rc; | ||
95 | sqlite3_stmt *pStmt; | ||
96 | const char zSql[] = "SELECT fts2_tokenizer(?, ?)"; | ||
97 | |||
98 | rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0); | ||
99 | if( rc!=SQLITE_OK ){ | ||
100 | return rc; | ||
101 | } | ||
102 | |||
103 | sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC); | ||
104 | sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC); | ||
105 | sqlite3_step(pStmt); | ||
106 | |||
107 | return sqlite3_finalize(pStmt); | ||
108 | } | ||
109 | |||
110 | int queryTokenizer( | ||
111 | sqlite3 *db, | ||
112 | char *zName, | ||
113 | const sqlite3_tokenizer_module **pp | ||
114 | ){ | ||
115 | int rc; | ||
116 | sqlite3_stmt *pStmt; | ||
117 | const char zSql[] = "SELECT fts2_tokenizer(?)"; | ||
118 | |||
119 | *pp = 0; | ||
120 | rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0); | ||
121 | if( rc!=SQLITE_OK ){ | ||
122 | return rc; | ||
123 | } | ||
124 | |||
125 | sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC); | ||
126 | if( SQLITE_ROW==sqlite3_step(pStmt) ){ | ||
127 | if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){ | ||
128 | memcpy(pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp)); | ||
129 | } | ||
130 | } | ||
131 | |||
132 | return sqlite3_finalize(pStmt); | ||
133 | } | ||
134 | |||
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/README.txt b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/README.txt deleted file mode 100644 index 517a2a0..0000000 --- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/README.txt +++ /dev/null | |||
@@ -1,4 +0,0 @@ | |||
1 | This folder contains source code to the second full-text search | ||
2 | extension for SQLite. While the API is the same, this version uses a | ||
3 | substantially different storage schema from fts1, so tables will need | ||
4 | to be rebuilt. | ||
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2.c deleted file mode 100644 index 65ad173..0000000 --- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2.c +++ /dev/null | |||
@@ -1,5936 +0,0 @@ | |||
1 | /* fts2 has a design flaw which can lead to database corruption (see | ||
2 | ** below). It is recommended not to use it any longer, instead use | ||
3 | ** fts3 (or higher). If you believe that your use of fts2 is safe, | ||
4 | ** add -DSQLITE_ENABLE_BROKEN_FTS2=1 to your CFLAGS. | ||
5 | */ | ||
6 | #ifndef SQLITE_ENABLE_BROKEN_FTS2 | ||
7 | #error fts2 has a design flaw and has been deprecated. | ||
8 | #endif | ||
9 | /* The flaw is that fts2 uses the content table's unaliased rowid as | ||
10 | ** the unique docid. fts2 embeds the rowid in the index it builds, | ||
11 | ** and expects the rowid to not change. The SQLite VACUUM operation | ||
12 | ** will renumber such rowids, thereby breaking fts2. If you are using | ||
13 | ** fts2 in a system which has disabled VACUUM, then you can continue | ||
14 | ** to use it safely. Note that PRAGMA auto_vacuum does NOT disable | ||
15 | ** VACUUM, though systems using auto_vacuum are unlikely to invoke | ||
16 | ** VACUUM. | ||
17 | ** | ||
18 | ** Unlike fts1, which is safe across VACUUM if you never delete | ||
19 | ** documents, fts2 has a second exposure to this flaw, in the segments | ||
20 | ** table. So fts2 should be considered unsafe across VACUUM in all | ||
21 | ** cases. | ||
22 | */ | ||
23 | |||
24 | /* | ||
25 | ** 2006 Oct 10 | ||
26 | ** | ||
27 | ** The author disclaims copyright to this source code. In place of | ||
28 | ** a legal notice, here is a blessing: | ||
29 | ** | ||
30 | ** May you do good and not evil. | ||
31 | ** May you find forgiveness for yourself and forgive others. | ||
32 | ** May you share freely, never taking more than you give. | ||
33 | ** | ||
34 | ****************************************************************************** | ||
35 | ** | ||
36 | ** This is an SQLite module implementing full-text search. | ||
37 | */ | ||
38 | |||
39 | /* | ||
40 | ** The code in this file is only compiled if: | ||
41 | ** | ||
42 | ** * The FTS2 module is being built as an extension | ||
43 | ** (in which case SQLITE_CORE is not defined), or | ||
44 | ** | ||
45 | ** * The FTS2 module is being built into the core of | ||
46 | ** SQLite (in which case SQLITE_ENABLE_FTS2 is defined). | ||
47 | */ | ||
48 | |||
49 | /* TODO(shess) Consider exporting this comment to an HTML file or the | ||
50 | ** wiki. | ||
51 | */ | ||
52 | /* The full-text index is stored in a series of b+tree (-like) | ||
53 | ** structures called segments which map terms to doclists. The | ||
54 | ** structures are like b+trees in layout, but are constructed from the | ||
55 | ** bottom up in optimal fashion and are not updatable. Since trees | ||
56 | ** are built from the bottom up, things will be described from the | ||
57 | ** bottom up. | ||
58 | ** | ||
59 | ** | ||
60 | **** Varints **** | ||
61 | ** The basic unit of encoding is a variable-length integer called a | ||
62 | ** varint. We encode variable-length integers in little-endian order | ||
63 | ** using seven bits * per byte as follows: | ||
64 | ** | ||
65 | ** KEY: | ||
66 | ** A = 0xxxxxxx 7 bits of data and one flag bit | ||
67 | ** B = 1xxxxxxx 7 bits of data and one flag bit | ||
68 | ** | ||
69 | ** 7 bits - A | ||
70 | ** 14 bits - BA | ||
71 | ** 21 bits - BBA | ||
72 | ** and so on. | ||
73 | ** | ||
74 | ** This is identical to how sqlite encodes varints (see util.c). | ||
75 | ** | ||
76 | ** | ||
77 | **** Document lists **** | ||
78 | ** A doclist (document list) holds a docid-sorted list of hits for a | ||
79 | ** given term. Doclists hold docids, and can optionally associate | ||
80 | ** token positions and offsets with docids. | ||
81 | ** | ||
82 | ** A DL_POSITIONS_OFFSETS doclist is stored like this: | ||
83 | ** | ||
84 | ** array { | ||
85 | ** varint docid; | ||
86 | ** array { (position list for column 0) | ||
87 | ** varint position; (delta from previous position plus POS_BASE) | ||
88 | ** varint startOffset; (delta from previous startOffset) | ||
89 | ** varint endOffset; (delta from startOffset) | ||
90 | ** } | ||
91 | ** array { | ||
92 | ** varint POS_COLUMN; (marks start of position list for new column) | ||
93 | ** varint column; (index of new column) | ||
94 | ** array { | ||
95 | ** varint position; (delta from previous position plus POS_BASE) | ||
96 | ** varint startOffset;(delta from previous startOffset) | ||
97 | ** varint endOffset; (delta from startOffset) | ||
98 | ** } | ||
99 | ** } | ||
100 | ** varint POS_END; (marks end of positions for this document. | ||
101 | ** } | ||
102 | ** | ||
103 | ** Here, array { X } means zero or more occurrences of X, adjacent in | ||
104 | ** memory. A "position" is an index of a token in the token stream | ||
105 | ** generated by the tokenizer, while an "offset" is a byte offset, | ||
106 | ** both based at 0. Note that POS_END and POS_COLUMN occur in the | ||
107 | ** same logical place as the position element, and act as sentinals | ||
108 | ** ending a position list array. | ||
109 | ** | ||
110 | ** A DL_POSITIONS doclist omits the startOffset and endOffset | ||
111 | ** information. A DL_DOCIDS doclist omits both the position and | ||
112 | ** offset information, becoming an array of varint-encoded docids. | ||
113 | ** | ||
114 | ** On-disk data is stored as type DL_DEFAULT, so we don't serialize | ||
115 | ** the type. Due to how deletion is implemented in the segmentation | ||
116 | ** system, on-disk doclists MUST store at least positions. | ||
117 | ** | ||
118 | ** | ||
119 | **** Segment leaf nodes **** | ||
120 | ** Segment leaf nodes store terms and doclists, ordered by term. Leaf | ||
121 | ** nodes are written using LeafWriter, and read using LeafReader (to | ||
122 | ** iterate through a single leaf node's data) and LeavesReader (to | ||
123 | ** iterate through a segment's entire leaf layer). Leaf nodes have | ||
124 | ** the format: | ||
125 | ** | ||
126 | ** varint iHeight; (height from leaf level, always 0) | ||
127 | ** varint nTerm; (length of first term) | ||
128 | ** char pTerm[nTerm]; (content of first term) | ||
129 | ** varint nDoclist; (length of term's associated doclist) | ||
130 | ** char pDoclist[nDoclist]; (content of doclist) | ||
131 | ** array { | ||
132 | ** (further terms are delta-encoded) | ||
133 | ** varint nPrefix; (length of prefix shared with previous term) | ||
134 | ** varint nSuffix; (length of unshared suffix) | ||
135 | ** char pTermSuffix[nSuffix];(unshared suffix of next term) | ||
136 | ** varint nDoclist; (length of term's associated doclist) | ||
137 | ** char pDoclist[nDoclist]; (content of doclist) | ||
138 | ** } | ||
139 | ** | ||
140 | ** Here, array { X } means zero or more occurrences of X, adjacent in | ||
141 | ** memory. | ||
142 | ** | ||
143 | ** Leaf nodes are broken into blocks which are stored contiguously in | ||
144 | ** the %_segments table in sorted order. This means that when the end | ||
145 | ** of a node is reached, the next term is in the node with the next | ||
146 | ** greater node id. | ||
147 | ** | ||
148 | ** New data is spilled to a new leaf node when the current node | ||
149 | ** exceeds LEAF_MAX bytes (default 2048). New data which itself is | ||
150 | ** larger than STANDALONE_MIN (default 1024) is placed in a standalone | ||
151 | ** node (a leaf node with a single term and doclist). The goal of | ||
152 | ** these settings is to pack together groups of small doclists while | ||
153 | ** making it efficient to directly access large doclists. The | ||
154 | ** assumption is that large doclists represent terms which are more | ||
155 | ** likely to be query targets. | ||
156 | ** | ||
157 | ** TODO(shess) It may be useful for blocking decisions to be more | ||
158 | ** dynamic. For instance, it may make more sense to have a 2.5k leaf | ||
159 | ** node rather than splitting into 2k and .5k nodes. My intuition is | ||
160 | ** that this might extend through 2x or 4x the pagesize. | ||
161 | ** | ||
162 | ** | ||
163 | **** Segment interior nodes **** | ||
164 | ** Segment interior nodes store blockids for subtree nodes and terms | ||
165 | ** to describe what data is stored by the each subtree. Interior | ||
166 | ** nodes are written using InteriorWriter, and read using | ||
167 | ** InteriorReader. InteriorWriters are created as needed when | ||
168 | ** SegmentWriter creates new leaf nodes, or when an interior node | ||
169 | ** itself grows too big and must be split. The format of interior | ||
170 | ** nodes: | ||
171 | ** | ||
172 | ** varint iHeight; (height from leaf level, always >0) | ||
173 | ** varint iBlockid; (block id of node's leftmost subtree) | ||
174 | ** optional { | ||
175 | ** varint nTerm; (length of first term) | ||
176 | ** char pTerm[nTerm]; (content of first term) | ||
177 | ** array { | ||
178 | ** (further terms are delta-encoded) | ||
179 | ** varint nPrefix; (length of shared prefix with previous term) | ||
180 | ** varint nSuffix; (length of unshared suffix) | ||
181 | ** char pTermSuffix[nSuffix]; (unshared suffix of next term) | ||
182 | ** } | ||
183 | ** } | ||
184 | ** | ||
185 | ** Here, optional { X } means an optional element, while array { X } | ||
186 | ** means zero or more occurrences of X, adjacent in memory. | ||
187 | ** | ||
188 | ** An interior node encodes n terms separating n+1 subtrees. The | ||
189 | ** subtree blocks are contiguous, so only the first subtree's blockid | ||
190 | ** is encoded. The subtree at iBlockid will contain all terms less | ||
191 | ** than the first term encoded (or all terms if no term is encoded). | ||
192 | ** Otherwise, for terms greater than or equal to pTerm[i] but less | ||
193 | ** than pTerm[i+1], the subtree for that term will be rooted at | ||
194 | ** iBlockid+i. Interior nodes only store enough term data to | ||
195 | ** distinguish adjacent children (if the rightmost term of the left | ||
196 | ** child is "something", and the leftmost term of the right child is | ||
197 | ** "wicked", only "w" is stored). | ||
198 | ** | ||
199 | ** New data is spilled to a new interior node at the same height when | ||
200 | ** the current node exceeds INTERIOR_MAX bytes (default 2048). | ||
201 | ** INTERIOR_MIN_TERMS (default 7) keeps large terms from monopolizing | ||
202 | ** interior nodes and making the tree too skinny. The interior nodes | ||
203 | ** at a given height are naturally tracked by interior nodes at | ||
204 | ** height+1, and so on. | ||
205 | ** | ||
206 | ** | ||
207 | **** Segment directory **** | ||
208 | ** The segment directory in table %_segdir stores meta-information for | ||
209 | ** merging and deleting segments, and also the root node of the | ||
210 | ** segment's tree. | ||
211 | ** | ||
212 | ** The root node is the top node of the segment's tree after encoding | ||
213 | ** the entire segment, restricted to ROOT_MAX bytes (default 1024). | ||
214 | ** This could be either a leaf node or an interior node. If the top | ||
215 | ** node requires more than ROOT_MAX bytes, it is flushed to %_segments | ||
216 | ** and a new root interior node is generated (which should always fit | ||
217 | ** within ROOT_MAX because it only needs space for 2 varints, the | ||
218 | ** height and the blockid of the previous root). | ||
219 | ** | ||
220 | ** The meta-information in the segment directory is: | ||
221 | ** level - segment level (see below) | ||
222 | ** idx - index within level | ||
223 | ** - (level,idx uniquely identify a segment) | ||
224 | ** start_block - first leaf node | ||
225 | ** leaves_end_block - last leaf node | ||
226 | ** end_block - last block (including interior nodes) | ||
227 | ** root - contents of root node | ||
228 | ** | ||
229 | ** If the root node is a leaf node, then start_block, | ||
230 | ** leaves_end_block, and end_block are all 0. | ||
231 | ** | ||
232 | ** | ||
233 | **** Segment merging **** | ||
234 | ** To amortize update costs, segments are groups into levels and | ||
235 | ** merged in matches. Each increase in level represents exponentially | ||
236 | ** more documents. | ||
237 | ** | ||
238 | ** New documents (actually, document updates) are tokenized and | ||
239 | ** written individually (using LeafWriter) to a level 0 segment, with | ||
240 | ** incrementing idx. When idx reaches MERGE_COUNT (default 16), all | ||
241 | ** level 0 segments are merged into a single level 1 segment. Level 1 | ||
242 | ** is populated like level 0, and eventually MERGE_COUNT level 1 | ||
243 | ** segments are merged to a single level 2 segment (representing | ||
244 | ** MERGE_COUNT^2 updates), and so on. | ||
245 | ** | ||
246 | ** A segment merge traverses all segments at a given level in | ||
247 | ** parallel, performing a straightforward sorted merge. Since segment | ||
248 | ** leaf nodes are written in to the %_segments table in order, this | ||
249 | ** merge traverses the underlying sqlite disk structures efficiently. | ||
250 | ** After the merge, all segment blocks from the merged level are | ||
251 | ** deleted. | ||
252 | ** | ||
253 | ** MERGE_COUNT controls how often we merge segments. 16 seems to be | ||
254 | ** somewhat of a sweet spot for insertion performance. 32 and 64 show | ||
255 | ** very similar performance numbers to 16 on insertion, though they're | ||
256 | ** a tiny bit slower (perhaps due to more overhead in merge-time | ||
257 | ** sorting). 8 is about 20% slower than 16, 4 about 50% slower than | ||
258 | ** 16, 2 about 66% slower than 16. | ||
259 | ** | ||
260 | ** At query time, high MERGE_COUNT increases the number of segments | ||
261 | ** which need to be scanned and merged. For instance, with 100k docs | ||
262 | ** inserted: | ||
263 | ** | ||
264 | ** MERGE_COUNT segments | ||
265 | ** 16 25 | ||
266 | ** 8 12 | ||
267 | ** 4 10 | ||
268 | ** 2 6 | ||
269 | ** | ||
270 | ** This appears to have only a moderate impact on queries for very | ||
271 | ** frequent terms (which are somewhat dominated by segment merge | ||
272 | ** costs), and infrequent and non-existent terms still seem to be fast | ||
273 | ** even with many segments. | ||
274 | ** | ||
275 | ** TODO(shess) That said, it would be nice to have a better query-side | ||
276 | ** argument for MERGE_COUNT of 16. Also, it's possible/likely that | ||
277 | ** optimizations to things like doclist merging will swing the sweet | ||
278 | ** spot around. | ||
279 | ** | ||
280 | ** | ||
281 | ** | ||
282 | **** Handling of deletions and updates **** | ||
283 | ** Since we're using a segmented structure, with no docid-oriented | ||
284 | ** index into the term index, we clearly cannot simply update the term | ||
285 | ** index when a document is deleted or updated. For deletions, we | ||
286 | ** write an empty doclist (varint(docid) varint(POS_END)), for updates | ||
287 | ** we simply write the new doclist. Segment merges overwrite older | ||
288 | ** data for a particular docid with newer data, so deletes or updates | ||
289 | ** will eventually overtake the earlier data and knock it out. The | ||
290 | ** query logic likewise merges doclists so that newer data knocks out | ||
291 | ** older data. | ||
292 | ** | ||
293 | ** TODO(shess) Provide a VACUUM type operation to clear out all | ||
294 | ** deletions and duplications. This would basically be a forced merge | ||
295 | ** into a single segment. | ||
296 | */ | ||
297 | |||
298 | #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) | ||
299 | |||
300 | #if defined(SQLITE_ENABLE_FTS2) && !defined(SQLITE_CORE) | ||
301 | # define SQLITE_CORE 1 | ||
302 | #endif | ||
303 | |||
304 | #include <assert.h> | ||
305 | #include <stdlib.h> | ||
306 | #include <stdio.h> | ||
307 | #include <string.h> | ||
308 | #include <ctype.h> | ||
309 | |||
310 | #include "fts2.h" | ||
311 | #include "fts2_hash.h" | ||
312 | #include "fts2_tokenizer.h" | ||
313 | #include "sqlite3.h" | ||
314 | #include "sqlite3ext.h" | ||
315 | SQLITE_EXTENSION_INIT1 | ||
316 | |||
317 | |||
318 | /* TODO(shess) MAN, this thing needs some refactoring. At minimum, it | ||
319 | ** would be nice to order the file better, perhaps something along the | ||
320 | ** lines of: | ||
321 | ** | ||
322 | ** - utility functions | ||
323 | ** - table setup functions | ||
324 | ** - table update functions | ||
325 | ** - table query functions | ||
326 | ** | ||
327 | ** Put the query functions last because they're likely to reference | ||
328 | ** typedefs or functions from the table update section. | ||
329 | */ | ||
330 | |||
331 | #if 0 | ||
332 | # define TRACE(A) printf A; fflush(stdout) | ||
333 | #else | ||
334 | # define TRACE(A) | ||
335 | #endif | ||
336 | |||
337 | /* It is not safe to call isspace(), tolower(), or isalnum() on | ||
338 | ** hi-bit-set characters. This is the same solution used in the | ||
339 | ** tokenizer. | ||
340 | */ | ||
341 | /* TODO(shess) The snippet-generation code should be using the | ||
342 | ** tokenizer-generated tokens rather than doing its own local | ||
343 | ** tokenization. | ||
344 | */ | ||
345 | /* TODO(shess) Is __isascii() a portable version of (c&0x80)==0? */ | ||
346 | static int safe_isspace(char c){ | ||
347 | return (c&0x80)==0 ? isspace(c) : 0; | ||
348 | } | ||
349 | static int safe_tolower(char c){ | ||
350 | return (c&0x80)==0 ? tolower(c) : c; | ||
351 | } | ||
352 | static int safe_isalnum(char c){ | ||
353 | return (c&0x80)==0 ? isalnum(c) : 0; | ||
354 | } | ||
355 | |||
356 | typedef enum DocListType { | ||
357 | DL_DOCIDS, /* docids only */ | ||
358 | DL_POSITIONS, /* docids + positions */ | ||
359 | DL_POSITIONS_OFFSETS /* docids + positions + offsets */ | ||
360 | } DocListType; | ||
361 | |||
362 | /* | ||
363 | ** By default, only positions and not offsets are stored in the doclists. | ||
364 | ** To change this so that offsets are stored too, compile with | ||
365 | ** | ||
366 | ** -DDL_DEFAULT=DL_POSITIONS_OFFSETS | ||
367 | ** | ||
368 | ** If DL_DEFAULT is set to DL_DOCIDS, your table can only be inserted | ||
369 | ** into (no deletes or updates). | ||
370 | */ | ||
371 | #ifndef DL_DEFAULT | ||
372 | # define DL_DEFAULT DL_POSITIONS | ||
373 | #endif | ||
374 | |||
375 | enum { | ||
376 | POS_END = 0, /* end of this position list */ | ||
377 | POS_COLUMN, /* followed by new column number */ | ||
378 | POS_BASE | ||
379 | }; | ||
380 | |||
381 | /* MERGE_COUNT controls how often we merge segments (see comment at | ||
382 | ** top of file). | ||
383 | */ | ||
384 | #define MERGE_COUNT 16 | ||
385 | |||
386 | /* utility functions */ | ||
387 | |||
388 | /* CLEAR() and SCRAMBLE() abstract memset() on a pointer to a single | ||
389 | ** record to prevent errors of the form: | ||
390 | ** | ||
391 | ** my_function(SomeType *b){ | ||
392 | ** memset(b, '\0', sizeof(b)); // sizeof(b)!=sizeof(*b) | ||
393 | ** } | ||
394 | */ | ||
395 | /* TODO(shess) Obvious candidates for a header file. */ | ||
396 | #define CLEAR(b) memset(b, '\0', sizeof(*(b))) | ||
397 | |||
398 | #ifndef NDEBUG | ||
399 | # define SCRAMBLE(b) memset(b, 0x55, sizeof(*(b))) | ||
400 | #else | ||
401 | # define SCRAMBLE(b) | ||
402 | #endif | ||
403 | |||
404 | /* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */ | ||
405 | #define VARINT_MAX 10 | ||
406 | |||
407 | /* Write a 64-bit variable-length integer to memory starting at p[0]. | ||
408 | * The length of data written will be between 1 and VARINT_MAX bytes. | ||
409 | * The number of bytes written is returned. */ | ||
410 | static int putVarint(char *p, sqlite_int64 v){ | ||
411 | unsigned char *q = (unsigned char *) p; | ||
412 | sqlite_uint64 vu = v; | ||
413 | do{ | ||
414 | *q++ = (unsigned char) ((vu & 0x7f) | 0x80); | ||
415 | vu >>= 7; | ||
416 | }while( vu!=0 ); | ||
417 | q[-1] &= 0x7f; /* turn off high bit in final byte */ | ||
418 | assert( q - (unsigned char *)p <= VARINT_MAX ); | ||
419 | return (int) (q - (unsigned char *)p); | ||
420 | } | ||
421 | |||
422 | /* Read a 64-bit variable-length integer from memory starting at p[0]. | ||
423 | * Return the number of bytes read, or 0 on error. | ||
424 | * The value is stored in *v. */ | ||
425 | static int getVarint(const char *p, sqlite_int64 *v){ | ||
426 | const unsigned char *q = (const unsigned char *) p; | ||
427 | sqlite_uint64 x = 0, y = 1; | ||
428 | while( (*q & 0x80) == 0x80 ){ | ||
429 | x += y * (*q++ & 0x7f); | ||
430 | y <<= 7; | ||
431 | if( q - (unsigned char *)p >= VARINT_MAX ){ /* bad data */ | ||
432 | assert( 0 ); | ||
433 | return 0; | ||
434 | } | ||
435 | } | ||
436 | x += y * (*q++); | ||
437 | *v = (sqlite_int64) x; | ||
438 | return (int) (q - (unsigned char *)p); | ||
439 | } | ||
440 | |||
441 | static int getVarint32(const char *p, int *pi){ | ||
442 | sqlite_int64 i; | ||
443 | int ret = getVarint(p, &i); | ||
444 | *pi = (int) i; | ||
445 | assert( *pi==i ); | ||
446 | return ret; | ||
447 | } | ||
448 | |||
449 | /*******************************************************************/ | ||
450 | /* DataBuffer is used to collect data into a buffer in piecemeal | ||
451 | ** fashion. It implements the usual distinction between amount of | ||
452 | ** data currently stored (nData) and buffer capacity (nCapacity). | ||
453 | ** | ||
454 | ** dataBufferInit - create a buffer with given initial capacity. | ||
455 | ** dataBufferReset - forget buffer's data, retaining capacity. | ||
456 | ** dataBufferDestroy - free buffer's data. | ||
457 | ** dataBufferExpand - expand capacity without adding data. | ||
458 | ** dataBufferAppend - append data. | ||
459 | ** dataBufferAppend2 - append two pieces of data at once. | ||
460 | ** dataBufferReplace - replace buffer's data. | ||
461 | */ | ||
462 | typedef struct DataBuffer { | ||
463 | char *pData; /* Pointer to malloc'ed buffer. */ | ||
464 | int nCapacity; /* Size of pData buffer. */ | ||
465 | int nData; /* End of data loaded into pData. */ | ||
466 | } DataBuffer; | ||
467 | |||
468 | static void dataBufferInit(DataBuffer *pBuffer, int nCapacity){ | ||
469 | assert( nCapacity>=0 ); | ||
470 | pBuffer->nData = 0; | ||
471 | pBuffer->nCapacity = nCapacity; | ||
472 | pBuffer->pData = nCapacity==0 ? NULL : malloc(nCapacity); | ||
473 | } | ||
474 | static void dataBufferReset(DataBuffer *pBuffer){ | ||
475 | pBuffer->nData = 0; | ||
476 | } | ||
477 | static void dataBufferDestroy(DataBuffer *pBuffer){ | ||
478 | if( pBuffer->pData!=NULL ) free(pBuffer->pData); | ||
479 | SCRAMBLE(pBuffer); | ||
480 | } | ||
481 | static void dataBufferExpand(DataBuffer *pBuffer, int nAddCapacity){ | ||
482 | assert( nAddCapacity>0 ); | ||
483 | /* TODO(shess) Consider expanding more aggressively. Note that the | ||
484 | ** underlying malloc implementation may take care of such things for | ||
485 | ** us already. | ||
486 | */ | ||
487 | if( pBuffer->nData+nAddCapacity>pBuffer->nCapacity ){ | ||
488 | pBuffer->nCapacity = pBuffer->nData+nAddCapacity; | ||
489 | pBuffer->pData = realloc(pBuffer->pData, pBuffer->nCapacity); | ||
490 | } | ||
491 | } | ||
492 | static void dataBufferAppend(DataBuffer *pBuffer, | ||
493 | const char *pSource, int nSource){ | ||
494 | assert( nSource>0 && pSource!=NULL ); | ||
495 | dataBufferExpand(pBuffer, nSource); | ||
496 | memcpy(pBuffer->pData+pBuffer->nData, pSource, nSource); | ||
497 | pBuffer->nData += nSource; | ||
498 | } | ||
499 | static void dataBufferAppend2(DataBuffer *pBuffer, | ||
500 | const char *pSource1, int nSource1, | ||
501 | const char *pSource2, int nSource2){ | ||
502 | assert( nSource1>0 && pSource1!=NULL ); | ||
503 | assert( nSource2>0 && pSource2!=NULL ); | ||
504 | dataBufferExpand(pBuffer, nSource1+nSource2); | ||
505 | memcpy(pBuffer->pData+pBuffer->nData, pSource1, nSource1); | ||
506 | memcpy(pBuffer->pData+pBuffer->nData+nSource1, pSource2, nSource2); | ||
507 | pBuffer->nData += nSource1+nSource2; | ||
508 | } | ||
509 | static void dataBufferReplace(DataBuffer *pBuffer, | ||
510 | const char *pSource, int nSource){ | ||
511 | dataBufferReset(pBuffer); | ||
512 | dataBufferAppend(pBuffer, pSource, nSource); | ||
513 | } | ||
514 | |||
515 | /* StringBuffer is a null-terminated version of DataBuffer. */ | ||
516 | typedef struct StringBuffer { | ||
517 | DataBuffer b; /* Includes null terminator. */ | ||
518 | } StringBuffer; | ||
519 | |||
520 | static void initStringBuffer(StringBuffer *sb){ | ||
521 | dataBufferInit(&sb->b, 100); | ||
522 | dataBufferReplace(&sb->b, "", 1); | ||
523 | } | ||
524 | static int stringBufferLength(StringBuffer *sb){ | ||
525 | return sb->b.nData-1; | ||
526 | } | ||
527 | static char *stringBufferData(StringBuffer *sb){ | ||
528 | return sb->b.pData; | ||
529 | } | ||
530 | static void stringBufferDestroy(StringBuffer *sb){ | ||
531 | dataBufferDestroy(&sb->b); | ||
532 | } | ||
533 | |||
534 | static void nappend(StringBuffer *sb, const char *zFrom, int nFrom){ | ||
535 | assert( sb->b.nData>0 ); | ||
536 | if( nFrom>0 ){ | ||
537 | sb->b.nData--; | ||
538 | dataBufferAppend2(&sb->b, zFrom, nFrom, "", 1); | ||
539 | } | ||
540 | } | ||
541 | static void append(StringBuffer *sb, const char *zFrom){ | ||
542 | nappend(sb, zFrom, strlen(zFrom)); | ||
543 | } | ||
544 | |||
545 | /* Append a list of strings separated by commas. */ | ||
546 | static void appendList(StringBuffer *sb, int nString, char **azString){ | ||
547 | int i; | ||
548 | for(i=0; i<nString; ++i){ | ||
549 | if( i>0 ) append(sb, ", "); | ||
550 | append(sb, azString[i]); | ||
551 | } | ||
552 | } | ||
553 | |||
554 | static int endsInWhiteSpace(StringBuffer *p){ | ||
555 | return stringBufferLength(p)>0 && | ||
556 | safe_isspace(stringBufferData(p)[stringBufferLength(p)-1]); | ||
557 | } | ||
558 | |||
559 | /* If the StringBuffer ends in something other than white space, add a | ||
560 | ** single space character to the end. | ||
561 | */ | ||
562 | static void appendWhiteSpace(StringBuffer *p){ | ||
563 | if( stringBufferLength(p)==0 ) return; | ||
564 | if( !endsInWhiteSpace(p) ) append(p, " "); | ||
565 | } | ||
566 | |||
567 | /* Remove white space from the end of the StringBuffer */ | ||
568 | static void trimWhiteSpace(StringBuffer *p){ | ||
569 | while( endsInWhiteSpace(p) ){ | ||
570 | p->b.pData[--p->b.nData-1] = '\0'; | ||
571 | } | ||
572 | } | ||
573 | |||
574 | /*******************************************************************/ | ||
575 | /* DLReader is used to read document elements from a doclist. The | ||
576 | ** current docid is cached, so dlrDocid() is fast. DLReader does not | ||
577 | ** own the doclist buffer. | ||
578 | ** | ||
579 | ** dlrAtEnd - true if there's no more data to read. | ||
580 | ** dlrDocid - docid of current document. | ||
581 | ** dlrDocData - doclist data for current document (including docid). | ||
582 | ** dlrDocDataBytes - length of same. | ||
583 | ** dlrAllDataBytes - length of all remaining data. | ||
584 | ** dlrPosData - position data for current document. | ||
585 | ** dlrPosDataLen - length of pos data for current document (incl POS_END). | ||
586 | ** dlrStep - step to current document. | ||
587 | ** dlrInit - initial for doclist of given type against given data. | ||
588 | ** dlrDestroy - clean up. | ||
589 | ** | ||
590 | ** Expected usage is something like: | ||
591 | ** | ||
592 | ** DLReader reader; | ||
593 | ** dlrInit(&reader, pData, nData); | ||
594 | ** while( !dlrAtEnd(&reader) ){ | ||
595 | ** // calls to dlrDocid() and kin. | ||
596 | ** dlrStep(&reader); | ||
597 | ** } | ||
598 | ** dlrDestroy(&reader); | ||
599 | */ | ||
600 | typedef struct DLReader { | ||
601 | DocListType iType; | ||
602 | const char *pData; | ||
603 | int nData; | ||
604 | |||
605 | sqlite_int64 iDocid; | ||
606 | int nElement; | ||
607 | } DLReader; | ||
608 | |||
609 | static int dlrAtEnd(DLReader *pReader){ | ||
610 | assert( pReader->nData>=0 ); | ||
611 | return pReader->nData==0; | ||
612 | } | ||
613 | static sqlite_int64 dlrDocid(DLReader *pReader){ | ||
614 | assert( !dlrAtEnd(pReader) ); | ||
615 | return pReader->iDocid; | ||
616 | } | ||
617 | static const char *dlrDocData(DLReader *pReader){ | ||
618 | assert( !dlrAtEnd(pReader) ); | ||
619 | return pReader->pData; | ||
620 | } | ||
621 | static int dlrDocDataBytes(DLReader *pReader){ | ||
622 | assert( !dlrAtEnd(pReader) ); | ||
623 | return pReader->nElement; | ||
624 | } | ||
625 | static int dlrAllDataBytes(DLReader *pReader){ | ||
626 | assert( !dlrAtEnd(pReader) ); | ||
627 | return pReader->nData; | ||
628 | } | ||
629 | /* TODO(shess) Consider adding a field to track iDocid varint length | ||
630 | ** to make these two functions faster. This might matter (a tiny bit) | ||
631 | ** for queries. | ||
632 | */ | ||
633 | static const char *dlrPosData(DLReader *pReader){ | ||
634 | sqlite_int64 iDummy; | ||
635 | int n = getVarint(pReader->pData, &iDummy); | ||
636 | assert( !dlrAtEnd(pReader) ); | ||
637 | return pReader->pData+n; | ||
638 | } | ||
639 | static int dlrPosDataLen(DLReader *pReader){ | ||
640 | sqlite_int64 iDummy; | ||
641 | int n = getVarint(pReader->pData, &iDummy); | ||
642 | assert( !dlrAtEnd(pReader) ); | ||
643 | return pReader->nElement-n; | ||
644 | } | ||
645 | static void dlrStep(DLReader *pReader){ | ||
646 | assert( !dlrAtEnd(pReader) ); | ||
647 | |||
648 | /* Skip past current doclist element. */ | ||
649 | assert( pReader->nElement<=pReader->nData ); | ||
650 | pReader->pData += pReader->nElement; | ||
651 | pReader->nData -= pReader->nElement; | ||
652 | |||
653 | /* If there is more data, read the next doclist element. */ | ||
654 | if( pReader->nData!=0 ){ | ||
655 | sqlite_int64 iDocidDelta; | ||
656 | int iDummy, n = getVarint(pReader->pData, &iDocidDelta); | ||
657 | pReader->iDocid += iDocidDelta; | ||
658 | if( pReader->iType>=DL_POSITIONS ){ | ||
659 | assert( n<pReader->nData ); | ||
660 | while( 1 ){ | ||
661 | n += getVarint32(pReader->pData+n, &iDummy); | ||
662 | assert( n<=pReader->nData ); | ||
663 | if( iDummy==POS_END ) break; | ||
664 | if( iDummy==POS_COLUMN ){ | ||
665 | n += getVarint32(pReader->pData+n, &iDummy); | ||
666 | assert( n<pReader->nData ); | ||
667 | }else if( pReader->iType==DL_POSITIONS_OFFSETS ){ | ||
668 | n += getVarint32(pReader->pData+n, &iDummy); | ||
669 | n += getVarint32(pReader->pData+n, &iDummy); | ||
670 | assert( n<pReader->nData ); | ||
671 | } | ||
672 | } | ||
673 | } | ||
674 | pReader->nElement = n; | ||
675 | assert( pReader->nElement<=pReader->nData ); | ||
676 | } | ||
677 | } | ||
678 | static void dlrInit(DLReader *pReader, DocListType iType, | ||
679 | const char *pData, int nData){ | ||
680 | assert( pData!=NULL && nData!=0 ); | ||
681 | pReader->iType = iType; | ||
682 | pReader->pData = pData; | ||
683 | pReader->nData = nData; | ||
684 | pReader->nElement = 0; | ||
685 | pReader->iDocid = 0; | ||
686 | |||
687 | /* Load the first element's data. There must be a first element. */ | ||
688 | dlrStep(pReader); | ||
689 | } | ||
690 | static void dlrDestroy(DLReader *pReader){ | ||
691 | SCRAMBLE(pReader); | ||
692 | } | ||
693 | |||
694 | #ifndef NDEBUG | ||
695 | /* Verify that the doclist can be validly decoded. Also returns the | ||
696 | ** last docid found because it's convenient in other assertions for | ||
697 | ** DLWriter. | ||
698 | */ | ||
699 | static void docListValidate(DocListType iType, const char *pData, int nData, | ||
700 | sqlite_int64 *pLastDocid){ | ||
701 | sqlite_int64 iPrevDocid = 0; | ||
702 | assert( nData>0 ); | ||
703 | assert( pData!=0 ); | ||
704 | assert( pData+nData>pData ); | ||
705 | while( nData!=0 ){ | ||
706 | sqlite_int64 iDocidDelta; | ||
707 | int n = getVarint(pData, &iDocidDelta); | ||
708 | iPrevDocid += iDocidDelta; | ||
709 | if( iType>DL_DOCIDS ){ | ||
710 | int iDummy; | ||
711 | while( 1 ){ | ||
712 | n += getVarint32(pData+n, &iDummy); | ||
713 | if( iDummy==POS_END ) break; | ||
714 | if( iDummy==POS_COLUMN ){ | ||
715 | n += getVarint32(pData+n, &iDummy); | ||
716 | }else if( iType>DL_POSITIONS ){ | ||
717 | n += getVarint32(pData+n, &iDummy); | ||
718 | n += getVarint32(pData+n, &iDummy); | ||
719 | } | ||
720 | assert( n<=nData ); | ||
721 | } | ||
722 | } | ||
723 | assert( n<=nData ); | ||
724 | pData += n; | ||
725 | nData -= n; | ||
726 | } | ||
727 | if( pLastDocid ) *pLastDocid = iPrevDocid; | ||
728 | } | ||
729 | #define ASSERT_VALID_DOCLIST(i, p, n, o) docListValidate(i, p, n, o) | ||
730 | #else | ||
731 | #define ASSERT_VALID_DOCLIST(i, p, n, o) assert( 1 ) | ||
732 | #endif | ||
733 | |||
734 | /*******************************************************************/ | ||
735 | /* DLWriter is used to write doclist data to a DataBuffer. DLWriter | ||
736 | ** always appends to the buffer and does not own it. | ||
737 | ** | ||
738 | ** dlwInit - initialize to write a given type doclistto a buffer. | ||
739 | ** dlwDestroy - clear the writer's memory. Does not free buffer. | ||
740 | ** dlwAppend - append raw doclist data to buffer. | ||
741 | ** dlwCopy - copy next doclist from reader to writer. | ||
742 | ** dlwAdd - construct doclist element and append to buffer. | ||
743 | ** Only apply dlwAdd() to DL_DOCIDS doclists (else use PLWriter). | ||
744 | */ | ||
745 | typedef struct DLWriter { | ||
746 | DocListType iType; | ||
747 | DataBuffer *b; | ||
748 | sqlite_int64 iPrevDocid; | ||
749 | #ifndef NDEBUG | ||
750 | int has_iPrevDocid; | ||
751 | #endif | ||
752 | } DLWriter; | ||
753 | |||
754 | static void dlwInit(DLWriter *pWriter, DocListType iType, DataBuffer *b){ | ||
755 | pWriter->b = b; | ||
756 | pWriter->iType = iType; | ||
757 | pWriter->iPrevDocid = 0; | ||
758 | #ifndef NDEBUG | ||
759 | pWriter->has_iPrevDocid = 0; | ||
760 | #endif | ||
761 | } | ||
762 | static void dlwDestroy(DLWriter *pWriter){ | ||
763 | SCRAMBLE(pWriter); | ||
764 | } | ||
765 | /* iFirstDocid is the first docid in the doclist in pData. It is | ||
766 | ** needed because pData may point within a larger doclist, in which | ||
767 | ** case the first item would be delta-encoded. | ||
768 | ** | ||
769 | ** iLastDocid is the final docid in the doclist in pData. It is | ||
770 | ** needed to create the new iPrevDocid for future delta-encoding. The | ||
771 | ** code could decode the passed doclist to recreate iLastDocid, but | ||
772 | ** the only current user (docListMerge) already has decoded this | ||
773 | ** information. | ||
774 | */ | ||
775 | /* TODO(shess) This has become just a helper for docListMerge. | ||
776 | ** Consider a refactor to make this cleaner. | ||
777 | */ | ||
778 | static void dlwAppend(DLWriter *pWriter, | ||
779 | const char *pData, int nData, | ||
780 | sqlite_int64 iFirstDocid, sqlite_int64 iLastDocid){ | ||
781 | sqlite_int64 iDocid = 0; | ||
782 | char c[VARINT_MAX]; | ||
783 | int nFirstOld, nFirstNew; /* Old and new varint len of first docid. */ | ||
784 | #ifndef NDEBUG | ||
785 | sqlite_int64 iLastDocidDelta; | ||
786 | #endif | ||
787 | |||
788 | /* Recode the initial docid as delta from iPrevDocid. */ | ||
789 | nFirstOld = getVarint(pData, &iDocid); | ||
790 | assert( nFirstOld<nData || (nFirstOld==nData && pWriter->iType==DL_DOCIDS) ); | ||
791 | nFirstNew = putVarint(c, iFirstDocid-pWriter->iPrevDocid); | ||
792 | |||
793 | /* Verify that the incoming doclist is valid AND that it ends with | ||
794 | ** the expected docid. This is essential because we'll trust this | ||
795 | ** docid in future delta-encoding. | ||
796 | */ | ||
797 | ASSERT_VALID_DOCLIST(pWriter->iType, pData, nData, &iLastDocidDelta); | ||
798 | assert( iLastDocid==iFirstDocid-iDocid+iLastDocidDelta ); | ||
799 | |||
800 | /* Append recoded initial docid and everything else. Rest of docids | ||
801 | ** should have been delta-encoded from previous initial docid. | ||
802 | */ | ||
803 | if( nFirstOld<nData ){ | ||
804 | dataBufferAppend2(pWriter->b, c, nFirstNew, | ||
805 | pData+nFirstOld, nData-nFirstOld); | ||
806 | }else{ | ||
807 | dataBufferAppend(pWriter->b, c, nFirstNew); | ||
808 | } | ||
809 | pWriter->iPrevDocid = iLastDocid; | ||
810 | } | ||
811 | static void dlwCopy(DLWriter *pWriter, DLReader *pReader){ | ||
812 | dlwAppend(pWriter, dlrDocData(pReader), dlrDocDataBytes(pReader), | ||
813 | dlrDocid(pReader), dlrDocid(pReader)); | ||
814 | } | ||
815 | static void dlwAdd(DLWriter *pWriter, sqlite_int64 iDocid){ | ||
816 | char c[VARINT_MAX]; | ||
817 | int n = putVarint(c, iDocid-pWriter->iPrevDocid); | ||
818 | |||
819 | /* Docids must ascend. */ | ||
820 | assert( !pWriter->has_iPrevDocid || iDocid>pWriter->iPrevDocid ); | ||
821 | assert( pWriter->iType==DL_DOCIDS ); | ||
822 | |||
823 | dataBufferAppend(pWriter->b, c, n); | ||
824 | pWriter->iPrevDocid = iDocid; | ||
825 | #ifndef NDEBUG | ||
826 | pWriter->has_iPrevDocid = 1; | ||
827 | #endif | ||
828 | } | ||
829 | |||
830 | /*******************************************************************/ | ||
831 | /* PLReader is used to read data from a document's position list. As | ||
832 | ** the caller steps through the list, data is cached so that varints | ||
833 | ** only need to be decoded once. | ||
834 | ** | ||
835 | ** plrInit, plrDestroy - create/destroy a reader. | ||
836 | ** plrColumn, plrPosition, plrStartOffset, plrEndOffset - accessors | ||
837 | ** plrAtEnd - at end of stream, only call plrDestroy once true. | ||
838 | ** plrStep - step to the next element. | ||
839 | */ | ||
840 | typedef struct PLReader { | ||
841 | /* These refer to the next position's data. nData will reach 0 when | ||
842 | ** reading the last position, so plrStep() signals EOF by setting | ||
843 | ** pData to NULL. | ||
844 | */ | ||
845 | const char *pData; | ||
846 | int nData; | ||
847 | |||
848 | DocListType iType; | ||
849 | int iColumn; /* the last column read */ | ||
850 | int iPosition; /* the last position read */ | ||
851 | int iStartOffset; /* the last start offset read */ | ||
852 | int iEndOffset; /* the last end offset read */ | ||
853 | } PLReader; | ||
854 | |||
855 | static int plrAtEnd(PLReader *pReader){ | ||
856 | return pReader->pData==NULL; | ||
857 | } | ||
858 | static int plrColumn(PLReader *pReader){ | ||
859 | assert( !plrAtEnd(pReader) ); | ||
860 | return pReader->iColumn; | ||
861 | } | ||
862 | static int plrPosition(PLReader *pReader){ | ||
863 | assert( !plrAtEnd(pReader) ); | ||
864 | return pReader->iPosition; | ||
865 | } | ||
866 | static int plrStartOffset(PLReader *pReader){ | ||
867 | assert( !plrAtEnd(pReader) ); | ||
868 | return pReader->iStartOffset; | ||
869 | } | ||
870 | static int plrEndOffset(PLReader *pReader){ | ||
871 | assert( !plrAtEnd(pReader) ); | ||
872 | return pReader->iEndOffset; | ||
873 | } | ||
874 | static void plrStep(PLReader *pReader){ | ||
875 | int i, n; | ||
876 | |||
877 | assert( !plrAtEnd(pReader) ); | ||
878 | |||
879 | if( pReader->nData==0 ){ | ||
880 | pReader->pData = NULL; | ||
881 | return; | ||
882 | } | ||
883 | |||
884 | n = getVarint32(pReader->pData, &i); | ||
885 | if( i==POS_COLUMN ){ | ||
886 | n += getVarint32(pReader->pData+n, &pReader->iColumn); | ||
887 | pReader->iPosition = 0; | ||
888 | pReader->iStartOffset = 0; | ||
889 | n += getVarint32(pReader->pData+n, &i); | ||
890 | } | ||
891 | /* Should never see adjacent column changes. */ | ||
892 | assert( i!=POS_COLUMN ); | ||
893 | |||
894 | if( i==POS_END ){ | ||
895 | pReader->nData = 0; | ||
896 | pReader->pData = NULL; | ||
897 | return; | ||
898 | } | ||
899 | |||
900 | pReader->iPosition += i-POS_BASE; | ||
901 | if( pReader->iType==DL_POSITIONS_OFFSETS ){ | ||
902 | n += getVarint32(pReader->pData+n, &i); | ||
903 | pReader->iStartOffset += i; | ||
904 | n += getVarint32(pReader->pData+n, &i); | ||
905 | pReader->iEndOffset = pReader->iStartOffset+i; | ||
906 | } | ||
907 | assert( n<=pReader->nData ); | ||
908 | pReader->pData += n; | ||
909 | pReader->nData -= n; | ||
910 | } | ||
911 | |||
912 | static void plrInit(PLReader *pReader, DLReader *pDLReader){ | ||
913 | pReader->pData = dlrPosData(pDLReader); | ||
914 | pReader->nData = dlrPosDataLen(pDLReader); | ||
915 | pReader->iType = pDLReader->iType; | ||
916 | pReader->iColumn = 0; | ||
917 | pReader->iPosition = 0; | ||
918 | pReader->iStartOffset = 0; | ||
919 | pReader->iEndOffset = 0; | ||
920 | plrStep(pReader); | ||
921 | } | ||
922 | static void plrDestroy(PLReader *pReader){ | ||
923 | SCRAMBLE(pReader); | ||
924 | } | ||
925 | |||
926 | /*******************************************************************/ | ||
927 | /* PLWriter is used in constructing a document's position list. As a | ||
928 | ** convenience, if iType is DL_DOCIDS, PLWriter becomes a no-op. | ||
929 | ** PLWriter writes to the associated DLWriter's buffer. | ||
930 | ** | ||
931 | ** plwInit - init for writing a document's poslist. | ||
932 | ** plwDestroy - clear a writer. | ||
933 | ** plwAdd - append position and offset information. | ||
934 | ** plwCopy - copy next position's data from reader to writer. | ||
935 | ** plwTerminate - add any necessary doclist terminator. | ||
936 | ** | ||
937 | ** Calling plwAdd() after plwTerminate() may result in a corrupt | ||
938 | ** doclist. | ||
939 | */ | ||
940 | /* TODO(shess) Until we've written the second item, we can cache the | ||
941 | ** first item's information. Then we'd have three states: | ||
942 | ** | ||
943 | ** - initialized with docid, no positions. | ||
944 | ** - docid and one position. | ||
945 | ** - docid and multiple positions. | ||
946 | ** | ||
947 | ** Only the last state needs to actually write to dlw->b, which would | ||
948 | ** be an improvement in the DLCollector case. | ||
949 | */ | ||
950 | typedef struct PLWriter { | ||
951 | DLWriter *dlw; | ||
952 | |||
953 | int iColumn; /* the last column written */ | ||
954 | int iPos; /* the last position written */ | ||
955 | int iOffset; /* the last start offset written */ | ||
956 | } PLWriter; | ||
957 | |||
958 | /* TODO(shess) In the case where the parent is reading these values | ||
959 | ** from a PLReader, we could optimize to a copy if that PLReader has | ||
960 | ** the same type as pWriter. | ||
961 | */ | ||
962 | static void plwAdd(PLWriter *pWriter, int iColumn, int iPos, | ||
963 | int iStartOffset, int iEndOffset){ | ||
964 | /* Worst-case space for POS_COLUMN, iColumn, iPosDelta, | ||
965 | ** iStartOffsetDelta, and iEndOffsetDelta. | ||
966 | */ | ||
967 | char c[5*VARINT_MAX]; | ||
968 | int n = 0; | ||
969 | |||
970 | /* Ban plwAdd() after plwTerminate(). */ | ||
971 | assert( pWriter->iPos!=-1 ); | ||
972 | |||
973 | if( pWriter->dlw->iType==DL_DOCIDS ) return; | ||
974 | |||
975 | if( iColumn!=pWriter->iColumn ){ | ||
976 | n += putVarint(c+n, POS_COLUMN); | ||
977 | n += putVarint(c+n, iColumn); | ||
978 | pWriter->iColumn = iColumn; | ||
979 | pWriter->iPos = 0; | ||
980 | pWriter->iOffset = 0; | ||
981 | } | ||
982 | assert( iPos>=pWriter->iPos ); | ||
983 | n += putVarint(c+n, POS_BASE+(iPos-pWriter->iPos)); | ||
984 | pWriter->iPos = iPos; | ||
985 | if( pWriter->dlw->iType==DL_POSITIONS_OFFSETS ){ | ||
986 | assert( iStartOffset>=pWriter->iOffset ); | ||
987 | n += putVarint(c+n, iStartOffset-pWriter->iOffset); | ||
988 | pWriter->iOffset = iStartOffset; | ||
989 | assert( iEndOffset>=iStartOffset ); | ||
990 | n += putVarint(c+n, iEndOffset-iStartOffset); | ||
991 | } | ||
992 | dataBufferAppend(pWriter->dlw->b, c, n); | ||
993 | } | ||
994 | static void plwCopy(PLWriter *pWriter, PLReader *pReader){ | ||
995 | plwAdd(pWriter, plrColumn(pReader), plrPosition(pReader), | ||
996 | plrStartOffset(pReader), plrEndOffset(pReader)); | ||
997 | } | ||
998 | static void plwInit(PLWriter *pWriter, DLWriter *dlw, sqlite_int64 iDocid){ | ||
999 | char c[VARINT_MAX]; | ||
1000 | int n; | ||
1001 | |||
1002 | pWriter->dlw = dlw; | ||
1003 | |||
1004 | /* Docids must ascend. */ | ||
1005 | assert( !pWriter->dlw->has_iPrevDocid || iDocid>pWriter->dlw->iPrevDocid ); | ||
1006 | n = putVarint(c, iDocid-pWriter->dlw->iPrevDocid); | ||
1007 | dataBufferAppend(pWriter->dlw->b, c, n); | ||
1008 | pWriter->dlw->iPrevDocid = iDocid; | ||
1009 | #ifndef NDEBUG | ||
1010 | pWriter->dlw->has_iPrevDocid = 1; | ||
1011 | #endif | ||
1012 | |||
1013 | pWriter->iColumn = 0; | ||
1014 | pWriter->iPos = 0; | ||
1015 | pWriter->iOffset = 0; | ||
1016 | } | ||
1017 | /* TODO(shess) Should plwDestroy() also terminate the doclist? But | ||
1018 | ** then plwDestroy() would no longer be just a destructor, it would | ||
1019 | ** also be doing work, which isn't consistent with the overall idiom. | ||
1020 | ** Another option would be for plwAdd() to always append any necessary | ||
1021 | ** terminator, so that the output is always correct. But that would | ||
1022 | ** add incremental work to the common case with the only benefit being | ||
1023 | ** API elegance. Punt for now. | ||
1024 | */ | ||
1025 | static void plwTerminate(PLWriter *pWriter){ | ||
1026 | if( pWriter->dlw->iType>DL_DOCIDS ){ | ||
1027 | char c[VARINT_MAX]; | ||
1028 | int n = putVarint(c, POS_END); | ||
1029 | dataBufferAppend(pWriter->dlw->b, c, n); | ||
1030 | } | ||
1031 | #ifndef NDEBUG | ||
1032 | /* Mark as terminated for assert in plwAdd(). */ | ||
1033 | pWriter->iPos = -1; | ||
1034 | #endif | ||
1035 | } | ||
1036 | static void plwDestroy(PLWriter *pWriter){ | ||
1037 | SCRAMBLE(pWriter); | ||
1038 | } | ||
1039 | |||
1040 | /*******************************************************************/ | ||
1041 | /* DLCollector wraps PLWriter and DLWriter to provide a | ||
1042 | ** dynamically-allocated doclist area to use during tokenization. | ||
1043 | ** | ||
1044 | ** dlcNew - malloc up and initialize a collector. | ||
1045 | ** dlcDelete - destroy a collector and all contained items. | ||
1046 | ** dlcAddPos - append position and offset information. | ||
1047 | ** dlcAddDoclist - add the collected doclist to the given buffer. | ||
1048 | ** dlcNext - terminate the current document and open another. | ||
1049 | */ | ||
1050 | typedef struct DLCollector { | ||
1051 | DataBuffer b; | ||
1052 | DLWriter dlw; | ||
1053 | PLWriter plw; | ||
1054 | } DLCollector; | ||
1055 | |||
1056 | /* TODO(shess) This could also be done by calling plwTerminate() and | ||
1057 | ** dataBufferAppend(). I tried that, expecting nominal performance | ||
1058 | ** differences, but it seemed to pretty reliably be worth 1% to code | ||
1059 | ** it this way. I suspect it's the incremental malloc overhead (some | ||
1060 | ** percentage of the plwTerminate() calls will cause a realloc), so | ||
1061 | ** this might be worth revisiting if the DataBuffer implementation | ||
1062 | ** changes. | ||
1063 | */ | ||
1064 | static void dlcAddDoclist(DLCollector *pCollector, DataBuffer *b){ | ||
1065 | if( pCollector->dlw.iType>DL_DOCIDS ){ | ||
1066 | char c[VARINT_MAX]; | ||
1067 | int n = putVarint(c, POS_END); | ||
1068 | dataBufferAppend2(b, pCollector->b.pData, pCollector->b.nData, c, n); | ||
1069 | }else{ | ||
1070 | dataBufferAppend(b, pCollector->b.pData, pCollector->b.nData); | ||
1071 | } | ||
1072 | } | ||
1073 | static void dlcNext(DLCollector *pCollector, sqlite_int64 iDocid){ | ||
1074 | plwTerminate(&pCollector->plw); | ||
1075 | plwDestroy(&pCollector->plw); | ||
1076 | plwInit(&pCollector->plw, &pCollector->dlw, iDocid); | ||
1077 | } | ||
1078 | static void dlcAddPos(DLCollector *pCollector, int iColumn, int iPos, | ||
1079 | int iStartOffset, int iEndOffset){ | ||
1080 | plwAdd(&pCollector->plw, iColumn, iPos, iStartOffset, iEndOffset); | ||
1081 | } | ||
1082 | |||
1083 | static DLCollector *dlcNew(sqlite_int64 iDocid, DocListType iType){ | ||
1084 | DLCollector *pCollector = malloc(sizeof(DLCollector)); | ||
1085 | dataBufferInit(&pCollector->b, 0); | ||
1086 | dlwInit(&pCollector->dlw, iType, &pCollector->b); | ||
1087 | plwInit(&pCollector->plw, &pCollector->dlw, iDocid); | ||
1088 | return pCollector; | ||
1089 | } | ||
1090 | static void dlcDelete(DLCollector *pCollector){ | ||
1091 | plwDestroy(&pCollector->plw); | ||
1092 | dlwDestroy(&pCollector->dlw); | ||
1093 | dataBufferDestroy(&pCollector->b); | ||
1094 | SCRAMBLE(pCollector); | ||
1095 | free(pCollector); | ||
1096 | } | ||
1097 | |||
1098 | |||
1099 | /* Copy the doclist data of iType in pData/nData into *out, trimming | ||
1100 | ** unnecessary data as we go. Only columns matching iColumn are | ||
1101 | ** copied, all columns copied if iColumn is -1. Elements with no | ||
1102 | ** matching columns are dropped. The output is an iOutType doclist. | ||
1103 | */ | ||
1104 | /* NOTE(shess) This code is only valid after all doclists are merged. | ||
1105 | ** If this is run before merges, then doclist items which represent | ||
1106 | ** deletion will be trimmed, and will thus not effect a deletion | ||
1107 | ** during the merge. | ||
1108 | */ | ||
1109 | static void docListTrim(DocListType iType, const char *pData, int nData, | ||
1110 | int iColumn, DocListType iOutType, DataBuffer *out){ | ||
1111 | DLReader dlReader; | ||
1112 | DLWriter dlWriter; | ||
1113 | |||
1114 | assert( iOutType<=iType ); | ||
1115 | |||
1116 | dlrInit(&dlReader, iType, pData, nData); | ||
1117 | dlwInit(&dlWriter, iOutType, out); | ||
1118 | |||
1119 | while( !dlrAtEnd(&dlReader) ){ | ||
1120 | PLReader plReader; | ||
1121 | PLWriter plWriter; | ||
1122 | int match = 0; | ||
1123 | |||
1124 | plrInit(&plReader, &dlReader); | ||
1125 | |||
1126 | while( !plrAtEnd(&plReader) ){ | ||
1127 | if( iColumn==-1 || plrColumn(&plReader)==iColumn ){ | ||
1128 | if( !match ){ | ||
1129 | plwInit(&plWriter, &dlWriter, dlrDocid(&dlReader)); | ||
1130 | match = 1; | ||
1131 | } | ||
1132 | plwAdd(&plWriter, plrColumn(&plReader), plrPosition(&plReader), | ||
1133 | plrStartOffset(&plReader), plrEndOffset(&plReader)); | ||
1134 | } | ||
1135 | plrStep(&plReader); | ||
1136 | } | ||
1137 | if( match ){ | ||
1138 | plwTerminate(&plWriter); | ||
1139 | plwDestroy(&plWriter); | ||
1140 | } | ||
1141 | |||
1142 | plrDestroy(&plReader); | ||
1143 | dlrStep(&dlReader); | ||
1144 | } | ||
1145 | dlwDestroy(&dlWriter); | ||
1146 | dlrDestroy(&dlReader); | ||
1147 | } | ||
1148 | |||
1149 | /* Used by docListMerge() to keep doclists in the ascending order by | ||
1150 | ** docid, then ascending order by age (so the newest comes first). | ||
1151 | */ | ||
1152 | typedef struct OrderedDLReader { | ||
1153 | DLReader *pReader; | ||
1154 | |||
1155 | /* TODO(shess) If we assume that docListMerge pReaders is ordered by | ||
1156 | ** age (which we do), then we could use pReader comparisons to break | ||
1157 | ** ties. | ||
1158 | */ | ||
1159 | int idx; | ||
1160 | } OrderedDLReader; | ||
1161 | |||
1162 | /* Order eof to end, then by docid asc, idx desc. */ | ||
1163 | static int orderedDLReaderCmp(OrderedDLReader *r1, OrderedDLReader *r2){ | ||
1164 | if( dlrAtEnd(r1->pReader) ){ | ||
1165 | if( dlrAtEnd(r2->pReader) ) return 0; /* Both atEnd(). */ | ||
1166 | return 1; /* Only r1 atEnd(). */ | ||
1167 | } | ||
1168 | if( dlrAtEnd(r2->pReader) ) return -1; /* Only r2 atEnd(). */ | ||
1169 | |||
1170 | if( dlrDocid(r1->pReader)<dlrDocid(r2->pReader) ) return -1; | ||
1171 | if( dlrDocid(r1->pReader)>dlrDocid(r2->pReader) ) return 1; | ||
1172 | |||
1173 | /* Descending on idx. */ | ||
1174 | return r2->idx-r1->idx; | ||
1175 | } | ||
1176 | |||
1177 | /* Bubble p[0] to appropriate place in p[1..n-1]. Assumes that | ||
1178 | ** p[1..n-1] is already sorted. | ||
1179 | */ | ||
1180 | /* TODO(shess) Is this frequent enough to warrant a binary search? | ||
1181 | ** Before implementing that, instrument the code to check. In most | ||
1182 | ** current usage, I expect that p[0] will be less than p[1] a very | ||
1183 | ** high proportion of the time. | ||
1184 | */ | ||
1185 | static void orderedDLReaderReorder(OrderedDLReader *p, int n){ | ||
1186 | while( n>1 && orderedDLReaderCmp(p, p+1)>0 ){ | ||
1187 | OrderedDLReader tmp = p[0]; | ||
1188 | p[0] = p[1]; | ||
1189 | p[1] = tmp; | ||
1190 | n--; | ||
1191 | p++; | ||
1192 | } | ||
1193 | } | ||
1194 | |||
1195 | /* Given an array of doclist readers, merge their doclist elements | ||
1196 | ** into out in sorted order (by docid), dropping elements from older | ||
1197 | ** readers when there is a duplicate docid. pReaders is assumed to be | ||
1198 | ** ordered by age, oldest first. | ||
1199 | */ | ||
1200 | /* TODO(shess) nReaders must be <= MERGE_COUNT. This should probably | ||
1201 | ** be fixed. | ||
1202 | */ | ||
1203 | static void docListMerge(DataBuffer *out, | ||
1204 | DLReader *pReaders, int nReaders){ | ||
1205 | OrderedDLReader readers[MERGE_COUNT]; | ||
1206 | DLWriter writer; | ||
1207 | int i, n; | ||
1208 | const char *pStart = 0; | ||
1209 | int nStart = 0; | ||
1210 | sqlite_int64 iFirstDocid = 0, iLastDocid = 0; | ||
1211 | |||
1212 | assert( nReaders>0 ); | ||
1213 | if( nReaders==1 ){ | ||
1214 | dataBufferAppend(out, dlrDocData(pReaders), dlrAllDataBytes(pReaders)); | ||
1215 | return; | ||
1216 | } | ||
1217 | |||
1218 | assert( nReaders<=MERGE_COUNT ); | ||
1219 | n = 0; | ||
1220 | for(i=0; i<nReaders; i++){ | ||
1221 | assert( pReaders[i].iType==pReaders[0].iType ); | ||
1222 | readers[i].pReader = pReaders+i; | ||
1223 | readers[i].idx = i; | ||
1224 | n += dlrAllDataBytes(&pReaders[i]); | ||
1225 | } | ||
1226 | /* Conservatively size output to sum of inputs. Output should end | ||
1227 | ** up strictly smaller than input. | ||
1228 | */ | ||
1229 | dataBufferExpand(out, n); | ||
1230 | |||
1231 | /* Get the readers into sorted order. */ | ||
1232 | while( i-->0 ){ | ||
1233 | orderedDLReaderReorder(readers+i, nReaders-i); | ||
1234 | } | ||
1235 | |||
1236 | dlwInit(&writer, pReaders[0].iType, out); | ||
1237 | while( !dlrAtEnd(readers[0].pReader) ){ | ||
1238 | sqlite_int64 iDocid = dlrDocid(readers[0].pReader); | ||
1239 | |||
1240 | /* If this is a continuation of the current buffer to copy, extend | ||
1241 | ** that buffer. memcpy() seems to be more efficient if it has a | ||
1242 | ** lots of data to copy. | ||
1243 | */ | ||
1244 | if( dlrDocData(readers[0].pReader)==pStart+nStart ){ | ||
1245 | nStart += dlrDocDataBytes(readers[0].pReader); | ||
1246 | }else{ | ||
1247 | if( pStart!=0 ){ | ||
1248 | dlwAppend(&writer, pStart, nStart, iFirstDocid, iLastDocid); | ||
1249 | } | ||
1250 | pStart = dlrDocData(readers[0].pReader); | ||
1251 | nStart = dlrDocDataBytes(readers[0].pReader); | ||
1252 | iFirstDocid = iDocid; | ||
1253 | } | ||
1254 | iLastDocid = iDocid; | ||
1255 | dlrStep(readers[0].pReader); | ||
1256 | |||
1257 | /* Drop all of the older elements with the same docid. */ | ||
1258 | for(i=1; i<nReaders && | ||
1259 | !dlrAtEnd(readers[i].pReader) && | ||
1260 | dlrDocid(readers[i].pReader)==iDocid; i++){ | ||
1261 | dlrStep(readers[i].pReader); | ||
1262 | } | ||
1263 | |||
1264 | /* Get the readers back into order. */ | ||
1265 | while( i-->0 ){ | ||
1266 | orderedDLReaderReorder(readers+i, nReaders-i); | ||
1267 | } | ||
1268 | } | ||
1269 | |||
1270 | /* Copy over any remaining elements. */ | ||
1271 | if( nStart>0 ) dlwAppend(&writer, pStart, nStart, iFirstDocid, iLastDocid); | ||
1272 | dlwDestroy(&writer); | ||
1273 | } | ||
1274 | |||
1275 | /* Helper function for posListUnion(). Compares the current position | ||
1276 | ** between left and right, returning as standard C idiom of <0 if | ||
1277 | ** left<right, >0 if left>right, and 0 if left==right. "End" always | ||
1278 | ** compares greater. | ||
1279 | */ | ||
1280 | static int posListCmp(PLReader *pLeft, PLReader *pRight){ | ||
1281 | assert( pLeft->iType==pRight->iType ); | ||
1282 | if( pLeft->iType==DL_DOCIDS ) return 0; | ||
1283 | |||
1284 | if( plrAtEnd(pLeft) ) return plrAtEnd(pRight) ? 0 : 1; | ||
1285 | if( plrAtEnd(pRight) ) return -1; | ||
1286 | |||
1287 | if( plrColumn(pLeft)<plrColumn(pRight) ) return -1; | ||
1288 | if( plrColumn(pLeft)>plrColumn(pRight) ) return 1; | ||
1289 | |||
1290 | if( plrPosition(pLeft)<plrPosition(pRight) ) return -1; | ||
1291 | if( plrPosition(pLeft)>plrPosition(pRight) ) return 1; | ||
1292 | if( pLeft->iType==DL_POSITIONS ) return 0; | ||
1293 | |||
1294 | if( plrStartOffset(pLeft)<plrStartOffset(pRight) ) return -1; | ||
1295 | if( plrStartOffset(pLeft)>plrStartOffset(pRight) ) return 1; | ||
1296 | |||
1297 | if( plrEndOffset(pLeft)<plrEndOffset(pRight) ) return -1; | ||
1298 | if( plrEndOffset(pLeft)>plrEndOffset(pRight) ) return 1; | ||
1299 | |||
1300 | return 0; | ||
1301 | } | ||
1302 | |||
1303 | /* Write the union of position lists in pLeft and pRight to pOut. | ||
1304 | ** "Union" in this case meaning "All unique position tuples". Should | ||
1305 | ** work with any doclist type, though both inputs and the output | ||
1306 | ** should be the same type. | ||
1307 | */ | ||
1308 | static void posListUnion(DLReader *pLeft, DLReader *pRight, DLWriter *pOut){ | ||
1309 | PLReader left, right; | ||
1310 | PLWriter writer; | ||
1311 | |||
1312 | assert( dlrDocid(pLeft)==dlrDocid(pRight) ); | ||
1313 | assert( pLeft->iType==pRight->iType ); | ||
1314 | assert( pLeft->iType==pOut->iType ); | ||
1315 | |||
1316 | plrInit(&left, pLeft); | ||
1317 | plrInit(&right, pRight); | ||
1318 | plwInit(&writer, pOut, dlrDocid(pLeft)); | ||
1319 | |||
1320 | while( !plrAtEnd(&left) || !plrAtEnd(&right) ){ | ||
1321 | int c = posListCmp(&left, &right); | ||
1322 | if( c<0 ){ | ||
1323 | plwCopy(&writer, &left); | ||
1324 | plrStep(&left); | ||
1325 | }else if( c>0 ){ | ||
1326 | plwCopy(&writer, &right); | ||
1327 | plrStep(&right); | ||
1328 | }else{ | ||
1329 | plwCopy(&writer, &left); | ||
1330 | plrStep(&left); | ||
1331 | plrStep(&right); | ||
1332 | } | ||
1333 | } | ||
1334 | |||
1335 | plwTerminate(&writer); | ||
1336 | plwDestroy(&writer); | ||
1337 | plrDestroy(&left); | ||
1338 | plrDestroy(&right); | ||
1339 | } | ||
1340 | |||
1341 | /* Write the union of doclists in pLeft and pRight to pOut. For | ||
1342 | ** docids in common between the inputs, the union of the position | ||
1343 | ** lists is written. Inputs and outputs are always type DL_DEFAULT. | ||
1344 | */ | ||
1345 | static void docListUnion( | ||
1346 | const char *pLeft, int nLeft, | ||
1347 | const char *pRight, int nRight, | ||
1348 | DataBuffer *pOut /* Write the combined doclist here */ | ||
1349 | ){ | ||
1350 | DLReader left, right; | ||
1351 | DLWriter writer; | ||
1352 | |||
1353 | if( nLeft==0 ){ | ||
1354 | dataBufferAppend(pOut, pRight, nRight); | ||
1355 | return; | ||
1356 | } | ||
1357 | if( nRight==0 ){ | ||
1358 | dataBufferAppend(pOut, pLeft, nLeft); | ||
1359 | return; | ||
1360 | } | ||
1361 | |||
1362 | dlrInit(&left, DL_DEFAULT, pLeft, nLeft); | ||
1363 | dlrInit(&right, DL_DEFAULT, pRight, nRight); | ||
1364 | dlwInit(&writer, DL_DEFAULT, pOut); | ||
1365 | |||
1366 | while( !dlrAtEnd(&left) || !dlrAtEnd(&right) ){ | ||
1367 | if( dlrAtEnd(&right) ){ | ||
1368 | dlwCopy(&writer, &left); | ||
1369 | dlrStep(&left); | ||
1370 | }else if( dlrAtEnd(&left) ){ | ||
1371 | dlwCopy(&writer, &right); | ||
1372 | dlrStep(&right); | ||
1373 | }else if( dlrDocid(&left)<dlrDocid(&right) ){ | ||
1374 | dlwCopy(&writer, &left); | ||
1375 | dlrStep(&left); | ||
1376 | }else if( dlrDocid(&left)>dlrDocid(&right) ){ | ||
1377 | dlwCopy(&writer, &right); | ||
1378 | dlrStep(&right); | ||
1379 | }else{ | ||
1380 | posListUnion(&left, &right, &writer); | ||
1381 | dlrStep(&left); | ||
1382 | dlrStep(&right); | ||
1383 | } | ||
1384 | } | ||
1385 | |||
1386 | dlrDestroy(&left); | ||
1387 | dlrDestroy(&right); | ||
1388 | dlwDestroy(&writer); | ||
1389 | } | ||
1390 | |||
1391 | /* pLeft and pRight are DLReaders positioned to the same docid. | ||
1392 | ** | ||
1393 | ** If there are no instances in pLeft or pRight where the position | ||
1394 | ** of pLeft is one less than the position of pRight, then this | ||
1395 | ** routine adds nothing to pOut. | ||
1396 | ** | ||
1397 | ** If there are one or more instances where positions from pLeft | ||
1398 | ** are exactly one less than positions from pRight, then add a new | ||
1399 | ** document record to pOut. If pOut wants to hold positions, then | ||
1400 | ** include the positions from pRight that are one more than a | ||
1401 | ** position in pLeft. In other words: pRight.iPos==pLeft.iPos+1. | ||
1402 | */ | ||
1403 | static void posListPhraseMerge(DLReader *pLeft, DLReader *pRight, | ||
1404 | DLWriter *pOut){ | ||
1405 | PLReader left, right; | ||
1406 | PLWriter writer; | ||
1407 | int match = 0; | ||
1408 | |||
1409 | assert( dlrDocid(pLeft)==dlrDocid(pRight) ); | ||
1410 | assert( pOut->iType!=DL_POSITIONS_OFFSETS ); | ||
1411 | |||
1412 | plrInit(&left, pLeft); | ||
1413 | plrInit(&right, pRight); | ||
1414 | |||
1415 | while( !plrAtEnd(&left) && !plrAtEnd(&right) ){ | ||
1416 | if( plrColumn(&left)<plrColumn(&right) ){ | ||
1417 | plrStep(&left); | ||
1418 | }else if( plrColumn(&left)>plrColumn(&right) ){ | ||
1419 | plrStep(&right); | ||
1420 | }else if( plrPosition(&left)+1<plrPosition(&right) ){ | ||
1421 | plrStep(&left); | ||
1422 | }else if( plrPosition(&left)+1>plrPosition(&right) ){ | ||
1423 | plrStep(&right); | ||
1424 | }else{ | ||
1425 | if( !match ){ | ||
1426 | plwInit(&writer, pOut, dlrDocid(pLeft)); | ||
1427 | match = 1; | ||
1428 | } | ||
1429 | plwAdd(&writer, plrColumn(&right), plrPosition(&right), 0, 0); | ||
1430 | plrStep(&left); | ||
1431 | plrStep(&right); | ||
1432 | } | ||
1433 | } | ||
1434 | |||
1435 | if( match ){ | ||
1436 | plwTerminate(&writer); | ||
1437 | plwDestroy(&writer); | ||
1438 | } | ||
1439 | |||
1440 | plrDestroy(&left); | ||
1441 | plrDestroy(&right); | ||
1442 | } | ||
1443 | |||
1444 | /* We have two doclists with positions: pLeft and pRight. | ||
1445 | ** Write the phrase intersection of these two doclists into pOut. | ||
1446 | ** | ||
1447 | ** A phrase intersection means that two documents only match | ||
1448 | ** if pLeft.iPos+1==pRight.iPos. | ||
1449 | ** | ||
1450 | ** iType controls the type of data written to pOut. If iType is | ||
1451 | ** DL_POSITIONS, the positions are those from pRight. | ||
1452 | */ | ||
1453 | static void docListPhraseMerge( | ||
1454 | const char *pLeft, int nLeft, | ||
1455 | const char *pRight, int nRight, | ||
1456 | DocListType iType, | ||
1457 | DataBuffer *pOut /* Write the combined doclist here */ | ||
1458 | ){ | ||
1459 | DLReader left, right; | ||
1460 | DLWriter writer; | ||
1461 | |||
1462 | if( nLeft==0 || nRight==0 ) return; | ||
1463 | |||
1464 | assert( iType!=DL_POSITIONS_OFFSETS ); | ||
1465 | |||
1466 | dlrInit(&left, DL_POSITIONS, pLeft, nLeft); | ||
1467 | dlrInit(&right, DL_POSITIONS, pRight, nRight); | ||
1468 | dlwInit(&writer, iType, pOut); | ||
1469 | |||
1470 | while( !dlrAtEnd(&left) && !dlrAtEnd(&right) ){ | ||
1471 | if( dlrDocid(&left)<dlrDocid(&right) ){ | ||
1472 | dlrStep(&left); | ||
1473 | }else if( dlrDocid(&right)<dlrDocid(&left) ){ | ||
1474 | dlrStep(&right); | ||
1475 | }else{ | ||
1476 | posListPhraseMerge(&left, &right, &writer); | ||
1477 | dlrStep(&left); | ||
1478 | dlrStep(&right); | ||
1479 | } | ||
1480 | } | ||
1481 | |||
1482 | dlrDestroy(&left); | ||
1483 | dlrDestroy(&right); | ||
1484 | dlwDestroy(&writer); | ||
1485 | } | ||
1486 | |||
1487 | /* We have two DL_DOCIDS doclists: pLeft and pRight. | ||
1488 | ** Write the intersection of these two doclists into pOut as a | ||
1489 | ** DL_DOCIDS doclist. | ||
1490 | */ | ||
1491 | static void docListAndMerge( | ||
1492 | const char *pLeft, int nLeft, | ||
1493 | const char *pRight, int nRight, | ||
1494 | DataBuffer *pOut /* Write the combined doclist here */ | ||
1495 | ){ | ||
1496 | DLReader left, right; | ||
1497 | DLWriter writer; | ||
1498 | |||
1499 | if( nLeft==0 || nRight==0 ) return; | ||
1500 | |||
1501 | dlrInit(&left, DL_DOCIDS, pLeft, nLeft); | ||
1502 | dlrInit(&right, DL_DOCIDS, pRight, nRight); | ||
1503 | dlwInit(&writer, DL_DOCIDS, pOut); | ||
1504 | |||
1505 | while( !dlrAtEnd(&left) && !dlrAtEnd(&right) ){ | ||
1506 | if( dlrDocid(&left)<dlrDocid(&right) ){ | ||
1507 | dlrStep(&left); | ||
1508 | }else if( dlrDocid(&right)<dlrDocid(&left) ){ | ||
1509 | dlrStep(&right); | ||
1510 | }else{ | ||
1511 | dlwAdd(&writer, dlrDocid(&left)); | ||
1512 | dlrStep(&left); | ||
1513 | dlrStep(&right); | ||
1514 | } | ||
1515 | } | ||
1516 | |||
1517 | dlrDestroy(&left); | ||
1518 | dlrDestroy(&right); | ||
1519 | dlwDestroy(&writer); | ||
1520 | } | ||
1521 | |||
1522 | /* We have two DL_DOCIDS doclists: pLeft and pRight. | ||
1523 | ** Write the union of these two doclists into pOut as a | ||
1524 | ** DL_DOCIDS doclist. | ||
1525 | */ | ||
1526 | static void docListOrMerge( | ||
1527 | const char *pLeft, int nLeft, | ||
1528 | const char *pRight, int nRight, | ||
1529 | DataBuffer *pOut /* Write the combined doclist here */ | ||
1530 | ){ | ||
1531 | DLReader left, right; | ||
1532 | DLWriter writer; | ||
1533 | |||
1534 | if( nLeft==0 ){ | ||
1535 | dataBufferAppend(pOut, pRight, nRight); | ||
1536 | return; | ||
1537 | } | ||
1538 | if( nRight==0 ){ | ||
1539 | dataBufferAppend(pOut, pLeft, nLeft); | ||
1540 | return; | ||
1541 | } | ||
1542 | |||
1543 | dlrInit(&left, DL_DOCIDS, pLeft, nLeft); | ||
1544 | dlrInit(&right, DL_DOCIDS, pRight, nRight); | ||
1545 | dlwInit(&writer, DL_DOCIDS, pOut); | ||
1546 | |||
1547 | while( !dlrAtEnd(&left) || !dlrAtEnd(&right) ){ | ||
1548 | if( dlrAtEnd(&right) ){ | ||
1549 | dlwAdd(&writer, dlrDocid(&left)); | ||
1550 | dlrStep(&left); | ||
1551 | }else if( dlrAtEnd(&left) ){ | ||
1552 | dlwAdd(&writer, dlrDocid(&right)); | ||
1553 | dlrStep(&right); | ||
1554 | }else if( dlrDocid(&left)<dlrDocid(&right) ){ | ||
1555 | dlwAdd(&writer, dlrDocid(&left)); | ||
1556 | dlrStep(&left); | ||
1557 | }else if( dlrDocid(&right)<dlrDocid(&left) ){ | ||
1558 | dlwAdd(&writer, dlrDocid(&right)); | ||
1559 | dlrStep(&right); | ||
1560 | }else{ | ||
1561 | dlwAdd(&writer, dlrDocid(&left)); | ||
1562 | dlrStep(&left); | ||
1563 | dlrStep(&right); | ||
1564 | } | ||
1565 | } | ||
1566 | |||
1567 | dlrDestroy(&left); | ||
1568 | dlrDestroy(&right); | ||
1569 | dlwDestroy(&writer); | ||
1570 | } | ||
1571 | |||
1572 | /* We have two DL_DOCIDS doclists: pLeft and pRight. | ||
1573 | ** Write into pOut as DL_DOCIDS doclist containing all documents that | ||
1574 | ** occur in pLeft but not in pRight. | ||
1575 | */ | ||
1576 | static void docListExceptMerge( | ||
1577 | const char *pLeft, int nLeft, | ||
1578 | const char *pRight, int nRight, | ||
1579 | DataBuffer *pOut /* Write the combined doclist here */ | ||
1580 | ){ | ||
1581 | DLReader left, right; | ||
1582 | DLWriter writer; | ||
1583 | |||
1584 | if( nLeft==0 ) return; | ||
1585 | if( nRight==0 ){ | ||
1586 | dataBufferAppend(pOut, pLeft, nLeft); | ||
1587 | return; | ||
1588 | } | ||
1589 | |||
1590 | dlrInit(&left, DL_DOCIDS, pLeft, nLeft); | ||
1591 | dlrInit(&right, DL_DOCIDS, pRight, nRight); | ||
1592 | dlwInit(&writer, DL_DOCIDS, pOut); | ||
1593 | |||
1594 | while( !dlrAtEnd(&left) ){ | ||
1595 | while( !dlrAtEnd(&right) && dlrDocid(&right)<dlrDocid(&left) ){ | ||
1596 | dlrStep(&right); | ||
1597 | } | ||
1598 | if( dlrAtEnd(&right) || dlrDocid(&left)<dlrDocid(&right) ){ | ||
1599 | dlwAdd(&writer, dlrDocid(&left)); | ||
1600 | } | ||
1601 | dlrStep(&left); | ||
1602 | } | ||
1603 | |||
1604 | dlrDestroy(&left); | ||
1605 | dlrDestroy(&right); | ||
1606 | dlwDestroy(&writer); | ||
1607 | } | ||
1608 | |||
1609 | static char *string_dup_n(const char *s, int n){ | ||
1610 | char *str = malloc(n + 1); | ||
1611 | memcpy(str, s, n); | ||
1612 | str[n] = '\0'; | ||
1613 | return str; | ||
1614 | } | ||
1615 | |||
1616 | /* Duplicate a string; the caller must free() the returned string. | ||
1617 | * (We don't use strdup() since it's not part of the standard C library and | ||
1618 | * may not be available everywhere.) */ | ||
1619 | static char *string_dup(const char *s){ | ||
1620 | return string_dup_n(s, strlen(s)); | ||
1621 | } | ||
1622 | |||
1623 | /* Format a string, replacing each occurrence of the % character with | ||
1624 | * zDb.zName. This may be more convenient than sqlite_mprintf() | ||
1625 | * when one string is used repeatedly in a format string. | ||
1626 | * The caller must free() the returned string. */ | ||
1627 | static char *string_format(const char *zFormat, | ||
1628 | const char *zDb, const char *zName){ | ||
1629 | const char *p; | ||
1630 | size_t len = 0; | ||
1631 | size_t nDb = strlen(zDb); | ||
1632 | size_t nName = strlen(zName); | ||
1633 | size_t nFullTableName = nDb+1+nName; | ||
1634 | char *result; | ||
1635 | char *r; | ||
1636 | |||
1637 | /* first compute length needed */ | ||
1638 | for(p = zFormat ; *p ; ++p){ | ||
1639 | len += (*p=='%' ? nFullTableName : 1); | ||
1640 | } | ||
1641 | len += 1; /* for null terminator */ | ||
1642 | |||
1643 | r = result = malloc(len); | ||
1644 | for(p = zFormat; *p; ++p){ | ||
1645 | if( *p=='%' ){ | ||
1646 | memcpy(r, zDb, nDb); | ||
1647 | r += nDb; | ||
1648 | *r++ = '.'; | ||
1649 | memcpy(r, zName, nName); | ||
1650 | r += nName; | ||
1651 | } else { | ||
1652 | *r++ = *p; | ||
1653 | } | ||
1654 | } | ||
1655 | *r++ = '\0'; | ||
1656 | assert( r == result + len ); | ||
1657 | return result; | ||
1658 | } | ||
1659 | |||
1660 | static int sql_exec(sqlite3 *db, const char *zDb, const char *zName, | ||
1661 | const char *zFormat){ | ||
1662 | char *zCommand = string_format(zFormat, zDb, zName); | ||
1663 | int rc; | ||
1664 | TRACE(("FTS2 sql: %s\n", zCommand)); | ||
1665 | rc = sqlite3_exec(db, zCommand, NULL, 0, NULL); | ||
1666 | free(zCommand); | ||
1667 | return rc; | ||
1668 | } | ||
1669 | |||
1670 | static int sql_prepare(sqlite3 *db, const char *zDb, const char *zName, | ||
1671 | sqlite3_stmt **ppStmt, const char *zFormat){ | ||
1672 | char *zCommand = string_format(zFormat, zDb, zName); | ||
1673 | int rc; | ||
1674 | TRACE(("FTS2 prepare: %s\n", zCommand)); | ||
1675 | rc = sqlite3_prepare_v2(db, zCommand, -1, ppStmt, NULL); | ||
1676 | free(zCommand); | ||
1677 | return rc; | ||
1678 | } | ||
1679 | |||
1680 | /* end utility functions */ | ||
1681 | |||
1682 | /* Forward reference */ | ||
1683 | typedef struct fulltext_vtab fulltext_vtab; | ||
1684 | |||
1685 | /* A single term in a query is represented by an instances of | ||
1686 | ** the following structure. | ||
1687 | */ | ||
1688 | typedef struct QueryTerm { | ||
1689 | short int nPhrase; /* How many following terms are part of the same phrase */ | ||
1690 | short int iPhrase; /* This is the i-th term of a phrase. */ | ||
1691 | short int iColumn; /* Column of the index that must match this term */ | ||
1692 | signed char isOr; /* this term is preceded by "OR" */ | ||
1693 | signed char isNot; /* this term is preceded by "-" */ | ||
1694 | signed char isPrefix; /* this term is followed by "*" */ | ||
1695 | char *pTerm; /* text of the term. '\000' terminated. malloced */ | ||
1696 | int nTerm; /* Number of bytes in pTerm[] */ | ||
1697 | } QueryTerm; | ||
1698 | |||
1699 | |||
1700 | /* A query string is parsed into a Query structure. | ||
1701 | * | ||
1702 | * We could, in theory, allow query strings to be complicated | ||
1703 | * nested expressions with precedence determined by parentheses. | ||
1704 | * But none of the major search engines do this. (Perhaps the | ||
1705 | * feeling is that an parenthesized expression is two complex of | ||
1706 | * an idea for the average user to grasp.) Taking our lead from | ||
1707 | * the major search engines, we will allow queries to be a list | ||
1708 | * of terms (with an implied AND operator) or phrases in double-quotes, | ||
1709 | * with a single optional "-" before each non-phrase term to designate | ||
1710 | * negation and an optional OR connector. | ||
1711 | * | ||
1712 | * OR binds more tightly than the implied AND, which is what the | ||
1713 | * major search engines seem to do. So, for example: | ||
1714 | * | ||
1715 | * [one two OR three] ==> one AND (two OR three) | ||
1716 | * [one OR two three] ==> (one OR two) AND three | ||
1717 | * | ||
1718 | * A "-" before a term matches all entries that lack that term. | ||
1719 | * The "-" must occur immediately before the term with in intervening | ||
1720 | * space. This is how the search engines do it. | ||
1721 | * | ||
1722 | * A NOT term cannot be the right-hand operand of an OR. If this | ||
1723 | * occurs in the query string, the NOT is ignored: | ||
1724 | * | ||
1725 | * [one OR -two] ==> one OR two | ||
1726 | * | ||
1727 | */ | ||
1728 | typedef struct Query { | ||
1729 | fulltext_vtab *pFts; /* The full text index */ | ||
1730 | int nTerms; /* Number of terms in the query */ | ||
1731 | QueryTerm *pTerms; /* Array of terms. Space obtained from malloc() */ | ||
1732 | int nextIsOr; /* Set the isOr flag on the next inserted term */ | ||
1733 | int nextColumn; /* Next word parsed must be in this column */ | ||
1734 | int dfltColumn; /* The default column */ | ||
1735 | } Query; | ||
1736 | |||
1737 | |||
1738 | /* | ||
1739 | ** An instance of the following structure keeps track of generated | ||
1740 | ** matching-word offset information and snippets. | ||
1741 | */ | ||
1742 | typedef struct Snippet { | ||
1743 | int nMatch; /* Total number of matches */ | ||
1744 | int nAlloc; /* Space allocated for aMatch[] */ | ||
1745 | struct snippetMatch { /* One entry for each matching term */ | ||
1746 | char snStatus; /* Status flag for use while constructing snippets */ | ||
1747 | short int iCol; /* The column that contains the match */ | ||
1748 | short int iTerm; /* The index in Query.pTerms[] of the matching term */ | ||
1749 | short int nByte; /* Number of bytes in the term */ | ||
1750 | int iStart; /* The offset to the first character of the term */ | ||
1751 | } *aMatch; /* Points to space obtained from malloc */ | ||
1752 | char *zOffset; /* Text rendering of aMatch[] */ | ||
1753 | int nOffset; /* strlen(zOffset) */ | ||
1754 | char *zSnippet; /* Snippet text */ | ||
1755 | int nSnippet; /* strlen(zSnippet) */ | ||
1756 | } Snippet; | ||
1757 | |||
1758 | |||
1759 | typedef enum QueryType { | ||
1760 | QUERY_GENERIC, /* table scan */ | ||
1761 | QUERY_ROWID, /* lookup by rowid */ | ||
1762 | QUERY_FULLTEXT /* QUERY_FULLTEXT + [i] is a full-text search for column i*/ | ||
1763 | } QueryType; | ||
1764 | |||
1765 | typedef enum fulltext_statement { | ||
1766 | CONTENT_INSERT_STMT, | ||
1767 | CONTENT_SELECT_STMT, | ||
1768 | CONTENT_UPDATE_STMT, | ||
1769 | CONTENT_DELETE_STMT, | ||
1770 | |||
1771 | BLOCK_INSERT_STMT, | ||
1772 | BLOCK_SELECT_STMT, | ||
1773 | BLOCK_DELETE_STMT, | ||
1774 | |||
1775 | SEGDIR_MAX_INDEX_STMT, | ||
1776 | SEGDIR_SET_STMT, | ||
1777 | SEGDIR_SELECT_STMT, | ||
1778 | SEGDIR_SPAN_STMT, | ||
1779 | SEGDIR_DELETE_STMT, | ||
1780 | SEGDIR_SELECT_ALL_STMT, | ||
1781 | |||
1782 | MAX_STMT /* Always at end! */ | ||
1783 | } fulltext_statement; | ||
1784 | |||
1785 | /* These must exactly match the enum above. */ | ||
1786 | /* TODO(shess): Is there some risk that a statement will be used in two | ||
1787 | ** cursors at once, e.g. if a query joins a virtual table to itself? | ||
1788 | ** If so perhaps we should move some of these to the cursor object. | ||
1789 | */ | ||
1790 | static const char *const fulltext_zStatement[MAX_STMT] = { | ||
1791 | /* CONTENT_INSERT */ NULL, /* generated in contentInsertStatement() */ | ||
1792 | /* CONTENT_SELECT */ "select * from %_content where rowid = ?", | ||
1793 | /* CONTENT_UPDATE */ NULL, /* generated in contentUpdateStatement() */ | ||
1794 | /* CONTENT_DELETE */ "delete from %_content where rowid = ?", | ||
1795 | |||
1796 | /* BLOCK_INSERT */ "insert into %_segments values (?)", | ||
1797 | /* BLOCK_SELECT */ "select block from %_segments where rowid = ?", | ||
1798 | /* BLOCK_DELETE */ "delete from %_segments where rowid between ? and ?", | ||
1799 | |||
1800 | /* SEGDIR_MAX_INDEX */ "select max(idx) from %_segdir where level = ?", | ||
1801 | /* SEGDIR_SET */ "insert into %_segdir values (?, ?, ?, ?, ?, ?)", | ||
1802 | /* SEGDIR_SELECT */ | ||
1803 | "select start_block, leaves_end_block, root from %_segdir " | ||
1804 | " where level = ? order by idx", | ||
1805 | /* SEGDIR_SPAN */ | ||
1806 | "select min(start_block), max(end_block) from %_segdir " | ||
1807 | " where level = ? and start_block <> 0", | ||
1808 | /* SEGDIR_DELETE */ "delete from %_segdir where level = ?", | ||
1809 | /* SEGDIR_SELECT_ALL */ | ||
1810 | "select root, leaves_end_block from %_segdir order by level desc, idx", | ||
1811 | }; | ||
1812 | |||
1813 | /* | ||
1814 | ** A connection to a fulltext index is an instance of the following | ||
1815 | ** structure. The xCreate and xConnect methods create an instance | ||
1816 | ** of this structure and xDestroy and xDisconnect free that instance. | ||
1817 | ** All other methods receive a pointer to the structure as one of their | ||
1818 | ** arguments. | ||
1819 | */ | ||
1820 | struct fulltext_vtab { | ||
1821 | sqlite3_vtab base; /* Base class used by SQLite core */ | ||
1822 | sqlite3 *db; /* The database connection */ | ||
1823 | const char *zDb; /* logical database name */ | ||
1824 | const char *zName; /* virtual table name */ | ||
1825 | int nColumn; /* number of columns in virtual table */ | ||
1826 | char **azColumn; /* column names. malloced */ | ||
1827 | char **azContentColumn; /* column names in content table; malloced */ | ||
1828 | sqlite3_tokenizer *pTokenizer; /* tokenizer for inserts and queries */ | ||
1829 | |||
1830 | /* Precompiled statements which we keep as long as the table is | ||
1831 | ** open. | ||
1832 | */ | ||
1833 | sqlite3_stmt *pFulltextStatements[MAX_STMT]; | ||
1834 | |||
1835 | /* Precompiled statements used for segment merges. We run a | ||
1836 | ** separate select across the leaf level of each tree being merged. | ||
1837 | */ | ||
1838 | sqlite3_stmt *pLeafSelectStmts[MERGE_COUNT]; | ||
1839 | /* The statement used to prepare pLeafSelectStmts. */ | ||
1840 | #define LEAF_SELECT \ | ||
1841 | "select block from %_segments where rowid between ? and ? order by rowid" | ||
1842 | |||
1843 | /* These buffer pending index updates during transactions. | ||
1844 | ** nPendingData estimates the memory size of the pending data. It | ||
1845 | ** doesn't include the hash-bucket overhead, nor any malloc | ||
1846 | ** overhead. When nPendingData exceeds kPendingThreshold, the | ||
1847 | ** buffer is flushed even before the transaction closes. | ||
1848 | ** pendingTerms stores the data, and is only valid when nPendingData | ||
1849 | ** is >=0 (nPendingData<0 means pendingTerms has not been | ||
1850 | ** initialized). iPrevDocid is the last docid written, used to make | ||
1851 | ** certain we're inserting in sorted order. | ||
1852 | */ | ||
1853 | int nPendingData; | ||
1854 | #define kPendingThreshold (1*1024*1024) | ||
1855 | sqlite_int64 iPrevDocid; | ||
1856 | fts2Hash pendingTerms; | ||
1857 | }; | ||
1858 | |||
1859 | /* | ||
1860 | ** When the core wants to do a query, it create a cursor using a | ||
1861 | ** call to xOpen. This structure is an instance of a cursor. It | ||
1862 | ** is destroyed by xClose. | ||
1863 | */ | ||
1864 | typedef struct fulltext_cursor { | ||
1865 | sqlite3_vtab_cursor base; /* Base class used by SQLite core */ | ||
1866 | QueryType iCursorType; /* Copy of sqlite3_index_info.idxNum */ | ||
1867 | sqlite3_stmt *pStmt; /* Prepared statement in use by the cursor */ | ||
1868 | int eof; /* True if at End Of Results */ | ||
1869 | Query q; /* Parsed query string */ | ||
1870 | Snippet snippet; /* Cached snippet for the current row */ | ||
1871 | int iColumn; /* Column being searched */ | ||
1872 | DataBuffer result; /* Doclist results from fulltextQuery */ | ||
1873 | DLReader reader; /* Result reader if result not empty */ | ||
1874 | } fulltext_cursor; | ||
1875 | |||
1876 | static struct fulltext_vtab *cursor_vtab(fulltext_cursor *c){ | ||
1877 | return (fulltext_vtab *) c->base.pVtab; | ||
1878 | } | ||
1879 | |||
1880 | static const sqlite3_module fts2Module; /* forward declaration */ | ||
1881 | |||
1882 | /* Return a dynamically generated statement of the form | ||
1883 | * insert into %_content (rowid, ...) values (?, ...) | ||
1884 | */ | ||
1885 | static const char *contentInsertStatement(fulltext_vtab *v){ | ||
1886 | StringBuffer sb; | ||
1887 | int i; | ||
1888 | |||
1889 | initStringBuffer(&sb); | ||
1890 | append(&sb, "insert into %_content (rowid, "); | ||
1891 | appendList(&sb, v->nColumn, v->azContentColumn); | ||
1892 | append(&sb, ") values (?"); | ||
1893 | for(i=0; i<v->nColumn; ++i) | ||
1894 | append(&sb, ", ?"); | ||
1895 | append(&sb, ")"); | ||
1896 | return stringBufferData(&sb); | ||
1897 | } | ||
1898 | |||
1899 | /* Return a dynamically generated statement of the form | ||
1900 | * update %_content set [col_0] = ?, [col_1] = ?, ... | ||
1901 | * where rowid = ? | ||
1902 | */ | ||
1903 | static const char *contentUpdateStatement(fulltext_vtab *v){ | ||
1904 | StringBuffer sb; | ||
1905 | int i; | ||
1906 | |||
1907 | initStringBuffer(&sb); | ||
1908 | append(&sb, "update %_content set "); | ||
1909 | for(i=0; i<v->nColumn; ++i) { | ||
1910 | if( i>0 ){ | ||
1911 | append(&sb, ", "); | ||
1912 | } | ||
1913 | append(&sb, v->azContentColumn[i]); | ||
1914 | append(&sb, " = ?"); | ||
1915 | } | ||
1916 | append(&sb, " where rowid = ?"); | ||
1917 | return stringBufferData(&sb); | ||
1918 | } | ||
1919 | |||
1920 | /* Puts a freshly-prepared statement determined by iStmt in *ppStmt. | ||
1921 | ** If the indicated statement has never been prepared, it is prepared | ||
1922 | ** and cached, otherwise the cached version is reset. | ||
1923 | */ | ||
1924 | static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt, | ||
1925 | sqlite3_stmt **ppStmt){ | ||
1926 | assert( iStmt<MAX_STMT ); | ||
1927 | if( v->pFulltextStatements[iStmt]==NULL ){ | ||
1928 | const char *zStmt; | ||
1929 | int rc; | ||
1930 | switch( iStmt ){ | ||
1931 | case CONTENT_INSERT_STMT: | ||
1932 | zStmt = contentInsertStatement(v); break; | ||
1933 | case CONTENT_UPDATE_STMT: | ||
1934 | zStmt = contentUpdateStatement(v); break; | ||
1935 | default: | ||
1936 | zStmt = fulltext_zStatement[iStmt]; | ||
1937 | } | ||
1938 | rc = sql_prepare(v->db, v->zDb, v->zName, &v->pFulltextStatements[iStmt], | ||
1939 | zStmt); | ||
1940 | if( zStmt != fulltext_zStatement[iStmt]) free((void *) zStmt); | ||
1941 | if( rc!=SQLITE_OK ) return rc; | ||
1942 | } else { | ||
1943 | int rc = sqlite3_reset(v->pFulltextStatements[iStmt]); | ||
1944 | if( rc!=SQLITE_OK ) return rc; | ||
1945 | } | ||
1946 | |||
1947 | *ppStmt = v->pFulltextStatements[iStmt]; | ||
1948 | return SQLITE_OK; | ||
1949 | } | ||
1950 | |||
1951 | /* Like sqlite3_step(), but convert SQLITE_DONE to SQLITE_OK and | ||
1952 | ** SQLITE_ROW to SQLITE_ERROR. Useful for statements like UPDATE, | ||
1953 | ** where we expect no results. | ||
1954 | */ | ||
1955 | static int sql_single_step(sqlite3_stmt *s){ | ||
1956 | int rc = sqlite3_step(s); | ||
1957 | return (rc==SQLITE_DONE) ? SQLITE_OK : rc; | ||
1958 | } | ||
1959 | |||
1960 | /* Like sql_get_statement(), but for special replicated LEAF_SELECT | ||
1961 | ** statements. | ||
1962 | */ | ||
1963 | /* TODO(shess) Write version for generic statements and then share | ||
1964 | ** that between the cached-statement functions. | ||
1965 | */ | ||
1966 | static int sql_get_leaf_statement(fulltext_vtab *v, int idx, | ||
1967 | sqlite3_stmt **ppStmt){ | ||
1968 | assert( idx>=0 && idx<MERGE_COUNT ); | ||
1969 | if( v->pLeafSelectStmts[idx]==NULL ){ | ||
1970 | int rc = sql_prepare(v->db, v->zDb, v->zName, &v->pLeafSelectStmts[idx], | ||
1971 | LEAF_SELECT); | ||
1972 | if( rc!=SQLITE_OK ) return rc; | ||
1973 | }else{ | ||
1974 | int rc = sqlite3_reset(v->pLeafSelectStmts[idx]); | ||
1975 | if( rc!=SQLITE_OK ) return rc; | ||
1976 | } | ||
1977 | |||
1978 | *ppStmt = v->pLeafSelectStmts[idx]; | ||
1979 | return SQLITE_OK; | ||
1980 | } | ||
1981 | |||
1982 | /* insert into %_content (rowid, ...) values ([rowid], [pValues]) */ | ||
1983 | static int content_insert(fulltext_vtab *v, sqlite3_value *rowid, | ||
1984 | sqlite3_value **pValues){ | ||
1985 | sqlite3_stmt *s; | ||
1986 | int i; | ||
1987 | int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s); | ||
1988 | if( rc!=SQLITE_OK ) return rc; | ||
1989 | |||
1990 | rc = sqlite3_bind_value(s, 1, rowid); | ||
1991 | if( rc!=SQLITE_OK ) return rc; | ||
1992 | |||
1993 | for(i=0; i<v->nColumn; ++i){ | ||
1994 | rc = sqlite3_bind_value(s, 2+i, pValues[i]); | ||
1995 | if( rc!=SQLITE_OK ) return rc; | ||
1996 | } | ||
1997 | |||
1998 | return sql_single_step(s); | ||
1999 | } | ||
2000 | |||
2001 | /* update %_content set col0 = pValues[0], col1 = pValues[1], ... | ||
2002 | * where rowid = [iRowid] */ | ||
2003 | static int content_update(fulltext_vtab *v, sqlite3_value **pValues, | ||
2004 | sqlite_int64 iRowid){ | ||
2005 | sqlite3_stmt *s; | ||
2006 | int i; | ||
2007 | int rc = sql_get_statement(v, CONTENT_UPDATE_STMT, &s); | ||
2008 | if( rc!=SQLITE_OK ) return rc; | ||
2009 | |||
2010 | for(i=0; i<v->nColumn; ++i){ | ||
2011 | rc = sqlite3_bind_value(s, 1+i, pValues[i]); | ||
2012 | if( rc!=SQLITE_OK ) return rc; | ||
2013 | } | ||
2014 | |||
2015 | rc = sqlite3_bind_int64(s, 1+v->nColumn, iRowid); | ||
2016 | if( rc!=SQLITE_OK ) return rc; | ||
2017 | |||
2018 | return sql_single_step(s); | ||
2019 | } | ||
2020 | |||
2021 | static void freeStringArray(int nString, const char **pString){ | ||
2022 | int i; | ||
2023 | |||
2024 | for (i=0 ; i < nString ; ++i) { | ||
2025 | if( pString[i]!=NULL ) free((void *) pString[i]); | ||
2026 | } | ||
2027 | free((void *) pString); | ||
2028 | } | ||
2029 | |||
2030 | /* select * from %_content where rowid = [iRow] | ||
2031 | * The caller must delete the returned array and all strings in it. | ||
2032 | * null fields will be NULL in the returned array. | ||
2033 | * | ||
2034 | * TODO: Perhaps we should return pointer/length strings here for consistency | ||
2035 | * with other code which uses pointer/length. */ | ||
2036 | static int content_select(fulltext_vtab *v, sqlite_int64 iRow, | ||
2037 | const char ***pValues){ | ||
2038 | sqlite3_stmt *s; | ||
2039 | const char **values; | ||
2040 | int i; | ||
2041 | int rc; | ||
2042 | |||
2043 | *pValues = NULL; | ||
2044 | |||
2045 | rc = sql_get_statement(v, CONTENT_SELECT_STMT, &s); | ||
2046 | if( rc!=SQLITE_OK ) return rc; | ||
2047 | |||
2048 | rc = sqlite3_bind_int64(s, 1, iRow); | ||
2049 | if( rc!=SQLITE_OK ) return rc; | ||
2050 | |||
2051 | rc = sqlite3_step(s); | ||
2052 | if( rc!=SQLITE_ROW ) return rc; | ||
2053 | |||
2054 | values = (const char **) malloc(v->nColumn * sizeof(const char *)); | ||
2055 | for(i=0; i<v->nColumn; ++i){ | ||
2056 | if( sqlite3_column_type(s, i)==SQLITE_NULL ){ | ||
2057 | values[i] = NULL; | ||
2058 | }else{ | ||
2059 | values[i] = string_dup((char*)sqlite3_column_text(s, i)); | ||
2060 | } | ||
2061 | } | ||
2062 | |||
2063 | /* We expect only one row. We must execute another sqlite3_step() | ||
2064 | * to complete the iteration; otherwise the table will remain locked. */ | ||
2065 | rc = sqlite3_step(s); | ||
2066 | if( rc==SQLITE_DONE ){ | ||
2067 | *pValues = values; | ||
2068 | return SQLITE_OK; | ||
2069 | } | ||
2070 | |||
2071 | freeStringArray(v->nColumn, values); | ||
2072 | return rc; | ||
2073 | } | ||
2074 | |||
2075 | /* delete from %_content where rowid = [iRow ] */ | ||
2076 | static int content_delete(fulltext_vtab *v, sqlite_int64 iRow){ | ||
2077 | sqlite3_stmt *s; | ||
2078 | int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s); | ||
2079 | if( rc!=SQLITE_OK ) return rc; | ||
2080 | |||
2081 | rc = sqlite3_bind_int64(s, 1, iRow); | ||
2082 | if( rc!=SQLITE_OK ) return rc; | ||
2083 | |||
2084 | return sql_single_step(s); | ||
2085 | } | ||
2086 | |||
2087 | /* insert into %_segments values ([pData]) | ||
2088 | ** returns assigned rowid in *piBlockid | ||
2089 | */ | ||
2090 | static int block_insert(fulltext_vtab *v, const char *pData, int nData, | ||
2091 | sqlite_int64 *piBlockid){ | ||
2092 | sqlite3_stmt *s; | ||
2093 | int rc = sql_get_statement(v, BLOCK_INSERT_STMT, &s); | ||
2094 | if( rc!=SQLITE_OK ) return rc; | ||
2095 | |||
2096 | rc = sqlite3_bind_blob(s, 1, pData, nData, SQLITE_STATIC); | ||
2097 | if( rc!=SQLITE_OK ) return rc; | ||
2098 | |||
2099 | rc = sqlite3_step(s); | ||
2100 | if( rc==SQLITE_ROW ) return SQLITE_ERROR; | ||
2101 | if( rc!=SQLITE_DONE ) return rc; | ||
2102 | |||
2103 | *piBlockid = sqlite3_last_insert_rowid(v->db); | ||
2104 | return SQLITE_OK; | ||
2105 | } | ||
2106 | |||
2107 | /* delete from %_segments | ||
2108 | ** where rowid between [iStartBlockid] and [iEndBlockid] | ||
2109 | ** | ||
2110 | ** Deletes the range of blocks, inclusive, used to delete the blocks | ||
2111 | ** which form a segment. | ||
2112 | */ | ||
2113 | static int block_delete(fulltext_vtab *v, | ||
2114 | sqlite_int64 iStartBlockid, sqlite_int64 iEndBlockid){ | ||
2115 | sqlite3_stmt *s; | ||
2116 | int rc = sql_get_statement(v, BLOCK_DELETE_STMT, &s); | ||
2117 | if( rc!=SQLITE_OK ) return rc; | ||
2118 | |||
2119 | rc = sqlite3_bind_int64(s, 1, iStartBlockid); | ||
2120 | if( rc!=SQLITE_OK ) return rc; | ||
2121 | |||
2122 | rc = sqlite3_bind_int64(s, 2, iEndBlockid); | ||
2123 | if( rc!=SQLITE_OK ) return rc; | ||
2124 | |||
2125 | return sql_single_step(s); | ||
2126 | } | ||
2127 | |||
2128 | /* Returns SQLITE_ROW with *pidx set to the maximum segment idx found | ||
2129 | ** at iLevel. Returns SQLITE_DONE if there are no segments at | ||
2130 | ** iLevel. Otherwise returns an error. | ||
2131 | */ | ||
2132 | static int segdir_max_index(fulltext_vtab *v, int iLevel, int *pidx){ | ||
2133 | sqlite3_stmt *s; | ||
2134 | int rc = sql_get_statement(v, SEGDIR_MAX_INDEX_STMT, &s); | ||
2135 | if( rc!=SQLITE_OK ) return rc; | ||
2136 | |||
2137 | rc = sqlite3_bind_int(s, 1, iLevel); | ||
2138 | if( rc!=SQLITE_OK ) return rc; | ||
2139 | |||
2140 | rc = sqlite3_step(s); | ||
2141 | /* Should always get at least one row due to how max() works. */ | ||
2142 | if( rc==SQLITE_DONE ) return SQLITE_DONE; | ||
2143 | if( rc!=SQLITE_ROW ) return rc; | ||
2144 | |||
2145 | /* NULL means that there were no inputs to max(). */ | ||
2146 | if( SQLITE_NULL==sqlite3_column_type(s, 0) ){ | ||
2147 | rc = sqlite3_step(s); | ||
2148 | if( rc==SQLITE_ROW ) return SQLITE_ERROR; | ||
2149 | return rc; | ||
2150 | } | ||
2151 | |||
2152 | *pidx = sqlite3_column_int(s, 0); | ||
2153 | |||
2154 | /* We expect only one row. We must execute another sqlite3_step() | ||
2155 | * to complete the iteration; otherwise the table will remain locked. */ | ||
2156 | rc = sqlite3_step(s); | ||
2157 | if( rc==SQLITE_ROW ) return SQLITE_ERROR; | ||
2158 | if( rc!=SQLITE_DONE ) return rc; | ||
2159 | return SQLITE_ROW; | ||
2160 | } | ||
2161 | |||
2162 | /* insert into %_segdir values ( | ||
2163 | ** [iLevel], [idx], | ||
2164 | ** [iStartBlockid], [iLeavesEndBlockid], [iEndBlockid], | ||
2165 | ** [pRootData] | ||
2166 | ** ) | ||
2167 | */ | ||
2168 | static int segdir_set(fulltext_vtab *v, int iLevel, int idx, | ||
2169 | sqlite_int64 iStartBlockid, | ||
2170 | sqlite_int64 iLeavesEndBlockid, | ||
2171 | sqlite_int64 iEndBlockid, | ||
2172 | const char *pRootData, int nRootData){ | ||
2173 | sqlite3_stmt *s; | ||
2174 | int rc = sql_get_statement(v, SEGDIR_SET_STMT, &s); | ||
2175 | if( rc!=SQLITE_OK ) return rc; | ||
2176 | |||
2177 | rc = sqlite3_bind_int(s, 1, iLevel); | ||
2178 | if( rc!=SQLITE_OK ) return rc; | ||
2179 | |||
2180 | rc = sqlite3_bind_int(s, 2, idx); | ||
2181 | if( rc!=SQLITE_OK ) return rc; | ||
2182 | |||
2183 | rc = sqlite3_bind_int64(s, 3, iStartBlockid); | ||
2184 | if( rc!=SQLITE_OK ) return rc; | ||
2185 | |||
2186 | rc = sqlite3_bind_int64(s, 4, iLeavesEndBlockid); | ||
2187 | if( rc!=SQLITE_OK ) return rc; | ||
2188 | |||
2189 | rc = sqlite3_bind_int64(s, 5, iEndBlockid); | ||
2190 | if( rc!=SQLITE_OK ) return rc; | ||
2191 | |||
2192 | rc = sqlite3_bind_blob(s, 6, pRootData, nRootData, SQLITE_STATIC); | ||
2193 | if( rc!=SQLITE_OK ) return rc; | ||
2194 | |||
2195 | return sql_single_step(s); | ||
2196 | } | ||
2197 | |||
2198 | /* Queries %_segdir for the block span of the segments in level | ||
2199 | ** iLevel. Returns SQLITE_DONE if there are no blocks for iLevel, | ||
2200 | ** SQLITE_ROW if there are blocks, else an error. | ||
2201 | */ | ||
2202 | static int segdir_span(fulltext_vtab *v, int iLevel, | ||
2203 | sqlite_int64 *piStartBlockid, | ||
2204 | sqlite_int64 *piEndBlockid){ | ||
2205 | sqlite3_stmt *s; | ||
2206 | int rc = sql_get_statement(v, SEGDIR_SPAN_STMT, &s); | ||
2207 | if( rc!=SQLITE_OK ) return rc; | ||
2208 | |||
2209 | rc = sqlite3_bind_int(s, 1, iLevel); | ||
2210 | if( rc!=SQLITE_OK ) return rc; | ||
2211 | |||
2212 | rc = sqlite3_step(s); | ||
2213 | if( rc==SQLITE_DONE ) return SQLITE_DONE; /* Should never happen */ | ||
2214 | if( rc!=SQLITE_ROW ) return rc; | ||
2215 | |||
2216 | /* This happens if all segments at this level are entirely inline. */ | ||
2217 | if( SQLITE_NULL==sqlite3_column_type(s, 0) ){ | ||
2218 | /* We expect only one row. We must execute another sqlite3_step() | ||
2219 | * to complete the iteration; otherwise the table will remain locked. */ | ||
2220 | int rc2 = sqlite3_step(s); | ||
2221 | if( rc2==SQLITE_ROW ) return SQLITE_ERROR; | ||
2222 | return rc2; | ||
2223 | } | ||
2224 | |||
2225 | *piStartBlockid = sqlite3_column_int64(s, 0); | ||
2226 | *piEndBlockid = sqlite3_column_int64(s, 1); | ||
2227 | |||
2228 | /* We expect only one row. We must execute another sqlite3_step() | ||
2229 | * to complete the iteration; otherwise the table will remain locked. */ | ||
2230 | rc = sqlite3_step(s); | ||
2231 | if( rc==SQLITE_ROW ) return SQLITE_ERROR; | ||
2232 | if( rc!=SQLITE_DONE ) return rc; | ||
2233 | return SQLITE_ROW; | ||
2234 | } | ||
2235 | |||
2236 | /* Delete the segment blocks and segment directory records for all | ||
2237 | ** segments at iLevel. | ||
2238 | */ | ||
2239 | static int segdir_delete(fulltext_vtab *v, int iLevel){ | ||
2240 | sqlite3_stmt *s; | ||
2241 | sqlite_int64 iStartBlockid, iEndBlockid; | ||
2242 | int rc = segdir_span(v, iLevel, &iStartBlockid, &iEndBlockid); | ||
2243 | if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ) return rc; | ||
2244 | |||
2245 | if( rc==SQLITE_ROW ){ | ||
2246 | rc = block_delete(v, iStartBlockid, iEndBlockid); | ||
2247 | if( rc!=SQLITE_OK ) return rc; | ||
2248 | } | ||
2249 | |||
2250 | /* Delete the segment directory itself. */ | ||
2251 | rc = sql_get_statement(v, SEGDIR_DELETE_STMT, &s); | ||
2252 | if( rc!=SQLITE_OK ) return rc; | ||
2253 | |||
2254 | rc = sqlite3_bind_int64(s, 1, iLevel); | ||
2255 | if( rc!=SQLITE_OK ) return rc; | ||
2256 | |||
2257 | return sql_single_step(s); | ||
2258 | } | ||
2259 | |||
2260 | /* TODO(shess) clearPendingTerms() is far down the file because | ||
2261 | ** writeZeroSegment() is far down the file because LeafWriter is far | ||
2262 | ** down the file. Consider refactoring the code to move the non-vtab | ||
2263 | ** code above the vtab code so that we don't need this forward | ||
2264 | ** reference. | ||
2265 | */ | ||
2266 | static int clearPendingTerms(fulltext_vtab *v); | ||
2267 | |||
2268 | /* | ||
2269 | ** Free the memory used to contain a fulltext_vtab structure. | ||
2270 | */ | ||
2271 | static void fulltext_vtab_destroy(fulltext_vtab *v){ | ||
2272 | int iStmt, i; | ||
2273 | |||
2274 | TRACE(("FTS2 Destroy %p\n", v)); | ||
2275 | for( iStmt=0; iStmt<MAX_STMT; iStmt++ ){ | ||
2276 | if( v->pFulltextStatements[iStmt]!=NULL ){ | ||
2277 | sqlite3_finalize(v->pFulltextStatements[iStmt]); | ||
2278 | v->pFulltextStatements[iStmt] = NULL; | ||
2279 | } | ||
2280 | } | ||
2281 | |||
2282 | for( i=0; i<MERGE_COUNT; i++ ){ | ||
2283 | if( v->pLeafSelectStmts[i]!=NULL ){ | ||
2284 | sqlite3_finalize(v->pLeafSelectStmts[i]); | ||
2285 | v->pLeafSelectStmts[i] = NULL; | ||
2286 | } | ||
2287 | } | ||
2288 | |||
2289 | if( v->pTokenizer!=NULL ){ | ||
2290 | v->pTokenizer->pModule->xDestroy(v->pTokenizer); | ||
2291 | v->pTokenizer = NULL; | ||
2292 | } | ||
2293 | |||
2294 | clearPendingTerms(v); | ||
2295 | |||
2296 | free(v->azColumn); | ||
2297 | for(i = 0; i < v->nColumn; ++i) { | ||
2298 | sqlite3_free(v->azContentColumn[i]); | ||
2299 | } | ||
2300 | free(v->azContentColumn); | ||
2301 | free(v); | ||
2302 | } | ||
2303 | |||
2304 | /* | ||
2305 | ** Token types for parsing the arguments to xConnect or xCreate. | ||
2306 | */ | ||
2307 | #define TOKEN_EOF 0 /* End of file */ | ||
2308 | #define TOKEN_SPACE 1 /* Any kind of whitespace */ | ||
2309 | #define TOKEN_ID 2 /* An identifier */ | ||
2310 | #define TOKEN_STRING 3 /* A string literal */ | ||
2311 | #define TOKEN_PUNCT 4 /* A single punctuation character */ | ||
2312 | |||
2313 | /* | ||
2314 | ** If X is a character that can be used in an identifier then | ||
2315 | ** IdChar(X) will be true. Otherwise it is false. | ||
2316 | ** | ||
2317 | ** For ASCII, any character with the high-order bit set is | ||
2318 | ** allowed in an identifier. For 7-bit characters, | ||
2319 | ** sqlite3IsIdChar[X] must be 1. | ||
2320 | ** | ||
2321 | ** Ticket #1066. the SQL standard does not allow '$' in the | ||
2322 | ** middle of identfiers. But many SQL implementations do. | ||
2323 | ** SQLite will allow '$' in identifiers for compatibility. | ||
2324 | ** But the feature is undocumented. | ||
2325 | */ | ||
2326 | static const char isIdChar[] = { | ||
2327 | /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */ | ||
2328 | 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 2x */ | ||
2329 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */ | ||
2330 | 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */ | ||
2331 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */ | ||
2332 | 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */ | ||
2333 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */ | ||
2334 | }; | ||
2335 | #define IdChar(C) (((c=C)&0x80)!=0 || (c>0x1f && isIdChar[c-0x20])) | ||
2336 | |||
2337 | |||
2338 | /* | ||
2339 | ** Return the length of the token that begins at z[0]. | ||
2340 | ** Store the token type in *tokenType before returning. | ||
2341 | */ | ||
2342 | static int getToken(const char *z, int *tokenType){ | ||
2343 | int i, c; | ||
2344 | switch( *z ){ | ||
2345 | case 0: { | ||
2346 | *tokenType = TOKEN_EOF; | ||
2347 | return 0; | ||
2348 | } | ||
2349 | case ' ': case '\t': case '\n': case '\f': case '\r': { | ||
2350 | for(i=1; safe_isspace(z[i]); i++){} | ||
2351 | *tokenType = TOKEN_SPACE; | ||
2352 | return i; | ||
2353 | } | ||
2354 | case '`': | ||
2355 | case '\'': | ||
2356 | case '"': { | ||
2357 | int delim = z[0]; | ||
2358 | for(i=1; (c=z[i])!=0; i++){ | ||
2359 | if( c==delim ){ | ||
2360 | if( z[i+1]==delim ){ | ||
2361 | i++; | ||
2362 | }else{ | ||
2363 | break; | ||
2364 | } | ||
2365 | } | ||
2366 | } | ||
2367 | *tokenType = TOKEN_STRING; | ||
2368 | return i + (c!=0); | ||
2369 | } | ||
2370 | case '[': { | ||
2371 | for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){} | ||
2372 | *tokenType = TOKEN_ID; | ||
2373 | return i; | ||
2374 | } | ||
2375 | default: { | ||
2376 | if( !IdChar(*z) ){ | ||
2377 | break; | ||
2378 | } | ||
2379 | for(i=1; IdChar(z[i]); i++){} | ||
2380 | *tokenType = TOKEN_ID; | ||
2381 | return i; | ||
2382 | } | ||
2383 | } | ||
2384 | *tokenType = TOKEN_PUNCT; | ||
2385 | return 1; | ||
2386 | } | ||
2387 | |||
2388 | /* | ||
2389 | ** A token extracted from a string is an instance of the following | ||
2390 | ** structure. | ||
2391 | */ | ||
2392 | typedef struct Token { | ||
2393 | const char *z; /* Pointer to token text. Not '\000' terminated */ | ||
2394 | short int n; /* Length of the token text in bytes. */ | ||
2395 | } Token; | ||
2396 | |||
2397 | /* | ||
2398 | ** Given a input string (which is really one of the argv[] parameters | ||
2399 | ** passed into xConnect or xCreate) split the string up into tokens. | ||
2400 | ** Return an array of pointers to '\000' terminated strings, one string | ||
2401 | ** for each non-whitespace token. | ||
2402 | ** | ||
2403 | ** The returned array is terminated by a single NULL pointer. | ||
2404 | ** | ||
2405 | ** Space to hold the returned array is obtained from a single | ||
2406 | ** malloc and should be freed by passing the return value to free(). | ||
2407 | ** The individual strings within the token list are all a part of | ||
2408 | ** the single memory allocation and will all be freed at once. | ||
2409 | */ | ||
2410 | static char **tokenizeString(const char *z, int *pnToken){ | ||
2411 | int nToken = 0; | ||
2412 | Token *aToken = malloc( strlen(z) * sizeof(aToken[0]) ); | ||
2413 | int n = 1; | ||
2414 | int e, i; | ||
2415 | int totalSize = 0; | ||
2416 | char **azToken; | ||
2417 | char *zCopy; | ||
2418 | while( n>0 ){ | ||
2419 | n = getToken(z, &e); | ||
2420 | if( e!=TOKEN_SPACE ){ | ||
2421 | aToken[nToken].z = z; | ||
2422 | aToken[nToken].n = n; | ||
2423 | nToken++; | ||
2424 | totalSize += n+1; | ||
2425 | } | ||
2426 | z += n; | ||
2427 | } | ||
2428 | azToken = (char**)malloc( nToken*sizeof(char*) + totalSize ); | ||
2429 | zCopy = (char*)&azToken[nToken]; | ||
2430 | nToken--; | ||
2431 | for(i=0; i<nToken; i++){ | ||
2432 | azToken[i] = zCopy; | ||
2433 | n = aToken[i].n; | ||
2434 | memcpy(zCopy, aToken[i].z, n); | ||
2435 | zCopy[n] = 0; | ||
2436 | zCopy += n+1; | ||
2437 | } | ||
2438 | azToken[nToken] = 0; | ||
2439 | free(aToken); | ||
2440 | *pnToken = nToken; | ||
2441 | return azToken; | ||
2442 | } | ||
2443 | |||
2444 | /* | ||
2445 | ** Convert an SQL-style quoted string into a normal string by removing | ||
2446 | ** the quote characters. The conversion is done in-place. If the | ||
2447 | ** input does not begin with a quote character, then this routine | ||
2448 | ** is a no-op. | ||
2449 | ** | ||
2450 | ** Examples: | ||
2451 | ** | ||
2452 | ** "abc" becomes abc | ||
2453 | ** 'xyz' becomes xyz | ||
2454 | ** [pqr] becomes pqr | ||
2455 | ** `mno` becomes mno | ||
2456 | */ | ||
2457 | static void dequoteString(char *z){ | ||
2458 | int quote; | ||
2459 | int i, j; | ||
2460 | if( z==0 ) return; | ||
2461 | quote = z[0]; | ||
2462 | switch( quote ){ | ||
2463 | case '\'': break; | ||
2464 | case '"': break; | ||
2465 | case '`': break; /* For MySQL compatibility */ | ||
2466 | case '[': quote = ']'; break; /* For MS SqlServer compatibility */ | ||
2467 | default: return; | ||
2468 | } | ||
2469 | for(i=1, j=0; z[i]; i++){ | ||
2470 | if( z[i]==quote ){ | ||
2471 | if( z[i+1]==quote ){ | ||
2472 | z[j++] = quote; | ||
2473 | i++; | ||
2474 | }else{ | ||
2475 | z[j++] = 0; | ||
2476 | break; | ||
2477 | } | ||
2478 | }else{ | ||
2479 | z[j++] = z[i]; | ||
2480 | } | ||
2481 | } | ||
2482 | } | ||
2483 | |||
2484 | /* | ||
2485 | ** The input azIn is a NULL-terminated list of tokens. Remove the first | ||
2486 | ** token and all punctuation tokens. Remove the quotes from | ||
2487 | ** around string literal tokens. | ||
2488 | ** | ||
2489 | ** Example: | ||
2490 | ** | ||
2491 | ** input: tokenize chinese ( 'simplifed' , 'mixed' ) | ||
2492 | ** output: chinese simplifed mixed | ||
2493 | ** | ||
2494 | ** Another example: | ||
2495 | ** | ||
2496 | ** input: delimiters ( '[' , ']' , '...' ) | ||
2497 | ** output: [ ] ... | ||
2498 | */ | ||
2499 | static void tokenListToIdList(char **azIn){ | ||
2500 | int i, j; | ||
2501 | if( azIn ){ | ||
2502 | for(i=0, j=-1; azIn[i]; i++){ | ||
2503 | if( safe_isalnum(azIn[i][0]) || azIn[i][1] ){ | ||
2504 | dequoteString(azIn[i]); | ||
2505 | if( j>=0 ){ | ||
2506 | azIn[j] = azIn[i]; | ||
2507 | } | ||
2508 | j++; | ||
2509 | } | ||
2510 | } | ||
2511 | azIn[j] = 0; | ||
2512 | } | ||
2513 | } | ||
2514 | |||
2515 | |||
2516 | /* | ||
2517 | ** Find the first alphanumeric token in the string zIn. Null-terminate | ||
2518 | ** this token. Remove any quotation marks. And return a pointer to | ||
2519 | ** the result. | ||
2520 | */ | ||
2521 | static char *firstToken(char *zIn, char **pzTail){ | ||
2522 | int n, ttype; | ||
2523 | while(1){ | ||
2524 | n = getToken(zIn, &ttype); | ||
2525 | if( ttype==TOKEN_SPACE ){ | ||
2526 | zIn += n; | ||
2527 | }else if( ttype==TOKEN_EOF ){ | ||
2528 | *pzTail = zIn; | ||
2529 | return 0; | ||
2530 | }else{ | ||
2531 | zIn[n] = 0; | ||
2532 | *pzTail = &zIn[1]; | ||
2533 | dequoteString(zIn); | ||
2534 | return zIn; | ||
2535 | } | ||
2536 | } | ||
2537 | /*NOTREACHED*/ | ||
2538 | } | ||
2539 | |||
2540 | /* Return true if... | ||
2541 | ** | ||
2542 | ** * s begins with the string t, ignoring case | ||
2543 | ** * s is longer than t | ||
2544 | ** * The first character of s beyond t is not a alphanumeric | ||
2545 | ** | ||
2546 | ** Ignore leading space in *s. | ||
2547 | ** | ||
2548 | ** To put it another way, return true if the first token of | ||
2549 | ** s[] is t[]. | ||
2550 | */ | ||
2551 | static int startsWith(const char *s, const char *t){ | ||
2552 | while( safe_isspace(*s) ){ s++; } | ||
2553 | while( *t ){ | ||
2554 | if( safe_tolower(*s++)!=safe_tolower(*t++) ) return 0; | ||
2555 | } | ||
2556 | return *s!='_' && !safe_isalnum(*s); | ||
2557 | } | ||
2558 | |||
2559 | /* | ||
2560 | ** An instance of this structure defines the "spec" of a | ||
2561 | ** full text index. This structure is populated by parseSpec | ||
2562 | ** and use by fulltextConnect and fulltextCreate. | ||
2563 | */ | ||
2564 | typedef struct TableSpec { | ||
2565 | const char *zDb; /* Logical database name */ | ||
2566 | const char *zName; /* Name of the full-text index */ | ||
2567 | int nColumn; /* Number of columns to be indexed */ | ||
2568 | char **azColumn; /* Original names of columns to be indexed */ | ||
2569 | char **azContentColumn; /* Column names for %_content */ | ||
2570 | char **azTokenizer; /* Name of tokenizer and its arguments */ | ||
2571 | } TableSpec; | ||
2572 | |||
2573 | /* | ||
2574 | ** Reclaim all of the memory used by a TableSpec | ||
2575 | */ | ||
2576 | static void clearTableSpec(TableSpec *p) { | ||
2577 | free(p->azColumn); | ||
2578 | free(p->azContentColumn); | ||
2579 | free(p->azTokenizer); | ||
2580 | } | ||
2581 | |||
2582 | /* Parse a CREATE VIRTUAL TABLE statement, which looks like this: | ||
2583 | * | ||
2584 | * CREATE VIRTUAL TABLE email | ||
2585 | * USING fts2(subject, body, tokenize mytokenizer(myarg)) | ||
2586 | * | ||
2587 | * We return parsed information in a TableSpec structure. | ||
2588 | * | ||
2589 | */ | ||
2590 | static int parseSpec(TableSpec *pSpec, int argc, const char *const*argv, | ||
2591 | char**pzErr){ | ||
2592 | int i, n; | ||
2593 | char *z, *zDummy; | ||
2594 | char **azArg; | ||
2595 | const char *zTokenizer = 0; /* argv[] entry describing the tokenizer */ | ||
2596 | |||
2597 | assert( argc>=3 ); | ||
2598 | /* Current interface: | ||
2599 | ** argv[0] - module name | ||
2600 | ** argv[1] - database name | ||
2601 | ** argv[2] - table name | ||
2602 | ** argv[3..] - columns, optionally followed by tokenizer specification | ||
2603 | ** and snippet delimiters specification. | ||
2604 | */ | ||
2605 | |||
2606 | /* Make a copy of the complete argv[][] array in a single allocation. | ||
2607 | ** The argv[][] array is read-only and transient. We can write to the | ||
2608 | ** copy in order to modify things and the copy is persistent. | ||
2609 | */ | ||
2610 | CLEAR(pSpec); | ||
2611 | for(i=n=0; i<argc; i++){ | ||
2612 | n += strlen(argv[i]) + 1; | ||
2613 | } | ||
2614 | azArg = malloc( sizeof(char*)*argc + n ); | ||
2615 | if( azArg==0 ){ | ||
2616 | return SQLITE_NOMEM; | ||
2617 | } | ||
2618 | z = (char*)&azArg[argc]; | ||
2619 | for(i=0; i<argc; i++){ | ||
2620 | azArg[i] = z; | ||
2621 | strcpy(z, argv[i]); | ||
2622 | z += strlen(z)+1; | ||
2623 | } | ||
2624 | |||
2625 | /* Identify the column names and the tokenizer and delimiter arguments | ||
2626 | ** in the argv[][] array. | ||
2627 | */ | ||
2628 | pSpec->zDb = azArg[1]; | ||
2629 | pSpec->zName = azArg[2]; | ||
2630 | pSpec->nColumn = 0; | ||
2631 | pSpec->azColumn = azArg; | ||
2632 | zTokenizer = "tokenize simple"; | ||
2633 | for(i=3; i<argc; ++i){ | ||
2634 | if( startsWith(azArg[i],"tokenize") ){ | ||
2635 | zTokenizer = azArg[i]; | ||
2636 | }else{ | ||
2637 | z = azArg[pSpec->nColumn] = firstToken(azArg[i], &zDummy); | ||
2638 | pSpec->nColumn++; | ||
2639 | } | ||
2640 | } | ||
2641 | if( pSpec->nColumn==0 ){ | ||
2642 | azArg[0] = "content"; | ||
2643 | pSpec->nColumn = 1; | ||
2644 | } | ||
2645 | |||
2646 | /* | ||
2647 | ** Construct the list of content column names. | ||
2648 | ** | ||
2649 | ** Each content column name will be of the form cNNAAAA | ||
2650 | ** where NN is the column number and AAAA is the sanitized | ||
2651 | ** column name. "sanitized" means that special characters are | ||
2652 | ** converted to "_". The cNN prefix guarantees that all column | ||
2653 | ** names are unique. | ||
2654 | ** | ||
2655 | ** The AAAA suffix is not strictly necessary. It is included | ||
2656 | ** for the convenience of people who might examine the generated | ||
2657 | ** %_content table and wonder what the columns are used for. | ||
2658 | */ | ||
2659 | pSpec->azContentColumn = malloc( pSpec->nColumn * sizeof(char *) ); | ||
2660 | if( pSpec->azContentColumn==0 ){ | ||
2661 | clearTableSpec(pSpec); | ||
2662 | return SQLITE_NOMEM; | ||
2663 | } | ||
2664 | for(i=0; i<pSpec->nColumn; i++){ | ||
2665 | char *p; | ||
2666 | pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]); | ||
2667 | for (p = pSpec->azContentColumn[i]; *p ; ++p) { | ||
2668 | if( !safe_isalnum(*p) ) *p = '_'; | ||
2669 | } | ||
2670 | } | ||
2671 | |||
2672 | /* | ||
2673 | ** Parse the tokenizer specification string. | ||
2674 | */ | ||
2675 | pSpec->azTokenizer = tokenizeString(zTokenizer, &n); | ||
2676 | tokenListToIdList(pSpec->azTokenizer); | ||
2677 | |||
2678 | return SQLITE_OK; | ||
2679 | } | ||
2680 | |||
2681 | /* | ||
2682 | ** Generate a CREATE TABLE statement that describes the schema of | ||
2683 | ** the virtual table. Return a pointer to this schema string. | ||
2684 | ** | ||
2685 | ** Space is obtained from sqlite3_mprintf() and should be freed | ||
2686 | ** using sqlite3_free(). | ||
2687 | */ | ||
2688 | static char *fulltextSchema( | ||
2689 | int nColumn, /* Number of columns */ | ||
2690 | const char *const* azColumn, /* List of columns */ | ||
2691 | const char *zTableName /* Name of the table */ | ||
2692 | ){ | ||
2693 | int i; | ||
2694 | char *zSchema, *zNext; | ||
2695 | const char *zSep = "("; | ||
2696 | zSchema = sqlite3_mprintf("CREATE TABLE x"); | ||
2697 | for(i=0; i<nColumn; i++){ | ||
2698 | zNext = sqlite3_mprintf("%s%s%Q", zSchema, zSep, azColumn[i]); | ||
2699 | sqlite3_free(zSchema); | ||
2700 | zSchema = zNext; | ||
2701 | zSep = ","; | ||
2702 | } | ||
2703 | zNext = sqlite3_mprintf("%s,%Q)", zSchema, zTableName); | ||
2704 | sqlite3_free(zSchema); | ||
2705 | return zNext; | ||
2706 | } | ||
2707 | |||
2708 | /* | ||
2709 | ** Build a new sqlite3_vtab structure that will describe the | ||
2710 | ** fulltext index defined by spec. | ||
2711 | */ | ||
2712 | static int constructVtab( | ||
2713 | sqlite3 *db, /* The SQLite database connection */ | ||
2714 | fts2Hash *pHash, /* Hash table containing tokenizers */ | ||
2715 | TableSpec *spec, /* Parsed spec information from parseSpec() */ | ||
2716 | sqlite3_vtab **ppVTab, /* Write the resulting vtab structure here */ | ||
2717 | char **pzErr /* Write any error message here */ | ||
2718 | ){ | ||
2719 | int rc; | ||
2720 | int n; | ||
2721 | fulltext_vtab *v = 0; | ||
2722 | const sqlite3_tokenizer_module *m = NULL; | ||
2723 | char *schema; | ||
2724 | |||
2725 | char const *zTok; /* Name of tokenizer to use for this fts table */ | ||
2726 | int nTok; /* Length of zTok, including nul terminator */ | ||
2727 | |||
2728 | v = (fulltext_vtab *) malloc(sizeof(fulltext_vtab)); | ||
2729 | if( v==0 ) return SQLITE_NOMEM; | ||
2730 | CLEAR(v); | ||
2731 | /* sqlite will initialize v->base */ | ||
2732 | v->db = db; | ||
2733 | v->zDb = spec->zDb; /* Freed when azColumn is freed */ | ||
2734 | v->zName = spec->zName; /* Freed when azColumn is freed */ | ||
2735 | v->nColumn = spec->nColumn; | ||
2736 | v->azContentColumn = spec->azContentColumn; | ||
2737 | spec->azContentColumn = 0; | ||
2738 | v->azColumn = spec->azColumn; | ||
2739 | spec->azColumn = 0; | ||
2740 | |||
2741 | if( spec->azTokenizer==0 ){ | ||
2742 | return SQLITE_NOMEM; | ||
2743 | } | ||
2744 | |||
2745 | zTok = spec->azTokenizer[0]; | ||
2746 | if( !zTok ){ | ||
2747 | zTok = "simple"; | ||
2748 | } | ||
2749 | nTok = strlen(zTok)+1; | ||
2750 | |||
2751 | m = (sqlite3_tokenizer_module *)sqlite3Fts2HashFind(pHash, zTok, nTok); | ||
2752 | if( !m ){ | ||
2753 | *pzErr = sqlite3_mprintf("unknown tokenizer: %s", spec->azTokenizer[0]); | ||
2754 | rc = SQLITE_ERROR; | ||
2755 | goto err; | ||
2756 | } | ||
2757 | |||
2758 | for(n=0; spec->azTokenizer[n]; n++){} | ||
2759 | if( n ){ | ||
2760 | rc = m->xCreate(n-1, (const char*const*)&spec->azTokenizer[1], | ||
2761 | &v->pTokenizer); | ||
2762 | }else{ | ||
2763 | rc = m->xCreate(0, 0, &v->pTokenizer); | ||
2764 | } | ||
2765 | if( rc!=SQLITE_OK ) goto err; | ||
2766 | v->pTokenizer->pModule = m; | ||
2767 | |||
2768 | /* TODO: verify the existence of backing tables foo_content, foo_term */ | ||
2769 | |||
2770 | schema = fulltextSchema(v->nColumn, (const char*const*)v->azColumn, | ||
2771 | spec->zName); | ||
2772 | rc = sqlite3_declare_vtab(db, schema); | ||
2773 | sqlite3_free(schema); | ||
2774 | if( rc!=SQLITE_OK ) goto err; | ||
2775 | |||
2776 | memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements)); | ||
2777 | |||
2778 | /* Indicate that the buffer is not live. */ | ||
2779 | v->nPendingData = -1; | ||
2780 | |||
2781 | *ppVTab = &v->base; | ||
2782 | TRACE(("FTS2 Connect %p\n", v)); | ||
2783 | |||
2784 | return rc; | ||
2785 | |||
2786 | err: | ||
2787 | fulltext_vtab_destroy(v); | ||
2788 | return rc; | ||
2789 | } | ||
2790 | |||
2791 | static int fulltextConnect( | ||
2792 | sqlite3 *db, | ||
2793 | void *pAux, | ||
2794 | int argc, const char *const*argv, | ||
2795 | sqlite3_vtab **ppVTab, | ||
2796 | char **pzErr | ||
2797 | ){ | ||
2798 | TableSpec spec; | ||
2799 | int rc = parseSpec(&spec, argc, argv, pzErr); | ||
2800 | if( rc!=SQLITE_OK ) return rc; | ||
2801 | |||
2802 | rc = constructVtab(db, (fts2Hash *)pAux, &spec, ppVTab, pzErr); | ||
2803 | clearTableSpec(&spec); | ||
2804 | return rc; | ||
2805 | } | ||
2806 | |||
2807 | /* The %_content table holds the text of each document, with | ||
2808 | ** the rowid used as the docid. | ||
2809 | */ | ||
2810 | /* TODO(shess) This comment needs elaboration to match the updated | ||
2811 | ** code. Work it into the top-of-file comment at that time. | ||
2812 | */ | ||
2813 | static int fulltextCreate(sqlite3 *db, void *pAux, | ||
2814 | int argc, const char * const *argv, | ||
2815 | sqlite3_vtab **ppVTab, char **pzErr){ | ||
2816 | int rc; | ||
2817 | TableSpec spec; | ||
2818 | StringBuffer schema; | ||
2819 | TRACE(("FTS2 Create\n")); | ||
2820 | |||
2821 | rc = parseSpec(&spec, argc, argv, pzErr); | ||
2822 | if( rc!=SQLITE_OK ) return rc; | ||
2823 | |||
2824 | initStringBuffer(&schema); | ||
2825 | append(&schema, "CREATE TABLE %_content("); | ||
2826 | appendList(&schema, spec.nColumn, spec.azContentColumn); | ||
2827 | append(&schema, ")"); | ||
2828 | rc = sql_exec(db, spec.zDb, spec.zName, stringBufferData(&schema)); | ||
2829 | stringBufferDestroy(&schema); | ||
2830 | if( rc!=SQLITE_OK ) goto out; | ||
2831 | |||
2832 | rc = sql_exec(db, spec.zDb, spec.zName, | ||
2833 | "create table %_segments(block blob);"); | ||
2834 | if( rc!=SQLITE_OK ) goto out; | ||
2835 | |||
2836 | rc = sql_exec(db, spec.zDb, spec.zName, | ||
2837 | "create table %_segdir(" | ||
2838 | " level integer," | ||
2839 | " idx integer," | ||
2840 | " start_block integer," | ||
2841 | " leaves_end_block integer," | ||
2842 | " end_block integer," | ||
2843 | " root blob," | ||
2844 | " primary key(level, idx)" | ||
2845 | ");"); | ||
2846 | if( rc!=SQLITE_OK ) goto out; | ||
2847 | |||
2848 | rc = constructVtab(db, (fts2Hash *)pAux, &spec, ppVTab, pzErr); | ||
2849 | |||
2850 | out: | ||
2851 | clearTableSpec(&spec); | ||
2852 | return rc; | ||
2853 | } | ||
2854 | |||
2855 | /* Decide how to handle an SQL query. */ | ||
2856 | static int fulltextBestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){ | ||
2857 | int i; | ||
2858 | TRACE(("FTS2 BestIndex\n")); | ||
2859 | |||
2860 | for(i=0; i<pInfo->nConstraint; ++i){ | ||
2861 | const struct sqlite3_index_constraint *pConstraint; | ||
2862 | pConstraint = &pInfo->aConstraint[i]; | ||
2863 | if( pConstraint->usable ) { | ||
2864 | if( pConstraint->iColumn==-1 && | ||
2865 | pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ ){ | ||
2866 | pInfo->idxNum = QUERY_ROWID; /* lookup by rowid */ | ||
2867 | TRACE(("FTS2 QUERY_ROWID\n")); | ||
2868 | } else if( pConstraint->iColumn>=0 && | ||
2869 | pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH ){ | ||
2870 | /* full-text search */ | ||
2871 | pInfo->idxNum = QUERY_FULLTEXT + pConstraint->iColumn; | ||
2872 | TRACE(("FTS2 QUERY_FULLTEXT %d\n", pConstraint->iColumn)); | ||
2873 | } else continue; | ||
2874 | |||
2875 | pInfo->aConstraintUsage[i].argvIndex = 1; | ||
2876 | pInfo->aConstraintUsage[i].omit = 1; | ||
2877 | |||
2878 | /* An arbitrary value for now. | ||
2879 | * TODO: Perhaps rowid matches should be considered cheaper than | ||
2880 | * full-text searches. */ | ||
2881 | pInfo->estimatedCost = 1.0; | ||
2882 | |||
2883 | return SQLITE_OK; | ||
2884 | } | ||
2885 | } | ||
2886 | pInfo->idxNum = QUERY_GENERIC; | ||
2887 | return SQLITE_OK; | ||
2888 | } | ||
2889 | |||
2890 | static int fulltextDisconnect(sqlite3_vtab *pVTab){ | ||
2891 | TRACE(("FTS2 Disconnect %p\n", pVTab)); | ||
2892 | fulltext_vtab_destroy((fulltext_vtab *)pVTab); | ||
2893 | return SQLITE_OK; | ||
2894 | } | ||
2895 | |||
2896 | static int fulltextDestroy(sqlite3_vtab *pVTab){ | ||
2897 | fulltext_vtab *v = (fulltext_vtab *)pVTab; | ||
2898 | int rc; | ||
2899 | |||
2900 | TRACE(("FTS2 Destroy %p\n", pVTab)); | ||
2901 | rc = sql_exec(v->db, v->zDb, v->zName, | ||
2902 | "drop table if exists %_content;" | ||
2903 | "drop table if exists %_segments;" | ||
2904 | "drop table if exists %_segdir;" | ||
2905 | ); | ||
2906 | if( rc!=SQLITE_OK ) return rc; | ||
2907 | |||
2908 | fulltext_vtab_destroy((fulltext_vtab *)pVTab); | ||
2909 | return SQLITE_OK; | ||
2910 | } | ||
2911 | |||
2912 | static int fulltextOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){ | ||
2913 | fulltext_cursor *c; | ||
2914 | |||
2915 | c = (fulltext_cursor *) calloc(sizeof(fulltext_cursor), 1); | ||
2916 | /* sqlite will initialize c->base */ | ||
2917 | *ppCursor = &c->base; | ||
2918 | TRACE(("FTS2 Open %p: %p\n", pVTab, c)); | ||
2919 | |||
2920 | return SQLITE_OK; | ||
2921 | } | ||
2922 | |||
2923 | |||
2924 | /* Free all of the dynamically allocated memory held by *q | ||
2925 | */ | ||
2926 | static void queryClear(Query *q){ | ||
2927 | int i; | ||
2928 | for(i = 0; i < q->nTerms; ++i){ | ||
2929 | free(q->pTerms[i].pTerm); | ||
2930 | } | ||
2931 | free(q->pTerms); | ||
2932 | CLEAR(q); | ||
2933 | } | ||
2934 | |||
2935 | /* Free all of the dynamically allocated memory held by the | ||
2936 | ** Snippet | ||
2937 | */ | ||
2938 | static void snippetClear(Snippet *p){ | ||
2939 | free(p->aMatch); | ||
2940 | free(p->zOffset); | ||
2941 | free(p->zSnippet); | ||
2942 | CLEAR(p); | ||
2943 | } | ||
2944 | /* | ||
2945 | ** Append a single entry to the p->aMatch[] log. | ||
2946 | */ | ||
2947 | static void snippetAppendMatch( | ||
2948 | Snippet *p, /* Append the entry to this snippet */ | ||
2949 | int iCol, int iTerm, /* The column and query term */ | ||
2950 | int iStart, int nByte /* Offset and size of the match */ | ||
2951 | ){ | ||
2952 | int i; | ||
2953 | struct snippetMatch *pMatch; | ||
2954 | if( p->nMatch+1>=p->nAlloc ){ | ||
2955 | p->nAlloc = p->nAlloc*2 + 10; | ||
2956 | p->aMatch = realloc(p->aMatch, p->nAlloc*sizeof(p->aMatch[0]) ); | ||
2957 | if( p->aMatch==0 ){ | ||
2958 | p->nMatch = 0; | ||
2959 | p->nAlloc = 0; | ||
2960 | return; | ||
2961 | } | ||
2962 | } | ||
2963 | i = p->nMatch++; | ||
2964 | pMatch = &p->aMatch[i]; | ||
2965 | pMatch->iCol = iCol; | ||
2966 | pMatch->iTerm = iTerm; | ||
2967 | pMatch->iStart = iStart; | ||
2968 | pMatch->nByte = nByte; | ||
2969 | } | ||
2970 | |||
2971 | /* | ||
2972 | ** Sizing information for the circular buffer used in snippetOffsetsOfColumn() | ||
2973 | */ | ||
2974 | #define FTS2_ROTOR_SZ (32) | ||
2975 | #define FTS2_ROTOR_MASK (FTS2_ROTOR_SZ-1) | ||
2976 | |||
2977 | /* | ||
2978 | ** Add entries to pSnippet->aMatch[] for every match that occurs against | ||
2979 | ** document zDoc[0..nDoc-1] which is stored in column iColumn. | ||
2980 | */ | ||
2981 | static void snippetOffsetsOfColumn( | ||
2982 | Query *pQuery, | ||
2983 | Snippet *pSnippet, | ||
2984 | int iColumn, | ||
2985 | const char *zDoc, | ||
2986 | int nDoc | ||
2987 | ){ | ||
2988 | const sqlite3_tokenizer_module *pTModule; /* The tokenizer module */ | ||
2989 | sqlite3_tokenizer *pTokenizer; /* The specific tokenizer */ | ||
2990 | sqlite3_tokenizer_cursor *pTCursor; /* Tokenizer cursor */ | ||
2991 | fulltext_vtab *pVtab; /* The full text index */ | ||
2992 | int nColumn; /* Number of columns in the index */ | ||
2993 | const QueryTerm *aTerm; /* Query string terms */ | ||
2994 | int nTerm; /* Number of query string terms */ | ||
2995 | int i, j; /* Loop counters */ | ||
2996 | int rc; /* Return code */ | ||
2997 | unsigned int match, prevMatch; /* Phrase search bitmasks */ | ||
2998 | const char *zToken; /* Next token from the tokenizer */ | ||
2999 | int nToken; /* Size of zToken */ | ||
3000 | int iBegin, iEnd, iPos; /* Offsets of beginning and end */ | ||
3001 | |||
3002 | /* The following variables keep a circular buffer of the last | ||
3003 | ** few tokens */ | ||
3004 | unsigned int iRotor = 0; /* Index of current token */ | ||
3005 | int iRotorBegin[FTS2_ROTOR_SZ]; /* Beginning offset of token */ | ||
3006 | int iRotorLen[FTS2_ROTOR_SZ]; /* Length of token */ | ||
3007 | |||
3008 | pVtab = pQuery->pFts; | ||
3009 | nColumn = pVtab->nColumn; | ||
3010 | pTokenizer = pVtab->pTokenizer; | ||
3011 | pTModule = pTokenizer->pModule; | ||
3012 | rc = pTModule->xOpen(pTokenizer, zDoc, nDoc, &pTCursor); | ||
3013 | if( rc ) return; | ||
3014 | pTCursor->pTokenizer = pTokenizer; | ||
3015 | aTerm = pQuery->pTerms; | ||
3016 | nTerm = pQuery->nTerms; | ||
3017 | if( nTerm>=FTS2_ROTOR_SZ ){ | ||
3018 | nTerm = FTS2_ROTOR_SZ - 1; | ||
3019 | } | ||
3020 | prevMatch = 0; | ||
3021 | while(1){ | ||
3022 | rc = pTModule->xNext(pTCursor, &zToken, &nToken, &iBegin, &iEnd, &iPos); | ||
3023 | if( rc ) break; | ||
3024 | iRotorBegin[iRotor&FTS2_ROTOR_MASK] = iBegin; | ||
3025 | iRotorLen[iRotor&FTS2_ROTOR_MASK] = iEnd-iBegin; | ||
3026 | match = 0; | ||
3027 | for(i=0; i<nTerm; i++){ | ||
3028 | int iCol; | ||
3029 | iCol = aTerm[i].iColumn; | ||
3030 | if( iCol>=0 && iCol<nColumn && iCol!=iColumn ) continue; | ||
3031 | if( aTerm[i].nTerm>nToken ) continue; | ||
3032 | if( !aTerm[i].isPrefix && aTerm[i].nTerm<nToken ) continue; | ||
3033 | assert( aTerm[i].nTerm<=nToken ); | ||
3034 | if( memcmp(aTerm[i].pTerm, zToken, aTerm[i].nTerm) ) continue; | ||
3035 | if( aTerm[i].iPhrase>1 && (prevMatch & (1<<i))==0 ) continue; | ||
3036 | match |= 1<<i; | ||
3037 | if( i==nTerm-1 || aTerm[i+1].iPhrase==1 ){ | ||
3038 | for(j=aTerm[i].iPhrase-1; j>=0; j--){ | ||
3039 | int k = (iRotor-j) & FTS2_ROTOR_MASK; | ||
3040 | snippetAppendMatch(pSnippet, iColumn, i-j, | ||
3041 | iRotorBegin[k], iRotorLen[k]); | ||
3042 | } | ||
3043 | } | ||
3044 | } | ||
3045 | prevMatch = match<<1; | ||
3046 | iRotor++; | ||
3047 | } | ||
3048 | pTModule->xClose(pTCursor); | ||
3049 | } | ||
3050 | |||
3051 | |||
3052 | /* | ||
3053 | ** Compute all offsets for the current row of the query. | ||
3054 | ** If the offsets have already been computed, this routine is a no-op. | ||
3055 | */ | ||
3056 | static void snippetAllOffsets(fulltext_cursor *p){ | ||
3057 | int nColumn; | ||
3058 | int iColumn, i; | ||
3059 | int iFirst, iLast; | ||
3060 | fulltext_vtab *pFts; | ||
3061 | |||
3062 | if( p->snippet.nMatch ) return; | ||
3063 | if( p->q.nTerms==0 ) return; | ||
3064 | pFts = p->q.pFts; | ||
3065 | nColumn = pFts->nColumn; | ||
3066 | iColumn = (p->iCursorType - QUERY_FULLTEXT); | ||
3067 | if( iColumn<0 || iColumn>=nColumn ){ | ||
3068 | iFirst = 0; | ||
3069 | iLast = nColumn-1; | ||
3070 | }else{ | ||
3071 | iFirst = iColumn; | ||
3072 | iLast = iColumn; | ||
3073 | } | ||
3074 | for(i=iFirst; i<=iLast; i++){ | ||
3075 | const char *zDoc; | ||
3076 | int nDoc; | ||
3077 | zDoc = (const char*)sqlite3_column_text(p->pStmt, i+1); | ||
3078 | nDoc = sqlite3_column_bytes(p->pStmt, i+1); | ||
3079 | snippetOffsetsOfColumn(&p->q, &p->snippet, i, zDoc, nDoc); | ||
3080 | } | ||
3081 | } | ||
3082 | |||
3083 | /* | ||
3084 | ** Convert the information in the aMatch[] array of the snippet | ||
3085 | ** into the string zOffset[0..nOffset-1]. | ||
3086 | */ | ||
3087 | static void snippetOffsetText(Snippet *p){ | ||
3088 | int i; | ||
3089 | int cnt = 0; | ||
3090 | StringBuffer sb; | ||
3091 | char zBuf[200]; | ||
3092 | if( p->zOffset ) return; | ||
3093 | initStringBuffer(&sb); | ||
3094 | for(i=0; i<p->nMatch; i++){ | ||
3095 | struct snippetMatch *pMatch = &p->aMatch[i]; | ||
3096 | zBuf[0] = ' '; | ||
3097 | sprintf(&zBuf[cnt>0], "%d %d %d %d", pMatch->iCol, | ||
3098 | pMatch->iTerm, pMatch->iStart, pMatch->nByte); | ||
3099 | append(&sb, zBuf); | ||
3100 | cnt++; | ||
3101 | } | ||
3102 | p->zOffset = stringBufferData(&sb); | ||
3103 | p->nOffset = stringBufferLength(&sb); | ||
3104 | } | ||
3105 | |||
3106 | /* | ||
3107 | ** zDoc[0..nDoc-1] is phrase of text. aMatch[0..nMatch-1] are a set | ||
3108 | ** of matching words some of which might be in zDoc. zDoc is column | ||
3109 | ** number iCol. | ||
3110 | ** | ||
3111 | ** iBreak is suggested spot in zDoc where we could begin or end an | ||
3112 | ** excerpt. Return a value similar to iBreak but possibly adjusted | ||
3113 | ** to be a little left or right so that the break point is better. | ||
3114 | */ | ||
3115 | static int wordBoundary( | ||
3116 | int iBreak, /* The suggested break point */ | ||
3117 | const char *zDoc, /* Document text */ | ||
3118 | int nDoc, /* Number of bytes in zDoc[] */ | ||
3119 | struct snippetMatch *aMatch, /* Matching words */ | ||
3120 | int nMatch, /* Number of entries in aMatch[] */ | ||
3121 | int iCol /* The column number for zDoc[] */ | ||
3122 | ){ | ||
3123 | int i; | ||
3124 | if( iBreak<=10 ){ | ||
3125 | return 0; | ||
3126 | } | ||
3127 | if( iBreak>=nDoc-10 ){ | ||
3128 | return nDoc; | ||
3129 | } | ||
3130 | for(i=0; i<nMatch && aMatch[i].iCol<iCol; i++){} | ||
3131 | while( i<nMatch && aMatch[i].iStart+aMatch[i].nByte<iBreak ){ i++; } | ||
3132 | if( i<nMatch ){ | ||
3133 | if( aMatch[i].iStart<iBreak+10 ){ | ||
3134 | return aMatch[i].iStart; | ||
3135 | } | ||
3136 | if( i>0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){ | ||
3137 | return aMatch[i-1].iStart; | ||
3138 | } | ||
3139 | } | ||
3140 | for(i=1; i<=10; i++){ | ||
3141 | if( safe_isspace(zDoc[iBreak-i]) ){ | ||
3142 | return iBreak - i + 1; | ||
3143 | } | ||
3144 | if( safe_isspace(zDoc[iBreak+i]) ){ | ||
3145 | return iBreak + i + 1; | ||
3146 | } | ||
3147 | } | ||
3148 | return iBreak; | ||
3149 | } | ||
3150 | |||
3151 | |||
3152 | |||
3153 | /* | ||
3154 | ** Allowed values for Snippet.aMatch[].snStatus | ||
3155 | */ | ||
3156 | #define SNIPPET_IGNORE 0 /* It is ok to omit this match from the snippet */ | ||
3157 | #define SNIPPET_DESIRED 1 /* We want to include this match in the snippet */ | ||
3158 | |||
3159 | /* | ||
3160 | ** Generate the text of a snippet. | ||
3161 | */ | ||
3162 | static void snippetText( | ||
3163 | fulltext_cursor *pCursor, /* The cursor we need the snippet for */ | ||
3164 | const char *zStartMark, /* Markup to appear before each match */ | ||
3165 | const char *zEndMark, /* Markup to appear after each match */ | ||
3166 | const char *zEllipsis /* Ellipsis mark */ | ||
3167 | ){ | ||
3168 | int i, j; | ||
3169 | struct snippetMatch *aMatch; | ||
3170 | int nMatch; | ||
3171 | int nDesired; | ||
3172 | StringBuffer sb; | ||
3173 | int tailCol; | ||
3174 | int tailOffset; | ||
3175 | int iCol; | ||
3176 | int nDoc; | ||
3177 | const char *zDoc; | ||
3178 | int iStart, iEnd; | ||
3179 | int tailEllipsis = 0; | ||
3180 | int iMatch; | ||
3181 | |||
3182 | |||
3183 | free(pCursor->snippet.zSnippet); | ||
3184 | pCursor->snippet.zSnippet = 0; | ||
3185 | aMatch = pCursor->snippet.aMatch; | ||
3186 | nMatch = pCursor->snippet.nMatch; | ||
3187 | initStringBuffer(&sb); | ||
3188 | |||
3189 | for(i=0; i<nMatch; i++){ | ||
3190 | aMatch[i].snStatus = SNIPPET_IGNORE; | ||
3191 | } | ||
3192 | nDesired = 0; | ||
3193 | for(i=0; i<pCursor->q.nTerms; i++){ | ||
3194 | for(j=0; j<nMatch; j++){ | ||
3195 | if( aMatch[j].iTerm==i ){ | ||
3196 | aMatch[j].snStatus = SNIPPET_DESIRED; | ||
3197 | nDesired++; | ||
3198 | break; | ||
3199 | } | ||
3200 | } | ||
3201 | } | ||
3202 | |||
3203 | iMatch = 0; | ||
3204 | tailCol = -1; | ||
3205 | tailOffset = 0; | ||
3206 | for(i=0; i<nMatch && nDesired>0; i++){ | ||
3207 | if( aMatch[i].snStatus!=SNIPPET_DESIRED ) continue; | ||
3208 | nDesired--; | ||
3209 | iCol = aMatch[i].iCol; | ||
3210 | zDoc = (const char*)sqlite3_column_text(pCursor->pStmt, iCol+1); | ||
3211 | nDoc = sqlite3_column_bytes(pCursor->pStmt, iCol+1); | ||
3212 | iStart = aMatch[i].iStart - 40; | ||
3213 | iStart = wordBoundary(iStart, zDoc, nDoc, aMatch, nMatch, iCol); | ||
3214 | if( iStart<=10 ){ | ||
3215 | iStart = 0; | ||
3216 | } | ||
3217 | if( iCol==tailCol && iStart<=tailOffset+20 ){ | ||
3218 | iStart = tailOffset; | ||
3219 | } | ||
3220 | if( (iCol!=tailCol && tailCol>=0) || iStart!=tailOffset ){ | ||
3221 | trimWhiteSpace(&sb); | ||
3222 | appendWhiteSpace(&sb); | ||
3223 | append(&sb, zEllipsis); | ||
3224 | appendWhiteSpace(&sb); | ||
3225 | } | ||
3226 | iEnd = aMatch[i].iStart + aMatch[i].nByte + 40; | ||
3227 | iEnd = wordBoundary(iEnd, zDoc, nDoc, aMatch, nMatch, iCol); | ||
3228 | if( iEnd>=nDoc-10 ){ | ||
3229 | iEnd = nDoc; | ||
3230 | tailEllipsis = 0; | ||
3231 | }else{ | ||
3232 | tailEllipsis = 1; | ||
3233 | } | ||
3234 | while( iMatch<nMatch && aMatch[iMatch].iCol<iCol ){ iMatch++; } | ||
3235 | while( iStart<iEnd ){ | ||
3236 | while( iMatch<nMatch && aMatch[iMatch].iStart<iStart | ||
3237 | && aMatch[iMatch].iCol<=iCol ){ | ||
3238 | iMatch++; | ||
3239 | } | ||
3240 | if( iMatch<nMatch && aMatch[iMatch].iStart<iEnd | ||
3241 | && aMatch[iMatch].iCol==iCol ){ | ||
3242 | nappend(&sb, &zDoc[iStart], aMatch[iMatch].iStart - iStart); | ||
3243 | iStart = aMatch[iMatch].iStart; | ||
3244 | append(&sb, zStartMark); | ||
3245 | nappend(&sb, &zDoc[iStart], aMatch[iMatch].nByte); | ||
3246 | append(&sb, zEndMark); | ||
3247 | iStart += aMatch[iMatch].nByte; | ||
3248 | for(j=iMatch+1; j<nMatch; j++){ | ||
3249 | if( aMatch[j].iTerm==aMatch[iMatch].iTerm | ||
3250 | && aMatch[j].snStatus==SNIPPET_DESIRED ){ | ||
3251 | nDesired--; | ||
3252 | aMatch[j].snStatus = SNIPPET_IGNORE; | ||
3253 | } | ||
3254 | } | ||
3255 | }else{ | ||
3256 | nappend(&sb, &zDoc[iStart], iEnd - iStart); | ||
3257 | iStart = iEnd; | ||
3258 | } | ||
3259 | } | ||
3260 | tailCol = iCol; | ||
3261 | tailOffset = iEnd; | ||
3262 | } | ||
3263 | trimWhiteSpace(&sb); | ||
3264 | if( tailEllipsis ){ | ||
3265 | appendWhiteSpace(&sb); | ||
3266 | append(&sb, zEllipsis); | ||
3267 | } | ||
3268 | pCursor->snippet.zSnippet = stringBufferData(&sb); | ||
3269 | pCursor->snippet.nSnippet = stringBufferLength(&sb); | ||
3270 | } | ||
3271 | |||
3272 | |||
3273 | /* | ||
3274 | ** Close the cursor. For additional information see the documentation | ||
3275 | ** on the xClose method of the virtual table interface. | ||
3276 | */ | ||
3277 | static int fulltextClose(sqlite3_vtab_cursor *pCursor){ | ||
3278 | fulltext_cursor *c = (fulltext_cursor *) pCursor; | ||
3279 | TRACE(("FTS2 Close %p\n", c)); | ||
3280 | sqlite3_finalize(c->pStmt); | ||
3281 | queryClear(&c->q); | ||
3282 | snippetClear(&c->snippet); | ||
3283 | if( c->result.nData!=0 ) dlrDestroy(&c->reader); | ||
3284 | dataBufferDestroy(&c->result); | ||
3285 | free(c); | ||
3286 | return SQLITE_OK; | ||
3287 | } | ||
3288 | |||
3289 | static int fulltextNext(sqlite3_vtab_cursor *pCursor){ | ||
3290 | fulltext_cursor *c = (fulltext_cursor *) pCursor; | ||
3291 | int rc; | ||
3292 | |||
3293 | TRACE(("FTS2 Next %p\n", pCursor)); | ||
3294 | snippetClear(&c->snippet); | ||
3295 | if( c->iCursorType < QUERY_FULLTEXT ){ | ||
3296 | /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */ | ||
3297 | rc = sqlite3_step(c->pStmt); | ||
3298 | switch( rc ){ | ||
3299 | case SQLITE_ROW: | ||
3300 | c->eof = 0; | ||
3301 | return SQLITE_OK; | ||
3302 | case SQLITE_DONE: | ||
3303 | c->eof = 1; | ||
3304 | return SQLITE_OK; | ||
3305 | default: | ||
3306 | c->eof = 1; | ||
3307 | return rc; | ||
3308 | } | ||
3309 | } else { /* full-text query */ | ||
3310 | rc = sqlite3_reset(c->pStmt); | ||
3311 | if( rc!=SQLITE_OK ) return rc; | ||
3312 | |||
3313 | if( c->result.nData==0 || dlrAtEnd(&c->reader) ){ | ||
3314 | c->eof = 1; | ||
3315 | return SQLITE_OK; | ||
3316 | } | ||
3317 | rc = sqlite3_bind_int64(c->pStmt, 1, dlrDocid(&c->reader)); | ||
3318 | dlrStep(&c->reader); | ||
3319 | if( rc!=SQLITE_OK ) return rc; | ||
3320 | /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */ | ||
3321 | rc = sqlite3_step(c->pStmt); | ||
3322 | if( rc==SQLITE_ROW ){ /* the case we expect */ | ||
3323 | c->eof = 0; | ||
3324 | return SQLITE_OK; | ||
3325 | } | ||
3326 | /* an error occurred; abort */ | ||
3327 | return rc==SQLITE_DONE ? SQLITE_ERROR : rc; | ||
3328 | } | ||
3329 | } | ||
3330 | |||
3331 | |||
3332 | /* TODO(shess) If we pushed LeafReader to the top of the file, or to | ||
3333 | ** another file, term_select() could be pushed above | ||
3334 | ** docListOfTerm(). | ||
3335 | */ | ||
3336 | static int termSelect(fulltext_vtab *v, int iColumn, | ||
3337 | const char *pTerm, int nTerm, int isPrefix, | ||
3338 | DocListType iType, DataBuffer *out); | ||
3339 | |||
3340 | /* Return a DocList corresponding to the query term *pTerm. If *pTerm | ||
3341 | ** is the first term of a phrase query, go ahead and evaluate the phrase | ||
3342 | ** query and return the doclist for the entire phrase query. | ||
3343 | ** | ||
3344 | ** The resulting DL_DOCIDS doclist is stored in pResult, which is | ||
3345 | ** overwritten. | ||
3346 | */ | ||
3347 | static int docListOfTerm( | ||
3348 | fulltext_vtab *v, /* The full text index */ | ||
3349 | int iColumn, /* column to restrict to. No restriction if >=nColumn */ | ||
3350 | QueryTerm *pQTerm, /* Term we are looking for, or 1st term of a phrase */ | ||
3351 | DataBuffer *pResult /* Write the result here */ | ||
3352 | ){ | ||
3353 | DataBuffer left, right, new; | ||
3354 | int i, rc; | ||
3355 | |||
3356 | /* No phrase search if no position info. */ | ||
3357 | assert( pQTerm->nPhrase==0 || DL_DEFAULT!=DL_DOCIDS ); | ||
3358 | |||
3359 | /* This code should never be called with buffered updates. */ | ||
3360 | assert( v->nPendingData<0 ); | ||
3361 | |||
3362 | dataBufferInit(&left, 0); | ||
3363 | rc = termSelect(v, iColumn, pQTerm->pTerm, pQTerm->nTerm, pQTerm->isPrefix, | ||
3364 | 0<pQTerm->nPhrase ? DL_POSITIONS : DL_DOCIDS, &left); | ||
3365 | if( rc ) return rc; | ||
3366 | for(i=1; i<=pQTerm->nPhrase && left.nData>0; i++){ | ||
3367 | dataBufferInit(&right, 0); | ||
3368 | rc = termSelect(v, iColumn, pQTerm[i].pTerm, pQTerm[i].nTerm, | ||
3369 | pQTerm[i].isPrefix, DL_POSITIONS, &right); | ||
3370 | if( rc ){ | ||
3371 | dataBufferDestroy(&left); | ||
3372 | return rc; | ||
3373 | } | ||
3374 | dataBufferInit(&new, 0); | ||
3375 | docListPhraseMerge(left.pData, left.nData, right.pData, right.nData, | ||
3376 | i<pQTerm->nPhrase ? DL_POSITIONS : DL_DOCIDS, &new); | ||
3377 | dataBufferDestroy(&left); | ||
3378 | dataBufferDestroy(&right); | ||
3379 | left = new; | ||
3380 | } | ||
3381 | *pResult = left; | ||
3382 | return SQLITE_OK; | ||
3383 | } | ||
3384 | |||
3385 | /* Add a new term pTerm[0..nTerm-1] to the query *q. | ||
3386 | */ | ||
3387 | static void queryAdd(Query *q, const char *pTerm, int nTerm){ | ||
3388 | QueryTerm *t; | ||
3389 | ++q->nTerms; | ||
3390 | q->pTerms = realloc(q->pTerms, q->nTerms * sizeof(q->pTerms[0])); | ||
3391 | if( q->pTerms==0 ){ | ||
3392 | q->nTerms = 0; | ||
3393 | return; | ||
3394 | } | ||
3395 | t = &q->pTerms[q->nTerms - 1]; | ||
3396 | CLEAR(t); | ||
3397 | t->pTerm = malloc(nTerm+1); | ||
3398 | memcpy(t->pTerm, pTerm, nTerm); | ||
3399 | t->pTerm[nTerm] = 0; | ||
3400 | t->nTerm = nTerm; | ||
3401 | t->isOr = q->nextIsOr; | ||
3402 | t->isPrefix = 0; | ||
3403 | q->nextIsOr = 0; | ||
3404 | t->iColumn = q->nextColumn; | ||
3405 | q->nextColumn = q->dfltColumn; | ||
3406 | } | ||
3407 | |||
3408 | /* | ||
3409 | ** Check to see if the string zToken[0...nToken-1] matches any | ||
3410 | ** column name in the virtual table. If it does, | ||
3411 | ** return the zero-indexed column number. If not, return -1. | ||
3412 | */ | ||
3413 | static int checkColumnSpecifier( | ||
3414 | fulltext_vtab *pVtab, /* The virtual table */ | ||
3415 | const char *zToken, /* Text of the token */ | ||
3416 | int nToken /* Number of characters in the token */ | ||
3417 | ){ | ||
3418 | int i; | ||
3419 | for(i=0; i<pVtab->nColumn; i++){ | ||
3420 | if( memcmp(pVtab->azColumn[i], zToken, nToken)==0 | ||
3421 | && pVtab->azColumn[i][nToken]==0 ){ | ||
3422 | return i; | ||
3423 | } | ||
3424 | } | ||
3425 | return -1; | ||
3426 | } | ||
3427 | |||
3428 | /* | ||
3429 | ** Parse the text at pSegment[0..nSegment-1]. Add additional terms | ||
3430 | ** to the query being assemblied in pQuery. | ||
3431 | ** | ||
3432 | ** inPhrase is true if pSegment[0..nSegement-1] is contained within | ||
3433 | ** double-quotes. If inPhrase is true, then the first term | ||
3434 | ** is marked with the number of terms in the phrase less one and | ||
3435 | ** OR and "-" syntax is ignored. If inPhrase is false, then every | ||
3436 | ** term found is marked with nPhrase=0 and OR and "-" syntax is significant. | ||
3437 | */ | ||
3438 | static int tokenizeSegment( | ||
3439 | sqlite3_tokenizer *pTokenizer, /* The tokenizer to use */ | ||
3440 | const char *pSegment, int nSegment, /* Query expression being parsed */ | ||
3441 | int inPhrase, /* True if within "..." */ | ||
3442 | Query *pQuery /* Append results here */ | ||
3443 | ){ | ||
3444 | const sqlite3_tokenizer_module *pModule = pTokenizer->pModule; | ||
3445 | sqlite3_tokenizer_cursor *pCursor; | ||
3446 | int firstIndex = pQuery->nTerms; | ||
3447 | int iCol; | ||
3448 | int nTerm = 1; | ||
3449 | |||
3450 | int rc = pModule->xOpen(pTokenizer, pSegment, nSegment, &pCursor); | ||
3451 | if( rc!=SQLITE_OK ) return rc; | ||
3452 | pCursor->pTokenizer = pTokenizer; | ||
3453 | |||
3454 | while( 1 ){ | ||
3455 | const char *pToken; | ||
3456 | int nToken, iBegin, iEnd, iPos; | ||
3457 | |||
3458 | rc = pModule->xNext(pCursor, | ||
3459 | &pToken, &nToken, | ||
3460 | &iBegin, &iEnd, &iPos); | ||
3461 | if( rc!=SQLITE_OK ) break; | ||
3462 | if( !inPhrase && | ||
3463 | pSegment[iEnd]==':' && | ||
3464 | (iCol = checkColumnSpecifier(pQuery->pFts, pToken, nToken))>=0 ){ | ||
3465 | pQuery->nextColumn = iCol; | ||
3466 | continue; | ||
3467 | } | ||
3468 | if( !inPhrase && pQuery->nTerms>0 && nToken==2 | ||
3469 | && pSegment[iBegin]=='O' && pSegment[iBegin+1]=='R' ){ | ||
3470 | pQuery->nextIsOr = 1; | ||
3471 | continue; | ||
3472 | } | ||
3473 | queryAdd(pQuery, pToken, nToken); | ||
3474 | if( !inPhrase && iBegin>0 && pSegment[iBegin-1]=='-' ){ | ||
3475 | pQuery->pTerms[pQuery->nTerms-1].isNot = 1; | ||
3476 | } | ||
3477 | if( iEnd<nSegment && pSegment[iEnd]=='*' ){ | ||
3478 | pQuery->pTerms[pQuery->nTerms-1].isPrefix = 1; | ||
3479 | } | ||
3480 | pQuery->pTerms[pQuery->nTerms-1].iPhrase = nTerm; | ||
3481 | if( inPhrase ){ | ||
3482 | nTerm++; | ||
3483 | } | ||
3484 | } | ||
3485 | |||
3486 | if( inPhrase && pQuery->nTerms>firstIndex ){ | ||
3487 | pQuery->pTerms[firstIndex].nPhrase = pQuery->nTerms - firstIndex - 1; | ||
3488 | } | ||
3489 | |||
3490 | return pModule->xClose(pCursor); | ||
3491 | } | ||
3492 | |||
3493 | /* Parse a query string, yielding a Query object pQuery. | ||
3494 | ** | ||
3495 | ** The calling function will need to queryClear() to clean up | ||
3496 | ** the dynamically allocated memory held by pQuery. | ||
3497 | */ | ||
3498 | static int parseQuery( | ||
3499 | fulltext_vtab *v, /* The fulltext index */ | ||
3500 | const char *zInput, /* Input text of the query string */ | ||
3501 | int nInput, /* Size of the input text */ | ||
3502 | int dfltColumn, /* Default column of the index to match against */ | ||
3503 | Query *pQuery /* Write the parse results here. */ | ||
3504 | ){ | ||
3505 | int iInput, inPhrase = 0; | ||
3506 | |||
3507 | if( zInput==0 ) nInput = 0; | ||
3508 | if( nInput<0 ) nInput = strlen(zInput); | ||
3509 | pQuery->nTerms = 0; | ||
3510 | pQuery->pTerms = NULL; | ||
3511 | pQuery->nextIsOr = 0; | ||
3512 | pQuery->nextColumn = dfltColumn; | ||
3513 | pQuery->dfltColumn = dfltColumn; | ||
3514 | pQuery->pFts = v; | ||
3515 | |||
3516 | for(iInput=0; iInput<nInput; ++iInput){ | ||
3517 | int i; | ||
3518 | for(i=iInput; i<nInput && zInput[i]!='"'; ++i){} | ||
3519 | if( i>iInput ){ | ||
3520 | tokenizeSegment(v->pTokenizer, zInput+iInput, i-iInput, inPhrase, | ||
3521 | pQuery); | ||
3522 | } | ||
3523 | iInput = i; | ||
3524 | if( i<nInput ){ | ||
3525 | assert( zInput[i]=='"' ); | ||
3526 | inPhrase = !inPhrase; | ||
3527 | } | ||
3528 | } | ||
3529 | |||
3530 | if( inPhrase ){ | ||
3531 | /* unmatched quote */ | ||
3532 | queryClear(pQuery); | ||
3533 | return SQLITE_ERROR; | ||
3534 | } | ||
3535 | return SQLITE_OK; | ||
3536 | } | ||
3537 | |||
3538 | /* TODO(shess) Refactor the code to remove this forward decl. */ | ||
3539 | static int flushPendingTerms(fulltext_vtab *v); | ||
3540 | |||
3541 | /* Perform a full-text query using the search expression in | ||
3542 | ** zInput[0..nInput-1]. Return a list of matching documents | ||
3543 | ** in pResult. | ||
3544 | ** | ||
3545 | ** Queries must match column iColumn. Or if iColumn>=nColumn | ||
3546 | ** they are allowed to match against any column. | ||
3547 | */ | ||
3548 | static int fulltextQuery( | ||
3549 | fulltext_vtab *v, /* The full text index */ | ||
3550 | int iColumn, /* Match against this column by default */ | ||
3551 | const char *zInput, /* The query string */ | ||
3552 | int nInput, /* Number of bytes in zInput[] */ | ||
3553 | DataBuffer *pResult, /* Write the result doclist here */ | ||
3554 | Query *pQuery /* Put parsed query string here */ | ||
3555 | ){ | ||
3556 | int i, iNext, rc; | ||
3557 | DataBuffer left, right, or, new; | ||
3558 | int nNot = 0; | ||
3559 | QueryTerm *aTerm; | ||
3560 | |||
3561 | /* TODO(shess) Instead of flushing pendingTerms, we could query for | ||
3562 | ** the relevant term and merge the doclist into what we receive from | ||
3563 | ** the database. Wait and see if this is a common issue, first. | ||
3564 | ** | ||
3565 | ** A good reason not to flush is to not generate update-related | ||
3566 | ** error codes from here. | ||
3567 | */ | ||
3568 | |||
3569 | /* Flush any buffered updates before executing the query. */ | ||
3570 | rc = flushPendingTerms(v); | ||
3571 | if( rc!=SQLITE_OK ) return rc; | ||
3572 | |||
3573 | /* TODO(shess) I think that the queryClear() calls below are not | ||
3574 | ** necessary, because fulltextClose() already clears the query. | ||
3575 | */ | ||
3576 | rc = parseQuery(v, zInput, nInput, iColumn, pQuery); | ||
3577 | if( rc!=SQLITE_OK ) return rc; | ||
3578 | |||
3579 | /* Empty or NULL queries return no results. */ | ||
3580 | if( pQuery->nTerms==0 ){ | ||
3581 | dataBufferInit(pResult, 0); | ||
3582 | return SQLITE_OK; | ||
3583 | } | ||
3584 | |||
3585 | /* Merge AND terms. */ | ||
3586 | /* TODO(shess) I think we can early-exit if( i>nNot && left.nData==0 ). */ | ||
3587 | aTerm = pQuery->pTerms; | ||
3588 | for(i = 0; i<pQuery->nTerms; i=iNext){ | ||
3589 | if( aTerm[i].isNot ){ | ||
3590 | /* Handle all NOT terms in a separate pass */ | ||
3591 | nNot++; | ||
3592 | iNext = i + aTerm[i].nPhrase+1; | ||
3593 | continue; | ||
3594 | } | ||
3595 | iNext = i + aTerm[i].nPhrase + 1; | ||
3596 | rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &right); | ||
3597 | if( rc ){ | ||
3598 | if( i!=nNot ) dataBufferDestroy(&left); | ||
3599 | queryClear(pQuery); | ||
3600 | return rc; | ||
3601 | } | ||
3602 | while( iNext<pQuery->nTerms && aTerm[iNext].isOr ){ | ||
3603 | rc = docListOfTerm(v, aTerm[iNext].iColumn, &aTerm[iNext], &or); | ||
3604 | iNext += aTerm[iNext].nPhrase + 1; | ||
3605 | if( rc ){ | ||
3606 | if( i!=nNot ) dataBufferDestroy(&left); | ||
3607 | dataBufferDestroy(&right); | ||
3608 | queryClear(pQuery); | ||
3609 | return rc; | ||
3610 | } | ||
3611 | dataBufferInit(&new, 0); | ||
3612 | docListOrMerge(right.pData, right.nData, or.pData, or.nData, &new); | ||
3613 | dataBufferDestroy(&right); | ||
3614 | dataBufferDestroy(&or); | ||
3615 | right = new; | ||
3616 | } | ||
3617 | if( i==nNot ){ /* first term processed. */ | ||
3618 | left = right; | ||
3619 | }else{ | ||
3620 | dataBufferInit(&new, 0); | ||
3621 | docListAndMerge(left.pData, left.nData, right.pData, right.nData, &new); | ||
3622 | dataBufferDestroy(&right); | ||
3623 | dataBufferDestroy(&left); | ||
3624 | left = new; | ||
3625 | } | ||
3626 | } | ||
3627 | |||
3628 | if( nNot==pQuery->nTerms ){ | ||
3629 | /* We do not yet know how to handle a query of only NOT terms */ | ||
3630 | return SQLITE_ERROR; | ||
3631 | } | ||
3632 | |||
3633 | /* Do the EXCEPT terms */ | ||
3634 | for(i=0; i<pQuery->nTerms; i += aTerm[i].nPhrase + 1){ | ||
3635 | if( !aTerm[i].isNot ) continue; | ||
3636 | rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &right); | ||
3637 | if( rc ){ | ||
3638 | queryClear(pQuery); | ||
3639 | dataBufferDestroy(&left); | ||
3640 | return rc; | ||
3641 | } | ||
3642 | dataBufferInit(&new, 0); | ||
3643 | docListExceptMerge(left.pData, left.nData, right.pData, right.nData, &new); | ||
3644 | dataBufferDestroy(&right); | ||
3645 | dataBufferDestroy(&left); | ||
3646 | left = new; | ||
3647 | } | ||
3648 | |||
3649 | *pResult = left; | ||
3650 | return rc; | ||
3651 | } | ||
3652 | |||
3653 | /* | ||
3654 | ** This is the xFilter interface for the virtual table. See | ||
3655 | ** the virtual table xFilter method documentation for additional | ||
3656 | ** information. | ||
3657 | ** | ||
3658 | ** If idxNum==QUERY_GENERIC then do a full table scan against | ||
3659 | ** the %_content table. | ||
3660 | ** | ||
3661 | ** If idxNum==QUERY_ROWID then do a rowid lookup for a single entry | ||
3662 | ** in the %_content table. | ||
3663 | ** | ||
3664 | ** If idxNum>=QUERY_FULLTEXT then use the full text index. The | ||
3665 | ** column on the left-hand side of the MATCH operator is column | ||
3666 | ** number idxNum-QUERY_FULLTEXT, 0 indexed. argv[0] is the right-hand | ||
3667 | ** side of the MATCH operator. | ||
3668 | */ | ||
3669 | /* TODO(shess) Upgrade the cursor initialization and destruction to | ||
3670 | ** account for fulltextFilter() being called multiple times on the | ||
3671 | ** same cursor. The current solution is very fragile. Apply fix to | ||
3672 | ** fts2 as appropriate. | ||
3673 | */ | ||
3674 | static int fulltextFilter( | ||
3675 | sqlite3_vtab_cursor *pCursor, /* The cursor used for this query */ | ||
3676 | int idxNum, const char *idxStr, /* Which indexing scheme to use */ | ||
3677 | int argc, sqlite3_value **argv /* Arguments for the indexing scheme */ | ||
3678 | ){ | ||
3679 | fulltext_cursor *c = (fulltext_cursor *) pCursor; | ||
3680 | fulltext_vtab *v = cursor_vtab(c); | ||
3681 | int rc; | ||
3682 | char *zSql; | ||
3683 | |||
3684 | TRACE(("FTS2 Filter %p\n",pCursor)); | ||
3685 | |||
3686 | zSql = sqlite3_mprintf("select rowid, * from %%_content %s", | ||
3687 | idxNum==QUERY_GENERIC ? "" : "where rowid=?"); | ||
3688 | sqlite3_finalize(c->pStmt); | ||
3689 | rc = sql_prepare(v->db, v->zDb, v->zName, &c->pStmt, zSql); | ||
3690 | sqlite3_free(zSql); | ||
3691 | if( rc!=SQLITE_OK ) return rc; | ||
3692 | |||
3693 | c->iCursorType = idxNum; | ||
3694 | switch( idxNum ){ | ||
3695 | case QUERY_GENERIC: | ||
3696 | break; | ||
3697 | |||
3698 | case QUERY_ROWID: | ||
3699 | rc = sqlite3_bind_int64(c->pStmt, 1, sqlite3_value_int64(argv[0])); | ||
3700 | if( rc!=SQLITE_OK ) return rc; | ||
3701 | break; | ||
3702 | |||
3703 | default: /* full-text search */ | ||
3704 | { | ||
3705 | const char *zQuery = (const char *)sqlite3_value_text(argv[0]); | ||
3706 | assert( idxNum<=QUERY_FULLTEXT+v->nColumn); | ||
3707 | assert( argc==1 ); | ||
3708 | queryClear(&c->q); | ||
3709 | if( c->result.nData!=0 ){ | ||
3710 | /* This case happens if the same cursor is used repeatedly. */ | ||
3711 | dlrDestroy(&c->reader); | ||
3712 | dataBufferReset(&c->result); | ||
3713 | }else{ | ||
3714 | dataBufferInit(&c->result, 0); | ||
3715 | } | ||
3716 | rc = fulltextQuery(v, idxNum-QUERY_FULLTEXT, zQuery, -1, &c->result, &c->q); | ||
3717 | if( rc!=SQLITE_OK ) return rc; | ||
3718 | if( c->result.nData!=0 ){ | ||
3719 | dlrInit(&c->reader, DL_DOCIDS, c->result.pData, c->result.nData); | ||
3720 | } | ||
3721 | break; | ||
3722 | } | ||
3723 | } | ||
3724 | |||
3725 | return fulltextNext(pCursor); | ||
3726 | } | ||
3727 | |||
3728 | /* This is the xEof method of the virtual table. The SQLite core | ||
3729 | ** calls this routine to find out if it has reached the end of | ||
3730 | ** a query's results set. | ||
3731 | */ | ||
3732 | static int fulltextEof(sqlite3_vtab_cursor *pCursor){ | ||
3733 | fulltext_cursor *c = (fulltext_cursor *) pCursor; | ||
3734 | return c->eof; | ||
3735 | } | ||
3736 | |||
3737 | /* This is the xColumn method of the virtual table. The SQLite | ||
3738 | ** core calls this method during a query when it needs the value | ||
3739 | ** of a column from the virtual table. This method needs to use | ||
3740 | ** one of the sqlite3_result_*() routines to store the requested | ||
3741 | ** value back in the pContext. | ||
3742 | */ | ||
3743 | static int fulltextColumn(sqlite3_vtab_cursor *pCursor, | ||
3744 | sqlite3_context *pContext, int idxCol){ | ||
3745 | fulltext_cursor *c = (fulltext_cursor *) pCursor; | ||
3746 | fulltext_vtab *v = cursor_vtab(c); | ||
3747 | |||
3748 | if( idxCol<v->nColumn ){ | ||
3749 | sqlite3_value *pVal = sqlite3_column_value(c->pStmt, idxCol+1); | ||
3750 | sqlite3_result_value(pContext, pVal); | ||
3751 | }else if( idxCol==v->nColumn ){ | ||
3752 | /* The extra column whose name is the same as the table. | ||
3753 | ** Return a blob which is a pointer to the cursor | ||
3754 | */ | ||
3755 | sqlite3_result_blob(pContext, &c, sizeof(c), SQLITE_TRANSIENT); | ||
3756 | } | ||
3757 | return SQLITE_OK; | ||
3758 | } | ||
3759 | |||
3760 | /* This is the xRowid method. The SQLite core calls this routine to | ||
3761 | ** retrive the rowid for the current row of the result set. The | ||
3762 | ** rowid should be written to *pRowid. | ||
3763 | */ | ||
3764 | static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){ | ||
3765 | fulltext_cursor *c = (fulltext_cursor *) pCursor; | ||
3766 | |||
3767 | *pRowid = sqlite3_column_int64(c->pStmt, 0); | ||
3768 | return SQLITE_OK; | ||
3769 | } | ||
3770 | |||
3771 | /* Add all terms in [zText] to pendingTerms table. If [iColumn] > 0, | ||
3772 | ** we also store positions and offsets in the hash table using that | ||
3773 | ** column number. | ||
3774 | */ | ||
3775 | static int buildTerms(fulltext_vtab *v, sqlite_int64 iDocid, | ||
3776 | const char *zText, int iColumn){ | ||
3777 | sqlite3_tokenizer *pTokenizer = v->pTokenizer; | ||
3778 | sqlite3_tokenizer_cursor *pCursor; | ||
3779 | const char *pToken; | ||
3780 | int nTokenBytes; | ||
3781 | int iStartOffset, iEndOffset, iPosition; | ||
3782 | int rc; | ||
3783 | |||
3784 | rc = pTokenizer->pModule->xOpen(pTokenizer, zText, -1, &pCursor); | ||
3785 | if( rc!=SQLITE_OK ) return rc; | ||
3786 | |||
3787 | pCursor->pTokenizer = pTokenizer; | ||
3788 | while( SQLITE_OK==pTokenizer->pModule->xNext(pCursor, | ||
3789 | &pToken, &nTokenBytes, | ||
3790 | &iStartOffset, &iEndOffset, | ||
3791 | &iPosition) ){ | ||
3792 | DLCollector *p; | ||
3793 | int nData; /* Size of doclist before our update. */ | ||
3794 | |||
3795 | /* Positions can't be negative; we use -1 as a terminator internally. */ | ||
3796 | if( iPosition<0 ){ | ||
3797 | pTokenizer->pModule->xClose(pCursor); | ||
3798 | return SQLITE_ERROR; | ||
3799 | } | ||
3800 | |||
3801 | p = fts2HashFind(&v->pendingTerms, pToken, nTokenBytes); | ||
3802 | if( p==NULL ){ | ||
3803 | nData = 0; | ||
3804 | p = dlcNew(iDocid, DL_DEFAULT); | ||
3805 | fts2HashInsert(&v->pendingTerms, pToken, nTokenBytes, p); | ||
3806 | |||
3807 | /* Overhead for our hash table entry, the key, and the value. */ | ||
3808 | v->nPendingData += sizeof(struct fts2HashElem)+sizeof(*p)+nTokenBytes; | ||
3809 | }else{ | ||
3810 | nData = p->b.nData; | ||
3811 | if( p->dlw.iPrevDocid!=iDocid ) dlcNext(p, iDocid); | ||
3812 | } | ||
3813 | if( iColumn>=0 ){ | ||
3814 | dlcAddPos(p, iColumn, iPosition, iStartOffset, iEndOffset); | ||
3815 | } | ||
3816 | |||
3817 | /* Accumulate data added by dlcNew or dlcNext, and dlcAddPos. */ | ||
3818 | v->nPendingData += p->b.nData-nData; | ||
3819 | } | ||
3820 | |||
3821 | /* TODO(shess) Check return? Should this be able to cause errors at | ||
3822 | ** this point? Actually, same question about sqlite3_finalize(), | ||
3823 | ** though one could argue that failure there means that the data is | ||
3824 | ** not durable. *ponder* | ||
3825 | */ | ||
3826 | pTokenizer->pModule->xClose(pCursor); | ||
3827 | return rc; | ||
3828 | } | ||
3829 | |||
3830 | /* Add doclists for all terms in [pValues] to pendingTerms table. */ | ||
3831 | static int insertTerms(fulltext_vtab *v, sqlite_int64 iRowid, | ||
3832 | sqlite3_value **pValues){ | ||
3833 | int i; | ||
3834 | for(i = 0; i < v->nColumn ; ++i){ | ||
3835 | char *zText = (char*)sqlite3_value_text(pValues[i]); | ||
3836 | int rc = buildTerms(v, iRowid, zText, i); | ||
3837 | if( rc!=SQLITE_OK ) return rc; | ||
3838 | } | ||
3839 | return SQLITE_OK; | ||
3840 | } | ||
3841 | |||
3842 | /* Add empty doclists for all terms in the given row's content to | ||
3843 | ** pendingTerms. | ||
3844 | */ | ||
3845 | static int deleteTerms(fulltext_vtab *v, sqlite_int64 iRowid){ | ||
3846 | const char **pValues; | ||
3847 | int i, rc; | ||
3848 | |||
3849 | /* TODO(shess) Should we allow such tables at all? */ | ||
3850 | if( DL_DEFAULT==DL_DOCIDS ) return SQLITE_ERROR; | ||
3851 | |||
3852 | rc = content_select(v, iRowid, &pValues); | ||
3853 | if( rc!=SQLITE_OK ) return rc; | ||
3854 | |||
3855 | for(i = 0 ; i < v->nColumn; ++i) { | ||
3856 | rc = buildTerms(v, iRowid, pValues[i], -1); | ||
3857 | if( rc!=SQLITE_OK ) break; | ||
3858 | } | ||
3859 | |||
3860 | freeStringArray(v->nColumn, pValues); | ||
3861 | return SQLITE_OK; | ||
3862 | } | ||
3863 | |||
3864 | /* TODO(shess) Refactor the code to remove this forward decl. */ | ||
3865 | static int initPendingTerms(fulltext_vtab *v, sqlite_int64 iDocid); | ||
3866 | |||
3867 | /* Insert a row into the %_content table; set *piRowid to be the ID of the | ||
3868 | ** new row. Add doclists for terms to pendingTerms. | ||
3869 | */ | ||
3870 | static int index_insert(fulltext_vtab *v, sqlite3_value *pRequestRowid, | ||
3871 | sqlite3_value **pValues, sqlite_int64 *piRowid){ | ||
3872 | int rc; | ||
3873 | |||
3874 | rc = content_insert(v, pRequestRowid, pValues); /* execute an SQL INSERT */ | ||
3875 | if( rc!=SQLITE_OK ) return rc; | ||
3876 | |||
3877 | *piRowid = sqlite3_last_insert_rowid(v->db); | ||
3878 | rc = initPendingTerms(v, *piRowid); | ||
3879 | if( rc!=SQLITE_OK ) return rc; | ||
3880 | |||
3881 | return insertTerms(v, *piRowid, pValues); | ||
3882 | } | ||
3883 | |||
3884 | /* Delete a row from the %_content table; add empty doclists for terms | ||
3885 | ** to pendingTerms. | ||
3886 | */ | ||
3887 | static int index_delete(fulltext_vtab *v, sqlite_int64 iRow){ | ||
3888 | int rc = initPendingTerms(v, iRow); | ||
3889 | if( rc!=SQLITE_OK ) return rc; | ||
3890 | |||
3891 | rc = deleteTerms(v, iRow); | ||
3892 | if( rc!=SQLITE_OK ) return rc; | ||
3893 | |||
3894 | return content_delete(v, iRow); /* execute an SQL DELETE */ | ||
3895 | } | ||
3896 | |||
3897 | /* Update a row in the %_content table; add delete doclists to | ||
3898 | ** pendingTerms for old terms not in the new data, add insert doclists | ||
3899 | ** to pendingTerms for terms in the new data. | ||
3900 | */ | ||
3901 | static int index_update(fulltext_vtab *v, sqlite_int64 iRow, | ||
3902 | sqlite3_value **pValues){ | ||
3903 | int rc = initPendingTerms(v, iRow); | ||
3904 | if( rc!=SQLITE_OK ) return rc; | ||
3905 | |||
3906 | /* Generate an empty doclist for each term that previously appeared in this | ||
3907 | * row. */ | ||
3908 | rc = deleteTerms(v, iRow); | ||
3909 | if( rc!=SQLITE_OK ) return rc; | ||
3910 | |||
3911 | rc = content_update(v, pValues, iRow); /* execute an SQL UPDATE */ | ||
3912 | if( rc!=SQLITE_OK ) return rc; | ||
3913 | |||
3914 | /* Now add positions for terms which appear in the updated row. */ | ||
3915 | return insertTerms(v, iRow, pValues); | ||
3916 | } | ||
3917 | |||
3918 | /*******************************************************************/ | ||
3919 | /* InteriorWriter is used to collect terms and block references into | ||
3920 | ** interior nodes in %_segments. See commentary at top of file for | ||
3921 | ** format. | ||
3922 | */ | ||
3923 | |||
3924 | /* How large interior nodes can grow. */ | ||
3925 | #define INTERIOR_MAX 2048 | ||
3926 | |||
3927 | /* Minimum number of terms per interior node (except the root). This | ||
3928 | ** prevents large terms from making the tree too skinny - must be >0 | ||
3929 | ** so that the tree always makes progress. Note that the min tree | ||
3930 | ** fanout will be INTERIOR_MIN_TERMS+1. | ||
3931 | */ | ||
3932 | #define INTERIOR_MIN_TERMS 7 | ||
3933 | #if INTERIOR_MIN_TERMS<1 | ||
3934 | # error INTERIOR_MIN_TERMS must be greater than 0. | ||
3935 | #endif | ||
3936 | |||
3937 | /* ROOT_MAX controls how much data is stored inline in the segment | ||
3938 | ** directory. | ||
3939 | */ | ||
3940 | /* TODO(shess) Push ROOT_MAX down to whoever is writing things. It's | ||
3941 | ** only here so that interiorWriterRootInfo() and leafWriterRootInfo() | ||
3942 | ** can both see it, but if the caller passed it in, we wouldn't even | ||
3943 | ** need a define. | ||
3944 | */ | ||
3945 | #define ROOT_MAX 1024 | ||
3946 | #if ROOT_MAX<VARINT_MAX*2 | ||
3947 | # error ROOT_MAX must have enough space for a header. | ||
3948 | #endif | ||
3949 | |||
3950 | /* InteriorBlock stores a linked-list of interior blocks while a lower | ||
3951 | ** layer is being constructed. | ||
3952 | */ | ||
3953 | typedef struct InteriorBlock { | ||
3954 | DataBuffer term; /* Leftmost term in block's subtree. */ | ||
3955 | DataBuffer data; /* Accumulated data for the block. */ | ||
3956 | struct InteriorBlock *next; | ||
3957 | } InteriorBlock; | ||
3958 | |||
3959 | static InteriorBlock *interiorBlockNew(int iHeight, sqlite_int64 iChildBlock, | ||
3960 | const char *pTerm, int nTerm){ | ||
3961 | InteriorBlock *block = calloc(1, sizeof(InteriorBlock)); | ||
3962 | char c[VARINT_MAX+VARINT_MAX]; | ||
3963 | int n; | ||
3964 | |||
3965 | dataBufferInit(&block->term, 0); | ||
3966 | dataBufferReplace(&block->term, pTerm, nTerm); | ||
3967 | |||
3968 | n = putVarint(c, iHeight); | ||
3969 | n += putVarint(c+n, iChildBlock); | ||
3970 | dataBufferInit(&block->data, INTERIOR_MAX); | ||
3971 | dataBufferReplace(&block->data, c, n); | ||
3972 | |||
3973 | return block; | ||
3974 | } | ||
3975 | |||
3976 | #ifndef NDEBUG | ||
3977 | /* Verify that the data is readable as an interior node. */ | ||
3978 | static void interiorBlockValidate(InteriorBlock *pBlock){ | ||
3979 | const char *pData = pBlock->data.pData; | ||
3980 | int nData = pBlock->data.nData; | ||
3981 | int n, iDummy; | ||
3982 | sqlite_int64 iBlockid; | ||
3983 | |||
3984 | assert( nData>0 ); | ||
3985 | assert( pData!=0 ); | ||
3986 | assert( pData+nData>pData ); | ||
3987 | |||
3988 | /* Must lead with height of node as a varint(n), n>0 */ | ||
3989 | n = getVarint32(pData, &iDummy); | ||
3990 | assert( n>0 ); | ||
3991 | assert( iDummy>0 ); | ||
3992 | assert( n<nData ); | ||
3993 | pData += n; | ||
3994 | nData -= n; | ||
3995 | |||
3996 | /* Must contain iBlockid. */ | ||
3997 | n = getVarint(pData, &iBlockid); | ||
3998 | assert( n>0 ); | ||
3999 | assert( n<=nData ); | ||
4000 | pData += n; | ||
4001 | nData -= n; | ||
4002 | |||
4003 | /* Zero or more terms of positive length */ | ||
4004 | if( nData!=0 ){ | ||
4005 | /* First term is not delta-encoded. */ | ||
4006 | n = getVarint32(pData, &iDummy); | ||
4007 | assert( n>0 ); | ||
4008 | assert( iDummy>0 ); | ||
4009 | assert( n+iDummy>0); | ||
4010 | assert( n+iDummy<=nData ); | ||
4011 | pData += n+iDummy; | ||
4012 | nData -= n+iDummy; | ||
4013 | |||
4014 | /* Following terms delta-encoded. */ | ||
4015 | while( nData!=0 ){ | ||
4016 | /* Length of shared prefix. */ | ||
4017 | n = getVarint32(pData, &iDummy); | ||
4018 | assert( n>0 ); | ||
4019 | assert( iDummy>=0 ); | ||
4020 | assert( n<nData ); | ||
4021 | pData += n; | ||
4022 | nData -= n; | ||
4023 | |||
4024 | /* Length and data of distinct suffix. */ | ||
4025 | n = getVarint32(pData, &iDummy); | ||
4026 | assert( n>0 ); | ||
4027 | assert( iDummy>0 ); | ||
4028 | assert( n+iDummy>0); | ||
4029 | assert( n+iDummy<=nData ); | ||
4030 | pData += n+iDummy; | ||
4031 | nData -= n+iDummy; | ||
4032 | } | ||
4033 | } | ||
4034 | } | ||
4035 | #define ASSERT_VALID_INTERIOR_BLOCK(x) interiorBlockValidate(x) | ||
4036 | #else | ||
4037 | #define ASSERT_VALID_INTERIOR_BLOCK(x) assert( 1 ) | ||
4038 | #endif | ||
4039 | |||
4040 | typedef struct InteriorWriter { | ||
4041 | int iHeight; /* from 0 at leaves. */ | ||
4042 | InteriorBlock *first, *last; | ||
4043 | struct InteriorWriter *parentWriter; | ||
4044 | |||
4045 | DataBuffer term; /* Last term written to block "last". */ | ||
4046 | sqlite_int64 iOpeningChildBlock; /* First child block in block "last". */ | ||
4047 | #ifndef NDEBUG | ||
4048 | sqlite_int64 iLastChildBlock; /* for consistency checks. */ | ||
4049 | #endif | ||
4050 | } InteriorWriter; | ||
4051 | |||
4052 | /* Initialize an interior node where pTerm[nTerm] marks the leftmost | ||
4053 | ** term in the tree. iChildBlock is the leftmost child block at the | ||
4054 | ** next level down the tree. | ||
4055 | */ | ||
4056 | static void interiorWriterInit(int iHeight, const char *pTerm, int nTerm, | ||
4057 | sqlite_int64 iChildBlock, | ||
4058 | InteriorWriter *pWriter){ | ||
4059 | InteriorBlock *block; | ||
4060 | assert( iHeight>0 ); | ||
4061 | CLEAR(pWriter); | ||
4062 | |||
4063 | pWriter->iHeight = iHeight; | ||
4064 | pWriter->iOpeningChildBlock = iChildBlock; | ||
4065 | #ifndef NDEBUG | ||
4066 | pWriter->iLastChildBlock = iChildBlock; | ||
4067 | #endif | ||
4068 | block = interiorBlockNew(iHeight, iChildBlock, pTerm, nTerm); | ||
4069 | pWriter->last = pWriter->first = block; | ||
4070 | ASSERT_VALID_INTERIOR_BLOCK(pWriter->last); | ||
4071 | dataBufferInit(&pWriter->term, 0); | ||
4072 | } | ||
4073 | |||
4074 | /* Append the child node rooted at iChildBlock to the interior node, | ||
4075 | ** with pTerm[nTerm] as the leftmost term in iChildBlock's subtree. | ||
4076 | */ | ||
4077 | static void interiorWriterAppend(InteriorWriter *pWriter, | ||
4078 | const char *pTerm, int nTerm, | ||
4079 | sqlite_int64 iChildBlock){ | ||
4080 | char c[VARINT_MAX+VARINT_MAX]; | ||
4081 | int n, nPrefix = 0; | ||
4082 | |||
4083 | ASSERT_VALID_INTERIOR_BLOCK(pWriter->last); | ||
4084 | |||
4085 | /* The first term written into an interior node is actually | ||
4086 | ** associated with the second child added (the first child was added | ||
4087 | ** in interiorWriterInit, or in the if clause at the bottom of this | ||
4088 | ** function). That term gets encoded straight up, with nPrefix left | ||
4089 | ** at 0. | ||
4090 | */ | ||
4091 | if( pWriter->term.nData==0 ){ | ||
4092 | n = putVarint(c, nTerm); | ||
4093 | }else{ | ||
4094 | while( nPrefix<pWriter->term.nData && | ||
4095 | pTerm[nPrefix]==pWriter->term.pData[nPrefix] ){ | ||
4096 | nPrefix++; | ||
4097 | } | ||
4098 | |||
4099 | n = putVarint(c, nPrefix); | ||
4100 | n += putVarint(c+n, nTerm-nPrefix); | ||
4101 | } | ||
4102 | |||
4103 | #ifndef NDEBUG | ||
4104 | pWriter->iLastChildBlock++; | ||
4105 | #endif | ||
4106 | assert( pWriter->iLastChildBlock==iChildBlock ); | ||
4107 | |||
4108 | /* Overflow to a new block if the new term makes the current block | ||
4109 | ** too big, and the current block already has enough terms. | ||
4110 | */ | ||
4111 | if( pWriter->last->data.nData+n+nTerm-nPrefix>INTERIOR_MAX && | ||
4112 | iChildBlock-pWriter->iOpeningChildBlock>INTERIOR_MIN_TERMS ){ | ||
4113 | pWriter->last->next = interiorBlockNew(pWriter->iHeight, iChildBlock, | ||
4114 | pTerm, nTerm); | ||
4115 | pWriter->last = pWriter->last->next; | ||
4116 | pWriter->iOpeningChildBlock = iChildBlock; | ||
4117 | dataBufferReset(&pWriter->term); | ||
4118 | }else{ | ||
4119 | dataBufferAppend2(&pWriter->last->data, c, n, | ||
4120 | pTerm+nPrefix, nTerm-nPrefix); | ||
4121 | dataBufferReplace(&pWriter->term, pTerm, nTerm); | ||
4122 | } | ||
4123 | ASSERT_VALID_INTERIOR_BLOCK(pWriter->last); | ||
4124 | } | ||
4125 | |||
4126 | /* Free the space used by pWriter, including the linked-list of | ||
4127 | ** InteriorBlocks, and parentWriter, if present. | ||
4128 | */ | ||
4129 | static int interiorWriterDestroy(InteriorWriter *pWriter){ | ||
4130 | InteriorBlock *block = pWriter->first; | ||
4131 | |||
4132 | while( block!=NULL ){ | ||
4133 | InteriorBlock *b = block; | ||
4134 | block = block->next; | ||
4135 | dataBufferDestroy(&b->term); | ||
4136 | dataBufferDestroy(&b->data); | ||
4137 | free(b); | ||
4138 | } | ||
4139 | if( pWriter->parentWriter!=NULL ){ | ||
4140 | interiorWriterDestroy(pWriter->parentWriter); | ||
4141 | free(pWriter->parentWriter); | ||
4142 | } | ||
4143 | dataBufferDestroy(&pWriter->term); | ||
4144 | SCRAMBLE(pWriter); | ||
4145 | return SQLITE_OK; | ||
4146 | } | ||
4147 | |||
4148 | /* If pWriter can fit entirely in ROOT_MAX, return it as the root info | ||
4149 | ** directly, leaving *piEndBlockid unchanged. Otherwise, flush | ||
4150 | ** pWriter to %_segments, building a new layer of interior nodes, and | ||
4151 | ** recursively ask for their root into. | ||
4152 | */ | ||
4153 | static int interiorWriterRootInfo(fulltext_vtab *v, InteriorWriter *pWriter, | ||
4154 | char **ppRootInfo, int *pnRootInfo, | ||
4155 | sqlite_int64 *piEndBlockid){ | ||
4156 | InteriorBlock *block = pWriter->first; | ||
4157 | sqlite_int64 iBlockid = 0; | ||
4158 | int rc; | ||
4159 | |||
4160 | /* If we can fit the segment inline */ | ||
4161 | if( block==pWriter->last && block->data.nData<ROOT_MAX ){ | ||
4162 | *ppRootInfo = block->data.pData; | ||
4163 | *pnRootInfo = block->data.nData; | ||
4164 | return SQLITE_OK; | ||
4165 | } | ||
4166 | |||
4167 | /* Flush the first block to %_segments, and create a new level of | ||
4168 | ** interior node. | ||
4169 | */ | ||
4170 | ASSERT_VALID_INTERIOR_BLOCK(block); | ||
4171 | rc = block_insert(v, block->data.pData, block->data.nData, &iBlockid); | ||
4172 | if( rc!=SQLITE_OK ) return rc; | ||
4173 | *piEndBlockid = iBlockid; | ||
4174 | |||
4175 | pWriter->parentWriter = malloc(sizeof(*pWriter->parentWriter)); | ||
4176 | interiorWriterInit(pWriter->iHeight+1, | ||
4177 | block->term.pData, block->term.nData, | ||
4178 | iBlockid, pWriter->parentWriter); | ||
4179 | |||
4180 | /* Flush additional blocks and append to the higher interior | ||
4181 | ** node. | ||
4182 | */ | ||
4183 | for(block=block->next; block!=NULL; block=block->next){ | ||
4184 | ASSERT_VALID_INTERIOR_BLOCK(block); | ||
4185 | rc = block_insert(v, block->data.pData, block->data.nData, &iBlockid); | ||
4186 | if( rc!=SQLITE_OK ) return rc; | ||
4187 | *piEndBlockid = iBlockid; | ||
4188 | |||
4189 | interiorWriterAppend(pWriter->parentWriter, | ||
4190 | block->term.pData, block->term.nData, iBlockid); | ||
4191 | } | ||
4192 | |||
4193 | /* Parent node gets the chance to be the root. */ | ||
4194 | return interiorWriterRootInfo(v, pWriter->parentWriter, | ||
4195 | ppRootInfo, pnRootInfo, piEndBlockid); | ||
4196 | } | ||
4197 | |||
4198 | /****************************************************************/ | ||
4199 | /* InteriorReader is used to read off the data from an interior node | ||
4200 | ** (see comment at top of file for the format). | ||
4201 | */ | ||
4202 | typedef struct InteriorReader { | ||
4203 | const char *pData; | ||
4204 | int nData; | ||
4205 | |||
4206 | DataBuffer term; /* previous term, for decoding term delta. */ | ||
4207 | |||
4208 | sqlite_int64 iBlockid; | ||
4209 | } InteriorReader; | ||
4210 | |||
4211 | static void interiorReaderDestroy(InteriorReader *pReader){ | ||
4212 | dataBufferDestroy(&pReader->term); | ||
4213 | SCRAMBLE(pReader); | ||
4214 | } | ||
4215 | |||
4216 | /* TODO(shess) The assertions are great, but what if we're in NDEBUG | ||
4217 | ** and the blob is empty or otherwise contains suspect data? | ||
4218 | */ | ||
4219 | static void interiorReaderInit(const char *pData, int nData, | ||
4220 | InteriorReader *pReader){ | ||
4221 | int n, nTerm; | ||
4222 | |||
4223 | /* Require at least the leading flag byte */ | ||
4224 | assert( nData>0 ); | ||
4225 | assert( pData[0]!='\0' ); | ||
4226 | |||
4227 | CLEAR(pReader); | ||
4228 | |||
4229 | /* Decode the base blockid, and set the cursor to the first term. */ | ||
4230 | n = getVarint(pData+1, &pReader->iBlockid); | ||
4231 | assert( 1+n<=nData ); | ||
4232 | pReader->pData = pData+1+n; | ||
4233 | pReader->nData = nData-(1+n); | ||
4234 | |||
4235 | /* A single-child interior node (such as when a leaf node was too | ||
4236 | ** large for the segment directory) won't have any terms. | ||
4237 | ** Otherwise, decode the first term. | ||
4238 | */ | ||
4239 | if( pReader->nData==0 ){ | ||
4240 | dataBufferInit(&pReader->term, 0); | ||
4241 | }else{ | ||
4242 | n = getVarint32(pReader->pData, &nTerm); | ||
4243 | dataBufferInit(&pReader->term, nTerm); | ||
4244 | dataBufferReplace(&pReader->term, pReader->pData+n, nTerm); | ||
4245 | assert( n+nTerm<=pReader->nData ); | ||
4246 | pReader->pData += n+nTerm; | ||
4247 | pReader->nData -= n+nTerm; | ||
4248 | } | ||
4249 | } | ||
4250 | |||
4251 | static int interiorReaderAtEnd(InteriorReader *pReader){ | ||
4252 | return pReader->term.nData==0; | ||
4253 | } | ||
4254 | |||
4255 | static sqlite_int64 interiorReaderCurrentBlockid(InteriorReader *pReader){ | ||
4256 | return pReader->iBlockid; | ||
4257 | } | ||
4258 | |||
4259 | static int interiorReaderTermBytes(InteriorReader *pReader){ | ||
4260 | assert( !interiorReaderAtEnd(pReader) ); | ||
4261 | return pReader->term.nData; | ||
4262 | } | ||
4263 | static const char *interiorReaderTerm(InteriorReader *pReader){ | ||
4264 | assert( !interiorReaderAtEnd(pReader) ); | ||
4265 | return pReader->term.pData; | ||
4266 | } | ||
4267 | |||
4268 | /* Step forward to the next term in the node. */ | ||
4269 | static void interiorReaderStep(InteriorReader *pReader){ | ||
4270 | assert( !interiorReaderAtEnd(pReader) ); | ||
4271 | |||
4272 | /* If the last term has been read, signal eof, else construct the | ||
4273 | ** next term. | ||
4274 | */ | ||
4275 | if( pReader->nData==0 ){ | ||
4276 | dataBufferReset(&pReader->term); | ||
4277 | }else{ | ||
4278 | int n, nPrefix, nSuffix; | ||
4279 | |||
4280 | n = getVarint32(pReader->pData, &nPrefix); | ||
4281 | n += getVarint32(pReader->pData+n, &nSuffix); | ||
4282 | |||
4283 | /* Truncate the current term and append suffix data. */ | ||
4284 | pReader->term.nData = nPrefix; | ||
4285 | dataBufferAppend(&pReader->term, pReader->pData+n, nSuffix); | ||
4286 | |||
4287 | assert( n+nSuffix<=pReader->nData ); | ||
4288 | pReader->pData += n+nSuffix; | ||
4289 | pReader->nData -= n+nSuffix; | ||
4290 | } | ||
4291 | pReader->iBlockid++; | ||
4292 | } | ||
4293 | |||
4294 | /* Compare the current term to pTerm[nTerm], returning strcmp-style | ||
4295 | ** results. If isPrefix, equality means equal through nTerm bytes. | ||
4296 | */ | ||
4297 | static int interiorReaderTermCmp(InteriorReader *pReader, | ||
4298 | const char *pTerm, int nTerm, int isPrefix){ | ||
4299 | const char *pReaderTerm = interiorReaderTerm(pReader); | ||
4300 | int nReaderTerm = interiorReaderTermBytes(pReader); | ||
4301 | int c, n = nReaderTerm<nTerm ? nReaderTerm : nTerm; | ||
4302 | |||
4303 | if( n==0 ){ | ||
4304 | if( nReaderTerm>0 ) return -1; | ||
4305 | if( nTerm>0 ) return 1; | ||
4306 | return 0; | ||
4307 | } | ||
4308 | |||
4309 | c = memcmp(pReaderTerm, pTerm, n); | ||
4310 | if( c!=0 ) return c; | ||
4311 | if( isPrefix && n==nTerm ) return 0; | ||
4312 | return nReaderTerm - nTerm; | ||
4313 | } | ||
4314 | |||
4315 | /****************************************************************/ | ||
4316 | /* LeafWriter is used to collect terms and associated doclist data | ||
4317 | ** into leaf blocks in %_segments (see top of file for format info). | ||
4318 | ** Expected usage is: | ||
4319 | ** | ||
4320 | ** LeafWriter writer; | ||
4321 | ** leafWriterInit(0, 0, &writer); | ||
4322 | ** while( sorted_terms_left_to_process ){ | ||
4323 | ** // data is doclist data for that term. | ||
4324 | ** rc = leafWriterStep(v, &writer, pTerm, nTerm, pData, nData); | ||
4325 | ** if( rc!=SQLITE_OK ) goto err; | ||
4326 | ** } | ||
4327 | ** rc = leafWriterFinalize(v, &writer); | ||
4328 | **err: | ||
4329 | ** leafWriterDestroy(&writer); | ||
4330 | ** return rc; | ||
4331 | ** | ||
4332 | ** leafWriterStep() may write a collected leaf out to %_segments. | ||
4333 | ** leafWriterFinalize() finishes writing any buffered data and stores | ||
4334 | ** a root node in %_segdir. leafWriterDestroy() frees all buffers and | ||
4335 | ** InteriorWriters allocated as part of writing this segment. | ||
4336 | ** | ||
4337 | ** TODO(shess) Document leafWriterStepMerge(). | ||
4338 | */ | ||
4339 | |||
4340 | /* Put terms with data this big in their own block. */ | ||
4341 | #define STANDALONE_MIN 1024 | ||
4342 | |||
4343 | /* Keep leaf blocks below this size. */ | ||
4344 | #define LEAF_MAX 2048 | ||
4345 | |||
4346 | typedef struct LeafWriter { | ||
4347 | int iLevel; | ||
4348 | int idx; | ||
4349 | sqlite_int64 iStartBlockid; /* needed to create the root info */ | ||
4350 | sqlite_int64 iEndBlockid; /* when we're done writing. */ | ||
4351 | |||
4352 | DataBuffer term; /* previous encoded term */ | ||
4353 | DataBuffer data; /* encoding buffer */ | ||
4354 | |||
4355 | /* bytes of first term in the current node which distinguishes that | ||
4356 | ** term from the last term of the previous node. | ||
4357 | */ | ||
4358 | int nTermDistinct; | ||
4359 | |||
4360 | InteriorWriter parentWriter; /* if we overflow */ | ||
4361 | int has_parent; | ||
4362 | } LeafWriter; | ||
4363 | |||
4364 | static void leafWriterInit(int iLevel, int idx, LeafWriter *pWriter){ | ||
4365 | CLEAR(pWriter); | ||
4366 | pWriter->iLevel = iLevel; | ||
4367 | pWriter->idx = idx; | ||
4368 | |||
4369 | dataBufferInit(&pWriter->term, 32); | ||
4370 | |||
4371 | /* Start out with a reasonably sized block, though it can grow. */ | ||
4372 | dataBufferInit(&pWriter->data, LEAF_MAX); | ||
4373 | } | ||
4374 | |||
4375 | #ifndef NDEBUG | ||
4376 | /* Verify that the data is readable as a leaf node. */ | ||
4377 | static void leafNodeValidate(const char *pData, int nData){ | ||
4378 | int n, iDummy; | ||
4379 | |||
4380 | if( nData==0 ) return; | ||
4381 | assert( nData>0 ); | ||
4382 | assert( pData!=0 ); | ||
4383 | assert( pData+nData>pData ); | ||
4384 | |||
4385 | /* Must lead with a varint(0) */ | ||
4386 | n = getVarint32(pData, &iDummy); | ||
4387 | assert( iDummy==0 ); | ||
4388 | assert( n>0 ); | ||
4389 | assert( n<nData ); | ||
4390 | pData += n; | ||
4391 | nData -= n; | ||
4392 | |||
4393 | /* Leading term length and data must fit in buffer. */ | ||
4394 | n = getVarint32(pData, &iDummy); | ||
4395 | assert( n>0 ); | ||
4396 | assert( iDummy>0 ); | ||
4397 | assert( n+iDummy>0 ); | ||
4398 | assert( n+iDummy<nData ); | ||
4399 | pData += n+iDummy; | ||
4400 | nData -= n+iDummy; | ||
4401 | |||
4402 | /* Leading term's doclist length and data must fit. */ | ||
4403 | n = getVarint32(pData, &iDummy); | ||
4404 | assert( n>0 ); | ||
4405 | assert( iDummy>0 ); | ||
4406 | assert( n+iDummy>0 ); | ||
4407 | assert( n+iDummy<=nData ); | ||
4408 | ASSERT_VALID_DOCLIST(DL_DEFAULT, pData+n, iDummy, NULL); | ||
4409 | pData += n+iDummy; | ||
4410 | nData -= n+iDummy; | ||
4411 | |||
4412 | /* Verify that trailing terms and doclists also are readable. */ | ||
4413 | while( nData!=0 ){ | ||
4414 | n = getVarint32(pData, &iDummy); | ||
4415 | assert( n>0 ); | ||
4416 | assert( iDummy>=0 ); | ||
4417 | assert( n<nData ); | ||
4418 | pData += n; | ||
4419 | nData -= n; | ||
4420 | n = getVarint32(pData, &iDummy); | ||
4421 | assert( n>0 ); | ||
4422 | assert( iDummy>0 ); | ||
4423 | assert( n+iDummy>0 ); | ||
4424 | assert( n+iDummy<nData ); | ||
4425 | pData += n+iDummy; | ||
4426 | nData -= n+iDummy; | ||
4427 | |||
4428 | n = getVarint32(pData, &iDummy); | ||
4429 | assert( n>0 ); | ||
4430 | assert( iDummy>0 ); | ||
4431 | assert( n+iDummy>0 ); | ||
4432 | assert( n+iDummy<=nData ); | ||
4433 | ASSERT_VALID_DOCLIST(DL_DEFAULT, pData+n, iDummy, NULL); | ||
4434 | pData += n+iDummy; | ||
4435 | nData -= n+iDummy; | ||
4436 | } | ||
4437 | } | ||
4438 | #define ASSERT_VALID_LEAF_NODE(p, n) leafNodeValidate(p, n) | ||
4439 | #else | ||
4440 | #define ASSERT_VALID_LEAF_NODE(p, n) assert( 1 ) | ||
4441 | #endif | ||
4442 | |||
4443 | /* Flush the current leaf node to %_segments, and adding the resulting | ||
4444 | ** blockid and the starting term to the interior node which will | ||
4445 | ** contain it. | ||
4446 | */ | ||
4447 | static int leafWriterInternalFlush(fulltext_vtab *v, LeafWriter *pWriter, | ||
4448 | int iData, int nData){ | ||
4449 | sqlite_int64 iBlockid = 0; | ||
4450 | const char *pStartingTerm; | ||
4451 | int nStartingTerm, rc, n; | ||
4452 | |||
4453 | /* Must have the leading varint(0) flag, plus at least some | ||
4454 | ** valid-looking data. | ||
4455 | */ | ||
4456 | assert( nData>2 ); | ||
4457 | assert( iData>=0 ); | ||
4458 | assert( iData+nData<=pWriter->data.nData ); | ||
4459 | ASSERT_VALID_LEAF_NODE(pWriter->data.pData+iData, nData); | ||
4460 | |||
4461 | rc = block_insert(v, pWriter->data.pData+iData, nData, &iBlockid); | ||
4462 | if( rc!=SQLITE_OK ) return rc; | ||
4463 | assert( iBlockid!=0 ); | ||
4464 | |||
4465 | /* Reconstruct the first term in the leaf for purposes of building | ||
4466 | ** the interior node. | ||
4467 | */ | ||
4468 | n = getVarint32(pWriter->data.pData+iData+1, &nStartingTerm); | ||
4469 | pStartingTerm = pWriter->data.pData+iData+1+n; | ||
4470 | assert( pWriter->data.nData>iData+1+n+nStartingTerm ); | ||
4471 | assert( pWriter->nTermDistinct>0 ); | ||
4472 | assert( pWriter->nTermDistinct<=nStartingTerm ); | ||
4473 | nStartingTerm = pWriter->nTermDistinct; | ||
4474 | |||
4475 | if( pWriter->has_parent ){ | ||
4476 | interiorWriterAppend(&pWriter->parentWriter, | ||
4477 | pStartingTerm, nStartingTerm, iBlockid); | ||
4478 | }else{ | ||
4479 | interiorWriterInit(1, pStartingTerm, nStartingTerm, iBlockid, | ||
4480 | &pWriter->parentWriter); | ||
4481 | pWriter->has_parent = 1; | ||
4482 | } | ||
4483 | |||
4484 | /* Track the span of this segment's leaf nodes. */ | ||
4485 | if( pWriter->iEndBlockid==0 ){ | ||
4486 | pWriter->iEndBlockid = pWriter->iStartBlockid = iBlockid; | ||
4487 | }else{ | ||
4488 | pWriter->iEndBlockid++; | ||
4489 | assert( iBlockid==pWriter->iEndBlockid ); | ||
4490 | } | ||
4491 | |||
4492 | return SQLITE_OK; | ||
4493 | } | ||
4494 | static int leafWriterFlush(fulltext_vtab *v, LeafWriter *pWriter){ | ||
4495 | int rc = leafWriterInternalFlush(v, pWriter, 0, pWriter->data.nData); | ||
4496 | if( rc!=SQLITE_OK ) return rc; | ||
4497 | |||
4498 | /* Re-initialize the output buffer. */ | ||
4499 | dataBufferReset(&pWriter->data); | ||
4500 | |||
4501 | return SQLITE_OK; | ||
4502 | } | ||
4503 | |||
4504 | /* Fetch the root info for the segment. If the entire leaf fits | ||
4505 | ** within ROOT_MAX, then it will be returned directly, otherwise it | ||
4506 | ** will be flushed and the root info will be returned from the | ||
4507 | ** interior node. *piEndBlockid is set to the blockid of the last | ||
4508 | ** interior or leaf node written to disk (0 if none are written at | ||
4509 | ** all). | ||
4510 | */ | ||
4511 | static int leafWriterRootInfo(fulltext_vtab *v, LeafWriter *pWriter, | ||
4512 | char **ppRootInfo, int *pnRootInfo, | ||
4513 | sqlite_int64 *piEndBlockid){ | ||
4514 | /* we can fit the segment entirely inline */ | ||
4515 | if( !pWriter->has_parent && pWriter->data.nData<ROOT_MAX ){ | ||
4516 | *ppRootInfo = pWriter->data.pData; | ||
4517 | *pnRootInfo = pWriter->data.nData; | ||
4518 | *piEndBlockid = 0; | ||
4519 | return SQLITE_OK; | ||
4520 | } | ||
4521 | |||
4522 | /* Flush remaining leaf data. */ | ||
4523 | if( pWriter->data.nData>0 ){ | ||
4524 | int rc = leafWriterFlush(v, pWriter); | ||
4525 | if( rc!=SQLITE_OK ) return rc; | ||
4526 | } | ||
4527 | |||
4528 | /* We must have flushed a leaf at some point. */ | ||
4529 | assert( pWriter->has_parent ); | ||
4530 | |||
4531 | /* Tenatively set the end leaf blockid as the end blockid. If the | ||
4532 | ** interior node can be returned inline, this will be the final | ||
4533 | ** blockid, otherwise it will be overwritten by | ||
4534 | ** interiorWriterRootInfo(). | ||
4535 | */ | ||
4536 | *piEndBlockid = pWriter->iEndBlockid; | ||
4537 | |||
4538 | return interiorWriterRootInfo(v, &pWriter->parentWriter, | ||
4539 | ppRootInfo, pnRootInfo, piEndBlockid); | ||
4540 | } | ||
4541 | |||
4542 | /* Collect the rootInfo data and store it into the segment directory. | ||
4543 | ** This has the effect of flushing the segment's leaf data to | ||
4544 | ** %_segments, and also flushing any interior nodes to %_segments. | ||
4545 | */ | ||
4546 | static int leafWriterFinalize(fulltext_vtab *v, LeafWriter *pWriter){ | ||
4547 | sqlite_int64 iEndBlockid; | ||
4548 | char *pRootInfo; | ||
4549 | int rc, nRootInfo; | ||
4550 | |||
4551 | rc = leafWriterRootInfo(v, pWriter, &pRootInfo, &nRootInfo, &iEndBlockid); | ||
4552 | if( rc!=SQLITE_OK ) return rc; | ||
4553 | |||
4554 | /* Don't bother storing an entirely empty segment. */ | ||
4555 | if( iEndBlockid==0 && nRootInfo==0 ) return SQLITE_OK; | ||
4556 | |||
4557 | return segdir_set(v, pWriter->iLevel, pWriter->idx, | ||
4558 | pWriter->iStartBlockid, pWriter->iEndBlockid, | ||
4559 | iEndBlockid, pRootInfo, nRootInfo); | ||
4560 | } | ||
4561 | |||
4562 | static void leafWriterDestroy(LeafWriter *pWriter){ | ||
4563 | if( pWriter->has_parent ) interiorWriterDestroy(&pWriter->parentWriter); | ||
4564 | dataBufferDestroy(&pWriter->term); | ||
4565 | dataBufferDestroy(&pWriter->data); | ||
4566 | } | ||
4567 | |||
4568 | /* Encode a term into the leafWriter, delta-encoding as appropriate. | ||
4569 | ** Returns the length of the new term which distinguishes it from the | ||
4570 | ** previous term, which can be used to set nTermDistinct when a node | ||
4571 | ** boundary is crossed. | ||
4572 | */ | ||
4573 | static int leafWriterEncodeTerm(LeafWriter *pWriter, | ||
4574 | const char *pTerm, int nTerm){ | ||
4575 | char c[VARINT_MAX+VARINT_MAX]; | ||
4576 | int n, nPrefix = 0; | ||
4577 | |||
4578 | assert( nTerm>0 ); | ||
4579 | while( nPrefix<pWriter->term.nData && | ||
4580 | pTerm[nPrefix]==pWriter->term.pData[nPrefix] ){ | ||
4581 | nPrefix++; | ||
4582 | /* Failing this implies that the terms weren't in order. */ | ||
4583 | assert( nPrefix<nTerm ); | ||
4584 | } | ||
4585 | |||
4586 | if( pWriter->data.nData==0 ){ | ||
4587 | /* Encode the node header and leading term as: | ||
4588 | ** varint(0) | ||
4589 | ** varint(nTerm) | ||
4590 | ** char pTerm[nTerm] | ||
4591 | */ | ||
4592 | n = putVarint(c, '\0'); | ||
4593 | n += putVarint(c+n, nTerm); | ||
4594 | dataBufferAppend2(&pWriter->data, c, n, pTerm, nTerm); | ||
4595 | }else{ | ||
4596 | /* Delta-encode the term as: | ||
4597 | ** varint(nPrefix) | ||
4598 | ** varint(nSuffix) | ||
4599 | ** char pTermSuffix[nSuffix] | ||
4600 | */ | ||
4601 | n = putVarint(c, nPrefix); | ||
4602 | n += putVarint(c+n, nTerm-nPrefix); | ||
4603 | dataBufferAppend2(&pWriter->data, c, n, pTerm+nPrefix, nTerm-nPrefix); | ||
4604 | } | ||
4605 | dataBufferReplace(&pWriter->term, pTerm, nTerm); | ||
4606 | |||
4607 | return nPrefix+1; | ||
4608 | } | ||
4609 | |||
4610 | /* Used to avoid a memmove when a large amount of doclist data is in | ||
4611 | ** the buffer. This constructs a node and term header before | ||
4612 | ** iDoclistData and flushes the resulting complete node using | ||
4613 | ** leafWriterInternalFlush(). | ||
4614 | */ | ||
4615 | static int leafWriterInlineFlush(fulltext_vtab *v, LeafWriter *pWriter, | ||
4616 | const char *pTerm, int nTerm, | ||
4617 | int iDoclistData){ | ||
4618 | char c[VARINT_MAX+VARINT_MAX]; | ||
4619 | int iData, n = putVarint(c, 0); | ||
4620 | n += putVarint(c+n, nTerm); | ||
4621 | |||
4622 | /* There should always be room for the header. Even if pTerm shared | ||
4623 | ** a substantial prefix with the previous term, the entire prefix | ||
4624 | ** could be constructed from earlier data in the doclist, so there | ||
4625 | ** should be room. | ||
4626 | */ | ||
4627 | assert( iDoclistData>=n+nTerm ); | ||
4628 | |||
4629 | iData = iDoclistData-(n+nTerm); | ||
4630 | memcpy(pWriter->data.pData+iData, c, n); | ||
4631 | memcpy(pWriter->data.pData+iData+n, pTerm, nTerm); | ||
4632 | |||
4633 | return leafWriterInternalFlush(v, pWriter, iData, pWriter->data.nData-iData); | ||
4634 | } | ||
4635 | |||
4636 | /* Push pTerm[nTerm] along with the doclist data to the leaf layer of | ||
4637 | ** %_segments. | ||
4638 | */ | ||
4639 | static int leafWriterStepMerge(fulltext_vtab *v, LeafWriter *pWriter, | ||
4640 | const char *pTerm, int nTerm, | ||
4641 | DLReader *pReaders, int nReaders){ | ||
4642 | char c[VARINT_MAX+VARINT_MAX]; | ||
4643 | int iTermData = pWriter->data.nData, iDoclistData; | ||
4644 | int i, nData, n, nActualData, nActual, rc, nTermDistinct; | ||
4645 | |||
4646 | ASSERT_VALID_LEAF_NODE(pWriter->data.pData, pWriter->data.nData); | ||
4647 | nTermDistinct = leafWriterEncodeTerm(pWriter, pTerm, nTerm); | ||
4648 | |||
4649 | /* Remember nTermDistinct if opening a new node. */ | ||
4650 | if( iTermData==0 ) pWriter->nTermDistinct = nTermDistinct; | ||
4651 | |||
4652 | iDoclistData = pWriter->data.nData; | ||
4653 | |||
4654 | /* Estimate the length of the merged doclist so we can leave space | ||
4655 | ** to encode it. | ||
4656 | */ | ||
4657 | for(i=0, nData=0; i<nReaders; i++){ | ||
4658 | nData += dlrAllDataBytes(&pReaders[i]); | ||
4659 | } | ||
4660 | n = putVarint(c, nData); | ||
4661 | dataBufferAppend(&pWriter->data, c, n); | ||
4662 | |||
4663 | docListMerge(&pWriter->data, pReaders, nReaders); | ||
4664 | ASSERT_VALID_DOCLIST(DL_DEFAULT, | ||
4665 | pWriter->data.pData+iDoclistData+n, | ||
4666 | pWriter->data.nData-iDoclistData-n, NULL); | ||
4667 | |||
4668 | /* The actual amount of doclist data at this point could be smaller | ||
4669 | ** than the length we encoded. Additionally, the space required to | ||
4670 | ** encode this length could be smaller. For small doclists, this is | ||
4671 | ** not a big deal, we can just use memmove() to adjust things. | ||
4672 | */ | ||
4673 | nActualData = pWriter->data.nData-(iDoclistData+n); | ||
4674 | nActual = putVarint(c, nActualData); | ||
4675 | assert( nActualData<=nData ); | ||
4676 | assert( nActual<=n ); | ||
4677 | |||
4678 | /* If the new doclist is big enough for force a standalone leaf | ||
4679 | ** node, we can immediately flush it inline without doing the | ||
4680 | ** memmove(). | ||
4681 | */ | ||
4682 | /* TODO(shess) This test matches leafWriterStep(), which does this | ||
4683 | ** test before it knows the cost to varint-encode the term and | ||
4684 | ** doclist lengths. At some point, change to | ||
4685 | ** pWriter->data.nData-iTermData>STANDALONE_MIN. | ||
4686 | */ | ||
4687 | if( nTerm+nActualData>STANDALONE_MIN ){ | ||
4688 | /* Push leaf node from before this term. */ | ||
4689 | if( iTermData>0 ){ | ||
4690 | rc = leafWriterInternalFlush(v, pWriter, 0, iTermData); | ||
4691 | if( rc!=SQLITE_OK ) return rc; | ||
4692 | |||
4693 | pWriter->nTermDistinct = nTermDistinct; | ||
4694 | } | ||
4695 | |||
4696 | /* Fix the encoded doclist length. */ | ||
4697 | iDoclistData += n - nActual; | ||
4698 | memcpy(pWriter->data.pData+iDoclistData, c, nActual); | ||
4699 | |||
4700 | /* Push the standalone leaf node. */ | ||
4701 | rc = leafWriterInlineFlush(v, pWriter, pTerm, nTerm, iDoclistData); | ||
4702 | if( rc!=SQLITE_OK ) return rc; | ||
4703 | |||
4704 | /* Leave the node empty. */ | ||
4705 | dataBufferReset(&pWriter->data); | ||
4706 | |||
4707 | return rc; | ||
4708 | } | ||
4709 | |||
4710 | /* At this point, we know that the doclist was small, so do the | ||
4711 | ** memmove if indicated. | ||
4712 | */ | ||
4713 | if( nActual<n ){ | ||
4714 | memmove(pWriter->data.pData+iDoclistData+nActual, | ||
4715 | pWriter->data.pData+iDoclistData+n, | ||
4716 | pWriter->data.nData-(iDoclistData+n)); | ||
4717 | pWriter->data.nData -= n-nActual; | ||
4718 | } | ||
4719 | |||
4720 | /* Replace written length with actual length. */ | ||
4721 | memcpy(pWriter->data.pData+iDoclistData, c, nActual); | ||
4722 | |||
4723 | /* If the node is too large, break things up. */ | ||
4724 | /* TODO(shess) This test matches leafWriterStep(), which does this | ||
4725 | ** test before it knows the cost to varint-encode the term and | ||
4726 | ** doclist lengths. At some point, change to | ||
4727 | ** pWriter->data.nData>LEAF_MAX. | ||
4728 | */ | ||
4729 | if( iTermData+nTerm+nActualData>LEAF_MAX ){ | ||
4730 | /* Flush out the leading data as a node */ | ||
4731 | rc = leafWriterInternalFlush(v, pWriter, 0, iTermData); | ||
4732 | if( rc!=SQLITE_OK ) return rc; | ||
4733 | |||
4734 | pWriter->nTermDistinct = nTermDistinct; | ||
4735 | |||
4736 | /* Rebuild header using the current term */ | ||
4737 | n = putVarint(pWriter->data.pData, 0); | ||
4738 | n += putVarint(pWriter->data.pData+n, nTerm); | ||
4739 | memcpy(pWriter->data.pData+n, pTerm, nTerm); | ||
4740 | n += nTerm; | ||
4741 | |||
4742 | /* There should always be room, because the previous encoding | ||
4743 | ** included all data necessary to construct the term. | ||
4744 | */ | ||
4745 | assert( n<iDoclistData ); | ||
4746 | /* So long as STANDALONE_MIN is half or less of LEAF_MAX, the | ||
4747 | ** following memcpy() is safe (as opposed to needing a memmove). | ||
4748 | */ | ||
4749 | assert( 2*STANDALONE_MIN<=LEAF_MAX ); | ||
4750 | assert( n+pWriter->data.nData-iDoclistData<iDoclistData ); | ||
4751 | memcpy(pWriter->data.pData+n, | ||
4752 | pWriter->data.pData+iDoclistData, | ||
4753 | pWriter->data.nData-iDoclistData); | ||
4754 | pWriter->data.nData -= iDoclistData-n; | ||
4755 | } | ||
4756 | ASSERT_VALID_LEAF_NODE(pWriter->data.pData, pWriter->data.nData); | ||
4757 | |||
4758 | return SQLITE_OK; | ||
4759 | } | ||
4760 | |||
4761 | /* Push pTerm[nTerm] along with the doclist data to the leaf layer of | ||
4762 | ** %_segments. | ||
4763 | */ | ||
4764 | /* TODO(shess) Revise writeZeroSegment() so that doclists are | ||
4765 | ** constructed directly in pWriter->data. | ||
4766 | */ | ||
4767 | static int leafWriterStep(fulltext_vtab *v, LeafWriter *pWriter, | ||
4768 | const char *pTerm, int nTerm, | ||
4769 | const char *pData, int nData){ | ||
4770 | int rc; | ||
4771 | DLReader reader; | ||
4772 | |||
4773 | dlrInit(&reader, DL_DEFAULT, pData, nData); | ||
4774 | rc = leafWriterStepMerge(v, pWriter, pTerm, nTerm, &reader, 1); | ||
4775 | dlrDestroy(&reader); | ||
4776 | |||
4777 | return rc; | ||
4778 | } | ||
4779 | |||
4780 | |||
4781 | /****************************************************************/ | ||
4782 | /* LeafReader is used to iterate over an individual leaf node. */ | ||
4783 | typedef struct LeafReader { | ||
4784 | DataBuffer term; /* copy of current term. */ | ||
4785 | |||
4786 | const char *pData; /* data for current term. */ | ||
4787 | int nData; | ||
4788 | } LeafReader; | ||
4789 | |||
4790 | static void leafReaderDestroy(LeafReader *pReader){ | ||
4791 | dataBufferDestroy(&pReader->term); | ||
4792 | SCRAMBLE(pReader); | ||
4793 | } | ||
4794 | |||
4795 | static int leafReaderAtEnd(LeafReader *pReader){ | ||
4796 | return pReader->nData<=0; | ||
4797 | } | ||
4798 | |||
4799 | /* Access the current term. */ | ||
4800 | static int leafReaderTermBytes(LeafReader *pReader){ | ||
4801 | return pReader->term.nData; | ||
4802 | } | ||
4803 | static const char *leafReaderTerm(LeafReader *pReader){ | ||
4804 | assert( pReader->term.nData>0 ); | ||
4805 | return pReader->term.pData; | ||
4806 | } | ||
4807 | |||
4808 | /* Access the doclist data for the current term. */ | ||
4809 | static int leafReaderDataBytes(LeafReader *pReader){ | ||
4810 | int nData; | ||
4811 | assert( pReader->term.nData>0 ); | ||
4812 | getVarint32(pReader->pData, &nData); | ||
4813 | return nData; | ||
4814 | } | ||
4815 | static const char *leafReaderData(LeafReader *pReader){ | ||
4816 | int n, nData; | ||
4817 | assert( pReader->term.nData>0 ); | ||
4818 | n = getVarint32(pReader->pData, &nData); | ||
4819 | return pReader->pData+n; | ||
4820 | } | ||
4821 | |||
4822 | static void leafReaderInit(const char *pData, int nData, | ||
4823 | LeafReader *pReader){ | ||
4824 | int nTerm, n; | ||
4825 | |||
4826 | assert( nData>0 ); | ||
4827 | assert( pData[0]=='\0' ); | ||
4828 | |||
4829 | CLEAR(pReader); | ||
4830 | |||
4831 | /* Read the first term, skipping the header byte. */ | ||
4832 | n = getVarint32(pData+1, &nTerm); | ||
4833 | dataBufferInit(&pReader->term, nTerm); | ||
4834 | dataBufferReplace(&pReader->term, pData+1+n, nTerm); | ||
4835 | |||
4836 | /* Position after the first term. */ | ||
4837 | assert( 1+n+nTerm<nData ); | ||
4838 | pReader->pData = pData+1+n+nTerm; | ||
4839 | pReader->nData = nData-1-n-nTerm; | ||
4840 | } | ||
4841 | |||
4842 | /* Step the reader forward to the next term. */ | ||
4843 | static void leafReaderStep(LeafReader *pReader){ | ||
4844 | int n, nData, nPrefix, nSuffix; | ||
4845 | assert( !leafReaderAtEnd(pReader) ); | ||
4846 | |||
4847 | /* Skip previous entry's data block. */ | ||
4848 | n = getVarint32(pReader->pData, &nData); | ||
4849 | assert( n+nData<=pReader->nData ); | ||
4850 | pReader->pData += n+nData; | ||
4851 | pReader->nData -= n+nData; | ||
4852 | |||
4853 | if( !leafReaderAtEnd(pReader) ){ | ||
4854 | /* Construct the new term using a prefix from the old term plus a | ||
4855 | ** suffix from the leaf data. | ||
4856 | */ | ||
4857 | n = getVarint32(pReader->pData, &nPrefix); | ||
4858 | n += getVarint32(pReader->pData+n, &nSuffix); | ||
4859 | assert( n+nSuffix<pReader->nData ); | ||
4860 | pReader->term.nData = nPrefix; | ||
4861 | dataBufferAppend(&pReader->term, pReader->pData+n, nSuffix); | ||
4862 | |||
4863 | pReader->pData += n+nSuffix; | ||
4864 | pReader->nData -= n+nSuffix; | ||
4865 | } | ||
4866 | } | ||
4867 | |||
4868 | /* strcmp-style comparison of pReader's current term against pTerm. | ||
4869 | ** If isPrefix, equality means equal through nTerm bytes. | ||
4870 | */ | ||
4871 | static int leafReaderTermCmp(LeafReader *pReader, | ||
4872 | const char *pTerm, int nTerm, int isPrefix){ | ||
4873 | int c, n = pReader->term.nData<nTerm ? pReader->term.nData : nTerm; | ||
4874 | if( n==0 ){ | ||
4875 | if( pReader->term.nData>0 ) return -1; | ||
4876 | if(nTerm>0 ) return 1; | ||
4877 | return 0; | ||
4878 | } | ||
4879 | |||
4880 | c = memcmp(pReader->term.pData, pTerm, n); | ||
4881 | if( c!=0 ) return c; | ||
4882 | if( isPrefix && n==nTerm ) return 0; | ||
4883 | return pReader->term.nData - nTerm; | ||
4884 | } | ||
4885 | |||
4886 | |||
4887 | /****************************************************************/ | ||
4888 | /* LeavesReader wraps LeafReader to allow iterating over the entire | ||
4889 | ** leaf layer of the tree. | ||
4890 | */ | ||
4891 | typedef struct LeavesReader { | ||
4892 | int idx; /* Index within the segment. */ | ||
4893 | |||
4894 | sqlite3_stmt *pStmt; /* Statement we're streaming leaves from. */ | ||
4895 | int eof; /* we've seen SQLITE_DONE from pStmt. */ | ||
4896 | |||
4897 | LeafReader leafReader; /* reader for the current leaf. */ | ||
4898 | DataBuffer rootData; /* root data for inline. */ | ||
4899 | } LeavesReader; | ||
4900 | |||
4901 | /* Access the current term. */ | ||
4902 | static int leavesReaderTermBytes(LeavesReader *pReader){ | ||
4903 | assert( !pReader->eof ); | ||
4904 | return leafReaderTermBytes(&pReader->leafReader); | ||
4905 | } | ||
4906 | static const char *leavesReaderTerm(LeavesReader *pReader){ | ||
4907 | assert( !pReader->eof ); | ||
4908 | return leafReaderTerm(&pReader->leafReader); | ||
4909 | } | ||
4910 | |||
4911 | /* Access the doclist data for the current term. */ | ||
4912 | static int leavesReaderDataBytes(LeavesReader *pReader){ | ||
4913 | assert( !pReader->eof ); | ||
4914 | return leafReaderDataBytes(&pReader->leafReader); | ||
4915 | } | ||
4916 | static const char *leavesReaderData(LeavesReader *pReader){ | ||
4917 | assert( !pReader->eof ); | ||
4918 | return leafReaderData(&pReader->leafReader); | ||
4919 | } | ||
4920 | |||
4921 | static int leavesReaderAtEnd(LeavesReader *pReader){ | ||
4922 | return pReader->eof; | ||
4923 | } | ||
4924 | |||
4925 | /* loadSegmentLeaves() may not read all the way to SQLITE_DONE, thus | ||
4926 | ** leaving the statement handle open, which locks the table. | ||
4927 | */ | ||
4928 | /* TODO(shess) This "solution" is not satisfactory. Really, there | ||
4929 | ** should be check-in function for all statement handles which | ||
4930 | ** arranges to call sqlite3_reset(). This most likely will require | ||
4931 | ** modification to control flow all over the place, though, so for now | ||
4932 | ** just punt. | ||
4933 | ** | ||
4934 | ** Note the the current system assumes that segment merges will run to | ||
4935 | ** completion, which is why this particular probably hasn't arisen in | ||
4936 | ** this case. Probably a brittle assumption. | ||
4937 | */ | ||
4938 | static int leavesReaderReset(LeavesReader *pReader){ | ||
4939 | return sqlite3_reset(pReader->pStmt); | ||
4940 | } | ||
4941 | |||
4942 | static void leavesReaderDestroy(LeavesReader *pReader){ | ||
4943 | leafReaderDestroy(&pReader->leafReader); | ||
4944 | dataBufferDestroy(&pReader->rootData); | ||
4945 | SCRAMBLE(pReader); | ||
4946 | } | ||
4947 | |||
4948 | /* Initialize pReader with the given root data (if iStartBlockid==0 | ||
4949 | ** the leaf data was entirely contained in the root), or from the | ||
4950 | ** stream of blocks between iStartBlockid and iEndBlockid, inclusive. | ||
4951 | */ | ||
4952 | static int leavesReaderInit(fulltext_vtab *v, | ||
4953 | int idx, | ||
4954 | sqlite_int64 iStartBlockid, | ||
4955 | sqlite_int64 iEndBlockid, | ||
4956 | const char *pRootData, int nRootData, | ||
4957 | LeavesReader *pReader){ | ||
4958 | CLEAR(pReader); | ||
4959 | pReader->idx = idx; | ||
4960 | |||
4961 | dataBufferInit(&pReader->rootData, 0); | ||
4962 | if( iStartBlockid==0 ){ | ||
4963 | /* Entire leaf level fit in root data. */ | ||
4964 | dataBufferReplace(&pReader->rootData, pRootData, nRootData); | ||
4965 | leafReaderInit(pReader->rootData.pData, pReader->rootData.nData, | ||
4966 | &pReader->leafReader); | ||
4967 | }else{ | ||
4968 | sqlite3_stmt *s; | ||
4969 | int rc = sql_get_leaf_statement(v, idx, &s); | ||
4970 | if( rc!=SQLITE_OK ) return rc; | ||
4971 | |||
4972 | rc = sqlite3_bind_int64(s, 1, iStartBlockid); | ||
4973 | if( rc!=SQLITE_OK ) return rc; | ||
4974 | |||
4975 | rc = sqlite3_bind_int64(s, 2, iEndBlockid); | ||
4976 | if( rc!=SQLITE_OK ) return rc; | ||
4977 | |||
4978 | rc = sqlite3_step(s); | ||
4979 | if( rc==SQLITE_DONE ){ | ||
4980 | pReader->eof = 1; | ||
4981 | return SQLITE_OK; | ||
4982 | } | ||
4983 | if( rc!=SQLITE_ROW ) return rc; | ||
4984 | |||
4985 | pReader->pStmt = s; | ||
4986 | leafReaderInit(sqlite3_column_blob(pReader->pStmt, 0), | ||
4987 | sqlite3_column_bytes(pReader->pStmt, 0), | ||
4988 | &pReader->leafReader); | ||
4989 | } | ||
4990 | return SQLITE_OK; | ||
4991 | } | ||
4992 | |||
4993 | /* Step the current leaf forward to the next term. If we reach the | ||
4994 | ** end of the current leaf, step forward to the next leaf block. | ||
4995 | */ | ||
4996 | static int leavesReaderStep(fulltext_vtab *v, LeavesReader *pReader){ | ||
4997 | assert( !leavesReaderAtEnd(pReader) ); | ||
4998 | leafReaderStep(&pReader->leafReader); | ||
4999 | |||
5000 | if( leafReaderAtEnd(&pReader->leafReader) ){ | ||
5001 | int rc; | ||
5002 | if( pReader->rootData.pData ){ | ||
5003 | pReader->eof = 1; | ||
5004 | return SQLITE_OK; | ||
5005 | } | ||
5006 | rc = sqlite3_step(pReader->pStmt); | ||
5007 | if( rc!=SQLITE_ROW ){ | ||
5008 | pReader->eof = 1; | ||
5009 | return rc==SQLITE_DONE ? SQLITE_OK : rc; | ||
5010 | } | ||
5011 | leafReaderDestroy(&pReader->leafReader); | ||
5012 | leafReaderInit(sqlite3_column_blob(pReader->pStmt, 0), | ||
5013 | sqlite3_column_bytes(pReader->pStmt, 0), | ||
5014 | &pReader->leafReader); | ||
5015 | } | ||
5016 | return SQLITE_OK; | ||
5017 | } | ||
5018 | |||
5019 | /* Order LeavesReaders by their term, ignoring idx. Readers at eof | ||
5020 | ** always sort to the end. | ||
5021 | */ | ||
5022 | static int leavesReaderTermCmp(LeavesReader *lr1, LeavesReader *lr2){ | ||
5023 | if( leavesReaderAtEnd(lr1) ){ | ||
5024 | if( leavesReaderAtEnd(lr2) ) return 0; | ||
5025 | return 1; | ||
5026 | } | ||
5027 | if( leavesReaderAtEnd(lr2) ) return -1; | ||
5028 | |||
5029 | return leafReaderTermCmp(&lr1->leafReader, | ||
5030 | leavesReaderTerm(lr2), leavesReaderTermBytes(lr2), | ||
5031 | 0); | ||
5032 | } | ||
5033 | |||
5034 | /* Similar to leavesReaderTermCmp(), with additional ordering by idx | ||
5035 | ** so that older segments sort before newer segments. | ||
5036 | */ | ||
5037 | static int leavesReaderCmp(LeavesReader *lr1, LeavesReader *lr2){ | ||
5038 | int c = leavesReaderTermCmp(lr1, lr2); | ||
5039 | if( c!=0 ) return c; | ||
5040 | return lr1->idx-lr2->idx; | ||
5041 | } | ||
5042 | |||
5043 | /* Assume that pLr[1]..pLr[nLr] are sorted. Bubble pLr[0] into its | ||
5044 | ** sorted position. | ||
5045 | */ | ||
5046 | static void leavesReaderReorder(LeavesReader *pLr, int nLr){ | ||
5047 | while( nLr>1 && leavesReaderCmp(pLr, pLr+1)>0 ){ | ||
5048 | LeavesReader tmp = pLr[0]; | ||
5049 | pLr[0] = pLr[1]; | ||
5050 | pLr[1] = tmp; | ||
5051 | nLr--; | ||
5052 | pLr++; | ||
5053 | } | ||
5054 | } | ||
5055 | |||
5056 | /* Initializes pReaders with the segments from level iLevel, returning | ||
5057 | ** the number of segments in *piReaders. Leaves pReaders in sorted | ||
5058 | ** order. | ||
5059 | */ | ||
5060 | static int leavesReadersInit(fulltext_vtab *v, int iLevel, | ||
5061 | LeavesReader *pReaders, int *piReaders){ | ||
5062 | sqlite3_stmt *s; | ||
5063 | int i, rc = sql_get_statement(v, SEGDIR_SELECT_STMT, &s); | ||
5064 | if( rc!=SQLITE_OK ) return rc; | ||
5065 | |||
5066 | rc = sqlite3_bind_int(s, 1, iLevel); | ||
5067 | if( rc!=SQLITE_OK ) return rc; | ||
5068 | |||
5069 | i = 0; | ||
5070 | while( (rc = sqlite3_step(s))==SQLITE_ROW ){ | ||
5071 | sqlite_int64 iStart = sqlite3_column_int64(s, 0); | ||
5072 | sqlite_int64 iEnd = sqlite3_column_int64(s, 1); | ||
5073 | const char *pRootData = sqlite3_column_blob(s, 2); | ||
5074 | int nRootData = sqlite3_column_bytes(s, 2); | ||
5075 | |||
5076 | assert( i<MERGE_COUNT ); | ||
5077 | rc = leavesReaderInit(v, i, iStart, iEnd, pRootData, nRootData, | ||
5078 | &pReaders[i]); | ||
5079 | if( rc!=SQLITE_OK ) break; | ||
5080 | |||
5081 | i++; | ||
5082 | } | ||
5083 | if( rc!=SQLITE_DONE ){ | ||
5084 | while( i-->0 ){ | ||
5085 | leavesReaderDestroy(&pReaders[i]); | ||
5086 | } | ||
5087 | return rc; | ||
5088 | } | ||
5089 | |||
5090 | *piReaders = i; | ||
5091 | |||
5092 | /* Leave our results sorted by term, then age. */ | ||
5093 | while( i-- ){ | ||
5094 | leavesReaderReorder(pReaders+i, *piReaders-i); | ||
5095 | } | ||
5096 | return SQLITE_OK; | ||
5097 | } | ||
5098 | |||
5099 | /* Merge doclists from pReaders[nReaders] into a single doclist, which | ||
5100 | ** is written to pWriter. Assumes pReaders is ordered oldest to | ||
5101 | ** newest. | ||
5102 | */ | ||
5103 | /* TODO(shess) Consider putting this inline in segmentMerge(). */ | ||
5104 | static int leavesReadersMerge(fulltext_vtab *v, | ||
5105 | LeavesReader *pReaders, int nReaders, | ||
5106 | LeafWriter *pWriter){ | ||
5107 | DLReader dlReaders[MERGE_COUNT]; | ||
5108 | const char *pTerm = leavesReaderTerm(pReaders); | ||
5109 | int i, nTerm = leavesReaderTermBytes(pReaders); | ||
5110 | |||
5111 | assert( nReaders<=MERGE_COUNT ); | ||
5112 | |||
5113 | for(i=0; i<nReaders; i++){ | ||
5114 | dlrInit(&dlReaders[i], DL_DEFAULT, | ||
5115 | leavesReaderData(pReaders+i), | ||
5116 | leavesReaderDataBytes(pReaders+i)); | ||
5117 | } | ||
5118 | |||
5119 | return leafWriterStepMerge(v, pWriter, pTerm, nTerm, dlReaders, nReaders); | ||
5120 | } | ||
5121 | |||
5122 | /* Forward ref due to mutual recursion with segdirNextIndex(). */ | ||
5123 | static int segmentMerge(fulltext_vtab *v, int iLevel); | ||
5124 | |||
5125 | /* Put the next available index at iLevel into *pidx. If iLevel | ||
5126 | ** already has MERGE_COUNT segments, they are merged to a higher | ||
5127 | ** level to make room. | ||
5128 | */ | ||
5129 | static int segdirNextIndex(fulltext_vtab *v, int iLevel, int *pidx){ | ||
5130 | int rc = segdir_max_index(v, iLevel, pidx); | ||
5131 | if( rc==SQLITE_DONE ){ /* No segments at iLevel. */ | ||
5132 | *pidx = 0; | ||
5133 | }else if( rc==SQLITE_ROW ){ | ||
5134 | if( *pidx==(MERGE_COUNT-1) ){ | ||
5135 | rc = segmentMerge(v, iLevel); | ||
5136 | if( rc!=SQLITE_OK ) return rc; | ||
5137 | *pidx = 0; | ||
5138 | }else{ | ||
5139 | (*pidx)++; | ||
5140 | } | ||
5141 | }else{ | ||
5142 | return rc; | ||
5143 | } | ||
5144 | return SQLITE_OK; | ||
5145 | } | ||
5146 | |||
5147 | /* Merge MERGE_COUNT segments at iLevel into a new segment at | ||
5148 | ** iLevel+1. If iLevel+1 is already full of segments, those will be | ||
5149 | ** merged to make room. | ||
5150 | */ | ||
5151 | static int segmentMerge(fulltext_vtab *v, int iLevel){ | ||
5152 | LeafWriter writer; | ||
5153 | LeavesReader lrs[MERGE_COUNT]; | ||
5154 | int i, rc, idx = 0; | ||
5155 | |||
5156 | /* Determine the next available segment index at the next level, | ||
5157 | ** merging as necessary. | ||
5158 | */ | ||
5159 | rc = segdirNextIndex(v, iLevel+1, &idx); | ||
5160 | if( rc!=SQLITE_OK ) return rc; | ||
5161 | |||
5162 | /* TODO(shess) This assumes that we'll always see exactly | ||
5163 | ** MERGE_COUNT segments to merge at a given level. That will be | ||
5164 | ** broken if we allow the developer to request preemptive or | ||
5165 | ** deferred merging. | ||
5166 | */ | ||
5167 | memset(&lrs, '\0', sizeof(lrs)); | ||
5168 | rc = leavesReadersInit(v, iLevel, lrs, &i); | ||
5169 | if( rc!=SQLITE_OK ) return rc; | ||
5170 | assert( i==MERGE_COUNT ); | ||
5171 | |||
5172 | leafWriterInit(iLevel+1, idx, &writer); | ||
5173 | |||
5174 | /* Since leavesReaderReorder() pushes readers at eof to the end, | ||
5175 | ** when the first reader is empty, all will be empty. | ||
5176 | */ | ||
5177 | while( !leavesReaderAtEnd(lrs) ){ | ||
5178 | /* Figure out how many readers share their next term. */ | ||
5179 | for(i=1; i<MERGE_COUNT && !leavesReaderAtEnd(lrs+i); i++){ | ||
5180 | if( 0!=leavesReaderTermCmp(lrs, lrs+i) ) break; | ||
5181 | } | ||
5182 | |||
5183 | rc = leavesReadersMerge(v, lrs, i, &writer); | ||
5184 | if( rc!=SQLITE_OK ) goto err; | ||
5185 | |||
5186 | /* Step forward those that were merged. */ | ||
5187 | while( i-->0 ){ | ||
5188 | rc = leavesReaderStep(v, lrs+i); | ||
5189 | if( rc!=SQLITE_OK ) goto err; | ||
5190 | |||
5191 | /* Reorder by term, then by age. */ | ||
5192 | leavesReaderReorder(lrs+i, MERGE_COUNT-i); | ||
5193 | } | ||
5194 | } | ||
5195 | |||
5196 | for(i=0; i<MERGE_COUNT; i++){ | ||
5197 | leavesReaderDestroy(&lrs[i]); | ||
5198 | } | ||
5199 | |||
5200 | rc = leafWriterFinalize(v, &writer); | ||
5201 | leafWriterDestroy(&writer); | ||
5202 | if( rc!=SQLITE_OK ) return rc; | ||
5203 | |||
5204 | /* Delete the merged segment data. */ | ||
5205 | return segdir_delete(v, iLevel); | ||
5206 | |||
5207 | err: | ||
5208 | for(i=0; i<MERGE_COUNT; i++){ | ||
5209 | leavesReaderDestroy(&lrs[i]); | ||
5210 | } | ||
5211 | leafWriterDestroy(&writer); | ||
5212 | return rc; | ||
5213 | } | ||
5214 | |||
5215 | /* Scan pReader for pTerm/nTerm, and merge the term's doclist over | ||
5216 | ** *out (any doclists with duplicate docids overwrite those in *out). | ||
5217 | ** Internal function for loadSegmentLeaf(). | ||
5218 | */ | ||
5219 | static int loadSegmentLeavesInt(fulltext_vtab *v, LeavesReader *pReader, | ||
5220 | const char *pTerm, int nTerm, int isPrefix, | ||
5221 | DataBuffer *out){ | ||
5222 | assert( nTerm>0 ); | ||
5223 | |||
5224 | /* Process while the prefix matches. */ | ||
5225 | while( !leavesReaderAtEnd(pReader) ){ | ||
5226 | /* TODO(shess) Really want leavesReaderTermCmp(), but that name is | ||
5227 | ** already taken to compare the terms of two LeavesReaders. Think | ||
5228 | ** on a better name. [Meanwhile, break encapsulation rather than | ||
5229 | ** use a confusing name.] | ||
5230 | */ | ||
5231 | int rc; | ||
5232 | int c = leafReaderTermCmp(&pReader->leafReader, pTerm, nTerm, isPrefix); | ||
5233 | if( c==0 ){ | ||
5234 | const char *pData = leavesReaderData(pReader); | ||
5235 | int nData = leavesReaderDataBytes(pReader); | ||
5236 | if( out->nData==0 ){ | ||
5237 | dataBufferReplace(out, pData, nData); | ||
5238 | }else{ | ||
5239 | DataBuffer result; | ||
5240 | dataBufferInit(&result, out->nData+nData); | ||
5241 | docListUnion(out->pData, out->nData, pData, nData, &result); | ||
5242 | dataBufferDestroy(out); | ||
5243 | *out = result; | ||
5244 | /* TODO(shess) Rather than destroy out, we could retain it for | ||
5245 | ** later reuse. | ||
5246 | */ | ||
5247 | } | ||
5248 | } | ||
5249 | if( c>0 ) break; /* Past any possible matches. */ | ||
5250 | |||
5251 | rc = leavesReaderStep(v, pReader); | ||
5252 | if( rc!=SQLITE_OK ) return rc; | ||
5253 | } | ||
5254 | return SQLITE_OK; | ||
5255 | } | ||
5256 | |||
5257 | /* Call loadSegmentLeavesInt() with pData/nData as input. */ | ||
5258 | static int loadSegmentLeaf(fulltext_vtab *v, const char *pData, int nData, | ||
5259 | const char *pTerm, int nTerm, int isPrefix, | ||
5260 | DataBuffer *out){ | ||
5261 | LeavesReader reader; | ||
5262 | int rc; | ||
5263 | |||
5264 | assert( nData>1 ); | ||
5265 | assert( *pData=='\0' ); | ||
5266 | rc = leavesReaderInit(v, 0, 0, 0, pData, nData, &reader); | ||
5267 | if( rc!=SQLITE_OK ) return rc; | ||
5268 | |||
5269 | rc = loadSegmentLeavesInt(v, &reader, pTerm, nTerm, isPrefix, out); | ||
5270 | leavesReaderReset(&reader); | ||
5271 | leavesReaderDestroy(&reader); | ||
5272 | return rc; | ||
5273 | } | ||
5274 | |||
5275 | /* Call loadSegmentLeavesInt() with the leaf nodes from iStartLeaf to | ||
5276 | ** iEndLeaf (inclusive) as input, and merge the resulting doclist into | ||
5277 | ** out. | ||
5278 | */ | ||
5279 | static int loadSegmentLeaves(fulltext_vtab *v, | ||
5280 | sqlite_int64 iStartLeaf, sqlite_int64 iEndLeaf, | ||
5281 | const char *pTerm, int nTerm, int isPrefix, | ||
5282 | DataBuffer *out){ | ||
5283 | int rc; | ||
5284 | LeavesReader reader; | ||
5285 | |||
5286 | assert( iStartLeaf<=iEndLeaf ); | ||
5287 | rc = leavesReaderInit(v, 0, iStartLeaf, iEndLeaf, NULL, 0, &reader); | ||
5288 | if( rc!=SQLITE_OK ) return rc; | ||
5289 | |||
5290 | rc = loadSegmentLeavesInt(v, &reader, pTerm, nTerm, isPrefix, out); | ||
5291 | leavesReaderReset(&reader); | ||
5292 | leavesReaderDestroy(&reader); | ||
5293 | return rc; | ||
5294 | } | ||
5295 | |||
5296 | /* Taking pData/nData as an interior node, find the sequence of child | ||
5297 | ** nodes which could include pTerm/nTerm/isPrefix. Note that the | ||
5298 | ** interior node terms logically come between the blocks, so there is | ||
5299 | ** one more blockid than there are terms (that block contains terms >= | ||
5300 | ** the last interior-node term). | ||
5301 | */ | ||
5302 | /* TODO(shess) The calling code may already know that the end child is | ||
5303 | ** not worth calculating, because the end may be in a later sibling | ||
5304 | ** node. Consider whether breaking symmetry is worthwhile. I suspect | ||
5305 | ** it's not worthwhile. | ||
5306 | */ | ||
5307 | static void getChildrenContaining(const char *pData, int nData, | ||
5308 | const char *pTerm, int nTerm, int isPrefix, | ||
5309 | sqlite_int64 *piStartChild, | ||
5310 | sqlite_int64 *piEndChild){ | ||
5311 | InteriorReader reader; | ||
5312 | |||
5313 | assert( nData>1 ); | ||
5314 | assert( *pData!='\0' ); | ||
5315 | interiorReaderInit(pData, nData, &reader); | ||
5316 | |||
5317 | /* Scan for the first child which could contain pTerm/nTerm. */ | ||
5318 | while( !interiorReaderAtEnd(&reader) ){ | ||
5319 | if( interiorReaderTermCmp(&reader, pTerm, nTerm, 0)>0 ) break; | ||
5320 | interiorReaderStep(&reader); | ||
5321 | } | ||
5322 | *piStartChild = interiorReaderCurrentBlockid(&reader); | ||
5323 | |||
5324 | /* Keep scanning to find a term greater than our term, using prefix | ||
5325 | ** comparison if indicated. If isPrefix is false, this will be the | ||
5326 | ** same blockid as the starting block. | ||
5327 | */ | ||
5328 | while( !interiorReaderAtEnd(&reader) ){ | ||
5329 | if( interiorReaderTermCmp(&reader, pTerm, nTerm, isPrefix)>0 ) break; | ||
5330 | interiorReaderStep(&reader); | ||
5331 | } | ||
5332 | *piEndChild = interiorReaderCurrentBlockid(&reader); | ||
5333 | |||
5334 | interiorReaderDestroy(&reader); | ||
5335 | |||
5336 | /* Children must ascend, and if !prefix, both must be the same. */ | ||
5337 | assert( *piEndChild>=*piStartChild ); | ||
5338 | assert( isPrefix || *piStartChild==*piEndChild ); | ||
5339 | } | ||
5340 | |||
5341 | /* Read block at iBlockid and pass it with other params to | ||
5342 | ** getChildrenContaining(). | ||
5343 | */ | ||
5344 | static int loadAndGetChildrenContaining( | ||
5345 | fulltext_vtab *v, | ||
5346 | sqlite_int64 iBlockid, | ||
5347 | const char *pTerm, int nTerm, int isPrefix, | ||
5348 | sqlite_int64 *piStartChild, sqlite_int64 *piEndChild | ||
5349 | ){ | ||
5350 | sqlite3_stmt *s = NULL; | ||
5351 | int rc; | ||
5352 | |||
5353 | assert( iBlockid!=0 ); | ||
5354 | assert( pTerm!=NULL ); | ||
5355 | assert( nTerm!=0 ); /* TODO(shess) Why not allow this? */ | ||
5356 | assert( piStartChild!=NULL ); | ||
5357 | assert( piEndChild!=NULL ); | ||
5358 | |||
5359 | rc = sql_get_statement(v, BLOCK_SELECT_STMT, &s); | ||
5360 | if( rc!=SQLITE_OK ) return rc; | ||
5361 | |||
5362 | rc = sqlite3_bind_int64(s, 1, iBlockid); | ||
5363 | if( rc!=SQLITE_OK ) return rc; | ||
5364 | |||
5365 | rc = sqlite3_step(s); | ||
5366 | if( rc==SQLITE_DONE ) return SQLITE_ERROR; | ||
5367 | if( rc!=SQLITE_ROW ) return rc; | ||
5368 | |||
5369 | getChildrenContaining(sqlite3_column_blob(s, 0), sqlite3_column_bytes(s, 0), | ||
5370 | pTerm, nTerm, isPrefix, piStartChild, piEndChild); | ||
5371 | |||
5372 | /* We expect only one row. We must execute another sqlite3_step() | ||
5373 | * to complete the iteration; otherwise the table will remain | ||
5374 | * locked. */ | ||
5375 | rc = sqlite3_step(s); | ||
5376 | if( rc==SQLITE_ROW ) return SQLITE_ERROR; | ||
5377 | if( rc!=SQLITE_DONE ) return rc; | ||
5378 | |||
5379 | return SQLITE_OK; | ||
5380 | } | ||
5381 | |||
5382 | /* Traverse the tree represented by pData[nData] looking for | ||
5383 | ** pTerm[nTerm], placing its doclist into *out. This is internal to | ||
5384 | ** loadSegment() to make error-handling cleaner. | ||
5385 | */ | ||
5386 | static int loadSegmentInt(fulltext_vtab *v, const char *pData, int nData, | ||
5387 | sqlite_int64 iLeavesEnd, | ||
5388 | const char *pTerm, int nTerm, int isPrefix, | ||
5389 | DataBuffer *out){ | ||
5390 | /* Special case where root is a leaf. */ | ||
5391 | if( *pData=='\0' ){ | ||
5392 | return loadSegmentLeaf(v, pData, nData, pTerm, nTerm, isPrefix, out); | ||
5393 | }else{ | ||
5394 | int rc; | ||
5395 | sqlite_int64 iStartChild, iEndChild; | ||
5396 | |||
5397 | /* Process pData as an interior node, then loop down the tree | ||
5398 | ** until we find the set of leaf nodes to scan for the term. | ||
5399 | */ | ||
5400 | getChildrenContaining(pData, nData, pTerm, nTerm, isPrefix, | ||
5401 | &iStartChild, &iEndChild); | ||
5402 | while( iStartChild>iLeavesEnd ){ | ||
5403 | sqlite_int64 iNextStart, iNextEnd; | ||
5404 | rc = loadAndGetChildrenContaining(v, iStartChild, pTerm, nTerm, isPrefix, | ||
5405 | &iNextStart, &iNextEnd); | ||
5406 | if( rc!=SQLITE_OK ) return rc; | ||
5407 | |||
5408 | /* If we've branched, follow the end branch, too. */ | ||
5409 | if( iStartChild!=iEndChild ){ | ||
5410 | sqlite_int64 iDummy; | ||
5411 | rc = loadAndGetChildrenContaining(v, iEndChild, pTerm, nTerm, isPrefix, | ||
5412 | &iDummy, &iNextEnd); | ||
5413 | if( rc!=SQLITE_OK ) return rc; | ||
5414 | } | ||
5415 | |||
5416 | assert( iNextStart<=iNextEnd ); | ||
5417 | iStartChild = iNextStart; | ||
5418 | iEndChild = iNextEnd; | ||
5419 | } | ||
5420 | assert( iStartChild<=iLeavesEnd ); | ||
5421 | assert( iEndChild<=iLeavesEnd ); | ||
5422 | |||
5423 | /* Scan through the leaf segments for doclists. */ | ||
5424 | return loadSegmentLeaves(v, iStartChild, iEndChild, | ||
5425 | pTerm, nTerm, isPrefix, out); | ||
5426 | } | ||
5427 | } | ||
5428 | |||
5429 | /* Call loadSegmentInt() to collect the doclist for pTerm/nTerm, then | ||
5430 | ** merge its doclist over *out (any duplicate doclists read from the | ||
5431 | ** segment rooted at pData will overwrite those in *out). | ||
5432 | */ | ||
5433 | /* TODO(shess) Consider changing this to determine the depth of the | ||
5434 | ** leaves using either the first characters of interior nodes (when | ||
5435 | ** ==1, we're one level above the leaves), or the first character of | ||
5436 | ** the root (which will describe the height of the tree directly). | ||
5437 | ** Either feels somewhat tricky to me. | ||
5438 | */ | ||
5439 | /* TODO(shess) The current merge is likely to be slow for large | ||
5440 | ** doclists (though it should process from newest/smallest to | ||
5441 | ** oldest/largest, so it may not be that bad). It might be useful to | ||
5442 | ** modify things to allow for N-way merging. This could either be | ||
5443 | ** within a segment, with pairwise merges across segments, or across | ||
5444 | ** all segments at once. | ||
5445 | */ | ||
5446 | static int loadSegment(fulltext_vtab *v, const char *pData, int nData, | ||
5447 | sqlite_int64 iLeavesEnd, | ||
5448 | const char *pTerm, int nTerm, int isPrefix, | ||
5449 | DataBuffer *out){ | ||
5450 | DataBuffer result; | ||
5451 | int rc; | ||
5452 | |||
5453 | assert( nData>1 ); | ||
5454 | |||
5455 | /* This code should never be called with buffered updates. */ | ||
5456 | assert( v->nPendingData<0 ); | ||
5457 | |||
5458 | dataBufferInit(&result, 0); | ||
5459 | rc = loadSegmentInt(v, pData, nData, iLeavesEnd, | ||
5460 | pTerm, nTerm, isPrefix, &result); | ||
5461 | if( rc==SQLITE_OK && result.nData>0 ){ | ||
5462 | if( out->nData==0 ){ | ||
5463 | DataBuffer tmp = *out; | ||
5464 | *out = result; | ||
5465 | result = tmp; | ||
5466 | }else{ | ||
5467 | DataBuffer merged; | ||
5468 | DLReader readers[2]; | ||
5469 | |||
5470 | dlrInit(&readers[0], DL_DEFAULT, out->pData, out->nData); | ||
5471 | dlrInit(&readers[1], DL_DEFAULT, result.pData, result.nData); | ||
5472 | dataBufferInit(&merged, out->nData+result.nData); | ||
5473 | docListMerge(&merged, readers, 2); | ||
5474 | dataBufferDestroy(out); | ||
5475 | *out = merged; | ||
5476 | dlrDestroy(&readers[0]); | ||
5477 | dlrDestroy(&readers[1]); | ||
5478 | } | ||
5479 | } | ||
5480 | dataBufferDestroy(&result); | ||
5481 | return rc; | ||
5482 | } | ||
5483 | |||
5484 | /* Scan the database and merge together the posting lists for the term | ||
5485 | ** into *out. | ||
5486 | */ | ||
5487 | static int termSelect(fulltext_vtab *v, int iColumn, | ||
5488 | const char *pTerm, int nTerm, int isPrefix, | ||
5489 | DocListType iType, DataBuffer *out){ | ||
5490 | DataBuffer doclist; | ||
5491 | sqlite3_stmt *s; | ||
5492 | int rc = sql_get_statement(v, SEGDIR_SELECT_ALL_STMT, &s); | ||
5493 | if( rc!=SQLITE_OK ) return rc; | ||
5494 | |||
5495 | /* This code should never be called with buffered updates. */ | ||
5496 | assert( v->nPendingData<0 ); | ||
5497 | |||
5498 | dataBufferInit(&doclist, 0); | ||
5499 | |||
5500 | /* Traverse the segments from oldest to newest so that newer doclist | ||
5501 | ** elements for given docids overwrite older elements. | ||
5502 | */ | ||
5503 | while( (rc = sqlite3_step(s))==SQLITE_ROW ){ | ||
5504 | const char *pData = sqlite3_column_blob(s, 0); | ||
5505 | const int nData = sqlite3_column_bytes(s, 0); | ||
5506 | const sqlite_int64 iLeavesEnd = sqlite3_column_int64(s, 1); | ||
5507 | rc = loadSegment(v, pData, nData, iLeavesEnd, pTerm, nTerm, isPrefix, | ||
5508 | &doclist); | ||
5509 | if( rc!=SQLITE_OK ) goto err; | ||
5510 | } | ||
5511 | if( rc==SQLITE_DONE ){ | ||
5512 | if( doclist.nData!=0 ){ | ||
5513 | /* TODO(shess) The old term_select_all() code applied the column | ||
5514 | ** restrict as we merged segments, leading to smaller buffers. | ||
5515 | ** This is probably worthwhile to bring back, once the new storage | ||
5516 | ** system is checked in. | ||
5517 | */ | ||
5518 | if( iColumn==v->nColumn) iColumn = -1; | ||
5519 | docListTrim(DL_DEFAULT, doclist.pData, doclist.nData, | ||
5520 | iColumn, iType, out); | ||
5521 | } | ||
5522 | rc = SQLITE_OK; | ||
5523 | } | ||
5524 | |||
5525 | err: | ||
5526 | dataBufferDestroy(&doclist); | ||
5527 | return rc; | ||
5528 | } | ||
5529 | |||
5530 | /****************************************************************/ | ||
5531 | /* Used to hold hashtable data for sorting. */ | ||
5532 | typedef struct TermData { | ||
5533 | const char *pTerm; | ||
5534 | int nTerm; | ||
5535 | DLCollector *pCollector; | ||
5536 | } TermData; | ||
5537 | |||
5538 | /* Orders TermData elements in strcmp fashion ( <0 for less-than, 0 | ||
5539 | ** for equal, >0 for greater-than). | ||
5540 | */ | ||
5541 | static int termDataCmp(const void *av, const void *bv){ | ||
5542 | const TermData *a = (const TermData *)av; | ||
5543 | const TermData *b = (const TermData *)bv; | ||
5544 | int n = a->nTerm<b->nTerm ? a->nTerm : b->nTerm; | ||
5545 | int c = memcmp(a->pTerm, b->pTerm, n); | ||
5546 | if( c!=0 ) return c; | ||
5547 | return a->nTerm-b->nTerm; | ||
5548 | } | ||
5549 | |||
5550 | /* Order pTerms data by term, then write a new level 0 segment using | ||
5551 | ** LeafWriter. | ||
5552 | */ | ||
5553 | static int writeZeroSegment(fulltext_vtab *v, fts2Hash *pTerms){ | ||
5554 | fts2HashElem *e; | ||
5555 | int idx, rc, i, n; | ||
5556 | TermData *pData; | ||
5557 | LeafWriter writer; | ||
5558 | DataBuffer dl; | ||
5559 | |||
5560 | /* Determine the next index at level 0, merging as necessary. */ | ||
5561 | rc = segdirNextIndex(v, 0, &idx); | ||
5562 | if( rc!=SQLITE_OK ) return rc; | ||
5563 | |||
5564 | n = fts2HashCount(pTerms); | ||
5565 | pData = malloc(n*sizeof(TermData)); | ||
5566 | |||
5567 | for(i = 0, e = fts2HashFirst(pTerms); e; i++, e = fts2HashNext(e)){ | ||
5568 | assert( i<n ); | ||
5569 | pData[i].pTerm = fts2HashKey(e); | ||
5570 | pData[i].nTerm = fts2HashKeysize(e); | ||
5571 | pData[i].pCollector = fts2HashData(e); | ||
5572 | } | ||
5573 | assert( i==n ); | ||
5574 | |||
5575 | /* TODO(shess) Should we allow user-defined collation sequences, | ||
5576 | ** here? I think we only need that once we support prefix searches. | ||
5577 | */ | ||
5578 | if( n>1 ) qsort(pData, n, sizeof(*pData), termDataCmp); | ||
5579 | |||
5580 | /* TODO(shess) Refactor so that we can write directly to the segment | ||
5581 | ** DataBuffer, as happens for segment merges. | ||
5582 | */ | ||
5583 | leafWriterInit(0, idx, &writer); | ||
5584 | dataBufferInit(&dl, 0); | ||
5585 | for(i=0; i<n; i++){ | ||
5586 | dataBufferReset(&dl); | ||
5587 | dlcAddDoclist(pData[i].pCollector, &dl); | ||
5588 | rc = leafWriterStep(v, &writer, | ||
5589 | pData[i].pTerm, pData[i].nTerm, dl.pData, dl.nData); | ||
5590 | if( rc!=SQLITE_OK ) goto err; | ||
5591 | } | ||
5592 | rc = leafWriterFinalize(v, &writer); | ||
5593 | |||
5594 | err: | ||
5595 | dataBufferDestroy(&dl); | ||
5596 | free(pData); | ||
5597 | leafWriterDestroy(&writer); | ||
5598 | return rc; | ||
5599 | } | ||
5600 | |||
5601 | /* If pendingTerms has data, free it. */ | ||
5602 | static int clearPendingTerms(fulltext_vtab *v){ | ||
5603 | if( v->nPendingData>=0 ){ | ||
5604 | fts2HashElem *e; | ||
5605 | for(e=fts2HashFirst(&v->pendingTerms); e; e=fts2HashNext(e)){ | ||
5606 | dlcDelete(fts2HashData(e)); | ||
5607 | } | ||
5608 | fts2HashClear(&v->pendingTerms); | ||
5609 | v->nPendingData = -1; | ||
5610 | } | ||
5611 | return SQLITE_OK; | ||
5612 | } | ||
5613 | |||
5614 | /* If pendingTerms has data, flush it to a level-zero segment, and | ||
5615 | ** free it. | ||
5616 | */ | ||
5617 | static int flushPendingTerms(fulltext_vtab *v){ | ||
5618 | if( v->nPendingData>=0 ){ | ||
5619 | int rc = writeZeroSegment(v, &v->pendingTerms); | ||
5620 | if( rc==SQLITE_OK ) clearPendingTerms(v); | ||
5621 | return rc; | ||
5622 | } | ||
5623 | return SQLITE_OK; | ||
5624 | } | ||
5625 | |||
5626 | /* If pendingTerms is "too big", or docid is out of order, flush it. | ||
5627 | ** Regardless, be certain that pendingTerms is initialized for use. | ||
5628 | */ | ||
5629 | static int initPendingTerms(fulltext_vtab *v, sqlite_int64 iDocid){ | ||
5630 | /* TODO(shess) Explore whether partially flushing the buffer on | ||
5631 | ** forced-flush would provide better performance. I suspect that if | ||
5632 | ** we ordered the doclists by size and flushed the largest until the | ||
5633 | ** buffer was half empty, that would let the less frequent terms | ||
5634 | ** generate longer doclists. | ||
5635 | */ | ||
5636 | if( iDocid<=v->iPrevDocid || v->nPendingData>kPendingThreshold ){ | ||
5637 | int rc = flushPendingTerms(v); | ||
5638 | if( rc!=SQLITE_OK ) return rc; | ||
5639 | } | ||
5640 | if( v->nPendingData<0 ){ | ||
5641 | fts2HashInit(&v->pendingTerms, FTS2_HASH_STRING, 1); | ||
5642 | v->nPendingData = 0; | ||
5643 | } | ||
5644 | v->iPrevDocid = iDocid; | ||
5645 | return SQLITE_OK; | ||
5646 | } | ||
5647 | |||
5648 | /* This function implements the xUpdate callback; it's the top-level entry | ||
5649 | * point for inserting, deleting or updating a row in a full-text table. */ | ||
5650 | static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg, | ||
5651 | sqlite_int64 *pRowid){ | ||
5652 | fulltext_vtab *v = (fulltext_vtab *) pVtab; | ||
5653 | int rc; | ||
5654 | |||
5655 | TRACE(("FTS2 Update %p\n", pVtab)); | ||
5656 | |||
5657 | if( nArg<2 ){ | ||
5658 | rc = index_delete(v, sqlite3_value_int64(ppArg[0])); | ||
5659 | } else if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){ | ||
5660 | /* An update: | ||
5661 | * ppArg[0] = old rowid | ||
5662 | * ppArg[1] = new rowid | ||
5663 | * ppArg[2..2+v->nColumn-1] = values | ||
5664 | * ppArg[2+v->nColumn] = value for magic column (we ignore this) | ||
5665 | */ | ||
5666 | sqlite_int64 rowid = sqlite3_value_int64(ppArg[0]); | ||
5667 | if( sqlite3_value_type(ppArg[1]) != SQLITE_INTEGER || | ||
5668 | sqlite3_value_int64(ppArg[1]) != rowid ){ | ||
5669 | rc = SQLITE_ERROR; /* we don't allow changing the rowid */ | ||
5670 | } else { | ||
5671 | assert( nArg==2+v->nColumn+1); | ||
5672 | rc = index_update(v, rowid, &ppArg[2]); | ||
5673 | } | ||
5674 | } else { | ||
5675 | /* An insert: | ||
5676 | * ppArg[1] = requested rowid | ||
5677 | * ppArg[2..2+v->nColumn-1] = values | ||
5678 | * ppArg[2+v->nColumn] = value for magic column (we ignore this) | ||
5679 | */ | ||
5680 | assert( nArg==2+v->nColumn+1); | ||
5681 | rc = index_insert(v, ppArg[1], &ppArg[2], pRowid); | ||
5682 | } | ||
5683 | |||
5684 | return rc; | ||
5685 | } | ||
5686 | |||
5687 | static int fulltextSync(sqlite3_vtab *pVtab){ | ||
5688 | TRACE(("FTS2 xSync()\n")); | ||
5689 | return flushPendingTerms((fulltext_vtab *)pVtab); | ||
5690 | } | ||
5691 | |||
5692 | static int fulltextBegin(sqlite3_vtab *pVtab){ | ||
5693 | fulltext_vtab *v = (fulltext_vtab *) pVtab; | ||
5694 | TRACE(("FTS2 xBegin()\n")); | ||
5695 | |||
5696 | /* Any buffered updates should have been cleared by the previous | ||
5697 | ** transaction. | ||
5698 | */ | ||
5699 | assert( v->nPendingData<0 ); | ||
5700 | return clearPendingTerms(v); | ||
5701 | } | ||
5702 | |||
5703 | static int fulltextCommit(sqlite3_vtab *pVtab){ | ||
5704 | fulltext_vtab *v = (fulltext_vtab *) pVtab; | ||
5705 | TRACE(("FTS2 xCommit()\n")); | ||
5706 | |||
5707 | /* Buffered updates should have been cleared by fulltextSync(). */ | ||
5708 | assert( v->nPendingData<0 ); | ||
5709 | return clearPendingTerms(v); | ||
5710 | } | ||
5711 | |||
5712 | static int fulltextRollback(sqlite3_vtab *pVtab){ | ||
5713 | TRACE(("FTS2 xRollback()\n")); | ||
5714 | return clearPendingTerms((fulltext_vtab *)pVtab); | ||
5715 | } | ||
5716 | |||
5717 | /* | ||
5718 | ** Implementation of the snippet() function for FTS2 | ||
5719 | */ | ||
5720 | static void snippetFunc( | ||
5721 | sqlite3_context *pContext, | ||
5722 | int argc, | ||
5723 | sqlite3_value **argv | ||
5724 | ){ | ||
5725 | fulltext_cursor *pCursor; | ||
5726 | if( argc<1 ) return; | ||
5727 | if( sqlite3_value_type(argv[0])!=SQLITE_BLOB || | ||
5728 | sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){ | ||
5729 | sqlite3_result_error(pContext, "illegal first argument to html_snippet",-1); | ||
5730 | }else{ | ||
5731 | const char *zStart = "<b>"; | ||
5732 | const char *zEnd = "</b>"; | ||
5733 | const char *zEllipsis = "<b>...</b>"; | ||
5734 | memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor)); | ||
5735 | if( argc>=2 ){ | ||
5736 | zStart = (const char*)sqlite3_value_text(argv[1]); | ||
5737 | if( argc>=3 ){ | ||
5738 | zEnd = (const char*)sqlite3_value_text(argv[2]); | ||
5739 | if( argc>=4 ){ | ||
5740 | zEllipsis = (const char*)sqlite3_value_text(argv[3]); | ||
5741 | } | ||
5742 | } | ||
5743 | } | ||
5744 | snippetAllOffsets(pCursor); | ||
5745 | snippetText(pCursor, zStart, zEnd, zEllipsis); | ||
5746 | sqlite3_result_text(pContext, pCursor->snippet.zSnippet, | ||
5747 | pCursor->snippet.nSnippet, SQLITE_STATIC); | ||
5748 | } | ||
5749 | } | ||
5750 | |||
5751 | /* | ||
5752 | ** Implementation of the offsets() function for FTS2 | ||
5753 | */ | ||
5754 | static void snippetOffsetsFunc( | ||
5755 | sqlite3_context *pContext, | ||
5756 | int argc, | ||
5757 | sqlite3_value **argv | ||
5758 | ){ | ||
5759 | fulltext_cursor *pCursor; | ||
5760 | if( argc<1 ) return; | ||
5761 | if( sqlite3_value_type(argv[0])!=SQLITE_BLOB || | ||
5762 | sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){ | ||
5763 | sqlite3_result_error(pContext, "illegal first argument to offsets",-1); | ||
5764 | }else{ | ||
5765 | memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor)); | ||
5766 | snippetAllOffsets(pCursor); | ||
5767 | snippetOffsetText(&pCursor->snippet); | ||
5768 | sqlite3_result_text(pContext, | ||
5769 | pCursor->snippet.zOffset, pCursor->snippet.nOffset, | ||
5770 | SQLITE_STATIC); | ||
5771 | } | ||
5772 | } | ||
5773 | |||
5774 | /* | ||
5775 | ** This routine implements the xFindFunction method for the FTS2 | ||
5776 | ** virtual table. | ||
5777 | */ | ||
5778 | static int fulltextFindFunction( | ||
5779 | sqlite3_vtab *pVtab, | ||
5780 | int nArg, | ||
5781 | const char *zName, | ||
5782 | void (**pxFunc)(sqlite3_context*,int,sqlite3_value**), | ||
5783 | void **ppArg | ||
5784 | ){ | ||
5785 | if( strcmp(zName,"snippet")==0 ){ | ||
5786 | *pxFunc = snippetFunc; | ||
5787 | return 1; | ||
5788 | }else if( strcmp(zName,"offsets")==0 ){ | ||
5789 | *pxFunc = snippetOffsetsFunc; | ||
5790 | return 1; | ||
5791 | } | ||
5792 | return 0; | ||
5793 | } | ||
5794 | |||
5795 | /* | ||
5796 | ** Rename an fts2 table. | ||
5797 | */ | ||
5798 | static int fulltextRename( | ||
5799 | sqlite3_vtab *pVtab, | ||
5800 | const char *zName | ||
5801 | ){ | ||
5802 | fulltext_vtab *p = (fulltext_vtab *)pVtab; | ||
5803 | int rc = SQLITE_NOMEM; | ||
5804 | char *zSql = sqlite3_mprintf( | ||
5805 | "ALTER TABLE %Q.'%q_content' RENAME TO '%q_content';" | ||
5806 | "ALTER TABLE %Q.'%q_segments' RENAME TO '%q_segments';" | ||
5807 | "ALTER TABLE %Q.'%q_segdir' RENAME TO '%q_segdir';" | ||
5808 | , p->zDb, p->zName, zName | ||
5809 | , p->zDb, p->zName, zName | ||
5810 | , p->zDb, p->zName, zName | ||
5811 | ); | ||
5812 | if( zSql ){ | ||
5813 | rc = sqlite3_exec(p->db, zSql, 0, 0, 0); | ||
5814 | sqlite3_free(zSql); | ||
5815 | } | ||
5816 | return rc; | ||
5817 | } | ||
5818 | |||
5819 | static const sqlite3_module fts2Module = { | ||
5820 | /* iVersion */ 0, | ||
5821 | /* xCreate */ fulltextCreate, | ||
5822 | /* xConnect */ fulltextConnect, | ||
5823 | /* xBestIndex */ fulltextBestIndex, | ||
5824 | /* xDisconnect */ fulltextDisconnect, | ||
5825 | /* xDestroy */ fulltextDestroy, | ||
5826 | /* xOpen */ fulltextOpen, | ||
5827 | /* xClose */ fulltextClose, | ||
5828 | /* xFilter */ fulltextFilter, | ||
5829 | /* xNext */ fulltextNext, | ||
5830 | /* xEof */ fulltextEof, | ||
5831 | /* xColumn */ fulltextColumn, | ||
5832 | /* xRowid */ fulltextRowid, | ||
5833 | /* xUpdate */ fulltextUpdate, | ||
5834 | /* xBegin */ fulltextBegin, | ||
5835 | /* xSync */ fulltextSync, | ||
5836 | /* xCommit */ fulltextCommit, | ||
5837 | /* xRollback */ fulltextRollback, | ||
5838 | /* xFindFunction */ fulltextFindFunction, | ||
5839 | /* xRename */ fulltextRename, | ||
5840 | }; | ||
5841 | |||
5842 | static void hashDestroy(void *p){ | ||
5843 | fts2Hash *pHash = (fts2Hash *)p; | ||
5844 | sqlite3Fts2HashClear(pHash); | ||
5845 | sqlite3_free(pHash); | ||
5846 | } | ||
5847 | |||
5848 | /* | ||
5849 | ** The fts2 built-in tokenizers - "simple" and "porter" - are implemented | ||
5850 | ** in files fts2_tokenizer1.c and fts2_porter.c respectively. The following | ||
5851 | ** two forward declarations are for functions declared in these files | ||
5852 | ** used to retrieve the respective implementations. | ||
5853 | ** | ||
5854 | ** Calling sqlite3Fts2SimpleTokenizerModule() sets the value pointed | ||
5855 | ** to by the argument to point a the "simple" tokenizer implementation. | ||
5856 | ** Function ...PorterTokenizerModule() sets *pModule to point to the | ||
5857 | ** porter tokenizer/stemmer implementation. | ||
5858 | */ | ||
5859 | void sqlite3Fts2SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule); | ||
5860 | void sqlite3Fts2PorterTokenizerModule(sqlite3_tokenizer_module const**ppModule); | ||
5861 | void sqlite3Fts2IcuTokenizerModule(sqlite3_tokenizer_module const**ppModule); | ||
5862 | |||
5863 | int sqlite3Fts2InitHashTable(sqlite3 *, fts2Hash *, const char *); | ||
5864 | |||
5865 | /* | ||
5866 | ** Initialise the fts2 extension. If this extension is built as part | ||
5867 | ** of the sqlite library, then this function is called directly by | ||
5868 | ** SQLite. If fts2 is built as a dynamically loadable extension, this | ||
5869 | ** function is called by the sqlite3_extension_init() entry point. | ||
5870 | */ | ||
5871 | int sqlite3Fts2Init(sqlite3 *db){ | ||
5872 | int rc = SQLITE_OK; | ||
5873 | fts2Hash *pHash = 0; | ||
5874 | const sqlite3_tokenizer_module *pSimple = 0; | ||
5875 | const sqlite3_tokenizer_module *pPorter = 0; | ||
5876 | const sqlite3_tokenizer_module *pIcu = 0; | ||
5877 | |||
5878 | sqlite3Fts2SimpleTokenizerModule(&pSimple); | ||
5879 | sqlite3Fts2PorterTokenizerModule(&pPorter); | ||
5880 | #ifdef SQLITE_ENABLE_ICU | ||
5881 | sqlite3Fts2IcuTokenizerModule(&pIcu); | ||
5882 | #endif | ||
5883 | |||
5884 | /* Allocate and initialise the hash-table used to store tokenizers. */ | ||
5885 | pHash = sqlite3_malloc(sizeof(fts2Hash)); | ||
5886 | if( !pHash ){ | ||
5887 | rc = SQLITE_NOMEM; | ||
5888 | }else{ | ||
5889 | sqlite3Fts2HashInit(pHash, FTS2_HASH_STRING, 1); | ||
5890 | } | ||
5891 | |||
5892 | /* Load the built-in tokenizers into the hash table */ | ||
5893 | if( rc==SQLITE_OK ){ | ||
5894 | if( sqlite3Fts2HashInsert(pHash, "simple", 7, (void *)pSimple) | ||
5895 | || sqlite3Fts2HashInsert(pHash, "porter", 7, (void *)pPorter) | ||
5896 | || (pIcu && sqlite3Fts2HashInsert(pHash, "icu", 4, (void *)pIcu)) | ||
5897 | ){ | ||
5898 | rc = SQLITE_NOMEM; | ||
5899 | } | ||
5900 | } | ||
5901 | |||
5902 | /* Create the virtual table wrapper around the hash-table and overload | ||
5903 | ** the two scalar functions. If this is successful, register the | ||
5904 | ** module with sqlite. | ||
5905 | */ | ||
5906 | if( SQLITE_OK==rc | ||
5907 | && SQLITE_OK==(rc = sqlite3Fts2InitHashTable(db, pHash, "fts2_tokenizer")) | ||
5908 | && SQLITE_OK==(rc = sqlite3_overload_function(db, "snippet", -1)) | ||
5909 | && SQLITE_OK==(rc = sqlite3_overload_function(db, "offsets", -1)) | ||
5910 | ){ | ||
5911 | return sqlite3_create_module_v2( | ||
5912 | db, "fts2", &fts2Module, (void *)pHash, hashDestroy | ||
5913 | ); | ||
5914 | } | ||
5915 | |||
5916 | /* An error has occured. Delete the hash table and return the error code. */ | ||
5917 | assert( rc!=SQLITE_OK ); | ||
5918 | if( pHash ){ | ||
5919 | sqlite3Fts2HashClear(pHash); | ||
5920 | sqlite3_free(pHash); | ||
5921 | } | ||
5922 | return rc; | ||
5923 | } | ||
5924 | |||
5925 | #if !SQLITE_CORE | ||
5926 | int sqlite3_extension_init( | ||
5927 | sqlite3 *db, | ||
5928 | char **pzErrMsg, | ||
5929 | const sqlite3_api_routines *pApi | ||
5930 | ){ | ||
5931 | SQLITE_EXTENSION_INIT2(pApi) | ||
5932 | return sqlite3Fts2Init(db); | ||
5933 | } | ||
5934 | #endif | ||
5935 | |||
5936 | #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */ | ||
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2.h b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2.h deleted file mode 100644 index 4da4c38..0000000 --- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2.h +++ /dev/null | |||
@@ -1,26 +0,0 @@ | |||
1 | /* | ||
2 | ** 2006 Oct 10 | ||
3 | ** | ||
4 | ** The author disclaims copyright to this source code. In place of | ||
5 | ** a legal notice, here is a blessing: | ||
6 | ** | ||
7 | ** May you do good and not evil. | ||
8 | ** May you find forgiveness for yourself and forgive others. | ||
9 | ** May you share freely, never taking more than you give. | ||
10 | ** | ||
11 | ****************************************************************************** | ||
12 | ** | ||
13 | ** This header file is used by programs that want to link against the | ||
14 | ** FTS2 library. All it does is declare the sqlite3Fts2Init() interface. | ||
15 | */ | ||
16 | #include "sqlite3.h" | ||
17 | |||
18 | #ifdef __cplusplus | ||
19 | extern "C" { | ||
20 | #endif /* __cplusplus */ | ||
21 | |||
22 | int sqlite3Fts2Init(sqlite3 *db); | ||
23 | |||
24 | #ifdef __cplusplus | ||
25 | } /* extern "C" */ | ||
26 | #endif /* __cplusplus */ | ||
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_hash.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_hash.c deleted file mode 100644 index fcd5cc2..0000000 --- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_hash.c +++ /dev/null | |||
@@ -1,369 +0,0 @@ | |||
1 | /* | ||
2 | ** 2001 September 22 | ||
3 | ** | ||
4 | ** The author disclaims copyright to this source code. In place of | ||
5 | ** a legal notice, here is a blessing: | ||
6 | ** | ||
7 | ** May you do good and not evil. | ||
8 | ** May you find forgiveness for yourself and forgive others. | ||
9 | ** May you share freely, never taking more than you give. | ||
10 | ** | ||
11 | ************************************************************************* | ||
12 | ** This is the implementation of generic hash-tables used in SQLite. | ||
13 | ** We've modified it slightly to serve as a standalone hash table | ||
14 | ** implementation for the full-text indexing module. | ||
15 | */ | ||
16 | |||
17 | /* | ||
18 | ** The code in this file is only compiled if: | ||
19 | ** | ||
20 | ** * The FTS2 module is being built as an extension | ||
21 | ** (in which case SQLITE_CORE is not defined), or | ||
22 | ** | ||
23 | ** * The FTS2 module is being built into the core of | ||
24 | ** SQLite (in which case SQLITE_ENABLE_FTS2 is defined). | ||
25 | */ | ||
26 | #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) | ||
27 | |||
28 | #include <assert.h> | ||
29 | #include <stdlib.h> | ||
30 | #include <string.h> | ||
31 | |||
32 | #include "fts2_hash.h" | ||
33 | |||
34 | static void *malloc_and_zero(int n){ | ||
35 | void *p = malloc(n); | ||
36 | if( p ){ | ||
37 | memset(p, 0, n); | ||
38 | } | ||
39 | return p; | ||
40 | } | ||
41 | |||
42 | /* Turn bulk memory into a hash table object by initializing the | ||
43 | ** fields of the Hash structure. | ||
44 | ** | ||
45 | ** "pNew" is a pointer to the hash table that is to be initialized. | ||
46 | ** keyClass is one of the constants | ||
47 | ** FTS2_HASH_BINARY or FTS2_HASH_STRING. The value of keyClass | ||
48 | ** determines what kind of key the hash table will use. "copyKey" is | ||
49 | ** true if the hash table should make its own private copy of keys and | ||
50 | ** false if it should just use the supplied pointer. | ||
51 | */ | ||
52 | void sqlite3Fts2HashInit(fts2Hash *pNew, int keyClass, int copyKey){ | ||
53 | assert( pNew!=0 ); | ||
54 | assert( keyClass>=FTS2_HASH_STRING && keyClass<=FTS2_HASH_BINARY ); | ||
55 | pNew->keyClass = keyClass; | ||
56 | pNew->copyKey = copyKey; | ||
57 | pNew->first = 0; | ||
58 | pNew->count = 0; | ||
59 | pNew->htsize = 0; | ||
60 | pNew->ht = 0; | ||
61 | pNew->xMalloc = malloc_and_zero; | ||
62 | pNew->xFree = free; | ||
63 | } | ||
64 | |||
65 | /* Remove all entries from a hash table. Reclaim all memory. | ||
66 | ** Call this routine to delete a hash table or to reset a hash table | ||
67 | ** to the empty state. | ||
68 | */ | ||
69 | void sqlite3Fts2HashClear(fts2Hash *pH){ | ||
70 | fts2HashElem *elem; /* For looping over all elements of the table */ | ||
71 | |||
72 | assert( pH!=0 ); | ||
73 | elem = pH->first; | ||
74 | pH->first = 0; | ||
75 | if( pH->ht ) pH->xFree(pH->ht); | ||
76 | pH->ht = 0; | ||
77 | pH->htsize = 0; | ||
78 | while( elem ){ | ||
79 | fts2HashElem *next_elem = elem->next; | ||
80 | if( pH->copyKey && elem->pKey ){ | ||
81 | pH->xFree(elem->pKey); | ||
82 | } | ||
83 | pH->xFree(elem); | ||
84 | elem = next_elem; | ||
85 | } | ||
86 | pH->count = 0; | ||
87 | } | ||
88 | |||
89 | /* | ||
90 | ** Hash and comparison functions when the mode is FTS2_HASH_STRING | ||
91 | */ | ||
92 | static int strHash(const void *pKey, int nKey){ | ||
93 | const char *z = (const char *)pKey; | ||
94 | int h = 0; | ||
95 | if( nKey<=0 ) nKey = (int) strlen(z); | ||
96 | while( nKey > 0 ){ | ||
97 | h = (h<<3) ^ h ^ *z++; | ||
98 | nKey--; | ||
99 | } | ||
100 | return h & 0x7fffffff; | ||
101 | } | ||
102 | static int strCompare(const void *pKey1, int n1, const void *pKey2, int n2){ | ||
103 | if( n1!=n2 ) return 1; | ||
104 | return strncmp((const char*)pKey1,(const char*)pKey2,n1); | ||
105 | } | ||
106 | |||
107 | /* | ||
108 | ** Hash and comparison functions when the mode is FTS2_HASH_BINARY | ||
109 | */ | ||
110 | static int binHash(const void *pKey, int nKey){ | ||
111 | int h = 0; | ||
112 | const char *z = (const char *)pKey; | ||
113 | while( nKey-- > 0 ){ | ||
114 | h = (h<<3) ^ h ^ *(z++); | ||
115 | } | ||
116 | return h & 0x7fffffff; | ||
117 | } | ||
118 | static int binCompare(const void *pKey1, int n1, const void *pKey2, int n2){ | ||
119 | if( n1!=n2 ) return 1; | ||
120 | return memcmp(pKey1,pKey2,n1); | ||
121 | } | ||
122 | |||
123 | /* | ||
124 | ** Return a pointer to the appropriate hash function given the key class. | ||
125 | ** | ||
126 | ** The C syntax in this function definition may be unfamilar to some | ||
127 | ** programmers, so we provide the following additional explanation: | ||
128 | ** | ||
129 | ** The name of the function is "hashFunction". The function takes a | ||
130 | ** single parameter "keyClass". The return value of hashFunction() | ||
131 | ** is a pointer to another function. Specifically, the return value | ||
132 | ** of hashFunction() is a pointer to a function that takes two parameters | ||
133 | ** with types "const void*" and "int" and returns an "int". | ||
134 | */ | ||
135 | static int (*hashFunction(int keyClass))(const void*,int){ | ||
136 | if( keyClass==FTS2_HASH_STRING ){ | ||
137 | return &strHash; | ||
138 | }else{ | ||
139 | assert( keyClass==FTS2_HASH_BINARY ); | ||
140 | return &binHash; | ||
141 | } | ||
142 | } | ||
143 | |||
144 | /* | ||
145 | ** Return a pointer to the appropriate hash function given the key class. | ||
146 | ** | ||
147 | ** For help in interpreted the obscure C code in the function definition, | ||
148 | ** see the header comment on the previous function. | ||
149 | */ | ||
150 | static int (*compareFunction(int keyClass))(const void*,int,const void*,int){ | ||
151 | if( keyClass==FTS2_HASH_STRING ){ | ||
152 | return &strCompare; | ||
153 | }else{ | ||
154 | assert( keyClass==FTS2_HASH_BINARY ); | ||
155 | return &binCompare; | ||
156 | } | ||
157 | } | ||
158 | |||
159 | /* Link an element into the hash table | ||
160 | */ | ||
161 | static void insertElement( | ||
162 | fts2Hash *pH, /* The complete hash table */ | ||
163 | struct _fts2ht *pEntry, /* The entry into which pNew is inserted */ | ||
164 | fts2HashElem *pNew /* The element to be inserted */ | ||
165 | ){ | ||
166 | fts2HashElem *pHead; /* First element already in pEntry */ | ||
167 | pHead = pEntry->chain; | ||
168 | if( pHead ){ | ||
169 | pNew->next = pHead; | ||
170 | pNew->prev = pHead->prev; | ||
171 | if( pHead->prev ){ pHead->prev->next = pNew; } | ||
172 | else { pH->first = pNew; } | ||
173 | pHead->prev = pNew; | ||
174 | }else{ | ||
175 | pNew->next = pH->first; | ||
176 | if( pH->first ){ pH->first->prev = pNew; } | ||
177 | pNew->prev = 0; | ||
178 | pH->first = pNew; | ||
179 | } | ||
180 | pEntry->count++; | ||
181 | pEntry->chain = pNew; | ||
182 | } | ||
183 | |||
184 | |||
185 | /* Resize the hash table so that it cantains "new_size" buckets. | ||
186 | ** "new_size" must be a power of 2. The hash table might fail | ||
187 | ** to resize if sqliteMalloc() fails. | ||
188 | */ | ||
189 | static void rehash(fts2Hash *pH, int new_size){ | ||
190 | struct _fts2ht *new_ht; /* The new hash table */ | ||
191 | fts2HashElem *elem, *next_elem; /* For looping over existing elements */ | ||
192 | int (*xHash)(const void*,int); /* The hash function */ | ||
193 | |||
194 | assert( (new_size & (new_size-1))==0 ); | ||
195 | new_ht = (struct _fts2ht *)pH->xMalloc( new_size*sizeof(struct _fts2ht) ); | ||
196 | if( new_ht==0 ) return; | ||
197 | if( pH->ht ) pH->xFree(pH->ht); | ||
198 | pH->ht = new_ht; | ||
199 | pH->htsize = new_size; | ||
200 | xHash = hashFunction(pH->keyClass); | ||
201 | for(elem=pH->first, pH->first=0; elem; elem = next_elem){ | ||
202 | int h = (*xHash)(elem->pKey, elem->nKey) & (new_size-1); | ||
203 | next_elem = elem->next; | ||
204 | insertElement(pH, &new_ht[h], elem); | ||
205 | } | ||
206 | } | ||
207 | |||
208 | /* This function (for internal use only) locates an element in an | ||
209 | ** hash table that matches the given key. The hash for this key has | ||
210 | ** already been computed and is passed as the 4th parameter. | ||
211 | */ | ||
212 | static fts2HashElem *findElementGivenHash( | ||
213 | const fts2Hash *pH, /* The pH to be searched */ | ||
214 | const void *pKey, /* The key we are searching for */ | ||
215 | int nKey, | ||
216 | int h /* The hash for this key. */ | ||
217 | ){ | ||
218 | fts2HashElem *elem; /* Used to loop thru the element list */ | ||
219 | int count; /* Number of elements left to test */ | ||
220 | int (*xCompare)(const void*,int,const void*,int); /* comparison function */ | ||
221 | |||
222 | if( pH->ht ){ | ||
223 | struct _fts2ht *pEntry = &pH->ht[h]; | ||
224 | elem = pEntry->chain; | ||
225 | count = pEntry->count; | ||
226 | xCompare = compareFunction(pH->keyClass); | ||
227 | while( count-- && elem ){ | ||
228 | if( (*xCompare)(elem->pKey,elem->nKey,pKey,nKey)==0 ){ | ||
229 | return elem; | ||
230 | } | ||
231 | elem = elem->next; | ||
232 | } | ||
233 | } | ||
234 | return 0; | ||
235 | } | ||
236 | |||
237 | /* Remove a single entry from the hash table given a pointer to that | ||
238 | ** element and a hash on the element's key. | ||
239 | */ | ||
240 | static void removeElementGivenHash( | ||
241 | fts2Hash *pH, /* The pH containing "elem" */ | ||
242 | fts2HashElem* elem, /* The element to be removed from the pH */ | ||
243 | int h /* Hash value for the element */ | ||
244 | ){ | ||
245 | struct _fts2ht *pEntry; | ||
246 | if( elem->prev ){ | ||
247 | elem->prev->next = elem->next; | ||
248 | }else{ | ||
249 | pH->first = elem->next; | ||
250 | } | ||
251 | if( elem->next ){ | ||
252 | elem->next->prev = elem->prev; | ||
253 | } | ||
254 | pEntry = &pH->ht[h]; | ||
255 | if( pEntry->chain==elem ){ | ||
256 | pEntry->chain = elem->next; | ||
257 | } | ||
258 | pEntry->count--; | ||
259 | if( pEntry->count<=0 ){ | ||
260 | pEntry->chain = 0; | ||
261 | } | ||
262 | if( pH->copyKey && elem->pKey ){ | ||
263 | pH->xFree(elem->pKey); | ||
264 | } | ||
265 | pH->xFree( elem ); | ||
266 | pH->count--; | ||
267 | if( pH->count<=0 ){ | ||
268 | assert( pH->first==0 ); | ||
269 | assert( pH->count==0 ); | ||
270 | fts2HashClear(pH); | ||
271 | } | ||
272 | } | ||
273 | |||
274 | /* Attempt to locate an element of the hash table pH with a key | ||
275 | ** that matches pKey,nKey. Return the data for this element if it is | ||
276 | ** found, or NULL if there is no match. | ||
277 | */ | ||
278 | void *sqlite3Fts2HashFind(const fts2Hash *pH, const void *pKey, int nKey){ | ||
279 | int h; /* A hash on key */ | ||
280 | fts2HashElem *elem; /* The element that matches key */ | ||
281 | int (*xHash)(const void*,int); /* The hash function */ | ||
282 | |||
283 | if( pH==0 || pH->ht==0 ) return 0; | ||
284 | xHash = hashFunction(pH->keyClass); | ||
285 | assert( xHash!=0 ); | ||
286 | h = (*xHash)(pKey,nKey); | ||
287 | assert( (pH->htsize & (pH->htsize-1))==0 ); | ||
288 | elem = findElementGivenHash(pH,pKey,nKey, h & (pH->htsize-1)); | ||
289 | return elem ? elem->data : 0; | ||
290 | } | ||
291 | |||
292 | /* Insert an element into the hash table pH. The key is pKey,nKey | ||
293 | ** and the data is "data". | ||
294 | ** | ||
295 | ** If no element exists with a matching key, then a new | ||
296 | ** element is created. A copy of the key is made if the copyKey | ||
297 | ** flag is set. NULL is returned. | ||
298 | ** | ||
299 | ** If another element already exists with the same key, then the | ||
300 | ** new data replaces the old data and the old data is returned. | ||
301 | ** The key is not copied in this instance. If a malloc fails, then | ||
302 | ** the new data is returned and the hash table is unchanged. | ||
303 | ** | ||
304 | ** If the "data" parameter to this function is NULL, then the | ||
305 | ** element corresponding to "key" is removed from the hash table. | ||
306 | */ | ||
307 | void *sqlite3Fts2HashInsert( | ||
308 | fts2Hash *pH, /* The hash table to insert into */ | ||
309 | const void *pKey, /* The key */ | ||
310 | int nKey, /* Number of bytes in the key */ | ||
311 | void *data /* The data */ | ||
312 | ){ | ||
313 | int hraw; /* Raw hash value of the key */ | ||
314 | int h; /* the hash of the key modulo hash table size */ | ||
315 | fts2HashElem *elem; /* Used to loop thru the element list */ | ||
316 | fts2HashElem *new_elem; /* New element added to the pH */ | ||
317 | int (*xHash)(const void*,int); /* The hash function */ | ||
318 | |||
319 | assert( pH!=0 ); | ||
320 | xHash = hashFunction(pH->keyClass); | ||
321 | assert( xHash!=0 ); | ||
322 | hraw = (*xHash)(pKey, nKey); | ||
323 | assert( (pH->htsize & (pH->htsize-1))==0 ); | ||
324 | h = hraw & (pH->htsize-1); | ||
325 | elem = findElementGivenHash(pH,pKey,nKey,h); | ||
326 | if( elem ){ | ||
327 | void *old_data = elem->data; | ||
328 | if( data==0 ){ | ||
329 | removeElementGivenHash(pH,elem,h); | ||
330 | }else{ | ||
331 | elem->data = data; | ||
332 | } | ||
333 | return old_data; | ||
334 | } | ||
335 | if( data==0 ) return 0; | ||
336 | new_elem = (fts2HashElem*)pH->xMalloc( sizeof(fts2HashElem) ); | ||
337 | if( new_elem==0 ) return data; | ||
338 | if( pH->copyKey && pKey!=0 ){ | ||
339 | new_elem->pKey = pH->xMalloc( nKey ); | ||
340 | if( new_elem->pKey==0 ){ | ||
341 | pH->xFree(new_elem); | ||
342 | return data; | ||
343 | } | ||
344 | memcpy((void*)new_elem->pKey, pKey, nKey); | ||
345 | }else{ | ||
346 | new_elem->pKey = (void*)pKey; | ||
347 | } | ||
348 | new_elem->nKey = nKey; | ||
349 | pH->count++; | ||
350 | if( pH->htsize==0 ){ | ||
351 | rehash(pH,8); | ||
352 | if( pH->htsize==0 ){ | ||
353 | pH->count = 0; | ||
354 | pH->xFree(new_elem); | ||
355 | return data; | ||
356 | } | ||
357 | } | ||
358 | if( pH->count > pH->htsize ){ | ||
359 | rehash(pH,pH->htsize*2); | ||
360 | } | ||
361 | assert( pH->htsize>0 ); | ||
362 | assert( (pH->htsize & (pH->htsize-1))==0 ); | ||
363 | h = hraw & (pH->htsize-1); | ||
364 | insertElement(pH, &pH->ht[h], new_elem); | ||
365 | new_elem->data = data; | ||
366 | return 0; | ||
367 | } | ||
368 | |||
369 | #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */ | ||
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_hash.h b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_hash.h deleted file mode 100644 index 97f3529..0000000 --- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_hash.h +++ /dev/null | |||
@@ -1,112 +0,0 @@ | |||
1 | /* | ||
2 | ** 2001 September 22 | ||
3 | ** | ||
4 | ** The author disclaims copyright to this source code. In place of | ||
5 | ** a legal notice, here is a blessing: | ||
6 | ** | ||
7 | ** May you do good and not evil. | ||
8 | ** May you find forgiveness for yourself and forgive others. | ||
9 | ** May you share freely, never taking more than you give. | ||
10 | ** | ||
11 | ************************************************************************* | ||
12 | ** This is the header file for the generic hash-table implemenation | ||
13 | ** used in SQLite. We've modified it slightly to serve as a standalone | ||
14 | ** hash table implementation for the full-text indexing module. | ||
15 | ** | ||
16 | */ | ||
17 | #ifndef _FTS2_HASH_H_ | ||
18 | #define _FTS2_HASH_H_ | ||
19 | |||
20 | /* Forward declarations of structures. */ | ||
21 | typedef struct fts2Hash fts2Hash; | ||
22 | typedef struct fts2HashElem fts2HashElem; | ||
23 | |||
24 | /* A complete hash table is an instance of the following structure. | ||
25 | ** The internals of this structure are intended to be opaque -- client | ||
26 | ** code should not attempt to access or modify the fields of this structure | ||
27 | ** directly. Change this structure only by using the routines below. | ||
28 | ** However, many of the "procedures" and "functions" for modifying and | ||
29 | ** accessing this structure are really macros, so we can't really make | ||
30 | ** this structure opaque. | ||
31 | */ | ||
32 | struct fts2Hash { | ||
33 | char keyClass; /* HASH_INT, _POINTER, _STRING, _BINARY */ | ||
34 | char copyKey; /* True if copy of key made on insert */ | ||
35 | int count; /* Number of entries in this table */ | ||
36 | fts2HashElem *first; /* The first element of the array */ | ||
37 | void *(*xMalloc)(int); /* malloc() function to use */ | ||
38 | void (*xFree)(void *); /* free() function to use */ | ||
39 | int htsize; /* Number of buckets in the hash table */ | ||
40 | struct _fts2ht { /* the hash table */ | ||
41 | int count; /* Number of entries with this hash */ | ||
42 | fts2HashElem *chain; /* Pointer to first entry with this hash */ | ||
43 | } *ht; | ||
44 | }; | ||
45 | |||
46 | /* Each element in the hash table is an instance of the following | ||
47 | ** structure. All elements are stored on a single doubly-linked list. | ||
48 | ** | ||
49 | ** Again, this structure is intended to be opaque, but it can't really | ||
50 | ** be opaque because it is used by macros. | ||
51 | */ | ||
52 | struct fts2HashElem { | ||
53 | fts2HashElem *next, *prev; /* Next and previous elements in the table */ | ||
54 | void *data; /* Data associated with this element */ | ||
55 | void *pKey; int nKey; /* Key associated with this element */ | ||
56 | }; | ||
57 | |||
58 | /* | ||
59 | ** There are 2 different modes of operation for a hash table: | ||
60 | ** | ||
61 | ** FTS2_HASH_STRING pKey points to a string that is nKey bytes long | ||
62 | ** (including the null-terminator, if any). Case | ||
63 | ** is respected in comparisons. | ||
64 | ** | ||
65 | ** FTS2_HASH_BINARY pKey points to binary data nKey bytes long. | ||
66 | ** memcmp() is used to compare keys. | ||
67 | ** | ||
68 | ** A copy of the key is made if the copyKey parameter to fts2HashInit is 1. | ||
69 | */ | ||
70 | #define FTS2_HASH_STRING 1 | ||
71 | #define FTS2_HASH_BINARY 2 | ||
72 | |||
73 | /* | ||
74 | ** Access routines. To delete, insert a NULL pointer. | ||
75 | */ | ||
76 | void sqlite3Fts2HashInit(fts2Hash*, int keytype, int copyKey); | ||
77 | void *sqlite3Fts2HashInsert(fts2Hash*, const void *pKey, int nKey, void *pData); | ||
78 | void *sqlite3Fts2HashFind(const fts2Hash*, const void *pKey, int nKey); | ||
79 | void sqlite3Fts2HashClear(fts2Hash*); | ||
80 | |||
81 | /* | ||
82 | ** Shorthand for the functions above | ||
83 | */ | ||
84 | #define fts2HashInit sqlite3Fts2HashInit | ||
85 | #define fts2HashInsert sqlite3Fts2HashInsert | ||
86 | #define fts2HashFind sqlite3Fts2HashFind | ||
87 | #define fts2HashClear sqlite3Fts2HashClear | ||
88 | |||
89 | /* | ||
90 | ** Macros for looping over all elements of a hash table. The idiom is | ||
91 | ** like this: | ||
92 | ** | ||
93 | ** fts2Hash h; | ||
94 | ** fts2HashElem *p; | ||
95 | ** ... | ||
96 | ** for(p=fts2HashFirst(&h); p; p=fts2HashNext(p)){ | ||
97 | ** SomeStructure *pData = fts2HashData(p); | ||
98 | ** // do something with pData | ||
99 | ** } | ||
100 | */ | ||
101 | #define fts2HashFirst(H) ((H)->first) | ||
102 | #define fts2HashNext(E) ((E)->next) | ||
103 | #define fts2HashData(E) ((E)->data) | ||
104 | #define fts2HashKey(E) ((E)->pKey) | ||
105 | #define fts2HashKeysize(E) ((E)->nKey) | ||
106 | |||
107 | /* | ||
108 | ** Number of entries in a hash table | ||
109 | */ | ||
110 | #define fts2HashCount(H) ((H)->count) | ||
111 | |||
112 | #endif /* _FTS2_HASH_H_ */ | ||
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_icu.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_icu.c deleted file mode 100644 index ed15f33..0000000 --- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_icu.c +++ /dev/null | |||
@@ -1,257 +0,0 @@ | |||
1 | /* | ||
2 | ** 2007 June 22 | ||
3 | ** | ||
4 | ** The author disclaims copyright to this source code. In place of | ||
5 | ** a legal notice, here is a blessing: | ||
6 | ** | ||
7 | ** May you do good and not evil. | ||
8 | ** May you find forgiveness for yourself and forgive others. | ||
9 | ** May you share freely, never taking more than you give. | ||
10 | ** | ||
11 | ************************************************************************* | ||
12 | ** This file implements a tokenizer for fts2 based on the ICU library. | ||
13 | ** | ||
14 | ** $Id: fts2_icu.c,v 1.1 2007/06/22 15:21:16 danielk1977 Exp $ | ||
15 | */ | ||
16 | |||
17 | #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) | ||
18 | #ifdef SQLITE_ENABLE_ICU | ||
19 | |||
20 | #include <assert.h> | ||
21 | #include <string.h> | ||
22 | #include "fts2_tokenizer.h" | ||
23 | |||
24 | #include <unicode/ubrk.h> | ||
25 | #include <unicode/ucol.h> | ||
26 | #include <unicode/ustring.h> | ||
27 | #include <unicode/utf16.h> | ||
28 | |||
29 | typedef struct IcuTokenizer IcuTokenizer; | ||
30 | typedef struct IcuCursor IcuCursor; | ||
31 | |||
32 | struct IcuTokenizer { | ||
33 | sqlite3_tokenizer base; | ||
34 | char *zLocale; | ||
35 | }; | ||
36 | |||
37 | struct IcuCursor { | ||
38 | sqlite3_tokenizer_cursor base; | ||
39 | |||
40 | UBreakIterator *pIter; /* ICU break-iterator object */ | ||
41 | int nChar; /* Number of UChar elements in pInput */ | ||
42 | UChar *aChar; /* Copy of input using utf-16 encoding */ | ||
43 | int *aOffset; /* Offsets of each character in utf-8 input */ | ||
44 | |||
45 | int nBuffer; | ||
46 | char *zBuffer; | ||
47 | |||
48 | int iToken; | ||
49 | }; | ||
50 | |||
51 | /* | ||
52 | ** Create a new tokenizer instance. | ||
53 | */ | ||
54 | static int icuCreate( | ||
55 | int argc, /* Number of entries in argv[] */ | ||
56 | const char * const *argv, /* Tokenizer creation arguments */ | ||
57 | sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */ | ||
58 | ){ | ||
59 | IcuTokenizer *p; | ||
60 | int n = 0; | ||
61 | |||
62 | if( argc>0 ){ | ||
63 | n = strlen(argv[0])+1; | ||
64 | } | ||
65 | p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n); | ||
66 | if( !p ){ | ||
67 | return SQLITE_NOMEM; | ||
68 | } | ||
69 | memset(p, 0, sizeof(IcuTokenizer)); | ||
70 | |||
71 | if( n ){ | ||
72 | p->zLocale = (char *)&p[1]; | ||
73 | memcpy(p->zLocale, argv[0], n); | ||
74 | } | ||
75 | |||
76 | *ppTokenizer = (sqlite3_tokenizer *)p; | ||
77 | |||
78 | return SQLITE_OK; | ||
79 | } | ||
80 | |||
81 | /* | ||
82 | ** Destroy a tokenizer | ||
83 | */ | ||
84 | static int icuDestroy(sqlite3_tokenizer *pTokenizer){ | ||
85 | IcuTokenizer *p = (IcuTokenizer *)pTokenizer; | ||
86 | sqlite3_free(p); | ||
87 | return SQLITE_OK; | ||
88 | } | ||
89 | |||
90 | /* | ||
91 | ** Prepare to begin tokenizing a particular string. The input | ||
92 | ** string to be tokenized is pInput[0..nBytes-1]. A cursor | ||
93 | ** used to incrementally tokenize this string is returned in | ||
94 | ** *ppCursor. | ||
95 | */ | ||
96 | static int icuOpen( | ||
97 | sqlite3_tokenizer *pTokenizer, /* The tokenizer */ | ||
98 | const char *zInput, /* Input string */ | ||
99 | int nInput, /* Length of zInput in bytes */ | ||
100 | sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ | ||
101 | ){ | ||
102 | IcuTokenizer *p = (IcuTokenizer *)pTokenizer; | ||
103 | IcuCursor *pCsr; | ||
104 | |||
105 | const int32_t opt = U_FOLD_CASE_DEFAULT; | ||
106 | UErrorCode status = U_ZERO_ERROR; | ||
107 | int nChar; | ||
108 | |||
109 | UChar32 c; | ||
110 | int iInput = 0; | ||
111 | int iOut = 0; | ||
112 | |||
113 | *ppCursor = 0; | ||
114 | |||
115 | nChar = nInput+1; | ||
116 | pCsr = (IcuCursor *)sqlite3_malloc( | ||
117 | sizeof(IcuCursor) + /* IcuCursor */ | ||
118 | nChar * sizeof(UChar) + /* IcuCursor.aChar[] */ | ||
119 | (nChar+1) * sizeof(int) /* IcuCursor.aOffset[] */ | ||
120 | ); | ||
121 | if( !pCsr ){ | ||
122 | return SQLITE_NOMEM; | ||
123 | } | ||
124 | memset(pCsr, 0, sizeof(IcuCursor)); | ||
125 | pCsr->aChar = (UChar *)&pCsr[1]; | ||
126 | pCsr->aOffset = (int *)&pCsr->aChar[nChar]; | ||
127 | |||
128 | pCsr->aOffset[iOut] = iInput; | ||
129 | U8_NEXT(zInput, iInput, nInput, c); | ||
130 | while( c>0 ){ | ||
131 | int isError = 0; | ||
132 | c = u_foldCase(c, opt); | ||
133 | U16_APPEND(pCsr->aChar, iOut, nChar, c, isError); | ||
134 | if( isError ){ | ||
135 | sqlite3_free(pCsr); | ||
136 | return SQLITE_ERROR; | ||
137 | } | ||
138 | pCsr->aOffset[iOut] = iInput; | ||
139 | |||
140 | if( iInput<nInput ){ | ||
141 | U8_NEXT(zInput, iInput, nInput, c); | ||
142 | }else{ | ||
143 | c = 0; | ||
144 | } | ||
145 | } | ||
146 | |||
147 | pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status); | ||
148 | if( !U_SUCCESS(status) ){ | ||
149 | sqlite3_free(pCsr); | ||
150 | return SQLITE_ERROR; | ||
151 | } | ||
152 | pCsr->nChar = iOut; | ||
153 | |||
154 | ubrk_first(pCsr->pIter); | ||
155 | *ppCursor = (sqlite3_tokenizer_cursor *)pCsr; | ||
156 | return SQLITE_OK; | ||
157 | } | ||
158 | |||
159 | /* | ||
160 | ** Close a tokenization cursor previously opened by a call to icuOpen(). | ||
161 | */ | ||
162 | static int icuClose(sqlite3_tokenizer_cursor *pCursor){ | ||
163 | IcuCursor *pCsr = (IcuCursor *)pCursor; | ||
164 | ubrk_close(pCsr->pIter); | ||
165 | sqlite3_free(pCsr->zBuffer); | ||
166 | sqlite3_free(pCsr); | ||
167 | return SQLITE_OK; | ||
168 | } | ||
169 | |||
170 | /* | ||
171 | ** Extract the next token from a tokenization cursor. | ||
172 | */ | ||
173 | static int icuNext( | ||
174 | sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */ | ||
175 | const char **ppToken, /* OUT: *ppToken is the token text */ | ||
176 | int *pnBytes, /* OUT: Number of bytes in token */ | ||
177 | int *piStartOffset, /* OUT: Starting offset of token */ | ||
178 | int *piEndOffset, /* OUT: Ending offset of token */ | ||
179 | int *piPosition /* OUT: Position integer of token */ | ||
180 | ){ | ||
181 | IcuCursor *pCsr = (IcuCursor *)pCursor; | ||
182 | |||
183 | int iStart = 0; | ||
184 | int iEnd = 0; | ||
185 | int nByte = 0; | ||
186 | |||
187 | while( iStart==iEnd ){ | ||
188 | UChar32 c; | ||
189 | |||
190 | iStart = ubrk_current(pCsr->pIter); | ||
191 | iEnd = ubrk_next(pCsr->pIter); | ||
192 | if( iEnd==UBRK_DONE ){ | ||
193 | return SQLITE_DONE; | ||
194 | } | ||
195 | |||
196 | while( iStart<iEnd ){ | ||
197 | int iWhite = iStart; | ||
198 | U8_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c); | ||
199 | if( u_isspace(c) ){ | ||
200 | iStart = iWhite; | ||
201 | }else{ | ||
202 | break; | ||
203 | } | ||
204 | } | ||
205 | assert(iStart<=iEnd); | ||
206 | } | ||
207 | |||
208 | do { | ||
209 | UErrorCode status = U_ZERO_ERROR; | ||
210 | if( nByte ){ | ||
211 | char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte); | ||
212 | if( !zNew ){ | ||
213 | return SQLITE_NOMEM; | ||
214 | } | ||
215 | pCsr->zBuffer = zNew; | ||
216 | pCsr->nBuffer = nByte; | ||
217 | } | ||
218 | |||
219 | u_strToUTF8( | ||
220 | pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */ | ||
221 | &pCsr->aChar[iStart], iEnd-iStart, /* Input vars */ | ||
222 | &status /* Output success/failure */ | ||
223 | ); | ||
224 | } while( nByte>pCsr->nBuffer ); | ||
225 | |||
226 | *ppToken = pCsr->zBuffer; | ||
227 | *pnBytes = nByte; | ||
228 | *piStartOffset = pCsr->aOffset[iStart]; | ||
229 | *piEndOffset = pCsr->aOffset[iEnd]; | ||
230 | *piPosition = pCsr->iToken++; | ||
231 | |||
232 | return SQLITE_OK; | ||
233 | } | ||
234 | |||
235 | /* | ||
236 | ** The set of routines that implement the simple tokenizer | ||
237 | */ | ||
238 | static const sqlite3_tokenizer_module icuTokenizerModule = { | ||
239 | 0, /* iVersion */ | ||
240 | icuCreate, /* xCreate */ | ||
241 | icuDestroy, /* xCreate */ | ||
242 | icuOpen, /* xOpen */ | ||
243 | icuClose, /* xClose */ | ||
244 | icuNext, /* xNext */ | ||
245 | }; | ||
246 | |||
247 | /* | ||
248 | ** Set *ppModule to point at the implementation of the ICU tokenizer. | ||
249 | */ | ||
250 | void sqlite3Fts2IcuTokenizerModule( | ||
251 | sqlite3_tokenizer_module const**ppModule | ||
252 | ){ | ||
253 | *ppModule = &icuTokenizerModule; | ||
254 | } | ||
255 | |||
256 | #endif /* defined(SQLITE_ENABLE_ICU) */ | ||
257 | #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */ | ||
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_porter.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_porter.c deleted file mode 100644 index dab1849..0000000 --- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_porter.c +++ /dev/null | |||
@@ -1,642 +0,0 @@ | |||
1 | /* | ||
2 | ** 2006 September 30 | ||
3 | ** | ||
4 | ** The author disclaims copyright to this source code. In place of | ||
5 | ** a legal notice, here is a blessing: | ||
6 | ** | ||
7 | ** May you do good and not evil. | ||
8 | ** May you find forgiveness for yourself and forgive others. | ||
9 | ** May you share freely, never taking more than you give. | ||
10 | ** | ||
11 | ************************************************************************* | ||
12 | ** Implementation of the full-text-search tokenizer that implements | ||
13 | ** a Porter stemmer. | ||
14 | */ | ||
15 | |||
16 | /* | ||
17 | ** The code in this file is only compiled if: | ||
18 | ** | ||
19 | ** * The FTS2 module is being built as an extension | ||
20 | ** (in which case SQLITE_CORE is not defined), or | ||
21 | ** | ||
22 | ** * The FTS2 module is being built into the core of | ||
23 | ** SQLite (in which case SQLITE_ENABLE_FTS2 is defined). | ||
24 | */ | ||
25 | #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) | ||
26 | |||
27 | |||
28 | #include <assert.h> | ||
29 | #include <stdlib.h> | ||
30 | #include <stdio.h> | ||
31 | #include <string.h> | ||
32 | #include <ctype.h> | ||
33 | |||
34 | #include "fts2_tokenizer.h" | ||
35 | |||
36 | /* | ||
37 | ** Class derived from sqlite3_tokenizer | ||
38 | */ | ||
39 | typedef struct porter_tokenizer { | ||
40 | sqlite3_tokenizer base; /* Base class */ | ||
41 | } porter_tokenizer; | ||
42 | |||
43 | /* | ||
44 | ** Class derived from sqlit3_tokenizer_cursor | ||
45 | */ | ||
46 | typedef struct porter_tokenizer_cursor { | ||
47 | sqlite3_tokenizer_cursor base; | ||
48 | const char *zInput; /* input we are tokenizing */ | ||
49 | int nInput; /* size of the input */ | ||
50 | int iOffset; /* current position in zInput */ | ||
51 | int iToken; /* index of next token to be returned */ | ||
52 | char *zToken; /* storage for current token */ | ||
53 | int nAllocated; /* space allocated to zToken buffer */ | ||
54 | } porter_tokenizer_cursor; | ||
55 | |||
56 | |||
57 | /* Forward declaration */ | ||
58 | static const sqlite3_tokenizer_module porterTokenizerModule; | ||
59 | |||
60 | |||
61 | /* | ||
62 | ** Create a new tokenizer instance. | ||
63 | */ | ||
64 | static int porterCreate( | ||
65 | int argc, const char * const *argv, | ||
66 | sqlite3_tokenizer **ppTokenizer | ||
67 | ){ | ||
68 | porter_tokenizer *t; | ||
69 | t = (porter_tokenizer *) calloc(sizeof(*t), 1); | ||
70 | if( t==NULL ) return SQLITE_NOMEM; | ||
71 | |||
72 | *ppTokenizer = &t->base; | ||
73 | return SQLITE_OK; | ||
74 | } | ||
75 | |||
76 | /* | ||
77 | ** Destroy a tokenizer | ||
78 | */ | ||
79 | static int porterDestroy(sqlite3_tokenizer *pTokenizer){ | ||
80 | free(pTokenizer); | ||
81 | return SQLITE_OK; | ||
82 | } | ||
83 | |||
84 | /* | ||
85 | ** Prepare to begin tokenizing a particular string. The input | ||
86 | ** string to be tokenized is zInput[0..nInput-1]. A cursor | ||
87 | ** used to incrementally tokenize this string is returned in | ||
88 | ** *ppCursor. | ||
89 | */ | ||
90 | static int porterOpen( | ||
91 | sqlite3_tokenizer *pTokenizer, /* The tokenizer */ | ||
92 | const char *zInput, int nInput, /* String to be tokenized */ | ||
93 | sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ | ||
94 | ){ | ||
95 | porter_tokenizer_cursor *c; | ||
96 | |||
97 | c = (porter_tokenizer_cursor *) malloc(sizeof(*c)); | ||
98 | if( c==NULL ) return SQLITE_NOMEM; | ||
99 | |||
100 | c->zInput = zInput; | ||
101 | if( zInput==0 ){ | ||
102 | c->nInput = 0; | ||
103 | }else if( nInput<0 ){ | ||
104 | c->nInput = (int)strlen(zInput); | ||
105 | }else{ | ||
106 | c->nInput = nInput; | ||
107 | } | ||
108 | c->iOffset = 0; /* start tokenizing at the beginning */ | ||
109 | c->iToken = 0; | ||
110 | c->zToken = NULL; /* no space allocated, yet. */ | ||
111 | c->nAllocated = 0; | ||
112 | |||
113 | *ppCursor = &c->base; | ||
114 | return SQLITE_OK; | ||
115 | } | ||
116 | |||
117 | /* | ||
118 | ** Close a tokenization cursor previously opened by a call to | ||
119 | ** porterOpen() above. | ||
120 | */ | ||
121 | static int porterClose(sqlite3_tokenizer_cursor *pCursor){ | ||
122 | porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor; | ||
123 | free(c->zToken); | ||
124 | free(c); | ||
125 | return SQLITE_OK; | ||
126 | } | ||
127 | /* | ||
128 | ** Vowel or consonant | ||
129 | */ | ||
130 | static const char cType[] = { | ||
131 | 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, | ||
132 | 1, 1, 1, 2, 1 | ||
133 | }; | ||
134 | |||
135 | /* | ||
136 | ** isConsonant() and isVowel() determine if their first character in | ||
137 | ** the string they point to is a consonant or a vowel, according | ||
138 | ** to Porter ruls. | ||
139 | ** | ||
140 | ** A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'. | ||
141 | ** 'Y' is a consonant unless it follows another consonant, | ||
142 | ** in which case it is a vowel. | ||
143 | ** | ||
144 | ** In these routine, the letters are in reverse order. So the 'y' rule | ||
145 | ** is that 'y' is a consonant unless it is followed by another | ||
146 | ** consonent. | ||
147 | */ | ||
148 | static int isVowel(const char*); | ||
149 | static int isConsonant(const char *z){ | ||
150 | int j; | ||
151 | char x = *z; | ||
152 | if( x==0 ) return 0; | ||
153 | assert( x>='a' && x<='z' ); | ||
154 | j = cType[x-'a']; | ||
155 | if( j<2 ) return j; | ||
156 | return z[1]==0 || isVowel(z + 1); | ||
157 | } | ||
158 | static int isVowel(const char *z){ | ||
159 | int j; | ||
160 | char x = *z; | ||
161 | if( x==0 ) return 0; | ||
162 | assert( x>='a' && x<='z' ); | ||
163 | j = cType[x-'a']; | ||
164 | if( j<2 ) return 1-j; | ||
165 | return isConsonant(z + 1); | ||
166 | } | ||
167 | |||
168 | /* | ||
169 | ** Let any sequence of one or more vowels be represented by V and let | ||
170 | ** C be sequence of one or more consonants. Then every word can be | ||
171 | ** represented as: | ||
172 | ** | ||
173 | ** [C] (VC){m} [V] | ||
174 | ** | ||
175 | ** In prose: A word is an optional consonant followed by zero or | ||
176 | ** vowel-consonant pairs followed by an optional vowel. "m" is the | ||
177 | ** number of vowel consonant pairs. This routine computes the value | ||
178 | ** of m for the first i bytes of a word. | ||
179 | ** | ||
180 | ** Return true if the m-value for z is 1 or more. In other words, | ||
181 | ** return true if z contains at least one vowel that is followed | ||
182 | ** by a consonant. | ||
183 | ** | ||
184 | ** In this routine z[] is in reverse order. So we are really looking | ||
185 | ** for an instance of of a consonant followed by a vowel. | ||
186 | */ | ||
187 | static int m_gt_0(const char *z){ | ||
188 | while( isVowel(z) ){ z++; } | ||
189 | if( *z==0 ) return 0; | ||
190 | while( isConsonant(z) ){ z++; } | ||
191 | return *z!=0; | ||
192 | } | ||
193 | |||
194 | /* Like mgt0 above except we are looking for a value of m which is | ||
195 | ** exactly 1 | ||
196 | */ | ||
197 | static int m_eq_1(const char *z){ | ||
198 | while( isVowel(z) ){ z++; } | ||
199 | if( *z==0 ) return 0; | ||
200 | while( isConsonant(z) ){ z++; } | ||
201 | if( *z==0 ) return 0; | ||
202 | while( isVowel(z) ){ z++; } | ||
203 | if( *z==0 ) return 1; | ||
204 | while( isConsonant(z) ){ z++; } | ||
205 | return *z==0; | ||
206 | } | ||
207 | |||
208 | /* Like mgt0 above except we are looking for a value of m>1 instead | ||
209 | ** or m>0 | ||
210 | */ | ||
211 | static int m_gt_1(const char *z){ | ||
212 | while( isVowel(z) ){ z++; } | ||
213 | if( *z==0 ) return 0; | ||
214 | while( isConsonant(z) ){ z++; } | ||
215 | if( *z==0 ) return 0; | ||
216 | while( isVowel(z) ){ z++; } | ||
217 | if( *z==0 ) return 0; | ||
218 | while( isConsonant(z) ){ z++; } | ||
219 | return *z!=0; | ||
220 | } | ||
221 | |||
222 | /* | ||
223 | ** Return TRUE if there is a vowel anywhere within z[0..n-1] | ||
224 | */ | ||
225 | static int hasVowel(const char *z){ | ||
226 | while( isConsonant(z) ){ z++; } | ||
227 | return *z!=0; | ||
228 | } | ||
229 | |||
230 | /* | ||
231 | ** Return TRUE if the word ends in a double consonant. | ||
232 | ** | ||
233 | ** The text is reversed here. So we are really looking at | ||
234 | ** the first two characters of z[]. | ||
235 | */ | ||
236 | static int doubleConsonant(const char *z){ | ||
237 | return isConsonant(z) && z[0]==z[1] && isConsonant(z+1); | ||
238 | } | ||
239 | |||
240 | /* | ||
241 | ** Return TRUE if the word ends with three letters which | ||
242 | ** are consonant-vowel-consonent and where the final consonant | ||
243 | ** is not 'w', 'x', or 'y'. | ||
244 | ** | ||
245 | ** The word is reversed here. So we are really checking the | ||
246 | ** first three letters and the first one cannot be in [wxy]. | ||
247 | */ | ||
248 | static int star_oh(const char *z){ | ||
249 | return | ||
250 | z[0]!=0 && isConsonant(z) && | ||
251 | z[0]!='w' && z[0]!='x' && z[0]!='y' && | ||
252 | z[1]!=0 && isVowel(z+1) && | ||
253 | z[2]!=0 && isConsonant(z+2); | ||
254 | } | ||
255 | |||
256 | /* | ||
257 | ** If the word ends with zFrom and xCond() is true for the stem | ||
258 | ** of the word that preceeds the zFrom ending, then change the | ||
259 | ** ending to zTo. | ||
260 | ** | ||
261 | ** The input word *pz and zFrom are both in reverse order. zTo | ||
262 | ** is in normal order. | ||
263 | ** | ||
264 | ** Return TRUE if zFrom matches. Return FALSE if zFrom does not | ||
265 | ** match. Not that TRUE is returned even if xCond() fails and | ||
266 | ** no substitution occurs. | ||
267 | */ | ||
268 | static int stem( | ||
269 | char **pz, /* The word being stemmed (Reversed) */ | ||
270 | const char *zFrom, /* If the ending matches this... (Reversed) */ | ||
271 | const char *zTo, /* ... change the ending to this (not reversed) */ | ||
272 | int (*xCond)(const char*) /* Condition that must be true */ | ||
273 | ){ | ||
274 | char *z = *pz; | ||
275 | while( *zFrom && *zFrom==*z ){ z++; zFrom++; } | ||
276 | if( *zFrom!=0 ) return 0; | ||
277 | if( xCond && !xCond(z) ) return 1; | ||
278 | while( *zTo ){ | ||
279 | *(--z) = *(zTo++); | ||
280 | } | ||
281 | *pz = z; | ||
282 | return 1; | ||
283 | } | ||
284 | |||
285 | /* | ||
286 | ** This is the fallback stemmer used when the porter stemmer is | ||
287 | ** inappropriate. The input word is copied into the output with | ||
288 | ** US-ASCII case folding. If the input word is too long (more | ||
289 | ** than 20 bytes if it contains no digits or more than 6 bytes if | ||
290 | ** it contains digits) then word is truncated to 20 or 6 bytes | ||
291 | ** by taking 10 or 3 bytes from the beginning and end. | ||
292 | */ | ||
293 | static void copy_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){ | ||
294 | int i, mx, j; | ||
295 | int hasDigit = 0; | ||
296 | for(i=0; i<nIn; i++){ | ||
297 | int c = zIn[i]; | ||
298 | if( c>='A' && c<='Z' ){ | ||
299 | zOut[i] = c - 'A' + 'a'; | ||
300 | }else{ | ||
301 | if( c>='0' && c<='9' ) hasDigit = 1; | ||
302 | zOut[i] = c; | ||
303 | } | ||
304 | } | ||
305 | mx = hasDigit ? 3 : 10; | ||
306 | if( nIn>mx*2 ){ | ||
307 | for(j=mx, i=nIn-mx; i<nIn; i++, j++){ | ||
308 | zOut[j] = zOut[i]; | ||
309 | } | ||
310 | i = j; | ||
311 | } | ||
312 | zOut[i] = 0; | ||
313 | *pnOut = i; | ||
314 | } | ||
315 | |||
316 | |||
317 | /* | ||
318 | ** Stem the input word zIn[0..nIn-1]. Store the output in zOut. | ||
319 | ** zOut is at least big enough to hold nIn bytes. Write the actual | ||
320 | ** size of the output word (exclusive of the '\0' terminator) into *pnOut. | ||
321 | ** | ||
322 | ** Any upper-case characters in the US-ASCII character set ([A-Z]) | ||
323 | ** are converted to lower case. Upper-case UTF characters are | ||
324 | ** unchanged. | ||
325 | ** | ||
326 | ** Words that are longer than about 20 bytes are stemmed by retaining | ||
327 | ** a few bytes from the beginning and the end of the word. If the | ||
328 | ** word contains digits, 3 bytes are taken from the beginning and | ||
329 | ** 3 bytes from the end. For long words without digits, 10 bytes | ||
330 | ** are taken from each end. US-ASCII case folding still applies. | ||
331 | ** | ||
332 | ** If the input word contains not digits but does characters not | ||
333 | ** in [a-zA-Z] then no stemming is attempted and this routine just | ||
334 | ** copies the input into the input into the output with US-ASCII | ||
335 | ** case folding. | ||
336 | ** | ||
337 | ** Stemming never increases the length of the word. So there is | ||
338 | ** no chance of overflowing the zOut buffer. | ||
339 | */ | ||
340 | static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){ | ||
341 | int i, j, c; | ||
342 | char zReverse[28]; | ||
343 | char *z, *z2; | ||
344 | if( nIn<3 || nIn>=sizeof(zReverse)-7 ){ | ||
345 | /* The word is too big or too small for the porter stemmer. | ||
346 | ** Fallback to the copy stemmer */ | ||
347 | copy_stemmer(zIn, nIn, zOut, pnOut); | ||
348 | return; | ||
349 | } | ||
350 | for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){ | ||
351 | c = zIn[i]; | ||
352 | if( c>='A' && c<='Z' ){ | ||
353 | zReverse[j] = c + 'a' - 'A'; | ||
354 | }else if( c>='a' && c<='z' ){ | ||
355 | zReverse[j] = c; | ||
356 | }else{ | ||
357 | /* The use of a character not in [a-zA-Z] means that we fallback | ||
358 | ** to the copy stemmer */ | ||
359 | copy_stemmer(zIn, nIn, zOut, pnOut); | ||
360 | return; | ||
361 | } | ||
362 | } | ||
363 | memset(&zReverse[sizeof(zReverse)-5], 0, 5); | ||
364 | z = &zReverse[j+1]; | ||
365 | |||
366 | |||
367 | /* Step 1a */ | ||
368 | if( z[0]=='s' ){ | ||
369 | if( | ||
370 | !stem(&z, "sess", "ss", 0) && | ||
371 | !stem(&z, "sei", "i", 0) && | ||
372 | !stem(&z, "ss", "ss", 0) | ||
373 | ){ | ||
374 | z++; | ||
375 | } | ||
376 | } | ||
377 | |||
378 | /* Step 1b */ | ||
379 | z2 = z; | ||
380 | if( stem(&z, "dee", "ee", m_gt_0) ){ | ||
381 | /* Do nothing. The work was all in the test */ | ||
382 | }else if( | ||
383 | (stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel)) | ||
384 | && z!=z2 | ||
385 | ){ | ||
386 | if( stem(&z, "ta", "ate", 0) || | ||
387 | stem(&z, "lb", "ble", 0) || | ||
388 | stem(&z, "zi", "ize", 0) ){ | ||
389 | /* Do nothing. The work was all in the test */ | ||
390 | }else if( doubleConsonant(z) && (*z!='l' && *z!='s' && *z!='z') ){ | ||
391 | z++; | ||
392 | }else if( m_eq_1(z) && star_oh(z) ){ | ||
393 | *(--z) = 'e'; | ||
394 | } | ||
395 | } | ||
396 | |||
397 | /* Step 1c */ | ||
398 | if( z[0]=='y' && hasVowel(z+1) ){ | ||
399 | z[0] = 'i'; | ||
400 | } | ||
401 | |||
402 | /* Step 2 */ | ||
403 | switch( z[1] ){ | ||
404 | case 'a': | ||
405 | stem(&z, "lanoita", "ate", m_gt_0) || | ||
406 | stem(&z, "lanoit", "tion", m_gt_0); | ||
407 | break; | ||
408 | case 'c': | ||
409 | stem(&z, "icne", "ence", m_gt_0) || | ||
410 | stem(&z, "icna", "ance", m_gt_0); | ||
411 | break; | ||
412 | case 'e': | ||
413 | stem(&z, "rezi", "ize", m_gt_0); | ||
414 | break; | ||
415 | case 'g': | ||
416 | stem(&z, "igol", "log", m_gt_0); | ||
417 | break; | ||
418 | case 'l': | ||
419 | stem(&z, "ilb", "ble", m_gt_0) || | ||
420 | stem(&z, "illa", "al", m_gt_0) || | ||
421 | stem(&z, "iltne", "ent", m_gt_0) || | ||
422 | stem(&z, "ile", "e", m_gt_0) || | ||
423 | stem(&z, "ilsuo", "ous", m_gt_0); | ||
424 | break; | ||
425 | case 'o': | ||
426 | stem(&z, "noitazi", "ize", m_gt_0) || | ||
427 | stem(&z, "noita", "ate", m_gt_0) || | ||
428 | stem(&z, "rota", "ate", m_gt_0); | ||
429 | break; | ||
430 | case 's': | ||
431 | stem(&z, "msila", "al", m_gt_0) || | ||
432 | stem(&z, "ssenevi", "ive", m_gt_0) || | ||
433 | stem(&z, "ssenluf", "ful", m_gt_0) || | ||
434 | stem(&z, "ssensuo", "ous", m_gt_0); | ||
435 | break; | ||
436 | case 't': | ||
437 | stem(&z, "itila", "al", m_gt_0) || | ||
438 | stem(&z, "itivi", "ive", m_gt_0) || | ||
439 | stem(&z, "itilib", "ble", m_gt_0); | ||
440 | break; | ||
441 | } | ||
442 | |||
443 | /* Step 3 */ | ||
444 | switch( z[0] ){ | ||
445 | case 'e': | ||
446 | stem(&z, "etaci", "ic", m_gt_0) || | ||
447 | stem(&z, "evita", "", m_gt_0) || | ||
448 | stem(&z, "ezila", "al", m_gt_0); | ||
449 | break; | ||
450 | case 'i': | ||
451 | stem(&z, "itici", "ic", m_gt_0); | ||
452 | break; | ||
453 | case 'l': | ||
454 | stem(&z, "laci", "ic", m_gt_0) || | ||
455 | stem(&z, "luf", "", m_gt_0); | ||
456 | break; | ||
457 | case 's': | ||
458 | stem(&z, "ssen", "", m_gt_0); | ||
459 | break; | ||
460 | } | ||
461 | |||
462 | /* Step 4 */ | ||
463 | switch( z[1] ){ | ||
464 | case 'a': | ||
465 | if( z[0]=='l' && m_gt_1(z+2) ){ | ||
466 | z += 2; | ||
467 | } | ||
468 | break; | ||
469 | case 'c': | ||
470 | if( z[0]=='e' && z[2]=='n' && (z[3]=='a' || z[3]=='e') && m_gt_1(z+4) ){ | ||
471 | z += 4; | ||
472 | } | ||
473 | break; | ||
474 | case 'e': | ||
475 | if( z[0]=='r' && m_gt_1(z+2) ){ | ||
476 | z += 2; | ||
477 | } | ||
478 | break; | ||
479 | case 'i': | ||
480 | if( z[0]=='c' && m_gt_1(z+2) ){ | ||
481 | z += 2; | ||
482 | } | ||
483 | break; | ||
484 | case 'l': | ||
485 | if( z[0]=='e' && z[2]=='b' && (z[3]=='a' || z[3]=='i') && m_gt_1(z+4) ){ | ||
486 | z += 4; | ||
487 | } | ||
488 | break; | ||
489 | case 'n': | ||
490 | if( z[0]=='t' ){ | ||
491 | if( z[2]=='a' ){ | ||
492 | if( m_gt_1(z+3) ){ | ||
493 | z += 3; | ||
494 | } | ||
495 | }else if( z[2]=='e' ){ | ||
496 | stem(&z, "tneme", "", m_gt_1) || | ||
497 | stem(&z, "tnem", "", m_gt_1) || | ||
498 | stem(&z, "tne", "", m_gt_1); | ||
499 | } | ||
500 | } | ||
501 | break; | ||
502 | case 'o': | ||
503 | if( z[0]=='u' ){ | ||
504 | if( m_gt_1(z+2) ){ | ||
505 | z += 2; | ||
506 | } | ||
507 | }else if( z[3]=='s' || z[3]=='t' ){ | ||
508 | stem(&z, "noi", "", m_gt_1); | ||
509 | } | ||
510 | break; | ||
511 | case 's': | ||
512 | if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){ | ||
513 | z += 3; | ||
514 | } | ||
515 | break; | ||
516 | case 't': | ||
517 | stem(&z, "eta", "", m_gt_1) || | ||
518 | stem(&z, "iti", "", m_gt_1); | ||
519 | break; | ||
520 | case 'u': | ||
521 | if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){ | ||
522 | z += 3; | ||
523 | } | ||
524 | break; | ||
525 | case 'v': | ||
526 | case 'z': | ||
527 | if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){ | ||
528 | z += 3; | ||
529 | } | ||
530 | break; | ||
531 | } | ||
532 | |||
533 | /* Step 5a */ | ||
534 | if( z[0]=='e' ){ | ||
535 | if( m_gt_1(z+1) ){ | ||
536 | z++; | ||
537 | }else if( m_eq_1(z+1) && !star_oh(z+1) ){ | ||
538 | z++; | ||
539 | } | ||
540 | } | ||
541 | |||
542 | /* Step 5b */ | ||
543 | if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){ | ||
544 | z++; | ||
545 | } | ||
546 | |||
547 | /* z[] is now the stemmed word in reverse order. Flip it back | ||
548 | ** around into forward order and return. | ||
549 | */ | ||
550 | *pnOut = i = strlen(z); | ||
551 | zOut[i] = 0; | ||
552 | while( *z ){ | ||
553 | zOut[--i] = *(z++); | ||
554 | } | ||
555 | } | ||
556 | |||
557 | /* | ||
558 | ** Characters that can be part of a token. We assume any character | ||
559 | ** whose value is greater than 0x80 (any UTF character) can be | ||
560 | ** part of a token. In other words, delimiters all must have | ||
561 | ** values of 0x7f or lower. | ||
562 | */ | ||
563 | static const char porterIdChar[] = { | ||
564 | /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */ | ||
565 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */ | ||
566 | 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */ | ||
567 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */ | ||
568 | 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */ | ||
569 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */ | ||
570 | }; | ||
571 | #define isDelim(C) (((ch=C)&0x80)==0 && (ch<0x30 || !porterIdChar[ch-0x30])) | ||
572 | |||
573 | /* | ||
574 | ** Extract the next token from a tokenization cursor. The cursor must | ||
575 | ** have been opened by a prior call to porterOpen(). | ||
576 | */ | ||
577 | static int porterNext( | ||
578 | sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by porterOpen */ | ||
579 | const char **pzToken, /* OUT: *pzToken is the token text */ | ||
580 | int *pnBytes, /* OUT: Number of bytes in token */ | ||
581 | int *piStartOffset, /* OUT: Starting offset of token */ | ||
582 | int *piEndOffset, /* OUT: Ending offset of token */ | ||
583 | int *piPosition /* OUT: Position integer of token */ | ||
584 | ){ | ||
585 | porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor; | ||
586 | const char *z = c->zInput; | ||
587 | |||
588 | while( c->iOffset<c->nInput ){ | ||
589 | int iStartOffset, ch; | ||
590 | |||
591 | /* Scan past delimiter characters */ | ||
592 | while( c->iOffset<c->nInput && isDelim(z[c->iOffset]) ){ | ||
593 | c->iOffset++; | ||
594 | } | ||
595 | |||
596 | /* Count non-delimiter characters. */ | ||
597 | iStartOffset = c->iOffset; | ||
598 | while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){ | ||
599 | c->iOffset++; | ||
600 | } | ||
601 | |||
602 | if( c->iOffset>iStartOffset ){ | ||
603 | int n = c->iOffset-iStartOffset; | ||
604 | if( n>c->nAllocated ){ | ||
605 | c->nAllocated = n+20; | ||
606 | c->zToken = realloc(c->zToken, c->nAllocated); | ||
607 | if( c->zToken==NULL ) return SQLITE_NOMEM; | ||
608 | } | ||
609 | porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes); | ||
610 | *pzToken = c->zToken; | ||
611 | *piStartOffset = iStartOffset; | ||
612 | *piEndOffset = c->iOffset; | ||
613 | *piPosition = c->iToken++; | ||
614 | return SQLITE_OK; | ||
615 | } | ||
616 | } | ||
617 | return SQLITE_DONE; | ||
618 | } | ||
619 | |||
620 | /* | ||
621 | ** The set of routines that implement the porter-stemmer tokenizer | ||
622 | */ | ||
623 | static const sqlite3_tokenizer_module porterTokenizerModule = { | ||
624 | 0, | ||
625 | porterCreate, | ||
626 | porterDestroy, | ||
627 | porterOpen, | ||
628 | porterClose, | ||
629 | porterNext, | ||
630 | }; | ||
631 | |||
632 | /* | ||
633 | ** Allocate a new porter tokenizer. Return a pointer to the new | ||
634 | ** tokenizer in *ppModule | ||
635 | */ | ||
636 | void sqlite3Fts2PorterTokenizerModule( | ||
637 | sqlite3_tokenizer_module const**ppModule | ||
638 | ){ | ||
639 | *ppModule = &porterTokenizerModule; | ||
640 | } | ||
641 | |||
642 | #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */ | ||
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_tokenizer.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_tokenizer.c deleted file mode 100644 index cbf771b..0000000 --- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_tokenizer.c +++ /dev/null | |||
@@ -1,371 +0,0 @@ | |||
1 | /* | ||
2 | ** 2007 June 22 | ||
3 | ** | ||
4 | ** The author disclaims copyright to this source code. In place of | ||
5 | ** a legal notice, here is a blessing: | ||
6 | ** | ||
7 | ** May you do good and not evil. | ||
8 | ** May you find forgiveness for yourself and forgive others. | ||
9 | ** May you share freely, never taking more than you give. | ||
10 | ** | ||
11 | ****************************************************************************** | ||
12 | ** | ||
13 | ** This is part of an SQLite module implementing full-text search. | ||
14 | ** This particular file implements the generic tokenizer interface. | ||
15 | */ | ||
16 | |||
17 | /* | ||
18 | ** The code in this file is only compiled if: | ||
19 | ** | ||
20 | ** * The FTS2 module is being built as an extension | ||
21 | ** (in which case SQLITE_CORE is not defined), or | ||
22 | ** | ||
23 | ** * The FTS2 module is being built into the core of | ||
24 | ** SQLite (in which case SQLITE_ENABLE_FTS2 is defined). | ||
25 | */ | ||
26 | #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) | ||
27 | |||
28 | |||
29 | #include "sqlite3.h" | ||
30 | #include "sqlite3ext.h" | ||
31 | SQLITE_EXTENSION_INIT1 | ||
32 | |||
33 | #include "fts2_hash.h" | ||
34 | #include "fts2_tokenizer.h" | ||
35 | #include <assert.h> | ||
36 | |||
37 | /* | ||
38 | ** Implementation of the SQL scalar function for accessing the underlying | ||
39 | ** hash table. This function may be called as follows: | ||
40 | ** | ||
41 | ** SELECT <function-name>(<key-name>); | ||
42 | ** SELECT <function-name>(<key-name>, <pointer>); | ||
43 | ** | ||
44 | ** where <function-name> is the name passed as the second argument | ||
45 | ** to the sqlite3Fts2InitHashTable() function (e.g. 'fts2_tokenizer'). | ||
46 | ** | ||
47 | ** If the <pointer> argument is specified, it must be a blob value | ||
48 | ** containing a pointer to be stored as the hash data corresponding | ||
49 | ** to the string <key-name>. If <pointer> is not specified, then | ||
50 | ** the string <key-name> must already exist in the has table. Otherwise, | ||
51 | ** an error is returned. | ||
52 | ** | ||
53 | ** Whether or not the <pointer> argument is specified, the value returned | ||
54 | ** is a blob containing the pointer stored as the hash data corresponding | ||
55 | ** to string <key-name> (after the hash-table is updated, if applicable). | ||
56 | */ | ||
57 | static void scalarFunc( | ||
58 | sqlite3_context *context, | ||
59 | int argc, | ||
60 | sqlite3_value **argv | ||
61 | ){ | ||
62 | fts2Hash *pHash; | ||
63 | void *pPtr = 0; | ||
64 | const unsigned char *zName; | ||
65 | int nName; | ||
66 | |||
67 | assert( argc==1 || argc==2 ); | ||
68 | |||
69 | pHash = (fts2Hash *)sqlite3_user_data(context); | ||
70 | |||
71 | zName = sqlite3_value_text(argv[0]); | ||
72 | nName = sqlite3_value_bytes(argv[0])+1; | ||
73 | |||
74 | if( argc==2 ){ | ||
75 | void *pOld; | ||
76 | int n = sqlite3_value_bytes(argv[1]); | ||
77 | if( n!=sizeof(pPtr) ){ | ||
78 | sqlite3_result_error(context, "argument type mismatch", -1); | ||
79 | return; | ||
80 | } | ||
81 | pPtr = *(void **)sqlite3_value_blob(argv[1]); | ||
82 | pOld = sqlite3Fts2HashInsert(pHash, (void *)zName, nName, pPtr); | ||
83 | if( pOld==pPtr ){ | ||
84 | sqlite3_result_error(context, "out of memory", -1); | ||
85 | return; | ||
86 | } | ||
87 | }else{ | ||
88 | pPtr = sqlite3Fts2HashFind(pHash, zName, nName); | ||
89 | if( !pPtr ){ | ||
90 | char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName); | ||
91 | sqlite3_result_error(context, zErr, -1); | ||
92 | sqlite3_free(zErr); | ||
93 | return; | ||
94 | } | ||
95 | } | ||
96 | |||
97 | sqlite3_result_blob(context, (void *)&pPtr, sizeof(pPtr), SQLITE_TRANSIENT); | ||
98 | } | ||
99 | |||
100 | #ifdef SQLITE_TEST | ||
101 | |||
102 | #include <tcl.h> | ||
103 | #include <string.h> | ||
104 | |||
105 | /* | ||
106 | ** Implementation of a special SQL scalar function for testing tokenizers | ||
107 | ** designed to be used in concert with the Tcl testing framework. This | ||
108 | ** function must be called with two arguments: | ||
109 | ** | ||
110 | ** SELECT <function-name>(<key-name>, <input-string>); | ||
111 | ** SELECT <function-name>(<key-name>, <pointer>); | ||
112 | ** | ||
113 | ** where <function-name> is the name passed as the second argument | ||
114 | ** to the sqlite3Fts2InitHashTable() function (e.g. 'fts2_tokenizer') | ||
115 | ** concatenated with the string '_test' (e.g. 'fts2_tokenizer_test'). | ||
116 | ** | ||
117 | ** The return value is a string that may be interpreted as a Tcl | ||
118 | ** list. For each token in the <input-string>, three elements are | ||
119 | ** added to the returned list. The first is the token position, the | ||
120 | ** second is the token text (folded, stemmed, etc.) and the third is the | ||
121 | ** substring of <input-string> associated with the token. For example, | ||
122 | ** using the built-in "simple" tokenizer: | ||
123 | ** | ||
124 | ** SELECT fts_tokenizer_test('simple', 'I don't see how'); | ||
125 | ** | ||
126 | ** will return the string: | ||
127 | ** | ||
128 | ** "{0 i I 1 dont don't 2 see see 3 how how}" | ||
129 | ** | ||
130 | */ | ||
131 | static void testFunc( | ||
132 | sqlite3_context *context, | ||
133 | int argc, | ||
134 | sqlite3_value **argv | ||
135 | ){ | ||
136 | fts2Hash *pHash; | ||
137 | sqlite3_tokenizer_module *p; | ||
138 | sqlite3_tokenizer *pTokenizer = 0; | ||
139 | sqlite3_tokenizer_cursor *pCsr = 0; | ||
140 | |||
141 | const char *zErr = 0; | ||
142 | |||
143 | const char *zName; | ||
144 | int nName; | ||
145 | const char *zInput; | ||
146 | int nInput; | ||
147 | |||
148 | const char *zArg = 0; | ||
149 | |||
150 | const char *zToken; | ||
151 | int nToken; | ||
152 | int iStart; | ||
153 | int iEnd; | ||
154 | int iPos; | ||
155 | |||
156 | Tcl_Obj *pRet; | ||
157 | |||
158 | assert( argc==2 || argc==3 ); | ||
159 | |||
160 | nName = sqlite3_value_bytes(argv[0]); | ||
161 | zName = (const char *)sqlite3_value_text(argv[0]); | ||
162 | nInput = sqlite3_value_bytes(argv[argc-1]); | ||
163 | zInput = (const char *)sqlite3_value_text(argv[argc-1]); | ||
164 | |||
165 | if( argc==3 ){ | ||
166 | zArg = (const char *)sqlite3_value_text(argv[1]); | ||
167 | } | ||
168 | |||
169 | pHash = (fts2Hash *)sqlite3_user_data(context); | ||
170 | p = (sqlite3_tokenizer_module *)sqlite3Fts2HashFind(pHash, zName, nName+1); | ||
171 | |||
172 | if( !p ){ | ||
173 | char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName); | ||
174 | sqlite3_result_error(context, zErr, -1); | ||
175 | sqlite3_free(zErr); | ||
176 | return; | ||
177 | } | ||
178 | |||
179 | pRet = Tcl_NewObj(); | ||
180 | Tcl_IncrRefCount(pRet); | ||
181 | |||
182 | if( SQLITE_OK!=p->xCreate(zArg ? 1 : 0, &zArg, &pTokenizer) ){ | ||
183 | zErr = "error in xCreate()"; | ||
184 | goto finish; | ||
185 | } | ||
186 | pTokenizer->pModule = p; | ||
187 | if( SQLITE_OK!=p->xOpen(pTokenizer, zInput, nInput, &pCsr) ){ | ||
188 | zErr = "error in xOpen()"; | ||
189 | goto finish; | ||
190 | } | ||
191 | pCsr->pTokenizer = pTokenizer; | ||
192 | |||
193 | while( SQLITE_OK==p->xNext(pCsr, &zToken, &nToken, &iStart, &iEnd, &iPos) ){ | ||
194 | Tcl_ListObjAppendElement(0, pRet, Tcl_NewIntObj(iPos)); | ||
195 | Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken)); | ||
196 | zToken = &zInput[iStart]; | ||
197 | nToken = iEnd-iStart; | ||
198 | Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken)); | ||
199 | } | ||
200 | |||
201 | if( SQLITE_OK!=p->xClose(pCsr) ){ | ||
202 | zErr = "error in xClose()"; | ||
203 | goto finish; | ||
204 | } | ||
205 | if( SQLITE_OK!=p->xDestroy(pTokenizer) ){ | ||
206 | zErr = "error in xDestroy()"; | ||
207 | goto finish; | ||
208 | } | ||
209 | |||
210 | finish: | ||
211 | if( zErr ){ | ||
212 | sqlite3_result_error(context, zErr, -1); | ||
213 | }else{ | ||
214 | sqlite3_result_text(context, Tcl_GetString(pRet), -1, SQLITE_TRANSIENT); | ||
215 | } | ||
216 | Tcl_DecrRefCount(pRet); | ||
217 | } | ||
218 | |||
219 | static | ||
220 | int registerTokenizer( | ||
221 | sqlite3 *db, | ||
222 | char *zName, | ||
223 | const sqlite3_tokenizer_module *p | ||
224 | ){ | ||
225 | int rc; | ||
226 | sqlite3_stmt *pStmt; | ||
227 | const char zSql[] = "SELECT fts2_tokenizer(?, ?)"; | ||
228 | |||
229 | rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0); | ||
230 | if( rc!=SQLITE_OK ){ | ||
231 | return rc; | ||
232 | } | ||
233 | |||
234 | sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC); | ||
235 | sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC); | ||
236 | sqlite3_step(pStmt); | ||
237 | |||
238 | return sqlite3_finalize(pStmt); | ||
239 | } | ||
240 | |||
241 | static | ||
242 | int queryTokenizer( | ||
243 | sqlite3 *db, | ||
244 | char *zName, | ||
245 | const sqlite3_tokenizer_module **pp | ||
246 | ){ | ||
247 | int rc; | ||
248 | sqlite3_stmt *pStmt; | ||
249 | const char zSql[] = "SELECT fts2_tokenizer(?)"; | ||
250 | |||
251 | *pp = 0; | ||
252 | rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0); | ||
253 | if( rc!=SQLITE_OK ){ | ||
254 | return rc; | ||
255 | } | ||
256 | |||
257 | sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC); | ||
258 | if( SQLITE_ROW==sqlite3_step(pStmt) ){ | ||
259 | if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){ | ||
260 | memcpy(pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp)); | ||
261 | } | ||
262 | } | ||
263 | |||
264 | return sqlite3_finalize(pStmt); | ||
265 | } | ||
266 | |||
267 | void sqlite3Fts2SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule); | ||
268 | |||
269 | /* | ||
270 | ** Implementation of the scalar function fts2_tokenizer_internal_test(). | ||
271 | ** This function is used for testing only, it is not included in the | ||
272 | ** build unless SQLITE_TEST is defined. | ||
273 | ** | ||
274 | ** The purpose of this is to test that the fts2_tokenizer() function | ||
275 | ** can be used as designed by the C-code in the queryTokenizer and | ||
276 | ** registerTokenizer() functions above. These two functions are repeated | ||
277 | ** in the README.tokenizer file as an example, so it is important to | ||
278 | ** test them. | ||
279 | ** | ||
280 | ** To run the tests, evaluate the fts2_tokenizer_internal_test() scalar | ||
281 | ** function with no arguments. An assert() will fail if a problem is | ||
282 | ** detected. i.e.: | ||
283 | ** | ||
284 | ** SELECT fts2_tokenizer_internal_test(); | ||
285 | ** | ||
286 | */ | ||
287 | static void intTestFunc( | ||
288 | sqlite3_context *context, | ||
289 | int argc, | ||
290 | sqlite3_value **argv | ||
291 | ){ | ||
292 | int rc; | ||
293 | const sqlite3_tokenizer_module *p1; | ||
294 | const sqlite3_tokenizer_module *p2; | ||
295 | sqlite3 *db = (sqlite3 *)sqlite3_user_data(context); | ||
296 | |||
297 | /* Test the query function */ | ||
298 | sqlite3Fts2SimpleTokenizerModule(&p1); | ||
299 | rc = queryTokenizer(db, "simple", &p2); | ||
300 | assert( rc==SQLITE_OK ); | ||
301 | assert( p1==p2 ); | ||
302 | rc = queryTokenizer(db, "nosuchtokenizer", &p2); | ||
303 | assert( rc==SQLITE_ERROR ); | ||
304 | assert( p2==0 ); | ||
305 | assert( 0==strcmp(sqlite3_errmsg(db), "unknown tokenizer: nosuchtokenizer") ); | ||
306 | |||
307 | /* Test the storage function */ | ||
308 | rc = registerTokenizer(db, "nosuchtokenizer", p1); | ||
309 | assert( rc==SQLITE_OK ); | ||
310 | rc = queryTokenizer(db, "nosuchtokenizer", &p2); | ||
311 | assert( rc==SQLITE_OK ); | ||
312 | assert( p2==p1 ); | ||
313 | |||
314 | sqlite3_result_text(context, "ok", -1, SQLITE_STATIC); | ||
315 | } | ||
316 | |||
317 | #endif | ||
318 | |||
319 | /* | ||
320 | ** Set up SQL objects in database db used to access the contents of | ||
321 | ** the hash table pointed to by argument pHash. The hash table must | ||
322 | ** been initialised to use string keys, and to take a private copy | ||
323 | ** of the key when a value is inserted. i.e. by a call similar to: | ||
324 | ** | ||
325 | ** sqlite3Fts2HashInit(pHash, FTS2_HASH_STRING, 1); | ||
326 | ** | ||
327 | ** This function adds a scalar function (see header comment above | ||
328 | ** scalarFunc() in this file for details) and, if ENABLE_TABLE is | ||
329 | ** defined at compilation time, a temporary virtual table (see header | ||
330 | ** comment above struct HashTableVtab) to the database schema. Both | ||
331 | ** provide read/write access to the contents of *pHash. | ||
332 | ** | ||
333 | ** The third argument to this function, zName, is used as the name | ||
334 | ** of both the scalar and, if created, the virtual table. | ||
335 | */ | ||
336 | int sqlite3Fts2InitHashTable( | ||
337 | sqlite3 *db, | ||
338 | fts2Hash *pHash, | ||
339 | const char *zName | ||
340 | ){ | ||
341 | int rc = SQLITE_OK; | ||
342 | void *p = (void *)pHash; | ||
343 | const int any = SQLITE_ANY; | ||
344 | char *zTest = 0; | ||
345 | char *zTest2 = 0; | ||
346 | |||
347 | #ifdef SQLITE_TEST | ||
348 | void *pdb = (void *)db; | ||
349 | zTest = sqlite3_mprintf("%s_test", zName); | ||
350 | zTest2 = sqlite3_mprintf("%s_internal_test", zName); | ||
351 | if( !zTest || !zTest2 ){ | ||
352 | rc = SQLITE_NOMEM; | ||
353 | } | ||
354 | #endif | ||
355 | |||
356 | if( rc!=SQLITE_OK | ||
357 | || (rc = sqlite3_create_function(db, zName, 1, any, p, scalarFunc, 0, 0)) | ||
358 | || (rc = sqlite3_create_function(db, zName, 2, any, p, scalarFunc, 0, 0)) | ||
359 | #ifdef SQLITE_TEST | ||
360 | || (rc = sqlite3_create_function(db, zTest, 2, any, p, testFunc, 0, 0)) | ||
361 | || (rc = sqlite3_create_function(db, zTest, 3, any, p, testFunc, 0, 0)) | ||
362 | || (rc = sqlite3_create_function(db, zTest2, 0, any, pdb, intTestFunc, 0, 0)) | ||
363 | #endif | ||
364 | ); | ||
365 | |||
366 | sqlite3_free(zTest); | ||
367 | sqlite3_free(zTest2); | ||
368 | return rc; | ||
369 | } | ||
370 | |||
371 | #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */ | ||
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_tokenizer.h b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_tokenizer.h deleted file mode 100644 index 8c256b2..0000000 --- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_tokenizer.h +++ /dev/null | |||
@@ -1,145 +0,0 @@ | |||
1 | /* | ||
2 | ** 2006 July 10 | ||
3 | ** | ||
4 | ** The author disclaims copyright to this source code. | ||
5 | ** | ||
6 | ************************************************************************* | ||
7 | ** Defines the interface to tokenizers used by fulltext-search. There | ||
8 | ** are three basic components: | ||
9 | ** | ||
10 | ** sqlite3_tokenizer_module is a singleton defining the tokenizer | ||
11 | ** interface functions. This is essentially the class structure for | ||
12 | ** tokenizers. | ||
13 | ** | ||
14 | ** sqlite3_tokenizer is used to define a particular tokenizer, perhaps | ||
15 | ** including customization information defined at creation time. | ||
16 | ** | ||
17 | ** sqlite3_tokenizer_cursor is generated by a tokenizer to generate | ||
18 | ** tokens from a particular input. | ||
19 | */ | ||
20 | #ifndef _FTS2_TOKENIZER_H_ | ||
21 | #define _FTS2_TOKENIZER_H_ | ||
22 | |||
23 | /* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time. | ||
24 | ** If tokenizers are to be allowed to call sqlite3_*() functions, then | ||
25 | ** we will need a way to register the API consistently. | ||
26 | */ | ||
27 | #include "sqlite3.h" | ||
28 | |||
29 | /* | ||
30 | ** Structures used by the tokenizer interface. When a new tokenizer | ||
31 | ** implementation is registered, the caller provides a pointer to | ||
32 | ** an sqlite3_tokenizer_module containing pointers to the callback | ||
33 | ** functions that make up an implementation. | ||
34 | ** | ||
35 | ** When an fts2 table is created, it passes any arguments passed to | ||
36 | ** the tokenizer clause of the CREATE VIRTUAL TABLE statement to the | ||
37 | ** sqlite3_tokenizer_module.xCreate() function of the requested tokenizer | ||
38 | ** implementation. The xCreate() function in turn returns an | ||
39 | ** sqlite3_tokenizer structure representing the specific tokenizer to | ||
40 | ** be used for the fts2 table (customized by the tokenizer clause arguments). | ||
41 | ** | ||
42 | ** To tokenize an input buffer, the sqlite3_tokenizer_module.xOpen() | ||
43 | ** method is called. It returns an sqlite3_tokenizer_cursor object | ||
44 | ** that may be used to tokenize a specific input buffer based on | ||
45 | ** the tokenization rules supplied by a specific sqlite3_tokenizer | ||
46 | ** object. | ||
47 | */ | ||
48 | typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module; | ||
49 | typedef struct sqlite3_tokenizer sqlite3_tokenizer; | ||
50 | typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor; | ||
51 | |||
52 | struct sqlite3_tokenizer_module { | ||
53 | |||
54 | /* | ||
55 | ** Structure version. Should always be set to 0. | ||
56 | */ | ||
57 | int iVersion; | ||
58 | |||
59 | /* | ||
60 | ** Create a new tokenizer. The values in the argv[] array are the | ||
61 | ** arguments passed to the "tokenizer" clause of the CREATE VIRTUAL | ||
62 | ** TABLE statement that created the fts2 table. For example, if | ||
63 | ** the following SQL is executed: | ||
64 | ** | ||
65 | ** CREATE .. USING fts2( ... , tokenizer <tokenizer-name> arg1 arg2) | ||
66 | ** | ||
67 | ** then argc is set to 2, and the argv[] array contains pointers | ||
68 | ** to the strings "arg1" and "arg2". | ||
69 | ** | ||
70 | ** This method should return either SQLITE_OK (0), or an SQLite error | ||
71 | ** code. If SQLITE_OK is returned, then *ppTokenizer should be set | ||
72 | ** to point at the newly created tokenizer structure. The generic | ||
73 | ** sqlite3_tokenizer.pModule variable should not be initialised by | ||
74 | ** this callback. The caller will do so. | ||
75 | */ | ||
76 | int (*xCreate)( | ||
77 | int argc, /* Size of argv array */ | ||
78 | const char *const*argv, /* Tokenizer argument strings */ | ||
79 | sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */ | ||
80 | ); | ||
81 | |||
82 | /* | ||
83 | ** Destroy an existing tokenizer. The fts2 module calls this method | ||
84 | ** exactly once for each successful call to xCreate(). | ||
85 | */ | ||
86 | int (*xDestroy)(sqlite3_tokenizer *pTokenizer); | ||
87 | |||
88 | /* | ||
89 | ** Create a tokenizer cursor to tokenize an input buffer. The caller | ||
90 | ** is responsible for ensuring that the input buffer remains valid | ||
91 | ** until the cursor is closed (using the xClose() method). | ||
92 | */ | ||
93 | int (*xOpen)( | ||
94 | sqlite3_tokenizer *pTokenizer, /* Tokenizer object */ | ||
95 | const char *pInput, int nBytes, /* Input buffer */ | ||
96 | sqlite3_tokenizer_cursor **ppCursor /* OUT: Created tokenizer cursor */ | ||
97 | ); | ||
98 | |||
99 | /* | ||
100 | ** Destroy an existing tokenizer cursor. The fts2 module calls this | ||
101 | ** method exactly once for each successful call to xOpen(). | ||
102 | */ | ||
103 | int (*xClose)(sqlite3_tokenizer_cursor *pCursor); | ||
104 | |||
105 | /* | ||
106 | ** Retrieve the next token from the tokenizer cursor pCursor. This | ||
107 | ** method should either return SQLITE_OK and set the values of the | ||
108 | ** "OUT" variables identified below, or SQLITE_DONE to indicate that | ||
109 | ** the end of the buffer has been reached, or an SQLite error code. | ||
110 | ** | ||
111 | ** *ppToken should be set to point at a buffer containing the | ||
112 | ** normalized version of the token (i.e. after any case-folding and/or | ||
113 | ** stemming has been performed). *pnBytes should be set to the length | ||
114 | ** of this buffer in bytes. The input text that generated the token is | ||
115 | ** identified by the byte offsets returned in *piStartOffset and | ||
116 | ** *piEndOffset. | ||
117 | ** | ||
118 | ** The buffer *ppToken is set to point at is managed by the tokenizer | ||
119 | ** implementation. It is only required to be valid until the next call | ||
120 | ** to xNext() or xClose(). | ||
121 | */ | ||
122 | /* TODO(shess) current implementation requires pInput to be | ||
123 | ** nul-terminated. This should either be fixed, or pInput/nBytes | ||
124 | ** should be converted to zInput. | ||
125 | */ | ||
126 | int (*xNext)( | ||
127 | sqlite3_tokenizer_cursor *pCursor, /* Tokenizer cursor */ | ||
128 | const char **ppToken, int *pnBytes, /* OUT: Normalized text for token */ | ||
129 | int *piStartOffset, /* OUT: Byte offset of token in input buffer */ | ||
130 | int *piEndOffset, /* OUT: Byte offset of end of token in input buffer */ | ||
131 | int *piPosition /* OUT: Number of tokens returned before this one */ | ||
132 | ); | ||
133 | }; | ||
134 | |||
135 | struct sqlite3_tokenizer { | ||
136 | const sqlite3_tokenizer_module *pModule; /* The module for this tokenizer */ | ||
137 | /* Tokenizer implementations will typically add additional fields */ | ||
138 | }; | ||
139 | |||
140 | struct sqlite3_tokenizer_cursor { | ||
141 | sqlite3_tokenizer *pTokenizer; /* Tokenizer for this cursor. */ | ||
142 | /* Tokenizer implementations will typically add additional fields */ | ||
143 | }; | ||
144 | |||
145 | #endif /* _FTS2_TOKENIZER_H_ */ | ||
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_tokenizer1.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_tokenizer1.c deleted file mode 100644 index 540ba27..0000000 --- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_tokenizer1.c +++ /dev/null | |||
@@ -1,229 +0,0 @@ | |||
1 | /* | ||
2 | ** 2006 Oct 10 | ||
3 | ** | ||
4 | ** The author disclaims copyright to this source code. In place of | ||
5 | ** a legal notice, here is a blessing: | ||
6 | ** | ||
7 | ** May you do good and not evil. | ||
8 | ** May you find forgiveness for yourself and forgive others. | ||
9 | ** May you share freely, never taking more than you give. | ||
10 | ** | ||
11 | ****************************************************************************** | ||
12 | ** | ||
13 | ** Implementation of the "simple" full-text-search tokenizer. | ||
14 | */ | ||
15 | |||
16 | /* | ||
17 | ** The code in this file is only compiled if: | ||
18 | ** | ||
19 | ** * The FTS2 module is being built as an extension | ||
20 | ** (in which case SQLITE_CORE is not defined), or | ||
21 | ** | ||
22 | ** * The FTS2 module is being built into the core of | ||
23 | ** SQLite (in which case SQLITE_ENABLE_FTS2 is defined). | ||
24 | */ | ||
25 | #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) | ||
26 | |||
27 | |||
28 | #include <assert.h> | ||
29 | #include <stdlib.h> | ||
30 | #include <stdio.h> | ||
31 | #include <string.h> | ||
32 | #include <ctype.h> | ||
33 | |||
34 | #include "fts2_tokenizer.h" | ||
35 | |||
36 | typedef struct simple_tokenizer { | ||
37 | sqlite3_tokenizer base; | ||
38 | char delim[128]; /* flag ASCII delimiters */ | ||
39 | } simple_tokenizer; | ||
40 | |||
41 | typedef struct simple_tokenizer_cursor { | ||
42 | sqlite3_tokenizer_cursor base; | ||
43 | const char *pInput; /* input we are tokenizing */ | ||
44 | int nBytes; /* size of the input */ | ||
45 | int iOffset; /* current position in pInput */ | ||
46 | int iToken; /* index of next token to be returned */ | ||
47 | char *pToken; /* storage for current token */ | ||
48 | int nTokenAllocated; /* space allocated to zToken buffer */ | ||
49 | } simple_tokenizer_cursor; | ||
50 | |||
51 | |||
52 | /* Forward declaration */ | ||
53 | static const sqlite3_tokenizer_module simpleTokenizerModule; | ||
54 | |||
55 | static int simpleDelim(simple_tokenizer *t, unsigned char c){ | ||
56 | return c<0x80 && t->delim[c]; | ||
57 | } | ||
58 | |||
59 | /* | ||
60 | ** Create a new tokenizer instance. | ||
61 | */ | ||
62 | static int simpleCreate( | ||
63 | int argc, const char * const *argv, | ||
64 | sqlite3_tokenizer **ppTokenizer | ||
65 | ){ | ||
66 | simple_tokenizer *t; | ||
67 | |||
68 | t = (simple_tokenizer *) calloc(sizeof(*t), 1); | ||
69 | if( t==NULL ) return SQLITE_NOMEM; | ||
70 | |||
71 | /* TODO(shess) Delimiters need to remain the same from run to run, | ||
72 | ** else we need to reindex. One solution would be a meta-table to | ||
73 | ** track such information in the database, then we'd only want this | ||
74 | ** information on the initial create. | ||
75 | */ | ||
76 | if( argc>1 ){ | ||
77 | int i, n = strlen(argv[1]); | ||
78 | for(i=0; i<n; i++){ | ||
79 | unsigned char ch = argv[1][i]; | ||
80 | /* We explicitly don't support UTF-8 delimiters for now. */ | ||
81 | if( ch>=0x80 ){ | ||
82 | free(t); | ||
83 | return SQLITE_ERROR; | ||
84 | } | ||
85 | t->delim[ch] = 1; | ||
86 | } | ||
87 | } else { | ||
88 | /* Mark non-alphanumeric ASCII characters as delimiters */ | ||
89 | int i; | ||
90 | for(i=1; i<0x80; i++){ | ||
91 | t->delim[i] = !isalnum(i); | ||
92 | } | ||
93 | } | ||
94 | |||
95 | *ppTokenizer = &t->base; | ||
96 | return SQLITE_OK; | ||
97 | } | ||
98 | |||
99 | /* | ||
100 | ** Destroy a tokenizer | ||
101 | */ | ||
102 | static int simpleDestroy(sqlite3_tokenizer *pTokenizer){ | ||
103 | free(pTokenizer); | ||
104 | return SQLITE_OK; | ||
105 | } | ||
106 | |||
107 | /* | ||
108 | ** Prepare to begin tokenizing a particular string. The input | ||
109 | ** string to be tokenized is pInput[0..nBytes-1]. A cursor | ||
110 | ** used to incrementally tokenize this string is returned in | ||
111 | ** *ppCursor. | ||
112 | */ | ||
113 | static int simpleOpen( | ||
114 | sqlite3_tokenizer *pTokenizer, /* The tokenizer */ | ||
115 | const char *pInput, int nBytes, /* String to be tokenized */ | ||
116 | sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ | ||
117 | ){ | ||
118 | simple_tokenizer_cursor *c; | ||
119 | |||
120 | c = (simple_tokenizer_cursor *) malloc(sizeof(*c)); | ||
121 | if( c==NULL ) return SQLITE_NOMEM; | ||
122 | |||
123 | c->pInput = pInput; | ||
124 | if( pInput==0 ){ | ||
125 | c->nBytes = 0; | ||
126 | }else if( nBytes<0 ){ | ||
127 | c->nBytes = (int)strlen(pInput); | ||
128 | }else{ | ||
129 | c->nBytes = nBytes; | ||
130 | } | ||
131 | c->iOffset = 0; /* start tokenizing at the beginning */ | ||
132 | c->iToken = 0; | ||
133 | c->pToken = NULL; /* no space allocated, yet. */ | ||
134 | c->nTokenAllocated = 0; | ||
135 | |||
136 | *ppCursor = &c->base; | ||
137 | return SQLITE_OK; | ||
138 | } | ||
139 | |||
140 | /* | ||
141 | ** Close a tokenization cursor previously opened by a call to | ||
142 | ** simpleOpen() above. | ||
143 | */ | ||
144 | static int simpleClose(sqlite3_tokenizer_cursor *pCursor){ | ||
145 | simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor; | ||
146 | free(c->pToken); | ||
147 | free(c); | ||
148 | return SQLITE_OK; | ||
149 | } | ||
150 | |||
151 | /* | ||
152 | ** Extract the next token from a tokenization cursor. The cursor must | ||
153 | ** have been opened by a prior call to simpleOpen(). | ||
154 | */ | ||
155 | static int simpleNext( | ||
156 | sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */ | ||
157 | const char **ppToken, /* OUT: *ppToken is the token text */ | ||
158 | int *pnBytes, /* OUT: Number of bytes in token */ | ||
159 | int *piStartOffset, /* OUT: Starting offset of token */ | ||
160 | int *piEndOffset, /* OUT: Ending offset of token */ | ||
161 | int *piPosition /* OUT: Position integer of token */ | ||
162 | ){ | ||
163 | simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor; | ||
164 | simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer; | ||
165 | unsigned char *p = (unsigned char *)c->pInput; | ||
166 | |||
167 | while( c->iOffset<c->nBytes ){ | ||
168 | int iStartOffset; | ||
169 | |||
170 | /* Scan past delimiter characters */ | ||
171 | while( c->iOffset<c->nBytes && simpleDelim(t, p[c->iOffset]) ){ | ||
172 | c->iOffset++; | ||
173 | } | ||
174 | |||
175 | /* Count non-delimiter characters. */ | ||
176 | iStartOffset = c->iOffset; | ||
177 | while( c->iOffset<c->nBytes && !simpleDelim(t, p[c->iOffset]) ){ | ||
178 | c->iOffset++; | ||
179 | } | ||
180 | |||
181 | if( c->iOffset>iStartOffset ){ | ||
182 | int i, n = c->iOffset-iStartOffset; | ||
183 | if( n>c->nTokenAllocated ){ | ||
184 | c->nTokenAllocated = n+20; | ||
185 | c->pToken = realloc(c->pToken, c->nTokenAllocated); | ||
186 | if( c->pToken==NULL ) return SQLITE_NOMEM; | ||
187 | } | ||
188 | for(i=0; i<n; i++){ | ||
189 | /* TODO(shess) This needs expansion to handle UTF-8 | ||
190 | ** case-insensitivity. | ||
191 | */ | ||
192 | unsigned char ch = p[iStartOffset+i]; | ||
193 | c->pToken[i] = ch<0x80 ? tolower(ch) : ch; | ||
194 | } | ||
195 | *ppToken = c->pToken; | ||
196 | *pnBytes = n; | ||
197 | *piStartOffset = iStartOffset; | ||
198 | *piEndOffset = c->iOffset; | ||
199 | *piPosition = c->iToken++; | ||
200 | |||
201 | return SQLITE_OK; | ||
202 | } | ||
203 | } | ||
204 | return SQLITE_DONE; | ||
205 | } | ||
206 | |||
207 | /* | ||
208 | ** The set of routines that implement the simple tokenizer | ||
209 | */ | ||
210 | static const sqlite3_tokenizer_module simpleTokenizerModule = { | ||
211 | 0, | ||
212 | simpleCreate, | ||
213 | simpleDestroy, | ||
214 | simpleOpen, | ||
215 | simpleClose, | ||
216 | simpleNext, | ||
217 | }; | ||
218 | |||
219 | /* | ||
220 | ** Allocate a new simple tokenizer. Return a pointer to the new | ||
221 | ** tokenizer in *ppModule | ||
222 | */ | ||
223 | void sqlite3Fts2SimpleTokenizerModule( | ||
224 | sqlite3_tokenizer_module const**ppModule | ||
225 | ){ | ||
226 | *ppModule = &simpleTokenizerModule; | ||
227 | } | ||
228 | |||
229 | #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */ | ||
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/mkfts2amal.tcl b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/mkfts2amal.tcl deleted file mode 100644 index 5c8d1e9..0000000 --- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/mkfts2amal.tcl +++ /dev/null | |||
@@ -1,116 +0,0 @@ | |||
1 | #!/usr/bin/tclsh | ||
2 | # | ||
3 | # This script builds a single C code file holding all of FTS2 code. | ||
4 | # The name of the output file is fts2amal.c. To build this file, | ||
5 | # first do: | ||
6 | # | ||
7 | # make target_source | ||
8 | # | ||
9 | # The make target above moves all of the source code files into | ||
10 | # a subdirectory named "tsrc". (This script expects to find the files | ||
11 | # there and will not work if they are not found.) | ||
12 | # | ||
13 | # After the "tsrc" directory has been created and populated, run | ||
14 | # this script: | ||
15 | # | ||
16 | # tclsh mkfts2amal.tcl | ||
17 | # | ||
18 | # The amalgamated FTS2 code will be written into fts2amal.c | ||
19 | # | ||
20 | |||
21 | # Open the output file and write a header comment at the beginning | ||
22 | # of the file. | ||
23 | # | ||
24 | set out [open fts2amal.c w] | ||
25 | set today [clock format [clock seconds] -format "%Y-%m-%d %H:%M:%S UTC" -gmt 1] | ||
26 | puts $out [subst \ | ||
27 | {/****************************************************************************** | ||
28 | ** This file is an amalgamation of separate C source files from the SQLite | ||
29 | ** Full Text Search extension 2 (fts2). By combining all the individual C | ||
30 | ** code files into this single large file, the entire code can be compiled | ||
31 | ** as a one translation unit. This allows many compilers to do optimizations | ||
32 | ** that would not be possible if the files were compiled separately. It also | ||
33 | ** makes the code easier to import into other projects. | ||
34 | ** | ||
35 | ** This amalgamation was generated on $today. | ||
36 | */}] | ||
37 | |||
38 | # These are the header files used by FTS2. The first time any of these | ||
39 | # files are seen in a #include statement in the C code, include the complete | ||
40 | # text of the file in-line. The file only needs to be included once. | ||
41 | # | ||
42 | foreach hdr { | ||
43 | fts2.h | ||
44 | fts2_hash.h | ||
45 | fts2_tokenizer.h | ||
46 | sqlite3.h | ||
47 | sqlite3ext.h | ||
48 | } { | ||
49 | set available_hdr($hdr) 1 | ||
50 | } | ||
51 | |||
52 | # 78 stars used for comment formatting. | ||
53 | set s78 \ | ||
54 | {*****************************************************************************} | ||
55 | |||
56 | # Insert a comment into the code | ||
57 | # | ||
58 | proc section_comment {text} { | ||
59 | global out s78 | ||
60 | set n [string length $text] | ||
61 | set nstar [expr {60 - $n}] | ||
62 | set stars [string range $s78 0 $nstar] | ||
63 | puts $out "/************** $text $stars/" | ||
64 | } | ||
65 | |||
66 | # Read the source file named $filename and write it into the | ||
67 | # sqlite3.c output file. If any #include statements are seen, | ||
68 | # process them approprately. | ||
69 | # | ||
70 | proc copy_file {filename} { | ||
71 | global seen_hdr available_hdr out | ||
72 | set tail [file tail $filename] | ||
73 | section_comment "Begin file $tail" | ||
74 | set in [open $filename r] | ||
75 | while {![eof $in]} { | ||
76 | set line [gets $in] | ||
77 | if {[regexp {^#\s*include\s+["<]([^">]+)[">]} $line all hdr]} { | ||
78 | if {[info exists available_hdr($hdr)]} { | ||
79 | if {$available_hdr($hdr)} { | ||
80 | section_comment "Include $hdr in the middle of $tail" | ||
81 | copy_file tsrc/$hdr | ||
82 | section_comment "Continuing where we left off in $tail" | ||
83 | } | ||
84 | } elseif {![info exists seen_hdr($hdr)]} { | ||
85 | set seen_hdr($hdr) 1 | ||
86 | puts $out $line | ||
87 | } | ||
88 | } elseif {[regexp {^#ifdef __cplusplus} $line]} { | ||
89 | puts $out "#if 0" | ||
90 | } elseif {[regexp {^#line} $line]} { | ||
91 | # Skip #line directives. | ||
92 | } else { | ||
93 | puts $out $line | ||
94 | } | ||
95 | } | ||
96 | close $in | ||
97 | section_comment "End of $tail" | ||
98 | } | ||
99 | |||
100 | |||
101 | # Process the source files. Process files containing commonly | ||
102 | # used subroutines first in order to help the compiler find | ||
103 | # inlining opportunities. | ||
104 | # | ||
105 | foreach file { | ||
106 | fts2.c | ||
107 | fts2_hash.c | ||
108 | fts2_porter.c | ||
109 | fts2_tokenizer.c | ||
110 | fts2_tokenizer1.c | ||
111 | fts2_icu.c | ||
112 | } { | ||
113 | copy_file tsrc/$file | ||
114 | } | ||
115 | |||
116 | close $out | ||
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/README.tokenizers b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/README.tokenizers deleted file mode 100644 index f214b24..0000000 --- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/README.tokenizers +++ /dev/null | |||
@@ -1,134 +0,0 @@ | |||
1 | |||
2 | 1. FTS3 Tokenizers | ||
3 | |||
4 | When creating a new full-text table, FTS3 allows the user to select | ||
5 | the text tokenizer implementation to be used when indexing text | ||
6 | by specifying a "tokenizer" clause as part of the CREATE VIRTUAL TABLE | ||
7 | statement: | ||
8 | |||
9 | CREATE VIRTUAL TABLE <table-name> USING fts3( | ||
10 | <columns ...> [, tokenizer <tokenizer-name> [<tokenizer-args>]] | ||
11 | ); | ||
12 | |||
13 | The built-in tokenizers (valid values to pass as <tokenizer name>) are | ||
14 | "simple" and "porter". | ||
15 | |||
16 | <tokenizer-args> should consist of zero or more white-space separated | ||
17 | arguments to pass to the selected tokenizer implementation. The | ||
18 | interpretation of the arguments, if any, depends on the individual | ||
19 | tokenizer. | ||
20 | |||
21 | 2. Custom Tokenizers | ||
22 | |||
23 | FTS3 allows users to provide custom tokenizer implementations. The | ||
24 | interface used to create a new tokenizer is defined and described in | ||
25 | the fts3_tokenizer.h source file. | ||
26 | |||
27 | Registering a new FTS3 tokenizer is similar to registering a new | ||
28 | virtual table module with SQLite. The user passes a pointer to a | ||
29 | structure containing pointers to various callback functions that | ||
30 | make up the implementation of the new tokenizer type. For tokenizers, | ||
31 | the structure (defined in fts3_tokenizer.h) is called | ||
32 | "sqlite3_tokenizer_module". | ||
33 | |||
34 | FTS3 does not expose a C-function that users call to register new | ||
35 | tokenizer types with a database handle. Instead, the pointer must | ||
36 | be encoded as an SQL blob value and passed to FTS3 through the SQL | ||
37 | engine by evaluating a special scalar function, "fts3_tokenizer()". | ||
38 | The fts3_tokenizer() function may be called with one or two arguments, | ||
39 | as follows: | ||
40 | |||
41 | SELECT fts3_tokenizer(<tokenizer-name>); | ||
42 | SELECT fts3_tokenizer(<tokenizer-name>, <sqlite3_tokenizer_module ptr>); | ||
43 | |||
44 | Where <tokenizer-name> is a string identifying the tokenizer and | ||
45 | <sqlite3_tokenizer_module ptr> is a pointer to an sqlite3_tokenizer_module | ||
46 | structure encoded as an SQL blob. If the second argument is present, | ||
47 | it is registered as tokenizer <tokenizer-name> and a copy of it | ||
48 | returned. If only one argument is passed, a pointer to the tokenizer | ||
49 | implementation currently registered as <tokenizer-name> is returned, | ||
50 | encoded as a blob. Or, if no such tokenizer exists, an SQL exception | ||
51 | (error) is raised. | ||
52 | |||
53 | SECURITY: If the fts3 extension is used in an environment where potentially | ||
54 | malicious users may execute arbitrary SQL (i.e. gears), they should be | ||
55 | prevented from invoking the fts3_tokenizer() function, possibly using the | ||
56 | authorisation callback. | ||
57 | |||
58 | See "Sample code" below for an example of calling the fts3_tokenizer() | ||
59 | function from C code. | ||
60 | |||
61 | 3. ICU Library Tokenizers | ||
62 | |||
63 | If this extension is compiled with the SQLITE_ENABLE_ICU pre-processor | ||
64 | symbol defined, then there exists a built-in tokenizer named "icu" | ||
65 | implemented using the ICU library. The first argument passed to the | ||
66 | xCreate() method (see fts3_tokenizer.h) of this tokenizer may be | ||
67 | an ICU locale identifier. For example "tr_TR" for Turkish as used | ||
68 | in Turkey, or "en_AU" for English as used in Australia. For example: | ||
69 | |||
70 | "CREATE VIRTUAL TABLE thai_text USING fts3(text, tokenizer icu th_TH)" | ||
71 | |||
72 | The ICU tokenizer implementation is very simple. It splits the input | ||
73 | text according to the ICU rules for finding word boundaries and discards | ||
74 | any tokens that consist entirely of white-space. This may be suitable | ||
75 | for some applications in some locales, but not all. If more complex | ||
76 | processing is required, for example to implement stemming or | ||
77 | discard punctuation, this can be done by creating a tokenizer | ||
78 | implementation that uses the ICU tokenizer as part of it's implementation. | ||
79 | |||
80 | When using the ICU tokenizer this way, it is safe to overwrite the | ||
81 | contents of the strings returned by the xNext() method (see | ||
82 | fts3_tokenizer.h). | ||
83 | |||
84 | 4. Sample code. | ||
85 | |||
86 | The following two code samples illustrate the way C code should invoke | ||
87 | the fts3_tokenizer() scalar function: | ||
88 | |||
89 | int registerTokenizer( | ||
90 | sqlite3 *db, | ||
91 | char *zName, | ||
92 | const sqlite3_tokenizer_module *p | ||
93 | ){ | ||
94 | int rc; | ||
95 | sqlite3_stmt *pStmt; | ||
96 | const char zSql[] = "SELECT fts3_tokenizer(?, ?)"; | ||
97 | |||
98 | rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0); | ||
99 | if( rc!=SQLITE_OK ){ | ||
100 | return rc; | ||
101 | } | ||
102 | |||
103 | sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC); | ||
104 | sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC); | ||
105 | sqlite3_step(pStmt); | ||
106 | |||
107 | return sqlite3_finalize(pStmt); | ||
108 | } | ||
109 | |||
110 | int queryTokenizer( | ||
111 | sqlite3 *db, | ||
112 | char *zName, | ||
113 | const sqlite3_tokenizer_module **pp | ||
114 | ){ | ||
115 | int rc; | ||
116 | sqlite3_stmt *pStmt; | ||
117 | const char zSql[] = "SELECT fts3_tokenizer(?)"; | ||
118 | |||
119 | *pp = 0; | ||
120 | rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0); | ||
121 | if( rc!=SQLITE_OK ){ | ||
122 | return rc; | ||
123 | } | ||
124 | |||
125 | sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC); | ||
126 | if( SQLITE_ROW==sqlite3_step(pStmt) ){ | ||
127 | if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){ | ||
128 | memcpy(pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp)); | ||
129 | } | ||
130 | } | ||
131 | |||
132 | return sqlite3_finalize(pStmt); | ||
133 | } | ||
134 | |||
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/README.txt b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/README.txt deleted file mode 100644 index 517a2a0..0000000 --- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/README.txt +++ /dev/null | |||
@@ -1,4 +0,0 @@ | |||
1 | This folder contains source code to the second full-text search | ||
2 | extension for SQLite. While the API is the same, this version uses a | ||
3 | substantially different storage schema from fts1, so tables will need | ||
4 | to be rebuilt. | ||
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3.c deleted file mode 100644 index b392919..0000000 --- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3.c +++ /dev/null | |||
@@ -1,5971 +0,0 @@ | |||
1 | /* | ||
2 | ** 2006 Oct 10 | ||
3 | ** | ||
4 | ** The author disclaims copyright to this source code. In place of | ||
5 | ** a legal notice, here is a blessing: | ||
6 | ** | ||
7 | ** May you do good and not evil. | ||
8 | ** May you find forgiveness for yourself and forgive others. | ||
9 | ** May you share freely, never taking more than you give. | ||
10 | ** | ||
11 | ****************************************************************************** | ||
12 | ** | ||
13 | ** This is an SQLite module implementing full-text search. | ||
14 | */ | ||
15 | |||
16 | /* | ||
17 | ** The code in this file is only compiled if: | ||
18 | ** | ||
19 | ** * The FTS3 module is being built as an extension | ||
20 | ** (in which case SQLITE_CORE is not defined), or | ||
21 | ** | ||
22 | ** * The FTS3 module is being built into the core of | ||
23 | ** SQLite (in which case SQLITE_ENABLE_FTS3 is defined). | ||
24 | */ | ||
25 | |||
26 | /* TODO(shess) Consider exporting this comment to an HTML file or the | ||
27 | ** wiki. | ||
28 | */ | ||
29 | /* The full-text index is stored in a series of b+tree (-like) | ||
30 | ** structures called segments which map terms to doclists. The | ||
31 | ** structures are like b+trees in layout, but are constructed from the | ||
32 | ** bottom up in optimal fashion and are not updatable. Since trees | ||
33 | ** are built from the bottom up, things will be described from the | ||
34 | ** bottom up. | ||
35 | ** | ||
36 | ** | ||
37 | **** Varints **** | ||
38 | ** The basic unit of encoding is a variable-length integer called a | ||
39 | ** varint. We encode variable-length integers in little-endian order | ||
40 | ** using seven bits * per byte as follows: | ||
41 | ** | ||
42 | ** KEY: | ||
43 | ** A = 0xxxxxxx 7 bits of data and one flag bit | ||
44 | ** B = 1xxxxxxx 7 bits of data and one flag bit | ||
45 | ** | ||
46 | ** 7 bits - A | ||
47 | ** 14 bits - BA | ||
48 | ** 21 bits - BBA | ||
49 | ** and so on. | ||
50 | ** | ||
51 | ** This is identical to how sqlite encodes varints (see util.c). | ||
52 | ** | ||
53 | ** | ||
54 | **** Document lists **** | ||
55 | ** A doclist (document list) holds a docid-sorted list of hits for a | ||
56 | ** given term. Doclists hold docids, and can optionally associate | ||
57 | ** token positions and offsets with docids. | ||
58 | ** | ||
59 | ** A DL_POSITIONS_OFFSETS doclist is stored like this: | ||
60 | ** | ||
61 | ** array { | ||
62 | ** varint docid; | ||
63 | ** array { (position list for column 0) | ||
64 | ** varint position; (delta from previous position plus POS_BASE) | ||
65 | ** varint startOffset; (delta from previous startOffset) | ||
66 | ** varint endOffset; (delta from startOffset) | ||
67 | ** } | ||
68 | ** array { | ||
69 | ** varint POS_COLUMN; (marks start of position list for new column) | ||
70 | ** varint column; (index of new column) | ||
71 | ** array { | ||
72 | ** varint position; (delta from previous position plus POS_BASE) | ||
73 | ** varint startOffset;(delta from previous startOffset) | ||
74 | ** varint endOffset; (delta from startOffset) | ||
75 | ** } | ||
76 | ** } | ||
77 | ** varint POS_END; (marks end of positions for this document. | ||
78 | ** } | ||
79 | ** | ||
80 | ** Here, array { X } means zero or more occurrences of X, adjacent in | ||
81 | ** memory. A "position" is an index of a token in the token stream | ||
82 | ** generated by the tokenizer, while an "offset" is a byte offset, | ||
83 | ** both based at 0. Note that POS_END and POS_COLUMN occur in the | ||
84 | ** same logical place as the position element, and act as sentinals | ||
85 | ** ending a position list array. | ||
86 | ** | ||
87 | ** A DL_POSITIONS doclist omits the startOffset and endOffset | ||
88 | ** information. A DL_DOCIDS doclist omits both the position and | ||
89 | ** offset information, becoming an array of varint-encoded docids. | ||
90 | ** | ||
91 | ** On-disk data is stored as type DL_DEFAULT, so we don't serialize | ||
92 | ** the type. Due to how deletion is implemented in the segmentation | ||
93 | ** system, on-disk doclists MUST store at least positions. | ||
94 | ** | ||
95 | ** | ||
96 | **** Segment leaf nodes **** | ||
97 | ** Segment leaf nodes store terms and doclists, ordered by term. Leaf | ||
98 | ** nodes are written using LeafWriter, and read using LeafReader (to | ||
99 | ** iterate through a single leaf node's data) and LeavesReader (to | ||
100 | ** iterate through a segment's entire leaf layer). Leaf nodes have | ||
101 | ** the format: | ||
102 | ** | ||
103 | ** varint iHeight; (height from leaf level, always 0) | ||
104 | ** varint nTerm; (length of first term) | ||
105 | ** char pTerm[nTerm]; (content of first term) | ||
106 | ** varint nDoclist; (length of term's associated doclist) | ||
107 | ** char pDoclist[nDoclist]; (content of doclist) | ||
108 | ** array { | ||
109 | ** (further terms are delta-encoded) | ||
110 | ** varint nPrefix; (length of prefix shared with previous term) | ||
111 | ** varint nSuffix; (length of unshared suffix) | ||
112 | ** char pTermSuffix[nSuffix];(unshared suffix of next term) | ||
113 | ** varint nDoclist; (length of term's associated doclist) | ||
114 | ** char pDoclist[nDoclist]; (content of doclist) | ||
115 | ** } | ||
116 | ** | ||
117 | ** Here, array { X } means zero or more occurrences of X, adjacent in | ||
118 | ** memory. | ||
119 | ** | ||
120 | ** Leaf nodes are broken into blocks which are stored contiguously in | ||
121 | ** the %_segments table in sorted order. This means that when the end | ||
122 | ** of a node is reached, the next term is in the node with the next | ||
123 | ** greater node id. | ||
124 | ** | ||
125 | ** New data is spilled to a new leaf node when the current node | ||
126 | ** exceeds LEAF_MAX bytes (default 2048). New data which itself is | ||
127 | ** larger than STANDALONE_MIN (default 1024) is placed in a standalone | ||
128 | ** node (a leaf node with a single term and doclist). The goal of | ||
129 | ** these settings is to pack together groups of small doclists while | ||
130 | ** making it efficient to directly access large doclists. The | ||
131 | ** assumption is that large doclists represent terms which are more | ||
132 | ** likely to be query targets. | ||
133 | ** | ||
134 | ** TODO(shess) It may be useful for blocking decisions to be more | ||
135 | ** dynamic. For instance, it may make more sense to have a 2.5k leaf | ||
136 | ** node rather than splitting into 2k and .5k nodes. My intuition is | ||
137 | ** that this might extend through 2x or 4x the pagesize. | ||
138 | ** | ||
139 | ** | ||
140 | **** Segment interior nodes **** | ||
141 | ** Segment interior nodes store blockids for subtree nodes and terms | ||
142 | ** to describe what data is stored by the each subtree. Interior | ||
143 | ** nodes are written using InteriorWriter, and read using | ||
144 | ** InteriorReader. InteriorWriters are created as needed when | ||
145 | ** SegmentWriter creates new leaf nodes, or when an interior node | ||
146 | ** itself grows too big and must be split. The format of interior | ||
147 | ** nodes: | ||
148 | ** | ||
149 | ** varint iHeight; (height from leaf level, always >0) | ||
150 | ** varint iBlockid; (block id of node's leftmost subtree) | ||
151 | ** optional { | ||
152 | ** varint nTerm; (length of first term) | ||
153 | ** char pTerm[nTerm]; (content of first term) | ||
154 | ** array { | ||
155 | ** (further terms are delta-encoded) | ||
156 | ** varint nPrefix; (length of shared prefix with previous term) | ||
157 | ** varint nSuffix; (length of unshared suffix) | ||
158 | ** char pTermSuffix[nSuffix]; (unshared suffix of next term) | ||
159 | ** } | ||
160 | ** } | ||
161 | ** | ||
162 | ** Here, optional { X } means an optional element, while array { X } | ||
163 | ** means zero or more occurrences of X, adjacent in memory. | ||
164 | ** | ||
165 | ** An interior node encodes n terms separating n+1 subtrees. The | ||
166 | ** subtree blocks are contiguous, so only the first subtree's blockid | ||
167 | ** is encoded. The subtree at iBlockid will contain all terms less | ||
168 | ** than the first term encoded (or all terms if no term is encoded). | ||
169 | ** Otherwise, for terms greater than or equal to pTerm[i] but less | ||
170 | ** than pTerm[i+1], the subtree for that term will be rooted at | ||
171 | ** iBlockid+i. Interior nodes only store enough term data to | ||
172 | ** distinguish adjacent children (if the rightmost term of the left | ||
173 | ** child is "something", and the leftmost term of the right child is | ||
174 | ** "wicked", only "w" is stored). | ||
175 | ** | ||
176 | ** New data is spilled to a new interior node at the same height when | ||
177 | ** the current node exceeds INTERIOR_MAX bytes (default 2048). | ||
178 | ** INTERIOR_MIN_TERMS (default 7) keeps large terms from monopolizing | ||
179 | ** interior nodes and making the tree too skinny. The interior nodes | ||
180 | ** at a given height are naturally tracked by interior nodes at | ||
181 | ** height+1, and so on. | ||
182 | ** | ||
183 | ** | ||
184 | **** Segment directory **** | ||
185 | ** The segment directory in table %_segdir stores meta-information for | ||
186 | ** merging and deleting segments, and also the root node of the | ||
187 | ** segment's tree. | ||
188 | ** | ||
189 | ** The root node is the top node of the segment's tree after encoding | ||
190 | ** the entire segment, restricted to ROOT_MAX bytes (default 1024). | ||
191 | ** This could be either a leaf node or an interior node. If the top | ||
192 | ** node requires more than ROOT_MAX bytes, it is flushed to %_segments | ||
193 | ** and a new root interior node is generated (which should always fit | ||
194 | ** within ROOT_MAX because it only needs space for 2 varints, the | ||
195 | ** height and the blockid of the previous root). | ||
196 | ** | ||
197 | ** The meta-information in the segment directory is: | ||
198 | ** level - segment level (see below) | ||
199 | ** idx - index within level | ||
200 | ** - (level,idx uniquely identify a segment) | ||
201 | ** start_block - first leaf node | ||
202 | ** leaves_end_block - last leaf node | ||
203 | ** end_block - last block (including interior nodes) | ||
204 | ** root - contents of root node | ||
205 | ** | ||
206 | ** If the root node is a leaf node, then start_block, | ||
207 | ** leaves_end_block, and end_block are all 0. | ||
208 | ** | ||
209 | ** | ||
210 | **** Segment merging **** | ||
211 | ** To amortize update costs, segments are groups into levels and | ||
212 | ** merged in matches. Each increase in level represents exponentially | ||
213 | ** more documents. | ||
214 | ** | ||
215 | ** New documents (actually, document updates) are tokenized and | ||
216 | ** written individually (using LeafWriter) to a level 0 segment, with | ||
217 | ** incrementing idx. When idx reaches MERGE_COUNT (default 16), all | ||
218 | ** level 0 segments are merged into a single level 1 segment. Level 1 | ||
219 | ** is populated like level 0, and eventually MERGE_COUNT level 1 | ||
220 | ** segments are merged to a single level 2 segment (representing | ||
221 | ** MERGE_COUNT^2 updates), and so on. | ||
222 | ** | ||
223 | ** A segment merge traverses all segments at a given level in | ||
224 | ** parallel, performing a straightforward sorted merge. Since segment | ||
225 | ** leaf nodes are written in to the %_segments table in order, this | ||
226 | ** merge traverses the underlying sqlite disk structures efficiently. | ||
227 | ** After the merge, all segment blocks from the merged level are | ||
228 | ** deleted. | ||
229 | ** | ||
230 | ** MERGE_COUNT controls how often we merge segments. 16 seems to be | ||
231 | ** somewhat of a sweet spot for insertion performance. 32 and 64 show | ||
232 | ** very similar performance numbers to 16 on insertion, though they're | ||
233 | ** a tiny bit slower (perhaps due to more overhead in merge-time | ||
234 | ** sorting). 8 is about 20% slower than 16, 4 about 50% slower than | ||
235 | ** 16, 2 about 66% slower than 16. | ||
236 | ** | ||
237 | ** At query time, high MERGE_COUNT increases the number of segments | ||
238 | ** which need to be scanned and merged. For instance, with 100k docs | ||
239 | ** inserted: | ||
240 | ** | ||
241 | ** MERGE_COUNT segments | ||
242 | ** 16 25 | ||
243 | ** 8 12 | ||
244 | ** 4 10 | ||
245 | ** 2 6 | ||
246 | ** | ||
247 | ** This appears to have only a moderate impact on queries for very | ||
248 | ** frequent terms (which are somewhat dominated by segment merge | ||
249 | ** costs), and infrequent and non-existent terms still seem to be fast | ||
250 | ** even with many segments. | ||
251 | ** | ||
252 | ** TODO(shess) That said, it would be nice to have a better query-side | ||
253 | ** argument for MERGE_COUNT of 16. Also, it's possible/likely that | ||
254 | ** optimizations to things like doclist merging will swing the sweet | ||
255 | ** spot around. | ||
256 | ** | ||
257 | ** | ||
258 | ** | ||
259 | **** Handling of deletions and updates **** | ||
260 | ** Since we're using a segmented structure, with no docid-oriented | ||
261 | ** index into the term index, we clearly cannot simply update the term | ||
262 | ** index when a document is deleted or updated. For deletions, we | ||
263 | ** write an empty doclist (varint(docid) varint(POS_END)), for updates | ||
264 | ** we simply write the new doclist. Segment merges overwrite older | ||
265 | ** data for a particular docid with newer data, so deletes or updates | ||
266 | ** will eventually overtake the earlier data and knock it out. The | ||
267 | ** query logic likewise merges doclists so that newer data knocks out | ||
268 | ** older data. | ||
269 | ** | ||
270 | ** TODO(shess) Provide a VACUUM type operation to clear out all | ||
271 | ** deletions and duplications. This would basically be a forced merge | ||
272 | ** into a single segment. | ||
273 | */ | ||
274 | |||
275 | #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) | ||
276 | |||
277 | #if defined(SQLITE_ENABLE_FTS3) && !defined(SQLITE_CORE) | ||
278 | # define SQLITE_CORE 1 | ||
279 | #endif | ||
280 | |||
281 | #include <assert.h> | ||
282 | #include <stdlib.h> | ||
283 | #include <stdio.h> | ||
284 | #include <string.h> | ||
285 | #include <ctype.h> | ||
286 | |||
287 | #include "fts3.h" | ||
288 | #include "fts3_hash.h" | ||
289 | #include "fts3_tokenizer.h" | ||
290 | #include "sqlite3.h" | ||
291 | #include "sqlite3ext.h" | ||
292 | SQLITE_EXTENSION_INIT1 | ||
293 | |||
294 | |||
295 | /* TODO(shess) MAN, this thing needs some refactoring. At minimum, it | ||
296 | ** would be nice to order the file better, perhaps something along the | ||
297 | ** lines of: | ||
298 | ** | ||
299 | ** - utility functions | ||
300 | ** - table setup functions | ||
301 | ** - table update functions | ||
302 | ** - table query functions | ||
303 | ** | ||
304 | ** Put the query functions last because they're likely to reference | ||
305 | ** typedefs or functions from the table update section. | ||
306 | */ | ||
307 | |||
308 | #if 0 | ||
309 | # define TRACE(A) printf A; fflush(stdout) | ||
310 | #else | ||
311 | # define TRACE(A) | ||
312 | #endif | ||
313 | |||
314 | /* It is not safe to call isspace(), tolower(), or isalnum() on | ||
315 | ** hi-bit-set characters. This is the same solution used in the | ||
316 | ** tokenizer. | ||
317 | */ | ||
318 | /* TODO(shess) The snippet-generation code should be using the | ||
319 | ** tokenizer-generated tokens rather than doing its own local | ||
320 | ** tokenization. | ||
321 | */ | ||
322 | /* TODO(shess) Is __isascii() a portable version of (c&0x80)==0? */ | ||
323 | static int safe_isspace(char c){ | ||
324 | return (c&0x80)==0 ? isspace(c) : 0; | ||
325 | } | ||
326 | static int safe_tolower(char c){ | ||
327 | return (c&0x80)==0 ? tolower(c) : c; | ||
328 | } | ||
329 | static int safe_isalnum(char c){ | ||
330 | return (c&0x80)==0 ? isalnum(c) : 0; | ||
331 | } | ||
332 | |||
333 | typedef enum DocListType { | ||
334 | DL_DOCIDS, /* docids only */ | ||
335 | DL_POSITIONS, /* docids + positions */ | ||
336 | DL_POSITIONS_OFFSETS /* docids + positions + offsets */ | ||
337 | } DocListType; | ||
338 | |||
339 | /* | ||
340 | ** By default, only positions and not offsets are stored in the doclists. | ||
341 | ** To change this so that offsets are stored too, compile with | ||
342 | ** | ||
343 | ** -DDL_DEFAULT=DL_POSITIONS_OFFSETS | ||
344 | ** | ||
345 | ** If DL_DEFAULT is set to DL_DOCIDS, your table can only be inserted | ||
346 | ** into (no deletes or updates). | ||
347 | */ | ||
348 | #ifndef DL_DEFAULT | ||
349 | # define DL_DEFAULT DL_POSITIONS | ||
350 | #endif | ||
351 | |||
352 | enum { | ||
353 | POS_END = 0, /* end of this position list */ | ||
354 | POS_COLUMN, /* followed by new column number */ | ||
355 | POS_BASE | ||
356 | }; | ||
357 | |||
358 | /* MERGE_COUNT controls how often we merge segments (see comment at | ||
359 | ** top of file). | ||
360 | */ | ||
361 | #define MERGE_COUNT 16 | ||
362 | |||
363 | /* utility functions */ | ||
364 | |||
365 | /* CLEAR() and SCRAMBLE() abstract memset() on a pointer to a single | ||
366 | ** record to prevent errors of the form: | ||
367 | ** | ||
368 | ** my_function(SomeType *b){ | ||
369 | ** memset(b, '\0', sizeof(b)); // sizeof(b)!=sizeof(*b) | ||
370 | ** } | ||
371 | */ | ||
372 | /* TODO(shess) Obvious candidates for a header file. */ | ||
373 | #define CLEAR(b) memset(b, '\0', sizeof(*(b))) | ||
374 | |||
375 | #ifndef NDEBUG | ||
376 | # define SCRAMBLE(b) memset(b, 0x55, sizeof(*(b))) | ||
377 | #else | ||
378 | # define SCRAMBLE(b) | ||
379 | #endif | ||
380 | |||
381 | /* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */ | ||
382 | #define VARINT_MAX 10 | ||
383 | |||
384 | /* Write a 64-bit variable-length integer to memory starting at p[0]. | ||
385 | * The length of data written will be between 1 and VARINT_MAX bytes. | ||
386 | * The number of bytes written is returned. */ | ||
387 | static int putVarint(char *p, sqlite_int64 v){ | ||
388 | unsigned char *q = (unsigned char *) p; | ||
389 | sqlite_uint64 vu = v; | ||
390 | do{ | ||
391 | *q++ = (unsigned char) ((vu & 0x7f) | 0x80); | ||
392 | vu >>= 7; | ||
393 | }while( vu!=0 ); | ||
394 | q[-1] &= 0x7f; /* turn off high bit in final byte */ | ||
395 | assert( q - (unsigned char *)p <= VARINT_MAX ); | ||
396 | return (int) (q - (unsigned char *)p); | ||
397 | } | ||
398 | |||
399 | /* Read a 64-bit variable-length integer from memory starting at p[0]. | ||
400 | * Return the number of bytes read, or 0 on error. | ||
401 | * The value is stored in *v. */ | ||
402 | static int getVarint(const char *p, sqlite_int64 *v){ | ||
403 | const unsigned char *q = (const unsigned char *) p; | ||
404 | sqlite_uint64 x = 0, y = 1; | ||
405 | while( (*q & 0x80) == 0x80 ){ | ||
406 | x += y * (*q++ & 0x7f); | ||
407 | y <<= 7; | ||
408 | if( q - (unsigned char *)p >= VARINT_MAX ){ /* bad data */ | ||
409 | assert( 0 ); | ||
410 | return 0; | ||
411 | } | ||
412 | } | ||
413 | x += y * (*q++); | ||
414 | *v = (sqlite_int64) x; | ||
415 | return (int) (q - (unsigned char *)p); | ||
416 | } | ||
417 | |||
418 | static int getVarint32(const char *p, int *pi){ | ||
419 | sqlite_int64 i; | ||
420 | int ret = getVarint(p, &i); | ||
421 | *pi = (int) i; | ||
422 | assert( *pi==i ); | ||
423 | return ret; | ||
424 | } | ||
425 | |||
426 | /*******************************************************************/ | ||
427 | /* DataBuffer is used to collect data into a buffer in piecemeal | ||
428 | ** fashion. It implements the usual distinction between amount of | ||
429 | ** data currently stored (nData) and buffer capacity (nCapacity). | ||
430 | ** | ||
431 | ** dataBufferInit - create a buffer with given initial capacity. | ||
432 | ** dataBufferReset - forget buffer's data, retaining capacity. | ||
433 | ** dataBufferDestroy - free buffer's data. | ||
434 | ** dataBufferExpand - expand capacity without adding data. | ||
435 | ** dataBufferAppend - append data. | ||
436 | ** dataBufferAppend2 - append two pieces of data at once. | ||
437 | ** dataBufferReplace - replace buffer's data. | ||
438 | */ | ||
439 | typedef struct DataBuffer { | ||
440 | char *pData; /* Pointer to malloc'ed buffer. */ | ||
441 | int nCapacity; /* Size of pData buffer. */ | ||
442 | int nData; /* End of data loaded into pData. */ | ||
443 | } DataBuffer; | ||
444 | |||
445 | static void dataBufferInit(DataBuffer *pBuffer, int nCapacity){ | ||
446 | assert( nCapacity>=0 ); | ||
447 | pBuffer->nData = 0; | ||
448 | pBuffer->nCapacity = nCapacity; | ||
449 | pBuffer->pData = nCapacity==0 ? NULL : malloc(nCapacity); | ||
450 | } | ||
451 | static void dataBufferReset(DataBuffer *pBuffer){ | ||
452 | pBuffer->nData = 0; | ||
453 | } | ||
454 | static void dataBufferDestroy(DataBuffer *pBuffer){ | ||
455 | if( pBuffer->pData!=NULL ) free(pBuffer->pData); | ||
456 | SCRAMBLE(pBuffer); | ||
457 | } | ||
458 | static void dataBufferExpand(DataBuffer *pBuffer, int nAddCapacity){ | ||
459 | assert( nAddCapacity>0 ); | ||
460 | /* TODO(shess) Consider expanding more aggressively. Note that the | ||
461 | ** underlying malloc implementation may take care of such things for | ||
462 | ** us already. | ||
463 | */ | ||
464 | if( pBuffer->nData+nAddCapacity>pBuffer->nCapacity ){ | ||
465 | pBuffer->nCapacity = pBuffer->nData+nAddCapacity; | ||
466 | pBuffer->pData = realloc(pBuffer->pData, pBuffer->nCapacity); | ||
467 | } | ||
468 | } | ||
469 | static void dataBufferAppend(DataBuffer *pBuffer, | ||
470 | const char *pSource, int nSource){ | ||
471 | assert( nSource>0 && pSource!=NULL ); | ||
472 | dataBufferExpand(pBuffer, nSource); | ||
473 | memcpy(pBuffer->pData+pBuffer->nData, pSource, nSource); | ||
474 | pBuffer->nData += nSource; | ||
475 | } | ||
476 | static void dataBufferAppend2(DataBuffer *pBuffer, | ||
477 | const char *pSource1, int nSource1, | ||
478 | const char *pSource2, int nSource2){ | ||
479 | assert( nSource1>0 && pSource1!=NULL ); | ||
480 | assert( nSource2>0 && pSource2!=NULL ); | ||
481 | dataBufferExpand(pBuffer, nSource1+nSource2); | ||
482 | memcpy(pBuffer->pData+pBuffer->nData, pSource1, nSource1); | ||
483 | memcpy(pBuffer->pData+pBuffer->nData+nSource1, pSource2, nSource2); | ||
484 | pBuffer->nData += nSource1+nSource2; | ||
485 | } | ||
486 | static void dataBufferReplace(DataBuffer *pBuffer, | ||
487 | const char *pSource, int nSource){ | ||
488 | dataBufferReset(pBuffer); | ||
489 | dataBufferAppend(pBuffer, pSource, nSource); | ||
490 | } | ||
491 | |||
492 | /* StringBuffer is a null-terminated version of DataBuffer. */ | ||
493 | typedef struct StringBuffer { | ||
494 | DataBuffer b; /* Includes null terminator. */ | ||
495 | } StringBuffer; | ||
496 | |||
497 | static void initStringBuffer(StringBuffer *sb){ | ||
498 | dataBufferInit(&sb->b, 100); | ||
499 | dataBufferReplace(&sb->b, "", 1); | ||
500 | } | ||
501 | static int stringBufferLength(StringBuffer *sb){ | ||
502 | return sb->b.nData-1; | ||
503 | } | ||
504 | static char *stringBufferData(StringBuffer *sb){ | ||
505 | return sb->b.pData; | ||
506 | } | ||
507 | static void stringBufferDestroy(StringBuffer *sb){ | ||
508 | dataBufferDestroy(&sb->b); | ||
509 | } | ||
510 | |||
511 | static void nappend(StringBuffer *sb, const char *zFrom, int nFrom){ | ||
512 | assert( sb->b.nData>0 ); | ||
513 | if( nFrom>0 ){ | ||
514 | sb->b.nData--; | ||
515 | dataBufferAppend2(&sb->b, zFrom, nFrom, "", 1); | ||
516 | } | ||
517 | } | ||
518 | static void append(StringBuffer *sb, const char *zFrom){ | ||
519 | nappend(sb, zFrom, strlen(zFrom)); | ||
520 | } | ||
521 | |||
522 | /* Append a list of strings separated by commas. */ | ||
523 | static void appendList(StringBuffer *sb, int nString, char **azString){ | ||
524 | int i; | ||
525 | for(i=0; i<nString; ++i){ | ||
526 | if( i>0 ) append(sb, ", "); | ||
527 | append(sb, azString[i]); | ||
528 | } | ||
529 | } | ||
530 | |||
531 | static int endsInWhiteSpace(StringBuffer *p){ | ||
532 | return stringBufferLength(p)>0 && | ||
533 | safe_isspace(stringBufferData(p)[stringBufferLength(p)-1]); | ||
534 | } | ||
535 | |||
536 | /* If the StringBuffer ends in something other than white space, add a | ||
537 | ** single space character to the end. | ||
538 | */ | ||
539 | static void appendWhiteSpace(StringBuffer *p){ | ||
540 | if( stringBufferLength(p)==0 ) return; | ||
541 | if( !endsInWhiteSpace(p) ) append(p, " "); | ||
542 | } | ||
543 | |||
544 | /* Remove white space from the end of the StringBuffer */ | ||
545 | static void trimWhiteSpace(StringBuffer *p){ | ||
546 | while( endsInWhiteSpace(p) ){ | ||
547 | p->b.pData[--p->b.nData-1] = '\0'; | ||
548 | } | ||
549 | } | ||
550 | |||
551 | /*******************************************************************/ | ||
552 | /* DLReader is used to read document elements from a doclist. The | ||
553 | ** current docid is cached, so dlrDocid() is fast. DLReader does not | ||
554 | ** own the doclist buffer. | ||
555 | ** | ||
556 | ** dlrAtEnd - true if there's no more data to read. | ||
557 | ** dlrDocid - docid of current document. | ||
558 | ** dlrDocData - doclist data for current document (including docid). | ||
559 | ** dlrDocDataBytes - length of same. | ||
560 | ** dlrAllDataBytes - length of all remaining data. | ||
561 | ** dlrPosData - position data for current document. | ||
562 | ** dlrPosDataLen - length of pos data for current document (incl POS_END). | ||
563 | ** dlrStep - step to current document. | ||
564 | ** dlrInit - initial for doclist of given type against given data. | ||
565 | ** dlrDestroy - clean up. | ||
566 | ** | ||
567 | ** Expected usage is something like: | ||
568 | ** | ||
569 | ** DLReader reader; | ||
570 | ** dlrInit(&reader, pData, nData); | ||
571 | ** while( !dlrAtEnd(&reader) ){ | ||
572 | ** // calls to dlrDocid() and kin. | ||
573 | ** dlrStep(&reader); | ||
574 | ** } | ||
575 | ** dlrDestroy(&reader); | ||
576 | */ | ||
577 | typedef struct DLReader { | ||
578 | DocListType iType; | ||
579 | const char *pData; | ||
580 | int nData; | ||
581 | |||
582 | sqlite_int64 iDocid; | ||
583 | int nElement; | ||
584 | } DLReader; | ||
585 | |||
586 | static int dlrAtEnd(DLReader *pReader){ | ||
587 | assert( pReader->nData>=0 ); | ||
588 | return pReader->nData==0; | ||
589 | } | ||
590 | static sqlite_int64 dlrDocid(DLReader *pReader){ | ||
591 | assert( !dlrAtEnd(pReader) ); | ||
592 | return pReader->iDocid; | ||
593 | } | ||
594 | static const char *dlrDocData(DLReader *pReader){ | ||
595 | assert( !dlrAtEnd(pReader) ); | ||
596 | return pReader->pData; | ||
597 | } | ||
598 | static int dlrDocDataBytes(DLReader *pReader){ | ||
599 | assert( !dlrAtEnd(pReader) ); | ||
600 | return pReader->nElement; | ||
601 | } | ||
602 | static int dlrAllDataBytes(DLReader *pReader){ | ||
603 | assert( !dlrAtEnd(pReader) ); | ||
604 | return pReader->nData; | ||
605 | } | ||
606 | /* TODO(shess) Consider adding a field to track iDocid varint length | ||
607 | ** to make these two functions faster. This might matter (a tiny bit) | ||
608 | ** for queries. | ||
609 | */ | ||
610 | static const char *dlrPosData(DLReader *pReader){ | ||
611 | sqlite_int64 iDummy; | ||
612 | int n = getVarint(pReader->pData, &iDummy); | ||
613 | assert( !dlrAtEnd(pReader) ); | ||
614 | return pReader->pData+n; | ||
615 | } | ||
616 | static int dlrPosDataLen(DLReader *pReader){ | ||
617 | sqlite_int64 iDummy; | ||
618 | int n = getVarint(pReader->pData, &iDummy); | ||
619 | assert( !dlrAtEnd(pReader) ); | ||
620 | return pReader->nElement-n; | ||
621 | } | ||
622 | static void dlrStep(DLReader *pReader){ | ||
623 | assert( !dlrAtEnd(pReader) ); | ||
624 | |||
625 | /* Skip past current doclist element. */ | ||
626 | assert( pReader->nElement<=pReader->nData ); | ||
627 | pReader->pData += pReader->nElement; | ||
628 | pReader->nData -= pReader->nElement; | ||
629 | |||
630 | /* If there is more data, read the next doclist element. */ | ||
631 | if( pReader->nData!=0 ){ | ||
632 | sqlite_int64 iDocidDelta; | ||
633 | int iDummy, n = getVarint(pReader->pData, &iDocidDelta); | ||
634 | pReader->iDocid += iDocidDelta; | ||
635 | if( pReader->iType>=DL_POSITIONS ){ | ||
636 | assert( n<pReader->nData ); | ||
637 | while( 1 ){ | ||
638 | n += getVarint32(pReader->pData+n, &iDummy); | ||
639 | assert( n<=pReader->nData ); | ||
640 | if( iDummy==POS_END ) break; | ||
641 | if( iDummy==POS_COLUMN ){ | ||
642 | n += getVarint32(pReader->pData+n, &iDummy); | ||
643 | assert( n<pReader->nData ); | ||
644 | }else if( pReader->iType==DL_POSITIONS_OFFSETS ){ | ||
645 | n += getVarint32(pReader->pData+n, &iDummy); | ||
646 | n += getVarint32(pReader->pData+n, &iDummy); | ||
647 | assert( n<pReader->nData ); | ||
648 | } | ||
649 | } | ||
650 | } | ||
651 | pReader->nElement = n; | ||
652 | assert( pReader->nElement<=pReader->nData ); | ||
653 | } | ||
654 | } | ||
655 | static void dlrInit(DLReader *pReader, DocListType iType, | ||
656 | const char *pData, int nData){ | ||
657 | assert( pData!=NULL && nData!=0 ); | ||
658 | pReader->iType = iType; | ||
659 | pReader->pData = pData; | ||
660 | pReader->nData = nData; | ||
661 | pReader->nElement = 0; | ||
662 | pReader->iDocid = 0; | ||
663 | |||
664 | /* Load the first element's data. There must be a first element. */ | ||
665 | dlrStep(pReader); | ||
666 | } | ||
667 | static void dlrDestroy(DLReader *pReader){ | ||
668 | SCRAMBLE(pReader); | ||
669 | } | ||
670 | |||
671 | #ifndef NDEBUG | ||
672 | /* Verify that the doclist can be validly decoded. Also returns the | ||
673 | ** last docid found because it's convenient in other assertions for | ||
674 | ** DLWriter. | ||
675 | */ | ||
676 | static void docListValidate(DocListType iType, const char *pData, int nData, | ||
677 | sqlite_int64 *pLastDocid){ | ||
678 | sqlite_int64 iPrevDocid = 0; | ||
679 | assert( nData>0 ); | ||
680 | assert( pData!=0 ); | ||
681 | assert( pData+nData>pData ); | ||
682 | while( nData!=0 ){ | ||
683 | sqlite_int64 iDocidDelta; | ||
684 | int n = getVarint(pData, &iDocidDelta); | ||
685 | iPrevDocid += iDocidDelta; | ||
686 | if( iType>DL_DOCIDS ){ | ||
687 | int iDummy; | ||
688 | while( 1 ){ | ||
689 | n += getVarint32(pData+n, &iDummy); | ||
690 | if( iDummy==POS_END ) break; | ||
691 | if( iDummy==POS_COLUMN ){ | ||
692 | n += getVarint32(pData+n, &iDummy); | ||
693 | }else if( iType>DL_POSITIONS ){ | ||
694 | n += getVarint32(pData+n, &iDummy); | ||
695 | n += getVarint32(pData+n, &iDummy); | ||
696 | } | ||
697 | assert( n<=nData ); | ||
698 | } | ||
699 | } | ||
700 | assert( n<=nData ); | ||
701 | pData += n; | ||
702 | nData -= n; | ||
703 | } | ||
704 | if( pLastDocid ) *pLastDocid = iPrevDocid; | ||
705 | } | ||
706 | #define ASSERT_VALID_DOCLIST(i, p, n, o) docListValidate(i, p, n, o) | ||
707 | #else | ||
708 | #define ASSERT_VALID_DOCLIST(i, p, n, o) assert( 1 ) | ||
709 | #endif | ||
710 | |||
711 | /*******************************************************************/ | ||
712 | /* DLWriter is used to write doclist data to a DataBuffer. DLWriter | ||
713 | ** always appends to the buffer and does not own it. | ||
714 | ** | ||
715 | ** dlwInit - initialize to write a given type doclistto a buffer. | ||
716 | ** dlwDestroy - clear the writer's memory. Does not free buffer. | ||
717 | ** dlwAppend - append raw doclist data to buffer. | ||
718 | ** dlwCopy - copy next doclist from reader to writer. | ||
719 | ** dlwAdd - construct doclist element and append to buffer. | ||
720 | ** Only apply dlwAdd() to DL_DOCIDS doclists (else use PLWriter). | ||
721 | */ | ||
722 | typedef struct DLWriter { | ||
723 | DocListType iType; | ||
724 | DataBuffer *b; | ||
725 | sqlite_int64 iPrevDocid; | ||
726 | #ifndef NDEBUG | ||
727 | int has_iPrevDocid; | ||
728 | #endif | ||
729 | } DLWriter; | ||
730 | |||
731 | static void dlwInit(DLWriter *pWriter, DocListType iType, DataBuffer *b){ | ||
732 | pWriter->b = b; | ||
733 | pWriter->iType = iType; | ||
734 | pWriter->iPrevDocid = 0; | ||
735 | #ifndef NDEBUG | ||
736 | pWriter->has_iPrevDocid = 0; | ||
737 | #endif | ||
738 | } | ||
739 | static void dlwDestroy(DLWriter *pWriter){ | ||
740 | SCRAMBLE(pWriter); | ||
741 | } | ||
742 | /* iFirstDocid is the first docid in the doclist in pData. It is | ||
743 | ** needed because pData may point within a larger doclist, in which | ||
744 | ** case the first item would be delta-encoded. | ||
745 | ** | ||
746 | ** iLastDocid is the final docid in the doclist in pData. It is | ||
747 | ** needed to create the new iPrevDocid for future delta-encoding. The | ||
748 | ** code could decode the passed doclist to recreate iLastDocid, but | ||
749 | ** the only current user (docListMerge) already has decoded this | ||
750 | ** information. | ||
751 | */ | ||
752 | /* TODO(shess) This has become just a helper for docListMerge. | ||
753 | ** Consider a refactor to make this cleaner. | ||
754 | */ | ||
755 | static void dlwAppend(DLWriter *pWriter, | ||
756 | const char *pData, int nData, | ||
757 | sqlite_int64 iFirstDocid, sqlite_int64 iLastDocid){ | ||
758 | sqlite_int64 iDocid = 0; | ||
759 | char c[VARINT_MAX]; | ||
760 | int nFirstOld, nFirstNew; /* Old and new varint len of first docid. */ | ||
761 | #ifndef NDEBUG | ||
762 | sqlite_int64 iLastDocidDelta; | ||
763 | #endif | ||
764 | |||
765 | /* Recode the initial docid as delta from iPrevDocid. */ | ||
766 | nFirstOld = getVarint(pData, &iDocid); | ||
767 | assert( nFirstOld<nData || (nFirstOld==nData && pWriter->iType==DL_DOCIDS) ); | ||
768 | nFirstNew = putVarint(c, iFirstDocid-pWriter->iPrevDocid); | ||
769 | |||
770 | /* Verify that the incoming doclist is valid AND that it ends with | ||
771 | ** the expected docid. This is essential because we'll trust this | ||
772 | ** docid in future delta-encoding. | ||
773 | */ | ||
774 | ASSERT_VALID_DOCLIST(pWriter->iType, pData, nData, &iLastDocidDelta); | ||
775 | assert( iLastDocid==iFirstDocid-iDocid+iLastDocidDelta ); | ||
776 | |||
777 | /* Append recoded initial docid and everything else. Rest of docids | ||
778 | ** should have been delta-encoded from previous initial docid. | ||
779 | */ | ||
780 | if( nFirstOld<nData ){ | ||
781 | dataBufferAppend2(pWriter->b, c, nFirstNew, | ||
782 | pData+nFirstOld, nData-nFirstOld); | ||
783 | }else{ | ||
784 | dataBufferAppend(pWriter->b, c, nFirstNew); | ||
785 | } | ||
786 | pWriter->iPrevDocid = iLastDocid; | ||
787 | } | ||
788 | static void dlwCopy(DLWriter *pWriter, DLReader *pReader){ | ||
789 | dlwAppend(pWriter, dlrDocData(pReader), dlrDocDataBytes(pReader), | ||
790 | dlrDocid(pReader), dlrDocid(pReader)); | ||
791 | } | ||
792 | static void dlwAdd(DLWriter *pWriter, sqlite_int64 iDocid){ | ||
793 | char c[VARINT_MAX]; | ||
794 | int n = putVarint(c, iDocid-pWriter->iPrevDocid); | ||
795 | |||
796 | /* Docids must ascend. */ | ||
797 | assert( !pWriter->has_iPrevDocid || iDocid>pWriter->iPrevDocid ); | ||
798 | assert( pWriter->iType==DL_DOCIDS ); | ||
799 | |||
800 | dataBufferAppend(pWriter->b, c, n); | ||
801 | pWriter->iPrevDocid = iDocid; | ||
802 | #ifndef NDEBUG | ||
803 | pWriter->has_iPrevDocid = 1; | ||
804 | #endif | ||
805 | } | ||
806 | |||
807 | /*******************************************************************/ | ||
808 | /* PLReader is used to read data from a document's position list. As | ||
809 | ** the caller steps through the list, data is cached so that varints | ||
810 | ** only need to be decoded once. | ||
811 | ** | ||
812 | ** plrInit, plrDestroy - create/destroy a reader. | ||
813 | ** plrColumn, plrPosition, plrStartOffset, plrEndOffset - accessors | ||
814 | ** plrAtEnd - at end of stream, only call plrDestroy once true. | ||
815 | ** plrStep - step to the next element. | ||
816 | */ | ||
817 | typedef struct PLReader { | ||
818 | /* These refer to the next position's data. nData will reach 0 when | ||
819 | ** reading the last position, so plrStep() signals EOF by setting | ||
820 | ** pData to NULL. | ||
821 | */ | ||
822 | const char *pData; | ||
823 | int nData; | ||
824 | |||
825 | DocListType iType; | ||
826 | int iColumn; /* the last column read */ | ||
827 | int iPosition; /* the last position read */ | ||
828 | int iStartOffset; /* the last start offset read */ | ||
829 | int iEndOffset; /* the last end offset read */ | ||
830 | } PLReader; | ||
831 | |||
832 | static int plrAtEnd(PLReader *pReader){ | ||
833 | return pReader->pData==NULL; | ||
834 | } | ||
835 | static int plrColumn(PLReader *pReader){ | ||
836 | assert( !plrAtEnd(pReader) ); | ||
837 | return pReader->iColumn; | ||
838 | } | ||
839 | static int plrPosition(PLReader *pReader){ | ||
840 | assert( !plrAtEnd(pReader) ); | ||
841 | return pReader->iPosition; | ||
842 | } | ||
843 | static int plrStartOffset(PLReader *pReader){ | ||
844 | assert( !plrAtEnd(pReader) ); | ||
845 | return pReader->iStartOffset; | ||
846 | } | ||
847 | static int plrEndOffset(PLReader *pReader){ | ||
848 | assert( !plrAtEnd(pReader) ); | ||
849 | return pReader->iEndOffset; | ||
850 | } | ||
851 | static void plrStep(PLReader *pReader){ | ||
852 | int i, n; | ||
853 | |||
854 | assert( !plrAtEnd(pReader) ); | ||
855 | |||
856 | if( pReader->nData==0 ){ | ||
857 | pReader->pData = NULL; | ||
858 | return; | ||
859 | } | ||
860 | |||
861 | n = getVarint32(pReader->pData, &i); | ||
862 | if( i==POS_COLUMN ){ | ||
863 | n += getVarint32(pReader->pData+n, &pReader->iColumn); | ||
864 | pReader->iPosition = 0; | ||
865 | pReader->iStartOffset = 0; | ||
866 | n += getVarint32(pReader->pData+n, &i); | ||
867 | } | ||
868 | /* Should never see adjacent column changes. */ | ||
869 | assert( i!=POS_COLUMN ); | ||
870 | |||
871 | if( i==POS_END ){ | ||
872 | pReader->nData = 0; | ||
873 | pReader->pData = NULL; | ||
874 | return; | ||
875 | } | ||
876 | |||
877 | pReader->iPosition += i-POS_BASE; | ||
878 | if( pReader->iType==DL_POSITIONS_OFFSETS ){ | ||
879 | n += getVarint32(pReader->pData+n, &i); | ||
880 | pReader->iStartOffset += i; | ||
881 | n += getVarint32(pReader->pData+n, &i); | ||
882 | pReader->iEndOffset = pReader->iStartOffset+i; | ||
883 | } | ||
884 | assert( n<=pReader->nData ); | ||
885 | pReader->pData += n; | ||
886 | pReader->nData -= n; | ||
887 | } | ||
888 | |||
889 | static void plrInit(PLReader *pReader, DLReader *pDLReader){ | ||
890 | pReader->pData = dlrPosData(pDLReader); | ||
891 | pReader->nData = dlrPosDataLen(pDLReader); | ||
892 | pReader->iType = pDLReader->iType; | ||
893 | pReader->iColumn = 0; | ||
894 | pReader->iPosition = 0; | ||
895 | pReader->iStartOffset = 0; | ||
896 | pReader->iEndOffset = 0; | ||
897 | plrStep(pReader); | ||
898 | } | ||
899 | static void plrDestroy(PLReader *pReader){ | ||
900 | SCRAMBLE(pReader); | ||
901 | } | ||
902 | |||
903 | /*******************************************************************/ | ||
904 | /* PLWriter is used in constructing a document's position list. As a | ||
905 | ** convenience, if iType is DL_DOCIDS, PLWriter becomes a no-op. | ||
906 | ** PLWriter writes to the associated DLWriter's buffer. | ||
907 | ** | ||
908 | ** plwInit - init for writing a document's poslist. | ||
909 | ** plwDestroy - clear a writer. | ||
910 | ** plwAdd - append position and offset information. | ||
911 | ** plwCopy - copy next position's data from reader to writer. | ||
912 | ** plwTerminate - add any necessary doclist terminator. | ||
913 | ** | ||
914 | ** Calling plwAdd() after plwTerminate() may result in a corrupt | ||
915 | ** doclist. | ||
916 | */ | ||
917 | /* TODO(shess) Until we've written the second item, we can cache the | ||
918 | ** first item's information. Then we'd have three states: | ||
919 | ** | ||
920 | ** - initialized with docid, no positions. | ||
921 | ** - docid and one position. | ||
922 | ** - docid and multiple positions. | ||
923 | ** | ||
924 | ** Only the last state needs to actually write to dlw->b, which would | ||
925 | ** be an improvement in the DLCollector case. | ||
926 | */ | ||
927 | typedef struct PLWriter { | ||
928 | DLWriter *dlw; | ||
929 | |||
930 | int iColumn; /* the last column written */ | ||
931 | int iPos; /* the last position written */ | ||
932 | int iOffset; /* the last start offset written */ | ||
933 | } PLWriter; | ||
934 | |||
935 | /* TODO(shess) In the case where the parent is reading these values | ||
936 | ** from a PLReader, we could optimize to a copy if that PLReader has | ||
937 | ** the same type as pWriter. | ||
938 | */ | ||
939 | static void plwAdd(PLWriter *pWriter, int iColumn, int iPos, | ||
940 | int iStartOffset, int iEndOffset){ | ||
941 | /* Worst-case space for POS_COLUMN, iColumn, iPosDelta, | ||
942 | ** iStartOffsetDelta, and iEndOffsetDelta. | ||
943 | */ | ||
944 | char c[5*VARINT_MAX]; | ||
945 | int n = 0; | ||
946 | |||
947 | /* Ban plwAdd() after plwTerminate(). */ | ||
948 | assert( pWriter->iPos!=-1 ); | ||
949 | |||
950 | if( pWriter->dlw->iType==DL_DOCIDS ) return; | ||
951 | |||
952 | if( iColumn!=pWriter->iColumn ){ | ||
953 | n += putVarint(c+n, POS_COLUMN); | ||
954 | n += putVarint(c+n, iColumn); | ||
955 | pWriter->iColumn = iColumn; | ||
956 | pWriter->iPos = 0; | ||
957 | pWriter->iOffset = 0; | ||
958 | } | ||
959 | assert( iPos>=pWriter->iPos ); | ||
960 | n += putVarint(c+n, POS_BASE+(iPos-pWriter->iPos)); | ||
961 | pWriter->iPos = iPos; | ||
962 | if( pWriter->dlw->iType==DL_POSITIONS_OFFSETS ){ | ||
963 | assert( iStartOffset>=pWriter->iOffset ); | ||
964 | n += putVarint(c+n, iStartOffset-pWriter->iOffset); | ||
965 | pWriter->iOffset = iStartOffset; | ||
966 | assert( iEndOffset>=iStartOffset ); | ||
967 | n += putVarint(c+n, iEndOffset-iStartOffset); | ||
968 | } | ||
969 | dataBufferAppend(pWriter->dlw->b, c, n); | ||
970 | } | ||
971 | static void plwCopy(PLWriter *pWriter, PLReader *pReader){ | ||
972 | plwAdd(pWriter, plrColumn(pReader), plrPosition(pReader), | ||
973 | plrStartOffset(pReader), plrEndOffset(pReader)); | ||
974 | } | ||
975 | static void plwInit(PLWriter *pWriter, DLWriter *dlw, sqlite_int64 iDocid){ | ||
976 | char c[VARINT_MAX]; | ||
977 | int n; | ||
978 | |||
979 | pWriter->dlw = dlw; | ||
980 | |||
981 | /* Docids must ascend. */ | ||
982 | assert( !pWriter->dlw->has_iPrevDocid || iDocid>pWriter->dlw->iPrevDocid ); | ||
983 | n = putVarint(c, iDocid-pWriter->dlw->iPrevDocid); | ||
984 | dataBufferAppend(pWriter->dlw->b, c, n); | ||
985 | pWriter->dlw->iPrevDocid = iDocid; | ||
986 | #ifndef NDEBUG | ||
987 | pWriter->dlw->has_iPrevDocid = 1; | ||
988 | #endif | ||
989 | |||
990 | pWriter->iColumn = 0; | ||
991 | pWriter->iPos = 0; | ||
992 | pWriter->iOffset = 0; | ||
993 | } | ||
994 | /* TODO(shess) Should plwDestroy() also terminate the doclist? But | ||
995 | ** then plwDestroy() would no longer be just a destructor, it would | ||
996 | ** also be doing work, which isn't consistent with the overall idiom. | ||
997 | ** Another option would be for plwAdd() to always append any necessary | ||
998 | ** terminator, so that the output is always correct. But that would | ||
999 | ** add incremental work to the common case with the only benefit being | ||
1000 | ** API elegance. Punt for now. | ||
1001 | */ | ||
1002 | static void plwTerminate(PLWriter *pWriter){ | ||
1003 | if( pWriter->dlw->iType>DL_DOCIDS ){ | ||
1004 | char c[VARINT_MAX]; | ||
1005 | int n = putVarint(c, POS_END); | ||
1006 | dataBufferAppend(pWriter->dlw->b, c, n); | ||
1007 | } | ||
1008 | #ifndef NDEBUG | ||
1009 | /* Mark as terminated for assert in plwAdd(). */ | ||
1010 | pWriter->iPos = -1; | ||
1011 | #endif | ||
1012 | } | ||
1013 | static void plwDestroy(PLWriter *pWriter){ | ||
1014 | SCRAMBLE(pWriter); | ||
1015 | } | ||
1016 | |||
1017 | /*******************************************************************/ | ||
1018 | /* DLCollector wraps PLWriter and DLWriter to provide a | ||
1019 | ** dynamically-allocated doclist area to use during tokenization. | ||
1020 | ** | ||
1021 | ** dlcNew - malloc up and initialize a collector. | ||
1022 | ** dlcDelete - destroy a collector and all contained items. | ||
1023 | ** dlcAddPos - append position and offset information. | ||
1024 | ** dlcAddDoclist - add the collected doclist to the given buffer. | ||
1025 | ** dlcNext - terminate the current document and open another. | ||
1026 | */ | ||
1027 | typedef struct DLCollector { | ||
1028 | DataBuffer b; | ||
1029 | DLWriter dlw; | ||
1030 | PLWriter plw; | ||
1031 | } DLCollector; | ||
1032 | |||
1033 | /* TODO(shess) This could also be done by calling plwTerminate() and | ||
1034 | ** dataBufferAppend(). I tried that, expecting nominal performance | ||
1035 | ** differences, but it seemed to pretty reliably be worth 1% to code | ||
1036 | ** it this way. I suspect it's the incremental malloc overhead (some | ||
1037 | ** percentage of the plwTerminate() calls will cause a realloc), so | ||
1038 | ** this might be worth revisiting if the DataBuffer implementation | ||
1039 | ** changes. | ||
1040 | */ | ||
1041 | static void dlcAddDoclist(DLCollector *pCollector, DataBuffer *b){ | ||
1042 | if( pCollector->dlw.iType>DL_DOCIDS ){ | ||
1043 | char c[VARINT_MAX]; | ||
1044 | int n = putVarint(c, POS_END); | ||
1045 | dataBufferAppend2(b, pCollector->b.pData, pCollector->b.nData, c, n); | ||
1046 | }else{ | ||
1047 | dataBufferAppend(b, pCollector->b.pData, pCollector->b.nData); | ||
1048 | } | ||
1049 | } | ||
1050 | static void dlcNext(DLCollector *pCollector, sqlite_int64 iDocid){ | ||
1051 | plwTerminate(&pCollector->plw); | ||
1052 | plwDestroy(&pCollector->plw); | ||
1053 | plwInit(&pCollector->plw, &pCollector->dlw, iDocid); | ||
1054 | } | ||
1055 | static void dlcAddPos(DLCollector *pCollector, int iColumn, int iPos, | ||
1056 | int iStartOffset, int iEndOffset){ | ||
1057 | plwAdd(&pCollector->plw, iColumn, iPos, iStartOffset, iEndOffset); | ||
1058 | } | ||
1059 | |||
1060 | static DLCollector *dlcNew(sqlite_int64 iDocid, DocListType iType){ | ||
1061 | DLCollector *pCollector = malloc(sizeof(DLCollector)); | ||
1062 | dataBufferInit(&pCollector->b, 0); | ||
1063 | dlwInit(&pCollector->dlw, iType, &pCollector->b); | ||
1064 | plwInit(&pCollector->plw, &pCollector->dlw, iDocid); | ||
1065 | return pCollector; | ||
1066 | } | ||
1067 | static void dlcDelete(DLCollector *pCollector){ | ||
1068 | plwDestroy(&pCollector->plw); | ||
1069 | dlwDestroy(&pCollector->dlw); | ||
1070 | dataBufferDestroy(&pCollector->b); | ||
1071 | SCRAMBLE(pCollector); | ||
1072 | free(pCollector); | ||
1073 | } | ||
1074 | |||
1075 | |||
1076 | /* Copy the doclist data of iType in pData/nData into *out, trimming | ||
1077 | ** unnecessary data as we go. Only columns matching iColumn are | ||
1078 | ** copied, all columns copied if iColumn is -1. Elements with no | ||
1079 | ** matching columns are dropped. The output is an iOutType doclist. | ||
1080 | */ | ||
1081 | /* NOTE(shess) This code is only valid after all doclists are merged. | ||
1082 | ** If this is run before merges, then doclist items which represent | ||
1083 | ** deletion will be trimmed, and will thus not effect a deletion | ||
1084 | ** during the merge. | ||
1085 | */ | ||
1086 | static void docListTrim(DocListType iType, const char *pData, int nData, | ||
1087 | int iColumn, DocListType iOutType, DataBuffer *out){ | ||
1088 | DLReader dlReader; | ||
1089 | DLWriter dlWriter; | ||
1090 | |||
1091 | assert( iOutType<=iType ); | ||
1092 | |||
1093 | dlrInit(&dlReader, iType, pData, nData); | ||
1094 | dlwInit(&dlWriter, iOutType, out); | ||
1095 | |||
1096 | while( !dlrAtEnd(&dlReader) ){ | ||
1097 | PLReader plReader; | ||
1098 | PLWriter plWriter; | ||
1099 | int match = 0; | ||
1100 | |||
1101 | plrInit(&plReader, &dlReader); | ||
1102 | |||
1103 | while( !plrAtEnd(&plReader) ){ | ||
1104 | if( iColumn==-1 || plrColumn(&plReader)==iColumn ){ | ||
1105 | if( !match ){ | ||
1106 | plwInit(&plWriter, &dlWriter, dlrDocid(&dlReader)); | ||
1107 | match = 1; | ||
1108 | } | ||
1109 | plwAdd(&plWriter, plrColumn(&plReader), plrPosition(&plReader), | ||
1110 | plrStartOffset(&plReader), plrEndOffset(&plReader)); | ||
1111 | } | ||
1112 | plrStep(&plReader); | ||
1113 | } | ||
1114 | if( match ){ | ||
1115 | plwTerminate(&plWriter); | ||
1116 | plwDestroy(&plWriter); | ||
1117 | } | ||
1118 | |||
1119 | plrDestroy(&plReader); | ||
1120 | dlrStep(&dlReader); | ||
1121 | } | ||
1122 | dlwDestroy(&dlWriter); | ||
1123 | dlrDestroy(&dlReader); | ||
1124 | } | ||
1125 | |||
1126 | /* Used by docListMerge() to keep doclists in the ascending order by | ||
1127 | ** docid, then ascending order by age (so the newest comes first). | ||
1128 | */ | ||
1129 | typedef struct OrderedDLReader { | ||
1130 | DLReader *pReader; | ||
1131 | |||
1132 | /* TODO(shess) If we assume that docListMerge pReaders is ordered by | ||
1133 | ** age (which we do), then we could use pReader comparisons to break | ||
1134 | ** ties. | ||
1135 | */ | ||
1136 | int idx; | ||
1137 | } OrderedDLReader; | ||
1138 | |||
1139 | /* Order eof to end, then by docid asc, idx desc. */ | ||
1140 | static int orderedDLReaderCmp(OrderedDLReader *r1, OrderedDLReader *r2){ | ||
1141 | if( dlrAtEnd(r1->pReader) ){ | ||
1142 | if( dlrAtEnd(r2->pReader) ) return 0; /* Both atEnd(). */ | ||
1143 | return 1; /* Only r1 atEnd(). */ | ||
1144 | } | ||
1145 | if( dlrAtEnd(r2->pReader) ) return -1; /* Only r2 atEnd(). */ | ||
1146 | |||
1147 | if( dlrDocid(r1->pReader)<dlrDocid(r2->pReader) ) return -1; | ||
1148 | if( dlrDocid(r1->pReader)>dlrDocid(r2->pReader) ) return 1; | ||
1149 | |||
1150 | /* Descending on idx. */ | ||
1151 | return r2->idx-r1->idx; | ||
1152 | } | ||
1153 | |||
1154 | /* Bubble p[0] to appropriate place in p[1..n-1]. Assumes that | ||
1155 | ** p[1..n-1] is already sorted. | ||
1156 | */ | ||
1157 | /* TODO(shess) Is this frequent enough to warrant a binary search? | ||
1158 | ** Before implementing that, instrument the code to check. In most | ||
1159 | ** current usage, I expect that p[0] will be less than p[1] a very | ||
1160 | ** high proportion of the time. | ||
1161 | */ | ||
1162 | static void orderedDLReaderReorder(OrderedDLReader *p, int n){ | ||
1163 | while( n>1 && orderedDLReaderCmp(p, p+1)>0 ){ | ||
1164 | OrderedDLReader tmp = p[0]; | ||
1165 | p[0] = p[1]; | ||
1166 | p[1] = tmp; | ||
1167 | n--; | ||
1168 | p++; | ||
1169 | } | ||
1170 | } | ||
1171 | |||
1172 | /* Given an array of doclist readers, merge their doclist elements | ||
1173 | ** into out in sorted order (by docid), dropping elements from older | ||
1174 | ** readers when there is a duplicate docid. pReaders is assumed to be | ||
1175 | ** ordered by age, oldest first. | ||
1176 | */ | ||
1177 | /* TODO(shess) nReaders must be <= MERGE_COUNT. This should probably | ||
1178 | ** be fixed. | ||
1179 | */ | ||
1180 | static void docListMerge(DataBuffer *out, | ||
1181 | DLReader *pReaders, int nReaders){ | ||
1182 | OrderedDLReader readers[MERGE_COUNT]; | ||
1183 | DLWriter writer; | ||
1184 | int i, n; | ||
1185 | const char *pStart = 0; | ||
1186 | int nStart = 0; | ||
1187 | sqlite_int64 iFirstDocid = 0, iLastDocid = 0; | ||
1188 | |||
1189 | assert( nReaders>0 ); | ||
1190 | if( nReaders==1 ){ | ||
1191 | dataBufferAppend(out, dlrDocData(pReaders), dlrAllDataBytes(pReaders)); | ||
1192 | return; | ||
1193 | } | ||
1194 | |||
1195 | assert( nReaders<=MERGE_COUNT ); | ||
1196 | n = 0; | ||
1197 | for(i=0; i<nReaders; i++){ | ||
1198 | assert( pReaders[i].iType==pReaders[0].iType ); | ||
1199 | readers[i].pReader = pReaders+i; | ||
1200 | readers[i].idx = i; | ||
1201 | n += dlrAllDataBytes(&pReaders[i]); | ||
1202 | } | ||
1203 | /* Conservatively size output to sum of inputs. Output should end | ||
1204 | ** up strictly smaller than input. | ||
1205 | */ | ||
1206 | dataBufferExpand(out, n); | ||
1207 | |||
1208 | /* Get the readers into sorted order. */ | ||
1209 | while( i-->0 ){ | ||
1210 | orderedDLReaderReorder(readers+i, nReaders-i); | ||
1211 | } | ||
1212 | |||
1213 | dlwInit(&writer, pReaders[0].iType, out); | ||
1214 | while( !dlrAtEnd(readers[0].pReader) ){ | ||
1215 | sqlite_int64 iDocid = dlrDocid(readers[0].pReader); | ||
1216 | |||
1217 | /* If this is a continuation of the current buffer to copy, extend | ||
1218 | ** that buffer. memcpy() seems to be more efficient if it has a | ||
1219 | ** lots of data to copy. | ||
1220 | */ | ||
1221 | if( dlrDocData(readers[0].pReader)==pStart+nStart ){ | ||
1222 | nStart += dlrDocDataBytes(readers[0].pReader); | ||
1223 | }else{ | ||
1224 | if( pStart!=0 ){ | ||
1225 | dlwAppend(&writer, pStart, nStart, iFirstDocid, iLastDocid); | ||
1226 | } | ||
1227 | pStart = dlrDocData(readers[0].pReader); | ||
1228 | nStart = dlrDocDataBytes(readers[0].pReader); | ||
1229 | iFirstDocid = iDocid; | ||
1230 | } | ||
1231 | iLastDocid = iDocid; | ||
1232 | dlrStep(readers[0].pReader); | ||
1233 | |||
1234 | /* Drop all of the older elements with the same docid. */ | ||
1235 | for(i=1; i<nReaders && | ||
1236 | !dlrAtEnd(readers[i].pReader) && | ||
1237 | dlrDocid(readers[i].pReader)==iDocid; i++){ | ||
1238 | dlrStep(readers[i].pReader); | ||
1239 | } | ||
1240 | |||
1241 | /* Get the readers back into order. */ | ||
1242 | while( i-->0 ){ | ||
1243 | orderedDLReaderReorder(readers+i, nReaders-i); | ||
1244 | } | ||
1245 | } | ||
1246 | |||
1247 | /* Copy over any remaining elements. */ | ||
1248 | if( nStart>0 ) dlwAppend(&writer, pStart, nStart, iFirstDocid, iLastDocid); | ||
1249 | dlwDestroy(&writer); | ||
1250 | } | ||
1251 | |||
1252 | /* Helper function for posListUnion(). Compares the current position | ||
1253 | ** between left and right, returning as standard C idiom of <0 if | ||
1254 | ** left<right, >0 if left>right, and 0 if left==right. "End" always | ||
1255 | ** compares greater. | ||
1256 | */ | ||
1257 | static int posListCmp(PLReader *pLeft, PLReader *pRight){ | ||
1258 | assert( pLeft->iType==pRight->iType ); | ||
1259 | if( pLeft->iType==DL_DOCIDS ) return 0; | ||
1260 | |||
1261 | if( plrAtEnd(pLeft) ) return plrAtEnd(pRight) ? 0 : 1; | ||
1262 | if( plrAtEnd(pRight) ) return -1; | ||
1263 | |||
1264 | if( plrColumn(pLeft)<plrColumn(pRight) ) return -1; | ||
1265 | if( plrColumn(pLeft)>plrColumn(pRight) ) return 1; | ||
1266 | |||
1267 | if( plrPosition(pLeft)<plrPosition(pRight) ) return -1; | ||
1268 | if( plrPosition(pLeft)>plrPosition(pRight) ) return 1; | ||
1269 | if( pLeft->iType==DL_POSITIONS ) return 0; | ||
1270 | |||
1271 | if( plrStartOffset(pLeft)<plrStartOffset(pRight) ) return -1; | ||
1272 | if( plrStartOffset(pLeft)>plrStartOffset(pRight) ) return 1; | ||
1273 | |||
1274 | if( plrEndOffset(pLeft)<plrEndOffset(pRight) ) return -1; | ||
1275 | if( plrEndOffset(pLeft)>plrEndOffset(pRight) ) return 1; | ||
1276 | |||
1277 | return 0; | ||
1278 | } | ||
1279 | |||
1280 | /* Write the union of position lists in pLeft and pRight to pOut. | ||
1281 | ** "Union" in this case meaning "All unique position tuples". Should | ||
1282 | ** work with any doclist type, though both inputs and the output | ||
1283 | ** should be the same type. | ||
1284 | */ | ||
1285 | static void posListUnion(DLReader *pLeft, DLReader *pRight, DLWriter *pOut){ | ||
1286 | PLReader left, right; | ||
1287 | PLWriter writer; | ||
1288 | |||
1289 | assert( dlrDocid(pLeft)==dlrDocid(pRight) ); | ||
1290 | assert( pLeft->iType==pRight->iType ); | ||
1291 | assert( pLeft->iType==pOut->iType ); | ||
1292 | |||
1293 | plrInit(&left, pLeft); | ||
1294 | plrInit(&right, pRight); | ||
1295 | plwInit(&writer, pOut, dlrDocid(pLeft)); | ||
1296 | |||
1297 | while( !plrAtEnd(&left) || !plrAtEnd(&right) ){ | ||
1298 | int c = posListCmp(&left, &right); | ||
1299 | if( c<0 ){ | ||
1300 | plwCopy(&writer, &left); | ||
1301 | plrStep(&left); | ||
1302 | }else if( c>0 ){ | ||
1303 | plwCopy(&writer, &right); | ||
1304 | plrStep(&right); | ||
1305 | }else{ | ||
1306 | plwCopy(&writer, &left); | ||
1307 | plrStep(&left); | ||
1308 | plrStep(&right); | ||
1309 | } | ||
1310 | } | ||
1311 | |||
1312 | plwTerminate(&writer); | ||
1313 | plwDestroy(&writer); | ||
1314 | plrDestroy(&left); | ||
1315 | plrDestroy(&right); | ||
1316 | } | ||
1317 | |||
1318 | /* Write the union of doclists in pLeft and pRight to pOut. For | ||
1319 | ** docids in common between the inputs, the union of the position | ||
1320 | ** lists is written. Inputs and outputs are always type DL_DEFAULT. | ||
1321 | */ | ||
1322 | static void docListUnion( | ||
1323 | const char *pLeft, int nLeft, | ||
1324 | const char *pRight, int nRight, | ||
1325 | DataBuffer *pOut /* Write the combined doclist here */ | ||
1326 | ){ | ||
1327 | DLReader left, right; | ||
1328 | DLWriter writer; | ||
1329 | |||
1330 | if( nLeft==0 ){ | ||
1331 | dataBufferAppend(pOut, pRight, nRight); | ||
1332 | return; | ||
1333 | } | ||
1334 | if( nRight==0 ){ | ||
1335 | dataBufferAppend(pOut, pLeft, nLeft); | ||
1336 | return; | ||
1337 | } | ||
1338 | |||
1339 | dlrInit(&left, DL_DEFAULT, pLeft, nLeft); | ||
1340 | dlrInit(&right, DL_DEFAULT, pRight, nRight); | ||
1341 | dlwInit(&writer, DL_DEFAULT, pOut); | ||
1342 | |||
1343 | while( !dlrAtEnd(&left) || !dlrAtEnd(&right) ){ | ||
1344 | if( dlrAtEnd(&right) ){ | ||
1345 | dlwCopy(&writer, &left); | ||
1346 | dlrStep(&left); | ||
1347 | }else if( dlrAtEnd(&left) ){ | ||
1348 | dlwCopy(&writer, &right); | ||
1349 | dlrStep(&right); | ||
1350 | }else if( dlrDocid(&left)<dlrDocid(&right) ){ | ||
1351 | dlwCopy(&writer, &left); | ||
1352 | dlrStep(&left); | ||
1353 | }else if( dlrDocid(&left)>dlrDocid(&right) ){ | ||
1354 | dlwCopy(&writer, &right); | ||
1355 | dlrStep(&right); | ||
1356 | }else{ | ||
1357 | posListUnion(&left, &right, &writer); | ||
1358 | dlrStep(&left); | ||
1359 | dlrStep(&right); | ||
1360 | } | ||
1361 | } | ||
1362 | |||
1363 | dlrDestroy(&left); | ||
1364 | dlrDestroy(&right); | ||
1365 | dlwDestroy(&writer); | ||
1366 | } | ||
1367 | |||
1368 | /* pLeft and pRight are DLReaders positioned to the same docid. | ||
1369 | ** | ||
1370 | ** If there are no instances in pLeft or pRight where the position | ||
1371 | ** of pLeft is one less than the position of pRight, then this | ||
1372 | ** routine adds nothing to pOut. | ||
1373 | ** | ||
1374 | ** If there are one or more instances where positions from pLeft | ||
1375 | ** are exactly one less than positions from pRight, then add a new | ||
1376 | ** document record to pOut. If pOut wants to hold positions, then | ||
1377 | ** include the positions from pRight that are one more than a | ||
1378 | ** position in pLeft. In other words: pRight.iPos==pLeft.iPos+1. | ||
1379 | */ | ||
1380 | static void posListPhraseMerge(DLReader *pLeft, DLReader *pRight, | ||
1381 | DLWriter *pOut){ | ||
1382 | PLReader left, right; | ||
1383 | PLWriter writer; | ||
1384 | int match = 0; | ||
1385 | |||
1386 | assert( dlrDocid(pLeft)==dlrDocid(pRight) ); | ||
1387 | assert( pOut->iType!=DL_POSITIONS_OFFSETS ); | ||
1388 | |||
1389 | plrInit(&left, pLeft); | ||
1390 | plrInit(&right, pRight); | ||
1391 | |||
1392 | while( !plrAtEnd(&left) && !plrAtEnd(&right) ){ | ||
1393 | if( plrColumn(&left)<plrColumn(&right) ){ | ||
1394 | plrStep(&left); | ||
1395 | }else if( plrColumn(&left)>plrColumn(&right) ){ | ||
1396 | plrStep(&right); | ||
1397 | }else if( plrPosition(&left)+1<plrPosition(&right) ){ | ||
1398 | plrStep(&left); | ||
1399 | }else if( plrPosition(&left)+1>plrPosition(&right) ){ | ||
1400 | plrStep(&right); | ||
1401 | }else{ | ||
1402 | if( !match ){ | ||
1403 | plwInit(&writer, pOut, dlrDocid(pLeft)); | ||
1404 | match = 1; | ||
1405 | } | ||
1406 | plwAdd(&writer, plrColumn(&right), plrPosition(&right), 0, 0); | ||
1407 | plrStep(&left); | ||
1408 | plrStep(&right); | ||
1409 | } | ||
1410 | } | ||
1411 | |||
1412 | if( match ){ | ||
1413 | plwTerminate(&writer); | ||
1414 | plwDestroy(&writer); | ||
1415 | } | ||
1416 | |||
1417 | plrDestroy(&left); | ||
1418 | plrDestroy(&right); | ||
1419 | } | ||
1420 | |||
1421 | /* We have two doclists with positions: pLeft and pRight. | ||
1422 | ** Write the phrase intersection of these two doclists into pOut. | ||
1423 | ** | ||
1424 | ** A phrase intersection means that two documents only match | ||
1425 | ** if pLeft.iPos+1==pRight.iPos. | ||
1426 | ** | ||
1427 | ** iType controls the type of data written to pOut. If iType is | ||
1428 | ** DL_POSITIONS, the positions are those from pRight. | ||
1429 | */ | ||
1430 | static void docListPhraseMerge( | ||
1431 | const char *pLeft, int nLeft, | ||
1432 | const char *pRight, int nRight, | ||
1433 | DocListType iType, | ||
1434 | DataBuffer *pOut /* Write the combined doclist here */ | ||
1435 | ){ | ||
1436 | DLReader left, right; | ||
1437 | DLWriter writer; | ||
1438 | |||
1439 | if( nLeft==0 || nRight==0 ) return; | ||
1440 | |||
1441 | assert( iType!=DL_POSITIONS_OFFSETS ); | ||
1442 | |||
1443 | dlrInit(&left, DL_POSITIONS, pLeft, nLeft); | ||
1444 | dlrInit(&right, DL_POSITIONS, pRight, nRight); | ||
1445 | dlwInit(&writer, iType, pOut); | ||
1446 | |||
1447 | while( !dlrAtEnd(&left) && !dlrAtEnd(&right) ){ | ||
1448 | if( dlrDocid(&left)<dlrDocid(&right) ){ | ||
1449 | dlrStep(&left); | ||
1450 | }else if( dlrDocid(&right)<dlrDocid(&left) ){ | ||
1451 | dlrStep(&right); | ||
1452 | }else{ | ||
1453 | posListPhraseMerge(&left, &right, &writer); | ||
1454 | dlrStep(&left); | ||
1455 | dlrStep(&right); | ||
1456 | } | ||
1457 | } | ||
1458 | |||
1459 | dlrDestroy(&left); | ||
1460 | dlrDestroy(&right); | ||
1461 | dlwDestroy(&writer); | ||
1462 | } | ||
1463 | |||
1464 | /* We have two DL_DOCIDS doclists: pLeft and pRight. | ||
1465 | ** Write the intersection of these two doclists into pOut as a | ||
1466 | ** DL_DOCIDS doclist. | ||
1467 | */ | ||
1468 | static void docListAndMerge( | ||
1469 | const char *pLeft, int nLeft, | ||
1470 | const char *pRight, int nRight, | ||
1471 | DataBuffer *pOut /* Write the combined doclist here */ | ||
1472 | ){ | ||
1473 | DLReader left, right; | ||
1474 | DLWriter writer; | ||
1475 | |||
1476 | if( nLeft==0 || nRight==0 ) return; | ||
1477 | |||
1478 | dlrInit(&left, DL_DOCIDS, pLeft, nLeft); | ||
1479 | dlrInit(&right, DL_DOCIDS, pRight, nRight); | ||
1480 | dlwInit(&writer, DL_DOCIDS, pOut); | ||
1481 | |||
1482 | while( !dlrAtEnd(&left) && !dlrAtEnd(&right) ){ | ||
1483 | if( dlrDocid(&left)<dlrDocid(&right) ){ | ||
1484 | dlrStep(&left); | ||
1485 | }else if( dlrDocid(&right)<dlrDocid(&left) ){ | ||
1486 | dlrStep(&right); | ||
1487 | }else{ | ||
1488 | dlwAdd(&writer, dlrDocid(&left)); | ||
1489 | dlrStep(&left); | ||
1490 | dlrStep(&right); | ||
1491 | } | ||
1492 | } | ||
1493 | |||
1494 | dlrDestroy(&left); | ||
1495 | dlrDestroy(&right); | ||
1496 | dlwDestroy(&writer); | ||
1497 | } | ||
1498 | |||
1499 | /* We have two DL_DOCIDS doclists: pLeft and pRight. | ||
1500 | ** Write the union of these two doclists into pOut as a | ||
1501 | ** DL_DOCIDS doclist. | ||
1502 | */ | ||
1503 | static void docListOrMerge( | ||
1504 | const char *pLeft, int nLeft, | ||
1505 | const char *pRight, int nRight, | ||
1506 | DataBuffer *pOut /* Write the combined doclist here */ | ||
1507 | ){ | ||
1508 | DLReader left, right; | ||
1509 | DLWriter writer; | ||
1510 | |||
1511 | if( nLeft==0 ){ | ||
1512 | dataBufferAppend(pOut, pRight, nRight); | ||
1513 | return; | ||
1514 | } | ||
1515 | if( nRight==0 ){ | ||
1516 | dataBufferAppend(pOut, pLeft, nLeft); | ||
1517 | return; | ||
1518 | } | ||
1519 | |||
1520 | dlrInit(&left, DL_DOCIDS, pLeft, nLeft); | ||
1521 | dlrInit(&right, DL_DOCIDS, pRight, nRight); | ||
1522 | dlwInit(&writer, DL_DOCIDS, pOut); | ||
1523 | |||
1524 | while( !dlrAtEnd(&left) || !dlrAtEnd(&right) ){ | ||
1525 | if( dlrAtEnd(&right) ){ | ||
1526 | dlwAdd(&writer, dlrDocid(&left)); | ||
1527 | dlrStep(&left); | ||
1528 | }else if( dlrAtEnd(&left) ){ | ||
1529 | dlwAdd(&writer, dlrDocid(&right)); | ||
1530 | dlrStep(&right); | ||
1531 | }else if( dlrDocid(&left)<dlrDocid(&right) ){ | ||
1532 | dlwAdd(&writer, dlrDocid(&left)); | ||
1533 | dlrStep(&left); | ||
1534 | }else if( dlrDocid(&right)<dlrDocid(&left) ){ | ||
1535 | dlwAdd(&writer, dlrDocid(&right)); | ||
1536 | dlrStep(&right); | ||
1537 | }else{ | ||
1538 | dlwAdd(&writer, dlrDocid(&left)); | ||
1539 | dlrStep(&left); | ||
1540 | dlrStep(&right); | ||
1541 | } | ||
1542 | } | ||
1543 | |||
1544 | dlrDestroy(&left); | ||
1545 | dlrDestroy(&right); | ||
1546 | dlwDestroy(&writer); | ||
1547 | } | ||
1548 | |||
1549 | /* We have two DL_DOCIDS doclists: pLeft and pRight. | ||
1550 | ** Write into pOut as DL_DOCIDS doclist containing all documents that | ||
1551 | ** occur in pLeft but not in pRight. | ||
1552 | */ | ||
1553 | static void docListExceptMerge( | ||
1554 | const char *pLeft, int nLeft, | ||
1555 | const char *pRight, int nRight, | ||
1556 | DataBuffer *pOut /* Write the combined doclist here */ | ||
1557 | ){ | ||
1558 | DLReader left, right; | ||
1559 | DLWriter writer; | ||
1560 | |||
1561 | if( nLeft==0 ) return; | ||
1562 | if( nRight==0 ){ | ||
1563 | dataBufferAppend(pOut, pLeft, nLeft); | ||
1564 | return; | ||
1565 | } | ||
1566 | |||
1567 | dlrInit(&left, DL_DOCIDS, pLeft, nLeft); | ||
1568 | dlrInit(&right, DL_DOCIDS, pRight, nRight); | ||
1569 | dlwInit(&writer, DL_DOCIDS, pOut); | ||
1570 | |||
1571 | while( !dlrAtEnd(&left) ){ | ||
1572 | while( !dlrAtEnd(&right) && dlrDocid(&right)<dlrDocid(&left) ){ | ||
1573 | dlrStep(&right); | ||
1574 | } | ||
1575 | if( dlrAtEnd(&right) || dlrDocid(&left)<dlrDocid(&right) ){ | ||
1576 | dlwAdd(&writer, dlrDocid(&left)); | ||
1577 | } | ||
1578 | dlrStep(&left); | ||
1579 | } | ||
1580 | |||
1581 | dlrDestroy(&left); | ||
1582 | dlrDestroy(&right); | ||
1583 | dlwDestroy(&writer); | ||
1584 | } | ||
1585 | |||
1586 | static char *string_dup_n(const char *s, int n){ | ||
1587 | char *str = malloc(n + 1); | ||
1588 | memcpy(str, s, n); | ||
1589 | str[n] = '\0'; | ||
1590 | return str; | ||
1591 | } | ||
1592 | |||
1593 | /* Duplicate a string; the caller must free() the returned string. | ||
1594 | * (We don't use strdup() since it's not part of the standard C library and | ||
1595 | * may not be available everywhere.) */ | ||
1596 | static char *string_dup(const char *s){ | ||
1597 | return string_dup_n(s, strlen(s)); | ||
1598 | } | ||
1599 | |||
1600 | /* Format a string, replacing each occurrence of the % character with | ||
1601 | * zDb.zName. This may be more convenient than sqlite_mprintf() | ||
1602 | * when one string is used repeatedly in a format string. | ||
1603 | * The caller must free() the returned string. */ | ||
1604 | static char *string_format(const char *zFormat, | ||
1605 | const char *zDb, const char *zName){ | ||
1606 | const char *p; | ||
1607 | size_t len = 0; | ||
1608 | size_t nDb = strlen(zDb); | ||
1609 | size_t nName = strlen(zName); | ||
1610 | size_t nFullTableName = nDb+1+nName; | ||
1611 | char *result; | ||
1612 | char *r; | ||
1613 | |||
1614 | /* first compute length needed */ | ||
1615 | for(p = zFormat ; *p ; ++p){ | ||
1616 | len += (*p=='%' ? nFullTableName : 1); | ||
1617 | } | ||
1618 | len += 1; /* for null terminator */ | ||
1619 | |||
1620 | r = result = malloc(len); | ||
1621 | for(p = zFormat; *p; ++p){ | ||
1622 | if( *p=='%' ){ | ||
1623 | memcpy(r, zDb, nDb); | ||
1624 | r += nDb; | ||
1625 | *r++ = '.'; | ||
1626 | memcpy(r, zName, nName); | ||
1627 | r += nName; | ||
1628 | } else { | ||
1629 | *r++ = *p; | ||
1630 | } | ||
1631 | } | ||
1632 | *r++ = '\0'; | ||
1633 | assert( r == result + len ); | ||
1634 | return result; | ||
1635 | } | ||
1636 | |||
1637 | static int sql_exec(sqlite3 *db, const char *zDb, const char *zName, | ||
1638 | const char *zFormat){ | ||
1639 | char *zCommand = string_format(zFormat, zDb, zName); | ||
1640 | int rc; | ||
1641 | TRACE(("FTS3 sql: %s\n", zCommand)); | ||
1642 | rc = sqlite3_exec(db, zCommand, NULL, 0, NULL); | ||
1643 | free(zCommand); | ||
1644 | return rc; | ||
1645 | } | ||
1646 | |||
1647 | static int sql_prepare(sqlite3 *db, const char *zDb, const char *zName, | ||
1648 | sqlite3_stmt **ppStmt, const char *zFormat){ | ||
1649 | char *zCommand = string_format(zFormat, zDb, zName); | ||
1650 | int rc; | ||
1651 | TRACE(("FTS3 prepare: %s\n", zCommand)); | ||
1652 | rc = sqlite3_prepare_v2(db, zCommand, -1, ppStmt, NULL); | ||
1653 | free(zCommand); | ||
1654 | return rc; | ||
1655 | } | ||
1656 | |||
1657 | /* end utility functions */ | ||
1658 | |||
1659 | /* Forward reference */ | ||
1660 | typedef struct fulltext_vtab fulltext_vtab; | ||
1661 | |||
1662 | /* A single term in a query is represented by an instances of | ||
1663 | ** the following structure. | ||
1664 | */ | ||
1665 | typedef struct QueryTerm { | ||
1666 | short int nPhrase; /* How many following terms are part of the same phrase */ | ||
1667 | short int iPhrase; /* This is the i-th term of a phrase. */ | ||
1668 | short int iColumn; /* Column of the index that must match this term */ | ||
1669 | signed char isOr; /* this term is preceded by "OR" */ | ||
1670 | signed char isNot; /* this term is preceded by "-" */ | ||
1671 | signed char isPrefix; /* this term is followed by "*" */ | ||
1672 | char *pTerm; /* text of the term. '\000' terminated. malloced */ | ||
1673 | int nTerm; /* Number of bytes in pTerm[] */ | ||
1674 | } QueryTerm; | ||
1675 | |||
1676 | |||
1677 | /* A query string is parsed into a Query structure. | ||
1678 | * | ||
1679 | * We could, in theory, allow query strings to be complicated | ||
1680 | * nested expressions with precedence determined by parentheses. | ||
1681 | * But none of the major search engines do this. (Perhaps the | ||
1682 | * feeling is that an parenthesized expression is two complex of | ||
1683 | * an idea for the average user to grasp.) Taking our lead from | ||
1684 | * the major search engines, we will allow queries to be a list | ||
1685 | * of terms (with an implied AND operator) or phrases in double-quotes, | ||
1686 | * with a single optional "-" before each non-phrase term to designate | ||
1687 | * negation and an optional OR connector. | ||
1688 | * | ||
1689 | * OR binds more tightly than the implied AND, which is what the | ||
1690 | * major search engines seem to do. So, for example: | ||
1691 | * | ||
1692 | * [one two OR three] ==> one AND (two OR three) | ||
1693 | * [one OR two three] ==> (one OR two) AND three | ||
1694 | * | ||
1695 | * A "-" before a term matches all entries that lack that term. | ||
1696 | * The "-" must occur immediately before the term with in intervening | ||
1697 | * space. This is how the search engines do it. | ||
1698 | * | ||
1699 | * A NOT term cannot be the right-hand operand of an OR. If this | ||
1700 | * occurs in the query string, the NOT is ignored: | ||
1701 | * | ||
1702 | * [one OR -two] ==> one OR two | ||
1703 | * | ||
1704 | */ | ||
1705 | typedef struct Query { | ||
1706 | fulltext_vtab *pFts; /* The full text index */ | ||
1707 | int nTerms; /* Number of terms in the query */ | ||
1708 | QueryTerm *pTerms; /* Array of terms. Space obtained from malloc() */ | ||
1709 | int nextIsOr; /* Set the isOr flag on the next inserted term */ | ||
1710 | int nextColumn; /* Next word parsed must be in this column */ | ||
1711 | int dfltColumn; /* The default column */ | ||
1712 | } Query; | ||
1713 | |||
1714 | |||
1715 | /* | ||
1716 | ** An instance of the following structure keeps track of generated | ||
1717 | ** matching-word offset information and snippets. | ||
1718 | */ | ||
1719 | typedef struct Snippet { | ||
1720 | int nMatch; /* Total number of matches */ | ||
1721 | int nAlloc; /* Space allocated for aMatch[] */ | ||
1722 | struct snippetMatch { /* One entry for each matching term */ | ||
1723 | char snStatus; /* Status flag for use while constructing snippets */ | ||
1724 | short int iCol; /* The column that contains the match */ | ||
1725 | short int iTerm; /* The index in Query.pTerms[] of the matching term */ | ||
1726 | short int nByte; /* Number of bytes in the term */ | ||
1727 | int iStart; /* The offset to the first character of the term */ | ||
1728 | } *aMatch; /* Points to space obtained from malloc */ | ||
1729 | char *zOffset; /* Text rendering of aMatch[] */ | ||
1730 | int nOffset; /* strlen(zOffset) */ | ||
1731 | char *zSnippet; /* Snippet text */ | ||
1732 | int nSnippet; /* strlen(zSnippet) */ | ||
1733 | } Snippet; | ||
1734 | |||
1735 | |||
1736 | typedef enum QueryType { | ||
1737 | QUERY_GENERIC, /* table scan */ | ||
1738 | QUERY_DOCID, /* lookup by docid */ | ||
1739 | QUERY_FULLTEXT /* QUERY_FULLTEXT + [i] is a full-text search for column i*/ | ||
1740 | } QueryType; | ||
1741 | |||
1742 | typedef enum fulltext_statement { | ||
1743 | CONTENT_INSERT_STMT, | ||
1744 | CONTENT_SELECT_STMT, | ||
1745 | CONTENT_UPDATE_STMT, | ||
1746 | CONTENT_DELETE_STMT, | ||
1747 | |||
1748 | BLOCK_INSERT_STMT, | ||
1749 | BLOCK_SELECT_STMT, | ||
1750 | BLOCK_DELETE_STMT, | ||
1751 | |||
1752 | SEGDIR_MAX_INDEX_STMT, | ||
1753 | SEGDIR_SET_STMT, | ||
1754 | SEGDIR_SELECT_STMT, | ||
1755 | SEGDIR_SPAN_STMT, | ||
1756 | SEGDIR_DELETE_STMT, | ||
1757 | SEGDIR_SELECT_ALL_STMT, | ||
1758 | |||
1759 | MAX_STMT /* Always at end! */ | ||
1760 | } fulltext_statement; | ||
1761 | |||
1762 | /* These must exactly match the enum above. */ | ||
1763 | /* TODO(shess): Is there some risk that a statement will be used in two | ||
1764 | ** cursors at once, e.g. if a query joins a virtual table to itself? | ||
1765 | ** If so perhaps we should move some of these to the cursor object. | ||
1766 | */ | ||
1767 | static const char *const fulltext_zStatement[MAX_STMT] = { | ||
1768 | /* CONTENT_INSERT */ NULL, /* generated in contentInsertStatement() */ | ||
1769 | /* CONTENT_SELECT */ NULL, /* generated in contentSelectStatement() */ | ||
1770 | /* CONTENT_UPDATE */ NULL, /* generated in contentUpdateStatement() */ | ||
1771 | /* CONTENT_DELETE */ "delete from %_content where docid = ?", | ||
1772 | |||
1773 | /* BLOCK_INSERT */ | ||
1774 | "insert into %_segments (blockid, block) values (null, ?)", | ||
1775 | /* BLOCK_SELECT */ "select block from %_segments where blockid = ?", | ||
1776 | /* BLOCK_DELETE */ "delete from %_segments where blockid between ? and ?", | ||
1777 | |||
1778 | /* SEGDIR_MAX_INDEX */ "select max(idx) from %_segdir where level = ?", | ||
1779 | /* SEGDIR_SET */ "insert into %_segdir values (?, ?, ?, ?, ?, ?)", | ||
1780 | /* SEGDIR_SELECT */ | ||
1781 | "select start_block, leaves_end_block, root from %_segdir " | ||
1782 | " where level = ? order by idx", | ||
1783 | /* SEGDIR_SPAN */ | ||
1784 | "select min(start_block), max(end_block) from %_segdir " | ||
1785 | " where level = ? and start_block <> 0", | ||
1786 | /* SEGDIR_DELETE */ "delete from %_segdir where level = ?", | ||
1787 | /* SEGDIR_SELECT_ALL */ | ||
1788 | "select root, leaves_end_block from %_segdir order by level desc, idx", | ||
1789 | }; | ||
1790 | |||
1791 | /* | ||
1792 | ** A connection to a fulltext index is an instance of the following | ||
1793 | ** structure. The xCreate and xConnect methods create an instance | ||
1794 | ** of this structure and xDestroy and xDisconnect free that instance. | ||
1795 | ** All other methods receive a pointer to the structure as one of their | ||
1796 | ** arguments. | ||
1797 | */ | ||
1798 | struct fulltext_vtab { | ||
1799 | sqlite3_vtab base; /* Base class used by SQLite core */ | ||
1800 | sqlite3 *db; /* The database connection */ | ||
1801 | const char *zDb; /* logical database name */ | ||
1802 | const char *zName; /* virtual table name */ | ||
1803 | int nColumn; /* number of columns in virtual table */ | ||
1804 | char **azColumn; /* column names. malloced */ | ||
1805 | char **azContentColumn; /* column names in content table; malloced */ | ||
1806 | sqlite3_tokenizer *pTokenizer; /* tokenizer for inserts and queries */ | ||
1807 | |||
1808 | /* Precompiled statements which we keep as long as the table is | ||
1809 | ** open. | ||
1810 | */ | ||
1811 | sqlite3_stmt *pFulltextStatements[MAX_STMT]; | ||
1812 | |||
1813 | /* Precompiled statements used for segment merges. We run a | ||
1814 | ** separate select across the leaf level of each tree being merged. | ||
1815 | */ | ||
1816 | sqlite3_stmt *pLeafSelectStmts[MERGE_COUNT]; | ||
1817 | /* The statement used to prepare pLeafSelectStmts. */ | ||
1818 | #define LEAF_SELECT \ | ||
1819 | "select block from %_segments where blockid between ? and ? order by blockid" | ||
1820 | |||
1821 | /* These buffer pending index updates during transactions. | ||
1822 | ** nPendingData estimates the memory size of the pending data. It | ||
1823 | ** doesn't include the hash-bucket overhead, nor any malloc | ||
1824 | ** overhead. When nPendingData exceeds kPendingThreshold, the | ||
1825 | ** buffer is flushed even before the transaction closes. | ||
1826 | ** pendingTerms stores the data, and is only valid when nPendingData | ||
1827 | ** is >=0 (nPendingData<0 means pendingTerms has not been | ||
1828 | ** initialized). iPrevDocid is the last docid written, used to make | ||
1829 | ** certain we're inserting in sorted order. | ||
1830 | */ | ||
1831 | int nPendingData; | ||
1832 | #define kPendingThreshold (1*1024*1024) | ||
1833 | sqlite_int64 iPrevDocid; | ||
1834 | fts3Hash pendingTerms; | ||
1835 | }; | ||
1836 | |||
1837 | /* | ||
1838 | ** When the core wants to do a query, it create a cursor using a | ||
1839 | ** call to xOpen. This structure is an instance of a cursor. It | ||
1840 | ** is destroyed by xClose. | ||
1841 | */ | ||
1842 | typedef struct fulltext_cursor { | ||
1843 | sqlite3_vtab_cursor base; /* Base class used by SQLite core */ | ||
1844 | QueryType iCursorType; /* Copy of sqlite3_index_info.idxNum */ | ||
1845 | sqlite3_stmt *pStmt; /* Prepared statement in use by the cursor */ | ||
1846 | int eof; /* True if at End Of Results */ | ||
1847 | Query q; /* Parsed query string */ | ||
1848 | Snippet snippet; /* Cached snippet for the current row */ | ||
1849 | int iColumn; /* Column being searched */ | ||
1850 | DataBuffer result; /* Doclist results from fulltextQuery */ | ||
1851 | DLReader reader; /* Result reader if result not empty */ | ||
1852 | } fulltext_cursor; | ||
1853 | |||
1854 | static struct fulltext_vtab *cursor_vtab(fulltext_cursor *c){ | ||
1855 | return (fulltext_vtab *) c->base.pVtab; | ||
1856 | } | ||
1857 | |||
1858 | static const sqlite3_module fts3Module; /* forward declaration */ | ||
1859 | |||
1860 | /* Return a dynamically generated statement of the form | ||
1861 | * insert into %_content (docid, ...) values (?, ...) | ||
1862 | */ | ||
1863 | static const char *contentInsertStatement(fulltext_vtab *v){ | ||
1864 | StringBuffer sb; | ||
1865 | int i; | ||
1866 | |||
1867 | initStringBuffer(&sb); | ||
1868 | append(&sb, "insert into %_content (docid, "); | ||
1869 | appendList(&sb, v->nColumn, v->azContentColumn); | ||
1870 | append(&sb, ") values (?"); | ||
1871 | for(i=0; i<v->nColumn; ++i) | ||
1872 | append(&sb, ", ?"); | ||
1873 | append(&sb, ")"); | ||
1874 | return stringBufferData(&sb); | ||
1875 | } | ||
1876 | |||
1877 | /* Return a dynamically generated statement of the form | ||
1878 | * select <content columns> from %_content where docid = ? | ||
1879 | */ | ||
1880 | static const char *contentSelectStatement(fulltext_vtab *v){ | ||
1881 | StringBuffer sb; | ||
1882 | initStringBuffer(&sb); | ||
1883 | append(&sb, "SELECT "); | ||
1884 | appendList(&sb, v->nColumn, v->azContentColumn); | ||
1885 | append(&sb, " FROM %_content WHERE docid = ?"); | ||
1886 | return stringBufferData(&sb); | ||
1887 | } | ||
1888 | |||
1889 | /* Return a dynamically generated statement of the form | ||
1890 | * update %_content set [col_0] = ?, [col_1] = ?, ... | ||
1891 | * where docid = ? | ||
1892 | */ | ||
1893 | static const char *contentUpdateStatement(fulltext_vtab *v){ | ||
1894 | StringBuffer sb; | ||
1895 | int i; | ||
1896 | |||
1897 | initStringBuffer(&sb); | ||
1898 | append(&sb, "update %_content set "); | ||
1899 | for(i=0; i<v->nColumn; ++i) { | ||
1900 | if( i>0 ){ | ||
1901 | append(&sb, ", "); | ||
1902 | } | ||
1903 | append(&sb, v->azContentColumn[i]); | ||
1904 | append(&sb, " = ?"); | ||
1905 | } | ||
1906 | append(&sb, " where docid = ?"); | ||
1907 | return stringBufferData(&sb); | ||
1908 | } | ||
1909 | |||
1910 | /* Puts a freshly-prepared statement determined by iStmt in *ppStmt. | ||
1911 | ** If the indicated statement has never been prepared, it is prepared | ||
1912 | ** and cached, otherwise the cached version is reset. | ||
1913 | */ | ||
1914 | static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt, | ||
1915 | sqlite3_stmt **ppStmt){ | ||
1916 | assert( iStmt<MAX_STMT ); | ||
1917 | if( v->pFulltextStatements[iStmt]==NULL ){ | ||
1918 | const char *zStmt; | ||
1919 | int rc; | ||
1920 | switch( iStmt ){ | ||
1921 | case CONTENT_INSERT_STMT: | ||
1922 | zStmt = contentInsertStatement(v); break; | ||
1923 | case CONTENT_SELECT_STMT: | ||
1924 | zStmt = contentSelectStatement(v); break; | ||
1925 | case CONTENT_UPDATE_STMT: | ||
1926 | zStmt = contentUpdateStatement(v); break; | ||
1927 | default: | ||
1928 | zStmt = fulltext_zStatement[iStmt]; | ||
1929 | } | ||
1930 | rc = sql_prepare(v->db, v->zDb, v->zName, &v->pFulltextStatements[iStmt], | ||
1931 | zStmt); | ||
1932 | if( zStmt != fulltext_zStatement[iStmt]) free((void *) zStmt); | ||
1933 | if( rc!=SQLITE_OK ) return rc; | ||
1934 | } else { | ||
1935 | int rc = sqlite3_reset(v->pFulltextStatements[iStmt]); | ||
1936 | if( rc!=SQLITE_OK ) return rc; | ||
1937 | } | ||
1938 | |||
1939 | *ppStmt = v->pFulltextStatements[iStmt]; | ||
1940 | return SQLITE_OK; | ||
1941 | } | ||
1942 | |||
1943 | /* Like sqlite3_step(), but convert SQLITE_DONE to SQLITE_OK and | ||
1944 | ** SQLITE_ROW to SQLITE_ERROR. Useful for statements like UPDATE, | ||
1945 | ** where we expect no results. | ||
1946 | */ | ||
1947 | static int sql_single_step(sqlite3_stmt *s){ | ||
1948 | int rc = sqlite3_step(s); | ||
1949 | return (rc==SQLITE_DONE) ? SQLITE_OK : rc; | ||
1950 | } | ||
1951 | |||
1952 | /* Like sql_get_statement(), but for special replicated LEAF_SELECT | ||
1953 | ** statements. | ||
1954 | */ | ||
1955 | /* TODO(shess) Write version for generic statements and then share | ||
1956 | ** that between the cached-statement functions. | ||
1957 | */ | ||
1958 | static int sql_get_leaf_statement(fulltext_vtab *v, int idx, | ||
1959 | sqlite3_stmt **ppStmt){ | ||
1960 | assert( idx>=0 && idx<MERGE_COUNT ); | ||
1961 | if( v->pLeafSelectStmts[idx]==NULL ){ | ||
1962 | int rc = sql_prepare(v->db, v->zDb, v->zName, &v->pLeafSelectStmts[idx], | ||
1963 | LEAF_SELECT); | ||
1964 | if( rc!=SQLITE_OK ) return rc; | ||
1965 | }else{ | ||
1966 | int rc = sqlite3_reset(v->pLeafSelectStmts[idx]); | ||
1967 | if( rc!=SQLITE_OK ) return rc; | ||
1968 | } | ||
1969 | |||
1970 | *ppStmt = v->pLeafSelectStmts[idx]; | ||
1971 | return SQLITE_OK; | ||
1972 | } | ||
1973 | |||
1974 | /* insert into %_content (docid, ...) values ([docid], [pValues]) | ||
1975 | ** If the docid contains SQL NULL, then a unique docid will be | ||
1976 | ** generated. | ||
1977 | */ | ||
1978 | static int content_insert(fulltext_vtab *v, sqlite3_value *docid, | ||
1979 | sqlite3_value **pValues){ | ||
1980 | sqlite3_stmt *s; | ||
1981 | int i; | ||
1982 | int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s); | ||
1983 | if( rc!=SQLITE_OK ) return rc; | ||
1984 | |||
1985 | rc = sqlite3_bind_value(s, 1, docid); | ||
1986 | if( rc!=SQLITE_OK ) return rc; | ||
1987 | |||
1988 | for(i=0; i<v->nColumn; ++i){ | ||
1989 | rc = sqlite3_bind_value(s, 2+i, pValues[i]); | ||
1990 | if( rc!=SQLITE_OK ) return rc; | ||
1991 | } | ||
1992 | |||
1993 | return sql_single_step(s); | ||
1994 | } | ||
1995 | |||
1996 | /* update %_content set col0 = pValues[0], col1 = pValues[1], ... | ||
1997 | * where docid = [iDocid] */ | ||
1998 | static int content_update(fulltext_vtab *v, sqlite3_value **pValues, | ||
1999 | sqlite_int64 iDocid){ | ||
2000 | sqlite3_stmt *s; | ||
2001 | int i; | ||
2002 | int rc = sql_get_statement(v, CONTENT_UPDATE_STMT, &s); | ||
2003 | if( rc!=SQLITE_OK ) return rc; | ||
2004 | |||
2005 | for(i=0; i<v->nColumn; ++i){ | ||
2006 | rc = sqlite3_bind_value(s, 1+i, pValues[i]); | ||
2007 | if( rc!=SQLITE_OK ) return rc; | ||
2008 | } | ||
2009 | |||
2010 | rc = sqlite3_bind_int64(s, 1+v->nColumn, iDocid); | ||
2011 | if( rc!=SQLITE_OK ) return rc; | ||
2012 | |||
2013 | return sql_single_step(s); | ||
2014 | } | ||
2015 | |||
2016 | static void freeStringArray(int nString, const char **pString){ | ||
2017 | int i; | ||
2018 | |||
2019 | for (i=0 ; i < nString ; ++i) { | ||
2020 | if( pString[i]!=NULL ) free((void *) pString[i]); | ||
2021 | } | ||
2022 | free((void *) pString); | ||
2023 | } | ||
2024 | |||
2025 | /* select * from %_content where docid = [iDocid] | ||
2026 | * The caller must delete the returned array and all strings in it. | ||
2027 | * null fields will be NULL in the returned array. | ||
2028 | * | ||
2029 | * TODO: Perhaps we should return pointer/length strings here for consistency | ||
2030 | * with other code which uses pointer/length. */ | ||
2031 | static int content_select(fulltext_vtab *v, sqlite_int64 iDocid, | ||
2032 | const char ***pValues){ | ||
2033 | sqlite3_stmt *s; | ||
2034 | const char **values; | ||
2035 | int i; | ||
2036 | int rc; | ||
2037 | |||
2038 | *pValues = NULL; | ||
2039 | |||
2040 | rc = sql_get_statement(v, CONTENT_SELECT_STMT, &s); | ||
2041 | if( rc!=SQLITE_OK ) return rc; | ||
2042 | |||
2043 | rc = sqlite3_bind_int64(s, 1, iDocid); | ||
2044 | if( rc!=SQLITE_OK ) return rc; | ||
2045 | |||
2046 | rc = sqlite3_step(s); | ||
2047 | if( rc!=SQLITE_ROW ) return rc; | ||
2048 | |||
2049 | values = (const char **) malloc(v->nColumn * sizeof(const char *)); | ||
2050 | for(i=0; i<v->nColumn; ++i){ | ||
2051 | if( sqlite3_column_type(s, i)==SQLITE_NULL ){ | ||
2052 | values[i] = NULL; | ||
2053 | }else{ | ||
2054 | values[i] = string_dup((char*)sqlite3_column_text(s, i)); | ||
2055 | } | ||
2056 | } | ||
2057 | |||
2058 | /* We expect only one row. We must execute another sqlite3_step() | ||
2059 | * to complete the iteration; otherwise the table will remain locked. */ | ||
2060 | rc = sqlite3_step(s); | ||
2061 | if( rc==SQLITE_DONE ){ | ||
2062 | *pValues = values; | ||
2063 | return SQLITE_OK; | ||
2064 | } | ||
2065 | |||
2066 | freeStringArray(v->nColumn, values); | ||
2067 | return rc; | ||
2068 | } | ||
2069 | |||
2070 | /* delete from %_content where docid = [iDocid ] */ | ||
2071 | static int content_delete(fulltext_vtab *v, sqlite_int64 iDocid){ | ||
2072 | sqlite3_stmt *s; | ||
2073 | int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s); | ||
2074 | if( rc!=SQLITE_OK ) return rc; | ||
2075 | |||
2076 | rc = sqlite3_bind_int64(s, 1, iDocid); | ||
2077 | if( rc!=SQLITE_OK ) return rc; | ||
2078 | |||
2079 | return sql_single_step(s); | ||
2080 | } | ||
2081 | |||
2082 | /* insert into %_segments values ([pData]) | ||
2083 | ** returns assigned blockid in *piBlockid | ||
2084 | */ | ||
2085 | static int block_insert(fulltext_vtab *v, const char *pData, int nData, | ||
2086 | sqlite_int64 *piBlockid){ | ||
2087 | sqlite3_stmt *s; | ||
2088 | int rc = sql_get_statement(v, BLOCK_INSERT_STMT, &s); | ||
2089 | if( rc!=SQLITE_OK ) return rc; | ||
2090 | |||
2091 | rc = sqlite3_bind_blob(s, 1, pData, nData, SQLITE_STATIC); | ||
2092 | if( rc!=SQLITE_OK ) return rc; | ||
2093 | |||
2094 | rc = sqlite3_step(s); | ||
2095 | if( rc==SQLITE_ROW ) return SQLITE_ERROR; | ||
2096 | if( rc!=SQLITE_DONE ) return rc; | ||
2097 | |||
2098 | /* blockid column is an alias for rowid. */ | ||
2099 | *piBlockid = sqlite3_last_insert_rowid(v->db); | ||
2100 | return SQLITE_OK; | ||
2101 | } | ||
2102 | |||
2103 | /* delete from %_segments | ||
2104 | ** where blockid between [iStartBlockid] and [iEndBlockid] | ||
2105 | ** | ||
2106 | ** Deletes the range of blocks, inclusive, used to delete the blocks | ||
2107 | ** which form a segment. | ||
2108 | */ | ||
2109 | static int block_delete(fulltext_vtab *v, | ||
2110 | sqlite_int64 iStartBlockid, sqlite_int64 iEndBlockid){ | ||
2111 | sqlite3_stmt *s; | ||
2112 | int rc = sql_get_statement(v, BLOCK_DELETE_STMT, &s); | ||
2113 | if( rc!=SQLITE_OK ) return rc; | ||
2114 | |||
2115 | rc = sqlite3_bind_int64(s, 1, iStartBlockid); | ||
2116 | if( rc!=SQLITE_OK ) return rc; | ||
2117 | |||
2118 | rc = sqlite3_bind_int64(s, 2, iEndBlockid); | ||
2119 | if( rc!=SQLITE_OK ) return rc; | ||
2120 | |||
2121 | return sql_single_step(s); | ||
2122 | } | ||
2123 | |||
2124 | /* Returns SQLITE_ROW with *pidx set to the maximum segment idx found | ||
2125 | ** at iLevel. Returns SQLITE_DONE if there are no segments at | ||
2126 | ** iLevel. Otherwise returns an error. | ||
2127 | */ | ||
2128 | static int segdir_max_index(fulltext_vtab *v, int iLevel, int *pidx){ | ||
2129 | sqlite3_stmt *s; | ||
2130 | int rc = sql_get_statement(v, SEGDIR_MAX_INDEX_STMT, &s); | ||
2131 | if( rc!=SQLITE_OK ) return rc; | ||
2132 | |||
2133 | rc = sqlite3_bind_int(s, 1, iLevel); | ||
2134 | if( rc!=SQLITE_OK ) return rc; | ||
2135 | |||
2136 | rc = sqlite3_step(s); | ||
2137 | /* Should always get at least one row due to how max() works. */ | ||
2138 | if( rc==SQLITE_DONE ) return SQLITE_DONE; | ||
2139 | if( rc!=SQLITE_ROW ) return rc; | ||
2140 | |||
2141 | /* NULL means that there were no inputs to max(). */ | ||
2142 | if( SQLITE_NULL==sqlite3_column_type(s, 0) ){ | ||
2143 | rc = sqlite3_step(s); | ||
2144 | if( rc==SQLITE_ROW ) return SQLITE_ERROR; | ||
2145 | return rc; | ||
2146 | } | ||
2147 | |||
2148 | *pidx = sqlite3_column_int(s, 0); | ||
2149 | |||
2150 | /* We expect only one row. We must execute another sqlite3_step() | ||
2151 | * to complete the iteration; otherwise the table will remain locked. */ | ||
2152 | rc = sqlite3_step(s); | ||
2153 | if( rc==SQLITE_ROW ) return SQLITE_ERROR; | ||
2154 | if( rc!=SQLITE_DONE ) return rc; | ||
2155 | return SQLITE_ROW; | ||
2156 | } | ||
2157 | |||
2158 | /* insert into %_segdir values ( | ||
2159 | ** [iLevel], [idx], | ||
2160 | ** [iStartBlockid], [iLeavesEndBlockid], [iEndBlockid], | ||
2161 | ** [pRootData] | ||
2162 | ** ) | ||
2163 | */ | ||
2164 | static int segdir_set(fulltext_vtab *v, int iLevel, int idx, | ||
2165 | sqlite_int64 iStartBlockid, | ||
2166 | sqlite_int64 iLeavesEndBlockid, | ||
2167 | sqlite_int64 iEndBlockid, | ||
2168 | const char *pRootData, int nRootData){ | ||
2169 | sqlite3_stmt *s; | ||
2170 | int rc = sql_get_statement(v, SEGDIR_SET_STMT, &s); | ||
2171 | if( rc!=SQLITE_OK ) return rc; | ||
2172 | |||
2173 | rc = sqlite3_bind_int(s, 1, iLevel); | ||
2174 | if( rc!=SQLITE_OK ) return rc; | ||
2175 | |||
2176 | rc = sqlite3_bind_int(s, 2, idx); | ||
2177 | if( rc!=SQLITE_OK ) return rc; | ||
2178 | |||
2179 | rc = sqlite3_bind_int64(s, 3, iStartBlockid); | ||
2180 | if( rc!=SQLITE_OK ) return rc; | ||
2181 | |||
2182 | rc = sqlite3_bind_int64(s, 4, iLeavesEndBlockid); | ||
2183 | if( rc!=SQLITE_OK ) return rc; | ||
2184 | |||
2185 | rc = sqlite3_bind_int64(s, 5, iEndBlockid); | ||
2186 | if( rc!=SQLITE_OK ) return rc; | ||
2187 | |||
2188 | rc = sqlite3_bind_blob(s, 6, pRootData, nRootData, SQLITE_STATIC); | ||
2189 | if( rc!=SQLITE_OK ) return rc; | ||
2190 | |||
2191 | return sql_single_step(s); | ||
2192 | } | ||
2193 | |||
2194 | /* Queries %_segdir for the block span of the segments in level | ||
2195 | ** iLevel. Returns SQLITE_DONE if there are no blocks for iLevel, | ||
2196 | ** SQLITE_ROW if there are blocks, else an error. | ||
2197 | */ | ||
2198 | static int segdir_span(fulltext_vtab *v, int iLevel, | ||
2199 | sqlite_int64 *piStartBlockid, | ||
2200 | sqlite_int64 *piEndBlockid){ | ||
2201 | sqlite3_stmt *s; | ||
2202 | int rc = sql_get_statement(v, SEGDIR_SPAN_STMT, &s); | ||
2203 | if( rc!=SQLITE_OK ) return rc; | ||
2204 | |||
2205 | rc = sqlite3_bind_int(s, 1, iLevel); | ||
2206 | if( rc!=SQLITE_OK ) return rc; | ||
2207 | |||
2208 | rc = sqlite3_step(s); | ||
2209 | if( rc==SQLITE_DONE ) return SQLITE_DONE; /* Should never happen */ | ||
2210 | if( rc!=SQLITE_ROW ) return rc; | ||
2211 | |||
2212 | /* This happens if all segments at this level are entirely inline. */ | ||
2213 | if( SQLITE_NULL==sqlite3_column_type(s, 0) ){ | ||
2214 | /* We expect only one row. We must execute another sqlite3_step() | ||
2215 | * to complete the iteration; otherwise the table will remain locked. */ | ||
2216 | int rc2 = sqlite3_step(s); | ||
2217 | if( rc2==SQLITE_ROW ) return SQLITE_ERROR; | ||
2218 | return rc2; | ||
2219 | } | ||
2220 | |||
2221 | *piStartBlockid = sqlite3_column_int64(s, 0); | ||
2222 | *piEndBlockid = sqlite3_column_int64(s, 1); | ||
2223 | |||
2224 | /* We expect only one row. We must execute another sqlite3_step() | ||
2225 | * to complete the iteration; otherwise the table will remain locked. */ | ||
2226 | rc = sqlite3_step(s); | ||
2227 | if( rc==SQLITE_ROW ) return SQLITE_ERROR; | ||
2228 | if( rc!=SQLITE_DONE ) return rc; | ||
2229 | return SQLITE_ROW; | ||
2230 | } | ||
2231 | |||
2232 | /* Delete the segment blocks and segment directory records for all | ||
2233 | ** segments at iLevel. | ||
2234 | */ | ||
2235 | static int segdir_delete(fulltext_vtab *v, int iLevel){ | ||
2236 | sqlite3_stmt *s; | ||
2237 | sqlite_int64 iStartBlockid, iEndBlockid; | ||
2238 | int rc = segdir_span(v, iLevel, &iStartBlockid, &iEndBlockid); | ||
2239 | if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ) return rc; | ||
2240 | |||
2241 | if( rc==SQLITE_ROW ){ | ||
2242 | rc = block_delete(v, iStartBlockid, iEndBlockid); | ||
2243 | if( rc!=SQLITE_OK ) return rc; | ||
2244 | } | ||
2245 | |||
2246 | /* Delete the segment directory itself. */ | ||
2247 | rc = sql_get_statement(v, SEGDIR_DELETE_STMT, &s); | ||
2248 | if( rc!=SQLITE_OK ) return rc; | ||
2249 | |||
2250 | rc = sqlite3_bind_int64(s, 1, iLevel); | ||
2251 | if( rc!=SQLITE_OK ) return rc; | ||
2252 | |||
2253 | return sql_single_step(s); | ||
2254 | } | ||
2255 | |||
2256 | /* TODO(shess) clearPendingTerms() is far down the file because | ||
2257 | ** writeZeroSegment() is far down the file because LeafWriter is far | ||
2258 | ** down the file. Consider refactoring the code to move the non-vtab | ||
2259 | ** code above the vtab code so that we don't need this forward | ||
2260 | ** reference. | ||
2261 | */ | ||
2262 | static int clearPendingTerms(fulltext_vtab *v); | ||
2263 | |||
2264 | /* | ||
2265 | ** Free the memory used to contain a fulltext_vtab structure. | ||
2266 | */ | ||
2267 | static void fulltext_vtab_destroy(fulltext_vtab *v){ | ||
2268 | int iStmt, i; | ||
2269 | |||
2270 | TRACE(("FTS3 Destroy %p\n", v)); | ||
2271 | for( iStmt=0; iStmt<MAX_STMT; iStmt++ ){ | ||
2272 | if( v->pFulltextStatements[iStmt]!=NULL ){ | ||
2273 | sqlite3_finalize(v->pFulltextStatements[iStmt]); | ||
2274 | v->pFulltextStatements[iStmt] = NULL; | ||
2275 | } | ||
2276 | } | ||
2277 | |||
2278 | for( i=0; i<MERGE_COUNT; i++ ){ | ||
2279 | if( v->pLeafSelectStmts[i]!=NULL ){ | ||
2280 | sqlite3_finalize(v->pLeafSelectStmts[i]); | ||
2281 | v->pLeafSelectStmts[i] = NULL; | ||
2282 | } | ||
2283 | } | ||
2284 | |||
2285 | if( v->pTokenizer!=NULL ){ | ||
2286 | v->pTokenizer->pModule->xDestroy(v->pTokenizer); | ||
2287 | v->pTokenizer = NULL; | ||
2288 | } | ||
2289 | |||
2290 | clearPendingTerms(v); | ||
2291 | |||
2292 | free(v->azColumn); | ||
2293 | for(i = 0; i < v->nColumn; ++i) { | ||
2294 | sqlite3_free(v->azContentColumn[i]); | ||
2295 | } | ||
2296 | free(v->azContentColumn); | ||
2297 | free(v); | ||
2298 | } | ||
2299 | |||
2300 | /* | ||
2301 | ** Token types for parsing the arguments to xConnect or xCreate. | ||
2302 | */ | ||
2303 | #define TOKEN_EOF 0 /* End of file */ | ||
2304 | #define TOKEN_SPACE 1 /* Any kind of whitespace */ | ||
2305 | #define TOKEN_ID 2 /* An identifier */ | ||
2306 | #define TOKEN_STRING 3 /* A string literal */ | ||
2307 | #define TOKEN_PUNCT 4 /* A single punctuation character */ | ||
2308 | |||
2309 | /* | ||
2310 | ** If X is a character that can be used in an identifier then | ||
2311 | ** IdChar(X) will be true. Otherwise it is false. | ||
2312 | ** | ||
2313 | ** For ASCII, any character with the high-order bit set is | ||
2314 | ** allowed in an identifier. For 7-bit characters, | ||
2315 | ** sqlite3IsIdChar[X] must be 1. | ||
2316 | ** | ||
2317 | ** Ticket #1066. the SQL standard does not allow '$' in the | ||
2318 | ** middle of identfiers. But many SQL implementations do. | ||
2319 | ** SQLite will allow '$' in identifiers for compatibility. | ||
2320 | ** But the feature is undocumented. | ||
2321 | */ | ||
2322 | static const char isIdChar[] = { | ||
2323 | /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */ | ||
2324 | 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 2x */ | ||
2325 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */ | ||
2326 | 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */ | ||
2327 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */ | ||
2328 | 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */ | ||
2329 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */ | ||
2330 | }; | ||
2331 | #define IdChar(C) (((c=C)&0x80)!=0 || (c>0x1f && isIdChar[c-0x20])) | ||
2332 | |||
2333 | |||
2334 | /* | ||
2335 | ** Return the length of the token that begins at z[0]. | ||
2336 | ** Store the token type in *tokenType before returning. | ||
2337 | */ | ||
2338 | static int getToken(const char *z, int *tokenType){ | ||
2339 | int i, c; | ||
2340 | switch( *z ){ | ||
2341 | case 0: { | ||
2342 | *tokenType = TOKEN_EOF; | ||
2343 | return 0; | ||
2344 | } | ||
2345 | case ' ': case '\t': case '\n': case '\f': case '\r': { | ||
2346 | for(i=1; safe_isspace(z[i]); i++){} | ||
2347 | *tokenType = TOKEN_SPACE; | ||
2348 | return i; | ||
2349 | } | ||
2350 | case '`': | ||
2351 | case '\'': | ||
2352 | case '"': { | ||
2353 | int delim = z[0]; | ||
2354 | for(i=1; (c=z[i])!=0; i++){ | ||
2355 | if( c==delim ){ | ||
2356 | if( z[i+1]==delim ){ | ||
2357 | i++; | ||
2358 | }else{ | ||
2359 | break; | ||
2360 | } | ||
2361 | } | ||
2362 | } | ||
2363 | *tokenType = TOKEN_STRING; | ||
2364 | return i + (c!=0); | ||
2365 | } | ||
2366 | case '[': { | ||
2367 | for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){} | ||
2368 | *tokenType = TOKEN_ID; | ||
2369 | return i; | ||
2370 | } | ||
2371 | default: { | ||
2372 | if( !IdChar(*z) ){ | ||
2373 | break; | ||
2374 | } | ||
2375 | for(i=1; IdChar(z[i]); i++){} | ||
2376 | *tokenType = TOKEN_ID; | ||
2377 | return i; | ||
2378 | } | ||
2379 | } | ||
2380 | *tokenType = TOKEN_PUNCT; | ||
2381 | return 1; | ||
2382 | } | ||
2383 | |||
2384 | /* | ||
2385 | ** A token extracted from a string is an instance of the following | ||
2386 | ** structure. | ||
2387 | */ | ||
2388 | typedef struct Token { | ||
2389 | const char *z; /* Pointer to token text. Not '\000' terminated */ | ||
2390 | short int n; /* Length of the token text in bytes. */ | ||
2391 | } Token; | ||
2392 | |||
2393 | /* | ||
2394 | ** Given a input string (which is really one of the argv[] parameters | ||
2395 | ** passed into xConnect or xCreate) split the string up into tokens. | ||
2396 | ** Return an array of pointers to '\000' terminated strings, one string | ||
2397 | ** for each non-whitespace token. | ||
2398 | ** | ||
2399 | ** The returned array is terminated by a single NULL pointer. | ||
2400 | ** | ||
2401 | ** Space to hold the returned array is obtained from a single | ||
2402 | ** malloc and should be freed by passing the return value to free(). | ||
2403 | ** The individual strings within the token list are all a part of | ||
2404 | ** the single memory allocation and will all be freed at once. | ||
2405 | */ | ||
2406 | static char **tokenizeString(const char *z, int *pnToken){ | ||
2407 | int nToken = 0; | ||
2408 | Token *aToken = malloc( strlen(z) * sizeof(aToken[0]) ); | ||
2409 | int n = 1; | ||
2410 | int e, i; | ||
2411 | int totalSize = 0; | ||
2412 | char **azToken; | ||
2413 | char *zCopy; | ||
2414 | while( n>0 ){ | ||
2415 | n = getToken(z, &e); | ||
2416 | if( e!=TOKEN_SPACE ){ | ||
2417 | aToken[nToken].z = z; | ||
2418 | aToken[nToken].n = n; | ||
2419 | nToken++; | ||
2420 | totalSize += n+1; | ||
2421 | } | ||
2422 | z += n; | ||
2423 | } | ||
2424 | azToken = (char**)malloc( nToken*sizeof(char*) + totalSize ); | ||
2425 | zCopy = (char*)&azToken[nToken]; | ||
2426 | nToken--; | ||
2427 | for(i=0; i<nToken; i++){ | ||
2428 | azToken[i] = zCopy; | ||
2429 | n = aToken[i].n; | ||
2430 | memcpy(zCopy, aToken[i].z, n); | ||
2431 | zCopy[n] = 0; | ||
2432 | zCopy += n+1; | ||
2433 | } | ||
2434 | azToken[nToken] = 0; | ||
2435 | free(aToken); | ||
2436 | *pnToken = nToken; | ||
2437 | return azToken; | ||
2438 | } | ||
2439 | |||
2440 | /* | ||
2441 | ** Convert an SQL-style quoted string into a normal string by removing | ||
2442 | ** the quote characters. The conversion is done in-place. If the | ||
2443 | ** input does not begin with a quote character, then this routine | ||
2444 | ** is a no-op. | ||
2445 | ** | ||
2446 | ** Examples: | ||
2447 | ** | ||
2448 | ** "abc" becomes abc | ||
2449 | ** 'xyz' becomes xyz | ||
2450 | ** [pqr] becomes pqr | ||
2451 | ** `mno` becomes mno | ||
2452 | */ | ||
2453 | static void dequoteString(char *z){ | ||
2454 | int quote; | ||
2455 | int i, j; | ||
2456 | if( z==0 ) return; | ||
2457 | quote = z[0]; | ||
2458 | switch( quote ){ | ||
2459 | case '\'': break; | ||
2460 | case '"': break; | ||
2461 | case '`': break; /* For MySQL compatibility */ | ||
2462 | case '[': quote = ']'; break; /* For MS SqlServer compatibility */ | ||
2463 | default: return; | ||
2464 | } | ||
2465 | for(i=1, j=0; z[i]; i++){ | ||
2466 | if( z[i]==quote ){ | ||
2467 | if( z[i+1]==quote ){ | ||
2468 | z[j++] = quote; | ||
2469 | i++; | ||
2470 | }else{ | ||
2471 | z[j++] = 0; | ||
2472 | break; | ||
2473 | } | ||
2474 | }else{ | ||
2475 | z[j++] = z[i]; | ||
2476 | } | ||
2477 | } | ||
2478 | } | ||
2479 | |||
2480 | /* | ||
2481 | ** The input azIn is a NULL-terminated list of tokens. Remove the first | ||
2482 | ** token and all punctuation tokens. Remove the quotes from | ||
2483 | ** around string literal tokens. | ||
2484 | ** | ||
2485 | ** Example: | ||
2486 | ** | ||
2487 | ** input: tokenize chinese ( 'simplifed' , 'mixed' ) | ||
2488 | ** output: chinese simplifed mixed | ||
2489 | ** | ||
2490 | ** Another example: | ||
2491 | ** | ||
2492 | ** input: delimiters ( '[' , ']' , '...' ) | ||
2493 | ** output: [ ] ... | ||
2494 | */ | ||
2495 | static void tokenListToIdList(char **azIn){ | ||
2496 | int i, j; | ||
2497 | if( azIn ){ | ||
2498 | for(i=0, j=-1; azIn[i]; i++){ | ||
2499 | if( safe_isalnum(azIn[i][0]) || azIn[i][1] ){ | ||
2500 | dequoteString(azIn[i]); | ||
2501 | if( j>=0 ){ | ||
2502 | azIn[j] = azIn[i]; | ||
2503 | } | ||
2504 | j++; | ||
2505 | } | ||
2506 | } | ||
2507 | azIn[j] = 0; | ||
2508 | } | ||
2509 | } | ||
2510 | |||
2511 | |||
2512 | /* | ||
2513 | ** Find the first alphanumeric token in the string zIn. Null-terminate | ||
2514 | ** this token. Remove any quotation marks. And return a pointer to | ||
2515 | ** the result. | ||
2516 | */ | ||
2517 | static char *firstToken(char *zIn, char **pzTail){ | ||
2518 | int n, ttype; | ||
2519 | while(1){ | ||
2520 | n = getToken(zIn, &ttype); | ||
2521 | if( ttype==TOKEN_SPACE ){ | ||
2522 | zIn += n; | ||
2523 | }else if( ttype==TOKEN_EOF ){ | ||
2524 | *pzTail = zIn; | ||
2525 | return 0; | ||
2526 | }else{ | ||
2527 | zIn[n] = 0; | ||
2528 | *pzTail = &zIn[1]; | ||
2529 | dequoteString(zIn); | ||
2530 | return zIn; | ||
2531 | } | ||
2532 | } | ||
2533 | /*NOTREACHED*/ | ||
2534 | } | ||
2535 | |||
2536 | /* Return true if... | ||
2537 | ** | ||
2538 | ** * s begins with the string t, ignoring case | ||
2539 | ** * s is longer than t | ||
2540 | ** * The first character of s beyond t is not a alphanumeric | ||
2541 | ** | ||
2542 | ** Ignore leading space in *s. | ||
2543 | ** | ||
2544 | ** To put it another way, return true if the first token of | ||
2545 | ** s[] is t[]. | ||
2546 | */ | ||
2547 | static int startsWith(const char *s, const char *t){ | ||
2548 | while( safe_isspace(*s) ){ s++; } | ||
2549 | while( *t ){ | ||
2550 | if( safe_tolower(*s++)!=safe_tolower(*t++) ) return 0; | ||
2551 | } | ||
2552 | return *s!='_' && !safe_isalnum(*s); | ||
2553 | } | ||
2554 | |||
2555 | /* | ||
2556 | ** An instance of this structure defines the "spec" of a | ||
2557 | ** full text index. This structure is populated by parseSpec | ||
2558 | ** and use by fulltextConnect and fulltextCreate. | ||
2559 | */ | ||
2560 | typedef struct TableSpec { | ||
2561 | const char *zDb; /* Logical database name */ | ||
2562 | const char *zName; /* Name of the full-text index */ | ||
2563 | int nColumn; /* Number of columns to be indexed */ | ||
2564 | char **azColumn; /* Original names of columns to be indexed */ | ||
2565 | char **azContentColumn; /* Column names for %_content */ | ||
2566 | char **azTokenizer; /* Name of tokenizer and its arguments */ | ||
2567 | } TableSpec; | ||
2568 | |||
2569 | /* | ||
2570 | ** Reclaim all of the memory used by a TableSpec | ||
2571 | */ | ||
2572 | static void clearTableSpec(TableSpec *p) { | ||
2573 | free(p->azColumn); | ||
2574 | free(p->azContentColumn); | ||
2575 | free(p->azTokenizer); | ||
2576 | } | ||
2577 | |||
2578 | /* Parse a CREATE VIRTUAL TABLE statement, which looks like this: | ||
2579 | * | ||
2580 | * CREATE VIRTUAL TABLE email | ||
2581 | * USING fts3(subject, body, tokenize mytokenizer(myarg)) | ||
2582 | * | ||
2583 | * We return parsed information in a TableSpec structure. | ||
2584 | * | ||
2585 | */ | ||
2586 | static int parseSpec(TableSpec *pSpec, int argc, const char *const*argv, | ||
2587 | char**pzErr){ | ||
2588 | int i, n; | ||
2589 | char *z, *zDummy; | ||
2590 | char **azArg; | ||
2591 | const char *zTokenizer = 0; /* argv[] entry describing the tokenizer */ | ||
2592 | |||
2593 | assert( argc>=3 ); | ||
2594 | /* Current interface: | ||
2595 | ** argv[0] - module name | ||
2596 | ** argv[1] - database name | ||
2597 | ** argv[2] - table name | ||
2598 | ** argv[3..] - columns, optionally followed by tokenizer specification | ||
2599 | ** and snippet delimiters specification. | ||
2600 | */ | ||
2601 | |||
2602 | /* Make a copy of the complete argv[][] array in a single allocation. | ||
2603 | ** The argv[][] array is read-only and transient. We can write to the | ||
2604 | ** copy in order to modify things and the copy is persistent. | ||
2605 | */ | ||
2606 | CLEAR(pSpec); | ||
2607 | for(i=n=0; i<argc; i++){ | ||
2608 | n += strlen(argv[i]) + 1; | ||
2609 | } | ||
2610 | azArg = malloc( sizeof(char*)*argc + n ); | ||
2611 | if( azArg==0 ){ | ||
2612 | return SQLITE_NOMEM; | ||
2613 | } | ||
2614 | z = (char*)&azArg[argc]; | ||
2615 | for(i=0; i<argc; i++){ | ||
2616 | azArg[i] = z; | ||
2617 | strcpy(z, argv[i]); | ||
2618 | z += strlen(z)+1; | ||
2619 | } | ||
2620 | |||
2621 | /* Identify the column names and the tokenizer and delimiter arguments | ||
2622 | ** in the argv[][] array. | ||
2623 | */ | ||
2624 | pSpec->zDb = azArg[1]; | ||
2625 | pSpec->zName = azArg[2]; | ||
2626 | pSpec->nColumn = 0; | ||
2627 | pSpec->azColumn = azArg; | ||
2628 | zTokenizer = "tokenize simple"; | ||
2629 | for(i=3; i<argc; ++i){ | ||
2630 | if( startsWith(azArg[i],"tokenize") ){ | ||
2631 | zTokenizer = azArg[i]; | ||
2632 | }else{ | ||
2633 | z = azArg[pSpec->nColumn] = firstToken(azArg[i], &zDummy); | ||
2634 | pSpec->nColumn++; | ||
2635 | } | ||
2636 | } | ||
2637 | if( pSpec->nColumn==0 ){ | ||
2638 | azArg[0] = "content"; | ||
2639 | pSpec->nColumn = 1; | ||
2640 | } | ||
2641 | |||
2642 | /* | ||
2643 | ** Construct the list of content column names. | ||
2644 | ** | ||
2645 | ** Each content column name will be of the form cNNAAAA | ||
2646 | ** where NN is the column number and AAAA is the sanitized | ||
2647 | ** column name. "sanitized" means that special characters are | ||
2648 | ** converted to "_". The cNN prefix guarantees that all column | ||
2649 | ** names are unique. | ||
2650 | ** | ||
2651 | ** The AAAA suffix is not strictly necessary. It is included | ||
2652 | ** for the convenience of people who might examine the generated | ||
2653 | ** %_content table and wonder what the columns are used for. | ||
2654 | */ | ||
2655 | pSpec->azContentColumn = malloc( pSpec->nColumn * sizeof(char *) ); | ||
2656 | if( pSpec->azContentColumn==0 ){ | ||
2657 | clearTableSpec(pSpec); | ||
2658 | return SQLITE_NOMEM; | ||
2659 | } | ||
2660 | for(i=0; i<pSpec->nColumn; i++){ | ||
2661 | char *p; | ||
2662 | pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]); | ||
2663 | for (p = pSpec->azContentColumn[i]; *p ; ++p) { | ||
2664 | if( !safe_isalnum(*p) ) *p = '_'; | ||
2665 | } | ||
2666 | } | ||
2667 | |||
2668 | /* | ||
2669 | ** Parse the tokenizer specification string. | ||
2670 | */ | ||
2671 | pSpec->azTokenizer = tokenizeString(zTokenizer, &n); | ||
2672 | tokenListToIdList(pSpec->azTokenizer); | ||
2673 | |||
2674 | return SQLITE_OK; | ||
2675 | } | ||
2676 | |||
2677 | /* | ||
2678 | ** Generate a CREATE TABLE statement that describes the schema of | ||
2679 | ** the virtual table. Return a pointer to this schema string. | ||
2680 | ** | ||
2681 | ** Space is obtained from sqlite3_mprintf() and should be freed | ||
2682 | ** using sqlite3_free(). | ||
2683 | */ | ||
2684 | static char *fulltextSchema( | ||
2685 | int nColumn, /* Number of columns */ | ||
2686 | const char *const* azColumn, /* List of columns */ | ||
2687 | const char *zTableName /* Name of the table */ | ||
2688 | ){ | ||
2689 | int i; | ||
2690 | char *zSchema, *zNext; | ||
2691 | const char *zSep = "("; | ||
2692 | zSchema = sqlite3_mprintf("CREATE TABLE x"); | ||
2693 | for(i=0; i<nColumn; i++){ | ||
2694 | zNext = sqlite3_mprintf("%s%s%Q", zSchema, zSep, azColumn[i]); | ||
2695 | sqlite3_free(zSchema); | ||
2696 | zSchema = zNext; | ||
2697 | zSep = ","; | ||
2698 | } | ||
2699 | zNext = sqlite3_mprintf("%s,%Q HIDDEN", zSchema, zTableName); | ||
2700 | sqlite3_free(zSchema); | ||
2701 | zSchema = zNext; | ||
2702 | zNext = sqlite3_mprintf("%s,docid HIDDEN)", zSchema); | ||
2703 | sqlite3_free(zSchema); | ||
2704 | return zNext; | ||
2705 | } | ||
2706 | |||
2707 | /* | ||
2708 | ** Build a new sqlite3_vtab structure that will describe the | ||
2709 | ** fulltext index defined by spec. | ||
2710 | */ | ||
2711 | static int constructVtab( | ||
2712 | sqlite3 *db, /* The SQLite database connection */ | ||
2713 | fts3Hash *pHash, /* Hash table containing tokenizers */ | ||
2714 | TableSpec *spec, /* Parsed spec information from parseSpec() */ | ||
2715 | sqlite3_vtab **ppVTab, /* Write the resulting vtab structure here */ | ||
2716 | char **pzErr /* Write any error message here */ | ||
2717 | ){ | ||
2718 | int rc; | ||
2719 | int n; | ||
2720 | fulltext_vtab *v = 0; | ||
2721 | const sqlite3_tokenizer_module *m = NULL; | ||
2722 | char *schema; | ||
2723 | |||
2724 | char const *zTok; /* Name of tokenizer to use for this fts table */ | ||
2725 | int nTok; /* Length of zTok, including nul terminator */ | ||
2726 | |||
2727 | v = (fulltext_vtab *) malloc(sizeof(fulltext_vtab)); | ||
2728 | if( v==0 ) return SQLITE_NOMEM; | ||
2729 | CLEAR(v); | ||
2730 | /* sqlite will initialize v->base */ | ||
2731 | v->db = db; | ||
2732 | v->zDb = spec->zDb; /* Freed when azColumn is freed */ | ||
2733 | v->zName = spec->zName; /* Freed when azColumn is freed */ | ||
2734 | v->nColumn = spec->nColumn; | ||
2735 | v->azContentColumn = spec->azContentColumn; | ||
2736 | spec->azContentColumn = 0; | ||
2737 | v->azColumn = spec->azColumn; | ||
2738 | spec->azColumn = 0; | ||
2739 | |||
2740 | if( spec->azTokenizer==0 ){ | ||
2741 | return SQLITE_NOMEM; | ||
2742 | } | ||
2743 | |||
2744 | zTok = spec->azTokenizer[0]; | ||
2745 | if( !zTok ){ | ||
2746 | zTok = "simple"; | ||
2747 | } | ||
2748 | nTok = strlen(zTok)+1; | ||
2749 | |||
2750 | m = (sqlite3_tokenizer_module *)sqlite3Fts3HashFind(pHash, zTok, nTok); | ||
2751 | if( !m ){ | ||
2752 | *pzErr = sqlite3_mprintf("unknown tokenizer: %s", spec->azTokenizer[0]); | ||
2753 | rc = SQLITE_ERROR; | ||
2754 | goto err; | ||
2755 | } | ||
2756 | |||
2757 | for(n=0; spec->azTokenizer[n]; n++){} | ||
2758 | if( n ){ | ||
2759 | rc = m->xCreate(n-1, (const char*const*)&spec->azTokenizer[1], | ||
2760 | &v->pTokenizer); | ||
2761 | }else{ | ||
2762 | rc = m->xCreate(0, 0, &v->pTokenizer); | ||
2763 | } | ||
2764 | if( rc!=SQLITE_OK ) goto err; | ||
2765 | v->pTokenizer->pModule = m; | ||
2766 | |||
2767 | /* TODO: verify the existence of backing tables foo_content, foo_term */ | ||
2768 | |||
2769 | schema = fulltextSchema(v->nColumn, (const char*const*)v->azColumn, | ||
2770 | spec->zName); | ||
2771 | rc = sqlite3_declare_vtab(db, schema); | ||
2772 | sqlite3_free(schema); | ||
2773 | if( rc!=SQLITE_OK ) goto err; | ||
2774 | |||
2775 | memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements)); | ||
2776 | |||
2777 | /* Indicate that the buffer is not live. */ | ||
2778 | v->nPendingData = -1; | ||
2779 | |||
2780 | *ppVTab = &v->base; | ||
2781 | TRACE(("FTS3 Connect %p\n", v)); | ||
2782 | |||
2783 | return rc; | ||
2784 | |||
2785 | err: | ||
2786 | fulltext_vtab_destroy(v); | ||
2787 | return rc; | ||
2788 | } | ||
2789 | |||
2790 | static int fulltextConnect( | ||
2791 | sqlite3 *db, | ||
2792 | void *pAux, | ||
2793 | int argc, const char *const*argv, | ||
2794 | sqlite3_vtab **ppVTab, | ||
2795 | char **pzErr | ||
2796 | ){ | ||
2797 | TableSpec spec; | ||
2798 | int rc = parseSpec(&spec, argc, argv, pzErr); | ||
2799 | if( rc!=SQLITE_OK ) return rc; | ||
2800 | |||
2801 | rc = constructVtab(db, (fts3Hash *)pAux, &spec, ppVTab, pzErr); | ||
2802 | clearTableSpec(&spec); | ||
2803 | return rc; | ||
2804 | } | ||
2805 | |||
2806 | /* The %_content table holds the text of each document, with | ||
2807 | ** the docid column exposed as the SQLite rowid for the table. | ||
2808 | */ | ||
2809 | /* TODO(shess) This comment needs elaboration to match the updated | ||
2810 | ** code. Work it into the top-of-file comment at that time. | ||
2811 | */ | ||
2812 | static int fulltextCreate(sqlite3 *db, void *pAux, | ||
2813 | int argc, const char * const *argv, | ||
2814 | sqlite3_vtab **ppVTab, char **pzErr){ | ||
2815 | int rc; | ||
2816 | TableSpec spec; | ||
2817 | StringBuffer schema; | ||
2818 | TRACE(("FTS3 Create\n")); | ||
2819 | |||
2820 | rc = parseSpec(&spec, argc, argv, pzErr); | ||
2821 | if( rc!=SQLITE_OK ) return rc; | ||
2822 | |||
2823 | initStringBuffer(&schema); | ||
2824 | append(&schema, "CREATE TABLE %_content("); | ||
2825 | append(&schema, " docid INTEGER PRIMARY KEY,"); | ||
2826 | appendList(&schema, spec.nColumn, spec.azContentColumn); | ||
2827 | append(&schema, ")"); | ||
2828 | rc = sql_exec(db, spec.zDb, spec.zName, stringBufferData(&schema)); | ||
2829 | stringBufferDestroy(&schema); | ||
2830 | if( rc!=SQLITE_OK ) goto out; | ||
2831 | |||
2832 | rc = sql_exec(db, spec.zDb, spec.zName, | ||
2833 | "create table %_segments(" | ||
2834 | " blockid INTEGER PRIMARY KEY," | ||
2835 | " block blob" | ||
2836 | ");" | ||
2837 | ); | ||
2838 | if( rc!=SQLITE_OK ) goto out; | ||
2839 | |||
2840 | rc = sql_exec(db, spec.zDb, spec.zName, | ||
2841 | "create table %_segdir(" | ||
2842 | " level integer," | ||
2843 | " idx integer," | ||
2844 | " start_block integer," | ||
2845 | " leaves_end_block integer," | ||
2846 | " end_block integer," | ||
2847 | " root blob," | ||
2848 | " primary key(level, idx)" | ||
2849 | ");"); | ||
2850 | if( rc!=SQLITE_OK ) goto out; | ||
2851 | |||
2852 | rc = constructVtab(db, (fts3Hash *)pAux, &spec, ppVTab, pzErr); | ||
2853 | |||
2854 | out: | ||
2855 | clearTableSpec(&spec); | ||
2856 | return rc; | ||
2857 | } | ||
2858 | |||
2859 | /* Decide how to handle an SQL query. */ | ||
2860 | static int fulltextBestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){ | ||
2861 | fulltext_vtab *v = (fulltext_vtab *)pVTab; | ||
2862 | int i; | ||
2863 | TRACE(("FTS3 BestIndex\n")); | ||
2864 | |||
2865 | for(i=0; i<pInfo->nConstraint; ++i){ | ||
2866 | const struct sqlite3_index_constraint *pConstraint; | ||
2867 | pConstraint = &pInfo->aConstraint[i]; | ||
2868 | if( pConstraint->usable ) { | ||
2869 | if( (pConstraint->iColumn==-1 || pConstraint->iColumn==v->nColumn+1) && | ||
2870 | pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ ){ | ||
2871 | pInfo->idxNum = QUERY_DOCID; /* lookup by docid */ | ||
2872 | TRACE(("FTS3 QUERY_DOCID\n")); | ||
2873 | } else if( pConstraint->iColumn>=0 && pConstraint->iColumn<=v->nColumn && | ||
2874 | pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH ){ | ||
2875 | /* full-text search */ | ||
2876 | pInfo->idxNum = QUERY_FULLTEXT + pConstraint->iColumn; | ||
2877 | TRACE(("FTS3 QUERY_FULLTEXT %d\n", pConstraint->iColumn)); | ||
2878 | } else continue; | ||
2879 | |||
2880 | pInfo->aConstraintUsage[i].argvIndex = 1; | ||
2881 | pInfo->aConstraintUsage[i].omit = 1; | ||
2882 | |||
2883 | /* An arbitrary value for now. | ||
2884 | * TODO: Perhaps docid matches should be considered cheaper than | ||
2885 | * full-text searches. */ | ||
2886 | pInfo->estimatedCost = 1.0; | ||
2887 | |||
2888 | return SQLITE_OK; | ||
2889 | } | ||
2890 | } | ||
2891 | pInfo->idxNum = QUERY_GENERIC; | ||
2892 | return SQLITE_OK; | ||
2893 | } | ||
2894 | |||
2895 | static int fulltextDisconnect(sqlite3_vtab *pVTab){ | ||
2896 | TRACE(("FTS3 Disconnect %p\n", pVTab)); | ||
2897 | fulltext_vtab_destroy((fulltext_vtab *)pVTab); | ||
2898 | return SQLITE_OK; | ||
2899 | } | ||
2900 | |||
2901 | static int fulltextDestroy(sqlite3_vtab *pVTab){ | ||
2902 | fulltext_vtab *v = (fulltext_vtab *)pVTab; | ||
2903 | int rc; | ||
2904 | |||
2905 | TRACE(("FTS3 Destroy %p\n", pVTab)); | ||
2906 | rc = sql_exec(v->db, v->zDb, v->zName, | ||
2907 | "drop table if exists %_content;" | ||
2908 | "drop table if exists %_segments;" | ||
2909 | "drop table if exists %_segdir;" | ||
2910 | ); | ||
2911 | if( rc!=SQLITE_OK ) return rc; | ||
2912 | |||
2913 | fulltext_vtab_destroy((fulltext_vtab *)pVTab); | ||
2914 | return SQLITE_OK; | ||
2915 | } | ||
2916 | |||
2917 | static int fulltextOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){ | ||
2918 | fulltext_cursor *c; | ||
2919 | |||
2920 | c = (fulltext_cursor *) calloc(sizeof(fulltext_cursor), 1); | ||
2921 | /* sqlite will initialize c->base */ | ||
2922 | *ppCursor = &c->base; | ||
2923 | TRACE(("FTS3 Open %p: %p\n", pVTab, c)); | ||
2924 | |||
2925 | return SQLITE_OK; | ||
2926 | } | ||
2927 | |||
2928 | |||
2929 | /* Free all of the dynamically allocated memory held by *q | ||
2930 | */ | ||
2931 | static void queryClear(Query *q){ | ||
2932 | int i; | ||
2933 | for(i = 0; i < q->nTerms; ++i){ | ||
2934 | free(q->pTerms[i].pTerm); | ||
2935 | } | ||
2936 | free(q->pTerms); | ||
2937 | CLEAR(q); | ||
2938 | } | ||
2939 | |||
2940 | /* Free all of the dynamically allocated memory held by the | ||
2941 | ** Snippet | ||
2942 | */ | ||
2943 | static void snippetClear(Snippet *p){ | ||
2944 | free(p->aMatch); | ||
2945 | free(p->zOffset); | ||
2946 | free(p->zSnippet); | ||
2947 | CLEAR(p); | ||
2948 | } | ||
2949 | /* | ||
2950 | ** Append a single entry to the p->aMatch[] log. | ||
2951 | */ | ||
2952 | static void snippetAppendMatch( | ||
2953 | Snippet *p, /* Append the entry to this snippet */ | ||
2954 | int iCol, int iTerm, /* The column and query term */ | ||
2955 | int iStart, int nByte /* Offset and size of the match */ | ||
2956 | ){ | ||
2957 | int i; | ||
2958 | struct snippetMatch *pMatch; | ||
2959 | if( p->nMatch+1>=p->nAlloc ){ | ||
2960 | p->nAlloc = p->nAlloc*2 + 10; | ||
2961 | p->aMatch = realloc(p->aMatch, p->nAlloc*sizeof(p->aMatch[0]) ); | ||
2962 | if( p->aMatch==0 ){ | ||
2963 | p->nMatch = 0; | ||
2964 | p->nAlloc = 0; | ||
2965 | return; | ||
2966 | } | ||
2967 | } | ||
2968 | i = p->nMatch++; | ||
2969 | pMatch = &p->aMatch[i]; | ||
2970 | pMatch->iCol = iCol; | ||
2971 | pMatch->iTerm = iTerm; | ||
2972 | pMatch->iStart = iStart; | ||
2973 | pMatch->nByte = nByte; | ||
2974 | } | ||
2975 | |||
2976 | /* | ||
2977 | ** Sizing information for the circular buffer used in snippetOffsetsOfColumn() | ||
2978 | */ | ||
2979 | #define FTS3_ROTOR_SZ (32) | ||
2980 | #define FTS3_ROTOR_MASK (FTS3_ROTOR_SZ-1) | ||
2981 | |||
2982 | /* | ||
2983 | ** Add entries to pSnippet->aMatch[] for every match that occurs against | ||
2984 | ** document zDoc[0..nDoc-1] which is stored in column iColumn. | ||
2985 | */ | ||
2986 | static void snippetOffsetsOfColumn( | ||
2987 | Query *pQuery, | ||
2988 | Snippet *pSnippet, | ||
2989 | int iColumn, | ||
2990 | const char *zDoc, | ||
2991 | int nDoc | ||
2992 | ){ | ||
2993 | const sqlite3_tokenizer_module *pTModule; /* The tokenizer module */ | ||
2994 | sqlite3_tokenizer *pTokenizer; /* The specific tokenizer */ | ||
2995 | sqlite3_tokenizer_cursor *pTCursor; /* Tokenizer cursor */ | ||
2996 | fulltext_vtab *pVtab; /* The full text index */ | ||
2997 | int nColumn; /* Number of columns in the index */ | ||
2998 | const QueryTerm *aTerm; /* Query string terms */ | ||
2999 | int nTerm; /* Number of query string terms */ | ||
3000 | int i, j; /* Loop counters */ | ||
3001 | int rc; /* Return code */ | ||
3002 | unsigned int match, prevMatch; /* Phrase search bitmasks */ | ||
3003 | const char *zToken; /* Next token from the tokenizer */ | ||
3004 | int nToken; /* Size of zToken */ | ||
3005 | int iBegin, iEnd, iPos; /* Offsets of beginning and end */ | ||
3006 | |||
3007 | /* The following variables keep a circular buffer of the last | ||
3008 | ** few tokens */ | ||
3009 | unsigned int iRotor = 0; /* Index of current token */ | ||
3010 | int iRotorBegin[FTS3_ROTOR_SZ]; /* Beginning offset of token */ | ||
3011 | int iRotorLen[FTS3_ROTOR_SZ]; /* Length of token */ | ||
3012 | |||
3013 | pVtab = pQuery->pFts; | ||
3014 | nColumn = pVtab->nColumn; | ||
3015 | pTokenizer = pVtab->pTokenizer; | ||
3016 | pTModule = pTokenizer->pModule; | ||
3017 | rc = pTModule->xOpen(pTokenizer, zDoc, nDoc, &pTCursor); | ||
3018 | if( rc ) return; | ||
3019 | pTCursor->pTokenizer = pTokenizer; | ||
3020 | aTerm = pQuery->pTerms; | ||
3021 | nTerm = pQuery->nTerms; | ||
3022 | if( nTerm>=FTS3_ROTOR_SZ ){ | ||
3023 | nTerm = FTS3_ROTOR_SZ - 1; | ||
3024 | } | ||
3025 | prevMatch = 0; | ||
3026 | while(1){ | ||
3027 | rc = pTModule->xNext(pTCursor, &zToken, &nToken, &iBegin, &iEnd, &iPos); | ||
3028 | if( rc ) break; | ||
3029 | iRotorBegin[iRotor&FTS3_ROTOR_MASK] = iBegin; | ||
3030 | iRotorLen[iRotor&FTS3_ROTOR_MASK] = iEnd-iBegin; | ||
3031 | match = 0; | ||
3032 | for(i=0; i<nTerm; i++){ | ||
3033 | int iCol; | ||
3034 | iCol = aTerm[i].iColumn; | ||
3035 | if( iCol>=0 && iCol<nColumn && iCol!=iColumn ) continue; | ||
3036 | if( aTerm[i].nTerm>nToken ) continue; | ||
3037 | if( !aTerm[i].isPrefix && aTerm[i].nTerm<nToken ) continue; | ||
3038 | assert( aTerm[i].nTerm<=nToken ); | ||
3039 | if( memcmp(aTerm[i].pTerm, zToken, aTerm[i].nTerm) ) continue; | ||
3040 | if( aTerm[i].iPhrase>1 && (prevMatch & (1<<i))==0 ) continue; | ||
3041 | match |= 1<<i; | ||
3042 | if( i==nTerm-1 || aTerm[i+1].iPhrase==1 ){ | ||
3043 | for(j=aTerm[i].iPhrase-1; j>=0; j--){ | ||
3044 | int k = (iRotor-j) & FTS3_ROTOR_MASK; | ||
3045 | snippetAppendMatch(pSnippet, iColumn, i-j, | ||
3046 | iRotorBegin[k], iRotorLen[k]); | ||
3047 | } | ||
3048 | } | ||
3049 | } | ||
3050 | prevMatch = match<<1; | ||
3051 | iRotor++; | ||
3052 | } | ||
3053 | pTModule->xClose(pTCursor); | ||
3054 | } | ||
3055 | |||
3056 | |||
3057 | /* | ||
3058 | ** Compute all offsets for the current row of the query. | ||
3059 | ** If the offsets have already been computed, this routine is a no-op. | ||
3060 | */ | ||
3061 | static void snippetAllOffsets(fulltext_cursor *p){ | ||
3062 | int nColumn; | ||
3063 | int iColumn, i; | ||
3064 | int iFirst, iLast; | ||
3065 | fulltext_vtab *pFts; | ||
3066 | |||
3067 | if( p->snippet.nMatch ) return; | ||
3068 | if( p->q.nTerms==0 ) return; | ||
3069 | pFts = p->q.pFts; | ||
3070 | nColumn = pFts->nColumn; | ||
3071 | iColumn = (p->iCursorType - QUERY_FULLTEXT); | ||
3072 | if( iColumn<0 || iColumn>=nColumn ){ | ||
3073 | iFirst = 0; | ||
3074 | iLast = nColumn-1; | ||
3075 | }else{ | ||
3076 | iFirst = iColumn; | ||
3077 | iLast = iColumn; | ||
3078 | } | ||
3079 | for(i=iFirst; i<=iLast; i++){ | ||
3080 | const char *zDoc; | ||
3081 | int nDoc; | ||
3082 | zDoc = (const char*)sqlite3_column_text(p->pStmt, i+1); | ||
3083 | nDoc = sqlite3_column_bytes(p->pStmt, i+1); | ||
3084 | snippetOffsetsOfColumn(&p->q, &p->snippet, i, zDoc, nDoc); | ||
3085 | } | ||
3086 | } | ||
3087 | |||
3088 | /* | ||
3089 | ** Convert the information in the aMatch[] array of the snippet | ||
3090 | ** into the string zOffset[0..nOffset-1]. | ||
3091 | */ | ||
3092 | static void snippetOffsetText(Snippet *p){ | ||
3093 | int i; | ||
3094 | int cnt = 0; | ||
3095 | StringBuffer sb; | ||
3096 | char zBuf[200]; | ||
3097 | if( p->zOffset ) return; | ||
3098 | initStringBuffer(&sb); | ||
3099 | for(i=0; i<p->nMatch; i++){ | ||
3100 | struct snippetMatch *pMatch = &p->aMatch[i]; | ||
3101 | zBuf[0] = ' '; | ||
3102 | sprintf(&zBuf[cnt>0], "%d %d %d %d", pMatch->iCol, | ||
3103 | pMatch->iTerm, pMatch->iStart, pMatch->nByte); | ||
3104 | append(&sb, zBuf); | ||
3105 | cnt++; | ||
3106 | } | ||
3107 | p->zOffset = stringBufferData(&sb); | ||
3108 | p->nOffset = stringBufferLength(&sb); | ||
3109 | } | ||
3110 | |||
3111 | /* | ||
3112 | ** zDoc[0..nDoc-1] is phrase of text. aMatch[0..nMatch-1] are a set | ||
3113 | ** of matching words some of which might be in zDoc. zDoc is column | ||
3114 | ** number iCol. | ||
3115 | ** | ||
3116 | ** iBreak is suggested spot in zDoc where we could begin or end an | ||
3117 | ** excerpt. Return a value similar to iBreak but possibly adjusted | ||
3118 | ** to be a little left or right so that the break point is better. | ||
3119 | */ | ||
3120 | static int wordBoundary( | ||
3121 | int iBreak, /* The suggested break point */ | ||
3122 | const char *zDoc, /* Document text */ | ||
3123 | int nDoc, /* Number of bytes in zDoc[] */ | ||
3124 | struct snippetMatch *aMatch, /* Matching words */ | ||
3125 | int nMatch, /* Number of entries in aMatch[] */ | ||
3126 | int iCol /* The column number for zDoc[] */ | ||
3127 | ){ | ||
3128 | int i; | ||
3129 | if( iBreak<=10 ){ | ||
3130 | return 0; | ||
3131 | } | ||
3132 | if( iBreak>=nDoc-10 ){ | ||
3133 | return nDoc; | ||
3134 | } | ||
3135 | for(i=0; i<nMatch && aMatch[i].iCol<iCol; i++){} | ||
3136 | while( i<nMatch && aMatch[i].iStart+aMatch[i].nByte<iBreak ){ i++; } | ||
3137 | if( i<nMatch ){ | ||
3138 | if( aMatch[i].iStart<iBreak+10 ){ | ||
3139 | return aMatch[i].iStart; | ||
3140 | } | ||
3141 | if( i>0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){ | ||
3142 | return aMatch[i-1].iStart; | ||
3143 | } | ||
3144 | } | ||
3145 | for(i=1; i<=10; i++){ | ||
3146 | if( safe_isspace(zDoc[iBreak-i]) ){ | ||
3147 | return iBreak - i + 1; | ||
3148 | } | ||
3149 | if( safe_isspace(zDoc[iBreak+i]) ){ | ||
3150 | return iBreak + i + 1; | ||
3151 | } | ||
3152 | } | ||
3153 | return iBreak; | ||
3154 | } | ||
3155 | |||
3156 | |||
3157 | |||
3158 | /* | ||
3159 | ** Allowed values for Snippet.aMatch[].snStatus | ||
3160 | */ | ||
3161 | #define SNIPPET_IGNORE 0 /* It is ok to omit this match from the snippet */ | ||
3162 | #define SNIPPET_DESIRED 1 /* We want to include this match in the snippet */ | ||
3163 | |||
3164 | /* | ||
3165 | ** Generate the text of a snippet. | ||
3166 | */ | ||
3167 | static void snippetText( | ||
3168 | fulltext_cursor *pCursor, /* The cursor we need the snippet for */ | ||
3169 | const char *zStartMark, /* Markup to appear before each match */ | ||
3170 | const char *zEndMark, /* Markup to appear after each match */ | ||
3171 | const char *zEllipsis /* Ellipsis mark */ | ||
3172 | ){ | ||
3173 | int i, j; | ||
3174 | struct snippetMatch *aMatch; | ||
3175 | int nMatch; | ||
3176 | int nDesired; | ||
3177 | StringBuffer sb; | ||
3178 | int tailCol; | ||
3179 | int tailOffset; | ||
3180 | int iCol; | ||
3181 | int nDoc; | ||
3182 | const char *zDoc; | ||
3183 | int iStart, iEnd; | ||
3184 | int tailEllipsis = 0; | ||
3185 | int iMatch; | ||
3186 | |||
3187 | |||
3188 | free(pCursor->snippet.zSnippet); | ||
3189 | pCursor->snippet.zSnippet = 0; | ||
3190 | aMatch = pCursor->snippet.aMatch; | ||
3191 | nMatch = pCursor->snippet.nMatch; | ||
3192 | initStringBuffer(&sb); | ||
3193 | |||
3194 | for(i=0; i<nMatch; i++){ | ||
3195 | aMatch[i].snStatus = SNIPPET_IGNORE; | ||
3196 | } | ||
3197 | nDesired = 0; | ||
3198 | for(i=0; i<pCursor->q.nTerms; i++){ | ||
3199 | for(j=0; j<nMatch; j++){ | ||
3200 | if( aMatch[j].iTerm==i ){ | ||
3201 | aMatch[j].snStatus = SNIPPET_DESIRED; | ||
3202 | nDesired++; | ||
3203 | break; | ||
3204 | } | ||
3205 | } | ||
3206 | } | ||
3207 | |||
3208 | iMatch = 0; | ||
3209 | tailCol = -1; | ||
3210 | tailOffset = 0; | ||
3211 | for(i=0; i<nMatch && nDesired>0; i++){ | ||
3212 | if( aMatch[i].snStatus!=SNIPPET_DESIRED ) continue; | ||
3213 | nDesired--; | ||
3214 | iCol = aMatch[i].iCol; | ||
3215 | zDoc = (const char*)sqlite3_column_text(pCursor->pStmt, iCol+1); | ||
3216 | nDoc = sqlite3_column_bytes(pCursor->pStmt, iCol+1); | ||
3217 | iStart = aMatch[i].iStart - 40; | ||
3218 | iStart = wordBoundary(iStart, zDoc, nDoc, aMatch, nMatch, iCol); | ||
3219 | if( iStart<=10 ){ | ||
3220 | iStart = 0; | ||
3221 | } | ||
3222 | if( iCol==tailCol && iStart<=tailOffset+20 ){ | ||
3223 | iStart = tailOffset; | ||
3224 | } | ||
3225 | if( (iCol!=tailCol && tailCol>=0) || iStart!=tailOffset ){ | ||
3226 | trimWhiteSpace(&sb); | ||
3227 | appendWhiteSpace(&sb); | ||
3228 | append(&sb, zEllipsis); | ||
3229 | appendWhiteSpace(&sb); | ||
3230 | } | ||
3231 | iEnd = aMatch[i].iStart + aMatch[i].nByte + 40; | ||
3232 | iEnd = wordBoundary(iEnd, zDoc, nDoc, aMatch, nMatch, iCol); | ||
3233 | if( iEnd>=nDoc-10 ){ | ||
3234 | iEnd = nDoc; | ||
3235 | tailEllipsis = 0; | ||
3236 | }else{ | ||
3237 | tailEllipsis = 1; | ||
3238 | } | ||
3239 | while( iMatch<nMatch && aMatch[iMatch].iCol<iCol ){ iMatch++; } | ||
3240 | while( iStart<iEnd ){ | ||
3241 | while( iMatch<nMatch && aMatch[iMatch].iStart<iStart | ||
3242 | && aMatch[iMatch].iCol<=iCol ){ | ||
3243 | iMatch++; | ||
3244 | } | ||
3245 | if( iMatch<nMatch && aMatch[iMatch].iStart<iEnd | ||
3246 | && aMatch[iMatch].iCol==iCol ){ | ||
3247 | nappend(&sb, &zDoc[iStart], aMatch[iMatch].iStart - iStart); | ||
3248 | iStart = aMatch[iMatch].iStart; | ||
3249 | append(&sb, zStartMark); | ||
3250 | nappend(&sb, &zDoc[iStart], aMatch[iMatch].nByte); | ||
3251 | append(&sb, zEndMark); | ||
3252 | iStart += aMatch[iMatch].nByte; | ||
3253 | for(j=iMatch+1; j<nMatch; j++){ | ||
3254 | if( aMatch[j].iTerm==aMatch[iMatch].iTerm | ||
3255 | && aMatch[j].snStatus==SNIPPET_DESIRED ){ | ||
3256 | nDesired--; | ||
3257 | aMatch[j].snStatus = SNIPPET_IGNORE; | ||
3258 | } | ||
3259 | } | ||
3260 | }else{ | ||
3261 | nappend(&sb, &zDoc[iStart], iEnd - iStart); | ||
3262 | iStart = iEnd; | ||
3263 | } | ||
3264 | } | ||
3265 | tailCol = iCol; | ||
3266 | tailOffset = iEnd; | ||
3267 | } | ||
3268 | trimWhiteSpace(&sb); | ||
3269 | if( tailEllipsis ){ | ||
3270 | appendWhiteSpace(&sb); | ||
3271 | append(&sb, zEllipsis); | ||
3272 | } | ||
3273 | pCursor->snippet.zSnippet = stringBufferData(&sb); | ||
3274 | pCursor->snippet.nSnippet = stringBufferLength(&sb); | ||
3275 | } | ||
3276 | |||
3277 | |||
3278 | /* | ||
3279 | ** Close the cursor. For additional information see the documentation | ||
3280 | ** on the xClose method of the virtual table interface. | ||
3281 | */ | ||
3282 | static int fulltextClose(sqlite3_vtab_cursor *pCursor){ | ||
3283 | fulltext_cursor *c = (fulltext_cursor *) pCursor; | ||
3284 | TRACE(("FTS3 Close %p\n", c)); | ||
3285 | sqlite3_finalize(c->pStmt); | ||
3286 | queryClear(&c->q); | ||
3287 | snippetClear(&c->snippet); | ||
3288 | if( c->result.nData!=0 ) dlrDestroy(&c->reader); | ||
3289 | dataBufferDestroy(&c->result); | ||
3290 | free(c); | ||
3291 | return SQLITE_OK; | ||
3292 | } | ||
3293 | |||
3294 | static int fulltextNext(sqlite3_vtab_cursor *pCursor){ | ||
3295 | fulltext_cursor *c = (fulltext_cursor *) pCursor; | ||
3296 | int rc; | ||
3297 | |||
3298 | TRACE(("FTS3 Next %p\n", pCursor)); | ||
3299 | snippetClear(&c->snippet); | ||
3300 | if( c->iCursorType < QUERY_FULLTEXT ){ | ||
3301 | /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */ | ||
3302 | rc = sqlite3_step(c->pStmt); | ||
3303 | switch( rc ){ | ||
3304 | case SQLITE_ROW: | ||
3305 | c->eof = 0; | ||
3306 | return SQLITE_OK; | ||
3307 | case SQLITE_DONE: | ||
3308 | c->eof = 1; | ||
3309 | return SQLITE_OK; | ||
3310 | default: | ||
3311 | c->eof = 1; | ||
3312 | return rc; | ||
3313 | } | ||
3314 | } else { /* full-text query */ | ||
3315 | rc = sqlite3_reset(c->pStmt); | ||
3316 | if( rc!=SQLITE_OK ) return rc; | ||
3317 | |||
3318 | if( c->result.nData==0 || dlrAtEnd(&c->reader) ){ | ||
3319 | c->eof = 1; | ||
3320 | return SQLITE_OK; | ||
3321 | } | ||
3322 | rc = sqlite3_bind_int64(c->pStmt, 1, dlrDocid(&c->reader)); | ||
3323 | dlrStep(&c->reader); | ||
3324 | if( rc!=SQLITE_OK ) return rc; | ||
3325 | /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */ | ||
3326 | rc = sqlite3_step(c->pStmt); | ||
3327 | if( rc==SQLITE_ROW ){ /* the case we expect */ | ||
3328 | c->eof = 0; | ||
3329 | return SQLITE_OK; | ||
3330 | } | ||
3331 | /* an error occurred; abort */ | ||
3332 | return rc==SQLITE_DONE ? SQLITE_ERROR : rc; | ||
3333 | } | ||
3334 | } | ||
3335 | |||
3336 | |||
3337 | /* TODO(shess) If we pushed LeafReader to the top of the file, or to | ||
3338 | ** another file, term_select() could be pushed above | ||
3339 | ** docListOfTerm(). | ||
3340 | */ | ||
3341 | static int termSelect(fulltext_vtab *v, int iColumn, | ||
3342 | const char *pTerm, int nTerm, int isPrefix, | ||
3343 | DocListType iType, DataBuffer *out); | ||
3344 | |||
3345 | /* Return a DocList corresponding to the query term *pTerm. If *pTerm | ||
3346 | ** is the first term of a phrase query, go ahead and evaluate the phrase | ||
3347 | ** query and return the doclist for the entire phrase query. | ||
3348 | ** | ||
3349 | ** The resulting DL_DOCIDS doclist is stored in pResult, which is | ||
3350 | ** overwritten. | ||
3351 | */ | ||
3352 | static int docListOfTerm( | ||
3353 | fulltext_vtab *v, /* The full text index */ | ||
3354 | int iColumn, /* column to restrict to. No restriction if >=nColumn */ | ||
3355 | QueryTerm *pQTerm, /* Term we are looking for, or 1st term of a phrase */ | ||
3356 | DataBuffer *pResult /* Write the result here */ | ||
3357 | ){ | ||
3358 | DataBuffer left, right, new; | ||
3359 | int i, rc; | ||
3360 | |||
3361 | /* No phrase search if no position info. */ | ||
3362 | assert( pQTerm->nPhrase==0 || DL_DEFAULT!=DL_DOCIDS ); | ||
3363 | |||
3364 | /* This code should never be called with buffered updates. */ | ||
3365 | assert( v->nPendingData<0 ); | ||
3366 | |||
3367 | dataBufferInit(&left, 0); | ||
3368 | rc = termSelect(v, iColumn, pQTerm->pTerm, pQTerm->nTerm, pQTerm->isPrefix, | ||
3369 | 0<pQTerm->nPhrase ? DL_POSITIONS : DL_DOCIDS, &left); | ||
3370 | if( rc ) return rc; | ||
3371 | for(i=1; i<=pQTerm->nPhrase && left.nData>0; i++){ | ||
3372 | dataBufferInit(&right, 0); | ||
3373 | rc = termSelect(v, iColumn, pQTerm[i].pTerm, pQTerm[i].nTerm, | ||
3374 | pQTerm[i].isPrefix, DL_POSITIONS, &right); | ||
3375 | if( rc ){ | ||
3376 | dataBufferDestroy(&left); | ||
3377 | return rc; | ||
3378 | } | ||
3379 | dataBufferInit(&new, 0); | ||
3380 | docListPhraseMerge(left.pData, left.nData, right.pData, right.nData, | ||
3381 | i<pQTerm->nPhrase ? DL_POSITIONS : DL_DOCIDS, &new); | ||
3382 | dataBufferDestroy(&left); | ||
3383 | dataBufferDestroy(&right); | ||
3384 | left = new; | ||
3385 | } | ||
3386 | *pResult = left; | ||
3387 | return SQLITE_OK; | ||
3388 | } | ||
3389 | |||
3390 | /* Add a new term pTerm[0..nTerm-1] to the query *q. | ||
3391 | */ | ||
3392 | static void queryAdd(Query *q, const char *pTerm, int nTerm){ | ||
3393 | QueryTerm *t; | ||
3394 | ++q->nTerms; | ||
3395 | q->pTerms = realloc(q->pTerms, q->nTerms * sizeof(q->pTerms[0])); | ||
3396 | if( q->pTerms==0 ){ | ||
3397 | q->nTerms = 0; | ||
3398 | return; | ||
3399 | } | ||
3400 | t = &q->pTerms[q->nTerms - 1]; | ||
3401 | CLEAR(t); | ||
3402 | t->pTerm = malloc(nTerm+1); | ||
3403 | memcpy(t->pTerm, pTerm, nTerm); | ||
3404 | t->pTerm[nTerm] = 0; | ||
3405 | t->nTerm = nTerm; | ||
3406 | t->isOr = q->nextIsOr; | ||
3407 | t->isPrefix = 0; | ||
3408 | q->nextIsOr = 0; | ||
3409 | t->iColumn = q->nextColumn; | ||
3410 | q->nextColumn = q->dfltColumn; | ||
3411 | } | ||
3412 | |||
3413 | /* | ||
3414 | ** Check to see if the string zToken[0...nToken-1] matches any | ||
3415 | ** column name in the virtual table. If it does, | ||
3416 | ** return the zero-indexed column number. If not, return -1. | ||
3417 | */ | ||
3418 | static int checkColumnSpecifier( | ||
3419 | fulltext_vtab *pVtab, /* The virtual table */ | ||
3420 | const char *zToken, /* Text of the token */ | ||
3421 | int nToken /* Number of characters in the token */ | ||
3422 | ){ | ||
3423 | int i; | ||
3424 | for(i=0; i<pVtab->nColumn; i++){ | ||
3425 | if( memcmp(pVtab->azColumn[i], zToken, nToken)==0 | ||
3426 | && pVtab->azColumn[i][nToken]==0 ){ | ||
3427 | return i; | ||
3428 | } | ||
3429 | } | ||
3430 | return -1; | ||
3431 | } | ||
3432 | |||
3433 | /* | ||
3434 | ** Parse the text at pSegment[0..nSegment-1]. Add additional terms | ||
3435 | ** to the query being assemblied in pQuery. | ||
3436 | ** | ||
3437 | ** inPhrase is true if pSegment[0..nSegement-1] is contained within | ||
3438 | ** double-quotes. If inPhrase is true, then the first term | ||
3439 | ** is marked with the number of terms in the phrase less one and | ||
3440 | ** OR and "-" syntax is ignored. If inPhrase is false, then every | ||
3441 | ** term found is marked with nPhrase=0 and OR and "-" syntax is significant. | ||
3442 | */ | ||
3443 | static int tokenizeSegment( | ||
3444 | sqlite3_tokenizer *pTokenizer, /* The tokenizer to use */ | ||
3445 | const char *pSegment, int nSegment, /* Query expression being parsed */ | ||
3446 | int inPhrase, /* True if within "..." */ | ||
3447 | Query *pQuery /* Append results here */ | ||
3448 | ){ | ||
3449 | const sqlite3_tokenizer_module *pModule = pTokenizer->pModule; | ||
3450 | sqlite3_tokenizer_cursor *pCursor; | ||
3451 | int firstIndex = pQuery->nTerms; | ||
3452 | int iCol; | ||
3453 | int nTerm = 1; | ||
3454 | |||
3455 | int rc = pModule->xOpen(pTokenizer, pSegment, nSegment, &pCursor); | ||
3456 | if( rc!=SQLITE_OK ) return rc; | ||
3457 | pCursor->pTokenizer = pTokenizer; | ||
3458 | |||
3459 | while( 1 ){ | ||
3460 | const char *pToken; | ||
3461 | int nToken, iBegin, iEnd, iPos; | ||
3462 | |||
3463 | rc = pModule->xNext(pCursor, | ||
3464 | &pToken, &nToken, | ||
3465 | &iBegin, &iEnd, &iPos); | ||
3466 | if( rc!=SQLITE_OK ) break; | ||
3467 | if( !inPhrase && | ||
3468 | pSegment[iEnd]==':' && | ||
3469 | (iCol = checkColumnSpecifier(pQuery->pFts, pToken, nToken))>=0 ){ | ||
3470 | pQuery->nextColumn = iCol; | ||
3471 | continue; | ||
3472 | } | ||
3473 | if( !inPhrase && pQuery->nTerms>0 && nToken==2 | ||
3474 | && pSegment[iBegin]=='O' && pSegment[iBegin+1]=='R' ){ | ||
3475 | pQuery->nextIsOr = 1; | ||
3476 | continue; | ||
3477 | } | ||
3478 | queryAdd(pQuery, pToken, nToken); | ||
3479 | if( !inPhrase && iBegin>0 && pSegment[iBegin-1]=='-' ){ | ||
3480 | pQuery->pTerms[pQuery->nTerms-1].isNot = 1; | ||
3481 | } | ||
3482 | if( iEnd<nSegment && pSegment[iEnd]=='*' ){ | ||
3483 | pQuery->pTerms[pQuery->nTerms-1].isPrefix = 1; | ||
3484 | } | ||
3485 | pQuery->pTerms[pQuery->nTerms-1].iPhrase = nTerm; | ||
3486 | if( inPhrase ){ | ||
3487 | nTerm++; | ||
3488 | } | ||
3489 | } | ||
3490 | |||
3491 | if( inPhrase && pQuery->nTerms>firstIndex ){ | ||
3492 | pQuery->pTerms[firstIndex].nPhrase = pQuery->nTerms - firstIndex - 1; | ||
3493 | } | ||
3494 | |||
3495 | return pModule->xClose(pCursor); | ||
3496 | } | ||
3497 | |||
3498 | /* Parse a query string, yielding a Query object pQuery. | ||
3499 | ** | ||
3500 | ** The calling function will need to queryClear() to clean up | ||
3501 | ** the dynamically allocated memory held by pQuery. | ||
3502 | */ | ||
3503 | static int parseQuery( | ||
3504 | fulltext_vtab *v, /* The fulltext index */ | ||
3505 | const char *zInput, /* Input text of the query string */ | ||
3506 | int nInput, /* Size of the input text */ | ||
3507 | int dfltColumn, /* Default column of the index to match against */ | ||
3508 | Query *pQuery /* Write the parse results here. */ | ||
3509 | ){ | ||
3510 | int iInput, inPhrase = 0; | ||
3511 | |||
3512 | if( zInput==0 ) nInput = 0; | ||
3513 | if( nInput<0 ) nInput = strlen(zInput); | ||
3514 | pQuery->nTerms = 0; | ||
3515 | pQuery->pTerms = NULL; | ||
3516 | pQuery->nextIsOr = 0; | ||
3517 | pQuery->nextColumn = dfltColumn; | ||
3518 | pQuery->dfltColumn = dfltColumn; | ||
3519 | pQuery->pFts = v; | ||
3520 | |||
3521 | for(iInput=0; iInput<nInput; ++iInput){ | ||
3522 | int i; | ||
3523 | for(i=iInput; i<nInput && zInput[i]!='"'; ++i){} | ||
3524 | if( i>iInput ){ | ||
3525 | tokenizeSegment(v->pTokenizer, zInput+iInput, i-iInput, inPhrase, | ||
3526 | pQuery); | ||
3527 | } | ||
3528 | iInput = i; | ||
3529 | if( i<nInput ){ | ||
3530 | assert( zInput[i]=='"' ); | ||
3531 | inPhrase = !inPhrase; | ||
3532 | } | ||
3533 | } | ||
3534 | |||
3535 | if( inPhrase ){ | ||
3536 | /* unmatched quote */ | ||
3537 | queryClear(pQuery); | ||
3538 | return SQLITE_ERROR; | ||
3539 | } | ||
3540 | return SQLITE_OK; | ||
3541 | } | ||
3542 | |||
3543 | /* TODO(shess) Refactor the code to remove this forward decl. */ | ||
3544 | static int flushPendingTerms(fulltext_vtab *v); | ||
3545 | |||
3546 | /* Perform a full-text query using the search expression in | ||
3547 | ** zInput[0..nInput-1]. Return a list of matching documents | ||
3548 | ** in pResult. | ||
3549 | ** | ||
3550 | ** Queries must match column iColumn. Or if iColumn>=nColumn | ||
3551 | ** they are allowed to match against any column. | ||
3552 | */ | ||
3553 | static int fulltextQuery( | ||
3554 | fulltext_vtab *v, /* The full text index */ | ||
3555 | int iColumn, /* Match against this column by default */ | ||
3556 | const char *zInput, /* The query string */ | ||
3557 | int nInput, /* Number of bytes in zInput[] */ | ||
3558 | DataBuffer *pResult, /* Write the result doclist here */ | ||
3559 | Query *pQuery /* Put parsed query string here */ | ||
3560 | ){ | ||
3561 | int i, iNext, rc; | ||
3562 | DataBuffer left, right, or, new; | ||
3563 | int nNot = 0; | ||
3564 | QueryTerm *aTerm; | ||
3565 | |||
3566 | /* TODO(shess) Instead of flushing pendingTerms, we could query for | ||
3567 | ** the relevant term and merge the doclist into what we receive from | ||
3568 | ** the database. Wait and see if this is a common issue, first. | ||
3569 | ** | ||
3570 | ** A good reason not to flush is to not generate update-related | ||
3571 | ** error codes from here. | ||
3572 | */ | ||
3573 | |||
3574 | /* Flush any buffered updates before executing the query. */ | ||
3575 | rc = flushPendingTerms(v); | ||
3576 | if( rc!=SQLITE_OK ) return rc; | ||
3577 | |||
3578 | /* TODO(shess) I think that the queryClear() calls below are not | ||
3579 | ** necessary, because fulltextClose() already clears the query. | ||
3580 | */ | ||
3581 | rc = parseQuery(v, zInput, nInput, iColumn, pQuery); | ||
3582 | if( rc!=SQLITE_OK ) return rc; | ||
3583 | |||
3584 | /* Empty or NULL queries return no results. */ | ||
3585 | if( pQuery->nTerms==0 ){ | ||
3586 | dataBufferInit(pResult, 0); | ||
3587 | return SQLITE_OK; | ||
3588 | } | ||
3589 | |||
3590 | /* Merge AND terms. */ | ||
3591 | /* TODO(shess) I think we can early-exit if( i>nNot && left.nData==0 ). */ | ||
3592 | aTerm = pQuery->pTerms; | ||
3593 | for(i = 0; i<pQuery->nTerms; i=iNext){ | ||
3594 | if( aTerm[i].isNot ){ | ||
3595 | /* Handle all NOT terms in a separate pass */ | ||
3596 | nNot++; | ||
3597 | iNext = i + aTerm[i].nPhrase+1; | ||
3598 | continue; | ||
3599 | } | ||
3600 | iNext = i + aTerm[i].nPhrase + 1; | ||
3601 | rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &right); | ||
3602 | if( rc ){ | ||
3603 | if( i!=nNot ) dataBufferDestroy(&left); | ||
3604 | queryClear(pQuery); | ||
3605 | return rc; | ||
3606 | } | ||
3607 | while( iNext<pQuery->nTerms && aTerm[iNext].isOr ){ | ||
3608 | rc = docListOfTerm(v, aTerm[iNext].iColumn, &aTerm[iNext], &or); | ||
3609 | iNext += aTerm[iNext].nPhrase + 1; | ||
3610 | if( rc ){ | ||
3611 | if( i!=nNot ) dataBufferDestroy(&left); | ||
3612 | dataBufferDestroy(&right); | ||
3613 | queryClear(pQuery); | ||
3614 | return rc; | ||
3615 | } | ||
3616 | dataBufferInit(&new, 0); | ||
3617 | docListOrMerge(right.pData, right.nData, or.pData, or.nData, &new); | ||
3618 | dataBufferDestroy(&right); | ||
3619 | dataBufferDestroy(&or); | ||
3620 | right = new; | ||
3621 | } | ||
3622 | if( i==nNot ){ /* first term processed. */ | ||
3623 | left = right; | ||
3624 | }else{ | ||
3625 | dataBufferInit(&new, 0); | ||
3626 | docListAndMerge(left.pData, left.nData, right.pData, right.nData, &new); | ||
3627 | dataBufferDestroy(&right); | ||
3628 | dataBufferDestroy(&left); | ||
3629 | left = new; | ||
3630 | } | ||
3631 | } | ||
3632 | |||
3633 | if( nNot==pQuery->nTerms ){ | ||
3634 | /* We do not yet know how to handle a query of only NOT terms */ | ||
3635 | return SQLITE_ERROR; | ||
3636 | } | ||
3637 | |||
3638 | /* Do the EXCEPT terms */ | ||
3639 | for(i=0; i<pQuery->nTerms; i += aTerm[i].nPhrase + 1){ | ||
3640 | if( !aTerm[i].isNot ) continue; | ||
3641 | rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &right); | ||
3642 | if( rc ){ | ||
3643 | queryClear(pQuery); | ||
3644 | dataBufferDestroy(&left); | ||
3645 | return rc; | ||
3646 | } | ||
3647 | dataBufferInit(&new, 0); | ||
3648 | docListExceptMerge(left.pData, left.nData, right.pData, right.nData, &new); | ||
3649 | dataBufferDestroy(&right); | ||
3650 | dataBufferDestroy(&left); | ||
3651 | left = new; | ||
3652 | } | ||
3653 | |||
3654 | *pResult = left; | ||
3655 | return rc; | ||
3656 | } | ||
3657 | |||
3658 | /* | ||
3659 | ** This is the xFilter interface for the virtual table. See | ||
3660 | ** the virtual table xFilter method documentation for additional | ||
3661 | ** information. | ||
3662 | ** | ||
3663 | ** If idxNum==QUERY_GENERIC then do a full table scan against | ||
3664 | ** the %_content table. | ||
3665 | ** | ||
3666 | ** If idxNum==QUERY_DOCID then do a docid lookup for a single entry | ||
3667 | ** in the %_content table. | ||
3668 | ** | ||
3669 | ** If idxNum>=QUERY_FULLTEXT then use the full text index. The | ||
3670 | ** column on the left-hand side of the MATCH operator is column | ||
3671 | ** number idxNum-QUERY_FULLTEXT, 0 indexed. argv[0] is the right-hand | ||
3672 | ** side of the MATCH operator. | ||
3673 | */ | ||
3674 | /* TODO(shess) Upgrade the cursor initialization and destruction to | ||
3675 | ** account for fulltextFilter() being called multiple times on the | ||
3676 | ** same cursor. The current solution is very fragile. Apply fix to | ||
3677 | ** fts3 as appropriate. | ||
3678 | */ | ||
3679 | static int fulltextFilter( | ||
3680 | sqlite3_vtab_cursor *pCursor, /* The cursor used for this query */ | ||
3681 | int idxNum, const char *idxStr, /* Which indexing scheme to use */ | ||
3682 | int argc, sqlite3_value **argv /* Arguments for the indexing scheme */ | ||
3683 | ){ | ||
3684 | fulltext_cursor *c = (fulltext_cursor *) pCursor; | ||
3685 | fulltext_vtab *v = cursor_vtab(c); | ||
3686 | int rc; | ||
3687 | StringBuffer sb; | ||
3688 | |||
3689 | TRACE(("FTS3 Filter %p\n",pCursor)); | ||
3690 | |||
3691 | initStringBuffer(&sb); | ||
3692 | append(&sb, "SELECT docid, "); | ||
3693 | appendList(&sb, v->nColumn, v->azContentColumn); | ||
3694 | append(&sb, " FROM %_content"); | ||
3695 | if( idxNum!=QUERY_GENERIC ) append(&sb, " WHERE docid = ?"); | ||
3696 | sqlite3_finalize(c->pStmt); | ||
3697 | rc = sql_prepare(v->db, v->zDb, v->zName, &c->pStmt, stringBufferData(&sb)); | ||
3698 | stringBufferDestroy(&sb); | ||
3699 | if( rc!=SQLITE_OK ) return rc; | ||
3700 | |||
3701 | c->iCursorType = idxNum; | ||
3702 | switch( idxNum ){ | ||
3703 | case QUERY_GENERIC: | ||
3704 | break; | ||
3705 | |||
3706 | case QUERY_DOCID: | ||
3707 | rc = sqlite3_bind_int64(c->pStmt, 1, sqlite3_value_int64(argv[0])); | ||
3708 | if( rc!=SQLITE_OK ) return rc; | ||
3709 | break; | ||
3710 | |||
3711 | default: /* full-text search */ | ||
3712 | { | ||
3713 | const char *zQuery = (const char *)sqlite3_value_text(argv[0]); | ||
3714 | assert( idxNum<=QUERY_FULLTEXT+v->nColumn); | ||
3715 | assert( argc==1 ); | ||
3716 | queryClear(&c->q); | ||
3717 | if( c->result.nData!=0 ){ | ||
3718 | /* This case happens if the same cursor is used repeatedly. */ | ||
3719 | dlrDestroy(&c->reader); | ||
3720 | dataBufferReset(&c->result); | ||
3721 | }else{ | ||
3722 | dataBufferInit(&c->result, 0); | ||
3723 | } | ||
3724 | rc = fulltextQuery(v, idxNum-QUERY_FULLTEXT, zQuery, -1, &c->result, &c->q); | ||
3725 | if( rc!=SQLITE_OK ) return rc; | ||
3726 | if( c->result.nData!=0 ){ | ||
3727 | dlrInit(&c->reader, DL_DOCIDS, c->result.pData, c->result.nData); | ||
3728 | } | ||
3729 | break; | ||
3730 | } | ||
3731 | } | ||
3732 | |||
3733 | return fulltextNext(pCursor); | ||
3734 | } | ||
3735 | |||
3736 | /* This is the xEof method of the virtual table. The SQLite core | ||
3737 | ** calls this routine to find out if it has reached the end of | ||
3738 | ** a query's results set. | ||
3739 | */ | ||
3740 | static int fulltextEof(sqlite3_vtab_cursor *pCursor){ | ||
3741 | fulltext_cursor *c = (fulltext_cursor *) pCursor; | ||
3742 | return c->eof; | ||
3743 | } | ||
3744 | |||
3745 | /* This is the xColumn method of the virtual table. The SQLite | ||
3746 | ** core calls this method during a query when it needs the value | ||
3747 | ** of a column from the virtual table. This method needs to use | ||
3748 | ** one of the sqlite3_result_*() routines to store the requested | ||
3749 | ** value back in the pContext. | ||
3750 | */ | ||
3751 | static int fulltextColumn(sqlite3_vtab_cursor *pCursor, | ||
3752 | sqlite3_context *pContext, int idxCol){ | ||
3753 | fulltext_cursor *c = (fulltext_cursor *) pCursor; | ||
3754 | fulltext_vtab *v = cursor_vtab(c); | ||
3755 | |||
3756 | if( idxCol<v->nColumn ){ | ||
3757 | sqlite3_value *pVal = sqlite3_column_value(c->pStmt, idxCol+1); | ||
3758 | sqlite3_result_value(pContext, pVal); | ||
3759 | }else if( idxCol==v->nColumn ){ | ||
3760 | /* The extra column whose name is the same as the table. | ||
3761 | ** Return a blob which is a pointer to the cursor | ||
3762 | */ | ||
3763 | sqlite3_result_blob(pContext, &c, sizeof(c), SQLITE_TRANSIENT); | ||
3764 | }else if( idxCol==v->nColumn+1 ){ | ||
3765 | /* The docid column, which is an alias for rowid. */ | ||
3766 | sqlite3_value *pVal = sqlite3_column_value(c->pStmt, 0); | ||
3767 | sqlite3_result_value(pContext, pVal); | ||
3768 | } | ||
3769 | return SQLITE_OK; | ||
3770 | } | ||
3771 | |||
3772 | /* This is the xRowid method. The SQLite core calls this routine to | ||
3773 | ** retrieve the rowid for the current row of the result set. fts3 | ||
3774 | ** exposes %_content.docid as the rowid for the virtual table. The | ||
3775 | ** rowid should be written to *pRowid. | ||
3776 | */ | ||
3777 | static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){ | ||
3778 | fulltext_cursor *c = (fulltext_cursor *) pCursor; | ||
3779 | |||
3780 | *pRowid = sqlite3_column_int64(c->pStmt, 0); | ||
3781 | return SQLITE_OK; | ||
3782 | } | ||
3783 | |||
3784 | /* Add all terms in [zText] to pendingTerms table. If [iColumn] > 0, | ||
3785 | ** we also store positions and offsets in the hash table using that | ||
3786 | ** column number. | ||
3787 | */ | ||
3788 | static int buildTerms(fulltext_vtab *v, sqlite_int64 iDocid, | ||
3789 | const char *zText, int iColumn){ | ||
3790 | sqlite3_tokenizer *pTokenizer = v->pTokenizer; | ||
3791 | sqlite3_tokenizer_cursor *pCursor; | ||
3792 | const char *pToken; | ||
3793 | int nTokenBytes; | ||
3794 | int iStartOffset, iEndOffset, iPosition; | ||
3795 | int rc; | ||
3796 | |||
3797 | rc = pTokenizer->pModule->xOpen(pTokenizer, zText, -1, &pCursor); | ||
3798 | if( rc!=SQLITE_OK ) return rc; | ||
3799 | |||
3800 | pCursor->pTokenizer = pTokenizer; | ||
3801 | while( SQLITE_OK==pTokenizer->pModule->xNext(pCursor, | ||
3802 | &pToken, &nTokenBytes, | ||
3803 | &iStartOffset, &iEndOffset, | ||
3804 | &iPosition) ){ | ||
3805 | DLCollector *p; | ||
3806 | int nData; /* Size of doclist before our update. */ | ||
3807 | |||
3808 | /* Positions can't be negative; we use -1 as a terminator internally. */ | ||
3809 | if( iPosition<0 ){ | ||
3810 | pTokenizer->pModule->xClose(pCursor); | ||
3811 | return SQLITE_ERROR; | ||
3812 | } | ||
3813 | |||
3814 | p = fts3HashFind(&v->pendingTerms, pToken, nTokenBytes); | ||
3815 | if( p==NULL ){ | ||
3816 | nData = 0; | ||
3817 | p = dlcNew(iDocid, DL_DEFAULT); | ||
3818 | fts3HashInsert(&v->pendingTerms, pToken, nTokenBytes, p); | ||
3819 | |||
3820 | /* Overhead for our hash table entry, the key, and the value. */ | ||
3821 | v->nPendingData += sizeof(struct fts3HashElem)+sizeof(*p)+nTokenBytes; | ||
3822 | }else{ | ||
3823 | nData = p->b.nData; | ||
3824 | if( p->dlw.iPrevDocid!=iDocid ) dlcNext(p, iDocid); | ||
3825 | } | ||
3826 | if( iColumn>=0 ){ | ||
3827 | dlcAddPos(p, iColumn, iPosition, iStartOffset, iEndOffset); | ||
3828 | } | ||
3829 | |||
3830 | /* Accumulate data added by dlcNew or dlcNext, and dlcAddPos. */ | ||
3831 | v->nPendingData += p->b.nData-nData; | ||
3832 | } | ||
3833 | |||
3834 | /* TODO(shess) Check return? Should this be able to cause errors at | ||
3835 | ** this point? Actually, same question about sqlite3_finalize(), | ||
3836 | ** though one could argue that failure there means that the data is | ||
3837 | ** not durable. *ponder* | ||
3838 | */ | ||
3839 | pTokenizer->pModule->xClose(pCursor); | ||
3840 | return rc; | ||
3841 | } | ||
3842 | |||
3843 | /* Add doclists for all terms in [pValues] to pendingTerms table. */ | ||
3844 | static int insertTerms(fulltext_vtab *v, sqlite_int64 iDocid, | ||
3845 | sqlite3_value **pValues){ | ||
3846 | int i; | ||
3847 | for(i = 0; i < v->nColumn ; ++i){ | ||
3848 | char *zText = (char*)sqlite3_value_text(pValues[i]); | ||
3849 | int rc = buildTerms(v, iDocid, zText, i); | ||
3850 | if( rc!=SQLITE_OK ) return rc; | ||
3851 | } | ||
3852 | return SQLITE_OK; | ||
3853 | } | ||
3854 | |||
3855 | /* Add empty doclists for all terms in the given row's content to | ||
3856 | ** pendingTerms. | ||
3857 | */ | ||
3858 | static int deleteTerms(fulltext_vtab *v, sqlite_int64 iDocid){ | ||
3859 | const char **pValues; | ||
3860 | int i, rc; | ||
3861 | |||
3862 | /* TODO(shess) Should we allow such tables at all? */ | ||
3863 | if( DL_DEFAULT==DL_DOCIDS ) return SQLITE_ERROR; | ||
3864 | |||
3865 | rc = content_select(v, iDocid, &pValues); | ||
3866 | if( rc!=SQLITE_OK ) return rc; | ||
3867 | |||
3868 | for(i = 0 ; i < v->nColumn; ++i) { | ||
3869 | rc = buildTerms(v, iDocid, pValues[i], -1); | ||
3870 | if( rc!=SQLITE_OK ) break; | ||
3871 | } | ||
3872 | |||
3873 | freeStringArray(v->nColumn, pValues); | ||
3874 | return SQLITE_OK; | ||
3875 | } | ||
3876 | |||
3877 | /* TODO(shess) Refactor the code to remove this forward decl. */ | ||
3878 | static int initPendingTerms(fulltext_vtab *v, sqlite_int64 iDocid); | ||
3879 | |||
3880 | /* Insert a row into the %_content table; set *piDocid to be the ID of the | ||
3881 | ** new row. Add doclists for terms to pendingTerms. | ||
3882 | */ | ||
3883 | static int index_insert(fulltext_vtab *v, sqlite3_value *pRequestDocid, | ||
3884 | sqlite3_value **pValues, sqlite_int64 *piDocid){ | ||
3885 | int rc; | ||
3886 | |||
3887 | rc = content_insert(v, pRequestDocid, pValues); /* execute an SQL INSERT */ | ||
3888 | if( rc!=SQLITE_OK ) return rc; | ||
3889 | |||
3890 | /* docid column is an alias for rowid. */ | ||
3891 | *piDocid = sqlite3_last_insert_rowid(v->db); | ||
3892 | rc = initPendingTerms(v, *piDocid); | ||
3893 | if( rc!=SQLITE_OK ) return rc; | ||
3894 | |||
3895 | return insertTerms(v, *piDocid, pValues); | ||
3896 | } | ||
3897 | |||
3898 | /* Delete a row from the %_content table; add empty doclists for terms | ||
3899 | ** to pendingTerms. | ||
3900 | */ | ||
3901 | static int index_delete(fulltext_vtab *v, sqlite_int64 iRow){ | ||
3902 | int rc = initPendingTerms(v, iRow); | ||
3903 | if( rc!=SQLITE_OK ) return rc; | ||
3904 | |||
3905 | rc = deleteTerms(v, iRow); | ||
3906 | if( rc!=SQLITE_OK ) return rc; | ||
3907 | |||
3908 | return content_delete(v, iRow); /* execute an SQL DELETE */ | ||
3909 | } | ||
3910 | |||
3911 | /* Update a row in the %_content table; add delete doclists to | ||
3912 | ** pendingTerms for old terms not in the new data, add insert doclists | ||
3913 | ** to pendingTerms for terms in the new data. | ||
3914 | */ | ||
3915 | static int index_update(fulltext_vtab *v, sqlite_int64 iRow, | ||
3916 | sqlite3_value **pValues){ | ||
3917 | int rc = initPendingTerms(v, iRow); | ||
3918 | if( rc!=SQLITE_OK ) return rc; | ||
3919 | |||
3920 | /* Generate an empty doclist for each term that previously appeared in this | ||
3921 | * row. */ | ||
3922 | rc = deleteTerms(v, iRow); | ||
3923 | if( rc!=SQLITE_OK ) return rc; | ||
3924 | |||
3925 | rc = content_update(v, pValues, iRow); /* execute an SQL UPDATE */ | ||
3926 | if( rc!=SQLITE_OK ) return rc; | ||
3927 | |||
3928 | /* Now add positions for terms which appear in the updated row. */ | ||
3929 | return insertTerms(v, iRow, pValues); | ||
3930 | } | ||
3931 | |||
3932 | /*******************************************************************/ | ||
3933 | /* InteriorWriter is used to collect terms and block references into | ||
3934 | ** interior nodes in %_segments. See commentary at top of file for | ||
3935 | ** format. | ||
3936 | */ | ||
3937 | |||
3938 | /* How large interior nodes can grow. */ | ||
3939 | #define INTERIOR_MAX 2048 | ||
3940 | |||
3941 | /* Minimum number of terms per interior node (except the root). This | ||
3942 | ** prevents large terms from making the tree too skinny - must be >0 | ||
3943 | ** so that the tree always makes progress. Note that the min tree | ||
3944 | ** fanout will be INTERIOR_MIN_TERMS+1. | ||
3945 | */ | ||
3946 | #define INTERIOR_MIN_TERMS 7 | ||
3947 | #if INTERIOR_MIN_TERMS<1 | ||
3948 | # error INTERIOR_MIN_TERMS must be greater than 0. | ||
3949 | #endif | ||
3950 | |||
3951 | /* ROOT_MAX controls how much data is stored inline in the segment | ||
3952 | ** directory. | ||
3953 | */ | ||
3954 | /* TODO(shess) Push ROOT_MAX down to whoever is writing things. It's | ||
3955 | ** only here so that interiorWriterRootInfo() and leafWriterRootInfo() | ||
3956 | ** can both see it, but if the caller passed it in, we wouldn't even | ||
3957 | ** need a define. | ||
3958 | */ | ||
3959 | #define ROOT_MAX 1024 | ||
3960 | #if ROOT_MAX<VARINT_MAX*2 | ||
3961 | # error ROOT_MAX must have enough space for a header. | ||
3962 | #endif | ||
3963 | |||
3964 | /* InteriorBlock stores a linked-list of interior blocks while a lower | ||
3965 | ** layer is being constructed. | ||
3966 | */ | ||
3967 | typedef struct InteriorBlock { | ||
3968 | DataBuffer term; /* Leftmost term in block's subtree. */ | ||
3969 | DataBuffer data; /* Accumulated data for the block. */ | ||
3970 | struct InteriorBlock *next; | ||
3971 | } InteriorBlock; | ||
3972 | |||
3973 | static InteriorBlock *interiorBlockNew(int iHeight, sqlite_int64 iChildBlock, | ||
3974 | const char *pTerm, int nTerm){ | ||
3975 | InteriorBlock *block = calloc(1, sizeof(InteriorBlock)); | ||
3976 | char c[VARINT_MAX+VARINT_MAX]; | ||
3977 | int n; | ||
3978 | |||
3979 | dataBufferInit(&block->term, 0); | ||
3980 | dataBufferReplace(&block->term, pTerm, nTerm); | ||
3981 | |||
3982 | n = putVarint(c, iHeight); | ||
3983 | n += putVarint(c+n, iChildBlock); | ||
3984 | dataBufferInit(&block->data, INTERIOR_MAX); | ||
3985 | dataBufferReplace(&block->data, c, n); | ||
3986 | |||
3987 | return block; | ||
3988 | } | ||
3989 | |||
3990 | #ifndef NDEBUG | ||
3991 | /* Verify that the data is readable as an interior node. */ | ||
3992 | static void interiorBlockValidate(InteriorBlock *pBlock){ | ||
3993 | const char *pData = pBlock->data.pData; | ||
3994 | int nData = pBlock->data.nData; | ||
3995 | int n, iDummy; | ||
3996 | sqlite_int64 iBlockid; | ||
3997 | |||
3998 | assert( nData>0 ); | ||
3999 | assert( pData!=0 ); | ||
4000 | assert( pData+nData>pData ); | ||
4001 | |||
4002 | /* Must lead with height of node as a varint(n), n>0 */ | ||
4003 | n = getVarint32(pData, &iDummy); | ||
4004 | assert( n>0 ); | ||
4005 | assert( iDummy>0 ); | ||
4006 | assert( n<nData ); | ||
4007 | pData += n; | ||
4008 | nData -= n; | ||
4009 | |||
4010 | /* Must contain iBlockid. */ | ||
4011 | n = getVarint(pData, &iBlockid); | ||
4012 | assert( n>0 ); | ||
4013 | assert( n<=nData ); | ||
4014 | pData += n; | ||
4015 | nData -= n; | ||
4016 | |||
4017 | /* Zero or more terms of positive length */ | ||
4018 | if( nData!=0 ){ | ||
4019 | /* First term is not delta-encoded. */ | ||
4020 | n = getVarint32(pData, &iDummy); | ||
4021 | assert( n>0 ); | ||
4022 | assert( iDummy>0 ); | ||
4023 | assert( n+iDummy>0); | ||
4024 | assert( n+iDummy<=nData ); | ||
4025 | pData += n+iDummy; | ||
4026 | nData -= n+iDummy; | ||
4027 | |||
4028 | /* Following terms delta-encoded. */ | ||
4029 | while( nData!=0 ){ | ||
4030 | /* Length of shared prefix. */ | ||
4031 | n = getVarint32(pData, &iDummy); | ||
4032 | assert( n>0 ); | ||
4033 | assert( iDummy>=0 ); | ||
4034 | assert( n<nData ); | ||
4035 | pData += n; | ||
4036 | nData -= n; | ||
4037 | |||
4038 | /* Length and data of distinct suffix. */ | ||
4039 | n = getVarint32(pData, &iDummy); | ||
4040 | assert( n>0 ); | ||
4041 | assert( iDummy>0 ); | ||
4042 | assert( n+iDummy>0); | ||
4043 | assert( n+iDummy<=nData ); | ||
4044 | pData += n+iDummy; | ||
4045 | nData -= n+iDummy; | ||
4046 | } | ||
4047 | } | ||
4048 | } | ||
4049 | #define ASSERT_VALID_INTERIOR_BLOCK(x) interiorBlockValidate(x) | ||
4050 | #else | ||
4051 | #define ASSERT_VALID_INTERIOR_BLOCK(x) assert( 1 ) | ||
4052 | #endif | ||
4053 | |||
4054 | typedef struct InteriorWriter { | ||
4055 | int iHeight; /* from 0 at leaves. */ | ||
4056 | InteriorBlock *first, *last; | ||
4057 | struct InteriorWriter *parentWriter; | ||
4058 | |||
4059 | DataBuffer term; /* Last term written to block "last". */ | ||
4060 | sqlite_int64 iOpeningChildBlock; /* First child block in block "last". */ | ||
4061 | #ifndef NDEBUG | ||
4062 | sqlite_int64 iLastChildBlock; /* for consistency checks. */ | ||
4063 | #endif | ||
4064 | } InteriorWriter; | ||
4065 | |||
4066 | /* Initialize an interior node where pTerm[nTerm] marks the leftmost | ||
4067 | ** term in the tree. iChildBlock is the leftmost child block at the | ||
4068 | ** next level down the tree. | ||
4069 | */ | ||
4070 | static void interiorWriterInit(int iHeight, const char *pTerm, int nTerm, | ||
4071 | sqlite_int64 iChildBlock, | ||
4072 | InteriorWriter *pWriter){ | ||
4073 | InteriorBlock *block; | ||
4074 | assert( iHeight>0 ); | ||
4075 | CLEAR(pWriter); | ||
4076 | |||
4077 | pWriter->iHeight = iHeight; | ||
4078 | pWriter->iOpeningChildBlock = iChildBlock; | ||
4079 | #ifndef NDEBUG | ||
4080 | pWriter->iLastChildBlock = iChildBlock; | ||
4081 | #endif | ||
4082 | block = interiorBlockNew(iHeight, iChildBlock, pTerm, nTerm); | ||
4083 | pWriter->last = pWriter->first = block; | ||
4084 | ASSERT_VALID_INTERIOR_BLOCK(pWriter->last); | ||
4085 | dataBufferInit(&pWriter->term, 0); | ||
4086 | } | ||
4087 | |||
4088 | /* Append the child node rooted at iChildBlock to the interior node, | ||
4089 | ** with pTerm[nTerm] as the leftmost term in iChildBlock's subtree. | ||
4090 | */ | ||
4091 | static void interiorWriterAppend(InteriorWriter *pWriter, | ||
4092 | const char *pTerm, int nTerm, | ||
4093 | sqlite_int64 iChildBlock){ | ||
4094 | char c[VARINT_MAX+VARINT_MAX]; | ||
4095 | int n, nPrefix = 0; | ||
4096 | |||
4097 | ASSERT_VALID_INTERIOR_BLOCK(pWriter->last); | ||
4098 | |||
4099 | /* The first term written into an interior node is actually | ||
4100 | ** associated with the second child added (the first child was added | ||
4101 | ** in interiorWriterInit, or in the if clause at the bottom of this | ||
4102 | ** function). That term gets encoded straight up, with nPrefix left | ||
4103 | ** at 0. | ||
4104 | */ | ||
4105 | if( pWriter->term.nData==0 ){ | ||
4106 | n = putVarint(c, nTerm); | ||
4107 | }else{ | ||
4108 | while( nPrefix<pWriter->term.nData && | ||
4109 | pTerm[nPrefix]==pWriter->term.pData[nPrefix] ){ | ||
4110 | nPrefix++; | ||
4111 | } | ||
4112 | |||
4113 | n = putVarint(c, nPrefix); | ||
4114 | n += putVarint(c+n, nTerm-nPrefix); | ||
4115 | } | ||
4116 | |||
4117 | #ifndef NDEBUG | ||
4118 | pWriter->iLastChildBlock++; | ||
4119 | #endif | ||
4120 | assert( pWriter->iLastChildBlock==iChildBlock ); | ||
4121 | |||
4122 | /* Overflow to a new block if the new term makes the current block | ||
4123 | ** too big, and the current block already has enough terms. | ||
4124 | */ | ||
4125 | if( pWriter->last->data.nData+n+nTerm-nPrefix>INTERIOR_MAX && | ||
4126 | iChildBlock-pWriter->iOpeningChildBlock>INTERIOR_MIN_TERMS ){ | ||
4127 | pWriter->last->next = interiorBlockNew(pWriter->iHeight, iChildBlock, | ||
4128 | pTerm, nTerm); | ||
4129 | pWriter->last = pWriter->last->next; | ||
4130 | pWriter->iOpeningChildBlock = iChildBlock; | ||
4131 | dataBufferReset(&pWriter->term); | ||
4132 | }else{ | ||
4133 | dataBufferAppend2(&pWriter->last->data, c, n, | ||
4134 | pTerm+nPrefix, nTerm-nPrefix); | ||
4135 | dataBufferReplace(&pWriter->term, pTerm, nTerm); | ||
4136 | } | ||
4137 | ASSERT_VALID_INTERIOR_BLOCK(pWriter->last); | ||
4138 | } | ||
4139 | |||
4140 | /* Free the space used by pWriter, including the linked-list of | ||
4141 | ** InteriorBlocks, and parentWriter, if present. | ||
4142 | */ | ||
4143 | static int interiorWriterDestroy(InteriorWriter *pWriter){ | ||
4144 | InteriorBlock *block = pWriter->first; | ||
4145 | |||
4146 | while( block!=NULL ){ | ||
4147 | InteriorBlock *b = block; | ||
4148 | block = block->next; | ||
4149 | dataBufferDestroy(&b->term); | ||
4150 | dataBufferDestroy(&b->data); | ||
4151 | free(b); | ||
4152 | } | ||
4153 | if( pWriter->parentWriter!=NULL ){ | ||
4154 | interiorWriterDestroy(pWriter->parentWriter); | ||
4155 | free(pWriter->parentWriter); | ||
4156 | } | ||
4157 | dataBufferDestroy(&pWriter->term); | ||
4158 | SCRAMBLE(pWriter); | ||
4159 | return SQLITE_OK; | ||
4160 | } | ||
4161 | |||
4162 | /* If pWriter can fit entirely in ROOT_MAX, return it as the root info | ||
4163 | ** directly, leaving *piEndBlockid unchanged. Otherwise, flush | ||
4164 | ** pWriter to %_segments, building a new layer of interior nodes, and | ||
4165 | ** recursively ask for their root into. | ||
4166 | */ | ||
4167 | static int interiorWriterRootInfo(fulltext_vtab *v, InteriorWriter *pWriter, | ||
4168 | char **ppRootInfo, int *pnRootInfo, | ||
4169 | sqlite_int64 *piEndBlockid){ | ||
4170 | InteriorBlock *block = pWriter->first; | ||
4171 | sqlite_int64 iBlockid = 0; | ||
4172 | int rc; | ||
4173 | |||
4174 | /* If we can fit the segment inline */ | ||
4175 | if( block==pWriter->last && block->data.nData<ROOT_MAX ){ | ||
4176 | *ppRootInfo = block->data.pData; | ||
4177 | *pnRootInfo = block->data.nData; | ||
4178 | return SQLITE_OK; | ||
4179 | } | ||
4180 | |||
4181 | /* Flush the first block to %_segments, and create a new level of | ||
4182 | ** interior node. | ||
4183 | */ | ||
4184 | ASSERT_VALID_INTERIOR_BLOCK(block); | ||
4185 | rc = block_insert(v, block->data.pData, block->data.nData, &iBlockid); | ||
4186 | if( rc!=SQLITE_OK ) return rc; | ||
4187 | *piEndBlockid = iBlockid; | ||
4188 | |||
4189 | pWriter->parentWriter = malloc(sizeof(*pWriter->parentWriter)); | ||
4190 | interiorWriterInit(pWriter->iHeight+1, | ||
4191 | block->term.pData, block->term.nData, | ||
4192 | iBlockid, pWriter->parentWriter); | ||
4193 | |||
4194 | /* Flush additional blocks and append to the higher interior | ||
4195 | ** node. | ||
4196 | */ | ||
4197 | for(block=block->next; block!=NULL; block=block->next){ | ||
4198 | ASSERT_VALID_INTERIOR_BLOCK(block); | ||
4199 | rc = block_insert(v, block->data.pData, block->data.nData, &iBlockid); | ||
4200 | if( rc!=SQLITE_OK ) return rc; | ||
4201 | *piEndBlockid = iBlockid; | ||
4202 | |||
4203 | interiorWriterAppend(pWriter->parentWriter, | ||
4204 | block->term.pData, block->term.nData, iBlockid); | ||
4205 | } | ||
4206 | |||
4207 | /* Parent node gets the chance to be the root. */ | ||
4208 | return interiorWriterRootInfo(v, pWriter->parentWriter, | ||
4209 | ppRootInfo, pnRootInfo, piEndBlockid); | ||
4210 | } | ||
4211 | |||
4212 | /****************************************************************/ | ||
4213 | /* InteriorReader is used to read off the data from an interior node | ||
4214 | ** (see comment at top of file for the format). | ||
4215 | */ | ||
4216 | typedef struct InteriorReader { | ||
4217 | const char *pData; | ||
4218 | int nData; | ||
4219 | |||
4220 | DataBuffer term; /* previous term, for decoding term delta. */ | ||
4221 | |||
4222 | sqlite_int64 iBlockid; | ||
4223 | } InteriorReader; | ||
4224 | |||
4225 | static void interiorReaderDestroy(InteriorReader *pReader){ | ||
4226 | dataBufferDestroy(&pReader->term); | ||
4227 | SCRAMBLE(pReader); | ||
4228 | } | ||
4229 | |||
4230 | /* TODO(shess) The assertions are great, but what if we're in NDEBUG | ||
4231 | ** and the blob is empty or otherwise contains suspect data? | ||
4232 | */ | ||
4233 | static void interiorReaderInit(const char *pData, int nData, | ||
4234 | InteriorReader *pReader){ | ||
4235 | int n, nTerm; | ||
4236 | |||
4237 | /* Require at least the leading flag byte */ | ||
4238 | assert( nData>0 ); | ||
4239 | assert( pData[0]!='\0' ); | ||
4240 | |||
4241 | CLEAR(pReader); | ||
4242 | |||
4243 | /* Decode the base blockid, and set the cursor to the first term. */ | ||
4244 | n = getVarint(pData+1, &pReader->iBlockid); | ||
4245 | assert( 1+n<=nData ); | ||
4246 | pReader->pData = pData+1+n; | ||
4247 | pReader->nData = nData-(1+n); | ||
4248 | |||
4249 | /* A single-child interior node (such as when a leaf node was too | ||
4250 | ** large for the segment directory) won't have any terms. | ||
4251 | ** Otherwise, decode the first term. | ||
4252 | */ | ||
4253 | if( pReader->nData==0 ){ | ||
4254 | dataBufferInit(&pReader->term, 0); | ||
4255 | }else{ | ||
4256 | n = getVarint32(pReader->pData, &nTerm); | ||
4257 | dataBufferInit(&pReader->term, nTerm); | ||
4258 | dataBufferReplace(&pReader->term, pReader->pData+n, nTerm); | ||
4259 | assert( n+nTerm<=pReader->nData ); | ||
4260 | pReader->pData += n+nTerm; | ||
4261 | pReader->nData -= n+nTerm; | ||
4262 | } | ||
4263 | } | ||
4264 | |||
4265 | static int interiorReaderAtEnd(InteriorReader *pReader){ | ||
4266 | return pReader->term.nData==0; | ||
4267 | } | ||
4268 | |||
4269 | static sqlite_int64 interiorReaderCurrentBlockid(InteriorReader *pReader){ | ||
4270 | return pReader->iBlockid; | ||
4271 | } | ||
4272 | |||
4273 | static int interiorReaderTermBytes(InteriorReader *pReader){ | ||
4274 | assert( !interiorReaderAtEnd(pReader) ); | ||
4275 | return pReader->term.nData; | ||
4276 | } | ||
4277 | static const char *interiorReaderTerm(InteriorReader *pReader){ | ||
4278 | assert( !interiorReaderAtEnd(pReader) ); | ||
4279 | return pReader->term.pData; | ||
4280 | } | ||
4281 | |||
4282 | /* Step forward to the next term in the node. */ | ||
4283 | static void interiorReaderStep(InteriorReader *pReader){ | ||
4284 | assert( !interiorReaderAtEnd(pReader) ); | ||
4285 | |||
4286 | /* If the last term has been read, signal eof, else construct the | ||
4287 | ** next term. | ||
4288 | */ | ||
4289 | if( pReader->nData==0 ){ | ||
4290 | dataBufferReset(&pReader->term); | ||
4291 | }else{ | ||
4292 | int n, nPrefix, nSuffix; | ||
4293 | |||
4294 | n = getVarint32(pReader->pData, &nPrefix); | ||
4295 | n += getVarint32(pReader->pData+n, &nSuffix); | ||
4296 | |||
4297 | /* Truncate the current term and append suffix data. */ | ||
4298 | pReader->term.nData = nPrefix; | ||
4299 | dataBufferAppend(&pReader->term, pReader->pData+n, nSuffix); | ||
4300 | |||
4301 | assert( n+nSuffix<=pReader->nData ); | ||
4302 | pReader->pData += n+nSuffix; | ||
4303 | pReader->nData -= n+nSuffix; | ||
4304 | } | ||
4305 | pReader->iBlockid++; | ||
4306 | } | ||
4307 | |||
4308 | /* Compare the current term to pTerm[nTerm], returning strcmp-style | ||
4309 | ** results. If isPrefix, equality means equal through nTerm bytes. | ||
4310 | */ | ||
4311 | static int interiorReaderTermCmp(InteriorReader *pReader, | ||
4312 | const char *pTerm, int nTerm, int isPrefix){ | ||
4313 | const char *pReaderTerm = interiorReaderTerm(pReader); | ||
4314 | int nReaderTerm = interiorReaderTermBytes(pReader); | ||
4315 | int c, n = nReaderTerm<nTerm ? nReaderTerm : nTerm; | ||
4316 | |||
4317 | if( n==0 ){ | ||
4318 | if( nReaderTerm>0 ) return -1; | ||
4319 | if( nTerm>0 ) return 1; | ||
4320 | return 0; | ||
4321 | } | ||
4322 | |||
4323 | c = memcmp(pReaderTerm, pTerm, n); | ||
4324 | if( c!=0 ) return c; | ||
4325 | if( isPrefix && n==nTerm ) return 0; | ||
4326 | return nReaderTerm - nTerm; | ||
4327 | } | ||
4328 | |||
4329 | /****************************************************************/ | ||
4330 | /* LeafWriter is used to collect terms and associated doclist data | ||
4331 | ** into leaf blocks in %_segments (see top of file for format info). | ||
4332 | ** Expected usage is: | ||
4333 | ** | ||
4334 | ** LeafWriter writer; | ||
4335 | ** leafWriterInit(0, 0, &writer); | ||
4336 | ** while( sorted_terms_left_to_process ){ | ||
4337 | ** // data is doclist data for that term. | ||
4338 | ** rc = leafWriterStep(v, &writer, pTerm, nTerm, pData, nData); | ||
4339 | ** if( rc!=SQLITE_OK ) goto err; | ||
4340 | ** } | ||
4341 | ** rc = leafWriterFinalize(v, &writer); | ||
4342 | **err: | ||
4343 | ** leafWriterDestroy(&writer); | ||
4344 | ** return rc; | ||
4345 | ** | ||
4346 | ** leafWriterStep() may write a collected leaf out to %_segments. | ||
4347 | ** leafWriterFinalize() finishes writing any buffered data and stores | ||
4348 | ** a root node in %_segdir. leafWriterDestroy() frees all buffers and | ||
4349 | ** InteriorWriters allocated as part of writing this segment. | ||
4350 | ** | ||
4351 | ** TODO(shess) Document leafWriterStepMerge(). | ||
4352 | */ | ||
4353 | |||
4354 | /* Put terms with data this big in their own block. */ | ||
4355 | #define STANDALONE_MIN 1024 | ||
4356 | |||
4357 | /* Keep leaf blocks below this size. */ | ||
4358 | #define LEAF_MAX 2048 | ||
4359 | |||
4360 | typedef struct LeafWriter { | ||
4361 | int iLevel; | ||
4362 | int idx; | ||
4363 | sqlite_int64 iStartBlockid; /* needed to create the root info */ | ||
4364 | sqlite_int64 iEndBlockid; /* when we're done writing. */ | ||
4365 | |||
4366 | DataBuffer term; /* previous encoded term */ | ||
4367 | DataBuffer data; /* encoding buffer */ | ||
4368 | |||
4369 | /* bytes of first term in the current node which distinguishes that | ||
4370 | ** term from the last term of the previous node. | ||
4371 | */ | ||
4372 | int nTermDistinct; | ||
4373 | |||
4374 | InteriorWriter parentWriter; /* if we overflow */ | ||
4375 | int has_parent; | ||
4376 | } LeafWriter; | ||
4377 | |||
4378 | static void leafWriterInit(int iLevel, int idx, LeafWriter *pWriter){ | ||
4379 | CLEAR(pWriter); | ||
4380 | pWriter->iLevel = iLevel; | ||
4381 | pWriter->idx = idx; | ||
4382 | |||
4383 | dataBufferInit(&pWriter->term, 32); | ||
4384 | |||
4385 | /* Start out with a reasonably sized block, though it can grow. */ | ||
4386 | dataBufferInit(&pWriter->data, LEAF_MAX); | ||
4387 | } | ||
4388 | |||
4389 | #ifndef NDEBUG | ||
4390 | /* Verify that the data is readable as a leaf node. */ | ||
4391 | static void leafNodeValidate(const char *pData, int nData){ | ||
4392 | int n, iDummy; | ||
4393 | |||
4394 | if( nData==0 ) return; | ||
4395 | assert( nData>0 ); | ||
4396 | assert( pData!=0 ); | ||
4397 | assert( pData+nData>pData ); | ||
4398 | |||
4399 | /* Must lead with a varint(0) */ | ||
4400 | n = getVarint32(pData, &iDummy); | ||
4401 | assert( iDummy==0 ); | ||
4402 | assert( n>0 ); | ||
4403 | assert( n<nData ); | ||
4404 | pData += n; | ||
4405 | nData -= n; | ||
4406 | |||
4407 | /* Leading term length and data must fit in buffer. */ | ||
4408 | n = getVarint32(pData, &iDummy); | ||
4409 | assert( n>0 ); | ||
4410 | assert( iDummy>0 ); | ||
4411 | assert( n+iDummy>0 ); | ||
4412 | assert( n+iDummy<nData ); | ||
4413 | pData += n+iDummy; | ||
4414 | nData -= n+iDummy; | ||
4415 | |||
4416 | /* Leading term's doclist length and data must fit. */ | ||
4417 | n = getVarint32(pData, &iDummy); | ||
4418 | assert( n>0 ); | ||
4419 | assert( iDummy>0 ); | ||
4420 | assert( n+iDummy>0 ); | ||
4421 | assert( n+iDummy<=nData ); | ||
4422 | ASSERT_VALID_DOCLIST(DL_DEFAULT, pData+n, iDummy, NULL); | ||
4423 | pData += n+iDummy; | ||
4424 | nData -= n+iDummy; | ||
4425 | |||
4426 | /* Verify that trailing terms and doclists also are readable. */ | ||
4427 | while( nData!=0 ){ | ||
4428 | n = getVarint32(pData, &iDummy); | ||
4429 | assert( n>0 ); | ||
4430 | assert( iDummy>=0 ); | ||
4431 | assert( n<nData ); | ||
4432 | pData += n; | ||
4433 | nData -= n; | ||
4434 | n = getVarint32(pData, &iDummy); | ||
4435 | assert( n>0 ); | ||
4436 | assert( iDummy>0 ); | ||
4437 | assert( n+iDummy>0 ); | ||
4438 | assert( n+iDummy<nData ); | ||
4439 | pData += n+iDummy; | ||
4440 | nData -= n+iDummy; | ||
4441 | |||
4442 | n = getVarint32(pData, &iDummy); | ||
4443 | assert( n>0 ); | ||
4444 | assert( iDummy>0 ); | ||
4445 | assert( n+iDummy>0 ); | ||
4446 | assert( n+iDummy<=nData ); | ||
4447 | ASSERT_VALID_DOCLIST(DL_DEFAULT, pData+n, iDummy, NULL); | ||
4448 | pData += n+iDummy; | ||
4449 | nData -= n+iDummy; | ||
4450 | } | ||
4451 | } | ||
4452 | #define ASSERT_VALID_LEAF_NODE(p, n) leafNodeValidate(p, n) | ||
4453 | #else | ||
4454 | #define ASSERT_VALID_LEAF_NODE(p, n) assert( 1 ) | ||
4455 | #endif | ||
4456 | |||
4457 | /* Flush the current leaf node to %_segments, and adding the resulting | ||
4458 | ** blockid and the starting term to the interior node which will | ||
4459 | ** contain it. | ||
4460 | */ | ||
4461 | static int leafWriterInternalFlush(fulltext_vtab *v, LeafWriter *pWriter, | ||
4462 | int iData, int nData){ | ||
4463 | sqlite_int64 iBlockid = 0; | ||
4464 | const char *pStartingTerm; | ||
4465 | int nStartingTerm, rc, n; | ||
4466 | |||
4467 | /* Must have the leading varint(0) flag, plus at least some | ||
4468 | ** valid-looking data. | ||
4469 | */ | ||
4470 | assert( nData>2 ); | ||
4471 | assert( iData>=0 ); | ||
4472 | assert( iData+nData<=pWriter->data.nData ); | ||
4473 | ASSERT_VALID_LEAF_NODE(pWriter->data.pData+iData, nData); | ||
4474 | |||
4475 | rc = block_insert(v, pWriter->data.pData+iData, nData, &iBlockid); | ||
4476 | if( rc!=SQLITE_OK ) return rc; | ||
4477 | assert( iBlockid!=0 ); | ||
4478 | |||
4479 | /* Reconstruct the first term in the leaf for purposes of building | ||
4480 | ** the interior node. | ||
4481 | */ | ||
4482 | n = getVarint32(pWriter->data.pData+iData+1, &nStartingTerm); | ||
4483 | pStartingTerm = pWriter->data.pData+iData+1+n; | ||
4484 | assert( pWriter->data.nData>iData+1+n+nStartingTerm ); | ||
4485 | assert( pWriter->nTermDistinct>0 ); | ||
4486 | assert( pWriter->nTermDistinct<=nStartingTerm ); | ||
4487 | nStartingTerm = pWriter->nTermDistinct; | ||
4488 | |||
4489 | if( pWriter->has_parent ){ | ||
4490 | interiorWriterAppend(&pWriter->parentWriter, | ||
4491 | pStartingTerm, nStartingTerm, iBlockid); | ||
4492 | }else{ | ||
4493 | interiorWriterInit(1, pStartingTerm, nStartingTerm, iBlockid, | ||
4494 | &pWriter->parentWriter); | ||
4495 | pWriter->has_parent = 1; | ||
4496 | } | ||
4497 | |||
4498 | /* Track the span of this segment's leaf nodes. */ | ||
4499 | if( pWriter->iEndBlockid==0 ){ | ||
4500 | pWriter->iEndBlockid = pWriter->iStartBlockid = iBlockid; | ||
4501 | }else{ | ||
4502 | pWriter->iEndBlockid++; | ||
4503 | assert( iBlockid==pWriter->iEndBlockid ); | ||
4504 | } | ||
4505 | |||
4506 | return SQLITE_OK; | ||
4507 | } | ||
4508 | static int leafWriterFlush(fulltext_vtab *v, LeafWriter *pWriter){ | ||
4509 | int rc = leafWriterInternalFlush(v, pWriter, 0, pWriter->data.nData); | ||
4510 | if( rc!=SQLITE_OK ) return rc; | ||
4511 | |||
4512 | /* Re-initialize the output buffer. */ | ||
4513 | dataBufferReset(&pWriter->data); | ||
4514 | |||
4515 | return SQLITE_OK; | ||
4516 | } | ||
4517 | |||
4518 | /* Fetch the root info for the segment. If the entire leaf fits | ||
4519 | ** within ROOT_MAX, then it will be returned directly, otherwise it | ||
4520 | ** will be flushed and the root info will be returned from the | ||
4521 | ** interior node. *piEndBlockid is set to the blockid of the last | ||
4522 | ** interior or leaf node written to disk (0 if none are written at | ||
4523 | ** all). | ||
4524 | */ | ||
4525 | static int leafWriterRootInfo(fulltext_vtab *v, LeafWriter *pWriter, | ||
4526 | char **ppRootInfo, int *pnRootInfo, | ||
4527 | sqlite_int64 *piEndBlockid){ | ||
4528 | /* we can fit the segment entirely inline */ | ||
4529 | if( !pWriter->has_parent && pWriter->data.nData<ROOT_MAX ){ | ||
4530 | *ppRootInfo = pWriter->data.pData; | ||
4531 | *pnRootInfo = pWriter->data.nData; | ||
4532 | *piEndBlockid = 0; | ||
4533 | return SQLITE_OK; | ||
4534 | } | ||
4535 | |||
4536 | /* Flush remaining leaf data. */ | ||
4537 | if( pWriter->data.nData>0 ){ | ||
4538 | int rc = leafWriterFlush(v, pWriter); | ||
4539 | if( rc!=SQLITE_OK ) return rc; | ||
4540 | } | ||
4541 | |||
4542 | /* We must have flushed a leaf at some point. */ | ||
4543 | assert( pWriter->has_parent ); | ||
4544 | |||
4545 | /* Tenatively set the end leaf blockid as the end blockid. If the | ||
4546 | ** interior node can be returned inline, this will be the final | ||
4547 | ** blockid, otherwise it will be overwritten by | ||
4548 | ** interiorWriterRootInfo(). | ||
4549 | */ | ||
4550 | *piEndBlockid = pWriter->iEndBlockid; | ||
4551 | |||
4552 | return interiorWriterRootInfo(v, &pWriter->parentWriter, | ||
4553 | ppRootInfo, pnRootInfo, piEndBlockid); | ||
4554 | } | ||
4555 | |||
4556 | /* Collect the rootInfo data and store it into the segment directory. | ||
4557 | ** This has the effect of flushing the segment's leaf data to | ||
4558 | ** %_segments, and also flushing any interior nodes to %_segments. | ||
4559 | */ | ||
4560 | static int leafWriterFinalize(fulltext_vtab *v, LeafWriter *pWriter){ | ||
4561 | sqlite_int64 iEndBlockid; | ||
4562 | char *pRootInfo; | ||
4563 | int rc, nRootInfo; | ||
4564 | |||
4565 | rc = leafWriterRootInfo(v, pWriter, &pRootInfo, &nRootInfo, &iEndBlockid); | ||
4566 | if( rc!=SQLITE_OK ) return rc; | ||
4567 | |||
4568 | /* Don't bother storing an entirely empty segment. */ | ||
4569 | if( iEndBlockid==0 && nRootInfo==0 ) return SQLITE_OK; | ||
4570 | |||
4571 | return segdir_set(v, pWriter->iLevel, pWriter->idx, | ||
4572 | pWriter->iStartBlockid, pWriter->iEndBlockid, | ||
4573 | iEndBlockid, pRootInfo, nRootInfo); | ||
4574 | } | ||
4575 | |||
4576 | static void leafWriterDestroy(LeafWriter *pWriter){ | ||
4577 | if( pWriter->has_parent ) interiorWriterDestroy(&pWriter->parentWriter); | ||
4578 | dataBufferDestroy(&pWriter->term); | ||
4579 | dataBufferDestroy(&pWriter->data); | ||
4580 | } | ||
4581 | |||
4582 | /* Encode a term into the leafWriter, delta-encoding as appropriate. | ||
4583 | ** Returns the length of the new term which distinguishes it from the | ||
4584 | ** previous term, which can be used to set nTermDistinct when a node | ||
4585 | ** boundary is crossed. | ||
4586 | */ | ||
4587 | static int leafWriterEncodeTerm(LeafWriter *pWriter, | ||
4588 | const char *pTerm, int nTerm){ | ||
4589 | char c[VARINT_MAX+VARINT_MAX]; | ||
4590 | int n, nPrefix = 0; | ||
4591 | |||
4592 | assert( nTerm>0 ); | ||
4593 | while( nPrefix<pWriter->term.nData && | ||
4594 | pTerm[nPrefix]==pWriter->term.pData[nPrefix] ){ | ||
4595 | nPrefix++; | ||
4596 | /* Failing this implies that the terms weren't in order. */ | ||
4597 | assert( nPrefix<nTerm ); | ||
4598 | } | ||
4599 | |||
4600 | if( pWriter->data.nData==0 ){ | ||
4601 | /* Encode the node header and leading term as: | ||
4602 | ** varint(0) | ||
4603 | ** varint(nTerm) | ||
4604 | ** char pTerm[nTerm] | ||
4605 | */ | ||
4606 | n = putVarint(c, '\0'); | ||
4607 | n += putVarint(c+n, nTerm); | ||
4608 | dataBufferAppend2(&pWriter->data, c, n, pTerm, nTerm); | ||
4609 | }else{ | ||
4610 | /* Delta-encode the term as: | ||
4611 | ** varint(nPrefix) | ||
4612 | ** varint(nSuffix) | ||
4613 | ** char pTermSuffix[nSuffix] | ||
4614 | */ | ||
4615 | n = putVarint(c, nPrefix); | ||
4616 | n += putVarint(c+n, nTerm-nPrefix); | ||
4617 | dataBufferAppend2(&pWriter->data, c, n, pTerm+nPrefix, nTerm-nPrefix); | ||
4618 | } | ||
4619 | dataBufferReplace(&pWriter->term, pTerm, nTerm); | ||
4620 | |||
4621 | return nPrefix+1; | ||
4622 | } | ||
4623 | |||
4624 | /* Used to avoid a memmove when a large amount of doclist data is in | ||
4625 | ** the buffer. This constructs a node and term header before | ||
4626 | ** iDoclistData and flushes the resulting complete node using | ||
4627 | ** leafWriterInternalFlush(). | ||
4628 | */ | ||
4629 | static int leafWriterInlineFlush(fulltext_vtab *v, LeafWriter *pWriter, | ||
4630 | const char *pTerm, int nTerm, | ||
4631 | int iDoclistData){ | ||
4632 | char c[VARINT_MAX+VARINT_MAX]; | ||
4633 | int iData, n = putVarint(c, 0); | ||
4634 | n += putVarint(c+n, nTerm); | ||
4635 | |||
4636 | /* There should always be room for the header. Even if pTerm shared | ||
4637 | ** a substantial prefix with the previous term, the entire prefix | ||
4638 | ** could be constructed from earlier data in the doclist, so there | ||
4639 | ** should be room. | ||
4640 | */ | ||
4641 | assert( iDoclistData>=n+nTerm ); | ||
4642 | |||
4643 | iData = iDoclistData-(n+nTerm); | ||
4644 | memcpy(pWriter->data.pData+iData, c, n); | ||
4645 | memcpy(pWriter->data.pData+iData+n, pTerm, nTerm); | ||
4646 | |||
4647 | return leafWriterInternalFlush(v, pWriter, iData, pWriter->data.nData-iData); | ||
4648 | } | ||
4649 | |||
4650 | /* Push pTerm[nTerm] along with the doclist data to the leaf layer of | ||
4651 | ** %_segments. | ||
4652 | */ | ||
4653 | static int leafWriterStepMerge(fulltext_vtab *v, LeafWriter *pWriter, | ||
4654 | const char *pTerm, int nTerm, | ||
4655 | DLReader *pReaders, int nReaders){ | ||
4656 | char c[VARINT_MAX+VARINT_MAX]; | ||
4657 | int iTermData = pWriter->data.nData, iDoclistData; | ||
4658 | int i, nData, n, nActualData, nActual, rc, nTermDistinct; | ||
4659 | |||
4660 | ASSERT_VALID_LEAF_NODE(pWriter->data.pData, pWriter->data.nData); | ||
4661 | nTermDistinct = leafWriterEncodeTerm(pWriter, pTerm, nTerm); | ||
4662 | |||
4663 | /* Remember nTermDistinct if opening a new node. */ | ||
4664 | if( iTermData==0 ) pWriter->nTermDistinct = nTermDistinct; | ||
4665 | |||
4666 | iDoclistData = pWriter->data.nData; | ||
4667 | |||
4668 | /* Estimate the length of the merged doclist so we can leave space | ||
4669 | ** to encode it. | ||
4670 | */ | ||
4671 | for(i=0, nData=0; i<nReaders; i++){ | ||
4672 | nData += dlrAllDataBytes(&pReaders[i]); | ||
4673 | } | ||
4674 | n = putVarint(c, nData); | ||
4675 | dataBufferAppend(&pWriter->data, c, n); | ||
4676 | |||
4677 | docListMerge(&pWriter->data, pReaders, nReaders); | ||
4678 | ASSERT_VALID_DOCLIST(DL_DEFAULT, | ||
4679 | pWriter->data.pData+iDoclistData+n, | ||
4680 | pWriter->data.nData-iDoclistData-n, NULL); | ||
4681 | |||
4682 | /* The actual amount of doclist data at this point could be smaller | ||
4683 | ** than the length we encoded. Additionally, the space required to | ||
4684 | ** encode this length could be smaller. For small doclists, this is | ||
4685 | ** not a big deal, we can just use memmove() to adjust things. | ||
4686 | */ | ||
4687 | nActualData = pWriter->data.nData-(iDoclistData+n); | ||
4688 | nActual = putVarint(c, nActualData); | ||
4689 | assert( nActualData<=nData ); | ||
4690 | assert( nActual<=n ); | ||
4691 | |||
4692 | /* If the new doclist is big enough for force a standalone leaf | ||
4693 | ** node, we can immediately flush it inline without doing the | ||
4694 | ** memmove(). | ||
4695 | */ | ||
4696 | /* TODO(shess) This test matches leafWriterStep(), which does this | ||
4697 | ** test before it knows the cost to varint-encode the term and | ||
4698 | ** doclist lengths. At some point, change to | ||
4699 | ** pWriter->data.nData-iTermData>STANDALONE_MIN. | ||
4700 | */ | ||
4701 | if( nTerm+nActualData>STANDALONE_MIN ){ | ||
4702 | /* Push leaf node from before this term. */ | ||
4703 | if( iTermData>0 ){ | ||
4704 | rc = leafWriterInternalFlush(v, pWriter, 0, iTermData); | ||
4705 | if( rc!=SQLITE_OK ) return rc; | ||
4706 | |||
4707 | pWriter->nTermDistinct = nTermDistinct; | ||
4708 | } | ||
4709 | |||
4710 | /* Fix the encoded doclist length. */ | ||
4711 | iDoclistData += n - nActual; | ||
4712 | memcpy(pWriter->data.pData+iDoclistData, c, nActual); | ||
4713 | |||
4714 | /* Push the standalone leaf node. */ | ||
4715 | rc = leafWriterInlineFlush(v, pWriter, pTerm, nTerm, iDoclistData); | ||
4716 | if( rc!=SQLITE_OK ) return rc; | ||
4717 | |||
4718 | /* Leave the node empty. */ | ||
4719 | dataBufferReset(&pWriter->data); | ||
4720 | |||
4721 | return rc; | ||
4722 | } | ||
4723 | |||
4724 | /* At this point, we know that the doclist was small, so do the | ||
4725 | ** memmove if indicated. | ||
4726 | */ | ||
4727 | if( nActual<n ){ | ||
4728 | memmove(pWriter->data.pData+iDoclistData+nActual, | ||
4729 | pWriter->data.pData+iDoclistData+n, | ||
4730 | pWriter->data.nData-(iDoclistData+n)); | ||
4731 | pWriter->data.nData -= n-nActual; | ||
4732 | } | ||
4733 | |||
4734 | /* Replace written length with actual length. */ | ||
4735 | memcpy(pWriter->data.pData+iDoclistData, c, nActual); | ||
4736 | |||
4737 | /* If the node is too large, break things up. */ | ||
4738 | /* TODO(shess) This test matches leafWriterStep(), which does this | ||
4739 | ** test before it knows the cost to varint-encode the term and | ||
4740 | ** doclist lengths. At some point, change to | ||
4741 | ** pWriter->data.nData>LEAF_MAX. | ||
4742 | */ | ||
4743 | if( iTermData+nTerm+nActualData>LEAF_MAX ){ | ||
4744 | /* Flush out the leading data as a node */ | ||
4745 | rc = leafWriterInternalFlush(v, pWriter, 0, iTermData); | ||
4746 | if( rc!=SQLITE_OK ) return rc; | ||
4747 | |||
4748 | pWriter->nTermDistinct = nTermDistinct; | ||
4749 | |||
4750 | /* Rebuild header using the current term */ | ||
4751 | n = putVarint(pWriter->data.pData, 0); | ||
4752 | n += putVarint(pWriter->data.pData+n, nTerm); | ||
4753 | memcpy(pWriter->data.pData+n, pTerm, nTerm); | ||
4754 | n += nTerm; | ||
4755 | |||
4756 | /* There should always be room, because the previous encoding | ||
4757 | ** included all data necessary to construct the term. | ||
4758 | */ | ||
4759 | assert( n<iDoclistData ); | ||
4760 | /* So long as STANDALONE_MIN is half or less of LEAF_MAX, the | ||
4761 | ** following memcpy() is safe (as opposed to needing a memmove). | ||
4762 | */ | ||
4763 | assert( 2*STANDALONE_MIN<=LEAF_MAX ); | ||
4764 | assert( n+pWriter->data.nData-iDoclistData<iDoclistData ); | ||
4765 | memcpy(pWriter->data.pData+n, | ||
4766 | pWriter->data.pData+iDoclistData, | ||
4767 | pWriter->data.nData-iDoclistData); | ||
4768 | pWriter->data.nData -= iDoclistData-n; | ||
4769 | } | ||
4770 | ASSERT_VALID_LEAF_NODE(pWriter->data.pData, pWriter->data.nData); | ||
4771 | |||
4772 | return SQLITE_OK; | ||
4773 | } | ||
4774 | |||
4775 | /* Push pTerm[nTerm] along with the doclist data to the leaf layer of | ||
4776 | ** %_segments. | ||
4777 | */ | ||
4778 | /* TODO(shess) Revise writeZeroSegment() so that doclists are | ||
4779 | ** constructed directly in pWriter->data. | ||
4780 | */ | ||
4781 | static int leafWriterStep(fulltext_vtab *v, LeafWriter *pWriter, | ||
4782 | const char *pTerm, int nTerm, | ||
4783 | const char *pData, int nData){ | ||
4784 | int rc; | ||
4785 | DLReader reader; | ||
4786 | |||
4787 | dlrInit(&reader, DL_DEFAULT, pData, nData); | ||
4788 | rc = leafWriterStepMerge(v, pWriter, pTerm, nTerm, &reader, 1); | ||
4789 | dlrDestroy(&reader); | ||
4790 | |||
4791 | return rc; | ||
4792 | } | ||
4793 | |||
4794 | |||
4795 | /****************************************************************/ | ||
4796 | /* LeafReader is used to iterate over an individual leaf node. */ | ||
4797 | typedef struct LeafReader { | ||
4798 | DataBuffer term; /* copy of current term. */ | ||
4799 | |||
4800 | const char *pData; /* data for current term. */ | ||
4801 | int nData; | ||
4802 | } LeafReader; | ||
4803 | |||
4804 | static void leafReaderDestroy(LeafReader *pReader){ | ||
4805 | dataBufferDestroy(&pReader->term); | ||
4806 | SCRAMBLE(pReader); | ||
4807 | } | ||
4808 | |||
4809 | static int leafReaderAtEnd(LeafReader *pReader){ | ||
4810 | return pReader->nData<=0; | ||
4811 | } | ||
4812 | |||
4813 | /* Access the current term. */ | ||
4814 | static int leafReaderTermBytes(LeafReader *pReader){ | ||
4815 | return pReader->term.nData; | ||
4816 | } | ||
4817 | static const char *leafReaderTerm(LeafReader *pReader){ | ||
4818 | assert( pReader->term.nData>0 ); | ||
4819 | return pReader->term.pData; | ||
4820 | } | ||
4821 | |||
4822 | /* Access the doclist data for the current term. */ | ||
4823 | static int leafReaderDataBytes(LeafReader *pReader){ | ||
4824 | int nData; | ||
4825 | assert( pReader->term.nData>0 ); | ||
4826 | getVarint32(pReader->pData, &nData); | ||
4827 | return nData; | ||
4828 | } | ||
4829 | static const char *leafReaderData(LeafReader *pReader){ | ||
4830 | int n, nData; | ||
4831 | assert( pReader->term.nData>0 ); | ||
4832 | n = getVarint32(pReader->pData, &nData); | ||
4833 | return pReader->pData+n; | ||
4834 | } | ||
4835 | |||
4836 | static void leafReaderInit(const char *pData, int nData, | ||
4837 | LeafReader *pReader){ | ||
4838 | int nTerm, n; | ||
4839 | |||
4840 | assert( nData>0 ); | ||
4841 | assert( pData[0]=='\0' ); | ||
4842 | |||
4843 | CLEAR(pReader); | ||
4844 | |||
4845 | /* Read the first term, skipping the header byte. */ | ||
4846 | n = getVarint32(pData+1, &nTerm); | ||
4847 | dataBufferInit(&pReader->term, nTerm); | ||
4848 | dataBufferReplace(&pReader->term, pData+1+n, nTerm); | ||
4849 | |||
4850 | /* Position after the first term. */ | ||
4851 | assert( 1+n+nTerm<nData ); | ||
4852 | pReader->pData = pData+1+n+nTerm; | ||
4853 | pReader->nData = nData-1-n-nTerm; | ||
4854 | } | ||
4855 | |||
4856 | /* Step the reader forward to the next term. */ | ||
4857 | static void leafReaderStep(LeafReader *pReader){ | ||
4858 | int n, nData, nPrefix, nSuffix; | ||
4859 | assert( !leafReaderAtEnd(pReader) ); | ||
4860 | |||
4861 | /* Skip previous entry's data block. */ | ||
4862 | n = getVarint32(pReader->pData, &nData); | ||
4863 | assert( n+nData<=pReader->nData ); | ||
4864 | pReader->pData += n+nData; | ||
4865 | pReader->nData -= n+nData; | ||
4866 | |||
4867 | if( !leafReaderAtEnd(pReader) ){ | ||
4868 | /* Construct the new term using a prefix from the old term plus a | ||
4869 | ** suffix from the leaf data. | ||
4870 | */ | ||
4871 | n = getVarint32(pReader->pData, &nPrefix); | ||
4872 | n += getVarint32(pReader->pData+n, &nSuffix); | ||
4873 | assert( n+nSuffix<pReader->nData ); | ||
4874 | pReader->term.nData = nPrefix; | ||
4875 | dataBufferAppend(&pReader->term, pReader->pData+n, nSuffix); | ||
4876 | |||
4877 | pReader->pData += n+nSuffix; | ||
4878 | pReader->nData -= n+nSuffix; | ||
4879 | } | ||
4880 | } | ||
4881 | |||
4882 | /* strcmp-style comparison of pReader's current term against pTerm. | ||
4883 | ** If isPrefix, equality means equal through nTerm bytes. | ||
4884 | */ | ||
4885 | static int leafReaderTermCmp(LeafReader *pReader, | ||
4886 | const char *pTerm, int nTerm, int isPrefix){ | ||
4887 | int c, n = pReader->term.nData<nTerm ? pReader->term.nData : nTerm; | ||
4888 | if( n==0 ){ | ||
4889 | if( pReader->term.nData>0 ) return -1; | ||
4890 | if(nTerm>0 ) return 1; | ||
4891 | return 0; | ||
4892 | } | ||
4893 | |||
4894 | c = memcmp(pReader->term.pData, pTerm, n); | ||
4895 | if( c!=0 ) return c; | ||
4896 | if( isPrefix && n==nTerm ) return 0; | ||
4897 | return pReader->term.nData - nTerm; | ||
4898 | } | ||
4899 | |||
4900 | |||
4901 | /****************************************************************/ | ||
4902 | /* LeavesReader wraps LeafReader to allow iterating over the entire | ||
4903 | ** leaf layer of the tree. | ||
4904 | */ | ||
4905 | typedef struct LeavesReader { | ||
4906 | int idx; /* Index within the segment. */ | ||
4907 | |||
4908 | sqlite3_stmt *pStmt; /* Statement we're streaming leaves from. */ | ||
4909 | int eof; /* we've seen SQLITE_DONE from pStmt. */ | ||
4910 | |||
4911 | LeafReader leafReader; /* reader for the current leaf. */ | ||
4912 | DataBuffer rootData; /* root data for inline. */ | ||
4913 | } LeavesReader; | ||
4914 | |||
4915 | /* Access the current term. */ | ||
4916 | static int leavesReaderTermBytes(LeavesReader *pReader){ | ||
4917 | assert( !pReader->eof ); | ||
4918 | return leafReaderTermBytes(&pReader->leafReader); | ||
4919 | } | ||
4920 | static const char *leavesReaderTerm(LeavesReader *pReader){ | ||
4921 | assert( !pReader->eof ); | ||
4922 | return leafReaderTerm(&pReader->leafReader); | ||
4923 | } | ||
4924 | |||
4925 | /* Access the doclist data for the current term. */ | ||
4926 | static int leavesReaderDataBytes(LeavesReader *pReader){ | ||
4927 | assert( !pReader->eof ); | ||
4928 | return leafReaderDataBytes(&pReader->leafReader); | ||
4929 | } | ||
4930 | static const char *leavesReaderData(LeavesReader *pReader){ | ||
4931 | assert( !pReader->eof ); | ||
4932 | return leafReaderData(&pReader->leafReader); | ||
4933 | } | ||
4934 | |||
4935 | static int leavesReaderAtEnd(LeavesReader *pReader){ | ||
4936 | return pReader->eof; | ||
4937 | } | ||
4938 | |||
4939 | /* loadSegmentLeaves() may not read all the way to SQLITE_DONE, thus | ||
4940 | ** leaving the statement handle open, which locks the table. | ||
4941 | */ | ||
4942 | /* TODO(shess) This "solution" is not satisfactory. Really, there | ||
4943 | ** should be check-in function for all statement handles which | ||
4944 | ** arranges to call sqlite3_reset(). This most likely will require | ||
4945 | ** modification to control flow all over the place, though, so for now | ||
4946 | ** just punt. | ||
4947 | ** | ||
4948 | ** Note the the current system assumes that segment merges will run to | ||
4949 | ** completion, which is why this particular probably hasn't arisen in | ||
4950 | ** this case. Probably a brittle assumption. | ||
4951 | */ | ||
4952 | static int leavesReaderReset(LeavesReader *pReader){ | ||
4953 | return sqlite3_reset(pReader->pStmt); | ||
4954 | } | ||
4955 | |||
4956 | static void leavesReaderDestroy(LeavesReader *pReader){ | ||
4957 | leafReaderDestroy(&pReader->leafReader); | ||
4958 | dataBufferDestroy(&pReader->rootData); | ||
4959 | SCRAMBLE(pReader); | ||
4960 | } | ||
4961 | |||
4962 | /* Initialize pReader with the given root data (if iStartBlockid==0 | ||
4963 | ** the leaf data was entirely contained in the root), or from the | ||
4964 | ** stream of blocks between iStartBlockid and iEndBlockid, inclusive. | ||
4965 | */ | ||
4966 | static int leavesReaderInit(fulltext_vtab *v, | ||
4967 | int idx, | ||
4968 | sqlite_int64 iStartBlockid, | ||
4969 | sqlite_int64 iEndBlockid, | ||
4970 | const char *pRootData, int nRootData, | ||
4971 | LeavesReader *pReader){ | ||
4972 | CLEAR(pReader); | ||
4973 | pReader->idx = idx; | ||
4974 | |||
4975 | dataBufferInit(&pReader->rootData, 0); | ||
4976 | if( iStartBlockid==0 ){ | ||
4977 | /* Entire leaf level fit in root data. */ | ||
4978 | dataBufferReplace(&pReader->rootData, pRootData, nRootData); | ||
4979 | leafReaderInit(pReader->rootData.pData, pReader->rootData.nData, | ||
4980 | &pReader->leafReader); | ||
4981 | }else{ | ||
4982 | sqlite3_stmt *s; | ||
4983 | int rc = sql_get_leaf_statement(v, idx, &s); | ||
4984 | if( rc!=SQLITE_OK ) return rc; | ||
4985 | |||
4986 | rc = sqlite3_bind_int64(s, 1, iStartBlockid); | ||
4987 | if( rc!=SQLITE_OK ) return rc; | ||
4988 | |||
4989 | rc = sqlite3_bind_int64(s, 2, iEndBlockid); | ||
4990 | if( rc!=SQLITE_OK ) return rc; | ||
4991 | |||
4992 | rc = sqlite3_step(s); | ||
4993 | if( rc==SQLITE_DONE ){ | ||
4994 | pReader->eof = 1; | ||
4995 | return SQLITE_OK; | ||
4996 | } | ||
4997 | if( rc!=SQLITE_ROW ) return rc; | ||
4998 | |||
4999 | pReader->pStmt = s; | ||
5000 | leafReaderInit(sqlite3_column_blob(pReader->pStmt, 0), | ||
5001 | sqlite3_column_bytes(pReader->pStmt, 0), | ||
5002 | &pReader->leafReader); | ||
5003 | } | ||
5004 | return SQLITE_OK; | ||
5005 | } | ||
5006 | |||
5007 | /* Step the current leaf forward to the next term. If we reach the | ||
5008 | ** end of the current leaf, step forward to the next leaf block. | ||
5009 | */ | ||
5010 | static int leavesReaderStep(fulltext_vtab *v, LeavesReader *pReader){ | ||
5011 | assert( !leavesReaderAtEnd(pReader) ); | ||
5012 | leafReaderStep(&pReader->leafReader); | ||
5013 | |||
5014 | if( leafReaderAtEnd(&pReader->leafReader) ){ | ||
5015 | int rc; | ||
5016 | if( pReader->rootData.pData ){ | ||
5017 | pReader->eof = 1; | ||
5018 | return SQLITE_OK; | ||
5019 | } | ||
5020 | rc = sqlite3_step(pReader->pStmt); | ||
5021 | if( rc!=SQLITE_ROW ){ | ||
5022 | pReader->eof = 1; | ||
5023 | return rc==SQLITE_DONE ? SQLITE_OK : rc; | ||
5024 | } | ||
5025 | leafReaderDestroy(&pReader->leafReader); | ||
5026 | leafReaderInit(sqlite3_column_blob(pReader->pStmt, 0), | ||
5027 | sqlite3_column_bytes(pReader->pStmt, 0), | ||
5028 | &pReader->leafReader); | ||
5029 | } | ||
5030 | return SQLITE_OK; | ||
5031 | } | ||
5032 | |||
5033 | /* Order LeavesReaders by their term, ignoring idx. Readers at eof | ||
5034 | ** always sort to the end. | ||
5035 | */ | ||
5036 | static int leavesReaderTermCmp(LeavesReader *lr1, LeavesReader *lr2){ | ||
5037 | if( leavesReaderAtEnd(lr1) ){ | ||
5038 | if( leavesReaderAtEnd(lr2) ) return 0; | ||
5039 | return 1; | ||
5040 | } | ||
5041 | if( leavesReaderAtEnd(lr2) ) return -1; | ||
5042 | |||
5043 | return leafReaderTermCmp(&lr1->leafReader, | ||
5044 | leavesReaderTerm(lr2), leavesReaderTermBytes(lr2), | ||
5045 | 0); | ||
5046 | } | ||
5047 | |||
5048 | /* Similar to leavesReaderTermCmp(), with additional ordering by idx | ||
5049 | ** so that older segments sort before newer segments. | ||
5050 | */ | ||
5051 | static int leavesReaderCmp(LeavesReader *lr1, LeavesReader *lr2){ | ||
5052 | int c = leavesReaderTermCmp(lr1, lr2); | ||
5053 | if( c!=0 ) return c; | ||
5054 | return lr1->idx-lr2->idx; | ||
5055 | } | ||
5056 | |||
5057 | /* Assume that pLr[1]..pLr[nLr] are sorted. Bubble pLr[0] into its | ||
5058 | ** sorted position. | ||
5059 | */ | ||
5060 | static void leavesReaderReorder(LeavesReader *pLr, int nLr){ | ||
5061 | while( nLr>1 && leavesReaderCmp(pLr, pLr+1)>0 ){ | ||
5062 | LeavesReader tmp = pLr[0]; | ||
5063 | pLr[0] = pLr[1]; | ||
5064 | pLr[1] = tmp; | ||
5065 | nLr--; | ||
5066 | pLr++; | ||
5067 | } | ||
5068 | } | ||
5069 | |||
5070 | /* Initializes pReaders with the segments from level iLevel, returning | ||
5071 | ** the number of segments in *piReaders. Leaves pReaders in sorted | ||
5072 | ** order. | ||
5073 | */ | ||
5074 | static int leavesReadersInit(fulltext_vtab *v, int iLevel, | ||
5075 | LeavesReader *pReaders, int *piReaders){ | ||
5076 | sqlite3_stmt *s; | ||
5077 | int i, rc = sql_get_statement(v, SEGDIR_SELECT_STMT, &s); | ||
5078 | if( rc!=SQLITE_OK ) return rc; | ||
5079 | |||
5080 | rc = sqlite3_bind_int(s, 1, iLevel); | ||
5081 | if( rc!=SQLITE_OK ) return rc; | ||
5082 | |||
5083 | i = 0; | ||
5084 | while( (rc = sqlite3_step(s))==SQLITE_ROW ){ | ||
5085 | sqlite_int64 iStart = sqlite3_column_int64(s, 0); | ||
5086 | sqlite_int64 iEnd = sqlite3_column_int64(s, 1); | ||
5087 | const char *pRootData = sqlite3_column_blob(s, 2); | ||
5088 | int nRootData = sqlite3_column_bytes(s, 2); | ||
5089 | |||
5090 | assert( i<MERGE_COUNT ); | ||
5091 | rc = leavesReaderInit(v, i, iStart, iEnd, pRootData, nRootData, | ||
5092 | &pReaders[i]); | ||
5093 | if( rc!=SQLITE_OK ) break; | ||
5094 | |||
5095 | i++; | ||
5096 | } | ||
5097 | if( rc!=SQLITE_DONE ){ | ||
5098 | while( i-->0 ){ | ||
5099 | leavesReaderDestroy(&pReaders[i]); | ||
5100 | } | ||
5101 | return rc; | ||
5102 | } | ||
5103 | |||
5104 | *piReaders = i; | ||
5105 | |||
5106 | /* Leave our results sorted by term, then age. */ | ||
5107 | while( i-- ){ | ||
5108 | leavesReaderReorder(pReaders+i, *piReaders-i); | ||
5109 | } | ||
5110 | return SQLITE_OK; | ||
5111 | } | ||
5112 | |||
5113 | /* Merge doclists from pReaders[nReaders] into a single doclist, which | ||
5114 | ** is written to pWriter. Assumes pReaders is ordered oldest to | ||
5115 | ** newest. | ||
5116 | */ | ||
5117 | /* TODO(shess) Consider putting this inline in segmentMerge(). */ | ||
5118 | static int leavesReadersMerge(fulltext_vtab *v, | ||
5119 | LeavesReader *pReaders, int nReaders, | ||
5120 | LeafWriter *pWriter){ | ||
5121 | DLReader dlReaders[MERGE_COUNT]; | ||
5122 | const char *pTerm = leavesReaderTerm(pReaders); | ||
5123 | int i, nTerm = leavesReaderTermBytes(pReaders); | ||
5124 | |||
5125 | assert( nReaders<=MERGE_COUNT ); | ||
5126 | |||
5127 | for(i=0; i<nReaders; i++){ | ||
5128 | dlrInit(&dlReaders[i], DL_DEFAULT, | ||
5129 | leavesReaderData(pReaders+i), | ||
5130 | leavesReaderDataBytes(pReaders+i)); | ||
5131 | } | ||
5132 | |||
5133 | return leafWriterStepMerge(v, pWriter, pTerm, nTerm, dlReaders, nReaders); | ||
5134 | } | ||
5135 | |||
5136 | /* Forward ref due to mutual recursion with segdirNextIndex(). */ | ||
5137 | static int segmentMerge(fulltext_vtab *v, int iLevel); | ||
5138 | |||
5139 | /* Put the next available index at iLevel into *pidx. If iLevel | ||
5140 | ** already has MERGE_COUNT segments, they are merged to a higher | ||
5141 | ** level to make room. | ||
5142 | */ | ||
5143 | static int segdirNextIndex(fulltext_vtab *v, int iLevel, int *pidx){ | ||
5144 | int rc = segdir_max_index(v, iLevel, pidx); | ||
5145 | if( rc==SQLITE_DONE ){ /* No segments at iLevel. */ | ||
5146 | *pidx = 0; | ||
5147 | }else if( rc==SQLITE_ROW ){ | ||
5148 | if( *pidx==(MERGE_COUNT-1) ){ | ||
5149 | rc = segmentMerge(v, iLevel); | ||
5150 | if( rc!=SQLITE_OK ) return rc; | ||
5151 | *pidx = 0; | ||
5152 | }else{ | ||
5153 | (*pidx)++; | ||
5154 | } | ||
5155 | }else{ | ||
5156 | return rc; | ||
5157 | } | ||
5158 | return SQLITE_OK; | ||
5159 | } | ||
5160 | |||
5161 | /* Merge MERGE_COUNT segments at iLevel into a new segment at | ||
5162 | ** iLevel+1. If iLevel+1 is already full of segments, those will be | ||
5163 | ** merged to make room. | ||
5164 | */ | ||
5165 | static int segmentMerge(fulltext_vtab *v, int iLevel){ | ||
5166 | LeafWriter writer; | ||
5167 | LeavesReader lrs[MERGE_COUNT]; | ||
5168 | int i, rc, idx = 0; | ||
5169 | |||
5170 | /* Determine the next available segment index at the next level, | ||
5171 | ** merging as necessary. | ||
5172 | */ | ||
5173 | rc = segdirNextIndex(v, iLevel+1, &idx); | ||
5174 | if( rc!=SQLITE_OK ) return rc; | ||
5175 | |||
5176 | /* TODO(shess) This assumes that we'll always see exactly | ||
5177 | ** MERGE_COUNT segments to merge at a given level. That will be | ||
5178 | ** broken if we allow the developer to request preemptive or | ||
5179 | ** deferred merging. | ||
5180 | */ | ||
5181 | memset(&lrs, '\0', sizeof(lrs)); | ||
5182 | rc = leavesReadersInit(v, iLevel, lrs, &i); | ||
5183 | if( rc!=SQLITE_OK ) return rc; | ||
5184 | assert( i==MERGE_COUNT ); | ||
5185 | |||
5186 | leafWriterInit(iLevel+1, idx, &writer); | ||
5187 | |||
5188 | /* Since leavesReaderReorder() pushes readers at eof to the end, | ||
5189 | ** when the first reader is empty, all will be empty. | ||
5190 | */ | ||
5191 | while( !leavesReaderAtEnd(lrs) ){ | ||
5192 | /* Figure out how many readers share their next term. */ | ||
5193 | for(i=1; i<MERGE_COUNT && !leavesReaderAtEnd(lrs+i); i++){ | ||
5194 | if( 0!=leavesReaderTermCmp(lrs, lrs+i) ) break; | ||
5195 | } | ||
5196 | |||
5197 | rc = leavesReadersMerge(v, lrs, i, &writer); | ||
5198 | if( rc!=SQLITE_OK ) goto err; | ||
5199 | |||
5200 | /* Step forward those that were merged. */ | ||
5201 | while( i-->0 ){ | ||
5202 | rc = leavesReaderStep(v, lrs+i); | ||
5203 | if( rc!=SQLITE_OK ) goto err; | ||
5204 | |||
5205 | /* Reorder by term, then by age. */ | ||
5206 | leavesReaderReorder(lrs+i, MERGE_COUNT-i); | ||
5207 | } | ||
5208 | } | ||
5209 | |||
5210 | for(i=0; i<MERGE_COUNT; i++){ | ||
5211 | leavesReaderDestroy(&lrs[i]); | ||
5212 | } | ||
5213 | |||
5214 | rc = leafWriterFinalize(v, &writer); | ||
5215 | leafWriterDestroy(&writer); | ||
5216 | if( rc!=SQLITE_OK ) return rc; | ||
5217 | |||
5218 | /* Delete the merged segment data. */ | ||
5219 | return segdir_delete(v, iLevel); | ||
5220 | |||
5221 | err: | ||
5222 | for(i=0; i<MERGE_COUNT; i++){ | ||
5223 | leavesReaderDestroy(&lrs[i]); | ||
5224 | } | ||
5225 | leafWriterDestroy(&writer); | ||
5226 | return rc; | ||
5227 | } | ||
5228 | |||
5229 | /* Scan pReader for pTerm/nTerm, and merge the term's doclist over | ||
5230 | ** *out (any doclists with duplicate docids overwrite those in *out). | ||
5231 | ** Internal function for loadSegmentLeaf(). | ||
5232 | */ | ||
5233 | static int loadSegmentLeavesInt(fulltext_vtab *v, LeavesReader *pReader, | ||
5234 | const char *pTerm, int nTerm, int isPrefix, | ||
5235 | DataBuffer *out){ | ||
5236 | assert( nTerm>0 ); | ||
5237 | |||
5238 | /* Process while the prefix matches. */ | ||
5239 | while( !leavesReaderAtEnd(pReader) ){ | ||
5240 | /* TODO(shess) Really want leavesReaderTermCmp(), but that name is | ||
5241 | ** already taken to compare the terms of two LeavesReaders. Think | ||
5242 | ** on a better name. [Meanwhile, break encapsulation rather than | ||
5243 | ** use a confusing name.] | ||
5244 | */ | ||
5245 | int rc; | ||
5246 | int c = leafReaderTermCmp(&pReader->leafReader, pTerm, nTerm, isPrefix); | ||
5247 | if( c==0 ){ | ||
5248 | const char *pData = leavesReaderData(pReader); | ||
5249 | int nData = leavesReaderDataBytes(pReader); | ||
5250 | if( out->nData==0 ){ | ||
5251 | dataBufferReplace(out, pData, nData); | ||
5252 | }else{ | ||
5253 | DataBuffer result; | ||
5254 | dataBufferInit(&result, out->nData+nData); | ||
5255 | docListUnion(out->pData, out->nData, pData, nData, &result); | ||
5256 | dataBufferDestroy(out); | ||
5257 | *out = result; | ||
5258 | /* TODO(shess) Rather than destroy out, we could retain it for | ||
5259 | ** later reuse. | ||
5260 | */ | ||
5261 | } | ||
5262 | } | ||
5263 | if( c>0 ) break; /* Past any possible matches. */ | ||
5264 | |||
5265 | rc = leavesReaderStep(v, pReader); | ||
5266 | if( rc!=SQLITE_OK ) return rc; | ||
5267 | } | ||
5268 | return SQLITE_OK; | ||
5269 | } | ||
5270 | |||
5271 | /* Call loadSegmentLeavesInt() with pData/nData as input. */ | ||
5272 | static int loadSegmentLeaf(fulltext_vtab *v, const char *pData, int nData, | ||
5273 | const char *pTerm, int nTerm, int isPrefix, | ||
5274 | DataBuffer *out){ | ||
5275 | LeavesReader reader; | ||
5276 | int rc; | ||
5277 | |||
5278 | assert( nData>1 ); | ||
5279 | assert( *pData=='\0' ); | ||
5280 | rc = leavesReaderInit(v, 0, 0, 0, pData, nData, &reader); | ||
5281 | if( rc!=SQLITE_OK ) return rc; | ||
5282 | |||
5283 | rc = loadSegmentLeavesInt(v, &reader, pTerm, nTerm, isPrefix, out); | ||
5284 | leavesReaderReset(&reader); | ||
5285 | leavesReaderDestroy(&reader); | ||
5286 | return rc; | ||
5287 | } | ||
5288 | |||
5289 | /* Call loadSegmentLeavesInt() with the leaf nodes from iStartLeaf to | ||
5290 | ** iEndLeaf (inclusive) as input, and merge the resulting doclist into | ||
5291 | ** out. | ||
5292 | */ | ||
5293 | static int loadSegmentLeaves(fulltext_vtab *v, | ||
5294 | sqlite_int64 iStartLeaf, sqlite_int64 iEndLeaf, | ||
5295 | const char *pTerm, int nTerm, int isPrefix, | ||
5296 | DataBuffer *out){ | ||
5297 | int rc; | ||
5298 | LeavesReader reader; | ||
5299 | |||
5300 | assert( iStartLeaf<=iEndLeaf ); | ||
5301 | rc = leavesReaderInit(v, 0, iStartLeaf, iEndLeaf, NULL, 0, &reader); | ||
5302 | if( rc!=SQLITE_OK ) return rc; | ||
5303 | |||
5304 | rc = loadSegmentLeavesInt(v, &reader, pTerm, nTerm, isPrefix, out); | ||
5305 | leavesReaderReset(&reader); | ||
5306 | leavesReaderDestroy(&reader); | ||
5307 | return rc; | ||
5308 | } | ||
5309 | |||
5310 | /* Taking pData/nData as an interior node, find the sequence of child | ||
5311 | ** nodes which could include pTerm/nTerm/isPrefix. Note that the | ||
5312 | ** interior node terms logically come between the blocks, so there is | ||
5313 | ** one more blockid than there are terms (that block contains terms >= | ||
5314 | ** the last interior-node term). | ||
5315 | */ | ||
5316 | /* TODO(shess) The calling code may already know that the end child is | ||
5317 | ** not worth calculating, because the end may be in a later sibling | ||
5318 | ** node. Consider whether breaking symmetry is worthwhile. I suspect | ||
5319 | ** it's not worthwhile. | ||
5320 | */ | ||
5321 | static void getChildrenContaining(const char *pData, int nData, | ||
5322 | const char *pTerm, int nTerm, int isPrefix, | ||
5323 | sqlite_int64 *piStartChild, | ||
5324 | sqlite_int64 *piEndChild){ | ||
5325 | InteriorReader reader; | ||
5326 | |||
5327 | assert( nData>1 ); | ||
5328 | assert( *pData!='\0' ); | ||
5329 | interiorReaderInit(pData, nData, &reader); | ||
5330 | |||
5331 | /* Scan for the first child which could contain pTerm/nTerm. */ | ||
5332 | while( !interiorReaderAtEnd(&reader) ){ | ||
5333 | if( interiorReaderTermCmp(&reader, pTerm, nTerm, 0)>0 ) break; | ||
5334 | interiorReaderStep(&reader); | ||
5335 | } | ||
5336 | *piStartChild = interiorReaderCurrentBlockid(&reader); | ||
5337 | |||
5338 | /* Keep scanning to find a term greater than our term, using prefix | ||
5339 | ** comparison if indicated. If isPrefix is false, this will be the | ||
5340 | ** same blockid as the starting block. | ||
5341 | */ | ||
5342 | while( !interiorReaderAtEnd(&reader) ){ | ||
5343 | if( interiorReaderTermCmp(&reader, pTerm, nTerm, isPrefix)>0 ) break; | ||
5344 | interiorReaderStep(&reader); | ||
5345 | } | ||
5346 | *piEndChild = interiorReaderCurrentBlockid(&reader); | ||
5347 | |||
5348 | interiorReaderDestroy(&reader); | ||
5349 | |||
5350 | /* Children must ascend, and if !prefix, both must be the same. */ | ||
5351 | assert( *piEndChild>=*piStartChild ); | ||
5352 | assert( isPrefix || *piStartChild==*piEndChild ); | ||
5353 | } | ||
5354 | |||
5355 | /* Read block at iBlockid and pass it with other params to | ||
5356 | ** getChildrenContaining(). | ||
5357 | */ | ||
5358 | static int loadAndGetChildrenContaining( | ||
5359 | fulltext_vtab *v, | ||
5360 | sqlite_int64 iBlockid, | ||
5361 | const char *pTerm, int nTerm, int isPrefix, | ||
5362 | sqlite_int64 *piStartChild, sqlite_int64 *piEndChild | ||
5363 | ){ | ||
5364 | sqlite3_stmt *s = NULL; | ||
5365 | int rc; | ||
5366 | |||
5367 | assert( iBlockid!=0 ); | ||
5368 | assert( pTerm!=NULL ); | ||
5369 | assert( nTerm!=0 ); /* TODO(shess) Why not allow this? */ | ||
5370 | assert( piStartChild!=NULL ); | ||
5371 | assert( piEndChild!=NULL ); | ||
5372 | |||
5373 | rc = sql_get_statement(v, BLOCK_SELECT_STMT, &s); | ||
5374 | if( rc!=SQLITE_OK ) return rc; | ||
5375 | |||
5376 | rc = sqlite3_bind_int64(s, 1, iBlockid); | ||
5377 | if( rc!=SQLITE_OK ) return rc; | ||
5378 | |||
5379 | rc = sqlite3_step(s); | ||
5380 | if( rc==SQLITE_DONE ) return SQLITE_ERROR; | ||
5381 | if( rc!=SQLITE_ROW ) return rc; | ||
5382 | |||
5383 | getChildrenContaining(sqlite3_column_blob(s, 0), sqlite3_column_bytes(s, 0), | ||
5384 | pTerm, nTerm, isPrefix, piStartChild, piEndChild); | ||
5385 | |||
5386 | /* We expect only one row. We must execute another sqlite3_step() | ||
5387 | * to complete the iteration; otherwise the table will remain | ||
5388 | * locked. */ | ||
5389 | rc = sqlite3_step(s); | ||
5390 | if( rc==SQLITE_ROW ) return SQLITE_ERROR; | ||
5391 | if( rc!=SQLITE_DONE ) return rc; | ||
5392 | |||
5393 | return SQLITE_OK; | ||
5394 | } | ||
5395 | |||
5396 | /* Traverse the tree represented by pData[nData] looking for | ||
5397 | ** pTerm[nTerm], placing its doclist into *out. This is internal to | ||
5398 | ** loadSegment() to make error-handling cleaner. | ||
5399 | */ | ||
5400 | static int loadSegmentInt(fulltext_vtab *v, const char *pData, int nData, | ||
5401 | sqlite_int64 iLeavesEnd, | ||
5402 | const char *pTerm, int nTerm, int isPrefix, | ||
5403 | DataBuffer *out){ | ||
5404 | /* Special case where root is a leaf. */ | ||
5405 | if( *pData=='\0' ){ | ||
5406 | return loadSegmentLeaf(v, pData, nData, pTerm, nTerm, isPrefix, out); | ||
5407 | }else{ | ||
5408 | int rc; | ||
5409 | sqlite_int64 iStartChild, iEndChild; | ||
5410 | |||
5411 | /* Process pData as an interior node, then loop down the tree | ||
5412 | ** until we find the set of leaf nodes to scan for the term. | ||
5413 | */ | ||
5414 | getChildrenContaining(pData, nData, pTerm, nTerm, isPrefix, | ||
5415 | &iStartChild, &iEndChild); | ||
5416 | while( iStartChild>iLeavesEnd ){ | ||
5417 | sqlite_int64 iNextStart, iNextEnd; | ||
5418 | rc = loadAndGetChildrenContaining(v, iStartChild, pTerm, nTerm, isPrefix, | ||
5419 | &iNextStart, &iNextEnd); | ||
5420 | if( rc!=SQLITE_OK ) return rc; | ||
5421 | |||
5422 | /* If we've branched, follow the end branch, too. */ | ||
5423 | if( iStartChild!=iEndChild ){ | ||
5424 | sqlite_int64 iDummy; | ||
5425 | rc = loadAndGetChildrenContaining(v, iEndChild, pTerm, nTerm, isPrefix, | ||
5426 | &iDummy, &iNextEnd); | ||
5427 | if( rc!=SQLITE_OK ) return rc; | ||
5428 | } | ||
5429 | |||
5430 | assert( iNextStart<=iNextEnd ); | ||
5431 | iStartChild = iNextStart; | ||
5432 | iEndChild = iNextEnd; | ||
5433 | } | ||
5434 | assert( iStartChild<=iLeavesEnd ); | ||
5435 | assert( iEndChild<=iLeavesEnd ); | ||
5436 | |||
5437 | /* Scan through the leaf segments for doclists. */ | ||
5438 | return loadSegmentLeaves(v, iStartChild, iEndChild, | ||
5439 | pTerm, nTerm, isPrefix, out); | ||
5440 | } | ||
5441 | } | ||
5442 | |||
5443 | /* Call loadSegmentInt() to collect the doclist for pTerm/nTerm, then | ||
5444 | ** merge its doclist over *out (any duplicate doclists read from the | ||
5445 | ** segment rooted at pData will overwrite those in *out). | ||
5446 | */ | ||
5447 | /* TODO(shess) Consider changing this to determine the depth of the | ||
5448 | ** leaves using either the first characters of interior nodes (when | ||
5449 | ** ==1, we're one level above the leaves), or the first character of | ||
5450 | ** the root (which will describe the height of the tree directly). | ||
5451 | ** Either feels somewhat tricky to me. | ||
5452 | */ | ||
5453 | /* TODO(shess) The current merge is likely to be slow for large | ||
5454 | ** doclists (though it should process from newest/smallest to | ||
5455 | ** oldest/largest, so it may not be that bad). It might be useful to | ||
5456 | ** modify things to allow for N-way merging. This could either be | ||
5457 | ** within a segment, with pairwise merges across segments, or across | ||
5458 | ** all segments at once. | ||
5459 | */ | ||
5460 | static int loadSegment(fulltext_vtab *v, const char *pData, int nData, | ||
5461 | sqlite_int64 iLeavesEnd, | ||
5462 | const char *pTerm, int nTerm, int isPrefix, | ||
5463 | DataBuffer *out){ | ||
5464 | DataBuffer result; | ||
5465 | int rc; | ||
5466 | |||
5467 | assert( nData>1 ); | ||
5468 | |||
5469 | /* This code should never be called with buffered updates. */ | ||
5470 | assert( v->nPendingData<0 ); | ||
5471 | |||
5472 | dataBufferInit(&result, 0); | ||
5473 | rc = loadSegmentInt(v, pData, nData, iLeavesEnd, | ||
5474 | pTerm, nTerm, isPrefix, &result); | ||
5475 | if( rc==SQLITE_OK && result.nData>0 ){ | ||
5476 | if( out->nData==0 ){ | ||
5477 | DataBuffer tmp = *out; | ||
5478 | *out = result; | ||
5479 | result = tmp; | ||
5480 | }else{ | ||
5481 | DataBuffer merged; | ||
5482 | DLReader readers[2]; | ||
5483 | |||
5484 | dlrInit(&readers[0], DL_DEFAULT, out->pData, out->nData); | ||
5485 | dlrInit(&readers[1], DL_DEFAULT, result.pData, result.nData); | ||
5486 | dataBufferInit(&merged, out->nData+result.nData); | ||
5487 | docListMerge(&merged, readers, 2); | ||
5488 | dataBufferDestroy(out); | ||
5489 | *out = merged; | ||
5490 | dlrDestroy(&readers[0]); | ||
5491 | dlrDestroy(&readers[1]); | ||
5492 | } | ||
5493 | } | ||
5494 | dataBufferDestroy(&result); | ||
5495 | return rc; | ||
5496 | } | ||
5497 | |||
5498 | /* Scan the database and merge together the posting lists for the term | ||
5499 | ** into *out. | ||
5500 | */ | ||
5501 | static int termSelect(fulltext_vtab *v, int iColumn, | ||
5502 | const char *pTerm, int nTerm, int isPrefix, | ||
5503 | DocListType iType, DataBuffer *out){ | ||
5504 | DataBuffer doclist; | ||
5505 | sqlite3_stmt *s; | ||
5506 | int rc = sql_get_statement(v, SEGDIR_SELECT_ALL_STMT, &s); | ||
5507 | if( rc!=SQLITE_OK ) return rc; | ||
5508 | |||
5509 | /* This code should never be called with buffered updates. */ | ||
5510 | assert( v->nPendingData<0 ); | ||
5511 | |||
5512 | dataBufferInit(&doclist, 0); | ||
5513 | |||
5514 | /* Traverse the segments from oldest to newest so that newer doclist | ||
5515 | ** elements for given docids overwrite older elements. | ||
5516 | */ | ||
5517 | while( (rc = sqlite3_step(s))==SQLITE_ROW ){ | ||
5518 | const char *pData = sqlite3_column_blob(s, 0); | ||
5519 | const int nData = sqlite3_column_bytes(s, 0); | ||
5520 | const sqlite_int64 iLeavesEnd = sqlite3_column_int64(s, 1); | ||
5521 | rc = loadSegment(v, pData, nData, iLeavesEnd, pTerm, nTerm, isPrefix, | ||
5522 | &doclist); | ||
5523 | if( rc!=SQLITE_OK ) goto err; | ||
5524 | } | ||
5525 | if( rc==SQLITE_DONE ){ | ||
5526 | if( doclist.nData!=0 ){ | ||
5527 | /* TODO(shess) The old term_select_all() code applied the column | ||
5528 | ** restrict as we merged segments, leading to smaller buffers. | ||
5529 | ** This is probably worthwhile to bring back, once the new storage | ||
5530 | ** system is checked in. | ||
5531 | */ | ||
5532 | if( iColumn==v->nColumn) iColumn = -1; | ||
5533 | docListTrim(DL_DEFAULT, doclist.pData, doclist.nData, | ||
5534 | iColumn, iType, out); | ||
5535 | } | ||
5536 | rc = SQLITE_OK; | ||
5537 | } | ||
5538 | |||
5539 | err: | ||
5540 | dataBufferDestroy(&doclist); | ||
5541 | return rc; | ||
5542 | } | ||
5543 | |||
5544 | /****************************************************************/ | ||
5545 | /* Used to hold hashtable data for sorting. */ | ||
5546 | typedef struct TermData { | ||
5547 | const char *pTerm; | ||
5548 | int nTerm; | ||
5549 | DLCollector *pCollector; | ||
5550 | } TermData; | ||
5551 | |||
5552 | /* Orders TermData elements in strcmp fashion ( <0 for less-than, 0 | ||
5553 | ** for equal, >0 for greater-than). | ||
5554 | */ | ||
5555 | static int termDataCmp(const void *av, const void *bv){ | ||
5556 | const TermData *a = (const TermData *)av; | ||
5557 | const TermData *b = (const TermData *)bv; | ||
5558 | int n = a->nTerm<b->nTerm ? a->nTerm : b->nTerm; | ||
5559 | int c = memcmp(a->pTerm, b->pTerm, n); | ||
5560 | if( c!=0 ) return c; | ||
5561 | return a->nTerm-b->nTerm; | ||
5562 | } | ||
5563 | |||
5564 | /* Order pTerms data by term, then write a new level 0 segment using | ||
5565 | ** LeafWriter. | ||
5566 | */ | ||
5567 | static int writeZeroSegment(fulltext_vtab *v, fts3Hash *pTerms){ | ||
5568 | fts3HashElem *e; | ||
5569 | int idx, rc, i, n; | ||
5570 | TermData *pData; | ||
5571 | LeafWriter writer; | ||
5572 | DataBuffer dl; | ||
5573 | |||
5574 | /* Determine the next index at level 0, merging as necessary. */ | ||
5575 | rc = segdirNextIndex(v, 0, &idx); | ||
5576 | if( rc!=SQLITE_OK ) return rc; | ||
5577 | |||
5578 | n = fts3HashCount(pTerms); | ||
5579 | pData = malloc(n*sizeof(TermData)); | ||
5580 | |||
5581 | for(i = 0, e = fts3HashFirst(pTerms); e; i++, e = fts3HashNext(e)){ | ||
5582 | assert( i<n ); | ||
5583 | pData[i].pTerm = fts3HashKey(e); | ||
5584 | pData[i].nTerm = fts3HashKeysize(e); | ||
5585 | pData[i].pCollector = fts3HashData(e); | ||
5586 | } | ||
5587 | assert( i==n ); | ||
5588 | |||
5589 | /* TODO(shess) Should we allow user-defined collation sequences, | ||
5590 | ** here? I think we only need that once we support prefix searches. | ||
5591 | */ | ||
5592 | if( n>1 ) qsort(pData, n, sizeof(*pData), termDataCmp); | ||
5593 | |||
5594 | /* TODO(shess) Refactor so that we can write directly to the segment | ||
5595 | ** DataBuffer, as happens for segment merges. | ||
5596 | */ | ||
5597 | leafWriterInit(0, idx, &writer); | ||
5598 | dataBufferInit(&dl, 0); | ||
5599 | for(i=0; i<n; i++){ | ||
5600 | dataBufferReset(&dl); | ||
5601 | dlcAddDoclist(pData[i].pCollector, &dl); | ||
5602 | rc = leafWriterStep(v, &writer, | ||
5603 | pData[i].pTerm, pData[i].nTerm, dl.pData, dl.nData); | ||
5604 | if( rc!=SQLITE_OK ) goto err; | ||
5605 | } | ||
5606 | rc = leafWriterFinalize(v, &writer); | ||
5607 | |||
5608 | err: | ||
5609 | dataBufferDestroy(&dl); | ||
5610 | free(pData); | ||
5611 | leafWriterDestroy(&writer); | ||
5612 | return rc; | ||
5613 | } | ||
5614 | |||
5615 | /* If pendingTerms has data, free it. */ | ||
5616 | static int clearPendingTerms(fulltext_vtab *v){ | ||
5617 | if( v->nPendingData>=0 ){ | ||
5618 | fts3HashElem *e; | ||
5619 | for(e=fts3HashFirst(&v->pendingTerms); e; e=fts3HashNext(e)){ | ||
5620 | dlcDelete(fts3HashData(e)); | ||
5621 | } | ||
5622 | fts3HashClear(&v->pendingTerms); | ||
5623 | v->nPendingData = -1; | ||
5624 | } | ||
5625 | return SQLITE_OK; | ||
5626 | } | ||
5627 | |||
5628 | /* If pendingTerms has data, flush it to a level-zero segment, and | ||
5629 | ** free it. | ||
5630 | */ | ||
5631 | static int flushPendingTerms(fulltext_vtab *v){ | ||
5632 | if( v->nPendingData>=0 ){ | ||
5633 | int rc = writeZeroSegment(v, &v->pendingTerms); | ||
5634 | if( rc==SQLITE_OK ) clearPendingTerms(v); | ||
5635 | return rc; | ||
5636 | } | ||
5637 | return SQLITE_OK; | ||
5638 | } | ||
5639 | |||
5640 | /* If pendingTerms is "too big", or docid is out of order, flush it. | ||
5641 | ** Regardless, be certain that pendingTerms is initialized for use. | ||
5642 | */ | ||
5643 | static int initPendingTerms(fulltext_vtab *v, sqlite_int64 iDocid){ | ||
5644 | /* TODO(shess) Explore whether partially flushing the buffer on | ||
5645 | ** forced-flush would provide better performance. I suspect that if | ||
5646 | ** we ordered the doclists by size and flushed the largest until the | ||
5647 | ** buffer was half empty, that would let the less frequent terms | ||
5648 | ** generate longer doclists. | ||
5649 | */ | ||
5650 | if( iDocid<=v->iPrevDocid || v->nPendingData>kPendingThreshold ){ | ||
5651 | int rc = flushPendingTerms(v); | ||
5652 | if( rc!=SQLITE_OK ) return rc; | ||
5653 | } | ||
5654 | if( v->nPendingData<0 ){ | ||
5655 | fts3HashInit(&v->pendingTerms, FTS3_HASH_STRING, 1); | ||
5656 | v->nPendingData = 0; | ||
5657 | } | ||
5658 | v->iPrevDocid = iDocid; | ||
5659 | return SQLITE_OK; | ||
5660 | } | ||
5661 | |||
5662 | /* This function implements the xUpdate callback; it's the top-level entry | ||
5663 | * point for inserting, deleting or updating a row in a full-text table. */ | ||
5664 | static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg, | ||
5665 | sqlite_int64 *pRowid){ | ||
5666 | fulltext_vtab *v = (fulltext_vtab *) pVtab; | ||
5667 | int rc; | ||
5668 | |||
5669 | TRACE(("FTS3 Update %p\n", pVtab)); | ||
5670 | |||
5671 | if( nArg<2 ){ | ||
5672 | rc = index_delete(v, sqlite3_value_int64(ppArg[0])); | ||
5673 | } else if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){ | ||
5674 | /* An update: | ||
5675 | * ppArg[0] = old rowid | ||
5676 | * ppArg[1] = new rowid | ||
5677 | * ppArg[2..2+v->nColumn-1] = values | ||
5678 | * ppArg[2+v->nColumn] = value for magic column (we ignore this) | ||
5679 | * ppArg[2+v->nColumn+1] = value for docid | ||
5680 | */ | ||
5681 | sqlite_int64 rowid = sqlite3_value_int64(ppArg[0]); | ||
5682 | if( sqlite3_value_type(ppArg[1]) != SQLITE_INTEGER || | ||
5683 | sqlite3_value_int64(ppArg[1]) != rowid ){ | ||
5684 | rc = SQLITE_ERROR; /* we don't allow changing the rowid */ | ||
5685 | }else if( sqlite3_value_type(ppArg[2+v->nColumn+1]) != SQLITE_INTEGER || | ||
5686 | sqlite3_value_int64(ppArg[2+v->nColumn+1]) != rowid ){ | ||
5687 | rc = SQLITE_ERROR; /* we don't allow changing the docid */ | ||
5688 | }else{ | ||
5689 | assert( nArg==2+v->nColumn+2); | ||
5690 | rc = index_update(v, rowid, &ppArg[2]); | ||
5691 | } | ||
5692 | } else { | ||
5693 | /* An insert: | ||
5694 | * ppArg[1] = requested rowid | ||
5695 | * ppArg[2..2+v->nColumn-1] = values | ||
5696 | * ppArg[2+v->nColumn] = value for magic column (we ignore this) | ||
5697 | * ppArg[2+v->nColumn+1] = value for docid | ||
5698 | */ | ||
5699 | sqlite3_value *pRequestDocid = ppArg[2+v->nColumn+1]; | ||
5700 | assert( nArg==2+v->nColumn+2); | ||
5701 | if( SQLITE_NULL != sqlite3_value_type(pRequestDocid) && | ||
5702 | SQLITE_NULL != sqlite3_value_type(ppArg[1]) ){ | ||
5703 | /* TODO(shess) Consider allowing this to work if the values are | ||
5704 | ** identical. I'm inclined to discourage that usage, though, | ||
5705 | ** given that both rowid and docid are special columns. Better | ||
5706 | ** would be to define one or the other as the default winner, | ||
5707 | ** but should it be fts3-centric (docid) or SQLite-centric | ||
5708 | ** (rowid)? | ||
5709 | */ | ||
5710 | rc = SQLITE_ERROR; | ||
5711 | }else{ | ||
5712 | if( SQLITE_NULL == sqlite3_value_type(pRequestDocid) ){ | ||
5713 | pRequestDocid = ppArg[1]; | ||
5714 | } | ||
5715 | rc = index_insert(v, pRequestDocid, &ppArg[2], pRowid); | ||
5716 | } | ||
5717 | } | ||
5718 | |||
5719 | return rc; | ||
5720 | } | ||
5721 | |||
5722 | static int fulltextSync(sqlite3_vtab *pVtab){ | ||
5723 | TRACE(("FTS3 xSync()\n")); | ||
5724 | return flushPendingTerms((fulltext_vtab *)pVtab); | ||
5725 | } | ||
5726 | |||
5727 | static int fulltextBegin(sqlite3_vtab *pVtab){ | ||
5728 | fulltext_vtab *v = (fulltext_vtab *) pVtab; | ||
5729 | TRACE(("FTS3 xBegin()\n")); | ||
5730 | |||
5731 | /* Any buffered updates should have been cleared by the previous | ||
5732 | ** transaction. | ||
5733 | */ | ||
5734 | assert( v->nPendingData<0 ); | ||
5735 | return clearPendingTerms(v); | ||
5736 | } | ||
5737 | |||
5738 | static int fulltextCommit(sqlite3_vtab *pVtab){ | ||
5739 | fulltext_vtab *v = (fulltext_vtab *) pVtab; | ||
5740 | TRACE(("FTS3 xCommit()\n")); | ||
5741 | |||
5742 | /* Buffered updates should have been cleared by fulltextSync(). */ | ||
5743 | assert( v->nPendingData<0 ); | ||
5744 | return clearPendingTerms(v); | ||
5745 | } | ||
5746 | |||
5747 | static int fulltextRollback(sqlite3_vtab *pVtab){ | ||
5748 | TRACE(("FTS3 xRollback()\n")); | ||
5749 | return clearPendingTerms((fulltext_vtab *)pVtab); | ||
5750 | } | ||
5751 | |||
5752 | /* | ||
5753 | ** Implementation of the snippet() function for FTS3 | ||
5754 | */ | ||
5755 | static void snippetFunc( | ||
5756 | sqlite3_context *pContext, | ||
5757 | int argc, | ||
5758 | sqlite3_value **argv | ||
5759 | ){ | ||
5760 | fulltext_cursor *pCursor; | ||
5761 | if( argc<1 ) return; | ||
5762 | if( sqlite3_value_type(argv[0])!=SQLITE_BLOB || | ||
5763 | sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){ | ||
5764 | sqlite3_result_error(pContext, "illegal first argument to html_snippet",-1); | ||
5765 | }else{ | ||
5766 | const char *zStart = "<b>"; | ||
5767 | const char *zEnd = "</b>"; | ||
5768 | const char *zEllipsis = "<b>...</b>"; | ||
5769 | memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor)); | ||
5770 | if( argc>=2 ){ | ||
5771 | zStart = (const char*)sqlite3_value_text(argv[1]); | ||
5772 | if( argc>=3 ){ | ||
5773 | zEnd = (const char*)sqlite3_value_text(argv[2]); | ||
5774 | if( argc>=4 ){ | ||
5775 | zEllipsis = (const char*)sqlite3_value_text(argv[3]); | ||
5776 | } | ||
5777 | } | ||
5778 | } | ||
5779 | snippetAllOffsets(pCursor); | ||
5780 | snippetText(pCursor, zStart, zEnd, zEllipsis); | ||
5781 | sqlite3_result_text(pContext, pCursor->snippet.zSnippet, | ||
5782 | pCursor->snippet.nSnippet, SQLITE_STATIC); | ||
5783 | } | ||
5784 | } | ||
5785 | |||
5786 | /* | ||
5787 | ** Implementation of the offsets() function for FTS3 | ||
5788 | */ | ||
5789 | static void snippetOffsetsFunc( | ||
5790 | sqlite3_context *pContext, | ||
5791 | int argc, | ||
5792 | sqlite3_value **argv | ||
5793 | ){ | ||
5794 | fulltext_cursor *pCursor; | ||
5795 | if( argc<1 ) return; | ||
5796 | if( sqlite3_value_type(argv[0])!=SQLITE_BLOB || | ||
5797 | sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){ | ||
5798 | sqlite3_result_error(pContext, "illegal first argument to offsets",-1); | ||
5799 | }else{ | ||
5800 | memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor)); | ||
5801 | snippetAllOffsets(pCursor); | ||
5802 | snippetOffsetText(&pCursor->snippet); | ||
5803 | sqlite3_result_text(pContext, | ||
5804 | pCursor->snippet.zOffset, pCursor->snippet.nOffset, | ||
5805 | SQLITE_STATIC); | ||
5806 | } | ||
5807 | } | ||
5808 | |||
5809 | /* | ||
5810 | ** This routine implements the xFindFunction method for the FTS3 | ||
5811 | ** virtual table. | ||
5812 | */ | ||
5813 | static int fulltextFindFunction( | ||
5814 | sqlite3_vtab *pVtab, | ||
5815 | int nArg, | ||
5816 | const char *zName, | ||
5817 | void (**pxFunc)(sqlite3_context*,int,sqlite3_value**), | ||
5818 | void **ppArg | ||
5819 | ){ | ||
5820 | if( strcmp(zName,"snippet")==0 ){ | ||
5821 | *pxFunc = snippetFunc; | ||
5822 | return 1; | ||
5823 | }else if( strcmp(zName,"offsets")==0 ){ | ||
5824 | *pxFunc = snippetOffsetsFunc; | ||
5825 | return 1; | ||
5826 | } | ||
5827 | return 0; | ||
5828 | } | ||
5829 | |||
5830 | /* | ||
5831 | ** Rename an fts3 table. | ||
5832 | */ | ||
5833 | static int fulltextRename( | ||
5834 | sqlite3_vtab *pVtab, | ||
5835 | const char *zName | ||
5836 | ){ | ||
5837 | fulltext_vtab *p = (fulltext_vtab *)pVtab; | ||
5838 | int rc = SQLITE_NOMEM; | ||
5839 | char *zSql = sqlite3_mprintf( | ||
5840 | "ALTER TABLE %Q.'%q_content' RENAME TO '%q_content';" | ||
5841 | "ALTER TABLE %Q.'%q_segments' RENAME TO '%q_segments';" | ||
5842 | "ALTER TABLE %Q.'%q_segdir' RENAME TO '%q_segdir';" | ||
5843 | , p->zDb, p->zName, zName | ||
5844 | , p->zDb, p->zName, zName | ||
5845 | , p->zDb, p->zName, zName | ||
5846 | ); | ||
5847 | if( zSql ){ | ||
5848 | rc = sqlite3_exec(p->db, zSql, 0, 0, 0); | ||
5849 | sqlite3_free(zSql); | ||
5850 | } | ||
5851 | return rc; | ||
5852 | } | ||
5853 | |||
5854 | static const sqlite3_module fts3Module = { | ||
5855 | /* iVersion */ 0, | ||
5856 | /* xCreate */ fulltextCreate, | ||
5857 | /* xConnect */ fulltextConnect, | ||
5858 | /* xBestIndex */ fulltextBestIndex, | ||
5859 | /* xDisconnect */ fulltextDisconnect, | ||
5860 | /* xDestroy */ fulltextDestroy, | ||
5861 | /* xOpen */ fulltextOpen, | ||
5862 | /* xClose */ fulltextClose, | ||
5863 | /* xFilter */ fulltextFilter, | ||
5864 | /* xNext */ fulltextNext, | ||
5865 | /* xEof */ fulltextEof, | ||
5866 | /* xColumn */ fulltextColumn, | ||
5867 | /* xRowid */ fulltextRowid, | ||
5868 | /* xUpdate */ fulltextUpdate, | ||
5869 | /* xBegin */ fulltextBegin, | ||
5870 | /* xSync */ fulltextSync, | ||
5871 | /* xCommit */ fulltextCommit, | ||
5872 | /* xRollback */ fulltextRollback, | ||
5873 | /* xFindFunction */ fulltextFindFunction, | ||
5874 | /* xRename */ fulltextRename, | ||
5875 | }; | ||
5876 | |||
5877 | static void hashDestroy(void *p){ | ||
5878 | fts3Hash *pHash = (fts3Hash *)p; | ||
5879 | sqlite3Fts3HashClear(pHash); | ||
5880 | sqlite3_free(pHash); | ||
5881 | } | ||
5882 | |||
5883 | /* | ||
5884 | ** The fts3 built-in tokenizers - "simple" and "porter" - are implemented | ||
5885 | ** in files fts3_tokenizer1.c and fts3_porter.c respectively. The following | ||
5886 | ** two forward declarations are for functions declared in these files | ||
5887 | ** used to retrieve the respective implementations. | ||
5888 | ** | ||
5889 | ** Calling sqlite3Fts3SimpleTokenizerModule() sets the value pointed | ||
5890 | ** to by the argument to point a the "simple" tokenizer implementation. | ||
5891 | ** Function ...PorterTokenizerModule() sets *pModule to point to the | ||
5892 | ** porter tokenizer/stemmer implementation. | ||
5893 | */ | ||
5894 | void sqlite3Fts3SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule); | ||
5895 | void sqlite3Fts3PorterTokenizerModule(sqlite3_tokenizer_module const**ppModule); | ||
5896 | void sqlite3Fts3IcuTokenizerModule(sqlite3_tokenizer_module const**ppModule); | ||
5897 | |||
5898 | int sqlite3Fts3InitHashTable(sqlite3 *, fts3Hash *, const char *); | ||
5899 | |||
5900 | /* | ||
5901 | ** Initialise the fts3 extension. If this extension is built as part | ||
5902 | ** of the sqlite library, then this function is called directly by | ||
5903 | ** SQLite. If fts3 is built as a dynamically loadable extension, this | ||
5904 | ** function is called by the sqlite3_extension_init() entry point. | ||
5905 | */ | ||
5906 | int sqlite3Fts3Init(sqlite3 *db){ | ||
5907 | int rc = SQLITE_OK; | ||
5908 | fts3Hash *pHash = 0; | ||
5909 | const sqlite3_tokenizer_module *pSimple = 0; | ||
5910 | const sqlite3_tokenizer_module *pPorter = 0; | ||
5911 | const sqlite3_tokenizer_module *pIcu = 0; | ||
5912 | |||
5913 | sqlite3Fts3SimpleTokenizerModule(&pSimple); | ||
5914 | sqlite3Fts3PorterTokenizerModule(&pPorter); | ||
5915 | #ifdef SQLITE_ENABLE_ICU | ||
5916 | sqlite3Fts3IcuTokenizerModule(&pIcu); | ||
5917 | #endif | ||
5918 | |||
5919 | /* Allocate and initialise the hash-table used to store tokenizers. */ | ||
5920 | pHash = sqlite3_malloc(sizeof(fts3Hash)); | ||
5921 | if( !pHash ){ | ||
5922 | rc = SQLITE_NOMEM; | ||
5923 | }else{ | ||
5924 | sqlite3Fts3HashInit(pHash, FTS3_HASH_STRING, 1); | ||
5925 | } | ||
5926 | |||
5927 | /* Load the built-in tokenizers into the hash table */ | ||
5928 | if( rc==SQLITE_OK ){ | ||
5929 | if( sqlite3Fts3HashInsert(pHash, "simple", 7, (void *)pSimple) | ||
5930 | || sqlite3Fts3HashInsert(pHash, "porter", 7, (void *)pPorter) | ||
5931 | || (pIcu && sqlite3Fts3HashInsert(pHash, "icu", 4, (void *)pIcu)) | ||
5932 | ){ | ||
5933 | rc = SQLITE_NOMEM; | ||
5934 | } | ||
5935 | } | ||
5936 | |||
5937 | /* Create the virtual table wrapper around the hash-table and overload | ||
5938 | ** the two scalar functions. If this is successful, register the | ||
5939 | ** module with sqlite. | ||
5940 | */ | ||
5941 | if( SQLITE_OK==rc | ||
5942 | && SQLITE_OK==(rc = sqlite3Fts3InitHashTable(db, pHash, "fts3_tokenizer")) | ||
5943 | && SQLITE_OK==(rc = sqlite3_overload_function(db, "snippet", -1)) | ||
5944 | && SQLITE_OK==(rc = sqlite3_overload_function(db, "offsets", -1)) | ||
5945 | ){ | ||
5946 | return sqlite3_create_module_v2( | ||
5947 | db, "fts3", &fts3Module, (void *)pHash, hashDestroy | ||
5948 | ); | ||
5949 | } | ||
5950 | |||
5951 | /* An error has occured. Delete the hash table and return the error code. */ | ||
5952 | assert( rc!=SQLITE_OK ); | ||
5953 | if( pHash ){ | ||
5954 | sqlite3Fts3HashClear(pHash); | ||
5955 | sqlite3_free(pHash); | ||
5956 | } | ||
5957 | return rc; | ||
5958 | } | ||
5959 | |||
5960 | #if !SQLITE_CORE | ||
5961 | int sqlite3_extension_init( | ||
5962 | sqlite3 *db, | ||
5963 | char **pzErrMsg, | ||
5964 | const sqlite3_api_routines *pApi | ||
5965 | ){ | ||
5966 | SQLITE_EXTENSION_INIT2(pApi) | ||
5967 | return sqlite3Fts3Init(db); | ||
5968 | } | ||
5969 | #endif | ||
5970 | |||
5971 | #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */ | ||
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3.h b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3.h deleted file mode 100644 index c1aa8ca..0000000 --- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3.h +++ /dev/null | |||
@@ -1,26 +0,0 @@ | |||
1 | /* | ||
2 | ** 2006 Oct 10 | ||
3 | ** | ||
4 | ** The author disclaims copyright to this source code. In place of | ||
5 | ** a legal notice, here is a blessing: | ||
6 | ** | ||
7 | ** May you do good and not evil. | ||
8 | ** May you find forgiveness for yourself and forgive others. | ||
9 | ** May you share freely, never taking more than you give. | ||
10 | ** | ||
11 | ****************************************************************************** | ||
12 | ** | ||
13 | ** This header file is used by programs that want to link against the | ||
14 | ** FTS3 library. All it does is declare the sqlite3Fts3Init() interface. | ||
15 | */ | ||
16 | #include "sqlite3.h" | ||
17 | |||
18 | #ifdef __cplusplus | ||
19 | extern "C" { | ||
20 | #endif /* __cplusplus */ | ||
21 | |||
22 | int sqlite3Fts3Init(sqlite3 *db); | ||
23 | |||
24 | #ifdef __cplusplus | ||
25 | } /* extern "C" */ | ||
26 | #endif /* __cplusplus */ | ||
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_hash.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_hash.c deleted file mode 100644 index b14511a..0000000 --- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_hash.c +++ /dev/null | |||
@@ -1,373 +0,0 @@ | |||
1 | /* | ||
2 | ** 2001 September 22 | ||
3 | ** | ||
4 | ** The author disclaims copyright to this source code. In place of | ||
5 | ** a legal notice, here is a blessing: | ||
6 | ** | ||
7 | ** May you do good and not evil. | ||
8 | ** May you find forgiveness for yourself and forgive others. | ||
9 | ** May you share freely, never taking more than you give. | ||
10 | ** | ||
11 | ************************************************************************* | ||
12 | ** This is the implementation of generic hash-tables used in SQLite. | ||
13 | ** We've modified it slightly to serve as a standalone hash table | ||
14 | ** implementation for the full-text indexing module. | ||
15 | */ | ||
16 | |||
17 | /* | ||
18 | ** The code in this file is only compiled if: | ||
19 | ** | ||
20 | ** * The FTS3 module is being built as an extension | ||
21 | ** (in which case SQLITE_CORE is not defined), or | ||
22 | ** | ||
23 | ** * The FTS3 module is being built into the core of | ||
24 | ** SQLite (in which case SQLITE_ENABLE_FTS3 is defined). | ||
25 | */ | ||
26 | #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) | ||
27 | |||
28 | #include <assert.h> | ||
29 | #include <stdlib.h> | ||
30 | #include <string.h> | ||
31 | |||
32 | #include "fts3_hash.h" | ||
33 | |||
34 | /* | ||
35 | ** Malloc and Free functions | ||
36 | */ | ||
37 | static void *fts3HashMalloc(int n){ | ||
38 | void *p = sqlite3_malloc(n); | ||
39 | if( p ){ | ||
40 | memset(p, 0, n); | ||
41 | } | ||
42 | return p; | ||
43 | } | ||
44 | static void fts3HashFree(void *p){ | ||
45 | sqlite3_free(p); | ||
46 | } | ||
47 | |||
48 | /* Turn bulk memory into a hash table object by initializing the | ||
49 | ** fields of the Hash structure. | ||
50 | ** | ||
51 | ** "pNew" is a pointer to the hash table that is to be initialized. | ||
52 | ** keyClass is one of the constants | ||
53 | ** FTS3_HASH_BINARY or FTS3_HASH_STRING. The value of keyClass | ||
54 | ** determines what kind of key the hash table will use. "copyKey" is | ||
55 | ** true if the hash table should make its own private copy of keys and | ||
56 | ** false if it should just use the supplied pointer. | ||
57 | */ | ||
58 | void sqlite3Fts3HashInit(fts3Hash *pNew, int keyClass, int copyKey){ | ||
59 | assert( pNew!=0 ); | ||
60 | assert( keyClass>=FTS3_HASH_STRING && keyClass<=FTS3_HASH_BINARY ); | ||
61 | pNew->keyClass = keyClass; | ||
62 | pNew->copyKey = copyKey; | ||
63 | pNew->first = 0; | ||
64 | pNew->count = 0; | ||
65 | pNew->htsize = 0; | ||
66 | pNew->ht = 0; | ||
67 | } | ||
68 | |||
69 | /* Remove all entries from a hash table. Reclaim all memory. | ||
70 | ** Call this routine to delete a hash table or to reset a hash table | ||
71 | ** to the empty state. | ||
72 | */ | ||
73 | void sqlite3Fts3HashClear(fts3Hash *pH){ | ||
74 | fts3HashElem *elem; /* For looping over all elements of the table */ | ||
75 | |||
76 | assert( pH!=0 ); | ||
77 | elem = pH->first; | ||
78 | pH->first = 0; | ||
79 | fts3HashFree(pH->ht); | ||
80 | pH->ht = 0; | ||
81 | pH->htsize = 0; | ||
82 | while( elem ){ | ||
83 | fts3HashElem *next_elem = elem->next; | ||
84 | if( pH->copyKey && elem->pKey ){ | ||
85 | fts3HashFree(elem->pKey); | ||
86 | } | ||
87 | fts3HashFree(elem); | ||
88 | elem = next_elem; | ||
89 | } | ||
90 | pH->count = 0; | ||
91 | } | ||
92 | |||
93 | /* | ||
94 | ** Hash and comparison functions when the mode is FTS3_HASH_STRING | ||
95 | */ | ||
96 | static int fts3StrHash(const void *pKey, int nKey){ | ||
97 | const char *z = (const char *)pKey; | ||
98 | int h = 0; | ||
99 | if( nKey<=0 ) nKey = (int) strlen(z); | ||
100 | while( nKey > 0 ){ | ||
101 | h = (h<<3) ^ h ^ *z++; | ||
102 | nKey--; | ||
103 | } | ||
104 | return h & 0x7fffffff; | ||
105 | } | ||
106 | static int fts3StrCompare(const void *pKey1, int n1, const void *pKey2, int n2){ | ||
107 | if( n1!=n2 ) return 1; | ||
108 | return strncmp((const char*)pKey1,(const char*)pKey2,n1); | ||
109 | } | ||
110 | |||
111 | /* | ||
112 | ** Hash and comparison functions when the mode is FTS3_HASH_BINARY | ||
113 | */ | ||
114 | static int fts3BinHash(const void *pKey, int nKey){ | ||
115 | int h = 0; | ||
116 | const char *z = (const char *)pKey; | ||
117 | while( nKey-- > 0 ){ | ||
118 | h = (h<<3) ^ h ^ *(z++); | ||
119 | } | ||
120 | return h & 0x7fffffff; | ||
121 | } | ||
122 | static int fts3BinCompare(const void *pKey1, int n1, const void *pKey2, int n2){ | ||
123 | if( n1!=n2 ) return 1; | ||
124 | return memcmp(pKey1,pKey2,n1); | ||
125 | } | ||
126 | |||
127 | /* | ||
128 | ** Return a pointer to the appropriate hash function given the key class. | ||
129 | ** | ||
130 | ** The C syntax in this function definition may be unfamilar to some | ||
131 | ** programmers, so we provide the following additional explanation: | ||
132 | ** | ||
133 | ** The name of the function is "hashFunction". The function takes a | ||
134 | ** single parameter "keyClass". The return value of hashFunction() | ||
135 | ** is a pointer to another function. Specifically, the return value | ||
136 | ** of hashFunction() is a pointer to a function that takes two parameters | ||
137 | ** with types "const void*" and "int" and returns an "int". | ||
138 | */ | ||
139 | static int (*hashFunction(int keyClass))(const void*,int){ | ||
140 | if( keyClass==FTS3_HASH_STRING ){ | ||
141 | return &fts3StrHash; | ||
142 | }else{ | ||
143 | assert( keyClass==FTS3_HASH_BINARY ); | ||
144 | return &fts3BinHash; | ||
145 | } | ||
146 | } | ||
147 | |||
148 | /* | ||
149 | ** Return a pointer to the appropriate hash function given the key class. | ||
150 | ** | ||
151 | ** For help in interpreted the obscure C code in the function definition, | ||
152 | ** see the header comment on the previous function. | ||
153 | */ | ||
154 | static int (*compareFunction(int keyClass))(const void*,int,const void*,int){ | ||
155 | if( keyClass==FTS3_HASH_STRING ){ | ||
156 | return &fts3StrCompare; | ||
157 | }else{ | ||
158 | assert( keyClass==FTS3_HASH_BINARY ); | ||
159 | return &fts3BinCompare; | ||
160 | } | ||
161 | } | ||
162 | |||
163 | /* Link an element into the hash table | ||
164 | */ | ||
165 | static void fts3HashInsertElement( | ||
166 | fts3Hash *pH, /* The complete hash table */ | ||
167 | struct _fts3ht *pEntry, /* The entry into which pNew is inserted */ | ||
168 | fts3HashElem *pNew /* The element to be inserted */ | ||
169 | ){ | ||
170 | fts3HashElem *pHead; /* First element already in pEntry */ | ||
171 | pHead = pEntry->chain; | ||
172 | if( pHead ){ | ||
173 | pNew->next = pHead; | ||
174 | pNew->prev = pHead->prev; | ||
175 | if( pHead->prev ){ pHead->prev->next = pNew; } | ||
176 | else { pH->first = pNew; } | ||
177 | pHead->prev = pNew; | ||
178 | }else{ | ||
179 | pNew->next = pH->first; | ||
180 | if( pH->first ){ pH->first->prev = pNew; } | ||
181 | pNew->prev = 0; | ||
182 | pH->first = pNew; | ||
183 | } | ||
184 | pEntry->count++; | ||
185 | pEntry->chain = pNew; | ||
186 | } | ||
187 | |||
188 | |||
189 | /* Resize the hash table so that it cantains "new_size" buckets. | ||
190 | ** "new_size" must be a power of 2. The hash table might fail | ||
191 | ** to resize if sqliteMalloc() fails. | ||
192 | */ | ||
193 | static void fts3Rehash(fts3Hash *pH, int new_size){ | ||
194 | struct _fts3ht *new_ht; /* The new hash table */ | ||
195 | fts3HashElem *elem, *next_elem; /* For looping over existing elements */ | ||
196 | int (*xHash)(const void*,int); /* The hash function */ | ||
197 | |||
198 | assert( (new_size & (new_size-1))==0 ); | ||
199 | new_ht = (struct _fts3ht *)fts3HashMalloc( new_size*sizeof(struct _fts3ht) ); | ||
200 | if( new_ht==0 ) return; | ||
201 | fts3HashFree(pH->ht); | ||
202 | pH->ht = new_ht; | ||
203 | pH->htsize = new_size; | ||
204 | xHash = hashFunction(pH->keyClass); | ||
205 | for(elem=pH->first, pH->first=0; elem; elem = next_elem){ | ||
206 | int h = (*xHash)(elem->pKey, elem->nKey) & (new_size-1); | ||
207 | next_elem = elem->next; | ||
208 | fts3HashInsertElement(pH, &new_ht[h], elem); | ||
209 | } | ||
210 | } | ||
211 | |||
212 | /* This function (for internal use only) locates an element in an | ||
213 | ** hash table that matches the given key. The hash for this key has | ||
214 | ** already been computed and is passed as the 4th parameter. | ||
215 | */ | ||
216 | static fts3HashElem *fts3FindElementByHash( | ||
217 | const fts3Hash *pH, /* The pH to be searched */ | ||
218 | const void *pKey, /* The key we are searching for */ | ||
219 | int nKey, | ||
220 | int h /* The hash for this key. */ | ||
221 | ){ | ||
222 | fts3HashElem *elem; /* Used to loop thru the element list */ | ||
223 | int count; /* Number of elements left to test */ | ||
224 | int (*xCompare)(const void*,int,const void*,int); /* comparison function */ | ||
225 | |||
226 | if( pH->ht ){ | ||
227 | struct _fts3ht *pEntry = &pH->ht[h]; | ||
228 | elem = pEntry->chain; | ||
229 | count = pEntry->count; | ||
230 | xCompare = compareFunction(pH->keyClass); | ||
231 | while( count-- && elem ){ | ||
232 | if( (*xCompare)(elem->pKey,elem->nKey,pKey,nKey)==0 ){ | ||
233 | return elem; | ||
234 | } | ||
235 | elem = elem->next; | ||
236 | } | ||
237 | } | ||
238 | return 0; | ||
239 | } | ||
240 | |||
241 | /* Remove a single entry from the hash table given a pointer to that | ||
242 | ** element and a hash on the element's key. | ||
243 | */ | ||
244 | static void fts3RemoveElementByHash( | ||
245 | fts3Hash *pH, /* The pH containing "elem" */ | ||
246 | fts3HashElem* elem, /* The element to be removed from the pH */ | ||
247 | int h /* Hash value for the element */ | ||
248 | ){ | ||
249 | struct _fts3ht *pEntry; | ||
250 | if( elem->prev ){ | ||
251 | elem->prev->next = elem->next; | ||
252 | }else{ | ||
253 | pH->first = elem->next; | ||
254 | } | ||
255 | if( elem->next ){ | ||
256 | elem->next->prev = elem->prev; | ||
257 | } | ||
258 | pEntry = &pH->ht[h]; | ||
259 | if( pEntry->chain==elem ){ | ||
260 | pEntry->chain = elem->next; | ||
261 | } | ||
262 | pEntry->count--; | ||
263 | if( pEntry->count<=0 ){ | ||
264 | pEntry->chain = 0; | ||
265 | } | ||
266 | if( pH->copyKey && elem->pKey ){ | ||
267 | fts3HashFree(elem->pKey); | ||
268 | } | ||
269 | fts3HashFree( elem ); | ||
270 | pH->count--; | ||
271 | if( pH->count<=0 ){ | ||
272 | assert( pH->first==0 ); | ||
273 | assert( pH->count==0 ); | ||
274 | fts3HashClear(pH); | ||
275 | } | ||
276 | } | ||
277 | |||
278 | /* Attempt to locate an element of the hash table pH with a key | ||
279 | ** that matches pKey,nKey. Return the data for this element if it is | ||
280 | ** found, or NULL if there is no match. | ||
281 | */ | ||
282 | void *sqlite3Fts3HashFind(const fts3Hash *pH, const void *pKey, int nKey){ | ||
283 | int h; /* A hash on key */ | ||
284 | fts3HashElem *elem; /* The element that matches key */ | ||
285 | int (*xHash)(const void*,int); /* The hash function */ | ||
286 | |||
287 | if( pH==0 || pH->ht==0 ) return 0; | ||
288 | xHash = hashFunction(pH->keyClass); | ||
289 | assert( xHash!=0 ); | ||
290 | h = (*xHash)(pKey,nKey); | ||
291 | assert( (pH->htsize & (pH->htsize-1))==0 ); | ||
292 | elem = fts3FindElementByHash(pH,pKey,nKey, h & (pH->htsize-1)); | ||
293 | return elem ? elem->data : 0; | ||
294 | } | ||
295 | |||
296 | /* Insert an element into the hash table pH. The key is pKey,nKey | ||
297 | ** and the data is "data". | ||
298 | ** | ||
299 | ** If no element exists with a matching key, then a new | ||
300 | ** element is created. A copy of the key is made if the copyKey | ||
301 | ** flag is set. NULL is returned. | ||
302 | ** | ||
303 | ** If another element already exists with the same key, then the | ||
304 | ** new data replaces the old data and the old data is returned. | ||
305 | ** The key is not copied in this instance. If a malloc fails, then | ||
306 | ** the new data is returned and the hash table is unchanged. | ||
307 | ** | ||
308 | ** If the "data" parameter to this function is NULL, then the | ||
309 | ** element corresponding to "key" is removed from the hash table. | ||
310 | */ | ||
311 | void *sqlite3Fts3HashInsert( | ||
312 | fts3Hash *pH, /* The hash table to insert into */ | ||
313 | const void *pKey, /* The key */ | ||
314 | int nKey, /* Number of bytes in the key */ | ||
315 | void *data /* The data */ | ||
316 | ){ | ||
317 | int hraw; /* Raw hash value of the key */ | ||
318 | int h; /* the hash of the key modulo hash table size */ | ||
319 | fts3HashElem *elem; /* Used to loop thru the element list */ | ||
320 | fts3HashElem *new_elem; /* New element added to the pH */ | ||
321 | int (*xHash)(const void*,int); /* The hash function */ | ||
322 | |||
323 | assert( pH!=0 ); | ||
324 | xHash = hashFunction(pH->keyClass); | ||
325 | assert( xHash!=0 ); | ||
326 | hraw = (*xHash)(pKey, nKey); | ||
327 | assert( (pH->htsize & (pH->htsize-1))==0 ); | ||
328 | h = hraw & (pH->htsize-1); | ||
329 | elem = fts3FindElementByHash(pH,pKey,nKey,h); | ||
330 | if( elem ){ | ||
331 | void *old_data = elem->data; | ||
332 | if( data==0 ){ | ||
333 | fts3RemoveElementByHash(pH,elem,h); | ||
334 | }else{ | ||
335 | elem->data = data; | ||
336 | } | ||
337 | return old_data; | ||
338 | } | ||
339 | if( data==0 ) return 0; | ||
340 | new_elem = (fts3HashElem*)fts3HashMalloc( sizeof(fts3HashElem) ); | ||
341 | if( new_elem==0 ) return data; | ||
342 | if( pH->copyKey && pKey!=0 ){ | ||
343 | new_elem->pKey = fts3HashMalloc( nKey ); | ||
344 | if( new_elem->pKey==0 ){ | ||
345 | fts3HashFree(new_elem); | ||
346 | return data; | ||
347 | } | ||
348 | memcpy((void*)new_elem->pKey, pKey, nKey); | ||
349 | }else{ | ||
350 | new_elem->pKey = (void*)pKey; | ||
351 | } | ||
352 | new_elem->nKey = nKey; | ||
353 | pH->count++; | ||
354 | if( pH->htsize==0 ){ | ||
355 | fts3Rehash(pH,8); | ||
356 | if( pH->htsize==0 ){ | ||
357 | pH->count = 0; | ||
358 | fts3HashFree(new_elem); | ||
359 | return data; | ||
360 | } | ||
361 | } | ||
362 | if( pH->count > pH->htsize ){ | ||
363 | fts3Rehash(pH,pH->htsize*2); | ||
364 | } | ||
365 | assert( pH->htsize>0 ); | ||
366 | assert( (pH->htsize & (pH->htsize-1))==0 ); | ||
367 | h = hraw & (pH->htsize-1); | ||
368 | fts3HashInsertElement(pH, &pH->ht[h], new_elem); | ||
369 | new_elem->data = data; | ||
370 | return 0; | ||
371 | } | ||
372 | |||
373 | #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */ | ||
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_hash.h b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_hash.h deleted file mode 100644 index e01954e..0000000 --- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_hash.h +++ /dev/null | |||
@@ -1,110 +0,0 @@ | |||
1 | /* | ||
2 | ** 2001 September 22 | ||
3 | ** | ||
4 | ** The author disclaims copyright to this source code. In place of | ||
5 | ** a legal notice, here is a blessing: | ||
6 | ** | ||
7 | ** May you do good and not evil. | ||
8 | ** May you find forgiveness for yourself and forgive others. | ||
9 | ** May you share freely, never taking more than you give. | ||
10 | ** | ||
11 | ************************************************************************* | ||
12 | ** This is the header file for the generic hash-table implemenation | ||
13 | ** used in SQLite. We've modified it slightly to serve as a standalone | ||
14 | ** hash table implementation for the full-text indexing module. | ||
15 | ** | ||
16 | */ | ||
17 | #ifndef _FTS3_HASH_H_ | ||
18 | #define _FTS3_HASH_H_ | ||
19 | |||
20 | /* Forward declarations of structures. */ | ||
21 | typedef struct fts3Hash fts3Hash; | ||
22 | typedef struct fts3HashElem fts3HashElem; | ||
23 | |||
24 | /* A complete hash table is an instance of the following structure. | ||
25 | ** The internals of this structure are intended to be opaque -- client | ||
26 | ** code should not attempt to access or modify the fields of this structure | ||
27 | ** directly. Change this structure only by using the routines below. | ||
28 | ** However, many of the "procedures" and "functions" for modifying and | ||
29 | ** accessing this structure are really macros, so we can't really make | ||
30 | ** this structure opaque. | ||
31 | */ | ||
32 | struct fts3Hash { | ||
33 | char keyClass; /* HASH_INT, _POINTER, _STRING, _BINARY */ | ||
34 | char copyKey; /* True if copy of key made on insert */ | ||
35 | int count; /* Number of entries in this table */ | ||
36 | fts3HashElem *first; /* The first element of the array */ | ||
37 | int htsize; /* Number of buckets in the hash table */ | ||
38 | struct _fts3ht { /* the hash table */ | ||
39 | int count; /* Number of entries with this hash */ | ||
40 | fts3HashElem *chain; /* Pointer to first entry with this hash */ | ||
41 | } *ht; | ||
42 | }; | ||
43 | |||
44 | /* Each element in the hash table is an instance of the following | ||
45 | ** structure. All elements are stored on a single doubly-linked list. | ||
46 | ** | ||
47 | ** Again, this structure is intended to be opaque, but it can't really | ||
48 | ** be opaque because it is used by macros. | ||
49 | */ | ||
50 | struct fts3HashElem { | ||
51 | fts3HashElem *next, *prev; /* Next and previous elements in the table */ | ||
52 | void *data; /* Data associated with this element */ | ||
53 | void *pKey; int nKey; /* Key associated with this element */ | ||
54 | }; | ||
55 | |||
56 | /* | ||
57 | ** There are 2 different modes of operation for a hash table: | ||
58 | ** | ||
59 | ** FTS3_HASH_STRING pKey points to a string that is nKey bytes long | ||
60 | ** (including the null-terminator, if any). Case | ||
61 | ** is respected in comparisons. | ||
62 | ** | ||
63 | ** FTS3_HASH_BINARY pKey points to binary data nKey bytes long. | ||
64 | ** memcmp() is used to compare keys. | ||
65 | ** | ||
66 | ** A copy of the key is made if the copyKey parameter to fts3HashInit is 1. | ||
67 | */ | ||
68 | #define FTS3_HASH_STRING 1 | ||
69 | #define FTS3_HASH_BINARY 2 | ||
70 | |||
71 | /* | ||
72 | ** Access routines. To delete, insert a NULL pointer. | ||
73 | */ | ||
74 | void sqlite3Fts3HashInit(fts3Hash*, int keytype, int copyKey); | ||
75 | void *sqlite3Fts3HashInsert(fts3Hash*, const void *pKey, int nKey, void *pData); | ||
76 | void *sqlite3Fts3HashFind(const fts3Hash*, const void *pKey, int nKey); | ||
77 | void sqlite3Fts3HashClear(fts3Hash*); | ||
78 | |||
79 | /* | ||
80 | ** Shorthand for the functions above | ||
81 | */ | ||
82 | #define fts3HashInit sqlite3Fts3HashInit | ||
83 | #define fts3HashInsert sqlite3Fts3HashInsert | ||
84 | #define fts3HashFind sqlite3Fts3HashFind | ||
85 | #define fts3HashClear sqlite3Fts3HashClear | ||
86 | |||
87 | /* | ||
88 | ** Macros for looping over all elements of a hash table. The idiom is | ||
89 | ** like this: | ||
90 | ** | ||
91 | ** fts3Hash h; | ||
92 | ** fts3HashElem *p; | ||
93 | ** ... | ||
94 | ** for(p=fts3HashFirst(&h); p; p=fts3HashNext(p)){ | ||
95 | ** SomeStructure *pData = fts3HashData(p); | ||
96 | ** // do something with pData | ||
97 | ** } | ||
98 | */ | ||
99 | #define fts3HashFirst(H) ((H)->first) | ||
100 | #define fts3HashNext(E) ((E)->next) | ||
101 | #define fts3HashData(E) ((E)->data) | ||
102 | #define fts3HashKey(E) ((E)->pKey) | ||
103 | #define fts3HashKeysize(E) ((E)->nKey) | ||
104 | |||
105 | /* | ||
106 | ** Number of entries in a hash table | ||
107 | */ | ||
108 | #define fts3HashCount(H) ((H)->count) | ||
109 | |||
110 | #endif /* _FTS3_HASH_H_ */ | ||
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_icu.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_icu.c deleted file mode 100644 index 86a9a50..0000000 --- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_icu.c +++ /dev/null | |||
@@ -1,257 +0,0 @@ | |||
1 | /* | ||
2 | ** 2007 June 22 | ||
3 | ** | ||
4 | ** The author disclaims copyright to this source code. In place of | ||
5 | ** a legal notice, here is a blessing: | ||
6 | ** | ||
7 | ** May you do good and not evil. | ||
8 | ** May you find forgiveness for yourself and forgive others. | ||
9 | ** May you share freely, never taking more than you give. | ||
10 | ** | ||
11 | ************************************************************************* | ||
12 | ** This file implements a tokenizer for fts3 based on the ICU library. | ||
13 | ** | ||
14 | ** $Id: fts3_icu.c,v 1.1 2007/08/20 17:37:04 shess Exp $ | ||
15 | */ | ||
16 | |||
17 | #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) | ||
18 | #ifdef SQLITE_ENABLE_ICU | ||
19 | |||
20 | #include <assert.h> | ||
21 | #include <string.h> | ||
22 | #include "fts3_tokenizer.h" | ||
23 | |||
24 | #include <unicode/ubrk.h> | ||
25 | #include <unicode/ucol.h> | ||
26 | #include <unicode/ustring.h> | ||
27 | #include <unicode/utf16.h> | ||
28 | |||
29 | typedef struct IcuTokenizer IcuTokenizer; | ||
30 | typedef struct IcuCursor IcuCursor; | ||
31 | |||
32 | struct IcuTokenizer { | ||
33 | sqlite3_tokenizer base; | ||
34 | char *zLocale; | ||
35 | }; | ||
36 | |||
37 | struct IcuCursor { | ||
38 | sqlite3_tokenizer_cursor base; | ||
39 | |||
40 | UBreakIterator *pIter; /* ICU break-iterator object */ | ||
41 | int nChar; /* Number of UChar elements in pInput */ | ||
42 | UChar *aChar; /* Copy of input using utf-16 encoding */ | ||
43 | int *aOffset; /* Offsets of each character in utf-8 input */ | ||
44 | |||
45 | int nBuffer; | ||
46 | char *zBuffer; | ||
47 | |||
48 | int iToken; | ||
49 | }; | ||
50 | |||
51 | /* | ||
52 | ** Create a new tokenizer instance. | ||
53 | */ | ||
54 | static int icuCreate( | ||
55 | int argc, /* Number of entries in argv[] */ | ||
56 | const char * const *argv, /* Tokenizer creation arguments */ | ||
57 | sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */ | ||
58 | ){ | ||
59 | IcuTokenizer *p; | ||
60 | int n = 0; | ||
61 | |||
62 | if( argc>0 ){ | ||
63 | n = strlen(argv[0])+1; | ||
64 | } | ||
65 | p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n); | ||
66 | if( !p ){ | ||
67 | return SQLITE_NOMEM; | ||
68 | } | ||
69 | memset(p, 0, sizeof(IcuTokenizer)); | ||
70 | |||
71 | if( n ){ | ||
72 | p->zLocale = (char *)&p[1]; | ||
73 | memcpy(p->zLocale, argv[0], n); | ||
74 | } | ||
75 | |||
76 | *ppTokenizer = (sqlite3_tokenizer *)p; | ||
77 | |||
78 | return SQLITE_OK; | ||
79 | } | ||
80 | |||
81 | /* | ||
82 | ** Destroy a tokenizer | ||
83 | */ | ||
84 | static int icuDestroy(sqlite3_tokenizer *pTokenizer){ | ||
85 | IcuTokenizer *p = (IcuTokenizer *)pTokenizer; | ||
86 | sqlite3_free(p); | ||
87 | return SQLITE_OK; | ||
88 | } | ||
89 | |||
90 | /* | ||
91 | ** Prepare to begin tokenizing a particular string. The input | ||
92 | ** string to be tokenized is pInput[0..nBytes-1]. A cursor | ||
93 | ** used to incrementally tokenize this string is returned in | ||
94 | ** *ppCursor. | ||
95 | */ | ||
96 | static int icuOpen( | ||
97 | sqlite3_tokenizer *pTokenizer, /* The tokenizer */ | ||
98 | const char *zInput, /* Input string */ | ||
99 | int nInput, /* Length of zInput in bytes */ | ||
100 | sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ | ||
101 | ){ | ||
102 | IcuTokenizer *p = (IcuTokenizer *)pTokenizer; | ||
103 | IcuCursor *pCsr; | ||
104 | |||
105 | const int32_t opt = U_FOLD_CASE_DEFAULT; | ||
106 | UErrorCode status = U_ZERO_ERROR; | ||
107 | int nChar; | ||
108 | |||
109 | UChar32 c; | ||
110 | int iInput = 0; | ||
111 | int iOut = 0; | ||
112 | |||
113 | *ppCursor = 0; | ||
114 | |||
115 | nChar = nInput+1; | ||
116 | pCsr = (IcuCursor *)sqlite3_malloc( | ||
117 | sizeof(IcuCursor) + /* IcuCursor */ | ||
118 | nChar * sizeof(UChar) + /* IcuCursor.aChar[] */ | ||
119 | (nChar+1) * sizeof(int) /* IcuCursor.aOffset[] */ | ||
120 | ); | ||
121 | if( !pCsr ){ | ||
122 | return SQLITE_NOMEM; | ||
123 | } | ||
124 | memset(pCsr, 0, sizeof(IcuCursor)); | ||
125 | pCsr->aChar = (UChar *)&pCsr[1]; | ||
126 | pCsr->aOffset = (int *)&pCsr->aChar[nChar]; | ||
127 | |||
128 | pCsr->aOffset[iOut] = iInput; | ||
129 | U8_NEXT(zInput, iInput, nInput, c); | ||
130 | while( c>0 ){ | ||
131 | int isError = 0; | ||
132 | c = u_foldCase(c, opt); | ||
133 | U16_APPEND(pCsr->aChar, iOut, nChar, c, isError); | ||
134 | if( isError ){ | ||
135 | sqlite3_free(pCsr); | ||
136 | return SQLITE_ERROR; | ||
137 | } | ||
138 | pCsr->aOffset[iOut] = iInput; | ||
139 | |||
140 | if( iInput<nInput ){ | ||
141 | U8_NEXT(zInput, iInput, nInput, c); | ||
142 | }else{ | ||
143 | c = 0; | ||
144 | } | ||
145 | } | ||
146 | |||
147 | pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status); | ||
148 | if( !U_SUCCESS(status) ){ | ||
149 | sqlite3_free(pCsr); | ||
150 | return SQLITE_ERROR; | ||
151 | } | ||
152 | pCsr->nChar = iOut; | ||
153 | |||
154 | ubrk_first(pCsr->pIter); | ||
155 | *ppCursor = (sqlite3_tokenizer_cursor *)pCsr; | ||
156 | return SQLITE_OK; | ||
157 | } | ||
158 | |||
159 | /* | ||
160 | ** Close a tokenization cursor previously opened by a call to icuOpen(). | ||
161 | */ | ||
162 | static int icuClose(sqlite3_tokenizer_cursor *pCursor){ | ||
163 | IcuCursor *pCsr = (IcuCursor *)pCursor; | ||
164 | ubrk_close(pCsr->pIter); | ||
165 | sqlite3_free(pCsr->zBuffer); | ||
166 | sqlite3_free(pCsr); | ||
167 | return SQLITE_OK; | ||
168 | } | ||
169 | |||
170 | /* | ||
171 | ** Extract the next token from a tokenization cursor. | ||
172 | */ | ||
173 | static int icuNext( | ||
174 | sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */ | ||
175 | const char **ppToken, /* OUT: *ppToken is the token text */ | ||
176 | int *pnBytes, /* OUT: Number of bytes in token */ | ||
177 | int *piStartOffset, /* OUT: Starting offset of token */ | ||
178 | int *piEndOffset, /* OUT: Ending offset of token */ | ||
179 | int *piPosition /* OUT: Position integer of token */ | ||
180 | ){ | ||
181 | IcuCursor *pCsr = (IcuCursor *)pCursor; | ||
182 | |||
183 | int iStart = 0; | ||
184 | int iEnd = 0; | ||
185 | int nByte = 0; | ||
186 | |||
187 | while( iStart==iEnd ){ | ||
188 | UChar32 c; | ||
189 | |||
190 | iStart = ubrk_current(pCsr->pIter); | ||
191 | iEnd = ubrk_next(pCsr->pIter); | ||
192 | if( iEnd==UBRK_DONE ){ | ||
193 | return SQLITE_DONE; | ||
194 | } | ||
195 | |||
196 | while( iStart<iEnd ){ | ||
197 | int iWhite = iStart; | ||
198 | U8_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c); | ||
199 | if( u_isspace(c) ){ | ||
200 | iStart = iWhite; | ||
201 | }else{ | ||
202 | break; | ||
203 | } | ||
204 | } | ||
205 | assert(iStart<=iEnd); | ||
206 | } | ||
207 | |||
208 | do { | ||
209 | UErrorCode status = U_ZERO_ERROR; | ||
210 | if( nByte ){ | ||
211 | char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte); | ||
212 | if( !zNew ){ | ||
213 | return SQLITE_NOMEM; | ||
214 | } | ||
215 | pCsr->zBuffer = zNew; | ||
216 | pCsr->nBuffer = nByte; | ||
217 | } | ||
218 | |||
219 | u_strToUTF8( | ||
220 | pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */ | ||
221 | &pCsr->aChar[iStart], iEnd-iStart, /* Input vars */ | ||
222 | &status /* Output success/failure */ | ||
223 | ); | ||
224 | } while( nByte>pCsr->nBuffer ); | ||
225 | |||
226 | *ppToken = pCsr->zBuffer; | ||
227 | *pnBytes = nByte; | ||
228 | *piStartOffset = pCsr->aOffset[iStart]; | ||
229 | *piEndOffset = pCsr->aOffset[iEnd]; | ||
230 | *piPosition = pCsr->iToken++; | ||
231 | |||
232 | return SQLITE_OK; | ||
233 | } | ||
234 | |||
235 | /* | ||
236 | ** The set of routines that implement the simple tokenizer | ||
237 | */ | ||
238 | static const sqlite3_tokenizer_module icuTokenizerModule = { | ||
239 | 0, /* iVersion */ | ||
240 | icuCreate, /* xCreate */ | ||
241 | icuDestroy, /* xCreate */ | ||
242 | icuOpen, /* xOpen */ | ||
243 | icuClose, /* xClose */ | ||
244 | icuNext, /* xNext */ | ||
245 | }; | ||
246 | |||
247 | /* | ||
248 | ** Set *ppModule to point at the implementation of the ICU tokenizer. | ||
249 | */ | ||
250 | void sqlite3Fts3IcuTokenizerModule( | ||
251 | sqlite3_tokenizer_module const**ppModule | ||
252 | ){ | ||
253 | *ppModule = &icuTokenizerModule; | ||
254 | } | ||
255 | |||
256 | #endif /* defined(SQLITE_ENABLE_ICU) */ | ||
257 | #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */ | ||
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_porter.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_porter.c deleted file mode 100644 index 14e129f..0000000 --- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_porter.c +++ /dev/null | |||
@@ -1,642 +0,0 @@ | |||
1 | /* | ||
2 | ** 2006 September 30 | ||
3 | ** | ||
4 | ** The author disclaims copyright to this source code. In place of | ||
5 | ** a legal notice, here is a blessing: | ||
6 | ** | ||
7 | ** May you do good and not evil. | ||
8 | ** May you find forgiveness for yourself and forgive others. | ||
9 | ** May you share freely, never taking more than you give. | ||
10 | ** | ||
11 | ************************************************************************* | ||
12 | ** Implementation of the full-text-search tokenizer that implements | ||
13 | ** a Porter stemmer. | ||
14 | */ | ||
15 | |||
16 | /* | ||
17 | ** The code in this file is only compiled if: | ||
18 | ** | ||
19 | ** * The FTS3 module is being built as an extension | ||
20 | ** (in which case SQLITE_CORE is not defined), or | ||
21 | ** | ||
22 | ** * The FTS3 module is being built into the core of | ||
23 | ** SQLite (in which case SQLITE_ENABLE_FTS3 is defined). | ||
24 | */ | ||
25 | #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) | ||
26 | |||
27 | |||
28 | #include <assert.h> | ||
29 | #include <stdlib.h> | ||
30 | #include <stdio.h> | ||
31 | #include <string.h> | ||
32 | #include <ctype.h> | ||
33 | |||
34 | #include "fts3_tokenizer.h" | ||
35 | |||
36 | /* | ||
37 | ** Class derived from sqlite3_tokenizer | ||
38 | */ | ||
39 | typedef struct porter_tokenizer { | ||
40 | sqlite3_tokenizer base; /* Base class */ | ||
41 | } porter_tokenizer; | ||
42 | |||
43 | /* | ||
44 | ** Class derived from sqlit3_tokenizer_cursor | ||
45 | */ | ||
46 | typedef struct porter_tokenizer_cursor { | ||
47 | sqlite3_tokenizer_cursor base; | ||
48 | const char *zInput; /* input we are tokenizing */ | ||
49 | int nInput; /* size of the input */ | ||
50 | int iOffset; /* current position in zInput */ | ||
51 | int iToken; /* index of next token to be returned */ | ||
52 | char *zToken; /* storage for current token */ | ||
53 | int nAllocated; /* space allocated to zToken buffer */ | ||
54 | } porter_tokenizer_cursor; | ||
55 | |||
56 | |||
57 | /* Forward declaration */ | ||
58 | static const sqlite3_tokenizer_module porterTokenizerModule; | ||
59 | |||
60 | |||
61 | /* | ||
62 | ** Create a new tokenizer instance. | ||
63 | */ | ||
64 | static int porterCreate( | ||
65 | int argc, const char * const *argv, | ||
66 | sqlite3_tokenizer **ppTokenizer | ||
67 | ){ | ||
68 | porter_tokenizer *t; | ||
69 | t = (porter_tokenizer *) calloc(sizeof(*t), 1); | ||
70 | if( t==NULL ) return SQLITE_NOMEM; | ||
71 | |||
72 | *ppTokenizer = &t->base; | ||
73 | return SQLITE_OK; | ||
74 | } | ||
75 | |||
76 | /* | ||
77 | ** Destroy a tokenizer | ||
78 | */ | ||
79 | static int porterDestroy(sqlite3_tokenizer *pTokenizer){ | ||
80 | free(pTokenizer); | ||
81 | return SQLITE_OK; | ||
82 | } | ||
83 | |||
84 | /* | ||
85 | ** Prepare to begin tokenizing a particular string. The input | ||
86 | ** string to be tokenized is zInput[0..nInput-1]. A cursor | ||
87 | ** used to incrementally tokenize this string is returned in | ||
88 | ** *ppCursor. | ||
89 | */ | ||
90 | static int porterOpen( | ||
91 | sqlite3_tokenizer *pTokenizer, /* The tokenizer */ | ||
92 | const char *zInput, int nInput, /* String to be tokenized */ | ||
93 | sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ | ||
94 | ){ | ||
95 | porter_tokenizer_cursor *c; | ||
96 | |||
97 | c = (porter_tokenizer_cursor *) malloc(sizeof(*c)); | ||
98 | if( c==NULL ) return SQLITE_NOMEM; | ||
99 | |||
100 | c->zInput = zInput; | ||
101 | if( zInput==0 ){ | ||
102 | c->nInput = 0; | ||
103 | }else if( nInput<0 ){ | ||
104 | c->nInput = (int)strlen(zInput); | ||
105 | }else{ | ||
106 | c->nInput = nInput; | ||
107 | } | ||
108 | c->iOffset = 0; /* start tokenizing at the beginning */ | ||
109 | c->iToken = 0; | ||
110 | c->zToken = NULL; /* no space allocated, yet. */ | ||
111 | c->nAllocated = 0; | ||
112 | |||
113 | *ppCursor = &c->base; | ||
114 | return SQLITE_OK; | ||
115 | } | ||
116 | |||
117 | /* | ||
118 | ** Close a tokenization cursor previously opened by a call to | ||
119 | ** porterOpen() above. | ||
120 | */ | ||
121 | static int porterClose(sqlite3_tokenizer_cursor *pCursor){ | ||
122 | porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor; | ||
123 | free(c->zToken); | ||
124 | free(c); | ||
125 | return SQLITE_OK; | ||
126 | } | ||
127 | /* | ||
128 | ** Vowel or consonant | ||
129 | */ | ||
130 | static const char cType[] = { | ||
131 | 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, | ||
132 | 1, 1, 1, 2, 1 | ||
133 | }; | ||
134 | |||
135 | /* | ||
136 | ** isConsonant() and isVowel() determine if their first character in | ||
137 | ** the string they point to is a consonant or a vowel, according | ||
138 | ** to Porter ruls. | ||
139 | ** | ||
140 | ** A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'. | ||
141 | ** 'Y' is a consonant unless it follows another consonant, | ||
142 | ** in which case it is a vowel. | ||
143 | ** | ||
144 | ** In these routine, the letters are in reverse order. So the 'y' rule | ||
145 | ** is that 'y' is a consonant unless it is followed by another | ||
146 | ** consonent. | ||
147 | */ | ||
148 | static int isVowel(const char*); | ||
149 | static int isConsonant(const char *z){ | ||
150 | int j; | ||
151 | char x = *z; | ||
152 | if( x==0 ) return 0; | ||
153 | assert( x>='a' && x<='z' ); | ||
154 | j = cType[x-'a']; | ||
155 | if( j<2 ) return j; | ||
156 | return z[1]==0 || isVowel(z + 1); | ||
157 | } | ||
158 | static int isVowel(const char *z){ | ||
159 | int j; | ||
160 | char x = *z; | ||
161 | if( x==0 ) return 0; | ||
162 | assert( x>='a' && x<='z' ); | ||
163 | j = cType[x-'a']; | ||
164 | if( j<2 ) return 1-j; | ||
165 | return isConsonant(z + 1); | ||
166 | } | ||
167 | |||
168 | /* | ||
169 | ** Let any sequence of one or more vowels be represented by V and let | ||
170 | ** C be sequence of one or more consonants. Then every word can be | ||
171 | ** represented as: | ||
172 | ** | ||
173 | ** [C] (VC){m} [V] | ||
174 | ** | ||
175 | ** In prose: A word is an optional consonant followed by zero or | ||
176 | ** vowel-consonant pairs followed by an optional vowel. "m" is the | ||
177 | ** number of vowel consonant pairs. This routine computes the value | ||
178 | ** of m for the first i bytes of a word. | ||
179 | ** | ||
180 | ** Return true if the m-value for z is 1 or more. In other words, | ||
181 | ** return true if z contains at least one vowel that is followed | ||
182 | ** by a consonant. | ||
183 | ** | ||
184 | ** In this routine z[] is in reverse order. So we are really looking | ||
185 | ** for an instance of of a consonant followed by a vowel. | ||
186 | */ | ||
187 | static int m_gt_0(const char *z){ | ||
188 | while( isVowel(z) ){ z++; } | ||
189 | if( *z==0 ) return 0; | ||
190 | while( isConsonant(z) ){ z++; } | ||
191 | return *z!=0; | ||
192 | } | ||
193 | |||
194 | /* Like mgt0 above except we are looking for a value of m which is | ||
195 | ** exactly 1 | ||
196 | */ | ||
197 | static int m_eq_1(const char *z){ | ||
198 | while( isVowel(z) ){ z++; } | ||
199 | if( *z==0 ) return 0; | ||
200 | while( isConsonant(z) ){ z++; } | ||
201 | if( *z==0 ) return 0; | ||
202 | while( isVowel(z) ){ z++; } | ||
203 | if( *z==0 ) return 1; | ||
204 | while( isConsonant(z) ){ z++; } | ||
205 | return *z==0; | ||
206 | } | ||
207 | |||
208 | /* Like mgt0 above except we are looking for a value of m>1 instead | ||
209 | ** or m>0 | ||
210 | */ | ||
211 | static int m_gt_1(const char *z){ | ||
212 | while( isVowel(z) ){ z++; } | ||
213 | if( *z==0 ) return 0; | ||
214 | while( isConsonant(z) ){ z++; } | ||
215 | if( *z==0 ) return 0; | ||
216 | while( isVowel(z) ){ z++; } | ||
217 | if( *z==0 ) return 0; | ||
218 | while( isConsonant(z) ){ z++; } | ||
219 | return *z!=0; | ||
220 | } | ||
221 | |||
222 | /* | ||
223 | ** Return TRUE if there is a vowel anywhere within z[0..n-1] | ||
224 | */ | ||
225 | static int hasVowel(const char *z){ | ||
226 | while( isConsonant(z) ){ z++; } | ||
227 | return *z!=0; | ||
228 | } | ||
229 | |||
230 | /* | ||
231 | ** Return TRUE if the word ends in a double consonant. | ||
232 | ** | ||
233 | ** The text is reversed here. So we are really looking at | ||
234 | ** the first two characters of z[]. | ||
235 | */ | ||
236 | static int doubleConsonant(const char *z){ | ||
237 | return isConsonant(z) && z[0]==z[1] && isConsonant(z+1); | ||
238 | } | ||
239 | |||
240 | /* | ||
241 | ** Return TRUE if the word ends with three letters which | ||
242 | ** are consonant-vowel-consonent and where the final consonant | ||
243 | ** is not 'w', 'x', or 'y'. | ||
244 | ** | ||
245 | ** The word is reversed here. So we are really checking the | ||
246 | ** first three letters and the first one cannot be in [wxy]. | ||
247 | */ | ||
248 | static int star_oh(const char *z){ | ||
249 | return | ||
250 | z[0]!=0 && isConsonant(z) && | ||
251 | z[0]!='w' && z[0]!='x' && z[0]!='y' && | ||
252 | z[1]!=0 && isVowel(z+1) && | ||
253 | z[2]!=0 && isConsonant(z+2); | ||
254 | } | ||
255 | |||
256 | /* | ||
257 | ** If the word ends with zFrom and xCond() is true for the stem | ||
258 | ** of the word that preceeds the zFrom ending, then change the | ||
259 | ** ending to zTo. | ||
260 | ** | ||
261 | ** The input word *pz and zFrom are both in reverse order. zTo | ||
262 | ** is in normal order. | ||
263 | ** | ||
264 | ** Return TRUE if zFrom matches. Return FALSE if zFrom does not | ||
265 | ** match. Not that TRUE is returned even if xCond() fails and | ||
266 | ** no substitution occurs. | ||
267 | */ | ||
268 | static int stem( | ||
269 | char **pz, /* The word being stemmed (Reversed) */ | ||
270 | const char *zFrom, /* If the ending matches this... (Reversed) */ | ||
271 | const char *zTo, /* ... change the ending to this (not reversed) */ | ||
272 | int (*xCond)(const char*) /* Condition that must be true */ | ||
273 | ){ | ||
274 | char *z = *pz; | ||
275 | while( *zFrom && *zFrom==*z ){ z++; zFrom++; } | ||
276 | if( *zFrom!=0 ) return 0; | ||
277 | if( xCond && !xCond(z) ) return 1; | ||
278 | while( *zTo ){ | ||
279 | *(--z) = *(zTo++); | ||
280 | } | ||
281 | *pz = z; | ||
282 | return 1; | ||
283 | } | ||
284 | |||
285 | /* | ||
286 | ** This is the fallback stemmer used when the porter stemmer is | ||
287 | ** inappropriate. The input word is copied into the output with | ||
288 | ** US-ASCII case folding. If the input word is too long (more | ||
289 | ** than 20 bytes if it contains no digits or more than 6 bytes if | ||
290 | ** it contains digits) then word is truncated to 20 or 6 bytes | ||
291 | ** by taking 10 or 3 bytes from the beginning and end. | ||
292 | */ | ||
293 | static void copy_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){ | ||
294 | int i, mx, j; | ||
295 | int hasDigit = 0; | ||
296 | for(i=0; i<nIn; i++){ | ||
297 | int c = zIn[i]; | ||
298 | if( c>='A' && c<='Z' ){ | ||
299 | zOut[i] = c - 'A' + 'a'; | ||
300 | }else{ | ||
301 | if( c>='0' && c<='9' ) hasDigit = 1; | ||
302 | zOut[i] = c; | ||
303 | } | ||
304 | } | ||
305 | mx = hasDigit ? 3 : 10; | ||
306 | if( nIn>mx*2 ){ | ||
307 | for(j=mx, i=nIn-mx; i<nIn; i++, j++){ | ||
308 | zOut[j] = zOut[i]; | ||
309 | } | ||
310 | i = j; | ||
311 | } | ||
312 | zOut[i] = 0; | ||
313 | *pnOut = i; | ||
314 | } | ||
315 | |||
316 | |||
317 | /* | ||
318 | ** Stem the input word zIn[0..nIn-1]. Store the output in zOut. | ||
319 | ** zOut is at least big enough to hold nIn bytes. Write the actual | ||
320 | ** size of the output word (exclusive of the '\0' terminator) into *pnOut. | ||
321 | ** | ||
322 | ** Any upper-case characters in the US-ASCII character set ([A-Z]) | ||
323 | ** are converted to lower case. Upper-case UTF characters are | ||
324 | ** unchanged. | ||
325 | ** | ||
326 | ** Words that are longer than about 20 bytes are stemmed by retaining | ||
327 | ** a few bytes from the beginning and the end of the word. If the | ||
328 | ** word contains digits, 3 bytes are taken from the beginning and | ||
329 | ** 3 bytes from the end. For long words without digits, 10 bytes | ||
330 | ** are taken from each end. US-ASCII case folding still applies. | ||
331 | ** | ||
332 | ** If the input word contains not digits but does characters not | ||
333 | ** in [a-zA-Z] then no stemming is attempted and this routine just | ||
334 | ** copies the input into the input into the output with US-ASCII | ||
335 | ** case folding. | ||
336 | ** | ||
337 | ** Stemming never increases the length of the word. So there is | ||
338 | ** no chance of overflowing the zOut buffer. | ||
339 | */ | ||
340 | static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){ | ||
341 | int i, j, c; | ||
342 | char zReverse[28]; | ||
343 | char *z, *z2; | ||
344 | if( nIn<3 || nIn>=sizeof(zReverse)-7 ){ | ||
345 | /* The word is too big or too small for the porter stemmer. | ||
346 | ** Fallback to the copy stemmer */ | ||
347 | copy_stemmer(zIn, nIn, zOut, pnOut); | ||
348 | return; | ||
349 | } | ||
350 | for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){ | ||
351 | c = zIn[i]; | ||
352 | if( c>='A' && c<='Z' ){ | ||
353 | zReverse[j] = c + 'a' - 'A'; | ||
354 | }else if( c>='a' && c<='z' ){ | ||
355 | zReverse[j] = c; | ||
356 | }else{ | ||
357 | /* The use of a character not in [a-zA-Z] means that we fallback | ||
358 | ** to the copy stemmer */ | ||
359 | copy_stemmer(zIn, nIn, zOut, pnOut); | ||
360 | return; | ||
361 | } | ||
362 | } | ||
363 | memset(&zReverse[sizeof(zReverse)-5], 0, 5); | ||
364 | z = &zReverse[j+1]; | ||
365 | |||
366 | |||
367 | /* Step 1a */ | ||
368 | if( z[0]=='s' ){ | ||
369 | if( | ||
370 | !stem(&z, "sess", "ss", 0) && | ||
371 | !stem(&z, "sei", "i", 0) && | ||
372 | !stem(&z, "ss", "ss", 0) | ||
373 | ){ | ||
374 | z++; | ||
375 | } | ||
376 | } | ||
377 | |||
378 | /* Step 1b */ | ||
379 | z2 = z; | ||
380 | if( stem(&z, "dee", "ee", m_gt_0) ){ | ||
381 | /* Do nothing. The work was all in the test */ | ||
382 | }else if( | ||
383 | (stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel)) | ||
384 | && z!=z2 | ||
385 | ){ | ||
386 | if( stem(&z, "ta", "ate", 0) || | ||
387 | stem(&z, "lb", "ble", 0) || | ||
388 | stem(&z, "zi", "ize", 0) ){ | ||
389 | /* Do nothing. The work was all in the test */ | ||
390 | }else if( doubleConsonant(z) && (*z!='l' && *z!='s' && *z!='z') ){ | ||
391 | z++; | ||
392 | }else if( m_eq_1(z) && star_oh(z) ){ | ||
393 | *(--z) = 'e'; | ||
394 | } | ||
395 | } | ||
396 | |||
397 | /* Step 1c */ | ||
398 | if( z[0]=='y' && hasVowel(z+1) ){ | ||
399 | z[0] = 'i'; | ||
400 | } | ||
401 | |||
402 | /* Step 2 */ | ||
403 | switch( z[1] ){ | ||
404 | case 'a': | ||
405 | stem(&z, "lanoita", "ate", m_gt_0) || | ||
406 | stem(&z, "lanoit", "tion", m_gt_0); | ||
407 | break; | ||
408 | case 'c': | ||
409 | stem(&z, "icne", "ence", m_gt_0) || | ||
410 | stem(&z, "icna", "ance", m_gt_0); | ||
411 | break; | ||
412 | case 'e': | ||
413 | stem(&z, "rezi", "ize", m_gt_0); | ||
414 | break; | ||
415 | case 'g': | ||
416 | stem(&z, "igol", "log", m_gt_0); | ||
417 | break; | ||
418 | case 'l': | ||
419 | stem(&z, "ilb", "ble", m_gt_0) || | ||
420 | stem(&z, "illa", "al", m_gt_0) || | ||
421 | stem(&z, "iltne", "ent", m_gt_0) || | ||
422 | stem(&z, "ile", "e", m_gt_0) || | ||
423 | stem(&z, "ilsuo", "ous", m_gt_0); | ||
424 | break; | ||
425 | case 'o': | ||
426 | stem(&z, "noitazi", "ize", m_gt_0) || | ||
427 | stem(&z, "noita", "ate", m_gt_0) || | ||
428 | stem(&z, "rota", "ate", m_gt_0); | ||
429 | break; | ||
430 | case 's': | ||
431 | stem(&z, "msila", "al", m_gt_0) || | ||
432 | stem(&z, "ssenevi", "ive", m_gt_0) || | ||
433 | stem(&z, "ssenluf", "ful", m_gt_0) || | ||
434 | stem(&z, "ssensuo", "ous", m_gt_0); | ||
435 | break; | ||
436 | case 't': | ||
437 | stem(&z, "itila", "al", m_gt_0) || | ||
438 | stem(&z, "itivi", "ive", m_gt_0) || | ||
439 | stem(&z, "itilib", "ble", m_gt_0); | ||
440 | break; | ||
441 | } | ||
442 | |||
443 | /* Step 3 */ | ||
444 | switch( z[0] ){ | ||
445 | case 'e': | ||
446 | stem(&z, "etaci", "ic", m_gt_0) || | ||
447 | stem(&z, "evita", "", m_gt_0) || | ||
448 | stem(&z, "ezila", "al", m_gt_0); | ||
449 | break; | ||
450 | case 'i': | ||
451 | stem(&z, "itici", "ic", m_gt_0); | ||
452 | break; | ||
453 | case 'l': | ||
454 | stem(&z, "laci", "ic", m_gt_0) || | ||
455 | stem(&z, "luf", "", m_gt_0); | ||
456 | break; | ||
457 | case 's': | ||
458 | stem(&z, "ssen", "", m_gt_0); | ||
459 | break; | ||
460 | } | ||
461 | |||
462 | /* Step 4 */ | ||
463 | switch( z[1] ){ | ||
464 | case 'a': | ||
465 | if( z[0]=='l' && m_gt_1(z+2) ){ | ||
466 | z += 2; | ||
467 | } | ||
468 | break; | ||
469 | case 'c': | ||
470 | if( z[0]=='e' && z[2]=='n' && (z[3]=='a' || z[3]=='e') && m_gt_1(z+4) ){ | ||
471 | z += 4; | ||
472 | } | ||
473 | break; | ||
474 | case 'e': | ||
475 | if( z[0]=='r' && m_gt_1(z+2) ){ | ||
476 | z += 2; | ||
477 | } | ||
478 | break; | ||
479 | case 'i': | ||
480 | if( z[0]=='c' && m_gt_1(z+2) ){ | ||
481 | z += 2; | ||
482 | } | ||
483 | break; | ||
484 | case 'l': | ||
485 | if( z[0]=='e' && z[2]=='b' && (z[3]=='a' || z[3]=='i') && m_gt_1(z+4) ){ | ||
486 | z += 4; | ||
487 | } | ||
488 | break; | ||
489 | case 'n': | ||
490 | if( z[0]=='t' ){ | ||
491 | if( z[2]=='a' ){ | ||
492 | if( m_gt_1(z+3) ){ | ||
493 | z += 3; | ||
494 | } | ||
495 | }else if( z[2]=='e' ){ | ||
496 | stem(&z, "tneme", "", m_gt_1) || | ||
497 | stem(&z, "tnem", "", m_gt_1) || | ||
498 | stem(&z, "tne", "", m_gt_1); | ||
499 | } | ||
500 | } | ||
501 | break; | ||
502 | case 'o': | ||
503 | if( z[0]=='u' ){ | ||
504 | if( m_gt_1(z+2) ){ | ||
505 | z += 2; | ||
506 | } | ||
507 | }else if( z[3]=='s' || z[3]=='t' ){ | ||
508 | stem(&z, "noi", "", m_gt_1); | ||
509 | } | ||
510 | break; | ||
511 | case 's': | ||
512 | if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){ | ||
513 | z += 3; | ||
514 | } | ||
515 | break; | ||
516 | case 't': | ||
517 | stem(&z, "eta", "", m_gt_1) || | ||
518 | stem(&z, "iti", "", m_gt_1); | ||
519 | break; | ||
520 | case 'u': | ||
521 | if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){ | ||
522 | z += 3; | ||
523 | } | ||
524 | break; | ||
525 | case 'v': | ||
526 | case 'z': | ||
527 | if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){ | ||
528 | z += 3; | ||
529 | } | ||
530 | break; | ||
531 | } | ||
532 | |||
533 | /* Step 5a */ | ||
534 | if( z[0]=='e' ){ | ||
535 | if( m_gt_1(z+1) ){ | ||
536 | z++; | ||
537 | }else if( m_eq_1(z+1) && !star_oh(z+1) ){ | ||
538 | z++; | ||
539 | } | ||
540 | } | ||
541 | |||
542 | /* Step 5b */ | ||
543 | if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){ | ||
544 | z++; | ||
545 | } | ||
546 | |||
547 | /* z[] is now the stemmed word in reverse order. Flip it back | ||
548 | ** around into forward order and return. | ||
549 | */ | ||
550 | *pnOut = i = strlen(z); | ||
551 | zOut[i] = 0; | ||
552 | while( *z ){ | ||
553 | zOut[--i] = *(z++); | ||
554 | } | ||
555 | } | ||
556 | |||
557 | /* | ||
558 | ** Characters that can be part of a token. We assume any character | ||
559 | ** whose value is greater than 0x80 (any UTF character) can be | ||
560 | ** part of a token. In other words, delimiters all must have | ||
561 | ** values of 0x7f or lower. | ||
562 | */ | ||
563 | static const char porterIdChar[] = { | ||
564 | /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */ | ||
565 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */ | ||
566 | 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */ | ||
567 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */ | ||
568 | 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */ | ||
569 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */ | ||
570 | }; | ||
571 | #define isDelim(C) (((ch=C)&0x80)==0 && (ch<0x30 || !porterIdChar[ch-0x30])) | ||
572 | |||
573 | /* | ||
574 | ** Extract the next token from a tokenization cursor. The cursor must | ||
575 | ** have been opened by a prior call to porterOpen(). | ||
576 | */ | ||
577 | static int porterNext( | ||
578 | sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by porterOpen */ | ||
579 | const char **pzToken, /* OUT: *pzToken is the token text */ | ||
580 | int *pnBytes, /* OUT: Number of bytes in token */ | ||
581 | int *piStartOffset, /* OUT: Starting offset of token */ | ||
582 | int *piEndOffset, /* OUT: Ending offset of token */ | ||
583 | int *piPosition /* OUT: Position integer of token */ | ||
584 | ){ | ||
585 | porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor; | ||
586 | const char *z = c->zInput; | ||
587 | |||
588 | while( c->iOffset<c->nInput ){ | ||
589 | int iStartOffset, ch; | ||
590 | |||
591 | /* Scan past delimiter characters */ | ||
592 | while( c->iOffset<c->nInput && isDelim(z[c->iOffset]) ){ | ||
593 | c->iOffset++; | ||
594 | } | ||
595 | |||
596 | /* Count non-delimiter characters. */ | ||
597 | iStartOffset = c->iOffset; | ||
598 | while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){ | ||
599 | c->iOffset++; | ||
600 | } | ||
601 | |||
602 | if( c->iOffset>iStartOffset ){ | ||
603 | int n = c->iOffset-iStartOffset; | ||
604 | if( n>c->nAllocated ){ | ||
605 | c->nAllocated = n+20; | ||
606 | c->zToken = realloc(c->zToken, c->nAllocated); | ||
607 | if( c->zToken==NULL ) return SQLITE_NOMEM; | ||
608 | } | ||
609 | porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes); | ||
610 | *pzToken = c->zToken; | ||
611 | *piStartOffset = iStartOffset; | ||
612 | *piEndOffset = c->iOffset; | ||
613 | *piPosition = c->iToken++; | ||
614 | return SQLITE_OK; | ||
615 | } | ||
616 | } | ||
617 | return SQLITE_DONE; | ||
618 | } | ||
619 | |||
620 | /* | ||
621 | ** The set of routines that implement the porter-stemmer tokenizer | ||
622 | */ | ||
623 | static const sqlite3_tokenizer_module porterTokenizerModule = { | ||
624 | 0, | ||
625 | porterCreate, | ||
626 | porterDestroy, | ||
627 | porterOpen, | ||
628 | porterClose, | ||
629 | porterNext, | ||
630 | }; | ||
631 | |||
632 | /* | ||
633 | ** Allocate a new porter tokenizer. Return a pointer to the new | ||
634 | ** tokenizer in *ppModule | ||
635 | */ | ||
636 | void sqlite3Fts3PorterTokenizerModule( | ||
637 | sqlite3_tokenizer_module const**ppModule | ||
638 | ){ | ||
639 | *ppModule = &porterTokenizerModule; | ||
640 | } | ||
641 | |||
642 | #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */ | ||
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_tokenizer.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_tokenizer.c deleted file mode 100644 index 7398227..0000000 --- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_tokenizer.c +++ /dev/null | |||
@@ -1,371 +0,0 @@ | |||
1 | /* | ||
2 | ** 2007 June 22 | ||
3 | ** | ||
4 | ** The author disclaims copyright to this source code. In place of | ||
5 | ** a legal notice, here is a blessing: | ||
6 | ** | ||
7 | ** May you do good and not evil. | ||
8 | ** May you find forgiveness for yourself and forgive others. | ||
9 | ** May you share freely, never taking more than you give. | ||
10 | ** | ||
11 | ****************************************************************************** | ||
12 | ** | ||
13 | ** This is part of an SQLite module implementing full-text search. | ||
14 | ** This particular file implements the generic tokenizer interface. | ||
15 | */ | ||
16 | |||
17 | /* | ||
18 | ** The code in this file is only compiled if: | ||
19 | ** | ||
20 | ** * The FTS3 module is being built as an extension | ||
21 | ** (in which case SQLITE_CORE is not defined), or | ||
22 | ** | ||
23 | ** * The FTS3 module is being built into the core of | ||
24 | ** SQLite (in which case SQLITE_ENABLE_FTS3 is defined). | ||
25 | */ | ||
26 | #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) | ||
27 | |||
28 | |||
29 | #include "sqlite3.h" | ||
30 | #include "sqlite3ext.h" | ||
31 | SQLITE_EXTENSION_INIT1 | ||
32 | |||
33 | #include "fts3_hash.h" | ||
34 | #include "fts3_tokenizer.h" | ||
35 | #include <assert.h> | ||
36 | |||
37 | /* | ||
38 | ** Implementation of the SQL scalar function for accessing the underlying | ||
39 | ** hash table. This function may be called as follows: | ||
40 | ** | ||
41 | ** SELECT <function-name>(<key-name>); | ||
42 | ** SELECT <function-name>(<key-name>, <pointer>); | ||
43 | ** | ||
44 | ** where <function-name> is the name passed as the second argument | ||
45 | ** to the sqlite3Fts3InitHashTable() function (e.g. 'fts3_tokenizer'). | ||
46 | ** | ||
47 | ** If the <pointer> argument is specified, it must be a blob value | ||
48 | ** containing a pointer to be stored as the hash data corresponding | ||
49 | ** to the string <key-name>. If <pointer> is not specified, then | ||
50 | ** the string <key-name> must already exist in the has table. Otherwise, | ||
51 | ** an error is returned. | ||
52 | ** | ||
53 | ** Whether or not the <pointer> argument is specified, the value returned | ||
54 | ** is a blob containing the pointer stored as the hash data corresponding | ||
55 | ** to string <key-name> (after the hash-table is updated, if applicable). | ||
56 | */ | ||
57 | static void scalarFunc( | ||
58 | sqlite3_context *context, | ||
59 | int argc, | ||
60 | sqlite3_value **argv | ||
61 | ){ | ||
62 | fts3Hash *pHash; | ||
63 | void *pPtr = 0; | ||
64 | const unsigned char *zName; | ||
65 | int nName; | ||
66 | |||
67 | assert( argc==1 || argc==2 ); | ||
68 | |||
69 | pHash = (fts3Hash *)sqlite3_user_data(context); | ||
70 | |||
71 | zName = sqlite3_value_text(argv[0]); | ||
72 | nName = sqlite3_value_bytes(argv[0])+1; | ||
73 | |||
74 | if( argc==2 ){ | ||
75 | void *pOld; | ||
76 | int n = sqlite3_value_bytes(argv[1]); | ||
77 | if( n!=sizeof(pPtr) ){ | ||
78 | sqlite3_result_error(context, "argument type mismatch", -1); | ||
79 | return; | ||
80 | } | ||
81 | pPtr = *(void **)sqlite3_value_blob(argv[1]); | ||
82 | pOld = sqlite3Fts3HashInsert(pHash, (void *)zName, nName, pPtr); | ||
83 | if( pOld==pPtr ){ | ||
84 | sqlite3_result_error(context, "out of memory", -1); | ||
85 | return; | ||
86 | } | ||
87 | }else{ | ||
88 | pPtr = sqlite3Fts3HashFind(pHash, zName, nName); | ||
89 | if( !pPtr ){ | ||
90 | char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName); | ||
91 | sqlite3_result_error(context, zErr, -1); | ||
92 | sqlite3_free(zErr); | ||
93 | return; | ||
94 | } | ||
95 | } | ||
96 | |||
97 | sqlite3_result_blob(context, (void *)&pPtr, sizeof(pPtr), SQLITE_TRANSIENT); | ||
98 | } | ||
99 | |||
100 | #ifdef SQLITE_TEST | ||
101 | |||
102 | #include <tcl.h> | ||
103 | #include <string.h> | ||
104 | |||
105 | /* | ||
106 | ** Implementation of a special SQL scalar function for testing tokenizers | ||
107 | ** designed to be used in concert with the Tcl testing framework. This | ||
108 | ** function must be called with two arguments: | ||
109 | ** | ||
110 | ** SELECT <function-name>(<key-name>, <input-string>); | ||
111 | ** SELECT <function-name>(<key-name>, <pointer>); | ||
112 | ** | ||
113 | ** where <function-name> is the name passed as the second argument | ||
114 | ** to the sqlite3Fts3InitHashTable() function (e.g. 'fts3_tokenizer') | ||
115 | ** concatenated with the string '_test' (e.g. 'fts3_tokenizer_test'). | ||
116 | ** | ||
117 | ** The return value is a string that may be interpreted as a Tcl | ||
118 | ** list. For each token in the <input-string>, three elements are | ||
119 | ** added to the returned list. The first is the token position, the | ||
120 | ** second is the token text (folded, stemmed, etc.) and the third is the | ||
121 | ** substring of <input-string> associated with the token. For example, | ||
122 | ** using the built-in "simple" tokenizer: | ||
123 | ** | ||
124 | ** SELECT fts_tokenizer_test('simple', 'I don't see how'); | ||
125 | ** | ||
126 | ** will return the string: | ||
127 | ** | ||
128 | ** "{0 i I 1 dont don't 2 see see 3 how how}" | ||
129 | ** | ||
130 | */ | ||
131 | static void testFunc( | ||
132 | sqlite3_context *context, | ||
133 | int argc, | ||
134 | sqlite3_value **argv | ||
135 | ){ | ||
136 | fts3Hash *pHash; | ||
137 | sqlite3_tokenizer_module *p; | ||
138 | sqlite3_tokenizer *pTokenizer = 0; | ||
139 | sqlite3_tokenizer_cursor *pCsr = 0; | ||
140 | |||
141 | const char *zErr = 0; | ||
142 | |||
143 | const char *zName; | ||
144 | int nName; | ||
145 | const char *zInput; | ||
146 | int nInput; | ||
147 | |||
148 | const char *zArg = 0; | ||
149 | |||
150 | const char *zToken; | ||
151 | int nToken; | ||
152 | int iStart; | ||
153 | int iEnd; | ||
154 | int iPos; | ||
155 | |||
156 | Tcl_Obj *pRet; | ||
157 | |||
158 | assert( argc==2 || argc==3 ); | ||
159 | |||
160 | nName = sqlite3_value_bytes(argv[0]); | ||
161 | zName = (const char *)sqlite3_value_text(argv[0]); | ||
162 | nInput = sqlite3_value_bytes(argv[argc-1]); | ||
163 | zInput = (const char *)sqlite3_value_text(argv[argc-1]); | ||
164 | |||
165 | if( argc==3 ){ | ||
166 | zArg = (const char *)sqlite3_value_text(argv[1]); | ||
167 | } | ||
168 | |||
169 | pHash = (fts3Hash *)sqlite3_user_data(context); | ||
170 | p = (sqlite3_tokenizer_module *)sqlite3Fts3HashFind(pHash, zName, nName+1); | ||
171 | |||
172 | if( !p ){ | ||
173 | char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName); | ||
174 | sqlite3_result_error(context, zErr, -1); | ||
175 | sqlite3_free(zErr); | ||
176 | return; | ||
177 | } | ||
178 | |||
179 | pRet = Tcl_NewObj(); | ||
180 | Tcl_IncrRefCount(pRet); | ||
181 | |||
182 | if( SQLITE_OK!=p->xCreate(zArg ? 1 : 0, &zArg, &pTokenizer) ){ | ||
183 | zErr = "error in xCreate()"; | ||
184 | goto finish; | ||
185 | } | ||
186 | pTokenizer->pModule = p; | ||
187 | if( SQLITE_OK!=p->xOpen(pTokenizer, zInput, nInput, &pCsr) ){ | ||
188 | zErr = "error in xOpen()"; | ||
189 | goto finish; | ||
190 | } | ||
191 | pCsr->pTokenizer = pTokenizer; | ||
192 | |||
193 | while( SQLITE_OK==p->xNext(pCsr, &zToken, &nToken, &iStart, &iEnd, &iPos) ){ | ||
194 | Tcl_ListObjAppendElement(0, pRet, Tcl_NewIntObj(iPos)); | ||
195 | Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken)); | ||
196 | zToken = &zInput[iStart]; | ||
197 | nToken = iEnd-iStart; | ||
198 | Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken)); | ||
199 | } | ||
200 | |||
201 | if( SQLITE_OK!=p->xClose(pCsr) ){ | ||
202 | zErr = "error in xClose()"; | ||
203 | goto finish; | ||
204 | } | ||
205 | if( SQLITE_OK!=p->xDestroy(pTokenizer) ){ | ||
206 | zErr = "error in xDestroy()"; | ||
207 | goto finish; | ||
208 | } | ||
209 | |||
210 | finish: | ||
211 | if( zErr ){ | ||
212 | sqlite3_result_error(context, zErr, -1); | ||
213 | }else{ | ||
214 | sqlite3_result_text(context, Tcl_GetString(pRet), -1, SQLITE_TRANSIENT); | ||
215 | } | ||
216 | Tcl_DecrRefCount(pRet); | ||
217 | } | ||
218 | |||
219 | static | ||
220 | int registerTokenizer( | ||
221 | sqlite3 *db, | ||
222 | char *zName, | ||
223 | const sqlite3_tokenizer_module *p | ||
224 | ){ | ||
225 | int rc; | ||
226 | sqlite3_stmt *pStmt; | ||
227 | const char zSql[] = "SELECT fts3_tokenizer(?, ?)"; | ||
228 | |||
229 | rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0); | ||
230 | if( rc!=SQLITE_OK ){ | ||
231 | return rc; | ||
232 | } | ||
233 | |||
234 | sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC); | ||
235 | sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC); | ||
236 | sqlite3_step(pStmt); | ||
237 | |||
238 | return sqlite3_finalize(pStmt); | ||
239 | } | ||
240 | |||
241 | static | ||
242 | int queryTokenizer( | ||
243 | sqlite3 *db, | ||
244 | char *zName, | ||
245 | const sqlite3_tokenizer_module **pp | ||
246 | ){ | ||
247 | int rc; | ||
248 | sqlite3_stmt *pStmt; | ||
249 | const char zSql[] = "SELECT fts3_tokenizer(?)"; | ||
250 | |||
251 | *pp = 0; | ||
252 | rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0); | ||
253 | if( rc!=SQLITE_OK ){ | ||
254 | return rc; | ||
255 | } | ||
256 | |||
257 | sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC); | ||
258 | if( SQLITE_ROW==sqlite3_step(pStmt) ){ | ||
259 | if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){ | ||
260 | memcpy(pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp)); | ||
261 | } | ||
262 | } | ||
263 | |||
264 | return sqlite3_finalize(pStmt); | ||
265 | } | ||
266 | |||
267 | void sqlite3Fts3SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule); | ||
268 | |||
269 | /* | ||
270 | ** Implementation of the scalar function fts3_tokenizer_internal_test(). | ||
271 | ** This function is used for testing only, it is not included in the | ||
272 | ** build unless SQLITE_TEST is defined. | ||
273 | ** | ||
274 | ** The purpose of this is to test that the fts3_tokenizer() function | ||
275 | ** can be used as designed by the C-code in the queryTokenizer and | ||
276 | ** registerTokenizer() functions above. These two functions are repeated | ||
277 | ** in the README.tokenizer file as an example, so it is important to | ||
278 | ** test them. | ||
279 | ** | ||
280 | ** To run the tests, evaluate the fts3_tokenizer_internal_test() scalar | ||
281 | ** function with no arguments. An assert() will fail if a problem is | ||
282 | ** detected. i.e.: | ||
283 | ** | ||
284 | ** SELECT fts3_tokenizer_internal_test(); | ||
285 | ** | ||
286 | */ | ||
287 | static void intTestFunc( | ||
288 | sqlite3_context *context, | ||
289 | int argc, | ||
290 | sqlite3_value **argv | ||
291 | ){ | ||
292 | int rc; | ||
293 | const sqlite3_tokenizer_module *p1; | ||
294 | const sqlite3_tokenizer_module *p2; | ||
295 | sqlite3 *db = (sqlite3 *)sqlite3_user_data(context); | ||
296 | |||
297 | /* Test the query function */ | ||
298 | sqlite3Fts3SimpleTokenizerModule(&p1); | ||
299 | rc = queryTokenizer(db, "simple", &p2); | ||
300 | assert( rc==SQLITE_OK ); | ||
301 | assert( p1==p2 ); | ||
302 | rc = queryTokenizer(db, "nosuchtokenizer", &p2); | ||
303 | assert( rc==SQLITE_ERROR ); | ||
304 | assert( p2==0 ); | ||
305 | assert( 0==strcmp(sqlite3_errmsg(db), "unknown tokenizer: nosuchtokenizer") ); | ||
306 | |||
307 | /* Test the storage function */ | ||
308 | rc = registerTokenizer(db, "nosuchtokenizer", p1); | ||
309 | assert( rc==SQLITE_OK ); | ||
310 | rc = queryTokenizer(db, "nosuchtokenizer", &p2); | ||
311 | assert( rc==SQLITE_OK ); | ||
312 | assert( p2==p1 ); | ||
313 | |||
314 | sqlite3_result_text(context, "ok", -1, SQLITE_STATIC); | ||
315 | } | ||
316 | |||
317 | #endif | ||
318 | |||
319 | /* | ||
320 | ** Set up SQL objects in database db used to access the contents of | ||
321 | ** the hash table pointed to by argument pHash. The hash table must | ||
322 | ** been initialised to use string keys, and to take a private copy | ||
323 | ** of the key when a value is inserted. i.e. by a call similar to: | ||
324 | ** | ||
325 | ** sqlite3Fts3HashInit(pHash, FTS3_HASH_STRING, 1); | ||
326 | ** | ||
327 | ** This function adds a scalar function (see header comment above | ||
328 | ** scalarFunc() in this file for details) and, if ENABLE_TABLE is | ||
329 | ** defined at compilation time, a temporary virtual table (see header | ||
330 | ** comment above struct HashTableVtab) to the database schema. Both | ||
331 | ** provide read/write access to the contents of *pHash. | ||
332 | ** | ||
333 | ** The third argument to this function, zName, is used as the name | ||
334 | ** of both the scalar and, if created, the virtual table. | ||
335 | */ | ||
336 | int sqlite3Fts3InitHashTable( | ||
337 | sqlite3 *db, | ||
338 | fts3Hash *pHash, | ||
339 | const char *zName | ||
340 | ){ | ||
341 | int rc = SQLITE_OK; | ||
342 | void *p = (void *)pHash; | ||
343 | const int any = SQLITE_ANY; | ||
344 | char *zTest = 0; | ||
345 | char *zTest2 = 0; | ||
346 | |||
347 | #ifdef SQLITE_TEST | ||
348 | void *pdb = (void *)db; | ||
349 | zTest = sqlite3_mprintf("%s_test", zName); | ||
350 | zTest2 = sqlite3_mprintf("%s_internal_test", zName); | ||
351 | if( !zTest || !zTest2 ){ | ||
352 | rc = SQLITE_NOMEM; | ||
353 | } | ||
354 | #endif | ||
355 | |||
356 | if( rc!=SQLITE_OK | ||
357 | || (rc = sqlite3_create_function(db, zName, 1, any, p, scalarFunc, 0, 0)) | ||
358 | || (rc = sqlite3_create_function(db, zName, 2, any, p, scalarFunc, 0, 0)) | ||
359 | #ifdef SQLITE_TEST | ||
360 | || (rc = sqlite3_create_function(db, zTest, 2, any, p, testFunc, 0, 0)) | ||
361 | || (rc = sqlite3_create_function(db, zTest, 3, any, p, testFunc, 0, 0)) | ||
362 | || (rc = sqlite3_create_function(db, zTest2, 0, any, pdb, intTestFunc, 0, 0)) | ||
363 | #endif | ||
364 | ); | ||
365 | |||
366 | sqlite3_free(zTest); | ||
367 | sqlite3_free(zTest2); | ||
368 | return rc; | ||
369 | } | ||
370 | |||
371 | #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */ | ||
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_tokenizer.h b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_tokenizer.h deleted file mode 100644 index 4faef56..0000000 --- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_tokenizer.h +++ /dev/null | |||
@@ -1,145 +0,0 @@ | |||
1 | /* | ||
2 | ** 2006 July 10 | ||
3 | ** | ||
4 | ** The author disclaims copyright to this source code. | ||
5 | ** | ||
6 | ************************************************************************* | ||
7 | ** Defines the interface to tokenizers used by fulltext-search. There | ||
8 | ** are three basic components: | ||
9 | ** | ||
10 | ** sqlite3_tokenizer_module is a singleton defining the tokenizer | ||
11 | ** interface functions. This is essentially the class structure for | ||
12 | ** tokenizers. | ||
13 | ** | ||
14 | ** sqlite3_tokenizer is used to define a particular tokenizer, perhaps | ||
15 | ** including customization information defined at creation time. | ||
16 | ** | ||
17 | ** sqlite3_tokenizer_cursor is generated by a tokenizer to generate | ||
18 | ** tokens from a particular input. | ||
19 | */ | ||
20 | #ifndef _FTS3_TOKENIZER_H_ | ||
21 | #define _FTS3_TOKENIZER_H_ | ||
22 | |||
23 | /* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time. | ||
24 | ** If tokenizers are to be allowed to call sqlite3_*() functions, then | ||
25 | ** we will need a way to register the API consistently. | ||
26 | */ | ||
27 | #include "sqlite3.h" | ||
28 | |||
29 | /* | ||
30 | ** Structures used by the tokenizer interface. When a new tokenizer | ||
31 | ** implementation is registered, the caller provides a pointer to | ||
32 | ** an sqlite3_tokenizer_module containing pointers to the callback | ||
33 | ** functions that make up an implementation. | ||
34 | ** | ||
35 | ** When an fts3 table is created, it passes any arguments passed to | ||
36 | ** the tokenizer clause of the CREATE VIRTUAL TABLE statement to the | ||
37 | ** sqlite3_tokenizer_module.xCreate() function of the requested tokenizer | ||
38 | ** implementation. The xCreate() function in turn returns an | ||
39 | ** sqlite3_tokenizer structure representing the specific tokenizer to | ||
40 | ** be used for the fts3 table (customized by the tokenizer clause arguments). | ||
41 | ** | ||
42 | ** To tokenize an input buffer, the sqlite3_tokenizer_module.xOpen() | ||
43 | ** method is called. It returns an sqlite3_tokenizer_cursor object | ||
44 | ** that may be used to tokenize a specific input buffer based on | ||
45 | ** the tokenization rules supplied by a specific sqlite3_tokenizer | ||
46 | ** object. | ||
47 | */ | ||
48 | typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module; | ||
49 | typedef struct sqlite3_tokenizer sqlite3_tokenizer; | ||
50 | typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor; | ||
51 | |||
52 | struct sqlite3_tokenizer_module { | ||
53 | |||
54 | /* | ||
55 | ** Structure version. Should always be set to 0. | ||
56 | */ | ||
57 | int iVersion; | ||
58 | |||
59 | /* | ||
60 | ** Create a new tokenizer. The values in the argv[] array are the | ||
61 | ** arguments passed to the "tokenizer" clause of the CREATE VIRTUAL | ||
62 | ** TABLE statement that created the fts3 table. For example, if | ||
63 | ** the following SQL is executed: | ||
64 | ** | ||
65 | ** CREATE .. USING fts3( ... , tokenizer <tokenizer-name> arg1 arg2) | ||
66 | ** | ||
67 | ** then argc is set to 2, and the argv[] array contains pointers | ||
68 | ** to the strings "arg1" and "arg2". | ||
69 | ** | ||
70 | ** This method should return either SQLITE_OK (0), or an SQLite error | ||
71 | ** code. If SQLITE_OK is returned, then *ppTokenizer should be set | ||
72 | ** to point at the newly created tokenizer structure. The generic | ||
73 | ** sqlite3_tokenizer.pModule variable should not be initialised by | ||
74 | ** this callback. The caller will do so. | ||
75 | */ | ||
76 | int (*xCreate)( | ||
77 | int argc, /* Size of argv array */ | ||
78 | const char *const*argv, /* Tokenizer argument strings */ | ||
79 | sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */ | ||
80 | ); | ||
81 | |||
82 | /* | ||
83 | ** Destroy an existing tokenizer. The fts3 module calls this method | ||
84 | ** exactly once for each successful call to xCreate(). | ||
85 | */ | ||
86 | int (*xDestroy)(sqlite3_tokenizer *pTokenizer); | ||
87 | |||
88 | /* | ||
89 | ** Create a tokenizer cursor to tokenize an input buffer. The caller | ||
90 | ** is responsible for ensuring that the input buffer remains valid | ||
91 | ** until the cursor is closed (using the xClose() method). | ||
92 | */ | ||
93 | int (*xOpen)( | ||
94 | sqlite3_tokenizer *pTokenizer, /* Tokenizer object */ | ||
95 | const char *pInput, int nBytes, /* Input buffer */ | ||
96 | sqlite3_tokenizer_cursor **ppCursor /* OUT: Created tokenizer cursor */ | ||
97 | ); | ||
98 | |||
99 | /* | ||
100 | ** Destroy an existing tokenizer cursor. The fts3 module calls this | ||
101 | ** method exactly once for each successful call to xOpen(). | ||
102 | */ | ||
103 | int (*xClose)(sqlite3_tokenizer_cursor *pCursor); | ||
104 | |||
105 | /* | ||
106 | ** Retrieve the next token from the tokenizer cursor pCursor. This | ||
107 | ** method should either return SQLITE_OK and set the values of the | ||
108 | ** "OUT" variables identified below, or SQLITE_DONE to indicate that | ||
109 | ** the end of the buffer has been reached, or an SQLite error code. | ||
110 | ** | ||
111 | ** *ppToken should be set to point at a buffer containing the | ||
112 | ** normalized version of the token (i.e. after any case-folding and/or | ||
113 | ** stemming has been performed). *pnBytes should be set to the length | ||
114 | ** of this buffer in bytes. The input text that generated the token is | ||
115 | ** identified by the byte offsets returned in *piStartOffset and | ||
116 | ** *piEndOffset. | ||
117 | ** | ||
118 | ** The buffer *ppToken is set to point at is managed by the tokenizer | ||
119 | ** implementation. It is only required to be valid until the next call | ||
120 | ** to xNext() or xClose(). | ||
121 | */ | ||
122 | /* TODO(shess) current implementation requires pInput to be | ||
123 | ** nul-terminated. This should either be fixed, or pInput/nBytes | ||
124 | ** should be converted to zInput. | ||
125 | */ | ||
126 | int (*xNext)( | ||
127 | sqlite3_tokenizer_cursor *pCursor, /* Tokenizer cursor */ | ||
128 | const char **ppToken, int *pnBytes, /* OUT: Normalized text for token */ | ||
129 | int *piStartOffset, /* OUT: Byte offset of token in input buffer */ | ||
130 | int *piEndOffset, /* OUT: Byte offset of end of token in input buffer */ | ||
131 | int *piPosition /* OUT: Number of tokens returned before this one */ | ||
132 | ); | ||
133 | }; | ||
134 | |||
135 | struct sqlite3_tokenizer { | ||
136 | const sqlite3_tokenizer_module *pModule; /* The module for this tokenizer */ | ||
137 | /* Tokenizer implementations will typically add additional fields */ | ||
138 | }; | ||
139 | |||
140 | struct sqlite3_tokenizer_cursor { | ||
141 | sqlite3_tokenizer *pTokenizer; /* Tokenizer for this cursor. */ | ||
142 | /* Tokenizer implementations will typically add additional fields */ | ||
143 | }; | ||
144 | |||
145 | #endif /* _FTS3_TOKENIZER_H_ */ | ||
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_tokenizer1.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_tokenizer1.c deleted file mode 100644 index f53cc1d..0000000 --- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_tokenizer1.c +++ /dev/null | |||
@@ -1,229 +0,0 @@ | |||
1 | /* | ||
2 | ** 2006 Oct 10 | ||
3 | ** | ||
4 | ** The author disclaims copyright to this source code. In place of | ||
5 | ** a legal notice, here is a blessing: | ||
6 | ** | ||
7 | ** May you do good and not evil. | ||
8 | ** May you find forgiveness for yourself and forgive others. | ||
9 | ** May you share freely, never taking more than you give. | ||
10 | ** | ||
11 | ****************************************************************************** | ||
12 | ** | ||
13 | ** Implementation of the "simple" full-text-search tokenizer. | ||
14 | */ | ||
15 | |||
16 | /* | ||
17 | ** The code in this file is only compiled if: | ||
18 | ** | ||
19 | ** * The FTS3 module is being built as an extension | ||
20 | ** (in which case SQLITE_CORE is not defined), or | ||
21 | ** | ||
22 | ** * The FTS3 module is being built into the core of | ||
23 | ** SQLite (in which case SQLITE_ENABLE_FTS3 is defined). | ||
24 | */ | ||
25 | #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) | ||
26 | |||
27 | |||
28 | #include <assert.h> | ||
29 | #include <stdlib.h> | ||
30 | #include <stdio.h> | ||
31 | #include <string.h> | ||
32 | #include <ctype.h> | ||
33 | |||
34 | #include "fts3_tokenizer.h" | ||
35 | |||
36 | typedef struct simple_tokenizer { | ||
37 | sqlite3_tokenizer base; | ||
38 | char delim[128]; /* flag ASCII delimiters */ | ||
39 | } simple_tokenizer; | ||
40 | |||
41 | typedef struct simple_tokenizer_cursor { | ||
42 | sqlite3_tokenizer_cursor base; | ||
43 | const char *pInput; /* input we are tokenizing */ | ||
44 | int nBytes; /* size of the input */ | ||
45 | int iOffset; /* current position in pInput */ | ||
46 | int iToken; /* index of next token to be returned */ | ||
47 | char *pToken; /* storage for current token */ | ||
48 | int nTokenAllocated; /* space allocated to zToken buffer */ | ||
49 | } simple_tokenizer_cursor; | ||
50 | |||
51 | |||
52 | /* Forward declaration */ | ||
53 | static const sqlite3_tokenizer_module simpleTokenizerModule; | ||
54 | |||
55 | static int simpleDelim(simple_tokenizer *t, unsigned char c){ | ||
56 | return c<0x80 && t->delim[c]; | ||
57 | } | ||
58 | |||
59 | /* | ||
60 | ** Create a new tokenizer instance. | ||
61 | */ | ||
62 | static int simpleCreate( | ||
63 | int argc, const char * const *argv, | ||
64 | sqlite3_tokenizer **ppTokenizer | ||
65 | ){ | ||
66 | simple_tokenizer *t; | ||
67 | |||
68 | t = (simple_tokenizer *) calloc(sizeof(*t), 1); | ||
69 | if( t==NULL ) return SQLITE_NOMEM; | ||
70 | |||
71 | /* TODO(shess) Delimiters need to remain the same from run to run, | ||
72 | ** else we need to reindex. One solution would be a meta-table to | ||
73 | ** track such information in the database, then we'd only want this | ||
74 | ** information on the initial create. | ||
75 | */ | ||
76 | if( argc>1 ){ | ||
77 | int i, n = strlen(argv[1]); | ||
78 | for(i=0; i<n; i++){ | ||
79 | unsigned char ch = argv[1][i]; | ||
80 | /* We explicitly don't support UTF-8 delimiters for now. */ | ||
81 | if( ch>=0x80 ){ | ||
82 | free(t); | ||
83 | return SQLITE_ERROR; | ||
84 | } | ||
85 | t->delim[ch] = 1; | ||
86 | } | ||
87 | } else { | ||
88 | /* Mark non-alphanumeric ASCII characters as delimiters */ | ||
89 | int i; | ||
90 | for(i=1; i<0x80; i++){ | ||
91 | t->delim[i] = !isalnum(i); | ||
92 | } | ||
93 | } | ||
94 | |||
95 | *ppTokenizer = &t->base; | ||
96 | return SQLITE_OK; | ||
97 | } | ||
98 | |||
99 | /* | ||
100 | ** Destroy a tokenizer | ||
101 | */ | ||
102 | static int simpleDestroy(sqlite3_tokenizer *pTokenizer){ | ||
103 | free(pTokenizer); | ||
104 | return SQLITE_OK; | ||
105 | } | ||
106 | |||
107 | /* | ||
108 | ** Prepare to begin tokenizing a particular string. The input | ||
109 | ** string to be tokenized is pInput[0..nBytes-1]. A cursor | ||
110 | ** used to incrementally tokenize this string is returned in | ||
111 | ** *ppCursor. | ||
112 | */ | ||
113 | static int simpleOpen( | ||
114 | sqlite3_tokenizer *pTokenizer, /* The tokenizer */ | ||
115 | const char *pInput, int nBytes, /* String to be tokenized */ | ||
116 | sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ | ||
117 | ){ | ||
118 | simple_tokenizer_cursor *c; | ||
119 | |||
120 | c = (simple_tokenizer_cursor *) malloc(sizeof(*c)); | ||
121 | if( c==NULL ) return SQLITE_NOMEM; | ||
122 | |||
123 | c->pInput = pInput; | ||
124 | if( pInput==0 ){ | ||
125 | c->nBytes = 0; | ||
126 | }else if( nBytes<0 ){ | ||
127 | c->nBytes = (int)strlen(pInput); | ||
128 | }else{ | ||
129 | c->nBytes = nBytes; | ||
130 | } | ||
131 | c->iOffset = 0; /* start tokenizing at the beginning */ | ||
132 | c->iToken = 0; | ||
133 | c->pToken = NULL; /* no space allocated, yet. */ | ||
134 | c->nTokenAllocated = 0; | ||
135 | |||
136 | *ppCursor = &c->base; | ||
137 | return SQLITE_OK; | ||
138 | } | ||
139 | |||
140 | /* | ||
141 | ** Close a tokenization cursor previously opened by a call to | ||
142 | ** simpleOpen() above. | ||
143 | */ | ||
144 | static int simpleClose(sqlite3_tokenizer_cursor *pCursor){ | ||
145 | simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor; | ||
146 | free(c->pToken); | ||
147 | free(c); | ||
148 | return SQLITE_OK; | ||
149 | } | ||
150 | |||
151 | /* | ||
152 | ** Extract the next token from a tokenization cursor. The cursor must | ||
153 | ** have been opened by a prior call to simpleOpen(). | ||
154 | */ | ||
155 | static int simpleNext( | ||
156 | sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */ | ||
157 | const char **ppToken, /* OUT: *ppToken is the token text */ | ||
158 | int *pnBytes, /* OUT: Number of bytes in token */ | ||
159 | int *piStartOffset, /* OUT: Starting offset of token */ | ||
160 | int *piEndOffset, /* OUT: Ending offset of token */ | ||
161 | int *piPosition /* OUT: Position integer of token */ | ||
162 | ){ | ||
163 | simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor; | ||
164 | simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer; | ||
165 | unsigned char *p = (unsigned char *)c->pInput; | ||
166 | |||
167 | while( c->iOffset<c->nBytes ){ | ||
168 | int iStartOffset; | ||
169 | |||
170 | /* Scan past delimiter characters */ | ||
171 | while( c->iOffset<c->nBytes && simpleDelim(t, p[c->iOffset]) ){ | ||
172 | c->iOffset++; | ||
173 | } | ||
174 | |||
175 | /* Count non-delimiter characters. */ | ||
176 | iStartOffset = c->iOffset; | ||
177 | while( c->iOffset<c->nBytes && !simpleDelim(t, p[c->iOffset]) ){ | ||
178 | c->iOffset++; | ||
179 | } | ||
180 | |||
181 | if( c->iOffset>iStartOffset ){ | ||
182 | int i, n = c->iOffset-iStartOffset; | ||
183 | if( n>c->nTokenAllocated ){ | ||
184 | c->nTokenAllocated = n+20; | ||
185 | c->pToken = realloc(c->pToken, c->nTokenAllocated); | ||
186 | if( c->pToken==NULL ) return SQLITE_NOMEM; | ||
187 | } | ||
188 | for(i=0; i<n; i++){ | ||
189 | /* TODO(shess) This needs expansion to handle UTF-8 | ||
190 | ** case-insensitivity. | ||
191 | */ | ||
192 | unsigned char ch = p[iStartOffset+i]; | ||
193 | c->pToken[i] = ch<0x80 ? tolower(ch) : ch; | ||
194 | } | ||
195 | *ppToken = c->pToken; | ||
196 | *pnBytes = n; | ||
197 | *piStartOffset = iStartOffset; | ||
198 | *piEndOffset = c->iOffset; | ||
199 | *piPosition = c->iToken++; | ||
200 | |||
201 | return SQLITE_OK; | ||
202 | } | ||
203 | } | ||
204 | return SQLITE_DONE; | ||
205 | } | ||
206 | |||
207 | /* | ||
208 | ** The set of routines that implement the simple tokenizer | ||
209 | */ | ||
210 | static const sqlite3_tokenizer_module simpleTokenizerModule = { | ||
211 | 0, | ||
212 | simpleCreate, | ||
213 | simpleDestroy, | ||
214 | simpleOpen, | ||
215 | simpleClose, | ||
216 | simpleNext, | ||
217 | }; | ||
218 | |||
219 | /* | ||
220 | ** Allocate a new simple tokenizer. Return a pointer to the new | ||
221 | ** tokenizer in *ppModule | ||
222 | */ | ||
223 | void sqlite3Fts3SimpleTokenizerModule( | ||
224 | sqlite3_tokenizer_module const**ppModule | ||
225 | ){ | ||
226 | *ppModule = &simpleTokenizerModule; | ||
227 | } | ||
228 | |||
229 | #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */ | ||
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/mkfts3amal.tcl b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/mkfts3amal.tcl deleted file mode 100644 index cfea5d2..0000000 --- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/mkfts3amal.tcl +++ /dev/null | |||
@@ -1,116 +0,0 @@ | |||
1 | #!/usr/bin/tclsh | ||
2 | # | ||
3 | # This script builds a single C code file holding all of FTS3 code. | ||
4 | # The name of the output file is fts3amal.c. To build this file, | ||
5 | # first do: | ||
6 | # | ||
7 | # make target_source | ||
8 | # | ||
9 | # The make target above moves all of the source code files into | ||
10 | # a subdirectory named "tsrc". (This script expects to find the files | ||
11 | # there and will not work if they are not found.) | ||
12 | # | ||
13 | # After the "tsrc" directory has been created and populated, run | ||
14 | # this script: | ||
15 | # | ||
16 | # tclsh mkfts3amal.tcl | ||
17 | # | ||
18 | # The amalgamated FTS3 code will be written into fts3amal.c | ||
19 | # | ||
20 | |||
21 | # Open the output file and write a header comment at the beginning | ||
22 | # of the file. | ||
23 | # | ||
24 | set out [open fts3amal.c w] | ||
25 | set today [clock format [clock seconds] -format "%Y-%m-%d %H:%M:%S UTC" -gmt 1] | ||
26 | puts $out [subst \ | ||
27 | {/****************************************************************************** | ||
28 | ** This file is an amalgamation of separate C source files from the SQLite | ||
29 | ** Full Text Search extension 2 (fts3). By combining all the individual C | ||
30 | ** code files into this single large file, the entire code can be compiled | ||
31 | ** as a one translation unit. This allows many compilers to do optimizations | ||
32 | ** that would not be possible if the files were compiled separately. It also | ||
33 | ** makes the code easier to import into other projects. | ||
34 | ** | ||
35 | ** This amalgamation was generated on $today. | ||
36 | */}] | ||
37 | |||
38 | # These are the header files used by FTS3. The first time any of these | ||
39 | # files are seen in a #include statement in the C code, include the complete | ||
40 | # text of the file in-line. The file only needs to be included once. | ||
41 | # | ||
42 | foreach hdr { | ||
43 | fts3.h | ||
44 | fts3_hash.h | ||
45 | fts3_tokenizer.h | ||
46 | sqlite3.h | ||
47 | sqlite3ext.h | ||
48 | } { | ||
49 | set available_hdr($hdr) 1 | ||
50 | } | ||
51 | |||
52 | # 78 stars used for comment formatting. | ||
53 | set s78 \ | ||
54 | {*****************************************************************************} | ||
55 | |||
56 | # Insert a comment into the code | ||
57 | # | ||
58 | proc section_comment {text} { | ||
59 | global out s78 | ||
60 | set n [string length $text] | ||
61 | set nstar [expr {60 - $n}] | ||
62 | set stars [string range $s78 0 $nstar] | ||
63 | puts $out "/************** $text $stars/" | ||
64 | } | ||
65 | |||
66 | # Read the source file named $filename and write it into the | ||
67 | # sqlite3.c output file. If any #include statements are seen, | ||
68 | # process them approprately. | ||
69 | # | ||
70 | proc copy_file {filename} { | ||
71 | global seen_hdr available_hdr out | ||
72 | set tail [file tail $filename] | ||
73 | section_comment "Begin file $tail" | ||
74 | set in [open $filename r] | ||
75 | while {![eof $in]} { | ||
76 | set line [gets $in] | ||
77 | if {[regexp {^#\s*include\s+["<]([^">]+)[">]} $line all hdr]} { | ||
78 | if {[info exists available_hdr($hdr)]} { | ||
79 | if {$available_hdr($hdr)} { | ||
80 | section_comment "Include $hdr in the middle of $tail" | ||
81 | copy_file tsrc/$hdr | ||
82 | section_comment "Continuing where we left off in $tail" | ||
83 | } | ||
84 | } elseif {![info exists seen_hdr($hdr)]} { | ||
85 | set seen_hdr($hdr) 1 | ||
86 | puts $out $line | ||
87 | } | ||
88 | } elseif {[regexp {^#ifdef __cplusplus} $line]} { | ||
89 | puts $out "#if 0" | ||
90 | } elseif {[regexp {^#line} $line]} { | ||
91 | # Skip #line directives. | ||
92 | } else { | ||
93 | puts $out $line | ||
94 | } | ||
95 | } | ||
96 | close $in | ||
97 | section_comment "End of $tail" | ||
98 | } | ||
99 | |||
100 | |||
101 | # Process the source files. Process files containing commonly | ||
102 | # used subroutines first in order to help the compiler find | ||
103 | # inlining opportunities. | ||
104 | # | ||
105 | foreach file { | ||
106 | fts3.c | ||
107 | fts3_hash.c | ||
108 | fts3_porter.c | ||
109 | fts3_tokenizer.c | ||
110 | fts3_tokenizer1.c | ||
111 | fts3_icu.c | ||
112 | } { | ||
113 | copy_file tsrc/$file | ||
114 | } | ||
115 | |||
116 | close $out | ||
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/icu/README.txt b/libraries/sqlite/unix/sqlite-3.5.1/ext/icu/README.txt deleted file mode 100644 index 5c995cc..0000000 --- a/libraries/sqlite/unix/sqlite-3.5.1/ext/icu/README.txt +++ /dev/null | |||
@@ -1,170 +0,0 @@ | |||
1 | |||
2 | This directory contains source code for the SQLite "ICU" extension, an | ||
3 | integration of the "International Components for Unicode" library with | ||
4 | SQLite. Documentation follows. | ||
5 | |||
6 | 1. Features | ||
7 | |||
8 | 1.1 SQL Scalars upper() and lower() | ||
9 | 1.2 Unicode Aware LIKE Operator | ||
10 | 1.3 ICU Collation Sequences | ||
11 | 1.4 SQL REGEXP Operator | ||
12 | |||
13 | 2. Compilation and Usage | ||
14 | |||
15 | 3. Bugs, Problems and Security Issues | ||
16 | |||
17 | 3.1 The "case_sensitive_like" Pragma | ||
18 | 3.2 The SQLITE_MAX_LIKE_PATTERN_LENGTH Macro | ||
19 | 3.3 Collation Sequence Security Issue | ||
20 | |||
21 | |||
22 | 1. FEATURES | ||
23 | |||
24 | 1.1 SQL Scalars upper() and lower() | ||
25 | |||
26 | SQLite's built-in implementations of these two functions only | ||
27 | provide case mapping for the 26 letters used in the English | ||
28 | language. The ICU based functions provided by this extension | ||
29 | provide case mapping, where defined, for the full range of | ||
30 | unicode characters. | ||
31 | |||
32 | ICU provides two types of case mapping, "general" case mapping and | ||
33 | "language specific". Refer to ICU documentation for the differences | ||
34 | between the two. Specifically: | ||
35 | |||
36 | http://www.icu-project.org/userguide/caseMappings.html | ||
37 | http://www.icu-project.org/userguide/posix.html#case_mappings | ||
38 | |||
39 | To utilise "general" case mapping, the upper() or lower() scalar | ||
40 | functions are invoked with one argument: | ||
41 | |||
42 | upper('ABC') -> 'abc' | ||
43 | lower('abc') -> 'ABC' | ||
44 | |||
45 | To access ICU "language specific" case mapping, upper() or lower() | ||
46 | should be invoked with two arguments. The second argument is the name | ||
47 | of the locale to use. Passing an empty string ("") or SQL NULL value | ||
48 | as the second argument is the same as invoking the 1 argument version | ||
49 | of upper() or lower(): | ||
50 | |||
51 | lower('I', 'en_us') -> 'i' | ||
52 | lower('I', 'tr_tr') -> 'ı' (small dotless i) | ||
53 | |||
54 | 1.2 Unicode Aware LIKE Operator | ||
55 | |||
56 | Similarly to the upper() and lower() functions, the built-in SQLite LIKE | ||
57 | operator understands case equivalence for the 26 letters of the English | ||
58 | language alphabet. The implementation of LIKE included in this | ||
59 | extension uses the ICU function u_foldCase() to provide case | ||
60 | independent comparisons for the full range of unicode characters. | ||
61 | |||
62 | The U_FOLD_CASE_DEFAULT flag is passed to u_foldCase(), meaning the | ||
63 | dotless 'I' character used in the Turkish language is considered | ||
64 | to be in the same equivalence class as the dotted 'I' character | ||
65 | used by many languages (including English). | ||
66 | |||
67 | 1.3 ICU Collation Sequences | ||
68 | |||
69 | A special SQL scalar function, icu_load_collation() is provided that | ||
70 | may be used to register ICU collation sequences with SQLite. It | ||
71 | is always called with exactly two arguments, the ICU locale | ||
72 | identifying the collation sequence to ICU, and the name of the | ||
73 | SQLite collation sequence to create. For example, to create an | ||
74 | SQLite collation sequence named "turkish" using Turkish language | ||
75 | sorting rules, the SQL statement: | ||
76 | |||
77 | SELECT icu_load_collation('tr_TR', 'turkish'); | ||
78 | |||
79 | Or, for Australian English: | ||
80 | |||
81 | SELECT icu_load_collation('en_AU', 'australian'); | ||
82 | |||
83 | The identifiers "turkish" and "australian" may then be used | ||
84 | as collation sequence identifiers in SQL statements: | ||
85 | |||
86 | CREATE TABLE aust_turkish_penpals( | ||
87 | australian_penpal_name TEXT COLLATE australian, | ||
88 | turkish_penpal_name TEXT COLLATE turkish | ||
89 | ); | ||
90 | |||
91 | 1.4 SQL REGEXP Operator | ||
92 | |||
93 | This extension provides an implementation of the SQL binary | ||
94 | comparision operator "REGEXP", based on the regular expression functions | ||
95 | provided by the ICU library. The syntax of the operator is as described | ||
96 | in SQLite documentation: | ||
97 | |||
98 | <string> REGEXP <re-pattern> | ||
99 | |||
100 | This extension uses the ICU defaults for regular expression matching | ||
101 | behaviour. Specifically, this means that: | ||
102 | |||
103 | * Matching is case-sensitive, | ||
104 | * Regular expression comments are not allowed within patterns, and | ||
105 | * The '^' and '$' characters match the beginning and end of the | ||
106 | <string> argument, not the beginning and end of lines within | ||
107 | the <string> argument. | ||
108 | |||
109 | Even more specifically, the value passed to the "flags" parameter | ||
110 | of ICU C function uregex_open() is 0. | ||
111 | |||
112 | |||
113 | 2 COMPILATION AND USAGE | ||
114 | |||
115 | The easiest way to compile and use the ICU extension is to build | ||
116 | and use it as a dynamically loadable SQLite extension. To do this | ||
117 | using gcc on *nix: | ||
118 | |||
119 | gcc -shared icu.c `icu-config --ldflags` -o libSqliteIcu.so | ||
120 | |||
121 | You may need to add "-I" flags so that gcc can find sqlite3ext.h | ||
122 | and sqlite3.h. The resulting shared lib, libSqliteIcu.so, may be | ||
123 | loaded into sqlite in the same way as any other dynamically loadable | ||
124 | extension. | ||
125 | |||
126 | |||
127 | 3 BUGS, PROBLEMS AND SECURITY ISSUES | ||
128 | |||
129 | 3.1 The "case_sensitive_like" Pragma | ||
130 | |||
131 | This extension does not work well with the "case_sensitive_like" | ||
132 | pragma. If this pragma is used before the ICU extension is loaded, | ||
133 | then the pragma has no effect. If the pragma is used after the ICU | ||
134 | extension is loaded, then SQLite ignores the ICU implementation and | ||
135 | always uses the built-in LIKE operator. | ||
136 | |||
137 | The ICU extension LIKE operator is always case insensitive. | ||
138 | |||
139 | 3.2 The SQLITE_MAX_LIKE_PATTERN_LENGTH Macro | ||
140 | |||
141 | Passing very long patterns to the built-in SQLite LIKE operator can | ||
142 | cause a stack overflow. To curb this problem, SQLite defines the | ||
143 | SQLITE_MAX_LIKE_PATTERN_LENGTH macro as the maximum length of a | ||
144 | pattern in bytes (irrespective of encoding). The default value is | ||
145 | defined in internal header file "limits.h". | ||
146 | |||
147 | The ICU extension LIKE implementation suffers from the same | ||
148 | problem and uses the same solution. However, since the ICU extension | ||
149 | code does not include the SQLite file "limits.h", modifying | ||
150 | the default value therein does not affect the ICU extension. | ||
151 | The default value of SQLITE_MAX_LIKE_PATTERN_LENGTH used by | ||
152 | the ICU extension LIKE operator is 50000, defined in source | ||
153 | file "icu.c". | ||
154 | |||
155 | 3.3 Collation Sequence Security Issue | ||
156 | |||
157 | Internally, SQLite assumes that indices stored in database files | ||
158 | are sorted according to the collation sequence indicated by the | ||
159 | SQL schema. Changing the definition of a collation sequence after | ||
160 | an index has been built is therefore equivalent to database | ||
161 | corruption. The SQLite library is not very well tested under | ||
162 | these conditions, and may contain potential buffer overruns | ||
163 | or other programming errors that could be exploited by a malicious | ||
164 | programmer. | ||
165 | |||
166 | If the ICU extension is used in an environment where potentially | ||
167 | malicious users may execute arbitrary SQL (i.e. gears), they | ||
168 | should be prevented from invoking the icu_load_collation() function, | ||
169 | possibly using the authorisation callback. | ||
170 | |||
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/icu/icu.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/icu/icu.c deleted file mode 100644 index 11bb116..0000000 --- a/libraries/sqlite/unix/sqlite-3.5.1/ext/icu/icu.c +++ /dev/null | |||
@@ -1,499 +0,0 @@ | |||
1 | /* | ||
2 | ** 2007 May 6 | ||
3 | ** | ||
4 | ** The author disclaims copyright to this source code. In place of | ||
5 | ** a legal notice, here is a blessing: | ||
6 | ** | ||
7 | ** May you do good and not evil. | ||
8 | ** May you find forgiveness for yourself and forgive others. | ||
9 | ** May you share freely, never taking more than you give. | ||
10 | ** | ||
11 | ************************************************************************* | ||
12 | ** $Id: icu.c,v 1.6 2007/06/22 15:21:16 danielk1977 Exp $ | ||
13 | ** | ||
14 | ** This file implements an integration between the ICU library | ||
15 | ** ("International Components for Unicode", an open-source library | ||
16 | ** for handling unicode data) and SQLite. The integration uses | ||
17 | ** ICU to provide the following to SQLite: | ||
18 | ** | ||
19 | ** * An implementation of the SQL regexp() function (and hence REGEXP | ||
20 | ** operator) using the ICU uregex_XX() APIs. | ||
21 | ** | ||
22 | ** * Implementations of the SQL scalar upper() and lower() functions | ||
23 | ** for case mapping. | ||
24 | ** | ||
25 | ** * Integration of ICU and SQLite collation seqences. | ||
26 | ** | ||
27 | ** * An implementation of the LIKE operator that uses ICU to | ||
28 | ** provide case-independent matching. | ||
29 | */ | ||
30 | |||
31 | #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ICU) | ||
32 | |||
33 | /* Include ICU headers */ | ||
34 | #include <unicode/utypes.h> | ||
35 | #include <unicode/uregex.h> | ||
36 | #include <unicode/ustring.h> | ||
37 | #include <unicode/ucol.h> | ||
38 | |||
39 | #include <assert.h> | ||
40 | |||
41 | #ifndef SQLITE_CORE | ||
42 | #include "sqlite3ext.h" | ||
43 | SQLITE_EXTENSION_INIT1 | ||
44 | #else | ||
45 | #include "sqlite3.h" | ||
46 | #endif | ||
47 | |||
48 | /* | ||
49 | ** Maximum length (in bytes) of the pattern in a LIKE or GLOB | ||
50 | ** operator. | ||
51 | */ | ||
52 | #ifndef SQLITE_MAX_LIKE_PATTERN_LENGTH | ||
53 | # define SQLITE_MAX_LIKE_PATTERN_LENGTH 50000 | ||
54 | #endif | ||
55 | |||
56 | /* | ||
57 | ** Version of sqlite3_free() that is always a function, never a macro. | ||
58 | */ | ||
59 | static void xFree(void *p){ | ||
60 | sqlite3_free(p); | ||
61 | } | ||
62 | |||
63 | /* | ||
64 | ** Compare two UTF-8 strings for equality where the first string is | ||
65 | ** a "LIKE" expression. Return true (1) if they are the same and | ||
66 | ** false (0) if they are different. | ||
67 | */ | ||
68 | static int icuLikeCompare( | ||
69 | const uint8_t *zPattern, /* LIKE pattern */ | ||
70 | const uint8_t *zString, /* The UTF-8 string to compare against */ | ||
71 | const UChar32 uEsc /* The escape character */ | ||
72 | ){ | ||
73 | static const int MATCH_ONE = (UChar32)'_'; | ||
74 | static const int MATCH_ALL = (UChar32)'%'; | ||
75 | |||
76 | int iPattern = 0; /* Current byte index in zPattern */ | ||
77 | int iString = 0; /* Current byte index in zString */ | ||
78 | |||
79 | int prevEscape = 0; /* True if the previous character was uEsc */ | ||
80 | |||
81 | while( zPattern[iPattern]!=0 ){ | ||
82 | |||
83 | /* Read (and consume) the next character from the input pattern. */ | ||
84 | UChar32 uPattern; | ||
85 | U8_NEXT_UNSAFE(zPattern, iPattern, uPattern); | ||
86 | assert(uPattern!=0); | ||
87 | |||
88 | /* There are now 4 possibilities: | ||
89 | ** | ||
90 | ** 1. uPattern is an unescaped match-all character "%", | ||
91 | ** 2. uPattern is an unescaped match-one character "_", | ||
92 | ** 3. uPattern is an unescaped escape character, or | ||
93 | ** 4. uPattern is to be handled as an ordinary character | ||
94 | */ | ||
95 | if( !prevEscape && uPattern==MATCH_ALL ){ | ||
96 | /* Case 1. */ | ||
97 | uint8_t c; | ||
98 | |||
99 | /* Skip any MATCH_ALL or MATCH_ONE characters that follow a | ||
100 | ** MATCH_ALL. For each MATCH_ONE, skip one character in the | ||
101 | ** test string. | ||
102 | */ | ||
103 | while( (c=zPattern[iPattern]) == MATCH_ALL || c == MATCH_ONE ){ | ||
104 | if( c==MATCH_ONE ){ | ||
105 | if( zString[iString]==0 ) return 0; | ||
106 | U8_FWD_1_UNSAFE(zString, iString); | ||
107 | } | ||
108 | iPattern++; | ||
109 | } | ||
110 | |||
111 | if( zPattern[iPattern]==0 ) return 1; | ||
112 | |||
113 | while( zString[iString] ){ | ||
114 | if( icuLikeCompare(&zPattern[iPattern], &zString[iString], uEsc) ){ | ||
115 | return 1; | ||
116 | } | ||
117 | U8_FWD_1_UNSAFE(zString, iString); | ||
118 | } | ||
119 | return 0; | ||
120 | |||
121 | }else if( !prevEscape && uPattern==MATCH_ONE ){ | ||
122 | /* Case 2. */ | ||
123 | if( zString[iString]==0 ) return 0; | ||
124 | U8_FWD_1_UNSAFE(zString, iString); | ||
125 | |||
126 | }else if( !prevEscape && uPattern==uEsc){ | ||
127 | /* Case 3. */ | ||
128 | prevEscape = 1; | ||
129 | |||
130 | }else{ | ||
131 | /* Case 4. */ | ||
132 | UChar32 uString; | ||
133 | U8_NEXT_UNSAFE(zString, iString, uString); | ||
134 | uString = u_foldCase(uString, U_FOLD_CASE_DEFAULT); | ||
135 | uPattern = u_foldCase(uPattern, U_FOLD_CASE_DEFAULT); | ||
136 | if( uString!=uPattern ){ | ||
137 | return 0; | ||
138 | } | ||
139 | prevEscape = 0; | ||
140 | } | ||
141 | } | ||
142 | |||
143 | return zString[iString]==0; | ||
144 | } | ||
145 | |||
146 | /* | ||
147 | ** Implementation of the like() SQL function. This function implements | ||
148 | ** the build-in LIKE operator. The first argument to the function is the | ||
149 | ** pattern and the second argument is the string. So, the SQL statements: | ||
150 | ** | ||
151 | ** A LIKE B | ||
152 | ** | ||
153 | ** is implemented as like(B, A). If there is an escape character E, | ||
154 | ** | ||
155 | ** A LIKE B ESCAPE E | ||
156 | ** | ||
157 | ** is mapped to like(B, A, E). | ||
158 | */ | ||
159 | static void icuLikeFunc( | ||
160 | sqlite3_context *context, | ||
161 | int argc, | ||
162 | sqlite3_value **argv | ||
163 | ){ | ||
164 | const unsigned char *zA = sqlite3_value_text(argv[0]); | ||
165 | const unsigned char *zB = sqlite3_value_text(argv[1]); | ||
166 | UChar32 uEsc = 0; | ||
167 | |||
168 | /* Limit the length of the LIKE or GLOB pattern to avoid problems | ||
169 | ** of deep recursion and N*N behavior in patternCompare(). | ||
170 | */ | ||
171 | if( sqlite3_value_bytes(argv[0])>SQLITE_MAX_LIKE_PATTERN_LENGTH ){ | ||
172 | sqlite3_result_error(context, "LIKE or GLOB pattern too complex", -1); | ||
173 | return; | ||
174 | } | ||
175 | |||
176 | |||
177 | if( argc==3 ){ | ||
178 | /* The escape character string must consist of a single UTF-8 character. | ||
179 | ** Otherwise, return an error. | ||
180 | */ | ||
181 | int nE= sqlite3_value_bytes(argv[2]); | ||
182 | const unsigned char *zE = sqlite3_value_text(argv[2]); | ||
183 | int i = 0; | ||
184 | if( zE==0 ) return; | ||
185 | U8_NEXT(zE, i, nE, uEsc); | ||
186 | if( i!=nE){ | ||
187 | sqlite3_result_error(context, | ||
188 | "ESCAPE expression must be a single character", -1); | ||
189 | return; | ||
190 | } | ||
191 | } | ||
192 | |||
193 | if( zA && zB ){ | ||
194 | sqlite3_result_int(context, icuLikeCompare(zA, zB, uEsc)); | ||
195 | } | ||
196 | } | ||
197 | |||
198 | /* | ||
199 | ** This function is called when an ICU function called from within | ||
200 | ** the implementation of an SQL scalar function returns an error. | ||
201 | ** | ||
202 | ** The scalar function context passed as the first argument is | ||
203 | ** loaded with an error message based on the following two args. | ||
204 | */ | ||
205 | static void icuFunctionError( | ||
206 | sqlite3_context *pCtx, /* SQLite scalar function context */ | ||
207 | const char *zName, /* Name of ICU function that failed */ | ||
208 | UErrorCode e /* Error code returned by ICU function */ | ||
209 | ){ | ||
210 | char zBuf[128]; | ||
211 | sqlite3_snprintf(128, zBuf, "ICU error: %s(): %s", zName, u_errorName(e)); | ||
212 | zBuf[127] = '\0'; | ||
213 | sqlite3_result_error(pCtx, zBuf, -1); | ||
214 | } | ||
215 | |||
216 | /* | ||
217 | ** Function to delete compiled regexp objects. Registered as | ||
218 | ** a destructor function with sqlite3_set_auxdata(). | ||
219 | */ | ||
220 | static void icuRegexpDelete(void *p){ | ||
221 | URegularExpression *pExpr = (URegularExpression *)p; | ||
222 | uregex_close(pExpr); | ||
223 | } | ||
224 | |||
225 | /* | ||
226 | ** Implementation of SQLite REGEXP operator. This scalar function takes | ||
227 | ** two arguments. The first is a regular expression pattern to compile | ||
228 | ** the second is a string to match against that pattern. If either | ||
229 | ** argument is an SQL NULL, then NULL Is returned. Otherwise, the result | ||
230 | ** is 1 if the string matches the pattern, or 0 otherwise. | ||
231 | ** | ||
232 | ** SQLite maps the regexp() function to the regexp() operator such | ||
233 | ** that the following two are equivalent: | ||
234 | ** | ||
235 | ** zString REGEXP zPattern | ||
236 | ** regexp(zPattern, zString) | ||
237 | ** | ||
238 | ** Uses the following ICU regexp APIs: | ||
239 | ** | ||
240 | ** uregex_open() | ||
241 | ** uregex_matches() | ||
242 | ** uregex_close() | ||
243 | */ | ||
244 | static void icuRegexpFunc(sqlite3_context *p, int nArg, sqlite3_value **apArg){ | ||
245 | UErrorCode status = U_ZERO_ERROR; | ||
246 | URegularExpression *pExpr; | ||
247 | UBool res; | ||
248 | const UChar *zString = sqlite3_value_text16(apArg[1]); | ||
249 | |||
250 | /* If the left hand side of the regexp operator is NULL, | ||
251 | ** then the result is also NULL. | ||
252 | */ | ||
253 | if( !zString ){ | ||
254 | return; | ||
255 | } | ||
256 | |||
257 | pExpr = sqlite3_get_auxdata(p, 0); | ||
258 | if( !pExpr ){ | ||
259 | const UChar *zPattern = sqlite3_value_text16(apArg[0]); | ||
260 | if( !zPattern ){ | ||
261 | return; | ||
262 | } | ||
263 | pExpr = uregex_open(zPattern, -1, 0, 0, &status); | ||
264 | |||
265 | if( U_SUCCESS(status) ){ | ||
266 | sqlite3_set_auxdata(p, 0, pExpr, icuRegexpDelete); | ||
267 | }else{ | ||
268 | assert(!pExpr); | ||
269 | icuFunctionError(p, "uregex_open", status); | ||
270 | return; | ||
271 | } | ||
272 | } | ||
273 | |||
274 | /* Configure the text that the regular expression operates on. */ | ||
275 | uregex_setText(pExpr, zString, -1, &status); | ||
276 | if( !U_SUCCESS(status) ){ | ||
277 | icuFunctionError(p, "uregex_setText", status); | ||
278 | return; | ||
279 | } | ||
280 | |||
281 | /* Attempt the match */ | ||
282 | res = uregex_matches(pExpr, 0, &status); | ||
283 | if( !U_SUCCESS(status) ){ | ||
284 | icuFunctionError(p, "uregex_matches", status); | ||
285 | return; | ||
286 | } | ||
287 | |||
288 | /* Set the text that the regular expression operates on to a NULL | ||
289 | ** pointer. This is not really necessary, but it is tidier than | ||
290 | ** leaving the regular expression object configured with an invalid | ||
291 | ** pointer after this function returns. | ||
292 | */ | ||
293 | uregex_setText(pExpr, 0, 0, &status); | ||
294 | |||
295 | /* Return 1 or 0. */ | ||
296 | sqlite3_result_int(p, res ? 1 : 0); | ||
297 | } | ||
298 | |||
299 | /* | ||
300 | ** Implementations of scalar functions for case mapping - upper() and | ||
301 | ** lower(). Function upper() converts it's input to upper-case (ABC). | ||
302 | ** Function lower() converts to lower-case (abc). | ||
303 | ** | ||
304 | ** ICU provides two types of case mapping, "general" case mapping and | ||
305 | ** "language specific". Refer to ICU documentation for the differences | ||
306 | ** between the two. | ||
307 | ** | ||
308 | ** To utilise "general" case mapping, the upper() or lower() scalar | ||
309 | ** functions are invoked with one argument: | ||
310 | ** | ||
311 | ** upper('ABC') -> 'abc' | ||
312 | ** lower('abc') -> 'ABC' | ||
313 | ** | ||
314 | ** To access ICU "language specific" case mapping, upper() or lower() | ||
315 | ** should be invoked with two arguments. The second argument is the name | ||
316 | ** of the locale to use. Passing an empty string ("") or SQL NULL value | ||
317 | ** as the second argument is the same as invoking the 1 argument version | ||
318 | ** of upper() or lower(). | ||
319 | ** | ||
320 | ** lower('I', 'en_us') -> 'i' | ||
321 | ** lower('I', 'tr_tr') -> 'ı' (small dotless i) | ||
322 | ** | ||
323 | ** http://www.icu-project.org/userguide/posix.html#case_mappings | ||
324 | */ | ||
325 | static void icuCaseFunc16(sqlite3_context *p, int nArg, sqlite3_value **apArg){ | ||
326 | const UChar *zInput; | ||
327 | UChar *zOutput; | ||
328 | int nInput; | ||
329 | int nOutput; | ||
330 | |||
331 | UErrorCode status = U_ZERO_ERROR; | ||
332 | const char *zLocale = 0; | ||
333 | |||
334 | assert(nArg==1 || nArg==2); | ||
335 | if( nArg==2 ){ | ||
336 | zLocale = (const char *)sqlite3_value_text(apArg[1]); | ||
337 | } | ||
338 | |||
339 | zInput = sqlite3_value_text16(apArg[0]); | ||
340 | if( !zInput ){ | ||
341 | return; | ||
342 | } | ||
343 | nInput = sqlite3_value_bytes16(apArg[0]); | ||
344 | |||
345 | nOutput = nInput * 2 + 2; | ||
346 | zOutput = sqlite3_malloc(nOutput); | ||
347 | if( !zOutput ){ | ||
348 | return; | ||
349 | } | ||
350 | |||
351 | if( sqlite3_user_data(p) ){ | ||
352 | u_strToUpper(zOutput, nOutput/2, zInput, nInput/2, zLocale, &status); | ||
353 | }else{ | ||
354 | u_strToLower(zOutput, nOutput/2, zInput, nInput/2, zLocale, &status); | ||
355 | } | ||
356 | |||
357 | if( !U_SUCCESS(status) ){ | ||
358 | icuFunctionError(p, "u_strToLower()/u_strToUpper", status); | ||
359 | return; | ||
360 | } | ||
361 | |||
362 | sqlite3_result_text16(p, zOutput, -1, xFree); | ||
363 | } | ||
364 | |||
365 | /* | ||
366 | ** Collation sequence destructor function. The pCtx argument points to | ||
367 | ** a UCollator structure previously allocated using ucol_open(). | ||
368 | */ | ||
369 | static void icuCollationDel(void *pCtx){ | ||
370 | UCollator *p = (UCollator *)pCtx; | ||
371 | ucol_close(p); | ||
372 | } | ||
373 | |||
374 | /* | ||
375 | ** Collation sequence comparison function. The pCtx argument points to | ||
376 | ** a UCollator structure previously allocated using ucol_open(). | ||
377 | */ | ||
378 | static int icuCollationColl( | ||
379 | void *pCtx, | ||
380 | int nLeft, | ||
381 | const void *zLeft, | ||
382 | int nRight, | ||
383 | const void *zRight | ||
384 | ){ | ||
385 | UCollationResult res; | ||
386 | UCollator *p = (UCollator *)pCtx; | ||
387 | res = ucol_strcoll(p, (UChar *)zLeft, nLeft/2, (UChar *)zRight, nRight/2); | ||
388 | switch( res ){ | ||
389 | case UCOL_LESS: return -1; | ||
390 | case UCOL_GREATER: return +1; | ||
391 | case UCOL_EQUAL: return 0; | ||
392 | } | ||
393 | assert(!"Unexpected return value from ucol_strcoll()"); | ||
394 | return 0; | ||
395 | } | ||
396 | |||
397 | /* | ||
398 | ** Implementation of the scalar function icu_load_collation(). | ||
399 | ** | ||
400 | ** This scalar function is used to add ICU collation based collation | ||
401 | ** types to an SQLite database connection. It is intended to be called | ||
402 | ** as follows: | ||
403 | ** | ||
404 | ** SELECT icu_load_collation(<locale>, <collation-name>); | ||
405 | ** | ||
406 | ** Where <locale> is a string containing an ICU locale identifier (i.e. | ||
407 | ** "en_AU", "tr_TR" etc.) and <collation-name> is the name of the | ||
408 | ** collation sequence to create. | ||
409 | */ | ||
410 | static void icuLoadCollation( | ||
411 | sqlite3_context *p, | ||
412 | int nArg, | ||
413 | sqlite3_value **apArg | ||
414 | ){ | ||
415 | sqlite3 *db = (sqlite3 *)sqlite3_user_data(p); | ||
416 | UErrorCode status = U_ZERO_ERROR; | ||
417 | const char *zLocale; /* Locale identifier - (eg. "jp_JP") */ | ||
418 | const char *zName; /* SQL Collation sequence name (eg. "japanese") */ | ||
419 | UCollator *pUCollator; /* ICU library collation object */ | ||
420 | int rc; /* Return code from sqlite3_create_collation_x() */ | ||
421 | |||
422 | assert(nArg==2); | ||
423 | zLocale = (const char *)sqlite3_value_text(apArg[0]); | ||
424 | zName = (const char *)sqlite3_value_text(apArg[1]); | ||
425 | |||
426 | if( !zLocale || !zName ){ | ||
427 | return; | ||
428 | } | ||
429 | |||
430 | pUCollator = ucol_open(zLocale, &status); | ||
431 | if( !U_SUCCESS(status) ){ | ||
432 | icuFunctionError(p, "ucol_open", status); | ||
433 | return; | ||
434 | } | ||
435 | assert(p); | ||
436 | |||
437 | rc = sqlite3_create_collation_v2(db, zName, SQLITE_UTF16, (void *)pUCollator, | ||
438 | icuCollationColl, icuCollationDel | ||
439 | ); | ||
440 | if( rc!=SQLITE_OK ){ | ||
441 | ucol_close(pUCollator); | ||
442 | sqlite3_result_error(p, "Error registering collation function", -1); | ||
443 | } | ||
444 | } | ||
445 | |||
446 | /* | ||
447 | ** Register the ICU extension functions with database db. | ||
448 | */ | ||
449 | int sqlite3IcuInit(sqlite3 *db){ | ||
450 | struct IcuScalar { | ||
451 | const char *zName; /* Function name */ | ||
452 | int nArg; /* Number of arguments */ | ||
453 | int enc; /* Optimal text encoding */ | ||
454 | void *pContext; /* sqlite3_user_data() context */ | ||
455 | void (*xFunc)(sqlite3_context*,int,sqlite3_value**); | ||
456 | } scalars[] = { | ||
457 | {"regexp",-1, SQLITE_ANY, 0, icuRegexpFunc}, | ||
458 | |||
459 | {"lower", 1, SQLITE_UTF16, 0, icuCaseFunc16}, | ||
460 | {"lower", 2, SQLITE_UTF16, 0, icuCaseFunc16}, | ||
461 | {"upper", 1, SQLITE_UTF16, (void*)1, icuCaseFunc16}, | ||
462 | {"upper", 2, SQLITE_UTF16, (void*)1, icuCaseFunc16}, | ||
463 | |||
464 | {"lower", 1, SQLITE_UTF8, 0, icuCaseFunc16}, | ||
465 | {"lower", 2, SQLITE_UTF8, 0, icuCaseFunc16}, | ||
466 | {"upper", 1, SQLITE_UTF8, (void*)1, icuCaseFunc16}, | ||
467 | {"upper", 2, SQLITE_UTF8, (void*)1, icuCaseFunc16}, | ||
468 | |||
469 | {"like", 2, SQLITE_UTF8, 0, icuLikeFunc}, | ||
470 | {"like", 3, SQLITE_UTF8, 0, icuLikeFunc}, | ||
471 | |||
472 | {"icu_load_collation", 2, SQLITE_UTF8, (void*)db, icuLoadCollation}, | ||
473 | }; | ||
474 | |||
475 | int rc = SQLITE_OK; | ||
476 | int i; | ||
477 | |||
478 | for(i=0; rc==SQLITE_OK && i<(sizeof(scalars)/sizeof(struct IcuScalar)); i++){ | ||
479 | struct IcuScalar *p = &scalars[i]; | ||
480 | rc = sqlite3_create_function( | ||
481 | db, p->zName, p->nArg, p->enc, p->pContext, p->xFunc, 0, 0 | ||
482 | ); | ||
483 | } | ||
484 | |||
485 | return rc; | ||
486 | } | ||
487 | |||
488 | #if !SQLITE_CORE | ||
489 | int sqlite3_extension_init( | ||
490 | sqlite3 *db, | ||
491 | char **pzErrMsg, | ||
492 | const sqlite3_api_routines *pApi | ||
493 | ){ | ||
494 | SQLITE_EXTENSION_INIT2(pApi) | ||
495 | return sqlite3IcuInit(db); | ||
496 | } | ||
497 | #endif | ||
498 | |||
499 | #endif | ||