diff options
Diffstat (limited to 'libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3.c')
-rw-r--r-- | libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3.c | 5971 |
1 files changed, 5971 insertions, 0 deletions
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3.c new file mode 100644 index 0000000..b392919 --- /dev/null +++ b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3.c | |||
@@ -0,0 +1,5971 @@ | |||
1 | /* | ||
2 | ** 2006 Oct 10 | ||
3 | ** | ||
4 | ** The author disclaims copyright to this source code. In place of | ||
5 | ** a legal notice, here is a blessing: | ||
6 | ** | ||
7 | ** May you do good and not evil. | ||
8 | ** May you find forgiveness for yourself and forgive others. | ||
9 | ** May you share freely, never taking more than you give. | ||
10 | ** | ||
11 | ****************************************************************************** | ||
12 | ** | ||
13 | ** This is an SQLite module implementing full-text search. | ||
14 | */ | ||
15 | |||
16 | /* | ||
17 | ** The code in this file is only compiled if: | ||
18 | ** | ||
19 | ** * The FTS3 module is being built as an extension | ||
20 | ** (in which case SQLITE_CORE is not defined), or | ||
21 | ** | ||
22 | ** * The FTS3 module is being built into the core of | ||
23 | ** SQLite (in which case SQLITE_ENABLE_FTS3 is defined). | ||
24 | */ | ||
25 | |||
26 | /* TODO(shess) Consider exporting this comment to an HTML file or the | ||
27 | ** wiki. | ||
28 | */ | ||
29 | /* The full-text index is stored in a series of b+tree (-like) | ||
30 | ** structures called segments which map terms to doclists. The | ||
31 | ** structures are like b+trees in layout, but are constructed from the | ||
32 | ** bottom up in optimal fashion and are not updatable. Since trees | ||
33 | ** are built from the bottom up, things will be described from the | ||
34 | ** bottom up. | ||
35 | ** | ||
36 | ** | ||
37 | **** Varints **** | ||
38 | ** The basic unit of encoding is a variable-length integer called a | ||
39 | ** varint. We encode variable-length integers in little-endian order | ||
40 | ** using seven bits * per byte as follows: | ||
41 | ** | ||
42 | ** KEY: | ||
43 | ** A = 0xxxxxxx 7 bits of data and one flag bit | ||
44 | ** B = 1xxxxxxx 7 bits of data and one flag bit | ||
45 | ** | ||
46 | ** 7 bits - A | ||
47 | ** 14 bits - BA | ||
48 | ** 21 bits - BBA | ||
49 | ** and so on. | ||
50 | ** | ||
51 | ** This is identical to how sqlite encodes varints (see util.c). | ||
52 | ** | ||
53 | ** | ||
54 | **** Document lists **** | ||
55 | ** A doclist (document list) holds a docid-sorted list of hits for a | ||
56 | ** given term. Doclists hold docids, and can optionally associate | ||
57 | ** token positions and offsets with docids. | ||
58 | ** | ||
59 | ** A DL_POSITIONS_OFFSETS doclist is stored like this: | ||
60 | ** | ||
61 | ** array { | ||
62 | ** varint docid; | ||
63 | ** array { (position list for column 0) | ||
64 | ** varint position; (delta from previous position plus POS_BASE) | ||
65 | ** varint startOffset; (delta from previous startOffset) | ||
66 | ** varint endOffset; (delta from startOffset) | ||
67 | ** } | ||
68 | ** array { | ||
69 | ** varint POS_COLUMN; (marks start of position list for new column) | ||
70 | ** varint column; (index of new column) | ||
71 | ** array { | ||
72 | ** varint position; (delta from previous position plus POS_BASE) | ||
73 | ** varint startOffset;(delta from previous startOffset) | ||
74 | ** varint endOffset; (delta from startOffset) | ||
75 | ** } | ||
76 | ** } | ||
77 | ** varint POS_END; (marks end of positions for this document. | ||
78 | ** } | ||
79 | ** | ||
80 | ** Here, array { X } means zero or more occurrences of X, adjacent in | ||
81 | ** memory. A "position" is an index of a token in the token stream | ||
82 | ** generated by the tokenizer, while an "offset" is a byte offset, | ||
83 | ** both based at 0. Note that POS_END and POS_COLUMN occur in the | ||
84 | ** same logical place as the position element, and act as sentinals | ||
85 | ** ending a position list array. | ||
86 | ** | ||
87 | ** A DL_POSITIONS doclist omits the startOffset and endOffset | ||
88 | ** information. A DL_DOCIDS doclist omits both the position and | ||
89 | ** offset information, becoming an array of varint-encoded docids. | ||
90 | ** | ||
91 | ** On-disk data is stored as type DL_DEFAULT, so we don't serialize | ||
92 | ** the type. Due to how deletion is implemented in the segmentation | ||
93 | ** system, on-disk doclists MUST store at least positions. | ||
94 | ** | ||
95 | ** | ||
96 | **** Segment leaf nodes **** | ||
97 | ** Segment leaf nodes store terms and doclists, ordered by term. Leaf | ||
98 | ** nodes are written using LeafWriter, and read using LeafReader (to | ||
99 | ** iterate through a single leaf node's data) and LeavesReader (to | ||
100 | ** iterate through a segment's entire leaf layer). Leaf nodes have | ||
101 | ** the format: | ||
102 | ** | ||
103 | ** varint iHeight; (height from leaf level, always 0) | ||
104 | ** varint nTerm; (length of first term) | ||
105 | ** char pTerm[nTerm]; (content of first term) | ||
106 | ** varint nDoclist; (length of term's associated doclist) | ||
107 | ** char pDoclist[nDoclist]; (content of doclist) | ||
108 | ** array { | ||
109 | ** (further terms are delta-encoded) | ||
110 | ** varint nPrefix; (length of prefix shared with previous term) | ||
111 | ** varint nSuffix; (length of unshared suffix) | ||
112 | ** char pTermSuffix[nSuffix];(unshared suffix of next term) | ||
113 | ** varint nDoclist; (length of term's associated doclist) | ||
114 | ** char pDoclist[nDoclist]; (content of doclist) | ||
115 | ** } | ||
116 | ** | ||
117 | ** Here, array { X } means zero or more occurrences of X, adjacent in | ||
118 | ** memory. | ||
119 | ** | ||
120 | ** Leaf nodes are broken into blocks which are stored contiguously in | ||
121 | ** the %_segments table in sorted order. This means that when the end | ||
122 | ** of a node is reached, the next term is in the node with the next | ||
123 | ** greater node id. | ||
124 | ** | ||
125 | ** New data is spilled to a new leaf node when the current node | ||
126 | ** exceeds LEAF_MAX bytes (default 2048). New data which itself is | ||
127 | ** larger than STANDALONE_MIN (default 1024) is placed in a standalone | ||
128 | ** node (a leaf node with a single term and doclist). The goal of | ||
129 | ** these settings is to pack together groups of small doclists while | ||
130 | ** making it efficient to directly access large doclists. The | ||
131 | ** assumption is that large doclists represent terms which are more | ||
132 | ** likely to be query targets. | ||
133 | ** | ||
134 | ** TODO(shess) It may be useful for blocking decisions to be more | ||
135 | ** dynamic. For instance, it may make more sense to have a 2.5k leaf | ||
136 | ** node rather than splitting into 2k and .5k nodes. My intuition is | ||
137 | ** that this might extend through 2x or 4x the pagesize. | ||
138 | ** | ||
139 | ** | ||
140 | **** Segment interior nodes **** | ||
141 | ** Segment interior nodes store blockids for subtree nodes and terms | ||
142 | ** to describe what data is stored by the each subtree. Interior | ||
143 | ** nodes are written using InteriorWriter, and read using | ||
144 | ** InteriorReader. InteriorWriters are created as needed when | ||
145 | ** SegmentWriter creates new leaf nodes, or when an interior node | ||
146 | ** itself grows too big and must be split. The format of interior | ||
147 | ** nodes: | ||
148 | ** | ||
149 | ** varint iHeight; (height from leaf level, always >0) | ||
150 | ** varint iBlockid; (block id of node's leftmost subtree) | ||
151 | ** optional { | ||
152 | ** varint nTerm; (length of first term) | ||
153 | ** char pTerm[nTerm]; (content of first term) | ||
154 | ** array { | ||
155 | ** (further terms are delta-encoded) | ||
156 | ** varint nPrefix; (length of shared prefix with previous term) | ||
157 | ** varint nSuffix; (length of unshared suffix) | ||
158 | ** char pTermSuffix[nSuffix]; (unshared suffix of next term) | ||
159 | ** } | ||
160 | ** } | ||
161 | ** | ||
162 | ** Here, optional { X } means an optional element, while array { X } | ||
163 | ** means zero or more occurrences of X, adjacent in memory. | ||
164 | ** | ||
165 | ** An interior node encodes n terms separating n+1 subtrees. The | ||
166 | ** subtree blocks are contiguous, so only the first subtree's blockid | ||
167 | ** is encoded. The subtree at iBlockid will contain all terms less | ||
168 | ** than the first term encoded (or all terms if no term is encoded). | ||
169 | ** Otherwise, for terms greater than or equal to pTerm[i] but less | ||
170 | ** than pTerm[i+1], the subtree for that term will be rooted at | ||
171 | ** iBlockid+i. Interior nodes only store enough term data to | ||
172 | ** distinguish adjacent children (if the rightmost term of the left | ||
173 | ** child is "something", and the leftmost term of the right child is | ||
174 | ** "wicked", only "w" is stored). | ||
175 | ** | ||
176 | ** New data is spilled to a new interior node at the same height when | ||
177 | ** the current node exceeds INTERIOR_MAX bytes (default 2048). | ||
178 | ** INTERIOR_MIN_TERMS (default 7) keeps large terms from monopolizing | ||
179 | ** interior nodes and making the tree too skinny. The interior nodes | ||
180 | ** at a given height are naturally tracked by interior nodes at | ||
181 | ** height+1, and so on. | ||
182 | ** | ||
183 | ** | ||
184 | **** Segment directory **** | ||
185 | ** The segment directory in table %_segdir stores meta-information for | ||
186 | ** merging and deleting segments, and also the root node of the | ||
187 | ** segment's tree. | ||
188 | ** | ||
189 | ** The root node is the top node of the segment's tree after encoding | ||
190 | ** the entire segment, restricted to ROOT_MAX bytes (default 1024). | ||
191 | ** This could be either a leaf node or an interior node. If the top | ||
192 | ** node requires more than ROOT_MAX bytes, it is flushed to %_segments | ||
193 | ** and a new root interior node is generated (which should always fit | ||
194 | ** within ROOT_MAX because it only needs space for 2 varints, the | ||
195 | ** height and the blockid of the previous root). | ||
196 | ** | ||
197 | ** The meta-information in the segment directory is: | ||
198 | ** level - segment level (see below) | ||
199 | ** idx - index within level | ||
200 | ** - (level,idx uniquely identify a segment) | ||
201 | ** start_block - first leaf node | ||
202 | ** leaves_end_block - last leaf node | ||
203 | ** end_block - last block (including interior nodes) | ||
204 | ** root - contents of root node | ||
205 | ** | ||
206 | ** If the root node is a leaf node, then start_block, | ||
207 | ** leaves_end_block, and end_block are all 0. | ||
208 | ** | ||
209 | ** | ||
210 | **** Segment merging **** | ||
211 | ** To amortize update costs, segments are groups into levels and | ||
212 | ** merged in matches. Each increase in level represents exponentially | ||
213 | ** more documents. | ||
214 | ** | ||
215 | ** New documents (actually, document updates) are tokenized and | ||
216 | ** written individually (using LeafWriter) to a level 0 segment, with | ||
217 | ** incrementing idx. When idx reaches MERGE_COUNT (default 16), all | ||
218 | ** level 0 segments are merged into a single level 1 segment. Level 1 | ||
219 | ** is populated like level 0, and eventually MERGE_COUNT level 1 | ||
220 | ** segments are merged to a single level 2 segment (representing | ||
221 | ** MERGE_COUNT^2 updates), and so on. | ||
222 | ** | ||
223 | ** A segment merge traverses all segments at a given level in | ||
224 | ** parallel, performing a straightforward sorted merge. Since segment | ||
225 | ** leaf nodes are written in to the %_segments table in order, this | ||
226 | ** merge traverses the underlying sqlite disk structures efficiently. | ||
227 | ** After the merge, all segment blocks from the merged level are | ||
228 | ** deleted. | ||
229 | ** | ||
230 | ** MERGE_COUNT controls how often we merge segments. 16 seems to be | ||
231 | ** somewhat of a sweet spot for insertion performance. 32 and 64 show | ||
232 | ** very similar performance numbers to 16 on insertion, though they're | ||
233 | ** a tiny bit slower (perhaps due to more overhead in merge-time | ||
234 | ** sorting). 8 is about 20% slower than 16, 4 about 50% slower than | ||
235 | ** 16, 2 about 66% slower than 16. | ||
236 | ** | ||
237 | ** At query time, high MERGE_COUNT increases the number of segments | ||
238 | ** which need to be scanned and merged. For instance, with 100k docs | ||
239 | ** inserted: | ||
240 | ** | ||
241 | ** MERGE_COUNT segments | ||
242 | ** 16 25 | ||
243 | ** 8 12 | ||
244 | ** 4 10 | ||
245 | ** 2 6 | ||
246 | ** | ||
247 | ** This appears to have only a moderate impact on queries for very | ||
248 | ** frequent terms (which are somewhat dominated by segment merge | ||
249 | ** costs), and infrequent and non-existent terms still seem to be fast | ||
250 | ** even with many segments. | ||
251 | ** | ||
252 | ** TODO(shess) That said, it would be nice to have a better query-side | ||
253 | ** argument for MERGE_COUNT of 16. Also, it's possible/likely that | ||
254 | ** optimizations to things like doclist merging will swing the sweet | ||
255 | ** spot around. | ||
256 | ** | ||
257 | ** | ||
258 | ** | ||
259 | **** Handling of deletions and updates **** | ||
260 | ** Since we're using a segmented structure, with no docid-oriented | ||
261 | ** index into the term index, we clearly cannot simply update the term | ||
262 | ** index when a document is deleted or updated. For deletions, we | ||
263 | ** write an empty doclist (varint(docid) varint(POS_END)), for updates | ||
264 | ** we simply write the new doclist. Segment merges overwrite older | ||
265 | ** data for a particular docid with newer data, so deletes or updates | ||
266 | ** will eventually overtake the earlier data and knock it out. The | ||
267 | ** query logic likewise merges doclists so that newer data knocks out | ||
268 | ** older data. | ||
269 | ** | ||
270 | ** TODO(shess) Provide a VACUUM type operation to clear out all | ||
271 | ** deletions and duplications. This would basically be a forced merge | ||
272 | ** into a single segment. | ||
273 | */ | ||
274 | |||
275 | #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) | ||
276 | |||
277 | #if defined(SQLITE_ENABLE_FTS3) && !defined(SQLITE_CORE) | ||
278 | # define SQLITE_CORE 1 | ||
279 | #endif | ||
280 | |||
281 | #include <assert.h> | ||
282 | #include <stdlib.h> | ||
283 | #include <stdio.h> | ||
284 | #include <string.h> | ||
285 | #include <ctype.h> | ||
286 | |||
287 | #include "fts3.h" | ||
288 | #include "fts3_hash.h" | ||
289 | #include "fts3_tokenizer.h" | ||
290 | #include "sqlite3.h" | ||
291 | #include "sqlite3ext.h" | ||
292 | SQLITE_EXTENSION_INIT1 | ||
293 | |||
294 | |||
295 | /* TODO(shess) MAN, this thing needs some refactoring. At minimum, it | ||
296 | ** would be nice to order the file better, perhaps something along the | ||
297 | ** lines of: | ||
298 | ** | ||
299 | ** - utility functions | ||
300 | ** - table setup functions | ||
301 | ** - table update functions | ||
302 | ** - table query functions | ||
303 | ** | ||
304 | ** Put the query functions last because they're likely to reference | ||
305 | ** typedefs or functions from the table update section. | ||
306 | */ | ||
307 | |||
308 | #if 0 | ||
309 | # define TRACE(A) printf A; fflush(stdout) | ||
310 | #else | ||
311 | # define TRACE(A) | ||
312 | #endif | ||
313 | |||
314 | /* It is not safe to call isspace(), tolower(), or isalnum() on | ||
315 | ** hi-bit-set characters. This is the same solution used in the | ||
316 | ** tokenizer. | ||
317 | */ | ||
318 | /* TODO(shess) The snippet-generation code should be using the | ||
319 | ** tokenizer-generated tokens rather than doing its own local | ||
320 | ** tokenization. | ||
321 | */ | ||
322 | /* TODO(shess) Is __isascii() a portable version of (c&0x80)==0? */ | ||
323 | static int safe_isspace(char c){ | ||
324 | return (c&0x80)==0 ? isspace(c) : 0; | ||
325 | } | ||
326 | static int safe_tolower(char c){ | ||
327 | return (c&0x80)==0 ? tolower(c) : c; | ||
328 | } | ||
329 | static int safe_isalnum(char c){ | ||
330 | return (c&0x80)==0 ? isalnum(c) : 0; | ||
331 | } | ||
332 | |||
333 | typedef enum DocListType { | ||
334 | DL_DOCIDS, /* docids only */ | ||
335 | DL_POSITIONS, /* docids + positions */ | ||
336 | DL_POSITIONS_OFFSETS /* docids + positions + offsets */ | ||
337 | } DocListType; | ||
338 | |||
339 | /* | ||
340 | ** By default, only positions and not offsets are stored in the doclists. | ||
341 | ** To change this so that offsets are stored too, compile with | ||
342 | ** | ||
343 | ** -DDL_DEFAULT=DL_POSITIONS_OFFSETS | ||
344 | ** | ||
345 | ** If DL_DEFAULT is set to DL_DOCIDS, your table can only be inserted | ||
346 | ** into (no deletes or updates). | ||
347 | */ | ||
348 | #ifndef DL_DEFAULT | ||
349 | # define DL_DEFAULT DL_POSITIONS | ||
350 | #endif | ||
351 | |||
352 | enum { | ||
353 | POS_END = 0, /* end of this position list */ | ||
354 | POS_COLUMN, /* followed by new column number */ | ||
355 | POS_BASE | ||
356 | }; | ||
357 | |||
358 | /* MERGE_COUNT controls how often we merge segments (see comment at | ||
359 | ** top of file). | ||
360 | */ | ||
361 | #define MERGE_COUNT 16 | ||
362 | |||
363 | /* utility functions */ | ||
364 | |||
365 | /* CLEAR() and SCRAMBLE() abstract memset() on a pointer to a single | ||
366 | ** record to prevent errors of the form: | ||
367 | ** | ||
368 | ** my_function(SomeType *b){ | ||
369 | ** memset(b, '\0', sizeof(b)); // sizeof(b)!=sizeof(*b) | ||
370 | ** } | ||
371 | */ | ||
372 | /* TODO(shess) Obvious candidates for a header file. */ | ||
373 | #define CLEAR(b) memset(b, '\0', sizeof(*(b))) | ||
374 | |||
375 | #ifndef NDEBUG | ||
376 | # define SCRAMBLE(b) memset(b, 0x55, sizeof(*(b))) | ||
377 | #else | ||
378 | # define SCRAMBLE(b) | ||
379 | #endif | ||
380 | |||
381 | /* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */ | ||
382 | #define VARINT_MAX 10 | ||
383 | |||
384 | /* Write a 64-bit variable-length integer to memory starting at p[0]. | ||
385 | * The length of data written will be between 1 and VARINT_MAX bytes. | ||
386 | * The number of bytes written is returned. */ | ||
387 | static int putVarint(char *p, sqlite_int64 v){ | ||
388 | unsigned char *q = (unsigned char *) p; | ||
389 | sqlite_uint64 vu = v; | ||
390 | do{ | ||
391 | *q++ = (unsigned char) ((vu & 0x7f) | 0x80); | ||
392 | vu >>= 7; | ||
393 | }while( vu!=0 ); | ||
394 | q[-1] &= 0x7f; /* turn off high bit in final byte */ | ||
395 | assert( q - (unsigned char *)p <= VARINT_MAX ); | ||
396 | return (int) (q - (unsigned char *)p); | ||
397 | } | ||
398 | |||
399 | /* Read a 64-bit variable-length integer from memory starting at p[0]. | ||
400 | * Return the number of bytes read, or 0 on error. | ||
401 | * The value is stored in *v. */ | ||
402 | static int getVarint(const char *p, sqlite_int64 *v){ | ||
403 | const unsigned char *q = (const unsigned char *) p; | ||
404 | sqlite_uint64 x = 0, y = 1; | ||
405 | while( (*q & 0x80) == 0x80 ){ | ||
406 | x += y * (*q++ & 0x7f); | ||
407 | y <<= 7; | ||
408 | if( q - (unsigned char *)p >= VARINT_MAX ){ /* bad data */ | ||
409 | assert( 0 ); | ||
410 | return 0; | ||
411 | } | ||
412 | } | ||
413 | x += y * (*q++); | ||
414 | *v = (sqlite_int64) x; | ||
415 | return (int) (q - (unsigned char *)p); | ||
416 | } | ||
417 | |||
418 | static int getVarint32(const char *p, int *pi){ | ||
419 | sqlite_int64 i; | ||
420 | int ret = getVarint(p, &i); | ||
421 | *pi = (int) i; | ||
422 | assert( *pi==i ); | ||
423 | return ret; | ||
424 | } | ||
425 | |||
426 | /*******************************************************************/ | ||
427 | /* DataBuffer is used to collect data into a buffer in piecemeal | ||
428 | ** fashion. It implements the usual distinction between amount of | ||
429 | ** data currently stored (nData) and buffer capacity (nCapacity). | ||
430 | ** | ||
431 | ** dataBufferInit - create a buffer with given initial capacity. | ||
432 | ** dataBufferReset - forget buffer's data, retaining capacity. | ||
433 | ** dataBufferDestroy - free buffer's data. | ||
434 | ** dataBufferExpand - expand capacity without adding data. | ||
435 | ** dataBufferAppend - append data. | ||
436 | ** dataBufferAppend2 - append two pieces of data at once. | ||
437 | ** dataBufferReplace - replace buffer's data. | ||
438 | */ | ||
439 | typedef struct DataBuffer { | ||
440 | char *pData; /* Pointer to malloc'ed buffer. */ | ||
441 | int nCapacity; /* Size of pData buffer. */ | ||
442 | int nData; /* End of data loaded into pData. */ | ||
443 | } DataBuffer; | ||
444 | |||
445 | static void dataBufferInit(DataBuffer *pBuffer, int nCapacity){ | ||
446 | assert( nCapacity>=0 ); | ||
447 | pBuffer->nData = 0; | ||
448 | pBuffer->nCapacity = nCapacity; | ||
449 | pBuffer->pData = nCapacity==0 ? NULL : malloc(nCapacity); | ||
450 | } | ||
451 | static void dataBufferReset(DataBuffer *pBuffer){ | ||
452 | pBuffer->nData = 0; | ||
453 | } | ||
454 | static void dataBufferDestroy(DataBuffer *pBuffer){ | ||
455 | if( pBuffer->pData!=NULL ) free(pBuffer->pData); | ||
456 | SCRAMBLE(pBuffer); | ||
457 | } | ||
458 | static void dataBufferExpand(DataBuffer *pBuffer, int nAddCapacity){ | ||
459 | assert( nAddCapacity>0 ); | ||
460 | /* TODO(shess) Consider expanding more aggressively. Note that the | ||
461 | ** underlying malloc implementation may take care of such things for | ||
462 | ** us already. | ||
463 | */ | ||
464 | if( pBuffer->nData+nAddCapacity>pBuffer->nCapacity ){ | ||
465 | pBuffer->nCapacity = pBuffer->nData+nAddCapacity; | ||
466 | pBuffer->pData = realloc(pBuffer->pData, pBuffer->nCapacity); | ||
467 | } | ||
468 | } | ||
469 | static void dataBufferAppend(DataBuffer *pBuffer, | ||
470 | const char *pSource, int nSource){ | ||
471 | assert( nSource>0 && pSource!=NULL ); | ||
472 | dataBufferExpand(pBuffer, nSource); | ||
473 | memcpy(pBuffer->pData+pBuffer->nData, pSource, nSource); | ||
474 | pBuffer->nData += nSource; | ||
475 | } | ||
476 | static void dataBufferAppend2(DataBuffer *pBuffer, | ||
477 | const char *pSource1, int nSource1, | ||
478 | const char *pSource2, int nSource2){ | ||
479 | assert( nSource1>0 && pSource1!=NULL ); | ||
480 | assert( nSource2>0 && pSource2!=NULL ); | ||
481 | dataBufferExpand(pBuffer, nSource1+nSource2); | ||
482 | memcpy(pBuffer->pData+pBuffer->nData, pSource1, nSource1); | ||
483 | memcpy(pBuffer->pData+pBuffer->nData+nSource1, pSource2, nSource2); | ||
484 | pBuffer->nData += nSource1+nSource2; | ||
485 | } | ||
486 | static void dataBufferReplace(DataBuffer *pBuffer, | ||
487 | const char *pSource, int nSource){ | ||
488 | dataBufferReset(pBuffer); | ||
489 | dataBufferAppend(pBuffer, pSource, nSource); | ||
490 | } | ||
491 | |||
492 | /* StringBuffer is a null-terminated version of DataBuffer. */ | ||
493 | typedef struct StringBuffer { | ||
494 | DataBuffer b; /* Includes null terminator. */ | ||
495 | } StringBuffer; | ||
496 | |||
497 | static void initStringBuffer(StringBuffer *sb){ | ||
498 | dataBufferInit(&sb->b, 100); | ||
499 | dataBufferReplace(&sb->b, "", 1); | ||
500 | } | ||
501 | static int stringBufferLength(StringBuffer *sb){ | ||
502 | return sb->b.nData-1; | ||
503 | } | ||
504 | static char *stringBufferData(StringBuffer *sb){ | ||
505 | return sb->b.pData; | ||
506 | } | ||
507 | static void stringBufferDestroy(StringBuffer *sb){ | ||
508 | dataBufferDestroy(&sb->b); | ||
509 | } | ||
510 | |||
511 | static void nappend(StringBuffer *sb, const char *zFrom, int nFrom){ | ||
512 | assert( sb->b.nData>0 ); | ||
513 | if( nFrom>0 ){ | ||
514 | sb->b.nData--; | ||
515 | dataBufferAppend2(&sb->b, zFrom, nFrom, "", 1); | ||
516 | } | ||
517 | } | ||
518 | static void append(StringBuffer *sb, const char *zFrom){ | ||
519 | nappend(sb, zFrom, strlen(zFrom)); | ||
520 | } | ||
521 | |||
522 | /* Append a list of strings separated by commas. */ | ||
523 | static void appendList(StringBuffer *sb, int nString, char **azString){ | ||
524 | int i; | ||
525 | for(i=0; i<nString; ++i){ | ||
526 | if( i>0 ) append(sb, ", "); | ||
527 | append(sb, azString[i]); | ||
528 | } | ||
529 | } | ||
530 | |||
531 | static int endsInWhiteSpace(StringBuffer *p){ | ||
532 | return stringBufferLength(p)>0 && | ||
533 | safe_isspace(stringBufferData(p)[stringBufferLength(p)-1]); | ||
534 | } | ||
535 | |||
536 | /* If the StringBuffer ends in something other than white space, add a | ||
537 | ** single space character to the end. | ||
538 | */ | ||
539 | static void appendWhiteSpace(StringBuffer *p){ | ||
540 | if( stringBufferLength(p)==0 ) return; | ||
541 | if( !endsInWhiteSpace(p) ) append(p, " "); | ||
542 | } | ||
543 | |||
544 | /* Remove white space from the end of the StringBuffer */ | ||
545 | static void trimWhiteSpace(StringBuffer *p){ | ||
546 | while( endsInWhiteSpace(p) ){ | ||
547 | p->b.pData[--p->b.nData-1] = '\0'; | ||
548 | } | ||
549 | } | ||
550 | |||
551 | /*******************************************************************/ | ||
552 | /* DLReader is used to read document elements from a doclist. The | ||
553 | ** current docid is cached, so dlrDocid() is fast. DLReader does not | ||
554 | ** own the doclist buffer. | ||
555 | ** | ||
556 | ** dlrAtEnd - true if there's no more data to read. | ||
557 | ** dlrDocid - docid of current document. | ||
558 | ** dlrDocData - doclist data for current document (including docid). | ||
559 | ** dlrDocDataBytes - length of same. | ||
560 | ** dlrAllDataBytes - length of all remaining data. | ||
561 | ** dlrPosData - position data for current document. | ||
562 | ** dlrPosDataLen - length of pos data for current document (incl POS_END). | ||
563 | ** dlrStep - step to current document. | ||
564 | ** dlrInit - initial for doclist of given type against given data. | ||
565 | ** dlrDestroy - clean up. | ||
566 | ** | ||
567 | ** Expected usage is something like: | ||
568 | ** | ||
569 | ** DLReader reader; | ||
570 | ** dlrInit(&reader, pData, nData); | ||
571 | ** while( !dlrAtEnd(&reader) ){ | ||
572 | ** // calls to dlrDocid() and kin. | ||
573 | ** dlrStep(&reader); | ||
574 | ** } | ||
575 | ** dlrDestroy(&reader); | ||
576 | */ | ||
577 | typedef struct DLReader { | ||
578 | DocListType iType; | ||
579 | const char *pData; | ||
580 | int nData; | ||
581 | |||
582 | sqlite_int64 iDocid; | ||
583 | int nElement; | ||
584 | } DLReader; | ||
585 | |||
586 | static int dlrAtEnd(DLReader *pReader){ | ||
587 | assert( pReader->nData>=0 ); | ||
588 | return pReader->nData==0; | ||
589 | } | ||
590 | static sqlite_int64 dlrDocid(DLReader *pReader){ | ||
591 | assert( !dlrAtEnd(pReader) ); | ||
592 | return pReader->iDocid; | ||
593 | } | ||
594 | static const char *dlrDocData(DLReader *pReader){ | ||
595 | assert( !dlrAtEnd(pReader) ); | ||
596 | return pReader->pData; | ||
597 | } | ||
598 | static int dlrDocDataBytes(DLReader *pReader){ | ||
599 | assert( !dlrAtEnd(pReader) ); | ||
600 | return pReader->nElement; | ||
601 | } | ||
602 | static int dlrAllDataBytes(DLReader *pReader){ | ||
603 | assert( !dlrAtEnd(pReader) ); | ||
604 | return pReader->nData; | ||
605 | } | ||
606 | /* TODO(shess) Consider adding a field to track iDocid varint length | ||
607 | ** to make these two functions faster. This might matter (a tiny bit) | ||
608 | ** for queries. | ||
609 | */ | ||
610 | static const char *dlrPosData(DLReader *pReader){ | ||
611 | sqlite_int64 iDummy; | ||
612 | int n = getVarint(pReader->pData, &iDummy); | ||
613 | assert( !dlrAtEnd(pReader) ); | ||
614 | return pReader->pData+n; | ||
615 | } | ||
616 | static int dlrPosDataLen(DLReader *pReader){ | ||
617 | sqlite_int64 iDummy; | ||
618 | int n = getVarint(pReader->pData, &iDummy); | ||
619 | assert( !dlrAtEnd(pReader) ); | ||
620 | return pReader->nElement-n; | ||
621 | } | ||
622 | static void dlrStep(DLReader *pReader){ | ||
623 | assert( !dlrAtEnd(pReader) ); | ||
624 | |||
625 | /* Skip past current doclist element. */ | ||
626 | assert( pReader->nElement<=pReader->nData ); | ||
627 | pReader->pData += pReader->nElement; | ||
628 | pReader->nData -= pReader->nElement; | ||
629 | |||
630 | /* If there is more data, read the next doclist element. */ | ||
631 | if( pReader->nData!=0 ){ | ||
632 | sqlite_int64 iDocidDelta; | ||
633 | int iDummy, n = getVarint(pReader->pData, &iDocidDelta); | ||
634 | pReader->iDocid += iDocidDelta; | ||
635 | if( pReader->iType>=DL_POSITIONS ){ | ||
636 | assert( n<pReader->nData ); | ||
637 | while( 1 ){ | ||
638 | n += getVarint32(pReader->pData+n, &iDummy); | ||
639 | assert( n<=pReader->nData ); | ||
640 | if( iDummy==POS_END ) break; | ||
641 | if( iDummy==POS_COLUMN ){ | ||
642 | n += getVarint32(pReader->pData+n, &iDummy); | ||
643 | assert( n<pReader->nData ); | ||
644 | }else if( pReader->iType==DL_POSITIONS_OFFSETS ){ | ||
645 | n += getVarint32(pReader->pData+n, &iDummy); | ||
646 | n += getVarint32(pReader->pData+n, &iDummy); | ||
647 | assert( n<pReader->nData ); | ||
648 | } | ||
649 | } | ||
650 | } | ||
651 | pReader->nElement = n; | ||
652 | assert( pReader->nElement<=pReader->nData ); | ||
653 | } | ||
654 | } | ||
655 | static void dlrInit(DLReader *pReader, DocListType iType, | ||
656 | const char *pData, int nData){ | ||
657 | assert( pData!=NULL && nData!=0 ); | ||
658 | pReader->iType = iType; | ||
659 | pReader->pData = pData; | ||
660 | pReader->nData = nData; | ||
661 | pReader->nElement = 0; | ||
662 | pReader->iDocid = 0; | ||
663 | |||
664 | /* Load the first element's data. There must be a first element. */ | ||
665 | dlrStep(pReader); | ||
666 | } | ||
667 | static void dlrDestroy(DLReader *pReader){ | ||
668 | SCRAMBLE(pReader); | ||
669 | } | ||
670 | |||
671 | #ifndef NDEBUG | ||
672 | /* Verify that the doclist can be validly decoded. Also returns the | ||
673 | ** last docid found because it's convenient in other assertions for | ||
674 | ** DLWriter. | ||
675 | */ | ||
676 | static void docListValidate(DocListType iType, const char *pData, int nData, | ||
677 | sqlite_int64 *pLastDocid){ | ||
678 | sqlite_int64 iPrevDocid = 0; | ||
679 | assert( nData>0 ); | ||
680 | assert( pData!=0 ); | ||
681 | assert( pData+nData>pData ); | ||
682 | while( nData!=0 ){ | ||
683 | sqlite_int64 iDocidDelta; | ||
684 | int n = getVarint(pData, &iDocidDelta); | ||
685 | iPrevDocid += iDocidDelta; | ||
686 | if( iType>DL_DOCIDS ){ | ||
687 | int iDummy; | ||
688 | while( 1 ){ | ||
689 | n += getVarint32(pData+n, &iDummy); | ||
690 | if( iDummy==POS_END ) break; | ||
691 | if( iDummy==POS_COLUMN ){ | ||
692 | n += getVarint32(pData+n, &iDummy); | ||
693 | }else if( iType>DL_POSITIONS ){ | ||
694 | n += getVarint32(pData+n, &iDummy); | ||
695 | n += getVarint32(pData+n, &iDummy); | ||
696 | } | ||
697 | assert( n<=nData ); | ||
698 | } | ||
699 | } | ||
700 | assert( n<=nData ); | ||
701 | pData += n; | ||
702 | nData -= n; | ||
703 | } | ||
704 | if( pLastDocid ) *pLastDocid = iPrevDocid; | ||
705 | } | ||
706 | #define ASSERT_VALID_DOCLIST(i, p, n, o) docListValidate(i, p, n, o) | ||
707 | #else | ||
708 | #define ASSERT_VALID_DOCLIST(i, p, n, o) assert( 1 ) | ||
709 | #endif | ||
710 | |||
711 | /*******************************************************************/ | ||
712 | /* DLWriter is used to write doclist data to a DataBuffer. DLWriter | ||
713 | ** always appends to the buffer and does not own it. | ||
714 | ** | ||
715 | ** dlwInit - initialize to write a given type doclistto a buffer. | ||
716 | ** dlwDestroy - clear the writer's memory. Does not free buffer. | ||
717 | ** dlwAppend - append raw doclist data to buffer. | ||
718 | ** dlwCopy - copy next doclist from reader to writer. | ||
719 | ** dlwAdd - construct doclist element and append to buffer. | ||
720 | ** Only apply dlwAdd() to DL_DOCIDS doclists (else use PLWriter). | ||
721 | */ | ||
722 | typedef struct DLWriter { | ||
723 | DocListType iType; | ||
724 | DataBuffer *b; | ||
725 | sqlite_int64 iPrevDocid; | ||
726 | #ifndef NDEBUG | ||
727 | int has_iPrevDocid; | ||
728 | #endif | ||
729 | } DLWriter; | ||
730 | |||
731 | static void dlwInit(DLWriter *pWriter, DocListType iType, DataBuffer *b){ | ||
732 | pWriter->b = b; | ||
733 | pWriter->iType = iType; | ||
734 | pWriter->iPrevDocid = 0; | ||
735 | #ifndef NDEBUG | ||
736 | pWriter->has_iPrevDocid = 0; | ||
737 | #endif | ||
738 | } | ||
739 | static void dlwDestroy(DLWriter *pWriter){ | ||
740 | SCRAMBLE(pWriter); | ||
741 | } | ||
742 | /* iFirstDocid is the first docid in the doclist in pData. It is | ||
743 | ** needed because pData may point within a larger doclist, in which | ||
744 | ** case the first item would be delta-encoded. | ||
745 | ** | ||
746 | ** iLastDocid is the final docid in the doclist in pData. It is | ||
747 | ** needed to create the new iPrevDocid for future delta-encoding. The | ||
748 | ** code could decode the passed doclist to recreate iLastDocid, but | ||
749 | ** the only current user (docListMerge) already has decoded this | ||
750 | ** information. | ||
751 | */ | ||
752 | /* TODO(shess) This has become just a helper for docListMerge. | ||
753 | ** Consider a refactor to make this cleaner. | ||
754 | */ | ||
755 | static void dlwAppend(DLWriter *pWriter, | ||
756 | const char *pData, int nData, | ||
757 | sqlite_int64 iFirstDocid, sqlite_int64 iLastDocid){ | ||
758 | sqlite_int64 iDocid = 0; | ||
759 | char c[VARINT_MAX]; | ||
760 | int nFirstOld, nFirstNew; /* Old and new varint len of first docid. */ | ||
761 | #ifndef NDEBUG | ||
762 | sqlite_int64 iLastDocidDelta; | ||
763 | #endif | ||
764 | |||
765 | /* Recode the initial docid as delta from iPrevDocid. */ | ||
766 | nFirstOld = getVarint(pData, &iDocid); | ||
767 | assert( nFirstOld<nData || (nFirstOld==nData && pWriter->iType==DL_DOCIDS) ); | ||
768 | nFirstNew = putVarint(c, iFirstDocid-pWriter->iPrevDocid); | ||
769 | |||
770 | /* Verify that the incoming doclist is valid AND that it ends with | ||
771 | ** the expected docid. This is essential because we'll trust this | ||
772 | ** docid in future delta-encoding. | ||
773 | */ | ||
774 | ASSERT_VALID_DOCLIST(pWriter->iType, pData, nData, &iLastDocidDelta); | ||
775 | assert( iLastDocid==iFirstDocid-iDocid+iLastDocidDelta ); | ||
776 | |||
777 | /* Append recoded initial docid and everything else. Rest of docids | ||
778 | ** should have been delta-encoded from previous initial docid. | ||
779 | */ | ||
780 | if( nFirstOld<nData ){ | ||
781 | dataBufferAppend2(pWriter->b, c, nFirstNew, | ||
782 | pData+nFirstOld, nData-nFirstOld); | ||
783 | }else{ | ||
784 | dataBufferAppend(pWriter->b, c, nFirstNew); | ||
785 | } | ||
786 | pWriter->iPrevDocid = iLastDocid; | ||
787 | } | ||
788 | static void dlwCopy(DLWriter *pWriter, DLReader *pReader){ | ||
789 | dlwAppend(pWriter, dlrDocData(pReader), dlrDocDataBytes(pReader), | ||
790 | dlrDocid(pReader), dlrDocid(pReader)); | ||
791 | } | ||
792 | static void dlwAdd(DLWriter *pWriter, sqlite_int64 iDocid){ | ||
793 | char c[VARINT_MAX]; | ||
794 | int n = putVarint(c, iDocid-pWriter->iPrevDocid); | ||
795 | |||
796 | /* Docids must ascend. */ | ||
797 | assert( !pWriter->has_iPrevDocid || iDocid>pWriter->iPrevDocid ); | ||
798 | assert( pWriter->iType==DL_DOCIDS ); | ||
799 | |||
800 | dataBufferAppend(pWriter->b, c, n); | ||
801 | pWriter->iPrevDocid = iDocid; | ||
802 | #ifndef NDEBUG | ||
803 | pWriter->has_iPrevDocid = 1; | ||
804 | #endif | ||
805 | } | ||
806 | |||
807 | /*******************************************************************/ | ||
808 | /* PLReader is used to read data from a document's position list. As | ||
809 | ** the caller steps through the list, data is cached so that varints | ||
810 | ** only need to be decoded once. | ||
811 | ** | ||
812 | ** plrInit, plrDestroy - create/destroy a reader. | ||
813 | ** plrColumn, plrPosition, plrStartOffset, plrEndOffset - accessors | ||
814 | ** plrAtEnd - at end of stream, only call plrDestroy once true. | ||
815 | ** plrStep - step to the next element. | ||
816 | */ | ||
817 | typedef struct PLReader { | ||
818 | /* These refer to the next position's data. nData will reach 0 when | ||
819 | ** reading the last position, so plrStep() signals EOF by setting | ||
820 | ** pData to NULL. | ||
821 | */ | ||
822 | const char *pData; | ||
823 | int nData; | ||
824 | |||
825 | DocListType iType; | ||
826 | int iColumn; /* the last column read */ | ||
827 | int iPosition; /* the last position read */ | ||
828 | int iStartOffset; /* the last start offset read */ | ||
829 | int iEndOffset; /* the last end offset read */ | ||
830 | } PLReader; | ||
831 | |||
832 | static int plrAtEnd(PLReader *pReader){ | ||
833 | return pReader->pData==NULL; | ||
834 | } | ||
835 | static int plrColumn(PLReader *pReader){ | ||
836 | assert( !plrAtEnd(pReader) ); | ||
837 | return pReader->iColumn; | ||
838 | } | ||
839 | static int plrPosition(PLReader *pReader){ | ||
840 | assert( !plrAtEnd(pReader) ); | ||
841 | return pReader->iPosition; | ||
842 | } | ||
843 | static int plrStartOffset(PLReader *pReader){ | ||
844 | assert( !plrAtEnd(pReader) ); | ||
845 | return pReader->iStartOffset; | ||
846 | } | ||
847 | static int plrEndOffset(PLReader *pReader){ | ||
848 | assert( !plrAtEnd(pReader) ); | ||
849 | return pReader->iEndOffset; | ||
850 | } | ||
851 | static void plrStep(PLReader *pReader){ | ||
852 | int i, n; | ||
853 | |||
854 | assert( !plrAtEnd(pReader) ); | ||
855 | |||
856 | if( pReader->nData==0 ){ | ||
857 | pReader->pData = NULL; | ||
858 | return; | ||
859 | } | ||
860 | |||
861 | n = getVarint32(pReader->pData, &i); | ||
862 | if( i==POS_COLUMN ){ | ||
863 | n += getVarint32(pReader->pData+n, &pReader->iColumn); | ||
864 | pReader->iPosition = 0; | ||
865 | pReader->iStartOffset = 0; | ||
866 | n += getVarint32(pReader->pData+n, &i); | ||
867 | } | ||
868 | /* Should never see adjacent column changes. */ | ||
869 | assert( i!=POS_COLUMN ); | ||
870 | |||
871 | if( i==POS_END ){ | ||
872 | pReader->nData = 0; | ||
873 | pReader->pData = NULL; | ||
874 | return; | ||
875 | } | ||
876 | |||
877 | pReader->iPosition += i-POS_BASE; | ||
878 | if( pReader->iType==DL_POSITIONS_OFFSETS ){ | ||
879 | n += getVarint32(pReader->pData+n, &i); | ||
880 | pReader->iStartOffset += i; | ||
881 | n += getVarint32(pReader->pData+n, &i); | ||
882 | pReader->iEndOffset = pReader->iStartOffset+i; | ||
883 | } | ||
884 | assert( n<=pReader->nData ); | ||
885 | pReader->pData += n; | ||
886 | pReader->nData -= n; | ||
887 | } | ||
888 | |||
889 | static void plrInit(PLReader *pReader, DLReader *pDLReader){ | ||
890 | pReader->pData = dlrPosData(pDLReader); | ||
891 | pReader->nData = dlrPosDataLen(pDLReader); | ||
892 | pReader->iType = pDLReader->iType; | ||
893 | pReader->iColumn = 0; | ||
894 | pReader->iPosition = 0; | ||
895 | pReader->iStartOffset = 0; | ||
896 | pReader->iEndOffset = 0; | ||
897 | plrStep(pReader); | ||
898 | } | ||
899 | static void plrDestroy(PLReader *pReader){ | ||
900 | SCRAMBLE(pReader); | ||
901 | } | ||
902 | |||
903 | /*******************************************************************/ | ||
904 | /* PLWriter is used in constructing a document's position list. As a | ||
905 | ** convenience, if iType is DL_DOCIDS, PLWriter becomes a no-op. | ||
906 | ** PLWriter writes to the associated DLWriter's buffer. | ||
907 | ** | ||
908 | ** plwInit - init for writing a document's poslist. | ||
909 | ** plwDestroy - clear a writer. | ||
910 | ** plwAdd - append position and offset information. | ||
911 | ** plwCopy - copy next position's data from reader to writer. | ||
912 | ** plwTerminate - add any necessary doclist terminator. | ||
913 | ** | ||
914 | ** Calling plwAdd() after plwTerminate() may result in a corrupt | ||
915 | ** doclist. | ||
916 | */ | ||
917 | /* TODO(shess) Until we've written the second item, we can cache the | ||
918 | ** first item's information. Then we'd have three states: | ||
919 | ** | ||
920 | ** - initialized with docid, no positions. | ||
921 | ** - docid and one position. | ||
922 | ** - docid and multiple positions. | ||
923 | ** | ||
924 | ** Only the last state needs to actually write to dlw->b, which would | ||
925 | ** be an improvement in the DLCollector case. | ||
926 | */ | ||
927 | typedef struct PLWriter { | ||
928 | DLWriter *dlw; | ||
929 | |||
930 | int iColumn; /* the last column written */ | ||
931 | int iPos; /* the last position written */ | ||
932 | int iOffset; /* the last start offset written */ | ||
933 | } PLWriter; | ||
934 | |||
935 | /* TODO(shess) In the case where the parent is reading these values | ||
936 | ** from a PLReader, we could optimize to a copy if that PLReader has | ||
937 | ** the same type as pWriter. | ||
938 | */ | ||
939 | static void plwAdd(PLWriter *pWriter, int iColumn, int iPos, | ||
940 | int iStartOffset, int iEndOffset){ | ||
941 | /* Worst-case space for POS_COLUMN, iColumn, iPosDelta, | ||
942 | ** iStartOffsetDelta, and iEndOffsetDelta. | ||
943 | */ | ||
944 | char c[5*VARINT_MAX]; | ||
945 | int n = 0; | ||
946 | |||
947 | /* Ban plwAdd() after plwTerminate(). */ | ||
948 | assert( pWriter->iPos!=-1 ); | ||
949 | |||
950 | if( pWriter->dlw->iType==DL_DOCIDS ) return; | ||
951 | |||
952 | if( iColumn!=pWriter->iColumn ){ | ||
953 | n += putVarint(c+n, POS_COLUMN); | ||
954 | n += putVarint(c+n, iColumn); | ||
955 | pWriter->iColumn = iColumn; | ||
956 | pWriter->iPos = 0; | ||
957 | pWriter->iOffset = 0; | ||
958 | } | ||
959 | assert( iPos>=pWriter->iPos ); | ||
960 | n += putVarint(c+n, POS_BASE+(iPos-pWriter->iPos)); | ||
961 | pWriter->iPos = iPos; | ||
962 | if( pWriter->dlw->iType==DL_POSITIONS_OFFSETS ){ | ||
963 | assert( iStartOffset>=pWriter->iOffset ); | ||
964 | n += putVarint(c+n, iStartOffset-pWriter->iOffset); | ||
965 | pWriter->iOffset = iStartOffset; | ||
966 | assert( iEndOffset>=iStartOffset ); | ||
967 | n += putVarint(c+n, iEndOffset-iStartOffset); | ||
968 | } | ||
969 | dataBufferAppend(pWriter->dlw->b, c, n); | ||
970 | } | ||
971 | static void plwCopy(PLWriter *pWriter, PLReader *pReader){ | ||
972 | plwAdd(pWriter, plrColumn(pReader), plrPosition(pReader), | ||
973 | plrStartOffset(pReader), plrEndOffset(pReader)); | ||
974 | } | ||
975 | static void plwInit(PLWriter *pWriter, DLWriter *dlw, sqlite_int64 iDocid){ | ||
976 | char c[VARINT_MAX]; | ||
977 | int n; | ||
978 | |||
979 | pWriter->dlw = dlw; | ||
980 | |||
981 | /* Docids must ascend. */ | ||
982 | assert( !pWriter->dlw->has_iPrevDocid || iDocid>pWriter->dlw->iPrevDocid ); | ||
983 | n = putVarint(c, iDocid-pWriter->dlw->iPrevDocid); | ||
984 | dataBufferAppend(pWriter->dlw->b, c, n); | ||
985 | pWriter->dlw->iPrevDocid = iDocid; | ||
986 | #ifndef NDEBUG | ||
987 | pWriter->dlw->has_iPrevDocid = 1; | ||
988 | #endif | ||
989 | |||
990 | pWriter->iColumn = 0; | ||
991 | pWriter->iPos = 0; | ||
992 | pWriter->iOffset = 0; | ||
993 | } | ||
994 | /* TODO(shess) Should plwDestroy() also terminate the doclist? But | ||
995 | ** then plwDestroy() would no longer be just a destructor, it would | ||
996 | ** also be doing work, which isn't consistent with the overall idiom. | ||
997 | ** Another option would be for plwAdd() to always append any necessary | ||
998 | ** terminator, so that the output is always correct. But that would | ||
999 | ** add incremental work to the common case with the only benefit being | ||
1000 | ** API elegance. Punt for now. | ||
1001 | */ | ||
1002 | static void plwTerminate(PLWriter *pWriter){ | ||
1003 | if( pWriter->dlw->iType>DL_DOCIDS ){ | ||
1004 | char c[VARINT_MAX]; | ||
1005 | int n = putVarint(c, POS_END); | ||
1006 | dataBufferAppend(pWriter->dlw->b, c, n); | ||
1007 | } | ||
1008 | #ifndef NDEBUG | ||
1009 | /* Mark as terminated for assert in plwAdd(). */ | ||
1010 | pWriter->iPos = -1; | ||
1011 | #endif | ||
1012 | } | ||
1013 | static void plwDestroy(PLWriter *pWriter){ | ||
1014 | SCRAMBLE(pWriter); | ||
1015 | } | ||
1016 | |||
1017 | /*******************************************************************/ | ||
1018 | /* DLCollector wraps PLWriter and DLWriter to provide a | ||
1019 | ** dynamically-allocated doclist area to use during tokenization. | ||
1020 | ** | ||
1021 | ** dlcNew - malloc up and initialize a collector. | ||
1022 | ** dlcDelete - destroy a collector and all contained items. | ||
1023 | ** dlcAddPos - append position and offset information. | ||
1024 | ** dlcAddDoclist - add the collected doclist to the given buffer. | ||
1025 | ** dlcNext - terminate the current document and open another. | ||
1026 | */ | ||
1027 | typedef struct DLCollector { | ||
1028 | DataBuffer b; | ||
1029 | DLWriter dlw; | ||
1030 | PLWriter plw; | ||
1031 | } DLCollector; | ||
1032 | |||
1033 | /* TODO(shess) This could also be done by calling plwTerminate() and | ||
1034 | ** dataBufferAppend(). I tried that, expecting nominal performance | ||
1035 | ** differences, but it seemed to pretty reliably be worth 1% to code | ||
1036 | ** it this way. I suspect it's the incremental malloc overhead (some | ||
1037 | ** percentage of the plwTerminate() calls will cause a realloc), so | ||
1038 | ** this might be worth revisiting if the DataBuffer implementation | ||
1039 | ** changes. | ||
1040 | */ | ||
1041 | static void dlcAddDoclist(DLCollector *pCollector, DataBuffer *b){ | ||
1042 | if( pCollector->dlw.iType>DL_DOCIDS ){ | ||
1043 | char c[VARINT_MAX]; | ||
1044 | int n = putVarint(c, POS_END); | ||
1045 | dataBufferAppend2(b, pCollector->b.pData, pCollector->b.nData, c, n); | ||
1046 | }else{ | ||
1047 | dataBufferAppend(b, pCollector->b.pData, pCollector->b.nData); | ||
1048 | } | ||
1049 | } | ||
1050 | static void dlcNext(DLCollector *pCollector, sqlite_int64 iDocid){ | ||
1051 | plwTerminate(&pCollector->plw); | ||
1052 | plwDestroy(&pCollector->plw); | ||
1053 | plwInit(&pCollector->plw, &pCollector->dlw, iDocid); | ||
1054 | } | ||
1055 | static void dlcAddPos(DLCollector *pCollector, int iColumn, int iPos, | ||
1056 | int iStartOffset, int iEndOffset){ | ||
1057 | plwAdd(&pCollector->plw, iColumn, iPos, iStartOffset, iEndOffset); | ||
1058 | } | ||
1059 | |||
1060 | static DLCollector *dlcNew(sqlite_int64 iDocid, DocListType iType){ | ||
1061 | DLCollector *pCollector = malloc(sizeof(DLCollector)); | ||
1062 | dataBufferInit(&pCollector->b, 0); | ||
1063 | dlwInit(&pCollector->dlw, iType, &pCollector->b); | ||
1064 | plwInit(&pCollector->plw, &pCollector->dlw, iDocid); | ||
1065 | return pCollector; | ||
1066 | } | ||
1067 | static void dlcDelete(DLCollector *pCollector){ | ||
1068 | plwDestroy(&pCollector->plw); | ||
1069 | dlwDestroy(&pCollector->dlw); | ||
1070 | dataBufferDestroy(&pCollector->b); | ||
1071 | SCRAMBLE(pCollector); | ||
1072 | free(pCollector); | ||
1073 | } | ||
1074 | |||
1075 | |||
1076 | /* Copy the doclist data of iType in pData/nData into *out, trimming | ||
1077 | ** unnecessary data as we go. Only columns matching iColumn are | ||
1078 | ** copied, all columns copied if iColumn is -1. Elements with no | ||
1079 | ** matching columns are dropped. The output is an iOutType doclist. | ||
1080 | */ | ||
1081 | /* NOTE(shess) This code is only valid after all doclists are merged. | ||
1082 | ** If this is run before merges, then doclist items which represent | ||
1083 | ** deletion will be trimmed, and will thus not effect a deletion | ||
1084 | ** during the merge. | ||
1085 | */ | ||
1086 | static void docListTrim(DocListType iType, const char *pData, int nData, | ||
1087 | int iColumn, DocListType iOutType, DataBuffer *out){ | ||
1088 | DLReader dlReader; | ||
1089 | DLWriter dlWriter; | ||
1090 | |||
1091 | assert( iOutType<=iType ); | ||
1092 | |||
1093 | dlrInit(&dlReader, iType, pData, nData); | ||
1094 | dlwInit(&dlWriter, iOutType, out); | ||
1095 | |||
1096 | while( !dlrAtEnd(&dlReader) ){ | ||
1097 | PLReader plReader; | ||
1098 | PLWriter plWriter; | ||
1099 | int match = 0; | ||
1100 | |||
1101 | plrInit(&plReader, &dlReader); | ||
1102 | |||
1103 | while( !plrAtEnd(&plReader) ){ | ||
1104 | if( iColumn==-1 || plrColumn(&plReader)==iColumn ){ | ||
1105 | if( !match ){ | ||
1106 | plwInit(&plWriter, &dlWriter, dlrDocid(&dlReader)); | ||
1107 | match = 1; | ||
1108 | } | ||
1109 | plwAdd(&plWriter, plrColumn(&plReader), plrPosition(&plReader), | ||
1110 | plrStartOffset(&plReader), plrEndOffset(&plReader)); | ||
1111 | } | ||
1112 | plrStep(&plReader); | ||
1113 | } | ||
1114 | if( match ){ | ||
1115 | plwTerminate(&plWriter); | ||
1116 | plwDestroy(&plWriter); | ||
1117 | } | ||
1118 | |||
1119 | plrDestroy(&plReader); | ||
1120 | dlrStep(&dlReader); | ||
1121 | } | ||
1122 | dlwDestroy(&dlWriter); | ||
1123 | dlrDestroy(&dlReader); | ||
1124 | } | ||
1125 | |||
1126 | /* Used by docListMerge() to keep doclists in the ascending order by | ||
1127 | ** docid, then ascending order by age (so the newest comes first). | ||
1128 | */ | ||
1129 | typedef struct OrderedDLReader { | ||
1130 | DLReader *pReader; | ||
1131 | |||
1132 | /* TODO(shess) If we assume that docListMerge pReaders is ordered by | ||
1133 | ** age (which we do), then we could use pReader comparisons to break | ||
1134 | ** ties. | ||
1135 | */ | ||
1136 | int idx; | ||
1137 | } OrderedDLReader; | ||
1138 | |||
1139 | /* Order eof to end, then by docid asc, idx desc. */ | ||
1140 | static int orderedDLReaderCmp(OrderedDLReader *r1, OrderedDLReader *r2){ | ||
1141 | if( dlrAtEnd(r1->pReader) ){ | ||
1142 | if( dlrAtEnd(r2->pReader) ) return 0; /* Both atEnd(). */ | ||
1143 | return 1; /* Only r1 atEnd(). */ | ||
1144 | } | ||
1145 | if( dlrAtEnd(r2->pReader) ) return -1; /* Only r2 atEnd(). */ | ||
1146 | |||
1147 | if( dlrDocid(r1->pReader)<dlrDocid(r2->pReader) ) return -1; | ||
1148 | if( dlrDocid(r1->pReader)>dlrDocid(r2->pReader) ) return 1; | ||
1149 | |||
1150 | /* Descending on idx. */ | ||
1151 | return r2->idx-r1->idx; | ||
1152 | } | ||
1153 | |||
1154 | /* Bubble p[0] to appropriate place in p[1..n-1]. Assumes that | ||
1155 | ** p[1..n-1] is already sorted. | ||
1156 | */ | ||
1157 | /* TODO(shess) Is this frequent enough to warrant a binary search? | ||
1158 | ** Before implementing that, instrument the code to check. In most | ||
1159 | ** current usage, I expect that p[0] will be less than p[1] a very | ||
1160 | ** high proportion of the time. | ||
1161 | */ | ||
1162 | static void orderedDLReaderReorder(OrderedDLReader *p, int n){ | ||
1163 | while( n>1 && orderedDLReaderCmp(p, p+1)>0 ){ | ||
1164 | OrderedDLReader tmp = p[0]; | ||
1165 | p[0] = p[1]; | ||
1166 | p[1] = tmp; | ||
1167 | n--; | ||
1168 | p++; | ||
1169 | } | ||
1170 | } | ||
1171 | |||
1172 | /* Given an array of doclist readers, merge their doclist elements | ||
1173 | ** into out in sorted order (by docid), dropping elements from older | ||
1174 | ** readers when there is a duplicate docid. pReaders is assumed to be | ||
1175 | ** ordered by age, oldest first. | ||
1176 | */ | ||
1177 | /* TODO(shess) nReaders must be <= MERGE_COUNT. This should probably | ||
1178 | ** be fixed. | ||
1179 | */ | ||
1180 | static void docListMerge(DataBuffer *out, | ||
1181 | DLReader *pReaders, int nReaders){ | ||
1182 | OrderedDLReader readers[MERGE_COUNT]; | ||
1183 | DLWriter writer; | ||
1184 | int i, n; | ||
1185 | const char *pStart = 0; | ||
1186 | int nStart = 0; | ||
1187 | sqlite_int64 iFirstDocid = 0, iLastDocid = 0; | ||
1188 | |||
1189 | assert( nReaders>0 ); | ||
1190 | if( nReaders==1 ){ | ||
1191 | dataBufferAppend(out, dlrDocData(pReaders), dlrAllDataBytes(pReaders)); | ||
1192 | return; | ||
1193 | } | ||
1194 | |||
1195 | assert( nReaders<=MERGE_COUNT ); | ||
1196 | n = 0; | ||
1197 | for(i=0; i<nReaders; i++){ | ||
1198 | assert( pReaders[i].iType==pReaders[0].iType ); | ||
1199 | readers[i].pReader = pReaders+i; | ||
1200 | readers[i].idx = i; | ||
1201 | n += dlrAllDataBytes(&pReaders[i]); | ||
1202 | } | ||
1203 | /* Conservatively size output to sum of inputs. Output should end | ||
1204 | ** up strictly smaller than input. | ||
1205 | */ | ||
1206 | dataBufferExpand(out, n); | ||
1207 | |||
1208 | /* Get the readers into sorted order. */ | ||
1209 | while( i-->0 ){ | ||
1210 | orderedDLReaderReorder(readers+i, nReaders-i); | ||
1211 | } | ||
1212 | |||
1213 | dlwInit(&writer, pReaders[0].iType, out); | ||
1214 | while( !dlrAtEnd(readers[0].pReader) ){ | ||
1215 | sqlite_int64 iDocid = dlrDocid(readers[0].pReader); | ||
1216 | |||
1217 | /* If this is a continuation of the current buffer to copy, extend | ||
1218 | ** that buffer. memcpy() seems to be more efficient if it has a | ||
1219 | ** lots of data to copy. | ||
1220 | */ | ||
1221 | if( dlrDocData(readers[0].pReader)==pStart+nStart ){ | ||
1222 | nStart += dlrDocDataBytes(readers[0].pReader); | ||
1223 | }else{ | ||
1224 | if( pStart!=0 ){ | ||
1225 | dlwAppend(&writer, pStart, nStart, iFirstDocid, iLastDocid); | ||
1226 | } | ||
1227 | pStart = dlrDocData(readers[0].pReader); | ||
1228 | nStart = dlrDocDataBytes(readers[0].pReader); | ||
1229 | iFirstDocid = iDocid; | ||
1230 | } | ||
1231 | iLastDocid = iDocid; | ||
1232 | dlrStep(readers[0].pReader); | ||
1233 | |||
1234 | /* Drop all of the older elements with the same docid. */ | ||
1235 | for(i=1; i<nReaders && | ||
1236 | !dlrAtEnd(readers[i].pReader) && | ||
1237 | dlrDocid(readers[i].pReader)==iDocid; i++){ | ||
1238 | dlrStep(readers[i].pReader); | ||
1239 | } | ||
1240 | |||
1241 | /* Get the readers back into order. */ | ||
1242 | while( i-->0 ){ | ||
1243 | orderedDLReaderReorder(readers+i, nReaders-i); | ||
1244 | } | ||
1245 | } | ||
1246 | |||
1247 | /* Copy over any remaining elements. */ | ||
1248 | if( nStart>0 ) dlwAppend(&writer, pStart, nStart, iFirstDocid, iLastDocid); | ||
1249 | dlwDestroy(&writer); | ||
1250 | } | ||
1251 | |||
1252 | /* Helper function for posListUnion(). Compares the current position | ||
1253 | ** between left and right, returning as standard C idiom of <0 if | ||
1254 | ** left<right, >0 if left>right, and 0 if left==right. "End" always | ||
1255 | ** compares greater. | ||
1256 | */ | ||
1257 | static int posListCmp(PLReader *pLeft, PLReader *pRight){ | ||
1258 | assert( pLeft->iType==pRight->iType ); | ||
1259 | if( pLeft->iType==DL_DOCIDS ) return 0; | ||
1260 | |||
1261 | if( plrAtEnd(pLeft) ) return plrAtEnd(pRight) ? 0 : 1; | ||
1262 | if( plrAtEnd(pRight) ) return -1; | ||
1263 | |||
1264 | if( plrColumn(pLeft)<plrColumn(pRight) ) return -1; | ||
1265 | if( plrColumn(pLeft)>plrColumn(pRight) ) return 1; | ||
1266 | |||
1267 | if( plrPosition(pLeft)<plrPosition(pRight) ) return -1; | ||
1268 | if( plrPosition(pLeft)>plrPosition(pRight) ) return 1; | ||
1269 | if( pLeft->iType==DL_POSITIONS ) return 0; | ||
1270 | |||
1271 | if( plrStartOffset(pLeft)<plrStartOffset(pRight) ) return -1; | ||
1272 | if( plrStartOffset(pLeft)>plrStartOffset(pRight) ) return 1; | ||
1273 | |||
1274 | if( plrEndOffset(pLeft)<plrEndOffset(pRight) ) return -1; | ||
1275 | if( plrEndOffset(pLeft)>plrEndOffset(pRight) ) return 1; | ||
1276 | |||
1277 | return 0; | ||
1278 | } | ||
1279 | |||
1280 | /* Write the union of position lists in pLeft and pRight to pOut. | ||
1281 | ** "Union" in this case meaning "All unique position tuples". Should | ||
1282 | ** work with any doclist type, though both inputs and the output | ||
1283 | ** should be the same type. | ||
1284 | */ | ||
1285 | static void posListUnion(DLReader *pLeft, DLReader *pRight, DLWriter *pOut){ | ||
1286 | PLReader left, right; | ||
1287 | PLWriter writer; | ||
1288 | |||
1289 | assert( dlrDocid(pLeft)==dlrDocid(pRight) ); | ||
1290 | assert( pLeft->iType==pRight->iType ); | ||
1291 | assert( pLeft->iType==pOut->iType ); | ||
1292 | |||
1293 | plrInit(&left, pLeft); | ||
1294 | plrInit(&right, pRight); | ||
1295 | plwInit(&writer, pOut, dlrDocid(pLeft)); | ||
1296 | |||
1297 | while( !plrAtEnd(&left) || !plrAtEnd(&right) ){ | ||
1298 | int c = posListCmp(&left, &right); | ||
1299 | if( c<0 ){ | ||
1300 | plwCopy(&writer, &left); | ||
1301 | plrStep(&left); | ||
1302 | }else if( c>0 ){ | ||
1303 | plwCopy(&writer, &right); | ||
1304 | plrStep(&right); | ||
1305 | }else{ | ||
1306 | plwCopy(&writer, &left); | ||
1307 | plrStep(&left); | ||
1308 | plrStep(&right); | ||
1309 | } | ||
1310 | } | ||
1311 | |||
1312 | plwTerminate(&writer); | ||
1313 | plwDestroy(&writer); | ||
1314 | plrDestroy(&left); | ||
1315 | plrDestroy(&right); | ||
1316 | } | ||
1317 | |||
1318 | /* Write the union of doclists in pLeft and pRight to pOut. For | ||
1319 | ** docids in common between the inputs, the union of the position | ||
1320 | ** lists is written. Inputs and outputs are always type DL_DEFAULT. | ||
1321 | */ | ||
1322 | static void docListUnion( | ||
1323 | const char *pLeft, int nLeft, | ||
1324 | const char *pRight, int nRight, | ||
1325 | DataBuffer *pOut /* Write the combined doclist here */ | ||
1326 | ){ | ||
1327 | DLReader left, right; | ||
1328 | DLWriter writer; | ||
1329 | |||
1330 | if( nLeft==0 ){ | ||
1331 | dataBufferAppend(pOut, pRight, nRight); | ||
1332 | return; | ||
1333 | } | ||
1334 | if( nRight==0 ){ | ||
1335 | dataBufferAppend(pOut, pLeft, nLeft); | ||
1336 | return; | ||
1337 | } | ||
1338 | |||
1339 | dlrInit(&left, DL_DEFAULT, pLeft, nLeft); | ||
1340 | dlrInit(&right, DL_DEFAULT, pRight, nRight); | ||
1341 | dlwInit(&writer, DL_DEFAULT, pOut); | ||
1342 | |||
1343 | while( !dlrAtEnd(&left) || !dlrAtEnd(&right) ){ | ||
1344 | if( dlrAtEnd(&right) ){ | ||
1345 | dlwCopy(&writer, &left); | ||
1346 | dlrStep(&left); | ||
1347 | }else if( dlrAtEnd(&left) ){ | ||
1348 | dlwCopy(&writer, &right); | ||
1349 | dlrStep(&right); | ||
1350 | }else if( dlrDocid(&left)<dlrDocid(&right) ){ | ||
1351 | dlwCopy(&writer, &left); | ||
1352 | dlrStep(&left); | ||
1353 | }else if( dlrDocid(&left)>dlrDocid(&right) ){ | ||
1354 | dlwCopy(&writer, &right); | ||
1355 | dlrStep(&right); | ||
1356 | }else{ | ||
1357 | posListUnion(&left, &right, &writer); | ||
1358 | dlrStep(&left); | ||
1359 | dlrStep(&right); | ||
1360 | } | ||
1361 | } | ||
1362 | |||
1363 | dlrDestroy(&left); | ||
1364 | dlrDestroy(&right); | ||
1365 | dlwDestroy(&writer); | ||
1366 | } | ||
1367 | |||
1368 | /* pLeft and pRight are DLReaders positioned to the same docid. | ||
1369 | ** | ||
1370 | ** If there are no instances in pLeft or pRight where the position | ||
1371 | ** of pLeft is one less than the position of pRight, then this | ||
1372 | ** routine adds nothing to pOut. | ||
1373 | ** | ||
1374 | ** If there are one or more instances where positions from pLeft | ||
1375 | ** are exactly one less than positions from pRight, then add a new | ||
1376 | ** document record to pOut. If pOut wants to hold positions, then | ||
1377 | ** include the positions from pRight that are one more than a | ||
1378 | ** position in pLeft. In other words: pRight.iPos==pLeft.iPos+1. | ||
1379 | */ | ||
1380 | static void posListPhraseMerge(DLReader *pLeft, DLReader *pRight, | ||
1381 | DLWriter *pOut){ | ||
1382 | PLReader left, right; | ||
1383 | PLWriter writer; | ||
1384 | int match = 0; | ||
1385 | |||
1386 | assert( dlrDocid(pLeft)==dlrDocid(pRight) ); | ||
1387 | assert( pOut->iType!=DL_POSITIONS_OFFSETS ); | ||
1388 | |||
1389 | plrInit(&left, pLeft); | ||
1390 | plrInit(&right, pRight); | ||
1391 | |||
1392 | while( !plrAtEnd(&left) && !plrAtEnd(&right) ){ | ||
1393 | if( plrColumn(&left)<plrColumn(&right) ){ | ||
1394 | plrStep(&left); | ||
1395 | }else if( plrColumn(&left)>plrColumn(&right) ){ | ||
1396 | plrStep(&right); | ||
1397 | }else if( plrPosition(&left)+1<plrPosition(&right) ){ | ||
1398 | plrStep(&left); | ||
1399 | }else if( plrPosition(&left)+1>plrPosition(&right) ){ | ||
1400 | plrStep(&right); | ||
1401 | }else{ | ||
1402 | if( !match ){ | ||
1403 | plwInit(&writer, pOut, dlrDocid(pLeft)); | ||
1404 | match = 1; | ||
1405 | } | ||
1406 | plwAdd(&writer, plrColumn(&right), plrPosition(&right), 0, 0); | ||
1407 | plrStep(&left); | ||
1408 | plrStep(&right); | ||
1409 | } | ||
1410 | } | ||
1411 | |||
1412 | if( match ){ | ||
1413 | plwTerminate(&writer); | ||
1414 | plwDestroy(&writer); | ||
1415 | } | ||
1416 | |||
1417 | plrDestroy(&left); | ||
1418 | plrDestroy(&right); | ||
1419 | } | ||
1420 | |||
1421 | /* We have two doclists with positions: pLeft and pRight. | ||
1422 | ** Write the phrase intersection of these two doclists into pOut. | ||
1423 | ** | ||
1424 | ** A phrase intersection means that two documents only match | ||
1425 | ** if pLeft.iPos+1==pRight.iPos. | ||
1426 | ** | ||
1427 | ** iType controls the type of data written to pOut. If iType is | ||
1428 | ** DL_POSITIONS, the positions are those from pRight. | ||
1429 | */ | ||
1430 | static void docListPhraseMerge( | ||
1431 | const char *pLeft, int nLeft, | ||
1432 | const char *pRight, int nRight, | ||
1433 | DocListType iType, | ||
1434 | DataBuffer *pOut /* Write the combined doclist here */ | ||
1435 | ){ | ||
1436 | DLReader left, right; | ||
1437 | DLWriter writer; | ||
1438 | |||
1439 | if( nLeft==0 || nRight==0 ) return; | ||
1440 | |||
1441 | assert( iType!=DL_POSITIONS_OFFSETS ); | ||
1442 | |||
1443 | dlrInit(&left, DL_POSITIONS, pLeft, nLeft); | ||
1444 | dlrInit(&right, DL_POSITIONS, pRight, nRight); | ||
1445 | dlwInit(&writer, iType, pOut); | ||
1446 | |||
1447 | while( !dlrAtEnd(&left) && !dlrAtEnd(&right) ){ | ||
1448 | if( dlrDocid(&left)<dlrDocid(&right) ){ | ||
1449 | dlrStep(&left); | ||
1450 | }else if( dlrDocid(&right)<dlrDocid(&left) ){ | ||
1451 | dlrStep(&right); | ||
1452 | }else{ | ||
1453 | posListPhraseMerge(&left, &right, &writer); | ||
1454 | dlrStep(&left); | ||
1455 | dlrStep(&right); | ||
1456 | } | ||
1457 | } | ||
1458 | |||
1459 | dlrDestroy(&left); | ||
1460 | dlrDestroy(&right); | ||
1461 | dlwDestroy(&writer); | ||
1462 | } | ||
1463 | |||
1464 | /* We have two DL_DOCIDS doclists: pLeft and pRight. | ||
1465 | ** Write the intersection of these two doclists into pOut as a | ||
1466 | ** DL_DOCIDS doclist. | ||
1467 | */ | ||
1468 | static void docListAndMerge( | ||
1469 | const char *pLeft, int nLeft, | ||
1470 | const char *pRight, int nRight, | ||
1471 | DataBuffer *pOut /* Write the combined doclist here */ | ||
1472 | ){ | ||
1473 | DLReader left, right; | ||
1474 | DLWriter writer; | ||
1475 | |||
1476 | if( nLeft==0 || nRight==0 ) return; | ||
1477 | |||
1478 | dlrInit(&left, DL_DOCIDS, pLeft, nLeft); | ||
1479 | dlrInit(&right, DL_DOCIDS, pRight, nRight); | ||
1480 | dlwInit(&writer, DL_DOCIDS, pOut); | ||
1481 | |||
1482 | while( !dlrAtEnd(&left) && !dlrAtEnd(&right) ){ | ||
1483 | if( dlrDocid(&left)<dlrDocid(&right) ){ | ||
1484 | dlrStep(&left); | ||
1485 | }else if( dlrDocid(&right)<dlrDocid(&left) ){ | ||
1486 | dlrStep(&right); | ||
1487 | }else{ | ||
1488 | dlwAdd(&writer, dlrDocid(&left)); | ||
1489 | dlrStep(&left); | ||
1490 | dlrStep(&right); | ||
1491 | } | ||
1492 | } | ||
1493 | |||
1494 | dlrDestroy(&left); | ||
1495 | dlrDestroy(&right); | ||
1496 | dlwDestroy(&writer); | ||
1497 | } | ||
1498 | |||
1499 | /* We have two DL_DOCIDS doclists: pLeft and pRight. | ||
1500 | ** Write the union of these two doclists into pOut as a | ||
1501 | ** DL_DOCIDS doclist. | ||
1502 | */ | ||
1503 | static void docListOrMerge( | ||
1504 | const char *pLeft, int nLeft, | ||
1505 | const char *pRight, int nRight, | ||
1506 | DataBuffer *pOut /* Write the combined doclist here */ | ||
1507 | ){ | ||
1508 | DLReader left, right; | ||
1509 | DLWriter writer; | ||
1510 | |||
1511 | if( nLeft==0 ){ | ||
1512 | dataBufferAppend(pOut, pRight, nRight); | ||
1513 | return; | ||
1514 | } | ||
1515 | if( nRight==0 ){ | ||
1516 | dataBufferAppend(pOut, pLeft, nLeft); | ||
1517 | return; | ||
1518 | } | ||
1519 | |||
1520 | dlrInit(&left, DL_DOCIDS, pLeft, nLeft); | ||
1521 | dlrInit(&right, DL_DOCIDS, pRight, nRight); | ||
1522 | dlwInit(&writer, DL_DOCIDS, pOut); | ||
1523 | |||
1524 | while( !dlrAtEnd(&left) || !dlrAtEnd(&right) ){ | ||
1525 | if( dlrAtEnd(&right) ){ | ||
1526 | dlwAdd(&writer, dlrDocid(&left)); | ||
1527 | dlrStep(&left); | ||
1528 | }else if( dlrAtEnd(&left) ){ | ||
1529 | dlwAdd(&writer, dlrDocid(&right)); | ||
1530 | dlrStep(&right); | ||
1531 | }else if( dlrDocid(&left)<dlrDocid(&right) ){ | ||
1532 | dlwAdd(&writer, dlrDocid(&left)); | ||
1533 | dlrStep(&left); | ||
1534 | }else if( dlrDocid(&right)<dlrDocid(&left) ){ | ||
1535 | dlwAdd(&writer, dlrDocid(&right)); | ||
1536 | dlrStep(&right); | ||
1537 | }else{ | ||
1538 | dlwAdd(&writer, dlrDocid(&left)); | ||
1539 | dlrStep(&left); | ||
1540 | dlrStep(&right); | ||
1541 | } | ||
1542 | } | ||
1543 | |||
1544 | dlrDestroy(&left); | ||
1545 | dlrDestroy(&right); | ||
1546 | dlwDestroy(&writer); | ||
1547 | } | ||
1548 | |||
1549 | /* We have two DL_DOCIDS doclists: pLeft and pRight. | ||
1550 | ** Write into pOut as DL_DOCIDS doclist containing all documents that | ||
1551 | ** occur in pLeft but not in pRight. | ||
1552 | */ | ||
1553 | static void docListExceptMerge( | ||
1554 | const char *pLeft, int nLeft, | ||
1555 | const char *pRight, int nRight, | ||
1556 | DataBuffer *pOut /* Write the combined doclist here */ | ||
1557 | ){ | ||
1558 | DLReader left, right; | ||
1559 | DLWriter writer; | ||
1560 | |||
1561 | if( nLeft==0 ) return; | ||
1562 | if( nRight==0 ){ | ||
1563 | dataBufferAppend(pOut, pLeft, nLeft); | ||
1564 | return; | ||
1565 | } | ||
1566 | |||
1567 | dlrInit(&left, DL_DOCIDS, pLeft, nLeft); | ||
1568 | dlrInit(&right, DL_DOCIDS, pRight, nRight); | ||
1569 | dlwInit(&writer, DL_DOCIDS, pOut); | ||
1570 | |||
1571 | while( !dlrAtEnd(&left) ){ | ||
1572 | while( !dlrAtEnd(&right) && dlrDocid(&right)<dlrDocid(&left) ){ | ||
1573 | dlrStep(&right); | ||
1574 | } | ||
1575 | if( dlrAtEnd(&right) || dlrDocid(&left)<dlrDocid(&right) ){ | ||
1576 | dlwAdd(&writer, dlrDocid(&left)); | ||
1577 | } | ||
1578 | dlrStep(&left); | ||
1579 | } | ||
1580 | |||
1581 | dlrDestroy(&left); | ||
1582 | dlrDestroy(&right); | ||
1583 | dlwDestroy(&writer); | ||
1584 | } | ||
1585 | |||
1586 | static char *string_dup_n(const char *s, int n){ | ||
1587 | char *str = malloc(n + 1); | ||
1588 | memcpy(str, s, n); | ||
1589 | str[n] = '\0'; | ||
1590 | return str; | ||
1591 | } | ||
1592 | |||
1593 | /* Duplicate a string; the caller must free() the returned string. | ||
1594 | * (We don't use strdup() since it's not part of the standard C library and | ||
1595 | * may not be available everywhere.) */ | ||
1596 | static char *string_dup(const char *s){ | ||
1597 | return string_dup_n(s, strlen(s)); | ||
1598 | } | ||
1599 | |||
1600 | /* Format a string, replacing each occurrence of the % character with | ||
1601 | * zDb.zName. This may be more convenient than sqlite_mprintf() | ||
1602 | * when one string is used repeatedly in a format string. | ||
1603 | * The caller must free() the returned string. */ | ||
1604 | static char *string_format(const char *zFormat, | ||
1605 | const char *zDb, const char *zName){ | ||
1606 | const char *p; | ||
1607 | size_t len = 0; | ||
1608 | size_t nDb = strlen(zDb); | ||
1609 | size_t nName = strlen(zName); | ||
1610 | size_t nFullTableName = nDb+1+nName; | ||
1611 | char *result; | ||
1612 | char *r; | ||
1613 | |||
1614 | /* first compute length needed */ | ||
1615 | for(p = zFormat ; *p ; ++p){ | ||
1616 | len += (*p=='%' ? nFullTableName : 1); | ||
1617 | } | ||
1618 | len += 1; /* for null terminator */ | ||
1619 | |||
1620 | r = result = malloc(len); | ||
1621 | for(p = zFormat; *p; ++p){ | ||
1622 | if( *p=='%' ){ | ||
1623 | memcpy(r, zDb, nDb); | ||
1624 | r += nDb; | ||
1625 | *r++ = '.'; | ||
1626 | memcpy(r, zName, nName); | ||
1627 | r += nName; | ||
1628 | } else { | ||
1629 | *r++ = *p; | ||
1630 | } | ||
1631 | } | ||
1632 | *r++ = '\0'; | ||
1633 | assert( r == result + len ); | ||
1634 | return result; | ||
1635 | } | ||
1636 | |||
1637 | static int sql_exec(sqlite3 *db, const char *zDb, const char *zName, | ||
1638 | const char *zFormat){ | ||
1639 | char *zCommand = string_format(zFormat, zDb, zName); | ||
1640 | int rc; | ||
1641 | TRACE(("FTS3 sql: %s\n", zCommand)); | ||
1642 | rc = sqlite3_exec(db, zCommand, NULL, 0, NULL); | ||
1643 | free(zCommand); | ||
1644 | return rc; | ||
1645 | } | ||
1646 | |||
1647 | static int sql_prepare(sqlite3 *db, const char *zDb, const char *zName, | ||
1648 | sqlite3_stmt **ppStmt, const char *zFormat){ | ||
1649 | char *zCommand = string_format(zFormat, zDb, zName); | ||
1650 | int rc; | ||
1651 | TRACE(("FTS3 prepare: %s\n", zCommand)); | ||
1652 | rc = sqlite3_prepare_v2(db, zCommand, -1, ppStmt, NULL); | ||
1653 | free(zCommand); | ||
1654 | return rc; | ||
1655 | } | ||
1656 | |||
1657 | /* end utility functions */ | ||
1658 | |||
1659 | /* Forward reference */ | ||
1660 | typedef struct fulltext_vtab fulltext_vtab; | ||
1661 | |||
1662 | /* A single term in a query is represented by an instances of | ||
1663 | ** the following structure. | ||
1664 | */ | ||
1665 | typedef struct QueryTerm { | ||
1666 | short int nPhrase; /* How many following terms are part of the same phrase */ | ||
1667 | short int iPhrase; /* This is the i-th term of a phrase. */ | ||
1668 | short int iColumn; /* Column of the index that must match this term */ | ||
1669 | signed char isOr; /* this term is preceded by "OR" */ | ||
1670 | signed char isNot; /* this term is preceded by "-" */ | ||
1671 | signed char isPrefix; /* this term is followed by "*" */ | ||
1672 | char *pTerm; /* text of the term. '\000' terminated. malloced */ | ||
1673 | int nTerm; /* Number of bytes in pTerm[] */ | ||
1674 | } QueryTerm; | ||
1675 | |||
1676 | |||
1677 | /* A query string is parsed into a Query structure. | ||
1678 | * | ||
1679 | * We could, in theory, allow query strings to be complicated | ||
1680 | * nested expressions with precedence determined by parentheses. | ||
1681 | * But none of the major search engines do this. (Perhaps the | ||
1682 | * feeling is that an parenthesized expression is two complex of | ||
1683 | * an idea for the average user to grasp.) Taking our lead from | ||
1684 | * the major search engines, we will allow queries to be a list | ||
1685 | * of terms (with an implied AND operator) or phrases in double-quotes, | ||
1686 | * with a single optional "-" before each non-phrase term to designate | ||
1687 | * negation and an optional OR connector. | ||
1688 | * | ||
1689 | * OR binds more tightly than the implied AND, which is what the | ||
1690 | * major search engines seem to do. So, for example: | ||
1691 | * | ||
1692 | * [one two OR three] ==> one AND (two OR three) | ||
1693 | * [one OR two three] ==> (one OR two) AND three | ||
1694 | * | ||
1695 | * A "-" before a term matches all entries that lack that term. | ||
1696 | * The "-" must occur immediately before the term with in intervening | ||
1697 | * space. This is how the search engines do it. | ||
1698 | * | ||
1699 | * A NOT term cannot be the right-hand operand of an OR. If this | ||
1700 | * occurs in the query string, the NOT is ignored: | ||
1701 | * | ||
1702 | * [one OR -two] ==> one OR two | ||
1703 | * | ||
1704 | */ | ||
1705 | typedef struct Query { | ||
1706 | fulltext_vtab *pFts; /* The full text index */ | ||
1707 | int nTerms; /* Number of terms in the query */ | ||
1708 | QueryTerm *pTerms; /* Array of terms. Space obtained from malloc() */ | ||
1709 | int nextIsOr; /* Set the isOr flag on the next inserted term */ | ||
1710 | int nextColumn; /* Next word parsed must be in this column */ | ||
1711 | int dfltColumn; /* The default column */ | ||
1712 | } Query; | ||
1713 | |||
1714 | |||
1715 | /* | ||
1716 | ** An instance of the following structure keeps track of generated | ||
1717 | ** matching-word offset information and snippets. | ||
1718 | */ | ||
1719 | typedef struct Snippet { | ||
1720 | int nMatch; /* Total number of matches */ | ||
1721 | int nAlloc; /* Space allocated for aMatch[] */ | ||
1722 | struct snippetMatch { /* One entry for each matching term */ | ||
1723 | char snStatus; /* Status flag for use while constructing snippets */ | ||
1724 | short int iCol; /* The column that contains the match */ | ||
1725 | short int iTerm; /* The index in Query.pTerms[] of the matching term */ | ||
1726 | short int nByte; /* Number of bytes in the term */ | ||
1727 | int iStart; /* The offset to the first character of the term */ | ||
1728 | } *aMatch; /* Points to space obtained from malloc */ | ||
1729 | char *zOffset; /* Text rendering of aMatch[] */ | ||
1730 | int nOffset; /* strlen(zOffset) */ | ||
1731 | char *zSnippet; /* Snippet text */ | ||
1732 | int nSnippet; /* strlen(zSnippet) */ | ||
1733 | } Snippet; | ||
1734 | |||
1735 | |||
1736 | typedef enum QueryType { | ||
1737 | QUERY_GENERIC, /* table scan */ | ||
1738 | QUERY_DOCID, /* lookup by docid */ | ||
1739 | QUERY_FULLTEXT /* QUERY_FULLTEXT + [i] is a full-text search for column i*/ | ||
1740 | } QueryType; | ||
1741 | |||
1742 | typedef enum fulltext_statement { | ||
1743 | CONTENT_INSERT_STMT, | ||
1744 | CONTENT_SELECT_STMT, | ||
1745 | CONTENT_UPDATE_STMT, | ||
1746 | CONTENT_DELETE_STMT, | ||
1747 | |||
1748 | BLOCK_INSERT_STMT, | ||
1749 | BLOCK_SELECT_STMT, | ||
1750 | BLOCK_DELETE_STMT, | ||
1751 | |||
1752 | SEGDIR_MAX_INDEX_STMT, | ||
1753 | SEGDIR_SET_STMT, | ||
1754 | SEGDIR_SELECT_STMT, | ||
1755 | SEGDIR_SPAN_STMT, | ||
1756 | SEGDIR_DELETE_STMT, | ||
1757 | SEGDIR_SELECT_ALL_STMT, | ||
1758 | |||
1759 | MAX_STMT /* Always at end! */ | ||
1760 | } fulltext_statement; | ||
1761 | |||
1762 | /* These must exactly match the enum above. */ | ||
1763 | /* TODO(shess): Is there some risk that a statement will be used in two | ||
1764 | ** cursors at once, e.g. if a query joins a virtual table to itself? | ||
1765 | ** If so perhaps we should move some of these to the cursor object. | ||
1766 | */ | ||
1767 | static const char *const fulltext_zStatement[MAX_STMT] = { | ||
1768 | /* CONTENT_INSERT */ NULL, /* generated in contentInsertStatement() */ | ||
1769 | /* CONTENT_SELECT */ NULL, /* generated in contentSelectStatement() */ | ||
1770 | /* CONTENT_UPDATE */ NULL, /* generated in contentUpdateStatement() */ | ||
1771 | /* CONTENT_DELETE */ "delete from %_content where docid = ?", | ||
1772 | |||
1773 | /* BLOCK_INSERT */ | ||
1774 | "insert into %_segments (blockid, block) values (null, ?)", | ||
1775 | /* BLOCK_SELECT */ "select block from %_segments where blockid = ?", | ||
1776 | /* BLOCK_DELETE */ "delete from %_segments where blockid between ? and ?", | ||
1777 | |||
1778 | /* SEGDIR_MAX_INDEX */ "select max(idx) from %_segdir where level = ?", | ||
1779 | /* SEGDIR_SET */ "insert into %_segdir values (?, ?, ?, ?, ?, ?)", | ||
1780 | /* SEGDIR_SELECT */ | ||
1781 | "select start_block, leaves_end_block, root from %_segdir " | ||
1782 | " where level = ? order by idx", | ||
1783 | /* SEGDIR_SPAN */ | ||
1784 | "select min(start_block), max(end_block) from %_segdir " | ||
1785 | " where level = ? and start_block <> 0", | ||
1786 | /* SEGDIR_DELETE */ "delete from %_segdir where level = ?", | ||
1787 | /* SEGDIR_SELECT_ALL */ | ||
1788 | "select root, leaves_end_block from %_segdir order by level desc, idx", | ||
1789 | }; | ||
1790 | |||
1791 | /* | ||
1792 | ** A connection to a fulltext index is an instance of the following | ||
1793 | ** structure. The xCreate and xConnect methods create an instance | ||
1794 | ** of this structure and xDestroy and xDisconnect free that instance. | ||
1795 | ** All other methods receive a pointer to the structure as one of their | ||
1796 | ** arguments. | ||
1797 | */ | ||
1798 | struct fulltext_vtab { | ||
1799 | sqlite3_vtab base; /* Base class used by SQLite core */ | ||
1800 | sqlite3 *db; /* The database connection */ | ||
1801 | const char *zDb; /* logical database name */ | ||
1802 | const char *zName; /* virtual table name */ | ||
1803 | int nColumn; /* number of columns in virtual table */ | ||
1804 | char **azColumn; /* column names. malloced */ | ||
1805 | char **azContentColumn; /* column names in content table; malloced */ | ||
1806 | sqlite3_tokenizer *pTokenizer; /* tokenizer for inserts and queries */ | ||
1807 | |||
1808 | /* Precompiled statements which we keep as long as the table is | ||
1809 | ** open. | ||
1810 | */ | ||
1811 | sqlite3_stmt *pFulltextStatements[MAX_STMT]; | ||
1812 | |||
1813 | /* Precompiled statements used for segment merges. We run a | ||
1814 | ** separate select across the leaf level of each tree being merged. | ||
1815 | */ | ||
1816 | sqlite3_stmt *pLeafSelectStmts[MERGE_COUNT]; | ||
1817 | /* The statement used to prepare pLeafSelectStmts. */ | ||
1818 | #define LEAF_SELECT \ | ||
1819 | "select block from %_segments where blockid between ? and ? order by blockid" | ||
1820 | |||
1821 | /* These buffer pending index updates during transactions. | ||
1822 | ** nPendingData estimates the memory size of the pending data. It | ||
1823 | ** doesn't include the hash-bucket overhead, nor any malloc | ||
1824 | ** overhead. When nPendingData exceeds kPendingThreshold, the | ||
1825 | ** buffer is flushed even before the transaction closes. | ||
1826 | ** pendingTerms stores the data, and is only valid when nPendingData | ||
1827 | ** is >=0 (nPendingData<0 means pendingTerms has not been | ||
1828 | ** initialized). iPrevDocid is the last docid written, used to make | ||
1829 | ** certain we're inserting in sorted order. | ||
1830 | */ | ||
1831 | int nPendingData; | ||
1832 | #define kPendingThreshold (1*1024*1024) | ||
1833 | sqlite_int64 iPrevDocid; | ||
1834 | fts3Hash pendingTerms; | ||
1835 | }; | ||
1836 | |||
1837 | /* | ||
1838 | ** When the core wants to do a query, it create a cursor using a | ||
1839 | ** call to xOpen. This structure is an instance of a cursor. It | ||
1840 | ** is destroyed by xClose. | ||
1841 | */ | ||
1842 | typedef struct fulltext_cursor { | ||
1843 | sqlite3_vtab_cursor base; /* Base class used by SQLite core */ | ||
1844 | QueryType iCursorType; /* Copy of sqlite3_index_info.idxNum */ | ||
1845 | sqlite3_stmt *pStmt; /* Prepared statement in use by the cursor */ | ||
1846 | int eof; /* True if at End Of Results */ | ||
1847 | Query q; /* Parsed query string */ | ||
1848 | Snippet snippet; /* Cached snippet for the current row */ | ||
1849 | int iColumn; /* Column being searched */ | ||
1850 | DataBuffer result; /* Doclist results from fulltextQuery */ | ||
1851 | DLReader reader; /* Result reader if result not empty */ | ||
1852 | } fulltext_cursor; | ||
1853 | |||
1854 | static struct fulltext_vtab *cursor_vtab(fulltext_cursor *c){ | ||
1855 | return (fulltext_vtab *) c->base.pVtab; | ||
1856 | } | ||
1857 | |||
1858 | static const sqlite3_module fts3Module; /* forward declaration */ | ||
1859 | |||
1860 | /* Return a dynamically generated statement of the form | ||
1861 | * insert into %_content (docid, ...) values (?, ...) | ||
1862 | */ | ||
1863 | static const char *contentInsertStatement(fulltext_vtab *v){ | ||
1864 | StringBuffer sb; | ||
1865 | int i; | ||
1866 | |||
1867 | initStringBuffer(&sb); | ||
1868 | append(&sb, "insert into %_content (docid, "); | ||
1869 | appendList(&sb, v->nColumn, v->azContentColumn); | ||
1870 | append(&sb, ") values (?"); | ||
1871 | for(i=0; i<v->nColumn; ++i) | ||
1872 | append(&sb, ", ?"); | ||
1873 | append(&sb, ")"); | ||
1874 | return stringBufferData(&sb); | ||
1875 | } | ||
1876 | |||
1877 | /* Return a dynamically generated statement of the form | ||
1878 | * select <content columns> from %_content where docid = ? | ||
1879 | */ | ||
1880 | static const char *contentSelectStatement(fulltext_vtab *v){ | ||
1881 | StringBuffer sb; | ||
1882 | initStringBuffer(&sb); | ||
1883 | append(&sb, "SELECT "); | ||
1884 | appendList(&sb, v->nColumn, v->azContentColumn); | ||
1885 | append(&sb, " FROM %_content WHERE docid = ?"); | ||
1886 | return stringBufferData(&sb); | ||
1887 | } | ||
1888 | |||
1889 | /* Return a dynamically generated statement of the form | ||
1890 | * update %_content set [col_0] = ?, [col_1] = ?, ... | ||
1891 | * where docid = ? | ||
1892 | */ | ||
1893 | static const char *contentUpdateStatement(fulltext_vtab *v){ | ||
1894 | StringBuffer sb; | ||
1895 | int i; | ||
1896 | |||
1897 | initStringBuffer(&sb); | ||
1898 | append(&sb, "update %_content set "); | ||
1899 | for(i=0; i<v->nColumn; ++i) { | ||
1900 | if( i>0 ){ | ||
1901 | append(&sb, ", "); | ||
1902 | } | ||
1903 | append(&sb, v->azContentColumn[i]); | ||
1904 | append(&sb, " = ?"); | ||
1905 | } | ||
1906 | append(&sb, " where docid = ?"); | ||
1907 | return stringBufferData(&sb); | ||
1908 | } | ||
1909 | |||
1910 | /* Puts a freshly-prepared statement determined by iStmt in *ppStmt. | ||
1911 | ** If the indicated statement has never been prepared, it is prepared | ||
1912 | ** and cached, otherwise the cached version is reset. | ||
1913 | */ | ||
1914 | static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt, | ||
1915 | sqlite3_stmt **ppStmt){ | ||
1916 | assert( iStmt<MAX_STMT ); | ||
1917 | if( v->pFulltextStatements[iStmt]==NULL ){ | ||
1918 | const char *zStmt; | ||
1919 | int rc; | ||
1920 | switch( iStmt ){ | ||
1921 | case CONTENT_INSERT_STMT: | ||
1922 | zStmt = contentInsertStatement(v); break; | ||
1923 | case CONTENT_SELECT_STMT: | ||
1924 | zStmt = contentSelectStatement(v); break; | ||
1925 | case CONTENT_UPDATE_STMT: | ||
1926 | zStmt = contentUpdateStatement(v); break; | ||
1927 | default: | ||
1928 | zStmt = fulltext_zStatement[iStmt]; | ||
1929 | } | ||
1930 | rc = sql_prepare(v->db, v->zDb, v->zName, &v->pFulltextStatements[iStmt], | ||
1931 | zStmt); | ||
1932 | if( zStmt != fulltext_zStatement[iStmt]) free((void *) zStmt); | ||
1933 | if( rc!=SQLITE_OK ) return rc; | ||
1934 | } else { | ||
1935 | int rc = sqlite3_reset(v->pFulltextStatements[iStmt]); | ||
1936 | if( rc!=SQLITE_OK ) return rc; | ||
1937 | } | ||
1938 | |||
1939 | *ppStmt = v->pFulltextStatements[iStmt]; | ||
1940 | return SQLITE_OK; | ||
1941 | } | ||
1942 | |||
1943 | /* Like sqlite3_step(), but convert SQLITE_DONE to SQLITE_OK and | ||
1944 | ** SQLITE_ROW to SQLITE_ERROR. Useful for statements like UPDATE, | ||
1945 | ** where we expect no results. | ||
1946 | */ | ||
1947 | static int sql_single_step(sqlite3_stmt *s){ | ||
1948 | int rc = sqlite3_step(s); | ||
1949 | return (rc==SQLITE_DONE) ? SQLITE_OK : rc; | ||
1950 | } | ||
1951 | |||
1952 | /* Like sql_get_statement(), but for special replicated LEAF_SELECT | ||
1953 | ** statements. | ||
1954 | */ | ||
1955 | /* TODO(shess) Write version for generic statements and then share | ||
1956 | ** that between the cached-statement functions. | ||
1957 | */ | ||
1958 | static int sql_get_leaf_statement(fulltext_vtab *v, int idx, | ||
1959 | sqlite3_stmt **ppStmt){ | ||
1960 | assert( idx>=0 && idx<MERGE_COUNT ); | ||
1961 | if( v->pLeafSelectStmts[idx]==NULL ){ | ||
1962 | int rc = sql_prepare(v->db, v->zDb, v->zName, &v->pLeafSelectStmts[idx], | ||
1963 | LEAF_SELECT); | ||
1964 | if( rc!=SQLITE_OK ) return rc; | ||
1965 | }else{ | ||
1966 | int rc = sqlite3_reset(v->pLeafSelectStmts[idx]); | ||
1967 | if( rc!=SQLITE_OK ) return rc; | ||
1968 | } | ||
1969 | |||
1970 | *ppStmt = v->pLeafSelectStmts[idx]; | ||
1971 | return SQLITE_OK; | ||
1972 | } | ||
1973 | |||
1974 | /* insert into %_content (docid, ...) values ([docid], [pValues]) | ||
1975 | ** If the docid contains SQL NULL, then a unique docid will be | ||
1976 | ** generated. | ||
1977 | */ | ||
1978 | static int content_insert(fulltext_vtab *v, sqlite3_value *docid, | ||
1979 | sqlite3_value **pValues){ | ||
1980 | sqlite3_stmt *s; | ||
1981 | int i; | ||
1982 | int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s); | ||
1983 | if( rc!=SQLITE_OK ) return rc; | ||
1984 | |||
1985 | rc = sqlite3_bind_value(s, 1, docid); | ||
1986 | if( rc!=SQLITE_OK ) return rc; | ||
1987 | |||
1988 | for(i=0; i<v->nColumn; ++i){ | ||
1989 | rc = sqlite3_bind_value(s, 2+i, pValues[i]); | ||
1990 | if( rc!=SQLITE_OK ) return rc; | ||
1991 | } | ||
1992 | |||
1993 | return sql_single_step(s); | ||
1994 | } | ||
1995 | |||
1996 | /* update %_content set col0 = pValues[0], col1 = pValues[1], ... | ||
1997 | * where docid = [iDocid] */ | ||
1998 | static int content_update(fulltext_vtab *v, sqlite3_value **pValues, | ||
1999 | sqlite_int64 iDocid){ | ||
2000 | sqlite3_stmt *s; | ||
2001 | int i; | ||
2002 | int rc = sql_get_statement(v, CONTENT_UPDATE_STMT, &s); | ||
2003 | if( rc!=SQLITE_OK ) return rc; | ||
2004 | |||
2005 | for(i=0; i<v->nColumn; ++i){ | ||
2006 | rc = sqlite3_bind_value(s, 1+i, pValues[i]); | ||
2007 | if( rc!=SQLITE_OK ) return rc; | ||
2008 | } | ||
2009 | |||
2010 | rc = sqlite3_bind_int64(s, 1+v->nColumn, iDocid); | ||
2011 | if( rc!=SQLITE_OK ) return rc; | ||
2012 | |||
2013 | return sql_single_step(s); | ||
2014 | } | ||
2015 | |||
2016 | static void freeStringArray(int nString, const char **pString){ | ||
2017 | int i; | ||
2018 | |||
2019 | for (i=0 ; i < nString ; ++i) { | ||
2020 | if( pString[i]!=NULL ) free((void *) pString[i]); | ||
2021 | } | ||
2022 | free((void *) pString); | ||
2023 | } | ||
2024 | |||
2025 | /* select * from %_content where docid = [iDocid] | ||
2026 | * The caller must delete the returned array and all strings in it. | ||
2027 | * null fields will be NULL in the returned array. | ||
2028 | * | ||
2029 | * TODO: Perhaps we should return pointer/length strings here for consistency | ||
2030 | * with other code which uses pointer/length. */ | ||
2031 | static int content_select(fulltext_vtab *v, sqlite_int64 iDocid, | ||
2032 | const char ***pValues){ | ||
2033 | sqlite3_stmt *s; | ||
2034 | const char **values; | ||
2035 | int i; | ||
2036 | int rc; | ||
2037 | |||
2038 | *pValues = NULL; | ||
2039 | |||
2040 | rc = sql_get_statement(v, CONTENT_SELECT_STMT, &s); | ||
2041 | if( rc!=SQLITE_OK ) return rc; | ||
2042 | |||
2043 | rc = sqlite3_bind_int64(s, 1, iDocid); | ||
2044 | if( rc!=SQLITE_OK ) return rc; | ||
2045 | |||
2046 | rc = sqlite3_step(s); | ||
2047 | if( rc!=SQLITE_ROW ) return rc; | ||
2048 | |||
2049 | values = (const char **) malloc(v->nColumn * sizeof(const char *)); | ||
2050 | for(i=0; i<v->nColumn; ++i){ | ||
2051 | if( sqlite3_column_type(s, i)==SQLITE_NULL ){ | ||
2052 | values[i] = NULL; | ||
2053 | }else{ | ||
2054 | values[i] = string_dup((char*)sqlite3_column_text(s, i)); | ||
2055 | } | ||
2056 | } | ||
2057 | |||
2058 | /* We expect only one row. We must execute another sqlite3_step() | ||
2059 | * to complete the iteration; otherwise the table will remain locked. */ | ||
2060 | rc = sqlite3_step(s); | ||
2061 | if( rc==SQLITE_DONE ){ | ||
2062 | *pValues = values; | ||
2063 | return SQLITE_OK; | ||
2064 | } | ||
2065 | |||
2066 | freeStringArray(v->nColumn, values); | ||
2067 | return rc; | ||
2068 | } | ||
2069 | |||
2070 | /* delete from %_content where docid = [iDocid ] */ | ||
2071 | static int content_delete(fulltext_vtab *v, sqlite_int64 iDocid){ | ||
2072 | sqlite3_stmt *s; | ||
2073 | int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s); | ||
2074 | if( rc!=SQLITE_OK ) return rc; | ||
2075 | |||
2076 | rc = sqlite3_bind_int64(s, 1, iDocid); | ||
2077 | if( rc!=SQLITE_OK ) return rc; | ||
2078 | |||
2079 | return sql_single_step(s); | ||
2080 | } | ||
2081 | |||
2082 | /* insert into %_segments values ([pData]) | ||
2083 | ** returns assigned blockid in *piBlockid | ||
2084 | */ | ||
2085 | static int block_insert(fulltext_vtab *v, const char *pData, int nData, | ||
2086 | sqlite_int64 *piBlockid){ | ||
2087 | sqlite3_stmt *s; | ||
2088 | int rc = sql_get_statement(v, BLOCK_INSERT_STMT, &s); | ||
2089 | if( rc!=SQLITE_OK ) return rc; | ||
2090 | |||
2091 | rc = sqlite3_bind_blob(s, 1, pData, nData, SQLITE_STATIC); | ||
2092 | if( rc!=SQLITE_OK ) return rc; | ||
2093 | |||
2094 | rc = sqlite3_step(s); | ||
2095 | if( rc==SQLITE_ROW ) return SQLITE_ERROR; | ||
2096 | if( rc!=SQLITE_DONE ) return rc; | ||
2097 | |||
2098 | /* blockid column is an alias for rowid. */ | ||
2099 | *piBlockid = sqlite3_last_insert_rowid(v->db); | ||
2100 | return SQLITE_OK; | ||
2101 | } | ||
2102 | |||
2103 | /* delete from %_segments | ||
2104 | ** where blockid between [iStartBlockid] and [iEndBlockid] | ||
2105 | ** | ||
2106 | ** Deletes the range of blocks, inclusive, used to delete the blocks | ||
2107 | ** which form a segment. | ||
2108 | */ | ||
2109 | static int block_delete(fulltext_vtab *v, | ||
2110 | sqlite_int64 iStartBlockid, sqlite_int64 iEndBlockid){ | ||
2111 | sqlite3_stmt *s; | ||
2112 | int rc = sql_get_statement(v, BLOCK_DELETE_STMT, &s); | ||
2113 | if( rc!=SQLITE_OK ) return rc; | ||
2114 | |||
2115 | rc = sqlite3_bind_int64(s, 1, iStartBlockid); | ||
2116 | if( rc!=SQLITE_OK ) return rc; | ||
2117 | |||
2118 | rc = sqlite3_bind_int64(s, 2, iEndBlockid); | ||
2119 | if( rc!=SQLITE_OK ) return rc; | ||
2120 | |||
2121 | return sql_single_step(s); | ||
2122 | } | ||
2123 | |||
2124 | /* Returns SQLITE_ROW with *pidx set to the maximum segment idx found | ||
2125 | ** at iLevel. Returns SQLITE_DONE if there are no segments at | ||
2126 | ** iLevel. Otherwise returns an error. | ||
2127 | */ | ||
2128 | static int segdir_max_index(fulltext_vtab *v, int iLevel, int *pidx){ | ||
2129 | sqlite3_stmt *s; | ||
2130 | int rc = sql_get_statement(v, SEGDIR_MAX_INDEX_STMT, &s); | ||
2131 | if( rc!=SQLITE_OK ) return rc; | ||
2132 | |||
2133 | rc = sqlite3_bind_int(s, 1, iLevel); | ||
2134 | if( rc!=SQLITE_OK ) return rc; | ||
2135 | |||
2136 | rc = sqlite3_step(s); | ||
2137 | /* Should always get at least one row due to how max() works. */ | ||
2138 | if( rc==SQLITE_DONE ) return SQLITE_DONE; | ||
2139 | if( rc!=SQLITE_ROW ) return rc; | ||
2140 | |||
2141 | /* NULL means that there were no inputs to max(). */ | ||
2142 | if( SQLITE_NULL==sqlite3_column_type(s, 0) ){ | ||
2143 | rc = sqlite3_step(s); | ||
2144 | if( rc==SQLITE_ROW ) return SQLITE_ERROR; | ||
2145 | return rc; | ||
2146 | } | ||
2147 | |||
2148 | *pidx = sqlite3_column_int(s, 0); | ||
2149 | |||
2150 | /* We expect only one row. We must execute another sqlite3_step() | ||
2151 | * to complete the iteration; otherwise the table will remain locked. */ | ||
2152 | rc = sqlite3_step(s); | ||
2153 | if( rc==SQLITE_ROW ) return SQLITE_ERROR; | ||
2154 | if( rc!=SQLITE_DONE ) return rc; | ||
2155 | return SQLITE_ROW; | ||
2156 | } | ||
2157 | |||
2158 | /* insert into %_segdir values ( | ||
2159 | ** [iLevel], [idx], | ||
2160 | ** [iStartBlockid], [iLeavesEndBlockid], [iEndBlockid], | ||
2161 | ** [pRootData] | ||
2162 | ** ) | ||
2163 | */ | ||
2164 | static int segdir_set(fulltext_vtab *v, int iLevel, int idx, | ||
2165 | sqlite_int64 iStartBlockid, | ||
2166 | sqlite_int64 iLeavesEndBlockid, | ||
2167 | sqlite_int64 iEndBlockid, | ||
2168 | const char *pRootData, int nRootData){ | ||
2169 | sqlite3_stmt *s; | ||
2170 | int rc = sql_get_statement(v, SEGDIR_SET_STMT, &s); | ||
2171 | if( rc!=SQLITE_OK ) return rc; | ||
2172 | |||
2173 | rc = sqlite3_bind_int(s, 1, iLevel); | ||
2174 | if( rc!=SQLITE_OK ) return rc; | ||
2175 | |||
2176 | rc = sqlite3_bind_int(s, 2, idx); | ||
2177 | if( rc!=SQLITE_OK ) return rc; | ||
2178 | |||
2179 | rc = sqlite3_bind_int64(s, 3, iStartBlockid); | ||
2180 | if( rc!=SQLITE_OK ) return rc; | ||
2181 | |||
2182 | rc = sqlite3_bind_int64(s, 4, iLeavesEndBlockid); | ||
2183 | if( rc!=SQLITE_OK ) return rc; | ||
2184 | |||
2185 | rc = sqlite3_bind_int64(s, 5, iEndBlockid); | ||
2186 | if( rc!=SQLITE_OK ) return rc; | ||
2187 | |||
2188 | rc = sqlite3_bind_blob(s, 6, pRootData, nRootData, SQLITE_STATIC); | ||
2189 | if( rc!=SQLITE_OK ) return rc; | ||
2190 | |||
2191 | return sql_single_step(s); | ||
2192 | } | ||
2193 | |||
2194 | /* Queries %_segdir for the block span of the segments in level | ||
2195 | ** iLevel. Returns SQLITE_DONE if there are no blocks for iLevel, | ||
2196 | ** SQLITE_ROW if there are blocks, else an error. | ||
2197 | */ | ||
2198 | static int segdir_span(fulltext_vtab *v, int iLevel, | ||
2199 | sqlite_int64 *piStartBlockid, | ||
2200 | sqlite_int64 *piEndBlockid){ | ||
2201 | sqlite3_stmt *s; | ||
2202 | int rc = sql_get_statement(v, SEGDIR_SPAN_STMT, &s); | ||
2203 | if( rc!=SQLITE_OK ) return rc; | ||
2204 | |||
2205 | rc = sqlite3_bind_int(s, 1, iLevel); | ||
2206 | if( rc!=SQLITE_OK ) return rc; | ||
2207 | |||
2208 | rc = sqlite3_step(s); | ||
2209 | if( rc==SQLITE_DONE ) return SQLITE_DONE; /* Should never happen */ | ||
2210 | if( rc!=SQLITE_ROW ) return rc; | ||
2211 | |||
2212 | /* This happens if all segments at this level are entirely inline. */ | ||
2213 | if( SQLITE_NULL==sqlite3_column_type(s, 0) ){ | ||
2214 | /* We expect only one row. We must execute another sqlite3_step() | ||
2215 | * to complete the iteration; otherwise the table will remain locked. */ | ||
2216 | int rc2 = sqlite3_step(s); | ||
2217 | if( rc2==SQLITE_ROW ) return SQLITE_ERROR; | ||
2218 | return rc2; | ||
2219 | } | ||
2220 | |||
2221 | *piStartBlockid = sqlite3_column_int64(s, 0); | ||
2222 | *piEndBlockid = sqlite3_column_int64(s, 1); | ||
2223 | |||
2224 | /* We expect only one row. We must execute another sqlite3_step() | ||
2225 | * to complete the iteration; otherwise the table will remain locked. */ | ||
2226 | rc = sqlite3_step(s); | ||
2227 | if( rc==SQLITE_ROW ) return SQLITE_ERROR; | ||
2228 | if( rc!=SQLITE_DONE ) return rc; | ||
2229 | return SQLITE_ROW; | ||
2230 | } | ||
2231 | |||
2232 | /* Delete the segment blocks and segment directory records for all | ||
2233 | ** segments at iLevel. | ||
2234 | */ | ||
2235 | static int segdir_delete(fulltext_vtab *v, int iLevel){ | ||
2236 | sqlite3_stmt *s; | ||
2237 | sqlite_int64 iStartBlockid, iEndBlockid; | ||
2238 | int rc = segdir_span(v, iLevel, &iStartBlockid, &iEndBlockid); | ||
2239 | if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ) return rc; | ||
2240 | |||
2241 | if( rc==SQLITE_ROW ){ | ||
2242 | rc = block_delete(v, iStartBlockid, iEndBlockid); | ||
2243 | if( rc!=SQLITE_OK ) return rc; | ||
2244 | } | ||
2245 | |||
2246 | /* Delete the segment directory itself. */ | ||
2247 | rc = sql_get_statement(v, SEGDIR_DELETE_STMT, &s); | ||
2248 | if( rc!=SQLITE_OK ) return rc; | ||
2249 | |||
2250 | rc = sqlite3_bind_int64(s, 1, iLevel); | ||
2251 | if( rc!=SQLITE_OK ) return rc; | ||
2252 | |||
2253 | return sql_single_step(s); | ||
2254 | } | ||
2255 | |||
2256 | /* TODO(shess) clearPendingTerms() is far down the file because | ||
2257 | ** writeZeroSegment() is far down the file because LeafWriter is far | ||
2258 | ** down the file. Consider refactoring the code to move the non-vtab | ||
2259 | ** code above the vtab code so that we don't need this forward | ||
2260 | ** reference. | ||
2261 | */ | ||
2262 | static int clearPendingTerms(fulltext_vtab *v); | ||
2263 | |||
2264 | /* | ||
2265 | ** Free the memory used to contain a fulltext_vtab structure. | ||
2266 | */ | ||
2267 | static void fulltext_vtab_destroy(fulltext_vtab *v){ | ||
2268 | int iStmt, i; | ||
2269 | |||
2270 | TRACE(("FTS3 Destroy %p\n", v)); | ||
2271 | for( iStmt=0; iStmt<MAX_STMT; iStmt++ ){ | ||
2272 | if( v->pFulltextStatements[iStmt]!=NULL ){ | ||
2273 | sqlite3_finalize(v->pFulltextStatements[iStmt]); | ||
2274 | v->pFulltextStatements[iStmt] = NULL; | ||
2275 | } | ||
2276 | } | ||
2277 | |||
2278 | for( i=0; i<MERGE_COUNT; i++ ){ | ||
2279 | if( v->pLeafSelectStmts[i]!=NULL ){ | ||
2280 | sqlite3_finalize(v->pLeafSelectStmts[i]); | ||
2281 | v->pLeafSelectStmts[i] = NULL; | ||
2282 | } | ||
2283 | } | ||
2284 | |||
2285 | if( v->pTokenizer!=NULL ){ | ||
2286 | v->pTokenizer->pModule->xDestroy(v->pTokenizer); | ||
2287 | v->pTokenizer = NULL; | ||
2288 | } | ||
2289 | |||
2290 | clearPendingTerms(v); | ||
2291 | |||
2292 | free(v->azColumn); | ||
2293 | for(i = 0; i < v->nColumn; ++i) { | ||
2294 | sqlite3_free(v->azContentColumn[i]); | ||
2295 | } | ||
2296 | free(v->azContentColumn); | ||
2297 | free(v); | ||
2298 | } | ||
2299 | |||
2300 | /* | ||
2301 | ** Token types for parsing the arguments to xConnect or xCreate. | ||
2302 | */ | ||
2303 | #define TOKEN_EOF 0 /* End of file */ | ||
2304 | #define TOKEN_SPACE 1 /* Any kind of whitespace */ | ||
2305 | #define TOKEN_ID 2 /* An identifier */ | ||
2306 | #define TOKEN_STRING 3 /* A string literal */ | ||
2307 | #define TOKEN_PUNCT 4 /* A single punctuation character */ | ||
2308 | |||
2309 | /* | ||
2310 | ** If X is a character that can be used in an identifier then | ||
2311 | ** IdChar(X) will be true. Otherwise it is false. | ||
2312 | ** | ||
2313 | ** For ASCII, any character with the high-order bit set is | ||
2314 | ** allowed in an identifier. For 7-bit characters, | ||
2315 | ** sqlite3IsIdChar[X] must be 1. | ||
2316 | ** | ||
2317 | ** Ticket #1066. the SQL standard does not allow '$' in the | ||
2318 | ** middle of identfiers. But many SQL implementations do. | ||
2319 | ** SQLite will allow '$' in identifiers for compatibility. | ||
2320 | ** But the feature is undocumented. | ||
2321 | */ | ||
2322 | static const char isIdChar[] = { | ||
2323 | /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */ | ||
2324 | 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 2x */ | ||
2325 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */ | ||
2326 | 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */ | ||
2327 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */ | ||
2328 | 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */ | ||
2329 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */ | ||
2330 | }; | ||
2331 | #define IdChar(C) (((c=C)&0x80)!=0 || (c>0x1f && isIdChar[c-0x20])) | ||
2332 | |||
2333 | |||
2334 | /* | ||
2335 | ** Return the length of the token that begins at z[0]. | ||
2336 | ** Store the token type in *tokenType before returning. | ||
2337 | */ | ||
2338 | static int getToken(const char *z, int *tokenType){ | ||
2339 | int i, c; | ||
2340 | switch( *z ){ | ||
2341 | case 0: { | ||
2342 | *tokenType = TOKEN_EOF; | ||
2343 | return 0; | ||
2344 | } | ||
2345 | case ' ': case '\t': case '\n': case '\f': case '\r': { | ||
2346 | for(i=1; safe_isspace(z[i]); i++){} | ||
2347 | *tokenType = TOKEN_SPACE; | ||
2348 | return i; | ||
2349 | } | ||
2350 | case '`': | ||
2351 | case '\'': | ||
2352 | case '"': { | ||
2353 | int delim = z[0]; | ||
2354 | for(i=1; (c=z[i])!=0; i++){ | ||
2355 | if( c==delim ){ | ||
2356 | if( z[i+1]==delim ){ | ||
2357 | i++; | ||
2358 | }else{ | ||
2359 | break; | ||
2360 | } | ||
2361 | } | ||
2362 | } | ||
2363 | *tokenType = TOKEN_STRING; | ||
2364 | return i + (c!=0); | ||
2365 | } | ||
2366 | case '[': { | ||
2367 | for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){} | ||
2368 | *tokenType = TOKEN_ID; | ||
2369 | return i; | ||
2370 | } | ||
2371 | default: { | ||
2372 | if( !IdChar(*z) ){ | ||
2373 | break; | ||
2374 | } | ||
2375 | for(i=1; IdChar(z[i]); i++){} | ||
2376 | *tokenType = TOKEN_ID; | ||
2377 | return i; | ||
2378 | } | ||
2379 | } | ||
2380 | *tokenType = TOKEN_PUNCT; | ||
2381 | return 1; | ||
2382 | } | ||
2383 | |||
2384 | /* | ||
2385 | ** A token extracted from a string is an instance of the following | ||
2386 | ** structure. | ||
2387 | */ | ||
2388 | typedef struct Token { | ||
2389 | const char *z; /* Pointer to token text. Not '\000' terminated */ | ||
2390 | short int n; /* Length of the token text in bytes. */ | ||
2391 | } Token; | ||
2392 | |||
2393 | /* | ||
2394 | ** Given a input string (which is really one of the argv[] parameters | ||
2395 | ** passed into xConnect or xCreate) split the string up into tokens. | ||
2396 | ** Return an array of pointers to '\000' terminated strings, one string | ||
2397 | ** for each non-whitespace token. | ||
2398 | ** | ||
2399 | ** The returned array is terminated by a single NULL pointer. | ||
2400 | ** | ||
2401 | ** Space to hold the returned array is obtained from a single | ||
2402 | ** malloc and should be freed by passing the return value to free(). | ||
2403 | ** The individual strings within the token list are all a part of | ||
2404 | ** the single memory allocation and will all be freed at once. | ||
2405 | */ | ||
2406 | static char **tokenizeString(const char *z, int *pnToken){ | ||
2407 | int nToken = 0; | ||
2408 | Token *aToken = malloc( strlen(z) * sizeof(aToken[0]) ); | ||
2409 | int n = 1; | ||
2410 | int e, i; | ||
2411 | int totalSize = 0; | ||
2412 | char **azToken; | ||
2413 | char *zCopy; | ||
2414 | while( n>0 ){ | ||
2415 | n = getToken(z, &e); | ||
2416 | if( e!=TOKEN_SPACE ){ | ||
2417 | aToken[nToken].z = z; | ||
2418 | aToken[nToken].n = n; | ||
2419 | nToken++; | ||
2420 | totalSize += n+1; | ||
2421 | } | ||
2422 | z += n; | ||
2423 | } | ||
2424 | azToken = (char**)malloc( nToken*sizeof(char*) + totalSize ); | ||
2425 | zCopy = (char*)&azToken[nToken]; | ||
2426 | nToken--; | ||
2427 | for(i=0; i<nToken; i++){ | ||
2428 | azToken[i] = zCopy; | ||
2429 | n = aToken[i].n; | ||
2430 | memcpy(zCopy, aToken[i].z, n); | ||
2431 | zCopy[n] = 0; | ||
2432 | zCopy += n+1; | ||
2433 | } | ||
2434 | azToken[nToken] = 0; | ||
2435 | free(aToken); | ||
2436 | *pnToken = nToken; | ||
2437 | return azToken; | ||
2438 | } | ||
2439 | |||
2440 | /* | ||
2441 | ** Convert an SQL-style quoted string into a normal string by removing | ||
2442 | ** the quote characters. The conversion is done in-place. If the | ||
2443 | ** input does not begin with a quote character, then this routine | ||
2444 | ** is a no-op. | ||
2445 | ** | ||
2446 | ** Examples: | ||
2447 | ** | ||
2448 | ** "abc" becomes abc | ||
2449 | ** 'xyz' becomes xyz | ||
2450 | ** [pqr] becomes pqr | ||
2451 | ** `mno` becomes mno | ||
2452 | */ | ||
2453 | static void dequoteString(char *z){ | ||
2454 | int quote; | ||
2455 | int i, j; | ||
2456 | if( z==0 ) return; | ||
2457 | quote = z[0]; | ||
2458 | switch( quote ){ | ||
2459 | case '\'': break; | ||
2460 | case '"': break; | ||
2461 | case '`': break; /* For MySQL compatibility */ | ||
2462 | case '[': quote = ']'; break; /* For MS SqlServer compatibility */ | ||
2463 | default: return; | ||
2464 | } | ||
2465 | for(i=1, j=0; z[i]; i++){ | ||
2466 | if( z[i]==quote ){ | ||
2467 | if( z[i+1]==quote ){ | ||
2468 | z[j++] = quote; | ||
2469 | i++; | ||
2470 | }else{ | ||
2471 | z[j++] = 0; | ||
2472 | break; | ||
2473 | } | ||
2474 | }else{ | ||
2475 | z[j++] = z[i]; | ||
2476 | } | ||
2477 | } | ||
2478 | } | ||
2479 | |||
2480 | /* | ||
2481 | ** The input azIn is a NULL-terminated list of tokens. Remove the first | ||
2482 | ** token and all punctuation tokens. Remove the quotes from | ||
2483 | ** around string literal tokens. | ||
2484 | ** | ||
2485 | ** Example: | ||
2486 | ** | ||
2487 | ** input: tokenize chinese ( 'simplifed' , 'mixed' ) | ||
2488 | ** output: chinese simplifed mixed | ||
2489 | ** | ||
2490 | ** Another example: | ||
2491 | ** | ||
2492 | ** input: delimiters ( '[' , ']' , '...' ) | ||
2493 | ** output: [ ] ... | ||
2494 | */ | ||
2495 | static void tokenListToIdList(char **azIn){ | ||
2496 | int i, j; | ||
2497 | if( azIn ){ | ||
2498 | for(i=0, j=-1; azIn[i]; i++){ | ||
2499 | if( safe_isalnum(azIn[i][0]) || azIn[i][1] ){ | ||
2500 | dequoteString(azIn[i]); | ||
2501 | if( j>=0 ){ | ||
2502 | azIn[j] = azIn[i]; | ||
2503 | } | ||
2504 | j++; | ||
2505 | } | ||
2506 | } | ||
2507 | azIn[j] = 0; | ||
2508 | } | ||
2509 | } | ||
2510 | |||
2511 | |||
2512 | /* | ||
2513 | ** Find the first alphanumeric token in the string zIn. Null-terminate | ||
2514 | ** this token. Remove any quotation marks. And return a pointer to | ||
2515 | ** the result. | ||
2516 | */ | ||
2517 | static char *firstToken(char *zIn, char **pzTail){ | ||
2518 | int n, ttype; | ||
2519 | while(1){ | ||
2520 | n = getToken(zIn, &ttype); | ||
2521 | if( ttype==TOKEN_SPACE ){ | ||
2522 | zIn += n; | ||
2523 | }else if( ttype==TOKEN_EOF ){ | ||
2524 | *pzTail = zIn; | ||
2525 | return 0; | ||
2526 | }else{ | ||
2527 | zIn[n] = 0; | ||
2528 | *pzTail = &zIn[1]; | ||
2529 | dequoteString(zIn); | ||
2530 | return zIn; | ||
2531 | } | ||
2532 | } | ||
2533 | /*NOTREACHED*/ | ||
2534 | } | ||
2535 | |||
2536 | /* Return true if... | ||
2537 | ** | ||
2538 | ** * s begins with the string t, ignoring case | ||
2539 | ** * s is longer than t | ||
2540 | ** * The first character of s beyond t is not a alphanumeric | ||
2541 | ** | ||
2542 | ** Ignore leading space in *s. | ||
2543 | ** | ||
2544 | ** To put it another way, return true if the first token of | ||
2545 | ** s[] is t[]. | ||
2546 | */ | ||
2547 | static int startsWith(const char *s, const char *t){ | ||
2548 | while( safe_isspace(*s) ){ s++; } | ||
2549 | while( *t ){ | ||
2550 | if( safe_tolower(*s++)!=safe_tolower(*t++) ) return 0; | ||
2551 | } | ||
2552 | return *s!='_' && !safe_isalnum(*s); | ||
2553 | } | ||
2554 | |||
2555 | /* | ||
2556 | ** An instance of this structure defines the "spec" of a | ||
2557 | ** full text index. This structure is populated by parseSpec | ||
2558 | ** and use by fulltextConnect and fulltextCreate. | ||
2559 | */ | ||
2560 | typedef struct TableSpec { | ||
2561 | const char *zDb; /* Logical database name */ | ||
2562 | const char *zName; /* Name of the full-text index */ | ||
2563 | int nColumn; /* Number of columns to be indexed */ | ||
2564 | char **azColumn; /* Original names of columns to be indexed */ | ||
2565 | char **azContentColumn; /* Column names for %_content */ | ||
2566 | char **azTokenizer; /* Name of tokenizer and its arguments */ | ||
2567 | } TableSpec; | ||
2568 | |||
2569 | /* | ||
2570 | ** Reclaim all of the memory used by a TableSpec | ||
2571 | */ | ||
2572 | static void clearTableSpec(TableSpec *p) { | ||
2573 | free(p->azColumn); | ||
2574 | free(p->azContentColumn); | ||
2575 | free(p->azTokenizer); | ||
2576 | } | ||
2577 | |||
2578 | /* Parse a CREATE VIRTUAL TABLE statement, which looks like this: | ||
2579 | * | ||
2580 | * CREATE VIRTUAL TABLE email | ||
2581 | * USING fts3(subject, body, tokenize mytokenizer(myarg)) | ||
2582 | * | ||
2583 | * We return parsed information in a TableSpec structure. | ||
2584 | * | ||
2585 | */ | ||
2586 | static int parseSpec(TableSpec *pSpec, int argc, const char *const*argv, | ||
2587 | char**pzErr){ | ||
2588 | int i, n; | ||
2589 | char *z, *zDummy; | ||
2590 | char **azArg; | ||
2591 | const char *zTokenizer = 0; /* argv[] entry describing the tokenizer */ | ||
2592 | |||
2593 | assert( argc>=3 ); | ||
2594 | /* Current interface: | ||
2595 | ** argv[0] - module name | ||
2596 | ** argv[1] - database name | ||
2597 | ** argv[2] - table name | ||
2598 | ** argv[3..] - columns, optionally followed by tokenizer specification | ||
2599 | ** and snippet delimiters specification. | ||
2600 | */ | ||
2601 | |||
2602 | /* Make a copy of the complete argv[][] array in a single allocation. | ||
2603 | ** The argv[][] array is read-only and transient. We can write to the | ||
2604 | ** copy in order to modify things and the copy is persistent. | ||
2605 | */ | ||
2606 | CLEAR(pSpec); | ||
2607 | for(i=n=0; i<argc; i++){ | ||
2608 | n += strlen(argv[i]) + 1; | ||
2609 | } | ||
2610 | azArg = malloc( sizeof(char*)*argc + n ); | ||
2611 | if( azArg==0 ){ | ||
2612 | return SQLITE_NOMEM; | ||
2613 | } | ||
2614 | z = (char*)&azArg[argc]; | ||
2615 | for(i=0; i<argc; i++){ | ||
2616 | azArg[i] = z; | ||
2617 | strcpy(z, argv[i]); | ||
2618 | z += strlen(z)+1; | ||
2619 | } | ||
2620 | |||
2621 | /* Identify the column names and the tokenizer and delimiter arguments | ||
2622 | ** in the argv[][] array. | ||
2623 | */ | ||
2624 | pSpec->zDb = azArg[1]; | ||
2625 | pSpec->zName = azArg[2]; | ||
2626 | pSpec->nColumn = 0; | ||
2627 | pSpec->azColumn = azArg; | ||
2628 | zTokenizer = "tokenize simple"; | ||
2629 | for(i=3; i<argc; ++i){ | ||
2630 | if( startsWith(azArg[i],"tokenize") ){ | ||
2631 | zTokenizer = azArg[i]; | ||
2632 | }else{ | ||
2633 | z = azArg[pSpec->nColumn] = firstToken(azArg[i], &zDummy); | ||
2634 | pSpec->nColumn++; | ||
2635 | } | ||
2636 | } | ||
2637 | if( pSpec->nColumn==0 ){ | ||
2638 | azArg[0] = "content"; | ||
2639 | pSpec->nColumn = 1; | ||
2640 | } | ||
2641 | |||
2642 | /* | ||
2643 | ** Construct the list of content column names. | ||
2644 | ** | ||
2645 | ** Each content column name will be of the form cNNAAAA | ||
2646 | ** where NN is the column number and AAAA is the sanitized | ||
2647 | ** column name. "sanitized" means that special characters are | ||
2648 | ** converted to "_". The cNN prefix guarantees that all column | ||
2649 | ** names are unique. | ||
2650 | ** | ||
2651 | ** The AAAA suffix is not strictly necessary. It is included | ||
2652 | ** for the convenience of people who might examine the generated | ||
2653 | ** %_content table and wonder what the columns are used for. | ||
2654 | */ | ||
2655 | pSpec->azContentColumn = malloc( pSpec->nColumn * sizeof(char *) ); | ||
2656 | if( pSpec->azContentColumn==0 ){ | ||
2657 | clearTableSpec(pSpec); | ||
2658 | return SQLITE_NOMEM; | ||
2659 | } | ||
2660 | for(i=0; i<pSpec->nColumn; i++){ | ||
2661 | char *p; | ||
2662 | pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]); | ||
2663 | for (p = pSpec->azContentColumn[i]; *p ; ++p) { | ||
2664 | if( !safe_isalnum(*p) ) *p = '_'; | ||
2665 | } | ||
2666 | } | ||
2667 | |||
2668 | /* | ||
2669 | ** Parse the tokenizer specification string. | ||
2670 | */ | ||
2671 | pSpec->azTokenizer = tokenizeString(zTokenizer, &n); | ||
2672 | tokenListToIdList(pSpec->azTokenizer); | ||
2673 | |||
2674 | return SQLITE_OK; | ||
2675 | } | ||
2676 | |||
2677 | /* | ||
2678 | ** Generate a CREATE TABLE statement that describes the schema of | ||
2679 | ** the virtual table. Return a pointer to this schema string. | ||
2680 | ** | ||
2681 | ** Space is obtained from sqlite3_mprintf() and should be freed | ||
2682 | ** using sqlite3_free(). | ||
2683 | */ | ||
2684 | static char *fulltextSchema( | ||
2685 | int nColumn, /* Number of columns */ | ||
2686 | const char *const* azColumn, /* List of columns */ | ||
2687 | const char *zTableName /* Name of the table */ | ||
2688 | ){ | ||
2689 | int i; | ||
2690 | char *zSchema, *zNext; | ||
2691 | const char *zSep = "("; | ||
2692 | zSchema = sqlite3_mprintf("CREATE TABLE x"); | ||
2693 | for(i=0; i<nColumn; i++){ | ||
2694 | zNext = sqlite3_mprintf("%s%s%Q", zSchema, zSep, azColumn[i]); | ||
2695 | sqlite3_free(zSchema); | ||
2696 | zSchema = zNext; | ||
2697 | zSep = ","; | ||
2698 | } | ||
2699 | zNext = sqlite3_mprintf("%s,%Q HIDDEN", zSchema, zTableName); | ||
2700 | sqlite3_free(zSchema); | ||
2701 | zSchema = zNext; | ||
2702 | zNext = sqlite3_mprintf("%s,docid HIDDEN)", zSchema); | ||
2703 | sqlite3_free(zSchema); | ||
2704 | return zNext; | ||
2705 | } | ||
2706 | |||
2707 | /* | ||
2708 | ** Build a new sqlite3_vtab structure that will describe the | ||
2709 | ** fulltext index defined by spec. | ||
2710 | */ | ||
2711 | static int constructVtab( | ||
2712 | sqlite3 *db, /* The SQLite database connection */ | ||
2713 | fts3Hash *pHash, /* Hash table containing tokenizers */ | ||
2714 | TableSpec *spec, /* Parsed spec information from parseSpec() */ | ||
2715 | sqlite3_vtab **ppVTab, /* Write the resulting vtab structure here */ | ||
2716 | char **pzErr /* Write any error message here */ | ||
2717 | ){ | ||
2718 | int rc; | ||
2719 | int n; | ||
2720 | fulltext_vtab *v = 0; | ||
2721 | const sqlite3_tokenizer_module *m = NULL; | ||
2722 | char *schema; | ||
2723 | |||
2724 | char const *zTok; /* Name of tokenizer to use for this fts table */ | ||
2725 | int nTok; /* Length of zTok, including nul terminator */ | ||
2726 | |||
2727 | v = (fulltext_vtab *) malloc(sizeof(fulltext_vtab)); | ||
2728 | if( v==0 ) return SQLITE_NOMEM; | ||
2729 | CLEAR(v); | ||
2730 | /* sqlite will initialize v->base */ | ||
2731 | v->db = db; | ||
2732 | v->zDb = spec->zDb; /* Freed when azColumn is freed */ | ||
2733 | v->zName = spec->zName; /* Freed when azColumn is freed */ | ||
2734 | v->nColumn = spec->nColumn; | ||
2735 | v->azContentColumn = spec->azContentColumn; | ||
2736 | spec->azContentColumn = 0; | ||
2737 | v->azColumn = spec->azColumn; | ||
2738 | spec->azColumn = 0; | ||
2739 | |||
2740 | if( spec->azTokenizer==0 ){ | ||
2741 | return SQLITE_NOMEM; | ||
2742 | } | ||
2743 | |||
2744 | zTok = spec->azTokenizer[0]; | ||
2745 | if( !zTok ){ | ||
2746 | zTok = "simple"; | ||
2747 | } | ||
2748 | nTok = strlen(zTok)+1; | ||
2749 | |||
2750 | m = (sqlite3_tokenizer_module *)sqlite3Fts3HashFind(pHash, zTok, nTok); | ||
2751 | if( !m ){ | ||
2752 | *pzErr = sqlite3_mprintf("unknown tokenizer: %s", spec->azTokenizer[0]); | ||
2753 | rc = SQLITE_ERROR; | ||
2754 | goto err; | ||
2755 | } | ||
2756 | |||
2757 | for(n=0; spec->azTokenizer[n]; n++){} | ||
2758 | if( n ){ | ||
2759 | rc = m->xCreate(n-1, (const char*const*)&spec->azTokenizer[1], | ||
2760 | &v->pTokenizer); | ||
2761 | }else{ | ||
2762 | rc = m->xCreate(0, 0, &v->pTokenizer); | ||
2763 | } | ||
2764 | if( rc!=SQLITE_OK ) goto err; | ||
2765 | v->pTokenizer->pModule = m; | ||
2766 | |||
2767 | /* TODO: verify the existence of backing tables foo_content, foo_term */ | ||
2768 | |||
2769 | schema = fulltextSchema(v->nColumn, (const char*const*)v->azColumn, | ||
2770 | spec->zName); | ||
2771 | rc = sqlite3_declare_vtab(db, schema); | ||
2772 | sqlite3_free(schema); | ||
2773 | if( rc!=SQLITE_OK ) goto err; | ||
2774 | |||
2775 | memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements)); | ||
2776 | |||
2777 | /* Indicate that the buffer is not live. */ | ||
2778 | v->nPendingData = -1; | ||
2779 | |||
2780 | *ppVTab = &v->base; | ||
2781 | TRACE(("FTS3 Connect %p\n", v)); | ||
2782 | |||
2783 | return rc; | ||
2784 | |||
2785 | err: | ||
2786 | fulltext_vtab_destroy(v); | ||
2787 | return rc; | ||
2788 | } | ||
2789 | |||
2790 | static int fulltextConnect( | ||
2791 | sqlite3 *db, | ||
2792 | void *pAux, | ||
2793 | int argc, const char *const*argv, | ||
2794 | sqlite3_vtab **ppVTab, | ||
2795 | char **pzErr | ||
2796 | ){ | ||
2797 | TableSpec spec; | ||
2798 | int rc = parseSpec(&spec, argc, argv, pzErr); | ||
2799 | if( rc!=SQLITE_OK ) return rc; | ||
2800 | |||
2801 | rc = constructVtab(db, (fts3Hash *)pAux, &spec, ppVTab, pzErr); | ||
2802 | clearTableSpec(&spec); | ||
2803 | return rc; | ||
2804 | } | ||
2805 | |||
2806 | /* The %_content table holds the text of each document, with | ||
2807 | ** the docid column exposed as the SQLite rowid for the table. | ||
2808 | */ | ||
2809 | /* TODO(shess) This comment needs elaboration to match the updated | ||
2810 | ** code. Work it into the top-of-file comment at that time. | ||
2811 | */ | ||
2812 | static int fulltextCreate(sqlite3 *db, void *pAux, | ||
2813 | int argc, const char * const *argv, | ||
2814 | sqlite3_vtab **ppVTab, char **pzErr){ | ||
2815 | int rc; | ||
2816 | TableSpec spec; | ||
2817 | StringBuffer schema; | ||
2818 | TRACE(("FTS3 Create\n")); | ||
2819 | |||
2820 | rc = parseSpec(&spec, argc, argv, pzErr); | ||
2821 | if( rc!=SQLITE_OK ) return rc; | ||
2822 | |||
2823 | initStringBuffer(&schema); | ||
2824 | append(&schema, "CREATE TABLE %_content("); | ||
2825 | append(&schema, " docid INTEGER PRIMARY KEY,"); | ||
2826 | appendList(&schema, spec.nColumn, spec.azContentColumn); | ||
2827 | append(&schema, ")"); | ||
2828 | rc = sql_exec(db, spec.zDb, spec.zName, stringBufferData(&schema)); | ||
2829 | stringBufferDestroy(&schema); | ||
2830 | if( rc!=SQLITE_OK ) goto out; | ||
2831 | |||
2832 | rc = sql_exec(db, spec.zDb, spec.zName, | ||
2833 | "create table %_segments(" | ||
2834 | " blockid INTEGER PRIMARY KEY," | ||
2835 | " block blob" | ||
2836 | ");" | ||
2837 | ); | ||
2838 | if( rc!=SQLITE_OK ) goto out; | ||
2839 | |||
2840 | rc = sql_exec(db, spec.zDb, spec.zName, | ||
2841 | "create table %_segdir(" | ||
2842 | " level integer," | ||
2843 | " idx integer," | ||
2844 | " start_block integer," | ||
2845 | " leaves_end_block integer," | ||
2846 | " end_block integer," | ||
2847 | " root blob," | ||
2848 | " primary key(level, idx)" | ||
2849 | ");"); | ||
2850 | if( rc!=SQLITE_OK ) goto out; | ||
2851 | |||
2852 | rc = constructVtab(db, (fts3Hash *)pAux, &spec, ppVTab, pzErr); | ||
2853 | |||
2854 | out: | ||
2855 | clearTableSpec(&spec); | ||
2856 | return rc; | ||
2857 | } | ||
2858 | |||
2859 | /* Decide how to handle an SQL query. */ | ||
2860 | static int fulltextBestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){ | ||
2861 | fulltext_vtab *v = (fulltext_vtab *)pVTab; | ||
2862 | int i; | ||
2863 | TRACE(("FTS3 BestIndex\n")); | ||
2864 | |||
2865 | for(i=0; i<pInfo->nConstraint; ++i){ | ||
2866 | const struct sqlite3_index_constraint *pConstraint; | ||
2867 | pConstraint = &pInfo->aConstraint[i]; | ||
2868 | if( pConstraint->usable ) { | ||
2869 | if( (pConstraint->iColumn==-1 || pConstraint->iColumn==v->nColumn+1) && | ||
2870 | pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ ){ | ||
2871 | pInfo->idxNum = QUERY_DOCID; /* lookup by docid */ | ||
2872 | TRACE(("FTS3 QUERY_DOCID\n")); | ||
2873 | } else if( pConstraint->iColumn>=0 && pConstraint->iColumn<=v->nColumn && | ||
2874 | pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH ){ | ||
2875 | /* full-text search */ | ||
2876 | pInfo->idxNum = QUERY_FULLTEXT + pConstraint->iColumn; | ||
2877 | TRACE(("FTS3 QUERY_FULLTEXT %d\n", pConstraint->iColumn)); | ||
2878 | } else continue; | ||
2879 | |||
2880 | pInfo->aConstraintUsage[i].argvIndex = 1; | ||
2881 | pInfo->aConstraintUsage[i].omit = 1; | ||
2882 | |||
2883 | /* An arbitrary value for now. | ||
2884 | * TODO: Perhaps docid matches should be considered cheaper than | ||
2885 | * full-text searches. */ | ||
2886 | pInfo->estimatedCost = 1.0; | ||
2887 | |||
2888 | return SQLITE_OK; | ||
2889 | } | ||
2890 | } | ||
2891 | pInfo->idxNum = QUERY_GENERIC; | ||
2892 | return SQLITE_OK; | ||
2893 | } | ||
2894 | |||
2895 | static int fulltextDisconnect(sqlite3_vtab *pVTab){ | ||
2896 | TRACE(("FTS3 Disconnect %p\n", pVTab)); | ||
2897 | fulltext_vtab_destroy((fulltext_vtab *)pVTab); | ||
2898 | return SQLITE_OK; | ||
2899 | } | ||
2900 | |||
2901 | static int fulltextDestroy(sqlite3_vtab *pVTab){ | ||
2902 | fulltext_vtab *v = (fulltext_vtab *)pVTab; | ||
2903 | int rc; | ||
2904 | |||
2905 | TRACE(("FTS3 Destroy %p\n", pVTab)); | ||
2906 | rc = sql_exec(v->db, v->zDb, v->zName, | ||
2907 | "drop table if exists %_content;" | ||
2908 | "drop table if exists %_segments;" | ||
2909 | "drop table if exists %_segdir;" | ||
2910 | ); | ||
2911 | if( rc!=SQLITE_OK ) return rc; | ||
2912 | |||
2913 | fulltext_vtab_destroy((fulltext_vtab *)pVTab); | ||
2914 | return SQLITE_OK; | ||
2915 | } | ||
2916 | |||
2917 | static int fulltextOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){ | ||
2918 | fulltext_cursor *c; | ||
2919 | |||
2920 | c = (fulltext_cursor *) calloc(sizeof(fulltext_cursor), 1); | ||
2921 | /* sqlite will initialize c->base */ | ||
2922 | *ppCursor = &c->base; | ||
2923 | TRACE(("FTS3 Open %p: %p\n", pVTab, c)); | ||
2924 | |||
2925 | return SQLITE_OK; | ||
2926 | } | ||
2927 | |||
2928 | |||
2929 | /* Free all of the dynamically allocated memory held by *q | ||
2930 | */ | ||
2931 | static void queryClear(Query *q){ | ||
2932 | int i; | ||
2933 | for(i = 0; i < q->nTerms; ++i){ | ||
2934 | free(q->pTerms[i].pTerm); | ||
2935 | } | ||
2936 | free(q->pTerms); | ||
2937 | CLEAR(q); | ||
2938 | } | ||
2939 | |||
2940 | /* Free all of the dynamically allocated memory held by the | ||
2941 | ** Snippet | ||
2942 | */ | ||
2943 | static void snippetClear(Snippet *p){ | ||
2944 | free(p->aMatch); | ||
2945 | free(p->zOffset); | ||
2946 | free(p->zSnippet); | ||
2947 | CLEAR(p); | ||
2948 | } | ||
2949 | /* | ||
2950 | ** Append a single entry to the p->aMatch[] log. | ||
2951 | */ | ||
2952 | static void snippetAppendMatch( | ||
2953 | Snippet *p, /* Append the entry to this snippet */ | ||
2954 | int iCol, int iTerm, /* The column and query term */ | ||
2955 | int iStart, int nByte /* Offset and size of the match */ | ||
2956 | ){ | ||
2957 | int i; | ||
2958 | struct snippetMatch *pMatch; | ||
2959 | if( p->nMatch+1>=p->nAlloc ){ | ||
2960 | p->nAlloc = p->nAlloc*2 + 10; | ||
2961 | p->aMatch = realloc(p->aMatch, p->nAlloc*sizeof(p->aMatch[0]) ); | ||
2962 | if( p->aMatch==0 ){ | ||
2963 | p->nMatch = 0; | ||
2964 | p->nAlloc = 0; | ||
2965 | return; | ||
2966 | } | ||
2967 | } | ||
2968 | i = p->nMatch++; | ||
2969 | pMatch = &p->aMatch[i]; | ||
2970 | pMatch->iCol = iCol; | ||
2971 | pMatch->iTerm = iTerm; | ||
2972 | pMatch->iStart = iStart; | ||
2973 | pMatch->nByte = nByte; | ||
2974 | } | ||
2975 | |||
2976 | /* | ||
2977 | ** Sizing information for the circular buffer used in snippetOffsetsOfColumn() | ||
2978 | */ | ||
2979 | #define FTS3_ROTOR_SZ (32) | ||
2980 | #define FTS3_ROTOR_MASK (FTS3_ROTOR_SZ-1) | ||
2981 | |||
2982 | /* | ||
2983 | ** Add entries to pSnippet->aMatch[] for every match that occurs against | ||
2984 | ** document zDoc[0..nDoc-1] which is stored in column iColumn. | ||
2985 | */ | ||
2986 | static void snippetOffsetsOfColumn( | ||
2987 | Query *pQuery, | ||
2988 | Snippet *pSnippet, | ||
2989 | int iColumn, | ||
2990 | const char *zDoc, | ||
2991 | int nDoc | ||
2992 | ){ | ||
2993 | const sqlite3_tokenizer_module *pTModule; /* The tokenizer module */ | ||
2994 | sqlite3_tokenizer *pTokenizer; /* The specific tokenizer */ | ||
2995 | sqlite3_tokenizer_cursor *pTCursor; /* Tokenizer cursor */ | ||
2996 | fulltext_vtab *pVtab; /* The full text index */ | ||
2997 | int nColumn; /* Number of columns in the index */ | ||
2998 | const QueryTerm *aTerm; /* Query string terms */ | ||
2999 | int nTerm; /* Number of query string terms */ | ||
3000 | int i, j; /* Loop counters */ | ||
3001 | int rc; /* Return code */ | ||
3002 | unsigned int match, prevMatch; /* Phrase search bitmasks */ | ||
3003 | const char *zToken; /* Next token from the tokenizer */ | ||
3004 | int nToken; /* Size of zToken */ | ||
3005 | int iBegin, iEnd, iPos; /* Offsets of beginning and end */ | ||
3006 | |||
3007 | /* The following variables keep a circular buffer of the last | ||
3008 | ** few tokens */ | ||
3009 | unsigned int iRotor = 0; /* Index of current token */ | ||
3010 | int iRotorBegin[FTS3_ROTOR_SZ]; /* Beginning offset of token */ | ||
3011 | int iRotorLen[FTS3_ROTOR_SZ]; /* Length of token */ | ||
3012 | |||
3013 | pVtab = pQuery->pFts; | ||
3014 | nColumn = pVtab->nColumn; | ||
3015 | pTokenizer = pVtab->pTokenizer; | ||
3016 | pTModule = pTokenizer->pModule; | ||
3017 | rc = pTModule->xOpen(pTokenizer, zDoc, nDoc, &pTCursor); | ||
3018 | if( rc ) return; | ||
3019 | pTCursor->pTokenizer = pTokenizer; | ||
3020 | aTerm = pQuery->pTerms; | ||
3021 | nTerm = pQuery->nTerms; | ||
3022 | if( nTerm>=FTS3_ROTOR_SZ ){ | ||
3023 | nTerm = FTS3_ROTOR_SZ - 1; | ||
3024 | } | ||
3025 | prevMatch = 0; | ||
3026 | while(1){ | ||
3027 | rc = pTModule->xNext(pTCursor, &zToken, &nToken, &iBegin, &iEnd, &iPos); | ||
3028 | if( rc ) break; | ||
3029 | iRotorBegin[iRotor&FTS3_ROTOR_MASK] = iBegin; | ||
3030 | iRotorLen[iRotor&FTS3_ROTOR_MASK] = iEnd-iBegin; | ||
3031 | match = 0; | ||
3032 | for(i=0; i<nTerm; i++){ | ||
3033 | int iCol; | ||
3034 | iCol = aTerm[i].iColumn; | ||
3035 | if( iCol>=0 && iCol<nColumn && iCol!=iColumn ) continue; | ||
3036 | if( aTerm[i].nTerm>nToken ) continue; | ||
3037 | if( !aTerm[i].isPrefix && aTerm[i].nTerm<nToken ) continue; | ||
3038 | assert( aTerm[i].nTerm<=nToken ); | ||
3039 | if( memcmp(aTerm[i].pTerm, zToken, aTerm[i].nTerm) ) continue; | ||
3040 | if( aTerm[i].iPhrase>1 && (prevMatch & (1<<i))==0 ) continue; | ||
3041 | match |= 1<<i; | ||
3042 | if( i==nTerm-1 || aTerm[i+1].iPhrase==1 ){ | ||
3043 | for(j=aTerm[i].iPhrase-1; j>=0; j--){ | ||
3044 | int k = (iRotor-j) & FTS3_ROTOR_MASK; | ||
3045 | snippetAppendMatch(pSnippet, iColumn, i-j, | ||
3046 | iRotorBegin[k], iRotorLen[k]); | ||
3047 | } | ||
3048 | } | ||
3049 | } | ||
3050 | prevMatch = match<<1; | ||
3051 | iRotor++; | ||
3052 | } | ||
3053 | pTModule->xClose(pTCursor); | ||
3054 | } | ||
3055 | |||
3056 | |||
3057 | /* | ||
3058 | ** Compute all offsets for the current row of the query. | ||
3059 | ** If the offsets have already been computed, this routine is a no-op. | ||
3060 | */ | ||
3061 | static void snippetAllOffsets(fulltext_cursor *p){ | ||
3062 | int nColumn; | ||
3063 | int iColumn, i; | ||
3064 | int iFirst, iLast; | ||
3065 | fulltext_vtab *pFts; | ||
3066 | |||
3067 | if( p->snippet.nMatch ) return; | ||
3068 | if( p->q.nTerms==0 ) return; | ||
3069 | pFts = p->q.pFts; | ||
3070 | nColumn = pFts->nColumn; | ||
3071 | iColumn = (p->iCursorType - QUERY_FULLTEXT); | ||
3072 | if( iColumn<0 || iColumn>=nColumn ){ | ||
3073 | iFirst = 0; | ||
3074 | iLast = nColumn-1; | ||
3075 | }else{ | ||
3076 | iFirst = iColumn; | ||
3077 | iLast = iColumn; | ||
3078 | } | ||
3079 | for(i=iFirst; i<=iLast; i++){ | ||
3080 | const char *zDoc; | ||
3081 | int nDoc; | ||
3082 | zDoc = (const char*)sqlite3_column_text(p->pStmt, i+1); | ||
3083 | nDoc = sqlite3_column_bytes(p->pStmt, i+1); | ||
3084 | snippetOffsetsOfColumn(&p->q, &p->snippet, i, zDoc, nDoc); | ||
3085 | } | ||
3086 | } | ||
3087 | |||
3088 | /* | ||
3089 | ** Convert the information in the aMatch[] array of the snippet | ||
3090 | ** into the string zOffset[0..nOffset-1]. | ||
3091 | */ | ||
3092 | static void snippetOffsetText(Snippet *p){ | ||
3093 | int i; | ||
3094 | int cnt = 0; | ||
3095 | StringBuffer sb; | ||
3096 | char zBuf[200]; | ||
3097 | if( p->zOffset ) return; | ||
3098 | initStringBuffer(&sb); | ||
3099 | for(i=0; i<p->nMatch; i++){ | ||
3100 | struct snippetMatch *pMatch = &p->aMatch[i]; | ||
3101 | zBuf[0] = ' '; | ||
3102 | sprintf(&zBuf[cnt>0], "%d %d %d %d", pMatch->iCol, | ||
3103 | pMatch->iTerm, pMatch->iStart, pMatch->nByte); | ||
3104 | append(&sb, zBuf); | ||
3105 | cnt++; | ||
3106 | } | ||
3107 | p->zOffset = stringBufferData(&sb); | ||
3108 | p->nOffset = stringBufferLength(&sb); | ||
3109 | } | ||
3110 | |||
3111 | /* | ||
3112 | ** zDoc[0..nDoc-1] is phrase of text. aMatch[0..nMatch-1] are a set | ||
3113 | ** of matching words some of which might be in zDoc. zDoc is column | ||
3114 | ** number iCol. | ||
3115 | ** | ||
3116 | ** iBreak is suggested spot in zDoc where we could begin or end an | ||
3117 | ** excerpt. Return a value similar to iBreak but possibly adjusted | ||
3118 | ** to be a little left or right so that the break point is better. | ||
3119 | */ | ||
3120 | static int wordBoundary( | ||
3121 | int iBreak, /* The suggested break point */ | ||
3122 | const char *zDoc, /* Document text */ | ||
3123 | int nDoc, /* Number of bytes in zDoc[] */ | ||
3124 | struct snippetMatch *aMatch, /* Matching words */ | ||
3125 | int nMatch, /* Number of entries in aMatch[] */ | ||
3126 | int iCol /* The column number for zDoc[] */ | ||
3127 | ){ | ||
3128 | int i; | ||
3129 | if( iBreak<=10 ){ | ||
3130 | return 0; | ||
3131 | } | ||
3132 | if( iBreak>=nDoc-10 ){ | ||
3133 | return nDoc; | ||
3134 | } | ||
3135 | for(i=0; i<nMatch && aMatch[i].iCol<iCol; i++){} | ||
3136 | while( i<nMatch && aMatch[i].iStart+aMatch[i].nByte<iBreak ){ i++; } | ||
3137 | if( i<nMatch ){ | ||
3138 | if( aMatch[i].iStart<iBreak+10 ){ | ||
3139 | return aMatch[i].iStart; | ||
3140 | } | ||
3141 | if( i>0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){ | ||
3142 | return aMatch[i-1].iStart; | ||
3143 | } | ||
3144 | } | ||
3145 | for(i=1; i<=10; i++){ | ||
3146 | if( safe_isspace(zDoc[iBreak-i]) ){ | ||
3147 | return iBreak - i + 1; | ||
3148 | } | ||
3149 | if( safe_isspace(zDoc[iBreak+i]) ){ | ||
3150 | return iBreak + i + 1; | ||
3151 | } | ||
3152 | } | ||
3153 | return iBreak; | ||
3154 | } | ||
3155 | |||
3156 | |||
3157 | |||
3158 | /* | ||
3159 | ** Allowed values for Snippet.aMatch[].snStatus | ||
3160 | */ | ||
3161 | #define SNIPPET_IGNORE 0 /* It is ok to omit this match from the snippet */ | ||
3162 | #define SNIPPET_DESIRED 1 /* We want to include this match in the snippet */ | ||
3163 | |||
3164 | /* | ||
3165 | ** Generate the text of a snippet. | ||
3166 | */ | ||
3167 | static void snippetText( | ||
3168 | fulltext_cursor *pCursor, /* The cursor we need the snippet for */ | ||
3169 | const char *zStartMark, /* Markup to appear before each match */ | ||
3170 | const char *zEndMark, /* Markup to appear after each match */ | ||
3171 | const char *zEllipsis /* Ellipsis mark */ | ||
3172 | ){ | ||
3173 | int i, j; | ||
3174 | struct snippetMatch *aMatch; | ||
3175 | int nMatch; | ||
3176 | int nDesired; | ||
3177 | StringBuffer sb; | ||
3178 | int tailCol; | ||
3179 | int tailOffset; | ||
3180 | int iCol; | ||
3181 | int nDoc; | ||
3182 | const char *zDoc; | ||
3183 | int iStart, iEnd; | ||
3184 | int tailEllipsis = 0; | ||
3185 | int iMatch; | ||
3186 | |||
3187 | |||
3188 | free(pCursor->snippet.zSnippet); | ||
3189 | pCursor->snippet.zSnippet = 0; | ||
3190 | aMatch = pCursor->snippet.aMatch; | ||
3191 | nMatch = pCursor->snippet.nMatch; | ||
3192 | initStringBuffer(&sb); | ||
3193 | |||
3194 | for(i=0; i<nMatch; i++){ | ||
3195 | aMatch[i].snStatus = SNIPPET_IGNORE; | ||
3196 | } | ||
3197 | nDesired = 0; | ||
3198 | for(i=0; i<pCursor->q.nTerms; i++){ | ||
3199 | for(j=0; j<nMatch; j++){ | ||
3200 | if( aMatch[j].iTerm==i ){ | ||
3201 | aMatch[j].snStatus = SNIPPET_DESIRED; | ||
3202 | nDesired++; | ||
3203 | break; | ||
3204 | } | ||
3205 | } | ||
3206 | } | ||
3207 | |||
3208 | iMatch = 0; | ||
3209 | tailCol = -1; | ||
3210 | tailOffset = 0; | ||
3211 | for(i=0; i<nMatch && nDesired>0; i++){ | ||
3212 | if( aMatch[i].snStatus!=SNIPPET_DESIRED ) continue; | ||
3213 | nDesired--; | ||
3214 | iCol = aMatch[i].iCol; | ||
3215 | zDoc = (const char*)sqlite3_column_text(pCursor->pStmt, iCol+1); | ||
3216 | nDoc = sqlite3_column_bytes(pCursor->pStmt, iCol+1); | ||
3217 | iStart = aMatch[i].iStart - 40; | ||
3218 | iStart = wordBoundary(iStart, zDoc, nDoc, aMatch, nMatch, iCol); | ||
3219 | if( iStart<=10 ){ | ||
3220 | iStart = 0; | ||
3221 | } | ||
3222 | if( iCol==tailCol && iStart<=tailOffset+20 ){ | ||
3223 | iStart = tailOffset; | ||
3224 | } | ||
3225 | if( (iCol!=tailCol && tailCol>=0) || iStart!=tailOffset ){ | ||
3226 | trimWhiteSpace(&sb); | ||
3227 | appendWhiteSpace(&sb); | ||
3228 | append(&sb, zEllipsis); | ||
3229 | appendWhiteSpace(&sb); | ||
3230 | } | ||
3231 | iEnd = aMatch[i].iStart + aMatch[i].nByte + 40; | ||
3232 | iEnd = wordBoundary(iEnd, zDoc, nDoc, aMatch, nMatch, iCol); | ||
3233 | if( iEnd>=nDoc-10 ){ | ||
3234 | iEnd = nDoc; | ||
3235 | tailEllipsis = 0; | ||
3236 | }else{ | ||
3237 | tailEllipsis = 1; | ||
3238 | } | ||
3239 | while( iMatch<nMatch && aMatch[iMatch].iCol<iCol ){ iMatch++; } | ||
3240 | while( iStart<iEnd ){ | ||
3241 | while( iMatch<nMatch && aMatch[iMatch].iStart<iStart | ||
3242 | && aMatch[iMatch].iCol<=iCol ){ | ||
3243 | iMatch++; | ||
3244 | } | ||
3245 | if( iMatch<nMatch && aMatch[iMatch].iStart<iEnd | ||
3246 | && aMatch[iMatch].iCol==iCol ){ | ||
3247 | nappend(&sb, &zDoc[iStart], aMatch[iMatch].iStart - iStart); | ||
3248 | iStart = aMatch[iMatch].iStart; | ||
3249 | append(&sb, zStartMark); | ||
3250 | nappend(&sb, &zDoc[iStart], aMatch[iMatch].nByte); | ||
3251 | append(&sb, zEndMark); | ||
3252 | iStart += aMatch[iMatch].nByte; | ||
3253 | for(j=iMatch+1; j<nMatch; j++){ | ||
3254 | if( aMatch[j].iTerm==aMatch[iMatch].iTerm | ||
3255 | && aMatch[j].snStatus==SNIPPET_DESIRED ){ | ||
3256 | nDesired--; | ||
3257 | aMatch[j].snStatus = SNIPPET_IGNORE; | ||
3258 | } | ||
3259 | } | ||
3260 | }else{ | ||
3261 | nappend(&sb, &zDoc[iStart], iEnd - iStart); | ||
3262 | iStart = iEnd; | ||
3263 | } | ||
3264 | } | ||
3265 | tailCol = iCol; | ||
3266 | tailOffset = iEnd; | ||
3267 | } | ||
3268 | trimWhiteSpace(&sb); | ||
3269 | if( tailEllipsis ){ | ||
3270 | appendWhiteSpace(&sb); | ||
3271 | append(&sb, zEllipsis); | ||
3272 | } | ||
3273 | pCursor->snippet.zSnippet = stringBufferData(&sb); | ||
3274 | pCursor->snippet.nSnippet = stringBufferLength(&sb); | ||
3275 | } | ||
3276 | |||
3277 | |||
3278 | /* | ||
3279 | ** Close the cursor. For additional information see the documentation | ||
3280 | ** on the xClose method of the virtual table interface. | ||
3281 | */ | ||
3282 | static int fulltextClose(sqlite3_vtab_cursor *pCursor){ | ||
3283 | fulltext_cursor *c = (fulltext_cursor *) pCursor; | ||
3284 | TRACE(("FTS3 Close %p\n", c)); | ||
3285 | sqlite3_finalize(c->pStmt); | ||
3286 | queryClear(&c->q); | ||
3287 | snippetClear(&c->snippet); | ||
3288 | if( c->result.nData!=0 ) dlrDestroy(&c->reader); | ||
3289 | dataBufferDestroy(&c->result); | ||
3290 | free(c); | ||
3291 | return SQLITE_OK; | ||
3292 | } | ||
3293 | |||
3294 | static int fulltextNext(sqlite3_vtab_cursor *pCursor){ | ||
3295 | fulltext_cursor *c = (fulltext_cursor *) pCursor; | ||
3296 | int rc; | ||
3297 | |||
3298 | TRACE(("FTS3 Next %p\n", pCursor)); | ||
3299 | snippetClear(&c->snippet); | ||
3300 | if( c->iCursorType < QUERY_FULLTEXT ){ | ||
3301 | /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */ | ||
3302 | rc = sqlite3_step(c->pStmt); | ||
3303 | switch( rc ){ | ||
3304 | case SQLITE_ROW: | ||
3305 | c->eof = 0; | ||
3306 | return SQLITE_OK; | ||
3307 | case SQLITE_DONE: | ||
3308 | c->eof = 1; | ||
3309 | return SQLITE_OK; | ||
3310 | default: | ||
3311 | c->eof = 1; | ||
3312 | return rc; | ||
3313 | } | ||
3314 | } else { /* full-text query */ | ||
3315 | rc = sqlite3_reset(c->pStmt); | ||
3316 | if( rc!=SQLITE_OK ) return rc; | ||
3317 | |||
3318 | if( c->result.nData==0 || dlrAtEnd(&c->reader) ){ | ||
3319 | c->eof = 1; | ||
3320 | return SQLITE_OK; | ||
3321 | } | ||
3322 | rc = sqlite3_bind_int64(c->pStmt, 1, dlrDocid(&c->reader)); | ||
3323 | dlrStep(&c->reader); | ||
3324 | if( rc!=SQLITE_OK ) return rc; | ||
3325 | /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */ | ||
3326 | rc = sqlite3_step(c->pStmt); | ||
3327 | if( rc==SQLITE_ROW ){ /* the case we expect */ | ||
3328 | c->eof = 0; | ||
3329 | return SQLITE_OK; | ||
3330 | } | ||
3331 | /* an error occurred; abort */ | ||
3332 | return rc==SQLITE_DONE ? SQLITE_ERROR : rc; | ||
3333 | } | ||
3334 | } | ||
3335 | |||
3336 | |||
3337 | /* TODO(shess) If we pushed LeafReader to the top of the file, or to | ||
3338 | ** another file, term_select() could be pushed above | ||
3339 | ** docListOfTerm(). | ||
3340 | */ | ||
3341 | static int termSelect(fulltext_vtab *v, int iColumn, | ||
3342 | const char *pTerm, int nTerm, int isPrefix, | ||
3343 | DocListType iType, DataBuffer *out); | ||
3344 | |||
3345 | /* Return a DocList corresponding to the query term *pTerm. If *pTerm | ||
3346 | ** is the first term of a phrase query, go ahead and evaluate the phrase | ||
3347 | ** query and return the doclist for the entire phrase query. | ||
3348 | ** | ||
3349 | ** The resulting DL_DOCIDS doclist is stored in pResult, which is | ||
3350 | ** overwritten. | ||
3351 | */ | ||
3352 | static int docListOfTerm( | ||
3353 | fulltext_vtab *v, /* The full text index */ | ||
3354 | int iColumn, /* column to restrict to. No restriction if >=nColumn */ | ||
3355 | QueryTerm *pQTerm, /* Term we are looking for, or 1st term of a phrase */ | ||
3356 | DataBuffer *pResult /* Write the result here */ | ||
3357 | ){ | ||
3358 | DataBuffer left, right, new; | ||
3359 | int i, rc; | ||
3360 | |||
3361 | /* No phrase search if no position info. */ | ||
3362 | assert( pQTerm->nPhrase==0 || DL_DEFAULT!=DL_DOCIDS ); | ||
3363 | |||
3364 | /* This code should never be called with buffered updates. */ | ||
3365 | assert( v->nPendingData<0 ); | ||
3366 | |||
3367 | dataBufferInit(&left, 0); | ||
3368 | rc = termSelect(v, iColumn, pQTerm->pTerm, pQTerm->nTerm, pQTerm->isPrefix, | ||
3369 | 0<pQTerm->nPhrase ? DL_POSITIONS : DL_DOCIDS, &left); | ||
3370 | if( rc ) return rc; | ||
3371 | for(i=1; i<=pQTerm->nPhrase && left.nData>0; i++){ | ||
3372 | dataBufferInit(&right, 0); | ||
3373 | rc = termSelect(v, iColumn, pQTerm[i].pTerm, pQTerm[i].nTerm, | ||
3374 | pQTerm[i].isPrefix, DL_POSITIONS, &right); | ||
3375 | if( rc ){ | ||
3376 | dataBufferDestroy(&left); | ||
3377 | return rc; | ||
3378 | } | ||
3379 | dataBufferInit(&new, 0); | ||
3380 | docListPhraseMerge(left.pData, left.nData, right.pData, right.nData, | ||
3381 | i<pQTerm->nPhrase ? DL_POSITIONS : DL_DOCIDS, &new); | ||
3382 | dataBufferDestroy(&left); | ||
3383 | dataBufferDestroy(&right); | ||
3384 | left = new; | ||
3385 | } | ||
3386 | *pResult = left; | ||
3387 | return SQLITE_OK; | ||
3388 | } | ||
3389 | |||
3390 | /* Add a new term pTerm[0..nTerm-1] to the query *q. | ||
3391 | */ | ||
3392 | static void queryAdd(Query *q, const char *pTerm, int nTerm){ | ||
3393 | QueryTerm *t; | ||
3394 | ++q->nTerms; | ||
3395 | q->pTerms = realloc(q->pTerms, q->nTerms * sizeof(q->pTerms[0])); | ||
3396 | if( q->pTerms==0 ){ | ||
3397 | q->nTerms = 0; | ||
3398 | return; | ||
3399 | } | ||
3400 | t = &q->pTerms[q->nTerms - 1]; | ||
3401 | CLEAR(t); | ||
3402 | t->pTerm = malloc(nTerm+1); | ||
3403 | memcpy(t->pTerm, pTerm, nTerm); | ||
3404 | t->pTerm[nTerm] = 0; | ||
3405 | t->nTerm = nTerm; | ||
3406 | t->isOr = q->nextIsOr; | ||
3407 | t->isPrefix = 0; | ||
3408 | q->nextIsOr = 0; | ||
3409 | t->iColumn = q->nextColumn; | ||
3410 | q->nextColumn = q->dfltColumn; | ||
3411 | } | ||
3412 | |||
3413 | /* | ||
3414 | ** Check to see if the string zToken[0...nToken-1] matches any | ||
3415 | ** column name in the virtual table. If it does, | ||
3416 | ** return the zero-indexed column number. If not, return -1. | ||
3417 | */ | ||
3418 | static int checkColumnSpecifier( | ||
3419 | fulltext_vtab *pVtab, /* The virtual table */ | ||
3420 | const char *zToken, /* Text of the token */ | ||
3421 | int nToken /* Number of characters in the token */ | ||
3422 | ){ | ||
3423 | int i; | ||
3424 | for(i=0; i<pVtab->nColumn; i++){ | ||
3425 | if( memcmp(pVtab->azColumn[i], zToken, nToken)==0 | ||
3426 | && pVtab->azColumn[i][nToken]==0 ){ | ||
3427 | return i; | ||
3428 | } | ||
3429 | } | ||
3430 | return -1; | ||
3431 | } | ||
3432 | |||
3433 | /* | ||
3434 | ** Parse the text at pSegment[0..nSegment-1]. Add additional terms | ||
3435 | ** to the query being assemblied in pQuery. | ||
3436 | ** | ||
3437 | ** inPhrase is true if pSegment[0..nSegement-1] is contained within | ||
3438 | ** double-quotes. If inPhrase is true, then the first term | ||
3439 | ** is marked with the number of terms in the phrase less one and | ||
3440 | ** OR and "-" syntax is ignored. If inPhrase is false, then every | ||
3441 | ** term found is marked with nPhrase=0 and OR and "-" syntax is significant. | ||
3442 | */ | ||
3443 | static int tokenizeSegment( | ||
3444 | sqlite3_tokenizer *pTokenizer, /* The tokenizer to use */ | ||
3445 | const char *pSegment, int nSegment, /* Query expression being parsed */ | ||
3446 | int inPhrase, /* True if within "..." */ | ||
3447 | Query *pQuery /* Append results here */ | ||
3448 | ){ | ||
3449 | const sqlite3_tokenizer_module *pModule = pTokenizer->pModule; | ||
3450 | sqlite3_tokenizer_cursor *pCursor; | ||
3451 | int firstIndex = pQuery->nTerms; | ||
3452 | int iCol; | ||
3453 | int nTerm = 1; | ||
3454 | |||
3455 | int rc = pModule->xOpen(pTokenizer, pSegment, nSegment, &pCursor); | ||
3456 | if( rc!=SQLITE_OK ) return rc; | ||
3457 | pCursor->pTokenizer = pTokenizer; | ||
3458 | |||
3459 | while( 1 ){ | ||
3460 | const char *pToken; | ||
3461 | int nToken, iBegin, iEnd, iPos; | ||
3462 | |||
3463 | rc = pModule->xNext(pCursor, | ||
3464 | &pToken, &nToken, | ||
3465 | &iBegin, &iEnd, &iPos); | ||
3466 | if( rc!=SQLITE_OK ) break; | ||
3467 | if( !inPhrase && | ||
3468 | pSegment[iEnd]==':' && | ||
3469 | (iCol = checkColumnSpecifier(pQuery->pFts, pToken, nToken))>=0 ){ | ||
3470 | pQuery->nextColumn = iCol; | ||
3471 | continue; | ||
3472 | } | ||
3473 | if( !inPhrase && pQuery->nTerms>0 && nToken==2 | ||
3474 | && pSegment[iBegin]=='O' && pSegment[iBegin+1]=='R' ){ | ||
3475 | pQuery->nextIsOr = 1; | ||
3476 | continue; | ||
3477 | } | ||
3478 | queryAdd(pQuery, pToken, nToken); | ||
3479 | if( !inPhrase && iBegin>0 && pSegment[iBegin-1]=='-' ){ | ||
3480 | pQuery->pTerms[pQuery->nTerms-1].isNot = 1; | ||
3481 | } | ||
3482 | if( iEnd<nSegment && pSegment[iEnd]=='*' ){ | ||
3483 | pQuery->pTerms[pQuery->nTerms-1].isPrefix = 1; | ||
3484 | } | ||
3485 | pQuery->pTerms[pQuery->nTerms-1].iPhrase = nTerm; | ||
3486 | if( inPhrase ){ | ||
3487 | nTerm++; | ||
3488 | } | ||
3489 | } | ||
3490 | |||
3491 | if( inPhrase && pQuery->nTerms>firstIndex ){ | ||
3492 | pQuery->pTerms[firstIndex].nPhrase = pQuery->nTerms - firstIndex - 1; | ||
3493 | } | ||
3494 | |||
3495 | return pModule->xClose(pCursor); | ||
3496 | } | ||
3497 | |||
3498 | /* Parse a query string, yielding a Query object pQuery. | ||
3499 | ** | ||
3500 | ** The calling function will need to queryClear() to clean up | ||
3501 | ** the dynamically allocated memory held by pQuery. | ||
3502 | */ | ||
3503 | static int parseQuery( | ||
3504 | fulltext_vtab *v, /* The fulltext index */ | ||
3505 | const char *zInput, /* Input text of the query string */ | ||
3506 | int nInput, /* Size of the input text */ | ||
3507 | int dfltColumn, /* Default column of the index to match against */ | ||
3508 | Query *pQuery /* Write the parse results here. */ | ||
3509 | ){ | ||
3510 | int iInput, inPhrase = 0; | ||
3511 | |||
3512 | if( zInput==0 ) nInput = 0; | ||
3513 | if( nInput<0 ) nInput = strlen(zInput); | ||
3514 | pQuery->nTerms = 0; | ||
3515 | pQuery->pTerms = NULL; | ||
3516 | pQuery->nextIsOr = 0; | ||
3517 | pQuery->nextColumn = dfltColumn; | ||
3518 | pQuery->dfltColumn = dfltColumn; | ||
3519 | pQuery->pFts = v; | ||
3520 | |||
3521 | for(iInput=0; iInput<nInput; ++iInput){ | ||
3522 | int i; | ||
3523 | for(i=iInput; i<nInput && zInput[i]!='"'; ++i){} | ||
3524 | if( i>iInput ){ | ||
3525 | tokenizeSegment(v->pTokenizer, zInput+iInput, i-iInput, inPhrase, | ||
3526 | pQuery); | ||
3527 | } | ||
3528 | iInput = i; | ||
3529 | if( i<nInput ){ | ||
3530 | assert( zInput[i]=='"' ); | ||
3531 | inPhrase = !inPhrase; | ||
3532 | } | ||
3533 | } | ||
3534 | |||
3535 | if( inPhrase ){ | ||
3536 | /* unmatched quote */ | ||
3537 | queryClear(pQuery); | ||
3538 | return SQLITE_ERROR; | ||
3539 | } | ||
3540 | return SQLITE_OK; | ||
3541 | } | ||
3542 | |||
3543 | /* TODO(shess) Refactor the code to remove this forward decl. */ | ||
3544 | static int flushPendingTerms(fulltext_vtab *v); | ||
3545 | |||
3546 | /* Perform a full-text query using the search expression in | ||
3547 | ** zInput[0..nInput-1]. Return a list of matching documents | ||
3548 | ** in pResult. | ||
3549 | ** | ||
3550 | ** Queries must match column iColumn. Or if iColumn>=nColumn | ||
3551 | ** they are allowed to match against any column. | ||
3552 | */ | ||
3553 | static int fulltextQuery( | ||
3554 | fulltext_vtab *v, /* The full text index */ | ||
3555 | int iColumn, /* Match against this column by default */ | ||
3556 | const char *zInput, /* The query string */ | ||
3557 | int nInput, /* Number of bytes in zInput[] */ | ||
3558 | DataBuffer *pResult, /* Write the result doclist here */ | ||
3559 | Query *pQuery /* Put parsed query string here */ | ||
3560 | ){ | ||
3561 | int i, iNext, rc; | ||
3562 | DataBuffer left, right, or, new; | ||
3563 | int nNot = 0; | ||
3564 | QueryTerm *aTerm; | ||
3565 | |||
3566 | /* TODO(shess) Instead of flushing pendingTerms, we could query for | ||
3567 | ** the relevant term and merge the doclist into what we receive from | ||
3568 | ** the database. Wait and see if this is a common issue, first. | ||
3569 | ** | ||
3570 | ** A good reason not to flush is to not generate update-related | ||
3571 | ** error codes from here. | ||
3572 | */ | ||
3573 | |||
3574 | /* Flush any buffered updates before executing the query. */ | ||
3575 | rc = flushPendingTerms(v); | ||
3576 | if( rc!=SQLITE_OK ) return rc; | ||
3577 | |||
3578 | /* TODO(shess) I think that the queryClear() calls below are not | ||
3579 | ** necessary, because fulltextClose() already clears the query. | ||
3580 | */ | ||
3581 | rc = parseQuery(v, zInput, nInput, iColumn, pQuery); | ||
3582 | if( rc!=SQLITE_OK ) return rc; | ||
3583 | |||
3584 | /* Empty or NULL queries return no results. */ | ||
3585 | if( pQuery->nTerms==0 ){ | ||
3586 | dataBufferInit(pResult, 0); | ||
3587 | return SQLITE_OK; | ||
3588 | } | ||
3589 | |||
3590 | /* Merge AND terms. */ | ||
3591 | /* TODO(shess) I think we can early-exit if( i>nNot && left.nData==0 ). */ | ||
3592 | aTerm = pQuery->pTerms; | ||
3593 | for(i = 0; i<pQuery->nTerms; i=iNext){ | ||
3594 | if( aTerm[i].isNot ){ | ||
3595 | /* Handle all NOT terms in a separate pass */ | ||
3596 | nNot++; | ||
3597 | iNext = i + aTerm[i].nPhrase+1; | ||
3598 | continue; | ||
3599 | } | ||
3600 | iNext = i + aTerm[i].nPhrase + 1; | ||
3601 | rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &right); | ||
3602 | if( rc ){ | ||
3603 | if( i!=nNot ) dataBufferDestroy(&left); | ||
3604 | queryClear(pQuery); | ||
3605 | return rc; | ||
3606 | } | ||
3607 | while( iNext<pQuery->nTerms && aTerm[iNext].isOr ){ | ||
3608 | rc = docListOfTerm(v, aTerm[iNext].iColumn, &aTerm[iNext], &or); | ||
3609 | iNext += aTerm[iNext].nPhrase + 1; | ||
3610 | if( rc ){ | ||
3611 | if( i!=nNot ) dataBufferDestroy(&left); | ||
3612 | dataBufferDestroy(&right); | ||
3613 | queryClear(pQuery); | ||
3614 | return rc; | ||
3615 | } | ||
3616 | dataBufferInit(&new, 0); | ||
3617 | docListOrMerge(right.pData, right.nData, or.pData, or.nData, &new); | ||
3618 | dataBufferDestroy(&right); | ||
3619 | dataBufferDestroy(&or); | ||
3620 | right = new; | ||
3621 | } | ||
3622 | if( i==nNot ){ /* first term processed. */ | ||
3623 | left = right; | ||
3624 | }else{ | ||
3625 | dataBufferInit(&new, 0); | ||
3626 | docListAndMerge(left.pData, left.nData, right.pData, right.nData, &new); | ||
3627 | dataBufferDestroy(&right); | ||
3628 | dataBufferDestroy(&left); | ||
3629 | left = new; | ||
3630 | } | ||
3631 | } | ||
3632 | |||
3633 | if( nNot==pQuery->nTerms ){ | ||
3634 | /* We do not yet know how to handle a query of only NOT terms */ | ||
3635 | return SQLITE_ERROR; | ||
3636 | } | ||
3637 | |||
3638 | /* Do the EXCEPT terms */ | ||
3639 | for(i=0; i<pQuery->nTerms; i += aTerm[i].nPhrase + 1){ | ||
3640 | if( !aTerm[i].isNot ) continue; | ||
3641 | rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &right); | ||
3642 | if( rc ){ | ||
3643 | queryClear(pQuery); | ||
3644 | dataBufferDestroy(&left); | ||
3645 | return rc; | ||
3646 | } | ||
3647 | dataBufferInit(&new, 0); | ||
3648 | docListExceptMerge(left.pData, left.nData, right.pData, right.nData, &new); | ||
3649 | dataBufferDestroy(&right); | ||
3650 | dataBufferDestroy(&left); | ||
3651 | left = new; | ||
3652 | } | ||
3653 | |||
3654 | *pResult = left; | ||
3655 | return rc; | ||
3656 | } | ||
3657 | |||
3658 | /* | ||
3659 | ** This is the xFilter interface for the virtual table. See | ||
3660 | ** the virtual table xFilter method documentation for additional | ||
3661 | ** information. | ||
3662 | ** | ||
3663 | ** If idxNum==QUERY_GENERIC then do a full table scan against | ||
3664 | ** the %_content table. | ||
3665 | ** | ||
3666 | ** If idxNum==QUERY_DOCID then do a docid lookup for a single entry | ||
3667 | ** in the %_content table. | ||
3668 | ** | ||
3669 | ** If idxNum>=QUERY_FULLTEXT then use the full text index. The | ||
3670 | ** column on the left-hand side of the MATCH operator is column | ||
3671 | ** number idxNum-QUERY_FULLTEXT, 0 indexed. argv[0] is the right-hand | ||
3672 | ** side of the MATCH operator. | ||
3673 | */ | ||
3674 | /* TODO(shess) Upgrade the cursor initialization and destruction to | ||
3675 | ** account for fulltextFilter() being called multiple times on the | ||
3676 | ** same cursor. The current solution is very fragile. Apply fix to | ||
3677 | ** fts3 as appropriate. | ||
3678 | */ | ||
3679 | static int fulltextFilter( | ||
3680 | sqlite3_vtab_cursor *pCursor, /* The cursor used for this query */ | ||
3681 | int idxNum, const char *idxStr, /* Which indexing scheme to use */ | ||
3682 | int argc, sqlite3_value **argv /* Arguments for the indexing scheme */ | ||
3683 | ){ | ||
3684 | fulltext_cursor *c = (fulltext_cursor *) pCursor; | ||
3685 | fulltext_vtab *v = cursor_vtab(c); | ||
3686 | int rc; | ||
3687 | StringBuffer sb; | ||
3688 | |||
3689 | TRACE(("FTS3 Filter %p\n",pCursor)); | ||
3690 | |||
3691 | initStringBuffer(&sb); | ||
3692 | append(&sb, "SELECT docid, "); | ||
3693 | appendList(&sb, v->nColumn, v->azContentColumn); | ||
3694 | append(&sb, " FROM %_content"); | ||
3695 | if( idxNum!=QUERY_GENERIC ) append(&sb, " WHERE docid = ?"); | ||
3696 | sqlite3_finalize(c->pStmt); | ||
3697 | rc = sql_prepare(v->db, v->zDb, v->zName, &c->pStmt, stringBufferData(&sb)); | ||
3698 | stringBufferDestroy(&sb); | ||
3699 | if( rc!=SQLITE_OK ) return rc; | ||
3700 | |||
3701 | c->iCursorType = idxNum; | ||
3702 | switch( idxNum ){ | ||
3703 | case QUERY_GENERIC: | ||
3704 | break; | ||
3705 | |||
3706 | case QUERY_DOCID: | ||
3707 | rc = sqlite3_bind_int64(c->pStmt, 1, sqlite3_value_int64(argv[0])); | ||
3708 | if( rc!=SQLITE_OK ) return rc; | ||
3709 | break; | ||
3710 | |||
3711 | default: /* full-text search */ | ||
3712 | { | ||
3713 | const char *zQuery = (const char *)sqlite3_value_text(argv[0]); | ||
3714 | assert( idxNum<=QUERY_FULLTEXT+v->nColumn); | ||
3715 | assert( argc==1 ); | ||
3716 | queryClear(&c->q); | ||
3717 | if( c->result.nData!=0 ){ | ||
3718 | /* This case happens if the same cursor is used repeatedly. */ | ||
3719 | dlrDestroy(&c->reader); | ||
3720 | dataBufferReset(&c->result); | ||
3721 | }else{ | ||
3722 | dataBufferInit(&c->result, 0); | ||
3723 | } | ||
3724 | rc = fulltextQuery(v, idxNum-QUERY_FULLTEXT, zQuery, -1, &c->result, &c->q); | ||
3725 | if( rc!=SQLITE_OK ) return rc; | ||
3726 | if( c->result.nData!=0 ){ | ||
3727 | dlrInit(&c->reader, DL_DOCIDS, c->result.pData, c->result.nData); | ||
3728 | } | ||
3729 | break; | ||
3730 | } | ||
3731 | } | ||
3732 | |||
3733 | return fulltextNext(pCursor); | ||
3734 | } | ||
3735 | |||
3736 | /* This is the xEof method of the virtual table. The SQLite core | ||
3737 | ** calls this routine to find out if it has reached the end of | ||
3738 | ** a query's results set. | ||
3739 | */ | ||
3740 | static int fulltextEof(sqlite3_vtab_cursor *pCursor){ | ||
3741 | fulltext_cursor *c = (fulltext_cursor *) pCursor; | ||
3742 | return c->eof; | ||
3743 | } | ||
3744 | |||
3745 | /* This is the xColumn method of the virtual table. The SQLite | ||
3746 | ** core calls this method during a query when it needs the value | ||
3747 | ** of a column from the virtual table. This method needs to use | ||
3748 | ** one of the sqlite3_result_*() routines to store the requested | ||
3749 | ** value back in the pContext. | ||
3750 | */ | ||
3751 | static int fulltextColumn(sqlite3_vtab_cursor *pCursor, | ||
3752 | sqlite3_context *pContext, int idxCol){ | ||
3753 | fulltext_cursor *c = (fulltext_cursor *) pCursor; | ||
3754 | fulltext_vtab *v = cursor_vtab(c); | ||
3755 | |||
3756 | if( idxCol<v->nColumn ){ | ||
3757 | sqlite3_value *pVal = sqlite3_column_value(c->pStmt, idxCol+1); | ||
3758 | sqlite3_result_value(pContext, pVal); | ||
3759 | }else if( idxCol==v->nColumn ){ | ||
3760 | /* The extra column whose name is the same as the table. | ||
3761 | ** Return a blob which is a pointer to the cursor | ||
3762 | */ | ||
3763 | sqlite3_result_blob(pContext, &c, sizeof(c), SQLITE_TRANSIENT); | ||
3764 | }else if( idxCol==v->nColumn+1 ){ | ||
3765 | /* The docid column, which is an alias for rowid. */ | ||
3766 | sqlite3_value *pVal = sqlite3_column_value(c->pStmt, 0); | ||
3767 | sqlite3_result_value(pContext, pVal); | ||
3768 | } | ||
3769 | return SQLITE_OK; | ||
3770 | } | ||
3771 | |||
3772 | /* This is the xRowid method. The SQLite core calls this routine to | ||
3773 | ** retrieve the rowid for the current row of the result set. fts3 | ||
3774 | ** exposes %_content.docid as the rowid for the virtual table. The | ||
3775 | ** rowid should be written to *pRowid. | ||
3776 | */ | ||
3777 | static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){ | ||
3778 | fulltext_cursor *c = (fulltext_cursor *) pCursor; | ||
3779 | |||
3780 | *pRowid = sqlite3_column_int64(c->pStmt, 0); | ||
3781 | return SQLITE_OK; | ||
3782 | } | ||
3783 | |||
3784 | /* Add all terms in [zText] to pendingTerms table. If [iColumn] > 0, | ||
3785 | ** we also store positions and offsets in the hash table using that | ||
3786 | ** column number. | ||
3787 | */ | ||
3788 | static int buildTerms(fulltext_vtab *v, sqlite_int64 iDocid, | ||
3789 | const char *zText, int iColumn){ | ||
3790 | sqlite3_tokenizer *pTokenizer = v->pTokenizer; | ||
3791 | sqlite3_tokenizer_cursor *pCursor; | ||
3792 | const char *pToken; | ||
3793 | int nTokenBytes; | ||
3794 | int iStartOffset, iEndOffset, iPosition; | ||
3795 | int rc; | ||
3796 | |||
3797 | rc = pTokenizer->pModule->xOpen(pTokenizer, zText, -1, &pCursor); | ||
3798 | if( rc!=SQLITE_OK ) return rc; | ||
3799 | |||
3800 | pCursor->pTokenizer = pTokenizer; | ||
3801 | while( SQLITE_OK==pTokenizer->pModule->xNext(pCursor, | ||
3802 | &pToken, &nTokenBytes, | ||
3803 | &iStartOffset, &iEndOffset, | ||
3804 | &iPosition) ){ | ||
3805 | DLCollector *p; | ||
3806 | int nData; /* Size of doclist before our update. */ | ||
3807 | |||
3808 | /* Positions can't be negative; we use -1 as a terminator internally. */ | ||
3809 | if( iPosition<0 ){ | ||
3810 | pTokenizer->pModule->xClose(pCursor); | ||
3811 | return SQLITE_ERROR; | ||
3812 | } | ||
3813 | |||
3814 | p = fts3HashFind(&v->pendingTerms, pToken, nTokenBytes); | ||
3815 | if( p==NULL ){ | ||
3816 | nData = 0; | ||
3817 | p = dlcNew(iDocid, DL_DEFAULT); | ||
3818 | fts3HashInsert(&v->pendingTerms, pToken, nTokenBytes, p); | ||
3819 | |||
3820 | /* Overhead for our hash table entry, the key, and the value. */ | ||
3821 | v->nPendingData += sizeof(struct fts3HashElem)+sizeof(*p)+nTokenBytes; | ||
3822 | }else{ | ||
3823 | nData = p->b.nData; | ||
3824 | if( p->dlw.iPrevDocid!=iDocid ) dlcNext(p, iDocid); | ||
3825 | } | ||
3826 | if( iColumn>=0 ){ | ||
3827 | dlcAddPos(p, iColumn, iPosition, iStartOffset, iEndOffset); | ||
3828 | } | ||
3829 | |||
3830 | /* Accumulate data added by dlcNew or dlcNext, and dlcAddPos. */ | ||
3831 | v->nPendingData += p->b.nData-nData; | ||
3832 | } | ||
3833 | |||
3834 | /* TODO(shess) Check return? Should this be able to cause errors at | ||
3835 | ** this point? Actually, same question about sqlite3_finalize(), | ||
3836 | ** though one could argue that failure there means that the data is | ||
3837 | ** not durable. *ponder* | ||
3838 | */ | ||
3839 | pTokenizer->pModule->xClose(pCursor); | ||
3840 | return rc; | ||
3841 | } | ||
3842 | |||
3843 | /* Add doclists for all terms in [pValues] to pendingTerms table. */ | ||
3844 | static int insertTerms(fulltext_vtab *v, sqlite_int64 iDocid, | ||
3845 | sqlite3_value **pValues){ | ||
3846 | int i; | ||
3847 | for(i = 0; i < v->nColumn ; ++i){ | ||
3848 | char *zText = (char*)sqlite3_value_text(pValues[i]); | ||
3849 | int rc = buildTerms(v, iDocid, zText, i); | ||
3850 | if( rc!=SQLITE_OK ) return rc; | ||
3851 | } | ||
3852 | return SQLITE_OK; | ||
3853 | } | ||
3854 | |||
3855 | /* Add empty doclists for all terms in the given row's content to | ||
3856 | ** pendingTerms. | ||
3857 | */ | ||
3858 | static int deleteTerms(fulltext_vtab *v, sqlite_int64 iDocid){ | ||
3859 | const char **pValues; | ||
3860 | int i, rc; | ||
3861 | |||
3862 | /* TODO(shess) Should we allow such tables at all? */ | ||
3863 | if( DL_DEFAULT==DL_DOCIDS ) return SQLITE_ERROR; | ||
3864 | |||
3865 | rc = content_select(v, iDocid, &pValues); | ||
3866 | if( rc!=SQLITE_OK ) return rc; | ||
3867 | |||
3868 | for(i = 0 ; i < v->nColumn; ++i) { | ||
3869 | rc = buildTerms(v, iDocid, pValues[i], -1); | ||
3870 | if( rc!=SQLITE_OK ) break; | ||
3871 | } | ||
3872 | |||
3873 | freeStringArray(v->nColumn, pValues); | ||
3874 | return SQLITE_OK; | ||
3875 | } | ||
3876 | |||
3877 | /* TODO(shess) Refactor the code to remove this forward decl. */ | ||
3878 | static int initPendingTerms(fulltext_vtab *v, sqlite_int64 iDocid); | ||
3879 | |||
3880 | /* Insert a row into the %_content table; set *piDocid to be the ID of the | ||
3881 | ** new row. Add doclists for terms to pendingTerms. | ||
3882 | */ | ||
3883 | static int index_insert(fulltext_vtab *v, sqlite3_value *pRequestDocid, | ||
3884 | sqlite3_value **pValues, sqlite_int64 *piDocid){ | ||
3885 | int rc; | ||
3886 | |||
3887 | rc = content_insert(v, pRequestDocid, pValues); /* execute an SQL INSERT */ | ||
3888 | if( rc!=SQLITE_OK ) return rc; | ||
3889 | |||
3890 | /* docid column is an alias for rowid. */ | ||
3891 | *piDocid = sqlite3_last_insert_rowid(v->db); | ||
3892 | rc = initPendingTerms(v, *piDocid); | ||
3893 | if( rc!=SQLITE_OK ) return rc; | ||
3894 | |||
3895 | return insertTerms(v, *piDocid, pValues); | ||
3896 | } | ||
3897 | |||
3898 | /* Delete a row from the %_content table; add empty doclists for terms | ||
3899 | ** to pendingTerms. | ||
3900 | */ | ||
3901 | static int index_delete(fulltext_vtab *v, sqlite_int64 iRow){ | ||
3902 | int rc = initPendingTerms(v, iRow); | ||
3903 | if( rc!=SQLITE_OK ) return rc; | ||
3904 | |||
3905 | rc = deleteTerms(v, iRow); | ||
3906 | if( rc!=SQLITE_OK ) return rc; | ||
3907 | |||
3908 | return content_delete(v, iRow); /* execute an SQL DELETE */ | ||
3909 | } | ||
3910 | |||
3911 | /* Update a row in the %_content table; add delete doclists to | ||
3912 | ** pendingTerms for old terms not in the new data, add insert doclists | ||
3913 | ** to pendingTerms for terms in the new data. | ||
3914 | */ | ||
3915 | static int index_update(fulltext_vtab *v, sqlite_int64 iRow, | ||
3916 | sqlite3_value **pValues){ | ||
3917 | int rc = initPendingTerms(v, iRow); | ||
3918 | if( rc!=SQLITE_OK ) return rc; | ||
3919 | |||
3920 | /* Generate an empty doclist for each term that previously appeared in this | ||
3921 | * row. */ | ||
3922 | rc = deleteTerms(v, iRow); | ||
3923 | if( rc!=SQLITE_OK ) return rc; | ||
3924 | |||
3925 | rc = content_update(v, pValues, iRow); /* execute an SQL UPDATE */ | ||
3926 | if( rc!=SQLITE_OK ) return rc; | ||
3927 | |||
3928 | /* Now add positions for terms which appear in the updated row. */ | ||
3929 | return insertTerms(v, iRow, pValues); | ||
3930 | } | ||
3931 | |||
3932 | /*******************************************************************/ | ||
3933 | /* InteriorWriter is used to collect terms and block references into | ||
3934 | ** interior nodes in %_segments. See commentary at top of file for | ||
3935 | ** format. | ||
3936 | */ | ||
3937 | |||
3938 | /* How large interior nodes can grow. */ | ||
3939 | #define INTERIOR_MAX 2048 | ||
3940 | |||
3941 | /* Minimum number of terms per interior node (except the root). This | ||
3942 | ** prevents large terms from making the tree too skinny - must be >0 | ||
3943 | ** so that the tree always makes progress. Note that the min tree | ||
3944 | ** fanout will be INTERIOR_MIN_TERMS+1. | ||
3945 | */ | ||
3946 | #define INTERIOR_MIN_TERMS 7 | ||
3947 | #if INTERIOR_MIN_TERMS<1 | ||
3948 | # error INTERIOR_MIN_TERMS must be greater than 0. | ||
3949 | #endif | ||
3950 | |||
3951 | /* ROOT_MAX controls how much data is stored inline in the segment | ||
3952 | ** directory. | ||
3953 | */ | ||
3954 | /* TODO(shess) Push ROOT_MAX down to whoever is writing things. It's | ||
3955 | ** only here so that interiorWriterRootInfo() and leafWriterRootInfo() | ||
3956 | ** can both see it, but if the caller passed it in, we wouldn't even | ||
3957 | ** need a define. | ||
3958 | */ | ||
3959 | #define ROOT_MAX 1024 | ||
3960 | #if ROOT_MAX<VARINT_MAX*2 | ||
3961 | # error ROOT_MAX must have enough space for a header. | ||
3962 | #endif | ||
3963 | |||
3964 | /* InteriorBlock stores a linked-list of interior blocks while a lower | ||
3965 | ** layer is being constructed. | ||
3966 | */ | ||
3967 | typedef struct InteriorBlock { | ||
3968 | DataBuffer term; /* Leftmost term in block's subtree. */ | ||
3969 | DataBuffer data; /* Accumulated data for the block. */ | ||
3970 | struct InteriorBlock *next; | ||
3971 | } InteriorBlock; | ||
3972 | |||
3973 | static InteriorBlock *interiorBlockNew(int iHeight, sqlite_int64 iChildBlock, | ||
3974 | const char *pTerm, int nTerm){ | ||
3975 | InteriorBlock *block = calloc(1, sizeof(InteriorBlock)); | ||
3976 | char c[VARINT_MAX+VARINT_MAX]; | ||
3977 | int n; | ||
3978 | |||
3979 | dataBufferInit(&block->term, 0); | ||
3980 | dataBufferReplace(&block->term, pTerm, nTerm); | ||
3981 | |||
3982 | n = putVarint(c, iHeight); | ||
3983 | n += putVarint(c+n, iChildBlock); | ||
3984 | dataBufferInit(&block->data, INTERIOR_MAX); | ||
3985 | dataBufferReplace(&block->data, c, n); | ||
3986 | |||
3987 | return block; | ||
3988 | } | ||
3989 | |||
3990 | #ifndef NDEBUG | ||
3991 | /* Verify that the data is readable as an interior node. */ | ||
3992 | static void interiorBlockValidate(InteriorBlock *pBlock){ | ||
3993 | const char *pData = pBlock->data.pData; | ||
3994 | int nData = pBlock->data.nData; | ||
3995 | int n, iDummy; | ||
3996 | sqlite_int64 iBlockid; | ||
3997 | |||
3998 | assert( nData>0 ); | ||
3999 | assert( pData!=0 ); | ||
4000 | assert( pData+nData>pData ); | ||
4001 | |||
4002 | /* Must lead with height of node as a varint(n), n>0 */ | ||
4003 | n = getVarint32(pData, &iDummy); | ||
4004 | assert( n>0 ); | ||
4005 | assert( iDummy>0 ); | ||
4006 | assert( n<nData ); | ||
4007 | pData += n; | ||
4008 | nData -= n; | ||
4009 | |||
4010 | /* Must contain iBlockid. */ | ||
4011 | n = getVarint(pData, &iBlockid); | ||
4012 | assert( n>0 ); | ||
4013 | assert( n<=nData ); | ||
4014 | pData += n; | ||
4015 | nData -= n; | ||
4016 | |||
4017 | /* Zero or more terms of positive length */ | ||
4018 | if( nData!=0 ){ | ||
4019 | /* First term is not delta-encoded. */ | ||
4020 | n = getVarint32(pData, &iDummy); | ||
4021 | assert( n>0 ); | ||
4022 | assert( iDummy>0 ); | ||
4023 | assert( n+iDummy>0); | ||
4024 | assert( n+iDummy<=nData ); | ||
4025 | pData += n+iDummy; | ||
4026 | nData -= n+iDummy; | ||
4027 | |||
4028 | /* Following terms delta-encoded. */ | ||
4029 | while( nData!=0 ){ | ||
4030 | /* Length of shared prefix. */ | ||
4031 | n = getVarint32(pData, &iDummy); | ||
4032 | assert( n>0 ); | ||
4033 | assert( iDummy>=0 ); | ||
4034 | assert( n<nData ); | ||
4035 | pData += n; | ||
4036 | nData -= n; | ||
4037 | |||
4038 | /* Length and data of distinct suffix. */ | ||
4039 | n = getVarint32(pData, &iDummy); | ||
4040 | assert( n>0 ); | ||
4041 | assert( iDummy>0 ); | ||
4042 | assert( n+iDummy>0); | ||
4043 | assert( n+iDummy<=nData ); | ||
4044 | pData += n+iDummy; | ||
4045 | nData -= n+iDummy; | ||
4046 | } | ||
4047 | } | ||
4048 | } | ||
4049 | #define ASSERT_VALID_INTERIOR_BLOCK(x) interiorBlockValidate(x) | ||
4050 | #else | ||
4051 | #define ASSERT_VALID_INTERIOR_BLOCK(x) assert( 1 ) | ||
4052 | #endif | ||
4053 | |||
4054 | typedef struct InteriorWriter { | ||
4055 | int iHeight; /* from 0 at leaves. */ | ||
4056 | InteriorBlock *first, *last; | ||
4057 | struct InteriorWriter *parentWriter; | ||
4058 | |||
4059 | DataBuffer term; /* Last term written to block "last". */ | ||
4060 | sqlite_int64 iOpeningChildBlock; /* First child block in block "last". */ | ||
4061 | #ifndef NDEBUG | ||
4062 | sqlite_int64 iLastChildBlock; /* for consistency checks. */ | ||
4063 | #endif | ||
4064 | } InteriorWriter; | ||
4065 | |||
4066 | /* Initialize an interior node where pTerm[nTerm] marks the leftmost | ||
4067 | ** term in the tree. iChildBlock is the leftmost child block at the | ||
4068 | ** next level down the tree. | ||
4069 | */ | ||
4070 | static void interiorWriterInit(int iHeight, const char *pTerm, int nTerm, | ||
4071 | sqlite_int64 iChildBlock, | ||
4072 | InteriorWriter *pWriter){ | ||
4073 | InteriorBlock *block; | ||
4074 | assert( iHeight>0 ); | ||
4075 | CLEAR(pWriter); | ||
4076 | |||
4077 | pWriter->iHeight = iHeight; | ||
4078 | pWriter->iOpeningChildBlock = iChildBlock; | ||
4079 | #ifndef NDEBUG | ||
4080 | pWriter->iLastChildBlock = iChildBlock; | ||
4081 | #endif | ||
4082 | block = interiorBlockNew(iHeight, iChildBlock, pTerm, nTerm); | ||
4083 | pWriter->last = pWriter->first = block; | ||
4084 | ASSERT_VALID_INTERIOR_BLOCK(pWriter->last); | ||
4085 | dataBufferInit(&pWriter->term, 0); | ||
4086 | } | ||
4087 | |||
4088 | /* Append the child node rooted at iChildBlock to the interior node, | ||
4089 | ** with pTerm[nTerm] as the leftmost term in iChildBlock's subtree. | ||
4090 | */ | ||
4091 | static void interiorWriterAppend(InteriorWriter *pWriter, | ||
4092 | const char *pTerm, int nTerm, | ||
4093 | sqlite_int64 iChildBlock){ | ||
4094 | char c[VARINT_MAX+VARINT_MAX]; | ||
4095 | int n, nPrefix = 0; | ||
4096 | |||
4097 | ASSERT_VALID_INTERIOR_BLOCK(pWriter->last); | ||
4098 | |||
4099 | /* The first term written into an interior node is actually | ||
4100 | ** associated with the second child added (the first child was added | ||
4101 | ** in interiorWriterInit, or in the if clause at the bottom of this | ||
4102 | ** function). That term gets encoded straight up, with nPrefix left | ||
4103 | ** at 0. | ||
4104 | */ | ||
4105 | if( pWriter->term.nData==0 ){ | ||
4106 | n = putVarint(c, nTerm); | ||
4107 | }else{ | ||
4108 | while( nPrefix<pWriter->term.nData && | ||
4109 | pTerm[nPrefix]==pWriter->term.pData[nPrefix] ){ | ||
4110 | nPrefix++; | ||
4111 | } | ||
4112 | |||
4113 | n = putVarint(c, nPrefix); | ||
4114 | n += putVarint(c+n, nTerm-nPrefix); | ||
4115 | } | ||
4116 | |||
4117 | #ifndef NDEBUG | ||
4118 | pWriter->iLastChildBlock++; | ||
4119 | #endif | ||
4120 | assert( pWriter->iLastChildBlock==iChildBlock ); | ||
4121 | |||
4122 | /* Overflow to a new block if the new term makes the current block | ||
4123 | ** too big, and the current block already has enough terms. | ||
4124 | */ | ||
4125 | if( pWriter->last->data.nData+n+nTerm-nPrefix>INTERIOR_MAX && | ||
4126 | iChildBlock-pWriter->iOpeningChildBlock>INTERIOR_MIN_TERMS ){ | ||
4127 | pWriter->last->next = interiorBlockNew(pWriter->iHeight, iChildBlock, | ||
4128 | pTerm, nTerm); | ||
4129 | pWriter->last = pWriter->last->next; | ||
4130 | pWriter->iOpeningChildBlock = iChildBlock; | ||
4131 | dataBufferReset(&pWriter->term); | ||
4132 | }else{ | ||
4133 | dataBufferAppend2(&pWriter->last->data, c, n, | ||
4134 | pTerm+nPrefix, nTerm-nPrefix); | ||
4135 | dataBufferReplace(&pWriter->term, pTerm, nTerm); | ||
4136 | } | ||
4137 | ASSERT_VALID_INTERIOR_BLOCK(pWriter->last); | ||
4138 | } | ||
4139 | |||
4140 | /* Free the space used by pWriter, including the linked-list of | ||
4141 | ** InteriorBlocks, and parentWriter, if present. | ||
4142 | */ | ||
4143 | static int interiorWriterDestroy(InteriorWriter *pWriter){ | ||
4144 | InteriorBlock *block = pWriter->first; | ||
4145 | |||
4146 | while( block!=NULL ){ | ||
4147 | InteriorBlock *b = block; | ||
4148 | block = block->next; | ||
4149 | dataBufferDestroy(&b->term); | ||
4150 | dataBufferDestroy(&b->data); | ||
4151 | free(b); | ||
4152 | } | ||
4153 | if( pWriter->parentWriter!=NULL ){ | ||
4154 | interiorWriterDestroy(pWriter->parentWriter); | ||
4155 | free(pWriter->parentWriter); | ||
4156 | } | ||
4157 | dataBufferDestroy(&pWriter->term); | ||
4158 | SCRAMBLE(pWriter); | ||
4159 | return SQLITE_OK; | ||
4160 | } | ||
4161 | |||
4162 | /* If pWriter can fit entirely in ROOT_MAX, return it as the root info | ||
4163 | ** directly, leaving *piEndBlockid unchanged. Otherwise, flush | ||
4164 | ** pWriter to %_segments, building a new layer of interior nodes, and | ||
4165 | ** recursively ask for their root into. | ||
4166 | */ | ||
4167 | static int interiorWriterRootInfo(fulltext_vtab *v, InteriorWriter *pWriter, | ||
4168 | char **ppRootInfo, int *pnRootInfo, | ||
4169 | sqlite_int64 *piEndBlockid){ | ||
4170 | InteriorBlock *block = pWriter->first; | ||
4171 | sqlite_int64 iBlockid = 0; | ||
4172 | int rc; | ||
4173 | |||
4174 | /* If we can fit the segment inline */ | ||
4175 | if( block==pWriter->last && block->data.nData<ROOT_MAX ){ | ||
4176 | *ppRootInfo = block->data.pData; | ||
4177 | *pnRootInfo = block->data.nData; | ||
4178 | return SQLITE_OK; | ||
4179 | } | ||
4180 | |||
4181 | /* Flush the first block to %_segments, and create a new level of | ||
4182 | ** interior node. | ||
4183 | */ | ||
4184 | ASSERT_VALID_INTERIOR_BLOCK(block); | ||
4185 | rc = block_insert(v, block->data.pData, block->data.nData, &iBlockid); | ||
4186 | if( rc!=SQLITE_OK ) return rc; | ||
4187 | *piEndBlockid = iBlockid; | ||
4188 | |||
4189 | pWriter->parentWriter = malloc(sizeof(*pWriter->parentWriter)); | ||
4190 | interiorWriterInit(pWriter->iHeight+1, | ||
4191 | block->term.pData, block->term.nData, | ||
4192 | iBlockid, pWriter->parentWriter); | ||
4193 | |||
4194 | /* Flush additional blocks and append to the higher interior | ||
4195 | ** node. | ||
4196 | */ | ||
4197 | for(block=block->next; block!=NULL; block=block->next){ | ||
4198 | ASSERT_VALID_INTERIOR_BLOCK(block); | ||
4199 | rc = block_insert(v, block->data.pData, block->data.nData, &iBlockid); | ||
4200 | if( rc!=SQLITE_OK ) return rc; | ||
4201 | *piEndBlockid = iBlockid; | ||
4202 | |||
4203 | interiorWriterAppend(pWriter->parentWriter, | ||
4204 | block->term.pData, block->term.nData, iBlockid); | ||
4205 | } | ||
4206 | |||
4207 | /* Parent node gets the chance to be the root. */ | ||
4208 | return interiorWriterRootInfo(v, pWriter->parentWriter, | ||
4209 | ppRootInfo, pnRootInfo, piEndBlockid); | ||
4210 | } | ||
4211 | |||
4212 | /****************************************************************/ | ||
4213 | /* InteriorReader is used to read off the data from an interior node | ||
4214 | ** (see comment at top of file for the format). | ||
4215 | */ | ||
4216 | typedef struct InteriorReader { | ||
4217 | const char *pData; | ||
4218 | int nData; | ||
4219 | |||
4220 | DataBuffer term; /* previous term, for decoding term delta. */ | ||
4221 | |||
4222 | sqlite_int64 iBlockid; | ||
4223 | } InteriorReader; | ||
4224 | |||
4225 | static void interiorReaderDestroy(InteriorReader *pReader){ | ||
4226 | dataBufferDestroy(&pReader->term); | ||
4227 | SCRAMBLE(pReader); | ||
4228 | } | ||
4229 | |||
4230 | /* TODO(shess) The assertions are great, but what if we're in NDEBUG | ||
4231 | ** and the blob is empty or otherwise contains suspect data? | ||
4232 | */ | ||
4233 | static void interiorReaderInit(const char *pData, int nData, | ||
4234 | InteriorReader *pReader){ | ||
4235 | int n, nTerm; | ||
4236 | |||
4237 | /* Require at least the leading flag byte */ | ||
4238 | assert( nData>0 ); | ||
4239 | assert( pData[0]!='\0' ); | ||
4240 | |||
4241 | CLEAR(pReader); | ||
4242 | |||
4243 | /* Decode the base blockid, and set the cursor to the first term. */ | ||
4244 | n = getVarint(pData+1, &pReader->iBlockid); | ||
4245 | assert( 1+n<=nData ); | ||
4246 | pReader->pData = pData+1+n; | ||
4247 | pReader->nData = nData-(1+n); | ||
4248 | |||
4249 | /* A single-child interior node (such as when a leaf node was too | ||
4250 | ** large for the segment directory) won't have any terms. | ||
4251 | ** Otherwise, decode the first term. | ||
4252 | */ | ||
4253 | if( pReader->nData==0 ){ | ||
4254 | dataBufferInit(&pReader->term, 0); | ||
4255 | }else{ | ||
4256 | n = getVarint32(pReader->pData, &nTerm); | ||
4257 | dataBufferInit(&pReader->term, nTerm); | ||
4258 | dataBufferReplace(&pReader->term, pReader->pData+n, nTerm); | ||
4259 | assert( n+nTerm<=pReader->nData ); | ||
4260 | pReader->pData += n+nTerm; | ||
4261 | pReader->nData -= n+nTerm; | ||
4262 | } | ||
4263 | } | ||
4264 | |||
4265 | static int interiorReaderAtEnd(InteriorReader *pReader){ | ||
4266 | return pReader->term.nData==0; | ||
4267 | } | ||
4268 | |||
4269 | static sqlite_int64 interiorReaderCurrentBlockid(InteriorReader *pReader){ | ||
4270 | return pReader->iBlockid; | ||
4271 | } | ||
4272 | |||
4273 | static int interiorReaderTermBytes(InteriorReader *pReader){ | ||
4274 | assert( !interiorReaderAtEnd(pReader) ); | ||
4275 | return pReader->term.nData; | ||
4276 | } | ||
4277 | static const char *interiorReaderTerm(InteriorReader *pReader){ | ||
4278 | assert( !interiorReaderAtEnd(pReader) ); | ||
4279 | return pReader->term.pData; | ||
4280 | } | ||
4281 | |||
4282 | /* Step forward to the next term in the node. */ | ||
4283 | static void interiorReaderStep(InteriorReader *pReader){ | ||
4284 | assert( !interiorReaderAtEnd(pReader) ); | ||
4285 | |||
4286 | /* If the last term has been read, signal eof, else construct the | ||
4287 | ** next term. | ||
4288 | */ | ||
4289 | if( pReader->nData==0 ){ | ||
4290 | dataBufferReset(&pReader->term); | ||
4291 | }else{ | ||
4292 | int n, nPrefix, nSuffix; | ||
4293 | |||
4294 | n = getVarint32(pReader->pData, &nPrefix); | ||
4295 | n += getVarint32(pReader->pData+n, &nSuffix); | ||
4296 | |||
4297 | /* Truncate the current term and append suffix data. */ | ||
4298 | pReader->term.nData = nPrefix; | ||
4299 | dataBufferAppend(&pReader->term, pReader->pData+n, nSuffix); | ||
4300 | |||
4301 | assert( n+nSuffix<=pReader->nData ); | ||
4302 | pReader->pData += n+nSuffix; | ||
4303 | pReader->nData -= n+nSuffix; | ||
4304 | } | ||
4305 | pReader->iBlockid++; | ||
4306 | } | ||
4307 | |||
4308 | /* Compare the current term to pTerm[nTerm], returning strcmp-style | ||
4309 | ** results. If isPrefix, equality means equal through nTerm bytes. | ||
4310 | */ | ||
4311 | static int interiorReaderTermCmp(InteriorReader *pReader, | ||
4312 | const char *pTerm, int nTerm, int isPrefix){ | ||
4313 | const char *pReaderTerm = interiorReaderTerm(pReader); | ||
4314 | int nReaderTerm = interiorReaderTermBytes(pReader); | ||
4315 | int c, n = nReaderTerm<nTerm ? nReaderTerm : nTerm; | ||
4316 | |||
4317 | if( n==0 ){ | ||
4318 | if( nReaderTerm>0 ) return -1; | ||
4319 | if( nTerm>0 ) return 1; | ||
4320 | return 0; | ||
4321 | } | ||
4322 | |||
4323 | c = memcmp(pReaderTerm, pTerm, n); | ||
4324 | if( c!=0 ) return c; | ||
4325 | if( isPrefix && n==nTerm ) return 0; | ||
4326 | return nReaderTerm - nTerm; | ||
4327 | } | ||
4328 | |||
4329 | /****************************************************************/ | ||
4330 | /* LeafWriter is used to collect terms and associated doclist data | ||
4331 | ** into leaf blocks in %_segments (see top of file for format info). | ||
4332 | ** Expected usage is: | ||
4333 | ** | ||
4334 | ** LeafWriter writer; | ||
4335 | ** leafWriterInit(0, 0, &writer); | ||
4336 | ** while( sorted_terms_left_to_process ){ | ||
4337 | ** // data is doclist data for that term. | ||
4338 | ** rc = leafWriterStep(v, &writer, pTerm, nTerm, pData, nData); | ||
4339 | ** if( rc!=SQLITE_OK ) goto err; | ||
4340 | ** } | ||
4341 | ** rc = leafWriterFinalize(v, &writer); | ||
4342 | **err: | ||
4343 | ** leafWriterDestroy(&writer); | ||
4344 | ** return rc; | ||
4345 | ** | ||
4346 | ** leafWriterStep() may write a collected leaf out to %_segments. | ||
4347 | ** leafWriterFinalize() finishes writing any buffered data and stores | ||
4348 | ** a root node in %_segdir. leafWriterDestroy() frees all buffers and | ||
4349 | ** InteriorWriters allocated as part of writing this segment. | ||
4350 | ** | ||
4351 | ** TODO(shess) Document leafWriterStepMerge(). | ||
4352 | */ | ||
4353 | |||
4354 | /* Put terms with data this big in their own block. */ | ||
4355 | #define STANDALONE_MIN 1024 | ||
4356 | |||
4357 | /* Keep leaf blocks below this size. */ | ||
4358 | #define LEAF_MAX 2048 | ||
4359 | |||
4360 | typedef struct LeafWriter { | ||
4361 | int iLevel; | ||
4362 | int idx; | ||
4363 | sqlite_int64 iStartBlockid; /* needed to create the root info */ | ||
4364 | sqlite_int64 iEndBlockid; /* when we're done writing. */ | ||
4365 | |||
4366 | DataBuffer term; /* previous encoded term */ | ||
4367 | DataBuffer data; /* encoding buffer */ | ||
4368 | |||
4369 | /* bytes of first term in the current node which distinguishes that | ||
4370 | ** term from the last term of the previous node. | ||
4371 | */ | ||
4372 | int nTermDistinct; | ||
4373 | |||
4374 | InteriorWriter parentWriter; /* if we overflow */ | ||
4375 | int has_parent; | ||
4376 | } LeafWriter; | ||
4377 | |||
4378 | static void leafWriterInit(int iLevel, int idx, LeafWriter *pWriter){ | ||
4379 | CLEAR(pWriter); | ||
4380 | pWriter->iLevel = iLevel; | ||
4381 | pWriter->idx = idx; | ||
4382 | |||
4383 | dataBufferInit(&pWriter->term, 32); | ||
4384 | |||
4385 | /* Start out with a reasonably sized block, though it can grow. */ | ||
4386 | dataBufferInit(&pWriter->data, LEAF_MAX); | ||
4387 | } | ||
4388 | |||
4389 | #ifndef NDEBUG | ||
4390 | /* Verify that the data is readable as a leaf node. */ | ||
4391 | static void leafNodeValidate(const char *pData, int nData){ | ||
4392 | int n, iDummy; | ||
4393 | |||
4394 | if( nData==0 ) return; | ||
4395 | assert( nData>0 ); | ||
4396 | assert( pData!=0 ); | ||
4397 | assert( pData+nData>pData ); | ||
4398 | |||
4399 | /* Must lead with a varint(0) */ | ||
4400 | n = getVarint32(pData, &iDummy); | ||
4401 | assert( iDummy==0 ); | ||
4402 | assert( n>0 ); | ||
4403 | assert( n<nData ); | ||
4404 | pData += n; | ||
4405 | nData -= n; | ||
4406 | |||
4407 | /* Leading term length and data must fit in buffer. */ | ||
4408 | n = getVarint32(pData, &iDummy); | ||
4409 | assert( n>0 ); | ||
4410 | assert( iDummy>0 ); | ||
4411 | assert( n+iDummy>0 ); | ||
4412 | assert( n+iDummy<nData ); | ||
4413 | pData += n+iDummy; | ||
4414 | nData -= n+iDummy; | ||
4415 | |||
4416 | /* Leading term's doclist length and data must fit. */ | ||
4417 | n = getVarint32(pData, &iDummy); | ||
4418 | assert( n>0 ); | ||
4419 | assert( iDummy>0 ); | ||
4420 | assert( n+iDummy>0 ); | ||
4421 | assert( n+iDummy<=nData ); | ||
4422 | ASSERT_VALID_DOCLIST(DL_DEFAULT, pData+n, iDummy, NULL); | ||
4423 | pData += n+iDummy; | ||
4424 | nData -= n+iDummy; | ||
4425 | |||
4426 | /* Verify that trailing terms and doclists also are readable. */ | ||
4427 | while( nData!=0 ){ | ||
4428 | n = getVarint32(pData, &iDummy); | ||
4429 | assert( n>0 ); | ||
4430 | assert( iDummy>=0 ); | ||
4431 | assert( n<nData ); | ||
4432 | pData += n; | ||
4433 | nData -= n; | ||
4434 | n = getVarint32(pData, &iDummy); | ||
4435 | assert( n>0 ); | ||
4436 | assert( iDummy>0 ); | ||
4437 | assert( n+iDummy>0 ); | ||
4438 | assert( n+iDummy<nData ); | ||
4439 | pData += n+iDummy; | ||
4440 | nData -= n+iDummy; | ||
4441 | |||
4442 | n = getVarint32(pData, &iDummy); | ||
4443 | assert( n>0 ); | ||
4444 | assert( iDummy>0 ); | ||
4445 | assert( n+iDummy>0 ); | ||
4446 | assert( n+iDummy<=nData ); | ||
4447 | ASSERT_VALID_DOCLIST(DL_DEFAULT, pData+n, iDummy, NULL); | ||
4448 | pData += n+iDummy; | ||
4449 | nData -= n+iDummy; | ||
4450 | } | ||
4451 | } | ||
4452 | #define ASSERT_VALID_LEAF_NODE(p, n) leafNodeValidate(p, n) | ||
4453 | #else | ||
4454 | #define ASSERT_VALID_LEAF_NODE(p, n) assert( 1 ) | ||
4455 | #endif | ||
4456 | |||
4457 | /* Flush the current leaf node to %_segments, and adding the resulting | ||
4458 | ** blockid and the starting term to the interior node which will | ||
4459 | ** contain it. | ||
4460 | */ | ||
4461 | static int leafWriterInternalFlush(fulltext_vtab *v, LeafWriter *pWriter, | ||
4462 | int iData, int nData){ | ||
4463 | sqlite_int64 iBlockid = 0; | ||
4464 | const char *pStartingTerm; | ||
4465 | int nStartingTerm, rc, n; | ||
4466 | |||
4467 | /* Must have the leading varint(0) flag, plus at least some | ||
4468 | ** valid-looking data. | ||
4469 | */ | ||
4470 | assert( nData>2 ); | ||
4471 | assert( iData>=0 ); | ||
4472 | assert( iData+nData<=pWriter->data.nData ); | ||
4473 | ASSERT_VALID_LEAF_NODE(pWriter->data.pData+iData, nData); | ||
4474 | |||
4475 | rc = block_insert(v, pWriter->data.pData+iData, nData, &iBlockid); | ||
4476 | if( rc!=SQLITE_OK ) return rc; | ||
4477 | assert( iBlockid!=0 ); | ||
4478 | |||
4479 | /* Reconstruct the first term in the leaf for purposes of building | ||
4480 | ** the interior node. | ||
4481 | */ | ||
4482 | n = getVarint32(pWriter->data.pData+iData+1, &nStartingTerm); | ||
4483 | pStartingTerm = pWriter->data.pData+iData+1+n; | ||
4484 | assert( pWriter->data.nData>iData+1+n+nStartingTerm ); | ||
4485 | assert( pWriter->nTermDistinct>0 ); | ||
4486 | assert( pWriter->nTermDistinct<=nStartingTerm ); | ||
4487 | nStartingTerm = pWriter->nTermDistinct; | ||
4488 | |||
4489 | if( pWriter->has_parent ){ | ||
4490 | interiorWriterAppend(&pWriter->parentWriter, | ||
4491 | pStartingTerm, nStartingTerm, iBlockid); | ||
4492 | }else{ | ||
4493 | interiorWriterInit(1, pStartingTerm, nStartingTerm, iBlockid, | ||
4494 | &pWriter->parentWriter); | ||
4495 | pWriter->has_parent = 1; | ||
4496 | } | ||
4497 | |||
4498 | /* Track the span of this segment's leaf nodes. */ | ||
4499 | if( pWriter->iEndBlockid==0 ){ | ||
4500 | pWriter->iEndBlockid = pWriter->iStartBlockid = iBlockid; | ||
4501 | }else{ | ||
4502 | pWriter->iEndBlockid++; | ||
4503 | assert( iBlockid==pWriter->iEndBlockid ); | ||
4504 | } | ||
4505 | |||
4506 | return SQLITE_OK; | ||
4507 | } | ||
4508 | static int leafWriterFlush(fulltext_vtab *v, LeafWriter *pWriter){ | ||
4509 | int rc = leafWriterInternalFlush(v, pWriter, 0, pWriter->data.nData); | ||
4510 | if( rc!=SQLITE_OK ) return rc; | ||
4511 | |||
4512 | /* Re-initialize the output buffer. */ | ||
4513 | dataBufferReset(&pWriter->data); | ||
4514 | |||
4515 | return SQLITE_OK; | ||
4516 | } | ||
4517 | |||
4518 | /* Fetch the root info for the segment. If the entire leaf fits | ||
4519 | ** within ROOT_MAX, then it will be returned directly, otherwise it | ||
4520 | ** will be flushed and the root info will be returned from the | ||
4521 | ** interior node. *piEndBlockid is set to the blockid of the last | ||
4522 | ** interior or leaf node written to disk (0 if none are written at | ||
4523 | ** all). | ||
4524 | */ | ||
4525 | static int leafWriterRootInfo(fulltext_vtab *v, LeafWriter *pWriter, | ||
4526 | char **ppRootInfo, int *pnRootInfo, | ||
4527 | sqlite_int64 *piEndBlockid){ | ||
4528 | /* we can fit the segment entirely inline */ | ||
4529 | if( !pWriter->has_parent && pWriter->data.nData<ROOT_MAX ){ | ||
4530 | *ppRootInfo = pWriter->data.pData; | ||
4531 | *pnRootInfo = pWriter->data.nData; | ||
4532 | *piEndBlockid = 0; | ||
4533 | return SQLITE_OK; | ||
4534 | } | ||
4535 | |||
4536 | /* Flush remaining leaf data. */ | ||
4537 | if( pWriter->data.nData>0 ){ | ||
4538 | int rc = leafWriterFlush(v, pWriter); | ||
4539 | if( rc!=SQLITE_OK ) return rc; | ||
4540 | } | ||
4541 | |||
4542 | /* We must have flushed a leaf at some point. */ | ||
4543 | assert( pWriter->has_parent ); | ||
4544 | |||
4545 | /* Tenatively set the end leaf blockid as the end blockid. If the | ||
4546 | ** interior node can be returned inline, this will be the final | ||
4547 | ** blockid, otherwise it will be overwritten by | ||
4548 | ** interiorWriterRootInfo(). | ||
4549 | */ | ||
4550 | *piEndBlockid = pWriter->iEndBlockid; | ||
4551 | |||
4552 | return interiorWriterRootInfo(v, &pWriter->parentWriter, | ||
4553 | ppRootInfo, pnRootInfo, piEndBlockid); | ||
4554 | } | ||
4555 | |||
4556 | /* Collect the rootInfo data and store it into the segment directory. | ||
4557 | ** This has the effect of flushing the segment's leaf data to | ||
4558 | ** %_segments, and also flushing any interior nodes to %_segments. | ||
4559 | */ | ||
4560 | static int leafWriterFinalize(fulltext_vtab *v, LeafWriter *pWriter){ | ||
4561 | sqlite_int64 iEndBlockid; | ||
4562 | char *pRootInfo; | ||
4563 | int rc, nRootInfo; | ||
4564 | |||
4565 | rc = leafWriterRootInfo(v, pWriter, &pRootInfo, &nRootInfo, &iEndBlockid); | ||
4566 | if( rc!=SQLITE_OK ) return rc; | ||
4567 | |||
4568 | /* Don't bother storing an entirely empty segment. */ | ||
4569 | if( iEndBlockid==0 && nRootInfo==0 ) return SQLITE_OK; | ||
4570 | |||
4571 | return segdir_set(v, pWriter->iLevel, pWriter->idx, | ||
4572 | pWriter->iStartBlockid, pWriter->iEndBlockid, | ||
4573 | iEndBlockid, pRootInfo, nRootInfo); | ||
4574 | } | ||
4575 | |||
4576 | static void leafWriterDestroy(LeafWriter *pWriter){ | ||
4577 | if( pWriter->has_parent ) interiorWriterDestroy(&pWriter->parentWriter); | ||
4578 | dataBufferDestroy(&pWriter->term); | ||
4579 | dataBufferDestroy(&pWriter->data); | ||
4580 | } | ||
4581 | |||
4582 | /* Encode a term into the leafWriter, delta-encoding as appropriate. | ||
4583 | ** Returns the length of the new term which distinguishes it from the | ||
4584 | ** previous term, which can be used to set nTermDistinct when a node | ||
4585 | ** boundary is crossed. | ||
4586 | */ | ||
4587 | static int leafWriterEncodeTerm(LeafWriter *pWriter, | ||
4588 | const char *pTerm, int nTerm){ | ||
4589 | char c[VARINT_MAX+VARINT_MAX]; | ||
4590 | int n, nPrefix = 0; | ||
4591 | |||
4592 | assert( nTerm>0 ); | ||
4593 | while( nPrefix<pWriter->term.nData && | ||
4594 | pTerm[nPrefix]==pWriter->term.pData[nPrefix] ){ | ||
4595 | nPrefix++; | ||
4596 | /* Failing this implies that the terms weren't in order. */ | ||
4597 | assert( nPrefix<nTerm ); | ||
4598 | } | ||
4599 | |||
4600 | if( pWriter->data.nData==0 ){ | ||
4601 | /* Encode the node header and leading term as: | ||
4602 | ** varint(0) | ||
4603 | ** varint(nTerm) | ||
4604 | ** char pTerm[nTerm] | ||
4605 | */ | ||
4606 | n = putVarint(c, '\0'); | ||
4607 | n += putVarint(c+n, nTerm); | ||
4608 | dataBufferAppend2(&pWriter->data, c, n, pTerm, nTerm); | ||
4609 | }else{ | ||
4610 | /* Delta-encode the term as: | ||
4611 | ** varint(nPrefix) | ||
4612 | ** varint(nSuffix) | ||
4613 | ** char pTermSuffix[nSuffix] | ||
4614 | */ | ||
4615 | n = putVarint(c, nPrefix); | ||
4616 | n += putVarint(c+n, nTerm-nPrefix); | ||
4617 | dataBufferAppend2(&pWriter->data, c, n, pTerm+nPrefix, nTerm-nPrefix); | ||
4618 | } | ||
4619 | dataBufferReplace(&pWriter->term, pTerm, nTerm); | ||
4620 | |||
4621 | return nPrefix+1; | ||
4622 | } | ||
4623 | |||
4624 | /* Used to avoid a memmove when a large amount of doclist data is in | ||
4625 | ** the buffer. This constructs a node and term header before | ||
4626 | ** iDoclistData and flushes the resulting complete node using | ||
4627 | ** leafWriterInternalFlush(). | ||
4628 | */ | ||
4629 | static int leafWriterInlineFlush(fulltext_vtab *v, LeafWriter *pWriter, | ||
4630 | const char *pTerm, int nTerm, | ||
4631 | int iDoclistData){ | ||
4632 | char c[VARINT_MAX+VARINT_MAX]; | ||
4633 | int iData, n = putVarint(c, 0); | ||
4634 | n += putVarint(c+n, nTerm); | ||
4635 | |||
4636 | /* There should always be room for the header. Even if pTerm shared | ||
4637 | ** a substantial prefix with the previous term, the entire prefix | ||
4638 | ** could be constructed from earlier data in the doclist, so there | ||
4639 | ** should be room. | ||
4640 | */ | ||
4641 | assert( iDoclistData>=n+nTerm ); | ||
4642 | |||
4643 | iData = iDoclistData-(n+nTerm); | ||
4644 | memcpy(pWriter->data.pData+iData, c, n); | ||
4645 | memcpy(pWriter->data.pData+iData+n, pTerm, nTerm); | ||
4646 | |||
4647 | return leafWriterInternalFlush(v, pWriter, iData, pWriter->data.nData-iData); | ||
4648 | } | ||
4649 | |||
4650 | /* Push pTerm[nTerm] along with the doclist data to the leaf layer of | ||
4651 | ** %_segments. | ||
4652 | */ | ||
4653 | static int leafWriterStepMerge(fulltext_vtab *v, LeafWriter *pWriter, | ||
4654 | const char *pTerm, int nTerm, | ||
4655 | DLReader *pReaders, int nReaders){ | ||
4656 | char c[VARINT_MAX+VARINT_MAX]; | ||
4657 | int iTermData = pWriter->data.nData, iDoclistData; | ||
4658 | int i, nData, n, nActualData, nActual, rc, nTermDistinct; | ||
4659 | |||
4660 | ASSERT_VALID_LEAF_NODE(pWriter->data.pData, pWriter->data.nData); | ||
4661 | nTermDistinct = leafWriterEncodeTerm(pWriter, pTerm, nTerm); | ||
4662 | |||
4663 | /* Remember nTermDistinct if opening a new node. */ | ||
4664 | if( iTermData==0 ) pWriter->nTermDistinct = nTermDistinct; | ||
4665 | |||
4666 | iDoclistData = pWriter->data.nData; | ||
4667 | |||
4668 | /* Estimate the length of the merged doclist so we can leave space | ||
4669 | ** to encode it. | ||
4670 | */ | ||
4671 | for(i=0, nData=0; i<nReaders; i++){ | ||
4672 | nData += dlrAllDataBytes(&pReaders[i]); | ||
4673 | } | ||
4674 | n = putVarint(c, nData); | ||
4675 | dataBufferAppend(&pWriter->data, c, n); | ||
4676 | |||
4677 | docListMerge(&pWriter->data, pReaders, nReaders); | ||
4678 | ASSERT_VALID_DOCLIST(DL_DEFAULT, | ||
4679 | pWriter->data.pData+iDoclistData+n, | ||
4680 | pWriter->data.nData-iDoclistData-n, NULL); | ||
4681 | |||
4682 | /* The actual amount of doclist data at this point could be smaller | ||
4683 | ** than the length we encoded. Additionally, the space required to | ||
4684 | ** encode this length could be smaller. For small doclists, this is | ||
4685 | ** not a big deal, we can just use memmove() to adjust things. | ||
4686 | */ | ||
4687 | nActualData = pWriter->data.nData-(iDoclistData+n); | ||
4688 | nActual = putVarint(c, nActualData); | ||
4689 | assert( nActualData<=nData ); | ||
4690 | assert( nActual<=n ); | ||
4691 | |||
4692 | /* If the new doclist is big enough for force a standalone leaf | ||
4693 | ** node, we can immediately flush it inline without doing the | ||
4694 | ** memmove(). | ||
4695 | */ | ||
4696 | /* TODO(shess) This test matches leafWriterStep(), which does this | ||
4697 | ** test before it knows the cost to varint-encode the term and | ||
4698 | ** doclist lengths. At some point, change to | ||
4699 | ** pWriter->data.nData-iTermData>STANDALONE_MIN. | ||
4700 | */ | ||
4701 | if( nTerm+nActualData>STANDALONE_MIN ){ | ||
4702 | /* Push leaf node from before this term. */ | ||
4703 | if( iTermData>0 ){ | ||
4704 | rc = leafWriterInternalFlush(v, pWriter, 0, iTermData); | ||
4705 | if( rc!=SQLITE_OK ) return rc; | ||
4706 | |||
4707 | pWriter->nTermDistinct = nTermDistinct; | ||
4708 | } | ||
4709 | |||
4710 | /* Fix the encoded doclist length. */ | ||
4711 | iDoclistData += n - nActual; | ||
4712 | memcpy(pWriter->data.pData+iDoclistData, c, nActual); | ||
4713 | |||
4714 | /* Push the standalone leaf node. */ | ||
4715 | rc = leafWriterInlineFlush(v, pWriter, pTerm, nTerm, iDoclistData); | ||
4716 | if( rc!=SQLITE_OK ) return rc; | ||
4717 | |||
4718 | /* Leave the node empty. */ | ||
4719 | dataBufferReset(&pWriter->data); | ||
4720 | |||
4721 | return rc; | ||
4722 | } | ||
4723 | |||
4724 | /* At this point, we know that the doclist was small, so do the | ||
4725 | ** memmove if indicated. | ||
4726 | */ | ||
4727 | if( nActual<n ){ | ||
4728 | memmove(pWriter->data.pData+iDoclistData+nActual, | ||
4729 | pWriter->data.pData+iDoclistData+n, | ||
4730 | pWriter->data.nData-(iDoclistData+n)); | ||
4731 | pWriter->data.nData -= n-nActual; | ||
4732 | } | ||
4733 | |||
4734 | /* Replace written length with actual length. */ | ||
4735 | memcpy(pWriter->data.pData+iDoclistData, c, nActual); | ||
4736 | |||
4737 | /* If the node is too large, break things up. */ | ||
4738 | /* TODO(shess) This test matches leafWriterStep(), which does this | ||
4739 | ** test before it knows the cost to varint-encode the term and | ||
4740 | ** doclist lengths. At some point, change to | ||
4741 | ** pWriter->data.nData>LEAF_MAX. | ||
4742 | */ | ||
4743 | if( iTermData+nTerm+nActualData>LEAF_MAX ){ | ||
4744 | /* Flush out the leading data as a node */ | ||
4745 | rc = leafWriterInternalFlush(v, pWriter, 0, iTermData); | ||
4746 | if( rc!=SQLITE_OK ) return rc; | ||
4747 | |||
4748 | pWriter->nTermDistinct = nTermDistinct; | ||
4749 | |||
4750 | /* Rebuild header using the current term */ | ||
4751 | n = putVarint(pWriter->data.pData, 0); | ||
4752 | n += putVarint(pWriter->data.pData+n, nTerm); | ||
4753 | memcpy(pWriter->data.pData+n, pTerm, nTerm); | ||
4754 | n += nTerm; | ||
4755 | |||
4756 | /* There should always be room, because the previous encoding | ||
4757 | ** included all data necessary to construct the term. | ||
4758 | */ | ||
4759 | assert( n<iDoclistData ); | ||
4760 | /* So long as STANDALONE_MIN is half or less of LEAF_MAX, the | ||
4761 | ** following memcpy() is safe (as opposed to needing a memmove). | ||
4762 | */ | ||
4763 | assert( 2*STANDALONE_MIN<=LEAF_MAX ); | ||
4764 | assert( n+pWriter->data.nData-iDoclistData<iDoclistData ); | ||
4765 | memcpy(pWriter->data.pData+n, | ||
4766 | pWriter->data.pData+iDoclistData, | ||
4767 | pWriter->data.nData-iDoclistData); | ||
4768 | pWriter->data.nData -= iDoclistData-n; | ||
4769 | } | ||
4770 | ASSERT_VALID_LEAF_NODE(pWriter->data.pData, pWriter->data.nData); | ||
4771 | |||
4772 | return SQLITE_OK; | ||
4773 | } | ||
4774 | |||
4775 | /* Push pTerm[nTerm] along with the doclist data to the leaf layer of | ||
4776 | ** %_segments. | ||
4777 | */ | ||
4778 | /* TODO(shess) Revise writeZeroSegment() so that doclists are | ||
4779 | ** constructed directly in pWriter->data. | ||
4780 | */ | ||
4781 | static int leafWriterStep(fulltext_vtab *v, LeafWriter *pWriter, | ||
4782 | const char *pTerm, int nTerm, | ||
4783 | const char *pData, int nData){ | ||
4784 | int rc; | ||
4785 | DLReader reader; | ||
4786 | |||
4787 | dlrInit(&reader, DL_DEFAULT, pData, nData); | ||
4788 | rc = leafWriterStepMerge(v, pWriter, pTerm, nTerm, &reader, 1); | ||
4789 | dlrDestroy(&reader); | ||
4790 | |||
4791 | return rc; | ||
4792 | } | ||
4793 | |||
4794 | |||
4795 | /****************************************************************/ | ||
4796 | /* LeafReader is used to iterate over an individual leaf node. */ | ||
4797 | typedef struct LeafReader { | ||
4798 | DataBuffer term; /* copy of current term. */ | ||
4799 | |||
4800 | const char *pData; /* data for current term. */ | ||
4801 | int nData; | ||
4802 | } LeafReader; | ||
4803 | |||
4804 | static void leafReaderDestroy(LeafReader *pReader){ | ||
4805 | dataBufferDestroy(&pReader->term); | ||
4806 | SCRAMBLE(pReader); | ||
4807 | } | ||
4808 | |||
4809 | static int leafReaderAtEnd(LeafReader *pReader){ | ||
4810 | return pReader->nData<=0; | ||
4811 | } | ||
4812 | |||
4813 | /* Access the current term. */ | ||
4814 | static int leafReaderTermBytes(LeafReader *pReader){ | ||
4815 | return pReader->term.nData; | ||
4816 | } | ||
4817 | static const char *leafReaderTerm(LeafReader *pReader){ | ||
4818 | assert( pReader->term.nData>0 ); | ||
4819 | return pReader->term.pData; | ||
4820 | } | ||
4821 | |||
4822 | /* Access the doclist data for the current term. */ | ||
4823 | static int leafReaderDataBytes(LeafReader *pReader){ | ||
4824 | int nData; | ||
4825 | assert( pReader->term.nData>0 ); | ||
4826 | getVarint32(pReader->pData, &nData); | ||
4827 | return nData; | ||
4828 | } | ||
4829 | static const char *leafReaderData(LeafReader *pReader){ | ||
4830 | int n, nData; | ||
4831 | assert( pReader->term.nData>0 ); | ||
4832 | n = getVarint32(pReader->pData, &nData); | ||
4833 | return pReader->pData+n; | ||
4834 | } | ||
4835 | |||
4836 | static void leafReaderInit(const char *pData, int nData, | ||
4837 | LeafReader *pReader){ | ||
4838 | int nTerm, n; | ||
4839 | |||
4840 | assert( nData>0 ); | ||
4841 | assert( pData[0]=='\0' ); | ||
4842 | |||
4843 | CLEAR(pReader); | ||
4844 | |||
4845 | /* Read the first term, skipping the header byte. */ | ||
4846 | n = getVarint32(pData+1, &nTerm); | ||
4847 | dataBufferInit(&pReader->term, nTerm); | ||
4848 | dataBufferReplace(&pReader->term, pData+1+n, nTerm); | ||
4849 | |||
4850 | /* Position after the first term. */ | ||
4851 | assert( 1+n+nTerm<nData ); | ||
4852 | pReader->pData = pData+1+n+nTerm; | ||
4853 | pReader->nData = nData-1-n-nTerm; | ||
4854 | } | ||
4855 | |||
4856 | /* Step the reader forward to the next term. */ | ||
4857 | static void leafReaderStep(LeafReader *pReader){ | ||
4858 | int n, nData, nPrefix, nSuffix; | ||
4859 | assert( !leafReaderAtEnd(pReader) ); | ||
4860 | |||
4861 | /* Skip previous entry's data block. */ | ||
4862 | n = getVarint32(pReader->pData, &nData); | ||
4863 | assert( n+nData<=pReader->nData ); | ||
4864 | pReader->pData += n+nData; | ||
4865 | pReader->nData -= n+nData; | ||
4866 | |||
4867 | if( !leafReaderAtEnd(pReader) ){ | ||
4868 | /* Construct the new term using a prefix from the old term plus a | ||
4869 | ** suffix from the leaf data. | ||
4870 | */ | ||
4871 | n = getVarint32(pReader->pData, &nPrefix); | ||
4872 | n += getVarint32(pReader->pData+n, &nSuffix); | ||
4873 | assert( n+nSuffix<pReader->nData ); | ||
4874 | pReader->term.nData = nPrefix; | ||
4875 | dataBufferAppend(&pReader->term, pReader->pData+n, nSuffix); | ||
4876 | |||
4877 | pReader->pData += n+nSuffix; | ||
4878 | pReader->nData -= n+nSuffix; | ||
4879 | } | ||
4880 | } | ||
4881 | |||
4882 | /* strcmp-style comparison of pReader's current term against pTerm. | ||
4883 | ** If isPrefix, equality means equal through nTerm bytes. | ||
4884 | */ | ||
4885 | static int leafReaderTermCmp(LeafReader *pReader, | ||
4886 | const char *pTerm, int nTerm, int isPrefix){ | ||
4887 | int c, n = pReader->term.nData<nTerm ? pReader->term.nData : nTerm; | ||
4888 | if( n==0 ){ | ||
4889 | if( pReader->term.nData>0 ) return -1; | ||
4890 | if(nTerm>0 ) return 1; | ||
4891 | return 0; | ||
4892 | } | ||
4893 | |||
4894 | c = memcmp(pReader->term.pData, pTerm, n); | ||
4895 | if( c!=0 ) return c; | ||
4896 | if( isPrefix && n==nTerm ) return 0; | ||
4897 | return pReader->term.nData - nTerm; | ||
4898 | } | ||
4899 | |||
4900 | |||
4901 | /****************************************************************/ | ||
4902 | /* LeavesReader wraps LeafReader to allow iterating over the entire | ||
4903 | ** leaf layer of the tree. | ||
4904 | */ | ||
4905 | typedef struct LeavesReader { | ||
4906 | int idx; /* Index within the segment. */ | ||
4907 | |||
4908 | sqlite3_stmt *pStmt; /* Statement we're streaming leaves from. */ | ||
4909 | int eof; /* we've seen SQLITE_DONE from pStmt. */ | ||
4910 | |||
4911 | LeafReader leafReader; /* reader for the current leaf. */ | ||
4912 | DataBuffer rootData; /* root data for inline. */ | ||
4913 | } LeavesReader; | ||
4914 | |||
4915 | /* Access the current term. */ | ||
4916 | static int leavesReaderTermBytes(LeavesReader *pReader){ | ||
4917 | assert( !pReader->eof ); | ||
4918 | return leafReaderTermBytes(&pReader->leafReader); | ||
4919 | } | ||
4920 | static const char *leavesReaderTerm(LeavesReader *pReader){ | ||
4921 | assert( !pReader->eof ); | ||
4922 | return leafReaderTerm(&pReader->leafReader); | ||
4923 | } | ||
4924 | |||
4925 | /* Access the doclist data for the current term. */ | ||
4926 | static int leavesReaderDataBytes(LeavesReader *pReader){ | ||
4927 | assert( !pReader->eof ); | ||
4928 | return leafReaderDataBytes(&pReader->leafReader); | ||
4929 | } | ||
4930 | static const char *leavesReaderData(LeavesReader *pReader){ | ||
4931 | assert( !pReader->eof ); | ||
4932 | return leafReaderData(&pReader->leafReader); | ||
4933 | } | ||
4934 | |||
4935 | static int leavesReaderAtEnd(LeavesReader *pReader){ | ||
4936 | return pReader->eof; | ||
4937 | } | ||
4938 | |||
4939 | /* loadSegmentLeaves() may not read all the way to SQLITE_DONE, thus | ||
4940 | ** leaving the statement handle open, which locks the table. | ||
4941 | */ | ||
4942 | /* TODO(shess) This "solution" is not satisfactory. Really, there | ||
4943 | ** should be check-in function for all statement handles which | ||
4944 | ** arranges to call sqlite3_reset(). This most likely will require | ||
4945 | ** modification to control flow all over the place, though, so for now | ||
4946 | ** just punt. | ||
4947 | ** | ||
4948 | ** Note the the current system assumes that segment merges will run to | ||
4949 | ** completion, which is why this particular probably hasn't arisen in | ||
4950 | ** this case. Probably a brittle assumption. | ||
4951 | */ | ||
4952 | static int leavesReaderReset(LeavesReader *pReader){ | ||
4953 | return sqlite3_reset(pReader->pStmt); | ||
4954 | } | ||
4955 | |||
4956 | static void leavesReaderDestroy(LeavesReader *pReader){ | ||
4957 | leafReaderDestroy(&pReader->leafReader); | ||
4958 | dataBufferDestroy(&pReader->rootData); | ||
4959 | SCRAMBLE(pReader); | ||
4960 | } | ||
4961 | |||
4962 | /* Initialize pReader with the given root data (if iStartBlockid==0 | ||
4963 | ** the leaf data was entirely contained in the root), or from the | ||
4964 | ** stream of blocks between iStartBlockid and iEndBlockid, inclusive. | ||
4965 | */ | ||
4966 | static int leavesReaderInit(fulltext_vtab *v, | ||
4967 | int idx, | ||
4968 | sqlite_int64 iStartBlockid, | ||
4969 | sqlite_int64 iEndBlockid, | ||
4970 | const char *pRootData, int nRootData, | ||
4971 | LeavesReader *pReader){ | ||
4972 | CLEAR(pReader); | ||
4973 | pReader->idx = idx; | ||
4974 | |||
4975 | dataBufferInit(&pReader->rootData, 0); | ||
4976 | if( iStartBlockid==0 ){ | ||
4977 | /* Entire leaf level fit in root data. */ | ||
4978 | dataBufferReplace(&pReader->rootData, pRootData, nRootData); | ||
4979 | leafReaderInit(pReader->rootData.pData, pReader->rootData.nData, | ||
4980 | &pReader->leafReader); | ||
4981 | }else{ | ||
4982 | sqlite3_stmt *s; | ||
4983 | int rc = sql_get_leaf_statement(v, idx, &s); | ||
4984 | if( rc!=SQLITE_OK ) return rc; | ||
4985 | |||
4986 | rc = sqlite3_bind_int64(s, 1, iStartBlockid); | ||
4987 | if( rc!=SQLITE_OK ) return rc; | ||
4988 | |||
4989 | rc = sqlite3_bind_int64(s, 2, iEndBlockid); | ||
4990 | if( rc!=SQLITE_OK ) return rc; | ||
4991 | |||
4992 | rc = sqlite3_step(s); | ||
4993 | if( rc==SQLITE_DONE ){ | ||
4994 | pReader->eof = 1; | ||
4995 | return SQLITE_OK; | ||
4996 | } | ||
4997 | if( rc!=SQLITE_ROW ) return rc; | ||
4998 | |||
4999 | pReader->pStmt = s; | ||
5000 | leafReaderInit(sqlite3_column_blob(pReader->pStmt, 0), | ||
5001 | sqlite3_column_bytes(pReader->pStmt, 0), | ||
5002 | &pReader->leafReader); | ||
5003 | } | ||
5004 | return SQLITE_OK; | ||
5005 | } | ||
5006 | |||
5007 | /* Step the current leaf forward to the next term. If we reach the | ||
5008 | ** end of the current leaf, step forward to the next leaf block. | ||
5009 | */ | ||
5010 | static int leavesReaderStep(fulltext_vtab *v, LeavesReader *pReader){ | ||
5011 | assert( !leavesReaderAtEnd(pReader) ); | ||
5012 | leafReaderStep(&pReader->leafReader); | ||
5013 | |||
5014 | if( leafReaderAtEnd(&pReader->leafReader) ){ | ||
5015 | int rc; | ||
5016 | if( pReader->rootData.pData ){ | ||
5017 | pReader->eof = 1; | ||
5018 | return SQLITE_OK; | ||
5019 | } | ||
5020 | rc = sqlite3_step(pReader->pStmt); | ||
5021 | if( rc!=SQLITE_ROW ){ | ||
5022 | pReader->eof = 1; | ||
5023 | return rc==SQLITE_DONE ? SQLITE_OK : rc; | ||
5024 | } | ||
5025 | leafReaderDestroy(&pReader->leafReader); | ||
5026 | leafReaderInit(sqlite3_column_blob(pReader->pStmt, 0), | ||
5027 | sqlite3_column_bytes(pReader->pStmt, 0), | ||
5028 | &pReader->leafReader); | ||
5029 | } | ||
5030 | return SQLITE_OK; | ||
5031 | } | ||
5032 | |||
5033 | /* Order LeavesReaders by their term, ignoring idx. Readers at eof | ||
5034 | ** always sort to the end. | ||
5035 | */ | ||
5036 | static int leavesReaderTermCmp(LeavesReader *lr1, LeavesReader *lr2){ | ||
5037 | if( leavesReaderAtEnd(lr1) ){ | ||
5038 | if( leavesReaderAtEnd(lr2) ) return 0; | ||
5039 | return 1; | ||
5040 | } | ||
5041 | if( leavesReaderAtEnd(lr2) ) return -1; | ||
5042 | |||
5043 | return leafReaderTermCmp(&lr1->leafReader, | ||
5044 | leavesReaderTerm(lr2), leavesReaderTermBytes(lr2), | ||
5045 | 0); | ||
5046 | } | ||
5047 | |||
5048 | /* Similar to leavesReaderTermCmp(), with additional ordering by idx | ||
5049 | ** so that older segments sort before newer segments. | ||
5050 | */ | ||
5051 | static int leavesReaderCmp(LeavesReader *lr1, LeavesReader *lr2){ | ||
5052 | int c = leavesReaderTermCmp(lr1, lr2); | ||
5053 | if( c!=0 ) return c; | ||
5054 | return lr1->idx-lr2->idx; | ||
5055 | } | ||
5056 | |||
5057 | /* Assume that pLr[1]..pLr[nLr] are sorted. Bubble pLr[0] into its | ||
5058 | ** sorted position. | ||
5059 | */ | ||
5060 | static void leavesReaderReorder(LeavesReader *pLr, int nLr){ | ||
5061 | while( nLr>1 && leavesReaderCmp(pLr, pLr+1)>0 ){ | ||
5062 | LeavesReader tmp = pLr[0]; | ||
5063 | pLr[0] = pLr[1]; | ||
5064 | pLr[1] = tmp; | ||
5065 | nLr--; | ||
5066 | pLr++; | ||
5067 | } | ||
5068 | } | ||
5069 | |||
5070 | /* Initializes pReaders with the segments from level iLevel, returning | ||
5071 | ** the number of segments in *piReaders. Leaves pReaders in sorted | ||
5072 | ** order. | ||
5073 | */ | ||
5074 | static int leavesReadersInit(fulltext_vtab *v, int iLevel, | ||
5075 | LeavesReader *pReaders, int *piReaders){ | ||
5076 | sqlite3_stmt *s; | ||
5077 | int i, rc = sql_get_statement(v, SEGDIR_SELECT_STMT, &s); | ||
5078 | if( rc!=SQLITE_OK ) return rc; | ||
5079 | |||
5080 | rc = sqlite3_bind_int(s, 1, iLevel); | ||
5081 | if( rc!=SQLITE_OK ) return rc; | ||
5082 | |||
5083 | i = 0; | ||
5084 | while( (rc = sqlite3_step(s))==SQLITE_ROW ){ | ||
5085 | sqlite_int64 iStart = sqlite3_column_int64(s, 0); | ||
5086 | sqlite_int64 iEnd = sqlite3_column_int64(s, 1); | ||
5087 | const char *pRootData = sqlite3_column_blob(s, 2); | ||
5088 | int nRootData = sqlite3_column_bytes(s, 2); | ||
5089 | |||
5090 | assert( i<MERGE_COUNT ); | ||
5091 | rc = leavesReaderInit(v, i, iStart, iEnd, pRootData, nRootData, | ||
5092 | &pReaders[i]); | ||
5093 | if( rc!=SQLITE_OK ) break; | ||
5094 | |||
5095 | i++; | ||
5096 | } | ||
5097 | if( rc!=SQLITE_DONE ){ | ||
5098 | while( i-->0 ){ | ||
5099 | leavesReaderDestroy(&pReaders[i]); | ||
5100 | } | ||
5101 | return rc; | ||
5102 | } | ||
5103 | |||
5104 | *piReaders = i; | ||
5105 | |||
5106 | /* Leave our results sorted by term, then age. */ | ||
5107 | while( i-- ){ | ||
5108 | leavesReaderReorder(pReaders+i, *piReaders-i); | ||
5109 | } | ||
5110 | return SQLITE_OK; | ||
5111 | } | ||
5112 | |||
5113 | /* Merge doclists from pReaders[nReaders] into a single doclist, which | ||
5114 | ** is written to pWriter. Assumes pReaders is ordered oldest to | ||
5115 | ** newest. | ||
5116 | */ | ||
5117 | /* TODO(shess) Consider putting this inline in segmentMerge(). */ | ||
5118 | static int leavesReadersMerge(fulltext_vtab *v, | ||
5119 | LeavesReader *pReaders, int nReaders, | ||
5120 | LeafWriter *pWriter){ | ||
5121 | DLReader dlReaders[MERGE_COUNT]; | ||
5122 | const char *pTerm = leavesReaderTerm(pReaders); | ||
5123 | int i, nTerm = leavesReaderTermBytes(pReaders); | ||
5124 | |||
5125 | assert( nReaders<=MERGE_COUNT ); | ||
5126 | |||
5127 | for(i=0; i<nReaders; i++){ | ||
5128 | dlrInit(&dlReaders[i], DL_DEFAULT, | ||
5129 | leavesReaderData(pReaders+i), | ||
5130 | leavesReaderDataBytes(pReaders+i)); | ||
5131 | } | ||
5132 | |||
5133 | return leafWriterStepMerge(v, pWriter, pTerm, nTerm, dlReaders, nReaders); | ||
5134 | } | ||
5135 | |||
5136 | /* Forward ref due to mutual recursion with segdirNextIndex(). */ | ||
5137 | static int segmentMerge(fulltext_vtab *v, int iLevel); | ||
5138 | |||
5139 | /* Put the next available index at iLevel into *pidx. If iLevel | ||
5140 | ** already has MERGE_COUNT segments, they are merged to a higher | ||
5141 | ** level to make room. | ||
5142 | */ | ||
5143 | static int segdirNextIndex(fulltext_vtab *v, int iLevel, int *pidx){ | ||
5144 | int rc = segdir_max_index(v, iLevel, pidx); | ||
5145 | if( rc==SQLITE_DONE ){ /* No segments at iLevel. */ | ||
5146 | *pidx = 0; | ||
5147 | }else if( rc==SQLITE_ROW ){ | ||
5148 | if( *pidx==(MERGE_COUNT-1) ){ | ||
5149 | rc = segmentMerge(v, iLevel); | ||
5150 | if( rc!=SQLITE_OK ) return rc; | ||
5151 | *pidx = 0; | ||
5152 | }else{ | ||
5153 | (*pidx)++; | ||
5154 | } | ||
5155 | }else{ | ||
5156 | return rc; | ||
5157 | } | ||
5158 | return SQLITE_OK; | ||
5159 | } | ||
5160 | |||
5161 | /* Merge MERGE_COUNT segments at iLevel into a new segment at | ||
5162 | ** iLevel+1. If iLevel+1 is already full of segments, those will be | ||
5163 | ** merged to make room. | ||
5164 | */ | ||
5165 | static int segmentMerge(fulltext_vtab *v, int iLevel){ | ||
5166 | LeafWriter writer; | ||
5167 | LeavesReader lrs[MERGE_COUNT]; | ||
5168 | int i, rc, idx = 0; | ||
5169 | |||
5170 | /* Determine the next available segment index at the next level, | ||
5171 | ** merging as necessary. | ||
5172 | */ | ||
5173 | rc = segdirNextIndex(v, iLevel+1, &idx); | ||
5174 | if( rc!=SQLITE_OK ) return rc; | ||
5175 | |||
5176 | /* TODO(shess) This assumes that we'll always see exactly | ||
5177 | ** MERGE_COUNT segments to merge at a given level. That will be | ||
5178 | ** broken if we allow the developer to request preemptive or | ||
5179 | ** deferred merging. | ||
5180 | */ | ||
5181 | memset(&lrs, '\0', sizeof(lrs)); | ||
5182 | rc = leavesReadersInit(v, iLevel, lrs, &i); | ||
5183 | if( rc!=SQLITE_OK ) return rc; | ||
5184 | assert( i==MERGE_COUNT ); | ||
5185 | |||
5186 | leafWriterInit(iLevel+1, idx, &writer); | ||
5187 | |||
5188 | /* Since leavesReaderReorder() pushes readers at eof to the end, | ||
5189 | ** when the first reader is empty, all will be empty. | ||
5190 | */ | ||
5191 | while( !leavesReaderAtEnd(lrs) ){ | ||
5192 | /* Figure out how many readers share their next term. */ | ||
5193 | for(i=1; i<MERGE_COUNT && !leavesReaderAtEnd(lrs+i); i++){ | ||
5194 | if( 0!=leavesReaderTermCmp(lrs, lrs+i) ) break; | ||
5195 | } | ||
5196 | |||
5197 | rc = leavesReadersMerge(v, lrs, i, &writer); | ||
5198 | if( rc!=SQLITE_OK ) goto err; | ||
5199 | |||
5200 | /* Step forward those that were merged. */ | ||
5201 | while( i-->0 ){ | ||
5202 | rc = leavesReaderStep(v, lrs+i); | ||
5203 | if( rc!=SQLITE_OK ) goto err; | ||
5204 | |||
5205 | /* Reorder by term, then by age. */ | ||
5206 | leavesReaderReorder(lrs+i, MERGE_COUNT-i); | ||
5207 | } | ||
5208 | } | ||
5209 | |||
5210 | for(i=0; i<MERGE_COUNT; i++){ | ||
5211 | leavesReaderDestroy(&lrs[i]); | ||
5212 | } | ||
5213 | |||
5214 | rc = leafWriterFinalize(v, &writer); | ||
5215 | leafWriterDestroy(&writer); | ||
5216 | if( rc!=SQLITE_OK ) return rc; | ||
5217 | |||
5218 | /* Delete the merged segment data. */ | ||
5219 | return segdir_delete(v, iLevel); | ||
5220 | |||
5221 | err: | ||
5222 | for(i=0; i<MERGE_COUNT; i++){ | ||
5223 | leavesReaderDestroy(&lrs[i]); | ||
5224 | } | ||
5225 | leafWriterDestroy(&writer); | ||
5226 | return rc; | ||
5227 | } | ||
5228 | |||
5229 | /* Scan pReader for pTerm/nTerm, and merge the term's doclist over | ||
5230 | ** *out (any doclists with duplicate docids overwrite those in *out). | ||
5231 | ** Internal function for loadSegmentLeaf(). | ||
5232 | */ | ||
5233 | static int loadSegmentLeavesInt(fulltext_vtab *v, LeavesReader *pReader, | ||
5234 | const char *pTerm, int nTerm, int isPrefix, | ||
5235 | DataBuffer *out){ | ||
5236 | assert( nTerm>0 ); | ||
5237 | |||
5238 | /* Process while the prefix matches. */ | ||
5239 | while( !leavesReaderAtEnd(pReader) ){ | ||
5240 | /* TODO(shess) Really want leavesReaderTermCmp(), but that name is | ||
5241 | ** already taken to compare the terms of two LeavesReaders. Think | ||
5242 | ** on a better name. [Meanwhile, break encapsulation rather than | ||
5243 | ** use a confusing name.] | ||
5244 | */ | ||
5245 | int rc; | ||
5246 | int c = leafReaderTermCmp(&pReader->leafReader, pTerm, nTerm, isPrefix); | ||
5247 | if( c==0 ){ | ||
5248 | const char *pData = leavesReaderData(pReader); | ||
5249 | int nData = leavesReaderDataBytes(pReader); | ||
5250 | if( out->nData==0 ){ | ||
5251 | dataBufferReplace(out, pData, nData); | ||
5252 | }else{ | ||
5253 | DataBuffer result; | ||
5254 | dataBufferInit(&result, out->nData+nData); | ||
5255 | docListUnion(out->pData, out->nData, pData, nData, &result); | ||
5256 | dataBufferDestroy(out); | ||
5257 | *out = result; | ||
5258 | /* TODO(shess) Rather than destroy out, we could retain it for | ||
5259 | ** later reuse. | ||
5260 | */ | ||
5261 | } | ||
5262 | } | ||
5263 | if( c>0 ) break; /* Past any possible matches. */ | ||
5264 | |||
5265 | rc = leavesReaderStep(v, pReader); | ||
5266 | if( rc!=SQLITE_OK ) return rc; | ||
5267 | } | ||
5268 | return SQLITE_OK; | ||
5269 | } | ||
5270 | |||
5271 | /* Call loadSegmentLeavesInt() with pData/nData as input. */ | ||
5272 | static int loadSegmentLeaf(fulltext_vtab *v, const char *pData, int nData, | ||
5273 | const char *pTerm, int nTerm, int isPrefix, | ||
5274 | DataBuffer *out){ | ||
5275 | LeavesReader reader; | ||
5276 | int rc; | ||
5277 | |||
5278 | assert( nData>1 ); | ||
5279 | assert( *pData=='\0' ); | ||
5280 | rc = leavesReaderInit(v, 0, 0, 0, pData, nData, &reader); | ||
5281 | if( rc!=SQLITE_OK ) return rc; | ||
5282 | |||
5283 | rc = loadSegmentLeavesInt(v, &reader, pTerm, nTerm, isPrefix, out); | ||
5284 | leavesReaderReset(&reader); | ||
5285 | leavesReaderDestroy(&reader); | ||
5286 | return rc; | ||
5287 | } | ||
5288 | |||
5289 | /* Call loadSegmentLeavesInt() with the leaf nodes from iStartLeaf to | ||
5290 | ** iEndLeaf (inclusive) as input, and merge the resulting doclist into | ||
5291 | ** out. | ||
5292 | */ | ||
5293 | static int loadSegmentLeaves(fulltext_vtab *v, | ||
5294 | sqlite_int64 iStartLeaf, sqlite_int64 iEndLeaf, | ||
5295 | const char *pTerm, int nTerm, int isPrefix, | ||
5296 | DataBuffer *out){ | ||
5297 | int rc; | ||
5298 | LeavesReader reader; | ||
5299 | |||
5300 | assert( iStartLeaf<=iEndLeaf ); | ||
5301 | rc = leavesReaderInit(v, 0, iStartLeaf, iEndLeaf, NULL, 0, &reader); | ||
5302 | if( rc!=SQLITE_OK ) return rc; | ||
5303 | |||
5304 | rc = loadSegmentLeavesInt(v, &reader, pTerm, nTerm, isPrefix, out); | ||
5305 | leavesReaderReset(&reader); | ||
5306 | leavesReaderDestroy(&reader); | ||
5307 | return rc; | ||
5308 | } | ||
5309 | |||
5310 | /* Taking pData/nData as an interior node, find the sequence of child | ||
5311 | ** nodes which could include pTerm/nTerm/isPrefix. Note that the | ||
5312 | ** interior node terms logically come between the blocks, so there is | ||
5313 | ** one more blockid than there are terms (that block contains terms >= | ||
5314 | ** the last interior-node term). | ||
5315 | */ | ||
5316 | /* TODO(shess) The calling code may already know that the end child is | ||
5317 | ** not worth calculating, because the end may be in a later sibling | ||
5318 | ** node. Consider whether breaking symmetry is worthwhile. I suspect | ||
5319 | ** it's not worthwhile. | ||
5320 | */ | ||
5321 | static void getChildrenContaining(const char *pData, int nData, | ||
5322 | const char *pTerm, int nTerm, int isPrefix, | ||
5323 | sqlite_int64 *piStartChild, | ||
5324 | sqlite_int64 *piEndChild){ | ||
5325 | InteriorReader reader; | ||
5326 | |||
5327 | assert( nData>1 ); | ||
5328 | assert( *pData!='\0' ); | ||
5329 | interiorReaderInit(pData, nData, &reader); | ||
5330 | |||
5331 | /* Scan for the first child which could contain pTerm/nTerm. */ | ||
5332 | while( !interiorReaderAtEnd(&reader) ){ | ||
5333 | if( interiorReaderTermCmp(&reader, pTerm, nTerm, 0)>0 ) break; | ||
5334 | interiorReaderStep(&reader); | ||
5335 | } | ||
5336 | *piStartChild = interiorReaderCurrentBlockid(&reader); | ||
5337 | |||
5338 | /* Keep scanning to find a term greater than our term, using prefix | ||
5339 | ** comparison if indicated. If isPrefix is false, this will be the | ||
5340 | ** same blockid as the starting block. | ||
5341 | */ | ||
5342 | while( !interiorReaderAtEnd(&reader) ){ | ||
5343 | if( interiorReaderTermCmp(&reader, pTerm, nTerm, isPrefix)>0 ) break; | ||
5344 | interiorReaderStep(&reader); | ||
5345 | } | ||
5346 | *piEndChild = interiorReaderCurrentBlockid(&reader); | ||
5347 | |||
5348 | interiorReaderDestroy(&reader); | ||
5349 | |||
5350 | /* Children must ascend, and if !prefix, both must be the same. */ | ||
5351 | assert( *piEndChild>=*piStartChild ); | ||
5352 | assert( isPrefix || *piStartChild==*piEndChild ); | ||
5353 | } | ||
5354 | |||
5355 | /* Read block at iBlockid and pass it with other params to | ||
5356 | ** getChildrenContaining(). | ||
5357 | */ | ||
5358 | static int loadAndGetChildrenContaining( | ||
5359 | fulltext_vtab *v, | ||
5360 | sqlite_int64 iBlockid, | ||
5361 | const char *pTerm, int nTerm, int isPrefix, | ||
5362 | sqlite_int64 *piStartChild, sqlite_int64 *piEndChild | ||
5363 | ){ | ||
5364 | sqlite3_stmt *s = NULL; | ||
5365 | int rc; | ||
5366 | |||
5367 | assert( iBlockid!=0 ); | ||
5368 | assert( pTerm!=NULL ); | ||
5369 | assert( nTerm!=0 ); /* TODO(shess) Why not allow this? */ | ||
5370 | assert( piStartChild!=NULL ); | ||
5371 | assert( piEndChild!=NULL ); | ||
5372 | |||
5373 | rc = sql_get_statement(v, BLOCK_SELECT_STMT, &s); | ||
5374 | if( rc!=SQLITE_OK ) return rc; | ||
5375 | |||
5376 | rc = sqlite3_bind_int64(s, 1, iBlockid); | ||
5377 | if( rc!=SQLITE_OK ) return rc; | ||
5378 | |||
5379 | rc = sqlite3_step(s); | ||
5380 | if( rc==SQLITE_DONE ) return SQLITE_ERROR; | ||
5381 | if( rc!=SQLITE_ROW ) return rc; | ||
5382 | |||
5383 | getChildrenContaining(sqlite3_column_blob(s, 0), sqlite3_column_bytes(s, 0), | ||
5384 | pTerm, nTerm, isPrefix, piStartChild, piEndChild); | ||
5385 | |||
5386 | /* We expect only one row. We must execute another sqlite3_step() | ||
5387 | * to complete the iteration; otherwise the table will remain | ||
5388 | * locked. */ | ||
5389 | rc = sqlite3_step(s); | ||
5390 | if( rc==SQLITE_ROW ) return SQLITE_ERROR; | ||
5391 | if( rc!=SQLITE_DONE ) return rc; | ||
5392 | |||
5393 | return SQLITE_OK; | ||
5394 | } | ||
5395 | |||
5396 | /* Traverse the tree represented by pData[nData] looking for | ||
5397 | ** pTerm[nTerm], placing its doclist into *out. This is internal to | ||
5398 | ** loadSegment() to make error-handling cleaner. | ||
5399 | */ | ||
5400 | static int loadSegmentInt(fulltext_vtab *v, const char *pData, int nData, | ||
5401 | sqlite_int64 iLeavesEnd, | ||
5402 | const char *pTerm, int nTerm, int isPrefix, | ||
5403 | DataBuffer *out){ | ||
5404 | /* Special case where root is a leaf. */ | ||
5405 | if( *pData=='\0' ){ | ||
5406 | return loadSegmentLeaf(v, pData, nData, pTerm, nTerm, isPrefix, out); | ||
5407 | }else{ | ||
5408 | int rc; | ||
5409 | sqlite_int64 iStartChild, iEndChild; | ||
5410 | |||
5411 | /* Process pData as an interior node, then loop down the tree | ||
5412 | ** until we find the set of leaf nodes to scan for the term. | ||
5413 | */ | ||
5414 | getChildrenContaining(pData, nData, pTerm, nTerm, isPrefix, | ||
5415 | &iStartChild, &iEndChild); | ||
5416 | while( iStartChild>iLeavesEnd ){ | ||
5417 | sqlite_int64 iNextStart, iNextEnd; | ||
5418 | rc = loadAndGetChildrenContaining(v, iStartChild, pTerm, nTerm, isPrefix, | ||
5419 | &iNextStart, &iNextEnd); | ||
5420 | if( rc!=SQLITE_OK ) return rc; | ||
5421 | |||
5422 | /* If we've branched, follow the end branch, too. */ | ||
5423 | if( iStartChild!=iEndChild ){ | ||
5424 | sqlite_int64 iDummy; | ||
5425 | rc = loadAndGetChildrenContaining(v, iEndChild, pTerm, nTerm, isPrefix, | ||
5426 | &iDummy, &iNextEnd); | ||
5427 | if( rc!=SQLITE_OK ) return rc; | ||
5428 | } | ||
5429 | |||
5430 | assert( iNextStart<=iNextEnd ); | ||
5431 | iStartChild = iNextStart; | ||
5432 | iEndChild = iNextEnd; | ||
5433 | } | ||
5434 | assert( iStartChild<=iLeavesEnd ); | ||
5435 | assert( iEndChild<=iLeavesEnd ); | ||
5436 | |||
5437 | /* Scan through the leaf segments for doclists. */ | ||
5438 | return loadSegmentLeaves(v, iStartChild, iEndChild, | ||
5439 | pTerm, nTerm, isPrefix, out); | ||
5440 | } | ||
5441 | } | ||
5442 | |||
5443 | /* Call loadSegmentInt() to collect the doclist for pTerm/nTerm, then | ||
5444 | ** merge its doclist over *out (any duplicate doclists read from the | ||
5445 | ** segment rooted at pData will overwrite those in *out). | ||
5446 | */ | ||
5447 | /* TODO(shess) Consider changing this to determine the depth of the | ||
5448 | ** leaves using either the first characters of interior nodes (when | ||
5449 | ** ==1, we're one level above the leaves), or the first character of | ||
5450 | ** the root (which will describe the height of the tree directly). | ||
5451 | ** Either feels somewhat tricky to me. | ||
5452 | */ | ||
5453 | /* TODO(shess) The current merge is likely to be slow for large | ||
5454 | ** doclists (though it should process from newest/smallest to | ||
5455 | ** oldest/largest, so it may not be that bad). It might be useful to | ||
5456 | ** modify things to allow for N-way merging. This could either be | ||
5457 | ** within a segment, with pairwise merges across segments, or across | ||
5458 | ** all segments at once. | ||
5459 | */ | ||
5460 | static int loadSegment(fulltext_vtab *v, const char *pData, int nData, | ||
5461 | sqlite_int64 iLeavesEnd, | ||
5462 | const char *pTerm, int nTerm, int isPrefix, | ||
5463 | DataBuffer *out){ | ||
5464 | DataBuffer result; | ||
5465 | int rc; | ||
5466 | |||
5467 | assert( nData>1 ); | ||
5468 | |||
5469 | /* This code should never be called with buffered updates. */ | ||
5470 | assert( v->nPendingData<0 ); | ||
5471 | |||
5472 | dataBufferInit(&result, 0); | ||
5473 | rc = loadSegmentInt(v, pData, nData, iLeavesEnd, | ||
5474 | pTerm, nTerm, isPrefix, &result); | ||
5475 | if( rc==SQLITE_OK && result.nData>0 ){ | ||
5476 | if( out->nData==0 ){ | ||
5477 | DataBuffer tmp = *out; | ||
5478 | *out = result; | ||
5479 | result = tmp; | ||
5480 | }else{ | ||
5481 | DataBuffer merged; | ||
5482 | DLReader readers[2]; | ||
5483 | |||
5484 | dlrInit(&readers[0], DL_DEFAULT, out->pData, out->nData); | ||
5485 | dlrInit(&readers[1], DL_DEFAULT, result.pData, result.nData); | ||
5486 | dataBufferInit(&merged, out->nData+result.nData); | ||
5487 | docListMerge(&merged, readers, 2); | ||
5488 | dataBufferDestroy(out); | ||
5489 | *out = merged; | ||
5490 | dlrDestroy(&readers[0]); | ||
5491 | dlrDestroy(&readers[1]); | ||
5492 | } | ||
5493 | } | ||
5494 | dataBufferDestroy(&result); | ||
5495 | return rc; | ||
5496 | } | ||
5497 | |||
5498 | /* Scan the database and merge together the posting lists for the term | ||
5499 | ** into *out. | ||
5500 | */ | ||
5501 | static int termSelect(fulltext_vtab *v, int iColumn, | ||
5502 | const char *pTerm, int nTerm, int isPrefix, | ||
5503 | DocListType iType, DataBuffer *out){ | ||
5504 | DataBuffer doclist; | ||
5505 | sqlite3_stmt *s; | ||
5506 | int rc = sql_get_statement(v, SEGDIR_SELECT_ALL_STMT, &s); | ||
5507 | if( rc!=SQLITE_OK ) return rc; | ||
5508 | |||
5509 | /* This code should never be called with buffered updates. */ | ||
5510 | assert( v->nPendingData<0 ); | ||
5511 | |||
5512 | dataBufferInit(&doclist, 0); | ||
5513 | |||
5514 | /* Traverse the segments from oldest to newest so that newer doclist | ||
5515 | ** elements for given docids overwrite older elements. | ||
5516 | */ | ||
5517 | while( (rc = sqlite3_step(s))==SQLITE_ROW ){ | ||
5518 | const char *pData = sqlite3_column_blob(s, 0); | ||
5519 | const int nData = sqlite3_column_bytes(s, 0); | ||
5520 | const sqlite_int64 iLeavesEnd = sqlite3_column_int64(s, 1); | ||
5521 | rc = loadSegment(v, pData, nData, iLeavesEnd, pTerm, nTerm, isPrefix, | ||
5522 | &doclist); | ||
5523 | if( rc!=SQLITE_OK ) goto err; | ||
5524 | } | ||
5525 | if( rc==SQLITE_DONE ){ | ||
5526 | if( doclist.nData!=0 ){ | ||
5527 | /* TODO(shess) The old term_select_all() code applied the column | ||
5528 | ** restrict as we merged segments, leading to smaller buffers. | ||
5529 | ** This is probably worthwhile to bring back, once the new storage | ||
5530 | ** system is checked in. | ||
5531 | */ | ||
5532 | if( iColumn==v->nColumn) iColumn = -1; | ||
5533 | docListTrim(DL_DEFAULT, doclist.pData, doclist.nData, | ||
5534 | iColumn, iType, out); | ||
5535 | } | ||
5536 | rc = SQLITE_OK; | ||
5537 | } | ||
5538 | |||
5539 | err: | ||
5540 | dataBufferDestroy(&doclist); | ||
5541 | return rc; | ||
5542 | } | ||
5543 | |||
5544 | /****************************************************************/ | ||
5545 | /* Used to hold hashtable data for sorting. */ | ||
5546 | typedef struct TermData { | ||
5547 | const char *pTerm; | ||
5548 | int nTerm; | ||
5549 | DLCollector *pCollector; | ||
5550 | } TermData; | ||
5551 | |||
5552 | /* Orders TermData elements in strcmp fashion ( <0 for less-than, 0 | ||
5553 | ** for equal, >0 for greater-than). | ||
5554 | */ | ||
5555 | static int termDataCmp(const void *av, const void *bv){ | ||
5556 | const TermData *a = (const TermData *)av; | ||
5557 | const TermData *b = (const TermData *)bv; | ||
5558 | int n = a->nTerm<b->nTerm ? a->nTerm : b->nTerm; | ||
5559 | int c = memcmp(a->pTerm, b->pTerm, n); | ||
5560 | if( c!=0 ) return c; | ||
5561 | return a->nTerm-b->nTerm; | ||
5562 | } | ||
5563 | |||
5564 | /* Order pTerms data by term, then write a new level 0 segment using | ||
5565 | ** LeafWriter. | ||
5566 | */ | ||
5567 | static int writeZeroSegment(fulltext_vtab *v, fts3Hash *pTerms){ | ||
5568 | fts3HashElem *e; | ||
5569 | int idx, rc, i, n; | ||
5570 | TermData *pData; | ||
5571 | LeafWriter writer; | ||
5572 | DataBuffer dl; | ||
5573 | |||
5574 | /* Determine the next index at level 0, merging as necessary. */ | ||
5575 | rc = segdirNextIndex(v, 0, &idx); | ||
5576 | if( rc!=SQLITE_OK ) return rc; | ||
5577 | |||
5578 | n = fts3HashCount(pTerms); | ||
5579 | pData = malloc(n*sizeof(TermData)); | ||
5580 | |||
5581 | for(i = 0, e = fts3HashFirst(pTerms); e; i++, e = fts3HashNext(e)){ | ||
5582 | assert( i<n ); | ||
5583 | pData[i].pTerm = fts3HashKey(e); | ||
5584 | pData[i].nTerm = fts3HashKeysize(e); | ||
5585 | pData[i].pCollector = fts3HashData(e); | ||
5586 | } | ||
5587 | assert( i==n ); | ||
5588 | |||
5589 | /* TODO(shess) Should we allow user-defined collation sequences, | ||
5590 | ** here? I think we only need that once we support prefix searches. | ||
5591 | */ | ||
5592 | if( n>1 ) qsort(pData, n, sizeof(*pData), termDataCmp); | ||
5593 | |||
5594 | /* TODO(shess) Refactor so that we can write directly to the segment | ||
5595 | ** DataBuffer, as happens for segment merges. | ||
5596 | */ | ||
5597 | leafWriterInit(0, idx, &writer); | ||
5598 | dataBufferInit(&dl, 0); | ||
5599 | for(i=0; i<n; i++){ | ||
5600 | dataBufferReset(&dl); | ||
5601 | dlcAddDoclist(pData[i].pCollector, &dl); | ||
5602 | rc = leafWriterStep(v, &writer, | ||
5603 | pData[i].pTerm, pData[i].nTerm, dl.pData, dl.nData); | ||
5604 | if( rc!=SQLITE_OK ) goto err; | ||
5605 | } | ||
5606 | rc = leafWriterFinalize(v, &writer); | ||
5607 | |||
5608 | err: | ||
5609 | dataBufferDestroy(&dl); | ||
5610 | free(pData); | ||
5611 | leafWriterDestroy(&writer); | ||
5612 | return rc; | ||
5613 | } | ||
5614 | |||
5615 | /* If pendingTerms has data, free it. */ | ||
5616 | static int clearPendingTerms(fulltext_vtab *v){ | ||
5617 | if( v->nPendingData>=0 ){ | ||
5618 | fts3HashElem *e; | ||
5619 | for(e=fts3HashFirst(&v->pendingTerms); e; e=fts3HashNext(e)){ | ||
5620 | dlcDelete(fts3HashData(e)); | ||
5621 | } | ||
5622 | fts3HashClear(&v->pendingTerms); | ||
5623 | v->nPendingData = -1; | ||
5624 | } | ||
5625 | return SQLITE_OK; | ||
5626 | } | ||
5627 | |||
5628 | /* If pendingTerms has data, flush it to a level-zero segment, and | ||
5629 | ** free it. | ||
5630 | */ | ||
5631 | static int flushPendingTerms(fulltext_vtab *v){ | ||
5632 | if( v->nPendingData>=0 ){ | ||
5633 | int rc = writeZeroSegment(v, &v->pendingTerms); | ||
5634 | if( rc==SQLITE_OK ) clearPendingTerms(v); | ||
5635 | return rc; | ||
5636 | } | ||
5637 | return SQLITE_OK; | ||
5638 | } | ||
5639 | |||
5640 | /* If pendingTerms is "too big", or docid is out of order, flush it. | ||
5641 | ** Regardless, be certain that pendingTerms is initialized for use. | ||
5642 | */ | ||
5643 | static int initPendingTerms(fulltext_vtab *v, sqlite_int64 iDocid){ | ||
5644 | /* TODO(shess) Explore whether partially flushing the buffer on | ||
5645 | ** forced-flush would provide better performance. I suspect that if | ||
5646 | ** we ordered the doclists by size and flushed the largest until the | ||
5647 | ** buffer was half empty, that would let the less frequent terms | ||
5648 | ** generate longer doclists. | ||
5649 | */ | ||
5650 | if( iDocid<=v->iPrevDocid || v->nPendingData>kPendingThreshold ){ | ||
5651 | int rc = flushPendingTerms(v); | ||
5652 | if( rc!=SQLITE_OK ) return rc; | ||
5653 | } | ||
5654 | if( v->nPendingData<0 ){ | ||
5655 | fts3HashInit(&v->pendingTerms, FTS3_HASH_STRING, 1); | ||
5656 | v->nPendingData = 0; | ||
5657 | } | ||
5658 | v->iPrevDocid = iDocid; | ||
5659 | return SQLITE_OK; | ||
5660 | } | ||
5661 | |||
5662 | /* This function implements the xUpdate callback; it's the top-level entry | ||
5663 | * point for inserting, deleting or updating a row in a full-text table. */ | ||
5664 | static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg, | ||
5665 | sqlite_int64 *pRowid){ | ||
5666 | fulltext_vtab *v = (fulltext_vtab *) pVtab; | ||
5667 | int rc; | ||
5668 | |||
5669 | TRACE(("FTS3 Update %p\n", pVtab)); | ||
5670 | |||
5671 | if( nArg<2 ){ | ||
5672 | rc = index_delete(v, sqlite3_value_int64(ppArg[0])); | ||
5673 | } else if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){ | ||
5674 | /* An update: | ||
5675 | * ppArg[0] = old rowid | ||
5676 | * ppArg[1] = new rowid | ||
5677 | * ppArg[2..2+v->nColumn-1] = values | ||
5678 | * ppArg[2+v->nColumn] = value for magic column (we ignore this) | ||
5679 | * ppArg[2+v->nColumn+1] = value for docid | ||
5680 | */ | ||
5681 | sqlite_int64 rowid = sqlite3_value_int64(ppArg[0]); | ||
5682 | if( sqlite3_value_type(ppArg[1]) != SQLITE_INTEGER || | ||
5683 | sqlite3_value_int64(ppArg[1]) != rowid ){ | ||
5684 | rc = SQLITE_ERROR; /* we don't allow changing the rowid */ | ||
5685 | }else if( sqlite3_value_type(ppArg[2+v->nColumn+1]) != SQLITE_INTEGER || | ||
5686 | sqlite3_value_int64(ppArg[2+v->nColumn+1]) != rowid ){ | ||
5687 | rc = SQLITE_ERROR; /* we don't allow changing the docid */ | ||
5688 | }else{ | ||
5689 | assert( nArg==2+v->nColumn+2); | ||
5690 | rc = index_update(v, rowid, &ppArg[2]); | ||
5691 | } | ||
5692 | } else { | ||
5693 | /* An insert: | ||
5694 | * ppArg[1] = requested rowid | ||
5695 | * ppArg[2..2+v->nColumn-1] = values | ||
5696 | * ppArg[2+v->nColumn] = value for magic column (we ignore this) | ||
5697 | * ppArg[2+v->nColumn+1] = value for docid | ||
5698 | */ | ||
5699 | sqlite3_value *pRequestDocid = ppArg[2+v->nColumn+1]; | ||
5700 | assert( nArg==2+v->nColumn+2); | ||
5701 | if( SQLITE_NULL != sqlite3_value_type(pRequestDocid) && | ||
5702 | SQLITE_NULL != sqlite3_value_type(ppArg[1]) ){ | ||
5703 | /* TODO(shess) Consider allowing this to work if the values are | ||
5704 | ** identical. I'm inclined to discourage that usage, though, | ||
5705 | ** given that both rowid and docid are special columns. Better | ||
5706 | ** would be to define one or the other as the default winner, | ||
5707 | ** but should it be fts3-centric (docid) or SQLite-centric | ||
5708 | ** (rowid)? | ||
5709 | */ | ||
5710 | rc = SQLITE_ERROR; | ||
5711 | }else{ | ||
5712 | if( SQLITE_NULL == sqlite3_value_type(pRequestDocid) ){ | ||
5713 | pRequestDocid = ppArg[1]; | ||
5714 | } | ||
5715 | rc = index_insert(v, pRequestDocid, &ppArg[2], pRowid); | ||
5716 | } | ||
5717 | } | ||
5718 | |||
5719 | return rc; | ||
5720 | } | ||
5721 | |||
5722 | static int fulltextSync(sqlite3_vtab *pVtab){ | ||
5723 | TRACE(("FTS3 xSync()\n")); | ||
5724 | return flushPendingTerms((fulltext_vtab *)pVtab); | ||
5725 | } | ||
5726 | |||
5727 | static int fulltextBegin(sqlite3_vtab *pVtab){ | ||
5728 | fulltext_vtab *v = (fulltext_vtab *) pVtab; | ||
5729 | TRACE(("FTS3 xBegin()\n")); | ||
5730 | |||
5731 | /* Any buffered updates should have been cleared by the previous | ||
5732 | ** transaction. | ||
5733 | */ | ||
5734 | assert( v->nPendingData<0 ); | ||
5735 | return clearPendingTerms(v); | ||
5736 | } | ||
5737 | |||
5738 | static int fulltextCommit(sqlite3_vtab *pVtab){ | ||
5739 | fulltext_vtab *v = (fulltext_vtab *) pVtab; | ||
5740 | TRACE(("FTS3 xCommit()\n")); | ||
5741 | |||
5742 | /* Buffered updates should have been cleared by fulltextSync(). */ | ||
5743 | assert( v->nPendingData<0 ); | ||
5744 | return clearPendingTerms(v); | ||
5745 | } | ||
5746 | |||
5747 | static int fulltextRollback(sqlite3_vtab *pVtab){ | ||
5748 | TRACE(("FTS3 xRollback()\n")); | ||
5749 | return clearPendingTerms((fulltext_vtab *)pVtab); | ||
5750 | } | ||
5751 | |||
5752 | /* | ||
5753 | ** Implementation of the snippet() function for FTS3 | ||
5754 | */ | ||
5755 | static void snippetFunc( | ||
5756 | sqlite3_context *pContext, | ||
5757 | int argc, | ||
5758 | sqlite3_value **argv | ||
5759 | ){ | ||
5760 | fulltext_cursor *pCursor; | ||
5761 | if( argc<1 ) return; | ||
5762 | if( sqlite3_value_type(argv[0])!=SQLITE_BLOB || | ||
5763 | sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){ | ||
5764 | sqlite3_result_error(pContext, "illegal first argument to html_snippet",-1); | ||
5765 | }else{ | ||
5766 | const char *zStart = "<b>"; | ||
5767 | const char *zEnd = "</b>"; | ||
5768 | const char *zEllipsis = "<b>...</b>"; | ||
5769 | memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor)); | ||
5770 | if( argc>=2 ){ | ||
5771 | zStart = (const char*)sqlite3_value_text(argv[1]); | ||
5772 | if( argc>=3 ){ | ||
5773 | zEnd = (const char*)sqlite3_value_text(argv[2]); | ||
5774 | if( argc>=4 ){ | ||
5775 | zEllipsis = (const char*)sqlite3_value_text(argv[3]); | ||
5776 | } | ||
5777 | } | ||
5778 | } | ||
5779 | snippetAllOffsets(pCursor); | ||
5780 | snippetText(pCursor, zStart, zEnd, zEllipsis); | ||
5781 | sqlite3_result_text(pContext, pCursor->snippet.zSnippet, | ||
5782 | pCursor->snippet.nSnippet, SQLITE_STATIC); | ||
5783 | } | ||
5784 | } | ||
5785 | |||
5786 | /* | ||
5787 | ** Implementation of the offsets() function for FTS3 | ||
5788 | */ | ||
5789 | static void snippetOffsetsFunc( | ||
5790 | sqlite3_context *pContext, | ||
5791 | int argc, | ||
5792 | sqlite3_value **argv | ||
5793 | ){ | ||
5794 | fulltext_cursor *pCursor; | ||
5795 | if( argc<1 ) return; | ||
5796 | if( sqlite3_value_type(argv[0])!=SQLITE_BLOB || | ||
5797 | sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){ | ||
5798 | sqlite3_result_error(pContext, "illegal first argument to offsets",-1); | ||
5799 | }else{ | ||
5800 | memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor)); | ||
5801 | snippetAllOffsets(pCursor); | ||
5802 | snippetOffsetText(&pCursor->snippet); | ||
5803 | sqlite3_result_text(pContext, | ||
5804 | pCursor->snippet.zOffset, pCursor->snippet.nOffset, | ||
5805 | SQLITE_STATIC); | ||
5806 | } | ||
5807 | } | ||
5808 | |||
5809 | /* | ||
5810 | ** This routine implements the xFindFunction method for the FTS3 | ||
5811 | ** virtual table. | ||
5812 | */ | ||
5813 | static int fulltextFindFunction( | ||
5814 | sqlite3_vtab *pVtab, | ||
5815 | int nArg, | ||
5816 | const char *zName, | ||
5817 | void (**pxFunc)(sqlite3_context*,int,sqlite3_value**), | ||
5818 | void **ppArg | ||
5819 | ){ | ||
5820 | if( strcmp(zName,"snippet")==0 ){ | ||
5821 | *pxFunc = snippetFunc; | ||
5822 | return 1; | ||
5823 | }else if( strcmp(zName,"offsets")==0 ){ | ||
5824 | *pxFunc = snippetOffsetsFunc; | ||
5825 | return 1; | ||
5826 | } | ||
5827 | return 0; | ||
5828 | } | ||
5829 | |||
5830 | /* | ||
5831 | ** Rename an fts3 table. | ||
5832 | */ | ||
5833 | static int fulltextRename( | ||
5834 | sqlite3_vtab *pVtab, | ||
5835 | const char *zName | ||
5836 | ){ | ||
5837 | fulltext_vtab *p = (fulltext_vtab *)pVtab; | ||
5838 | int rc = SQLITE_NOMEM; | ||
5839 | char *zSql = sqlite3_mprintf( | ||
5840 | "ALTER TABLE %Q.'%q_content' RENAME TO '%q_content';" | ||
5841 | "ALTER TABLE %Q.'%q_segments' RENAME TO '%q_segments';" | ||
5842 | "ALTER TABLE %Q.'%q_segdir' RENAME TO '%q_segdir';" | ||
5843 | , p->zDb, p->zName, zName | ||
5844 | , p->zDb, p->zName, zName | ||
5845 | , p->zDb, p->zName, zName | ||
5846 | ); | ||
5847 | if( zSql ){ | ||
5848 | rc = sqlite3_exec(p->db, zSql, 0, 0, 0); | ||
5849 | sqlite3_free(zSql); | ||
5850 | } | ||
5851 | return rc; | ||
5852 | } | ||
5853 | |||
5854 | static const sqlite3_module fts3Module = { | ||
5855 | /* iVersion */ 0, | ||
5856 | /* xCreate */ fulltextCreate, | ||
5857 | /* xConnect */ fulltextConnect, | ||
5858 | /* xBestIndex */ fulltextBestIndex, | ||
5859 | /* xDisconnect */ fulltextDisconnect, | ||
5860 | /* xDestroy */ fulltextDestroy, | ||
5861 | /* xOpen */ fulltextOpen, | ||
5862 | /* xClose */ fulltextClose, | ||
5863 | /* xFilter */ fulltextFilter, | ||
5864 | /* xNext */ fulltextNext, | ||
5865 | /* xEof */ fulltextEof, | ||
5866 | /* xColumn */ fulltextColumn, | ||
5867 | /* xRowid */ fulltextRowid, | ||
5868 | /* xUpdate */ fulltextUpdate, | ||
5869 | /* xBegin */ fulltextBegin, | ||
5870 | /* xSync */ fulltextSync, | ||
5871 | /* xCommit */ fulltextCommit, | ||
5872 | /* xRollback */ fulltextRollback, | ||
5873 | /* xFindFunction */ fulltextFindFunction, | ||
5874 | /* xRename */ fulltextRename, | ||
5875 | }; | ||
5876 | |||
5877 | static void hashDestroy(void *p){ | ||
5878 | fts3Hash *pHash = (fts3Hash *)p; | ||
5879 | sqlite3Fts3HashClear(pHash); | ||
5880 | sqlite3_free(pHash); | ||
5881 | } | ||
5882 | |||
5883 | /* | ||
5884 | ** The fts3 built-in tokenizers - "simple" and "porter" - are implemented | ||
5885 | ** in files fts3_tokenizer1.c and fts3_porter.c respectively. The following | ||
5886 | ** two forward declarations are for functions declared in these files | ||
5887 | ** used to retrieve the respective implementations. | ||
5888 | ** | ||
5889 | ** Calling sqlite3Fts3SimpleTokenizerModule() sets the value pointed | ||
5890 | ** to by the argument to point a the "simple" tokenizer implementation. | ||
5891 | ** Function ...PorterTokenizerModule() sets *pModule to point to the | ||
5892 | ** porter tokenizer/stemmer implementation. | ||
5893 | */ | ||
5894 | void sqlite3Fts3SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule); | ||
5895 | void sqlite3Fts3PorterTokenizerModule(sqlite3_tokenizer_module const**ppModule); | ||
5896 | void sqlite3Fts3IcuTokenizerModule(sqlite3_tokenizer_module const**ppModule); | ||
5897 | |||
5898 | int sqlite3Fts3InitHashTable(sqlite3 *, fts3Hash *, const char *); | ||
5899 | |||
5900 | /* | ||
5901 | ** Initialise the fts3 extension. If this extension is built as part | ||
5902 | ** of the sqlite library, then this function is called directly by | ||
5903 | ** SQLite. If fts3 is built as a dynamically loadable extension, this | ||
5904 | ** function is called by the sqlite3_extension_init() entry point. | ||
5905 | */ | ||
5906 | int sqlite3Fts3Init(sqlite3 *db){ | ||
5907 | int rc = SQLITE_OK; | ||
5908 | fts3Hash *pHash = 0; | ||
5909 | const sqlite3_tokenizer_module *pSimple = 0; | ||
5910 | const sqlite3_tokenizer_module *pPorter = 0; | ||
5911 | const sqlite3_tokenizer_module *pIcu = 0; | ||
5912 | |||
5913 | sqlite3Fts3SimpleTokenizerModule(&pSimple); | ||
5914 | sqlite3Fts3PorterTokenizerModule(&pPorter); | ||
5915 | #ifdef SQLITE_ENABLE_ICU | ||
5916 | sqlite3Fts3IcuTokenizerModule(&pIcu); | ||
5917 | #endif | ||
5918 | |||
5919 | /* Allocate and initialise the hash-table used to store tokenizers. */ | ||
5920 | pHash = sqlite3_malloc(sizeof(fts3Hash)); | ||
5921 | if( !pHash ){ | ||
5922 | rc = SQLITE_NOMEM; | ||
5923 | }else{ | ||
5924 | sqlite3Fts3HashInit(pHash, FTS3_HASH_STRING, 1); | ||
5925 | } | ||
5926 | |||
5927 | /* Load the built-in tokenizers into the hash table */ | ||
5928 | if( rc==SQLITE_OK ){ | ||
5929 | if( sqlite3Fts3HashInsert(pHash, "simple", 7, (void *)pSimple) | ||
5930 | || sqlite3Fts3HashInsert(pHash, "porter", 7, (void *)pPorter) | ||
5931 | || (pIcu && sqlite3Fts3HashInsert(pHash, "icu", 4, (void *)pIcu)) | ||
5932 | ){ | ||
5933 | rc = SQLITE_NOMEM; | ||
5934 | } | ||
5935 | } | ||
5936 | |||
5937 | /* Create the virtual table wrapper around the hash-table and overload | ||
5938 | ** the two scalar functions. If this is successful, register the | ||
5939 | ** module with sqlite. | ||
5940 | */ | ||
5941 | if( SQLITE_OK==rc | ||
5942 | && SQLITE_OK==(rc = sqlite3Fts3InitHashTable(db, pHash, "fts3_tokenizer")) | ||
5943 | && SQLITE_OK==(rc = sqlite3_overload_function(db, "snippet", -1)) | ||
5944 | && SQLITE_OK==(rc = sqlite3_overload_function(db, "offsets", -1)) | ||
5945 | ){ | ||
5946 | return sqlite3_create_module_v2( | ||
5947 | db, "fts3", &fts3Module, (void *)pHash, hashDestroy | ||
5948 | ); | ||
5949 | } | ||
5950 | |||
5951 | /* An error has occured. Delete the hash table and return the error code. */ | ||
5952 | assert( rc!=SQLITE_OK ); | ||
5953 | if( pHash ){ | ||
5954 | sqlite3Fts3HashClear(pHash); | ||
5955 | sqlite3_free(pHash); | ||
5956 | } | ||
5957 | return rc; | ||
5958 | } | ||
5959 | |||
5960 | #if !SQLITE_CORE | ||
5961 | int sqlite3_extension_init( | ||
5962 | sqlite3 *db, | ||
5963 | char **pzErrMsg, | ||
5964 | const sqlite3_api_routines *pApi | ||
5965 | ){ | ||
5966 | SQLITE_EXTENSION_INIT2(pApi) | ||
5967 | return sqlite3Fts3Init(db); | ||
5968 | } | ||
5969 | #endif | ||
5970 | |||
5971 | #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */ | ||