aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2.c
diff options
context:
space:
mode:
Diffstat (limited to 'libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2.c')
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2.c5936
1 files changed, 5936 insertions, 0 deletions
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2.c
new file mode 100644
index 0000000..65ad173
--- /dev/null
+++ b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2.c
@@ -0,0 +1,5936 @@
1/* fts2 has a design flaw which can lead to database corruption (see
2** below). It is recommended not to use it any longer, instead use
3** fts3 (or higher). If you believe that your use of fts2 is safe,
4** add -DSQLITE_ENABLE_BROKEN_FTS2=1 to your CFLAGS.
5*/
6#ifndef SQLITE_ENABLE_BROKEN_FTS2
7#error fts2 has a design flaw and has been deprecated.
8#endif
9/* The flaw is that fts2 uses the content table's unaliased rowid as
10** the unique docid. fts2 embeds the rowid in the index it builds,
11** and expects the rowid to not change. The SQLite VACUUM operation
12** will renumber such rowids, thereby breaking fts2. If you are using
13** fts2 in a system which has disabled VACUUM, then you can continue
14** to use it safely. Note that PRAGMA auto_vacuum does NOT disable
15** VACUUM, though systems using auto_vacuum are unlikely to invoke
16** VACUUM.
17**
18** Unlike fts1, which is safe across VACUUM if you never delete
19** documents, fts2 has a second exposure to this flaw, in the segments
20** table. So fts2 should be considered unsafe across VACUUM in all
21** cases.
22*/
23
24/*
25** 2006 Oct 10
26**
27** The author disclaims copyright to this source code. In place of
28** a legal notice, here is a blessing:
29**
30** May you do good and not evil.
31** May you find forgiveness for yourself and forgive others.
32** May you share freely, never taking more than you give.
33**
34******************************************************************************
35**
36** This is an SQLite module implementing full-text search.
37*/
38
39/*
40** The code in this file is only compiled if:
41**
42** * The FTS2 module is being built as an extension
43** (in which case SQLITE_CORE is not defined), or
44**
45** * The FTS2 module is being built into the core of
46** SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
47*/
48
49/* TODO(shess) Consider exporting this comment to an HTML file or the
50** wiki.
51*/
52/* The full-text index is stored in a series of b+tree (-like)
53** structures called segments which map terms to doclists. The
54** structures are like b+trees in layout, but are constructed from the
55** bottom up in optimal fashion and are not updatable. Since trees
56** are built from the bottom up, things will be described from the
57** bottom up.
58**
59**
60**** Varints ****
61** The basic unit of encoding is a variable-length integer called a
62** varint. We encode variable-length integers in little-endian order
63** using seven bits * per byte as follows:
64**
65** KEY:
66** A = 0xxxxxxx 7 bits of data and one flag bit
67** B = 1xxxxxxx 7 bits of data and one flag bit
68**
69** 7 bits - A
70** 14 bits - BA
71** 21 bits - BBA
72** and so on.
73**
74** This is identical to how sqlite encodes varints (see util.c).
75**
76**
77**** Document lists ****
78** A doclist (document list) holds a docid-sorted list of hits for a
79** given term. Doclists hold docids, and can optionally associate
80** token positions and offsets with docids.
81**
82** A DL_POSITIONS_OFFSETS doclist is stored like this:
83**
84** array {
85** varint docid;
86** array { (position list for column 0)
87** varint position; (delta from previous position plus POS_BASE)
88** varint startOffset; (delta from previous startOffset)
89** varint endOffset; (delta from startOffset)
90** }
91** array {
92** varint POS_COLUMN; (marks start of position list for new column)
93** varint column; (index of new column)
94** array {
95** varint position; (delta from previous position plus POS_BASE)
96** varint startOffset;(delta from previous startOffset)
97** varint endOffset; (delta from startOffset)
98** }
99** }
100** varint POS_END; (marks end of positions for this document.
101** }
102**
103** Here, array { X } means zero or more occurrences of X, adjacent in
104** memory. A "position" is an index of a token in the token stream
105** generated by the tokenizer, while an "offset" is a byte offset,
106** both based at 0. Note that POS_END and POS_COLUMN occur in the
107** same logical place as the position element, and act as sentinals
108** ending a position list array.
109**
110** A DL_POSITIONS doclist omits the startOffset and endOffset
111** information. A DL_DOCIDS doclist omits both the position and
112** offset information, becoming an array of varint-encoded docids.
113**
114** On-disk data is stored as type DL_DEFAULT, so we don't serialize
115** the type. Due to how deletion is implemented in the segmentation
116** system, on-disk doclists MUST store at least positions.
117**
118**
119**** Segment leaf nodes ****
120** Segment leaf nodes store terms and doclists, ordered by term. Leaf
121** nodes are written using LeafWriter, and read using LeafReader (to
122** iterate through a single leaf node's data) and LeavesReader (to
123** iterate through a segment's entire leaf layer). Leaf nodes have
124** the format:
125**
126** varint iHeight; (height from leaf level, always 0)
127** varint nTerm; (length of first term)
128** char pTerm[nTerm]; (content of first term)
129** varint nDoclist; (length of term's associated doclist)
130** char pDoclist[nDoclist]; (content of doclist)
131** array {
132** (further terms are delta-encoded)
133** varint nPrefix; (length of prefix shared with previous term)
134** varint nSuffix; (length of unshared suffix)
135** char pTermSuffix[nSuffix];(unshared suffix of next term)
136** varint nDoclist; (length of term's associated doclist)
137** char pDoclist[nDoclist]; (content of doclist)
138** }
139**
140** Here, array { X } means zero or more occurrences of X, adjacent in
141** memory.
142**
143** Leaf nodes are broken into blocks which are stored contiguously in
144** the %_segments table in sorted order. This means that when the end
145** of a node is reached, the next term is in the node with the next
146** greater node id.
147**
148** New data is spilled to a new leaf node when the current node
149** exceeds LEAF_MAX bytes (default 2048). New data which itself is
150** larger than STANDALONE_MIN (default 1024) is placed in a standalone
151** node (a leaf node with a single term and doclist). The goal of
152** these settings is to pack together groups of small doclists while
153** making it efficient to directly access large doclists. The
154** assumption is that large doclists represent terms which are more
155** likely to be query targets.
156**
157** TODO(shess) It may be useful for blocking decisions to be more
158** dynamic. For instance, it may make more sense to have a 2.5k leaf
159** node rather than splitting into 2k and .5k nodes. My intuition is
160** that this might extend through 2x or 4x the pagesize.
161**
162**
163**** Segment interior nodes ****
164** Segment interior nodes store blockids for subtree nodes and terms
165** to describe what data is stored by the each subtree. Interior
166** nodes are written using InteriorWriter, and read using
167** InteriorReader. InteriorWriters are created as needed when
168** SegmentWriter creates new leaf nodes, or when an interior node
169** itself grows too big and must be split. The format of interior
170** nodes:
171**
172** varint iHeight; (height from leaf level, always >0)
173** varint iBlockid; (block id of node's leftmost subtree)
174** optional {
175** varint nTerm; (length of first term)
176** char pTerm[nTerm]; (content of first term)
177** array {
178** (further terms are delta-encoded)
179** varint nPrefix; (length of shared prefix with previous term)
180** varint nSuffix; (length of unshared suffix)
181** char pTermSuffix[nSuffix]; (unshared suffix of next term)
182** }
183** }
184**
185** Here, optional { X } means an optional element, while array { X }
186** means zero or more occurrences of X, adjacent in memory.
187**
188** An interior node encodes n terms separating n+1 subtrees. The
189** subtree blocks are contiguous, so only the first subtree's blockid
190** is encoded. The subtree at iBlockid will contain all terms less
191** than the first term encoded (or all terms if no term is encoded).
192** Otherwise, for terms greater than or equal to pTerm[i] but less
193** than pTerm[i+1], the subtree for that term will be rooted at
194** iBlockid+i. Interior nodes only store enough term data to
195** distinguish adjacent children (if the rightmost term of the left
196** child is "something", and the leftmost term of the right child is
197** "wicked", only "w" is stored).
198**
199** New data is spilled to a new interior node at the same height when
200** the current node exceeds INTERIOR_MAX bytes (default 2048).
201** INTERIOR_MIN_TERMS (default 7) keeps large terms from monopolizing
202** interior nodes and making the tree too skinny. The interior nodes
203** at a given height are naturally tracked by interior nodes at
204** height+1, and so on.
205**
206**
207**** Segment directory ****
208** The segment directory in table %_segdir stores meta-information for
209** merging and deleting segments, and also the root node of the
210** segment's tree.
211**
212** The root node is the top node of the segment's tree after encoding
213** the entire segment, restricted to ROOT_MAX bytes (default 1024).
214** This could be either a leaf node or an interior node. If the top
215** node requires more than ROOT_MAX bytes, it is flushed to %_segments
216** and a new root interior node is generated (which should always fit
217** within ROOT_MAX because it only needs space for 2 varints, the
218** height and the blockid of the previous root).
219**
220** The meta-information in the segment directory is:
221** level - segment level (see below)
222** idx - index within level
223** - (level,idx uniquely identify a segment)
224** start_block - first leaf node
225** leaves_end_block - last leaf node
226** end_block - last block (including interior nodes)
227** root - contents of root node
228**
229** If the root node is a leaf node, then start_block,
230** leaves_end_block, and end_block are all 0.
231**
232**
233**** Segment merging ****
234** To amortize update costs, segments are groups into levels and
235** merged in matches. Each increase in level represents exponentially
236** more documents.
237**
238** New documents (actually, document updates) are tokenized and
239** written individually (using LeafWriter) to a level 0 segment, with
240** incrementing idx. When idx reaches MERGE_COUNT (default 16), all
241** level 0 segments are merged into a single level 1 segment. Level 1
242** is populated like level 0, and eventually MERGE_COUNT level 1
243** segments are merged to a single level 2 segment (representing
244** MERGE_COUNT^2 updates), and so on.
245**
246** A segment merge traverses all segments at a given level in
247** parallel, performing a straightforward sorted merge. Since segment
248** leaf nodes are written in to the %_segments table in order, this
249** merge traverses the underlying sqlite disk structures efficiently.
250** After the merge, all segment blocks from the merged level are
251** deleted.
252**
253** MERGE_COUNT controls how often we merge segments. 16 seems to be
254** somewhat of a sweet spot for insertion performance. 32 and 64 show
255** very similar performance numbers to 16 on insertion, though they're
256** a tiny bit slower (perhaps due to more overhead in merge-time
257** sorting). 8 is about 20% slower than 16, 4 about 50% slower than
258** 16, 2 about 66% slower than 16.
259**
260** At query time, high MERGE_COUNT increases the number of segments
261** which need to be scanned and merged. For instance, with 100k docs
262** inserted:
263**
264** MERGE_COUNT segments
265** 16 25
266** 8 12
267** 4 10
268** 2 6
269**
270** This appears to have only a moderate impact on queries for very
271** frequent terms (which are somewhat dominated by segment merge
272** costs), and infrequent and non-existent terms still seem to be fast
273** even with many segments.
274**
275** TODO(shess) That said, it would be nice to have a better query-side
276** argument for MERGE_COUNT of 16. Also, it's possible/likely that
277** optimizations to things like doclist merging will swing the sweet
278** spot around.
279**
280**
281**
282**** Handling of deletions and updates ****
283** Since we're using a segmented structure, with no docid-oriented
284** index into the term index, we clearly cannot simply update the term
285** index when a document is deleted or updated. For deletions, we
286** write an empty doclist (varint(docid) varint(POS_END)), for updates
287** we simply write the new doclist. Segment merges overwrite older
288** data for a particular docid with newer data, so deletes or updates
289** will eventually overtake the earlier data and knock it out. The
290** query logic likewise merges doclists so that newer data knocks out
291** older data.
292**
293** TODO(shess) Provide a VACUUM type operation to clear out all
294** deletions and duplications. This would basically be a forced merge
295** into a single segment.
296*/
297
298#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
299
300#if defined(SQLITE_ENABLE_FTS2) && !defined(SQLITE_CORE)
301# define SQLITE_CORE 1
302#endif
303
304#include <assert.h>
305#include <stdlib.h>
306#include <stdio.h>
307#include <string.h>
308#include <ctype.h>
309
310#include "fts2.h"
311#include "fts2_hash.h"
312#include "fts2_tokenizer.h"
313#include "sqlite3.h"
314#include "sqlite3ext.h"
315SQLITE_EXTENSION_INIT1
316
317
318/* TODO(shess) MAN, this thing needs some refactoring. At minimum, it
319** would be nice to order the file better, perhaps something along the
320** lines of:
321**
322** - utility functions
323** - table setup functions
324** - table update functions
325** - table query functions
326**
327** Put the query functions last because they're likely to reference
328** typedefs or functions from the table update section.
329*/
330
331#if 0
332# define TRACE(A) printf A; fflush(stdout)
333#else
334# define TRACE(A)
335#endif
336
337/* It is not safe to call isspace(), tolower(), or isalnum() on
338** hi-bit-set characters. This is the same solution used in the
339** tokenizer.
340*/
341/* TODO(shess) The snippet-generation code should be using the
342** tokenizer-generated tokens rather than doing its own local
343** tokenization.
344*/
345/* TODO(shess) Is __isascii() a portable version of (c&0x80)==0? */
346static int safe_isspace(char c){
347 return (c&0x80)==0 ? isspace(c) : 0;
348}
349static int safe_tolower(char c){
350 return (c&0x80)==0 ? tolower(c) : c;
351}
352static int safe_isalnum(char c){
353 return (c&0x80)==0 ? isalnum(c) : 0;
354}
355
356typedef enum DocListType {
357 DL_DOCIDS, /* docids only */
358 DL_POSITIONS, /* docids + positions */
359 DL_POSITIONS_OFFSETS /* docids + positions + offsets */
360} DocListType;
361
362/*
363** By default, only positions and not offsets are stored in the doclists.
364** To change this so that offsets are stored too, compile with
365**
366** -DDL_DEFAULT=DL_POSITIONS_OFFSETS
367**
368** If DL_DEFAULT is set to DL_DOCIDS, your table can only be inserted
369** into (no deletes or updates).
370*/
371#ifndef DL_DEFAULT
372# define DL_DEFAULT DL_POSITIONS
373#endif
374
375enum {
376 POS_END = 0, /* end of this position list */
377 POS_COLUMN, /* followed by new column number */
378 POS_BASE
379};
380
381/* MERGE_COUNT controls how often we merge segments (see comment at
382** top of file).
383*/
384#define MERGE_COUNT 16
385
386/* utility functions */
387
388/* CLEAR() and SCRAMBLE() abstract memset() on a pointer to a single
389** record to prevent errors of the form:
390**
391** my_function(SomeType *b){
392** memset(b, '\0', sizeof(b)); // sizeof(b)!=sizeof(*b)
393** }
394*/
395/* TODO(shess) Obvious candidates for a header file. */
396#define CLEAR(b) memset(b, '\0', sizeof(*(b)))
397
398#ifndef NDEBUG
399# define SCRAMBLE(b) memset(b, 0x55, sizeof(*(b)))
400#else
401# define SCRAMBLE(b)
402#endif
403
404/* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */
405#define VARINT_MAX 10
406
407/* Write a 64-bit variable-length integer to memory starting at p[0].
408 * The length of data written will be between 1 and VARINT_MAX bytes.
409 * The number of bytes written is returned. */
410static int putVarint(char *p, sqlite_int64 v){
411 unsigned char *q = (unsigned char *) p;
412 sqlite_uint64 vu = v;
413 do{
414 *q++ = (unsigned char) ((vu & 0x7f) | 0x80);
415 vu >>= 7;
416 }while( vu!=0 );
417 q[-1] &= 0x7f; /* turn off high bit in final byte */
418 assert( q - (unsigned char *)p <= VARINT_MAX );
419 return (int) (q - (unsigned char *)p);
420}
421
422/* Read a 64-bit variable-length integer from memory starting at p[0].
423 * Return the number of bytes read, or 0 on error.
424 * The value is stored in *v. */
425static int getVarint(const char *p, sqlite_int64 *v){
426 const unsigned char *q = (const unsigned char *) p;
427 sqlite_uint64 x = 0, y = 1;
428 while( (*q & 0x80) == 0x80 ){
429 x += y * (*q++ & 0x7f);
430 y <<= 7;
431 if( q - (unsigned char *)p >= VARINT_MAX ){ /* bad data */
432 assert( 0 );
433 return 0;
434 }
435 }
436 x += y * (*q++);
437 *v = (sqlite_int64) x;
438 return (int) (q - (unsigned char *)p);
439}
440
441static int getVarint32(const char *p, int *pi){
442 sqlite_int64 i;
443 int ret = getVarint(p, &i);
444 *pi = (int) i;
445 assert( *pi==i );
446 return ret;
447}
448
449/*******************************************************************/
450/* DataBuffer is used to collect data into a buffer in piecemeal
451** fashion. It implements the usual distinction between amount of
452** data currently stored (nData) and buffer capacity (nCapacity).
453**
454** dataBufferInit - create a buffer with given initial capacity.
455** dataBufferReset - forget buffer's data, retaining capacity.
456** dataBufferDestroy - free buffer's data.
457** dataBufferExpand - expand capacity without adding data.
458** dataBufferAppend - append data.
459** dataBufferAppend2 - append two pieces of data at once.
460** dataBufferReplace - replace buffer's data.
461*/
462typedef struct DataBuffer {
463 char *pData; /* Pointer to malloc'ed buffer. */
464 int nCapacity; /* Size of pData buffer. */
465 int nData; /* End of data loaded into pData. */
466} DataBuffer;
467
468static void dataBufferInit(DataBuffer *pBuffer, int nCapacity){
469 assert( nCapacity>=0 );
470 pBuffer->nData = 0;
471 pBuffer->nCapacity = nCapacity;
472 pBuffer->pData = nCapacity==0 ? NULL : malloc(nCapacity);
473}
474static void dataBufferReset(DataBuffer *pBuffer){
475 pBuffer->nData = 0;
476}
477static void dataBufferDestroy(DataBuffer *pBuffer){
478 if( pBuffer->pData!=NULL ) free(pBuffer->pData);
479 SCRAMBLE(pBuffer);
480}
481static void dataBufferExpand(DataBuffer *pBuffer, int nAddCapacity){
482 assert( nAddCapacity>0 );
483 /* TODO(shess) Consider expanding more aggressively. Note that the
484 ** underlying malloc implementation may take care of such things for
485 ** us already.
486 */
487 if( pBuffer->nData+nAddCapacity>pBuffer->nCapacity ){
488 pBuffer->nCapacity = pBuffer->nData+nAddCapacity;
489 pBuffer->pData = realloc(pBuffer->pData, pBuffer->nCapacity);
490 }
491}
492static void dataBufferAppend(DataBuffer *pBuffer,
493 const char *pSource, int nSource){
494 assert( nSource>0 && pSource!=NULL );
495 dataBufferExpand(pBuffer, nSource);
496 memcpy(pBuffer->pData+pBuffer->nData, pSource, nSource);
497 pBuffer->nData += nSource;
498}
499static void dataBufferAppend2(DataBuffer *pBuffer,
500 const char *pSource1, int nSource1,
501 const char *pSource2, int nSource2){
502 assert( nSource1>0 && pSource1!=NULL );
503 assert( nSource2>0 && pSource2!=NULL );
504 dataBufferExpand(pBuffer, nSource1+nSource2);
505 memcpy(pBuffer->pData+pBuffer->nData, pSource1, nSource1);
506 memcpy(pBuffer->pData+pBuffer->nData+nSource1, pSource2, nSource2);
507 pBuffer->nData += nSource1+nSource2;
508}
509static void dataBufferReplace(DataBuffer *pBuffer,
510 const char *pSource, int nSource){
511 dataBufferReset(pBuffer);
512 dataBufferAppend(pBuffer, pSource, nSource);
513}
514
515/* StringBuffer is a null-terminated version of DataBuffer. */
516typedef struct StringBuffer {
517 DataBuffer b; /* Includes null terminator. */
518} StringBuffer;
519
520static void initStringBuffer(StringBuffer *sb){
521 dataBufferInit(&sb->b, 100);
522 dataBufferReplace(&sb->b, "", 1);
523}
524static int stringBufferLength(StringBuffer *sb){
525 return sb->b.nData-1;
526}
527static char *stringBufferData(StringBuffer *sb){
528 return sb->b.pData;
529}
530static void stringBufferDestroy(StringBuffer *sb){
531 dataBufferDestroy(&sb->b);
532}
533
534static void nappend(StringBuffer *sb, const char *zFrom, int nFrom){
535 assert( sb->b.nData>0 );
536 if( nFrom>0 ){
537 sb->b.nData--;
538 dataBufferAppend2(&sb->b, zFrom, nFrom, "", 1);
539 }
540}
541static void append(StringBuffer *sb, const char *zFrom){
542 nappend(sb, zFrom, strlen(zFrom));
543}
544
545/* Append a list of strings separated by commas. */
546static void appendList(StringBuffer *sb, int nString, char **azString){
547 int i;
548 for(i=0; i<nString; ++i){
549 if( i>0 ) append(sb, ", ");
550 append(sb, azString[i]);
551 }
552}
553
554static int endsInWhiteSpace(StringBuffer *p){
555 return stringBufferLength(p)>0 &&
556 safe_isspace(stringBufferData(p)[stringBufferLength(p)-1]);
557}
558
559/* If the StringBuffer ends in something other than white space, add a
560** single space character to the end.
561*/
562static void appendWhiteSpace(StringBuffer *p){
563 if( stringBufferLength(p)==0 ) return;
564 if( !endsInWhiteSpace(p) ) append(p, " ");
565}
566
567/* Remove white space from the end of the StringBuffer */
568static void trimWhiteSpace(StringBuffer *p){
569 while( endsInWhiteSpace(p) ){
570 p->b.pData[--p->b.nData-1] = '\0';
571 }
572}
573
574/*******************************************************************/
575/* DLReader is used to read document elements from a doclist. The
576** current docid is cached, so dlrDocid() is fast. DLReader does not
577** own the doclist buffer.
578**
579** dlrAtEnd - true if there's no more data to read.
580** dlrDocid - docid of current document.
581** dlrDocData - doclist data for current document (including docid).
582** dlrDocDataBytes - length of same.
583** dlrAllDataBytes - length of all remaining data.
584** dlrPosData - position data for current document.
585** dlrPosDataLen - length of pos data for current document (incl POS_END).
586** dlrStep - step to current document.
587** dlrInit - initial for doclist of given type against given data.
588** dlrDestroy - clean up.
589**
590** Expected usage is something like:
591**
592** DLReader reader;
593** dlrInit(&reader, pData, nData);
594** while( !dlrAtEnd(&reader) ){
595** // calls to dlrDocid() and kin.
596** dlrStep(&reader);
597** }
598** dlrDestroy(&reader);
599*/
600typedef struct DLReader {
601 DocListType iType;
602 const char *pData;
603 int nData;
604
605 sqlite_int64 iDocid;
606 int nElement;
607} DLReader;
608
609static int dlrAtEnd(DLReader *pReader){
610 assert( pReader->nData>=0 );
611 return pReader->nData==0;
612}
613static sqlite_int64 dlrDocid(DLReader *pReader){
614 assert( !dlrAtEnd(pReader) );
615 return pReader->iDocid;
616}
617static const char *dlrDocData(DLReader *pReader){
618 assert( !dlrAtEnd(pReader) );
619 return pReader->pData;
620}
621static int dlrDocDataBytes(DLReader *pReader){
622 assert( !dlrAtEnd(pReader) );
623 return pReader->nElement;
624}
625static int dlrAllDataBytes(DLReader *pReader){
626 assert( !dlrAtEnd(pReader) );
627 return pReader->nData;
628}
629/* TODO(shess) Consider adding a field to track iDocid varint length
630** to make these two functions faster. This might matter (a tiny bit)
631** for queries.
632*/
633static const char *dlrPosData(DLReader *pReader){
634 sqlite_int64 iDummy;
635 int n = getVarint(pReader->pData, &iDummy);
636 assert( !dlrAtEnd(pReader) );
637 return pReader->pData+n;
638}
639static int dlrPosDataLen(DLReader *pReader){
640 sqlite_int64 iDummy;
641 int n = getVarint(pReader->pData, &iDummy);
642 assert( !dlrAtEnd(pReader) );
643 return pReader->nElement-n;
644}
645static void dlrStep(DLReader *pReader){
646 assert( !dlrAtEnd(pReader) );
647
648 /* Skip past current doclist element. */
649 assert( pReader->nElement<=pReader->nData );
650 pReader->pData += pReader->nElement;
651 pReader->nData -= pReader->nElement;
652
653 /* If there is more data, read the next doclist element. */
654 if( pReader->nData!=0 ){
655 sqlite_int64 iDocidDelta;
656 int iDummy, n = getVarint(pReader->pData, &iDocidDelta);
657 pReader->iDocid += iDocidDelta;
658 if( pReader->iType>=DL_POSITIONS ){
659 assert( n<pReader->nData );
660 while( 1 ){
661 n += getVarint32(pReader->pData+n, &iDummy);
662 assert( n<=pReader->nData );
663 if( iDummy==POS_END ) break;
664 if( iDummy==POS_COLUMN ){
665 n += getVarint32(pReader->pData+n, &iDummy);
666 assert( n<pReader->nData );
667 }else if( pReader->iType==DL_POSITIONS_OFFSETS ){
668 n += getVarint32(pReader->pData+n, &iDummy);
669 n += getVarint32(pReader->pData+n, &iDummy);
670 assert( n<pReader->nData );
671 }
672 }
673 }
674 pReader->nElement = n;
675 assert( pReader->nElement<=pReader->nData );
676 }
677}
678static void dlrInit(DLReader *pReader, DocListType iType,
679 const char *pData, int nData){
680 assert( pData!=NULL && nData!=0 );
681 pReader->iType = iType;
682 pReader->pData = pData;
683 pReader->nData = nData;
684 pReader->nElement = 0;
685 pReader->iDocid = 0;
686
687 /* Load the first element's data. There must be a first element. */
688 dlrStep(pReader);
689}
690static void dlrDestroy(DLReader *pReader){
691 SCRAMBLE(pReader);
692}
693
694#ifndef NDEBUG
695/* Verify that the doclist can be validly decoded. Also returns the
696** last docid found because it's convenient in other assertions for
697** DLWriter.
698*/
699static void docListValidate(DocListType iType, const char *pData, int nData,
700 sqlite_int64 *pLastDocid){
701 sqlite_int64 iPrevDocid = 0;
702 assert( nData>0 );
703 assert( pData!=0 );
704 assert( pData+nData>pData );
705 while( nData!=0 ){
706 sqlite_int64 iDocidDelta;
707 int n = getVarint(pData, &iDocidDelta);
708 iPrevDocid += iDocidDelta;
709 if( iType>DL_DOCIDS ){
710 int iDummy;
711 while( 1 ){
712 n += getVarint32(pData+n, &iDummy);
713 if( iDummy==POS_END ) break;
714 if( iDummy==POS_COLUMN ){
715 n += getVarint32(pData+n, &iDummy);
716 }else if( iType>DL_POSITIONS ){
717 n += getVarint32(pData+n, &iDummy);
718 n += getVarint32(pData+n, &iDummy);
719 }
720 assert( n<=nData );
721 }
722 }
723 assert( n<=nData );
724 pData += n;
725 nData -= n;
726 }
727 if( pLastDocid ) *pLastDocid = iPrevDocid;
728}
729#define ASSERT_VALID_DOCLIST(i, p, n, o) docListValidate(i, p, n, o)
730#else
731#define ASSERT_VALID_DOCLIST(i, p, n, o) assert( 1 )
732#endif
733
734/*******************************************************************/
735/* DLWriter is used to write doclist data to a DataBuffer. DLWriter
736** always appends to the buffer and does not own it.
737**
738** dlwInit - initialize to write a given type doclistto a buffer.
739** dlwDestroy - clear the writer's memory. Does not free buffer.
740** dlwAppend - append raw doclist data to buffer.
741** dlwCopy - copy next doclist from reader to writer.
742** dlwAdd - construct doclist element and append to buffer.
743** Only apply dlwAdd() to DL_DOCIDS doclists (else use PLWriter).
744*/
745typedef struct DLWriter {
746 DocListType iType;
747 DataBuffer *b;
748 sqlite_int64 iPrevDocid;
749#ifndef NDEBUG
750 int has_iPrevDocid;
751#endif
752} DLWriter;
753
754static void dlwInit(DLWriter *pWriter, DocListType iType, DataBuffer *b){
755 pWriter->b = b;
756 pWriter->iType = iType;
757 pWriter->iPrevDocid = 0;
758#ifndef NDEBUG
759 pWriter->has_iPrevDocid = 0;
760#endif
761}
762static void dlwDestroy(DLWriter *pWriter){
763 SCRAMBLE(pWriter);
764}
765/* iFirstDocid is the first docid in the doclist in pData. It is
766** needed because pData may point within a larger doclist, in which
767** case the first item would be delta-encoded.
768**
769** iLastDocid is the final docid in the doclist in pData. It is
770** needed to create the new iPrevDocid for future delta-encoding. The
771** code could decode the passed doclist to recreate iLastDocid, but
772** the only current user (docListMerge) already has decoded this
773** information.
774*/
775/* TODO(shess) This has become just a helper for docListMerge.
776** Consider a refactor to make this cleaner.
777*/
778static void dlwAppend(DLWriter *pWriter,
779 const char *pData, int nData,
780 sqlite_int64 iFirstDocid, sqlite_int64 iLastDocid){
781 sqlite_int64 iDocid = 0;
782 char c[VARINT_MAX];
783 int nFirstOld, nFirstNew; /* Old and new varint len of first docid. */
784#ifndef NDEBUG
785 sqlite_int64 iLastDocidDelta;
786#endif
787
788 /* Recode the initial docid as delta from iPrevDocid. */
789 nFirstOld = getVarint(pData, &iDocid);
790 assert( nFirstOld<nData || (nFirstOld==nData && pWriter->iType==DL_DOCIDS) );
791 nFirstNew = putVarint(c, iFirstDocid-pWriter->iPrevDocid);
792
793 /* Verify that the incoming doclist is valid AND that it ends with
794 ** the expected docid. This is essential because we'll trust this
795 ** docid in future delta-encoding.
796 */
797 ASSERT_VALID_DOCLIST(pWriter->iType, pData, nData, &iLastDocidDelta);
798 assert( iLastDocid==iFirstDocid-iDocid+iLastDocidDelta );
799
800 /* Append recoded initial docid and everything else. Rest of docids
801 ** should have been delta-encoded from previous initial docid.
802 */
803 if( nFirstOld<nData ){
804 dataBufferAppend2(pWriter->b, c, nFirstNew,
805 pData+nFirstOld, nData-nFirstOld);
806 }else{
807 dataBufferAppend(pWriter->b, c, nFirstNew);
808 }
809 pWriter->iPrevDocid = iLastDocid;
810}
811static void dlwCopy(DLWriter *pWriter, DLReader *pReader){
812 dlwAppend(pWriter, dlrDocData(pReader), dlrDocDataBytes(pReader),
813 dlrDocid(pReader), dlrDocid(pReader));
814}
815static void dlwAdd(DLWriter *pWriter, sqlite_int64 iDocid){
816 char c[VARINT_MAX];
817 int n = putVarint(c, iDocid-pWriter->iPrevDocid);
818
819 /* Docids must ascend. */
820 assert( !pWriter->has_iPrevDocid || iDocid>pWriter->iPrevDocid );
821 assert( pWriter->iType==DL_DOCIDS );
822
823 dataBufferAppend(pWriter->b, c, n);
824 pWriter->iPrevDocid = iDocid;
825#ifndef NDEBUG
826 pWriter->has_iPrevDocid = 1;
827#endif
828}
829
830/*******************************************************************/
831/* PLReader is used to read data from a document's position list. As
832** the caller steps through the list, data is cached so that varints
833** only need to be decoded once.
834**
835** plrInit, plrDestroy - create/destroy a reader.
836** plrColumn, plrPosition, plrStartOffset, plrEndOffset - accessors
837** plrAtEnd - at end of stream, only call plrDestroy once true.
838** plrStep - step to the next element.
839*/
840typedef struct PLReader {
841 /* These refer to the next position's data. nData will reach 0 when
842 ** reading the last position, so plrStep() signals EOF by setting
843 ** pData to NULL.
844 */
845 const char *pData;
846 int nData;
847
848 DocListType iType;
849 int iColumn; /* the last column read */
850 int iPosition; /* the last position read */
851 int iStartOffset; /* the last start offset read */
852 int iEndOffset; /* the last end offset read */
853} PLReader;
854
855static int plrAtEnd(PLReader *pReader){
856 return pReader->pData==NULL;
857}
858static int plrColumn(PLReader *pReader){
859 assert( !plrAtEnd(pReader) );
860 return pReader->iColumn;
861}
862static int plrPosition(PLReader *pReader){
863 assert( !plrAtEnd(pReader) );
864 return pReader->iPosition;
865}
866static int plrStartOffset(PLReader *pReader){
867 assert( !plrAtEnd(pReader) );
868 return pReader->iStartOffset;
869}
870static int plrEndOffset(PLReader *pReader){
871 assert( !plrAtEnd(pReader) );
872 return pReader->iEndOffset;
873}
874static void plrStep(PLReader *pReader){
875 int i, n;
876
877 assert( !plrAtEnd(pReader) );
878
879 if( pReader->nData==0 ){
880 pReader->pData = NULL;
881 return;
882 }
883
884 n = getVarint32(pReader->pData, &i);
885 if( i==POS_COLUMN ){
886 n += getVarint32(pReader->pData+n, &pReader->iColumn);
887 pReader->iPosition = 0;
888 pReader->iStartOffset = 0;
889 n += getVarint32(pReader->pData+n, &i);
890 }
891 /* Should never see adjacent column changes. */
892 assert( i!=POS_COLUMN );
893
894 if( i==POS_END ){
895 pReader->nData = 0;
896 pReader->pData = NULL;
897 return;
898 }
899
900 pReader->iPosition += i-POS_BASE;
901 if( pReader->iType==DL_POSITIONS_OFFSETS ){
902 n += getVarint32(pReader->pData+n, &i);
903 pReader->iStartOffset += i;
904 n += getVarint32(pReader->pData+n, &i);
905 pReader->iEndOffset = pReader->iStartOffset+i;
906 }
907 assert( n<=pReader->nData );
908 pReader->pData += n;
909 pReader->nData -= n;
910}
911
912static void plrInit(PLReader *pReader, DLReader *pDLReader){
913 pReader->pData = dlrPosData(pDLReader);
914 pReader->nData = dlrPosDataLen(pDLReader);
915 pReader->iType = pDLReader->iType;
916 pReader->iColumn = 0;
917 pReader->iPosition = 0;
918 pReader->iStartOffset = 0;
919 pReader->iEndOffset = 0;
920 plrStep(pReader);
921}
922static void plrDestroy(PLReader *pReader){
923 SCRAMBLE(pReader);
924}
925
926/*******************************************************************/
927/* PLWriter is used in constructing a document's position list. As a
928** convenience, if iType is DL_DOCIDS, PLWriter becomes a no-op.
929** PLWriter writes to the associated DLWriter's buffer.
930**
931** plwInit - init for writing a document's poslist.
932** plwDestroy - clear a writer.
933** plwAdd - append position and offset information.
934** plwCopy - copy next position's data from reader to writer.
935** plwTerminate - add any necessary doclist terminator.
936**
937** Calling plwAdd() after plwTerminate() may result in a corrupt
938** doclist.
939*/
940/* TODO(shess) Until we've written the second item, we can cache the
941** first item's information. Then we'd have three states:
942**
943** - initialized with docid, no positions.
944** - docid and one position.
945** - docid and multiple positions.
946**
947** Only the last state needs to actually write to dlw->b, which would
948** be an improvement in the DLCollector case.
949*/
950typedef struct PLWriter {
951 DLWriter *dlw;
952
953 int iColumn; /* the last column written */
954 int iPos; /* the last position written */
955 int iOffset; /* the last start offset written */
956} PLWriter;
957
958/* TODO(shess) In the case where the parent is reading these values
959** from a PLReader, we could optimize to a copy if that PLReader has
960** the same type as pWriter.
961*/
962static void plwAdd(PLWriter *pWriter, int iColumn, int iPos,
963 int iStartOffset, int iEndOffset){
964 /* Worst-case space for POS_COLUMN, iColumn, iPosDelta,
965 ** iStartOffsetDelta, and iEndOffsetDelta.
966 */
967 char c[5*VARINT_MAX];
968 int n = 0;
969
970 /* Ban plwAdd() after plwTerminate(). */
971 assert( pWriter->iPos!=-1 );
972
973 if( pWriter->dlw->iType==DL_DOCIDS ) return;
974
975 if( iColumn!=pWriter->iColumn ){
976 n += putVarint(c+n, POS_COLUMN);
977 n += putVarint(c+n, iColumn);
978 pWriter->iColumn = iColumn;
979 pWriter->iPos = 0;
980 pWriter->iOffset = 0;
981 }
982 assert( iPos>=pWriter->iPos );
983 n += putVarint(c+n, POS_BASE+(iPos-pWriter->iPos));
984 pWriter->iPos = iPos;
985 if( pWriter->dlw->iType==DL_POSITIONS_OFFSETS ){
986 assert( iStartOffset>=pWriter->iOffset );
987 n += putVarint(c+n, iStartOffset-pWriter->iOffset);
988 pWriter->iOffset = iStartOffset;
989 assert( iEndOffset>=iStartOffset );
990 n += putVarint(c+n, iEndOffset-iStartOffset);
991 }
992 dataBufferAppend(pWriter->dlw->b, c, n);
993}
994static void plwCopy(PLWriter *pWriter, PLReader *pReader){
995 plwAdd(pWriter, plrColumn(pReader), plrPosition(pReader),
996 plrStartOffset(pReader), plrEndOffset(pReader));
997}
998static void plwInit(PLWriter *pWriter, DLWriter *dlw, sqlite_int64 iDocid){
999 char c[VARINT_MAX];
1000 int n;
1001
1002 pWriter->dlw = dlw;
1003
1004 /* Docids must ascend. */
1005 assert( !pWriter->dlw->has_iPrevDocid || iDocid>pWriter->dlw->iPrevDocid );
1006 n = putVarint(c, iDocid-pWriter->dlw->iPrevDocid);
1007 dataBufferAppend(pWriter->dlw->b, c, n);
1008 pWriter->dlw->iPrevDocid = iDocid;
1009#ifndef NDEBUG
1010 pWriter->dlw->has_iPrevDocid = 1;
1011#endif
1012
1013 pWriter->iColumn = 0;
1014 pWriter->iPos = 0;
1015 pWriter->iOffset = 0;
1016}
1017/* TODO(shess) Should plwDestroy() also terminate the doclist? But
1018** then plwDestroy() would no longer be just a destructor, it would
1019** also be doing work, which isn't consistent with the overall idiom.
1020** Another option would be for plwAdd() to always append any necessary
1021** terminator, so that the output is always correct. But that would
1022** add incremental work to the common case with the only benefit being
1023** API elegance. Punt for now.
1024*/
1025static void plwTerminate(PLWriter *pWriter){
1026 if( pWriter->dlw->iType>DL_DOCIDS ){
1027 char c[VARINT_MAX];
1028 int n = putVarint(c, POS_END);
1029 dataBufferAppend(pWriter->dlw->b, c, n);
1030 }
1031#ifndef NDEBUG
1032 /* Mark as terminated for assert in plwAdd(). */
1033 pWriter->iPos = -1;
1034#endif
1035}
1036static void plwDestroy(PLWriter *pWriter){
1037 SCRAMBLE(pWriter);
1038}
1039
1040/*******************************************************************/
1041/* DLCollector wraps PLWriter and DLWriter to provide a
1042** dynamically-allocated doclist area to use during tokenization.
1043**
1044** dlcNew - malloc up and initialize a collector.
1045** dlcDelete - destroy a collector and all contained items.
1046** dlcAddPos - append position and offset information.
1047** dlcAddDoclist - add the collected doclist to the given buffer.
1048** dlcNext - terminate the current document and open another.
1049*/
1050typedef struct DLCollector {
1051 DataBuffer b;
1052 DLWriter dlw;
1053 PLWriter plw;
1054} DLCollector;
1055
1056/* TODO(shess) This could also be done by calling plwTerminate() and
1057** dataBufferAppend(). I tried that, expecting nominal performance
1058** differences, but it seemed to pretty reliably be worth 1% to code
1059** it this way. I suspect it's the incremental malloc overhead (some
1060** percentage of the plwTerminate() calls will cause a realloc), so
1061** this might be worth revisiting if the DataBuffer implementation
1062** changes.
1063*/
1064static void dlcAddDoclist(DLCollector *pCollector, DataBuffer *b){
1065 if( pCollector->dlw.iType>DL_DOCIDS ){
1066 char c[VARINT_MAX];
1067 int n = putVarint(c, POS_END);
1068 dataBufferAppend2(b, pCollector->b.pData, pCollector->b.nData, c, n);
1069 }else{
1070 dataBufferAppend(b, pCollector->b.pData, pCollector->b.nData);
1071 }
1072}
1073static void dlcNext(DLCollector *pCollector, sqlite_int64 iDocid){
1074 plwTerminate(&pCollector->plw);
1075 plwDestroy(&pCollector->plw);
1076 plwInit(&pCollector->plw, &pCollector->dlw, iDocid);
1077}
1078static void dlcAddPos(DLCollector *pCollector, int iColumn, int iPos,
1079 int iStartOffset, int iEndOffset){
1080 plwAdd(&pCollector->plw, iColumn, iPos, iStartOffset, iEndOffset);
1081}
1082
1083static DLCollector *dlcNew(sqlite_int64 iDocid, DocListType iType){
1084 DLCollector *pCollector = malloc(sizeof(DLCollector));
1085 dataBufferInit(&pCollector->b, 0);
1086 dlwInit(&pCollector->dlw, iType, &pCollector->b);
1087 plwInit(&pCollector->plw, &pCollector->dlw, iDocid);
1088 return pCollector;
1089}
1090static void dlcDelete(DLCollector *pCollector){
1091 plwDestroy(&pCollector->plw);
1092 dlwDestroy(&pCollector->dlw);
1093 dataBufferDestroy(&pCollector->b);
1094 SCRAMBLE(pCollector);
1095 free(pCollector);
1096}
1097
1098
1099/* Copy the doclist data of iType in pData/nData into *out, trimming
1100** unnecessary data as we go. Only columns matching iColumn are
1101** copied, all columns copied if iColumn is -1. Elements with no
1102** matching columns are dropped. The output is an iOutType doclist.
1103*/
1104/* NOTE(shess) This code is only valid after all doclists are merged.
1105** If this is run before merges, then doclist items which represent
1106** deletion will be trimmed, and will thus not effect a deletion
1107** during the merge.
1108*/
1109static void docListTrim(DocListType iType, const char *pData, int nData,
1110 int iColumn, DocListType iOutType, DataBuffer *out){
1111 DLReader dlReader;
1112 DLWriter dlWriter;
1113
1114 assert( iOutType<=iType );
1115
1116 dlrInit(&dlReader, iType, pData, nData);
1117 dlwInit(&dlWriter, iOutType, out);
1118
1119 while( !dlrAtEnd(&dlReader) ){
1120 PLReader plReader;
1121 PLWriter plWriter;
1122 int match = 0;
1123
1124 plrInit(&plReader, &dlReader);
1125
1126 while( !plrAtEnd(&plReader) ){
1127 if( iColumn==-1 || plrColumn(&plReader)==iColumn ){
1128 if( !match ){
1129 plwInit(&plWriter, &dlWriter, dlrDocid(&dlReader));
1130 match = 1;
1131 }
1132 plwAdd(&plWriter, plrColumn(&plReader), plrPosition(&plReader),
1133 plrStartOffset(&plReader), plrEndOffset(&plReader));
1134 }
1135 plrStep(&plReader);
1136 }
1137 if( match ){
1138 plwTerminate(&plWriter);
1139 plwDestroy(&plWriter);
1140 }
1141
1142 plrDestroy(&plReader);
1143 dlrStep(&dlReader);
1144 }
1145 dlwDestroy(&dlWriter);
1146 dlrDestroy(&dlReader);
1147}
1148
1149/* Used by docListMerge() to keep doclists in the ascending order by
1150** docid, then ascending order by age (so the newest comes first).
1151*/
1152typedef struct OrderedDLReader {
1153 DLReader *pReader;
1154
1155 /* TODO(shess) If we assume that docListMerge pReaders is ordered by
1156 ** age (which we do), then we could use pReader comparisons to break
1157 ** ties.
1158 */
1159 int idx;
1160} OrderedDLReader;
1161
1162/* Order eof to end, then by docid asc, idx desc. */
1163static int orderedDLReaderCmp(OrderedDLReader *r1, OrderedDLReader *r2){
1164 if( dlrAtEnd(r1->pReader) ){
1165 if( dlrAtEnd(r2->pReader) ) return 0; /* Both atEnd(). */
1166 return 1; /* Only r1 atEnd(). */
1167 }
1168 if( dlrAtEnd(r2->pReader) ) return -1; /* Only r2 atEnd(). */
1169
1170 if( dlrDocid(r1->pReader)<dlrDocid(r2->pReader) ) return -1;
1171 if( dlrDocid(r1->pReader)>dlrDocid(r2->pReader) ) return 1;
1172
1173 /* Descending on idx. */
1174 return r2->idx-r1->idx;
1175}
1176
1177/* Bubble p[0] to appropriate place in p[1..n-1]. Assumes that
1178** p[1..n-1] is already sorted.
1179*/
1180/* TODO(shess) Is this frequent enough to warrant a binary search?
1181** Before implementing that, instrument the code to check. In most
1182** current usage, I expect that p[0] will be less than p[1] a very
1183** high proportion of the time.
1184*/
1185static void orderedDLReaderReorder(OrderedDLReader *p, int n){
1186 while( n>1 && orderedDLReaderCmp(p, p+1)>0 ){
1187 OrderedDLReader tmp = p[0];
1188 p[0] = p[1];
1189 p[1] = tmp;
1190 n--;
1191 p++;
1192 }
1193}
1194
1195/* Given an array of doclist readers, merge their doclist elements
1196** into out in sorted order (by docid), dropping elements from older
1197** readers when there is a duplicate docid. pReaders is assumed to be
1198** ordered by age, oldest first.
1199*/
1200/* TODO(shess) nReaders must be <= MERGE_COUNT. This should probably
1201** be fixed.
1202*/
1203static void docListMerge(DataBuffer *out,
1204 DLReader *pReaders, int nReaders){
1205 OrderedDLReader readers[MERGE_COUNT];
1206 DLWriter writer;
1207 int i, n;
1208 const char *pStart = 0;
1209 int nStart = 0;
1210 sqlite_int64 iFirstDocid = 0, iLastDocid = 0;
1211
1212 assert( nReaders>0 );
1213 if( nReaders==1 ){
1214 dataBufferAppend(out, dlrDocData(pReaders), dlrAllDataBytes(pReaders));
1215 return;
1216 }
1217
1218 assert( nReaders<=MERGE_COUNT );
1219 n = 0;
1220 for(i=0; i<nReaders; i++){
1221 assert( pReaders[i].iType==pReaders[0].iType );
1222 readers[i].pReader = pReaders+i;
1223 readers[i].idx = i;
1224 n += dlrAllDataBytes(&pReaders[i]);
1225 }
1226 /* Conservatively size output to sum of inputs. Output should end
1227 ** up strictly smaller than input.
1228 */
1229 dataBufferExpand(out, n);
1230
1231 /* Get the readers into sorted order. */
1232 while( i-->0 ){
1233 orderedDLReaderReorder(readers+i, nReaders-i);
1234 }
1235
1236 dlwInit(&writer, pReaders[0].iType, out);
1237 while( !dlrAtEnd(readers[0].pReader) ){
1238 sqlite_int64 iDocid = dlrDocid(readers[0].pReader);
1239
1240 /* If this is a continuation of the current buffer to copy, extend
1241 ** that buffer. memcpy() seems to be more efficient if it has a
1242 ** lots of data to copy.
1243 */
1244 if( dlrDocData(readers[0].pReader)==pStart+nStart ){
1245 nStart += dlrDocDataBytes(readers[0].pReader);
1246 }else{
1247 if( pStart!=0 ){
1248 dlwAppend(&writer, pStart, nStart, iFirstDocid, iLastDocid);
1249 }
1250 pStart = dlrDocData(readers[0].pReader);
1251 nStart = dlrDocDataBytes(readers[0].pReader);
1252 iFirstDocid = iDocid;
1253 }
1254 iLastDocid = iDocid;
1255 dlrStep(readers[0].pReader);
1256
1257 /* Drop all of the older elements with the same docid. */
1258 for(i=1; i<nReaders &&
1259 !dlrAtEnd(readers[i].pReader) &&
1260 dlrDocid(readers[i].pReader)==iDocid; i++){
1261 dlrStep(readers[i].pReader);
1262 }
1263
1264 /* Get the readers back into order. */
1265 while( i-->0 ){
1266 orderedDLReaderReorder(readers+i, nReaders-i);
1267 }
1268 }
1269
1270 /* Copy over any remaining elements. */
1271 if( nStart>0 ) dlwAppend(&writer, pStart, nStart, iFirstDocid, iLastDocid);
1272 dlwDestroy(&writer);
1273}
1274
1275/* Helper function for posListUnion(). Compares the current position
1276** between left and right, returning as standard C idiom of <0 if
1277** left<right, >0 if left>right, and 0 if left==right. "End" always
1278** compares greater.
1279*/
1280static int posListCmp(PLReader *pLeft, PLReader *pRight){
1281 assert( pLeft->iType==pRight->iType );
1282 if( pLeft->iType==DL_DOCIDS ) return 0;
1283
1284 if( plrAtEnd(pLeft) ) return plrAtEnd(pRight) ? 0 : 1;
1285 if( plrAtEnd(pRight) ) return -1;
1286
1287 if( plrColumn(pLeft)<plrColumn(pRight) ) return -1;
1288 if( plrColumn(pLeft)>plrColumn(pRight) ) return 1;
1289
1290 if( plrPosition(pLeft)<plrPosition(pRight) ) return -1;
1291 if( plrPosition(pLeft)>plrPosition(pRight) ) return 1;
1292 if( pLeft->iType==DL_POSITIONS ) return 0;
1293
1294 if( plrStartOffset(pLeft)<plrStartOffset(pRight) ) return -1;
1295 if( plrStartOffset(pLeft)>plrStartOffset(pRight) ) return 1;
1296
1297 if( plrEndOffset(pLeft)<plrEndOffset(pRight) ) return -1;
1298 if( plrEndOffset(pLeft)>plrEndOffset(pRight) ) return 1;
1299
1300 return 0;
1301}
1302
1303/* Write the union of position lists in pLeft and pRight to pOut.
1304** "Union" in this case meaning "All unique position tuples". Should
1305** work with any doclist type, though both inputs and the output
1306** should be the same type.
1307*/
1308static void posListUnion(DLReader *pLeft, DLReader *pRight, DLWriter *pOut){
1309 PLReader left, right;
1310 PLWriter writer;
1311
1312 assert( dlrDocid(pLeft)==dlrDocid(pRight) );
1313 assert( pLeft->iType==pRight->iType );
1314 assert( pLeft->iType==pOut->iType );
1315
1316 plrInit(&left, pLeft);
1317 plrInit(&right, pRight);
1318 plwInit(&writer, pOut, dlrDocid(pLeft));
1319
1320 while( !plrAtEnd(&left) || !plrAtEnd(&right) ){
1321 int c = posListCmp(&left, &right);
1322 if( c<0 ){
1323 plwCopy(&writer, &left);
1324 plrStep(&left);
1325 }else if( c>0 ){
1326 plwCopy(&writer, &right);
1327 plrStep(&right);
1328 }else{
1329 plwCopy(&writer, &left);
1330 plrStep(&left);
1331 plrStep(&right);
1332 }
1333 }
1334
1335 plwTerminate(&writer);
1336 plwDestroy(&writer);
1337 plrDestroy(&left);
1338 plrDestroy(&right);
1339}
1340
1341/* Write the union of doclists in pLeft and pRight to pOut. For
1342** docids in common between the inputs, the union of the position
1343** lists is written. Inputs and outputs are always type DL_DEFAULT.
1344*/
1345static void docListUnion(
1346 const char *pLeft, int nLeft,
1347 const char *pRight, int nRight,
1348 DataBuffer *pOut /* Write the combined doclist here */
1349){
1350 DLReader left, right;
1351 DLWriter writer;
1352
1353 if( nLeft==0 ){
1354 dataBufferAppend(pOut, pRight, nRight);
1355 return;
1356 }
1357 if( nRight==0 ){
1358 dataBufferAppend(pOut, pLeft, nLeft);
1359 return;
1360 }
1361
1362 dlrInit(&left, DL_DEFAULT, pLeft, nLeft);
1363 dlrInit(&right, DL_DEFAULT, pRight, nRight);
1364 dlwInit(&writer, DL_DEFAULT, pOut);
1365
1366 while( !dlrAtEnd(&left) || !dlrAtEnd(&right) ){
1367 if( dlrAtEnd(&right) ){
1368 dlwCopy(&writer, &left);
1369 dlrStep(&left);
1370 }else if( dlrAtEnd(&left) ){
1371 dlwCopy(&writer, &right);
1372 dlrStep(&right);
1373 }else if( dlrDocid(&left)<dlrDocid(&right) ){
1374 dlwCopy(&writer, &left);
1375 dlrStep(&left);
1376 }else if( dlrDocid(&left)>dlrDocid(&right) ){
1377 dlwCopy(&writer, &right);
1378 dlrStep(&right);
1379 }else{
1380 posListUnion(&left, &right, &writer);
1381 dlrStep(&left);
1382 dlrStep(&right);
1383 }
1384 }
1385
1386 dlrDestroy(&left);
1387 dlrDestroy(&right);
1388 dlwDestroy(&writer);
1389}
1390
1391/* pLeft and pRight are DLReaders positioned to the same docid.
1392**
1393** If there are no instances in pLeft or pRight where the position
1394** of pLeft is one less than the position of pRight, then this
1395** routine adds nothing to pOut.
1396**
1397** If there are one or more instances where positions from pLeft
1398** are exactly one less than positions from pRight, then add a new
1399** document record to pOut. If pOut wants to hold positions, then
1400** include the positions from pRight that are one more than a
1401** position in pLeft. In other words: pRight.iPos==pLeft.iPos+1.
1402*/
1403static void posListPhraseMerge(DLReader *pLeft, DLReader *pRight,
1404 DLWriter *pOut){
1405 PLReader left, right;
1406 PLWriter writer;
1407 int match = 0;
1408
1409 assert( dlrDocid(pLeft)==dlrDocid(pRight) );
1410 assert( pOut->iType!=DL_POSITIONS_OFFSETS );
1411
1412 plrInit(&left, pLeft);
1413 plrInit(&right, pRight);
1414
1415 while( !plrAtEnd(&left) && !plrAtEnd(&right) ){
1416 if( plrColumn(&left)<plrColumn(&right) ){
1417 plrStep(&left);
1418 }else if( plrColumn(&left)>plrColumn(&right) ){
1419 plrStep(&right);
1420 }else if( plrPosition(&left)+1<plrPosition(&right) ){
1421 plrStep(&left);
1422 }else if( plrPosition(&left)+1>plrPosition(&right) ){
1423 plrStep(&right);
1424 }else{
1425 if( !match ){
1426 plwInit(&writer, pOut, dlrDocid(pLeft));
1427 match = 1;
1428 }
1429 plwAdd(&writer, plrColumn(&right), plrPosition(&right), 0, 0);
1430 plrStep(&left);
1431 plrStep(&right);
1432 }
1433 }
1434
1435 if( match ){
1436 plwTerminate(&writer);
1437 plwDestroy(&writer);
1438 }
1439
1440 plrDestroy(&left);
1441 plrDestroy(&right);
1442}
1443
1444/* We have two doclists with positions: pLeft and pRight.
1445** Write the phrase intersection of these two doclists into pOut.
1446**
1447** A phrase intersection means that two documents only match
1448** if pLeft.iPos+1==pRight.iPos.
1449**
1450** iType controls the type of data written to pOut. If iType is
1451** DL_POSITIONS, the positions are those from pRight.
1452*/
1453static void docListPhraseMerge(
1454 const char *pLeft, int nLeft,
1455 const char *pRight, int nRight,
1456 DocListType iType,
1457 DataBuffer *pOut /* Write the combined doclist here */
1458){
1459 DLReader left, right;
1460 DLWriter writer;
1461
1462 if( nLeft==0 || nRight==0 ) return;
1463
1464 assert( iType!=DL_POSITIONS_OFFSETS );
1465
1466 dlrInit(&left, DL_POSITIONS, pLeft, nLeft);
1467 dlrInit(&right, DL_POSITIONS, pRight, nRight);
1468 dlwInit(&writer, iType, pOut);
1469
1470 while( !dlrAtEnd(&left) && !dlrAtEnd(&right) ){
1471 if( dlrDocid(&left)<dlrDocid(&right) ){
1472 dlrStep(&left);
1473 }else if( dlrDocid(&right)<dlrDocid(&left) ){
1474 dlrStep(&right);
1475 }else{
1476 posListPhraseMerge(&left, &right, &writer);
1477 dlrStep(&left);
1478 dlrStep(&right);
1479 }
1480 }
1481
1482 dlrDestroy(&left);
1483 dlrDestroy(&right);
1484 dlwDestroy(&writer);
1485}
1486
1487/* We have two DL_DOCIDS doclists: pLeft and pRight.
1488** Write the intersection of these two doclists into pOut as a
1489** DL_DOCIDS doclist.
1490*/
1491static void docListAndMerge(
1492 const char *pLeft, int nLeft,
1493 const char *pRight, int nRight,
1494 DataBuffer *pOut /* Write the combined doclist here */
1495){
1496 DLReader left, right;
1497 DLWriter writer;
1498
1499 if( nLeft==0 || nRight==0 ) return;
1500
1501 dlrInit(&left, DL_DOCIDS, pLeft, nLeft);
1502 dlrInit(&right, DL_DOCIDS, pRight, nRight);
1503 dlwInit(&writer, DL_DOCIDS, pOut);
1504
1505 while( !dlrAtEnd(&left) && !dlrAtEnd(&right) ){
1506 if( dlrDocid(&left)<dlrDocid(&right) ){
1507 dlrStep(&left);
1508 }else if( dlrDocid(&right)<dlrDocid(&left) ){
1509 dlrStep(&right);
1510 }else{
1511 dlwAdd(&writer, dlrDocid(&left));
1512 dlrStep(&left);
1513 dlrStep(&right);
1514 }
1515 }
1516
1517 dlrDestroy(&left);
1518 dlrDestroy(&right);
1519 dlwDestroy(&writer);
1520}
1521
1522/* We have two DL_DOCIDS doclists: pLeft and pRight.
1523** Write the union of these two doclists into pOut as a
1524** DL_DOCIDS doclist.
1525*/
1526static void docListOrMerge(
1527 const char *pLeft, int nLeft,
1528 const char *pRight, int nRight,
1529 DataBuffer *pOut /* Write the combined doclist here */
1530){
1531 DLReader left, right;
1532 DLWriter writer;
1533
1534 if( nLeft==0 ){
1535 dataBufferAppend(pOut, pRight, nRight);
1536 return;
1537 }
1538 if( nRight==0 ){
1539 dataBufferAppend(pOut, pLeft, nLeft);
1540 return;
1541 }
1542
1543 dlrInit(&left, DL_DOCIDS, pLeft, nLeft);
1544 dlrInit(&right, DL_DOCIDS, pRight, nRight);
1545 dlwInit(&writer, DL_DOCIDS, pOut);
1546
1547 while( !dlrAtEnd(&left) || !dlrAtEnd(&right) ){
1548 if( dlrAtEnd(&right) ){
1549 dlwAdd(&writer, dlrDocid(&left));
1550 dlrStep(&left);
1551 }else if( dlrAtEnd(&left) ){
1552 dlwAdd(&writer, dlrDocid(&right));
1553 dlrStep(&right);
1554 }else if( dlrDocid(&left)<dlrDocid(&right) ){
1555 dlwAdd(&writer, dlrDocid(&left));
1556 dlrStep(&left);
1557 }else if( dlrDocid(&right)<dlrDocid(&left) ){
1558 dlwAdd(&writer, dlrDocid(&right));
1559 dlrStep(&right);
1560 }else{
1561 dlwAdd(&writer, dlrDocid(&left));
1562 dlrStep(&left);
1563 dlrStep(&right);
1564 }
1565 }
1566
1567 dlrDestroy(&left);
1568 dlrDestroy(&right);
1569 dlwDestroy(&writer);
1570}
1571
1572/* We have two DL_DOCIDS doclists: pLeft and pRight.
1573** Write into pOut as DL_DOCIDS doclist containing all documents that
1574** occur in pLeft but not in pRight.
1575*/
1576static void docListExceptMerge(
1577 const char *pLeft, int nLeft,
1578 const char *pRight, int nRight,
1579 DataBuffer *pOut /* Write the combined doclist here */
1580){
1581 DLReader left, right;
1582 DLWriter writer;
1583
1584 if( nLeft==0 ) return;
1585 if( nRight==0 ){
1586 dataBufferAppend(pOut, pLeft, nLeft);
1587 return;
1588 }
1589
1590 dlrInit(&left, DL_DOCIDS, pLeft, nLeft);
1591 dlrInit(&right, DL_DOCIDS, pRight, nRight);
1592 dlwInit(&writer, DL_DOCIDS, pOut);
1593
1594 while( !dlrAtEnd(&left) ){
1595 while( !dlrAtEnd(&right) && dlrDocid(&right)<dlrDocid(&left) ){
1596 dlrStep(&right);
1597 }
1598 if( dlrAtEnd(&right) || dlrDocid(&left)<dlrDocid(&right) ){
1599 dlwAdd(&writer, dlrDocid(&left));
1600 }
1601 dlrStep(&left);
1602 }
1603
1604 dlrDestroy(&left);
1605 dlrDestroy(&right);
1606 dlwDestroy(&writer);
1607}
1608
1609static char *string_dup_n(const char *s, int n){
1610 char *str = malloc(n + 1);
1611 memcpy(str, s, n);
1612 str[n] = '\0';
1613 return str;
1614}
1615
1616/* Duplicate a string; the caller must free() the returned string.
1617 * (We don't use strdup() since it's not part of the standard C library and
1618 * may not be available everywhere.) */
1619static char *string_dup(const char *s){
1620 return string_dup_n(s, strlen(s));
1621}
1622
1623/* Format a string, replacing each occurrence of the % character with
1624 * zDb.zName. This may be more convenient than sqlite_mprintf()
1625 * when one string is used repeatedly in a format string.
1626 * The caller must free() the returned string. */
1627static char *string_format(const char *zFormat,
1628 const char *zDb, const char *zName){
1629 const char *p;
1630 size_t len = 0;
1631 size_t nDb = strlen(zDb);
1632 size_t nName = strlen(zName);
1633 size_t nFullTableName = nDb+1+nName;
1634 char *result;
1635 char *r;
1636
1637 /* first compute length needed */
1638 for(p = zFormat ; *p ; ++p){
1639 len += (*p=='%' ? nFullTableName : 1);
1640 }
1641 len += 1; /* for null terminator */
1642
1643 r = result = malloc(len);
1644 for(p = zFormat; *p; ++p){
1645 if( *p=='%' ){
1646 memcpy(r, zDb, nDb);
1647 r += nDb;
1648 *r++ = '.';
1649 memcpy(r, zName, nName);
1650 r += nName;
1651 } else {
1652 *r++ = *p;
1653 }
1654 }
1655 *r++ = '\0';
1656 assert( r == result + len );
1657 return result;
1658}
1659
1660static int sql_exec(sqlite3 *db, const char *zDb, const char *zName,
1661 const char *zFormat){
1662 char *zCommand = string_format(zFormat, zDb, zName);
1663 int rc;
1664 TRACE(("FTS2 sql: %s\n", zCommand));
1665 rc = sqlite3_exec(db, zCommand, NULL, 0, NULL);
1666 free(zCommand);
1667 return rc;
1668}
1669
1670static int sql_prepare(sqlite3 *db, const char *zDb, const char *zName,
1671 sqlite3_stmt **ppStmt, const char *zFormat){
1672 char *zCommand = string_format(zFormat, zDb, zName);
1673 int rc;
1674 TRACE(("FTS2 prepare: %s\n", zCommand));
1675 rc = sqlite3_prepare_v2(db, zCommand, -1, ppStmt, NULL);
1676 free(zCommand);
1677 return rc;
1678}
1679
1680/* end utility functions */
1681
1682/* Forward reference */
1683typedef struct fulltext_vtab fulltext_vtab;
1684
1685/* A single term in a query is represented by an instances of
1686** the following structure.
1687*/
1688typedef struct QueryTerm {
1689 short int nPhrase; /* How many following terms are part of the same phrase */
1690 short int iPhrase; /* This is the i-th term of a phrase. */
1691 short int iColumn; /* Column of the index that must match this term */
1692 signed char isOr; /* this term is preceded by "OR" */
1693 signed char isNot; /* this term is preceded by "-" */
1694 signed char isPrefix; /* this term is followed by "*" */
1695 char *pTerm; /* text of the term. '\000' terminated. malloced */
1696 int nTerm; /* Number of bytes in pTerm[] */
1697} QueryTerm;
1698
1699
1700/* A query string is parsed into a Query structure.
1701 *
1702 * We could, in theory, allow query strings to be complicated
1703 * nested expressions with precedence determined by parentheses.
1704 * But none of the major search engines do this. (Perhaps the
1705 * feeling is that an parenthesized expression is two complex of
1706 * an idea for the average user to grasp.) Taking our lead from
1707 * the major search engines, we will allow queries to be a list
1708 * of terms (with an implied AND operator) or phrases in double-quotes,
1709 * with a single optional "-" before each non-phrase term to designate
1710 * negation and an optional OR connector.
1711 *
1712 * OR binds more tightly than the implied AND, which is what the
1713 * major search engines seem to do. So, for example:
1714 *
1715 * [one two OR three] ==> one AND (two OR three)
1716 * [one OR two three] ==> (one OR two) AND three
1717 *
1718 * A "-" before a term matches all entries that lack that term.
1719 * The "-" must occur immediately before the term with in intervening
1720 * space. This is how the search engines do it.
1721 *
1722 * A NOT term cannot be the right-hand operand of an OR. If this
1723 * occurs in the query string, the NOT is ignored:
1724 *
1725 * [one OR -two] ==> one OR two
1726 *
1727 */
1728typedef struct Query {
1729 fulltext_vtab *pFts; /* The full text index */
1730 int nTerms; /* Number of terms in the query */
1731 QueryTerm *pTerms; /* Array of terms. Space obtained from malloc() */
1732 int nextIsOr; /* Set the isOr flag on the next inserted term */
1733 int nextColumn; /* Next word parsed must be in this column */
1734 int dfltColumn; /* The default column */
1735} Query;
1736
1737
1738/*
1739** An instance of the following structure keeps track of generated
1740** matching-word offset information and snippets.
1741*/
1742typedef struct Snippet {
1743 int nMatch; /* Total number of matches */
1744 int nAlloc; /* Space allocated for aMatch[] */
1745 struct snippetMatch { /* One entry for each matching term */
1746 char snStatus; /* Status flag for use while constructing snippets */
1747 short int iCol; /* The column that contains the match */
1748 short int iTerm; /* The index in Query.pTerms[] of the matching term */
1749 short int nByte; /* Number of bytes in the term */
1750 int iStart; /* The offset to the first character of the term */
1751 } *aMatch; /* Points to space obtained from malloc */
1752 char *zOffset; /* Text rendering of aMatch[] */
1753 int nOffset; /* strlen(zOffset) */
1754 char *zSnippet; /* Snippet text */
1755 int nSnippet; /* strlen(zSnippet) */
1756} Snippet;
1757
1758
1759typedef enum QueryType {
1760 QUERY_GENERIC, /* table scan */
1761 QUERY_ROWID, /* lookup by rowid */
1762 QUERY_FULLTEXT /* QUERY_FULLTEXT + [i] is a full-text search for column i*/
1763} QueryType;
1764
1765typedef enum fulltext_statement {
1766 CONTENT_INSERT_STMT,
1767 CONTENT_SELECT_STMT,
1768 CONTENT_UPDATE_STMT,
1769 CONTENT_DELETE_STMT,
1770
1771 BLOCK_INSERT_STMT,
1772 BLOCK_SELECT_STMT,
1773 BLOCK_DELETE_STMT,
1774
1775 SEGDIR_MAX_INDEX_STMT,
1776 SEGDIR_SET_STMT,
1777 SEGDIR_SELECT_STMT,
1778 SEGDIR_SPAN_STMT,
1779 SEGDIR_DELETE_STMT,
1780 SEGDIR_SELECT_ALL_STMT,
1781
1782 MAX_STMT /* Always at end! */
1783} fulltext_statement;
1784
1785/* These must exactly match the enum above. */
1786/* TODO(shess): Is there some risk that a statement will be used in two
1787** cursors at once, e.g. if a query joins a virtual table to itself?
1788** If so perhaps we should move some of these to the cursor object.
1789*/
1790static const char *const fulltext_zStatement[MAX_STMT] = {
1791 /* CONTENT_INSERT */ NULL, /* generated in contentInsertStatement() */
1792 /* CONTENT_SELECT */ "select * from %_content where rowid = ?",
1793 /* CONTENT_UPDATE */ NULL, /* generated in contentUpdateStatement() */
1794 /* CONTENT_DELETE */ "delete from %_content where rowid = ?",
1795
1796 /* BLOCK_INSERT */ "insert into %_segments values (?)",
1797 /* BLOCK_SELECT */ "select block from %_segments where rowid = ?",
1798 /* BLOCK_DELETE */ "delete from %_segments where rowid between ? and ?",
1799
1800 /* SEGDIR_MAX_INDEX */ "select max(idx) from %_segdir where level = ?",
1801 /* SEGDIR_SET */ "insert into %_segdir values (?, ?, ?, ?, ?, ?)",
1802 /* SEGDIR_SELECT */
1803 "select start_block, leaves_end_block, root from %_segdir "
1804 " where level = ? order by idx",
1805 /* SEGDIR_SPAN */
1806 "select min(start_block), max(end_block) from %_segdir "
1807 " where level = ? and start_block <> 0",
1808 /* SEGDIR_DELETE */ "delete from %_segdir where level = ?",
1809 /* SEGDIR_SELECT_ALL */
1810 "select root, leaves_end_block from %_segdir order by level desc, idx",
1811};
1812
1813/*
1814** A connection to a fulltext index is an instance of the following
1815** structure. The xCreate and xConnect methods create an instance
1816** of this structure and xDestroy and xDisconnect free that instance.
1817** All other methods receive a pointer to the structure as one of their
1818** arguments.
1819*/
1820struct fulltext_vtab {
1821 sqlite3_vtab base; /* Base class used by SQLite core */
1822 sqlite3 *db; /* The database connection */
1823 const char *zDb; /* logical database name */
1824 const char *zName; /* virtual table name */
1825 int nColumn; /* number of columns in virtual table */
1826 char **azColumn; /* column names. malloced */
1827 char **azContentColumn; /* column names in content table; malloced */
1828 sqlite3_tokenizer *pTokenizer; /* tokenizer for inserts and queries */
1829
1830 /* Precompiled statements which we keep as long as the table is
1831 ** open.
1832 */
1833 sqlite3_stmt *pFulltextStatements[MAX_STMT];
1834
1835 /* Precompiled statements used for segment merges. We run a
1836 ** separate select across the leaf level of each tree being merged.
1837 */
1838 sqlite3_stmt *pLeafSelectStmts[MERGE_COUNT];
1839 /* The statement used to prepare pLeafSelectStmts. */
1840#define LEAF_SELECT \
1841 "select block from %_segments where rowid between ? and ? order by rowid"
1842
1843 /* These buffer pending index updates during transactions.
1844 ** nPendingData estimates the memory size of the pending data. It
1845 ** doesn't include the hash-bucket overhead, nor any malloc
1846 ** overhead. When nPendingData exceeds kPendingThreshold, the
1847 ** buffer is flushed even before the transaction closes.
1848 ** pendingTerms stores the data, and is only valid when nPendingData
1849 ** is >=0 (nPendingData<0 means pendingTerms has not been
1850 ** initialized). iPrevDocid is the last docid written, used to make
1851 ** certain we're inserting in sorted order.
1852 */
1853 int nPendingData;
1854#define kPendingThreshold (1*1024*1024)
1855 sqlite_int64 iPrevDocid;
1856 fts2Hash pendingTerms;
1857};
1858
1859/*
1860** When the core wants to do a query, it create a cursor using a
1861** call to xOpen. This structure is an instance of a cursor. It
1862** is destroyed by xClose.
1863*/
1864typedef struct fulltext_cursor {
1865 sqlite3_vtab_cursor base; /* Base class used by SQLite core */
1866 QueryType iCursorType; /* Copy of sqlite3_index_info.idxNum */
1867 sqlite3_stmt *pStmt; /* Prepared statement in use by the cursor */
1868 int eof; /* True if at End Of Results */
1869 Query q; /* Parsed query string */
1870 Snippet snippet; /* Cached snippet for the current row */
1871 int iColumn; /* Column being searched */
1872 DataBuffer result; /* Doclist results from fulltextQuery */
1873 DLReader reader; /* Result reader if result not empty */
1874} fulltext_cursor;
1875
1876static struct fulltext_vtab *cursor_vtab(fulltext_cursor *c){
1877 return (fulltext_vtab *) c->base.pVtab;
1878}
1879
1880static const sqlite3_module fts2Module; /* forward declaration */
1881
1882/* Return a dynamically generated statement of the form
1883 * insert into %_content (rowid, ...) values (?, ...)
1884 */
1885static const char *contentInsertStatement(fulltext_vtab *v){
1886 StringBuffer sb;
1887 int i;
1888
1889 initStringBuffer(&sb);
1890 append(&sb, "insert into %_content (rowid, ");
1891 appendList(&sb, v->nColumn, v->azContentColumn);
1892 append(&sb, ") values (?");
1893 for(i=0; i<v->nColumn; ++i)
1894 append(&sb, ", ?");
1895 append(&sb, ")");
1896 return stringBufferData(&sb);
1897}
1898
1899/* Return a dynamically generated statement of the form
1900 * update %_content set [col_0] = ?, [col_1] = ?, ...
1901 * where rowid = ?
1902 */
1903static const char *contentUpdateStatement(fulltext_vtab *v){
1904 StringBuffer sb;
1905 int i;
1906
1907 initStringBuffer(&sb);
1908 append(&sb, "update %_content set ");
1909 for(i=0; i<v->nColumn; ++i) {
1910 if( i>0 ){
1911 append(&sb, ", ");
1912 }
1913 append(&sb, v->azContentColumn[i]);
1914 append(&sb, " = ?");
1915 }
1916 append(&sb, " where rowid = ?");
1917 return stringBufferData(&sb);
1918}
1919
1920/* Puts a freshly-prepared statement determined by iStmt in *ppStmt.
1921** If the indicated statement has never been prepared, it is prepared
1922** and cached, otherwise the cached version is reset.
1923*/
1924static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt,
1925 sqlite3_stmt **ppStmt){
1926 assert( iStmt<MAX_STMT );
1927 if( v->pFulltextStatements[iStmt]==NULL ){
1928 const char *zStmt;
1929 int rc;
1930 switch( iStmt ){
1931 case CONTENT_INSERT_STMT:
1932 zStmt = contentInsertStatement(v); break;
1933 case CONTENT_UPDATE_STMT:
1934 zStmt = contentUpdateStatement(v); break;
1935 default:
1936 zStmt = fulltext_zStatement[iStmt];
1937 }
1938 rc = sql_prepare(v->db, v->zDb, v->zName, &v->pFulltextStatements[iStmt],
1939 zStmt);
1940 if( zStmt != fulltext_zStatement[iStmt]) free((void *) zStmt);
1941 if( rc!=SQLITE_OK ) return rc;
1942 } else {
1943 int rc = sqlite3_reset(v->pFulltextStatements[iStmt]);
1944 if( rc!=SQLITE_OK ) return rc;
1945 }
1946
1947 *ppStmt = v->pFulltextStatements[iStmt];
1948 return SQLITE_OK;
1949}
1950
1951/* Like sqlite3_step(), but convert SQLITE_DONE to SQLITE_OK and
1952** SQLITE_ROW to SQLITE_ERROR. Useful for statements like UPDATE,
1953** where we expect no results.
1954*/
1955static int sql_single_step(sqlite3_stmt *s){
1956 int rc = sqlite3_step(s);
1957 return (rc==SQLITE_DONE) ? SQLITE_OK : rc;
1958}
1959
1960/* Like sql_get_statement(), but for special replicated LEAF_SELECT
1961** statements.
1962*/
1963/* TODO(shess) Write version for generic statements and then share
1964** that between the cached-statement functions.
1965*/
1966static int sql_get_leaf_statement(fulltext_vtab *v, int idx,
1967 sqlite3_stmt **ppStmt){
1968 assert( idx>=0 && idx<MERGE_COUNT );
1969 if( v->pLeafSelectStmts[idx]==NULL ){
1970 int rc = sql_prepare(v->db, v->zDb, v->zName, &v->pLeafSelectStmts[idx],
1971 LEAF_SELECT);
1972 if( rc!=SQLITE_OK ) return rc;
1973 }else{
1974 int rc = sqlite3_reset(v->pLeafSelectStmts[idx]);
1975 if( rc!=SQLITE_OK ) return rc;
1976 }
1977
1978 *ppStmt = v->pLeafSelectStmts[idx];
1979 return SQLITE_OK;
1980}
1981
1982/* insert into %_content (rowid, ...) values ([rowid], [pValues]) */
1983static int content_insert(fulltext_vtab *v, sqlite3_value *rowid,
1984 sqlite3_value **pValues){
1985 sqlite3_stmt *s;
1986 int i;
1987 int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s);
1988 if( rc!=SQLITE_OK ) return rc;
1989
1990 rc = sqlite3_bind_value(s, 1, rowid);
1991 if( rc!=SQLITE_OK ) return rc;
1992
1993 for(i=0; i<v->nColumn; ++i){
1994 rc = sqlite3_bind_value(s, 2+i, pValues[i]);
1995 if( rc!=SQLITE_OK ) return rc;
1996 }
1997
1998 return sql_single_step(s);
1999}
2000
2001/* update %_content set col0 = pValues[0], col1 = pValues[1], ...
2002 * where rowid = [iRowid] */
2003static int content_update(fulltext_vtab *v, sqlite3_value **pValues,
2004 sqlite_int64 iRowid){
2005 sqlite3_stmt *s;
2006 int i;
2007 int rc = sql_get_statement(v, CONTENT_UPDATE_STMT, &s);
2008 if( rc!=SQLITE_OK ) return rc;
2009
2010 for(i=0; i<v->nColumn; ++i){
2011 rc = sqlite3_bind_value(s, 1+i, pValues[i]);
2012 if( rc!=SQLITE_OK ) return rc;
2013 }
2014
2015 rc = sqlite3_bind_int64(s, 1+v->nColumn, iRowid);
2016 if( rc!=SQLITE_OK ) return rc;
2017
2018 return sql_single_step(s);
2019}
2020
2021static void freeStringArray(int nString, const char **pString){
2022 int i;
2023
2024 for (i=0 ; i < nString ; ++i) {
2025 if( pString[i]!=NULL ) free((void *) pString[i]);
2026 }
2027 free((void *) pString);
2028}
2029
2030/* select * from %_content where rowid = [iRow]
2031 * The caller must delete the returned array and all strings in it.
2032 * null fields will be NULL in the returned array.
2033 *
2034 * TODO: Perhaps we should return pointer/length strings here for consistency
2035 * with other code which uses pointer/length. */
2036static int content_select(fulltext_vtab *v, sqlite_int64 iRow,
2037 const char ***pValues){
2038 sqlite3_stmt *s;
2039 const char **values;
2040 int i;
2041 int rc;
2042
2043 *pValues = NULL;
2044
2045 rc = sql_get_statement(v, CONTENT_SELECT_STMT, &s);
2046 if( rc!=SQLITE_OK ) return rc;
2047
2048 rc = sqlite3_bind_int64(s, 1, iRow);
2049 if( rc!=SQLITE_OK ) return rc;
2050
2051 rc = sqlite3_step(s);
2052 if( rc!=SQLITE_ROW ) return rc;
2053
2054 values = (const char **) malloc(v->nColumn * sizeof(const char *));
2055 for(i=0; i<v->nColumn; ++i){
2056 if( sqlite3_column_type(s, i)==SQLITE_NULL ){
2057 values[i] = NULL;
2058 }else{
2059 values[i] = string_dup((char*)sqlite3_column_text(s, i));
2060 }
2061 }
2062
2063 /* We expect only one row. We must execute another sqlite3_step()
2064 * to complete the iteration; otherwise the table will remain locked. */
2065 rc = sqlite3_step(s);
2066 if( rc==SQLITE_DONE ){
2067 *pValues = values;
2068 return SQLITE_OK;
2069 }
2070
2071 freeStringArray(v->nColumn, values);
2072 return rc;
2073}
2074
2075/* delete from %_content where rowid = [iRow ] */
2076static int content_delete(fulltext_vtab *v, sqlite_int64 iRow){
2077 sqlite3_stmt *s;
2078 int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s);
2079 if( rc!=SQLITE_OK ) return rc;
2080
2081 rc = sqlite3_bind_int64(s, 1, iRow);
2082 if( rc!=SQLITE_OK ) return rc;
2083
2084 return sql_single_step(s);
2085}
2086
2087/* insert into %_segments values ([pData])
2088** returns assigned rowid in *piBlockid
2089*/
2090static int block_insert(fulltext_vtab *v, const char *pData, int nData,
2091 sqlite_int64 *piBlockid){
2092 sqlite3_stmt *s;
2093 int rc = sql_get_statement(v, BLOCK_INSERT_STMT, &s);
2094 if( rc!=SQLITE_OK ) return rc;
2095
2096 rc = sqlite3_bind_blob(s, 1, pData, nData, SQLITE_STATIC);
2097 if( rc!=SQLITE_OK ) return rc;
2098
2099 rc = sqlite3_step(s);
2100 if( rc==SQLITE_ROW ) return SQLITE_ERROR;
2101 if( rc!=SQLITE_DONE ) return rc;
2102
2103 *piBlockid = sqlite3_last_insert_rowid(v->db);
2104 return SQLITE_OK;
2105}
2106
2107/* delete from %_segments
2108** where rowid between [iStartBlockid] and [iEndBlockid]
2109**
2110** Deletes the range of blocks, inclusive, used to delete the blocks
2111** which form a segment.
2112*/
2113static int block_delete(fulltext_vtab *v,
2114 sqlite_int64 iStartBlockid, sqlite_int64 iEndBlockid){
2115 sqlite3_stmt *s;
2116 int rc = sql_get_statement(v, BLOCK_DELETE_STMT, &s);
2117 if( rc!=SQLITE_OK ) return rc;
2118
2119 rc = sqlite3_bind_int64(s, 1, iStartBlockid);
2120 if( rc!=SQLITE_OK ) return rc;
2121
2122 rc = sqlite3_bind_int64(s, 2, iEndBlockid);
2123 if( rc!=SQLITE_OK ) return rc;
2124
2125 return sql_single_step(s);
2126}
2127
2128/* Returns SQLITE_ROW with *pidx set to the maximum segment idx found
2129** at iLevel. Returns SQLITE_DONE if there are no segments at
2130** iLevel. Otherwise returns an error.
2131*/
2132static int segdir_max_index(fulltext_vtab *v, int iLevel, int *pidx){
2133 sqlite3_stmt *s;
2134 int rc = sql_get_statement(v, SEGDIR_MAX_INDEX_STMT, &s);
2135 if( rc!=SQLITE_OK ) return rc;
2136
2137 rc = sqlite3_bind_int(s, 1, iLevel);
2138 if( rc!=SQLITE_OK ) return rc;
2139
2140 rc = sqlite3_step(s);
2141 /* Should always get at least one row due to how max() works. */
2142 if( rc==SQLITE_DONE ) return SQLITE_DONE;
2143 if( rc!=SQLITE_ROW ) return rc;
2144
2145 /* NULL means that there were no inputs to max(). */
2146 if( SQLITE_NULL==sqlite3_column_type(s, 0) ){
2147 rc = sqlite3_step(s);
2148 if( rc==SQLITE_ROW ) return SQLITE_ERROR;
2149 return rc;
2150 }
2151
2152 *pidx = sqlite3_column_int(s, 0);
2153
2154 /* We expect only one row. We must execute another sqlite3_step()
2155 * to complete the iteration; otherwise the table will remain locked. */
2156 rc = sqlite3_step(s);
2157 if( rc==SQLITE_ROW ) return SQLITE_ERROR;
2158 if( rc!=SQLITE_DONE ) return rc;
2159 return SQLITE_ROW;
2160}
2161
2162/* insert into %_segdir values (
2163** [iLevel], [idx],
2164** [iStartBlockid], [iLeavesEndBlockid], [iEndBlockid],
2165** [pRootData]
2166** )
2167*/
2168static int segdir_set(fulltext_vtab *v, int iLevel, int idx,
2169 sqlite_int64 iStartBlockid,
2170 sqlite_int64 iLeavesEndBlockid,
2171 sqlite_int64 iEndBlockid,
2172 const char *pRootData, int nRootData){
2173 sqlite3_stmt *s;
2174 int rc = sql_get_statement(v, SEGDIR_SET_STMT, &s);
2175 if( rc!=SQLITE_OK ) return rc;
2176
2177 rc = sqlite3_bind_int(s, 1, iLevel);
2178 if( rc!=SQLITE_OK ) return rc;
2179
2180 rc = sqlite3_bind_int(s, 2, idx);
2181 if( rc!=SQLITE_OK ) return rc;
2182
2183 rc = sqlite3_bind_int64(s, 3, iStartBlockid);
2184 if( rc!=SQLITE_OK ) return rc;
2185
2186 rc = sqlite3_bind_int64(s, 4, iLeavesEndBlockid);
2187 if( rc!=SQLITE_OK ) return rc;
2188
2189 rc = sqlite3_bind_int64(s, 5, iEndBlockid);
2190 if( rc!=SQLITE_OK ) return rc;
2191
2192 rc = sqlite3_bind_blob(s, 6, pRootData, nRootData, SQLITE_STATIC);
2193 if( rc!=SQLITE_OK ) return rc;
2194
2195 return sql_single_step(s);
2196}
2197
2198/* Queries %_segdir for the block span of the segments in level
2199** iLevel. Returns SQLITE_DONE if there are no blocks for iLevel,
2200** SQLITE_ROW if there are blocks, else an error.
2201*/
2202static int segdir_span(fulltext_vtab *v, int iLevel,
2203 sqlite_int64 *piStartBlockid,
2204 sqlite_int64 *piEndBlockid){
2205 sqlite3_stmt *s;
2206 int rc = sql_get_statement(v, SEGDIR_SPAN_STMT, &s);
2207 if( rc!=SQLITE_OK ) return rc;
2208
2209 rc = sqlite3_bind_int(s, 1, iLevel);
2210 if( rc!=SQLITE_OK ) return rc;
2211
2212 rc = sqlite3_step(s);
2213 if( rc==SQLITE_DONE ) return SQLITE_DONE; /* Should never happen */
2214 if( rc!=SQLITE_ROW ) return rc;
2215
2216 /* This happens if all segments at this level are entirely inline. */
2217 if( SQLITE_NULL==sqlite3_column_type(s, 0) ){
2218 /* We expect only one row. We must execute another sqlite3_step()
2219 * to complete the iteration; otherwise the table will remain locked. */
2220 int rc2 = sqlite3_step(s);
2221 if( rc2==SQLITE_ROW ) return SQLITE_ERROR;
2222 return rc2;
2223 }
2224
2225 *piStartBlockid = sqlite3_column_int64(s, 0);
2226 *piEndBlockid = sqlite3_column_int64(s, 1);
2227
2228 /* We expect only one row. We must execute another sqlite3_step()
2229 * to complete the iteration; otherwise the table will remain locked. */
2230 rc = sqlite3_step(s);
2231 if( rc==SQLITE_ROW ) return SQLITE_ERROR;
2232 if( rc!=SQLITE_DONE ) return rc;
2233 return SQLITE_ROW;
2234}
2235
2236/* Delete the segment blocks and segment directory records for all
2237** segments at iLevel.
2238*/
2239static int segdir_delete(fulltext_vtab *v, int iLevel){
2240 sqlite3_stmt *s;
2241 sqlite_int64 iStartBlockid, iEndBlockid;
2242 int rc = segdir_span(v, iLevel, &iStartBlockid, &iEndBlockid);
2243 if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ) return rc;
2244
2245 if( rc==SQLITE_ROW ){
2246 rc = block_delete(v, iStartBlockid, iEndBlockid);
2247 if( rc!=SQLITE_OK ) return rc;
2248 }
2249
2250 /* Delete the segment directory itself. */
2251 rc = sql_get_statement(v, SEGDIR_DELETE_STMT, &s);
2252 if( rc!=SQLITE_OK ) return rc;
2253
2254 rc = sqlite3_bind_int64(s, 1, iLevel);
2255 if( rc!=SQLITE_OK ) return rc;
2256
2257 return sql_single_step(s);
2258}
2259
2260/* TODO(shess) clearPendingTerms() is far down the file because
2261** writeZeroSegment() is far down the file because LeafWriter is far
2262** down the file. Consider refactoring the code to move the non-vtab
2263** code above the vtab code so that we don't need this forward
2264** reference.
2265*/
2266static int clearPendingTerms(fulltext_vtab *v);
2267
2268/*
2269** Free the memory used to contain a fulltext_vtab structure.
2270*/
2271static void fulltext_vtab_destroy(fulltext_vtab *v){
2272 int iStmt, i;
2273
2274 TRACE(("FTS2 Destroy %p\n", v));
2275 for( iStmt=0; iStmt<MAX_STMT; iStmt++ ){
2276 if( v->pFulltextStatements[iStmt]!=NULL ){
2277 sqlite3_finalize(v->pFulltextStatements[iStmt]);
2278 v->pFulltextStatements[iStmt] = NULL;
2279 }
2280 }
2281
2282 for( i=0; i<MERGE_COUNT; i++ ){
2283 if( v->pLeafSelectStmts[i]!=NULL ){
2284 sqlite3_finalize(v->pLeafSelectStmts[i]);
2285 v->pLeafSelectStmts[i] = NULL;
2286 }
2287 }
2288
2289 if( v->pTokenizer!=NULL ){
2290 v->pTokenizer->pModule->xDestroy(v->pTokenizer);
2291 v->pTokenizer = NULL;
2292 }
2293
2294 clearPendingTerms(v);
2295
2296 free(v->azColumn);
2297 for(i = 0; i < v->nColumn; ++i) {
2298 sqlite3_free(v->azContentColumn[i]);
2299 }
2300 free(v->azContentColumn);
2301 free(v);
2302}
2303
2304/*
2305** Token types for parsing the arguments to xConnect or xCreate.
2306*/
2307#define TOKEN_EOF 0 /* End of file */
2308#define TOKEN_SPACE 1 /* Any kind of whitespace */
2309#define TOKEN_ID 2 /* An identifier */
2310#define TOKEN_STRING 3 /* A string literal */
2311#define TOKEN_PUNCT 4 /* A single punctuation character */
2312
2313/*
2314** If X is a character that can be used in an identifier then
2315** IdChar(X) will be true. Otherwise it is false.
2316**
2317** For ASCII, any character with the high-order bit set is
2318** allowed in an identifier. For 7-bit characters,
2319** sqlite3IsIdChar[X] must be 1.
2320**
2321** Ticket #1066. the SQL standard does not allow '$' in the
2322** middle of identfiers. But many SQL implementations do.
2323** SQLite will allow '$' in identifiers for compatibility.
2324** But the feature is undocumented.
2325*/
2326static const char isIdChar[] = {
2327/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
2328 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 2x */
2329 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */
2330 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */
2331 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */
2332 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */
2333 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */
2334};
2335#define IdChar(C) (((c=C)&0x80)!=0 || (c>0x1f && isIdChar[c-0x20]))
2336
2337
2338/*
2339** Return the length of the token that begins at z[0].
2340** Store the token type in *tokenType before returning.
2341*/
2342static int getToken(const char *z, int *tokenType){
2343 int i, c;
2344 switch( *z ){
2345 case 0: {
2346 *tokenType = TOKEN_EOF;
2347 return 0;
2348 }
2349 case ' ': case '\t': case '\n': case '\f': case '\r': {
2350 for(i=1; safe_isspace(z[i]); i++){}
2351 *tokenType = TOKEN_SPACE;
2352 return i;
2353 }
2354 case '`':
2355 case '\'':
2356 case '"': {
2357 int delim = z[0];
2358 for(i=1; (c=z[i])!=0; i++){
2359 if( c==delim ){
2360 if( z[i+1]==delim ){
2361 i++;
2362 }else{
2363 break;
2364 }
2365 }
2366 }
2367 *tokenType = TOKEN_STRING;
2368 return i + (c!=0);
2369 }
2370 case '[': {
2371 for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){}
2372 *tokenType = TOKEN_ID;
2373 return i;
2374 }
2375 default: {
2376 if( !IdChar(*z) ){
2377 break;
2378 }
2379 for(i=1; IdChar(z[i]); i++){}
2380 *tokenType = TOKEN_ID;
2381 return i;
2382 }
2383 }
2384 *tokenType = TOKEN_PUNCT;
2385 return 1;
2386}
2387
2388/*
2389** A token extracted from a string is an instance of the following
2390** structure.
2391*/
2392typedef struct Token {
2393 const char *z; /* Pointer to token text. Not '\000' terminated */
2394 short int n; /* Length of the token text in bytes. */
2395} Token;
2396
2397/*
2398** Given a input string (which is really one of the argv[] parameters
2399** passed into xConnect or xCreate) split the string up into tokens.
2400** Return an array of pointers to '\000' terminated strings, one string
2401** for each non-whitespace token.
2402**
2403** The returned array is terminated by a single NULL pointer.
2404**
2405** Space to hold the returned array is obtained from a single
2406** malloc and should be freed by passing the return value to free().
2407** The individual strings within the token list are all a part of
2408** the single memory allocation and will all be freed at once.
2409*/
2410static char **tokenizeString(const char *z, int *pnToken){
2411 int nToken = 0;
2412 Token *aToken = malloc( strlen(z) * sizeof(aToken[0]) );
2413 int n = 1;
2414 int e, i;
2415 int totalSize = 0;
2416 char **azToken;
2417 char *zCopy;
2418 while( n>0 ){
2419 n = getToken(z, &e);
2420 if( e!=TOKEN_SPACE ){
2421 aToken[nToken].z = z;
2422 aToken[nToken].n = n;
2423 nToken++;
2424 totalSize += n+1;
2425 }
2426 z += n;
2427 }
2428 azToken = (char**)malloc( nToken*sizeof(char*) + totalSize );
2429 zCopy = (char*)&azToken[nToken];
2430 nToken--;
2431 for(i=0; i<nToken; i++){
2432 azToken[i] = zCopy;
2433 n = aToken[i].n;
2434 memcpy(zCopy, aToken[i].z, n);
2435 zCopy[n] = 0;
2436 zCopy += n+1;
2437 }
2438 azToken[nToken] = 0;
2439 free(aToken);
2440 *pnToken = nToken;
2441 return azToken;
2442}
2443
2444/*
2445** Convert an SQL-style quoted string into a normal string by removing
2446** the quote characters. The conversion is done in-place. If the
2447** input does not begin with a quote character, then this routine
2448** is a no-op.
2449**
2450** Examples:
2451**
2452** "abc" becomes abc
2453** 'xyz' becomes xyz
2454** [pqr] becomes pqr
2455** `mno` becomes mno
2456*/
2457static void dequoteString(char *z){
2458 int quote;
2459 int i, j;
2460 if( z==0 ) return;
2461 quote = z[0];
2462 switch( quote ){
2463 case '\'': break;
2464 case '"': break;
2465 case '`': break; /* For MySQL compatibility */
2466 case '[': quote = ']'; break; /* For MS SqlServer compatibility */
2467 default: return;
2468 }
2469 for(i=1, j=0; z[i]; i++){
2470 if( z[i]==quote ){
2471 if( z[i+1]==quote ){
2472 z[j++] = quote;
2473 i++;
2474 }else{
2475 z[j++] = 0;
2476 break;
2477 }
2478 }else{
2479 z[j++] = z[i];
2480 }
2481 }
2482}
2483
2484/*
2485** The input azIn is a NULL-terminated list of tokens. Remove the first
2486** token and all punctuation tokens. Remove the quotes from
2487** around string literal tokens.
2488**
2489** Example:
2490**
2491** input: tokenize chinese ( 'simplifed' , 'mixed' )
2492** output: chinese simplifed mixed
2493**
2494** Another example:
2495**
2496** input: delimiters ( '[' , ']' , '...' )
2497** output: [ ] ...
2498*/
2499static void tokenListToIdList(char **azIn){
2500 int i, j;
2501 if( azIn ){
2502 for(i=0, j=-1; azIn[i]; i++){
2503 if( safe_isalnum(azIn[i][0]) || azIn[i][1] ){
2504 dequoteString(azIn[i]);
2505 if( j>=0 ){
2506 azIn[j] = azIn[i];
2507 }
2508 j++;
2509 }
2510 }
2511 azIn[j] = 0;
2512 }
2513}
2514
2515
2516/*
2517** Find the first alphanumeric token in the string zIn. Null-terminate
2518** this token. Remove any quotation marks. And return a pointer to
2519** the result.
2520*/
2521static char *firstToken(char *zIn, char **pzTail){
2522 int n, ttype;
2523 while(1){
2524 n = getToken(zIn, &ttype);
2525 if( ttype==TOKEN_SPACE ){
2526 zIn += n;
2527 }else if( ttype==TOKEN_EOF ){
2528 *pzTail = zIn;
2529 return 0;
2530 }else{
2531 zIn[n] = 0;
2532 *pzTail = &zIn[1];
2533 dequoteString(zIn);
2534 return zIn;
2535 }
2536 }
2537 /*NOTREACHED*/
2538}
2539
2540/* Return true if...
2541**
2542** * s begins with the string t, ignoring case
2543** * s is longer than t
2544** * The first character of s beyond t is not a alphanumeric
2545**
2546** Ignore leading space in *s.
2547**
2548** To put it another way, return true if the first token of
2549** s[] is t[].
2550*/
2551static int startsWith(const char *s, const char *t){
2552 while( safe_isspace(*s) ){ s++; }
2553 while( *t ){
2554 if( safe_tolower(*s++)!=safe_tolower(*t++) ) return 0;
2555 }
2556 return *s!='_' && !safe_isalnum(*s);
2557}
2558
2559/*
2560** An instance of this structure defines the "spec" of a
2561** full text index. This structure is populated by parseSpec
2562** and use by fulltextConnect and fulltextCreate.
2563*/
2564typedef struct TableSpec {
2565 const char *zDb; /* Logical database name */
2566 const char *zName; /* Name of the full-text index */
2567 int nColumn; /* Number of columns to be indexed */
2568 char **azColumn; /* Original names of columns to be indexed */
2569 char **azContentColumn; /* Column names for %_content */
2570 char **azTokenizer; /* Name of tokenizer and its arguments */
2571} TableSpec;
2572
2573/*
2574** Reclaim all of the memory used by a TableSpec
2575*/
2576static void clearTableSpec(TableSpec *p) {
2577 free(p->azColumn);
2578 free(p->azContentColumn);
2579 free(p->azTokenizer);
2580}
2581
2582/* Parse a CREATE VIRTUAL TABLE statement, which looks like this:
2583 *
2584 * CREATE VIRTUAL TABLE email
2585 * USING fts2(subject, body, tokenize mytokenizer(myarg))
2586 *
2587 * We return parsed information in a TableSpec structure.
2588 *
2589 */
2590static int parseSpec(TableSpec *pSpec, int argc, const char *const*argv,
2591 char**pzErr){
2592 int i, n;
2593 char *z, *zDummy;
2594 char **azArg;
2595 const char *zTokenizer = 0; /* argv[] entry describing the tokenizer */
2596
2597 assert( argc>=3 );
2598 /* Current interface:
2599 ** argv[0] - module name
2600 ** argv[1] - database name
2601 ** argv[2] - table name
2602 ** argv[3..] - columns, optionally followed by tokenizer specification
2603 ** and snippet delimiters specification.
2604 */
2605
2606 /* Make a copy of the complete argv[][] array in a single allocation.
2607 ** The argv[][] array is read-only and transient. We can write to the
2608 ** copy in order to modify things and the copy is persistent.
2609 */
2610 CLEAR(pSpec);
2611 for(i=n=0; i<argc; i++){
2612 n += strlen(argv[i]) + 1;
2613 }
2614 azArg = malloc( sizeof(char*)*argc + n );
2615 if( azArg==0 ){
2616 return SQLITE_NOMEM;
2617 }
2618 z = (char*)&azArg[argc];
2619 for(i=0; i<argc; i++){
2620 azArg[i] = z;
2621 strcpy(z, argv[i]);
2622 z += strlen(z)+1;
2623 }
2624
2625 /* Identify the column names and the tokenizer and delimiter arguments
2626 ** in the argv[][] array.
2627 */
2628 pSpec->zDb = azArg[1];
2629 pSpec->zName = azArg[2];
2630 pSpec->nColumn = 0;
2631 pSpec->azColumn = azArg;
2632 zTokenizer = "tokenize simple";
2633 for(i=3; i<argc; ++i){
2634 if( startsWith(azArg[i],"tokenize") ){
2635 zTokenizer = azArg[i];
2636 }else{
2637 z = azArg[pSpec->nColumn] = firstToken(azArg[i], &zDummy);
2638 pSpec->nColumn++;
2639 }
2640 }
2641 if( pSpec->nColumn==0 ){
2642 azArg[0] = "content";
2643 pSpec->nColumn = 1;
2644 }
2645
2646 /*
2647 ** Construct the list of content column names.
2648 **
2649 ** Each content column name will be of the form cNNAAAA
2650 ** where NN is the column number and AAAA is the sanitized
2651 ** column name. "sanitized" means that special characters are
2652 ** converted to "_". The cNN prefix guarantees that all column
2653 ** names are unique.
2654 **
2655 ** The AAAA suffix is not strictly necessary. It is included
2656 ** for the convenience of people who might examine the generated
2657 ** %_content table and wonder what the columns are used for.
2658 */
2659 pSpec->azContentColumn = malloc( pSpec->nColumn * sizeof(char *) );
2660 if( pSpec->azContentColumn==0 ){
2661 clearTableSpec(pSpec);
2662 return SQLITE_NOMEM;
2663 }
2664 for(i=0; i<pSpec->nColumn; i++){
2665 char *p;
2666 pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]);
2667 for (p = pSpec->azContentColumn[i]; *p ; ++p) {
2668 if( !safe_isalnum(*p) ) *p = '_';
2669 }
2670 }
2671
2672 /*
2673 ** Parse the tokenizer specification string.
2674 */
2675 pSpec->azTokenizer = tokenizeString(zTokenizer, &n);
2676 tokenListToIdList(pSpec->azTokenizer);
2677
2678 return SQLITE_OK;
2679}
2680
2681/*
2682** Generate a CREATE TABLE statement that describes the schema of
2683** the virtual table. Return a pointer to this schema string.
2684**
2685** Space is obtained from sqlite3_mprintf() and should be freed
2686** using sqlite3_free().
2687*/
2688static char *fulltextSchema(
2689 int nColumn, /* Number of columns */
2690 const char *const* azColumn, /* List of columns */
2691 const char *zTableName /* Name of the table */
2692){
2693 int i;
2694 char *zSchema, *zNext;
2695 const char *zSep = "(";
2696 zSchema = sqlite3_mprintf("CREATE TABLE x");
2697 for(i=0; i<nColumn; i++){
2698 zNext = sqlite3_mprintf("%s%s%Q", zSchema, zSep, azColumn[i]);
2699 sqlite3_free(zSchema);
2700 zSchema = zNext;
2701 zSep = ",";
2702 }
2703 zNext = sqlite3_mprintf("%s,%Q)", zSchema, zTableName);
2704 sqlite3_free(zSchema);
2705 return zNext;
2706}
2707
2708/*
2709** Build a new sqlite3_vtab structure that will describe the
2710** fulltext index defined by spec.
2711*/
2712static int constructVtab(
2713 sqlite3 *db, /* The SQLite database connection */
2714 fts2Hash *pHash, /* Hash table containing tokenizers */
2715 TableSpec *spec, /* Parsed spec information from parseSpec() */
2716 sqlite3_vtab **ppVTab, /* Write the resulting vtab structure here */
2717 char **pzErr /* Write any error message here */
2718){
2719 int rc;
2720 int n;
2721 fulltext_vtab *v = 0;
2722 const sqlite3_tokenizer_module *m = NULL;
2723 char *schema;
2724
2725 char const *zTok; /* Name of tokenizer to use for this fts table */
2726 int nTok; /* Length of zTok, including nul terminator */
2727
2728 v = (fulltext_vtab *) malloc(sizeof(fulltext_vtab));
2729 if( v==0 ) return SQLITE_NOMEM;
2730 CLEAR(v);
2731 /* sqlite will initialize v->base */
2732 v->db = db;
2733 v->zDb = spec->zDb; /* Freed when azColumn is freed */
2734 v->zName = spec->zName; /* Freed when azColumn is freed */
2735 v->nColumn = spec->nColumn;
2736 v->azContentColumn = spec->azContentColumn;
2737 spec->azContentColumn = 0;
2738 v->azColumn = spec->azColumn;
2739 spec->azColumn = 0;
2740
2741 if( spec->azTokenizer==0 ){
2742 return SQLITE_NOMEM;
2743 }
2744
2745 zTok = spec->azTokenizer[0];
2746 if( !zTok ){
2747 zTok = "simple";
2748 }
2749 nTok = strlen(zTok)+1;
2750
2751 m = (sqlite3_tokenizer_module *)sqlite3Fts2HashFind(pHash, zTok, nTok);
2752 if( !m ){
2753 *pzErr = sqlite3_mprintf("unknown tokenizer: %s", spec->azTokenizer[0]);
2754 rc = SQLITE_ERROR;
2755 goto err;
2756 }
2757
2758 for(n=0; spec->azTokenizer[n]; n++){}
2759 if( n ){
2760 rc = m->xCreate(n-1, (const char*const*)&spec->azTokenizer[1],
2761 &v->pTokenizer);
2762 }else{
2763 rc = m->xCreate(0, 0, &v->pTokenizer);
2764 }
2765 if( rc!=SQLITE_OK ) goto err;
2766 v->pTokenizer->pModule = m;
2767
2768 /* TODO: verify the existence of backing tables foo_content, foo_term */
2769
2770 schema = fulltextSchema(v->nColumn, (const char*const*)v->azColumn,
2771 spec->zName);
2772 rc = sqlite3_declare_vtab(db, schema);
2773 sqlite3_free(schema);
2774 if( rc!=SQLITE_OK ) goto err;
2775
2776 memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements));
2777
2778 /* Indicate that the buffer is not live. */
2779 v->nPendingData = -1;
2780
2781 *ppVTab = &v->base;
2782 TRACE(("FTS2 Connect %p\n", v));
2783
2784 return rc;
2785
2786err:
2787 fulltext_vtab_destroy(v);
2788 return rc;
2789}
2790
2791static int fulltextConnect(
2792 sqlite3 *db,
2793 void *pAux,
2794 int argc, const char *const*argv,
2795 sqlite3_vtab **ppVTab,
2796 char **pzErr
2797){
2798 TableSpec spec;
2799 int rc = parseSpec(&spec, argc, argv, pzErr);
2800 if( rc!=SQLITE_OK ) return rc;
2801
2802 rc = constructVtab(db, (fts2Hash *)pAux, &spec, ppVTab, pzErr);
2803 clearTableSpec(&spec);
2804 return rc;
2805}
2806
2807/* The %_content table holds the text of each document, with
2808** the rowid used as the docid.
2809*/
2810/* TODO(shess) This comment needs elaboration to match the updated
2811** code. Work it into the top-of-file comment at that time.
2812*/
2813static int fulltextCreate(sqlite3 *db, void *pAux,
2814 int argc, const char * const *argv,
2815 sqlite3_vtab **ppVTab, char **pzErr){
2816 int rc;
2817 TableSpec spec;
2818 StringBuffer schema;
2819 TRACE(("FTS2 Create\n"));
2820
2821 rc = parseSpec(&spec, argc, argv, pzErr);
2822 if( rc!=SQLITE_OK ) return rc;
2823
2824 initStringBuffer(&schema);
2825 append(&schema, "CREATE TABLE %_content(");
2826 appendList(&schema, spec.nColumn, spec.azContentColumn);
2827 append(&schema, ")");
2828 rc = sql_exec(db, spec.zDb, spec.zName, stringBufferData(&schema));
2829 stringBufferDestroy(&schema);
2830 if( rc!=SQLITE_OK ) goto out;
2831
2832 rc = sql_exec(db, spec.zDb, spec.zName,
2833 "create table %_segments(block blob);");
2834 if( rc!=SQLITE_OK ) goto out;
2835
2836 rc = sql_exec(db, spec.zDb, spec.zName,
2837 "create table %_segdir("
2838 " level integer,"
2839 " idx integer,"
2840 " start_block integer,"
2841 " leaves_end_block integer,"
2842 " end_block integer,"
2843 " root blob,"
2844 " primary key(level, idx)"
2845 ");");
2846 if( rc!=SQLITE_OK ) goto out;
2847
2848 rc = constructVtab(db, (fts2Hash *)pAux, &spec, ppVTab, pzErr);
2849
2850out:
2851 clearTableSpec(&spec);
2852 return rc;
2853}
2854
2855/* Decide how to handle an SQL query. */
2856static int fulltextBestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){
2857 int i;
2858 TRACE(("FTS2 BestIndex\n"));
2859
2860 for(i=0; i<pInfo->nConstraint; ++i){
2861 const struct sqlite3_index_constraint *pConstraint;
2862 pConstraint = &pInfo->aConstraint[i];
2863 if( pConstraint->usable ) {
2864 if( pConstraint->iColumn==-1 &&
2865 pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ ){
2866 pInfo->idxNum = QUERY_ROWID; /* lookup by rowid */
2867 TRACE(("FTS2 QUERY_ROWID\n"));
2868 } else if( pConstraint->iColumn>=0 &&
2869 pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH ){
2870 /* full-text search */
2871 pInfo->idxNum = QUERY_FULLTEXT + pConstraint->iColumn;
2872 TRACE(("FTS2 QUERY_FULLTEXT %d\n", pConstraint->iColumn));
2873 } else continue;
2874
2875 pInfo->aConstraintUsage[i].argvIndex = 1;
2876 pInfo->aConstraintUsage[i].omit = 1;
2877
2878 /* An arbitrary value for now.
2879 * TODO: Perhaps rowid matches should be considered cheaper than
2880 * full-text searches. */
2881 pInfo->estimatedCost = 1.0;
2882
2883 return SQLITE_OK;
2884 }
2885 }
2886 pInfo->idxNum = QUERY_GENERIC;
2887 return SQLITE_OK;
2888}
2889
2890static int fulltextDisconnect(sqlite3_vtab *pVTab){
2891 TRACE(("FTS2 Disconnect %p\n", pVTab));
2892 fulltext_vtab_destroy((fulltext_vtab *)pVTab);
2893 return SQLITE_OK;
2894}
2895
2896static int fulltextDestroy(sqlite3_vtab *pVTab){
2897 fulltext_vtab *v = (fulltext_vtab *)pVTab;
2898 int rc;
2899
2900 TRACE(("FTS2 Destroy %p\n", pVTab));
2901 rc = sql_exec(v->db, v->zDb, v->zName,
2902 "drop table if exists %_content;"
2903 "drop table if exists %_segments;"
2904 "drop table if exists %_segdir;"
2905 );
2906 if( rc!=SQLITE_OK ) return rc;
2907
2908 fulltext_vtab_destroy((fulltext_vtab *)pVTab);
2909 return SQLITE_OK;
2910}
2911
2912static int fulltextOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){
2913 fulltext_cursor *c;
2914
2915 c = (fulltext_cursor *) calloc(sizeof(fulltext_cursor), 1);
2916 /* sqlite will initialize c->base */
2917 *ppCursor = &c->base;
2918 TRACE(("FTS2 Open %p: %p\n", pVTab, c));
2919
2920 return SQLITE_OK;
2921}
2922
2923
2924/* Free all of the dynamically allocated memory held by *q
2925*/
2926static void queryClear(Query *q){
2927 int i;
2928 for(i = 0; i < q->nTerms; ++i){
2929 free(q->pTerms[i].pTerm);
2930 }
2931 free(q->pTerms);
2932 CLEAR(q);
2933}
2934
2935/* Free all of the dynamically allocated memory held by the
2936** Snippet
2937*/
2938static void snippetClear(Snippet *p){
2939 free(p->aMatch);
2940 free(p->zOffset);
2941 free(p->zSnippet);
2942 CLEAR(p);
2943}
2944/*
2945** Append a single entry to the p->aMatch[] log.
2946*/
2947static void snippetAppendMatch(
2948 Snippet *p, /* Append the entry to this snippet */
2949 int iCol, int iTerm, /* The column and query term */
2950 int iStart, int nByte /* Offset and size of the match */
2951){
2952 int i;
2953 struct snippetMatch *pMatch;
2954 if( p->nMatch+1>=p->nAlloc ){
2955 p->nAlloc = p->nAlloc*2 + 10;
2956 p->aMatch = realloc(p->aMatch, p->nAlloc*sizeof(p->aMatch[0]) );
2957 if( p->aMatch==0 ){
2958 p->nMatch = 0;
2959 p->nAlloc = 0;
2960 return;
2961 }
2962 }
2963 i = p->nMatch++;
2964 pMatch = &p->aMatch[i];
2965 pMatch->iCol = iCol;
2966 pMatch->iTerm = iTerm;
2967 pMatch->iStart = iStart;
2968 pMatch->nByte = nByte;
2969}
2970
2971/*
2972** Sizing information for the circular buffer used in snippetOffsetsOfColumn()
2973*/
2974#define FTS2_ROTOR_SZ (32)
2975#define FTS2_ROTOR_MASK (FTS2_ROTOR_SZ-1)
2976
2977/*
2978** Add entries to pSnippet->aMatch[] for every match that occurs against
2979** document zDoc[0..nDoc-1] which is stored in column iColumn.
2980*/
2981static void snippetOffsetsOfColumn(
2982 Query *pQuery,
2983 Snippet *pSnippet,
2984 int iColumn,
2985 const char *zDoc,
2986 int nDoc
2987){
2988 const sqlite3_tokenizer_module *pTModule; /* The tokenizer module */
2989 sqlite3_tokenizer *pTokenizer; /* The specific tokenizer */
2990 sqlite3_tokenizer_cursor *pTCursor; /* Tokenizer cursor */
2991 fulltext_vtab *pVtab; /* The full text index */
2992 int nColumn; /* Number of columns in the index */
2993 const QueryTerm *aTerm; /* Query string terms */
2994 int nTerm; /* Number of query string terms */
2995 int i, j; /* Loop counters */
2996 int rc; /* Return code */
2997 unsigned int match, prevMatch; /* Phrase search bitmasks */
2998 const char *zToken; /* Next token from the tokenizer */
2999 int nToken; /* Size of zToken */
3000 int iBegin, iEnd, iPos; /* Offsets of beginning and end */
3001
3002 /* The following variables keep a circular buffer of the last
3003 ** few tokens */
3004 unsigned int iRotor = 0; /* Index of current token */
3005 int iRotorBegin[FTS2_ROTOR_SZ]; /* Beginning offset of token */
3006 int iRotorLen[FTS2_ROTOR_SZ]; /* Length of token */
3007
3008 pVtab = pQuery->pFts;
3009 nColumn = pVtab->nColumn;
3010 pTokenizer = pVtab->pTokenizer;
3011 pTModule = pTokenizer->pModule;
3012 rc = pTModule->xOpen(pTokenizer, zDoc, nDoc, &pTCursor);
3013 if( rc ) return;
3014 pTCursor->pTokenizer = pTokenizer;
3015 aTerm = pQuery->pTerms;
3016 nTerm = pQuery->nTerms;
3017 if( nTerm>=FTS2_ROTOR_SZ ){
3018 nTerm = FTS2_ROTOR_SZ - 1;
3019 }
3020 prevMatch = 0;
3021 while(1){
3022 rc = pTModule->xNext(pTCursor, &zToken, &nToken, &iBegin, &iEnd, &iPos);
3023 if( rc ) break;
3024 iRotorBegin[iRotor&FTS2_ROTOR_MASK] = iBegin;
3025 iRotorLen[iRotor&FTS2_ROTOR_MASK] = iEnd-iBegin;
3026 match = 0;
3027 for(i=0; i<nTerm; i++){
3028 int iCol;
3029 iCol = aTerm[i].iColumn;
3030 if( iCol>=0 && iCol<nColumn && iCol!=iColumn ) continue;
3031 if( aTerm[i].nTerm>nToken ) continue;
3032 if( !aTerm[i].isPrefix && aTerm[i].nTerm<nToken ) continue;
3033 assert( aTerm[i].nTerm<=nToken );
3034 if( memcmp(aTerm[i].pTerm, zToken, aTerm[i].nTerm) ) continue;
3035 if( aTerm[i].iPhrase>1 && (prevMatch & (1<<i))==0 ) continue;
3036 match |= 1<<i;
3037 if( i==nTerm-1 || aTerm[i+1].iPhrase==1 ){
3038 for(j=aTerm[i].iPhrase-1; j>=0; j--){
3039 int k = (iRotor-j) & FTS2_ROTOR_MASK;
3040 snippetAppendMatch(pSnippet, iColumn, i-j,
3041 iRotorBegin[k], iRotorLen[k]);
3042 }
3043 }
3044 }
3045 prevMatch = match<<1;
3046 iRotor++;
3047 }
3048 pTModule->xClose(pTCursor);
3049}
3050
3051
3052/*
3053** Compute all offsets for the current row of the query.
3054** If the offsets have already been computed, this routine is a no-op.
3055*/
3056static void snippetAllOffsets(fulltext_cursor *p){
3057 int nColumn;
3058 int iColumn, i;
3059 int iFirst, iLast;
3060 fulltext_vtab *pFts;
3061
3062 if( p->snippet.nMatch ) return;
3063 if( p->q.nTerms==0 ) return;
3064 pFts = p->q.pFts;
3065 nColumn = pFts->nColumn;
3066 iColumn = (p->iCursorType - QUERY_FULLTEXT);
3067 if( iColumn<0 || iColumn>=nColumn ){
3068 iFirst = 0;
3069 iLast = nColumn-1;
3070 }else{
3071 iFirst = iColumn;
3072 iLast = iColumn;
3073 }
3074 for(i=iFirst; i<=iLast; i++){
3075 const char *zDoc;
3076 int nDoc;
3077 zDoc = (const char*)sqlite3_column_text(p->pStmt, i+1);
3078 nDoc = sqlite3_column_bytes(p->pStmt, i+1);
3079 snippetOffsetsOfColumn(&p->q, &p->snippet, i, zDoc, nDoc);
3080 }
3081}
3082
3083/*
3084** Convert the information in the aMatch[] array of the snippet
3085** into the string zOffset[0..nOffset-1].
3086*/
3087static void snippetOffsetText(Snippet *p){
3088 int i;
3089 int cnt = 0;
3090 StringBuffer sb;
3091 char zBuf[200];
3092 if( p->zOffset ) return;
3093 initStringBuffer(&sb);
3094 for(i=0; i<p->nMatch; i++){
3095 struct snippetMatch *pMatch = &p->aMatch[i];
3096 zBuf[0] = ' ';
3097 sprintf(&zBuf[cnt>0], "%d %d %d %d", pMatch->iCol,
3098 pMatch->iTerm, pMatch->iStart, pMatch->nByte);
3099 append(&sb, zBuf);
3100 cnt++;
3101 }
3102 p->zOffset = stringBufferData(&sb);
3103 p->nOffset = stringBufferLength(&sb);
3104}
3105
3106/*
3107** zDoc[0..nDoc-1] is phrase of text. aMatch[0..nMatch-1] are a set
3108** of matching words some of which might be in zDoc. zDoc is column
3109** number iCol.
3110**
3111** iBreak is suggested spot in zDoc where we could begin or end an
3112** excerpt. Return a value similar to iBreak but possibly adjusted
3113** to be a little left or right so that the break point is better.
3114*/
3115static int wordBoundary(
3116 int iBreak, /* The suggested break point */
3117 const char *zDoc, /* Document text */
3118 int nDoc, /* Number of bytes in zDoc[] */
3119 struct snippetMatch *aMatch, /* Matching words */
3120 int nMatch, /* Number of entries in aMatch[] */
3121 int iCol /* The column number for zDoc[] */
3122){
3123 int i;
3124 if( iBreak<=10 ){
3125 return 0;
3126 }
3127 if( iBreak>=nDoc-10 ){
3128 return nDoc;
3129 }
3130 for(i=0; i<nMatch && aMatch[i].iCol<iCol; i++){}
3131 while( i<nMatch && aMatch[i].iStart+aMatch[i].nByte<iBreak ){ i++; }
3132 if( i<nMatch ){
3133 if( aMatch[i].iStart<iBreak+10 ){
3134 return aMatch[i].iStart;
3135 }
3136 if( i>0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){
3137 return aMatch[i-1].iStart;
3138 }
3139 }
3140 for(i=1; i<=10; i++){
3141 if( safe_isspace(zDoc[iBreak-i]) ){
3142 return iBreak - i + 1;
3143 }
3144 if( safe_isspace(zDoc[iBreak+i]) ){
3145 return iBreak + i + 1;
3146 }
3147 }
3148 return iBreak;
3149}
3150
3151
3152
3153/*
3154** Allowed values for Snippet.aMatch[].snStatus
3155*/
3156#define SNIPPET_IGNORE 0 /* It is ok to omit this match from the snippet */
3157#define SNIPPET_DESIRED 1 /* We want to include this match in the snippet */
3158
3159/*
3160** Generate the text of a snippet.
3161*/
3162static void snippetText(
3163 fulltext_cursor *pCursor, /* The cursor we need the snippet for */
3164 const char *zStartMark, /* Markup to appear before each match */
3165 const char *zEndMark, /* Markup to appear after each match */
3166 const char *zEllipsis /* Ellipsis mark */
3167){
3168 int i, j;
3169 struct snippetMatch *aMatch;
3170 int nMatch;
3171 int nDesired;
3172 StringBuffer sb;
3173 int tailCol;
3174 int tailOffset;
3175 int iCol;
3176 int nDoc;
3177 const char *zDoc;
3178 int iStart, iEnd;
3179 int tailEllipsis = 0;
3180 int iMatch;
3181
3182
3183 free(pCursor->snippet.zSnippet);
3184 pCursor->snippet.zSnippet = 0;
3185 aMatch = pCursor->snippet.aMatch;
3186 nMatch = pCursor->snippet.nMatch;
3187 initStringBuffer(&sb);
3188
3189 for(i=0; i<nMatch; i++){
3190 aMatch[i].snStatus = SNIPPET_IGNORE;
3191 }
3192 nDesired = 0;
3193 for(i=0; i<pCursor->q.nTerms; i++){
3194 for(j=0; j<nMatch; j++){
3195 if( aMatch[j].iTerm==i ){
3196 aMatch[j].snStatus = SNIPPET_DESIRED;
3197 nDesired++;
3198 break;
3199 }
3200 }
3201 }
3202
3203 iMatch = 0;
3204 tailCol = -1;
3205 tailOffset = 0;
3206 for(i=0; i<nMatch && nDesired>0; i++){
3207 if( aMatch[i].snStatus!=SNIPPET_DESIRED ) continue;
3208 nDesired--;
3209 iCol = aMatch[i].iCol;
3210 zDoc = (const char*)sqlite3_column_text(pCursor->pStmt, iCol+1);
3211 nDoc = sqlite3_column_bytes(pCursor->pStmt, iCol+1);
3212 iStart = aMatch[i].iStart - 40;
3213 iStart = wordBoundary(iStart, zDoc, nDoc, aMatch, nMatch, iCol);
3214 if( iStart<=10 ){
3215 iStart = 0;
3216 }
3217 if( iCol==tailCol && iStart<=tailOffset+20 ){
3218 iStart = tailOffset;
3219 }
3220 if( (iCol!=tailCol && tailCol>=0) || iStart!=tailOffset ){
3221 trimWhiteSpace(&sb);
3222 appendWhiteSpace(&sb);
3223 append(&sb, zEllipsis);
3224 appendWhiteSpace(&sb);
3225 }
3226 iEnd = aMatch[i].iStart + aMatch[i].nByte + 40;
3227 iEnd = wordBoundary(iEnd, zDoc, nDoc, aMatch, nMatch, iCol);
3228 if( iEnd>=nDoc-10 ){
3229 iEnd = nDoc;
3230 tailEllipsis = 0;
3231 }else{
3232 tailEllipsis = 1;
3233 }
3234 while( iMatch<nMatch && aMatch[iMatch].iCol<iCol ){ iMatch++; }
3235 while( iStart<iEnd ){
3236 while( iMatch<nMatch && aMatch[iMatch].iStart<iStart
3237 && aMatch[iMatch].iCol<=iCol ){
3238 iMatch++;
3239 }
3240 if( iMatch<nMatch && aMatch[iMatch].iStart<iEnd
3241 && aMatch[iMatch].iCol==iCol ){
3242 nappend(&sb, &zDoc[iStart], aMatch[iMatch].iStart - iStart);
3243 iStart = aMatch[iMatch].iStart;
3244 append(&sb, zStartMark);
3245 nappend(&sb, &zDoc[iStart], aMatch[iMatch].nByte);
3246 append(&sb, zEndMark);
3247 iStart += aMatch[iMatch].nByte;
3248 for(j=iMatch+1; j<nMatch; j++){
3249 if( aMatch[j].iTerm==aMatch[iMatch].iTerm
3250 && aMatch[j].snStatus==SNIPPET_DESIRED ){
3251 nDesired--;
3252 aMatch[j].snStatus = SNIPPET_IGNORE;
3253 }
3254 }
3255 }else{
3256 nappend(&sb, &zDoc[iStart], iEnd - iStart);
3257 iStart = iEnd;
3258 }
3259 }
3260 tailCol = iCol;
3261 tailOffset = iEnd;
3262 }
3263 trimWhiteSpace(&sb);
3264 if( tailEllipsis ){
3265 appendWhiteSpace(&sb);
3266 append(&sb, zEllipsis);
3267 }
3268 pCursor->snippet.zSnippet = stringBufferData(&sb);
3269 pCursor->snippet.nSnippet = stringBufferLength(&sb);
3270}
3271
3272
3273/*
3274** Close the cursor. For additional information see the documentation
3275** on the xClose method of the virtual table interface.
3276*/
3277static int fulltextClose(sqlite3_vtab_cursor *pCursor){
3278 fulltext_cursor *c = (fulltext_cursor *) pCursor;
3279 TRACE(("FTS2 Close %p\n", c));
3280 sqlite3_finalize(c->pStmt);
3281 queryClear(&c->q);
3282 snippetClear(&c->snippet);
3283 if( c->result.nData!=0 ) dlrDestroy(&c->reader);
3284 dataBufferDestroy(&c->result);
3285 free(c);
3286 return SQLITE_OK;
3287}
3288
3289static int fulltextNext(sqlite3_vtab_cursor *pCursor){
3290 fulltext_cursor *c = (fulltext_cursor *) pCursor;
3291 int rc;
3292
3293 TRACE(("FTS2 Next %p\n", pCursor));
3294 snippetClear(&c->snippet);
3295 if( c->iCursorType < QUERY_FULLTEXT ){
3296 /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
3297 rc = sqlite3_step(c->pStmt);
3298 switch( rc ){
3299 case SQLITE_ROW:
3300 c->eof = 0;
3301 return SQLITE_OK;
3302 case SQLITE_DONE:
3303 c->eof = 1;
3304 return SQLITE_OK;
3305 default:
3306 c->eof = 1;
3307 return rc;
3308 }
3309 } else { /* full-text query */
3310 rc = sqlite3_reset(c->pStmt);
3311 if( rc!=SQLITE_OK ) return rc;
3312
3313 if( c->result.nData==0 || dlrAtEnd(&c->reader) ){
3314 c->eof = 1;
3315 return SQLITE_OK;
3316 }
3317 rc = sqlite3_bind_int64(c->pStmt, 1, dlrDocid(&c->reader));
3318 dlrStep(&c->reader);
3319 if( rc!=SQLITE_OK ) return rc;
3320 /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
3321 rc = sqlite3_step(c->pStmt);
3322 if( rc==SQLITE_ROW ){ /* the case we expect */
3323 c->eof = 0;
3324 return SQLITE_OK;
3325 }
3326 /* an error occurred; abort */
3327 return rc==SQLITE_DONE ? SQLITE_ERROR : rc;
3328 }
3329}
3330
3331
3332/* TODO(shess) If we pushed LeafReader to the top of the file, or to
3333** another file, term_select() could be pushed above
3334** docListOfTerm().
3335*/
3336static int termSelect(fulltext_vtab *v, int iColumn,
3337 const char *pTerm, int nTerm, int isPrefix,
3338 DocListType iType, DataBuffer *out);
3339
3340/* Return a DocList corresponding to the query term *pTerm. If *pTerm
3341** is the first term of a phrase query, go ahead and evaluate the phrase
3342** query and return the doclist for the entire phrase query.
3343**
3344** The resulting DL_DOCIDS doclist is stored in pResult, which is
3345** overwritten.
3346*/
3347static int docListOfTerm(
3348 fulltext_vtab *v, /* The full text index */
3349 int iColumn, /* column to restrict to. No restriction if >=nColumn */
3350 QueryTerm *pQTerm, /* Term we are looking for, or 1st term of a phrase */
3351 DataBuffer *pResult /* Write the result here */
3352){
3353 DataBuffer left, right, new;
3354 int i, rc;
3355
3356 /* No phrase search if no position info. */
3357 assert( pQTerm->nPhrase==0 || DL_DEFAULT!=DL_DOCIDS );
3358
3359 /* This code should never be called with buffered updates. */
3360 assert( v->nPendingData<0 );
3361
3362 dataBufferInit(&left, 0);
3363 rc = termSelect(v, iColumn, pQTerm->pTerm, pQTerm->nTerm, pQTerm->isPrefix,
3364 0<pQTerm->nPhrase ? DL_POSITIONS : DL_DOCIDS, &left);
3365 if( rc ) return rc;
3366 for(i=1; i<=pQTerm->nPhrase && left.nData>0; i++){
3367 dataBufferInit(&right, 0);
3368 rc = termSelect(v, iColumn, pQTerm[i].pTerm, pQTerm[i].nTerm,
3369 pQTerm[i].isPrefix, DL_POSITIONS, &right);
3370 if( rc ){
3371 dataBufferDestroy(&left);
3372 return rc;
3373 }
3374 dataBufferInit(&new, 0);
3375 docListPhraseMerge(left.pData, left.nData, right.pData, right.nData,
3376 i<pQTerm->nPhrase ? DL_POSITIONS : DL_DOCIDS, &new);
3377 dataBufferDestroy(&left);
3378 dataBufferDestroy(&right);
3379 left = new;
3380 }
3381 *pResult = left;
3382 return SQLITE_OK;
3383}
3384
3385/* Add a new term pTerm[0..nTerm-1] to the query *q.
3386*/
3387static void queryAdd(Query *q, const char *pTerm, int nTerm){
3388 QueryTerm *t;
3389 ++q->nTerms;
3390 q->pTerms = realloc(q->pTerms, q->nTerms * sizeof(q->pTerms[0]));
3391 if( q->pTerms==0 ){
3392 q->nTerms = 0;
3393 return;
3394 }
3395 t = &q->pTerms[q->nTerms - 1];
3396 CLEAR(t);
3397 t->pTerm = malloc(nTerm+1);
3398 memcpy(t->pTerm, pTerm, nTerm);
3399 t->pTerm[nTerm] = 0;
3400 t->nTerm = nTerm;
3401 t->isOr = q->nextIsOr;
3402 t->isPrefix = 0;
3403 q->nextIsOr = 0;
3404 t->iColumn = q->nextColumn;
3405 q->nextColumn = q->dfltColumn;
3406}
3407
3408/*
3409** Check to see if the string zToken[0...nToken-1] matches any
3410** column name in the virtual table. If it does,
3411** return the zero-indexed column number. If not, return -1.
3412*/
3413static int checkColumnSpecifier(
3414 fulltext_vtab *pVtab, /* The virtual table */
3415 const char *zToken, /* Text of the token */
3416 int nToken /* Number of characters in the token */
3417){
3418 int i;
3419 for(i=0; i<pVtab->nColumn; i++){
3420 if( memcmp(pVtab->azColumn[i], zToken, nToken)==0
3421 && pVtab->azColumn[i][nToken]==0 ){
3422 return i;
3423 }
3424 }
3425 return -1;
3426}
3427
3428/*
3429** Parse the text at pSegment[0..nSegment-1]. Add additional terms
3430** to the query being assemblied in pQuery.
3431**
3432** inPhrase is true if pSegment[0..nSegement-1] is contained within
3433** double-quotes. If inPhrase is true, then the first term
3434** is marked with the number of terms in the phrase less one and
3435** OR and "-" syntax is ignored. If inPhrase is false, then every
3436** term found is marked with nPhrase=0 and OR and "-" syntax is significant.
3437*/
3438static int tokenizeSegment(
3439 sqlite3_tokenizer *pTokenizer, /* The tokenizer to use */
3440 const char *pSegment, int nSegment, /* Query expression being parsed */
3441 int inPhrase, /* True if within "..." */
3442 Query *pQuery /* Append results here */
3443){
3444 const sqlite3_tokenizer_module *pModule = pTokenizer->pModule;
3445 sqlite3_tokenizer_cursor *pCursor;
3446 int firstIndex = pQuery->nTerms;
3447 int iCol;
3448 int nTerm = 1;
3449
3450 int rc = pModule->xOpen(pTokenizer, pSegment, nSegment, &pCursor);
3451 if( rc!=SQLITE_OK ) return rc;
3452 pCursor->pTokenizer = pTokenizer;
3453
3454 while( 1 ){
3455 const char *pToken;
3456 int nToken, iBegin, iEnd, iPos;
3457
3458 rc = pModule->xNext(pCursor,
3459 &pToken, &nToken,
3460 &iBegin, &iEnd, &iPos);
3461 if( rc!=SQLITE_OK ) break;
3462 if( !inPhrase &&
3463 pSegment[iEnd]==':' &&
3464 (iCol = checkColumnSpecifier(pQuery->pFts, pToken, nToken))>=0 ){
3465 pQuery->nextColumn = iCol;
3466 continue;
3467 }
3468 if( !inPhrase && pQuery->nTerms>0 && nToken==2
3469 && pSegment[iBegin]=='O' && pSegment[iBegin+1]=='R' ){
3470 pQuery->nextIsOr = 1;
3471 continue;
3472 }
3473 queryAdd(pQuery, pToken, nToken);
3474 if( !inPhrase && iBegin>0 && pSegment[iBegin-1]=='-' ){
3475 pQuery->pTerms[pQuery->nTerms-1].isNot = 1;
3476 }
3477 if( iEnd<nSegment && pSegment[iEnd]=='*' ){
3478 pQuery->pTerms[pQuery->nTerms-1].isPrefix = 1;
3479 }
3480 pQuery->pTerms[pQuery->nTerms-1].iPhrase = nTerm;
3481 if( inPhrase ){
3482 nTerm++;
3483 }
3484 }
3485
3486 if( inPhrase && pQuery->nTerms>firstIndex ){
3487 pQuery->pTerms[firstIndex].nPhrase = pQuery->nTerms - firstIndex - 1;
3488 }
3489
3490 return pModule->xClose(pCursor);
3491}
3492
3493/* Parse a query string, yielding a Query object pQuery.
3494**
3495** The calling function will need to queryClear() to clean up
3496** the dynamically allocated memory held by pQuery.
3497*/
3498static int parseQuery(
3499 fulltext_vtab *v, /* The fulltext index */
3500 const char *zInput, /* Input text of the query string */
3501 int nInput, /* Size of the input text */
3502 int dfltColumn, /* Default column of the index to match against */
3503 Query *pQuery /* Write the parse results here. */
3504){
3505 int iInput, inPhrase = 0;
3506
3507 if( zInput==0 ) nInput = 0;
3508 if( nInput<0 ) nInput = strlen(zInput);
3509 pQuery->nTerms = 0;
3510 pQuery->pTerms = NULL;
3511 pQuery->nextIsOr = 0;
3512 pQuery->nextColumn = dfltColumn;
3513 pQuery->dfltColumn = dfltColumn;
3514 pQuery->pFts = v;
3515
3516 for(iInput=0; iInput<nInput; ++iInput){
3517 int i;
3518 for(i=iInput; i<nInput && zInput[i]!='"'; ++i){}
3519 if( i>iInput ){
3520 tokenizeSegment(v->pTokenizer, zInput+iInput, i-iInput, inPhrase,
3521 pQuery);
3522 }
3523 iInput = i;
3524 if( i<nInput ){
3525 assert( zInput[i]=='"' );
3526 inPhrase = !inPhrase;
3527 }
3528 }
3529
3530 if( inPhrase ){
3531 /* unmatched quote */
3532 queryClear(pQuery);
3533 return SQLITE_ERROR;
3534 }
3535 return SQLITE_OK;
3536}
3537
3538/* TODO(shess) Refactor the code to remove this forward decl. */
3539static int flushPendingTerms(fulltext_vtab *v);
3540
3541/* Perform a full-text query using the search expression in
3542** zInput[0..nInput-1]. Return a list of matching documents
3543** in pResult.
3544**
3545** Queries must match column iColumn. Or if iColumn>=nColumn
3546** they are allowed to match against any column.
3547*/
3548static int fulltextQuery(
3549 fulltext_vtab *v, /* The full text index */
3550 int iColumn, /* Match against this column by default */
3551 const char *zInput, /* The query string */
3552 int nInput, /* Number of bytes in zInput[] */
3553 DataBuffer *pResult, /* Write the result doclist here */
3554 Query *pQuery /* Put parsed query string here */
3555){
3556 int i, iNext, rc;
3557 DataBuffer left, right, or, new;
3558 int nNot = 0;
3559 QueryTerm *aTerm;
3560
3561 /* TODO(shess) Instead of flushing pendingTerms, we could query for
3562 ** the relevant term and merge the doclist into what we receive from
3563 ** the database. Wait and see if this is a common issue, first.
3564 **
3565 ** A good reason not to flush is to not generate update-related
3566 ** error codes from here.
3567 */
3568
3569 /* Flush any buffered updates before executing the query. */
3570 rc = flushPendingTerms(v);
3571 if( rc!=SQLITE_OK ) return rc;
3572
3573 /* TODO(shess) I think that the queryClear() calls below are not
3574 ** necessary, because fulltextClose() already clears the query.
3575 */
3576 rc = parseQuery(v, zInput, nInput, iColumn, pQuery);
3577 if( rc!=SQLITE_OK ) return rc;
3578
3579 /* Empty or NULL queries return no results. */
3580 if( pQuery->nTerms==0 ){
3581 dataBufferInit(pResult, 0);
3582 return SQLITE_OK;
3583 }
3584
3585 /* Merge AND terms. */
3586 /* TODO(shess) I think we can early-exit if( i>nNot && left.nData==0 ). */
3587 aTerm = pQuery->pTerms;
3588 for(i = 0; i<pQuery->nTerms; i=iNext){
3589 if( aTerm[i].isNot ){
3590 /* Handle all NOT terms in a separate pass */
3591 nNot++;
3592 iNext = i + aTerm[i].nPhrase+1;
3593 continue;
3594 }
3595 iNext = i + aTerm[i].nPhrase + 1;
3596 rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &right);
3597 if( rc ){
3598 if( i!=nNot ) dataBufferDestroy(&left);
3599 queryClear(pQuery);
3600 return rc;
3601 }
3602 while( iNext<pQuery->nTerms && aTerm[iNext].isOr ){
3603 rc = docListOfTerm(v, aTerm[iNext].iColumn, &aTerm[iNext], &or);
3604 iNext += aTerm[iNext].nPhrase + 1;
3605 if( rc ){
3606 if( i!=nNot ) dataBufferDestroy(&left);
3607 dataBufferDestroy(&right);
3608 queryClear(pQuery);
3609 return rc;
3610 }
3611 dataBufferInit(&new, 0);
3612 docListOrMerge(right.pData, right.nData, or.pData, or.nData, &new);
3613 dataBufferDestroy(&right);
3614 dataBufferDestroy(&or);
3615 right = new;
3616 }
3617 if( i==nNot ){ /* first term processed. */
3618 left = right;
3619 }else{
3620 dataBufferInit(&new, 0);
3621 docListAndMerge(left.pData, left.nData, right.pData, right.nData, &new);
3622 dataBufferDestroy(&right);
3623 dataBufferDestroy(&left);
3624 left = new;
3625 }
3626 }
3627
3628 if( nNot==pQuery->nTerms ){
3629 /* We do not yet know how to handle a query of only NOT terms */
3630 return SQLITE_ERROR;
3631 }
3632
3633 /* Do the EXCEPT terms */
3634 for(i=0; i<pQuery->nTerms; i += aTerm[i].nPhrase + 1){
3635 if( !aTerm[i].isNot ) continue;
3636 rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &right);
3637 if( rc ){
3638 queryClear(pQuery);
3639 dataBufferDestroy(&left);
3640 return rc;
3641 }
3642 dataBufferInit(&new, 0);
3643 docListExceptMerge(left.pData, left.nData, right.pData, right.nData, &new);
3644 dataBufferDestroy(&right);
3645 dataBufferDestroy(&left);
3646 left = new;
3647 }
3648
3649 *pResult = left;
3650 return rc;
3651}
3652
3653/*
3654** This is the xFilter interface for the virtual table. See
3655** the virtual table xFilter method documentation for additional
3656** information.
3657**
3658** If idxNum==QUERY_GENERIC then do a full table scan against
3659** the %_content table.
3660**
3661** If idxNum==QUERY_ROWID then do a rowid lookup for a single entry
3662** in the %_content table.
3663**
3664** If idxNum>=QUERY_FULLTEXT then use the full text index. The
3665** column on the left-hand side of the MATCH operator is column
3666** number idxNum-QUERY_FULLTEXT, 0 indexed. argv[0] is the right-hand
3667** side of the MATCH operator.
3668*/
3669/* TODO(shess) Upgrade the cursor initialization and destruction to
3670** account for fulltextFilter() being called multiple times on the
3671** same cursor. The current solution is very fragile. Apply fix to
3672** fts2 as appropriate.
3673*/
3674static int fulltextFilter(
3675 sqlite3_vtab_cursor *pCursor, /* The cursor used for this query */
3676 int idxNum, const char *idxStr, /* Which indexing scheme to use */
3677 int argc, sqlite3_value **argv /* Arguments for the indexing scheme */
3678){
3679 fulltext_cursor *c = (fulltext_cursor *) pCursor;
3680 fulltext_vtab *v = cursor_vtab(c);
3681 int rc;
3682 char *zSql;
3683
3684 TRACE(("FTS2 Filter %p\n",pCursor));
3685
3686 zSql = sqlite3_mprintf("select rowid, * from %%_content %s",
3687 idxNum==QUERY_GENERIC ? "" : "where rowid=?");
3688 sqlite3_finalize(c->pStmt);
3689 rc = sql_prepare(v->db, v->zDb, v->zName, &c->pStmt, zSql);
3690 sqlite3_free(zSql);
3691 if( rc!=SQLITE_OK ) return rc;
3692
3693 c->iCursorType = idxNum;
3694 switch( idxNum ){
3695 case QUERY_GENERIC:
3696 break;
3697
3698 case QUERY_ROWID:
3699 rc = sqlite3_bind_int64(c->pStmt, 1, sqlite3_value_int64(argv[0]));
3700 if( rc!=SQLITE_OK ) return rc;
3701 break;
3702
3703 default: /* full-text search */
3704 {
3705 const char *zQuery = (const char *)sqlite3_value_text(argv[0]);
3706 assert( idxNum<=QUERY_FULLTEXT+v->nColumn);
3707 assert( argc==1 );
3708 queryClear(&c->q);
3709 if( c->result.nData!=0 ){
3710 /* This case happens if the same cursor is used repeatedly. */
3711 dlrDestroy(&c->reader);
3712 dataBufferReset(&c->result);
3713 }else{
3714 dataBufferInit(&c->result, 0);
3715 }
3716 rc = fulltextQuery(v, idxNum-QUERY_FULLTEXT, zQuery, -1, &c->result, &c->q);
3717 if( rc!=SQLITE_OK ) return rc;
3718 if( c->result.nData!=0 ){
3719 dlrInit(&c->reader, DL_DOCIDS, c->result.pData, c->result.nData);
3720 }
3721 break;
3722 }
3723 }
3724
3725 return fulltextNext(pCursor);
3726}
3727
3728/* This is the xEof method of the virtual table. The SQLite core
3729** calls this routine to find out if it has reached the end of
3730** a query's results set.
3731*/
3732static int fulltextEof(sqlite3_vtab_cursor *pCursor){
3733 fulltext_cursor *c = (fulltext_cursor *) pCursor;
3734 return c->eof;
3735}
3736
3737/* This is the xColumn method of the virtual table. The SQLite
3738** core calls this method during a query when it needs the value
3739** of a column from the virtual table. This method needs to use
3740** one of the sqlite3_result_*() routines to store the requested
3741** value back in the pContext.
3742*/
3743static int fulltextColumn(sqlite3_vtab_cursor *pCursor,
3744 sqlite3_context *pContext, int idxCol){
3745 fulltext_cursor *c = (fulltext_cursor *) pCursor;
3746 fulltext_vtab *v = cursor_vtab(c);
3747
3748 if( idxCol<v->nColumn ){
3749 sqlite3_value *pVal = sqlite3_column_value(c->pStmt, idxCol+1);
3750 sqlite3_result_value(pContext, pVal);
3751 }else if( idxCol==v->nColumn ){
3752 /* The extra column whose name is the same as the table.
3753 ** Return a blob which is a pointer to the cursor
3754 */
3755 sqlite3_result_blob(pContext, &c, sizeof(c), SQLITE_TRANSIENT);
3756 }
3757 return SQLITE_OK;
3758}
3759
3760/* This is the xRowid method. The SQLite core calls this routine to
3761** retrive the rowid for the current row of the result set. The
3762** rowid should be written to *pRowid.
3763*/
3764static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){
3765 fulltext_cursor *c = (fulltext_cursor *) pCursor;
3766
3767 *pRowid = sqlite3_column_int64(c->pStmt, 0);
3768 return SQLITE_OK;
3769}
3770
3771/* Add all terms in [zText] to pendingTerms table. If [iColumn] > 0,
3772** we also store positions and offsets in the hash table using that
3773** column number.
3774*/
3775static int buildTerms(fulltext_vtab *v, sqlite_int64 iDocid,
3776 const char *zText, int iColumn){
3777 sqlite3_tokenizer *pTokenizer = v->pTokenizer;
3778 sqlite3_tokenizer_cursor *pCursor;
3779 const char *pToken;
3780 int nTokenBytes;
3781 int iStartOffset, iEndOffset, iPosition;
3782 int rc;
3783
3784 rc = pTokenizer->pModule->xOpen(pTokenizer, zText, -1, &pCursor);
3785 if( rc!=SQLITE_OK ) return rc;
3786
3787 pCursor->pTokenizer = pTokenizer;
3788 while( SQLITE_OK==pTokenizer->pModule->xNext(pCursor,
3789 &pToken, &nTokenBytes,
3790 &iStartOffset, &iEndOffset,
3791 &iPosition) ){
3792 DLCollector *p;
3793 int nData; /* Size of doclist before our update. */
3794
3795 /* Positions can't be negative; we use -1 as a terminator internally. */
3796 if( iPosition<0 ){
3797 pTokenizer->pModule->xClose(pCursor);
3798 return SQLITE_ERROR;
3799 }
3800
3801 p = fts2HashFind(&v->pendingTerms, pToken, nTokenBytes);
3802 if( p==NULL ){
3803 nData = 0;
3804 p = dlcNew(iDocid, DL_DEFAULT);
3805 fts2HashInsert(&v->pendingTerms, pToken, nTokenBytes, p);
3806
3807 /* Overhead for our hash table entry, the key, and the value. */
3808 v->nPendingData += sizeof(struct fts2HashElem)+sizeof(*p)+nTokenBytes;
3809 }else{
3810 nData = p->b.nData;
3811 if( p->dlw.iPrevDocid!=iDocid ) dlcNext(p, iDocid);
3812 }
3813 if( iColumn>=0 ){
3814 dlcAddPos(p, iColumn, iPosition, iStartOffset, iEndOffset);
3815 }
3816
3817 /* Accumulate data added by dlcNew or dlcNext, and dlcAddPos. */
3818 v->nPendingData += p->b.nData-nData;
3819 }
3820
3821 /* TODO(shess) Check return? Should this be able to cause errors at
3822 ** this point? Actually, same question about sqlite3_finalize(),
3823 ** though one could argue that failure there means that the data is
3824 ** not durable. *ponder*
3825 */
3826 pTokenizer->pModule->xClose(pCursor);
3827 return rc;
3828}
3829
3830/* Add doclists for all terms in [pValues] to pendingTerms table. */
3831static int insertTerms(fulltext_vtab *v, sqlite_int64 iRowid,
3832 sqlite3_value **pValues){
3833 int i;
3834 for(i = 0; i < v->nColumn ; ++i){
3835 char *zText = (char*)sqlite3_value_text(pValues[i]);
3836 int rc = buildTerms(v, iRowid, zText, i);
3837 if( rc!=SQLITE_OK ) return rc;
3838 }
3839 return SQLITE_OK;
3840}
3841
3842/* Add empty doclists for all terms in the given row's content to
3843** pendingTerms.
3844*/
3845static int deleteTerms(fulltext_vtab *v, sqlite_int64 iRowid){
3846 const char **pValues;
3847 int i, rc;
3848
3849 /* TODO(shess) Should we allow such tables at all? */
3850 if( DL_DEFAULT==DL_DOCIDS ) return SQLITE_ERROR;
3851
3852 rc = content_select(v, iRowid, &pValues);
3853 if( rc!=SQLITE_OK ) return rc;
3854
3855 for(i = 0 ; i < v->nColumn; ++i) {
3856 rc = buildTerms(v, iRowid, pValues[i], -1);
3857 if( rc!=SQLITE_OK ) break;
3858 }
3859
3860 freeStringArray(v->nColumn, pValues);
3861 return SQLITE_OK;
3862}
3863
3864/* TODO(shess) Refactor the code to remove this forward decl. */
3865static int initPendingTerms(fulltext_vtab *v, sqlite_int64 iDocid);
3866
3867/* Insert a row into the %_content table; set *piRowid to be the ID of the
3868** new row. Add doclists for terms to pendingTerms.
3869*/
3870static int index_insert(fulltext_vtab *v, sqlite3_value *pRequestRowid,
3871 sqlite3_value **pValues, sqlite_int64 *piRowid){
3872 int rc;
3873
3874 rc = content_insert(v, pRequestRowid, pValues); /* execute an SQL INSERT */
3875 if( rc!=SQLITE_OK ) return rc;
3876
3877 *piRowid = sqlite3_last_insert_rowid(v->db);
3878 rc = initPendingTerms(v, *piRowid);
3879 if( rc!=SQLITE_OK ) return rc;
3880
3881 return insertTerms(v, *piRowid, pValues);
3882}
3883
3884/* Delete a row from the %_content table; add empty doclists for terms
3885** to pendingTerms.
3886*/
3887static int index_delete(fulltext_vtab *v, sqlite_int64 iRow){
3888 int rc = initPendingTerms(v, iRow);
3889 if( rc!=SQLITE_OK ) return rc;
3890
3891 rc = deleteTerms(v, iRow);
3892 if( rc!=SQLITE_OK ) return rc;
3893
3894 return content_delete(v, iRow); /* execute an SQL DELETE */
3895}
3896
3897/* Update a row in the %_content table; add delete doclists to
3898** pendingTerms for old terms not in the new data, add insert doclists
3899** to pendingTerms for terms in the new data.
3900*/
3901static int index_update(fulltext_vtab *v, sqlite_int64 iRow,
3902 sqlite3_value **pValues){
3903 int rc = initPendingTerms(v, iRow);
3904 if( rc!=SQLITE_OK ) return rc;
3905
3906 /* Generate an empty doclist for each term that previously appeared in this
3907 * row. */
3908 rc = deleteTerms(v, iRow);
3909 if( rc!=SQLITE_OK ) return rc;
3910
3911 rc = content_update(v, pValues, iRow); /* execute an SQL UPDATE */
3912 if( rc!=SQLITE_OK ) return rc;
3913
3914 /* Now add positions for terms which appear in the updated row. */
3915 return insertTerms(v, iRow, pValues);
3916}
3917
3918/*******************************************************************/
3919/* InteriorWriter is used to collect terms and block references into
3920** interior nodes in %_segments. See commentary at top of file for
3921** format.
3922*/
3923
3924/* How large interior nodes can grow. */
3925#define INTERIOR_MAX 2048
3926
3927/* Minimum number of terms per interior node (except the root). This
3928** prevents large terms from making the tree too skinny - must be >0
3929** so that the tree always makes progress. Note that the min tree
3930** fanout will be INTERIOR_MIN_TERMS+1.
3931*/
3932#define INTERIOR_MIN_TERMS 7
3933#if INTERIOR_MIN_TERMS<1
3934# error INTERIOR_MIN_TERMS must be greater than 0.
3935#endif
3936
3937/* ROOT_MAX controls how much data is stored inline in the segment
3938** directory.
3939*/
3940/* TODO(shess) Push ROOT_MAX down to whoever is writing things. It's
3941** only here so that interiorWriterRootInfo() and leafWriterRootInfo()
3942** can both see it, but if the caller passed it in, we wouldn't even
3943** need a define.
3944*/
3945#define ROOT_MAX 1024
3946#if ROOT_MAX<VARINT_MAX*2
3947# error ROOT_MAX must have enough space for a header.
3948#endif
3949
3950/* InteriorBlock stores a linked-list of interior blocks while a lower
3951** layer is being constructed.
3952*/
3953typedef struct InteriorBlock {
3954 DataBuffer term; /* Leftmost term in block's subtree. */
3955 DataBuffer data; /* Accumulated data for the block. */
3956 struct InteriorBlock *next;
3957} InteriorBlock;
3958
3959static InteriorBlock *interiorBlockNew(int iHeight, sqlite_int64 iChildBlock,
3960 const char *pTerm, int nTerm){
3961 InteriorBlock *block = calloc(1, sizeof(InteriorBlock));
3962 char c[VARINT_MAX+VARINT_MAX];
3963 int n;
3964
3965 dataBufferInit(&block->term, 0);
3966 dataBufferReplace(&block->term, pTerm, nTerm);
3967
3968 n = putVarint(c, iHeight);
3969 n += putVarint(c+n, iChildBlock);
3970 dataBufferInit(&block->data, INTERIOR_MAX);
3971 dataBufferReplace(&block->data, c, n);
3972
3973 return block;
3974}
3975
3976#ifndef NDEBUG
3977/* Verify that the data is readable as an interior node. */
3978static void interiorBlockValidate(InteriorBlock *pBlock){
3979 const char *pData = pBlock->data.pData;
3980 int nData = pBlock->data.nData;
3981 int n, iDummy;
3982 sqlite_int64 iBlockid;
3983
3984 assert( nData>0 );
3985 assert( pData!=0 );
3986 assert( pData+nData>pData );
3987
3988 /* Must lead with height of node as a varint(n), n>0 */
3989 n = getVarint32(pData, &iDummy);
3990 assert( n>0 );
3991 assert( iDummy>0 );
3992 assert( n<nData );
3993 pData += n;
3994 nData -= n;
3995
3996 /* Must contain iBlockid. */
3997 n = getVarint(pData, &iBlockid);
3998 assert( n>0 );
3999 assert( n<=nData );
4000 pData += n;
4001 nData -= n;
4002
4003 /* Zero or more terms of positive length */
4004 if( nData!=0 ){
4005 /* First term is not delta-encoded. */
4006 n = getVarint32(pData, &iDummy);
4007 assert( n>0 );
4008 assert( iDummy>0 );
4009 assert( n+iDummy>0);
4010 assert( n+iDummy<=nData );
4011 pData += n+iDummy;
4012 nData -= n+iDummy;
4013
4014 /* Following terms delta-encoded. */
4015 while( nData!=0 ){
4016 /* Length of shared prefix. */
4017 n = getVarint32(pData, &iDummy);
4018 assert( n>0 );
4019 assert( iDummy>=0 );
4020 assert( n<nData );
4021 pData += n;
4022 nData -= n;
4023
4024 /* Length and data of distinct suffix. */
4025 n = getVarint32(pData, &iDummy);
4026 assert( n>0 );
4027 assert( iDummy>0 );
4028 assert( n+iDummy>0);
4029 assert( n+iDummy<=nData );
4030 pData += n+iDummy;
4031 nData -= n+iDummy;
4032 }
4033 }
4034}
4035#define ASSERT_VALID_INTERIOR_BLOCK(x) interiorBlockValidate(x)
4036#else
4037#define ASSERT_VALID_INTERIOR_BLOCK(x) assert( 1 )
4038#endif
4039
4040typedef struct InteriorWriter {
4041 int iHeight; /* from 0 at leaves. */
4042 InteriorBlock *first, *last;
4043 struct InteriorWriter *parentWriter;
4044
4045 DataBuffer term; /* Last term written to block "last". */
4046 sqlite_int64 iOpeningChildBlock; /* First child block in block "last". */
4047#ifndef NDEBUG
4048 sqlite_int64 iLastChildBlock; /* for consistency checks. */
4049#endif
4050} InteriorWriter;
4051
4052/* Initialize an interior node where pTerm[nTerm] marks the leftmost
4053** term in the tree. iChildBlock is the leftmost child block at the
4054** next level down the tree.
4055*/
4056static void interiorWriterInit(int iHeight, const char *pTerm, int nTerm,
4057 sqlite_int64 iChildBlock,
4058 InteriorWriter *pWriter){
4059 InteriorBlock *block;
4060 assert( iHeight>0 );
4061 CLEAR(pWriter);
4062
4063 pWriter->iHeight = iHeight;
4064 pWriter->iOpeningChildBlock = iChildBlock;
4065#ifndef NDEBUG
4066 pWriter->iLastChildBlock = iChildBlock;
4067#endif
4068 block = interiorBlockNew(iHeight, iChildBlock, pTerm, nTerm);
4069 pWriter->last = pWriter->first = block;
4070 ASSERT_VALID_INTERIOR_BLOCK(pWriter->last);
4071 dataBufferInit(&pWriter->term, 0);
4072}
4073
4074/* Append the child node rooted at iChildBlock to the interior node,
4075** with pTerm[nTerm] as the leftmost term in iChildBlock's subtree.
4076*/
4077static void interiorWriterAppend(InteriorWriter *pWriter,
4078 const char *pTerm, int nTerm,
4079 sqlite_int64 iChildBlock){
4080 char c[VARINT_MAX+VARINT_MAX];
4081 int n, nPrefix = 0;
4082
4083 ASSERT_VALID_INTERIOR_BLOCK(pWriter->last);
4084
4085 /* The first term written into an interior node is actually
4086 ** associated with the second child added (the first child was added
4087 ** in interiorWriterInit, or in the if clause at the bottom of this
4088 ** function). That term gets encoded straight up, with nPrefix left
4089 ** at 0.
4090 */
4091 if( pWriter->term.nData==0 ){
4092 n = putVarint(c, nTerm);
4093 }else{
4094 while( nPrefix<pWriter->term.nData &&
4095 pTerm[nPrefix]==pWriter->term.pData[nPrefix] ){
4096 nPrefix++;
4097 }
4098
4099 n = putVarint(c, nPrefix);
4100 n += putVarint(c+n, nTerm-nPrefix);
4101 }
4102
4103#ifndef NDEBUG
4104 pWriter->iLastChildBlock++;
4105#endif
4106 assert( pWriter->iLastChildBlock==iChildBlock );
4107
4108 /* Overflow to a new block if the new term makes the current block
4109 ** too big, and the current block already has enough terms.
4110 */
4111 if( pWriter->last->data.nData+n+nTerm-nPrefix>INTERIOR_MAX &&
4112 iChildBlock-pWriter->iOpeningChildBlock>INTERIOR_MIN_TERMS ){
4113 pWriter->last->next = interiorBlockNew(pWriter->iHeight, iChildBlock,
4114 pTerm, nTerm);
4115 pWriter->last = pWriter->last->next;
4116 pWriter->iOpeningChildBlock = iChildBlock;
4117 dataBufferReset(&pWriter->term);
4118 }else{
4119 dataBufferAppend2(&pWriter->last->data, c, n,
4120 pTerm+nPrefix, nTerm-nPrefix);
4121 dataBufferReplace(&pWriter->term, pTerm, nTerm);
4122 }
4123 ASSERT_VALID_INTERIOR_BLOCK(pWriter->last);
4124}
4125
4126/* Free the space used by pWriter, including the linked-list of
4127** InteriorBlocks, and parentWriter, if present.
4128*/
4129static int interiorWriterDestroy(InteriorWriter *pWriter){
4130 InteriorBlock *block = pWriter->first;
4131
4132 while( block!=NULL ){
4133 InteriorBlock *b = block;
4134 block = block->next;
4135 dataBufferDestroy(&b->term);
4136 dataBufferDestroy(&b->data);
4137 free(b);
4138 }
4139 if( pWriter->parentWriter!=NULL ){
4140 interiorWriterDestroy(pWriter->parentWriter);
4141 free(pWriter->parentWriter);
4142 }
4143 dataBufferDestroy(&pWriter->term);
4144 SCRAMBLE(pWriter);
4145 return SQLITE_OK;
4146}
4147
4148/* If pWriter can fit entirely in ROOT_MAX, return it as the root info
4149** directly, leaving *piEndBlockid unchanged. Otherwise, flush
4150** pWriter to %_segments, building a new layer of interior nodes, and
4151** recursively ask for their root into.
4152*/
4153static int interiorWriterRootInfo(fulltext_vtab *v, InteriorWriter *pWriter,
4154 char **ppRootInfo, int *pnRootInfo,
4155 sqlite_int64 *piEndBlockid){
4156 InteriorBlock *block = pWriter->first;
4157 sqlite_int64 iBlockid = 0;
4158 int rc;
4159
4160 /* If we can fit the segment inline */
4161 if( block==pWriter->last && block->data.nData<ROOT_MAX ){
4162 *ppRootInfo = block->data.pData;
4163 *pnRootInfo = block->data.nData;
4164 return SQLITE_OK;
4165 }
4166
4167 /* Flush the first block to %_segments, and create a new level of
4168 ** interior node.
4169 */
4170 ASSERT_VALID_INTERIOR_BLOCK(block);
4171 rc = block_insert(v, block->data.pData, block->data.nData, &iBlockid);
4172 if( rc!=SQLITE_OK ) return rc;
4173 *piEndBlockid = iBlockid;
4174
4175 pWriter->parentWriter = malloc(sizeof(*pWriter->parentWriter));
4176 interiorWriterInit(pWriter->iHeight+1,
4177 block->term.pData, block->term.nData,
4178 iBlockid, pWriter->parentWriter);
4179
4180 /* Flush additional blocks and append to the higher interior
4181 ** node.
4182 */
4183 for(block=block->next; block!=NULL; block=block->next){
4184 ASSERT_VALID_INTERIOR_BLOCK(block);
4185 rc = block_insert(v, block->data.pData, block->data.nData, &iBlockid);
4186 if( rc!=SQLITE_OK ) return rc;
4187 *piEndBlockid = iBlockid;
4188
4189 interiorWriterAppend(pWriter->parentWriter,
4190 block->term.pData, block->term.nData, iBlockid);
4191 }
4192
4193 /* Parent node gets the chance to be the root. */
4194 return interiorWriterRootInfo(v, pWriter->parentWriter,
4195 ppRootInfo, pnRootInfo, piEndBlockid);
4196}
4197
4198/****************************************************************/
4199/* InteriorReader is used to read off the data from an interior node
4200** (see comment at top of file for the format).
4201*/
4202typedef struct InteriorReader {
4203 const char *pData;
4204 int nData;
4205
4206 DataBuffer term; /* previous term, for decoding term delta. */
4207
4208 sqlite_int64 iBlockid;
4209} InteriorReader;
4210
4211static void interiorReaderDestroy(InteriorReader *pReader){
4212 dataBufferDestroy(&pReader->term);
4213 SCRAMBLE(pReader);
4214}
4215
4216/* TODO(shess) The assertions are great, but what if we're in NDEBUG
4217** and the blob is empty or otherwise contains suspect data?
4218*/
4219static void interiorReaderInit(const char *pData, int nData,
4220 InteriorReader *pReader){
4221 int n, nTerm;
4222
4223 /* Require at least the leading flag byte */
4224 assert( nData>0 );
4225 assert( pData[0]!='\0' );
4226
4227 CLEAR(pReader);
4228
4229 /* Decode the base blockid, and set the cursor to the first term. */
4230 n = getVarint(pData+1, &pReader->iBlockid);
4231 assert( 1+n<=nData );
4232 pReader->pData = pData+1+n;
4233 pReader->nData = nData-(1+n);
4234
4235 /* A single-child interior node (such as when a leaf node was too
4236 ** large for the segment directory) won't have any terms.
4237 ** Otherwise, decode the first term.
4238 */
4239 if( pReader->nData==0 ){
4240 dataBufferInit(&pReader->term, 0);
4241 }else{
4242 n = getVarint32(pReader->pData, &nTerm);
4243 dataBufferInit(&pReader->term, nTerm);
4244 dataBufferReplace(&pReader->term, pReader->pData+n, nTerm);
4245 assert( n+nTerm<=pReader->nData );
4246 pReader->pData += n+nTerm;
4247 pReader->nData -= n+nTerm;
4248 }
4249}
4250
4251static int interiorReaderAtEnd(InteriorReader *pReader){
4252 return pReader->term.nData==0;
4253}
4254
4255static sqlite_int64 interiorReaderCurrentBlockid(InteriorReader *pReader){
4256 return pReader->iBlockid;
4257}
4258
4259static int interiorReaderTermBytes(InteriorReader *pReader){
4260 assert( !interiorReaderAtEnd(pReader) );
4261 return pReader->term.nData;
4262}
4263static const char *interiorReaderTerm(InteriorReader *pReader){
4264 assert( !interiorReaderAtEnd(pReader) );
4265 return pReader->term.pData;
4266}
4267
4268/* Step forward to the next term in the node. */
4269static void interiorReaderStep(InteriorReader *pReader){
4270 assert( !interiorReaderAtEnd(pReader) );
4271
4272 /* If the last term has been read, signal eof, else construct the
4273 ** next term.
4274 */
4275 if( pReader->nData==0 ){
4276 dataBufferReset(&pReader->term);
4277 }else{
4278 int n, nPrefix, nSuffix;
4279
4280 n = getVarint32(pReader->pData, &nPrefix);
4281 n += getVarint32(pReader->pData+n, &nSuffix);
4282
4283 /* Truncate the current term and append suffix data. */
4284 pReader->term.nData = nPrefix;
4285 dataBufferAppend(&pReader->term, pReader->pData+n, nSuffix);
4286
4287 assert( n+nSuffix<=pReader->nData );
4288 pReader->pData += n+nSuffix;
4289 pReader->nData -= n+nSuffix;
4290 }
4291 pReader->iBlockid++;
4292}
4293
4294/* Compare the current term to pTerm[nTerm], returning strcmp-style
4295** results. If isPrefix, equality means equal through nTerm bytes.
4296*/
4297static int interiorReaderTermCmp(InteriorReader *pReader,
4298 const char *pTerm, int nTerm, int isPrefix){
4299 const char *pReaderTerm = interiorReaderTerm(pReader);
4300 int nReaderTerm = interiorReaderTermBytes(pReader);
4301 int c, n = nReaderTerm<nTerm ? nReaderTerm : nTerm;
4302
4303 if( n==0 ){
4304 if( nReaderTerm>0 ) return -1;
4305 if( nTerm>0 ) return 1;
4306 return 0;
4307 }
4308
4309 c = memcmp(pReaderTerm, pTerm, n);
4310 if( c!=0 ) return c;
4311 if( isPrefix && n==nTerm ) return 0;
4312 return nReaderTerm - nTerm;
4313}
4314
4315/****************************************************************/
4316/* LeafWriter is used to collect terms and associated doclist data
4317** into leaf blocks in %_segments (see top of file for format info).
4318** Expected usage is:
4319**
4320** LeafWriter writer;
4321** leafWriterInit(0, 0, &writer);
4322** while( sorted_terms_left_to_process ){
4323** // data is doclist data for that term.
4324** rc = leafWriterStep(v, &writer, pTerm, nTerm, pData, nData);
4325** if( rc!=SQLITE_OK ) goto err;
4326** }
4327** rc = leafWriterFinalize(v, &writer);
4328**err:
4329** leafWriterDestroy(&writer);
4330** return rc;
4331**
4332** leafWriterStep() may write a collected leaf out to %_segments.
4333** leafWriterFinalize() finishes writing any buffered data and stores
4334** a root node in %_segdir. leafWriterDestroy() frees all buffers and
4335** InteriorWriters allocated as part of writing this segment.
4336**
4337** TODO(shess) Document leafWriterStepMerge().
4338*/
4339
4340/* Put terms with data this big in their own block. */
4341#define STANDALONE_MIN 1024
4342
4343/* Keep leaf blocks below this size. */
4344#define LEAF_MAX 2048
4345
4346typedef struct LeafWriter {
4347 int iLevel;
4348 int idx;
4349 sqlite_int64 iStartBlockid; /* needed to create the root info */
4350 sqlite_int64 iEndBlockid; /* when we're done writing. */
4351
4352 DataBuffer term; /* previous encoded term */
4353 DataBuffer data; /* encoding buffer */
4354
4355 /* bytes of first term in the current node which distinguishes that
4356 ** term from the last term of the previous node.
4357 */
4358 int nTermDistinct;
4359
4360 InteriorWriter parentWriter; /* if we overflow */
4361 int has_parent;
4362} LeafWriter;
4363
4364static void leafWriterInit(int iLevel, int idx, LeafWriter *pWriter){
4365 CLEAR(pWriter);
4366 pWriter->iLevel = iLevel;
4367 pWriter->idx = idx;
4368
4369 dataBufferInit(&pWriter->term, 32);
4370
4371 /* Start out with a reasonably sized block, though it can grow. */
4372 dataBufferInit(&pWriter->data, LEAF_MAX);
4373}
4374
4375#ifndef NDEBUG
4376/* Verify that the data is readable as a leaf node. */
4377static void leafNodeValidate(const char *pData, int nData){
4378 int n, iDummy;
4379
4380 if( nData==0 ) return;
4381 assert( nData>0 );
4382 assert( pData!=0 );
4383 assert( pData+nData>pData );
4384
4385 /* Must lead with a varint(0) */
4386 n = getVarint32(pData, &iDummy);
4387 assert( iDummy==0 );
4388 assert( n>0 );
4389 assert( n<nData );
4390 pData += n;
4391 nData -= n;
4392
4393 /* Leading term length and data must fit in buffer. */
4394 n = getVarint32(pData, &iDummy);
4395 assert( n>0 );
4396 assert( iDummy>0 );
4397 assert( n+iDummy>0 );
4398 assert( n+iDummy<nData );
4399 pData += n+iDummy;
4400 nData -= n+iDummy;
4401
4402 /* Leading term's doclist length and data must fit. */
4403 n = getVarint32(pData, &iDummy);
4404 assert( n>0 );
4405 assert( iDummy>0 );
4406 assert( n+iDummy>0 );
4407 assert( n+iDummy<=nData );
4408 ASSERT_VALID_DOCLIST(DL_DEFAULT, pData+n, iDummy, NULL);
4409 pData += n+iDummy;
4410 nData -= n+iDummy;
4411
4412 /* Verify that trailing terms and doclists also are readable. */
4413 while( nData!=0 ){
4414 n = getVarint32(pData, &iDummy);
4415 assert( n>0 );
4416 assert( iDummy>=0 );
4417 assert( n<nData );
4418 pData += n;
4419 nData -= n;
4420 n = getVarint32(pData, &iDummy);
4421 assert( n>0 );
4422 assert( iDummy>0 );
4423 assert( n+iDummy>0 );
4424 assert( n+iDummy<nData );
4425 pData += n+iDummy;
4426 nData -= n+iDummy;
4427
4428 n = getVarint32(pData, &iDummy);
4429 assert( n>0 );
4430 assert( iDummy>0 );
4431 assert( n+iDummy>0 );
4432 assert( n+iDummy<=nData );
4433 ASSERT_VALID_DOCLIST(DL_DEFAULT, pData+n, iDummy, NULL);
4434 pData += n+iDummy;
4435 nData -= n+iDummy;
4436 }
4437}
4438#define ASSERT_VALID_LEAF_NODE(p, n) leafNodeValidate(p, n)
4439#else
4440#define ASSERT_VALID_LEAF_NODE(p, n) assert( 1 )
4441#endif
4442
4443/* Flush the current leaf node to %_segments, and adding the resulting
4444** blockid and the starting term to the interior node which will
4445** contain it.
4446*/
4447static int leafWriterInternalFlush(fulltext_vtab *v, LeafWriter *pWriter,
4448 int iData, int nData){
4449 sqlite_int64 iBlockid = 0;
4450 const char *pStartingTerm;
4451 int nStartingTerm, rc, n;
4452
4453 /* Must have the leading varint(0) flag, plus at least some
4454 ** valid-looking data.
4455 */
4456 assert( nData>2 );
4457 assert( iData>=0 );
4458 assert( iData+nData<=pWriter->data.nData );
4459 ASSERT_VALID_LEAF_NODE(pWriter->data.pData+iData, nData);
4460
4461 rc = block_insert(v, pWriter->data.pData+iData, nData, &iBlockid);
4462 if( rc!=SQLITE_OK ) return rc;
4463 assert( iBlockid!=0 );
4464
4465 /* Reconstruct the first term in the leaf for purposes of building
4466 ** the interior node.
4467 */
4468 n = getVarint32(pWriter->data.pData+iData+1, &nStartingTerm);
4469 pStartingTerm = pWriter->data.pData+iData+1+n;
4470 assert( pWriter->data.nData>iData+1+n+nStartingTerm );
4471 assert( pWriter->nTermDistinct>0 );
4472 assert( pWriter->nTermDistinct<=nStartingTerm );
4473 nStartingTerm = pWriter->nTermDistinct;
4474
4475 if( pWriter->has_parent ){
4476 interiorWriterAppend(&pWriter->parentWriter,
4477 pStartingTerm, nStartingTerm, iBlockid);
4478 }else{
4479 interiorWriterInit(1, pStartingTerm, nStartingTerm, iBlockid,
4480 &pWriter->parentWriter);
4481 pWriter->has_parent = 1;
4482 }
4483
4484 /* Track the span of this segment's leaf nodes. */
4485 if( pWriter->iEndBlockid==0 ){
4486 pWriter->iEndBlockid = pWriter->iStartBlockid = iBlockid;
4487 }else{
4488 pWriter->iEndBlockid++;
4489 assert( iBlockid==pWriter->iEndBlockid );
4490 }
4491
4492 return SQLITE_OK;
4493}
4494static int leafWriterFlush(fulltext_vtab *v, LeafWriter *pWriter){
4495 int rc = leafWriterInternalFlush(v, pWriter, 0, pWriter->data.nData);
4496 if( rc!=SQLITE_OK ) return rc;
4497
4498 /* Re-initialize the output buffer. */
4499 dataBufferReset(&pWriter->data);
4500
4501 return SQLITE_OK;
4502}
4503
4504/* Fetch the root info for the segment. If the entire leaf fits
4505** within ROOT_MAX, then it will be returned directly, otherwise it
4506** will be flushed and the root info will be returned from the
4507** interior node. *piEndBlockid is set to the blockid of the last
4508** interior or leaf node written to disk (0 if none are written at
4509** all).
4510*/
4511static int leafWriterRootInfo(fulltext_vtab *v, LeafWriter *pWriter,
4512 char **ppRootInfo, int *pnRootInfo,
4513 sqlite_int64 *piEndBlockid){
4514 /* we can fit the segment entirely inline */
4515 if( !pWriter->has_parent && pWriter->data.nData<ROOT_MAX ){
4516 *ppRootInfo = pWriter->data.pData;
4517 *pnRootInfo = pWriter->data.nData;
4518 *piEndBlockid = 0;
4519 return SQLITE_OK;
4520 }
4521
4522 /* Flush remaining leaf data. */
4523 if( pWriter->data.nData>0 ){
4524 int rc = leafWriterFlush(v, pWriter);
4525 if( rc!=SQLITE_OK ) return rc;
4526 }
4527
4528 /* We must have flushed a leaf at some point. */
4529 assert( pWriter->has_parent );
4530
4531 /* Tenatively set the end leaf blockid as the end blockid. If the
4532 ** interior node can be returned inline, this will be the final
4533 ** blockid, otherwise it will be overwritten by
4534 ** interiorWriterRootInfo().
4535 */
4536 *piEndBlockid = pWriter->iEndBlockid;
4537
4538 return interiorWriterRootInfo(v, &pWriter->parentWriter,
4539 ppRootInfo, pnRootInfo, piEndBlockid);
4540}
4541
4542/* Collect the rootInfo data and store it into the segment directory.
4543** This has the effect of flushing the segment's leaf data to
4544** %_segments, and also flushing any interior nodes to %_segments.
4545*/
4546static int leafWriterFinalize(fulltext_vtab *v, LeafWriter *pWriter){
4547 sqlite_int64 iEndBlockid;
4548 char *pRootInfo;
4549 int rc, nRootInfo;
4550
4551 rc = leafWriterRootInfo(v, pWriter, &pRootInfo, &nRootInfo, &iEndBlockid);
4552 if( rc!=SQLITE_OK ) return rc;
4553
4554 /* Don't bother storing an entirely empty segment. */
4555 if( iEndBlockid==0 && nRootInfo==0 ) return SQLITE_OK;
4556
4557 return segdir_set(v, pWriter->iLevel, pWriter->idx,
4558 pWriter->iStartBlockid, pWriter->iEndBlockid,
4559 iEndBlockid, pRootInfo, nRootInfo);
4560}
4561
4562static void leafWriterDestroy(LeafWriter *pWriter){
4563 if( pWriter->has_parent ) interiorWriterDestroy(&pWriter->parentWriter);
4564 dataBufferDestroy(&pWriter->term);
4565 dataBufferDestroy(&pWriter->data);
4566}
4567
4568/* Encode a term into the leafWriter, delta-encoding as appropriate.
4569** Returns the length of the new term which distinguishes it from the
4570** previous term, which can be used to set nTermDistinct when a node
4571** boundary is crossed.
4572*/
4573static int leafWriterEncodeTerm(LeafWriter *pWriter,
4574 const char *pTerm, int nTerm){
4575 char c[VARINT_MAX+VARINT_MAX];
4576 int n, nPrefix = 0;
4577
4578 assert( nTerm>0 );
4579 while( nPrefix<pWriter->term.nData &&
4580 pTerm[nPrefix]==pWriter->term.pData[nPrefix] ){
4581 nPrefix++;
4582 /* Failing this implies that the terms weren't in order. */
4583 assert( nPrefix<nTerm );
4584 }
4585
4586 if( pWriter->data.nData==0 ){
4587 /* Encode the node header and leading term as:
4588 ** varint(0)
4589 ** varint(nTerm)
4590 ** char pTerm[nTerm]
4591 */
4592 n = putVarint(c, '\0');
4593 n += putVarint(c+n, nTerm);
4594 dataBufferAppend2(&pWriter->data, c, n, pTerm, nTerm);
4595 }else{
4596 /* Delta-encode the term as:
4597 ** varint(nPrefix)
4598 ** varint(nSuffix)
4599 ** char pTermSuffix[nSuffix]
4600 */
4601 n = putVarint(c, nPrefix);
4602 n += putVarint(c+n, nTerm-nPrefix);
4603 dataBufferAppend2(&pWriter->data, c, n, pTerm+nPrefix, nTerm-nPrefix);
4604 }
4605 dataBufferReplace(&pWriter->term, pTerm, nTerm);
4606
4607 return nPrefix+1;
4608}
4609
4610/* Used to avoid a memmove when a large amount of doclist data is in
4611** the buffer. This constructs a node and term header before
4612** iDoclistData and flushes the resulting complete node using
4613** leafWriterInternalFlush().
4614*/
4615static int leafWriterInlineFlush(fulltext_vtab *v, LeafWriter *pWriter,
4616 const char *pTerm, int nTerm,
4617 int iDoclistData){
4618 char c[VARINT_MAX+VARINT_MAX];
4619 int iData, n = putVarint(c, 0);
4620 n += putVarint(c+n, nTerm);
4621
4622 /* There should always be room for the header. Even if pTerm shared
4623 ** a substantial prefix with the previous term, the entire prefix
4624 ** could be constructed from earlier data in the doclist, so there
4625 ** should be room.
4626 */
4627 assert( iDoclistData>=n+nTerm );
4628
4629 iData = iDoclistData-(n+nTerm);
4630 memcpy(pWriter->data.pData+iData, c, n);
4631 memcpy(pWriter->data.pData+iData+n, pTerm, nTerm);
4632
4633 return leafWriterInternalFlush(v, pWriter, iData, pWriter->data.nData-iData);
4634}
4635
4636/* Push pTerm[nTerm] along with the doclist data to the leaf layer of
4637** %_segments.
4638*/
4639static int leafWriterStepMerge(fulltext_vtab *v, LeafWriter *pWriter,
4640 const char *pTerm, int nTerm,
4641 DLReader *pReaders, int nReaders){
4642 char c[VARINT_MAX+VARINT_MAX];
4643 int iTermData = pWriter->data.nData, iDoclistData;
4644 int i, nData, n, nActualData, nActual, rc, nTermDistinct;
4645
4646 ASSERT_VALID_LEAF_NODE(pWriter->data.pData, pWriter->data.nData);
4647 nTermDistinct = leafWriterEncodeTerm(pWriter, pTerm, nTerm);
4648
4649 /* Remember nTermDistinct if opening a new node. */
4650 if( iTermData==0 ) pWriter->nTermDistinct = nTermDistinct;
4651
4652 iDoclistData = pWriter->data.nData;
4653
4654 /* Estimate the length of the merged doclist so we can leave space
4655 ** to encode it.
4656 */
4657 for(i=0, nData=0; i<nReaders; i++){
4658 nData += dlrAllDataBytes(&pReaders[i]);
4659 }
4660 n = putVarint(c, nData);
4661 dataBufferAppend(&pWriter->data, c, n);
4662
4663 docListMerge(&pWriter->data, pReaders, nReaders);
4664 ASSERT_VALID_DOCLIST(DL_DEFAULT,
4665 pWriter->data.pData+iDoclistData+n,
4666 pWriter->data.nData-iDoclistData-n, NULL);
4667
4668 /* The actual amount of doclist data at this point could be smaller
4669 ** than the length we encoded. Additionally, the space required to
4670 ** encode this length could be smaller. For small doclists, this is
4671 ** not a big deal, we can just use memmove() to adjust things.
4672 */
4673 nActualData = pWriter->data.nData-(iDoclistData+n);
4674 nActual = putVarint(c, nActualData);
4675 assert( nActualData<=nData );
4676 assert( nActual<=n );
4677
4678 /* If the new doclist is big enough for force a standalone leaf
4679 ** node, we can immediately flush it inline without doing the
4680 ** memmove().
4681 */
4682 /* TODO(shess) This test matches leafWriterStep(), which does this
4683 ** test before it knows the cost to varint-encode the term and
4684 ** doclist lengths. At some point, change to
4685 ** pWriter->data.nData-iTermData>STANDALONE_MIN.
4686 */
4687 if( nTerm+nActualData>STANDALONE_MIN ){
4688 /* Push leaf node from before this term. */
4689 if( iTermData>0 ){
4690 rc = leafWriterInternalFlush(v, pWriter, 0, iTermData);
4691 if( rc!=SQLITE_OK ) return rc;
4692
4693 pWriter->nTermDistinct = nTermDistinct;
4694 }
4695
4696 /* Fix the encoded doclist length. */
4697 iDoclistData += n - nActual;
4698 memcpy(pWriter->data.pData+iDoclistData, c, nActual);
4699
4700 /* Push the standalone leaf node. */
4701 rc = leafWriterInlineFlush(v, pWriter, pTerm, nTerm, iDoclistData);
4702 if( rc!=SQLITE_OK ) return rc;
4703
4704 /* Leave the node empty. */
4705 dataBufferReset(&pWriter->data);
4706
4707 return rc;
4708 }
4709
4710 /* At this point, we know that the doclist was small, so do the
4711 ** memmove if indicated.
4712 */
4713 if( nActual<n ){
4714 memmove(pWriter->data.pData+iDoclistData+nActual,
4715 pWriter->data.pData+iDoclistData+n,
4716 pWriter->data.nData-(iDoclistData+n));
4717 pWriter->data.nData -= n-nActual;
4718 }
4719
4720 /* Replace written length with actual length. */
4721 memcpy(pWriter->data.pData+iDoclistData, c, nActual);
4722
4723 /* If the node is too large, break things up. */
4724 /* TODO(shess) This test matches leafWriterStep(), which does this
4725 ** test before it knows the cost to varint-encode the term and
4726 ** doclist lengths. At some point, change to
4727 ** pWriter->data.nData>LEAF_MAX.
4728 */
4729 if( iTermData+nTerm+nActualData>LEAF_MAX ){
4730 /* Flush out the leading data as a node */
4731 rc = leafWriterInternalFlush(v, pWriter, 0, iTermData);
4732 if( rc!=SQLITE_OK ) return rc;
4733
4734 pWriter->nTermDistinct = nTermDistinct;
4735
4736 /* Rebuild header using the current term */
4737 n = putVarint(pWriter->data.pData, 0);
4738 n += putVarint(pWriter->data.pData+n, nTerm);
4739 memcpy(pWriter->data.pData+n, pTerm, nTerm);
4740 n += nTerm;
4741
4742 /* There should always be room, because the previous encoding
4743 ** included all data necessary to construct the term.
4744 */
4745 assert( n<iDoclistData );
4746 /* So long as STANDALONE_MIN is half or less of LEAF_MAX, the
4747 ** following memcpy() is safe (as opposed to needing a memmove).
4748 */
4749 assert( 2*STANDALONE_MIN<=LEAF_MAX );
4750 assert( n+pWriter->data.nData-iDoclistData<iDoclistData );
4751 memcpy(pWriter->data.pData+n,
4752 pWriter->data.pData+iDoclistData,
4753 pWriter->data.nData-iDoclistData);
4754 pWriter->data.nData -= iDoclistData-n;
4755 }
4756 ASSERT_VALID_LEAF_NODE(pWriter->data.pData, pWriter->data.nData);
4757
4758 return SQLITE_OK;
4759}
4760
4761/* Push pTerm[nTerm] along with the doclist data to the leaf layer of
4762** %_segments.
4763*/
4764/* TODO(shess) Revise writeZeroSegment() so that doclists are
4765** constructed directly in pWriter->data.
4766*/
4767static int leafWriterStep(fulltext_vtab *v, LeafWriter *pWriter,
4768 const char *pTerm, int nTerm,
4769 const char *pData, int nData){
4770 int rc;
4771 DLReader reader;
4772
4773 dlrInit(&reader, DL_DEFAULT, pData, nData);
4774 rc = leafWriterStepMerge(v, pWriter, pTerm, nTerm, &reader, 1);
4775 dlrDestroy(&reader);
4776
4777 return rc;
4778}
4779
4780
4781/****************************************************************/
4782/* LeafReader is used to iterate over an individual leaf node. */
4783typedef struct LeafReader {
4784 DataBuffer term; /* copy of current term. */
4785
4786 const char *pData; /* data for current term. */
4787 int nData;
4788} LeafReader;
4789
4790static void leafReaderDestroy(LeafReader *pReader){
4791 dataBufferDestroy(&pReader->term);
4792 SCRAMBLE(pReader);
4793}
4794
4795static int leafReaderAtEnd(LeafReader *pReader){
4796 return pReader->nData<=0;
4797}
4798
4799/* Access the current term. */
4800static int leafReaderTermBytes(LeafReader *pReader){
4801 return pReader->term.nData;
4802}
4803static const char *leafReaderTerm(LeafReader *pReader){
4804 assert( pReader->term.nData>0 );
4805 return pReader->term.pData;
4806}
4807
4808/* Access the doclist data for the current term. */
4809static int leafReaderDataBytes(LeafReader *pReader){
4810 int nData;
4811 assert( pReader->term.nData>0 );
4812 getVarint32(pReader->pData, &nData);
4813 return nData;
4814}
4815static const char *leafReaderData(LeafReader *pReader){
4816 int n, nData;
4817 assert( pReader->term.nData>0 );
4818 n = getVarint32(pReader->pData, &nData);
4819 return pReader->pData+n;
4820}
4821
4822static void leafReaderInit(const char *pData, int nData,
4823 LeafReader *pReader){
4824 int nTerm, n;
4825
4826 assert( nData>0 );
4827 assert( pData[0]=='\0' );
4828
4829 CLEAR(pReader);
4830
4831 /* Read the first term, skipping the header byte. */
4832 n = getVarint32(pData+1, &nTerm);
4833 dataBufferInit(&pReader->term, nTerm);
4834 dataBufferReplace(&pReader->term, pData+1+n, nTerm);
4835
4836 /* Position after the first term. */
4837 assert( 1+n+nTerm<nData );
4838 pReader->pData = pData+1+n+nTerm;
4839 pReader->nData = nData-1-n-nTerm;
4840}
4841
4842/* Step the reader forward to the next term. */
4843static void leafReaderStep(LeafReader *pReader){
4844 int n, nData, nPrefix, nSuffix;
4845 assert( !leafReaderAtEnd(pReader) );
4846
4847 /* Skip previous entry's data block. */
4848 n = getVarint32(pReader->pData, &nData);
4849 assert( n+nData<=pReader->nData );
4850 pReader->pData += n+nData;
4851 pReader->nData -= n+nData;
4852
4853 if( !leafReaderAtEnd(pReader) ){
4854 /* Construct the new term using a prefix from the old term plus a
4855 ** suffix from the leaf data.
4856 */
4857 n = getVarint32(pReader->pData, &nPrefix);
4858 n += getVarint32(pReader->pData+n, &nSuffix);
4859 assert( n+nSuffix<pReader->nData );
4860 pReader->term.nData = nPrefix;
4861 dataBufferAppend(&pReader->term, pReader->pData+n, nSuffix);
4862
4863 pReader->pData += n+nSuffix;
4864 pReader->nData -= n+nSuffix;
4865 }
4866}
4867
4868/* strcmp-style comparison of pReader's current term against pTerm.
4869** If isPrefix, equality means equal through nTerm bytes.
4870*/
4871static int leafReaderTermCmp(LeafReader *pReader,
4872 const char *pTerm, int nTerm, int isPrefix){
4873 int c, n = pReader->term.nData<nTerm ? pReader->term.nData : nTerm;
4874 if( n==0 ){
4875 if( pReader->term.nData>0 ) return -1;
4876 if(nTerm>0 ) return 1;
4877 return 0;
4878 }
4879
4880 c = memcmp(pReader->term.pData, pTerm, n);
4881 if( c!=0 ) return c;
4882 if( isPrefix && n==nTerm ) return 0;
4883 return pReader->term.nData - nTerm;
4884}
4885
4886
4887/****************************************************************/
4888/* LeavesReader wraps LeafReader to allow iterating over the entire
4889** leaf layer of the tree.
4890*/
4891typedef struct LeavesReader {
4892 int idx; /* Index within the segment. */
4893
4894 sqlite3_stmt *pStmt; /* Statement we're streaming leaves from. */
4895 int eof; /* we've seen SQLITE_DONE from pStmt. */
4896
4897 LeafReader leafReader; /* reader for the current leaf. */
4898 DataBuffer rootData; /* root data for inline. */
4899} LeavesReader;
4900
4901/* Access the current term. */
4902static int leavesReaderTermBytes(LeavesReader *pReader){
4903 assert( !pReader->eof );
4904 return leafReaderTermBytes(&pReader->leafReader);
4905}
4906static const char *leavesReaderTerm(LeavesReader *pReader){
4907 assert( !pReader->eof );
4908 return leafReaderTerm(&pReader->leafReader);
4909}
4910
4911/* Access the doclist data for the current term. */
4912static int leavesReaderDataBytes(LeavesReader *pReader){
4913 assert( !pReader->eof );
4914 return leafReaderDataBytes(&pReader->leafReader);
4915}
4916static const char *leavesReaderData(LeavesReader *pReader){
4917 assert( !pReader->eof );
4918 return leafReaderData(&pReader->leafReader);
4919}
4920
4921static int leavesReaderAtEnd(LeavesReader *pReader){
4922 return pReader->eof;
4923}
4924
4925/* loadSegmentLeaves() may not read all the way to SQLITE_DONE, thus
4926** leaving the statement handle open, which locks the table.
4927*/
4928/* TODO(shess) This "solution" is not satisfactory. Really, there
4929** should be check-in function for all statement handles which
4930** arranges to call sqlite3_reset(). This most likely will require
4931** modification to control flow all over the place, though, so for now
4932** just punt.
4933**
4934** Note the the current system assumes that segment merges will run to
4935** completion, which is why this particular probably hasn't arisen in
4936** this case. Probably a brittle assumption.
4937*/
4938static int leavesReaderReset(LeavesReader *pReader){
4939 return sqlite3_reset(pReader->pStmt);
4940}
4941
4942static void leavesReaderDestroy(LeavesReader *pReader){
4943 leafReaderDestroy(&pReader->leafReader);
4944 dataBufferDestroy(&pReader->rootData);
4945 SCRAMBLE(pReader);
4946}
4947
4948/* Initialize pReader with the given root data (if iStartBlockid==0
4949** the leaf data was entirely contained in the root), or from the
4950** stream of blocks between iStartBlockid and iEndBlockid, inclusive.
4951*/
4952static int leavesReaderInit(fulltext_vtab *v,
4953 int idx,
4954 sqlite_int64 iStartBlockid,
4955 sqlite_int64 iEndBlockid,
4956 const char *pRootData, int nRootData,
4957 LeavesReader *pReader){
4958 CLEAR(pReader);
4959 pReader->idx = idx;
4960
4961 dataBufferInit(&pReader->rootData, 0);
4962 if( iStartBlockid==0 ){
4963 /* Entire leaf level fit in root data. */
4964 dataBufferReplace(&pReader->rootData, pRootData, nRootData);
4965 leafReaderInit(pReader->rootData.pData, pReader->rootData.nData,
4966 &pReader->leafReader);
4967 }else{
4968 sqlite3_stmt *s;
4969 int rc = sql_get_leaf_statement(v, idx, &s);
4970 if( rc!=SQLITE_OK ) return rc;
4971
4972 rc = sqlite3_bind_int64(s, 1, iStartBlockid);
4973 if( rc!=SQLITE_OK ) return rc;
4974
4975 rc = sqlite3_bind_int64(s, 2, iEndBlockid);
4976 if( rc!=SQLITE_OK ) return rc;
4977
4978 rc = sqlite3_step(s);
4979 if( rc==SQLITE_DONE ){
4980 pReader->eof = 1;
4981 return SQLITE_OK;
4982 }
4983 if( rc!=SQLITE_ROW ) return rc;
4984
4985 pReader->pStmt = s;
4986 leafReaderInit(sqlite3_column_blob(pReader->pStmt, 0),
4987 sqlite3_column_bytes(pReader->pStmt, 0),
4988 &pReader->leafReader);
4989 }
4990 return SQLITE_OK;
4991}
4992
4993/* Step the current leaf forward to the next term. If we reach the
4994** end of the current leaf, step forward to the next leaf block.
4995*/
4996static int leavesReaderStep(fulltext_vtab *v, LeavesReader *pReader){
4997 assert( !leavesReaderAtEnd(pReader) );
4998 leafReaderStep(&pReader->leafReader);
4999
5000 if( leafReaderAtEnd(&pReader->leafReader) ){
5001 int rc;
5002 if( pReader->rootData.pData ){
5003 pReader->eof = 1;
5004 return SQLITE_OK;
5005 }
5006 rc = sqlite3_step(pReader->pStmt);
5007 if( rc!=SQLITE_ROW ){
5008 pReader->eof = 1;
5009 return rc==SQLITE_DONE ? SQLITE_OK : rc;
5010 }
5011 leafReaderDestroy(&pReader->leafReader);
5012 leafReaderInit(sqlite3_column_blob(pReader->pStmt, 0),
5013 sqlite3_column_bytes(pReader->pStmt, 0),
5014 &pReader->leafReader);
5015 }
5016 return SQLITE_OK;
5017}
5018
5019/* Order LeavesReaders by their term, ignoring idx. Readers at eof
5020** always sort to the end.
5021*/
5022static int leavesReaderTermCmp(LeavesReader *lr1, LeavesReader *lr2){
5023 if( leavesReaderAtEnd(lr1) ){
5024 if( leavesReaderAtEnd(lr2) ) return 0;
5025 return 1;
5026 }
5027 if( leavesReaderAtEnd(lr2) ) return -1;
5028
5029 return leafReaderTermCmp(&lr1->leafReader,
5030 leavesReaderTerm(lr2), leavesReaderTermBytes(lr2),
5031 0);
5032}
5033
5034/* Similar to leavesReaderTermCmp(), with additional ordering by idx
5035** so that older segments sort before newer segments.
5036*/
5037static int leavesReaderCmp(LeavesReader *lr1, LeavesReader *lr2){
5038 int c = leavesReaderTermCmp(lr1, lr2);
5039 if( c!=0 ) return c;
5040 return lr1->idx-lr2->idx;
5041}
5042
5043/* Assume that pLr[1]..pLr[nLr] are sorted. Bubble pLr[0] into its
5044** sorted position.
5045*/
5046static void leavesReaderReorder(LeavesReader *pLr, int nLr){
5047 while( nLr>1 && leavesReaderCmp(pLr, pLr+1)>0 ){
5048 LeavesReader tmp = pLr[0];
5049 pLr[0] = pLr[1];
5050 pLr[1] = tmp;
5051 nLr--;
5052 pLr++;
5053 }
5054}
5055
5056/* Initializes pReaders with the segments from level iLevel, returning
5057** the number of segments in *piReaders. Leaves pReaders in sorted
5058** order.
5059*/
5060static int leavesReadersInit(fulltext_vtab *v, int iLevel,
5061 LeavesReader *pReaders, int *piReaders){
5062 sqlite3_stmt *s;
5063 int i, rc = sql_get_statement(v, SEGDIR_SELECT_STMT, &s);
5064 if( rc!=SQLITE_OK ) return rc;
5065
5066 rc = sqlite3_bind_int(s, 1, iLevel);
5067 if( rc!=SQLITE_OK ) return rc;
5068
5069 i = 0;
5070 while( (rc = sqlite3_step(s))==SQLITE_ROW ){
5071 sqlite_int64 iStart = sqlite3_column_int64(s, 0);
5072 sqlite_int64 iEnd = sqlite3_column_int64(s, 1);
5073 const char *pRootData = sqlite3_column_blob(s, 2);
5074 int nRootData = sqlite3_column_bytes(s, 2);
5075
5076 assert( i<MERGE_COUNT );
5077 rc = leavesReaderInit(v, i, iStart, iEnd, pRootData, nRootData,
5078 &pReaders[i]);
5079 if( rc!=SQLITE_OK ) break;
5080
5081 i++;
5082 }
5083 if( rc!=SQLITE_DONE ){
5084 while( i-->0 ){
5085 leavesReaderDestroy(&pReaders[i]);
5086 }
5087 return rc;
5088 }
5089
5090 *piReaders = i;
5091
5092 /* Leave our results sorted by term, then age. */
5093 while( i-- ){
5094 leavesReaderReorder(pReaders+i, *piReaders-i);
5095 }
5096 return SQLITE_OK;
5097}
5098
5099/* Merge doclists from pReaders[nReaders] into a single doclist, which
5100** is written to pWriter. Assumes pReaders is ordered oldest to
5101** newest.
5102*/
5103/* TODO(shess) Consider putting this inline in segmentMerge(). */
5104static int leavesReadersMerge(fulltext_vtab *v,
5105 LeavesReader *pReaders, int nReaders,
5106 LeafWriter *pWriter){
5107 DLReader dlReaders[MERGE_COUNT];
5108 const char *pTerm = leavesReaderTerm(pReaders);
5109 int i, nTerm = leavesReaderTermBytes(pReaders);
5110
5111 assert( nReaders<=MERGE_COUNT );
5112
5113 for(i=0; i<nReaders; i++){
5114 dlrInit(&dlReaders[i], DL_DEFAULT,
5115 leavesReaderData(pReaders+i),
5116 leavesReaderDataBytes(pReaders+i));
5117 }
5118
5119 return leafWriterStepMerge(v, pWriter, pTerm, nTerm, dlReaders, nReaders);
5120}
5121
5122/* Forward ref due to mutual recursion with segdirNextIndex(). */
5123static int segmentMerge(fulltext_vtab *v, int iLevel);
5124
5125/* Put the next available index at iLevel into *pidx. If iLevel
5126** already has MERGE_COUNT segments, they are merged to a higher
5127** level to make room.
5128*/
5129static int segdirNextIndex(fulltext_vtab *v, int iLevel, int *pidx){
5130 int rc = segdir_max_index(v, iLevel, pidx);
5131 if( rc==SQLITE_DONE ){ /* No segments at iLevel. */
5132 *pidx = 0;
5133 }else if( rc==SQLITE_ROW ){
5134 if( *pidx==(MERGE_COUNT-1) ){
5135 rc = segmentMerge(v, iLevel);
5136 if( rc!=SQLITE_OK ) return rc;
5137 *pidx = 0;
5138 }else{
5139 (*pidx)++;
5140 }
5141 }else{
5142 return rc;
5143 }
5144 return SQLITE_OK;
5145}
5146
5147/* Merge MERGE_COUNT segments at iLevel into a new segment at
5148** iLevel+1. If iLevel+1 is already full of segments, those will be
5149** merged to make room.
5150*/
5151static int segmentMerge(fulltext_vtab *v, int iLevel){
5152 LeafWriter writer;
5153 LeavesReader lrs[MERGE_COUNT];
5154 int i, rc, idx = 0;
5155
5156 /* Determine the next available segment index at the next level,
5157 ** merging as necessary.
5158 */
5159 rc = segdirNextIndex(v, iLevel+1, &idx);
5160 if( rc!=SQLITE_OK ) return rc;
5161
5162 /* TODO(shess) This assumes that we'll always see exactly
5163 ** MERGE_COUNT segments to merge at a given level. That will be
5164 ** broken if we allow the developer to request preemptive or
5165 ** deferred merging.
5166 */
5167 memset(&lrs, '\0', sizeof(lrs));
5168 rc = leavesReadersInit(v, iLevel, lrs, &i);
5169 if( rc!=SQLITE_OK ) return rc;
5170 assert( i==MERGE_COUNT );
5171
5172 leafWriterInit(iLevel+1, idx, &writer);
5173
5174 /* Since leavesReaderReorder() pushes readers at eof to the end,
5175 ** when the first reader is empty, all will be empty.
5176 */
5177 while( !leavesReaderAtEnd(lrs) ){
5178 /* Figure out how many readers share their next term. */
5179 for(i=1; i<MERGE_COUNT && !leavesReaderAtEnd(lrs+i); i++){
5180 if( 0!=leavesReaderTermCmp(lrs, lrs+i) ) break;
5181 }
5182
5183 rc = leavesReadersMerge(v, lrs, i, &writer);
5184 if( rc!=SQLITE_OK ) goto err;
5185
5186 /* Step forward those that were merged. */
5187 while( i-->0 ){
5188 rc = leavesReaderStep(v, lrs+i);
5189 if( rc!=SQLITE_OK ) goto err;
5190
5191 /* Reorder by term, then by age. */
5192 leavesReaderReorder(lrs+i, MERGE_COUNT-i);
5193 }
5194 }
5195
5196 for(i=0; i<MERGE_COUNT; i++){
5197 leavesReaderDestroy(&lrs[i]);
5198 }
5199
5200 rc = leafWriterFinalize(v, &writer);
5201 leafWriterDestroy(&writer);
5202 if( rc!=SQLITE_OK ) return rc;
5203
5204 /* Delete the merged segment data. */
5205 return segdir_delete(v, iLevel);
5206
5207 err:
5208 for(i=0; i<MERGE_COUNT; i++){
5209 leavesReaderDestroy(&lrs[i]);
5210 }
5211 leafWriterDestroy(&writer);
5212 return rc;
5213}
5214
5215/* Scan pReader for pTerm/nTerm, and merge the term's doclist over
5216** *out (any doclists with duplicate docids overwrite those in *out).
5217** Internal function for loadSegmentLeaf().
5218*/
5219static int loadSegmentLeavesInt(fulltext_vtab *v, LeavesReader *pReader,
5220 const char *pTerm, int nTerm, int isPrefix,
5221 DataBuffer *out){
5222 assert( nTerm>0 );
5223
5224 /* Process while the prefix matches. */
5225 while( !leavesReaderAtEnd(pReader) ){
5226 /* TODO(shess) Really want leavesReaderTermCmp(), but that name is
5227 ** already taken to compare the terms of two LeavesReaders. Think
5228 ** on a better name. [Meanwhile, break encapsulation rather than
5229 ** use a confusing name.]
5230 */
5231 int rc;
5232 int c = leafReaderTermCmp(&pReader->leafReader, pTerm, nTerm, isPrefix);
5233 if( c==0 ){
5234 const char *pData = leavesReaderData(pReader);
5235 int nData = leavesReaderDataBytes(pReader);
5236 if( out->nData==0 ){
5237 dataBufferReplace(out, pData, nData);
5238 }else{
5239 DataBuffer result;
5240 dataBufferInit(&result, out->nData+nData);
5241 docListUnion(out->pData, out->nData, pData, nData, &result);
5242 dataBufferDestroy(out);
5243 *out = result;
5244 /* TODO(shess) Rather than destroy out, we could retain it for
5245 ** later reuse.
5246 */
5247 }
5248 }
5249 if( c>0 ) break; /* Past any possible matches. */
5250
5251 rc = leavesReaderStep(v, pReader);
5252 if( rc!=SQLITE_OK ) return rc;
5253 }
5254 return SQLITE_OK;
5255}
5256
5257/* Call loadSegmentLeavesInt() with pData/nData as input. */
5258static int loadSegmentLeaf(fulltext_vtab *v, const char *pData, int nData,
5259 const char *pTerm, int nTerm, int isPrefix,
5260 DataBuffer *out){
5261 LeavesReader reader;
5262 int rc;
5263
5264 assert( nData>1 );
5265 assert( *pData=='\0' );
5266 rc = leavesReaderInit(v, 0, 0, 0, pData, nData, &reader);
5267 if( rc!=SQLITE_OK ) return rc;
5268
5269 rc = loadSegmentLeavesInt(v, &reader, pTerm, nTerm, isPrefix, out);
5270 leavesReaderReset(&reader);
5271 leavesReaderDestroy(&reader);
5272 return rc;
5273}
5274
5275/* Call loadSegmentLeavesInt() with the leaf nodes from iStartLeaf to
5276** iEndLeaf (inclusive) as input, and merge the resulting doclist into
5277** out.
5278*/
5279static int loadSegmentLeaves(fulltext_vtab *v,
5280 sqlite_int64 iStartLeaf, sqlite_int64 iEndLeaf,
5281 const char *pTerm, int nTerm, int isPrefix,
5282 DataBuffer *out){
5283 int rc;
5284 LeavesReader reader;
5285
5286 assert( iStartLeaf<=iEndLeaf );
5287 rc = leavesReaderInit(v, 0, iStartLeaf, iEndLeaf, NULL, 0, &reader);
5288 if( rc!=SQLITE_OK ) return rc;
5289
5290 rc = loadSegmentLeavesInt(v, &reader, pTerm, nTerm, isPrefix, out);
5291 leavesReaderReset(&reader);
5292 leavesReaderDestroy(&reader);
5293 return rc;
5294}
5295
5296/* Taking pData/nData as an interior node, find the sequence of child
5297** nodes which could include pTerm/nTerm/isPrefix. Note that the
5298** interior node terms logically come between the blocks, so there is
5299** one more blockid than there are terms (that block contains terms >=
5300** the last interior-node term).
5301*/
5302/* TODO(shess) The calling code may already know that the end child is
5303** not worth calculating, because the end may be in a later sibling
5304** node. Consider whether breaking symmetry is worthwhile. I suspect
5305** it's not worthwhile.
5306*/
5307static void getChildrenContaining(const char *pData, int nData,
5308 const char *pTerm, int nTerm, int isPrefix,
5309 sqlite_int64 *piStartChild,
5310 sqlite_int64 *piEndChild){
5311 InteriorReader reader;
5312
5313 assert( nData>1 );
5314 assert( *pData!='\0' );
5315 interiorReaderInit(pData, nData, &reader);
5316
5317 /* Scan for the first child which could contain pTerm/nTerm. */
5318 while( !interiorReaderAtEnd(&reader) ){
5319 if( interiorReaderTermCmp(&reader, pTerm, nTerm, 0)>0 ) break;
5320 interiorReaderStep(&reader);
5321 }
5322 *piStartChild = interiorReaderCurrentBlockid(&reader);
5323
5324 /* Keep scanning to find a term greater than our term, using prefix
5325 ** comparison if indicated. If isPrefix is false, this will be the
5326 ** same blockid as the starting block.
5327 */
5328 while( !interiorReaderAtEnd(&reader) ){
5329 if( interiorReaderTermCmp(&reader, pTerm, nTerm, isPrefix)>0 ) break;
5330 interiorReaderStep(&reader);
5331 }
5332 *piEndChild = interiorReaderCurrentBlockid(&reader);
5333
5334 interiorReaderDestroy(&reader);
5335
5336 /* Children must ascend, and if !prefix, both must be the same. */
5337 assert( *piEndChild>=*piStartChild );
5338 assert( isPrefix || *piStartChild==*piEndChild );
5339}
5340
5341/* Read block at iBlockid and pass it with other params to
5342** getChildrenContaining().
5343*/
5344static int loadAndGetChildrenContaining(
5345 fulltext_vtab *v,
5346 sqlite_int64 iBlockid,
5347 const char *pTerm, int nTerm, int isPrefix,
5348 sqlite_int64 *piStartChild, sqlite_int64 *piEndChild
5349){
5350 sqlite3_stmt *s = NULL;
5351 int rc;
5352
5353 assert( iBlockid!=0 );
5354 assert( pTerm!=NULL );
5355 assert( nTerm!=0 ); /* TODO(shess) Why not allow this? */
5356 assert( piStartChild!=NULL );
5357 assert( piEndChild!=NULL );
5358
5359 rc = sql_get_statement(v, BLOCK_SELECT_STMT, &s);
5360 if( rc!=SQLITE_OK ) return rc;
5361
5362 rc = sqlite3_bind_int64(s, 1, iBlockid);
5363 if( rc!=SQLITE_OK ) return rc;
5364
5365 rc = sqlite3_step(s);
5366 if( rc==SQLITE_DONE ) return SQLITE_ERROR;
5367 if( rc!=SQLITE_ROW ) return rc;
5368
5369 getChildrenContaining(sqlite3_column_blob(s, 0), sqlite3_column_bytes(s, 0),
5370 pTerm, nTerm, isPrefix, piStartChild, piEndChild);
5371
5372 /* We expect only one row. We must execute another sqlite3_step()
5373 * to complete the iteration; otherwise the table will remain
5374 * locked. */
5375 rc = sqlite3_step(s);
5376 if( rc==SQLITE_ROW ) return SQLITE_ERROR;
5377 if( rc!=SQLITE_DONE ) return rc;
5378
5379 return SQLITE_OK;
5380}
5381
5382/* Traverse the tree represented by pData[nData] looking for
5383** pTerm[nTerm], placing its doclist into *out. This is internal to
5384** loadSegment() to make error-handling cleaner.
5385*/
5386static int loadSegmentInt(fulltext_vtab *v, const char *pData, int nData,
5387 sqlite_int64 iLeavesEnd,
5388 const char *pTerm, int nTerm, int isPrefix,
5389 DataBuffer *out){
5390 /* Special case where root is a leaf. */
5391 if( *pData=='\0' ){
5392 return loadSegmentLeaf(v, pData, nData, pTerm, nTerm, isPrefix, out);
5393 }else{
5394 int rc;
5395 sqlite_int64 iStartChild, iEndChild;
5396
5397 /* Process pData as an interior node, then loop down the tree
5398 ** until we find the set of leaf nodes to scan for the term.
5399 */
5400 getChildrenContaining(pData, nData, pTerm, nTerm, isPrefix,
5401 &iStartChild, &iEndChild);
5402 while( iStartChild>iLeavesEnd ){
5403 sqlite_int64 iNextStart, iNextEnd;
5404 rc = loadAndGetChildrenContaining(v, iStartChild, pTerm, nTerm, isPrefix,
5405 &iNextStart, &iNextEnd);
5406 if( rc!=SQLITE_OK ) return rc;
5407
5408 /* If we've branched, follow the end branch, too. */
5409 if( iStartChild!=iEndChild ){
5410 sqlite_int64 iDummy;
5411 rc = loadAndGetChildrenContaining(v, iEndChild, pTerm, nTerm, isPrefix,
5412 &iDummy, &iNextEnd);
5413 if( rc!=SQLITE_OK ) return rc;
5414 }
5415
5416 assert( iNextStart<=iNextEnd );
5417 iStartChild = iNextStart;
5418 iEndChild = iNextEnd;
5419 }
5420 assert( iStartChild<=iLeavesEnd );
5421 assert( iEndChild<=iLeavesEnd );
5422
5423 /* Scan through the leaf segments for doclists. */
5424 return loadSegmentLeaves(v, iStartChild, iEndChild,
5425 pTerm, nTerm, isPrefix, out);
5426 }
5427}
5428
5429/* Call loadSegmentInt() to collect the doclist for pTerm/nTerm, then
5430** merge its doclist over *out (any duplicate doclists read from the
5431** segment rooted at pData will overwrite those in *out).
5432*/
5433/* TODO(shess) Consider changing this to determine the depth of the
5434** leaves using either the first characters of interior nodes (when
5435** ==1, we're one level above the leaves), or the first character of
5436** the root (which will describe the height of the tree directly).
5437** Either feels somewhat tricky to me.
5438*/
5439/* TODO(shess) The current merge is likely to be slow for large
5440** doclists (though it should process from newest/smallest to
5441** oldest/largest, so it may not be that bad). It might be useful to
5442** modify things to allow for N-way merging. This could either be
5443** within a segment, with pairwise merges across segments, or across
5444** all segments at once.
5445*/
5446static int loadSegment(fulltext_vtab *v, const char *pData, int nData,
5447 sqlite_int64 iLeavesEnd,
5448 const char *pTerm, int nTerm, int isPrefix,
5449 DataBuffer *out){
5450 DataBuffer result;
5451 int rc;
5452
5453 assert( nData>1 );
5454
5455 /* This code should never be called with buffered updates. */
5456 assert( v->nPendingData<0 );
5457
5458 dataBufferInit(&result, 0);
5459 rc = loadSegmentInt(v, pData, nData, iLeavesEnd,
5460 pTerm, nTerm, isPrefix, &result);
5461 if( rc==SQLITE_OK && result.nData>0 ){
5462 if( out->nData==0 ){
5463 DataBuffer tmp = *out;
5464 *out = result;
5465 result = tmp;
5466 }else{
5467 DataBuffer merged;
5468 DLReader readers[2];
5469
5470 dlrInit(&readers[0], DL_DEFAULT, out->pData, out->nData);
5471 dlrInit(&readers[1], DL_DEFAULT, result.pData, result.nData);
5472 dataBufferInit(&merged, out->nData+result.nData);
5473 docListMerge(&merged, readers, 2);
5474 dataBufferDestroy(out);
5475 *out = merged;
5476 dlrDestroy(&readers[0]);
5477 dlrDestroy(&readers[1]);
5478 }
5479 }
5480 dataBufferDestroy(&result);
5481 return rc;
5482}
5483
5484/* Scan the database and merge together the posting lists for the term
5485** into *out.
5486*/
5487static int termSelect(fulltext_vtab *v, int iColumn,
5488 const char *pTerm, int nTerm, int isPrefix,
5489 DocListType iType, DataBuffer *out){
5490 DataBuffer doclist;
5491 sqlite3_stmt *s;
5492 int rc = sql_get_statement(v, SEGDIR_SELECT_ALL_STMT, &s);
5493 if( rc!=SQLITE_OK ) return rc;
5494
5495 /* This code should never be called with buffered updates. */
5496 assert( v->nPendingData<0 );
5497
5498 dataBufferInit(&doclist, 0);
5499
5500 /* Traverse the segments from oldest to newest so that newer doclist
5501 ** elements for given docids overwrite older elements.
5502 */
5503 while( (rc = sqlite3_step(s))==SQLITE_ROW ){
5504 const char *pData = sqlite3_column_blob(s, 0);
5505 const int nData = sqlite3_column_bytes(s, 0);
5506 const sqlite_int64 iLeavesEnd = sqlite3_column_int64(s, 1);
5507 rc = loadSegment(v, pData, nData, iLeavesEnd, pTerm, nTerm, isPrefix,
5508 &doclist);
5509 if( rc!=SQLITE_OK ) goto err;
5510 }
5511 if( rc==SQLITE_DONE ){
5512 if( doclist.nData!=0 ){
5513 /* TODO(shess) The old term_select_all() code applied the column
5514 ** restrict as we merged segments, leading to smaller buffers.
5515 ** This is probably worthwhile to bring back, once the new storage
5516 ** system is checked in.
5517 */
5518 if( iColumn==v->nColumn) iColumn = -1;
5519 docListTrim(DL_DEFAULT, doclist.pData, doclist.nData,
5520 iColumn, iType, out);
5521 }
5522 rc = SQLITE_OK;
5523 }
5524
5525 err:
5526 dataBufferDestroy(&doclist);
5527 return rc;
5528}
5529
5530/****************************************************************/
5531/* Used to hold hashtable data for sorting. */
5532typedef struct TermData {
5533 const char *pTerm;
5534 int nTerm;
5535 DLCollector *pCollector;
5536} TermData;
5537
5538/* Orders TermData elements in strcmp fashion ( <0 for less-than, 0
5539** for equal, >0 for greater-than).
5540*/
5541static int termDataCmp(const void *av, const void *bv){
5542 const TermData *a = (const TermData *)av;
5543 const TermData *b = (const TermData *)bv;
5544 int n = a->nTerm<b->nTerm ? a->nTerm : b->nTerm;
5545 int c = memcmp(a->pTerm, b->pTerm, n);
5546 if( c!=0 ) return c;
5547 return a->nTerm-b->nTerm;
5548}
5549
5550/* Order pTerms data by term, then write a new level 0 segment using
5551** LeafWriter.
5552*/
5553static int writeZeroSegment(fulltext_vtab *v, fts2Hash *pTerms){
5554 fts2HashElem *e;
5555 int idx, rc, i, n;
5556 TermData *pData;
5557 LeafWriter writer;
5558 DataBuffer dl;
5559
5560 /* Determine the next index at level 0, merging as necessary. */
5561 rc = segdirNextIndex(v, 0, &idx);
5562 if( rc!=SQLITE_OK ) return rc;
5563
5564 n = fts2HashCount(pTerms);
5565 pData = malloc(n*sizeof(TermData));
5566
5567 for(i = 0, e = fts2HashFirst(pTerms); e; i++, e = fts2HashNext(e)){
5568 assert( i<n );
5569 pData[i].pTerm = fts2HashKey(e);
5570 pData[i].nTerm = fts2HashKeysize(e);
5571 pData[i].pCollector = fts2HashData(e);
5572 }
5573 assert( i==n );
5574
5575 /* TODO(shess) Should we allow user-defined collation sequences,
5576 ** here? I think we only need that once we support prefix searches.
5577 */
5578 if( n>1 ) qsort(pData, n, sizeof(*pData), termDataCmp);
5579
5580 /* TODO(shess) Refactor so that we can write directly to the segment
5581 ** DataBuffer, as happens for segment merges.
5582 */
5583 leafWriterInit(0, idx, &writer);
5584 dataBufferInit(&dl, 0);
5585 for(i=0; i<n; i++){
5586 dataBufferReset(&dl);
5587 dlcAddDoclist(pData[i].pCollector, &dl);
5588 rc = leafWriterStep(v, &writer,
5589 pData[i].pTerm, pData[i].nTerm, dl.pData, dl.nData);
5590 if( rc!=SQLITE_OK ) goto err;
5591 }
5592 rc = leafWriterFinalize(v, &writer);
5593
5594 err:
5595 dataBufferDestroy(&dl);
5596 free(pData);
5597 leafWriterDestroy(&writer);
5598 return rc;
5599}
5600
5601/* If pendingTerms has data, free it. */
5602static int clearPendingTerms(fulltext_vtab *v){
5603 if( v->nPendingData>=0 ){
5604 fts2HashElem *e;
5605 for(e=fts2HashFirst(&v->pendingTerms); e; e=fts2HashNext(e)){
5606 dlcDelete(fts2HashData(e));
5607 }
5608 fts2HashClear(&v->pendingTerms);
5609 v->nPendingData = -1;
5610 }
5611 return SQLITE_OK;
5612}
5613
5614/* If pendingTerms has data, flush it to a level-zero segment, and
5615** free it.
5616*/
5617static int flushPendingTerms(fulltext_vtab *v){
5618 if( v->nPendingData>=0 ){
5619 int rc = writeZeroSegment(v, &v->pendingTerms);
5620 if( rc==SQLITE_OK ) clearPendingTerms(v);
5621 return rc;
5622 }
5623 return SQLITE_OK;
5624}
5625
5626/* If pendingTerms is "too big", or docid is out of order, flush it.
5627** Regardless, be certain that pendingTerms is initialized for use.
5628*/
5629static int initPendingTerms(fulltext_vtab *v, sqlite_int64 iDocid){
5630 /* TODO(shess) Explore whether partially flushing the buffer on
5631 ** forced-flush would provide better performance. I suspect that if
5632 ** we ordered the doclists by size and flushed the largest until the
5633 ** buffer was half empty, that would let the less frequent terms
5634 ** generate longer doclists.
5635 */
5636 if( iDocid<=v->iPrevDocid || v->nPendingData>kPendingThreshold ){
5637 int rc = flushPendingTerms(v);
5638 if( rc!=SQLITE_OK ) return rc;
5639 }
5640 if( v->nPendingData<0 ){
5641 fts2HashInit(&v->pendingTerms, FTS2_HASH_STRING, 1);
5642 v->nPendingData = 0;
5643 }
5644 v->iPrevDocid = iDocid;
5645 return SQLITE_OK;
5646}
5647
5648/* This function implements the xUpdate callback; it's the top-level entry
5649 * point for inserting, deleting or updating a row in a full-text table. */
5650static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg,
5651 sqlite_int64 *pRowid){
5652 fulltext_vtab *v = (fulltext_vtab *) pVtab;
5653 int rc;
5654
5655 TRACE(("FTS2 Update %p\n", pVtab));
5656
5657 if( nArg<2 ){
5658 rc = index_delete(v, sqlite3_value_int64(ppArg[0]));
5659 } else if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){
5660 /* An update:
5661 * ppArg[0] = old rowid
5662 * ppArg[1] = new rowid
5663 * ppArg[2..2+v->nColumn-1] = values
5664 * ppArg[2+v->nColumn] = value for magic column (we ignore this)
5665 */
5666 sqlite_int64 rowid = sqlite3_value_int64(ppArg[0]);
5667 if( sqlite3_value_type(ppArg[1]) != SQLITE_INTEGER ||
5668 sqlite3_value_int64(ppArg[1]) != rowid ){
5669 rc = SQLITE_ERROR; /* we don't allow changing the rowid */
5670 } else {
5671 assert( nArg==2+v->nColumn+1);
5672 rc = index_update(v, rowid, &ppArg[2]);
5673 }
5674 } else {
5675 /* An insert:
5676 * ppArg[1] = requested rowid
5677 * ppArg[2..2+v->nColumn-1] = values
5678 * ppArg[2+v->nColumn] = value for magic column (we ignore this)
5679 */
5680 assert( nArg==2+v->nColumn+1);
5681 rc = index_insert(v, ppArg[1], &ppArg[2], pRowid);
5682 }
5683
5684 return rc;
5685}
5686
5687static int fulltextSync(sqlite3_vtab *pVtab){
5688 TRACE(("FTS2 xSync()\n"));
5689 return flushPendingTerms((fulltext_vtab *)pVtab);
5690}
5691
5692static int fulltextBegin(sqlite3_vtab *pVtab){
5693 fulltext_vtab *v = (fulltext_vtab *) pVtab;
5694 TRACE(("FTS2 xBegin()\n"));
5695
5696 /* Any buffered updates should have been cleared by the previous
5697 ** transaction.
5698 */
5699 assert( v->nPendingData<0 );
5700 return clearPendingTerms(v);
5701}
5702
5703static int fulltextCommit(sqlite3_vtab *pVtab){
5704 fulltext_vtab *v = (fulltext_vtab *) pVtab;
5705 TRACE(("FTS2 xCommit()\n"));
5706
5707 /* Buffered updates should have been cleared by fulltextSync(). */
5708 assert( v->nPendingData<0 );
5709 return clearPendingTerms(v);
5710}
5711
5712static int fulltextRollback(sqlite3_vtab *pVtab){
5713 TRACE(("FTS2 xRollback()\n"));
5714 return clearPendingTerms((fulltext_vtab *)pVtab);
5715}
5716
5717/*
5718** Implementation of the snippet() function for FTS2
5719*/
5720static void snippetFunc(
5721 sqlite3_context *pContext,
5722 int argc,
5723 sqlite3_value **argv
5724){
5725 fulltext_cursor *pCursor;
5726 if( argc<1 ) return;
5727 if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
5728 sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
5729 sqlite3_result_error(pContext, "illegal first argument to html_snippet",-1);
5730 }else{
5731 const char *zStart = "<b>";
5732 const char *zEnd = "</b>";
5733 const char *zEllipsis = "<b>...</b>";
5734 memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
5735 if( argc>=2 ){
5736 zStart = (const char*)sqlite3_value_text(argv[1]);
5737 if( argc>=3 ){
5738 zEnd = (const char*)sqlite3_value_text(argv[2]);
5739 if( argc>=4 ){
5740 zEllipsis = (const char*)sqlite3_value_text(argv[3]);
5741 }
5742 }
5743 }
5744 snippetAllOffsets(pCursor);
5745 snippetText(pCursor, zStart, zEnd, zEllipsis);
5746 sqlite3_result_text(pContext, pCursor->snippet.zSnippet,
5747 pCursor->snippet.nSnippet, SQLITE_STATIC);
5748 }
5749}
5750
5751/*
5752** Implementation of the offsets() function for FTS2
5753*/
5754static void snippetOffsetsFunc(
5755 sqlite3_context *pContext,
5756 int argc,
5757 sqlite3_value **argv
5758){
5759 fulltext_cursor *pCursor;
5760 if( argc<1 ) return;
5761 if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
5762 sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
5763 sqlite3_result_error(pContext, "illegal first argument to offsets",-1);
5764 }else{
5765 memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
5766 snippetAllOffsets(pCursor);
5767 snippetOffsetText(&pCursor->snippet);
5768 sqlite3_result_text(pContext,
5769 pCursor->snippet.zOffset, pCursor->snippet.nOffset,
5770 SQLITE_STATIC);
5771 }
5772}
5773
5774/*
5775** This routine implements the xFindFunction method for the FTS2
5776** virtual table.
5777*/
5778static int fulltextFindFunction(
5779 sqlite3_vtab *pVtab,
5780 int nArg,
5781 const char *zName,
5782 void (**pxFunc)(sqlite3_context*,int,sqlite3_value**),
5783 void **ppArg
5784){
5785 if( strcmp(zName,"snippet")==0 ){
5786 *pxFunc = snippetFunc;
5787 return 1;
5788 }else if( strcmp(zName,"offsets")==0 ){
5789 *pxFunc = snippetOffsetsFunc;
5790 return 1;
5791 }
5792 return 0;
5793}
5794
5795/*
5796** Rename an fts2 table.
5797*/
5798static int fulltextRename(
5799 sqlite3_vtab *pVtab,
5800 const char *zName
5801){
5802 fulltext_vtab *p = (fulltext_vtab *)pVtab;
5803 int rc = SQLITE_NOMEM;
5804 char *zSql = sqlite3_mprintf(
5805 "ALTER TABLE %Q.'%q_content' RENAME TO '%q_content';"
5806 "ALTER TABLE %Q.'%q_segments' RENAME TO '%q_segments';"
5807 "ALTER TABLE %Q.'%q_segdir' RENAME TO '%q_segdir';"
5808 , p->zDb, p->zName, zName
5809 , p->zDb, p->zName, zName
5810 , p->zDb, p->zName, zName
5811 );
5812 if( zSql ){
5813 rc = sqlite3_exec(p->db, zSql, 0, 0, 0);
5814 sqlite3_free(zSql);
5815 }
5816 return rc;
5817}
5818
5819static const sqlite3_module fts2Module = {
5820 /* iVersion */ 0,
5821 /* xCreate */ fulltextCreate,
5822 /* xConnect */ fulltextConnect,
5823 /* xBestIndex */ fulltextBestIndex,
5824 /* xDisconnect */ fulltextDisconnect,
5825 /* xDestroy */ fulltextDestroy,
5826 /* xOpen */ fulltextOpen,
5827 /* xClose */ fulltextClose,
5828 /* xFilter */ fulltextFilter,
5829 /* xNext */ fulltextNext,
5830 /* xEof */ fulltextEof,
5831 /* xColumn */ fulltextColumn,
5832 /* xRowid */ fulltextRowid,
5833 /* xUpdate */ fulltextUpdate,
5834 /* xBegin */ fulltextBegin,
5835 /* xSync */ fulltextSync,
5836 /* xCommit */ fulltextCommit,
5837 /* xRollback */ fulltextRollback,
5838 /* xFindFunction */ fulltextFindFunction,
5839 /* xRename */ fulltextRename,
5840};
5841
5842static void hashDestroy(void *p){
5843 fts2Hash *pHash = (fts2Hash *)p;
5844 sqlite3Fts2HashClear(pHash);
5845 sqlite3_free(pHash);
5846}
5847
5848/*
5849** The fts2 built-in tokenizers - "simple" and "porter" - are implemented
5850** in files fts2_tokenizer1.c and fts2_porter.c respectively. The following
5851** two forward declarations are for functions declared in these files
5852** used to retrieve the respective implementations.
5853**
5854** Calling sqlite3Fts2SimpleTokenizerModule() sets the value pointed
5855** to by the argument to point a the "simple" tokenizer implementation.
5856** Function ...PorterTokenizerModule() sets *pModule to point to the
5857** porter tokenizer/stemmer implementation.
5858*/
5859void sqlite3Fts2SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule);
5860void sqlite3Fts2PorterTokenizerModule(sqlite3_tokenizer_module const**ppModule);
5861void sqlite3Fts2IcuTokenizerModule(sqlite3_tokenizer_module const**ppModule);
5862
5863int sqlite3Fts2InitHashTable(sqlite3 *, fts2Hash *, const char *);
5864
5865/*
5866** Initialise the fts2 extension. If this extension is built as part
5867** of the sqlite library, then this function is called directly by
5868** SQLite. If fts2 is built as a dynamically loadable extension, this
5869** function is called by the sqlite3_extension_init() entry point.
5870*/
5871int sqlite3Fts2Init(sqlite3 *db){
5872 int rc = SQLITE_OK;
5873 fts2Hash *pHash = 0;
5874 const sqlite3_tokenizer_module *pSimple = 0;
5875 const sqlite3_tokenizer_module *pPorter = 0;
5876 const sqlite3_tokenizer_module *pIcu = 0;
5877
5878 sqlite3Fts2SimpleTokenizerModule(&pSimple);
5879 sqlite3Fts2PorterTokenizerModule(&pPorter);
5880#ifdef SQLITE_ENABLE_ICU
5881 sqlite3Fts2IcuTokenizerModule(&pIcu);
5882#endif
5883
5884 /* Allocate and initialise the hash-table used to store tokenizers. */
5885 pHash = sqlite3_malloc(sizeof(fts2Hash));
5886 if( !pHash ){
5887 rc = SQLITE_NOMEM;
5888 }else{
5889 sqlite3Fts2HashInit(pHash, FTS2_HASH_STRING, 1);
5890 }
5891
5892 /* Load the built-in tokenizers into the hash table */
5893 if( rc==SQLITE_OK ){
5894 if( sqlite3Fts2HashInsert(pHash, "simple", 7, (void *)pSimple)
5895 || sqlite3Fts2HashInsert(pHash, "porter", 7, (void *)pPorter)
5896 || (pIcu && sqlite3Fts2HashInsert(pHash, "icu", 4, (void *)pIcu))
5897 ){
5898 rc = SQLITE_NOMEM;
5899 }
5900 }
5901
5902 /* Create the virtual table wrapper around the hash-table and overload
5903 ** the two scalar functions. If this is successful, register the
5904 ** module with sqlite.
5905 */
5906 if( SQLITE_OK==rc
5907 && SQLITE_OK==(rc = sqlite3Fts2InitHashTable(db, pHash, "fts2_tokenizer"))
5908 && SQLITE_OK==(rc = sqlite3_overload_function(db, "snippet", -1))
5909 && SQLITE_OK==(rc = sqlite3_overload_function(db, "offsets", -1))
5910 ){
5911 return sqlite3_create_module_v2(
5912 db, "fts2", &fts2Module, (void *)pHash, hashDestroy
5913 );
5914 }
5915
5916 /* An error has occured. Delete the hash table and return the error code. */
5917 assert( rc!=SQLITE_OK );
5918 if( pHash ){
5919 sqlite3Fts2HashClear(pHash);
5920 sqlite3_free(pHash);
5921 }
5922 return rc;
5923}
5924
5925#if !SQLITE_CORE
5926int sqlite3_extension_init(
5927 sqlite3 *db,
5928 char **pzErrMsg,
5929 const sqlite3_api_routines *pApi
5930){
5931 SQLITE_EXTENSION_INIT2(pApi)
5932 return sqlite3Fts2Init(db);
5933}
5934#endif
5935
5936#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */