aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/libraries/sqlite/unix/sqlite-3.5.1/ext
diff options
context:
space:
mode:
authordan miller2007-10-21 08:36:32 +0000
committerdan miller2007-10-21 08:36:32 +0000
commit2f8d7092bc2c9609fa98d6888106b96f38b22828 (patch)
treeda6c37579258cc965b52a75aee6135fe44237698 /libraries/sqlite/unix/sqlite-3.5.1/ext
parent* Committing new PolicyManager based on an ACL system. (diff)
downloadopensim-SC_OLD-2f8d7092bc2c9609fa98d6888106b96f38b22828.zip
opensim-SC_OLD-2f8d7092bc2c9609fa98d6888106b96f38b22828.tar.gz
opensim-SC_OLD-2f8d7092bc2c9609fa98d6888106b96f38b22828.tar.bz2
opensim-SC_OLD-2f8d7092bc2c9609fa98d6888106b96f38b22828.tar.xz
libraries moved to opensim-libs, a new repository
Diffstat (limited to 'libraries/sqlite/unix/sqlite-3.5.1/ext')
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/README.txt2
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/README.txt2
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1.c3344
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1.h11
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_hash.c369
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_hash.h112
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_porter.c643
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_tokenizer.h90
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_tokenizer1.c221
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/README.tokenizers134
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/README.txt4
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2.c5936
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2.h26
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_hash.c369
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_hash.h112
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_icu.c257
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_porter.c642
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_tokenizer.c371
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_tokenizer.h145
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_tokenizer1.c229
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/mkfts2amal.tcl116
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/README.tokenizers134
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/README.txt4
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3.c5971
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3.h26
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_hash.c373
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_hash.h110
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_icu.c257
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_porter.c642
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_tokenizer.c371
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_tokenizer.h145
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_tokenizer1.c229
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/mkfts3amal.tcl116
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/icu/README.txt170
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/icu/icu.c499
35 files changed, 0 insertions, 22182 deletions
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/README.txt b/libraries/sqlite/unix/sqlite-3.5.1/ext/README.txt
deleted file mode 100644
index 009495f..0000000
--- a/libraries/sqlite/unix/sqlite-3.5.1/ext/README.txt
+++ /dev/null
@@ -1,2 +0,0 @@
1Version loadable extensions to SQLite are found in subfolders
2of this folder.
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/README.txt b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/README.txt
deleted file mode 100644
index 292b7da..0000000
--- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/README.txt
+++ /dev/null
@@ -1,2 +0,0 @@
1This folder contains source code to the first full-text search
2extension for SQLite.
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1.c
deleted file mode 100644
index 5a69965..0000000
--- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1.c
+++ /dev/null
@@ -1,3344 +0,0 @@
1/* fts1 has a design flaw which can lead to database corruption (see
2** below). It is recommended not to use it any longer, instead use
3** fts3 (or higher). If you believe that your use of fts1 is safe,
4** add -DSQLITE_ENABLE_BROKEN_FTS1=1 to your CFLAGS.
5*/
6#ifndef SQLITE_ENABLE_BROKEN_FTS1
7#error fts1 has a design flaw and has been deprecated.
8#endif
9/* The flaw is that fts1 uses the content table's unaliased rowid as
10** the unique docid. fts1 embeds the rowid in the index it builds,
11** and expects the rowid to not change. The SQLite VACUUM operation
12** will renumber such rowids, thereby breaking fts1. If you are using
13** fts1 in a system which has disabled VACUUM, then you can continue
14** to use it safely. Note that PRAGMA auto_vacuum does NOT disable
15** VACUUM, though systems using auto_vacuum are unlikely to invoke
16** VACUUM.
17**
18** fts1 should be safe even across VACUUM if you only insert documents
19** and never delete.
20*/
21
22/* The author disclaims copyright to this source code.
23 *
24 * This is an SQLite module implementing full-text search.
25 */
26
27/*
28** The code in this file is only compiled if:
29**
30** * The FTS1 module is being built as an extension
31** (in which case SQLITE_CORE is not defined), or
32**
33** * The FTS1 module is being built into the core of
34** SQLite (in which case SQLITE_ENABLE_FTS1 is defined).
35*/
36#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)
37
38#if defined(SQLITE_ENABLE_FTS1) && !defined(SQLITE_CORE)
39# define SQLITE_CORE 1
40#endif
41
42#include <assert.h>
43#include <stdlib.h>
44#include <stdio.h>
45#include <string.h>
46#include <ctype.h>
47
48#include "fts1.h"
49#include "fts1_hash.h"
50#include "fts1_tokenizer.h"
51#include "sqlite3.h"
52#include "sqlite3ext.h"
53SQLITE_EXTENSION_INIT1
54
55
56#if 0
57# define TRACE(A) printf A; fflush(stdout)
58#else
59# define TRACE(A)
60#endif
61
62/* utility functions */
63
64typedef struct StringBuffer {
65 int len; /* length, not including null terminator */
66 int alloced; /* Space allocated for s[] */
67 char *s; /* Content of the string */
68} StringBuffer;
69
70static void initStringBuffer(StringBuffer *sb){
71 sb->len = 0;
72 sb->alloced = 100;
73 sb->s = malloc(100);
74 sb->s[0] = '\0';
75}
76
77static void nappend(StringBuffer *sb, const char *zFrom, int nFrom){
78 if( sb->len + nFrom >= sb->alloced ){
79 sb->alloced = sb->len + nFrom + 100;
80 sb->s = realloc(sb->s, sb->alloced+1);
81 if( sb->s==0 ){
82 initStringBuffer(sb);
83 return;
84 }
85 }
86 memcpy(sb->s + sb->len, zFrom, nFrom);
87 sb->len += nFrom;
88 sb->s[sb->len] = 0;
89}
90static void append(StringBuffer *sb, const char *zFrom){
91 nappend(sb, zFrom, strlen(zFrom));
92}
93
94/* We encode variable-length integers in little-endian order using seven bits
95 * per byte as follows:
96**
97** KEY:
98** A = 0xxxxxxx 7 bits of data and one flag bit
99** B = 1xxxxxxx 7 bits of data and one flag bit
100**
101** 7 bits - A
102** 14 bits - BA
103** 21 bits - BBA
104** and so on.
105*/
106
107/* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */
108#define VARINT_MAX 10
109
110/* Write a 64-bit variable-length integer to memory starting at p[0].
111 * The length of data written will be between 1 and VARINT_MAX bytes.
112 * The number of bytes written is returned. */
113static int putVarint(char *p, sqlite_int64 v){
114 unsigned char *q = (unsigned char *) p;
115 sqlite_uint64 vu = v;
116 do{
117 *q++ = (unsigned char) ((vu & 0x7f) | 0x80);
118 vu >>= 7;
119 }while( vu!=0 );
120 q[-1] &= 0x7f; /* turn off high bit in final byte */
121 assert( q - (unsigned char *)p <= VARINT_MAX );
122 return (int) (q - (unsigned char *)p);
123}
124
125/* Read a 64-bit variable-length integer from memory starting at p[0].
126 * Return the number of bytes read, or 0 on error.
127 * The value is stored in *v. */
128static int getVarint(const char *p, sqlite_int64 *v){
129 const unsigned char *q = (const unsigned char *) p;
130 sqlite_uint64 x = 0, y = 1;
131 while( (*q & 0x80) == 0x80 ){
132 x += y * (*q++ & 0x7f);
133 y <<= 7;
134 if( q - (unsigned char *)p >= VARINT_MAX ){ /* bad data */
135 assert( 0 );
136 return 0;
137 }
138 }
139 x += y * (*q++);
140 *v = (sqlite_int64) x;
141 return (int) (q - (unsigned char *)p);
142}
143
144static int getVarint32(const char *p, int *pi){
145 sqlite_int64 i;
146 int ret = getVarint(p, &i);
147 *pi = (int) i;
148 assert( *pi==i );
149 return ret;
150}
151
152/*** Document lists ***
153 *
154 * A document list holds a sorted list of varint-encoded document IDs.
155 *
156 * A doclist with type DL_POSITIONS_OFFSETS is stored like this:
157 *
158 * array {
159 * varint docid;
160 * array {
161 * varint position; (delta from previous position plus POS_BASE)
162 * varint startOffset; (delta from previous startOffset)
163 * varint endOffset; (delta from startOffset)
164 * }
165 * }
166 *
167 * Here, array { X } means zero or more occurrences of X, adjacent in memory.
168 *
169 * A position list may hold positions for text in multiple columns. A position
170 * POS_COLUMN is followed by a varint containing the index of the column for
171 * following positions in the list. Any positions appearing before any
172 * occurrences of POS_COLUMN are for column 0.
173 *
174 * A doclist with type DL_POSITIONS is like the above, but holds only docids
175 * and positions without offset information.
176 *
177 * A doclist with type DL_DOCIDS is like the above, but holds only docids
178 * without positions or offset information.
179 *
180 * On disk, every document list has positions and offsets, so we don't bother
181 * to serialize a doclist's type.
182 *
183 * We don't yet delta-encode document IDs; doing so will probably be a
184 * modest win.
185 *
186 * NOTE(shess) I've thought of a slightly (1%) better offset encoding.
187 * After the first offset, estimate the next offset by using the
188 * current token position and the previous token position and offset,
189 * offset to handle some variance. So the estimate would be
190 * (iPosition*w->iStartOffset/w->iPosition-64), which is delta-encoded
191 * as normal. Offsets more than 64 chars from the estimate are
192 * encoded as the delta to the previous start offset + 128. An
193 * additional tiny increment can be gained by using the end offset of
194 * the previous token to make the estimate a tiny bit more precise.
195*/
196
197/* It is not safe to call isspace(), tolower(), or isalnum() on
198** hi-bit-set characters. This is the same solution used in the
199** tokenizer.
200*/
201/* TODO(shess) The snippet-generation code should be using the
202** tokenizer-generated tokens rather than doing its own local
203** tokenization.
204*/
205/* TODO(shess) Is __isascii() a portable version of (c&0x80)==0? */
206static int safe_isspace(char c){
207 return (c&0x80)==0 ? isspace(c) : 0;
208}
209static int safe_tolower(char c){
210 return (c&0x80)==0 ? tolower(c) : c;
211}
212static int safe_isalnum(char c){
213 return (c&0x80)==0 ? isalnum(c) : 0;
214}
215
216typedef enum DocListType {
217 DL_DOCIDS, /* docids only */
218 DL_POSITIONS, /* docids + positions */
219 DL_POSITIONS_OFFSETS /* docids + positions + offsets */
220} DocListType;
221
222/*
223** By default, only positions and not offsets are stored in the doclists.
224** To change this so that offsets are stored too, compile with
225**
226** -DDL_DEFAULT=DL_POSITIONS_OFFSETS
227**
228*/
229#ifndef DL_DEFAULT
230# define DL_DEFAULT DL_POSITIONS
231#endif
232
233typedef struct DocList {
234 char *pData;
235 int nData;
236 DocListType iType;
237 int iLastColumn; /* the last column written */
238 int iLastPos; /* the last position written */
239 int iLastOffset; /* the last start offset written */
240} DocList;
241
242enum {
243 POS_END = 0, /* end of this position list */
244 POS_COLUMN, /* followed by new column number */
245 POS_BASE
246};
247
248/* Initialize a new DocList to hold the given data. */
249static void docListInit(DocList *d, DocListType iType,
250 const char *pData, int nData){
251 d->nData = nData;
252 if( nData>0 ){
253 d->pData = malloc(nData);
254 memcpy(d->pData, pData, nData);
255 } else {
256 d->pData = NULL;
257 }
258 d->iType = iType;
259 d->iLastColumn = 0;
260 d->iLastPos = d->iLastOffset = 0;
261}
262
263/* Create a new dynamically-allocated DocList. */
264static DocList *docListNew(DocListType iType){
265 DocList *d = (DocList *) malloc(sizeof(DocList));
266 docListInit(d, iType, 0, 0);
267 return d;
268}
269
270static void docListDestroy(DocList *d){
271 free(d->pData);
272#ifndef NDEBUG
273 memset(d, 0x55, sizeof(*d));
274#endif
275}
276
277static void docListDelete(DocList *d){
278 docListDestroy(d);
279 free(d);
280}
281
282static char *docListEnd(DocList *d){
283 return d->pData + d->nData;
284}
285
286/* Append a varint to a DocList's data. */
287static void appendVarint(DocList *d, sqlite_int64 i){
288 char c[VARINT_MAX];
289 int n = putVarint(c, i);
290 d->pData = realloc(d->pData, d->nData + n);
291 memcpy(d->pData + d->nData, c, n);
292 d->nData += n;
293}
294
295static void docListAddDocid(DocList *d, sqlite_int64 iDocid){
296 appendVarint(d, iDocid);
297 if( d->iType>=DL_POSITIONS ){
298 appendVarint(d, POS_END); /* initially empty position list */
299 d->iLastColumn = 0;
300 d->iLastPos = d->iLastOffset = 0;
301 }
302}
303
304/* helper function for docListAddPos and docListAddPosOffset */
305static void addPos(DocList *d, int iColumn, int iPos){
306 assert( d->nData>0 );
307 --d->nData; /* remove previous terminator */
308 if( iColumn!=d->iLastColumn ){
309 assert( iColumn>d->iLastColumn );
310 appendVarint(d, POS_COLUMN);
311 appendVarint(d, iColumn);
312 d->iLastColumn = iColumn;
313 d->iLastPos = d->iLastOffset = 0;
314 }
315 assert( iPos>=d->iLastPos );
316 appendVarint(d, iPos-d->iLastPos+POS_BASE);
317 d->iLastPos = iPos;
318}
319
320/* Add a position to the last position list in a doclist. */
321static void docListAddPos(DocList *d, int iColumn, int iPos){
322 assert( d->iType==DL_POSITIONS );
323 addPos(d, iColumn, iPos);
324 appendVarint(d, POS_END); /* add new terminator */
325}
326
327/*
328** Add a position and starting and ending offsets to a doclist.
329**
330** If the doclist is setup to handle only positions, then insert
331** the position only and ignore the offsets.
332*/
333static void docListAddPosOffset(
334 DocList *d, /* Doclist under construction */
335 int iColumn, /* Column the inserted term is part of */
336 int iPos, /* Position of the inserted term */
337 int iStartOffset, /* Starting offset of inserted term */
338 int iEndOffset /* Ending offset of inserted term */
339){
340 assert( d->iType>=DL_POSITIONS );
341 addPos(d, iColumn, iPos);
342 if( d->iType==DL_POSITIONS_OFFSETS ){
343 assert( iStartOffset>=d->iLastOffset );
344 appendVarint(d, iStartOffset-d->iLastOffset);
345 d->iLastOffset = iStartOffset;
346 assert( iEndOffset>=iStartOffset );
347 appendVarint(d, iEndOffset-iStartOffset);
348 }
349 appendVarint(d, POS_END); /* add new terminator */
350}
351
352/*
353** A DocListReader object is a cursor into a doclist. Initialize
354** the cursor to the beginning of the doclist by calling readerInit().
355** Then use routines
356**
357** peekDocid()
358** readDocid()
359** readPosition()
360** skipPositionList()
361** and so forth...
362**
363** to read information out of the doclist. When we reach the end
364** of the doclist, atEnd() returns TRUE.
365*/
366typedef struct DocListReader {
367 DocList *pDoclist; /* The document list we are stepping through */
368 char *p; /* Pointer to next unread byte in the doclist */
369 int iLastColumn;
370 int iLastPos; /* the last position read, or -1 when not in a position list */
371} DocListReader;
372
373/*
374** Initialize the DocListReader r to point to the beginning of pDoclist.
375*/
376static void readerInit(DocListReader *r, DocList *pDoclist){
377 r->pDoclist = pDoclist;
378 if( pDoclist!=NULL ){
379 r->p = pDoclist->pData;
380 }
381 r->iLastColumn = -1;
382 r->iLastPos = -1;
383}
384
385/*
386** Return TRUE if we have reached then end of pReader and there is
387** nothing else left to read.
388*/
389static int atEnd(DocListReader *pReader){
390 return pReader->pDoclist==0 || (pReader->p >= docListEnd(pReader->pDoclist));
391}
392
393/* Peek at the next docid without advancing the read pointer.
394*/
395static sqlite_int64 peekDocid(DocListReader *pReader){
396 sqlite_int64 ret;
397 assert( !atEnd(pReader) );
398 assert( pReader->iLastPos==-1 );
399 getVarint(pReader->p, &ret);
400 return ret;
401}
402
403/* Read the next docid. See also nextDocid().
404*/
405static sqlite_int64 readDocid(DocListReader *pReader){
406 sqlite_int64 ret;
407 assert( !atEnd(pReader) );
408 assert( pReader->iLastPos==-1 );
409 pReader->p += getVarint(pReader->p, &ret);
410 if( pReader->pDoclist->iType>=DL_POSITIONS ){
411 pReader->iLastColumn = 0;
412 pReader->iLastPos = 0;
413 }
414 return ret;
415}
416
417/* Read the next position and column index from a position list.
418 * Returns the position, or -1 at the end of the list. */
419static int readPosition(DocListReader *pReader, int *iColumn){
420 int i;
421 int iType = pReader->pDoclist->iType;
422
423 if( pReader->iLastPos==-1 ){
424 return -1;
425 }
426 assert( !atEnd(pReader) );
427
428 if( iType<DL_POSITIONS ){
429 return -1;
430 }
431 pReader->p += getVarint32(pReader->p, &i);
432 if( i==POS_END ){
433 pReader->iLastColumn = pReader->iLastPos = -1;
434 *iColumn = -1;
435 return -1;
436 }
437 if( i==POS_COLUMN ){
438 pReader->p += getVarint32(pReader->p, &pReader->iLastColumn);
439 pReader->iLastPos = 0;
440 pReader->p += getVarint32(pReader->p, &i);
441 assert( i>=POS_BASE );
442 }
443 pReader->iLastPos += ((int) i)-POS_BASE;
444 if( iType>=DL_POSITIONS_OFFSETS ){
445 /* Skip over offsets, ignoring them for now. */
446 int iStart, iEnd;
447 pReader->p += getVarint32(pReader->p, &iStart);
448 pReader->p += getVarint32(pReader->p, &iEnd);
449 }
450 *iColumn = pReader->iLastColumn;
451 return pReader->iLastPos;
452}
453
454/* Skip past the end of a position list. */
455static void skipPositionList(DocListReader *pReader){
456 DocList *p = pReader->pDoclist;
457 if( p && p->iType>=DL_POSITIONS ){
458 int iColumn;
459 while( readPosition(pReader, &iColumn)!=-1 ){}
460 }
461}
462
463/* Skip over a docid, including its position list if the doclist has
464 * positions. */
465static void skipDocument(DocListReader *pReader){
466 readDocid(pReader);
467 skipPositionList(pReader);
468}
469
470/* Skip past all docids which are less than [iDocid]. Returns 1 if a docid
471 * matching [iDocid] was found. */
472static int skipToDocid(DocListReader *pReader, sqlite_int64 iDocid){
473 sqlite_int64 d = 0;
474 while( !atEnd(pReader) && (d=peekDocid(pReader))<iDocid ){
475 skipDocument(pReader);
476 }
477 return !atEnd(pReader) && d==iDocid;
478}
479
480/* Return the first document in a document list.
481*/
482static sqlite_int64 firstDocid(DocList *d){
483 DocListReader r;
484 readerInit(&r, d);
485 return readDocid(&r);
486}
487
488#ifdef SQLITE_DEBUG
489/*
490** This routine is used for debugging purpose only.
491**
492** Write the content of a doclist to standard output.
493*/
494static void printDoclist(DocList *p){
495 DocListReader r;
496 const char *zSep = "";
497
498 readerInit(&r, p);
499 while( !atEnd(&r) ){
500 sqlite_int64 docid = readDocid(&r);
501 if( docid==0 ){
502 skipPositionList(&r);
503 continue;
504 }
505 printf("%s%lld", zSep, docid);
506 zSep = ",";
507 if( p->iType>=DL_POSITIONS ){
508 int iPos, iCol;
509 const char *zDiv = "";
510 printf("(");
511 while( (iPos = readPosition(&r, &iCol))>=0 ){
512 printf("%s%d:%d", zDiv, iCol, iPos);
513 zDiv = ":";
514 }
515 printf(")");
516 }
517 }
518 printf("\n");
519 fflush(stdout);
520}
521#endif /* SQLITE_DEBUG */
522
523/* Trim the given doclist to contain only positions in column
524 * [iRestrictColumn]. */
525static void docListRestrictColumn(DocList *in, int iRestrictColumn){
526 DocListReader r;
527 DocList out;
528
529 assert( in->iType>=DL_POSITIONS );
530 readerInit(&r, in);
531 docListInit(&out, DL_POSITIONS, NULL, 0);
532
533 while( !atEnd(&r) ){
534 sqlite_int64 iDocid = readDocid(&r);
535 int iPos, iColumn;
536
537 docListAddDocid(&out, iDocid);
538 while( (iPos = readPosition(&r, &iColumn)) != -1 ){
539 if( iColumn==iRestrictColumn ){
540 docListAddPos(&out, iColumn, iPos);
541 }
542 }
543 }
544
545 docListDestroy(in);
546 *in = out;
547}
548
549/* Trim the given doclist by discarding any docids without any remaining
550 * positions. */
551static void docListDiscardEmpty(DocList *in) {
552 DocListReader r;
553 DocList out;
554
555 /* TODO: It would be nice to implement this operation in place; that
556 * could save a significant amount of memory in queries with long doclists. */
557 assert( in->iType>=DL_POSITIONS );
558 readerInit(&r, in);
559 docListInit(&out, DL_POSITIONS, NULL, 0);
560
561 while( !atEnd(&r) ){
562 sqlite_int64 iDocid = readDocid(&r);
563 int match = 0;
564 int iPos, iColumn;
565 while( (iPos = readPosition(&r, &iColumn)) != -1 ){
566 if( !match ){
567 docListAddDocid(&out, iDocid);
568 match = 1;
569 }
570 docListAddPos(&out, iColumn, iPos);
571 }
572 }
573
574 docListDestroy(in);
575 *in = out;
576}
577
578/* Helper function for docListUpdate() and docListAccumulate().
579** Splices a doclist element into the doclist represented by r,
580** leaving r pointing after the newly spliced element.
581*/
582static void docListSpliceElement(DocListReader *r, sqlite_int64 iDocid,
583 const char *pSource, int nSource){
584 DocList *d = r->pDoclist;
585 char *pTarget;
586 int nTarget, found;
587
588 found = skipToDocid(r, iDocid);
589
590 /* Describe slice in d to place pSource/nSource. */
591 pTarget = r->p;
592 if( found ){
593 skipDocument(r);
594 nTarget = r->p-pTarget;
595 }else{
596 nTarget = 0;
597 }
598
599 /* The sense of the following is that there are three possibilities.
600 ** If nTarget==nSource, we should not move any memory nor realloc.
601 ** If nTarget>nSource, trim target and realloc.
602 ** If nTarget<nSource, realloc then expand target.
603 */
604 if( nTarget>nSource ){
605 memmove(pTarget+nSource, pTarget+nTarget, docListEnd(d)-(pTarget+nTarget));
606 }
607 if( nTarget!=nSource ){
608 int iDoclist = pTarget-d->pData;
609 d->pData = realloc(d->pData, d->nData+nSource-nTarget);
610 pTarget = d->pData+iDoclist;
611 }
612 if( nTarget<nSource ){
613 memmove(pTarget+nSource, pTarget+nTarget, docListEnd(d)-(pTarget+nTarget));
614 }
615
616 memcpy(pTarget, pSource, nSource);
617 d->nData += nSource-nTarget;
618 r->p = pTarget+nSource;
619}
620
621/* Insert/update pUpdate into the doclist. */
622static void docListUpdate(DocList *d, DocList *pUpdate){
623 DocListReader reader;
624
625 assert( d!=NULL && pUpdate!=NULL );
626 assert( d->iType==pUpdate->iType);
627
628 readerInit(&reader, d);
629 docListSpliceElement(&reader, firstDocid(pUpdate),
630 pUpdate->pData, pUpdate->nData);
631}
632
633/* Propagate elements from pUpdate to pAcc, overwriting elements with
634** matching docids.
635*/
636static void docListAccumulate(DocList *pAcc, DocList *pUpdate){
637 DocListReader accReader, updateReader;
638
639 /* Handle edge cases where one doclist is empty. */
640 assert( pAcc!=NULL );
641 if( pUpdate==NULL || pUpdate->nData==0 ) return;
642 if( pAcc->nData==0 ){
643 pAcc->pData = malloc(pUpdate->nData);
644 memcpy(pAcc->pData, pUpdate->pData, pUpdate->nData);
645 pAcc->nData = pUpdate->nData;
646 return;
647 }
648
649 readerInit(&accReader, pAcc);
650 readerInit(&updateReader, pUpdate);
651
652 while( !atEnd(&updateReader) ){
653 char *pSource = updateReader.p;
654 sqlite_int64 iDocid = readDocid(&updateReader);
655 skipPositionList(&updateReader);
656 docListSpliceElement(&accReader, iDocid, pSource, updateReader.p-pSource);
657 }
658}
659
660/*
661** Read the next docid off of pIn. Return 0 if we reach the end.
662*
663* TODO: This assumes that docids are never 0, but they may actually be 0 since
664* users can choose docids when inserting into a full-text table. Fix this.
665*/
666static sqlite_int64 nextDocid(DocListReader *pIn){
667 skipPositionList(pIn);
668 return atEnd(pIn) ? 0 : readDocid(pIn);
669}
670
671/*
672** pLeft and pRight are two DocListReaders that are pointing to
673** positions lists of the same document: iDocid.
674**
675** If there are no instances in pLeft or pRight where the position
676** of pLeft is one less than the position of pRight, then this
677** routine adds nothing to pOut.
678**
679** If there are one or more instances where positions from pLeft
680** are exactly one less than positions from pRight, then add a new
681** document record to pOut. If pOut wants to hold positions, then
682** include the positions from pRight that are one more than a
683** position in pLeft. In other words: pRight.iPos==pLeft.iPos+1.
684**
685** pLeft and pRight are left pointing at the next document record.
686*/
687static void mergePosList(
688 DocListReader *pLeft, /* Left position list */
689 DocListReader *pRight, /* Right position list */
690 sqlite_int64 iDocid, /* The docid from pLeft and pRight */
691 DocList *pOut /* Write the merged document record here */
692){
693 int iLeftCol, iLeftPos = readPosition(pLeft, &iLeftCol);
694 int iRightCol, iRightPos = readPosition(pRight, &iRightCol);
695 int match = 0;
696
697 /* Loop until we've reached the end of both position lists. */
698 while( iLeftPos!=-1 && iRightPos!=-1 ){
699 if( iLeftCol==iRightCol && iLeftPos+1==iRightPos ){
700 if( !match ){
701 docListAddDocid(pOut, iDocid);
702 match = 1;
703 }
704 if( pOut->iType>=DL_POSITIONS ){
705 docListAddPos(pOut, iRightCol, iRightPos);
706 }
707 iLeftPos = readPosition(pLeft, &iLeftCol);
708 iRightPos = readPosition(pRight, &iRightCol);
709 }else if( iRightCol<iLeftCol ||
710 (iRightCol==iLeftCol && iRightPos<iLeftPos+1) ){
711 iRightPos = readPosition(pRight, &iRightCol);
712 }else{
713 iLeftPos = readPosition(pLeft, &iLeftCol);
714 }
715 }
716 if( iLeftPos>=0 ) skipPositionList(pLeft);
717 if( iRightPos>=0 ) skipPositionList(pRight);
718}
719
720/* We have two doclists: pLeft and pRight.
721** Write the phrase intersection of these two doclists into pOut.
722**
723** A phrase intersection means that two documents only match
724** if pLeft.iPos+1==pRight.iPos.
725**
726** The output pOut may or may not contain positions. If pOut
727** does contain positions, they are the positions of pRight.
728*/
729static void docListPhraseMerge(
730 DocList *pLeft, /* Doclist resulting from the words on the left */
731 DocList *pRight, /* Doclist for the next word to the right */
732 DocList *pOut /* Write the combined doclist here */
733){
734 DocListReader left, right;
735 sqlite_int64 docidLeft, docidRight;
736
737 readerInit(&left, pLeft);
738 readerInit(&right, pRight);
739 docidLeft = nextDocid(&left);
740 docidRight = nextDocid(&right);
741
742 while( docidLeft>0 && docidRight>0 ){
743 if( docidLeft<docidRight ){
744 docidLeft = nextDocid(&left);
745 }else if( docidRight<docidLeft ){
746 docidRight = nextDocid(&right);
747 }else{
748 mergePosList(&left, &right, docidLeft, pOut);
749 docidLeft = nextDocid(&left);
750 docidRight = nextDocid(&right);
751 }
752 }
753}
754
755/* We have two doclists: pLeft and pRight.
756** Write the intersection of these two doclists into pOut.
757** Only docids are matched. Position information is ignored.
758**
759** The output pOut never holds positions.
760*/
761static void docListAndMerge(
762 DocList *pLeft, /* Doclist resulting from the words on the left */
763 DocList *pRight, /* Doclist for the next word to the right */
764 DocList *pOut /* Write the combined doclist here */
765){
766 DocListReader left, right;
767 sqlite_int64 docidLeft, docidRight;
768
769 assert( pOut->iType<DL_POSITIONS );
770
771 readerInit(&left, pLeft);
772 readerInit(&right, pRight);
773 docidLeft = nextDocid(&left);
774 docidRight = nextDocid(&right);
775
776 while( docidLeft>0 && docidRight>0 ){
777 if( docidLeft<docidRight ){
778 docidLeft = nextDocid(&left);
779 }else if( docidRight<docidLeft ){
780 docidRight = nextDocid(&right);
781 }else{
782 docListAddDocid(pOut, docidLeft);
783 docidLeft = nextDocid(&left);
784 docidRight = nextDocid(&right);
785 }
786 }
787}
788
789/* We have two doclists: pLeft and pRight.
790** Write the union of these two doclists into pOut.
791** Only docids are matched. Position information is ignored.
792**
793** The output pOut never holds positions.
794*/
795static void docListOrMerge(
796 DocList *pLeft, /* Doclist resulting from the words on the left */
797 DocList *pRight, /* Doclist for the next word to the right */
798 DocList *pOut /* Write the combined doclist here */
799){
800 DocListReader left, right;
801 sqlite_int64 docidLeft, docidRight, priorLeft;
802
803 readerInit(&left, pLeft);
804 readerInit(&right, pRight);
805 docidLeft = nextDocid(&left);
806 docidRight = nextDocid(&right);
807
808 while( docidLeft>0 && docidRight>0 ){
809 if( docidLeft<=docidRight ){
810 docListAddDocid(pOut, docidLeft);
811 }else{
812 docListAddDocid(pOut, docidRight);
813 }
814 priorLeft = docidLeft;
815 if( docidLeft<=docidRight ){
816 docidLeft = nextDocid(&left);
817 }
818 if( docidRight>0 && docidRight<=priorLeft ){
819 docidRight = nextDocid(&right);
820 }
821 }
822 while( docidLeft>0 ){
823 docListAddDocid(pOut, docidLeft);
824 docidLeft = nextDocid(&left);
825 }
826 while( docidRight>0 ){
827 docListAddDocid(pOut, docidRight);
828 docidRight = nextDocid(&right);
829 }
830}
831
832/* We have two doclists: pLeft and pRight.
833** Write into pOut all documents that occur in pLeft but not
834** in pRight.
835**
836** Only docids are matched. Position information is ignored.
837**
838** The output pOut never holds positions.
839*/
840static void docListExceptMerge(
841 DocList *pLeft, /* Doclist resulting from the words on the left */
842 DocList *pRight, /* Doclist for the next word to the right */
843 DocList *pOut /* Write the combined doclist here */
844){
845 DocListReader left, right;
846 sqlite_int64 docidLeft, docidRight, priorLeft;
847
848 readerInit(&left, pLeft);
849 readerInit(&right, pRight);
850 docidLeft = nextDocid(&left);
851 docidRight = nextDocid(&right);
852
853 while( docidLeft>0 && docidRight>0 ){
854 priorLeft = docidLeft;
855 if( docidLeft<docidRight ){
856 docListAddDocid(pOut, docidLeft);
857 }
858 if( docidLeft<=docidRight ){
859 docidLeft = nextDocid(&left);
860 }
861 if( docidRight>0 && docidRight<=priorLeft ){
862 docidRight = nextDocid(&right);
863 }
864 }
865 while( docidLeft>0 ){
866 docListAddDocid(pOut, docidLeft);
867 docidLeft = nextDocid(&left);
868 }
869}
870
871static char *string_dup_n(const char *s, int n){
872 char *str = malloc(n + 1);
873 memcpy(str, s, n);
874 str[n] = '\0';
875 return str;
876}
877
878/* Duplicate a string; the caller must free() the returned string.
879 * (We don't use strdup() since it's not part of the standard C library and
880 * may not be available everywhere.) */
881static char *string_dup(const char *s){
882 return string_dup_n(s, strlen(s));
883}
884
885/* Format a string, replacing each occurrence of the % character with
886 * zDb.zName. This may be more convenient than sqlite_mprintf()
887 * when one string is used repeatedly in a format string.
888 * The caller must free() the returned string. */
889static char *string_format(const char *zFormat,
890 const char *zDb, const char *zName){
891 const char *p;
892 size_t len = 0;
893 size_t nDb = strlen(zDb);
894 size_t nName = strlen(zName);
895 size_t nFullTableName = nDb+1+nName;
896 char *result;
897 char *r;
898
899 /* first compute length needed */
900 for(p = zFormat ; *p ; ++p){
901 len += (*p=='%' ? nFullTableName : 1);
902 }
903 len += 1; /* for null terminator */
904
905 r = result = malloc(len);
906 for(p = zFormat; *p; ++p){
907 if( *p=='%' ){
908 memcpy(r, zDb, nDb);
909 r += nDb;
910 *r++ = '.';
911 memcpy(r, zName, nName);
912 r += nName;
913 } else {
914 *r++ = *p;
915 }
916 }
917 *r++ = '\0';
918 assert( r == result + len );
919 return result;
920}
921
922static int sql_exec(sqlite3 *db, const char *zDb, const char *zName,
923 const char *zFormat){
924 char *zCommand = string_format(zFormat, zDb, zName);
925 int rc;
926 TRACE(("FTS1 sql: %s\n", zCommand));
927 rc = sqlite3_exec(db, zCommand, NULL, 0, NULL);
928 free(zCommand);
929 return rc;
930}
931
932static int sql_prepare(sqlite3 *db, const char *zDb, const char *zName,
933 sqlite3_stmt **ppStmt, const char *zFormat){
934 char *zCommand = string_format(zFormat, zDb, zName);
935 int rc;
936 TRACE(("FTS1 prepare: %s\n", zCommand));
937 rc = sqlite3_prepare(db, zCommand, -1, ppStmt, NULL);
938 free(zCommand);
939 return rc;
940}
941
942/* end utility functions */
943
944/* Forward reference */
945typedef struct fulltext_vtab fulltext_vtab;
946
947/* A single term in a query is represented by an instances of
948** the following structure.
949*/
950typedef struct QueryTerm {
951 short int nPhrase; /* How many following terms are part of the same phrase */
952 short int iPhrase; /* This is the i-th term of a phrase. */
953 short int iColumn; /* Column of the index that must match this term */
954 signed char isOr; /* this term is preceded by "OR" */
955 signed char isNot; /* this term is preceded by "-" */
956 char *pTerm; /* text of the term. '\000' terminated. malloced */
957 int nTerm; /* Number of bytes in pTerm[] */
958} QueryTerm;
959
960
961/* A query string is parsed into a Query structure.
962 *
963 * We could, in theory, allow query strings to be complicated
964 * nested expressions with precedence determined by parentheses.
965 * But none of the major search engines do this. (Perhaps the
966 * feeling is that an parenthesized expression is two complex of
967 * an idea for the average user to grasp.) Taking our lead from
968 * the major search engines, we will allow queries to be a list
969 * of terms (with an implied AND operator) or phrases in double-quotes,
970 * with a single optional "-" before each non-phrase term to designate
971 * negation and an optional OR connector.
972 *
973 * OR binds more tightly than the implied AND, which is what the
974 * major search engines seem to do. So, for example:
975 *
976 * [one two OR three] ==> one AND (two OR three)
977 * [one OR two three] ==> (one OR two) AND three
978 *
979 * A "-" before a term matches all entries that lack that term.
980 * The "-" must occur immediately before the term with in intervening
981 * space. This is how the search engines do it.
982 *
983 * A NOT term cannot be the right-hand operand of an OR. If this
984 * occurs in the query string, the NOT is ignored:
985 *
986 * [one OR -two] ==> one OR two
987 *
988 */
989typedef struct Query {
990 fulltext_vtab *pFts; /* The full text index */
991 int nTerms; /* Number of terms in the query */
992 QueryTerm *pTerms; /* Array of terms. Space obtained from malloc() */
993 int nextIsOr; /* Set the isOr flag on the next inserted term */
994 int nextColumn; /* Next word parsed must be in this column */
995 int dfltColumn; /* The default column */
996} Query;
997
998
999/*
1000** An instance of the following structure keeps track of generated
1001** matching-word offset information and snippets.
1002*/
1003typedef struct Snippet {
1004 int nMatch; /* Total number of matches */
1005 int nAlloc; /* Space allocated for aMatch[] */
1006 struct snippetMatch { /* One entry for each matching term */
1007 char snStatus; /* Status flag for use while constructing snippets */
1008 short int iCol; /* The column that contains the match */
1009 short int iTerm; /* The index in Query.pTerms[] of the matching term */
1010 short int nByte; /* Number of bytes in the term */
1011 int iStart; /* The offset to the first character of the term */
1012 } *aMatch; /* Points to space obtained from malloc */
1013 char *zOffset; /* Text rendering of aMatch[] */
1014 int nOffset; /* strlen(zOffset) */
1015 char *zSnippet; /* Snippet text */
1016 int nSnippet; /* strlen(zSnippet) */
1017} Snippet;
1018
1019
1020typedef enum QueryType {
1021 QUERY_GENERIC, /* table scan */
1022 QUERY_ROWID, /* lookup by rowid */
1023 QUERY_FULLTEXT /* QUERY_FULLTEXT + [i] is a full-text search for column i*/
1024} QueryType;
1025
1026/* TODO(shess) CHUNK_MAX controls how much data we allow in segment 0
1027** before we start aggregating into larger segments. Lower CHUNK_MAX
1028** means that for a given input we have more individual segments per
1029** term, which means more rows in the table and a bigger index (due to
1030** both more rows and bigger rowids). But it also reduces the average
1031** cost of adding new elements to the segment 0 doclist, and it seems
1032** to reduce the number of pages read and written during inserts. 256
1033** was chosen by measuring insertion times for a certain input (first
1034** 10k documents of Enron corpus), though including query performance
1035** in the decision may argue for a larger value.
1036*/
1037#define CHUNK_MAX 256
1038
1039typedef enum fulltext_statement {
1040 CONTENT_INSERT_STMT,
1041 CONTENT_SELECT_STMT,
1042 CONTENT_UPDATE_STMT,
1043 CONTENT_DELETE_STMT,
1044
1045 TERM_SELECT_STMT,
1046 TERM_SELECT_ALL_STMT,
1047 TERM_INSERT_STMT,
1048 TERM_UPDATE_STMT,
1049 TERM_DELETE_STMT,
1050
1051 MAX_STMT /* Always at end! */
1052} fulltext_statement;
1053
1054/* These must exactly match the enum above. */
1055/* TODO(adam): Is there some risk that a statement (in particular,
1056** pTermSelectStmt) will be used in two cursors at once, e.g. if a
1057** query joins a virtual table to itself? If so perhaps we should
1058** move some of these to the cursor object.
1059*/
1060static const char *const fulltext_zStatement[MAX_STMT] = {
1061 /* CONTENT_INSERT */ NULL, /* generated in contentInsertStatement() */
1062 /* CONTENT_SELECT */ "select * from %_content where rowid = ?",
1063 /* CONTENT_UPDATE */ NULL, /* generated in contentUpdateStatement() */
1064 /* CONTENT_DELETE */ "delete from %_content where rowid = ?",
1065
1066 /* TERM_SELECT */
1067 "select rowid, doclist from %_term where term = ? and segment = ?",
1068 /* TERM_SELECT_ALL */
1069 "select doclist from %_term where term = ? order by segment",
1070 /* TERM_INSERT */
1071 "insert into %_term (rowid, term, segment, doclist) values (?, ?, ?, ?)",
1072 /* TERM_UPDATE */ "update %_term set doclist = ? where rowid = ?",
1073 /* TERM_DELETE */ "delete from %_term where rowid = ?",
1074};
1075
1076/*
1077** A connection to a fulltext index is an instance of the following
1078** structure. The xCreate and xConnect methods create an instance
1079** of this structure and xDestroy and xDisconnect free that instance.
1080** All other methods receive a pointer to the structure as one of their
1081** arguments.
1082*/
1083struct fulltext_vtab {
1084 sqlite3_vtab base; /* Base class used by SQLite core */
1085 sqlite3 *db; /* The database connection */
1086 const char *zDb; /* logical database name */
1087 const char *zName; /* virtual table name */
1088 int nColumn; /* number of columns in virtual table */
1089 char **azColumn; /* column names. malloced */
1090 char **azContentColumn; /* column names in content table; malloced */
1091 sqlite3_tokenizer *pTokenizer; /* tokenizer for inserts and queries */
1092
1093 /* Precompiled statements which we keep as long as the table is
1094 ** open.
1095 */
1096 sqlite3_stmt *pFulltextStatements[MAX_STMT];
1097};
1098
1099/*
1100** When the core wants to do a query, it create a cursor using a
1101** call to xOpen. This structure is an instance of a cursor. It
1102** is destroyed by xClose.
1103*/
1104typedef struct fulltext_cursor {
1105 sqlite3_vtab_cursor base; /* Base class used by SQLite core */
1106 QueryType iCursorType; /* Copy of sqlite3_index_info.idxNum */
1107 sqlite3_stmt *pStmt; /* Prepared statement in use by the cursor */
1108 int eof; /* True if at End Of Results */
1109 Query q; /* Parsed query string */
1110 Snippet snippet; /* Cached snippet for the current row */
1111 int iColumn; /* Column being searched */
1112 DocListReader result; /* used when iCursorType == QUERY_FULLTEXT */
1113} fulltext_cursor;
1114
1115static struct fulltext_vtab *cursor_vtab(fulltext_cursor *c){
1116 return (fulltext_vtab *) c->base.pVtab;
1117}
1118
1119static const sqlite3_module fulltextModule; /* forward declaration */
1120
1121/* Append a list of strings separated by commas to a StringBuffer. */
1122static void appendList(StringBuffer *sb, int nString, char **azString){
1123 int i;
1124 for(i=0; i<nString; ++i){
1125 if( i>0 ) append(sb, ", ");
1126 append(sb, azString[i]);
1127 }
1128}
1129
1130/* Return a dynamically generated statement of the form
1131 * insert into %_content (rowid, ...) values (?, ...)
1132 */
1133static const char *contentInsertStatement(fulltext_vtab *v){
1134 StringBuffer sb;
1135 int i;
1136
1137 initStringBuffer(&sb);
1138 append(&sb, "insert into %_content (rowid, ");
1139 appendList(&sb, v->nColumn, v->azContentColumn);
1140 append(&sb, ") values (?");
1141 for(i=0; i<v->nColumn; ++i)
1142 append(&sb, ", ?");
1143 append(&sb, ")");
1144 return sb.s;
1145}
1146
1147/* Return a dynamically generated statement of the form
1148 * update %_content set [col_0] = ?, [col_1] = ?, ...
1149 * where rowid = ?
1150 */
1151static const char *contentUpdateStatement(fulltext_vtab *v){
1152 StringBuffer sb;
1153 int i;
1154
1155 initStringBuffer(&sb);
1156 append(&sb, "update %_content set ");
1157 for(i=0; i<v->nColumn; ++i) {
1158 if( i>0 ){
1159 append(&sb, ", ");
1160 }
1161 append(&sb, v->azContentColumn[i]);
1162 append(&sb, " = ?");
1163 }
1164 append(&sb, " where rowid = ?");
1165 return sb.s;
1166}
1167
1168/* Puts a freshly-prepared statement determined by iStmt in *ppStmt.
1169** If the indicated statement has never been prepared, it is prepared
1170** and cached, otherwise the cached version is reset.
1171*/
1172static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt,
1173 sqlite3_stmt **ppStmt){
1174 assert( iStmt<MAX_STMT );
1175 if( v->pFulltextStatements[iStmt]==NULL ){
1176 const char *zStmt;
1177 int rc;
1178 switch( iStmt ){
1179 case CONTENT_INSERT_STMT:
1180 zStmt = contentInsertStatement(v); break;
1181 case CONTENT_UPDATE_STMT:
1182 zStmt = contentUpdateStatement(v); break;
1183 default:
1184 zStmt = fulltext_zStatement[iStmt];
1185 }
1186 rc = sql_prepare(v->db, v->zDb, v->zName, &v->pFulltextStatements[iStmt],
1187 zStmt);
1188 if( zStmt != fulltext_zStatement[iStmt]) free((void *) zStmt);
1189 if( rc!=SQLITE_OK ) return rc;
1190 } else {
1191 int rc = sqlite3_reset(v->pFulltextStatements[iStmt]);
1192 if( rc!=SQLITE_OK ) return rc;
1193 }
1194
1195 *ppStmt = v->pFulltextStatements[iStmt];
1196 return SQLITE_OK;
1197}
1198
1199/* Step the indicated statement, handling errors SQLITE_BUSY (by
1200** retrying) and SQLITE_SCHEMA (by re-preparing and transferring
1201** bindings to the new statement).
1202** TODO(adam): We should extend this function so that it can work with
1203** statements declared locally, not only globally cached statements.
1204*/
1205static int sql_step_statement(fulltext_vtab *v, fulltext_statement iStmt,
1206 sqlite3_stmt **ppStmt){
1207 int rc;
1208 sqlite3_stmt *s = *ppStmt;
1209 assert( iStmt<MAX_STMT );
1210 assert( s==v->pFulltextStatements[iStmt] );
1211
1212 while( (rc=sqlite3_step(s))!=SQLITE_DONE && rc!=SQLITE_ROW ){
1213 if( rc==SQLITE_BUSY ) continue;
1214 if( rc!=SQLITE_ERROR ) return rc;
1215
1216 /* If an SQLITE_SCHEMA error has occured, then finalizing this
1217 * statement is going to delete the fulltext_vtab structure. If
1218 * the statement just executed is in the pFulltextStatements[]
1219 * array, it will be finalized twice. So remove it before
1220 * calling sqlite3_finalize().
1221 */
1222 v->pFulltextStatements[iStmt] = NULL;
1223 rc = sqlite3_finalize(s);
1224 break;
1225 }
1226 return rc;
1227
1228 err:
1229 sqlite3_finalize(s);
1230 return rc;
1231}
1232
1233/* Like sql_step_statement(), but convert SQLITE_DONE to SQLITE_OK.
1234** Useful for statements like UPDATE, where we expect no results.
1235*/
1236static int sql_single_step_statement(fulltext_vtab *v,
1237 fulltext_statement iStmt,
1238 sqlite3_stmt **ppStmt){
1239 int rc = sql_step_statement(v, iStmt, ppStmt);
1240 return (rc==SQLITE_DONE) ? SQLITE_OK : rc;
1241}
1242
1243/* insert into %_content (rowid, ...) values ([rowid], [pValues]) */
1244static int content_insert(fulltext_vtab *v, sqlite3_value *rowid,
1245 sqlite3_value **pValues){
1246 sqlite3_stmt *s;
1247 int i;
1248 int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s);
1249 if( rc!=SQLITE_OK ) return rc;
1250
1251 rc = sqlite3_bind_value(s, 1, rowid);
1252 if( rc!=SQLITE_OK ) return rc;
1253
1254 for(i=0; i<v->nColumn; ++i){
1255 rc = sqlite3_bind_value(s, 2+i, pValues[i]);
1256 if( rc!=SQLITE_OK ) return rc;
1257 }
1258
1259 return sql_single_step_statement(v, CONTENT_INSERT_STMT, &s);
1260}
1261
1262/* update %_content set col0 = pValues[0], col1 = pValues[1], ...
1263 * where rowid = [iRowid] */
1264static int content_update(fulltext_vtab *v, sqlite3_value **pValues,
1265 sqlite_int64 iRowid){
1266 sqlite3_stmt *s;
1267 int i;
1268 int rc = sql_get_statement(v, CONTENT_UPDATE_STMT, &s);
1269 if( rc!=SQLITE_OK ) return rc;
1270
1271 for(i=0; i<v->nColumn; ++i){
1272 rc = sqlite3_bind_value(s, 1+i, pValues[i]);
1273 if( rc!=SQLITE_OK ) return rc;
1274 }
1275
1276 rc = sqlite3_bind_int64(s, 1+v->nColumn, iRowid);
1277 if( rc!=SQLITE_OK ) return rc;
1278
1279 return sql_single_step_statement(v, CONTENT_UPDATE_STMT, &s);
1280}
1281
1282static void freeStringArray(int nString, const char **pString){
1283 int i;
1284
1285 for (i=0 ; i < nString ; ++i) {
1286 if( pString[i]!=NULL ) free((void *) pString[i]);
1287 }
1288 free((void *) pString);
1289}
1290
1291/* select * from %_content where rowid = [iRow]
1292 * The caller must delete the returned array and all strings in it.
1293 * null fields will be NULL in the returned array.
1294 *
1295 * TODO: Perhaps we should return pointer/length strings here for consistency
1296 * with other code which uses pointer/length. */
1297static int content_select(fulltext_vtab *v, sqlite_int64 iRow,
1298 const char ***pValues){
1299 sqlite3_stmt *s;
1300 const char **values;
1301 int i;
1302 int rc;
1303
1304 *pValues = NULL;
1305
1306 rc = sql_get_statement(v, CONTENT_SELECT_STMT, &s);
1307 if( rc!=SQLITE_OK ) return rc;
1308
1309 rc = sqlite3_bind_int64(s, 1, iRow);
1310 if( rc!=SQLITE_OK ) return rc;
1311
1312 rc = sql_step_statement(v, CONTENT_SELECT_STMT, &s);
1313 if( rc!=SQLITE_ROW ) return rc;
1314
1315 values = (const char **) malloc(v->nColumn * sizeof(const char *));
1316 for(i=0; i<v->nColumn; ++i){
1317 if( sqlite3_column_type(s, i)==SQLITE_NULL ){
1318 values[i] = NULL;
1319 }else{
1320 values[i] = string_dup((char*)sqlite3_column_text(s, i));
1321 }
1322 }
1323
1324 /* We expect only one row. We must execute another sqlite3_step()
1325 * to complete the iteration; otherwise the table will remain locked. */
1326 rc = sqlite3_step(s);
1327 if( rc==SQLITE_DONE ){
1328 *pValues = values;
1329 return SQLITE_OK;
1330 }
1331
1332 freeStringArray(v->nColumn, values);
1333 return rc;
1334}
1335
1336/* delete from %_content where rowid = [iRow ] */
1337static int content_delete(fulltext_vtab *v, sqlite_int64 iRow){
1338 sqlite3_stmt *s;
1339 int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s);
1340 if( rc!=SQLITE_OK ) return rc;
1341
1342 rc = sqlite3_bind_int64(s, 1, iRow);
1343 if( rc!=SQLITE_OK ) return rc;
1344
1345 return sql_single_step_statement(v, CONTENT_DELETE_STMT, &s);
1346}
1347
1348/* select rowid, doclist from %_term
1349 * where term = [pTerm] and segment = [iSegment]
1350 * If found, returns SQLITE_ROW; the caller must free the
1351 * returned doclist. If no rows found, returns SQLITE_DONE. */
1352static int term_select(fulltext_vtab *v, const char *pTerm, int nTerm,
1353 int iSegment,
1354 sqlite_int64 *rowid, DocList *out){
1355 sqlite3_stmt *s;
1356 int rc = sql_get_statement(v, TERM_SELECT_STMT, &s);
1357 if( rc!=SQLITE_OK ) return rc;
1358
1359 rc = sqlite3_bind_text(s, 1, pTerm, nTerm, SQLITE_STATIC);
1360 if( rc!=SQLITE_OK ) return rc;
1361
1362 rc = sqlite3_bind_int(s, 2, iSegment);
1363 if( rc!=SQLITE_OK ) return rc;
1364
1365 rc = sql_step_statement(v, TERM_SELECT_STMT, &s);
1366 if( rc!=SQLITE_ROW ) return rc;
1367
1368 *rowid = sqlite3_column_int64(s, 0);
1369 docListInit(out, DL_DEFAULT,
1370 sqlite3_column_blob(s, 1), sqlite3_column_bytes(s, 1));
1371
1372 /* We expect only one row. We must execute another sqlite3_step()
1373 * to complete the iteration; otherwise the table will remain locked. */
1374 rc = sqlite3_step(s);
1375 return rc==SQLITE_DONE ? SQLITE_ROW : rc;
1376}
1377
1378/* Load the segment doclists for term pTerm and merge them in
1379** appropriate order into out. Returns SQLITE_OK if successful. If
1380** there are no segments for pTerm, successfully returns an empty
1381** doclist in out.
1382**
1383** Each document consists of 1 or more "columns". The number of
1384** columns is v->nColumn. If iColumn==v->nColumn, then return
1385** position information about all columns. If iColumn<v->nColumn,
1386** then only return position information about the iColumn-th column
1387** (where the first column is 0).
1388*/
1389static int term_select_all(
1390 fulltext_vtab *v, /* The fulltext index we are querying against */
1391 int iColumn, /* If <nColumn, only look at the iColumn-th column */
1392 const char *pTerm, /* The term whose posting lists we want */
1393 int nTerm, /* Number of bytes in pTerm */
1394 DocList *out /* Write the resulting doclist here */
1395){
1396 DocList doclist;
1397 sqlite3_stmt *s;
1398 int rc = sql_get_statement(v, TERM_SELECT_ALL_STMT, &s);
1399 if( rc!=SQLITE_OK ) return rc;
1400
1401 rc = sqlite3_bind_text(s, 1, pTerm, nTerm, SQLITE_STATIC);
1402 if( rc!=SQLITE_OK ) return rc;
1403
1404 docListInit(&doclist, DL_DEFAULT, 0, 0);
1405
1406 /* TODO(shess) Handle schema and busy errors. */
1407 while( (rc=sql_step_statement(v, TERM_SELECT_ALL_STMT, &s))==SQLITE_ROW ){
1408 DocList old;
1409
1410 /* TODO(shess) If we processed doclists from oldest to newest, we
1411 ** could skip the malloc() involved with the following call. For
1412 ** now, I'd rather keep this logic similar to index_insert_term().
1413 ** We could additionally drop elements when we see deletes, but
1414 ** that would require a distinct version of docListAccumulate().
1415 */
1416 docListInit(&old, DL_DEFAULT,
1417 sqlite3_column_blob(s, 0), sqlite3_column_bytes(s, 0));
1418
1419 if( iColumn<v->nColumn ){ /* querying a single column */
1420 docListRestrictColumn(&old, iColumn);
1421 }
1422
1423 /* doclist contains the newer data, so write it over old. Then
1424 ** steal accumulated result for doclist.
1425 */
1426 docListAccumulate(&old, &doclist);
1427 docListDestroy(&doclist);
1428 doclist = old;
1429 }
1430 if( rc!=SQLITE_DONE ){
1431 docListDestroy(&doclist);
1432 return rc;
1433 }
1434
1435 docListDiscardEmpty(&doclist);
1436 *out = doclist;
1437 return SQLITE_OK;
1438}
1439
1440/* insert into %_term (rowid, term, segment, doclist)
1441 values ([piRowid], [pTerm], [iSegment], [doclist])
1442** Lets sqlite select rowid if piRowid is NULL, else uses *piRowid.
1443**
1444** NOTE(shess) piRowid is IN, with values of "space of int64" plus
1445** null, it is not used to pass data back to the caller.
1446*/
1447static int term_insert(fulltext_vtab *v, sqlite_int64 *piRowid,
1448 const char *pTerm, int nTerm,
1449 int iSegment, DocList *doclist){
1450 sqlite3_stmt *s;
1451 int rc = sql_get_statement(v, TERM_INSERT_STMT, &s);
1452 if( rc!=SQLITE_OK ) return rc;
1453
1454 if( piRowid==NULL ){
1455 rc = sqlite3_bind_null(s, 1);
1456 }else{
1457 rc = sqlite3_bind_int64(s, 1, *piRowid);
1458 }
1459 if( rc!=SQLITE_OK ) return rc;
1460
1461 rc = sqlite3_bind_text(s, 2, pTerm, nTerm, SQLITE_STATIC);
1462 if( rc!=SQLITE_OK ) return rc;
1463
1464 rc = sqlite3_bind_int(s, 3, iSegment);
1465 if( rc!=SQLITE_OK ) return rc;
1466
1467 rc = sqlite3_bind_blob(s, 4, doclist->pData, doclist->nData, SQLITE_STATIC);
1468 if( rc!=SQLITE_OK ) return rc;
1469
1470 return sql_single_step_statement(v, TERM_INSERT_STMT, &s);
1471}
1472
1473/* update %_term set doclist = [doclist] where rowid = [rowid] */
1474static int term_update(fulltext_vtab *v, sqlite_int64 rowid,
1475 DocList *doclist){
1476 sqlite3_stmt *s;
1477 int rc = sql_get_statement(v, TERM_UPDATE_STMT, &s);
1478 if( rc!=SQLITE_OK ) return rc;
1479
1480 rc = sqlite3_bind_blob(s, 1, doclist->pData, doclist->nData, SQLITE_STATIC);
1481 if( rc!=SQLITE_OK ) return rc;
1482
1483 rc = sqlite3_bind_int64(s, 2, rowid);
1484 if( rc!=SQLITE_OK ) return rc;
1485
1486 return sql_single_step_statement(v, TERM_UPDATE_STMT, &s);
1487}
1488
1489static int term_delete(fulltext_vtab *v, sqlite_int64 rowid){
1490 sqlite3_stmt *s;
1491 int rc = sql_get_statement(v, TERM_DELETE_STMT, &s);
1492 if( rc!=SQLITE_OK ) return rc;
1493
1494 rc = sqlite3_bind_int64(s, 1, rowid);
1495 if( rc!=SQLITE_OK ) return rc;
1496
1497 return sql_single_step_statement(v, TERM_DELETE_STMT, &s);
1498}
1499
1500/*
1501** Free the memory used to contain a fulltext_vtab structure.
1502*/
1503static void fulltext_vtab_destroy(fulltext_vtab *v){
1504 int iStmt, i;
1505
1506 TRACE(("FTS1 Destroy %p\n", v));
1507 for( iStmt=0; iStmt<MAX_STMT; iStmt++ ){
1508 if( v->pFulltextStatements[iStmt]!=NULL ){
1509 sqlite3_finalize(v->pFulltextStatements[iStmt]);
1510 v->pFulltextStatements[iStmt] = NULL;
1511 }
1512 }
1513
1514 if( v->pTokenizer!=NULL ){
1515 v->pTokenizer->pModule->xDestroy(v->pTokenizer);
1516 v->pTokenizer = NULL;
1517 }
1518
1519 free(v->azColumn);
1520 for(i = 0; i < v->nColumn; ++i) {
1521 sqlite3_free(v->azContentColumn[i]);
1522 }
1523 free(v->azContentColumn);
1524 free(v);
1525}
1526
1527/*
1528** Token types for parsing the arguments to xConnect or xCreate.
1529*/
1530#define TOKEN_EOF 0 /* End of file */
1531#define TOKEN_SPACE 1 /* Any kind of whitespace */
1532#define TOKEN_ID 2 /* An identifier */
1533#define TOKEN_STRING 3 /* A string literal */
1534#define TOKEN_PUNCT 4 /* A single punctuation character */
1535
1536/*
1537** If X is a character that can be used in an identifier then
1538** IdChar(X) will be true. Otherwise it is false.
1539**
1540** For ASCII, any character with the high-order bit set is
1541** allowed in an identifier. For 7-bit characters,
1542** sqlite3IsIdChar[X] must be 1.
1543**
1544** Ticket #1066. the SQL standard does not allow '$' in the
1545** middle of identfiers. But many SQL implementations do.
1546** SQLite will allow '$' in identifiers for compatibility.
1547** But the feature is undocumented.
1548*/
1549static const char isIdChar[] = {
1550/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
1551 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 2x */
1552 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */
1553 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */
1554 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */
1555 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */
1556 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */
1557};
1558#define IdChar(C) (((c=C)&0x80)!=0 || (c>0x1f && isIdChar[c-0x20]))
1559
1560
1561/*
1562** Return the length of the token that begins at z[0].
1563** Store the token type in *tokenType before returning.
1564*/
1565static int getToken(const char *z, int *tokenType){
1566 int i, c;
1567 switch( *z ){
1568 case 0: {
1569 *tokenType = TOKEN_EOF;
1570 return 0;
1571 }
1572 case ' ': case '\t': case '\n': case '\f': case '\r': {
1573 for(i=1; safe_isspace(z[i]); i++){}
1574 *tokenType = TOKEN_SPACE;
1575 return i;
1576 }
1577 case '`':
1578 case '\'':
1579 case '"': {
1580 int delim = z[0];
1581 for(i=1; (c=z[i])!=0; i++){
1582 if( c==delim ){
1583 if( z[i+1]==delim ){
1584 i++;
1585 }else{
1586 break;
1587 }
1588 }
1589 }
1590 *tokenType = TOKEN_STRING;
1591 return i + (c!=0);
1592 }
1593 case '[': {
1594 for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){}
1595 *tokenType = TOKEN_ID;
1596 return i;
1597 }
1598 default: {
1599 if( !IdChar(*z) ){
1600 break;
1601 }
1602 for(i=1; IdChar(z[i]); i++){}
1603 *tokenType = TOKEN_ID;
1604 return i;
1605 }
1606 }
1607 *tokenType = TOKEN_PUNCT;
1608 return 1;
1609}
1610
1611/*
1612** A token extracted from a string is an instance of the following
1613** structure.
1614*/
1615typedef struct Token {
1616 const char *z; /* Pointer to token text. Not '\000' terminated */
1617 short int n; /* Length of the token text in bytes. */
1618} Token;
1619
1620/*
1621** Given a input string (which is really one of the argv[] parameters
1622** passed into xConnect or xCreate) split the string up into tokens.
1623** Return an array of pointers to '\000' terminated strings, one string
1624** for each non-whitespace token.
1625**
1626** The returned array is terminated by a single NULL pointer.
1627**
1628** Space to hold the returned array is obtained from a single
1629** malloc and should be freed by passing the return value to free().
1630** The individual strings within the token list are all a part of
1631** the single memory allocation and will all be freed at once.
1632*/
1633static char **tokenizeString(const char *z, int *pnToken){
1634 int nToken = 0;
1635 Token *aToken = malloc( strlen(z) * sizeof(aToken[0]) );
1636 int n = 1;
1637 int e, i;
1638 int totalSize = 0;
1639 char **azToken;
1640 char *zCopy;
1641 while( n>0 ){
1642 n = getToken(z, &e);
1643 if( e!=TOKEN_SPACE ){
1644 aToken[nToken].z = z;
1645 aToken[nToken].n = n;
1646 nToken++;
1647 totalSize += n+1;
1648 }
1649 z += n;
1650 }
1651 azToken = (char**)malloc( nToken*sizeof(char*) + totalSize );
1652 zCopy = (char*)&azToken[nToken];
1653 nToken--;
1654 for(i=0; i<nToken; i++){
1655 azToken[i] = zCopy;
1656 n = aToken[i].n;
1657 memcpy(zCopy, aToken[i].z, n);
1658 zCopy[n] = 0;
1659 zCopy += n+1;
1660 }
1661 azToken[nToken] = 0;
1662 free(aToken);
1663 *pnToken = nToken;
1664 return azToken;
1665}
1666
1667/*
1668** Convert an SQL-style quoted string into a normal string by removing
1669** the quote characters. The conversion is done in-place. If the
1670** input does not begin with a quote character, then this routine
1671** is a no-op.
1672**
1673** Examples:
1674**
1675** "abc" becomes abc
1676** 'xyz' becomes xyz
1677** [pqr] becomes pqr
1678** `mno` becomes mno
1679*/
1680static void dequoteString(char *z){
1681 int quote;
1682 int i, j;
1683 if( z==0 ) return;
1684 quote = z[0];
1685 switch( quote ){
1686 case '\'': break;
1687 case '"': break;
1688 case '`': break; /* For MySQL compatibility */
1689 case '[': quote = ']'; break; /* For MS SqlServer compatibility */
1690 default: return;
1691 }
1692 for(i=1, j=0; z[i]; i++){
1693 if( z[i]==quote ){
1694 if( z[i+1]==quote ){
1695 z[j++] = quote;
1696 i++;
1697 }else{
1698 z[j++] = 0;
1699 break;
1700 }
1701 }else{
1702 z[j++] = z[i];
1703 }
1704 }
1705}
1706
1707/*
1708** The input azIn is a NULL-terminated list of tokens. Remove the first
1709** token and all punctuation tokens. Remove the quotes from
1710** around string literal tokens.
1711**
1712** Example:
1713**
1714** input: tokenize chinese ( 'simplifed' , 'mixed' )
1715** output: chinese simplifed mixed
1716**
1717** Another example:
1718**
1719** input: delimiters ( '[' , ']' , '...' )
1720** output: [ ] ...
1721*/
1722static void tokenListToIdList(char **azIn){
1723 int i, j;
1724 if( azIn ){
1725 for(i=0, j=-1; azIn[i]; i++){
1726 if( safe_isalnum(azIn[i][0]) || azIn[i][1] ){
1727 dequoteString(azIn[i]);
1728 if( j>=0 ){
1729 azIn[j] = azIn[i];
1730 }
1731 j++;
1732 }
1733 }
1734 azIn[j] = 0;
1735 }
1736}
1737
1738
1739/*
1740** Find the first alphanumeric token in the string zIn. Null-terminate
1741** this token. Remove any quotation marks. And return a pointer to
1742** the result.
1743*/
1744static char *firstToken(char *zIn, char **pzTail){
1745 int n, ttype;
1746 while(1){
1747 n = getToken(zIn, &ttype);
1748 if( ttype==TOKEN_SPACE ){
1749 zIn += n;
1750 }else if( ttype==TOKEN_EOF ){
1751 *pzTail = zIn;
1752 return 0;
1753 }else{
1754 zIn[n] = 0;
1755 *pzTail = &zIn[1];
1756 dequoteString(zIn);
1757 return zIn;
1758 }
1759 }
1760 /*NOTREACHED*/
1761}
1762
1763/* Return true if...
1764**
1765** * s begins with the string t, ignoring case
1766** * s is longer than t
1767** * The first character of s beyond t is not a alphanumeric
1768**
1769** Ignore leading space in *s.
1770**
1771** To put it another way, return true if the first token of
1772** s[] is t[].
1773*/
1774static int startsWith(const char *s, const char *t){
1775 while( safe_isspace(*s) ){ s++; }
1776 while( *t ){
1777 if( safe_tolower(*s++)!=safe_tolower(*t++) ) return 0;
1778 }
1779 return *s!='_' && !safe_isalnum(*s);
1780}
1781
1782/*
1783** An instance of this structure defines the "spec" of a
1784** full text index. This structure is populated by parseSpec
1785** and use by fulltextConnect and fulltextCreate.
1786*/
1787typedef struct TableSpec {
1788 const char *zDb; /* Logical database name */
1789 const char *zName; /* Name of the full-text index */
1790 int nColumn; /* Number of columns to be indexed */
1791 char **azColumn; /* Original names of columns to be indexed */
1792 char **azContentColumn; /* Column names for %_content */
1793 char **azTokenizer; /* Name of tokenizer and its arguments */
1794} TableSpec;
1795
1796/*
1797** Reclaim all of the memory used by a TableSpec
1798*/
1799static void clearTableSpec(TableSpec *p) {
1800 free(p->azColumn);
1801 free(p->azContentColumn);
1802 free(p->azTokenizer);
1803}
1804
1805/* Parse a CREATE VIRTUAL TABLE statement, which looks like this:
1806 *
1807 * CREATE VIRTUAL TABLE email
1808 * USING fts1(subject, body, tokenize mytokenizer(myarg))
1809 *
1810 * We return parsed information in a TableSpec structure.
1811 *
1812 */
1813static int parseSpec(TableSpec *pSpec, int argc, const char *const*argv,
1814 char**pzErr){
1815 int i, n;
1816 char *z, *zDummy;
1817 char **azArg;
1818 const char *zTokenizer = 0; /* argv[] entry describing the tokenizer */
1819
1820 assert( argc>=3 );
1821 /* Current interface:
1822 ** argv[0] - module name
1823 ** argv[1] - database name
1824 ** argv[2] - table name
1825 ** argv[3..] - columns, optionally followed by tokenizer specification
1826 ** and snippet delimiters specification.
1827 */
1828
1829 /* Make a copy of the complete argv[][] array in a single allocation.
1830 ** The argv[][] array is read-only and transient. We can write to the
1831 ** copy in order to modify things and the copy is persistent.
1832 */
1833 memset(pSpec, 0, sizeof(*pSpec));
1834 for(i=n=0; i<argc; i++){
1835 n += strlen(argv[i]) + 1;
1836 }
1837 azArg = malloc( sizeof(char*)*argc + n );
1838 if( azArg==0 ){
1839 return SQLITE_NOMEM;
1840 }
1841 z = (char*)&azArg[argc];
1842 for(i=0; i<argc; i++){
1843 azArg[i] = z;
1844 strcpy(z, argv[i]);
1845 z += strlen(z)+1;
1846 }
1847
1848 /* Identify the column names and the tokenizer and delimiter arguments
1849 ** in the argv[][] array.
1850 */
1851 pSpec->zDb = azArg[1];
1852 pSpec->zName = azArg[2];
1853 pSpec->nColumn = 0;
1854 pSpec->azColumn = azArg;
1855 zTokenizer = "tokenize simple";
1856 for(i=3; i<argc; ++i){
1857 if( startsWith(azArg[i],"tokenize") ){
1858 zTokenizer = azArg[i];
1859 }else{
1860 z = azArg[pSpec->nColumn] = firstToken(azArg[i], &zDummy);
1861 pSpec->nColumn++;
1862 }
1863 }
1864 if( pSpec->nColumn==0 ){
1865 azArg[0] = "content";
1866 pSpec->nColumn = 1;
1867 }
1868
1869 /*
1870 ** Construct the list of content column names.
1871 **
1872 ** Each content column name will be of the form cNNAAAA
1873 ** where NN is the column number and AAAA is the sanitized
1874 ** column name. "sanitized" means that special characters are
1875 ** converted to "_". The cNN prefix guarantees that all column
1876 ** names are unique.
1877 **
1878 ** The AAAA suffix is not strictly necessary. It is included
1879 ** for the convenience of people who might examine the generated
1880 ** %_content table and wonder what the columns are used for.
1881 */
1882 pSpec->azContentColumn = malloc( pSpec->nColumn * sizeof(char *) );
1883 if( pSpec->azContentColumn==0 ){
1884 clearTableSpec(pSpec);
1885 return SQLITE_NOMEM;
1886 }
1887 for(i=0; i<pSpec->nColumn; i++){
1888 char *p;
1889 pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]);
1890 for (p = pSpec->azContentColumn[i]; *p ; ++p) {
1891 if( !safe_isalnum(*p) ) *p = '_';
1892 }
1893 }
1894
1895 /*
1896 ** Parse the tokenizer specification string.
1897 */
1898 pSpec->azTokenizer = tokenizeString(zTokenizer, &n);
1899 tokenListToIdList(pSpec->azTokenizer);
1900
1901 return SQLITE_OK;
1902}
1903
1904/*
1905** Generate a CREATE TABLE statement that describes the schema of
1906** the virtual table. Return a pointer to this schema string.
1907**
1908** Space is obtained from sqlite3_mprintf() and should be freed
1909** using sqlite3_free().
1910*/
1911static char *fulltextSchema(
1912 int nColumn, /* Number of columns */
1913 const char *const* azColumn, /* List of columns */
1914 const char *zTableName /* Name of the table */
1915){
1916 int i;
1917 char *zSchema, *zNext;
1918 const char *zSep = "(";
1919 zSchema = sqlite3_mprintf("CREATE TABLE x");
1920 for(i=0; i<nColumn; i++){
1921 zNext = sqlite3_mprintf("%s%s%Q", zSchema, zSep, azColumn[i]);
1922 sqlite3_free(zSchema);
1923 zSchema = zNext;
1924 zSep = ",";
1925 }
1926 zNext = sqlite3_mprintf("%s,%Q)", zSchema, zTableName);
1927 sqlite3_free(zSchema);
1928 return zNext;
1929}
1930
1931/*
1932** Build a new sqlite3_vtab structure that will describe the
1933** fulltext index defined by spec.
1934*/
1935static int constructVtab(
1936 sqlite3 *db, /* The SQLite database connection */
1937 TableSpec *spec, /* Parsed spec information from parseSpec() */
1938 sqlite3_vtab **ppVTab, /* Write the resulting vtab structure here */
1939 char **pzErr /* Write any error message here */
1940){
1941 int rc;
1942 int n;
1943 fulltext_vtab *v = 0;
1944 const sqlite3_tokenizer_module *m = NULL;
1945 char *schema;
1946
1947 v = (fulltext_vtab *) malloc(sizeof(fulltext_vtab));
1948 if( v==0 ) return SQLITE_NOMEM;
1949 memset(v, 0, sizeof(*v));
1950 /* sqlite will initialize v->base */
1951 v->db = db;
1952 v->zDb = spec->zDb; /* Freed when azColumn is freed */
1953 v->zName = spec->zName; /* Freed when azColumn is freed */
1954 v->nColumn = spec->nColumn;
1955 v->azContentColumn = spec->azContentColumn;
1956 spec->azContentColumn = 0;
1957 v->azColumn = spec->azColumn;
1958 spec->azColumn = 0;
1959
1960 if( spec->azTokenizer==0 ){
1961 return SQLITE_NOMEM;
1962 }
1963 /* TODO(shess) For now, add new tokenizers as else if clauses. */
1964 if( spec->azTokenizer[0]==0 || startsWith(spec->azTokenizer[0], "simple") ){
1965 sqlite3Fts1SimpleTokenizerModule(&m);
1966 }else if( startsWith(spec->azTokenizer[0], "porter") ){
1967 sqlite3Fts1PorterTokenizerModule(&m);
1968 }else{
1969 *pzErr = sqlite3_mprintf("unknown tokenizer: %s", spec->azTokenizer[0]);
1970 rc = SQLITE_ERROR;
1971 goto err;
1972 }
1973 for(n=0; spec->azTokenizer[n]; n++){}
1974 if( n ){
1975 rc = m->xCreate(n-1, (const char*const*)&spec->azTokenizer[1],
1976 &v->pTokenizer);
1977 }else{
1978 rc = m->xCreate(0, 0, &v->pTokenizer);
1979 }
1980 if( rc!=SQLITE_OK ) goto err;
1981 v->pTokenizer->pModule = m;
1982
1983 /* TODO: verify the existence of backing tables foo_content, foo_term */
1984
1985 schema = fulltextSchema(v->nColumn, (const char*const*)v->azColumn,
1986 spec->zName);
1987 rc = sqlite3_declare_vtab(db, schema);
1988 sqlite3_free(schema);
1989 if( rc!=SQLITE_OK ) goto err;
1990
1991 memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements));
1992
1993 *ppVTab = &v->base;
1994 TRACE(("FTS1 Connect %p\n", v));
1995
1996 return rc;
1997
1998err:
1999 fulltext_vtab_destroy(v);
2000 return rc;
2001}
2002
2003static int fulltextConnect(
2004 sqlite3 *db,
2005 void *pAux,
2006 int argc, const char *const*argv,
2007 sqlite3_vtab **ppVTab,
2008 char **pzErr
2009){
2010 TableSpec spec;
2011 int rc = parseSpec(&spec, argc, argv, pzErr);
2012 if( rc!=SQLITE_OK ) return rc;
2013
2014 rc = constructVtab(db, &spec, ppVTab, pzErr);
2015 clearTableSpec(&spec);
2016 return rc;
2017}
2018
2019 /* The %_content table holds the text of each document, with
2020 ** the rowid used as the docid.
2021 **
2022 ** The %_term table maps each term to a document list blob
2023 ** containing elements sorted by ascending docid, each element
2024 ** encoded as:
2025 **
2026 ** docid varint-encoded
2027 ** token elements:
2028 ** position+1 varint-encoded as delta from previous position
2029 ** start offset varint-encoded as delta from previous start offset
2030 ** end offset varint-encoded as delta from start offset
2031 **
2032 ** The sentinel position of 0 indicates the end of the token list.
2033 **
2034 ** Additionally, doclist blobs are chunked into multiple segments,
2035 ** using segment to order the segments. New elements are added to
2036 ** the segment at segment 0, until it exceeds CHUNK_MAX. Then
2037 ** segment 0 is deleted, and the doclist is inserted at segment 1.
2038 ** If there is already a doclist at segment 1, the segment 0 doclist
2039 ** is merged with it, the segment 1 doclist is deleted, and the
2040 ** merged doclist is inserted at segment 2, repeating those
2041 ** operations until an insert succeeds.
2042 **
2043 ** Since this structure doesn't allow us to update elements in place
2044 ** in case of deletion or update, these are simply written to
2045 ** segment 0 (with an empty token list in case of deletion), with
2046 ** docListAccumulate() taking care to retain lower-segment
2047 ** information in preference to higher-segment information.
2048 */
2049 /* TODO(shess) Provide a VACUUM type operation which both removes
2050 ** deleted elements which are no longer necessary, and duplicated
2051 ** elements. I suspect this will probably not be necessary in
2052 ** practice, though.
2053 */
2054static int fulltextCreate(sqlite3 *db, void *pAux,
2055 int argc, const char * const *argv,
2056 sqlite3_vtab **ppVTab, char **pzErr){
2057 int rc;
2058 TableSpec spec;
2059 StringBuffer schema;
2060 TRACE(("FTS1 Create\n"));
2061
2062 rc = parseSpec(&spec, argc, argv, pzErr);
2063 if( rc!=SQLITE_OK ) return rc;
2064
2065 initStringBuffer(&schema);
2066 append(&schema, "CREATE TABLE %_content(");
2067 appendList(&schema, spec.nColumn, spec.azContentColumn);
2068 append(&schema, ")");
2069 rc = sql_exec(db, spec.zDb, spec.zName, schema.s);
2070 free(schema.s);
2071 if( rc!=SQLITE_OK ) goto out;
2072
2073 rc = sql_exec(db, spec.zDb, spec.zName,
2074 "create table %_term(term text, segment integer, doclist blob, "
2075 "primary key(term, segment));");
2076 if( rc!=SQLITE_OK ) goto out;
2077
2078 rc = constructVtab(db, &spec, ppVTab, pzErr);
2079
2080out:
2081 clearTableSpec(&spec);
2082 return rc;
2083}
2084
2085/* Decide how to handle an SQL query. */
2086static int fulltextBestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){
2087 int i;
2088 TRACE(("FTS1 BestIndex\n"));
2089
2090 for(i=0; i<pInfo->nConstraint; ++i){
2091 const struct sqlite3_index_constraint *pConstraint;
2092 pConstraint = &pInfo->aConstraint[i];
2093 if( pConstraint->usable ) {
2094 if( pConstraint->iColumn==-1 &&
2095 pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ ){
2096 pInfo->idxNum = QUERY_ROWID; /* lookup by rowid */
2097 TRACE(("FTS1 QUERY_ROWID\n"));
2098 } else if( pConstraint->iColumn>=0 &&
2099 pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH ){
2100 /* full-text search */
2101 pInfo->idxNum = QUERY_FULLTEXT + pConstraint->iColumn;
2102 TRACE(("FTS1 QUERY_FULLTEXT %d\n", pConstraint->iColumn));
2103 } else continue;
2104
2105 pInfo->aConstraintUsage[i].argvIndex = 1;
2106 pInfo->aConstraintUsage[i].omit = 1;
2107
2108 /* An arbitrary value for now.
2109 * TODO: Perhaps rowid matches should be considered cheaper than
2110 * full-text searches. */
2111 pInfo->estimatedCost = 1.0;
2112
2113 return SQLITE_OK;
2114 }
2115 }
2116 pInfo->idxNum = QUERY_GENERIC;
2117 return SQLITE_OK;
2118}
2119
2120static int fulltextDisconnect(sqlite3_vtab *pVTab){
2121 TRACE(("FTS1 Disconnect %p\n", pVTab));
2122 fulltext_vtab_destroy((fulltext_vtab *)pVTab);
2123 return SQLITE_OK;
2124}
2125
2126static int fulltextDestroy(sqlite3_vtab *pVTab){
2127 fulltext_vtab *v = (fulltext_vtab *)pVTab;
2128 int rc;
2129
2130 TRACE(("FTS1 Destroy %p\n", pVTab));
2131 rc = sql_exec(v->db, v->zDb, v->zName,
2132 "drop table if exists %_content;"
2133 "drop table if exists %_term;"
2134 );
2135 if( rc!=SQLITE_OK ) return rc;
2136
2137 fulltext_vtab_destroy((fulltext_vtab *)pVTab);
2138 return SQLITE_OK;
2139}
2140
2141static int fulltextOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){
2142 fulltext_cursor *c;
2143
2144 c = (fulltext_cursor *) calloc(sizeof(fulltext_cursor), 1);
2145 /* sqlite will initialize c->base */
2146 *ppCursor = &c->base;
2147 TRACE(("FTS1 Open %p: %p\n", pVTab, c));
2148
2149 return SQLITE_OK;
2150}
2151
2152
2153/* Free all of the dynamically allocated memory held by *q
2154*/
2155static void queryClear(Query *q){
2156 int i;
2157 for(i = 0; i < q->nTerms; ++i){
2158 free(q->pTerms[i].pTerm);
2159 }
2160 free(q->pTerms);
2161 memset(q, 0, sizeof(*q));
2162}
2163
2164/* Free all of the dynamically allocated memory held by the
2165** Snippet
2166*/
2167static void snippetClear(Snippet *p){
2168 free(p->aMatch);
2169 free(p->zOffset);
2170 free(p->zSnippet);
2171 memset(p, 0, sizeof(*p));
2172}
2173/*
2174** Append a single entry to the p->aMatch[] log.
2175*/
2176static void snippetAppendMatch(
2177 Snippet *p, /* Append the entry to this snippet */
2178 int iCol, int iTerm, /* The column and query term */
2179 int iStart, int nByte /* Offset and size of the match */
2180){
2181 int i;
2182 struct snippetMatch *pMatch;
2183 if( p->nMatch+1>=p->nAlloc ){
2184 p->nAlloc = p->nAlloc*2 + 10;
2185 p->aMatch = realloc(p->aMatch, p->nAlloc*sizeof(p->aMatch[0]) );
2186 if( p->aMatch==0 ){
2187 p->nMatch = 0;
2188 p->nAlloc = 0;
2189 return;
2190 }
2191 }
2192 i = p->nMatch++;
2193 pMatch = &p->aMatch[i];
2194 pMatch->iCol = iCol;
2195 pMatch->iTerm = iTerm;
2196 pMatch->iStart = iStart;
2197 pMatch->nByte = nByte;
2198}
2199
2200/*
2201** Sizing information for the circular buffer used in snippetOffsetsOfColumn()
2202*/
2203#define FTS1_ROTOR_SZ (32)
2204#define FTS1_ROTOR_MASK (FTS1_ROTOR_SZ-1)
2205
2206/*
2207** Add entries to pSnippet->aMatch[] for every match that occurs against
2208** document zDoc[0..nDoc-1] which is stored in column iColumn.
2209*/
2210static void snippetOffsetsOfColumn(
2211 Query *pQuery,
2212 Snippet *pSnippet,
2213 int iColumn,
2214 const char *zDoc,
2215 int nDoc
2216){
2217 const sqlite3_tokenizer_module *pTModule; /* The tokenizer module */
2218 sqlite3_tokenizer *pTokenizer; /* The specific tokenizer */
2219 sqlite3_tokenizer_cursor *pTCursor; /* Tokenizer cursor */
2220 fulltext_vtab *pVtab; /* The full text index */
2221 int nColumn; /* Number of columns in the index */
2222 const QueryTerm *aTerm; /* Query string terms */
2223 int nTerm; /* Number of query string terms */
2224 int i, j; /* Loop counters */
2225 int rc; /* Return code */
2226 unsigned int match, prevMatch; /* Phrase search bitmasks */
2227 const char *zToken; /* Next token from the tokenizer */
2228 int nToken; /* Size of zToken */
2229 int iBegin, iEnd, iPos; /* Offsets of beginning and end */
2230
2231 /* The following variables keep a circular buffer of the last
2232 ** few tokens */
2233 unsigned int iRotor = 0; /* Index of current token */
2234 int iRotorBegin[FTS1_ROTOR_SZ]; /* Beginning offset of token */
2235 int iRotorLen[FTS1_ROTOR_SZ]; /* Length of token */
2236
2237 pVtab = pQuery->pFts;
2238 nColumn = pVtab->nColumn;
2239 pTokenizer = pVtab->pTokenizer;
2240 pTModule = pTokenizer->pModule;
2241 rc = pTModule->xOpen(pTokenizer, zDoc, nDoc, &pTCursor);
2242 if( rc ) return;
2243 pTCursor->pTokenizer = pTokenizer;
2244 aTerm = pQuery->pTerms;
2245 nTerm = pQuery->nTerms;
2246 if( nTerm>=FTS1_ROTOR_SZ ){
2247 nTerm = FTS1_ROTOR_SZ - 1;
2248 }
2249 prevMatch = 0;
2250 while(1){
2251 rc = pTModule->xNext(pTCursor, &zToken, &nToken, &iBegin, &iEnd, &iPos);
2252 if( rc ) break;
2253 iRotorBegin[iRotor&FTS1_ROTOR_MASK] = iBegin;
2254 iRotorLen[iRotor&FTS1_ROTOR_MASK] = iEnd-iBegin;
2255 match = 0;
2256 for(i=0; i<nTerm; i++){
2257 int iCol;
2258 iCol = aTerm[i].iColumn;
2259 if( iCol>=0 && iCol<nColumn && iCol!=iColumn ) continue;
2260 if( aTerm[i].nTerm!=nToken ) continue;
2261 if( memcmp(aTerm[i].pTerm, zToken, nToken) ) continue;
2262 if( aTerm[i].iPhrase>1 && (prevMatch & (1<<i))==0 ) continue;
2263 match |= 1<<i;
2264 if( i==nTerm-1 || aTerm[i+1].iPhrase==1 ){
2265 for(j=aTerm[i].iPhrase-1; j>=0; j--){
2266 int k = (iRotor-j) & FTS1_ROTOR_MASK;
2267 snippetAppendMatch(pSnippet, iColumn, i-j,
2268 iRotorBegin[k], iRotorLen[k]);
2269 }
2270 }
2271 }
2272 prevMatch = match<<1;
2273 iRotor++;
2274 }
2275 pTModule->xClose(pTCursor);
2276}
2277
2278
2279/*
2280** Compute all offsets for the current row of the query.
2281** If the offsets have already been computed, this routine is a no-op.
2282*/
2283static void snippetAllOffsets(fulltext_cursor *p){
2284 int nColumn;
2285 int iColumn, i;
2286 int iFirst, iLast;
2287 fulltext_vtab *pFts;
2288
2289 if( p->snippet.nMatch ) return;
2290 if( p->q.nTerms==0 ) return;
2291 pFts = p->q.pFts;
2292 nColumn = pFts->nColumn;
2293 iColumn = p->iCursorType - QUERY_FULLTEXT;
2294 if( iColumn<0 || iColumn>=nColumn ){
2295 iFirst = 0;
2296 iLast = nColumn-1;
2297 }else{
2298 iFirst = iColumn;
2299 iLast = iColumn;
2300 }
2301 for(i=iFirst; i<=iLast; i++){
2302 const char *zDoc;
2303 int nDoc;
2304 zDoc = (const char*)sqlite3_column_text(p->pStmt, i+1);
2305 nDoc = sqlite3_column_bytes(p->pStmt, i+1);
2306 snippetOffsetsOfColumn(&p->q, &p->snippet, i, zDoc, nDoc);
2307 }
2308}
2309
2310/*
2311** Convert the information in the aMatch[] array of the snippet
2312** into the string zOffset[0..nOffset-1].
2313*/
2314static void snippetOffsetText(Snippet *p){
2315 int i;
2316 int cnt = 0;
2317 StringBuffer sb;
2318 char zBuf[200];
2319 if( p->zOffset ) return;
2320 initStringBuffer(&sb);
2321 for(i=0; i<p->nMatch; i++){
2322 struct snippetMatch *pMatch = &p->aMatch[i];
2323 zBuf[0] = ' ';
2324 sprintf(&zBuf[cnt>0], "%d %d %d %d", pMatch->iCol,
2325 pMatch->iTerm, pMatch->iStart, pMatch->nByte);
2326 append(&sb, zBuf);
2327 cnt++;
2328 }
2329 p->zOffset = sb.s;
2330 p->nOffset = sb.len;
2331}
2332
2333/*
2334** zDoc[0..nDoc-1] is phrase of text. aMatch[0..nMatch-1] are a set
2335** of matching words some of which might be in zDoc. zDoc is column
2336** number iCol.
2337**
2338** iBreak is suggested spot in zDoc where we could begin or end an
2339** excerpt. Return a value similar to iBreak but possibly adjusted
2340** to be a little left or right so that the break point is better.
2341*/
2342static int wordBoundary(
2343 int iBreak, /* The suggested break point */
2344 const char *zDoc, /* Document text */
2345 int nDoc, /* Number of bytes in zDoc[] */
2346 struct snippetMatch *aMatch, /* Matching words */
2347 int nMatch, /* Number of entries in aMatch[] */
2348 int iCol /* The column number for zDoc[] */
2349){
2350 int i;
2351 if( iBreak<=10 ){
2352 return 0;
2353 }
2354 if( iBreak>=nDoc-10 ){
2355 return nDoc;
2356 }
2357 for(i=0; i<nMatch && aMatch[i].iCol<iCol; i++){}
2358 while( i<nMatch && aMatch[i].iStart+aMatch[i].nByte<iBreak ){ i++; }
2359 if( i<nMatch ){
2360 if( aMatch[i].iStart<iBreak+10 ){
2361 return aMatch[i].iStart;
2362 }
2363 if( i>0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){
2364 return aMatch[i-1].iStart;
2365 }
2366 }
2367 for(i=1; i<=10; i++){
2368 if( safe_isspace(zDoc[iBreak-i]) ){
2369 return iBreak - i + 1;
2370 }
2371 if( safe_isspace(zDoc[iBreak+i]) ){
2372 return iBreak + i + 1;
2373 }
2374 }
2375 return iBreak;
2376}
2377
2378/*
2379** If the StringBuffer does not end in white space, add a single
2380** space character to the end.
2381*/
2382static void appendWhiteSpace(StringBuffer *p){
2383 if( p->len==0 ) return;
2384 if( safe_isspace(p->s[p->len-1]) ) return;
2385 append(p, " ");
2386}
2387
2388/*
2389** Remove white space from teh end of the StringBuffer
2390*/
2391static void trimWhiteSpace(StringBuffer *p){
2392 while( p->len>0 && safe_isspace(p->s[p->len-1]) ){
2393 p->len--;
2394 }
2395}
2396
2397
2398
2399/*
2400** Allowed values for Snippet.aMatch[].snStatus
2401*/
2402#define SNIPPET_IGNORE 0 /* It is ok to omit this match from the snippet */
2403#define SNIPPET_DESIRED 1 /* We want to include this match in the snippet */
2404
2405/*
2406** Generate the text of a snippet.
2407*/
2408static void snippetText(
2409 fulltext_cursor *pCursor, /* The cursor we need the snippet for */
2410 const char *zStartMark, /* Markup to appear before each match */
2411 const char *zEndMark, /* Markup to appear after each match */
2412 const char *zEllipsis /* Ellipsis mark */
2413){
2414 int i, j;
2415 struct snippetMatch *aMatch;
2416 int nMatch;
2417 int nDesired;
2418 StringBuffer sb;
2419 int tailCol;
2420 int tailOffset;
2421 int iCol;
2422 int nDoc;
2423 const char *zDoc;
2424 int iStart, iEnd;
2425 int tailEllipsis = 0;
2426 int iMatch;
2427
2428
2429 free(pCursor->snippet.zSnippet);
2430 pCursor->snippet.zSnippet = 0;
2431 aMatch = pCursor->snippet.aMatch;
2432 nMatch = pCursor->snippet.nMatch;
2433 initStringBuffer(&sb);
2434
2435 for(i=0; i<nMatch; i++){
2436 aMatch[i].snStatus = SNIPPET_IGNORE;
2437 }
2438 nDesired = 0;
2439 for(i=0; i<pCursor->q.nTerms; i++){
2440 for(j=0; j<nMatch; j++){
2441 if( aMatch[j].iTerm==i ){
2442 aMatch[j].snStatus = SNIPPET_DESIRED;
2443 nDesired++;
2444 break;
2445 }
2446 }
2447 }
2448
2449 iMatch = 0;
2450 tailCol = -1;
2451 tailOffset = 0;
2452 for(i=0; i<nMatch && nDesired>0; i++){
2453 if( aMatch[i].snStatus!=SNIPPET_DESIRED ) continue;
2454 nDesired--;
2455 iCol = aMatch[i].iCol;
2456 zDoc = (const char*)sqlite3_column_text(pCursor->pStmt, iCol+1);
2457 nDoc = sqlite3_column_bytes(pCursor->pStmt, iCol+1);
2458 iStart = aMatch[i].iStart - 40;
2459 iStart = wordBoundary(iStart, zDoc, nDoc, aMatch, nMatch, iCol);
2460 if( iStart<=10 ){
2461 iStart = 0;
2462 }
2463 if( iCol==tailCol && iStart<=tailOffset+20 ){
2464 iStart = tailOffset;
2465 }
2466 if( (iCol!=tailCol && tailCol>=0) || iStart!=tailOffset ){
2467 trimWhiteSpace(&sb);
2468 appendWhiteSpace(&sb);
2469 append(&sb, zEllipsis);
2470 appendWhiteSpace(&sb);
2471 }
2472 iEnd = aMatch[i].iStart + aMatch[i].nByte + 40;
2473 iEnd = wordBoundary(iEnd, zDoc, nDoc, aMatch, nMatch, iCol);
2474 if( iEnd>=nDoc-10 ){
2475 iEnd = nDoc;
2476 tailEllipsis = 0;
2477 }else{
2478 tailEllipsis = 1;
2479 }
2480 while( iMatch<nMatch && aMatch[iMatch].iCol<iCol ){ iMatch++; }
2481 while( iStart<iEnd ){
2482 while( iMatch<nMatch && aMatch[iMatch].iStart<iStart
2483 && aMatch[iMatch].iCol<=iCol ){
2484 iMatch++;
2485 }
2486 if( iMatch<nMatch && aMatch[iMatch].iStart<iEnd
2487 && aMatch[iMatch].iCol==iCol ){
2488 nappend(&sb, &zDoc[iStart], aMatch[iMatch].iStart - iStart);
2489 iStart = aMatch[iMatch].iStart;
2490 append(&sb, zStartMark);
2491 nappend(&sb, &zDoc[iStart], aMatch[iMatch].nByte);
2492 append(&sb, zEndMark);
2493 iStart += aMatch[iMatch].nByte;
2494 for(j=iMatch+1; j<nMatch; j++){
2495 if( aMatch[j].iTerm==aMatch[iMatch].iTerm
2496 && aMatch[j].snStatus==SNIPPET_DESIRED ){
2497 nDesired--;
2498 aMatch[j].snStatus = SNIPPET_IGNORE;
2499 }
2500 }
2501 }else{
2502 nappend(&sb, &zDoc[iStart], iEnd - iStart);
2503 iStart = iEnd;
2504 }
2505 }
2506 tailCol = iCol;
2507 tailOffset = iEnd;
2508 }
2509 trimWhiteSpace(&sb);
2510 if( tailEllipsis ){
2511 appendWhiteSpace(&sb);
2512 append(&sb, zEllipsis);
2513 }
2514 pCursor->snippet.zSnippet = sb.s;
2515 pCursor->snippet.nSnippet = sb.len;
2516}
2517
2518
2519/*
2520** Close the cursor. For additional information see the documentation
2521** on the xClose method of the virtual table interface.
2522*/
2523static int fulltextClose(sqlite3_vtab_cursor *pCursor){
2524 fulltext_cursor *c = (fulltext_cursor *) pCursor;
2525 TRACE(("FTS1 Close %p\n", c));
2526 sqlite3_finalize(c->pStmt);
2527 queryClear(&c->q);
2528 snippetClear(&c->snippet);
2529 if( c->result.pDoclist!=NULL ){
2530 docListDelete(c->result.pDoclist);
2531 }
2532 free(c);
2533 return SQLITE_OK;
2534}
2535
2536static int fulltextNext(sqlite3_vtab_cursor *pCursor){
2537 fulltext_cursor *c = (fulltext_cursor *) pCursor;
2538 sqlite_int64 iDocid;
2539 int rc;
2540
2541 TRACE(("FTS1 Next %p\n", pCursor));
2542 snippetClear(&c->snippet);
2543 if( c->iCursorType < QUERY_FULLTEXT ){
2544 /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
2545 rc = sqlite3_step(c->pStmt);
2546 switch( rc ){
2547 case SQLITE_ROW:
2548 c->eof = 0;
2549 return SQLITE_OK;
2550 case SQLITE_DONE:
2551 c->eof = 1;
2552 return SQLITE_OK;
2553 default:
2554 c->eof = 1;
2555 return rc;
2556 }
2557 } else { /* full-text query */
2558 rc = sqlite3_reset(c->pStmt);
2559 if( rc!=SQLITE_OK ) return rc;
2560
2561 iDocid = nextDocid(&c->result);
2562 if( iDocid==0 ){
2563 c->eof = 1;
2564 return SQLITE_OK;
2565 }
2566 rc = sqlite3_bind_int64(c->pStmt, 1, iDocid);
2567 if( rc!=SQLITE_OK ) return rc;
2568 /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
2569 rc = sqlite3_step(c->pStmt);
2570 if( rc==SQLITE_ROW ){ /* the case we expect */
2571 c->eof = 0;
2572 return SQLITE_OK;
2573 }
2574 /* an error occurred; abort */
2575 return rc==SQLITE_DONE ? SQLITE_ERROR : rc;
2576 }
2577}
2578
2579
2580/* Return a DocList corresponding to the query term *pTerm. If *pTerm
2581** is the first term of a phrase query, go ahead and evaluate the phrase
2582** query and return the doclist for the entire phrase query.
2583**
2584** The result is stored in pTerm->doclist.
2585*/
2586static int docListOfTerm(
2587 fulltext_vtab *v, /* The full text index */
2588 int iColumn, /* column to restrict to. No restrition if >=nColumn */
2589 QueryTerm *pQTerm, /* Term we are looking for, or 1st term of a phrase */
2590 DocList **ppResult /* Write the result here */
2591){
2592 DocList *pLeft, *pRight, *pNew;
2593 int i, rc;
2594
2595 pLeft = docListNew(DL_POSITIONS);
2596 rc = term_select_all(v, iColumn, pQTerm->pTerm, pQTerm->nTerm, pLeft);
2597 if( rc ){
2598 docListDelete(pLeft);
2599 return rc;
2600 }
2601 for(i=1; i<=pQTerm->nPhrase; i++){
2602 pRight = docListNew(DL_POSITIONS);
2603 rc = term_select_all(v, iColumn, pQTerm[i].pTerm, pQTerm[i].nTerm, pRight);
2604 if( rc ){
2605 docListDelete(pLeft);
2606 return rc;
2607 }
2608 pNew = docListNew(i<pQTerm->nPhrase ? DL_POSITIONS : DL_DOCIDS);
2609 docListPhraseMerge(pLeft, pRight, pNew);
2610 docListDelete(pLeft);
2611 docListDelete(pRight);
2612 pLeft = pNew;
2613 }
2614 *ppResult = pLeft;
2615 return SQLITE_OK;
2616}
2617
2618/* Add a new term pTerm[0..nTerm-1] to the query *q.
2619*/
2620static void queryAdd(Query *q, const char *pTerm, int nTerm){
2621 QueryTerm *t;
2622 ++q->nTerms;
2623 q->pTerms = realloc(q->pTerms, q->nTerms * sizeof(q->pTerms[0]));
2624 if( q->pTerms==0 ){
2625 q->nTerms = 0;
2626 return;
2627 }
2628 t = &q->pTerms[q->nTerms - 1];
2629 memset(t, 0, sizeof(*t));
2630 t->pTerm = malloc(nTerm+1);
2631 memcpy(t->pTerm, pTerm, nTerm);
2632 t->pTerm[nTerm] = 0;
2633 t->nTerm = nTerm;
2634 t->isOr = q->nextIsOr;
2635 q->nextIsOr = 0;
2636 t->iColumn = q->nextColumn;
2637 q->nextColumn = q->dfltColumn;
2638}
2639
2640/*
2641** Check to see if the string zToken[0...nToken-1] matches any
2642** column name in the virtual table. If it does,
2643** return the zero-indexed column number. If not, return -1.
2644*/
2645static int checkColumnSpecifier(
2646 fulltext_vtab *pVtab, /* The virtual table */
2647 const char *zToken, /* Text of the token */
2648 int nToken /* Number of characters in the token */
2649){
2650 int i;
2651 for(i=0; i<pVtab->nColumn; i++){
2652 if( memcmp(pVtab->azColumn[i], zToken, nToken)==0
2653 && pVtab->azColumn[i][nToken]==0 ){
2654 return i;
2655 }
2656 }
2657 return -1;
2658}
2659
2660/*
2661** Parse the text at pSegment[0..nSegment-1]. Add additional terms
2662** to the query being assemblied in pQuery.
2663**
2664** inPhrase is true if pSegment[0..nSegement-1] is contained within
2665** double-quotes. If inPhrase is true, then the first term
2666** is marked with the number of terms in the phrase less one and
2667** OR and "-" syntax is ignored. If inPhrase is false, then every
2668** term found is marked with nPhrase=0 and OR and "-" syntax is significant.
2669*/
2670static int tokenizeSegment(
2671 sqlite3_tokenizer *pTokenizer, /* The tokenizer to use */
2672 const char *pSegment, int nSegment, /* Query expression being parsed */
2673 int inPhrase, /* True if within "..." */
2674 Query *pQuery /* Append results here */
2675){
2676 const sqlite3_tokenizer_module *pModule = pTokenizer->pModule;
2677 sqlite3_tokenizer_cursor *pCursor;
2678 int firstIndex = pQuery->nTerms;
2679 int iCol;
2680 int nTerm = 1;
2681
2682 int rc = pModule->xOpen(pTokenizer, pSegment, nSegment, &pCursor);
2683 if( rc!=SQLITE_OK ) return rc;
2684 pCursor->pTokenizer = pTokenizer;
2685
2686 while( 1 ){
2687 const char *pToken;
2688 int nToken, iBegin, iEnd, iPos;
2689
2690 rc = pModule->xNext(pCursor,
2691 &pToken, &nToken,
2692 &iBegin, &iEnd, &iPos);
2693 if( rc!=SQLITE_OK ) break;
2694 if( !inPhrase &&
2695 pSegment[iEnd]==':' &&
2696 (iCol = checkColumnSpecifier(pQuery->pFts, pToken, nToken))>=0 ){
2697 pQuery->nextColumn = iCol;
2698 continue;
2699 }
2700 if( !inPhrase && pQuery->nTerms>0 && nToken==2
2701 && pSegment[iBegin]=='O' && pSegment[iBegin+1]=='R' ){
2702 pQuery->nextIsOr = 1;
2703 continue;
2704 }
2705 queryAdd(pQuery, pToken, nToken);
2706 if( !inPhrase && iBegin>0 && pSegment[iBegin-1]=='-' ){
2707 pQuery->pTerms[pQuery->nTerms-1].isNot = 1;
2708 }
2709 pQuery->pTerms[pQuery->nTerms-1].iPhrase = nTerm;
2710 if( inPhrase ){
2711 nTerm++;
2712 }
2713 }
2714
2715 if( inPhrase && pQuery->nTerms>firstIndex ){
2716 pQuery->pTerms[firstIndex].nPhrase = pQuery->nTerms - firstIndex - 1;
2717 }
2718
2719 return pModule->xClose(pCursor);
2720}
2721
2722/* Parse a query string, yielding a Query object pQuery.
2723**
2724** The calling function will need to queryClear() to clean up
2725** the dynamically allocated memory held by pQuery.
2726*/
2727static int parseQuery(
2728 fulltext_vtab *v, /* The fulltext index */
2729 const char *zInput, /* Input text of the query string */
2730 int nInput, /* Size of the input text */
2731 int dfltColumn, /* Default column of the index to match against */
2732 Query *pQuery /* Write the parse results here. */
2733){
2734 int iInput, inPhrase = 0;
2735
2736 if( zInput==0 ) nInput = 0;
2737 if( nInput<0 ) nInput = strlen(zInput);
2738 pQuery->nTerms = 0;
2739 pQuery->pTerms = NULL;
2740 pQuery->nextIsOr = 0;
2741 pQuery->nextColumn = dfltColumn;
2742 pQuery->dfltColumn = dfltColumn;
2743 pQuery->pFts = v;
2744
2745 for(iInput=0; iInput<nInput; ++iInput){
2746 int i;
2747 for(i=iInput; i<nInput && zInput[i]!='"'; ++i){}
2748 if( i>iInput ){
2749 tokenizeSegment(v->pTokenizer, zInput+iInput, i-iInput, inPhrase,
2750 pQuery);
2751 }
2752 iInput = i;
2753 if( i<nInput ){
2754 assert( zInput[i]=='"' );
2755 inPhrase = !inPhrase;
2756 }
2757 }
2758
2759 if( inPhrase ){
2760 /* unmatched quote */
2761 queryClear(pQuery);
2762 return SQLITE_ERROR;
2763 }
2764 return SQLITE_OK;
2765}
2766
2767/* Perform a full-text query using the search expression in
2768** zInput[0..nInput-1]. Return a list of matching documents
2769** in pResult.
2770**
2771** Queries must match column iColumn. Or if iColumn>=nColumn
2772** they are allowed to match against any column.
2773*/
2774static int fulltextQuery(
2775 fulltext_vtab *v, /* The full text index */
2776 int iColumn, /* Match against this column by default */
2777 const char *zInput, /* The query string */
2778 int nInput, /* Number of bytes in zInput[] */
2779 DocList **pResult, /* Write the result doclist here */
2780 Query *pQuery /* Put parsed query string here */
2781){
2782 int i, iNext, rc;
2783 DocList *pLeft = NULL;
2784 DocList *pRight, *pNew, *pOr;
2785 int nNot = 0;
2786 QueryTerm *aTerm;
2787
2788 rc = parseQuery(v, zInput, nInput, iColumn, pQuery);
2789 if( rc!=SQLITE_OK ) return rc;
2790
2791 /* Merge AND terms. */
2792 aTerm = pQuery->pTerms;
2793 for(i = 0; i<pQuery->nTerms; i=iNext){
2794 if( aTerm[i].isNot ){
2795 /* Handle all NOT terms in a separate pass */
2796 nNot++;
2797 iNext = i + aTerm[i].nPhrase+1;
2798 continue;
2799 }
2800 iNext = i + aTerm[i].nPhrase + 1;
2801 rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &pRight);
2802 if( rc ){
2803 queryClear(pQuery);
2804 return rc;
2805 }
2806 while( iNext<pQuery->nTerms && aTerm[iNext].isOr ){
2807 rc = docListOfTerm(v, aTerm[iNext].iColumn, &aTerm[iNext], &pOr);
2808 iNext += aTerm[iNext].nPhrase + 1;
2809 if( rc ){
2810 queryClear(pQuery);
2811 return rc;
2812 }
2813 pNew = docListNew(DL_DOCIDS);
2814 docListOrMerge(pRight, pOr, pNew);
2815 docListDelete(pRight);
2816 docListDelete(pOr);
2817 pRight = pNew;
2818 }
2819 if( pLeft==0 ){
2820 pLeft = pRight;
2821 }else{
2822 pNew = docListNew(DL_DOCIDS);
2823 docListAndMerge(pLeft, pRight, pNew);
2824 docListDelete(pRight);
2825 docListDelete(pLeft);
2826 pLeft = pNew;
2827 }
2828 }
2829
2830 if( nNot && pLeft==0 ){
2831 /* We do not yet know how to handle a query of only NOT terms */
2832 return SQLITE_ERROR;
2833 }
2834
2835 /* Do the EXCEPT terms */
2836 for(i=0; i<pQuery->nTerms; i += aTerm[i].nPhrase + 1){
2837 if( !aTerm[i].isNot ) continue;
2838 rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &pRight);
2839 if( rc ){
2840 queryClear(pQuery);
2841 docListDelete(pLeft);
2842 return rc;
2843 }
2844 pNew = docListNew(DL_DOCIDS);
2845 docListExceptMerge(pLeft, pRight, pNew);
2846 docListDelete(pRight);
2847 docListDelete(pLeft);
2848 pLeft = pNew;
2849 }
2850
2851 *pResult = pLeft;
2852 return rc;
2853}
2854
2855/*
2856** This is the xFilter interface for the virtual table. See
2857** the virtual table xFilter method documentation for additional
2858** information.
2859**
2860** If idxNum==QUERY_GENERIC then do a full table scan against
2861** the %_content table.
2862**
2863** If idxNum==QUERY_ROWID then do a rowid lookup for a single entry
2864** in the %_content table.
2865**
2866** If idxNum>=QUERY_FULLTEXT then use the full text index. The
2867** column on the left-hand side of the MATCH operator is column
2868** number idxNum-QUERY_FULLTEXT, 0 indexed. argv[0] is the right-hand
2869** side of the MATCH operator.
2870*/
2871/* TODO(shess) Upgrade the cursor initialization and destruction to
2872** account for fulltextFilter() being called multiple times on the
2873** same cursor. The current solution is very fragile. Apply fix to
2874** fts2 as appropriate.
2875*/
2876static int fulltextFilter(
2877 sqlite3_vtab_cursor *pCursor, /* The cursor used for this query */
2878 int idxNum, const char *idxStr, /* Which indexing scheme to use */
2879 int argc, sqlite3_value **argv /* Arguments for the indexing scheme */
2880){
2881 fulltext_cursor *c = (fulltext_cursor *) pCursor;
2882 fulltext_vtab *v = cursor_vtab(c);
2883 int rc;
2884 char *zSql;
2885
2886 TRACE(("FTS1 Filter %p\n",pCursor));
2887
2888 zSql = sqlite3_mprintf("select rowid, * from %%_content %s",
2889 idxNum==QUERY_GENERIC ? "" : "where rowid=?");
2890 sqlite3_finalize(c->pStmt);
2891 rc = sql_prepare(v->db, v->zDb, v->zName, &c->pStmt, zSql);
2892 sqlite3_free(zSql);
2893 if( rc!=SQLITE_OK ) return rc;
2894
2895 c->iCursorType = idxNum;
2896 switch( idxNum ){
2897 case QUERY_GENERIC:
2898 break;
2899
2900 case QUERY_ROWID:
2901 rc = sqlite3_bind_int64(c->pStmt, 1, sqlite3_value_int64(argv[0]));
2902 if( rc!=SQLITE_OK ) return rc;
2903 break;
2904
2905 default: /* full-text search */
2906 {
2907 const char *zQuery = (const char *)sqlite3_value_text(argv[0]);
2908 DocList *pResult;
2909 assert( idxNum<=QUERY_FULLTEXT+v->nColumn);
2910 assert( argc==1 );
2911 queryClear(&c->q);
2912 rc = fulltextQuery(v, idxNum-QUERY_FULLTEXT, zQuery, -1, &pResult, &c->q);
2913 if( rc!=SQLITE_OK ) return rc;
2914 if( c->result.pDoclist!=NULL ) docListDelete(c->result.pDoclist);
2915 readerInit(&c->result, pResult);
2916 break;
2917 }
2918 }
2919
2920 return fulltextNext(pCursor);
2921}
2922
2923/* This is the xEof method of the virtual table. The SQLite core
2924** calls this routine to find out if it has reached the end of
2925** a query's results set.
2926*/
2927static int fulltextEof(sqlite3_vtab_cursor *pCursor){
2928 fulltext_cursor *c = (fulltext_cursor *) pCursor;
2929 return c->eof;
2930}
2931
2932/* This is the xColumn method of the virtual table. The SQLite
2933** core calls this method during a query when it needs the value
2934** of a column from the virtual table. This method needs to use
2935** one of the sqlite3_result_*() routines to store the requested
2936** value back in the pContext.
2937*/
2938static int fulltextColumn(sqlite3_vtab_cursor *pCursor,
2939 sqlite3_context *pContext, int idxCol){
2940 fulltext_cursor *c = (fulltext_cursor *) pCursor;
2941 fulltext_vtab *v = cursor_vtab(c);
2942
2943 if( idxCol<v->nColumn ){
2944 sqlite3_value *pVal = sqlite3_column_value(c->pStmt, idxCol+1);
2945 sqlite3_result_value(pContext, pVal);
2946 }else if( idxCol==v->nColumn ){
2947 /* The extra column whose name is the same as the table.
2948 ** Return a blob which is a pointer to the cursor
2949 */
2950 sqlite3_result_blob(pContext, &c, sizeof(c), SQLITE_TRANSIENT);
2951 }
2952 return SQLITE_OK;
2953}
2954
2955/* This is the xRowid method. The SQLite core calls this routine to
2956** retrive the rowid for the current row of the result set. The
2957** rowid should be written to *pRowid.
2958*/
2959static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){
2960 fulltext_cursor *c = (fulltext_cursor *) pCursor;
2961
2962 *pRowid = sqlite3_column_int64(c->pStmt, 0);
2963 return SQLITE_OK;
2964}
2965
2966/* Add all terms in [zText] to the given hash table. If [iColumn] > 0,
2967 * we also store positions and offsets in the hash table using the given
2968 * column number. */
2969static int buildTerms(fulltext_vtab *v, fts1Hash *terms, sqlite_int64 iDocid,
2970 const char *zText, int iColumn){
2971 sqlite3_tokenizer *pTokenizer = v->pTokenizer;
2972 sqlite3_tokenizer_cursor *pCursor;
2973 const char *pToken;
2974 int nTokenBytes;
2975 int iStartOffset, iEndOffset, iPosition;
2976 int rc;
2977
2978 rc = pTokenizer->pModule->xOpen(pTokenizer, zText, -1, &pCursor);
2979 if( rc!=SQLITE_OK ) return rc;
2980
2981 pCursor->pTokenizer = pTokenizer;
2982 while( SQLITE_OK==pTokenizer->pModule->xNext(pCursor,
2983 &pToken, &nTokenBytes,
2984 &iStartOffset, &iEndOffset,
2985 &iPosition) ){
2986 DocList *p;
2987
2988 /* Positions can't be negative; we use -1 as a terminator internally. */
2989 if( iPosition<0 ){
2990 pTokenizer->pModule->xClose(pCursor);
2991 return SQLITE_ERROR;
2992 }
2993
2994 p = fts1HashFind(terms, pToken, nTokenBytes);
2995 if( p==NULL ){
2996 p = docListNew(DL_DEFAULT);
2997 docListAddDocid(p, iDocid);
2998 fts1HashInsert(terms, pToken, nTokenBytes, p);
2999 }
3000 if( iColumn>=0 ){
3001 docListAddPosOffset(p, iColumn, iPosition, iStartOffset, iEndOffset);
3002 }
3003 }
3004
3005 /* TODO(shess) Check return? Should this be able to cause errors at
3006 ** this point? Actually, same question about sqlite3_finalize(),
3007 ** though one could argue that failure there means that the data is
3008 ** not durable. *ponder*
3009 */
3010 pTokenizer->pModule->xClose(pCursor);
3011 return rc;
3012}
3013
3014/* Update the %_terms table to map the term [pTerm] to the given rowid. */
3015static int index_insert_term(fulltext_vtab *v, const char *pTerm, int nTerm,
3016 DocList *d){
3017 sqlite_int64 iIndexRow;
3018 DocList doclist;
3019 int iSegment = 0, rc;
3020
3021 rc = term_select(v, pTerm, nTerm, iSegment, &iIndexRow, &doclist);
3022 if( rc==SQLITE_DONE ){
3023 docListInit(&doclist, DL_DEFAULT, 0, 0);
3024 docListUpdate(&doclist, d);
3025 /* TODO(shess) Consider length(doclist)>CHUNK_MAX? */
3026 rc = term_insert(v, NULL, pTerm, nTerm, iSegment, &doclist);
3027 goto err;
3028 }
3029 if( rc!=SQLITE_ROW ) return SQLITE_ERROR;
3030
3031 docListUpdate(&doclist, d);
3032 if( doclist.nData<=CHUNK_MAX ){
3033 rc = term_update(v, iIndexRow, &doclist);
3034 goto err;
3035 }
3036
3037 /* Doclist doesn't fit, delete what's there, and accumulate
3038 ** forward.
3039 */
3040 rc = term_delete(v, iIndexRow);
3041 if( rc!=SQLITE_OK ) goto err;
3042
3043 /* Try to insert the doclist into a higher segment bucket. On
3044 ** failure, accumulate existing doclist with the doclist from that
3045 ** bucket, and put results in the next bucket.
3046 */
3047 iSegment++;
3048 while( (rc=term_insert(v, &iIndexRow, pTerm, nTerm, iSegment,
3049 &doclist))!=SQLITE_OK ){
3050 sqlite_int64 iSegmentRow;
3051 DocList old;
3052 int rc2;
3053
3054 /* Retain old error in case the term_insert() error was really an
3055 ** error rather than a bounced insert.
3056 */
3057 rc2 = term_select(v, pTerm, nTerm, iSegment, &iSegmentRow, &old);
3058 if( rc2!=SQLITE_ROW ) goto err;
3059
3060 rc = term_delete(v, iSegmentRow);
3061 if( rc!=SQLITE_OK ) goto err;
3062
3063 /* Reusing lowest-number deleted row keeps the index smaller. */
3064 if( iSegmentRow<iIndexRow ) iIndexRow = iSegmentRow;
3065
3066 /* doclist contains the newer data, so accumulate it over old.
3067 ** Then steal accumulated data for doclist.
3068 */
3069 docListAccumulate(&old, &doclist);
3070 docListDestroy(&doclist);
3071 doclist = old;
3072
3073 iSegment++;
3074 }
3075
3076 err:
3077 docListDestroy(&doclist);
3078 return rc;
3079}
3080
3081/* Add doclists for all terms in [pValues] to the hash table [terms]. */
3082static int insertTerms(fulltext_vtab *v, fts1Hash *terms, sqlite_int64 iRowid,
3083 sqlite3_value **pValues){
3084 int i;
3085 for(i = 0; i < v->nColumn ; ++i){
3086 char *zText = (char*)sqlite3_value_text(pValues[i]);
3087 int rc = buildTerms(v, terms, iRowid, zText, i);
3088 if( rc!=SQLITE_OK ) return rc;
3089 }
3090 return SQLITE_OK;
3091}
3092
3093/* Add empty doclists for all terms in the given row's content to the hash
3094 * table [pTerms]. */
3095static int deleteTerms(fulltext_vtab *v, fts1Hash *pTerms, sqlite_int64 iRowid){
3096 const char **pValues;
3097 int i;
3098
3099 int rc = content_select(v, iRowid, &pValues);
3100 if( rc!=SQLITE_OK ) return rc;
3101
3102 for(i = 0 ; i < v->nColumn; ++i) {
3103 rc = buildTerms(v, pTerms, iRowid, pValues[i], -1);
3104 if( rc!=SQLITE_OK ) break;
3105 }
3106
3107 freeStringArray(v->nColumn, pValues);
3108 return SQLITE_OK;
3109}
3110
3111/* Insert a row into the %_content table; set *piRowid to be the ID of the
3112 * new row. Fill [pTerms] with new doclists for the %_term table. */
3113static int index_insert(fulltext_vtab *v, sqlite3_value *pRequestRowid,
3114 sqlite3_value **pValues,
3115 sqlite_int64 *piRowid, fts1Hash *pTerms){
3116 int rc;
3117
3118 rc = content_insert(v, pRequestRowid, pValues); /* execute an SQL INSERT */
3119 if( rc!=SQLITE_OK ) return rc;
3120 *piRowid = sqlite3_last_insert_rowid(v->db);
3121 return insertTerms(v, pTerms, *piRowid, pValues);
3122}
3123
3124/* Delete a row from the %_content table; fill [pTerms] with empty doclists
3125 * to be written to the %_term table. */
3126static int index_delete(fulltext_vtab *v, sqlite_int64 iRow, fts1Hash *pTerms){
3127 int rc = deleteTerms(v, pTerms, iRow);
3128 if( rc!=SQLITE_OK ) return rc;
3129 return content_delete(v, iRow); /* execute an SQL DELETE */
3130}
3131
3132/* Update a row in the %_content table; fill [pTerms] with new doclists for the
3133 * %_term table. */
3134static int index_update(fulltext_vtab *v, sqlite_int64 iRow,
3135 sqlite3_value **pValues, fts1Hash *pTerms){
3136 /* Generate an empty doclist for each term that previously appeared in this
3137 * row. */
3138 int rc = deleteTerms(v, pTerms, iRow);
3139 if( rc!=SQLITE_OK ) return rc;
3140
3141 rc = content_update(v, pValues, iRow); /* execute an SQL UPDATE */
3142 if( rc!=SQLITE_OK ) return rc;
3143
3144 /* Now add positions for terms which appear in the updated row. */
3145 return insertTerms(v, pTerms, iRow, pValues);
3146}
3147
3148/* This function implements the xUpdate callback; it's the top-level entry
3149 * point for inserting, deleting or updating a row in a full-text table. */
3150static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg,
3151 sqlite_int64 *pRowid){
3152 fulltext_vtab *v = (fulltext_vtab *) pVtab;
3153 fts1Hash terms; /* maps term string -> PosList */
3154 int rc;
3155 fts1HashElem *e;
3156
3157 TRACE(("FTS1 Update %p\n", pVtab));
3158
3159 fts1HashInit(&terms, FTS1_HASH_STRING, 1);
3160
3161 if( nArg<2 ){
3162 rc = index_delete(v, sqlite3_value_int64(ppArg[0]), &terms);
3163 } else if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){
3164 /* An update:
3165 * ppArg[0] = old rowid
3166 * ppArg[1] = new rowid
3167 * ppArg[2..2+v->nColumn-1] = values
3168 * ppArg[2+v->nColumn] = value for magic column (we ignore this)
3169 */
3170 sqlite_int64 rowid = sqlite3_value_int64(ppArg[0]);
3171 if( sqlite3_value_type(ppArg[1]) != SQLITE_INTEGER ||
3172 sqlite3_value_int64(ppArg[1]) != rowid ){
3173 rc = SQLITE_ERROR; /* we don't allow changing the rowid */
3174 } else {
3175 assert( nArg==2+v->nColumn+1);
3176 rc = index_update(v, rowid, &ppArg[2], &terms);
3177 }
3178 } else {
3179 /* An insert:
3180 * ppArg[1] = requested rowid
3181 * ppArg[2..2+v->nColumn-1] = values
3182 * ppArg[2+v->nColumn] = value for magic column (we ignore this)
3183 */
3184 assert( nArg==2+v->nColumn+1);
3185 rc = index_insert(v, ppArg[1], &ppArg[2], pRowid, &terms);
3186 }
3187
3188 if( rc==SQLITE_OK ){
3189 /* Write updated doclists to disk. */
3190 for(e=fts1HashFirst(&terms); e; e=fts1HashNext(e)){
3191 DocList *p = fts1HashData(e);
3192 rc = index_insert_term(v, fts1HashKey(e), fts1HashKeysize(e), p);
3193 if( rc!=SQLITE_OK ) break;
3194 }
3195 }
3196
3197 /* clean up */
3198 for(e=fts1HashFirst(&terms); e; e=fts1HashNext(e)){
3199 DocList *p = fts1HashData(e);
3200 docListDelete(p);
3201 }
3202 fts1HashClear(&terms);
3203
3204 return rc;
3205}
3206
3207/*
3208** Implementation of the snippet() function for FTS1
3209*/
3210static void snippetFunc(
3211 sqlite3_context *pContext,
3212 int argc,
3213 sqlite3_value **argv
3214){
3215 fulltext_cursor *pCursor;
3216 if( argc<1 ) return;
3217 if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
3218 sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
3219 sqlite3_result_error(pContext, "illegal first argument to html_snippet",-1);
3220 }else{
3221 const char *zStart = "<b>";
3222 const char *zEnd = "</b>";
3223 const char *zEllipsis = "<b>...</b>";
3224 memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
3225 if( argc>=2 ){
3226 zStart = (const char*)sqlite3_value_text(argv[1]);
3227 if( argc>=3 ){
3228 zEnd = (const char*)sqlite3_value_text(argv[2]);
3229 if( argc>=4 ){
3230 zEllipsis = (const char*)sqlite3_value_text(argv[3]);
3231 }
3232 }
3233 }
3234 snippetAllOffsets(pCursor);
3235 snippetText(pCursor, zStart, zEnd, zEllipsis);
3236 sqlite3_result_text(pContext, pCursor->snippet.zSnippet,
3237 pCursor->snippet.nSnippet, SQLITE_STATIC);
3238 }
3239}
3240
3241/*
3242** Implementation of the offsets() function for FTS1
3243*/
3244static void snippetOffsetsFunc(
3245 sqlite3_context *pContext,
3246 int argc,
3247 sqlite3_value **argv
3248){
3249 fulltext_cursor *pCursor;
3250 if( argc<1 ) return;
3251 if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
3252 sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
3253 sqlite3_result_error(pContext, "illegal first argument to offsets",-1);
3254 }else{
3255 memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
3256 snippetAllOffsets(pCursor);
3257 snippetOffsetText(&pCursor->snippet);
3258 sqlite3_result_text(pContext,
3259 pCursor->snippet.zOffset, pCursor->snippet.nOffset,
3260 SQLITE_STATIC);
3261 }
3262}
3263
3264/*
3265** This routine implements the xFindFunction method for the FTS1
3266** virtual table.
3267*/
3268static int fulltextFindFunction(
3269 sqlite3_vtab *pVtab,
3270 int nArg,
3271 const char *zName,
3272 void (**pxFunc)(sqlite3_context*,int,sqlite3_value**),
3273 void **ppArg
3274){
3275 if( strcmp(zName,"snippet")==0 ){
3276 *pxFunc = snippetFunc;
3277 return 1;
3278 }else if( strcmp(zName,"offsets")==0 ){
3279 *pxFunc = snippetOffsetsFunc;
3280 return 1;
3281 }
3282 return 0;
3283}
3284
3285/*
3286** Rename an fts1 table.
3287*/
3288static int fulltextRename(
3289 sqlite3_vtab *pVtab,
3290 const char *zName
3291){
3292 fulltext_vtab *p = (fulltext_vtab *)pVtab;
3293 int rc = SQLITE_NOMEM;
3294 char *zSql = sqlite3_mprintf(
3295 "ALTER TABLE %Q.'%q_content' RENAME TO '%q_content';"
3296 "ALTER TABLE %Q.'%q_term' RENAME TO '%q_term';"
3297 , p->zDb, p->zName, zName
3298 , p->zDb, p->zName, zName
3299 );
3300 if( zSql ){
3301 rc = sqlite3_exec(p->db, zSql, 0, 0, 0);
3302 sqlite3_free(zSql);
3303 }
3304 return rc;
3305}
3306
3307static const sqlite3_module fulltextModule = {
3308 /* iVersion */ 0,
3309 /* xCreate */ fulltextCreate,
3310 /* xConnect */ fulltextConnect,
3311 /* xBestIndex */ fulltextBestIndex,
3312 /* xDisconnect */ fulltextDisconnect,
3313 /* xDestroy */ fulltextDestroy,
3314 /* xOpen */ fulltextOpen,
3315 /* xClose */ fulltextClose,
3316 /* xFilter */ fulltextFilter,
3317 /* xNext */ fulltextNext,
3318 /* xEof */ fulltextEof,
3319 /* xColumn */ fulltextColumn,
3320 /* xRowid */ fulltextRowid,
3321 /* xUpdate */ fulltextUpdate,
3322 /* xBegin */ 0,
3323 /* xSync */ 0,
3324 /* xCommit */ 0,
3325 /* xRollback */ 0,
3326 /* xFindFunction */ fulltextFindFunction,
3327 /* xRename */ fulltextRename,
3328};
3329
3330int sqlite3Fts1Init(sqlite3 *db){
3331 sqlite3_overload_function(db, "snippet", -1);
3332 sqlite3_overload_function(db, "offsets", -1);
3333 return sqlite3_create_module(db, "fts1", &fulltextModule, 0);
3334}
3335
3336#if !SQLITE_CORE
3337int sqlite3_extension_init(sqlite3 *db, char **pzErrMsg,
3338 const sqlite3_api_routines *pApi){
3339 SQLITE_EXTENSION_INIT2(pApi)
3340 return sqlite3Fts1Init(db);
3341}
3342#endif
3343
3344#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1.h b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1.h
deleted file mode 100644
index d55e689..0000000
--- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1.h
+++ /dev/null
@@ -1,11 +0,0 @@
1#include "sqlite3.h"
2
3#ifdef __cplusplus
4extern "C" {
5#endif /* __cplusplus */
6
7int sqlite3Fts1Init(sqlite3 *db);
8
9#ifdef __cplusplus
10} /* extern "C" */
11#endif /* __cplusplus */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_hash.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_hash.c
deleted file mode 100644
index 463a52b..0000000
--- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_hash.c
+++ /dev/null
@@ -1,369 +0,0 @@
1/*
2** 2001 September 22
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12** This is the implementation of generic hash-tables used in SQLite.
13** We've modified it slightly to serve as a standalone hash table
14** implementation for the full-text indexing module.
15*/
16#include <assert.h>
17#include <stdlib.h>
18#include <string.h>
19
20/*
21** The code in this file is only compiled if:
22**
23** * The FTS1 module is being built as an extension
24** (in which case SQLITE_CORE is not defined), or
25**
26** * The FTS1 module is being built into the core of
27** SQLite (in which case SQLITE_ENABLE_FTS1 is defined).
28*/
29#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)
30
31
32#include "fts1_hash.h"
33
34static void *malloc_and_zero(int n){
35 void *p = malloc(n);
36 if( p ){
37 memset(p, 0, n);
38 }
39 return p;
40}
41
42/* Turn bulk memory into a hash table object by initializing the
43** fields of the Hash structure.
44**
45** "pNew" is a pointer to the hash table that is to be initialized.
46** keyClass is one of the constants
47** FTS1_HASH_BINARY or FTS1_HASH_STRING. The value of keyClass
48** determines what kind of key the hash table will use. "copyKey" is
49** true if the hash table should make its own private copy of keys and
50** false if it should just use the supplied pointer.
51*/
52void sqlite3Fts1HashInit(fts1Hash *pNew, int keyClass, int copyKey){
53 assert( pNew!=0 );
54 assert( keyClass>=FTS1_HASH_STRING && keyClass<=FTS1_HASH_BINARY );
55 pNew->keyClass = keyClass;
56 pNew->copyKey = copyKey;
57 pNew->first = 0;
58 pNew->count = 0;
59 pNew->htsize = 0;
60 pNew->ht = 0;
61 pNew->xMalloc = malloc_and_zero;
62 pNew->xFree = free;
63}
64
65/* Remove all entries from a hash table. Reclaim all memory.
66** Call this routine to delete a hash table or to reset a hash table
67** to the empty state.
68*/
69void sqlite3Fts1HashClear(fts1Hash *pH){
70 fts1HashElem *elem; /* For looping over all elements of the table */
71
72 assert( pH!=0 );
73 elem = pH->first;
74 pH->first = 0;
75 if( pH->ht ) pH->xFree(pH->ht);
76 pH->ht = 0;
77 pH->htsize = 0;
78 while( elem ){
79 fts1HashElem *next_elem = elem->next;
80 if( pH->copyKey && elem->pKey ){
81 pH->xFree(elem->pKey);
82 }
83 pH->xFree(elem);
84 elem = next_elem;
85 }
86 pH->count = 0;
87}
88
89/*
90** Hash and comparison functions when the mode is FTS1_HASH_STRING
91*/
92static int strHash(const void *pKey, int nKey){
93 const char *z = (const char *)pKey;
94 int h = 0;
95 if( nKey<=0 ) nKey = (int) strlen(z);
96 while( nKey > 0 ){
97 h = (h<<3) ^ h ^ *z++;
98 nKey--;
99 }
100 return h & 0x7fffffff;
101}
102static int strCompare(const void *pKey1, int n1, const void *pKey2, int n2){
103 if( n1!=n2 ) return 1;
104 return strncmp((const char*)pKey1,(const char*)pKey2,n1);
105}
106
107/*
108** Hash and comparison functions when the mode is FTS1_HASH_BINARY
109*/
110static int binHash(const void *pKey, int nKey){
111 int h = 0;
112 const char *z = (const char *)pKey;
113 while( nKey-- > 0 ){
114 h = (h<<3) ^ h ^ *(z++);
115 }
116 return h & 0x7fffffff;
117}
118static int binCompare(const void *pKey1, int n1, const void *pKey2, int n2){
119 if( n1!=n2 ) return 1;
120 return memcmp(pKey1,pKey2,n1);
121}
122
123/*
124** Return a pointer to the appropriate hash function given the key class.
125**
126** The C syntax in this function definition may be unfamilar to some
127** programmers, so we provide the following additional explanation:
128**
129** The name of the function is "hashFunction". The function takes a
130** single parameter "keyClass". The return value of hashFunction()
131** is a pointer to another function. Specifically, the return value
132** of hashFunction() is a pointer to a function that takes two parameters
133** with types "const void*" and "int" and returns an "int".
134*/
135static int (*hashFunction(int keyClass))(const void*,int){
136 if( keyClass==FTS1_HASH_STRING ){
137 return &strHash;
138 }else{
139 assert( keyClass==FTS1_HASH_BINARY );
140 return &binHash;
141 }
142}
143
144/*
145** Return a pointer to the appropriate hash function given the key class.
146**
147** For help in interpreted the obscure C code in the function definition,
148** see the header comment on the previous function.
149*/
150static int (*compareFunction(int keyClass))(const void*,int,const void*,int){
151 if( keyClass==FTS1_HASH_STRING ){
152 return &strCompare;
153 }else{
154 assert( keyClass==FTS1_HASH_BINARY );
155 return &binCompare;
156 }
157}
158
159/* Link an element into the hash table
160*/
161static void insertElement(
162 fts1Hash *pH, /* The complete hash table */
163 struct _fts1ht *pEntry, /* The entry into which pNew is inserted */
164 fts1HashElem *pNew /* The element to be inserted */
165){
166 fts1HashElem *pHead; /* First element already in pEntry */
167 pHead = pEntry->chain;
168 if( pHead ){
169 pNew->next = pHead;
170 pNew->prev = pHead->prev;
171 if( pHead->prev ){ pHead->prev->next = pNew; }
172 else { pH->first = pNew; }
173 pHead->prev = pNew;
174 }else{
175 pNew->next = pH->first;
176 if( pH->first ){ pH->first->prev = pNew; }
177 pNew->prev = 0;
178 pH->first = pNew;
179 }
180 pEntry->count++;
181 pEntry->chain = pNew;
182}
183
184
185/* Resize the hash table so that it cantains "new_size" buckets.
186** "new_size" must be a power of 2. The hash table might fail
187** to resize if sqliteMalloc() fails.
188*/
189static void rehash(fts1Hash *pH, int new_size){
190 struct _fts1ht *new_ht; /* The new hash table */
191 fts1HashElem *elem, *next_elem; /* For looping over existing elements */
192 int (*xHash)(const void*,int); /* The hash function */
193
194 assert( (new_size & (new_size-1))==0 );
195 new_ht = (struct _fts1ht *)pH->xMalloc( new_size*sizeof(struct _fts1ht) );
196 if( new_ht==0 ) return;
197 if( pH->ht ) pH->xFree(pH->ht);
198 pH->ht = new_ht;
199 pH->htsize = new_size;
200 xHash = hashFunction(pH->keyClass);
201 for(elem=pH->first, pH->first=0; elem; elem = next_elem){
202 int h = (*xHash)(elem->pKey, elem->nKey) & (new_size-1);
203 next_elem = elem->next;
204 insertElement(pH, &new_ht[h], elem);
205 }
206}
207
208/* This function (for internal use only) locates an element in an
209** hash table that matches the given key. The hash for this key has
210** already been computed and is passed as the 4th parameter.
211*/
212static fts1HashElem *findElementGivenHash(
213 const fts1Hash *pH, /* The pH to be searched */
214 const void *pKey, /* The key we are searching for */
215 int nKey,
216 int h /* The hash for this key. */
217){
218 fts1HashElem *elem; /* Used to loop thru the element list */
219 int count; /* Number of elements left to test */
220 int (*xCompare)(const void*,int,const void*,int); /* comparison function */
221
222 if( pH->ht ){
223 struct _fts1ht *pEntry = &pH->ht[h];
224 elem = pEntry->chain;
225 count = pEntry->count;
226 xCompare = compareFunction(pH->keyClass);
227 while( count-- && elem ){
228 if( (*xCompare)(elem->pKey,elem->nKey,pKey,nKey)==0 ){
229 return elem;
230 }
231 elem = elem->next;
232 }
233 }
234 return 0;
235}
236
237/* Remove a single entry from the hash table given a pointer to that
238** element and a hash on the element's key.
239*/
240static void removeElementGivenHash(
241 fts1Hash *pH, /* The pH containing "elem" */
242 fts1HashElem* elem, /* The element to be removed from the pH */
243 int h /* Hash value for the element */
244){
245 struct _fts1ht *pEntry;
246 if( elem->prev ){
247 elem->prev->next = elem->next;
248 }else{
249 pH->first = elem->next;
250 }
251 if( elem->next ){
252 elem->next->prev = elem->prev;
253 }
254 pEntry = &pH->ht[h];
255 if( pEntry->chain==elem ){
256 pEntry->chain = elem->next;
257 }
258 pEntry->count--;
259 if( pEntry->count<=0 ){
260 pEntry->chain = 0;
261 }
262 if( pH->copyKey && elem->pKey ){
263 pH->xFree(elem->pKey);
264 }
265 pH->xFree( elem );
266 pH->count--;
267 if( pH->count<=0 ){
268 assert( pH->first==0 );
269 assert( pH->count==0 );
270 fts1HashClear(pH);
271 }
272}
273
274/* Attempt to locate an element of the hash table pH with a key
275** that matches pKey,nKey. Return the data for this element if it is
276** found, or NULL if there is no match.
277*/
278void *sqlite3Fts1HashFind(const fts1Hash *pH, const void *pKey, int nKey){
279 int h; /* A hash on key */
280 fts1HashElem *elem; /* The element that matches key */
281 int (*xHash)(const void*,int); /* The hash function */
282
283 if( pH==0 || pH->ht==0 ) return 0;
284 xHash = hashFunction(pH->keyClass);
285 assert( xHash!=0 );
286 h = (*xHash)(pKey,nKey);
287 assert( (pH->htsize & (pH->htsize-1))==0 );
288 elem = findElementGivenHash(pH,pKey,nKey, h & (pH->htsize-1));
289 return elem ? elem->data : 0;
290}
291
292/* Insert an element into the hash table pH. The key is pKey,nKey
293** and the data is "data".
294**
295** If no element exists with a matching key, then a new
296** element is created. A copy of the key is made if the copyKey
297** flag is set. NULL is returned.
298**
299** If another element already exists with the same key, then the
300** new data replaces the old data and the old data is returned.
301** The key is not copied in this instance. If a malloc fails, then
302** the new data is returned and the hash table is unchanged.
303**
304** If the "data" parameter to this function is NULL, then the
305** element corresponding to "key" is removed from the hash table.
306*/
307void *sqlite3Fts1HashInsert(
308 fts1Hash *pH, /* The hash table to insert into */
309 const void *pKey, /* The key */
310 int nKey, /* Number of bytes in the key */
311 void *data /* The data */
312){
313 int hraw; /* Raw hash value of the key */
314 int h; /* the hash of the key modulo hash table size */
315 fts1HashElem *elem; /* Used to loop thru the element list */
316 fts1HashElem *new_elem; /* New element added to the pH */
317 int (*xHash)(const void*,int); /* The hash function */
318
319 assert( pH!=0 );
320 xHash = hashFunction(pH->keyClass);
321 assert( xHash!=0 );
322 hraw = (*xHash)(pKey, nKey);
323 assert( (pH->htsize & (pH->htsize-1))==0 );
324 h = hraw & (pH->htsize-1);
325 elem = findElementGivenHash(pH,pKey,nKey,h);
326 if( elem ){
327 void *old_data = elem->data;
328 if( data==0 ){
329 removeElementGivenHash(pH,elem,h);
330 }else{
331 elem->data = data;
332 }
333 return old_data;
334 }
335 if( data==0 ) return 0;
336 new_elem = (fts1HashElem*)pH->xMalloc( sizeof(fts1HashElem) );
337 if( new_elem==0 ) return data;
338 if( pH->copyKey && pKey!=0 ){
339 new_elem->pKey = pH->xMalloc( nKey );
340 if( new_elem->pKey==0 ){
341 pH->xFree(new_elem);
342 return data;
343 }
344 memcpy((void*)new_elem->pKey, pKey, nKey);
345 }else{
346 new_elem->pKey = (void*)pKey;
347 }
348 new_elem->nKey = nKey;
349 pH->count++;
350 if( pH->htsize==0 ){
351 rehash(pH,8);
352 if( pH->htsize==0 ){
353 pH->count = 0;
354 pH->xFree(new_elem);
355 return data;
356 }
357 }
358 if( pH->count > pH->htsize ){
359 rehash(pH,pH->htsize*2);
360 }
361 assert( pH->htsize>0 );
362 assert( (pH->htsize & (pH->htsize-1))==0 );
363 h = hraw & (pH->htsize-1);
364 insertElement(pH, &pH->ht[h], new_elem);
365 new_elem->data = data;
366 return 0;
367}
368
369#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_hash.h b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_hash.h
deleted file mode 100644
index c31c430..0000000
--- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_hash.h
+++ /dev/null
@@ -1,112 +0,0 @@
1/*
2** 2001 September 22
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12** This is the header file for the generic hash-table implemenation
13** used in SQLite. We've modified it slightly to serve as a standalone
14** hash table implementation for the full-text indexing module.
15**
16*/
17#ifndef _FTS1_HASH_H_
18#define _FTS1_HASH_H_
19
20/* Forward declarations of structures. */
21typedef struct fts1Hash fts1Hash;
22typedef struct fts1HashElem fts1HashElem;
23
24/* A complete hash table is an instance of the following structure.
25** The internals of this structure are intended to be opaque -- client
26** code should not attempt to access or modify the fields of this structure
27** directly. Change this structure only by using the routines below.
28** However, many of the "procedures" and "functions" for modifying and
29** accessing this structure are really macros, so we can't really make
30** this structure opaque.
31*/
32struct fts1Hash {
33 char keyClass; /* HASH_INT, _POINTER, _STRING, _BINARY */
34 char copyKey; /* True if copy of key made on insert */
35 int count; /* Number of entries in this table */
36 fts1HashElem *first; /* The first element of the array */
37 void *(*xMalloc)(int); /* malloc() function to use */
38 void (*xFree)(void *); /* free() function to use */
39 int htsize; /* Number of buckets in the hash table */
40 struct _fts1ht { /* the hash table */
41 int count; /* Number of entries with this hash */
42 fts1HashElem *chain; /* Pointer to first entry with this hash */
43 } *ht;
44};
45
46/* Each element in the hash table is an instance of the following
47** structure. All elements are stored on a single doubly-linked list.
48**
49** Again, this structure is intended to be opaque, but it can't really
50** be opaque because it is used by macros.
51*/
52struct fts1HashElem {
53 fts1HashElem *next, *prev; /* Next and previous elements in the table */
54 void *data; /* Data associated with this element */
55 void *pKey; int nKey; /* Key associated with this element */
56};
57
58/*
59** There are 2 different modes of operation for a hash table:
60**
61** FTS1_HASH_STRING pKey points to a string that is nKey bytes long
62** (including the null-terminator, if any). Case
63** is respected in comparisons.
64**
65** FTS1_HASH_BINARY pKey points to binary data nKey bytes long.
66** memcmp() is used to compare keys.
67**
68** A copy of the key is made if the copyKey parameter to fts1HashInit is 1.
69*/
70#define FTS1_HASH_STRING 1
71#define FTS1_HASH_BINARY 2
72
73/*
74** Access routines. To delete, insert a NULL pointer.
75*/
76void sqlite3Fts1HashInit(fts1Hash*, int keytype, int copyKey);
77void *sqlite3Fts1HashInsert(fts1Hash*, const void *pKey, int nKey, void *pData);
78void *sqlite3Fts1HashFind(const fts1Hash*, const void *pKey, int nKey);
79void sqlite3Fts1HashClear(fts1Hash*);
80
81/*
82** Shorthand for the functions above
83*/
84#define fts1HashInit sqlite3Fts1HashInit
85#define fts1HashInsert sqlite3Fts1HashInsert
86#define fts1HashFind sqlite3Fts1HashFind
87#define fts1HashClear sqlite3Fts1HashClear
88
89/*
90** Macros for looping over all elements of a hash table. The idiom is
91** like this:
92**
93** fts1Hash h;
94** fts1HashElem *p;
95** ...
96** for(p=fts1HashFirst(&h); p; p=fts1HashNext(p)){
97** SomeStructure *pData = fts1HashData(p);
98** // do something with pData
99** }
100*/
101#define fts1HashFirst(H) ((H)->first)
102#define fts1HashNext(E) ((E)->next)
103#define fts1HashData(E) ((E)->data)
104#define fts1HashKey(E) ((E)->pKey)
105#define fts1HashKeysize(E) ((E)->nKey)
106
107/*
108** Number of entries in a hash table
109*/
110#define fts1HashCount(H) ((H)->count)
111
112#endif /* _FTS1_HASH_H_ */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_porter.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_porter.c
deleted file mode 100644
index 1d26236..0000000
--- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_porter.c
+++ /dev/null
@@ -1,643 +0,0 @@
1/*
2** 2006 September 30
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12** Implementation of the full-text-search tokenizer that implements
13** a Porter stemmer.
14*/
15
16/*
17** The code in this file is only compiled if:
18**
19** * The FTS1 module is being built as an extension
20** (in which case SQLITE_CORE is not defined), or
21**
22** * The FTS1 module is being built into the core of
23** SQLite (in which case SQLITE_ENABLE_FTS1 is defined).
24*/
25#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)
26
27
28#include <assert.h>
29#include <stdlib.h>
30#include <stdio.h>
31#include <string.h>
32#include <ctype.h>
33
34#include "fts1_tokenizer.h"
35
36/*
37** Class derived from sqlite3_tokenizer
38*/
39typedef struct porter_tokenizer {
40 sqlite3_tokenizer base; /* Base class */
41} porter_tokenizer;
42
43/*
44** Class derived from sqlit3_tokenizer_cursor
45*/
46typedef struct porter_tokenizer_cursor {
47 sqlite3_tokenizer_cursor base;
48 const char *zInput; /* input we are tokenizing */
49 int nInput; /* size of the input */
50 int iOffset; /* current position in zInput */
51 int iToken; /* index of next token to be returned */
52 char *zToken; /* storage for current token */
53 int nAllocated; /* space allocated to zToken buffer */
54} porter_tokenizer_cursor;
55
56
57/* Forward declaration */
58static const sqlite3_tokenizer_module porterTokenizerModule;
59
60
61/*
62** Create a new tokenizer instance.
63*/
64static int porterCreate(
65 int argc, const char * const *argv,
66 sqlite3_tokenizer **ppTokenizer
67){
68 porter_tokenizer *t;
69 t = (porter_tokenizer *) calloc(sizeof(*t), 1);
70 if( t==NULL ) return SQLITE_NOMEM;
71
72 *ppTokenizer = &t->base;
73 return SQLITE_OK;
74}
75
76/*
77** Destroy a tokenizer
78*/
79static int porterDestroy(sqlite3_tokenizer *pTokenizer){
80 free(pTokenizer);
81 return SQLITE_OK;
82}
83
84/*
85** Prepare to begin tokenizing a particular string. The input
86** string to be tokenized is zInput[0..nInput-1]. A cursor
87** used to incrementally tokenize this string is returned in
88** *ppCursor.
89*/
90static int porterOpen(
91 sqlite3_tokenizer *pTokenizer, /* The tokenizer */
92 const char *zInput, int nInput, /* String to be tokenized */
93 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
94){
95 porter_tokenizer_cursor *c;
96
97 c = (porter_tokenizer_cursor *) malloc(sizeof(*c));
98 if( c==NULL ) return SQLITE_NOMEM;
99
100 c->zInput = zInput;
101 if( zInput==0 ){
102 c->nInput = 0;
103 }else if( nInput<0 ){
104 c->nInput = (int)strlen(zInput);
105 }else{
106 c->nInput = nInput;
107 }
108 c->iOffset = 0; /* start tokenizing at the beginning */
109 c->iToken = 0;
110 c->zToken = NULL; /* no space allocated, yet. */
111 c->nAllocated = 0;
112
113 *ppCursor = &c->base;
114 return SQLITE_OK;
115}
116
117/*
118** Close a tokenization cursor previously opened by a call to
119** porterOpen() above.
120*/
121static int porterClose(sqlite3_tokenizer_cursor *pCursor){
122 porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
123 free(c->zToken);
124 free(c);
125 return SQLITE_OK;
126}
127/*
128** Vowel or consonant
129*/
130static const char cType[] = {
131 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
132 1, 1, 1, 2, 1
133};
134
135/*
136** isConsonant() and isVowel() determine if their first character in
137** the string they point to is a consonant or a vowel, according
138** to Porter ruls.
139**
140** A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'.
141** 'Y' is a consonant unless it follows another consonant,
142** in which case it is a vowel.
143**
144** In these routine, the letters are in reverse order. So the 'y' rule
145** is that 'y' is a consonant unless it is followed by another
146** consonent.
147*/
148static int isVowel(const char*);
149static int isConsonant(const char *z){
150 int j;
151 char x = *z;
152 if( x==0 ) return 0;
153 assert( x>='a' && x<='z' );
154 j = cType[x-'a'];
155 if( j<2 ) return j;
156 return z[1]==0 || isVowel(z + 1);
157}
158static int isVowel(const char *z){
159 int j;
160 char x = *z;
161 if( x==0 ) return 0;
162 assert( x>='a' && x<='z' );
163 j = cType[x-'a'];
164 if( j<2 ) return 1-j;
165 return isConsonant(z + 1);
166}
167
168/*
169** Let any sequence of one or more vowels be represented by V and let
170** C be sequence of one or more consonants. Then every word can be
171** represented as:
172**
173** [C] (VC){m} [V]
174**
175** In prose: A word is an optional consonant followed by zero or
176** vowel-consonant pairs followed by an optional vowel. "m" is the
177** number of vowel consonant pairs. This routine computes the value
178** of m for the first i bytes of a word.
179**
180** Return true if the m-value for z is 1 or more. In other words,
181** return true if z contains at least one vowel that is followed
182** by a consonant.
183**
184** In this routine z[] is in reverse order. So we are really looking
185** for an instance of of a consonant followed by a vowel.
186*/
187static int m_gt_0(const char *z){
188 while( isVowel(z) ){ z++; }
189 if( *z==0 ) return 0;
190 while( isConsonant(z) ){ z++; }
191 return *z!=0;
192}
193
194/* Like mgt0 above except we are looking for a value of m which is
195** exactly 1
196*/
197static int m_eq_1(const char *z){
198 while( isVowel(z) ){ z++; }
199 if( *z==0 ) return 0;
200 while( isConsonant(z) ){ z++; }
201 if( *z==0 ) return 0;
202 while( isVowel(z) ){ z++; }
203 if( *z==0 ) return 1;
204 while( isConsonant(z) ){ z++; }
205 return *z==0;
206}
207
208/* Like mgt0 above except we are looking for a value of m>1 instead
209** or m>0
210*/
211static int m_gt_1(const char *z){
212 while( isVowel(z) ){ z++; }
213 if( *z==0 ) return 0;
214 while( isConsonant(z) ){ z++; }
215 if( *z==0 ) return 0;
216 while( isVowel(z) ){ z++; }
217 if( *z==0 ) return 0;
218 while( isConsonant(z) ){ z++; }
219 return *z!=0;
220}
221
222/*
223** Return TRUE if there is a vowel anywhere within z[0..n-1]
224*/
225static int hasVowel(const char *z){
226 while( isConsonant(z) ){ z++; }
227 return *z!=0;
228}
229
230/*
231** Return TRUE if the word ends in a double consonant.
232**
233** The text is reversed here. So we are really looking at
234** the first two characters of z[].
235*/
236static int doubleConsonant(const char *z){
237 return isConsonant(z) && z[0]==z[1] && isConsonant(z+1);
238}
239
240/*
241** Return TRUE if the word ends with three letters which
242** are consonant-vowel-consonent and where the final consonant
243** is not 'w', 'x', or 'y'.
244**
245** The word is reversed here. So we are really checking the
246** first three letters and the first one cannot be in [wxy].
247*/
248static int star_oh(const char *z){
249 return
250 z[0]!=0 && isConsonant(z) &&
251 z[0]!='w' && z[0]!='x' && z[0]!='y' &&
252 z[1]!=0 && isVowel(z+1) &&
253 z[2]!=0 && isConsonant(z+2);
254}
255
256/*
257** If the word ends with zFrom and xCond() is true for the stem
258** of the word that preceeds the zFrom ending, then change the
259** ending to zTo.
260**
261** The input word *pz and zFrom are both in reverse order. zTo
262** is in normal order.
263**
264** Return TRUE if zFrom matches. Return FALSE if zFrom does not
265** match. Not that TRUE is returned even if xCond() fails and
266** no substitution occurs.
267*/
268static int stem(
269 char **pz, /* The word being stemmed (Reversed) */
270 const char *zFrom, /* If the ending matches this... (Reversed) */
271 const char *zTo, /* ... change the ending to this (not reversed) */
272 int (*xCond)(const char*) /* Condition that must be true */
273){
274 char *z = *pz;
275 while( *zFrom && *zFrom==*z ){ z++; zFrom++; }
276 if( *zFrom!=0 ) return 0;
277 if( xCond && !xCond(z) ) return 1;
278 while( *zTo ){
279 *(--z) = *(zTo++);
280 }
281 *pz = z;
282 return 1;
283}
284
285/*
286** This is the fallback stemmer used when the porter stemmer is
287** inappropriate. The input word is copied into the output with
288** US-ASCII case folding. If the input word is too long (more
289** than 20 bytes if it contains no digits or more than 6 bytes if
290** it contains digits) then word is truncated to 20 or 6 bytes
291** by taking 10 or 3 bytes from the beginning and end.
292*/
293static void copy_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
294 int i, mx, j;
295 int hasDigit = 0;
296 for(i=0; i<nIn; i++){
297 int c = zIn[i];
298 if( c>='A' && c<='Z' ){
299 zOut[i] = c - 'A' + 'a';
300 }else{
301 if( c>='0' && c<='9' ) hasDigit = 1;
302 zOut[i] = c;
303 }
304 }
305 mx = hasDigit ? 3 : 10;
306 if( nIn>mx*2 ){
307 for(j=mx, i=nIn-mx; i<nIn; i++, j++){
308 zOut[j] = zOut[i];
309 }
310 i = j;
311 }
312 zOut[i] = 0;
313 *pnOut = i;
314}
315
316
317/*
318** Stem the input word zIn[0..nIn-1]. Store the output in zOut.
319** zOut is at least big enough to hold nIn bytes. Write the actual
320** size of the output word (exclusive of the '\0' terminator) into *pnOut.
321**
322** Any upper-case characters in the US-ASCII character set ([A-Z])
323** are converted to lower case. Upper-case UTF characters are
324** unchanged.
325**
326** Words that are longer than about 20 bytes are stemmed by retaining
327** a few bytes from the beginning and the end of the word. If the
328** word contains digits, 3 bytes are taken from the beginning and
329** 3 bytes from the end. For long words without digits, 10 bytes
330** are taken from each end. US-ASCII case folding still applies.
331**
332** If the input word contains not digits but does characters not
333** in [a-zA-Z] then no stemming is attempted and this routine just
334** copies the input into the input into the output with US-ASCII
335** case folding.
336**
337** Stemming never increases the length of the word. So there is
338** no chance of overflowing the zOut buffer.
339*/
340static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
341 int i, j, c;
342 char zReverse[28];
343 char *z, *z2;
344 if( nIn<3 || nIn>=sizeof(zReverse)-7 ){
345 /* The word is too big or too small for the porter stemmer.
346 ** Fallback to the copy stemmer */
347 copy_stemmer(zIn, nIn, zOut, pnOut);
348 return;
349 }
350 for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){
351 c = zIn[i];
352 if( c>='A' && c<='Z' ){
353 zReverse[j] = c + 'a' - 'A';
354 }else if( c>='a' && c<='z' ){
355 zReverse[j] = c;
356 }else{
357 /* The use of a character not in [a-zA-Z] means that we fallback
358 ** to the copy stemmer */
359 copy_stemmer(zIn, nIn, zOut, pnOut);
360 return;
361 }
362 }
363 memset(&zReverse[sizeof(zReverse)-5], 0, 5);
364 z = &zReverse[j+1];
365
366
367 /* Step 1a */
368 if( z[0]=='s' ){
369 if(
370 !stem(&z, "sess", "ss", 0) &&
371 !stem(&z, "sei", "i", 0) &&
372 !stem(&z, "ss", "ss", 0)
373 ){
374 z++;
375 }
376 }
377
378 /* Step 1b */
379 z2 = z;
380 if( stem(&z, "dee", "ee", m_gt_0) ){
381 /* Do nothing. The work was all in the test */
382 }else if(
383 (stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel))
384 && z!=z2
385 ){
386 if( stem(&z, "ta", "ate", 0) ||
387 stem(&z, "lb", "ble", 0) ||
388 stem(&z, "zi", "ize", 0) ){
389 /* Do nothing. The work was all in the test */
390 }else if( doubleConsonant(z) && (*z!='l' && *z!='s' && *z!='z') ){
391 z++;
392 }else if( m_eq_1(z) && star_oh(z) ){
393 *(--z) = 'e';
394 }
395 }
396
397 /* Step 1c */
398 if( z[0]=='y' && hasVowel(z+1) ){
399 z[0] = 'i';
400 }
401
402 /* Step 2 */
403 switch( z[1] ){
404 case 'a':
405 stem(&z, "lanoita", "ate", m_gt_0) ||
406 stem(&z, "lanoit", "tion", m_gt_0);
407 break;
408 case 'c':
409 stem(&z, "icne", "ence", m_gt_0) ||
410 stem(&z, "icna", "ance", m_gt_0);
411 break;
412 case 'e':
413 stem(&z, "rezi", "ize", m_gt_0);
414 break;
415 case 'g':
416 stem(&z, "igol", "log", m_gt_0);
417 break;
418 case 'l':
419 stem(&z, "ilb", "ble", m_gt_0) ||
420 stem(&z, "illa", "al", m_gt_0) ||
421 stem(&z, "iltne", "ent", m_gt_0) ||
422 stem(&z, "ile", "e", m_gt_0) ||
423 stem(&z, "ilsuo", "ous", m_gt_0);
424 break;
425 case 'o':
426 stem(&z, "noitazi", "ize", m_gt_0) ||
427 stem(&z, "noita", "ate", m_gt_0) ||
428 stem(&z, "rota", "ate", m_gt_0);
429 break;
430 case 's':
431 stem(&z, "msila", "al", m_gt_0) ||
432 stem(&z, "ssenevi", "ive", m_gt_0) ||
433 stem(&z, "ssenluf", "ful", m_gt_0) ||
434 stem(&z, "ssensuo", "ous", m_gt_0);
435 break;
436 case 't':
437 stem(&z, "itila", "al", m_gt_0) ||
438 stem(&z, "itivi", "ive", m_gt_0) ||
439 stem(&z, "itilib", "ble", m_gt_0);
440 break;
441 }
442
443 /* Step 3 */
444 switch( z[0] ){
445 case 'e':
446 stem(&z, "etaci", "ic", m_gt_0) ||
447 stem(&z, "evita", "", m_gt_0) ||
448 stem(&z, "ezila", "al", m_gt_0);
449 break;
450 case 'i':
451 stem(&z, "itici", "ic", m_gt_0);
452 break;
453 case 'l':
454 stem(&z, "laci", "ic", m_gt_0) ||
455 stem(&z, "luf", "", m_gt_0);
456 break;
457 case 's':
458 stem(&z, "ssen", "", m_gt_0);
459 break;
460 }
461
462 /* Step 4 */
463 switch( z[1] ){
464 case 'a':
465 if( z[0]=='l' && m_gt_1(z+2) ){
466 z += 2;
467 }
468 break;
469 case 'c':
470 if( z[0]=='e' && z[2]=='n' && (z[3]=='a' || z[3]=='e') && m_gt_1(z+4) ){
471 z += 4;
472 }
473 break;
474 case 'e':
475 if( z[0]=='r' && m_gt_1(z+2) ){
476 z += 2;
477 }
478 break;
479 case 'i':
480 if( z[0]=='c' && m_gt_1(z+2) ){
481 z += 2;
482 }
483 break;
484 case 'l':
485 if( z[0]=='e' && z[2]=='b' && (z[3]=='a' || z[3]=='i') && m_gt_1(z+4) ){
486 z += 4;
487 }
488 break;
489 case 'n':
490 if( z[0]=='t' ){
491 if( z[2]=='a' ){
492 if( m_gt_1(z+3) ){
493 z += 3;
494 }
495 }else if( z[2]=='e' ){
496 stem(&z, "tneme", "", m_gt_1) ||
497 stem(&z, "tnem", "", m_gt_1) ||
498 stem(&z, "tne", "", m_gt_1);
499 }
500 }
501 break;
502 case 'o':
503 if( z[0]=='u' ){
504 if( m_gt_1(z+2) ){
505 z += 2;
506 }
507 }else if( z[3]=='s' || z[3]=='t' ){
508 stem(&z, "noi", "", m_gt_1);
509 }
510 break;
511 case 's':
512 if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){
513 z += 3;
514 }
515 break;
516 case 't':
517 stem(&z, "eta", "", m_gt_1) ||
518 stem(&z, "iti", "", m_gt_1);
519 break;
520 case 'u':
521 if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){
522 z += 3;
523 }
524 break;
525 case 'v':
526 case 'z':
527 if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){
528 z += 3;
529 }
530 break;
531 }
532
533 /* Step 5a */
534 if( z[0]=='e' ){
535 if( m_gt_1(z+1) ){
536 z++;
537 }else if( m_eq_1(z+1) && !star_oh(z+1) ){
538 z++;
539 }
540 }
541
542 /* Step 5b */
543 if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){
544 z++;
545 }
546
547 /* z[] is now the stemmed word in reverse order. Flip it back
548 ** around into forward order and return.
549 */
550 *pnOut = i = strlen(z);
551 zOut[i] = 0;
552 while( *z ){
553 zOut[--i] = *(z++);
554 }
555}
556
557/*
558** Characters that can be part of a token. We assume any character
559** whose value is greater than 0x80 (any UTF character) can be
560** part of a token. In other words, delimiters all must have
561** values of 0x7f or lower.
562*/
563static const char isIdChar[] = {
564/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
565 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */
566 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */
567 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */
568 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */
569 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */
570};
571#define idChar(C) (((ch=C)&0x80)!=0 || (ch>0x2f && isIdChar[ch-0x30]))
572#define isDelim(C) (((ch=C)&0x80)==0 && (ch<0x30 || !isIdChar[ch-0x30]))
573
574/*
575** Extract the next token from a tokenization cursor. The cursor must
576** have been opened by a prior call to porterOpen().
577*/
578static int porterNext(
579 sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by porterOpen */
580 const char **pzToken, /* OUT: *pzToken is the token text */
581 int *pnBytes, /* OUT: Number of bytes in token */
582 int *piStartOffset, /* OUT: Starting offset of token */
583 int *piEndOffset, /* OUT: Ending offset of token */
584 int *piPosition /* OUT: Position integer of token */
585){
586 porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
587 const char *z = c->zInput;
588
589 while( c->iOffset<c->nInput ){
590 int iStartOffset, ch;
591
592 /* Scan past delimiter characters */
593 while( c->iOffset<c->nInput && isDelim(z[c->iOffset]) ){
594 c->iOffset++;
595 }
596
597 /* Count non-delimiter characters. */
598 iStartOffset = c->iOffset;
599 while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){
600 c->iOffset++;
601 }
602
603 if( c->iOffset>iStartOffset ){
604 int n = c->iOffset-iStartOffset;
605 if( n>c->nAllocated ){
606 c->nAllocated = n+20;
607 c->zToken = realloc(c->zToken, c->nAllocated);
608 if( c->zToken==NULL ) return SQLITE_NOMEM;
609 }
610 porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes);
611 *pzToken = c->zToken;
612 *piStartOffset = iStartOffset;
613 *piEndOffset = c->iOffset;
614 *piPosition = c->iToken++;
615 return SQLITE_OK;
616 }
617 }
618 return SQLITE_DONE;
619}
620
621/*
622** The set of routines that implement the porter-stemmer tokenizer
623*/
624static const sqlite3_tokenizer_module porterTokenizerModule = {
625 0,
626 porterCreate,
627 porterDestroy,
628 porterOpen,
629 porterClose,
630 porterNext,
631};
632
633/*
634** Allocate a new porter tokenizer. Return a pointer to the new
635** tokenizer in *ppModule
636*/
637void sqlite3Fts1PorterTokenizerModule(
638 sqlite3_tokenizer_module const**ppModule
639){
640 *ppModule = &porterTokenizerModule;
641}
642
643#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_tokenizer.h b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_tokenizer.h
deleted file mode 100644
index a48cb74..0000000
--- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_tokenizer.h
+++ /dev/null
@@ -1,90 +0,0 @@
1/*
2** 2006 July 10
3**
4** The author disclaims copyright to this source code.
5**
6*************************************************************************
7** Defines the interface to tokenizers used by fulltext-search. There
8** are three basic components:
9**
10** sqlite3_tokenizer_module is a singleton defining the tokenizer
11** interface functions. This is essentially the class structure for
12** tokenizers.
13**
14** sqlite3_tokenizer is used to define a particular tokenizer, perhaps
15** including customization information defined at creation time.
16**
17** sqlite3_tokenizer_cursor is generated by a tokenizer to generate
18** tokens from a particular input.
19*/
20#ifndef _FTS1_TOKENIZER_H_
21#define _FTS1_TOKENIZER_H_
22
23/* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time.
24** If tokenizers are to be allowed to call sqlite3_*() functions, then
25** we will need a way to register the API consistently.
26*/
27#include "sqlite3.h"
28
29/*
30** Structures used by the tokenizer interface.
31*/
32typedef struct sqlite3_tokenizer sqlite3_tokenizer;
33typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor;
34typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module;
35
36struct sqlite3_tokenizer_module {
37 int iVersion; /* currently 0 */
38
39 /*
40 ** Create and destroy a tokenizer. argc/argv are passed down from
41 ** the fulltext virtual table creation to allow customization.
42 */
43 int (*xCreate)(int argc, const char *const*argv,
44 sqlite3_tokenizer **ppTokenizer);
45 int (*xDestroy)(sqlite3_tokenizer *pTokenizer);
46
47 /*
48 ** Tokenize a particular input. Call xOpen() to prepare to
49 ** tokenize, xNext() repeatedly until it returns SQLITE_DONE, then
50 ** xClose() to free any internal state. The pInput passed to
51 ** xOpen() must exist until the cursor is closed. The ppToken
52 ** result from xNext() is only valid until the next call to xNext()
53 ** or until xClose() is called.
54 */
55 /* TODO(shess) current implementation requires pInput to be
56 ** nul-terminated. This should either be fixed, or pInput/nBytes
57 ** should be converted to zInput.
58 */
59 int (*xOpen)(sqlite3_tokenizer *pTokenizer,
60 const char *pInput, int nBytes,
61 sqlite3_tokenizer_cursor **ppCursor);
62 int (*xClose)(sqlite3_tokenizer_cursor *pCursor);
63 int (*xNext)(sqlite3_tokenizer_cursor *pCursor,
64 const char **ppToken, int *pnBytes,
65 int *piStartOffset, int *piEndOffset, int *piPosition);
66};
67
68struct sqlite3_tokenizer {
69 const sqlite3_tokenizer_module *pModule; /* The module for this tokenizer */
70 /* Tokenizer implementations will typically add additional fields */
71};
72
73struct sqlite3_tokenizer_cursor {
74 sqlite3_tokenizer *pTokenizer; /* Tokenizer for this cursor. */
75 /* Tokenizer implementations will typically add additional fields */
76};
77
78/*
79** Get the module for a tokenizer which generates tokens based on a
80** set of non-token characters. The default is to break tokens at any
81** non-alnum character, though the set of delimiters can also be
82** specified by the first argv argument to xCreate().
83*/
84/* TODO(shess) This doesn't belong here. Need some sort of
85** registration process.
86*/
87void sqlite3Fts1SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule);
88void sqlite3Fts1PorterTokenizerModule(sqlite3_tokenizer_module const**ppModule);
89
90#endif /* _FTS1_TOKENIZER_H_ */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_tokenizer1.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_tokenizer1.c
deleted file mode 100644
index f58fba8..0000000
--- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_tokenizer1.c
+++ /dev/null
@@ -1,221 +0,0 @@
1/*
2** The author disclaims copyright to this source code.
3**
4*************************************************************************
5** Implementation of the "simple" full-text-search tokenizer.
6*/
7
8/*
9** The code in this file is only compiled if:
10**
11** * The FTS1 module is being built as an extension
12** (in which case SQLITE_CORE is not defined), or
13**
14** * The FTS1 module is being built into the core of
15** SQLite (in which case SQLITE_ENABLE_FTS1 is defined).
16*/
17#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)
18
19
20#include <assert.h>
21#include <stdlib.h>
22#include <stdio.h>
23#include <string.h>
24#include <ctype.h>
25
26#include "fts1_tokenizer.h"
27
28typedef struct simple_tokenizer {
29 sqlite3_tokenizer base;
30 char delim[128]; /* flag ASCII delimiters */
31} simple_tokenizer;
32
33typedef struct simple_tokenizer_cursor {
34 sqlite3_tokenizer_cursor base;
35 const char *pInput; /* input we are tokenizing */
36 int nBytes; /* size of the input */
37 int iOffset; /* current position in pInput */
38 int iToken; /* index of next token to be returned */
39 char *pToken; /* storage for current token */
40 int nTokenAllocated; /* space allocated to zToken buffer */
41} simple_tokenizer_cursor;
42
43
44/* Forward declaration */
45static const sqlite3_tokenizer_module simpleTokenizerModule;
46
47static int isDelim(simple_tokenizer *t, unsigned char c){
48 return c<0x80 && t->delim[c];
49}
50
51/*
52** Create a new tokenizer instance.
53*/
54static int simpleCreate(
55 int argc, const char * const *argv,
56 sqlite3_tokenizer **ppTokenizer
57){
58 simple_tokenizer *t;
59
60 t = (simple_tokenizer *) calloc(sizeof(*t), 1);
61 if( t==NULL ) return SQLITE_NOMEM;
62
63 /* TODO(shess) Delimiters need to remain the same from run to run,
64 ** else we need to reindex. One solution would be a meta-table to
65 ** track such information in the database, then we'd only want this
66 ** information on the initial create.
67 */
68 if( argc>1 ){
69 int i, n = strlen(argv[1]);
70 for(i=0; i<n; i++){
71 unsigned char ch = argv[1][i];
72 /* We explicitly don't support UTF-8 delimiters for now. */
73 if( ch>=0x80 ){
74 free(t);
75 return SQLITE_ERROR;
76 }
77 t->delim[ch] = 1;
78 }
79 } else {
80 /* Mark non-alphanumeric ASCII characters as delimiters */
81 int i;
82 for(i=1; i<0x80; i++){
83 t->delim[i] = !isalnum(i);
84 }
85 }
86
87 *ppTokenizer = &t->base;
88 return SQLITE_OK;
89}
90
91/*
92** Destroy a tokenizer
93*/
94static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
95 free(pTokenizer);
96 return SQLITE_OK;
97}
98
99/*
100** Prepare to begin tokenizing a particular string. The input
101** string to be tokenized is pInput[0..nBytes-1]. A cursor
102** used to incrementally tokenize this string is returned in
103** *ppCursor.
104*/
105static int simpleOpen(
106 sqlite3_tokenizer *pTokenizer, /* The tokenizer */
107 const char *pInput, int nBytes, /* String to be tokenized */
108 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
109){
110 simple_tokenizer_cursor *c;
111
112 c = (simple_tokenizer_cursor *) malloc(sizeof(*c));
113 if( c==NULL ) return SQLITE_NOMEM;
114
115 c->pInput = pInput;
116 if( pInput==0 ){
117 c->nBytes = 0;
118 }else if( nBytes<0 ){
119 c->nBytes = (int)strlen(pInput);
120 }else{
121 c->nBytes = nBytes;
122 }
123 c->iOffset = 0; /* start tokenizing at the beginning */
124 c->iToken = 0;
125 c->pToken = NULL; /* no space allocated, yet. */
126 c->nTokenAllocated = 0;
127
128 *ppCursor = &c->base;
129 return SQLITE_OK;
130}
131
132/*
133** Close a tokenization cursor previously opened by a call to
134** simpleOpen() above.
135*/
136static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
137 simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
138 free(c->pToken);
139 free(c);
140 return SQLITE_OK;
141}
142
143/*
144** Extract the next token from a tokenization cursor. The cursor must
145** have been opened by a prior call to simpleOpen().
146*/
147static int simpleNext(
148 sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */
149 const char **ppToken, /* OUT: *ppToken is the token text */
150 int *pnBytes, /* OUT: Number of bytes in token */
151 int *piStartOffset, /* OUT: Starting offset of token */
152 int *piEndOffset, /* OUT: Ending offset of token */
153 int *piPosition /* OUT: Position integer of token */
154){
155 simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
156 simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
157 unsigned char *p = (unsigned char *)c->pInput;
158
159 while( c->iOffset<c->nBytes ){
160 int iStartOffset;
161
162 /* Scan past delimiter characters */
163 while( c->iOffset<c->nBytes && isDelim(t, p[c->iOffset]) ){
164 c->iOffset++;
165 }
166
167 /* Count non-delimiter characters. */
168 iStartOffset = c->iOffset;
169 while( c->iOffset<c->nBytes && !isDelim(t, p[c->iOffset]) ){
170 c->iOffset++;
171 }
172
173 if( c->iOffset>iStartOffset ){
174 int i, n = c->iOffset-iStartOffset;
175 if( n>c->nTokenAllocated ){
176 c->nTokenAllocated = n+20;
177 c->pToken = realloc(c->pToken, c->nTokenAllocated);
178 if( c->pToken==NULL ) return SQLITE_NOMEM;
179 }
180 for(i=0; i<n; i++){
181 /* TODO(shess) This needs expansion to handle UTF-8
182 ** case-insensitivity.
183 */
184 unsigned char ch = p[iStartOffset+i];
185 c->pToken[i] = ch<0x80 ? tolower(ch) : ch;
186 }
187 *ppToken = c->pToken;
188 *pnBytes = n;
189 *piStartOffset = iStartOffset;
190 *piEndOffset = c->iOffset;
191 *piPosition = c->iToken++;
192
193 return SQLITE_OK;
194 }
195 }
196 return SQLITE_DONE;
197}
198
199/*
200** The set of routines that implement the simple tokenizer
201*/
202static const sqlite3_tokenizer_module simpleTokenizerModule = {
203 0,
204 simpleCreate,
205 simpleDestroy,
206 simpleOpen,
207 simpleClose,
208 simpleNext,
209};
210
211/*
212** Allocate a new simple tokenizer. Return a pointer to the new
213** tokenizer in *ppModule
214*/
215void sqlite3Fts1SimpleTokenizerModule(
216 sqlite3_tokenizer_module const**ppModule
217){
218 *ppModule = &simpleTokenizerModule;
219}
220
221#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/README.tokenizers b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/README.tokenizers
deleted file mode 100644
index 6625b31..0000000
--- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/README.tokenizers
+++ /dev/null
@@ -1,134 +0,0 @@
1
21. FTS2 Tokenizers
3
4 When creating a new full-text table, FTS2 allows the user to select
5 the text tokenizer implementation to be used when indexing text
6 by specifying a "tokenizer" clause as part of the CREATE VIRTUAL TABLE
7 statement:
8
9 CREATE VIRTUAL TABLE <table-name> USING fts2(
10 <columns ...> [, tokenizer <tokenizer-name> [<tokenizer-args>]]
11 );
12
13 The built-in tokenizers (valid values to pass as <tokenizer name>) are
14 "simple" and "porter".
15
16 <tokenizer-args> should consist of zero or more white-space separated
17 arguments to pass to the selected tokenizer implementation. The
18 interpretation of the arguments, if any, depends on the individual
19 tokenizer.
20
212. Custom Tokenizers
22
23 FTS2 allows users to provide custom tokenizer implementations. The
24 interface used to create a new tokenizer is defined and described in
25 the fts2_tokenizer.h source file.
26
27 Registering a new FTS2 tokenizer is similar to registering a new
28 virtual table module with SQLite. The user passes a pointer to a
29 structure containing pointers to various callback functions that
30 make up the implementation of the new tokenizer type. For tokenizers,
31 the structure (defined in fts2_tokenizer.h) is called
32 "sqlite3_tokenizer_module".
33
34 FTS2 does not expose a C-function that users call to register new
35 tokenizer types with a database handle. Instead, the pointer must
36 be encoded as an SQL blob value and passed to FTS2 through the SQL
37 engine by evaluating a special scalar function, "fts2_tokenizer()".
38 The fts2_tokenizer() function may be called with one or two arguments,
39 as follows:
40
41 SELECT fts2_tokenizer(<tokenizer-name>);
42 SELECT fts2_tokenizer(<tokenizer-name>, <sqlite3_tokenizer_module ptr>);
43
44 Where <tokenizer-name> is a string identifying the tokenizer and
45 <sqlite3_tokenizer_module ptr> is a pointer to an sqlite3_tokenizer_module
46 structure encoded as an SQL blob. If the second argument is present,
47 it is registered as tokenizer <tokenizer-name> and a copy of it
48 returned. If only one argument is passed, a pointer to the tokenizer
49 implementation currently registered as <tokenizer-name> is returned,
50 encoded as a blob. Or, if no such tokenizer exists, an SQL exception
51 (error) is raised.
52
53 SECURITY: If the fts2 extension is used in an environment where potentially
54 malicious users may execute arbitrary SQL (i.e. gears), they should be
55 prevented from invoking the fts2_tokenizer() function, possibly using the
56 authorisation callback.
57
58 See "Sample code" below for an example of calling the fts2_tokenizer()
59 function from C code.
60
613. ICU Library Tokenizers
62
63 If this extension is compiled with the SQLITE_ENABLE_ICU pre-processor
64 symbol defined, then there exists a built-in tokenizer named "icu"
65 implemented using the ICU library. The first argument passed to the
66 xCreate() method (see fts2_tokenizer.h) of this tokenizer may be
67 an ICU locale identifier. For example "tr_TR" for Turkish as used
68 in Turkey, or "en_AU" for English as used in Australia. For example:
69
70 "CREATE VIRTUAL TABLE thai_text USING fts2(text, tokenizer icu th_TH)"
71
72 The ICU tokenizer implementation is very simple. It splits the input
73 text according to the ICU rules for finding word boundaries and discards
74 any tokens that consist entirely of white-space. This may be suitable
75 for some applications in some locales, but not all. If more complex
76 processing is required, for example to implement stemming or
77 discard punctuation, this can be done by creating a tokenizer
78 implementation that uses the ICU tokenizer as part of it's implementation.
79
80 When using the ICU tokenizer this way, it is safe to overwrite the
81 contents of the strings returned by the xNext() method (see
82 fts2_tokenizer.h).
83
844. Sample code.
85
86 The following two code samples illustrate the way C code should invoke
87 the fts2_tokenizer() scalar function:
88
89 int registerTokenizer(
90 sqlite3 *db,
91 char *zName,
92 const sqlite3_tokenizer_module *p
93 ){
94 int rc;
95 sqlite3_stmt *pStmt;
96 const char zSql[] = "SELECT fts2_tokenizer(?, ?)";
97
98 rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
99 if( rc!=SQLITE_OK ){
100 return rc;
101 }
102
103 sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
104 sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
105 sqlite3_step(pStmt);
106
107 return sqlite3_finalize(pStmt);
108 }
109
110 int queryTokenizer(
111 sqlite3 *db,
112 char *zName,
113 const sqlite3_tokenizer_module **pp
114 ){
115 int rc;
116 sqlite3_stmt *pStmt;
117 const char zSql[] = "SELECT fts2_tokenizer(?)";
118
119 *pp = 0;
120 rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
121 if( rc!=SQLITE_OK ){
122 return rc;
123 }
124
125 sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
126 if( SQLITE_ROW==sqlite3_step(pStmt) ){
127 if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){
128 memcpy(pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp));
129 }
130 }
131
132 return sqlite3_finalize(pStmt);
133 }
134
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/README.txt b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/README.txt
deleted file mode 100644
index 517a2a0..0000000
--- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/README.txt
+++ /dev/null
@@ -1,4 +0,0 @@
1This folder contains source code to the second full-text search
2extension for SQLite. While the API is the same, this version uses a
3substantially different storage schema from fts1, so tables will need
4to be rebuilt.
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2.c
deleted file mode 100644
index 65ad173..0000000
--- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2.c
+++ /dev/null
@@ -1,5936 +0,0 @@
1/* fts2 has a design flaw which can lead to database corruption (see
2** below). It is recommended not to use it any longer, instead use
3** fts3 (or higher). If you believe that your use of fts2 is safe,
4** add -DSQLITE_ENABLE_BROKEN_FTS2=1 to your CFLAGS.
5*/
6#ifndef SQLITE_ENABLE_BROKEN_FTS2
7#error fts2 has a design flaw and has been deprecated.
8#endif
9/* The flaw is that fts2 uses the content table's unaliased rowid as
10** the unique docid. fts2 embeds the rowid in the index it builds,
11** and expects the rowid to not change. The SQLite VACUUM operation
12** will renumber such rowids, thereby breaking fts2. If you are using
13** fts2 in a system which has disabled VACUUM, then you can continue
14** to use it safely. Note that PRAGMA auto_vacuum does NOT disable
15** VACUUM, though systems using auto_vacuum are unlikely to invoke
16** VACUUM.
17**
18** Unlike fts1, which is safe across VACUUM if you never delete
19** documents, fts2 has a second exposure to this flaw, in the segments
20** table. So fts2 should be considered unsafe across VACUUM in all
21** cases.
22*/
23
24/*
25** 2006 Oct 10
26**
27** The author disclaims copyright to this source code. In place of
28** a legal notice, here is a blessing:
29**
30** May you do good and not evil.
31** May you find forgiveness for yourself and forgive others.
32** May you share freely, never taking more than you give.
33**
34******************************************************************************
35**
36** This is an SQLite module implementing full-text search.
37*/
38
39/*
40** The code in this file is only compiled if:
41**
42** * The FTS2 module is being built as an extension
43** (in which case SQLITE_CORE is not defined), or
44**
45** * The FTS2 module is being built into the core of
46** SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
47*/
48
49/* TODO(shess) Consider exporting this comment to an HTML file or the
50** wiki.
51*/
52/* The full-text index is stored in a series of b+tree (-like)
53** structures called segments which map terms to doclists. The
54** structures are like b+trees in layout, but are constructed from the
55** bottom up in optimal fashion and are not updatable. Since trees
56** are built from the bottom up, things will be described from the
57** bottom up.
58**
59**
60**** Varints ****
61** The basic unit of encoding is a variable-length integer called a
62** varint. We encode variable-length integers in little-endian order
63** using seven bits * per byte as follows:
64**
65** KEY:
66** A = 0xxxxxxx 7 bits of data and one flag bit
67** B = 1xxxxxxx 7 bits of data and one flag bit
68**
69** 7 bits - A
70** 14 bits - BA
71** 21 bits - BBA
72** and so on.
73**
74** This is identical to how sqlite encodes varints (see util.c).
75**
76**
77**** Document lists ****
78** A doclist (document list) holds a docid-sorted list of hits for a
79** given term. Doclists hold docids, and can optionally associate
80** token positions and offsets with docids.
81**
82** A DL_POSITIONS_OFFSETS doclist is stored like this:
83**
84** array {
85** varint docid;
86** array { (position list for column 0)
87** varint position; (delta from previous position plus POS_BASE)
88** varint startOffset; (delta from previous startOffset)
89** varint endOffset; (delta from startOffset)
90** }
91** array {
92** varint POS_COLUMN; (marks start of position list for new column)
93** varint column; (index of new column)
94** array {
95** varint position; (delta from previous position plus POS_BASE)
96** varint startOffset;(delta from previous startOffset)
97** varint endOffset; (delta from startOffset)
98** }
99** }
100** varint POS_END; (marks end of positions for this document.
101** }
102**
103** Here, array { X } means zero or more occurrences of X, adjacent in
104** memory. A "position" is an index of a token in the token stream
105** generated by the tokenizer, while an "offset" is a byte offset,
106** both based at 0. Note that POS_END and POS_COLUMN occur in the
107** same logical place as the position element, and act as sentinals
108** ending a position list array.
109**
110** A DL_POSITIONS doclist omits the startOffset and endOffset
111** information. A DL_DOCIDS doclist omits both the position and
112** offset information, becoming an array of varint-encoded docids.
113**
114** On-disk data is stored as type DL_DEFAULT, so we don't serialize
115** the type. Due to how deletion is implemented in the segmentation
116** system, on-disk doclists MUST store at least positions.
117**
118**
119**** Segment leaf nodes ****
120** Segment leaf nodes store terms and doclists, ordered by term. Leaf
121** nodes are written using LeafWriter, and read using LeafReader (to
122** iterate through a single leaf node's data) and LeavesReader (to
123** iterate through a segment's entire leaf layer). Leaf nodes have
124** the format:
125**
126** varint iHeight; (height from leaf level, always 0)
127** varint nTerm; (length of first term)
128** char pTerm[nTerm]; (content of first term)
129** varint nDoclist; (length of term's associated doclist)
130** char pDoclist[nDoclist]; (content of doclist)
131** array {
132** (further terms are delta-encoded)
133** varint nPrefix; (length of prefix shared with previous term)
134** varint nSuffix; (length of unshared suffix)
135** char pTermSuffix[nSuffix];(unshared suffix of next term)
136** varint nDoclist; (length of term's associated doclist)
137** char pDoclist[nDoclist]; (content of doclist)
138** }
139**
140** Here, array { X } means zero or more occurrences of X, adjacent in
141** memory.
142**
143** Leaf nodes are broken into blocks which are stored contiguously in
144** the %_segments table in sorted order. This means that when the end
145** of a node is reached, the next term is in the node with the next
146** greater node id.
147**
148** New data is spilled to a new leaf node when the current node
149** exceeds LEAF_MAX bytes (default 2048). New data which itself is
150** larger than STANDALONE_MIN (default 1024) is placed in a standalone
151** node (a leaf node with a single term and doclist). The goal of
152** these settings is to pack together groups of small doclists while
153** making it efficient to directly access large doclists. The
154** assumption is that large doclists represent terms which are more
155** likely to be query targets.
156**
157** TODO(shess) It may be useful for blocking decisions to be more
158** dynamic. For instance, it may make more sense to have a 2.5k leaf
159** node rather than splitting into 2k and .5k nodes. My intuition is
160** that this might extend through 2x or 4x the pagesize.
161**
162**
163**** Segment interior nodes ****
164** Segment interior nodes store blockids for subtree nodes and terms
165** to describe what data is stored by the each subtree. Interior
166** nodes are written using InteriorWriter, and read using
167** InteriorReader. InteriorWriters are created as needed when
168** SegmentWriter creates new leaf nodes, or when an interior node
169** itself grows too big and must be split. The format of interior
170** nodes:
171**
172** varint iHeight; (height from leaf level, always >0)
173** varint iBlockid; (block id of node's leftmost subtree)
174** optional {
175** varint nTerm; (length of first term)
176** char pTerm[nTerm]; (content of first term)
177** array {
178** (further terms are delta-encoded)
179** varint nPrefix; (length of shared prefix with previous term)
180** varint nSuffix; (length of unshared suffix)
181** char pTermSuffix[nSuffix]; (unshared suffix of next term)
182** }
183** }
184**
185** Here, optional { X } means an optional element, while array { X }
186** means zero or more occurrences of X, adjacent in memory.
187**
188** An interior node encodes n terms separating n+1 subtrees. The
189** subtree blocks are contiguous, so only the first subtree's blockid
190** is encoded. The subtree at iBlockid will contain all terms less
191** than the first term encoded (or all terms if no term is encoded).
192** Otherwise, for terms greater than or equal to pTerm[i] but less
193** than pTerm[i+1], the subtree for that term will be rooted at
194** iBlockid+i. Interior nodes only store enough term data to
195** distinguish adjacent children (if the rightmost term of the left
196** child is "something", and the leftmost term of the right child is
197** "wicked", only "w" is stored).
198**
199** New data is spilled to a new interior node at the same height when
200** the current node exceeds INTERIOR_MAX bytes (default 2048).
201** INTERIOR_MIN_TERMS (default 7) keeps large terms from monopolizing
202** interior nodes and making the tree too skinny. The interior nodes
203** at a given height are naturally tracked by interior nodes at
204** height+1, and so on.
205**
206**
207**** Segment directory ****
208** The segment directory in table %_segdir stores meta-information for
209** merging and deleting segments, and also the root node of the
210** segment's tree.
211**
212** The root node is the top node of the segment's tree after encoding
213** the entire segment, restricted to ROOT_MAX bytes (default 1024).
214** This could be either a leaf node or an interior node. If the top
215** node requires more than ROOT_MAX bytes, it is flushed to %_segments
216** and a new root interior node is generated (which should always fit
217** within ROOT_MAX because it only needs space for 2 varints, the
218** height and the blockid of the previous root).
219**
220** The meta-information in the segment directory is:
221** level - segment level (see below)
222** idx - index within level
223** - (level,idx uniquely identify a segment)
224** start_block - first leaf node
225** leaves_end_block - last leaf node
226** end_block - last block (including interior nodes)
227** root - contents of root node
228**
229** If the root node is a leaf node, then start_block,
230** leaves_end_block, and end_block are all 0.
231**
232**
233**** Segment merging ****
234** To amortize update costs, segments are groups into levels and
235** merged in matches. Each increase in level represents exponentially
236** more documents.
237**
238** New documents (actually, document updates) are tokenized and
239** written individually (using LeafWriter) to a level 0 segment, with
240** incrementing idx. When idx reaches MERGE_COUNT (default 16), all
241** level 0 segments are merged into a single level 1 segment. Level 1
242** is populated like level 0, and eventually MERGE_COUNT level 1
243** segments are merged to a single level 2 segment (representing
244** MERGE_COUNT^2 updates), and so on.
245**
246** A segment merge traverses all segments at a given level in
247** parallel, performing a straightforward sorted merge. Since segment
248** leaf nodes are written in to the %_segments table in order, this
249** merge traverses the underlying sqlite disk structures efficiently.
250** After the merge, all segment blocks from the merged level are
251** deleted.
252**
253** MERGE_COUNT controls how often we merge segments. 16 seems to be
254** somewhat of a sweet spot for insertion performance. 32 and 64 show
255** very similar performance numbers to 16 on insertion, though they're
256** a tiny bit slower (perhaps due to more overhead in merge-time
257** sorting). 8 is about 20% slower than 16, 4 about 50% slower than
258** 16, 2 about 66% slower than 16.
259**
260** At query time, high MERGE_COUNT increases the number of segments
261** which need to be scanned and merged. For instance, with 100k docs
262** inserted:
263**
264** MERGE_COUNT segments
265** 16 25
266** 8 12
267** 4 10
268** 2 6
269**
270** This appears to have only a moderate impact on queries for very
271** frequent terms (which are somewhat dominated by segment merge
272** costs), and infrequent and non-existent terms still seem to be fast
273** even with many segments.
274**
275** TODO(shess) That said, it would be nice to have a better query-side
276** argument for MERGE_COUNT of 16. Also, it's possible/likely that
277** optimizations to things like doclist merging will swing the sweet
278** spot around.
279**
280**
281**
282**** Handling of deletions and updates ****
283** Since we're using a segmented structure, with no docid-oriented
284** index into the term index, we clearly cannot simply update the term
285** index when a document is deleted or updated. For deletions, we
286** write an empty doclist (varint(docid) varint(POS_END)), for updates
287** we simply write the new doclist. Segment merges overwrite older
288** data for a particular docid with newer data, so deletes or updates
289** will eventually overtake the earlier data and knock it out. The
290** query logic likewise merges doclists so that newer data knocks out
291** older data.
292**
293** TODO(shess) Provide a VACUUM type operation to clear out all
294** deletions and duplications. This would basically be a forced merge
295** into a single segment.
296*/
297
298#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
299
300#if defined(SQLITE_ENABLE_FTS2) && !defined(SQLITE_CORE)
301# define SQLITE_CORE 1
302#endif
303
304#include <assert.h>
305#include <stdlib.h>
306#include <stdio.h>
307#include <string.h>
308#include <ctype.h>
309
310#include "fts2.h"
311#include "fts2_hash.h"
312#include "fts2_tokenizer.h"
313#include "sqlite3.h"
314#include "sqlite3ext.h"
315SQLITE_EXTENSION_INIT1
316
317
318/* TODO(shess) MAN, this thing needs some refactoring. At minimum, it
319** would be nice to order the file better, perhaps something along the
320** lines of:
321**
322** - utility functions
323** - table setup functions
324** - table update functions
325** - table query functions
326**
327** Put the query functions last because they're likely to reference
328** typedefs or functions from the table update section.
329*/
330
331#if 0
332# define TRACE(A) printf A; fflush(stdout)
333#else
334# define TRACE(A)
335#endif
336
337/* It is not safe to call isspace(), tolower(), or isalnum() on
338** hi-bit-set characters. This is the same solution used in the
339** tokenizer.
340*/
341/* TODO(shess) The snippet-generation code should be using the
342** tokenizer-generated tokens rather than doing its own local
343** tokenization.
344*/
345/* TODO(shess) Is __isascii() a portable version of (c&0x80)==0? */
346static int safe_isspace(char c){
347 return (c&0x80)==0 ? isspace(c) : 0;
348}
349static int safe_tolower(char c){
350 return (c&0x80)==0 ? tolower(c) : c;
351}
352static int safe_isalnum(char c){
353 return (c&0x80)==0 ? isalnum(c) : 0;
354}
355
356typedef enum DocListType {
357 DL_DOCIDS, /* docids only */
358 DL_POSITIONS, /* docids + positions */
359 DL_POSITIONS_OFFSETS /* docids + positions + offsets */
360} DocListType;
361
362/*
363** By default, only positions and not offsets are stored in the doclists.
364** To change this so that offsets are stored too, compile with
365**
366** -DDL_DEFAULT=DL_POSITIONS_OFFSETS
367**
368** If DL_DEFAULT is set to DL_DOCIDS, your table can only be inserted
369** into (no deletes or updates).
370*/
371#ifndef DL_DEFAULT
372# define DL_DEFAULT DL_POSITIONS
373#endif
374
375enum {
376 POS_END = 0, /* end of this position list */
377 POS_COLUMN, /* followed by new column number */
378 POS_BASE
379};
380
381/* MERGE_COUNT controls how often we merge segments (see comment at
382** top of file).
383*/
384#define MERGE_COUNT 16
385
386/* utility functions */
387
388/* CLEAR() and SCRAMBLE() abstract memset() on a pointer to a single
389** record to prevent errors of the form:
390**
391** my_function(SomeType *b){
392** memset(b, '\0', sizeof(b)); // sizeof(b)!=sizeof(*b)
393** }
394*/
395/* TODO(shess) Obvious candidates for a header file. */
396#define CLEAR(b) memset(b, '\0', sizeof(*(b)))
397
398#ifndef NDEBUG
399# define SCRAMBLE(b) memset(b, 0x55, sizeof(*(b)))
400#else
401# define SCRAMBLE(b)
402#endif
403
404/* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */
405#define VARINT_MAX 10
406
407/* Write a 64-bit variable-length integer to memory starting at p[0].
408 * The length of data written will be between 1 and VARINT_MAX bytes.
409 * The number of bytes written is returned. */
410static int putVarint(char *p, sqlite_int64 v){
411 unsigned char *q = (unsigned char *) p;
412 sqlite_uint64 vu = v;
413 do{
414 *q++ = (unsigned char) ((vu & 0x7f) | 0x80);
415 vu >>= 7;
416 }while( vu!=0 );
417 q[-1] &= 0x7f; /* turn off high bit in final byte */
418 assert( q - (unsigned char *)p <= VARINT_MAX );
419 return (int) (q - (unsigned char *)p);
420}
421
422/* Read a 64-bit variable-length integer from memory starting at p[0].
423 * Return the number of bytes read, or 0 on error.
424 * The value is stored in *v. */
425static int getVarint(const char *p, sqlite_int64 *v){
426 const unsigned char *q = (const unsigned char *) p;
427 sqlite_uint64 x = 0, y = 1;
428 while( (*q & 0x80) == 0x80 ){
429 x += y * (*q++ & 0x7f);
430 y <<= 7;
431 if( q - (unsigned char *)p >= VARINT_MAX ){ /* bad data */
432 assert( 0 );
433 return 0;
434 }
435 }
436 x += y * (*q++);
437 *v = (sqlite_int64) x;
438 return (int) (q - (unsigned char *)p);
439}
440
441static int getVarint32(const char *p, int *pi){
442 sqlite_int64 i;
443 int ret = getVarint(p, &i);
444 *pi = (int) i;
445 assert( *pi==i );
446 return ret;
447}
448
449/*******************************************************************/
450/* DataBuffer is used to collect data into a buffer in piecemeal
451** fashion. It implements the usual distinction between amount of
452** data currently stored (nData) and buffer capacity (nCapacity).
453**
454** dataBufferInit - create a buffer with given initial capacity.
455** dataBufferReset - forget buffer's data, retaining capacity.
456** dataBufferDestroy - free buffer's data.
457** dataBufferExpand - expand capacity without adding data.
458** dataBufferAppend - append data.
459** dataBufferAppend2 - append two pieces of data at once.
460** dataBufferReplace - replace buffer's data.
461*/
462typedef struct DataBuffer {
463 char *pData; /* Pointer to malloc'ed buffer. */
464 int nCapacity; /* Size of pData buffer. */
465 int nData; /* End of data loaded into pData. */
466} DataBuffer;
467
468static void dataBufferInit(DataBuffer *pBuffer, int nCapacity){
469 assert( nCapacity>=0 );
470 pBuffer->nData = 0;
471 pBuffer->nCapacity = nCapacity;
472 pBuffer->pData = nCapacity==0 ? NULL : malloc(nCapacity);
473}
474static void dataBufferReset(DataBuffer *pBuffer){
475 pBuffer->nData = 0;
476}
477static void dataBufferDestroy(DataBuffer *pBuffer){
478 if( pBuffer->pData!=NULL ) free(pBuffer->pData);
479 SCRAMBLE(pBuffer);
480}
481static void dataBufferExpand(DataBuffer *pBuffer, int nAddCapacity){
482 assert( nAddCapacity>0 );
483 /* TODO(shess) Consider expanding more aggressively. Note that the
484 ** underlying malloc implementation may take care of such things for
485 ** us already.
486 */
487 if( pBuffer->nData+nAddCapacity>pBuffer->nCapacity ){
488 pBuffer->nCapacity = pBuffer->nData+nAddCapacity;
489 pBuffer->pData = realloc(pBuffer->pData, pBuffer->nCapacity);
490 }
491}
492static void dataBufferAppend(DataBuffer *pBuffer,
493 const char *pSource, int nSource){
494 assert( nSource>0 && pSource!=NULL );
495 dataBufferExpand(pBuffer, nSource);
496 memcpy(pBuffer->pData+pBuffer->nData, pSource, nSource);
497 pBuffer->nData += nSource;
498}
499static void dataBufferAppend2(DataBuffer *pBuffer,
500 const char *pSource1, int nSource1,
501 const char *pSource2, int nSource2){
502 assert( nSource1>0 && pSource1!=NULL );
503 assert( nSource2>0 && pSource2!=NULL );
504 dataBufferExpand(pBuffer, nSource1+nSource2);
505 memcpy(pBuffer->pData+pBuffer->nData, pSource1, nSource1);
506 memcpy(pBuffer->pData+pBuffer->nData+nSource1, pSource2, nSource2);
507 pBuffer->nData += nSource1+nSource2;
508}
509static void dataBufferReplace(DataBuffer *pBuffer,
510 const char *pSource, int nSource){
511 dataBufferReset(pBuffer);
512 dataBufferAppend(pBuffer, pSource, nSource);
513}
514
515/* StringBuffer is a null-terminated version of DataBuffer. */
516typedef struct StringBuffer {
517 DataBuffer b; /* Includes null terminator. */
518} StringBuffer;
519
520static void initStringBuffer(StringBuffer *sb){
521 dataBufferInit(&sb->b, 100);
522 dataBufferReplace(&sb->b, "", 1);
523}
524static int stringBufferLength(StringBuffer *sb){
525 return sb->b.nData-1;
526}
527static char *stringBufferData(StringBuffer *sb){
528 return sb->b.pData;
529}
530static void stringBufferDestroy(StringBuffer *sb){
531 dataBufferDestroy(&sb->b);
532}
533
534static void nappend(StringBuffer *sb, const char *zFrom, int nFrom){
535 assert( sb->b.nData>0 );
536 if( nFrom>0 ){
537 sb->b.nData--;
538 dataBufferAppend2(&sb->b, zFrom, nFrom, "", 1);
539 }
540}
541static void append(StringBuffer *sb, const char *zFrom){
542 nappend(sb, zFrom, strlen(zFrom));
543}
544
545/* Append a list of strings separated by commas. */
546static void appendList(StringBuffer *sb, int nString, char **azString){
547 int i;
548 for(i=0; i<nString; ++i){
549 if( i>0 ) append(sb, ", ");
550 append(sb, azString[i]);
551 }
552}
553
554static int endsInWhiteSpace(StringBuffer *p){
555 return stringBufferLength(p)>0 &&
556 safe_isspace(stringBufferData(p)[stringBufferLength(p)-1]);
557}
558
559/* If the StringBuffer ends in something other than white space, add a
560** single space character to the end.
561*/
562static void appendWhiteSpace(StringBuffer *p){
563 if( stringBufferLength(p)==0 ) return;
564 if( !endsInWhiteSpace(p) ) append(p, " ");
565}
566
567/* Remove white space from the end of the StringBuffer */
568static void trimWhiteSpace(StringBuffer *p){
569 while( endsInWhiteSpace(p) ){
570 p->b.pData[--p->b.nData-1] = '\0';
571 }
572}
573
574/*******************************************************************/
575/* DLReader is used to read document elements from a doclist. The
576** current docid is cached, so dlrDocid() is fast. DLReader does not
577** own the doclist buffer.
578**
579** dlrAtEnd - true if there's no more data to read.
580** dlrDocid - docid of current document.
581** dlrDocData - doclist data for current document (including docid).
582** dlrDocDataBytes - length of same.
583** dlrAllDataBytes - length of all remaining data.
584** dlrPosData - position data for current document.
585** dlrPosDataLen - length of pos data for current document (incl POS_END).
586** dlrStep - step to current document.
587** dlrInit - initial for doclist of given type against given data.
588** dlrDestroy - clean up.
589**
590** Expected usage is something like:
591**
592** DLReader reader;
593** dlrInit(&reader, pData, nData);
594** while( !dlrAtEnd(&reader) ){
595** // calls to dlrDocid() and kin.
596** dlrStep(&reader);
597** }
598** dlrDestroy(&reader);
599*/
600typedef struct DLReader {
601 DocListType iType;
602 const char *pData;
603 int nData;
604
605 sqlite_int64 iDocid;
606 int nElement;
607} DLReader;
608
609static int dlrAtEnd(DLReader *pReader){
610 assert( pReader->nData>=0 );
611 return pReader->nData==0;
612}
613static sqlite_int64 dlrDocid(DLReader *pReader){
614 assert( !dlrAtEnd(pReader) );
615 return pReader->iDocid;
616}
617static const char *dlrDocData(DLReader *pReader){
618 assert( !dlrAtEnd(pReader) );
619 return pReader->pData;
620}
621static int dlrDocDataBytes(DLReader *pReader){
622 assert( !dlrAtEnd(pReader) );
623 return pReader->nElement;
624}
625static int dlrAllDataBytes(DLReader *pReader){
626 assert( !dlrAtEnd(pReader) );
627 return pReader->nData;
628}
629/* TODO(shess) Consider adding a field to track iDocid varint length
630** to make these two functions faster. This might matter (a tiny bit)
631** for queries.
632*/
633static const char *dlrPosData(DLReader *pReader){
634 sqlite_int64 iDummy;
635 int n = getVarint(pReader->pData, &iDummy);
636 assert( !dlrAtEnd(pReader) );
637 return pReader->pData+n;
638}
639static int dlrPosDataLen(DLReader *pReader){
640 sqlite_int64 iDummy;
641 int n = getVarint(pReader->pData, &iDummy);
642 assert( !dlrAtEnd(pReader) );
643 return pReader->nElement-n;
644}
645static void dlrStep(DLReader *pReader){
646 assert( !dlrAtEnd(pReader) );
647
648 /* Skip past current doclist element. */
649 assert( pReader->nElement<=pReader->nData );
650 pReader->pData += pReader->nElement;
651 pReader->nData -= pReader->nElement;
652
653 /* If there is more data, read the next doclist element. */
654 if( pReader->nData!=0 ){
655 sqlite_int64 iDocidDelta;
656 int iDummy, n = getVarint(pReader->pData, &iDocidDelta);
657 pReader->iDocid += iDocidDelta;
658 if( pReader->iType>=DL_POSITIONS ){
659 assert( n<pReader->nData );
660 while( 1 ){
661 n += getVarint32(pReader->pData+n, &iDummy);
662 assert( n<=pReader->nData );
663 if( iDummy==POS_END ) break;
664 if( iDummy==POS_COLUMN ){
665 n += getVarint32(pReader->pData+n, &iDummy);
666 assert( n<pReader->nData );
667 }else if( pReader->iType==DL_POSITIONS_OFFSETS ){
668 n += getVarint32(pReader->pData+n, &iDummy);
669 n += getVarint32(pReader->pData+n, &iDummy);
670 assert( n<pReader->nData );
671 }
672 }
673 }
674 pReader->nElement = n;
675 assert( pReader->nElement<=pReader->nData );
676 }
677}
678static void dlrInit(DLReader *pReader, DocListType iType,
679 const char *pData, int nData){
680 assert( pData!=NULL && nData!=0 );
681 pReader->iType = iType;
682 pReader->pData = pData;
683 pReader->nData = nData;
684 pReader->nElement = 0;
685 pReader->iDocid = 0;
686
687 /* Load the first element's data. There must be a first element. */
688 dlrStep(pReader);
689}
690static void dlrDestroy(DLReader *pReader){
691 SCRAMBLE(pReader);
692}
693
694#ifndef NDEBUG
695/* Verify that the doclist can be validly decoded. Also returns the
696** last docid found because it's convenient in other assertions for
697** DLWriter.
698*/
699static void docListValidate(DocListType iType, const char *pData, int nData,
700 sqlite_int64 *pLastDocid){
701 sqlite_int64 iPrevDocid = 0;
702 assert( nData>0 );
703 assert( pData!=0 );
704 assert( pData+nData>pData );
705 while( nData!=0 ){
706 sqlite_int64 iDocidDelta;
707 int n = getVarint(pData, &iDocidDelta);
708 iPrevDocid += iDocidDelta;
709 if( iType>DL_DOCIDS ){
710 int iDummy;
711 while( 1 ){
712 n += getVarint32(pData+n, &iDummy);
713 if( iDummy==POS_END ) break;
714 if( iDummy==POS_COLUMN ){
715 n += getVarint32(pData+n, &iDummy);
716 }else if( iType>DL_POSITIONS ){
717 n += getVarint32(pData+n, &iDummy);
718 n += getVarint32(pData+n, &iDummy);
719 }
720 assert( n<=nData );
721 }
722 }
723 assert( n<=nData );
724 pData += n;
725 nData -= n;
726 }
727 if( pLastDocid ) *pLastDocid = iPrevDocid;
728}
729#define ASSERT_VALID_DOCLIST(i, p, n, o) docListValidate(i, p, n, o)
730#else
731#define ASSERT_VALID_DOCLIST(i, p, n, o) assert( 1 )
732#endif
733
734/*******************************************************************/
735/* DLWriter is used to write doclist data to a DataBuffer. DLWriter
736** always appends to the buffer and does not own it.
737**
738** dlwInit - initialize to write a given type doclistto a buffer.
739** dlwDestroy - clear the writer's memory. Does not free buffer.
740** dlwAppend - append raw doclist data to buffer.
741** dlwCopy - copy next doclist from reader to writer.
742** dlwAdd - construct doclist element and append to buffer.
743** Only apply dlwAdd() to DL_DOCIDS doclists (else use PLWriter).
744*/
745typedef struct DLWriter {
746 DocListType iType;
747 DataBuffer *b;
748 sqlite_int64 iPrevDocid;
749#ifndef NDEBUG
750 int has_iPrevDocid;
751#endif
752} DLWriter;
753
754static void dlwInit(DLWriter *pWriter, DocListType iType, DataBuffer *b){
755 pWriter->b = b;
756 pWriter->iType = iType;
757 pWriter->iPrevDocid = 0;
758#ifndef NDEBUG
759 pWriter->has_iPrevDocid = 0;
760#endif
761}
762static void dlwDestroy(DLWriter *pWriter){
763 SCRAMBLE(pWriter);
764}
765/* iFirstDocid is the first docid in the doclist in pData. It is
766** needed because pData may point within a larger doclist, in which
767** case the first item would be delta-encoded.
768**
769** iLastDocid is the final docid in the doclist in pData. It is
770** needed to create the new iPrevDocid for future delta-encoding. The
771** code could decode the passed doclist to recreate iLastDocid, but
772** the only current user (docListMerge) already has decoded this
773** information.
774*/
775/* TODO(shess) This has become just a helper for docListMerge.
776** Consider a refactor to make this cleaner.
777*/
778static void dlwAppend(DLWriter *pWriter,
779 const char *pData, int nData,
780 sqlite_int64 iFirstDocid, sqlite_int64 iLastDocid){
781 sqlite_int64 iDocid = 0;
782 char c[VARINT_MAX];
783 int nFirstOld, nFirstNew; /* Old and new varint len of first docid. */
784#ifndef NDEBUG
785 sqlite_int64 iLastDocidDelta;
786#endif
787
788 /* Recode the initial docid as delta from iPrevDocid. */
789 nFirstOld = getVarint(pData, &iDocid);
790 assert( nFirstOld<nData || (nFirstOld==nData && pWriter->iType==DL_DOCIDS) );
791 nFirstNew = putVarint(c, iFirstDocid-pWriter->iPrevDocid);
792
793 /* Verify that the incoming doclist is valid AND that it ends with
794 ** the expected docid. This is essential because we'll trust this
795 ** docid in future delta-encoding.
796 */
797 ASSERT_VALID_DOCLIST(pWriter->iType, pData, nData, &iLastDocidDelta);
798 assert( iLastDocid==iFirstDocid-iDocid+iLastDocidDelta );
799
800 /* Append recoded initial docid and everything else. Rest of docids
801 ** should have been delta-encoded from previous initial docid.
802 */
803 if( nFirstOld<nData ){
804 dataBufferAppend2(pWriter->b, c, nFirstNew,
805 pData+nFirstOld, nData-nFirstOld);
806 }else{
807 dataBufferAppend(pWriter->b, c, nFirstNew);
808 }
809 pWriter->iPrevDocid = iLastDocid;
810}
811static void dlwCopy(DLWriter *pWriter, DLReader *pReader){
812 dlwAppend(pWriter, dlrDocData(pReader), dlrDocDataBytes(pReader),
813 dlrDocid(pReader), dlrDocid(pReader));
814}
815static void dlwAdd(DLWriter *pWriter, sqlite_int64 iDocid){
816 char c[VARINT_MAX];
817 int n = putVarint(c, iDocid-pWriter->iPrevDocid);
818
819 /* Docids must ascend. */
820 assert( !pWriter->has_iPrevDocid || iDocid>pWriter->iPrevDocid );
821 assert( pWriter->iType==DL_DOCIDS );
822
823 dataBufferAppend(pWriter->b, c, n);
824 pWriter->iPrevDocid = iDocid;
825#ifndef NDEBUG
826 pWriter->has_iPrevDocid = 1;
827#endif
828}
829
830/*******************************************************************/
831/* PLReader is used to read data from a document's position list. As
832** the caller steps through the list, data is cached so that varints
833** only need to be decoded once.
834**
835** plrInit, plrDestroy - create/destroy a reader.
836** plrColumn, plrPosition, plrStartOffset, plrEndOffset - accessors
837** plrAtEnd - at end of stream, only call plrDestroy once true.
838** plrStep - step to the next element.
839*/
840typedef struct PLReader {
841 /* These refer to the next position's data. nData will reach 0 when
842 ** reading the last position, so plrStep() signals EOF by setting
843 ** pData to NULL.
844 */
845 const char *pData;
846 int nData;
847
848 DocListType iType;
849 int iColumn; /* the last column read */
850 int iPosition; /* the last position read */
851 int iStartOffset; /* the last start offset read */
852 int iEndOffset; /* the last end offset read */
853} PLReader;
854
855static int plrAtEnd(PLReader *pReader){
856 return pReader->pData==NULL;
857}
858static int plrColumn(PLReader *pReader){
859 assert( !plrAtEnd(pReader) );
860 return pReader->iColumn;
861}
862static int plrPosition(PLReader *pReader){
863 assert( !plrAtEnd(pReader) );
864 return pReader->iPosition;
865}
866static int plrStartOffset(PLReader *pReader){
867 assert( !plrAtEnd(pReader) );
868 return pReader->iStartOffset;
869}
870static int plrEndOffset(PLReader *pReader){
871 assert( !plrAtEnd(pReader) );
872 return pReader->iEndOffset;
873}
874static void plrStep(PLReader *pReader){
875 int i, n;
876
877 assert( !plrAtEnd(pReader) );
878
879 if( pReader->nData==0 ){
880 pReader->pData = NULL;
881 return;
882 }
883
884 n = getVarint32(pReader->pData, &i);
885 if( i==POS_COLUMN ){
886 n += getVarint32(pReader->pData+n, &pReader->iColumn);
887 pReader->iPosition = 0;
888 pReader->iStartOffset = 0;
889 n += getVarint32(pReader->pData+n, &i);
890 }
891 /* Should never see adjacent column changes. */
892 assert( i!=POS_COLUMN );
893
894 if( i==POS_END ){
895 pReader->nData = 0;
896 pReader->pData = NULL;
897 return;
898 }
899
900 pReader->iPosition += i-POS_BASE;
901 if( pReader->iType==DL_POSITIONS_OFFSETS ){
902 n += getVarint32(pReader->pData+n, &i);
903 pReader->iStartOffset += i;
904 n += getVarint32(pReader->pData+n, &i);
905 pReader->iEndOffset = pReader->iStartOffset+i;
906 }
907 assert( n<=pReader->nData );
908 pReader->pData += n;
909 pReader->nData -= n;
910}
911
912static void plrInit(PLReader *pReader, DLReader *pDLReader){
913 pReader->pData = dlrPosData(pDLReader);
914 pReader->nData = dlrPosDataLen(pDLReader);
915 pReader->iType = pDLReader->iType;
916 pReader->iColumn = 0;
917 pReader->iPosition = 0;
918 pReader->iStartOffset = 0;
919 pReader->iEndOffset = 0;
920 plrStep(pReader);
921}
922static void plrDestroy(PLReader *pReader){
923 SCRAMBLE(pReader);
924}
925
926/*******************************************************************/
927/* PLWriter is used in constructing a document's position list. As a
928** convenience, if iType is DL_DOCIDS, PLWriter becomes a no-op.
929** PLWriter writes to the associated DLWriter's buffer.
930**
931** plwInit - init for writing a document's poslist.
932** plwDestroy - clear a writer.
933** plwAdd - append position and offset information.
934** plwCopy - copy next position's data from reader to writer.
935** plwTerminate - add any necessary doclist terminator.
936**
937** Calling plwAdd() after plwTerminate() may result in a corrupt
938** doclist.
939*/
940/* TODO(shess) Until we've written the second item, we can cache the
941** first item's information. Then we'd have three states:
942**
943** - initialized with docid, no positions.
944** - docid and one position.
945** - docid and multiple positions.
946**
947** Only the last state needs to actually write to dlw->b, which would
948** be an improvement in the DLCollector case.
949*/
950typedef struct PLWriter {
951 DLWriter *dlw;
952
953 int iColumn; /* the last column written */
954 int iPos; /* the last position written */
955 int iOffset; /* the last start offset written */
956} PLWriter;
957
958/* TODO(shess) In the case where the parent is reading these values
959** from a PLReader, we could optimize to a copy if that PLReader has
960** the same type as pWriter.
961*/
962static void plwAdd(PLWriter *pWriter, int iColumn, int iPos,
963 int iStartOffset, int iEndOffset){
964 /* Worst-case space for POS_COLUMN, iColumn, iPosDelta,
965 ** iStartOffsetDelta, and iEndOffsetDelta.
966 */
967 char c[5*VARINT_MAX];
968 int n = 0;
969
970 /* Ban plwAdd() after plwTerminate(). */
971 assert( pWriter->iPos!=-1 );
972
973 if( pWriter->dlw->iType==DL_DOCIDS ) return;
974
975 if( iColumn!=pWriter->iColumn ){
976 n += putVarint(c+n, POS_COLUMN);
977 n += putVarint(c+n, iColumn);
978 pWriter->iColumn = iColumn;
979 pWriter->iPos = 0;
980 pWriter->iOffset = 0;
981 }
982 assert( iPos>=pWriter->iPos );
983 n += putVarint(c+n, POS_BASE+(iPos-pWriter->iPos));
984 pWriter->iPos = iPos;
985 if( pWriter->dlw->iType==DL_POSITIONS_OFFSETS ){
986 assert( iStartOffset>=pWriter->iOffset );
987 n += putVarint(c+n, iStartOffset-pWriter->iOffset);
988 pWriter->iOffset = iStartOffset;
989 assert( iEndOffset>=iStartOffset );
990 n += putVarint(c+n, iEndOffset-iStartOffset);
991 }
992 dataBufferAppend(pWriter->dlw->b, c, n);
993}
994static void plwCopy(PLWriter *pWriter, PLReader *pReader){
995 plwAdd(pWriter, plrColumn(pReader), plrPosition(pReader),
996 plrStartOffset(pReader), plrEndOffset(pReader));
997}
998static void plwInit(PLWriter *pWriter, DLWriter *dlw, sqlite_int64 iDocid){
999 char c[VARINT_MAX];
1000 int n;
1001
1002 pWriter->dlw = dlw;
1003
1004 /* Docids must ascend. */
1005 assert( !pWriter->dlw->has_iPrevDocid || iDocid>pWriter->dlw->iPrevDocid );
1006 n = putVarint(c, iDocid-pWriter->dlw->iPrevDocid);
1007 dataBufferAppend(pWriter->dlw->b, c, n);
1008 pWriter->dlw->iPrevDocid = iDocid;
1009#ifndef NDEBUG
1010 pWriter->dlw->has_iPrevDocid = 1;
1011#endif
1012
1013 pWriter->iColumn = 0;
1014 pWriter->iPos = 0;
1015 pWriter->iOffset = 0;
1016}
1017/* TODO(shess) Should plwDestroy() also terminate the doclist? But
1018** then plwDestroy() would no longer be just a destructor, it would
1019** also be doing work, which isn't consistent with the overall idiom.
1020** Another option would be for plwAdd() to always append any necessary
1021** terminator, so that the output is always correct. But that would
1022** add incremental work to the common case with the only benefit being
1023** API elegance. Punt for now.
1024*/
1025static void plwTerminate(PLWriter *pWriter){
1026 if( pWriter->dlw->iType>DL_DOCIDS ){
1027 char c[VARINT_MAX];
1028 int n = putVarint(c, POS_END);
1029 dataBufferAppend(pWriter->dlw->b, c, n);
1030 }
1031#ifndef NDEBUG
1032 /* Mark as terminated for assert in plwAdd(). */
1033 pWriter->iPos = -1;
1034#endif
1035}
1036static void plwDestroy(PLWriter *pWriter){
1037 SCRAMBLE(pWriter);
1038}
1039
1040/*******************************************************************/
1041/* DLCollector wraps PLWriter and DLWriter to provide a
1042** dynamically-allocated doclist area to use during tokenization.
1043**
1044** dlcNew - malloc up and initialize a collector.
1045** dlcDelete - destroy a collector and all contained items.
1046** dlcAddPos - append position and offset information.
1047** dlcAddDoclist - add the collected doclist to the given buffer.
1048** dlcNext - terminate the current document and open another.
1049*/
1050typedef struct DLCollector {
1051 DataBuffer b;
1052 DLWriter dlw;
1053 PLWriter plw;
1054} DLCollector;
1055
1056/* TODO(shess) This could also be done by calling plwTerminate() and
1057** dataBufferAppend(). I tried that, expecting nominal performance
1058** differences, but it seemed to pretty reliably be worth 1% to code
1059** it this way. I suspect it's the incremental malloc overhead (some
1060** percentage of the plwTerminate() calls will cause a realloc), so
1061** this might be worth revisiting if the DataBuffer implementation
1062** changes.
1063*/
1064static void dlcAddDoclist(DLCollector *pCollector, DataBuffer *b){
1065 if( pCollector->dlw.iType>DL_DOCIDS ){
1066 char c[VARINT_MAX];
1067 int n = putVarint(c, POS_END);
1068 dataBufferAppend2(b, pCollector->b.pData, pCollector->b.nData, c, n);
1069 }else{
1070 dataBufferAppend(b, pCollector->b.pData, pCollector->b.nData);
1071 }
1072}
1073static void dlcNext(DLCollector *pCollector, sqlite_int64 iDocid){
1074 plwTerminate(&pCollector->plw);
1075 plwDestroy(&pCollector->plw);
1076 plwInit(&pCollector->plw, &pCollector->dlw, iDocid);
1077}
1078static void dlcAddPos(DLCollector *pCollector, int iColumn, int iPos,
1079 int iStartOffset, int iEndOffset){
1080 plwAdd(&pCollector->plw, iColumn, iPos, iStartOffset, iEndOffset);
1081}
1082
1083static DLCollector *dlcNew(sqlite_int64 iDocid, DocListType iType){
1084 DLCollector *pCollector = malloc(sizeof(DLCollector));
1085 dataBufferInit(&pCollector->b, 0);
1086 dlwInit(&pCollector->dlw, iType, &pCollector->b);
1087 plwInit(&pCollector->plw, &pCollector->dlw, iDocid);
1088 return pCollector;
1089}
1090static void dlcDelete(DLCollector *pCollector){
1091 plwDestroy(&pCollector->plw);
1092 dlwDestroy(&pCollector->dlw);
1093 dataBufferDestroy(&pCollector->b);
1094 SCRAMBLE(pCollector);
1095 free(pCollector);
1096}
1097
1098
1099/* Copy the doclist data of iType in pData/nData into *out, trimming
1100** unnecessary data as we go. Only columns matching iColumn are
1101** copied, all columns copied if iColumn is -1. Elements with no
1102** matching columns are dropped. The output is an iOutType doclist.
1103*/
1104/* NOTE(shess) This code is only valid after all doclists are merged.
1105** If this is run before merges, then doclist items which represent
1106** deletion will be trimmed, and will thus not effect a deletion
1107** during the merge.
1108*/
1109static void docListTrim(DocListType iType, const char *pData, int nData,
1110 int iColumn, DocListType iOutType, DataBuffer *out){
1111 DLReader dlReader;
1112 DLWriter dlWriter;
1113
1114 assert( iOutType<=iType );
1115
1116 dlrInit(&dlReader, iType, pData, nData);
1117 dlwInit(&dlWriter, iOutType, out);
1118
1119 while( !dlrAtEnd(&dlReader) ){
1120 PLReader plReader;
1121 PLWriter plWriter;
1122 int match = 0;
1123
1124 plrInit(&plReader, &dlReader);
1125
1126 while( !plrAtEnd(&plReader) ){
1127 if( iColumn==-1 || plrColumn(&plReader)==iColumn ){
1128 if( !match ){
1129 plwInit(&plWriter, &dlWriter, dlrDocid(&dlReader));
1130 match = 1;
1131 }
1132 plwAdd(&plWriter, plrColumn(&plReader), plrPosition(&plReader),
1133 plrStartOffset(&plReader), plrEndOffset(&plReader));
1134 }
1135 plrStep(&plReader);
1136 }
1137 if( match ){
1138 plwTerminate(&plWriter);
1139 plwDestroy(&plWriter);
1140 }
1141
1142 plrDestroy(&plReader);
1143 dlrStep(&dlReader);
1144 }
1145 dlwDestroy(&dlWriter);
1146 dlrDestroy(&dlReader);
1147}
1148
1149/* Used by docListMerge() to keep doclists in the ascending order by
1150** docid, then ascending order by age (so the newest comes first).
1151*/
1152typedef struct OrderedDLReader {
1153 DLReader *pReader;
1154
1155 /* TODO(shess) If we assume that docListMerge pReaders is ordered by
1156 ** age (which we do), then we could use pReader comparisons to break
1157 ** ties.
1158 */
1159 int idx;
1160} OrderedDLReader;
1161
1162/* Order eof to end, then by docid asc, idx desc. */
1163static int orderedDLReaderCmp(OrderedDLReader *r1, OrderedDLReader *r2){
1164 if( dlrAtEnd(r1->pReader) ){
1165 if( dlrAtEnd(r2->pReader) ) return 0; /* Both atEnd(). */
1166 return 1; /* Only r1 atEnd(). */
1167 }
1168 if( dlrAtEnd(r2->pReader) ) return -1; /* Only r2 atEnd(). */
1169
1170 if( dlrDocid(r1->pReader)<dlrDocid(r2->pReader) ) return -1;
1171 if( dlrDocid(r1->pReader)>dlrDocid(r2->pReader) ) return 1;
1172
1173 /* Descending on idx. */
1174 return r2->idx-r1->idx;
1175}
1176
1177/* Bubble p[0] to appropriate place in p[1..n-1]. Assumes that
1178** p[1..n-1] is already sorted.
1179*/
1180/* TODO(shess) Is this frequent enough to warrant a binary search?
1181** Before implementing that, instrument the code to check. In most
1182** current usage, I expect that p[0] will be less than p[1] a very
1183** high proportion of the time.
1184*/
1185static void orderedDLReaderReorder(OrderedDLReader *p, int n){
1186 while( n>1 && orderedDLReaderCmp(p, p+1)>0 ){
1187 OrderedDLReader tmp = p[0];
1188 p[0] = p[1];
1189 p[1] = tmp;
1190 n--;
1191 p++;
1192 }
1193}
1194
1195/* Given an array of doclist readers, merge their doclist elements
1196** into out in sorted order (by docid), dropping elements from older
1197** readers when there is a duplicate docid. pReaders is assumed to be
1198** ordered by age, oldest first.
1199*/
1200/* TODO(shess) nReaders must be <= MERGE_COUNT. This should probably
1201** be fixed.
1202*/
1203static void docListMerge(DataBuffer *out,
1204 DLReader *pReaders, int nReaders){
1205 OrderedDLReader readers[MERGE_COUNT];
1206 DLWriter writer;
1207 int i, n;
1208 const char *pStart = 0;
1209 int nStart = 0;
1210 sqlite_int64 iFirstDocid = 0, iLastDocid = 0;
1211
1212 assert( nReaders>0 );
1213 if( nReaders==1 ){
1214 dataBufferAppend(out, dlrDocData(pReaders), dlrAllDataBytes(pReaders));
1215 return;
1216 }
1217
1218 assert( nReaders<=MERGE_COUNT );
1219 n = 0;
1220 for(i=0; i<nReaders; i++){
1221 assert( pReaders[i].iType==pReaders[0].iType );
1222 readers[i].pReader = pReaders+i;
1223 readers[i].idx = i;
1224 n += dlrAllDataBytes(&pReaders[i]);
1225 }
1226 /* Conservatively size output to sum of inputs. Output should end
1227 ** up strictly smaller than input.
1228 */
1229 dataBufferExpand(out, n);
1230
1231 /* Get the readers into sorted order. */
1232 while( i-->0 ){
1233 orderedDLReaderReorder(readers+i, nReaders-i);
1234 }
1235
1236 dlwInit(&writer, pReaders[0].iType, out);
1237 while( !dlrAtEnd(readers[0].pReader) ){
1238 sqlite_int64 iDocid = dlrDocid(readers[0].pReader);
1239
1240 /* If this is a continuation of the current buffer to copy, extend
1241 ** that buffer. memcpy() seems to be more efficient if it has a
1242 ** lots of data to copy.
1243 */
1244 if( dlrDocData(readers[0].pReader)==pStart+nStart ){
1245 nStart += dlrDocDataBytes(readers[0].pReader);
1246 }else{
1247 if( pStart!=0 ){
1248 dlwAppend(&writer, pStart, nStart, iFirstDocid, iLastDocid);
1249 }
1250 pStart = dlrDocData(readers[0].pReader);
1251 nStart = dlrDocDataBytes(readers[0].pReader);
1252 iFirstDocid = iDocid;
1253 }
1254 iLastDocid = iDocid;
1255 dlrStep(readers[0].pReader);
1256
1257 /* Drop all of the older elements with the same docid. */
1258 for(i=1; i<nReaders &&
1259 !dlrAtEnd(readers[i].pReader) &&
1260 dlrDocid(readers[i].pReader)==iDocid; i++){
1261 dlrStep(readers[i].pReader);
1262 }
1263
1264 /* Get the readers back into order. */
1265 while( i-->0 ){
1266 orderedDLReaderReorder(readers+i, nReaders-i);
1267 }
1268 }
1269
1270 /* Copy over any remaining elements. */
1271 if( nStart>0 ) dlwAppend(&writer, pStart, nStart, iFirstDocid, iLastDocid);
1272 dlwDestroy(&writer);
1273}
1274
1275/* Helper function for posListUnion(). Compares the current position
1276** between left and right, returning as standard C idiom of <0 if
1277** left<right, >0 if left>right, and 0 if left==right. "End" always
1278** compares greater.
1279*/
1280static int posListCmp(PLReader *pLeft, PLReader *pRight){
1281 assert( pLeft->iType==pRight->iType );
1282 if( pLeft->iType==DL_DOCIDS ) return 0;
1283
1284 if( plrAtEnd(pLeft) ) return plrAtEnd(pRight) ? 0 : 1;
1285 if( plrAtEnd(pRight) ) return -1;
1286
1287 if( plrColumn(pLeft)<plrColumn(pRight) ) return -1;
1288 if( plrColumn(pLeft)>plrColumn(pRight) ) return 1;
1289
1290 if( plrPosition(pLeft)<plrPosition(pRight) ) return -1;
1291 if( plrPosition(pLeft)>plrPosition(pRight) ) return 1;
1292 if( pLeft->iType==DL_POSITIONS ) return 0;
1293
1294 if( plrStartOffset(pLeft)<plrStartOffset(pRight) ) return -1;
1295 if( plrStartOffset(pLeft)>plrStartOffset(pRight) ) return 1;
1296
1297 if( plrEndOffset(pLeft)<plrEndOffset(pRight) ) return -1;
1298 if( plrEndOffset(pLeft)>plrEndOffset(pRight) ) return 1;
1299
1300 return 0;
1301}
1302
1303/* Write the union of position lists in pLeft and pRight to pOut.
1304** "Union" in this case meaning "All unique position tuples". Should
1305** work with any doclist type, though both inputs and the output
1306** should be the same type.
1307*/
1308static void posListUnion(DLReader *pLeft, DLReader *pRight, DLWriter *pOut){
1309 PLReader left, right;
1310 PLWriter writer;
1311
1312 assert( dlrDocid(pLeft)==dlrDocid(pRight) );
1313 assert( pLeft->iType==pRight->iType );
1314 assert( pLeft->iType==pOut->iType );
1315
1316 plrInit(&left, pLeft);
1317 plrInit(&right, pRight);
1318 plwInit(&writer, pOut, dlrDocid(pLeft));
1319
1320 while( !plrAtEnd(&left) || !plrAtEnd(&right) ){
1321 int c = posListCmp(&left, &right);
1322 if( c<0 ){
1323 plwCopy(&writer, &left);
1324 plrStep(&left);
1325 }else if( c>0 ){
1326 plwCopy(&writer, &right);
1327 plrStep(&right);
1328 }else{
1329 plwCopy(&writer, &left);
1330 plrStep(&left);
1331 plrStep(&right);
1332 }
1333 }
1334
1335 plwTerminate(&writer);
1336 plwDestroy(&writer);
1337 plrDestroy(&left);
1338 plrDestroy(&right);
1339}
1340
1341/* Write the union of doclists in pLeft and pRight to pOut. For
1342** docids in common between the inputs, the union of the position
1343** lists is written. Inputs and outputs are always type DL_DEFAULT.
1344*/
1345static void docListUnion(
1346 const char *pLeft, int nLeft,
1347 const char *pRight, int nRight,
1348 DataBuffer *pOut /* Write the combined doclist here */
1349){
1350 DLReader left, right;
1351 DLWriter writer;
1352
1353 if( nLeft==0 ){
1354 dataBufferAppend(pOut, pRight, nRight);
1355 return;
1356 }
1357 if( nRight==0 ){
1358 dataBufferAppend(pOut, pLeft, nLeft);
1359 return;
1360 }
1361
1362 dlrInit(&left, DL_DEFAULT, pLeft, nLeft);
1363 dlrInit(&right, DL_DEFAULT, pRight, nRight);
1364 dlwInit(&writer, DL_DEFAULT, pOut);
1365
1366 while( !dlrAtEnd(&left) || !dlrAtEnd(&right) ){
1367 if( dlrAtEnd(&right) ){
1368 dlwCopy(&writer, &left);
1369 dlrStep(&left);
1370 }else if( dlrAtEnd(&left) ){
1371 dlwCopy(&writer, &right);
1372 dlrStep(&right);
1373 }else if( dlrDocid(&left)<dlrDocid(&right) ){
1374 dlwCopy(&writer, &left);
1375 dlrStep(&left);
1376 }else if( dlrDocid(&left)>dlrDocid(&right) ){
1377 dlwCopy(&writer, &right);
1378 dlrStep(&right);
1379 }else{
1380 posListUnion(&left, &right, &writer);
1381 dlrStep(&left);
1382 dlrStep(&right);
1383 }
1384 }
1385
1386 dlrDestroy(&left);
1387 dlrDestroy(&right);
1388 dlwDestroy(&writer);
1389}
1390
1391/* pLeft and pRight are DLReaders positioned to the same docid.
1392**
1393** If there are no instances in pLeft or pRight where the position
1394** of pLeft is one less than the position of pRight, then this
1395** routine adds nothing to pOut.
1396**
1397** If there are one or more instances where positions from pLeft
1398** are exactly one less than positions from pRight, then add a new
1399** document record to pOut. If pOut wants to hold positions, then
1400** include the positions from pRight that are one more than a
1401** position in pLeft. In other words: pRight.iPos==pLeft.iPos+1.
1402*/
1403static void posListPhraseMerge(DLReader *pLeft, DLReader *pRight,
1404 DLWriter *pOut){
1405 PLReader left, right;
1406 PLWriter writer;
1407 int match = 0;
1408
1409 assert( dlrDocid(pLeft)==dlrDocid(pRight) );
1410 assert( pOut->iType!=DL_POSITIONS_OFFSETS );
1411
1412 plrInit(&left, pLeft);
1413 plrInit(&right, pRight);
1414
1415 while( !plrAtEnd(&left) && !plrAtEnd(&right) ){
1416 if( plrColumn(&left)<plrColumn(&right) ){
1417 plrStep(&left);
1418 }else if( plrColumn(&left)>plrColumn(&right) ){
1419 plrStep(&right);
1420 }else if( plrPosition(&left)+1<plrPosition(&right) ){
1421 plrStep(&left);
1422 }else if( plrPosition(&left)+1>plrPosition(&right) ){
1423 plrStep(&right);
1424 }else{
1425 if( !match ){
1426 plwInit(&writer, pOut, dlrDocid(pLeft));
1427 match = 1;
1428 }
1429 plwAdd(&writer, plrColumn(&right), plrPosition(&right), 0, 0);
1430 plrStep(&left);
1431 plrStep(&right);
1432 }
1433 }
1434
1435 if( match ){
1436 plwTerminate(&writer);
1437 plwDestroy(&writer);
1438 }
1439
1440 plrDestroy(&left);
1441 plrDestroy(&right);
1442}
1443
1444/* We have two doclists with positions: pLeft and pRight.
1445** Write the phrase intersection of these two doclists into pOut.
1446**
1447** A phrase intersection means that two documents only match
1448** if pLeft.iPos+1==pRight.iPos.
1449**
1450** iType controls the type of data written to pOut. If iType is
1451** DL_POSITIONS, the positions are those from pRight.
1452*/
1453static void docListPhraseMerge(
1454 const char *pLeft, int nLeft,
1455 const char *pRight, int nRight,
1456 DocListType iType,
1457 DataBuffer *pOut /* Write the combined doclist here */
1458){
1459 DLReader left, right;
1460 DLWriter writer;
1461
1462 if( nLeft==0 || nRight==0 ) return;
1463
1464 assert( iType!=DL_POSITIONS_OFFSETS );
1465
1466 dlrInit(&left, DL_POSITIONS, pLeft, nLeft);
1467 dlrInit(&right, DL_POSITIONS, pRight, nRight);
1468 dlwInit(&writer, iType, pOut);
1469
1470 while( !dlrAtEnd(&left) && !dlrAtEnd(&right) ){
1471 if( dlrDocid(&left)<dlrDocid(&right) ){
1472 dlrStep(&left);
1473 }else if( dlrDocid(&right)<dlrDocid(&left) ){
1474 dlrStep(&right);
1475 }else{
1476 posListPhraseMerge(&left, &right, &writer);
1477 dlrStep(&left);
1478 dlrStep(&right);
1479 }
1480 }
1481
1482 dlrDestroy(&left);
1483 dlrDestroy(&right);
1484 dlwDestroy(&writer);
1485}
1486
1487/* We have two DL_DOCIDS doclists: pLeft and pRight.
1488** Write the intersection of these two doclists into pOut as a
1489** DL_DOCIDS doclist.
1490*/
1491static void docListAndMerge(
1492 const char *pLeft, int nLeft,
1493 const char *pRight, int nRight,
1494 DataBuffer *pOut /* Write the combined doclist here */
1495){
1496 DLReader left, right;
1497 DLWriter writer;
1498
1499 if( nLeft==0 || nRight==0 ) return;
1500
1501 dlrInit(&left, DL_DOCIDS, pLeft, nLeft);
1502 dlrInit(&right, DL_DOCIDS, pRight, nRight);
1503 dlwInit(&writer, DL_DOCIDS, pOut);
1504
1505 while( !dlrAtEnd(&left) && !dlrAtEnd(&right) ){
1506 if( dlrDocid(&left)<dlrDocid(&right) ){
1507 dlrStep(&left);
1508 }else if( dlrDocid(&right)<dlrDocid(&left) ){
1509 dlrStep(&right);
1510 }else{
1511 dlwAdd(&writer, dlrDocid(&left));
1512 dlrStep(&left);
1513 dlrStep(&right);
1514 }
1515 }
1516
1517 dlrDestroy(&left);
1518 dlrDestroy(&right);
1519 dlwDestroy(&writer);
1520}
1521
1522/* We have two DL_DOCIDS doclists: pLeft and pRight.
1523** Write the union of these two doclists into pOut as a
1524** DL_DOCIDS doclist.
1525*/
1526static void docListOrMerge(
1527 const char *pLeft, int nLeft,
1528 const char *pRight, int nRight,
1529 DataBuffer *pOut /* Write the combined doclist here */
1530){
1531 DLReader left, right;
1532 DLWriter writer;
1533
1534 if( nLeft==0 ){
1535 dataBufferAppend(pOut, pRight, nRight);
1536 return;
1537 }
1538 if( nRight==0 ){
1539 dataBufferAppend(pOut, pLeft, nLeft);
1540 return;
1541 }
1542
1543 dlrInit(&left, DL_DOCIDS, pLeft, nLeft);
1544 dlrInit(&right, DL_DOCIDS, pRight, nRight);
1545 dlwInit(&writer, DL_DOCIDS, pOut);
1546
1547 while( !dlrAtEnd(&left) || !dlrAtEnd(&right) ){
1548 if( dlrAtEnd(&right) ){
1549 dlwAdd(&writer, dlrDocid(&left));
1550 dlrStep(&left);
1551 }else if( dlrAtEnd(&left) ){
1552 dlwAdd(&writer, dlrDocid(&right));
1553 dlrStep(&right);
1554 }else if( dlrDocid(&left)<dlrDocid(&right) ){
1555 dlwAdd(&writer, dlrDocid(&left));
1556 dlrStep(&left);
1557 }else if( dlrDocid(&right)<dlrDocid(&left) ){
1558 dlwAdd(&writer, dlrDocid(&right));
1559 dlrStep(&right);
1560 }else{
1561 dlwAdd(&writer, dlrDocid(&left));
1562 dlrStep(&left);
1563 dlrStep(&right);
1564 }
1565 }
1566
1567 dlrDestroy(&left);
1568 dlrDestroy(&right);
1569 dlwDestroy(&writer);
1570}
1571
1572/* We have two DL_DOCIDS doclists: pLeft and pRight.
1573** Write into pOut as DL_DOCIDS doclist containing all documents that
1574** occur in pLeft but not in pRight.
1575*/
1576static void docListExceptMerge(
1577 const char *pLeft, int nLeft,
1578 const char *pRight, int nRight,
1579 DataBuffer *pOut /* Write the combined doclist here */
1580){
1581 DLReader left, right;
1582 DLWriter writer;
1583
1584 if( nLeft==0 ) return;
1585 if( nRight==0 ){
1586 dataBufferAppend(pOut, pLeft, nLeft);
1587 return;
1588 }
1589
1590 dlrInit(&left, DL_DOCIDS, pLeft, nLeft);
1591 dlrInit(&right, DL_DOCIDS, pRight, nRight);
1592 dlwInit(&writer, DL_DOCIDS, pOut);
1593
1594 while( !dlrAtEnd(&left) ){
1595 while( !dlrAtEnd(&right) && dlrDocid(&right)<dlrDocid(&left) ){
1596 dlrStep(&right);
1597 }
1598 if( dlrAtEnd(&right) || dlrDocid(&left)<dlrDocid(&right) ){
1599 dlwAdd(&writer, dlrDocid(&left));
1600 }
1601 dlrStep(&left);
1602 }
1603
1604 dlrDestroy(&left);
1605 dlrDestroy(&right);
1606 dlwDestroy(&writer);
1607}
1608
1609static char *string_dup_n(const char *s, int n){
1610 char *str = malloc(n + 1);
1611 memcpy(str, s, n);
1612 str[n] = '\0';
1613 return str;
1614}
1615
1616/* Duplicate a string; the caller must free() the returned string.
1617 * (We don't use strdup() since it's not part of the standard C library and
1618 * may not be available everywhere.) */
1619static char *string_dup(const char *s){
1620 return string_dup_n(s, strlen(s));
1621}
1622
1623/* Format a string, replacing each occurrence of the % character with
1624 * zDb.zName. This may be more convenient than sqlite_mprintf()
1625 * when one string is used repeatedly in a format string.
1626 * The caller must free() the returned string. */
1627static char *string_format(const char *zFormat,
1628 const char *zDb, const char *zName){
1629 const char *p;
1630 size_t len = 0;
1631 size_t nDb = strlen(zDb);
1632 size_t nName = strlen(zName);
1633 size_t nFullTableName = nDb+1+nName;
1634 char *result;
1635 char *r;
1636
1637 /* first compute length needed */
1638 for(p = zFormat ; *p ; ++p){
1639 len += (*p=='%' ? nFullTableName : 1);
1640 }
1641 len += 1; /* for null terminator */
1642
1643 r = result = malloc(len);
1644 for(p = zFormat; *p; ++p){
1645 if( *p=='%' ){
1646 memcpy(r, zDb, nDb);
1647 r += nDb;
1648 *r++ = '.';
1649 memcpy(r, zName, nName);
1650 r += nName;
1651 } else {
1652 *r++ = *p;
1653 }
1654 }
1655 *r++ = '\0';
1656 assert( r == result + len );
1657 return result;
1658}
1659
1660static int sql_exec(sqlite3 *db, const char *zDb, const char *zName,
1661 const char *zFormat){
1662 char *zCommand = string_format(zFormat, zDb, zName);
1663 int rc;
1664 TRACE(("FTS2 sql: %s\n", zCommand));
1665 rc = sqlite3_exec(db, zCommand, NULL, 0, NULL);
1666 free(zCommand);
1667 return rc;
1668}
1669
1670static int sql_prepare(sqlite3 *db, const char *zDb, const char *zName,
1671 sqlite3_stmt **ppStmt, const char *zFormat){
1672 char *zCommand = string_format(zFormat, zDb, zName);
1673 int rc;
1674 TRACE(("FTS2 prepare: %s\n", zCommand));
1675 rc = sqlite3_prepare_v2(db, zCommand, -1, ppStmt, NULL);
1676 free(zCommand);
1677 return rc;
1678}
1679
1680/* end utility functions */
1681
1682/* Forward reference */
1683typedef struct fulltext_vtab fulltext_vtab;
1684
1685/* A single term in a query is represented by an instances of
1686** the following structure.
1687*/
1688typedef struct QueryTerm {
1689 short int nPhrase; /* How many following terms are part of the same phrase */
1690 short int iPhrase; /* This is the i-th term of a phrase. */
1691 short int iColumn; /* Column of the index that must match this term */
1692 signed char isOr; /* this term is preceded by "OR" */
1693 signed char isNot; /* this term is preceded by "-" */
1694 signed char isPrefix; /* this term is followed by "*" */
1695 char *pTerm; /* text of the term. '\000' terminated. malloced */
1696 int nTerm; /* Number of bytes in pTerm[] */
1697} QueryTerm;
1698
1699
1700/* A query string is parsed into a Query structure.
1701 *
1702 * We could, in theory, allow query strings to be complicated
1703 * nested expressions with precedence determined by parentheses.
1704 * But none of the major search engines do this. (Perhaps the
1705 * feeling is that an parenthesized expression is two complex of
1706 * an idea for the average user to grasp.) Taking our lead from
1707 * the major search engines, we will allow queries to be a list
1708 * of terms (with an implied AND operator) or phrases in double-quotes,
1709 * with a single optional "-" before each non-phrase term to designate
1710 * negation and an optional OR connector.
1711 *
1712 * OR binds more tightly than the implied AND, which is what the
1713 * major search engines seem to do. So, for example:
1714 *
1715 * [one two OR three] ==> one AND (two OR three)
1716 * [one OR two three] ==> (one OR two) AND three
1717 *
1718 * A "-" before a term matches all entries that lack that term.
1719 * The "-" must occur immediately before the term with in intervening
1720 * space. This is how the search engines do it.
1721 *
1722 * A NOT term cannot be the right-hand operand of an OR. If this
1723 * occurs in the query string, the NOT is ignored:
1724 *
1725 * [one OR -two] ==> one OR two
1726 *
1727 */
1728typedef struct Query {
1729 fulltext_vtab *pFts; /* The full text index */
1730 int nTerms; /* Number of terms in the query */
1731 QueryTerm *pTerms; /* Array of terms. Space obtained from malloc() */
1732 int nextIsOr; /* Set the isOr flag on the next inserted term */
1733 int nextColumn; /* Next word parsed must be in this column */
1734 int dfltColumn; /* The default column */
1735} Query;
1736
1737
1738/*
1739** An instance of the following structure keeps track of generated
1740** matching-word offset information and snippets.
1741*/
1742typedef struct Snippet {
1743 int nMatch; /* Total number of matches */
1744 int nAlloc; /* Space allocated for aMatch[] */
1745 struct snippetMatch { /* One entry for each matching term */
1746 char snStatus; /* Status flag for use while constructing snippets */
1747 short int iCol; /* The column that contains the match */
1748 short int iTerm; /* The index in Query.pTerms[] of the matching term */
1749 short int nByte; /* Number of bytes in the term */
1750 int iStart; /* The offset to the first character of the term */
1751 } *aMatch; /* Points to space obtained from malloc */
1752 char *zOffset; /* Text rendering of aMatch[] */
1753 int nOffset; /* strlen(zOffset) */
1754 char *zSnippet; /* Snippet text */
1755 int nSnippet; /* strlen(zSnippet) */
1756} Snippet;
1757
1758
1759typedef enum QueryType {
1760 QUERY_GENERIC, /* table scan */
1761 QUERY_ROWID, /* lookup by rowid */
1762 QUERY_FULLTEXT /* QUERY_FULLTEXT + [i] is a full-text search for column i*/
1763} QueryType;
1764
1765typedef enum fulltext_statement {
1766 CONTENT_INSERT_STMT,
1767 CONTENT_SELECT_STMT,
1768 CONTENT_UPDATE_STMT,
1769 CONTENT_DELETE_STMT,
1770
1771 BLOCK_INSERT_STMT,
1772 BLOCK_SELECT_STMT,
1773 BLOCK_DELETE_STMT,
1774
1775 SEGDIR_MAX_INDEX_STMT,
1776 SEGDIR_SET_STMT,
1777 SEGDIR_SELECT_STMT,
1778 SEGDIR_SPAN_STMT,
1779 SEGDIR_DELETE_STMT,
1780 SEGDIR_SELECT_ALL_STMT,
1781
1782 MAX_STMT /* Always at end! */
1783} fulltext_statement;
1784
1785/* These must exactly match the enum above. */
1786/* TODO(shess): Is there some risk that a statement will be used in two
1787** cursors at once, e.g. if a query joins a virtual table to itself?
1788** If so perhaps we should move some of these to the cursor object.
1789*/
1790static const char *const fulltext_zStatement[MAX_STMT] = {
1791 /* CONTENT_INSERT */ NULL, /* generated in contentInsertStatement() */
1792 /* CONTENT_SELECT */ "select * from %_content where rowid = ?",
1793 /* CONTENT_UPDATE */ NULL, /* generated in contentUpdateStatement() */
1794 /* CONTENT_DELETE */ "delete from %_content where rowid = ?",
1795
1796 /* BLOCK_INSERT */ "insert into %_segments values (?)",
1797 /* BLOCK_SELECT */ "select block from %_segments where rowid = ?",
1798 /* BLOCK_DELETE */ "delete from %_segments where rowid between ? and ?",
1799
1800 /* SEGDIR_MAX_INDEX */ "select max(idx) from %_segdir where level = ?",
1801 /* SEGDIR_SET */ "insert into %_segdir values (?, ?, ?, ?, ?, ?)",
1802 /* SEGDIR_SELECT */
1803 "select start_block, leaves_end_block, root from %_segdir "
1804 " where level = ? order by idx",
1805 /* SEGDIR_SPAN */
1806 "select min(start_block), max(end_block) from %_segdir "
1807 " where level = ? and start_block <> 0",
1808 /* SEGDIR_DELETE */ "delete from %_segdir where level = ?",
1809 /* SEGDIR_SELECT_ALL */
1810 "select root, leaves_end_block from %_segdir order by level desc, idx",
1811};
1812
1813/*
1814** A connection to a fulltext index is an instance of the following
1815** structure. The xCreate and xConnect methods create an instance
1816** of this structure and xDestroy and xDisconnect free that instance.
1817** All other methods receive a pointer to the structure as one of their
1818** arguments.
1819*/
1820struct fulltext_vtab {
1821 sqlite3_vtab base; /* Base class used by SQLite core */
1822 sqlite3 *db; /* The database connection */
1823 const char *zDb; /* logical database name */
1824 const char *zName; /* virtual table name */
1825 int nColumn; /* number of columns in virtual table */
1826 char **azColumn; /* column names. malloced */
1827 char **azContentColumn; /* column names in content table; malloced */
1828 sqlite3_tokenizer *pTokenizer; /* tokenizer for inserts and queries */
1829
1830 /* Precompiled statements which we keep as long as the table is
1831 ** open.
1832 */
1833 sqlite3_stmt *pFulltextStatements[MAX_STMT];
1834
1835 /* Precompiled statements used for segment merges. We run a
1836 ** separate select across the leaf level of each tree being merged.
1837 */
1838 sqlite3_stmt *pLeafSelectStmts[MERGE_COUNT];
1839 /* The statement used to prepare pLeafSelectStmts. */
1840#define LEAF_SELECT \
1841 "select block from %_segments where rowid between ? and ? order by rowid"
1842
1843 /* These buffer pending index updates during transactions.
1844 ** nPendingData estimates the memory size of the pending data. It
1845 ** doesn't include the hash-bucket overhead, nor any malloc
1846 ** overhead. When nPendingData exceeds kPendingThreshold, the
1847 ** buffer is flushed even before the transaction closes.
1848 ** pendingTerms stores the data, and is only valid when nPendingData
1849 ** is >=0 (nPendingData<0 means pendingTerms has not been
1850 ** initialized). iPrevDocid is the last docid written, used to make
1851 ** certain we're inserting in sorted order.
1852 */
1853 int nPendingData;
1854#define kPendingThreshold (1*1024*1024)
1855 sqlite_int64 iPrevDocid;
1856 fts2Hash pendingTerms;
1857};
1858
1859/*
1860** When the core wants to do a query, it create a cursor using a
1861** call to xOpen. This structure is an instance of a cursor. It
1862** is destroyed by xClose.
1863*/
1864typedef struct fulltext_cursor {
1865 sqlite3_vtab_cursor base; /* Base class used by SQLite core */
1866 QueryType iCursorType; /* Copy of sqlite3_index_info.idxNum */
1867 sqlite3_stmt *pStmt; /* Prepared statement in use by the cursor */
1868 int eof; /* True if at End Of Results */
1869 Query q; /* Parsed query string */
1870 Snippet snippet; /* Cached snippet for the current row */
1871 int iColumn; /* Column being searched */
1872 DataBuffer result; /* Doclist results from fulltextQuery */
1873 DLReader reader; /* Result reader if result not empty */
1874} fulltext_cursor;
1875
1876static struct fulltext_vtab *cursor_vtab(fulltext_cursor *c){
1877 return (fulltext_vtab *) c->base.pVtab;
1878}
1879
1880static const sqlite3_module fts2Module; /* forward declaration */
1881
1882/* Return a dynamically generated statement of the form
1883 * insert into %_content (rowid, ...) values (?, ...)
1884 */
1885static const char *contentInsertStatement(fulltext_vtab *v){
1886 StringBuffer sb;
1887 int i;
1888
1889 initStringBuffer(&sb);
1890 append(&sb, "insert into %_content (rowid, ");
1891 appendList(&sb, v->nColumn, v->azContentColumn);
1892 append(&sb, ") values (?");
1893 for(i=0; i<v->nColumn; ++i)
1894 append(&sb, ", ?");
1895 append(&sb, ")");
1896 return stringBufferData(&sb);
1897}
1898
1899/* Return a dynamically generated statement of the form
1900 * update %_content set [col_0] = ?, [col_1] = ?, ...
1901 * where rowid = ?
1902 */
1903static const char *contentUpdateStatement(fulltext_vtab *v){
1904 StringBuffer sb;
1905 int i;
1906
1907 initStringBuffer(&sb);
1908 append(&sb, "update %_content set ");
1909 for(i=0; i<v->nColumn; ++i) {
1910 if( i>0 ){
1911 append(&sb, ", ");
1912 }
1913 append(&sb, v->azContentColumn[i]);
1914 append(&sb, " = ?");
1915 }
1916 append(&sb, " where rowid = ?");
1917 return stringBufferData(&sb);
1918}
1919
1920/* Puts a freshly-prepared statement determined by iStmt in *ppStmt.
1921** If the indicated statement has never been prepared, it is prepared
1922** and cached, otherwise the cached version is reset.
1923*/
1924static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt,
1925 sqlite3_stmt **ppStmt){
1926 assert( iStmt<MAX_STMT );
1927 if( v->pFulltextStatements[iStmt]==NULL ){
1928 const char *zStmt;
1929 int rc;
1930 switch( iStmt ){
1931 case CONTENT_INSERT_STMT:
1932 zStmt = contentInsertStatement(v); break;
1933 case CONTENT_UPDATE_STMT:
1934 zStmt = contentUpdateStatement(v); break;
1935 default:
1936 zStmt = fulltext_zStatement[iStmt];
1937 }
1938 rc = sql_prepare(v->db, v->zDb, v->zName, &v->pFulltextStatements[iStmt],
1939 zStmt);
1940 if( zStmt != fulltext_zStatement[iStmt]) free((void *) zStmt);
1941 if( rc!=SQLITE_OK ) return rc;
1942 } else {
1943 int rc = sqlite3_reset(v->pFulltextStatements[iStmt]);
1944 if( rc!=SQLITE_OK ) return rc;
1945 }
1946
1947 *ppStmt = v->pFulltextStatements[iStmt];
1948 return SQLITE_OK;
1949}
1950
1951/* Like sqlite3_step(), but convert SQLITE_DONE to SQLITE_OK and
1952** SQLITE_ROW to SQLITE_ERROR. Useful for statements like UPDATE,
1953** where we expect no results.
1954*/
1955static int sql_single_step(sqlite3_stmt *s){
1956 int rc = sqlite3_step(s);
1957 return (rc==SQLITE_DONE) ? SQLITE_OK : rc;
1958}
1959
1960/* Like sql_get_statement(), but for special replicated LEAF_SELECT
1961** statements.
1962*/
1963/* TODO(shess) Write version for generic statements and then share
1964** that between the cached-statement functions.
1965*/
1966static int sql_get_leaf_statement(fulltext_vtab *v, int idx,
1967 sqlite3_stmt **ppStmt){
1968 assert( idx>=0 && idx<MERGE_COUNT );
1969 if( v->pLeafSelectStmts[idx]==NULL ){
1970 int rc = sql_prepare(v->db, v->zDb, v->zName, &v->pLeafSelectStmts[idx],
1971 LEAF_SELECT);
1972 if( rc!=SQLITE_OK ) return rc;
1973 }else{
1974 int rc = sqlite3_reset(v->pLeafSelectStmts[idx]);
1975 if( rc!=SQLITE_OK ) return rc;
1976 }
1977
1978 *ppStmt = v->pLeafSelectStmts[idx];
1979 return SQLITE_OK;
1980}
1981
1982/* insert into %_content (rowid, ...) values ([rowid], [pValues]) */
1983static int content_insert(fulltext_vtab *v, sqlite3_value *rowid,
1984 sqlite3_value **pValues){
1985 sqlite3_stmt *s;
1986 int i;
1987 int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s);
1988 if( rc!=SQLITE_OK ) return rc;
1989
1990 rc = sqlite3_bind_value(s, 1, rowid);
1991 if( rc!=SQLITE_OK ) return rc;
1992
1993 for(i=0; i<v->nColumn; ++i){
1994 rc = sqlite3_bind_value(s, 2+i, pValues[i]);
1995 if( rc!=SQLITE_OK ) return rc;
1996 }
1997
1998 return sql_single_step(s);
1999}
2000
2001/* update %_content set col0 = pValues[0], col1 = pValues[1], ...
2002 * where rowid = [iRowid] */
2003static int content_update(fulltext_vtab *v, sqlite3_value **pValues,
2004 sqlite_int64 iRowid){
2005 sqlite3_stmt *s;
2006 int i;
2007 int rc = sql_get_statement(v, CONTENT_UPDATE_STMT, &s);
2008 if( rc!=SQLITE_OK ) return rc;
2009
2010 for(i=0; i<v->nColumn; ++i){
2011 rc = sqlite3_bind_value(s, 1+i, pValues[i]);
2012 if( rc!=SQLITE_OK ) return rc;
2013 }
2014
2015 rc = sqlite3_bind_int64(s, 1+v->nColumn, iRowid);
2016 if( rc!=SQLITE_OK ) return rc;
2017
2018 return sql_single_step(s);
2019}
2020
2021static void freeStringArray(int nString, const char **pString){
2022 int i;
2023
2024 for (i=0 ; i < nString ; ++i) {
2025 if( pString[i]!=NULL ) free((void *) pString[i]);
2026 }
2027 free((void *) pString);
2028}
2029
2030/* select * from %_content where rowid = [iRow]
2031 * The caller must delete the returned array and all strings in it.
2032 * null fields will be NULL in the returned array.
2033 *
2034 * TODO: Perhaps we should return pointer/length strings here for consistency
2035 * with other code which uses pointer/length. */
2036static int content_select(fulltext_vtab *v, sqlite_int64 iRow,
2037 const char ***pValues){
2038 sqlite3_stmt *s;
2039 const char **values;
2040 int i;
2041 int rc;
2042
2043 *pValues = NULL;
2044
2045 rc = sql_get_statement(v, CONTENT_SELECT_STMT, &s);
2046 if( rc!=SQLITE_OK ) return rc;
2047
2048 rc = sqlite3_bind_int64(s, 1, iRow);
2049 if( rc!=SQLITE_OK ) return rc;
2050
2051 rc = sqlite3_step(s);
2052 if( rc!=SQLITE_ROW ) return rc;
2053
2054 values = (const char **) malloc(v->nColumn * sizeof(const char *));
2055 for(i=0; i<v->nColumn; ++i){
2056 if( sqlite3_column_type(s, i)==SQLITE_NULL ){
2057 values[i] = NULL;
2058 }else{
2059 values[i] = string_dup((char*)sqlite3_column_text(s, i));
2060 }
2061 }
2062
2063 /* We expect only one row. We must execute another sqlite3_step()
2064 * to complete the iteration; otherwise the table will remain locked. */
2065 rc = sqlite3_step(s);
2066 if( rc==SQLITE_DONE ){
2067 *pValues = values;
2068 return SQLITE_OK;
2069 }
2070
2071 freeStringArray(v->nColumn, values);
2072 return rc;
2073}
2074
2075/* delete from %_content where rowid = [iRow ] */
2076static int content_delete(fulltext_vtab *v, sqlite_int64 iRow){
2077 sqlite3_stmt *s;
2078 int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s);
2079 if( rc!=SQLITE_OK ) return rc;
2080
2081 rc = sqlite3_bind_int64(s, 1, iRow);
2082 if( rc!=SQLITE_OK ) return rc;
2083
2084 return sql_single_step(s);
2085}
2086
2087/* insert into %_segments values ([pData])
2088** returns assigned rowid in *piBlockid
2089*/
2090static int block_insert(fulltext_vtab *v, const char *pData, int nData,
2091 sqlite_int64 *piBlockid){
2092 sqlite3_stmt *s;
2093 int rc = sql_get_statement(v, BLOCK_INSERT_STMT, &s);
2094 if( rc!=SQLITE_OK ) return rc;
2095
2096 rc = sqlite3_bind_blob(s, 1, pData, nData, SQLITE_STATIC);
2097 if( rc!=SQLITE_OK ) return rc;
2098
2099 rc = sqlite3_step(s);
2100 if( rc==SQLITE_ROW ) return SQLITE_ERROR;
2101 if( rc!=SQLITE_DONE ) return rc;
2102
2103 *piBlockid = sqlite3_last_insert_rowid(v->db);
2104 return SQLITE_OK;
2105}
2106
2107/* delete from %_segments
2108** where rowid between [iStartBlockid] and [iEndBlockid]
2109**
2110** Deletes the range of blocks, inclusive, used to delete the blocks
2111** which form a segment.
2112*/
2113static int block_delete(fulltext_vtab *v,
2114 sqlite_int64 iStartBlockid, sqlite_int64 iEndBlockid){
2115 sqlite3_stmt *s;
2116 int rc = sql_get_statement(v, BLOCK_DELETE_STMT, &s);
2117 if( rc!=SQLITE_OK ) return rc;
2118
2119 rc = sqlite3_bind_int64(s, 1, iStartBlockid);
2120 if( rc!=SQLITE_OK ) return rc;
2121
2122 rc = sqlite3_bind_int64(s, 2, iEndBlockid);
2123 if( rc!=SQLITE_OK ) return rc;
2124
2125 return sql_single_step(s);
2126}
2127
2128/* Returns SQLITE_ROW with *pidx set to the maximum segment idx found
2129** at iLevel. Returns SQLITE_DONE if there are no segments at
2130** iLevel. Otherwise returns an error.
2131*/
2132static int segdir_max_index(fulltext_vtab *v, int iLevel, int *pidx){
2133 sqlite3_stmt *s;
2134 int rc = sql_get_statement(v, SEGDIR_MAX_INDEX_STMT, &s);
2135 if( rc!=SQLITE_OK ) return rc;
2136
2137 rc = sqlite3_bind_int(s, 1, iLevel);
2138 if( rc!=SQLITE_OK ) return rc;
2139
2140 rc = sqlite3_step(s);
2141 /* Should always get at least one row due to how max() works. */
2142 if( rc==SQLITE_DONE ) return SQLITE_DONE;
2143 if( rc!=SQLITE_ROW ) return rc;
2144
2145 /* NULL means that there were no inputs to max(). */
2146 if( SQLITE_NULL==sqlite3_column_type(s, 0) ){
2147 rc = sqlite3_step(s);
2148 if( rc==SQLITE_ROW ) return SQLITE_ERROR;
2149 return rc;
2150 }
2151
2152 *pidx = sqlite3_column_int(s, 0);
2153
2154 /* We expect only one row. We must execute another sqlite3_step()
2155 * to complete the iteration; otherwise the table will remain locked. */
2156 rc = sqlite3_step(s);
2157 if( rc==SQLITE_ROW ) return SQLITE_ERROR;
2158 if( rc!=SQLITE_DONE ) return rc;
2159 return SQLITE_ROW;
2160}
2161
2162/* insert into %_segdir values (
2163** [iLevel], [idx],
2164** [iStartBlockid], [iLeavesEndBlockid], [iEndBlockid],
2165** [pRootData]
2166** )
2167*/
2168static int segdir_set(fulltext_vtab *v, int iLevel, int idx,
2169 sqlite_int64 iStartBlockid,
2170 sqlite_int64 iLeavesEndBlockid,
2171 sqlite_int64 iEndBlockid,
2172 const char *pRootData, int nRootData){
2173 sqlite3_stmt *s;
2174 int rc = sql_get_statement(v, SEGDIR_SET_STMT, &s);
2175 if( rc!=SQLITE_OK ) return rc;
2176
2177 rc = sqlite3_bind_int(s, 1, iLevel);
2178 if( rc!=SQLITE_OK ) return rc;
2179
2180 rc = sqlite3_bind_int(s, 2, idx);
2181 if( rc!=SQLITE_OK ) return rc;
2182
2183 rc = sqlite3_bind_int64(s, 3, iStartBlockid);
2184 if( rc!=SQLITE_OK ) return rc;
2185
2186 rc = sqlite3_bind_int64(s, 4, iLeavesEndBlockid);
2187 if( rc!=SQLITE_OK ) return rc;
2188
2189 rc = sqlite3_bind_int64(s, 5, iEndBlockid);
2190 if( rc!=SQLITE_OK ) return rc;
2191
2192 rc = sqlite3_bind_blob(s, 6, pRootData, nRootData, SQLITE_STATIC);
2193 if( rc!=SQLITE_OK ) return rc;
2194
2195 return sql_single_step(s);
2196}
2197
2198/* Queries %_segdir for the block span of the segments in level
2199** iLevel. Returns SQLITE_DONE if there are no blocks for iLevel,
2200** SQLITE_ROW if there are blocks, else an error.
2201*/
2202static int segdir_span(fulltext_vtab *v, int iLevel,
2203 sqlite_int64 *piStartBlockid,
2204 sqlite_int64 *piEndBlockid){
2205 sqlite3_stmt *s;
2206 int rc = sql_get_statement(v, SEGDIR_SPAN_STMT, &s);
2207 if( rc!=SQLITE_OK ) return rc;
2208
2209 rc = sqlite3_bind_int(s, 1, iLevel);
2210 if( rc!=SQLITE_OK ) return rc;
2211
2212 rc = sqlite3_step(s);
2213 if( rc==SQLITE_DONE ) return SQLITE_DONE; /* Should never happen */
2214 if( rc!=SQLITE_ROW ) return rc;
2215
2216 /* This happens if all segments at this level are entirely inline. */
2217 if( SQLITE_NULL==sqlite3_column_type(s, 0) ){
2218 /* We expect only one row. We must execute another sqlite3_step()
2219 * to complete the iteration; otherwise the table will remain locked. */
2220 int rc2 = sqlite3_step(s);
2221 if( rc2==SQLITE_ROW ) return SQLITE_ERROR;
2222 return rc2;
2223 }
2224
2225 *piStartBlockid = sqlite3_column_int64(s, 0);
2226 *piEndBlockid = sqlite3_column_int64(s, 1);
2227
2228 /* We expect only one row. We must execute another sqlite3_step()
2229 * to complete the iteration; otherwise the table will remain locked. */
2230 rc = sqlite3_step(s);
2231 if( rc==SQLITE_ROW ) return SQLITE_ERROR;
2232 if( rc!=SQLITE_DONE ) return rc;
2233 return SQLITE_ROW;
2234}
2235
2236/* Delete the segment blocks and segment directory records for all
2237** segments at iLevel.
2238*/
2239static int segdir_delete(fulltext_vtab *v, int iLevel){
2240 sqlite3_stmt *s;
2241 sqlite_int64 iStartBlockid, iEndBlockid;
2242 int rc = segdir_span(v, iLevel, &iStartBlockid, &iEndBlockid);
2243 if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ) return rc;
2244
2245 if( rc==SQLITE_ROW ){
2246 rc = block_delete(v, iStartBlockid, iEndBlockid);
2247 if( rc!=SQLITE_OK ) return rc;
2248 }
2249
2250 /* Delete the segment directory itself. */
2251 rc = sql_get_statement(v, SEGDIR_DELETE_STMT, &s);
2252 if( rc!=SQLITE_OK ) return rc;
2253
2254 rc = sqlite3_bind_int64(s, 1, iLevel);
2255 if( rc!=SQLITE_OK ) return rc;
2256
2257 return sql_single_step(s);
2258}
2259
2260/* TODO(shess) clearPendingTerms() is far down the file because
2261** writeZeroSegment() is far down the file because LeafWriter is far
2262** down the file. Consider refactoring the code to move the non-vtab
2263** code above the vtab code so that we don't need this forward
2264** reference.
2265*/
2266static int clearPendingTerms(fulltext_vtab *v);
2267
2268/*
2269** Free the memory used to contain a fulltext_vtab structure.
2270*/
2271static void fulltext_vtab_destroy(fulltext_vtab *v){
2272 int iStmt, i;
2273
2274 TRACE(("FTS2 Destroy %p\n", v));
2275 for( iStmt=0; iStmt<MAX_STMT; iStmt++ ){
2276 if( v->pFulltextStatements[iStmt]!=NULL ){
2277 sqlite3_finalize(v->pFulltextStatements[iStmt]);
2278 v->pFulltextStatements[iStmt] = NULL;
2279 }
2280 }
2281
2282 for( i=0; i<MERGE_COUNT; i++ ){
2283 if( v->pLeafSelectStmts[i]!=NULL ){
2284 sqlite3_finalize(v->pLeafSelectStmts[i]);
2285 v->pLeafSelectStmts[i] = NULL;
2286 }
2287 }
2288
2289 if( v->pTokenizer!=NULL ){
2290 v->pTokenizer->pModule->xDestroy(v->pTokenizer);
2291 v->pTokenizer = NULL;
2292 }
2293
2294 clearPendingTerms(v);
2295
2296 free(v->azColumn);
2297 for(i = 0; i < v->nColumn; ++i) {
2298 sqlite3_free(v->azContentColumn[i]);
2299 }
2300 free(v->azContentColumn);
2301 free(v);
2302}
2303
2304/*
2305** Token types for parsing the arguments to xConnect or xCreate.
2306*/
2307#define TOKEN_EOF 0 /* End of file */
2308#define TOKEN_SPACE 1 /* Any kind of whitespace */
2309#define TOKEN_ID 2 /* An identifier */
2310#define TOKEN_STRING 3 /* A string literal */
2311#define TOKEN_PUNCT 4 /* A single punctuation character */
2312
2313/*
2314** If X is a character that can be used in an identifier then
2315** IdChar(X) will be true. Otherwise it is false.
2316**
2317** For ASCII, any character with the high-order bit set is
2318** allowed in an identifier. For 7-bit characters,
2319** sqlite3IsIdChar[X] must be 1.
2320**
2321** Ticket #1066. the SQL standard does not allow '$' in the
2322** middle of identfiers. But many SQL implementations do.
2323** SQLite will allow '$' in identifiers for compatibility.
2324** But the feature is undocumented.
2325*/
2326static const char isIdChar[] = {
2327/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
2328 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 2x */
2329 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */
2330 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */
2331 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */
2332 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */
2333 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */
2334};
2335#define IdChar(C) (((c=C)&0x80)!=0 || (c>0x1f && isIdChar[c-0x20]))
2336
2337
2338/*
2339** Return the length of the token that begins at z[0].
2340** Store the token type in *tokenType before returning.
2341*/
2342static int getToken(const char *z, int *tokenType){
2343 int i, c;
2344 switch( *z ){
2345 case 0: {
2346 *tokenType = TOKEN_EOF;
2347 return 0;
2348 }
2349 case ' ': case '\t': case '\n': case '\f': case '\r': {
2350 for(i=1; safe_isspace(z[i]); i++){}
2351 *tokenType = TOKEN_SPACE;
2352 return i;
2353 }
2354 case '`':
2355 case '\'':
2356 case '"': {
2357 int delim = z[0];
2358 for(i=1; (c=z[i])!=0; i++){
2359 if( c==delim ){
2360 if( z[i+1]==delim ){
2361 i++;
2362 }else{
2363 break;
2364 }
2365 }
2366 }
2367 *tokenType = TOKEN_STRING;
2368 return i + (c!=0);
2369 }
2370 case '[': {
2371 for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){}
2372 *tokenType = TOKEN_ID;
2373 return i;
2374 }
2375 default: {
2376 if( !IdChar(*z) ){
2377 break;
2378 }
2379 for(i=1; IdChar(z[i]); i++){}
2380 *tokenType = TOKEN_ID;
2381 return i;
2382 }
2383 }
2384 *tokenType = TOKEN_PUNCT;
2385 return 1;
2386}
2387
2388/*
2389** A token extracted from a string is an instance of the following
2390** structure.
2391*/
2392typedef struct Token {
2393 const char *z; /* Pointer to token text. Not '\000' terminated */
2394 short int n; /* Length of the token text in bytes. */
2395} Token;
2396
2397/*
2398** Given a input string (which is really one of the argv[] parameters
2399** passed into xConnect or xCreate) split the string up into tokens.
2400** Return an array of pointers to '\000' terminated strings, one string
2401** for each non-whitespace token.
2402**
2403** The returned array is terminated by a single NULL pointer.
2404**
2405** Space to hold the returned array is obtained from a single
2406** malloc and should be freed by passing the return value to free().
2407** The individual strings within the token list are all a part of
2408** the single memory allocation and will all be freed at once.
2409*/
2410static char **tokenizeString(const char *z, int *pnToken){
2411 int nToken = 0;
2412 Token *aToken = malloc( strlen(z) * sizeof(aToken[0]) );
2413 int n = 1;
2414 int e, i;
2415 int totalSize = 0;
2416 char **azToken;
2417 char *zCopy;
2418 while( n>0 ){
2419 n = getToken(z, &e);
2420 if( e!=TOKEN_SPACE ){
2421 aToken[nToken].z = z;
2422 aToken[nToken].n = n;
2423 nToken++;
2424 totalSize += n+1;
2425 }
2426 z += n;
2427 }
2428 azToken = (char**)malloc( nToken*sizeof(char*) + totalSize );
2429 zCopy = (char*)&azToken[nToken];
2430 nToken--;
2431 for(i=0; i<nToken; i++){
2432 azToken[i] = zCopy;
2433 n = aToken[i].n;
2434 memcpy(zCopy, aToken[i].z, n);
2435 zCopy[n] = 0;
2436 zCopy += n+1;
2437 }
2438 azToken[nToken] = 0;
2439 free(aToken);
2440 *pnToken = nToken;
2441 return azToken;
2442}
2443
2444/*
2445** Convert an SQL-style quoted string into a normal string by removing
2446** the quote characters. The conversion is done in-place. If the
2447** input does not begin with a quote character, then this routine
2448** is a no-op.
2449**
2450** Examples:
2451**
2452** "abc" becomes abc
2453** 'xyz' becomes xyz
2454** [pqr] becomes pqr
2455** `mno` becomes mno
2456*/
2457static void dequoteString(char *z){
2458 int quote;
2459 int i, j;
2460 if( z==0 ) return;
2461 quote = z[0];
2462 switch( quote ){
2463 case '\'': break;
2464 case '"': break;
2465 case '`': break; /* For MySQL compatibility */
2466 case '[': quote = ']'; break; /* For MS SqlServer compatibility */
2467 default: return;
2468 }
2469 for(i=1, j=0; z[i]; i++){
2470 if( z[i]==quote ){
2471 if( z[i+1]==quote ){
2472 z[j++] = quote;
2473 i++;
2474 }else{
2475 z[j++] = 0;
2476 break;
2477 }
2478 }else{
2479 z[j++] = z[i];
2480 }
2481 }
2482}
2483
2484/*
2485** The input azIn is a NULL-terminated list of tokens. Remove the first
2486** token and all punctuation tokens. Remove the quotes from
2487** around string literal tokens.
2488**
2489** Example:
2490**
2491** input: tokenize chinese ( 'simplifed' , 'mixed' )
2492** output: chinese simplifed mixed
2493**
2494** Another example:
2495**
2496** input: delimiters ( '[' , ']' , '...' )
2497** output: [ ] ...
2498*/
2499static void tokenListToIdList(char **azIn){
2500 int i, j;
2501 if( azIn ){
2502 for(i=0, j=-1; azIn[i]; i++){
2503 if( safe_isalnum(azIn[i][0]) || azIn[i][1] ){
2504 dequoteString(azIn[i]);
2505 if( j>=0 ){
2506 azIn[j] = azIn[i];
2507 }
2508 j++;
2509 }
2510 }
2511 azIn[j] = 0;
2512 }
2513}
2514
2515
2516/*
2517** Find the first alphanumeric token in the string zIn. Null-terminate
2518** this token. Remove any quotation marks. And return a pointer to
2519** the result.
2520*/
2521static char *firstToken(char *zIn, char **pzTail){
2522 int n, ttype;
2523 while(1){
2524 n = getToken(zIn, &ttype);
2525 if( ttype==TOKEN_SPACE ){
2526 zIn += n;
2527 }else if( ttype==TOKEN_EOF ){
2528 *pzTail = zIn;
2529 return 0;
2530 }else{
2531 zIn[n] = 0;
2532 *pzTail = &zIn[1];
2533 dequoteString(zIn);
2534 return zIn;
2535 }
2536 }
2537 /*NOTREACHED*/
2538}
2539
2540/* Return true if...
2541**
2542** * s begins with the string t, ignoring case
2543** * s is longer than t
2544** * The first character of s beyond t is not a alphanumeric
2545**
2546** Ignore leading space in *s.
2547**
2548** To put it another way, return true if the first token of
2549** s[] is t[].
2550*/
2551static int startsWith(const char *s, const char *t){
2552 while( safe_isspace(*s) ){ s++; }
2553 while( *t ){
2554 if( safe_tolower(*s++)!=safe_tolower(*t++) ) return 0;
2555 }
2556 return *s!='_' && !safe_isalnum(*s);
2557}
2558
2559/*
2560** An instance of this structure defines the "spec" of a
2561** full text index. This structure is populated by parseSpec
2562** and use by fulltextConnect and fulltextCreate.
2563*/
2564typedef struct TableSpec {
2565 const char *zDb; /* Logical database name */
2566 const char *zName; /* Name of the full-text index */
2567 int nColumn; /* Number of columns to be indexed */
2568 char **azColumn; /* Original names of columns to be indexed */
2569 char **azContentColumn; /* Column names for %_content */
2570 char **azTokenizer; /* Name of tokenizer and its arguments */
2571} TableSpec;
2572
2573/*
2574** Reclaim all of the memory used by a TableSpec
2575*/
2576static void clearTableSpec(TableSpec *p) {
2577 free(p->azColumn);
2578 free(p->azContentColumn);
2579 free(p->azTokenizer);
2580}
2581
2582/* Parse a CREATE VIRTUAL TABLE statement, which looks like this:
2583 *
2584 * CREATE VIRTUAL TABLE email
2585 * USING fts2(subject, body, tokenize mytokenizer(myarg))
2586 *
2587 * We return parsed information in a TableSpec structure.
2588 *
2589 */
2590static int parseSpec(TableSpec *pSpec, int argc, const char *const*argv,
2591 char**pzErr){
2592 int i, n;
2593 char *z, *zDummy;
2594 char **azArg;
2595 const char *zTokenizer = 0; /* argv[] entry describing the tokenizer */
2596
2597 assert( argc>=3 );
2598 /* Current interface:
2599 ** argv[0] - module name
2600 ** argv[1] - database name
2601 ** argv[2] - table name
2602 ** argv[3..] - columns, optionally followed by tokenizer specification
2603 ** and snippet delimiters specification.
2604 */
2605
2606 /* Make a copy of the complete argv[][] array in a single allocation.
2607 ** The argv[][] array is read-only and transient. We can write to the
2608 ** copy in order to modify things and the copy is persistent.
2609 */
2610 CLEAR(pSpec);
2611 for(i=n=0; i<argc; i++){
2612 n += strlen(argv[i]) + 1;
2613 }
2614 azArg = malloc( sizeof(char*)*argc + n );
2615 if( azArg==0 ){
2616 return SQLITE_NOMEM;
2617 }
2618 z = (char*)&azArg[argc];
2619 for(i=0; i<argc; i++){
2620 azArg[i] = z;
2621 strcpy(z, argv[i]);
2622 z += strlen(z)+1;
2623 }
2624
2625 /* Identify the column names and the tokenizer and delimiter arguments
2626 ** in the argv[][] array.
2627 */
2628 pSpec->zDb = azArg[1];
2629 pSpec->zName = azArg[2];
2630 pSpec->nColumn = 0;
2631 pSpec->azColumn = azArg;
2632 zTokenizer = "tokenize simple";
2633 for(i=3; i<argc; ++i){
2634 if( startsWith(azArg[i],"tokenize") ){
2635 zTokenizer = azArg[i];
2636 }else{
2637 z = azArg[pSpec->nColumn] = firstToken(azArg[i], &zDummy);
2638 pSpec->nColumn++;
2639 }
2640 }
2641 if( pSpec->nColumn==0 ){
2642 azArg[0] = "content";
2643 pSpec->nColumn = 1;
2644 }
2645
2646 /*
2647 ** Construct the list of content column names.
2648 **
2649 ** Each content column name will be of the form cNNAAAA
2650 ** where NN is the column number and AAAA is the sanitized
2651 ** column name. "sanitized" means that special characters are
2652 ** converted to "_". The cNN prefix guarantees that all column
2653 ** names are unique.
2654 **
2655 ** The AAAA suffix is not strictly necessary. It is included
2656 ** for the convenience of people who might examine the generated
2657 ** %_content table and wonder what the columns are used for.
2658 */
2659 pSpec->azContentColumn = malloc( pSpec->nColumn * sizeof(char *) );
2660 if( pSpec->azContentColumn==0 ){
2661 clearTableSpec(pSpec);
2662 return SQLITE_NOMEM;
2663 }
2664 for(i=0; i<pSpec->nColumn; i++){
2665 char *p;
2666 pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]);
2667 for (p = pSpec->azContentColumn[i]; *p ; ++p) {
2668 if( !safe_isalnum(*p) ) *p = '_';
2669 }
2670 }
2671
2672 /*
2673 ** Parse the tokenizer specification string.
2674 */
2675 pSpec->azTokenizer = tokenizeString(zTokenizer, &n);
2676 tokenListToIdList(pSpec->azTokenizer);
2677
2678 return SQLITE_OK;
2679}
2680
2681/*
2682** Generate a CREATE TABLE statement that describes the schema of
2683** the virtual table. Return a pointer to this schema string.
2684**
2685** Space is obtained from sqlite3_mprintf() and should be freed
2686** using sqlite3_free().
2687*/
2688static char *fulltextSchema(
2689 int nColumn, /* Number of columns */
2690 const char *const* azColumn, /* List of columns */
2691 const char *zTableName /* Name of the table */
2692){
2693 int i;
2694 char *zSchema, *zNext;
2695 const char *zSep = "(";
2696 zSchema = sqlite3_mprintf("CREATE TABLE x");
2697 for(i=0; i<nColumn; i++){
2698 zNext = sqlite3_mprintf("%s%s%Q", zSchema, zSep, azColumn[i]);
2699 sqlite3_free(zSchema);
2700 zSchema = zNext;
2701 zSep = ",";
2702 }
2703 zNext = sqlite3_mprintf("%s,%Q)", zSchema, zTableName);
2704 sqlite3_free(zSchema);
2705 return zNext;
2706}
2707
2708/*
2709** Build a new sqlite3_vtab structure that will describe the
2710** fulltext index defined by spec.
2711*/
2712static int constructVtab(
2713 sqlite3 *db, /* The SQLite database connection */
2714 fts2Hash *pHash, /* Hash table containing tokenizers */
2715 TableSpec *spec, /* Parsed spec information from parseSpec() */
2716 sqlite3_vtab **ppVTab, /* Write the resulting vtab structure here */
2717 char **pzErr /* Write any error message here */
2718){
2719 int rc;
2720 int n;
2721 fulltext_vtab *v = 0;
2722 const sqlite3_tokenizer_module *m = NULL;
2723 char *schema;
2724
2725 char const *zTok; /* Name of tokenizer to use for this fts table */
2726 int nTok; /* Length of zTok, including nul terminator */
2727
2728 v = (fulltext_vtab *) malloc(sizeof(fulltext_vtab));
2729 if( v==0 ) return SQLITE_NOMEM;
2730 CLEAR(v);
2731 /* sqlite will initialize v->base */
2732 v->db = db;
2733 v->zDb = spec->zDb; /* Freed when azColumn is freed */
2734 v->zName = spec->zName; /* Freed when azColumn is freed */
2735 v->nColumn = spec->nColumn;
2736 v->azContentColumn = spec->azContentColumn;
2737 spec->azContentColumn = 0;
2738 v->azColumn = spec->azColumn;
2739 spec->azColumn = 0;
2740
2741 if( spec->azTokenizer==0 ){
2742 return SQLITE_NOMEM;
2743 }
2744
2745 zTok = spec->azTokenizer[0];
2746 if( !zTok ){
2747 zTok = "simple";
2748 }
2749 nTok = strlen(zTok)+1;
2750
2751 m = (sqlite3_tokenizer_module *)sqlite3Fts2HashFind(pHash, zTok, nTok);
2752 if( !m ){
2753 *pzErr = sqlite3_mprintf("unknown tokenizer: %s", spec->azTokenizer[0]);
2754 rc = SQLITE_ERROR;
2755 goto err;
2756 }
2757
2758 for(n=0; spec->azTokenizer[n]; n++){}
2759 if( n ){
2760 rc = m->xCreate(n-1, (const char*const*)&spec->azTokenizer[1],
2761 &v->pTokenizer);
2762 }else{
2763 rc = m->xCreate(0, 0, &v->pTokenizer);
2764 }
2765 if( rc!=SQLITE_OK ) goto err;
2766 v->pTokenizer->pModule = m;
2767
2768 /* TODO: verify the existence of backing tables foo_content, foo_term */
2769
2770 schema = fulltextSchema(v->nColumn, (const char*const*)v->azColumn,
2771 spec->zName);
2772 rc = sqlite3_declare_vtab(db, schema);
2773 sqlite3_free(schema);
2774 if( rc!=SQLITE_OK ) goto err;
2775
2776 memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements));
2777
2778 /* Indicate that the buffer is not live. */
2779 v->nPendingData = -1;
2780
2781 *ppVTab = &v->base;
2782 TRACE(("FTS2 Connect %p\n", v));
2783
2784 return rc;
2785
2786err:
2787 fulltext_vtab_destroy(v);
2788 return rc;
2789}
2790
2791static int fulltextConnect(
2792 sqlite3 *db,
2793 void *pAux,
2794 int argc, const char *const*argv,
2795 sqlite3_vtab **ppVTab,
2796 char **pzErr
2797){
2798 TableSpec spec;
2799 int rc = parseSpec(&spec, argc, argv, pzErr);
2800 if( rc!=SQLITE_OK ) return rc;
2801
2802 rc = constructVtab(db, (fts2Hash *)pAux, &spec, ppVTab, pzErr);
2803 clearTableSpec(&spec);
2804 return rc;
2805}
2806
2807/* The %_content table holds the text of each document, with
2808** the rowid used as the docid.
2809*/
2810/* TODO(shess) This comment needs elaboration to match the updated
2811** code. Work it into the top-of-file comment at that time.
2812*/
2813static int fulltextCreate(sqlite3 *db, void *pAux,
2814 int argc, const char * const *argv,
2815 sqlite3_vtab **ppVTab, char **pzErr){
2816 int rc;
2817 TableSpec spec;
2818 StringBuffer schema;
2819 TRACE(("FTS2 Create\n"));
2820
2821 rc = parseSpec(&spec, argc, argv, pzErr);
2822 if( rc!=SQLITE_OK ) return rc;
2823
2824 initStringBuffer(&schema);
2825 append(&schema, "CREATE TABLE %_content(");
2826 appendList(&schema, spec.nColumn, spec.azContentColumn);
2827 append(&schema, ")");
2828 rc = sql_exec(db, spec.zDb, spec.zName, stringBufferData(&schema));
2829 stringBufferDestroy(&schema);
2830 if( rc!=SQLITE_OK ) goto out;
2831
2832 rc = sql_exec(db, spec.zDb, spec.zName,
2833 "create table %_segments(block blob);");
2834 if( rc!=SQLITE_OK ) goto out;
2835
2836 rc = sql_exec(db, spec.zDb, spec.zName,
2837 "create table %_segdir("
2838 " level integer,"
2839 " idx integer,"
2840 " start_block integer,"
2841 " leaves_end_block integer,"
2842 " end_block integer,"
2843 " root blob,"
2844 " primary key(level, idx)"
2845 ");");
2846 if( rc!=SQLITE_OK ) goto out;
2847
2848 rc = constructVtab(db, (fts2Hash *)pAux, &spec, ppVTab, pzErr);
2849
2850out:
2851 clearTableSpec(&spec);
2852 return rc;
2853}
2854
2855/* Decide how to handle an SQL query. */
2856static int fulltextBestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){
2857 int i;
2858 TRACE(("FTS2 BestIndex\n"));
2859
2860 for(i=0; i<pInfo->nConstraint; ++i){
2861 const struct sqlite3_index_constraint *pConstraint;
2862 pConstraint = &pInfo->aConstraint[i];
2863 if( pConstraint->usable ) {
2864 if( pConstraint->iColumn==-1 &&
2865 pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ ){
2866 pInfo->idxNum = QUERY_ROWID; /* lookup by rowid */
2867 TRACE(("FTS2 QUERY_ROWID\n"));
2868 } else if( pConstraint->iColumn>=0 &&
2869 pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH ){
2870 /* full-text search */
2871 pInfo->idxNum = QUERY_FULLTEXT + pConstraint->iColumn;
2872 TRACE(("FTS2 QUERY_FULLTEXT %d\n", pConstraint->iColumn));
2873 } else continue;
2874
2875 pInfo->aConstraintUsage[i].argvIndex = 1;
2876 pInfo->aConstraintUsage[i].omit = 1;
2877
2878 /* An arbitrary value for now.
2879 * TODO: Perhaps rowid matches should be considered cheaper than
2880 * full-text searches. */
2881 pInfo->estimatedCost = 1.0;
2882
2883 return SQLITE_OK;
2884 }
2885 }
2886 pInfo->idxNum = QUERY_GENERIC;
2887 return SQLITE_OK;
2888}
2889
2890static int fulltextDisconnect(sqlite3_vtab *pVTab){
2891 TRACE(("FTS2 Disconnect %p\n", pVTab));
2892 fulltext_vtab_destroy((fulltext_vtab *)pVTab);
2893 return SQLITE_OK;
2894}
2895
2896static int fulltextDestroy(sqlite3_vtab *pVTab){
2897 fulltext_vtab *v = (fulltext_vtab *)pVTab;
2898 int rc;
2899
2900 TRACE(("FTS2 Destroy %p\n", pVTab));
2901 rc = sql_exec(v->db, v->zDb, v->zName,
2902 "drop table if exists %_content;"
2903 "drop table if exists %_segments;"
2904 "drop table if exists %_segdir;"
2905 );
2906 if( rc!=SQLITE_OK ) return rc;
2907
2908 fulltext_vtab_destroy((fulltext_vtab *)pVTab);
2909 return SQLITE_OK;
2910}
2911
2912static int fulltextOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){
2913 fulltext_cursor *c;
2914
2915 c = (fulltext_cursor *) calloc(sizeof(fulltext_cursor), 1);
2916 /* sqlite will initialize c->base */
2917 *ppCursor = &c->base;
2918 TRACE(("FTS2 Open %p: %p\n", pVTab, c));
2919
2920 return SQLITE_OK;
2921}
2922
2923
2924/* Free all of the dynamically allocated memory held by *q
2925*/
2926static void queryClear(Query *q){
2927 int i;
2928 for(i = 0; i < q->nTerms; ++i){
2929 free(q->pTerms[i].pTerm);
2930 }
2931 free(q->pTerms);
2932 CLEAR(q);
2933}
2934
2935/* Free all of the dynamically allocated memory held by the
2936** Snippet
2937*/
2938static void snippetClear(Snippet *p){
2939 free(p->aMatch);
2940 free(p->zOffset);
2941 free(p->zSnippet);
2942 CLEAR(p);
2943}
2944/*
2945** Append a single entry to the p->aMatch[] log.
2946*/
2947static void snippetAppendMatch(
2948 Snippet *p, /* Append the entry to this snippet */
2949 int iCol, int iTerm, /* The column and query term */
2950 int iStart, int nByte /* Offset and size of the match */
2951){
2952 int i;
2953 struct snippetMatch *pMatch;
2954 if( p->nMatch+1>=p->nAlloc ){
2955 p->nAlloc = p->nAlloc*2 + 10;
2956 p->aMatch = realloc(p->aMatch, p->nAlloc*sizeof(p->aMatch[0]) );
2957 if( p->aMatch==0 ){
2958 p->nMatch = 0;
2959 p->nAlloc = 0;
2960 return;
2961 }
2962 }
2963 i = p->nMatch++;
2964 pMatch = &p->aMatch[i];
2965 pMatch->iCol = iCol;
2966 pMatch->iTerm = iTerm;
2967 pMatch->iStart = iStart;
2968 pMatch->nByte = nByte;
2969}
2970
2971/*
2972** Sizing information for the circular buffer used in snippetOffsetsOfColumn()
2973*/
2974#define FTS2_ROTOR_SZ (32)
2975#define FTS2_ROTOR_MASK (FTS2_ROTOR_SZ-1)
2976
2977/*
2978** Add entries to pSnippet->aMatch[] for every match that occurs against
2979** document zDoc[0..nDoc-1] which is stored in column iColumn.
2980*/
2981static void snippetOffsetsOfColumn(
2982 Query *pQuery,
2983 Snippet *pSnippet,
2984 int iColumn,
2985 const char *zDoc,
2986 int nDoc
2987){
2988 const sqlite3_tokenizer_module *pTModule; /* The tokenizer module */
2989 sqlite3_tokenizer *pTokenizer; /* The specific tokenizer */
2990 sqlite3_tokenizer_cursor *pTCursor; /* Tokenizer cursor */
2991 fulltext_vtab *pVtab; /* The full text index */
2992 int nColumn; /* Number of columns in the index */
2993 const QueryTerm *aTerm; /* Query string terms */
2994 int nTerm; /* Number of query string terms */
2995 int i, j; /* Loop counters */
2996 int rc; /* Return code */
2997 unsigned int match, prevMatch; /* Phrase search bitmasks */
2998 const char *zToken; /* Next token from the tokenizer */
2999 int nToken; /* Size of zToken */
3000 int iBegin, iEnd, iPos; /* Offsets of beginning and end */
3001
3002 /* The following variables keep a circular buffer of the last
3003 ** few tokens */
3004 unsigned int iRotor = 0; /* Index of current token */
3005 int iRotorBegin[FTS2_ROTOR_SZ]; /* Beginning offset of token */
3006 int iRotorLen[FTS2_ROTOR_SZ]; /* Length of token */
3007
3008 pVtab = pQuery->pFts;
3009 nColumn = pVtab->nColumn;
3010 pTokenizer = pVtab->pTokenizer;
3011 pTModule = pTokenizer->pModule;
3012 rc = pTModule->xOpen(pTokenizer, zDoc, nDoc, &pTCursor);
3013 if( rc ) return;
3014 pTCursor->pTokenizer = pTokenizer;
3015 aTerm = pQuery->pTerms;
3016 nTerm = pQuery->nTerms;
3017 if( nTerm>=FTS2_ROTOR_SZ ){
3018 nTerm = FTS2_ROTOR_SZ - 1;
3019 }
3020 prevMatch = 0;
3021 while(1){
3022 rc = pTModule->xNext(pTCursor, &zToken, &nToken, &iBegin, &iEnd, &iPos);
3023 if( rc ) break;
3024 iRotorBegin[iRotor&FTS2_ROTOR_MASK] = iBegin;
3025 iRotorLen[iRotor&FTS2_ROTOR_MASK] = iEnd-iBegin;
3026 match = 0;
3027 for(i=0; i<nTerm; i++){
3028 int iCol;
3029 iCol = aTerm[i].iColumn;
3030 if( iCol>=0 && iCol<nColumn && iCol!=iColumn ) continue;
3031 if( aTerm[i].nTerm>nToken ) continue;
3032 if( !aTerm[i].isPrefix && aTerm[i].nTerm<nToken ) continue;
3033 assert( aTerm[i].nTerm<=nToken );
3034 if( memcmp(aTerm[i].pTerm, zToken, aTerm[i].nTerm) ) continue;
3035 if( aTerm[i].iPhrase>1 && (prevMatch & (1<<i))==0 ) continue;
3036 match |= 1<<i;
3037 if( i==nTerm-1 || aTerm[i+1].iPhrase==1 ){
3038 for(j=aTerm[i].iPhrase-1; j>=0; j--){
3039 int k = (iRotor-j) & FTS2_ROTOR_MASK;
3040 snippetAppendMatch(pSnippet, iColumn, i-j,
3041 iRotorBegin[k], iRotorLen[k]);
3042 }
3043 }
3044 }
3045 prevMatch = match<<1;
3046 iRotor++;
3047 }
3048 pTModule->xClose(pTCursor);
3049}
3050
3051
3052/*
3053** Compute all offsets for the current row of the query.
3054** If the offsets have already been computed, this routine is a no-op.
3055*/
3056static void snippetAllOffsets(fulltext_cursor *p){
3057 int nColumn;
3058 int iColumn, i;
3059 int iFirst, iLast;
3060 fulltext_vtab *pFts;
3061
3062 if( p->snippet.nMatch ) return;
3063 if( p->q.nTerms==0 ) return;
3064 pFts = p->q.pFts;
3065 nColumn = pFts->nColumn;
3066 iColumn = (p->iCursorType - QUERY_FULLTEXT);
3067 if( iColumn<0 || iColumn>=nColumn ){
3068 iFirst = 0;
3069 iLast = nColumn-1;
3070 }else{
3071 iFirst = iColumn;
3072 iLast = iColumn;
3073 }
3074 for(i=iFirst; i<=iLast; i++){
3075 const char *zDoc;
3076 int nDoc;
3077 zDoc = (const char*)sqlite3_column_text(p->pStmt, i+1);
3078 nDoc = sqlite3_column_bytes(p->pStmt, i+1);
3079 snippetOffsetsOfColumn(&p->q, &p->snippet, i, zDoc, nDoc);
3080 }
3081}
3082
3083/*
3084** Convert the information in the aMatch[] array of the snippet
3085** into the string zOffset[0..nOffset-1].
3086*/
3087static void snippetOffsetText(Snippet *p){
3088 int i;
3089 int cnt = 0;
3090 StringBuffer sb;
3091 char zBuf[200];
3092 if( p->zOffset ) return;
3093 initStringBuffer(&sb);
3094 for(i=0; i<p->nMatch; i++){
3095 struct snippetMatch *pMatch = &p->aMatch[i];
3096 zBuf[0] = ' ';
3097 sprintf(&zBuf[cnt>0], "%d %d %d %d", pMatch->iCol,
3098 pMatch->iTerm, pMatch->iStart, pMatch->nByte);
3099 append(&sb, zBuf);
3100 cnt++;
3101 }
3102 p->zOffset = stringBufferData(&sb);
3103 p->nOffset = stringBufferLength(&sb);
3104}
3105
3106/*
3107** zDoc[0..nDoc-1] is phrase of text. aMatch[0..nMatch-1] are a set
3108** of matching words some of which might be in zDoc. zDoc is column
3109** number iCol.
3110**
3111** iBreak is suggested spot in zDoc where we could begin or end an
3112** excerpt. Return a value similar to iBreak but possibly adjusted
3113** to be a little left or right so that the break point is better.
3114*/
3115static int wordBoundary(
3116 int iBreak, /* The suggested break point */
3117 const char *zDoc, /* Document text */
3118 int nDoc, /* Number of bytes in zDoc[] */
3119 struct snippetMatch *aMatch, /* Matching words */
3120 int nMatch, /* Number of entries in aMatch[] */
3121 int iCol /* The column number for zDoc[] */
3122){
3123 int i;
3124 if( iBreak<=10 ){
3125 return 0;
3126 }
3127 if( iBreak>=nDoc-10 ){
3128 return nDoc;
3129 }
3130 for(i=0; i<nMatch && aMatch[i].iCol<iCol; i++){}
3131 while( i<nMatch && aMatch[i].iStart+aMatch[i].nByte<iBreak ){ i++; }
3132 if( i<nMatch ){
3133 if( aMatch[i].iStart<iBreak+10 ){
3134 return aMatch[i].iStart;
3135 }
3136 if( i>0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){
3137 return aMatch[i-1].iStart;
3138 }
3139 }
3140 for(i=1; i<=10; i++){
3141 if( safe_isspace(zDoc[iBreak-i]) ){
3142 return iBreak - i + 1;
3143 }
3144 if( safe_isspace(zDoc[iBreak+i]) ){
3145 return iBreak + i + 1;
3146 }
3147 }
3148 return iBreak;
3149}
3150
3151
3152
3153/*
3154** Allowed values for Snippet.aMatch[].snStatus
3155*/
3156#define SNIPPET_IGNORE 0 /* It is ok to omit this match from the snippet */
3157#define SNIPPET_DESIRED 1 /* We want to include this match in the snippet */
3158
3159/*
3160** Generate the text of a snippet.
3161*/
3162static void snippetText(
3163 fulltext_cursor *pCursor, /* The cursor we need the snippet for */
3164 const char *zStartMark, /* Markup to appear before each match */
3165 const char *zEndMark, /* Markup to appear after each match */
3166 const char *zEllipsis /* Ellipsis mark */
3167){
3168 int i, j;
3169 struct snippetMatch *aMatch;
3170 int nMatch;
3171 int nDesired;
3172 StringBuffer sb;
3173 int tailCol;
3174 int tailOffset;
3175 int iCol;
3176 int nDoc;
3177 const char *zDoc;
3178 int iStart, iEnd;
3179 int tailEllipsis = 0;
3180 int iMatch;
3181
3182
3183 free(pCursor->snippet.zSnippet);
3184 pCursor->snippet.zSnippet = 0;
3185 aMatch = pCursor->snippet.aMatch;
3186 nMatch = pCursor->snippet.nMatch;
3187 initStringBuffer(&sb);
3188
3189 for(i=0; i<nMatch; i++){
3190 aMatch[i].snStatus = SNIPPET_IGNORE;
3191 }
3192 nDesired = 0;
3193 for(i=0; i<pCursor->q.nTerms; i++){
3194 for(j=0; j<nMatch; j++){
3195 if( aMatch[j].iTerm==i ){
3196 aMatch[j].snStatus = SNIPPET_DESIRED;
3197 nDesired++;
3198 break;
3199 }
3200 }
3201 }
3202
3203 iMatch = 0;
3204 tailCol = -1;
3205 tailOffset = 0;
3206 for(i=0; i<nMatch && nDesired>0; i++){
3207 if( aMatch[i].snStatus!=SNIPPET_DESIRED ) continue;
3208 nDesired--;
3209 iCol = aMatch[i].iCol;
3210 zDoc = (const char*)sqlite3_column_text(pCursor->pStmt, iCol+1);
3211 nDoc = sqlite3_column_bytes(pCursor->pStmt, iCol+1);
3212 iStart = aMatch[i].iStart - 40;
3213 iStart = wordBoundary(iStart, zDoc, nDoc, aMatch, nMatch, iCol);
3214 if( iStart<=10 ){
3215 iStart = 0;
3216 }
3217 if( iCol==tailCol && iStart<=tailOffset+20 ){
3218 iStart = tailOffset;
3219 }
3220 if( (iCol!=tailCol && tailCol>=0) || iStart!=tailOffset ){
3221 trimWhiteSpace(&sb);
3222 appendWhiteSpace(&sb);
3223 append(&sb, zEllipsis);
3224 appendWhiteSpace(&sb);
3225 }
3226 iEnd = aMatch[i].iStart + aMatch[i].nByte + 40;
3227 iEnd = wordBoundary(iEnd, zDoc, nDoc, aMatch, nMatch, iCol);
3228 if( iEnd>=nDoc-10 ){
3229 iEnd = nDoc;
3230 tailEllipsis = 0;
3231 }else{
3232 tailEllipsis = 1;
3233 }
3234 while( iMatch<nMatch && aMatch[iMatch].iCol<iCol ){ iMatch++; }
3235 while( iStart<iEnd ){
3236 while( iMatch<nMatch && aMatch[iMatch].iStart<iStart
3237 && aMatch[iMatch].iCol<=iCol ){
3238 iMatch++;
3239 }
3240 if( iMatch<nMatch && aMatch[iMatch].iStart<iEnd
3241 && aMatch[iMatch].iCol==iCol ){
3242 nappend(&sb, &zDoc[iStart], aMatch[iMatch].iStart - iStart);
3243 iStart = aMatch[iMatch].iStart;
3244 append(&sb, zStartMark);
3245 nappend(&sb, &zDoc[iStart], aMatch[iMatch].nByte);
3246 append(&sb, zEndMark);
3247 iStart += aMatch[iMatch].nByte;
3248 for(j=iMatch+1; j<nMatch; j++){
3249 if( aMatch[j].iTerm==aMatch[iMatch].iTerm
3250 && aMatch[j].snStatus==SNIPPET_DESIRED ){
3251 nDesired--;
3252 aMatch[j].snStatus = SNIPPET_IGNORE;
3253 }
3254 }
3255 }else{
3256 nappend(&sb, &zDoc[iStart], iEnd - iStart);
3257 iStart = iEnd;
3258 }
3259 }
3260 tailCol = iCol;
3261 tailOffset = iEnd;
3262 }
3263 trimWhiteSpace(&sb);
3264 if( tailEllipsis ){
3265 appendWhiteSpace(&sb);
3266 append(&sb, zEllipsis);
3267 }
3268 pCursor->snippet.zSnippet = stringBufferData(&sb);
3269 pCursor->snippet.nSnippet = stringBufferLength(&sb);
3270}
3271
3272
3273/*
3274** Close the cursor. For additional information see the documentation
3275** on the xClose method of the virtual table interface.
3276*/
3277static int fulltextClose(sqlite3_vtab_cursor *pCursor){
3278 fulltext_cursor *c = (fulltext_cursor *) pCursor;
3279 TRACE(("FTS2 Close %p\n", c));
3280 sqlite3_finalize(c->pStmt);
3281 queryClear(&c->q);
3282 snippetClear(&c->snippet);
3283 if( c->result.nData!=0 ) dlrDestroy(&c->reader);
3284 dataBufferDestroy(&c->result);
3285 free(c);
3286 return SQLITE_OK;
3287}
3288
3289static int fulltextNext(sqlite3_vtab_cursor *pCursor){
3290 fulltext_cursor *c = (fulltext_cursor *) pCursor;
3291 int rc;
3292
3293 TRACE(("FTS2 Next %p\n", pCursor));
3294 snippetClear(&c->snippet);
3295 if( c->iCursorType < QUERY_FULLTEXT ){
3296 /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
3297 rc = sqlite3_step(c->pStmt);
3298 switch( rc ){
3299 case SQLITE_ROW:
3300 c->eof = 0;
3301 return SQLITE_OK;
3302 case SQLITE_DONE:
3303 c->eof = 1;
3304 return SQLITE_OK;
3305 default:
3306 c->eof = 1;
3307 return rc;
3308 }
3309 } else { /* full-text query */
3310 rc = sqlite3_reset(c->pStmt);
3311 if( rc!=SQLITE_OK ) return rc;
3312
3313 if( c->result.nData==0 || dlrAtEnd(&c->reader) ){
3314 c->eof = 1;
3315 return SQLITE_OK;
3316 }
3317 rc = sqlite3_bind_int64(c->pStmt, 1, dlrDocid(&c->reader));
3318 dlrStep(&c->reader);
3319 if( rc!=SQLITE_OK ) return rc;
3320 /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
3321 rc = sqlite3_step(c->pStmt);
3322 if( rc==SQLITE_ROW ){ /* the case we expect */
3323 c->eof = 0;
3324 return SQLITE_OK;
3325 }
3326 /* an error occurred; abort */
3327 return rc==SQLITE_DONE ? SQLITE_ERROR : rc;
3328 }
3329}
3330
3331
3332/* TODO(shess) If we pushed LeafReader to the top of the file, or to
3333** another file, term_select() could be pushed above
3334** docListOfTerm().
3335*/
3336static int termSelect(fulltext_vtab *v, int iColumn,
3337 const char *pTerm, int nTerm, int isPrefix,
3338 DocListType iType, DataBuffer *out);
3339
3340/* Return a DocList corresponding to the query term *pTerm. If *pTerm
3341** is the first term of a phrase query, go ahead and evaluate the phrase
3342** query and return the doclist for the entire phrase query.
3343**
3344** The resulting DL_DOCIDS doclist is stored in pResult, which is
3345** overwritten.
3346*/
3347static int docListOfTerm(
3348 fulltext_vtab *v, /* The full text index */
3349 int iColumn, /* column to restrict to. No restriction if >=nColumn */
3350 QueryTerm *pQTerm, /* Term we are looking for, or 1st term of a phrase */
3351 DataBuffer *pResult /* Write the result here */
3352){
3353 DataBuffer left, right, new;
3354 int i, rc;
3355
3356 /* No phrase search if no position info. */
3357 assert( pQTerm->nPhrase==0 || DL_DEFAULT!=DL_DOCIDS );
3358
3359 /* This code should never be called with buffered updates. */
3360 assert( v->nPendingData<0 );
3361
3362 dataBufferInit(&left, 0);
3363 rc = termSelect(v, iColumn, pQTerm->pTerm, pQTerm->nTerm, pQTerm->isPrefix,
3364 0<pQTerm->nPhrase ? DL_POSITIONS : DL_DOCIDS, &left);
3365 if( rc ) return rc;
3366 for(i=1; i<=pQTerm->nPhrase && left.nData>0; i++){
3367 dataBufferInit(&right, 0);
3368 rc = termSelect(v, iColumn, pQTerm[i].pTerm, pQTerm[i].nTerm,
3369 pQTerm[i].isPrefix, DL_POSITIONS, &right);
3370 if( rc ){
3371 dataBufferDestroy(&left);
3372 return rc;
3373 }
3374 dataBufferInit(&new, 0);
3375 docListPhraseMerge(left.pData, left.nData, right.pData, right.nData,
3376 i<pQTerm->nPhrase ? DL_POSITIONS : DL_DOCIDS, &new);
3377 dataBufferDestroy(&left);
3378 dataBufferDestroy(&right);
3379 left = new;
3380 }
3381 *pResult = left;
3382 return SQLITE_OK;
3383}
3384
3385/* Add a new term pTerm[0..nTerm-1] to the query *q.
3386*/
3387static void queryAdd(Query *q, const char *pTerm, int nTerm){
3388 QueryTerm *t;
3389 ++q->nTerms;
3390 q->pTerms = realloc(q->pTerms, q->nTerms * sizeof(q->pTerms[0]));
3391 if( q->pTerms==0 ){
3392 q->nTerms = 0;
3393 return;
3394 }
3395 t = &q->pTerms[q->nTerms - 1];
3396 CLEAR(t);
3397 t->pTerm = malloc(nTerm+1);
3398 memcpy(t->pTerm, pTerm, nTerm);
3399 t->pTerm[nTerm] = 0;
3400 t->nTerm = nTerm;
3401 t->isOr = q->nextIsOr;
3402 t->isPrefix = 0;
3403 q->nextIsOr = 0;
3404 t->iColumn = q->nextColumn;
3405 q->nextColumn = q->dfltColumn;
3406}
3407
3408/*
3409** Check to see if the string zToken[0...nToken-1] matches any
3410** column name in the virtual table. If it does,
3411** return the zero-indexed column number. If not, return -1.
3412*/
3413static int checkColumnSpecifier(
3414 fulltext_vtab *pVtab, /* The virtual table */
3415 const char *zToken, /* Text of the token */
3416 int nToken /* Number of characters in the token */
3417){
3418 int i;
3419 for(i=0; i<pVtab->nColumn; i++){
3420 if( memcmp(pVtab->azColumn[i], zToken, nToken)==0
3421 && pVtab->azColumn[i][nToken]==0 ){
3422 return i;
3423 }
3424 }
3425 return -1;
3426}
3427
3428/*
3429** Parse the text at pSegment[0..nSegment-1]. Add additional terms
3430** to the query being assemblied in pQuery.
3431**
3432** inPhrase is true if pSegment[0..nSegement-1] is contained within
3433** double-quotes. If inPhrase is true, then the first term
3434** is marked with the number of terms in the phrase less one and
3435** OR and "-" syntax is ignored. If inPhrase is false, then every
3436** term found is marked with nPhrase=0 and OR and "-" syntax is significant.
3437*/
3438static int tokenizeSegment(
3439 sqlite3_tokenizer *pTokenizer, /* The tokenizer to use */
3440 const char *pSegment, int nSegment, /* Query expression being parsed */
3441 int inPhrase, /* True if within "..." */
3442 Query *pQuery /* Append results here */
3443){
3444 const sqlite3_tokenizer_module *pModule = pTokenizer->pModule;
3445 sqlite3_tokenizer_cursor *pCursor;
3446 int firstIndex = pQuery->nTerms;
3447 int iCol;
3448 int nTerm = 1;
3449
3450 int rc = pModule->xOpen(pTokenizer, pSegment, nSegment, &pCursor);
3451 if( rc!=SQLITE_OK ) return rc;
3452 pCursor->pTokenizer = pTokenizer;
3453
3454 while( 1 ){
3455 const char *pToken;
3456 int nToken, iBegin, iEnd, iPos;
3457
3458 rc = pModule->xNext(pCursor,
3459 &pToken, &nToken,
3460 &iBegin, &iEnd, &iPos);
3461 if( rc!=SQLITE_OK ) break;
3462 if( !inPhrase &&
3463 pSegment[iEnd]==':' &&
3464 (iCol = checkColumnSpecifier(pQuery->pFts, pToken, nToken))>=0 ){
3465 pQuery->nextColumn = iCol;
3466 continue;
3467 }
3468 if( !inPhrase && pQuery->nTerms>0 && nToken==2
3469 && pSegment[iBegin]=='O' && pSegment[iBegin+1]=='R' ){
3470 pQuery->nextIsOr = 1;
3471 continue;
3472 }
3473 queryAdd(pQuery, pToken, nToken);
3474 if( !inPhrase && iBegin>0 && pSegment[iBegin-1]=='-' ){
3475 pQuery->pTerms[pQuery->nTerms-1].isNot = 1;
3476 }
3477 if( iEnd<nSegment && pSegment[iEnd]=='*' ){
3478 pQuery->pTerms[pQuery->nTerms-1].isPrefix = 1;
3479 }
3480 pQuery->pTerms[pQuery->nTerms-1].iPhrase = nTerm;
3481 if( inPhrase ){
3482 nTerm++;
3483 }
3484 }
3485
3486 if( inPhrase && pQuery->nTerms>firstIndex ){
3487 pQuery->pTerms[firstIndex].nPhrase = pQuery->nTerms - firstIndex - 1;
3488 }
3489
3490 return pModule->xClose(pCursor);
3491}
3492
3493/* Parse a query string, yielding a Query object pQuery.
3494**
3495** The calling function will need to queryClear() to clean up
3496** the dynamically allocated memory held by pQuery.
3497*/
3498static int parseQuery(
3499 fulltext_vtab *v, /* The fulltext index */
3500 const char *zInput, /* Input text of the query string */
3501 int nInput, /* Size of the input text */
3502 int dfltColumn, /* Default column of the index to match against */
3503 Query *pQuery /* Write the parse results here. */
3504){
3505 int iInput, inPhrase = 0;
3506
3507 if( zInput==0 ) nInput = 0;
3508 if( nInput<0 ) nInput = strlen(zInput);
3509 pQuery->nTerms = 0;
3510 pQuery->pTerms = NULL;
3511 pQuery->nextIsOr = 0;
3512 pQuery->nextColumn = dfltColumn;
3513 pQuery->dfltColumn = dfltColumn;
3514 pQuery->pFts = v;
3515
3516 for(iInput=0; iInput<nInput; ++iInput){
3517 int i;
3518 for(i=iInput; i<nInput && zInput[i]!='"'; ++i){}
3519 if( i>iInput ){
3520 tokenizeSegment(v->pTokenizer, zInput+iInput, i-iInput, inPhrase,
3521 pQuery);
3522 }
3523 iInput = i;
3524 if( i<nInput ){
3525 assert( zInput[i]=='"' );
3526 inPhrase = !inPhrase;
3527 }
3528 }
3529
3530 if( inPhrase ){
3531 /* unmatched quote */
3532 queryClear(pQuery);
3533 return SQLITE_ERROR;
3534 }
3535 return SQLITE_OK;
3536}
3537
3538/* TODO(shess) Refactor the code to remove this forward decl. */
3539static int flushPendingTerms(fulltext_vtab *v);
3540
3541/* Perform a full-text query using the search expression in
3542** zInput[0..nInput-1]. Return a list of matching documents
3543** in pResult.
3544**
3545** Queries must match column iColumn. Or if iColumn>=nColumn
3546** they are allowed to match against any column.
3547*/
3548static int fulltextQuery(
3549 fulltext_vtab *v, /* The full text index */
3550 int iColumn, /* Match against this column by default */
3551 const char *zInput, /* The query string */
3552 int nInput, /* Number of bytes in zInput[] */
3553 DataBuffer *pResult, /* Write the result doclist here */
3554 Query *pQuery /* Put parsed query string here */
3555){
3556 int i, iNext, rc;
3557 DataBuffer left, right, or, new;
3558 int nNot = 0;
3559 QueryTerm *aTerm;
3560
3561 /* TODO(shess) Instead of flushing pendingTerms, we could query for
3562 ** the relevant term and merge the doclist into what we receive from
3563 ** the database. Wait and see if this is a common issue, first.
3564 **
3565 ** A good reason not to flush is to not generate update-related
3566 ** error codes from here.
3567 */
3568
3569 /* Flush any buffered updates before executing the query. */
3570 rc = flushPendingTerms(v);
3571 if( rc!=SQLITE_OK ) return rc;
3572
3573 /* TODO(shess) I think that the queryClear() calls below are not
3574 ** necessary, because fulltextClose() already clears the query.
3575 */
3576 rc = parseQuery(v, zInput, nInput, iColumn, pQuery);
3577 if( rc!=SQLITE_OK ) return rc;
3578
3579 /* Empty or NULL queries return no results. */
3580 if( pQuery->nTerms==0 ){
3581 dataBufferInit(pResult, 0);
3582 return SQLITE_OK;
3583 }
3584
3585 /* Merge AND terms. */
3586 /* TODO(shess) I think we can early-exit if( i>nNot && left.nData==0 ). */
3587 aTerm = pQuery->pTerms;
3588 for(i = 0; i<pQuery->nTerms; i=iNext){
3589 if( aTerm[i].isNot ){
3590 /* Handle all NOT terms in a separate pass */
3591 nNot++;
3592 iNext = i + aTerm[i].nPhrase+1;
3593 continue;
3594 }
3595 iNext = i + aTerm[i].nPhrase + 1;
3596 rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &right);
3597 if( rc ){
3598 if( i!=nNot ) dataBufferDestroy(&left);
3599 queryClear(pQuery);
3600 return rc;
3601 }
3602 while( iNext<pQuery->nTerms && aTerm[iNext].isOr ){
3603 rc = docListOfTerm(v, aTerm[iNext].iColumn, &aTerm[iNext], &or);
3604 iNext += aTerm[iNext].nPhrase + 1;
3605 if( rc ){
3606 if( i!=nNot ) dataBufferDestroy(&left);
3607 dataBufferDestroy(&right);
3608 queryClear(pQuery);
3609 return rc;
3610 }
3611 dataBufferInit(&new, 0);
3612 docListOrMerge(right.pData, right.nData, or.pData, or.nData, &new);
3613 dataBufferDestroy(&right);
3614 dataBufferDestroy(&or);
3615 right = new;
3616 }
3617 if( i==nNot ){ /* first term processed. */
3618 left = right;
3619 }else{
3620 dataBufferInit(&new, 0);
3621 docListAndMerge(left.pData, left.nData, right.pData, right.nData, &new);
3622 dataBufferDestroy(&right);
3623 dataBufferDestroy(&left);
3624 left = new;
3625 }
3626 }
3627
3628 if( nNot==pQuery->nTerms ){
3629 /* We do not yet know how to handle a query of only NOT terms */
3630 return SQLITE_ERROR;
3631 }
3632
3633 /* Do the EXCEPT terms */
3634 for(i=0; i<pQuery->nTerms; i += aTerm[i].nPhrase + 1){
3635 if( !aTerm[i].isNot ) continue;
3636 rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &right);
3637 if( rc ){
3638 queryClear(pQuery);
3639 dataBufferDestroy(&left);
3640 return rc;
3641 }
3642 dataBufferInit(&new, 0);
3643 docListExceptMerge(left.pData, left.nData, right.pData, right.nData, &new);
3644 dataBufferDestroy(&right);
3645 dataBufferDestroy(&left);
3646 left = new;
3647 }
3648
3649 *pResult = left;
3650 return rc;
3651}
3652
3653/*
3654** This is the xFilter interface for the virtual table. See
3655** the virtual table xFilter method documentation for additional
3656** information.
3657**
3658** If idxNum==QUERY_GENERIC then do a full table scan against
3659** the %_content table.
3660**
3661** If idxNum==QUERY_ROWID then do a rowid lookup for a single entry
3662** in the %_content table.
3663**
3664** If idxNum>=QUERY_FULLTEXT then use the full text index. The
3665** column on the left-hand side of the MATCH operator is column
3666** number idxNum-QUERY_FULLTEXT, 0 indexed. argv[0] is the right-hand
3667** side of the MATCH operator.
3668*/
3669/* TODO(shess) Upgrade the cursor initialization and destruction to
3670** account for fulltextFilter() being called multiple times on the
3671** same cursor. The current solution is very fragile. Apply fix to
3672** fts2 as appropriate.
3673*/
3674static int fulltextFilter(
3675 sqlite3_vtab_cursor *pCursor, /* The cursor used for this query */
3676 int idxNum, const char *idxStr, /* Which indexing scheme to use */
3677 int argc, sqlite3_value **argv /* Arguments for the indexing scheme */
3678){
3679 fulltext_cursor *c = (fulltext_cursor *) pCursor;
3680 fulltext_vtab *v = cursor_vtab(c);
3681 int rc;
3682 char *zSql;
3683
3684 TRACE(("FTS2 Filter %p\n",pCursor));
3685
3686 zSql = sqlite3_mprintf("select rowid, * from %%_content %s",
3687 idxNum==QUERY_GENERIC ? "" : "where rowid=?");
3688 sqlite3_finalize(c->pStmt);
3689 rc = sql_prepare(v->db, v->zDb, v->zName, &c->pStmt, zSql);
3690 sqlite3_free(zSql);
3691 if( rc!=SQLITE_OK ) return rc;
3692
3693 c->iCursorType = idxNum;
3694 switch( idxNum ){
3695 case QUERY_GENERIC:
3696 break;
3697
3698 case QUERY_ROWID:
3699 rc = sqlite3_bind_int64(c->pStmt, 1, sqlite3_value_int64(argv[0]));
3700 if( rc!=SQLITE_OK ) return rc;
3701 break;
3702
3703 default: /* full-text search */
3704 {
3705 const char *zQuery = (const char *)sqlite3_value_text(argv[0]);
3706 assert( idxNum<=QUERY_FULLTEXT+v->nColumn);
3707 assert( argc==1 );
3708 queryClear(&c->q);
3709 if( c->result.nData!=0 ){
3710 /* This case happens if the same cursor is used repeatedly. */
3711 dlrDestroy(&c->reader);
3712 dataBufferReset(&c->result);
3713 }else{
3714 dataBufferInit(&c->result, 0);
3715 }
3716 rc = fulltextQuery(v, idxNum-QUERY_FULLTEXT, zQuery, -1, &c->result, &c->q);
3717 if( rc!=SQLITE_OK ) return rc;
3718 if( c->result.nData!=0 ){
3719 dlrInit(&c->reader, DL_DOCIDS, c->result.pData, c->result.nData);
3720 }
3721 break;
3722 }
3723 }
3724
3725 return fulltextNext(pCursor);
3726}
3727
3728/* This is the xEof method of the virtual table. The SQLite core
3729** calls this routine to find out if it has reached the end of
3730** a query's results set.
3731*/
3732static int fulltextEof(sqlite3_vtab_cursor *pCursor){
3733 fulltext_cursor *c = (fulltext_cursor *) pCursor;
3734 return c->eof;
3735}
3736
3737/* This is the xColumn method of the virtual table. The SQLite
3738** core calls this method during a query when it needs the value
3739** of a column from the virtual table. This method needs to use
3740** one of the sqlite3_result_*() routines to store the requested
3741** value back in the pContext.
3742*/
3743static int fulltextColumn(sqlite3_vtab_cursor *pCursor,
3744 sqlite3_context *pContext, int idxCol){
3745 fulltext_cursor *c = (fulltext_cursor *) pCursor;
3746 fulltext_vtab *v = cursor_vtab(c);
3747
3748 if( idxCol<v->nColumn ){
3749 sqlite3_value *pVal = sqlite3_column_value(c->pStmt, idxCol+1);
3750 sqlite3_result_value(pContext, pVal);
3751 }else if( idxCol==v->nColumn ){
3752 /* The extra column whose name is the same as the table.
3753 ** Return a blob which is a pointer to the cursor
3754 */
3755 sqlite3_result_blob(pContext, &c, sizeof(c), SQLITE_TRANSIENT);
3756 }
3757 return SQLITE_OK;
3758}
3759
3760/* This is the xRowid method. The SQLite core calls this routine to
3761** retrive the rowid for the current row of the result set. The
3762** rowid should be written to *pRowid.
3763*/
3764static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){
3765 fulltext_cursor *c = (fulltext_cursor *) pCursor;
3766
3767 *pRowid = sqlite3_column_int64(c->pStmt, 0);
3768 return SQLITE_OK;
3769}
3770
3771/* Add all terms in [zText] to pendingTerms table. If [iColumn] > 0,
3772** we also store positions and offsets in the hash table using that
3773** column number.
3774*/
3775static int buildTerms(fulltext_vtab *v, sqlite_int64 iDocid,
3776 const char *zText, int iColumn){
3777 sqlite3_tokenizer *pTokenizer = v->pTokenizer;
3778 sqlite3_tokenizer_cursor *pCursor;
3779 const char *pToken;
3780 int nTokenBytes;
3781 int iStartOffset, iEndOffset, iPosition;
3782 int rc;
3783
3784 rc = pTokenizer->pModule->xOpen(pTokenizer, zText, -1, &pCursor);
3785 if( rc!=SQLITE_OK ) return rc;
3786
3787 pCursor->pTokenizer = pTokenizer;
3788 while( SQLITE_OK==pTokenizer->pModule->xNext(pCursor,
3789 &pToken, &nTokenBytes,
3790 &iStartOffset, &iEndOffset,
3791 &iPosition) ){
3792 DLCollector *p;
3793 int nData; /* Size of doclist before our update. */
3794
3795 /* Positions can't be negative; we use -1 as a terminator internally. */
3796 if( iPosition<0 ){
3797 pTokenizer->pModule->xClose(pCursor);
3798 return SQLITE_ERROR;
3799 }
3800
3801 p = fts2HashFind(&v->pendingTerms, pToken, nTokenBytes);
3802 if( p==NULL ){
3803 nData = 0;
3804 p = dlcNew(iDocid, DL_DEFAULT);
3805 fts2HashInsert(&v->pendingTerms, pToken, nTokenBytes, p);
3806
3807 /* Overhead for our hash table entry, the key, and the value. */
3808 v->nPendingData += sizeof(struct fts2HashElem)+sizeof(*p)+nTokenBytes;
3809 }else{
3810 nData = p->b.nData;
3811 if( p->dlw.iPrevDocid!=iDocid ) dlcNext(p, iDocid);
3812 }
3813 if( iColumn>=0 ){
3814 dlcAddPos(p, iColumn, iPosition, iStartOffset, iEndOffset);
3815 }
3816
3817 /* Accumulate data added by dlcNew or dlcNext, and dlcAddPos. */
3818 v->nPendingData += p->b.nData-nData;
3819 }
3820
3821 /* TODO(shess) Check return? Should this be able to cause errors at
3822 ** this point? Actually, same question about sqlite3_finalize(),
3823 ** though one could argue that failure there means that the data is
3824 ** not durable. *ponder*
3825 */
3826 pTokenizer->pModule->xClose(pCursor);
3827 return rc;
3828}
3829
3830/* Add doclists for all terms in [pValues] to pendingTerms table. */
3831static int insertTerms(fulltext_vtab *v, sqlite_int64 iRowid,
3832 sqlite3_value **pValues){
3833 int i;
3834 for(i = 0; i < v->nColumn ; ++i){
3835 char *zText = (char*)sqlite3_value_text(pValues[i]);
3836 int rc = buildTerms(v, iRowid, zText, i);
3837 if( rc!=SQLITE_OK ) return rc;
3838 }
3839 return SQLITE_OK;
3840}
3841
3842/* Add empty doclists for all terms in the given row's content to
3843** pendingTerms.
3844*/
3845static int deleteTerms(fulltext_vtab *v, sqlite_int64 iRowid){
3846 const char **pValues;
3847 int i, rc;
3848
3849 /* TODO(shess) Should we allow such tables at all? */
3850 if( DL_DEFAULT==DL_DOCIDS ) return SQLITE_ERROR;
3851
3852 rc = content_select(v, iRowid, &pValues);
3853 if( rc!=SQLITE_OK ) return rc;
3854
3855 for(i = 0 ; i < v->nColumn; ++i) {
3856 rc = buildTerms(v, iRowid, pValues[i], -1);
3857 if( rc!=SQLITE_OK ) break;
3858 }
3859
3860 freeStringArray(v->nColumn, pValues);
3861 return SQLITE_OK;
3862}
3863
3864/* TODO(shess) Refactor the code to remove this forward decl. */
3865static int initPendingTerms(fulltext_vtab *v, sqlite_int64 iDocid);
3866
3867/* Insert a row into the %_content table; set *piRowid to be the ID of the
3868** new row. Add doclists for terms to pendingTerms.
3869*/
3870static int index_insert(fulltext_vtab *v, sqlite3_value *pRequestRowid,
3871 sqlite3_value **pValues, sqlite_int64 *piRowid){
3872 int rc;
3873
3874 rc = content_insert(v, pRequestRowid, pValues); /* execute an SQL INSERT */
3875 if( rc!=SQLITE_OK ) return rc;
3876
3877 *piRowid = sqlite3_last_insert_rowid(v->db);
3878 rc = initPendingTerms(v, *piRowid);
3879 if( rc!=SQLITE_OK ) return rc;
3880
3881 return insertTerms(v, *piRowid, pValues);
3882}
3883
3884/* Delete a row from the %_content table; add empty doclists for terms
3885** to pendingTerms.
3886*/
3887static int index_delete(fulltext_vtab *v, sqlite_int64 iRow){
3888 int rc = initPendingTerms(v, iRow);
3889 if( rc!=SQLITE_OK ) return rc;
3890
3891 rc = deleteTerms(v, iRow);
3892 if( rc!=SQLITE_OK ) return rc;
3893
3894 return content_delete(v, iRow); /* execute an SQL DELETE */
3895}
3896
3897/* Update a row in the %_content table; add delete doclists to
3898** pendingTerms for old terms not in the new data, add insert doclists
3899** to pendingTerms for terms in the new data.
3900*/
3901static int index_update(fulltext_vtab *v, sqlite_int64 iRow,
3902 sqlite3_value **pValues){
3903 int rc = initPendingTerms(v, iRow);
3904 if( rc!=SQLITE_OK ) return rc;
3905
3906 /* Generate an empty doclist for each term that previously appeared in this
3907 * row. */
3908 rc = deleteTerms(v, iRow);
3909 if( rc!=SQLITE_OK ) return rc;
3910
3911 rc = content_update(v, pValues, iRow); /* execute an SQL UPDATE */
3912 if( rc!=SQLITE_OK ) return rc;
3913
3914 /* Now add positions for terms which appear in the updated row. */
3915 return insertTerms(v, iRow, pValues);
3916}
3917
3918/*******************************************************************/
3919/* InteriorWriter is used to collect terms and block references into
3920** interior nodes in %_segments. See commentary at top of file for
3921** format.
3922*/
3923
3924/* How large interior nodes can grow. */
3925#define INTERIOR_MAX 2048
3926
3927/* Minimum number of terms per interior node (except the root). This
3928** prevents large terms from making the tree too skinny - must be >0
3929** so that the tree always makes progress. Note that the min tree
3930** fanout will be INTERIOR_MIN_TERMS+1.
3931*/
3932#define INTERIOR_MIN_TERMS 7
3933#if INTERIOR_MIN_TERMS<1
3934# error INTERIOR_MIN_TERMS must be greater than 0.
3935#endif
3936
3937/* ROOT_MAX controls how much data is stored inline in the segment
3938** directory.
3939*/
3940/* TODO(shess) Push ROOT_MAX down to whoever is writing things. It's
3941** only here so that interiorWriterRootInfo() and leafWriterRootInfo()
3942** can both see it, but if the caller passed it in, we wouldn't even
3943** need a define.
3944*/
3945#define ROOT_MAX 1024
3946#if ROOT_MAX<VARINT_MAX*2
3947# error ROOT_MAX must have enough space for a header.
3948#endif
3949
3950/* InteriorBlock stores a linked-list of interior blocks while a lower
3951** layer is being constructed.
3952*/
3953typedef struct InteriorBlock {
3954 DataBuffer term; /* Leftmost term in block's subtree. */
3955 DataBuffer data; /* Accumulated data for the block. */
3956 struct InteriorBlock *next;
3957} InteriorBlock;
3958
3959static InteriorBlock *interiorBlockNew(int iHeight, sqlite_int64 iChildBlock,
3960 const char *pTerm, int nTerm){
3961 InteriorBlock *block = calloc(1, sizeof(InteriorBlock));
3962 char c[VARINT_MAX+VARINT_MAX];
3963 int n;
3964
3965 dataBufferInit(&block->term, 0);
3966 dataBufferReplace(&block->term, pTerm, nTerm);
3967
3968 n = putVarint(c, iHeight);
3969 n += putVarint(c+n, iChildBlock);
3970 dataBufferInit(&block->data, INTERIOR_MAX);
3971 dataBufferReplace(&block->data, c, n);
3972
3973 return block;
3974}
3975
3976#ifndef NDEBUG
3977/* Verify that the data is readable as an interior node. */
3978static void interiorBlockValidate(InteriorBlock *pBlock){
3979 const char *pData = pBlock->data.pData;
3980 int nData = pBlock->data.nData;
3981 int n, iDummy;
3982 sqlite_int64 iBlockid;
3983
3984 assert( nData>0 );
3985 assert( pData!=0 );
3986 assert( pData+nData>pData );
3987
3988 /* Must lead with height of node as a varint(n), n>0 */
3989 n = getVarint32(pData, &iDummy);
3990 assert( n>0 );
3991 assert( iDummy>0 );
3992 assert( n<nData );
3993 pData += n;
3994 nData -= n;
3995
3996 /* Must contain iBlockid. */
3997 n = getVarint(pData, &iBlockid);
3998 assert( n>0 );
3999 assert( n<=nData );
4000 pData += n;
4001 nData -= n;
4002
4003 /* Zero or more terms of positive length */
4004 if( nData!=0 ){
4005 /* First term is not delta-encoded. */
4006 n = getVarint32(pData, &iDummy);
4007 assert( n>0 );
4008 assert( iDummy>0 );
4009 assert( n+iDummy>0);
4010 assert( n+iDummy<=nData );
4011 pData += n+iDummy;
4012 nData -= n+iDummy;
4013
4014 /* Following terms delta-encoded. */
4015 while( nData!=0 ){
4016 /* Length of shared prefix. */
4017 n = getVarint32(pData, &iDummy);
4018 assert( n>0 );
4019 assert( iDummy>=0 );
4020 assert( n<nData );
4021 pData += n;
4022 nData -= n;
4023
4024 /* Length and data of distinct suffix. */
4025 n = getVarint32(pData, &iDummy);
4026 assert( n>0 );
4027 assert( iDummy>0 );
4028 assert( n+iDummy>0);
4029 assert( n+iDummy<=nData );
4030 pData += n+iDummy;
4031 nData -= n+iDummy;
4032 }
4033 }
4034}
4035#define ASSERT_VALID_INTERIOR_BLOCK(x) interiorBlockValidate(x)
4036#else
4037#define ASSERT_VALID_INTERIOR_BLOCK(x) assert( 1 )
4038#endif
4039
4040typedef struct InteriorWriter {
4041 int iHeight; /* from 0 at leaves. */
4042 InteriorBlock *first, *last;
4043 struct InteriorWriter *parentWriter;
4044
4045 DataBuffer term; /* Last term written to block "last". */
4046 sqlite_int64 iOpeningChildBlock; /* First child block in block "last". */
4047#ifndef NDEBUG
4048 sqlite_int64 iLastChildBlock; /* for consistency checks. */
4049#endif
4050} InteriorWriter;
4051
4052/* Initialize an interior node where pTerm[nTerm] marks the leftmost
4053** term in the tree. iChildBlock is the leftmost child block at the
4054** next level down the tree.
4055*/
4056static void interiorWriterInit(int iHeight, const char *pTerm, int nTerm,
4057 sqlite_int64 iChildBlock,
4058 InteriorWriter *pWriter){
4059 InteriorBlock *block;
4060 assert( iHeight>0 );
4061 CLEAR(pWriter);
4062
4063 pWriter->iHeight = iHeight;
4064 pWriter->iOpeningChildBlock = iChildBlock;
4065#ifndef NDEBUG
4066 pWriter->iLastChildBlock = iChildBlock;
4067#endif
4068 block = interiorBlockNew(iHeight, iChildBlock, pTerm, nTerm);
4069 pWriter->last = pWriter->first = block;
4070 ASSERT_VALID_INTERIOR_BLOCK(pWriter->last);
4071 dataBufferInit(&pWriter->term, 0);
4072}
4073
4074/* Append the child node rooted at iChildBlock to the interior node,
4075** with pTerm[nTerm] as the leftmost term in iChildBlock's subtree.
4076*/
4077static void interiorWriterAppend(InteriorWriter *pWriter,
4078 const char *pTerm, int nTerm,
4079 sqlite_int64 iChildBlock){
4080 char c[VARINT_MAX+VARINT_MAX];
4081 int n, nPrefix = 0;
4082
4083 ASSERT_VALID_INTERIOR_BLOCK(pWriter->last);
4084
4085 /* The first term written into an interior node is actually
4086 ** associated with the second child added (the first child was added
4087 ** in interiorWriterInit, or in the if clause at the bottom of this
4088 ** function). That term gets encoded straight up, with nPrefix left
4089 ** at 0.
4090 */
4091 if( pWriter->term.nData==0 ){
4092 n = putVarint(c, nTerm);
4093 }else{
4094 while( nPrefix<pWriter->term.nData &&
4095 pTerm[nPrefix]==pWriter->term.pData[nPrefix] ){
4096 nPrefix++;
4097 }
4098
4099 n = putVarint(c, nPrefix);
4100 n += putVarint(c+n, nTerm-nPrefix);
4101 }
4102
4103#ifndef NDEBUG
4104 pWriter->iLastChildBlock++;
4105#endif
4106 assert( pWriter->iLastChildBlock==iChildBlock );
4107
4108 /* Overflow to a new block if the new term makes the current block
4109 ** too big, and the current block already has enough terms.
4110 */
4111 if( pWriter->last->data.nData+n+nTerm-nPrefix>INTERIOR_MAX &&
4112 iChildBlock-pWriter->iOpeningChildBlock>INTERIOR_MIN_TERMS ){
4113 pWriter->last->next = interiorBlockNew(pWriter->iHeight, iChildBlock,
4114 pTerm, nTerm);
4115 pWriter->last = pWriter->last->next;
4116 pWriter->iOpeningChildBlock = iChildBlock;
4117 dataBufferReset(&pWriter->term);
4118 }else{
4119 dataBufferAppend2(&pWriter->last->data, c, n,
4120 pTerm+nPrefix, nTerm-nPrefix);
4121 dataBufferReplace(&pWriter->term, pTerm, nTerm);
4122 }
4123 ASSERT_VALID_INTERIOR_BLOCK(pWriter->last);
4124}
4125
4126/* Free the space used by pWriter, including the linked-list of
4127** InteriorBlocks, and parentWriter, if present.
4128*/
4129static int interiorWriterDestroy(InteriorWriter *pWriter){
4130 InteriorBlock *block = pWriter->first;
4131
4132 while( block!=NULL ){
4133 InteriorBlock *b = block;
4134 block = block->next;
4135 dataBufferDestroy(&b->term);
4136 dataBufferDestroy(&b->data);
4137 free(b);
4138 }
4139 if( pWriter->parentWriter!=NULL ){
4140 interiorWriterDestroy(pWriter->parentWriter);
4141 free(pWriter->parentWriter);
4142 }
4143 dataBufferDestroy(&pWriter->term);
4144 SCRAMBLE(pWriter);
4145 return SQLITE_OK;
4146}
4147
4148/* If pWriter can fit entirely in ROOT_MAX, return it as the root info
4149** directly, leaving *piEndBlockid unchanged. Otherwise, flush
4150** pWriter to %_segments, building a new layer of interior nodes, and
4151** recursively ask for their root into.
4152*/
4153static int interiorWriterRootInfo(fulltext_vtab *v, InteriorWriter *pWriter,
4154 char **ppRootInfo, int *pnRootInfo,
4155 sqlite_int64 *piEndBlockid){
4156 InteriorBlock *block = pWriter->first;
4157 sqlite_int64 iBlockid = 0;
4158 int rc;
4159
4160 /* If we can fit the segment inline */
4161 if( block==pWriter->last && block->data.nData<ROOT_MAX ){
4162 *ppRootInfo = block->data.pData;
4163 *pnRootInfo = block->data.nData;
4164 return SQLITE_OK;
4165 }
4166
4167 /* Flush the first block to %_segments, and create a new level of
4168 ** interior node.
4169 */
4170 ASSERT_VALID_INTERIOR_BLOCK(block);
4171 rc = block_insert(v, block->data.pData, block->data.nData, &iBlockid);
4172 if( rc!=SQLITE_OK ) return rc;
4173 *piEndBlockid = iBlockid;
4174
4175 pWriter->parentWriter = malloc(sizeof(*pWriter->parentWriter));
4176 interiorWriterInit(pWriter->iHeight+1,
4177 block->term.pData, block->term.nData,
4178 iBlockid, pWriter->parentWriter);
4179
4180 /* Flush additional blocks and append to the higher interior
4181 ** node.
4182 */
4183 for(block=block->next; block!=NULL; block=block->next){
4184 ASSERT_VALID_INTERIOR_BLOCK(block);
4185 rc = block_insert(v, block->data.pData, block->data.nData, &iBlockid);
4186 if( rc!=SQLITE_OK ) return rc;
4187 *piEndBlockid = iBlockid;
4188
4189 interiorWriterAppend(pWriter->parentWriter,
4190 block->term.pData, block->term.nData, iBlockid);
4191 }
4192
4193 /* Parent node gets the chance to be the root. */
4194 return interiorWriterRootInfo(v, pWriter->parentWriter,
4195 ppRootInfo, pnRootInfo, piEndBlockid);
4196}
4197
4198/****************************************************************/
4199/* InteriorReader is used to read off the data from an interior node
4200** (see comment at top of file for the format).
4201*/
4202typedef struct InteriorReader {
4203 const char *pData;
4204 int nData;
4205
4206 DataBuffer term; /* previous term, for decoding term delta. */
4207
4208 sqlite_int64 iBlockid;
4209} InteriorReader;
4210
4211static void interiorReaderDestroy(InteriorReader *pReader){
4212 dataBufferDestroy(&pReader->term);
4213 SCRAMBLE(pReader);
4214}
4215
4216/* TODO(shess) The assertions are great, but what if we're in NDEBUG
4217** and the blob is empty or otherwise contains suspect data?
4218*/
4219static void interiorReaderInit(const char *pData, int nData,
4220 InteriorReader *pReader){
4221 int n, nTerm;
4222
4223 /* Require at least the leading flag byte */
4224 assert( nData>0 );
4225 assert( pData[0]!='\0' );
4226
4227 CLEAR(pReader);
4228
4229 /* Decode the base blockid, and set the cursor to the first term. */
4230 n = getVarint(pData+1, &pReader->iBlockid);
4231 assert( 1+n<=nData );
4232 pReader->pData = pData+1+n;
4233 pReader->nData = nData-(1+n);
4234
4235 /* A single-child interior node (such as when a leaf node was too
4236 ** large for the segment directory) won't have any terms.
4237 ** Otherwise, decode the first term.
4238 */
4239 if( pReader->nData==0 ){
4240 dataBufferInit(&pReader->term, 0);
4241 }else{
4242 n = getVarint32(pReader->pData, &nTerm);
4243 dataBufferInit(&pReader->term, nTerm);
4244 dataBufferReplace(&pReader->term, pReader->pData+n, nTerm);
4245 assert( n+nTerm<=pReader->nData );
4246 pReader->pData += n+nTerm;
4247 pReader->nData -= n+nTerm;
4248 }
4249}
4250
4251static int interiorReaderAtEnd(InteriorReader *pReader){
4252 return pReader->term.nData==0;
4253}
4254
4255static sqlite_int64 interiorReaderCurrentBlockid(InteriorReader *pReader){
4256 return pReader->iBlockid;
4257}
4258
4259static int interiorReaderTermBytes(InteriorReader *pReader){
4260 assert( !interiorReaderAtEnd(pReader) );
4261 return pReader->term.nData;
4262}
4263static const char *interiorReaderTerm(InteriorReader *pReader){
4264 assert( !interiorReaderAtEnd(pReader) );
4265 return pReader->term.pData;
4266}
4267
4268/* Step forward to the next term in the node. */
4269static void interiorReaderStep(InteriorReader *pReader){
4270 assert( !interiorReaderAtEnd(pReader) );
4271
4272 /* If the last term has been read, signal eof, else construct the
4273 ** next term.
4274 */
4275 if( pReader->nData==0 ){
4276 dataBufferReset(&pReader->term);
4277 }else{
4278 int n, nPrefix, nSuffix;
4279
4280 n = getVarint32(pReader->pData, &nPrefix);
4281 n += getVarint32(pReader->pData+n, &nSuffix);
4282
4283 /* Truncate the current term and append suffix data. */
4284 pReader->term.nData = nPrefix;
4285 dataBufferAppend(&pReader->term, pReader->pData+n, nSuffix);
4286
4287 assert( n+nSuffix<=pReader->nData );
4288 pReader->pData += n+nSuffix;
4289 pReader->nData -= n+nSuffix;
4290 }
4291 pReader->iBlockid++;
4292}
4293
4294/* Compare the current term to pTerm[nTerm], returning strcmp-style
4295** results. If isPrefix, equality means equal through nTerm bytes.
4296*/
4297static int interiorReaderTermCmp(InteriorReader *pReader,
4298 const char *pTerm, int nTerm, int isPrefix){
4299 const char *pReaderTerm = interiorReaderTerm(pReader);
4300 int nReaderTerm = interiorReaderTermBytes(pReader);
4301 int c, n = nReaderTerm<nTerm ? nReaderTerm : nTerm;
4302
4303 if( n==0 ){
4304 if( nReaderTerm>0 ) return -1;
4305 if( nTerm>0 ) return 1;
4306 return 0;
4307 }
4308
4309 c = memcmp(pReaderTerm, pTerm, n);
4310 if( c!=0 ) return c;
4311 if( isPrefix && n==nTerm ) return 0;
4312 return nReaderTerm - nTerm;
4313}
4314
4315/****************************************************************/
4316/* LeafWriter is used to collect terms and associated doclist data
4317** into leaf blocks in %_segments (see top of file for format info).
4318** Expected usage is:
4319**
4320** LeafWriter writer;
4321** leafWriterInit(0, 0, &writer);
4322** while( sorted_terms_left_to_process ){
4323** // data is doclist data for that term.
4324** rc = leafWriterStep(v, &writer, pTerm, nTerm, pData, nData);
4325** if( rc!=SQLITE_OK ) goto err;
4326** }
4327** rc = leafWriterFinalize(v, &writer);
4328**err:
4329** leafWriterDestroy(&writer);
4330** return rc;
4331**
4332** leafWriterStep() may write a collected leaf out to %_segments.
4333** leafWriterFinalize() finishes writing any buffered data and stores
4334** a root node in %_segdir. leafWriterDestroy() frees all buffers and
4335** InteriorWriters allocated as part of writing this segment.
4336**
4337** TODO(shess) Document leafWriterStepMerge().
4338*/
4339
4340/* Put terms with data this big in their own block. */
4341#define STANDALONE_MIN 1024
4342
4343/* Keep leaf blocks below this size. */
4344#define LEAF_MAX 2048
4345
4346typedef struct LeafWriter {
4347 int iLevel;
4348 int idx;
4349 sqlite_int64 iStartBlockid; /* needed to create the root info */
4350 sqlite_int64 iEndBlockid; /* when we're done writing. */
4351
4352 DataBuffer term; /* previous encoded term */
4353 DataBuffer data; /* encoding buffer */
4354
4355 /* bytes of first term in the current node which distinguishes that
4356 ** term from the last term of the previous node.
4357 */
4358 int nTermDistinct;
4359
4360 InteriorWriter parentWriter; /* if we overflow */
4361 int has_parent;
4362} LeafWriter;
4363
4364static void leafWriterInit(int iLevel, int idx, LeafWriter *pWriter){
4365 CLEAR(pWriter);
4366 pWriter->iLevel = iLevel;
4367 pWriter->idx = idx;
4368
4369 dataBufferInit(&pWriter->term, 32);
4370
4371 /* Start out with a reasonably sized block, though it can grow. */
4372 dataBufferInit(&pWriter->data, LEAF_MAX);
4373}
4374
4375#ifndef NDEBUG
4376/* Verify that the data is readable as a leaf node. */
4377static void leafNodeValidate(const char *pData, int nData){
4378 int n, iDummy;
4379
4380 if( nData==0 ) return;
4381 assert( nData>0 );
4382 assert( pData!=0 );
4383 assert( pData+nData>pData );
4384
4385 /* Must lead with a varint(0) */
4386 n = getVarint32(pData, &iDummy);
4387 assert( iDummy==0 );
4388 assert( n>0 );
4389 assert( n<nData );
4390 pData += n;
4391 nData -= n;
4392
4393 /* Leading term length and data must fit in buffer. */
4394 n = getVarint32(pData, &iDummy);
4395 assert( n>0 );
4396 assert( iDummy>0 );
4397 assert( n+iDummy>0 );
4398 assert( n+iDummy<nData );
4399 pData += n+iDummy;
4400 nData -= n+iDummy;
4401
4402 /* Leading term's doclist length and data must fit. */
4403 n = getVarint32(pData, &iDummy);
4404 assert( n>0 );
4405 assert( iDummy>0 );
4406 assert( n+iDummy>0 );
4407 assert( n+iDummy<=nData );
4408 ASSERT_VALID_DOCLIST(DL_DEFAULT, pData+n, iDummy, NULL);
4409 pData += n+iDummy;
4410 nData -= n+iDummy;
4411
4412 /* Verify that trailing terms and doclists also are readable. */
4413 while( nData!=0 ){
4414 n = getVarint32(pData, &iDummy);
4415 assert( n>0 );
4416 assert( iDummy>=0 );
4417 assert( n<nData );
4418 pData += n;
4419 nData -= n;
4420 n = getVarint32(pData, &iDummy);
4421 assert( n>0 );
4422 assert( iDummy>0 );
4423 assert( n+iDummy>0 );
4424 assert( n+iDummy<nData );
4425 pData += n+iDummy;
4426 nData -= n+iDummy;
4427
4428 n = getVarint32(pData, &iDummy);
4429 assert( n>0 );
4430 assert( iDummy>0 );
4431 assert( n+iDummy>0 );
4432 assert( n+iDummy<=nData );
4433 ASSERT_VALID_DOCLIST(DL_DEFAULT, pData+n, iDummy, NULL);
4434 pData += n+iDummy;
4435 nData -= n+iDummy;
4436 }
4437}
4438#define ASSERT_VALID_LEAF_NODE(p, n) leafNodeValidate(p, n)
4439#else
4440#define ASSERT_VALID_LEAF_NODE(p, n) assert( 1 )
4441#endif
4442
4443/* Flush the current leaf node to %_segments, and adding the resulting
4444** blockid and the starting term to the interior node which will
4445** contain it.
4446*/
4447static int leafWriterInternalFlush(fulltext_vtab *v, LeafWriter *pWriter,
4448 int iData, int nData){
4449 sqlite_int64 iBlockid = 0;
4450 const char *pStartingTerm;
4451 int nStartingTerm, rc, n;
4452
4453 /* Must have the leading varint(0) flag, plus at least some
4454 ** valid-looking data.
4455 */
4456 assert( nData>2 );
4457 assert( iData>=0 );
4458 assert( iData+nData<=pWriter->data.nData );
4459 ASSERT_VALID_LEAF_NODE(pWriter->data.pData+iData, nData);
4460
4461 rc = block_insert(v, pWriter->data.pData+iData, nData, &iBlockid);
4462 if( rc!=SQLITE_OK ) return rc;
4463 assert( iBlockid!=0 );
4464
4465 /* Reconstruct the first term in the leaf for purposes of building
4466 ** the interior node.
4467 */
4468 n = getVarint32(pWriter->data.pData+iData+1, &nStartingTerm);
4469 pStartingTerm = pWriter->data.pData+iData+1+n;
4470 assert( pWriter->data.nData>iData+1+n+nStartingTerm );
4471 assert( pWriter->nTermDistinct>0 );
4472 assert( pWriter->nTermDistinct<=nStartingTerm );
4473 nStartingTerm = pWriter->nTermDistinct;
4474
4475 if( pWriter->has_parent ){
4476 interiorWriterAppend(&pWriter->parentWriter,
4477 pStartingTerm, nStartingTerm, iBlockid);
4478 }else{
4479 interiorWriterInit(1, pStartingTerm, nStartingTerm, iBlockid,
4480 &pWriter->parentWriter);
4481 pWriter->has_parent = 1;
4482 }
4483
4484 /* Track the span of this segment's leaf nodes. */
4485 if( pWriter->iEndBlockid==0 ){
4486 pWriter->iEndBlockid = pWriter->iStartBlockid = iBlockid;
4487 }else{
4488 pWriter->iEndBlockid++;
4489 assert( iBlockid==pWriter->iEndBlockid );
4490 }
4491
4492 return SQLITE_OK;
4493}
4494static int leafWriterFlush(fulltext_vtab *v, LeafWriter *pWriter){
4495 int rc = leafWriterInternalFlush(v, pWriter, 0, pWriter->data.nData);
4496 if( rc!=SQLITE_OK ) return rc;
4497
4498 /* Re-initialize the output buffer. */
4499 dataBufferReset(&pWriter->data);
4500
4501 return SQLITE_OK;
4502}
4503
4504/* Fetch the root info for the segment. If the entire leaf fits
4505** within ROOT_MAX, then it will be returned directly, otherwise it
4506** will be flushed and the root info will be returned from the
4507** interior node. *piEndBlockid is set to the blockid of the last
4508** interior or leaf node written to disk (0 if none are written at
4509** all).
4510*/
4511static int leafWriterRootInfo(fulltext_vtab *v, LeafWriter *pWriter,
4512 char **ppRootInfo, int *pnRootInfo,
4513 sqlite_int64 *piEndBlockid){
4514 /* we can fit the segment entirely inline */
4515 if( !pWriter->has_parent && pWriter->data.nData<ROOT_MAX ){
4516 *ppRootInfo = pWriter->data.pData;
4517 *pnRootInfo = pWriter->data.nData;
4518 *piEndBlockid = 0;
4519 return SQLITE_OK;
4520 }
4521
4522 /* Flush remaining leaf data. */
4523 if( pWriter->data.nData>0 ){
4524 int rc = leafWriterFlush(v, pWriter);
4525 if( rc!=SQLITE_OK ) return rc;
4526 }
4527
4528 /* We must have flushed a leaf at some point. */
4529 assert( pWriter->has_parent );
4530
4531 /* Tenatively set the end leaf blockid as the end blockid. If the
4532 ** interior node can be returned inline, this will be the final
4533 ** blockid, otherwise it will be overwritten by
4534 ** interiorWriterRootInfo().
4535 */
4536 *piEndBlockid = pWriter->iEndBlockid;
4537
4538 return interiorWriterRootInfo(v, &pWriter->parentWriter,
4539 ppRootInfo, pnRootInfo, piEndBlockid);
4540}
4541
4542/* Collect the rootInfo data and store it into the segment directory.
4543** This has the effect of flushing the segment's leaf data to
4544** %_segments, and also flushing any interior nodes to %_segments.
4545*/
4546static int leafWriterFinalize(fulltext_vtab *v, LeafWriter *pWriter){
4547 sqlite_int64 iEndBlockid;
4548 char *pRootInfo;
4549 int rc, nRootInfo;
4550
4551 rc = leafWriterRootInfo(v, pWriter, &pRootInfo, &nRootInfo, &iEndBlockid);
4552 if( rc!=SQLITE_OK ) return rc;
4553
4554 /* Don't bother storing an entirely empty segment. */
4555 if( iEndBlockid==0 && nRootInfo==0 ) return SQLITE_OK;
4556
4557 return segdir_set(v, pWriter->iLevel, pWriter->idx,
4558 pWriter->iStartBlockid, pWriter->iEndBlockid,
4559 iEndBlockid, pRootInfo, nRootInfo);
4560}
4561
4562static void leafWriterDestroy(LeafWriter *pWriter){
4563 if( pWriter->has_parent ) interiorWriterDestroy(&pWriter->parentWriter);
4564 dataBufferDestroy(&pWriter->term);
4565 dataBufferDestroy(&pWriter->data);
4566}
4567
4568/* Encode a term into the leafWriter, delta-encoding as appropriate.
4569** Returns the length of the new term which distinguishes it from the
4570** previous term, which can be used to set nTermDistinct when a node
4571** boundary is crossed.
4572*/
4573static int leafWriterEncodeTerm(LeafWriter *pWriter,
4574 const char *pTerm, int nTerm){
4575 char c[VARINT_MAX+VARINT_MAX];
4576 int n, nPrefix = 0;
4577
4578 assert( nTerm>0 );
4579 while( nPrefix<pWriter->term.nData &&
4580 pTerm[nPrefix]==pWriter->term.pData[nPrefix] ){
4581 nPrefix++;
4582 /* Failing this implies that the terms weren't in order. */
4583 assert( nPrefix<nTerm );
4584 }
4585
4586 if( pWriter->data.nData==0 ){
4587 /* Encode the node header and leading term as:
4588 ** varint(0)
4589 ** varint(nTerm)
4590 ** char pTerm[nTerm]
4591 */
4592 n = putVarint(c, '\0');
4593 n += putVarint(c+n, nTerm);
4594 dataBufferAppend2(&pWriter->data, c, n, pTerm, nTerm);
4595 }else{
4596 /* Delta-encode the term as:
4597 ** varint(nPrefix)
4598 ** varint(nSuffix)
4599 ** char pTermSuffix[nSuffix]
4600 */
4601 n = putVarint(c, nPrefix);
4602 n += putVarint(c+n, nTerm-nPrefix);
4603 dataBufferAppend2(&pWriter->data, c, n, pTerm+nPrefix, nTerm-nPrefix);
4604 }
4605 dataBufferReplace(&pWriter->term, pTerm, nTerm);
4606
4607 return nPrefix+1;
4608}
4609
4610/* Used to avoid a memmove when a large amount of doclist data is in
4611** the buffer. This constructs a node and term header before
4612** iDoclistData and flushes the resulting complete node using
4613** leafWriterInternalFlush().
4614*/
4615static int leafWriterInlineFlush(fulltext_vtab *v, LeafWriter *pWriter,
4616 const char *pTerm, int nTerm,
4617 int iDoclistData){
4618 char c[VARINT_MAX+VARINT_MAX];
4619 int iData, n = putVarint(c, 0);
4620 n += putVarint(c+n, nTerm);
4621
4622 /* There should always be room for the header. Even if pTerm shared
4623 ** a substantial prefix with the previous term, the entire prefix
4624 ** could be constructed from earlier data in the doclist, so there
4625 ** should be room.
4626 */
4627 assert( iDoclistData>=n+nTerm );
4628
4629 iData = iDoclistData-(n+nTerm);
4630 memcpy(pWriter->data.pData+iData, c, n);
4631 memcpy(pWriter->data.pData+iData+n, pTerm, nTerm);
4632
4633 return leafWriterInternalFlush(v, pWriter, iData, pWriter->data.nData-iData);
4634}
4635
4636/* Push pTerm[nTerm] along with the doclist data to the leaf layer of
4637** %_segments.
4638*/
4639static int leafWriterStepMerge(fulltext_vtab *v, LeafWriter *pWriter,
4640 const char *pTerm, int nTerm,
4641 DLReader *pReaders, int nReaders){
4642 char c[VARINT_MAX+VARINT_MAX];
4643 int iTermData = pWriter->data.nData, iDoclistData;
4644 int i, nData, n, nActualData, nActual, rc, nTermDistinct;
4645
4646 ASSERT_VALID_LEAF_NODE(pWriter->data.pData, pWriter->data.nData);
4647 nTermDistinct = leafWriterEncodeTerm(pWriter, pTerm, nTerm);
4648
4649 /* Remember nTermDistinct if opening a new node. */
4650 if( iTermData==0 ) pWriter->nTermDistinct = nTermDistinct;
4651
4652 iDoclistData = pWriter->data.nData;
4653
4654 /* Estimate the length of the merged doclist so we can leave space
4655 ** to encode it.
4656 */
4657 for(i=0, nData=0; i<nReaders; i++){
4658 nData += dlrAllDataBytes(&pReaders[i]);
4659 }
4660 n = putVarint(c, nData);
4661 dataBufferAppend(&pWriter->data, c, n);
4662
4663 docListMerge(&pWriter->data, pReaders, nReaders);
4664 ASSERT_VALID_DOCLIST(DL_DEFAULT,
4665 pWriter->data.pData+iDoclistData+n,
4666 pWriter->data.nData-iDoclistData-n, NULL);
4667
4668 /* The actual amount of doclist data at this point could be smaller
4669 ** than the length we encoded. Additionally, the space required to
4670 ** encode this length could be smaller. For small doclists, this is
4671 ** not a big deal, we can just use memmove() to adjust things.
4672 */
4673 nActualData = pWriter->data.nData-(iDoclistData+n);
4674 nActual = putVarint(c, nActualData);
4675 assert( nActualData<=nData );
4676 assert( nActual<=n );
4677
4678 /* If the new doclist is big enough for force a standalone leaf
4679 ** node, we can immediately flush it inline without doing the
4680 ** memmove().
4681 */
4682 /* TODO(shess) This test matches leafWriterStep(), which does this
4683 ** test before it knows the cost to varint-encode the term and
4684 ** doclist lengths. At some point, change to
4685 ** pWriter->data.nData-iTermData>STANDALONE_MIN.
4686 */
4687 if( nTerm+nActualData>STANDALONE_MIN ){
4688 /* Push leaf node from before this term. */
4689 if( iTermData>0 ){
4690 rc = leafWriterInternalFlush(v, pWriter, 0, iTermData);
4691 if( rc!=SQLITE_OK ) return rc;
4692
4693 pWriter->nTermDistinct = nTermDistinct;
4694 }
4695
4696 /* Fix the encoded doclist length. */
4697 iDoclistData += n - nActual;
4698 memcpy(pWriter->data.pData+iDoclistData, c, nActual);
4699
4700 /* Push the standalone leaf node. */
4701 rc = leafWriterInlineFlush(v, pWriter, pTerm, nTerm, iDoclistData);
4702 if( rc!=SQLITE_OK ) return rc;
4703
4704 /* Leave the node empty. */
4705 dataBufferReset(&pWriter->data);
4706
4707 return rc;
4708 }
4709
4710 /* At this point, we know that the doclist was small, so do the
4711 ** memmove if indicated.
4712 */
4713 if( nActual<n ){
4714 memmove(pWriter->data.pData+iDoclistData+nActual,
4715 pWriter->data.pData+iDoclistData+n,
4716 pWriter->data.nData-(iDoclistData+n));
4717 pWriter->data.nData -= n-nActual;
4718 }
4719
4720 /* Replace written length with actual length. */
4721 memcpy(pWriter->data.pData+iDoclistData, c, nActual);
4722
4723 /* If the node is too large, break things up. */
4724 /* TODO(shess) This test matches leafWriterStep(), which does this
4725 ** test before it knows the cost to varint-encode the term and
4726 ** doclist lengths. At some point, change to
4727 ** pWriter->data.nData>LEAF_MAX.
4728 */
4729 if( iTermData+nTerm+nActualData>LEAF_MAX ){
4730 /* Flush out the leading data as a node */
4731 rc = leafWriterInternalFlush(v, pWriter, 0, iTermData);
4732 if( rc!=SQLITE_OK ) return rc;
4733
4734 pWriter->nTermDistinct = nTermDistinct;
4735
4736 /* Rebuild header using the current term */
4737 n = putVarint(pWriter->data.pData, 0);
4738 n += putVarint(pWriter->data.pData+n, nTerm);
4739 memcpy(pWriter->data.pData+n, pTerm, nTerm);
4740 n += nTerm;
4741
4742 /* There should always be room, because the previous encoding
4743 ** included all data necessary to construct the term.
4744 */
4745 assert( n<iDoclistData );
4746 /* So long as STANDALONE_MIN is half or less of LEAF_MAX, the
4747 ** following memcpy() is safe (as opposed to needing a memmove).
4748 */
4749 assert( 2*STANDALONE_MIN<=LEAF_MAX );
4750 assert( n+pWriter->data.nData-iDoclistData<iDoclistData );
4751 memcpy(pWriter->data.pData+n,
4752 pWriter->data.pData+iDoclistData,
4753 pWriter->data.nData-iDoclistData);
4754 pWriter->data.nData -= iDoclistData-n;
4755 }
4756 ASSERT_VALID_LEAF_NODE(pWriter->data.pData, pWriter->data.nData);
4757
4758 return SQLITE_OK;
4759}
4760
4761/* Push pTerm[nTerm] along with the doclist data to the leaf layer of
4762** %_segments.
4763*/
4764/* TODO(shess) Revise writeZeroSegment() so that doclists are
4765** constructed directly in pWriter->data.
4766*/
4767static int leafWriterStep(fulltext_vtab *v, LeafWriter *pWriter,
4768 const char *pTerm, int nTerm,
4769 const char *pData, int nData){
4770 int rc;
4771 DLReader reader;
4772
4773 dlrInit(&reader, DL_DEFAULT, pData, nData);
4774 rc = leafWriterStepMerge(v, pWriter, pTerm, nTerm, &reader, 1);
4775 dlrDestroy(&reader);
4776
4777 return rc;
4778}
4779
4780
4781/****************************************************************/
4782/* LeafReader is used to iterate over an individual leaf node. */
4783typedef struct LeafReader {
4784 DataBuffer term; /* copy of current term. */
4785
4786 const char *pData; /* data for current term. */
4787 int nData;
4788} LeafReader;
4789
4790static void leafReaderDestroy(LeafReader *pReader){
4791 dataBufferDestroy(&pReader->term);
4792 SCRAMBLE(pReader);
4793}
4794
4795static int leafReaderAtEnd(LeafReader *pReader){
4796 return pReader->nData<=0;
4797}
4798
4799/* Access the current term. */
4800static int leafReaderTermBytes(LeafReader *pReader){
4801 return pReader->term.nData;
4802}
4803static const char *leafReaderTerm(LeafReader *pReader){
4804 assert( pReader->term.nData>0 );
4805 return pReader->term.pData;
4806}
4807
4808/* Access the doclist data for the current term. */
4809static int leafReaderDataBytes(LeafReader *pReader){
4810 int nData;
4811 assert( pReader->term.nData>0 );
4812 getVarint32(pReader->pData, &nData);
4813 return nData;
4814}
4815static const char *leafReaderData(LeafReader *pReader){
4816 int n, nData;
4817 assert( pReader->term.nData>0 );
4818 n = getVarint32(pReader->pData, &nData);
4819 return pReader->pData+n;
4820}
4821
4822static void leafReaderInit(const char *pData, int nData,
4823 LeafReader *pReader){
4824 int nTerm, n;
4825
4826 assert( nData>0 );
4827 assert( pData[0]=='\0' );
4828
4829 CLEAR(pReader);
4830
4831 /* Read the first term, skipping the header byte. */
4832 n = getVarint32(pData+1, &nTerm);
4833 dataBufferInit(&pReader->term, nTerm);
4834 dataBufferReplace(&pReader->term, pData+1+n, nTerm);
4835
4836 /* Position after the first term. */
4837 assert( 1+n+nTerm<nData );
4838 pReader->pData = pData+1+n+nTerm;
4839 pReader->nData = nData-1-n-nTerm;
4840}
4841
4842/* Step the reader forward to the next term. */
4843static void leafReaderStep(LeafReader *pReader){
4844 int n, nData, nPrefix, nSuffix;
4845 assert( !leafReaderAtEnd(pReader) );
4846
4847 /* Skip previous entry's data block. */
4848 n = getVarint32(pReader->pData, &nData);
4849 assert( n+nData<=pReader->nData );
4850 pReader->pData += n+nData;
4851 pReader->nData -= n+nData;
4852
4853 if( !leafReaderAtEnd(pReader) ){
4854 /* Construct the new term using a prefix from the old term plus a
4855 ** suffix from the leaf data.
4856 */
4857 n = getVarint32(pReader->pData, &nPrefix);
4858 n += getVarint32(pReader->pData+n, &nSuffix);
4859 assert( n+nSuffix<pReader->nData );
4860 pReader->term.nData = nPrefix;
4861 dataBufferAppend(&pReader->term, pReader->pData+n, nSuffix);
4862
4863 pReader->pData += n+nSuffix;
4864 pReader->nData -= n+nSuffix;
4865 }
4866}
4867
4868/* strcmp-style comparison of pReader's current term against pTerm.
4869** If isPrefix, equality means equal through nTerm bytes.
4870*/
4871static int leafReaderTermCmp(LeafReader *pReader,
4872 const char *pTerm, int nTerm, int isPrefix){
4873 int c, n = pReader->term.nData<nTerm ? pReader->term.nData : nTerm;
4874 if( n==0 ){
4875 if( pReader->term.nData>0 ) return -1;
4876 if(nTerm>0 ) return 1;
4877 return 0;
4878 }
4879
4880 c = memcmp(pReader->term.pData, pTerm, n);
4881 if( c!=0 ) return c;
4882 if( isPrefix && n==nTerm ) return 0;
4883 return pReader->term.nData - nTerm;
4884}
4885
4886
4887/****************************************************************/
4888/* LeavesReader wraps LeafReader to allow iterating over the entire
4889** leaf layer of the tree.
4890*/
4891typedef struct LeavesReader {
4892 int idx; /* Index within the segment. */
4893
4894 sqlite3_stmt *pStmt; /* Statement we're streaming leaves from. */
4895 int eof; /* we've seen SQLITE_DONE from pStmt. */
4896
4897 LeafReader leafReader; /* reader for the current leaf. */
4898 DataBuffer rootData; /* root data for inline. */
4899} LeavesReader;
4900
4901/* Access the current term. */
4902static int leavesReaderTermBytes(LeavesReader *pReader){
4903 assert( !pReader->eof );
4904 return leafReaderTermBytes(&pReader->leafReader);
4905}
4906static const char *leavesReaderTerm(LeavesReader *pReader){
4907 assert( !pReader->eof );
4908 return leafReaderTerm(&pReader->leafReader);
4909}
4910
4911/* Access the doclist data for the current term. */
4912static int leavesReaderDataBytes(LeavesReader *pReader){
4913 assert( !pReader->eof );
4914 return leafReaderDataBytes(&pReader->leafReader);
4915}
4916static const char *leavesReaderData(LeavesReader *pReader){
4917 assert( !pReader->eof );
4918 return leafReaderData(&pReader->leafReader);
4919}
4920
4921static int leavesReaderAtEnd(LeavesReader *pReader){
4922 return pReader->eof;
4923}
4924
4925/* loadSegmentLeaves() may not read all the way to SQLITE_DONE, thus
4926** leaving the statement handle open, which locks the table.
4927*/
4928/* TODO(shess) This "solution" is not satisfactory. Really, there
4929** should be check-in function for all statement handles which
4930** arranges to call sqlite3_reset(). This most likely will require
4931** modification to control flow all over the place, though, so for now
4932** just punt.
4933**
4934** Note the the current system assumes that segment merges will run to
4935** completion, which is why this particular probably hasn't arisen in
4936** this case. Probably a brittle assumption.
4937*/
4938static int leavesReaderReset(LeavesReader *pReader){
4939 return sqlite3_reset(pReader->pStmt);
4940}
4941
4942static void leavesReaderDestroy(LeavesReader *pReader){
4943 leafReaderDestroy(&pReader->leafReader);
4944 dataBufferDestroy(&pReader->rootData);
4945 SCRAMBLE(pReader);
4946}
4947
4948/* Initialize pReader with the given root data (if iStartBlockid==0
4949** the leaf data was entirely contained in the root), or from the
4950** stream of blocks between iStartBlockid and iEndBlockid, inclusive.
4951*/
4952static int leavesReaderInit(fulltext_vtab *v,
4953 int idx,
4954 sqlite_int64 iStartBlockid,
4955 sqlite_int64 iEndBlockid,
4956 const char *pRootData, int nRootData,
4957 LeavesReader *pReader){
4958 CLEAR(pReader);
4959 pReader->idx = idx;
4960
4961 dataBufferInit(&pReader->rootData, 0);
4962 if( iStartBlockid==0 ){
4963 /* Entire leaf level fit in root data. */
4964 dataBufferReplace(&pReader->rootData, pRootData, nRootData);
4965 leafReaderInit(pReader->rootData.pData, pReader->rootData.nData,
4966 &pReader->leafReader);
4967 }else{
4968 sqlite3_stmt *s;
4969 int rc = sql_get_leaf_statement(v, idx, &s);
4970 if( rc!=SQLITE_OK ) return rc;
4971
4972 rc = sqlite3_bind_int64(s, 1, iStartBlockid);
4973 if( rc!=SQLITE_OK ) return rc;
4974
4975 rc = sqlite3_bind_int64(s, 2, iEndBlockid);
4976 if( rc!=SQLITE_OK ) return rc;
4977
4978 rc = sqlite3_step(s);
4979 if( rc==SQLITE_DONE ){
4980 pReader->eof = 1;
4981 return SQLITE_OK;
4982 }
4983 if( rc!=SQLITE_ROW ) return rc;
4984
4985 pReader->pStmt = s;
4986 leafReaderInit(sqlite3_column_blob(pReader->pStmt, 0),
4987 sqlite3_column_bytes(pReader->pStmt, 0),
4988 &pReader->leafReader);
4989 }
4990 return SQLITE_OK;
4991}
4992
4993/* Step the current leaf forward to the next term. If we reach the
4994** end of the current leaf, step forward to the next leaf block.
4995*/
4996static int leavesReaderStep(fulltext_vtab *v, LeavesReader *pReader){
4997 assert( !leavesReaderAtEnd(pReader) );
4998 leafReaderStep(&pReader->leafReader);
4999
5000 if( leafReaderAtEnd(&pReader->leafReader) ){
5001 int rc;
5002 if( pReader->rootData.pData ){
5003 pReader->eof = 1;
5004 return SQLITE_OK;
5005 }
5006 rc = sqlite3_step(pReader->pStmt);
5007 if( rc!=SQLITE_ROW ){
5008 pReader->eof = 1;
5009 return rc==SQLITE_DONE ? SQLITE_OK : rc;
5010 }
5011 leafReaderDestroy(&pReader->leafReader);
5012 leafReaderInit(sqlite3_column_blob(pReader->pStmt, 0),
5013 sqlite3_column_bytes(pReader->pStmt, 0),
5014 &pReader->leafReader);
5015 }
5016 return SQLITE_OK;
5017}
5018
5019/* Order LeavesReaders by their term, ignoring idx. Readers at eof
5020** always sort to the end.
5021*/
5022static int leavesReaderTermCmp(LeavesReader *lr1, LeavesReader *lr2){
5023 if( leavesReaderAtEnd(lr1) ){
5024 if( leavesReaderAtEnd(lr2) ) return 0;
5025 return 1;
5026 }
5027 if( leavesReaderAtEnd(lr2) ) return -1;
5028
5029 return leafReaderTermCmp(&lr1->leafReader,
5030 leavesReaderTerm(lr2), leavesReaderTermBytes(lr2),
5031 0);
5032}
5033
5034/* Similar to leavesReaderTermCmp(), with additional ordering by idx
5035** so that older segments sort before newer segments.
5036*/
5037static int leavesReaderCmp(LeavesReader *lr1, LeavesReader *lr2){
5038 int c = leavesReaderTermCmp(lr1, lr2);
5039 if( c!=0 ) return c;
5040 return lr1->idx-lr2->idx;
5041}
5042
5043/* Assume that pLr[1]..pLr[nLr] are sorted. Bubble pLr[0] into its
5044** sorted position.
5045*/
5046static void leavesReaderReorder(LeavesReader *pLr, int nLr){
5047 while( nLr>1 && leavesReaderCmp(pLr, pLr+1)>0 ){
5048 LeavesReader tmp = pLr[0];
5049 pLr[0] = pLr[1];
5050 pLr[1] = tmp;
5051 nLr--;
5052 pLr++;
5053 }
5054}
5055
5056/* Initializes pReaders with the segments from level iLevel, returning
5057** the number of segments in *piReaders. Leaves pReaders in sorted
5058** order.
5059*/
5060static int leavesReadersInit(fulltext_vtab *v, int iLevel,
5061 LeavesReader *pReaders, int *piReaders){
5062 sqlite3_stmt *s;
5063 int i, rc = sql_get_statement(v, SEGDIR_SELECT_STMT, &s);
5064 if( rc!=SQLITE_OK ) return rc;
5065
5066 rc = sqlite3_bind_int(s, 1, iLevel);
5067 if( rc!=SQLITE_OK ) return rc;
5068
5069 i = 0;
5070 while( (rc = sqlite3_step(s))==SQLITE_ROW ){
5071 sqlite_int64 iStart = sqlite3_column_int64(s, 0);
5072 sqlite_int64 iEnd = sqlite3_column_int64(s, 1);
5073 const char *pRootData = sqlite3_column_blob(s, 2);
5074 int nRootData = sqlite3_column_bytes(s, 2);
5075
5076 assert( i<MERGE_COUNT );
5077 rc = leavesReaderInit(v, i, iStart, iEnd, pRootData, nRootData,
5078 &pReaders[i]);
5079 if( rc!=SQLITE_OK ) break;
5080
5081 i++;
5082 }
5083 if( rc!=SQLITE_DONE ){
5084 while( i-->0 ){
5085 leavesReaderDestroy(&pReaders[i]);
5086 }
5087 return rc;
5088 }
5089
5090 *piReaders = i;
5091
5092 /* Leave our results sorted by term, then age. */
5093 while( i-- ){
5094 leavesReaderReorder(pReaders+i, *piReaders-i);
5095 }
5096 return SQLITE_OK;
5097}
5098
5099/* Merge doclists from pReaders[nReaders] into a single doclist, which
5100** is written to pWriter. Assumes pReaders is ordered oldest to
5101** newest.
5102*/
5103/* TODO(shess) Consider putting this inline in segmentMerge(). */
5104static int leavesReadersMerge(fulltext_vtab *v,
5105 LeavesReader *pReaders, int nReaders,
5106 LeafWriter *pWriter){
5107 DLReader dlReaders[MERGE_COUNT];
5108 const char *pTerm = leavesReaderTerm(pReaders);
5109 int i, nTerm = leavesReaderTermBytes(pReaders);
5110
5111 assert( nReaders<=MERGE_COUNT );
5112
5113 for(i=0; i<nReaders; i++){
5114 dlrInit(&dlReaders[i], DL_DEFAULT,
5115 leavesReaderData(pReaders+i),
5116 leavesReaderDataBytes(pReaders+i));
5117 }
5118
5119 return leafWriterStepMerge(v, pWriter, pTerm, nTerm, dlReaders, nReaders);
5120}
5121
5122/* Forward ref due to mutual recursion with segdirNextIndex(). */
5123static int segmentMerge(fulltext_vtab *v, int iLevel);
5124
5125/* Put the next available index at iLevel into *pidx. If iLevel
5126** already has MERGE_COUNT segments, they are merged to a higher
5127** level to make room.
5128*/
5129static int segdirNextIndex(fulltext_vtab *v, int iLevel, int *pidx){
5130 int rc = segdir_max_index(v, iLevel, pidx);
5131 if( rc==SQLITE_DONE ){ /* No segments at iLevel. */
5132 *pidx = 0;
5133 }else if( rc==SQLITE_ROW ){
5134 if( *pidx==(MERGE_COUNT-1) ){
5135 rc = segmentMerge(v, iLevel);
5136 if( rc!=SQLITE_OK ) return rc;
5137 *pidx = 0;
5138 }else{
5139 (*pidx)++;
5140 }
5141 }else{
5142 return rc;
5143 }
5144 return SQLITE_OK;
5145}
5146
5147/* Merge MERGE_COUNT segments at iLevel into a new segment at
5148** iLevel+1. If iLevel+1 is already full of segments, those will be
5149** merged to make room.
5150*/
5151static int segmentMerge(fulltext_vtab *v, int iLevel){
5152 LeafWriter writer;
5153 LeavesReader lrs[MERGE_COUNT];
5154 int i, rc, idx = 0;
5155
5156 /* Determine the next available segment index at the next level,
5157 ** merging as necessary.
5158 */
5159 rc = segdirNextIndex(v, iLevel+1, &idx);
5160 if( rc!=SQLITE_OK ) return rc;
5161
5162 /* TODO(shess) This assumes that we'll always see exactly
5163 ** MERGE_COUNT segments to merge at a given level. That will be
5164 ** broken if we allow the developer to request preemptive or
5165 ** deferred merging.
5166 */
5167 memset(&lrs, '\0', sizeof(lrs));
5168 rc = leavesReadersInit(v, iLevel, lrs, &i);
5169 if( rc!=SQLITE_OK ) return rc;
5170 assert( i==MERGE_COUNT );
5171
5172 leafWriterInit(iLevel+1, idx, &writer);
5173
5174 /* Since leavesReaderReorder() pushes readers at eof to the end,
5175 ** when the first reader is empty, all will be empty.
5176 */
5177 while( !leavesReaderAtEnd(lrs) ){
5178 /* Figure out how many readers share their next term. */
5179 for(i=1; i<MERGE_COUNT && !leavesReaderAtEnd(lrs+i); i++){
5180 if( 0!=leavesReaderTermCmp(lrs, lrs+i) ) break;
5181 }
5182
5183 rc = leavesReadersMerge(v, lrs, i, &writer);
5184 if( rc!=SQLITE_OK ) goto err;
5185
5186 /* Step forward those that were merged. */
5187 while( i-->0 ){
5188 rc = leavesReaderStep(v, lrs+i);
5189 if( rc!=SQLITE_OK ) goto err;
5190
5191 /* Reorder by term, then by age. */
5192 leavesReaderReorder(lrs+i, MERGE_COUNT-i);
5193 }
5194 }
5195
5196 for(i=0; i<MERGE_COUNT; i++){
5197 leavesReaderDestroy(&lrs[i]);
5198 }
5199
5200 rc = leafWriterFinalize(v, &writer);
5201 leafWriterDestroy(&writer);
5202 if( rc!=SQLITE_OK ) return rc;
5203
5204 /* Delete the merged segment data. */
5205 return segdir_delete(v, iLevel);
5206
5207 err:
5208 for(i=0; i<MERGE_COUNT; i++){
5209 leavesReaderDestroy(&lrs[i]);
5210 }
5211 leafWriterDestroy(&writer);
5212 return rc;
5213}
5214
5215/* Scan pReader for pTerm/nTerm, and merge the term's doclist over
5216** *out (any doclists with duplicate docids overwrite those in *out).
5217** Internal function for loadSegmentLeaf().
5218*/
5219static int loadSegmentLeavesInt(fulltext_vtab *v, LeavesReader *pReader,
5220 const char *pTerm, int nTerm, int isPrefix,
5221 DataBuffer *out){
5222 assert( nTerm>0 );
5223
5224 /* Process while the prefix matches. */
5225 while( !leavesReaderAtEnd(pReader) ){
5226 /* TODO(shess) Really want leavesReaderTermCmp(), but that name is
5227 ** already taken to compare the terms of two LeavesReaders. Think
5228 ** on a better name. [Meanwhile, break encapsulation rather than
5229 ** use a confusing name.]
5230 */
5231 int rc;
5232 int c = leafReaderTermCmp(&pReader->leafReader, pTerm, nTerm, isPrefix);
5233 if( c==0 ){
5234 const char *pData = leavesReaderData(pReader);
5235 int nData = leavesReaderDataBytes(pReader);
5236 if( out->nData==0 ){
5237 dataBufferReplace(out, pData, nData);
5238 }else{
5239 DataBuffer result;
5240 dataBufferInit(&result, out->nData+nData);
5241 docListUnion(out->pData, out->nData, pData, nData, &result);
5242 dataBufferDestroy(out);
5243 *out = result;
5244 /* TODO(shess) Rather than destroy out, we could retain it for
5245 ** later reuse.
5246 */
5247 }
5248 }
5249 if( c>0 ) break; /* Past any possible matches. */
5250
5251 rc = leavesReaderStep(v, pReader);
5252 if( rc!=SQLITE_OK ) return rc;
5253 }
5254 return SQLITE_OK;
5255}
5256
5257/* Call loadSegmentLeavesInt() with pData/nData as input. */
5258static int loadSegmentLeaf(fulltext_vtab *v, const char *pData, int nData,
5259 const char *pTerm, int nTerm, int isPrefix,
5260 DataBuffer *out){
5261 LeavesReader reader;
5262 int rc;
5263
5264 assert( nData>1 );
5265 assert( *pData=='\0' );
5266 rc = leavesReaderInit(v, 0, 0, 0, pData, nData, &reader);
5267 if( rc!=SQLITE_OK ) return rc;
5268
5269 rc = loadSegmentLeavesInt(v, &reader, pTerm, nTerm, isPrefix, out);
5270 leavesReaderReset(&reader);
5271 leavesReaderDestroy(&reader);
5272 return rc;
5273}
5274
5275/* Call loadSegmentLeavesInt() with the leaf nodes from iStartLeaf to
5276** iEndLeaf (inclusive) as input, and merge the resulting doclist into
5277** out.
5278*/
5279static int loadSegmentLeaves(fulltext_vtab *v,
5280 sqlite_int64 iStartLeaf, sqlite_int64 iEndLeaf,
5281 const char *pTerm, int nTerm, int isPrefix,
5282 DataBuffer *out){
5283 int rc;
5284 LeavesReader reader;
5285
5286 assert( iStartLeaf<=iEndLeaf );
5287 rc = leavesReaderInit(v, 0, iStartLeaf, iEndLeaf, NULL, 0, &reader);
5288 if( rc!=SQLITE_OK ) return rc;
5289
5290 rc = loadSegmentLeavesInt(v, &reader, pTerm, nTerm, isPrefix, out);
5291 leavesReaderReset(&reader);
5292 leavesReaderDestroy(&reader);
5293 return rc;
5294}
5295
5296/* Taking pData/nData as an interior node, find the sequence of child
5297** nodes which could include pTerm/nTerm/isPrefix. Note that the
5298** interior node terms logically come between the blocks, so there is
5299** one more blockid than there are terms (that block contains terms >=
5300** the last interior-node term).
5301*/
5302/* TODO(shess) The calling code may already know that the end child is
5303** not worth calculating, because the end may be in a later sibling
5304** node. Consider whether breaking symmetry is worthwhile. I suspect
5305** it's not worthwhile.
5306*/
5307static void getChildrenContaining(const char *pData, int nData,
5308 const char *pTerm, int nTerm, int isPrefix,
5309 sqlite_int64 *piStartChild,
5310 sqlite_int64 *piEndChild){
5311 InteriorReader reader;
5312
5313 assert( nData>1 );
5314 assert( *pData!='\0' );
5315 interiorReaderInit(pData, nData, &reader);
5316
5317 /* Scan for the first child which could contain pTerm/nTerm. */
5318 while( !interiorReaderAtEnd(&reader) ){
5319 if( interiorReaderTermCmp(&reader, pTerm, nTerm, 0)>0 ) break;
5320 interiorReaderStep(&reader);
5321 }
5322 *piStartChild = interiorReaderCurrentBlockid(&reader);
5323
5324 /* Keep scanning to find a term greater than our term, using prefix
5325 ** comparison if indicated. If isPrefix is false, this will be the
5326 ** same blockid as the starting block.
5327 */
5328 while( !interiorReaderAtEnd(&reader) ){
5329 if( interiorReaderTermCmp(&reader, pTerm, nTerm, isPrefix)>0 ) break;
5330 interiorReaderStep(&reader);
5331 }
5332 *piEndChild = interiorReaderCurrentBlockid(&reader);
5333
5334 interiorReaderDestroy(&reader);
5335
5336 /* Children must ascend, and if !prefix, both must be the same. */
5337 assert( *piEndChild>=*piStartChild );
5338 assert( isPrefix || *piStartChild==*piEndChild );
5339}
5340
5341/* Read block at iBlockid and pass it with other params to
5342** getChildrenContaining().
5343*/
5344static int loadAndGetChildrenContaining(
5345 fulltext_vtab *v,
5346 sqlite_int64 iBlockid,
5347 const char *pTerm, int nTerm, int isPrefix,
5348 sqlite_int64 *piStartChild, sqlite_int64 *piEndChild
5349){
5350 sqlite3_stmt *s = NULL;
5351 int rc;
5352
5353 assert( iBlockid!=0 );
5354 assert( pTerm!=NULL );
5355 assert( nTerm!=0 ); /* TODO(shess) Why not allow this? */
5356 assert( piStartChild!=NULL );
5357 assert( piEndChild!=NULL );
5358
5359 rc = sql_get_statement(v, BLOCK_SELECT_STMT, &s);
5360 if( rc!=SQLITE_OK ) return rc;
5361
5362 rc = sqlite3_bind_int64(s, 1, iBlockid);
5363 if( rc!=SQLITE_OK ) return rc;
5364
5365 rc = sqlite3_step(s);
5366 if( rc==SQLITE_DONE ) return SQLITE_ERROR;
5367 if( rc!=SQLITE_ROW ) return rc;
5368
5369 getChildrenContaining(sqlite3_column_blob(s, 0), sqlite3_column_bytes(s, 0),
5370 pTerm, nTerm, isPrefix, piStartChild, piEndChild);
5371
5372 /* We expect only one row. We must execute another sqlite3_step()
5373 * to complete the iteration; otherwise the table will remain
5374 * locked. */
5375 rc = sqlite3_step(s);
5376 if( rc==SQLITE_ROW ) return SQLITE_ERROR;
5377 if( rc!=SQLITE_DONE ) return rc;
5378
5379 return SQLITE_OK;
5380}
5381
5382/* Traverse the tree represented by pData[nData] looking for
5383** pTerm[nTerm], placing its doclist into *out. This is internal to
5384** loadSegment() to make error-handling cleaner.
5385*/
5386static int loadSegmentInt(fulltext_vtab *v, const char *pData, int nData,
5387 sqlite_int64 iLeavesEnd,
5388 const char *pTerm, int nTerm, int isPrefix,
5389 DataBuffer *out){
5390 /* Special case where root is a leaf. */
5391 if( *pData=='\0' ){
5392 return loadSegmentLeaf(v, pData, nData, pTerm, nTerm, isPrefix, out);
5393 }else{
5394 int rc;
5395 sqlite_int64 iStartChild, iEndChild;
5396
5397 /* Process pData as an interior node, then loop down the tree
5398 ** until we find the set of leaf nodes to scan for the term.
5399 */
5400 getChildrenContaining(pData, nData, pTerm, nTerm, isPrefix,
5401 &iStartChild, &iEndChild);
5402 while( iStartChild>iLeavesEnd ){
5403 sqlite_int64 iNextStart, iNextEnd;
5404 rc = loadAndGetChildrenContaining(v, iStartChild, pTerm, nTerm, isPrefix,
5405 &iNextStart, &iNextEnd);
5406 if( rc!=SQLITE_OK ) return rc;
5407
5408 /* If we've branched, follow the end branch, too. */
5409 if( iStartChild!=iEndChild ){
5410 sqlite_int64 iDummy;
5411 rc = loadAndGetChildrenContaining(v, iEndChild, pTerm, nTerm, isPrefix,
5412 &iDummy, &iNextEnd);
5413 if( rc!=SQLITE_OK ) return rc;
5414 }
5415
5416 assert( iNextStart<=iNextEnd );
5417 iStartChild = iNextStart;
5418 iEndChild = iNextEnd;
5419 }
5420 assert( iStartChild<=iLeavesEnd );
5421 assert( iEndChild<=iLeavesEnd );
5422
5423 /* Scan through the leaf segments for doclists. */
5424 return loadSegmentLeaves(v, iStartChild, iEndChild,
5425 pTerm, nTerm, isPrefix, out);
5426 }
5427}
5428
5429/* Call loadSegmentInt() to collect the doclist for pTerm/nTerm, then
5430** merge its doclist over *out (any duplicate doclists read from the
5431** segment rooted at pData will overwrite those in *out).
5432*/
5433/* TODO(shess) Consider changing this to determine the depth of the
5434** leaves using either the first characters of interior nodes (when
5435** ==1, we're one level above the leaves), or the first character of
5436** the root (which will describe the height of the tree directly).
5437** Either feels somewhat tricky to me.
5438*/
5439/* TODO(shess) The current merge is likely to be slow for large
5440** doclists (though it should process from newest/smallest to
5441** oldest/largest, so it may not be that bad). It might be useful to
5442** modify things to allow for N-way merging. This could either be
5443** within a segment, with pairwise merges across segments, or across
5444** all segments at once.
5445*/
5446static int loadSegment(fulltext_vtab *v, const char *pData, int nData,
5447 sqlite_int64 iLeavesEnd,
5448 const char *pTerm, int nTerm, int isPrefix,
5449 DataBuffer *out){
5450 DataBuffer result;
5451 int rc;
5452
5453 assert( nData>1 );
5454
5455 /* This code should never be called with buffered updates. */
5456 assert( v->nPendingData<0 );
5457
5458 dataBufferInit(&result, 0);
5459 rc = loadSegmentInt(v, pData, nData, iLeavesEnd,
5460 pTerm, nTerm, isPrefix, &result);
5461 if( rc==SQLITE_OK && result.nData>0 ){
5462 if( out->nData==0 ){
5463 DataBuffer tmp = *out;
5464 *out = result;
5465 result = tmp;
5466 }else{
5467 DataBuffer merged;
5468 DLReader readers[2];
5469
5470 dlrInit(&readers[0], DL_DEFAULT, out->pData, out->nData);
5471 dlrInit(&readers[1], DL_DEFAULT, result.pData, result.nData);
5472 dataBufferInit(&merged, out->nData+result.nData);
5473 docListMerge(&merged, readers, 2);
5474 dataBufferDestroy(out);
5475 *out = merged;
5476 dlrDestroy(&readers[0]);
5477 dlrDestroy(&readers[1]);
5478 }
5479 }
5480 dataBufferDestroy(&result);
5481 return rc;
5482}
5483
5484/* Scan the database and merge together the posting lists for the term
5485** into *out.
5486*/
5487static int termSelect(fulltext_vtab *v, int iColumn,
5488 const char *pTerm, int nTerm, int isPrefix,
5489 DocListType iType, DataBuffer *out){
5490 DataBuffer doclist;
5491 sqlite3_stmt *s;
5492 int rc = sql_get_statement(v, SEGDIR_SELECT_ALL_STMT, &s);
5493 if( rc!=SQLITE_OK ) return rc;
5494
5495 /* This code should never be called with buffered updates. */
5496 assert( v->nPendingData<0 );
5497
5498 dataBufferInit(&doclist, 0);
5499
5500 /* Traverse the segments from oldest to newest so that newer doclist
5501 ** elements for given docids overwrite older elements.
5502 */
5503 while( (rc = sqlite3_step(s))==SQLITE_ROW ){
5504 const char *pData = sqlite3_column_blob(s, 0);
5505 const int nData = sqlite3_column_bytes(s, 0);
5506 const sqlite_int64 iLeavesEnd = sqlite3_column_int64(s, 1);
5507 rc = loadSegment(v, pData, nData, iLeavesEnd, pTerm, nTerm, isPrefix,
5508 &doclist);
5509 if( rc!=SQLITE_OK ) goto err;
5510 }
5511 if( rc==SQLITE_DONE ){
5512 if( doclist.nData!=0 ){
5513 /* TODO(shess) The old term_select_all() code applied the column
5514 ** restrict as we merged segments, leading to smaller buffers.
5515 ** This is probably worthwhile to bring back, once the new storage
5516 ** system is checked in.
5517 */
5518 if( iColumn==v->nColumn) iColumn = -1;
5519 docListTrim(DL_DEFAULT, doclist.pData, doclist.nData,
5520 iColumn, iType, out);
5521 }
5522 rc = SQLITE_OK;
5523 }
5524
5525 err:
5526 dataBufferDestroy(&doclist);
5527 return rc;
5528}
5529
5530/****************************************************************/
5531/* Used to hold hashtable data for sorting. */
5532typedef struct TermData {
5533 const char *pTerm;
5534 int nTerm;
5535 DLCollector *pCollector;
5536} TermData;
5537
5538/* Orders TermData elements in strcmp fashion ( <0 for less-than, 0
5539** for equal, >0 for greater-than).
5540*/
5541static int termDataCmp(const void *av, const void *bv){
5542 const TermData *a = (const TermData *)av;
5543 const TermData *b = (const TermData *)bv;
5544 int n = a->nTerm<b->nTerm ? a->nTerm : b->nTerm;
5545 int c = memcmp(a->pTerm, b->pTerm, n);
5546 if( c!=0 ) return c;
5547 return a->nTerm-b->nTerm;
5548}
5549
5550/* Order pTerms data by term, then write a new level 0 segment using
5551** LeafWriter.
5552*/
5553static int writeZeroSegment(fulltext_vtab *v, fts2Hash *pTerms){
5554 fts2HashElem *e;
5555 int idx, rc, i, n;
5556 TermData *pData;
5557 LeafWriter writer;
5558 DataBuffer dl;
5559
5560 /* Determine the next index at level 0, merging as necessary. */
5561 rc = segdirNextIndex(v, 0, &idx);
5562 if( rc!=SQLITE_OK ) return rc;
5563
5564 n = fts2HashCount(pTerms);
5565 pData = malloc(n*sizeof(TermData));
5566
5567 for(i = 0, e = fts2HashFirst(pTerms); e; i++, e = fts2HashNext(e)){
5568 assert( i<n );
5569 pData[i].pTerm = fts2HashKey(e);
5570 pData[i].nTerm = fts2HashKeysize(e);
5571 pData[i].pCollector = fts2HashData(e);
5572 }
5573 assert( i==n );
5574
5575 /* TODO(shess) Should we allow user-defined collation sequences,
5576 ** here? I think we only need that once we support prefix searches.
5577 */
5578 if( n>1 ) qsort(pData, n, sizeof(*pData), termDataCmp);
5579
5580 /* TODO(shess) Refactor so that we can write directly to the segment
5581 ** DataBuffer, as happens for segment merges.
5582 */
5583 leafWriterInit(0, idx, &writer);
5584 dataBufferInit(&dl, 0);
5585 for(i=0; i<n; i++){
5586 dataBufferReset(&dl);
5587 dlcAddDoclist(pData[i].pCollector, &dl);
5588 rc = leafWriterStep(v, &writer,
5589 pData[i].pTerm, pData[i].nTerm, dl.pData, dl.nData);
5590 if( rc!=SQLITE_OK ) goto err;
5591 }
5592 rc = leafWriterFinalize(v, &writer);
5593
5594 err:
5595 dataBufferDestroy(&dl);
5596 free(pData);
5597 leafWriterDestroy(&writer);
5598 return rc;
5599}
5600
5601/* If pendingTerms has data, free it. */
5602static int clearPendingTerms(fulltext_vtab *v){
5603 if( v->nPendingData>=0 ){
5604 fts2HashElem *e;
5605 for(e=fts2HashFirst(&v->pendingTerms); e; e=fts2HashNext(e)){
5606 dlcDelete(fts2HashData(e));
5607 }
5608 fts2HashClear(&v->pendingTerms);
5609 v->nPendingData = -1;
5610 }
5611 return SQLITE_OK;
5612}
5613
5614/* If pendingTerms has data, flush it to a level-zero segment, and
5615** free it.
5616*/
5617static int flushPendingTerms(fulltext_vtab *v){
5618 if( v->nPendingData>=0 ){
5619 int rc = writeZeroSegment(v, &v->pendingTerms);
5620 if( rc==SQLITE_OK ) clearPendingTerms(v);
5621 return rc;
5622 }
5623 return SQLITE_OK;
5624}
5625
5626/* If pendingTerms is "too big", or docid is out of order, flush it.
5627** Regardless, be certain that pendingTerms is initialized for use.
5628*/
5629static int initPendingTerms(fulltext_vtab *v, sqlite_int64 iDocid){
5630 /* TODO(shess) Explore whether partially flushing the buffer on
5631 ** forced-flush would provide better performance. I suspect that if
5632 ** we ordered the doclists by size and flushed the largest until the
5633 ** buffer was half empty, that would let the less frequent terms
5634 ** generate longer doclists.
5635 */
5636 if( iDocid<=v->iPrevDocid || v->nPendingData>kPendingThreshold ){
5637 int rc = flushPendingTerms(v);
5638 if( rc!=SQLITE_OK ) return rc;
5639 }
5640 if( v->nPendingData<0 ){
5641 fts2HashInit(&v->pendingTerms, FTS2_HASH_STRING, 1);
5642 v->nPendingData = 0;
5643 }
5644 v->iPrevDocid = iDocid;
5645 return SQLITE_OK;
5646}
5647
5648/* This function implements the xUpdate callback; it's the top-level entry
5649 * point for inserting, deleting or updating a row in a full-text table. */
5650static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg,
5651 sqlite_int64 *pRowid){
5652 fulltext_vtab *v = (fulltext_vtab *) pVtab;
5653 int rc;
5654
5655 TRACE(("FTS2 Update %p\n", pVtab));
5656
5657 if( nArg<2 ){
5658 rc = index_delete(v, sqlite3_value_int64(ppArg[0]));
5659 } else if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){
5660 /* An update:
5661 * ppArg[0] = old rowid
5662 * ppArg[1] = new rowid
5663 * ppArg[2..2+v->nColumn-1] = values
5664 * ppArg[2+v->nColumn] = value for magic column (we ignore this)
5665 */
5666 sqlite_int64 rowid = sqlite3_value_int64(ppArg[0]);
5667 if( sqlite3_value_type(ppArg[1]) != SQLITE_INTEGER ||
5668 sqlite3_value_int64(ppArg[1]) != rowid ){
5669 rc = SQLITE_ERROR; /* we don't allow changing the rowid */
5670 } else {
5671 assert( nArg==2+v->nColumn+1);
5672 rc = index_update(v, rowid, &ppArg[2]);
5673 }
5674 } else {
5675 /* An insert:
5676 * ppArg[1] = requested rowid
5677 * ppArg[2..2+v->nColumn-1] = values
5678 * ppArg[2+v->nColumn] = value for magic column (we ignore this)
5679 */
5680 assert( nArg==2+v->nColumn+1);
5681 rc = index_insert(v, ppArg[1], &ppArg[2], pRowid);
5682 }
5683
5684 return rc;
5685}
5686
5687static int fulltextSync(sqlite3_vtab *pVtab){
5688 TRACE(("FTS2 xSync()\n"));
5689 return flushPendingTerms((fulltext_vtab *)pVtab);
5690}
5691
5692static int fulltextBegin(sqlite3_vtab *pVtab){
5693 fulltext_vtab *v = (fulltext_vtab *) pVtab;
5694 TRACE(("FTS2 xBegin()\n"));
5695
5696 /* Any buffered updates should have been cleared by the previous
5697 ** transaction.
5698 */
5699 assert( v->nPendingData<0 );
5700 return clearPendingTerms(v);
5701}
5702
5703static int fulltextCommit(sqlite3_vtab *pVtab){
5704 fulltext_vtab *v = (fulltext_vtab *) pVtab;
5705 TRACE(("FTS2 xCommit()\n"));
5706
5707 /* Buffered updates should have been cleared by fulltextSync(). */
5708 assert( v->nPendingData<0 );
5709 return clearPendingTerms(v);
5710}
5711
5712static int fulltextRollback(sqlite3_vtab *pVtab){
5713 TRACE(("FTS2 xRollback()\n"));
5714 return clearPendingTerms((fulltext_vtab *)pVtab);
5715}
5716
5717/*
5718** Implementation of the snippet() function for FTS2
5719*/
5720static void snippetFunc(
5721 sqlite3_context *pContext,
5722 int argc,
5723 sqlite3_value **argv
5724){
5725 fulltext_cursor *pCursor;
5726 if( argc<1 ) return;
5727 if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
5728 sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
5729 sqlite3_result_error(pContext, "illegal first argument to html_snippet",-1);
5730 }else{
5731 const char *zStart = "<b>";
5732 const char *zEnd = "</b>";
5733 const char *zEllipsis = "<b>...</b>";
5734 memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
5735 if( argc>=2 ){
5736 zStart = (const char*)sqlite3_value_text(argv[1]);
5737 if( argc>=3 ){
5738 zEnd = (const char*)sqlite3_value_text(argv[2]);
5739 if( argc>=4 ){
5740 zEllipsis = (const char*)sqlite3_value_text(argv[3]);
5741 }
5742 }
5743 }
5744 snippetAllOffsets(pCursor);
5745 snippetText(pCursor, zStart, zEnd, zEllipsis);
5746 sqlite3_result_text(pContext, pCursor->snippet.zSnippet,
5747 pCursor->snippet.nSnippet, SQLITE_STATIC);
5748 }
5749}
5750
5751/*
5752** Implementation of the offsets() function for FTS2
5753*/
5754static void snippetOffsetsFunc(
5755 sqlite3_context *pContext,
5756 int argc,
5757 sqlite3_value **argv
5758){
5759 fulltext_cursor *pCursor;
5760 if( argc<1 ) return;
5761 if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
5762 sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
5763 sqlite3_result_error(pContext, "illegal first argument to offsets",-1);
5764 }else{
5765 memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
5766 snippetAllOffsets(pCursor);
5767 snippetOffsetText(&pCursor->snippet);
5768 sqlite3_result_text(pContext,
5769 pCursor->snippet.zOffset, pCursor->snippet.nOffset,
5770 SQLITE_STATIC);
5771 }
5772}
5773
5774/*
5775** This routine implements the xFindFunction method for the FTS2
5776** virtual table.
5777*/
5778static int fulltextFindFunction(
5779 sqlite3_vtab *pVtab,
5780 int nArg,
5781 const char *zName,
5782 void (**pxFunc)(sqlite3_context*,int,sqlite3_value**),
5783 void **ppArg
5784){
5785 if( strcmp(zName,"snippet")==0 ){
5786 *pxFunc = snippetFunc;
5787 return 1;
5788 }else if( strcmp(zName,"offsets")==0 ){
5789 *pxFunc = snippetOffsetsFunc;
5790 return 1;
5791 }
5792 return 0;
5793}
5794
5795/*
5796** Rename an fts2 table.
5797*/
5798static int fulltextRename(
5799 sqlite3_vtab *pVtab,
5800 const char *zName
5801){
5802 fulltext_vtab *p = (fulltext_vtab *)pVtab;
5803 int rc = SQLITE_NOMEM;
5804 char *zSql = sqlite3_mprintf(
5805 "ALTER TABLE %Q.'%q_content' RENAME TO '%q_content';"
5806 "ALTER TABLE %Q.'%q_segments' RENAME TO '%q_segments';"
5807 "ALTER TABLE %Q.'%q_segdir' RENAME TO '%q_segdir';"
5808 , p->zDb, p->zName, zName
5809 , p->zDb, p->zName, zName
5810 , p->zDb, p->zName, zName
5811 );
5812 if( zSql ){
5813 rc = sqlite3_exec(p->db, zSql, 0, 0, 0);
5814 sqlite3_free(zSql);
5815 }
5816 return rc;
5817}
5818
5819static const sqlite3_module fts2Module = {
5820 /* iVersion */ 0,
5821 /* xCreate */ fulltextCreate,
5822 /* xConnect */ fulltextConnect,
5823 /* xBestIndex */ fulltextBestIndex,
5824 /* xDisconnect */ fulltextDisconnect,
5825 /* xDestroy */ fulltextDestroy,
5826 /* xOpen */ fulltextOpen,
5827 /* xClose */ fulltextClose,
5828 /* xFilter */ fulltextFilter,
5829 /* xNext */ fulltextNext,
5830 /* xEof */ fulltextEof,
5831 /* xColumn */ fulltextColumn,
5832 /* xRowid */ fulltextRowid,
5833 /* xUpdate */ fulltextUpdate,
5834 /* xBegin */ fulltextBegin,
5835 /* xSync */ fulltextSync,
5836 /* xCommit */ fulltextCommit,
5837 /* xRollback */ fulltextRollback,
5838 /* xFindFunction */ fulltextFindFunction,
5839 /* xRename */ fulltextRename,
5840};
5841
5842static void hashDestroy(void *p){
5843 fts2Hash *pHash = (fts2Hash *)p;
5844 sqlite3Fts2HashClear(pHash);
5845 sqlite3_free(pHash);
5846}
5847
5848/*
5849** The fts2 built-in tokenizers - "simple" and "porter" - are implemented
5850** in files fts2_tokenizer1.c and fts2_porter.c respectively. The following
5851** two forward declarations are for functions declared in these files
5852** used to retrieve the respective implementations.
5853**
5854** Calling sqlite3Fts2SimpleTokenizerModule() sets the value pointed
5855** to by the argument to point a the "simple" tokenizer implementation.
5856** Function ...PorterTokenizerModule() sets *pModule to point to the
5857** porter tokenizer/stemmer implementation.
5858*/
5859void sqlite3Fts2SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule);
5860void sqlite3Fts2PorterTokenizerModule(sqlite3_tokenizer_module const**ppModule);
5861void sqlite3Fts2IcuTokenizerModule(sqlite3_tokenizer_module const**ppModule);
5862
5863int sqlite3Fts2InitHashTable(sqlite3 *, fts2Hash *, const char *);
5864
5865/*
5866** Initialise the fts2 extension. If this extension is built as part
5867** of the sqlite library, then this function is called directly by
5868** SQLite. If fts2 is built as a dynamically loadable extension, this
5869** function is called by the sqlite3_extension_init() entry point.
5870*/
5871int sqlite3Fts2Init(sqlite3 *db){
5872 int rc = SQLITE_OK;
5873 fts2Hash *pHash = 0;
5874 const sqlite3_tokenizer_module *pSimple = 0;
5875 const sqlite3_tokenizer_module *pPorter = 0;
5876 const sqlite3_tokenizer_module *pIcu = 0;
5877
5878 sqlite3Fts2SimpleTokenizerModule(&pSimple);
5879 sqlite3Fts2PorterTokenizerModule(&pPorter);
5880#ifdef SQLITE_ENABLE_ICU
5881 sqlite3Fts2IcuTokenizerModule(&pIcu);
5882#endif
5883
5884 /* Allocate and initialise the hash-table used to store tokenizers. */
5885 pHash = sqlite3_malloc(sizeof(fts2Hash));
5886 if( !pHash ){
5887 rc = SQLITE_NOMEM;
5888 }else{
5889 sqlite3Fts2HashInit(pHash, FTS2_HASH_STRING, 1);
5890 }
5891
5892 /* Load the built-in tokenizers into the hash table */
5893 if( rc==SQLITE_OK ){
5894 if( sqlite3Fts2HashInsert(pHash, "simple", 7, (void *)pSimple)
5895 || sqlite3Fts2HashInsert(pHash, "porter", 7, (void *)pPorter)
5896 || (pIcu && sqlite3Fts2HashInsert(pHash, "icu", 4, (void *)pIcu))
5897 ){
5898 rc = SQLITE_NOMEM;
5899 }
5900 }
5901
5902 /* Create the virtual table wrapper around the hash-table and overload
5903 ** the two scalar functions. If this is successful, register the
5904 ** module with sqlite.
5905 */
5906 if( SQLITE_OK==rc
5907 && SQLITE_OK==(rc = sqlite3Fts2InitHashTable(db, pHash, "fts2_tokenizer"))
5908 && SQLITE_OK==(rc = sqlite3_overload_function(db, "snippet", -1))
5909 && SQLITE_OK==(rc = sqlite3_overload_function(db, "offsets", -1))
5910 ){
5911 return sqlite3_create_module_v2(
5912 db, "fts2", &fts2Module, (void *)pHash, hashDestroy
5913 );
5914 }
5915
5916 /* An error has occured. Delete the hash table and return the error code. */
5917 assert( rc!=SQLITE_OK );
5918 if( pHash ){
5919 sqlite3Fts2HashClear(pHash);
5920 sqlite3_free(pHash);
5921 }
5922 return rc;
5923}
5924
5925#if !SQLITE_CORE
5926int sqlite3_extension_init(
5927 sqlite3 *db,
5928 char **pzErrMsg,
5929 const sqlite3_api_routines *pApi
5930){
5931 SQLITE_EXTENSION_INIT2(pApi)
5932 return sqlite3Fts2Init(db);
5933}
5934#endif
5935
5936#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2.h b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2.h
deleted file mode 100644
index 4da4c38..0000000
--- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2.h
+++ /dev/null
@@ -1,26 +0,0 @@
1/*
2** 2006 Oct 10
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11******************************************************************************
12**
13** This header file is used by programs that want to link against the
14** FTS2 library. All it does is declare the sqlite3Fts2Init() interface.
15*/
16#include "sqlite3.h"
17
18#ifdef __cplusplus
19extern "C" {
20#endif /* __cplusplus */
21
22int sqlite3Fts2Init(sqlite3 *db);
23
24#ifdef __cplusplus
25} /* extern "C" */
26#endif /* __cplusplus */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_hash.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_hash.c
deleted file mode 100644
index fcd5cc2..0000000
--- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_hash.c
+++ /dev/null
@@ -1,369 +0,0 @@
1/*
2** 2001 September 22
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12** This is the implementation of generic hash-tables used in SQLite.
13** We've modified it slightly to serve as a standalone hash table
14** implementation for the full-text indexing module.
15*/
16
17/*
18** The code in this file is only compiled if:
19**
20** * The FTS2 module is being built as an extension
21** (in which case SQLITE_CORE is not defined), or
22**
23** * The FTS2 module is being built into the core of
24** SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
25*/
26#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
27
28#include <assert.h>
29#include <stdlib.h>
30#include <string.h>
31
32#include "fts2_hash.h"
33
34static void *malloc_and_zero(int n){
35 void *p = malloc(n);
36 if( p ){
37 memset(p, 0, n);
38 }
39 return p;
40}
41
42/* Turn bulk memory into a hash table object by initializing the
43** fields of the Hash structure.
44**
45** "pNew" is a pointer to the hash table that is to be initialized.
46** keyClass is one of the constants
47** FTS2_HASH_BINARY or FTS2_HASH_STRING. The value of keyClass
48** determines what kind of key the hash table will use. "copyKey" is
49** true if the hash table should make its own private copy of keys and
50** false if it should just use the supplied pointer.
51*/
52void sqlite3Fts2HashInit(fts2Hash *pNew, int keyClass, int copyKey){
53 assert( pNew!=0 );
54 assert( keyClass>=FTS2_HASH_STRING && keyClass<=FTS2_HASH_BINARY );
55 pNew->keyClass = keyClass;
56 pNew->copyKey = copyKey;
57 pNew->first = 0;
58 pNew->count = 0;
59 pNew->htsize = 0;
60 pNew->ht = 0;
61 pNew->xMalloc = malloc_and_zero;
62 pNew->xFree = free;
63}
64
65/* Remove all entries from a hash table. Reclaim all memory.
66** Call this routine to delete a hash table or to reset a hash table
67** to the empty state.
68*/
69void sqlite3Fts2HashClear(fts2Hash *pH){
70 fts2HashElem *elem; /* For looping over all elements of the table */
71
72 assert( pH!=0 );
73 elem = pH->first;
74 pH->first = 0;
75 if( pH->ht ) pH->xFree(pH->ht);
76 pH->ht = 0;
77 pH->htsize = 0;
78 while( elem ){
79 fts2HashElem *next_elem = elem->next;
80 if( pH->copyKey && elem->pKey ){
81 pH->xFree(elem->pKey);
82 }
83 pH->xFree(elem);
84 elem = next_elem;
85 }
86 pH->count = 0;
87}
88
89/*
90** Hash and comparison functions when the mode is FTS2_HASH_STRING
91*/
92static int strHash(const void *pKey, int nKey){
93 const char *z = (const char *)pKey;
94 int h = 0;
95 if( nKey<=0 ) nKey = (int) strlen(z);
96 while( nKey > 0 ){
97 h = (h<<3) ^ h ^ *z++;
98 nKey--;
99 }
100 return h & 0x7fffffff;
101}
102static int strCompare(const void *pKey1, int n1, const void *pKey2, int n2){
103 if( n1!=n2 ) return 1;
104 return strncmp((const char*)pKey1,(const char*)pKey2,n1);
105}
106
107/*
108** Hash and comparison functions when the mode is FTS2_HASH_BINARY
109*/
110static int binHash(const void *pKey, int nKey){
111 int h = 0;
112 const char *z = (const char *)pKey;
113 while( nKey-- > 0 ){
114 h = (h<<3) ^ h ^ *(z++);
115 }
116 return h & 0x7fffffff;
117}
118static int binCompare(const void *pKey1, int n1, const void *pKey2, int n2){
119 if( n1!=n2 ) return 1;
120 return memcmp(pKey1,pKey2,n1);
121}
122
123/*
124** Return a pointer to the appropriate hash function given the key class.
125**
126** The C syntax in this function definition may be unfamilar to some
127** programmers, so we provide the following additional explanation:
128**
129** The name of the function is "hashFunction". The function takes a
130** single parameter "keyClass". The return value of hashFunction()
131** is a pointer to another function. Specifically, the return value
132** of hashFunction() is a pointer to a function that takes two parameters
133** with types "const void*" and "int" and returns an "int".
134*/
135static int (*hashFunction(int keyClass))(const void*,int){
136 if( keyClass==FTS2_HASH_STRING ){
137 return &strHash;
138 }else{
139 assert( keyClass==FTS2_HASH_BINARY );
140 return &binHash;
141 }
142}
143
144/*
145** Return a pointer to the appropriate hash function given the key class.
146**
147** For help in interpreted the obscure C code in the function definition,
148** see the header comment on the previous function.
149*/
150static int (*compareFunction(int keyClass))(const void*,int,const void*,int){
151 if( keyClass==FTS2_HASH_STRING ){
152 return &strCompare;
153 }else{
154 assert( keyClass==FTS2_HASH_BINARY );
155 return &binCompare;
156 }
157}
158
159/* Link an element into the hash table
160*/
161static void insertElement(
162 fts2Hash *pH, /* The complete hash table */
163 struct _fts2ht *pEntry, /* The entry into which pNew is inserted */
164 fts2HashElem *pNew /* The element to be inserted */
165){
166 fts2HashElem *pHead; /* First element already in pEntry */
167 pHead = pEntry->chain;
168 if( pHead ){
169 pNew->next = pHead;
170 pNew->prev = pHead->prev;
171 if( pHead->prev ){ pHead->prev->next = pNew; }
172 else { pH->first = pNew; }
173 pHead->prev = pNew;
174 }else{
175 pNew->next = pH->first;
176 if( pH->first ){ pH->first->prev = pNew; }
177 pNew->prev = 0;
178 pH->first = pNew;
179 }
180 pEntry->count++;
181 pEntry->chain = pNew;
182}
183
184
185/* Resize the hash table so that it cantains "new_size" buckets.
186** "new_size" must be a power of 2. The hash table might fail
187** to resize if sqliteMalloc() fails.
188*/
189static void rehash(fts2Hash *pH, int new_size){
190 struct _fts2ht *new_ht; /* The new hash table */
191 fts2HashElem *elem, *next_elem; /* For looping over existing elements */
192 int (*xHash)(const void*,int); /* The hash function */
193
194 assert( (new_size & (new_size-1))==0 );
195 new_ht = (struct _fts2ht *)pH->xMalloc( new_size*sizeof(struct _fts2ht) );
196 if( new_ht==0 ) return;
197 if( pH->ht ) pH->xFree(pH->ht);
198 pH->ht = new_ht;
199 pH->htsize = new_size;
200 xHash = hashFunction(pH->keyClass);
201 for(elem=pH->first, pH->first=0; elem; elem = next_elem){
202 int h = (*xHash)(elem->pKey, elem->nKey) & (new_size-1);
203 next_elem = elem->next;
204 insertElement(pH, &new_ht[h], elem);
205 }
206}
207
208/* This function (for internal use only) locates an element in an
209** hash table that matches the given key. The hash for this key has
210** already been computed and is passed as the 4th parameter.
211*/
212static fts2HashElem *findElementGivenHash(
213 const fts2Hash *pH, /* The pH to be searched */
214 const void *pKey, /* The key we are searching for */
215 int nKey,
216 int h /* The hash for this key. */
217){
218 fts2HashElem *elem; /* Used to loop thru the element list */
219 int count; /* Number of elements left to test */
220 int (*xCompare)(const void*,int,const void*,int); /* comparison function */
221
222 if( pH->ht ){
223 struct _fts2ht *pEntry = &pH->ht[h];
224 elem = pEntry->chain;
225 count = pEntry->count;
226 xCompare = compareFunction(pH->keyClass);
227 while( count-- && elem ){
228 if( (*xCompare)(elem->pKey,elem->nKey,pKey,nKey)==0 ){
229 return elem;
230 }
231 elem = elem->next;
232 }
233 }
234 return 0;
235}
236
237/* Remove a single entry from the hash table given a pointer to that
238** element and a hash on the element's key.
239*/
240static void removeElementGivenHash(
241 fts2Hash *pH, /* The pH containing "elem" */
242 fts2HashElem* elem, /* The element to be removed from the pH */
243 int h /* Hash value for the element */
244){
245 struct _fts2ht *pEntry;
246 if( elem->prev ){
247 elem->prev->next = elem->next;
248 }else{
249 pH->first = elem->next;
250 }
251 if( elem->next ){
252 elem->next->prev = elem->prev;
253 }
254 pEntry = &pH->ht[h];
255 if( pEntry->chain==elem ){
256 pEntry->chain = elem->next;
257 }
258 pEntry->count--;
259 if( pEntry->count<=0 ){
260 pEntry->chain = 0;
261 }
262 if( pH->copyKey && elem->pKey ){
263 pH->xFree(elem->pKey);
264 }
265 pH->xFree( elem );
266 pH->count--;
267 if( pH->count<=0 ){
268 assert( pH->first==0 );
269 assert( pH->count==0 );
270 fts2HashClear(pH);
271 }
272}
273
274/* Attempt to locate an element of the hash table pH with a key
275** that matches pKey,nKey. Return the data for this element if it is
276** found, or NULL if there is no match.
277*/
278void *sqlite3Fts2HashFind(const fts2Hash *pH, const void *pKey, int nKey){
279 int h; /* A hash on key */
280 fts2HashElem *elem; /* The element that matches key */
281 int (*xHash)(const void*,int); /* The hash function */
282
283 if( pH==0 || pH->ht==0 ) return 0;
284 xHash = hashFunction(pH->keyClass);
285 assert( xHash!=0 );
286 h = (*xHash)(pKey,nKey);
287 assert( (pH->htsize & (pH->htsize-1))==0 );
288 elem = findElementGivenHash(pH,pKey,nKey, h & (pH->htsize-1));
289 return elem ? elem->data : 0;
290}
291
292/* Insert an element into the hash table pH. The key is pKey,nKey
293** and the data is "data".
294**
295** If no element exists with a matching key, then a new
296** element is created. A copy of the key is made if the copyKey
297** flag is set. NULL is returned.
298**
299** If another element already exists with the same key, then the
300** new data replaces the old data and the old data is returned.
301** The key is not copied in this instance. If a malloc fails, then
302** the new data is returned and the hash table is unchanged.
303**
304** If the "data" parameter to this function is NULL, then the
305** element corresponding to "key" is removed from the hash table.
306*/
307void *sqlite3Fts2HashInsert(
308 fts2Hash *pH, /* The hash table to insert into */
309 const void *pKey, /* The key */
310 int nKey, /* Number of bytes in the key */
311 void *data /* The data */
312){
313 int hraw; /* Raw hash value of the key */
314 int h; /* the hash of the key modulo hash table size */
315 fts2HashElem *elem; /* Used to loop thru the element list */
316 fts2HashElem *new_elem; /* New element added to the pH */
317 int (*xHash)(const void*,int); /* The hash function */
318
319 assert( pH!=0 );
320 xHash = hashFunction(pH->keyClass);
321 assert( xHash!=0 );
322 hraw = (*xHash)(pKey, nKey);
323 assert( (pH->htsize & (pH->htsize-1))==0 );
324 h = hraw & (pH->htsize-1);
325 elem = findElementGivenHash(pH,pKey,nKey,h);
326 if( elem ){
327 void *old_data = elem->data;
328 if( data==0 ){
329 removeElementGivenHash(pH,elem,h);
330 }else{
331 elem->data = data;
332 }
333 return old_data;
334 }
335 if( data==0 ) return 0;
336 new_elem = (fts2HashElem*)pH->xMalloc( sizeof(fts2HashElem) );
337 if( new_elem==0 ) return data;
338 if( pH->copyKey && pKey!=0 ){
339 new_elem->pKey = pH->xMalloc( nKey );
340 if( new_elem->pKey==0 ){
341 pH->xFree(new_elem);
342 return data;
343 }
344 memcpy((void*)new_elem->pKey, pKey, nKey);
345 }else{
346 new_elem->pKey = (void*)pKey;
347 }
348 new_elem->nKey = nKey;
349 pH->count++;
350 if( pH->htsize==0 ){
351 rehash(pH,8);
352 if( pH->htsize==0 ){
353 pH->count = 0;
354 pH->xFree(new_elem);
355 return data;
356 }
357 }
358 if( pH->count > pH->htsize ){
359 rehash(pH,pH->htsize*2);
360 }
361 assert( pH->htsize>0 );
362 assert( (pH->htsize & (pH->htsize-1))==0 );
363 h = hraw & (pH->htsize-1);
364 insertElement(pH, &pH->ht[h], new_elem);
365 new_elem->data = data;
366 return 0;
367}
368
369#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_hash.h b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_hash.h
deleted file mode 100644
index 97f3529..0000000
--- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_hash.h
+++ /dev/null
@@ -1,112 +0,0 @@
1/*
2** 2001 September 22
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12** This is the header file for the generic hash-table implemenation
13** used in SQLite. We've modified it slightly to serve as a standalone
14** hash table implementation for the full-text indexing module.
15**
16*/
17#ifndef _FTS2_HASH_H_
18#define _FTS2_HASH_H_
19
20/* Forward declarations of structures. */
21typedef struct fts2Hash fts2Hash;
22typedef struct fts2HashElem fts2HashElem;
23
24/* A complete hash table is an instance of the following structure.
25** The internals of this structure are intended to be opaque -- client
26** code should not attempt to access or modify the fields of this structure
27** directly. Change this structure only by using the routines below.
28** However, many of the "procedures" and "functions" for modifying and
29** accessing this structure are really macros, so we can't really make
30** this structure opaque.
31*/
32struct fts2Hash {
33 char keyClass; /* HASH_INT, _POINTER, _STRING, _BINARY */
34 char copyKey; /* True if copy of key made on insert */
35 int count; /* Number of entries in this table */
36 fts2HashElem *first; /* The first element of the array */
37 void *(*xMalloc)(int); /* malloc() function to use */
38 void (*xFree)(void *); /* free() function to use */
39 int htsize; /* Number of buckets in the hash table */
40 struct _fts2ht { /* the hash table */
41 int count; /* Number of entries with this hash */
42 fts2HashElem *chain; /* Pointer to first entry with this hash */
43 } *ht;
44};
45
46/* Each element in the hash table is an instance of the following
47** structure. All elements are stored on a single doubly-linked list.
48**
49** Again, this structure is intended to be opaque, but it can't really
50** be opaque because it is used by macros.
51*/
52struct fts2HashElem {
53 fts2HashElem *next, *prev; /* Next and previous elements in the table */
54 void *data; /* Data associated with this element */
55 void *pKey; int nKey; /* Key associated with this element */
56};
57
58/*
59** There are 2 different modes of operation for a hash table:
60**
61** FTS2_HASH_STRING pKey points to a string that is nKey bytes long
62** (including the null-terminator, if any). Case
63** is respected in comparisons.
64**
65** FTS2_HASH_BINARY pKey points to binary data nKey bytes long.
66** memcmp() is used to compare keys.
67**
68** A copy of the key is made if the copyKey parameter to fts2HashInit is 1.
69*/
70#define FTS2_HASH_STRING 1
71#define FTS2_HASH_BINARY 2
72
73/*
74** Access routines. To delete, insert a NULL pointer.
75*/
76void sqlite3Fts2HashInit(fts2Hash*, int keytype, int copyKey);
77void *sqlite3Fts2HashInsert(fts2Hash*, const void *pKey, int nKey, void *pData);
78void *sqlite3Fts2HashFind(const fts2Hash*, const void *pKey, int nKey);
79void sqlite3Fts2HashClear(fts2Hash*);
80
81/*
82** Shorthand for the functions above
83*/
84#define fts2HashInit sqlite3Fts2HashInit
85#define fts2HashInsert sqlite3Fts2HashInsert
86#define fts2HashFind sqlite3Fts2HashFind
87#define fts2HashClear sqlite3Fts2HashClear
88
89/*
90** Macros for looping over all elements of a hash table. The idiom is
91** like this:
92**
93** fts2Hash h;
94** fts2HashElem *p;
95** ...
96** for(p=fts2HashFirst(&h); p; p=fts2HashNext(p)){
97** SomeStructure *pData = fts2HashData(p);
98** // do something with pData
99** }
100*/
101#define fts2HashFirst(H) ((H)->first)
102#define fts2HashNext(E) ((E)->next)
103#define fts2HashData(E) ((E)->data)
104#define fts2HashKey(E) ((E)->pKey)
105#define fts2HashKeysize(E) ((E)->nKey)
106
107/*
108** Number of entries in a hash table
109*/
110#define fts2HashCount(H) ((H)->count)
111
112#endif /* _FTS2_HASH_H_ */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_icu.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_icu.c
deleted file mode 100644
index ed15f33..0000000
--- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_icu.c
+++ /dev/null
@@ -1,257 +0,0 @@
1/*
2** 2007 June 22
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12** This file implements a tokenizer for fts2 based on the ICU library.
13**
14** $Id: fts2_icu.c,v 1.1 2007/06/22 15:21:16 danielk1977 Exp $
15*/
16
17#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
18#ifdef SQLITE_ENABLE_ICU
19
20#include <assert.h>
21#include <string.h>
22#include "fts2_tokenizer.h"
23
24#include <unicode/ubrk.h>
25#include <unicode/ucol.h>
26#include <unicode/ustring.h>
27#include <unicode/utf16.h>
28
29typedef struct IcuTokenizer IcuTokenizer;
30typedef struct IcuCursor IcuCursor;
31
32struct IcuTokenizer {
33 sqlite3_tokenizer base;
34 char *zLocale;
35};
36
37struct IcuCursor {
38 sqlite3_tokenizer_cursor base;
39
40 UBreakIterator *pIter; /* ICU break-iterator object */
41 int nChar; /* Number of UChar elements in pInput */
42 UChar *aChar; /* Copy of input using utf-16 encoding */
43 int *aOffset; /* Offsets of each character in utf-8 input */
44
45 int nBuffer;
46 char *zBuffer;
47
48 int iToken;
49};
50
51/*
52** Create a new tokenizer instance.
53*/
54static int icuCreate(
55 int argc, /* Number of entries in argv[] */
56 const char * const *argv, /* Tokenizer creation arguments */
57 sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */
58){
59 IcuTokenizer *p;
60 int n = 0;
61
62 if( argc>0 ){
63 n = strlen(argv[0])+1;
64 }
65 p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n);
66 if( !p ){
67 return SQLITE_NOMEM;
68 }
69 memset(p, 0, sizeof(IcuTokenizer));
70
71 if( n ){
72 p->zLocale = (char *)&p[1];
73 memcpy(p->zLocale, argv[0], n);
74 }
75
76 *ppTokenizer = (sqlite3_tokenizer *)p;
77
78 return SQLITE_OK;
79}
80
81/*
82** Destroy a tokenizer
83*/
84static int icuDestroy(sqlite3_tokenizer *pTokenizer){
85 IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
86 sqlite3_free(p);
87 return SQLITE_OK;
88}
89
90/*
91** Prepare to begin tokenizing a particular string. The input
92** string to be tokenized is pInput[0..nBytes-1]. A cursor
93** used to incrementally tokenize this string is returned in
94** *ppCursor.
95*/
96static int icuOpen(
97 sqlite3_tokenizer *pTokenizer, /* The tokenizer */
98 const char *zInput, /* Input string */
99 int nInput, /* Length of zInput in bytes */
100 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
101){
102 IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
103 IcuCursor *pCsr;
104
105 const int32_t opt = U_FOLD_CASE_DEFAULT;
106 UErrorCode status = U_ZERO_ERROR;
107 int nChar;
108
109 UChar32 c;
110 int iInput = 0;
111 int iOut = 0;
112
113 *ppCursor = 0;
114
115 nChar = nInput+1;
116 pCsr = (IcuCursor *)sqlite3_malloc(
117 sizeof(IcuCursor) + /* IcuCursor */
118 nChar * sizeof(UChar) + /* IcuCursor.aChar[] */
119 (nChar+1) * sizeof(int) /* IcuCursor.aOffset[] */
120 );
121 if( !pCsr ){
122 return SQLITE_NOMEM;
123 }
124 memset(pCsr, 0, sizeof(IcuCursor));
125 pCsr->aChar = (UChar *)&pCsr[1];
126 pCsr->aOffset = (int *)&pCsr->aChar[nChar];
127
128 pCsr->aOffset[iOut] = iInput;
129 U8_NEXT(zInput, iInput, nInput, c);
130 while( c>0 ){
131 int isError = 0;
132 c = u_foldCase(c, opt);
133 U16_APPEND(pCsr->aChar, iOut, nChar, c, isError);
134 if( isError ){
135 sqlite3_free(pCsr);
136 return SQLITE_ERROR;
137 }
138 pCsr->aOffset[iOut] = iInput;
139
140 if( iInput<nInput ){
141 U8_NEXT(zInput, iInput, nInput, c);
142 }else{
143 c = 0;
144 }
145 }
146
147 pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status);
148 if( !U_SUCCESS(status) ){
149 sqlite3_free(pCsr);
150 return SQLITE_ERROR;
151 }
152 pCsr->nChar = iOut;
153
154 ubrk_first(pCsr->pIter);
155 *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
156 return SQLITE_OK;
157}
158
159/*
160** Close a tokenization cursor previously opened by a call to icuOpen().
161*/
162static int icuClose(sqlite3_tokenizer_cursor *pCursor){
163 IcuCursor *pCsr = (IcuCursor *)pCursor;
164 ubrk_close(pCsr->pIter);
165 sqlite3_free(pCsr->zBuffer);
166 sqlite3_free(pCsr);
167 return SQLITE_OK;
168}
169
170/*
171** Extract the next token from a tokenization cursor.
172*/
173static int icuNext(
174 sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */
175 const char **ppToken, /* OUT: *ppToken is the token text */
176 int *pnBytes, /* OUT: Number of bytes in token */
177 int *piStartOffset, /* OUT: Starting offset of token */
178 int *piEndOffset, /* OUT: Ending offset of token */
179 int *piPosition /* OUT: Position integer of token */
180){
181 IcuCursor *pCsr = (IcuCursor *)pCursor;
182
183 int iStart = 0;
184 int iEnd = 0;
185 int nByte = 0;
186
187 while( iStart==iEnd ){
188 UChar32 c;
189
190 iStart = ubrk_current(pCsr->pIter);
191 iEnd = ubrk_next(pCsr->pIter);
192 if( iEnd==UBRK_DONE ){
193 return SQLITE_DONE;
194 }
195
196 while( iStart<iEnd ){
197 int iWhite = iStart;
198 U8_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);
199 if( u_isspace(c) ){
200 iStart = iWhite;
201 }else{
202 break;
203 }
204 }
205 assert(iStart<=iEnd);
206 }
207
208 do {
209 UErrorCode status = U_ZERO_ERROR;
210 if( nByte ){
211 char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);
212 if( !zNew ){
213 return SQLITE_NOMEM;
214 }
215 pCsr->zBuffer = zNew;
216 pCsr->nBuffer = nByte;
217 }
218
219 u_strToUTF8(
220 pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */
221 &pCsr->aChar[iStart], iEnd-iStart, /* Input vars */
222 &status /* Output success/failure */
223 );
224 } while( nByte>pCsr->nBuffer );
225
226 *ppToken = pCsr->zBuffer;
227 *pnBytes = nByte;
228 *piStartOffset = pCsr->aOffset[iStart];
229 *piEndOffset = pCsr->aOffset[iEnd];
230 *piPosition = pCsr->iToken++;
231
232 return SQLITE_OK;
233}
234
235/*
236** The set of routines that implement the simple tokenizer
237*/
238static const sqlite3_tokenizer_module icuTokenizerModule = {
239 0, /* iVersion */
240 icuCreate, /* xCreate */
241 icuDestroy, /* xCreate */
242 icuOpen, /* xOpen */
243 icuClose, /* xClose */
244 icuNext, /* xNext */
245};
246
247/*
248** Set *ppModule to point at the implementation of the ICU tokenizer.
249*/
250void sqlite3Fts2IcuTokenizerModule(
251 sqlite3_tokenizer_module const**ppModule
252){
253 *ppModule = &icuTokenizerModule;
254}
255
256#endif /* defined(SQLITE_ENABLE_ICU) */
257#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_porter.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_porter.c
deleted file mode 100644
index dab1849..0000000
--- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_porter.c
+++ /dev/null
@@ -1,642 +0,0 @@
1/*
2** 2006 September 30
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12** Implementation of the full-text-search tokenizer that implements
13** a Porter stemmer.
14*/
15
16/*
17** The code in this file is only compiled if:
18**
19** * The FTS2 module is being built as an extension
20** (in which case SQLITE_CORE is not defined), or
21**
22** * The FTS2 module is being built into the core of
23** SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
24*/
25#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
26
27
28#include <assert.h>
29#include <stdlib.h>
30#include <stdio.h>
31#include <string.h>
32#include <ctype.h>
33
34#include "fts2_tokenizer.h"
35
36/*
37** Class derived from sqlite3_tokenizer
38*/
39typedef struct porter_tokenizer {
40 sqlite3_tokenizer base; /* Base class */
41} porter_tokenizer;
42
43/*
44** Class derived from sqlit3_tokenizer_cursor
45*/
46typedef struct porter_tokenizer_cursor {
47 sqlite3_tokenizer_cursor base;
48 const char *zInput; /* input we are tokenizing */
49 int nInput; /* size of the input */
50 int iOffset; /* current position in zInput */
51 int iToken; /* index of next token to be returned */
52 char *zToken; /* storage for current token */
53 int nAllocated; /* space allocated to zToken buffer */
54} porter_tokenizer_cursor;
55
56
57/* Forward declaration */
58static const sqlite3_tokenizer_module porterTokenizerModule;
59
60
61/*
62** Create a new tokenizer instance.
63*/
64static int porterCreate(
65 int argc, const char * const *argv,
66 sqlite3_tokenizer **ppTokenizer
67){
68 porter_tokenizer *t;
69 t = (porter_tokenizer *) calloc(sizeof(*t), 1);
70 if( t==NULL ) return SQLITE_NOMEM;
71
72 *ppTokenizer = &t->base;
73 return SQLITE_OK;
74}
75
76/*
77** Destroy a tokenizer
78*/
79static int porterDestroy(sqlite3_tokenizer *pTokenizer){
80 free(pTokenizer);
81 return SQLITE_OK;
82}
83
84/*
85** Prepare to begin tokenizing a particular string. The input
86** string to be tokenized is zInput[0..nInput-1]. A cursor
87** used to incrementally tokenize this string is returned in
88** *ppCursor.
89*/
90static int porterOpen(
91 sqlite3_tokenizer *pTokenizer, /* The tokenizer */
92 const char *zInput, int nInput, /* String to be tokenized */
93 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
94){
95 porter_tokenizer_cursor *c;
96
97 c = (porter_tokenizer_cursor *) malloc(sizeof(*c));
98 if( c==NULL ) return SQLITE_NOMEM;
99
100 c->zInput = zInput;
101 if( zInput==0 ){
102 c->nInput = 0;
103 }else if( nInput<0 ){
104 c->nInput = (int)strlen(zInput);
105 }else{
106 c->nInput = nInput;
107 }
108 c->iOffset = 0; /* start tokenizing at the beginning */
109 c->iToken = 0;
110 c->zToken = NULL; /* no space allocated, yet. */
111 c->nAllocated = 0;
112
113 *ppCursor = &c->base;
114 return SQLITE_OK;
115}
116
117/*
118** Close a tokenization cursor previously opened by a call to
119** porterOpen() above.
120*/
121static int porterClose(sqlite3_tokenizer_cursor *pCursor){
122 porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
123 free(c->zToken);
124 free(c);
125 return SQLITE_OK;
126}
127/*
128** Vowel or consonant
129*/
130static const char cType[] = {
131 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
132 1, 1, 1, 2, 1
133};
134
135/*
136** isConsonant() and isVowel() determine if their first character in
137** the string they point to is a consonant or a vowel, according
138** to Porter ruls.
139**
140** A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'.
141** 'Y' is a consonant unless it follows another consonant,
142** in which case it is a vowel.
143**
144** In these routine, the letters are in reverse order. So the 'y' rule
145** is that 'y' is a consonant unless it is followed by another
146** consonent.
147*/
148static int isVowel(const char*);
149static int isConsonant(const char *z){
150 int j;
151 char x = *z;
152 if( x==0 ) return 0;
153 assert( x>='a' && x<='z' );
154 j = cType[x-'a'];
155 if( j<2 ) return j;
156 return z[1]==0 || isVowel(z + 1);
157}
158static int isVowel(const char *z){
159 int j;
160 char x = *z;
161 if( x==0 ) return 0;
162 assert( x>='a' && x<='z' );
163 j = cType[x-'a'];
164 if( j<2 ) return 1-j;
165 return isConsonant(z + 1);
166}
167
168/*
169** Let any sequence of one or more vowels be represented by V and let
170** C be sequence of one or more consonants. Then every word can be
171** represented as:
172**
173** [C] (VC){m} [V]
174**
175** In prose: A word is an optional consonant followed by zero or
176** vowel-consonant pairs followed by an optional vowel. "m" is the
177** number of vowel consonant pairs. This routine computes the value
178** of m for the first i bytes of a word.
179**
180** Return true if the m-value for z is 1 or more. In other words,
181** return true if z contains at least one vowel that is followed
182** by a consonant.
183**
184** In this routine z[] is in reverse order. So we are really looking
185** for an instance of of a consonant followed by a vowel.
186*/
187static int m_gt_0(const char *z){
188 while( isVowel(z) ){ z++; }
189 if( *z==0 ) return 0;
190 while( isConsonant(z) ){ z++; }
191 return *z!=0;
192}
193
194/* Like mgt0 above except we are looking for a value of m which is
195** exactly 1
196*/
197static int m_eq_1(const char *z){
198 while( isVowel(z) ){ z++; }
199 if( *z==0 ) return 0;
200 while( isConsonant(z) ){ z++; }
201 if( *z==0 ) return 0;
202 while( isVowel(z) ){ z++; }
203 if( *z==0 ) return 1;
204 while( isConsonant(z) ){ z++; }
205 return *z==0;
206}
207
208/* Like mgt0 above except we are looking for a value of m>1 instead
209** or m>0
210*/
211static int m_gt_1(const char *z){
212 while( isVowel(z) ){ z++; }
213 if( *z==0 ) return 0;
214 while( isConsonant(z) ){ z++; }
215 if( *z==0 ) return 0;
216 while( isVowel(z) ){ z++; }
217 if( *z==0 ) return 0;
218 while( isConsonant(z) ){ z++; }
219 return *z!=0;
220}
221
222/*
223** Return TRUE if there is a vowel anywhere within z[0..n-1]
224*/
225static int hasVowel(const char *z){
226 while( isConsonant(z) ){ z++; }
227 return *z!=0;
228}
229
230/*
231** Return TRUE if the word ends in a double consonant.
232**
233** The text is reversed here. So we are really looking at
234** the first two characters of z[].
235*/
236static int doubleConsonant(const char *z){
237 return isConsonant(z) && z[0]==z[1] && isConsonant(z+1);
238}
239
240/*
241** Return TRUE if the word ends with three letters which
242** are consonant-vowel-consonent and where the final consonant
243** is not 'w', 'x', or 'y'.
244**
245** The word is reversed here. So we are really checking the
246** first three letters and the first one cannot be in [wxy].
247*/
248static int star_oh(const char *z){
249 return
250 z[0]!=0 && isConsonant(z) &&
251 z[0]!='w' && z[0]!='x' && z[0]!='y' &&
252 z[1]!=0 && isVowel(z+1) &&
253 z[2]!=0 && isConsonant(z+2);
254}
255
256/*
257** If the word ends with zFrom and xCond() is true for the stem
258** of the word that preceeds the zFrom ending, then change the
259** ending to zTo.
260**
261** The input word *pz and zFrom are both in reverse order. zTo
262** is in normal order.
263**
264** Return TRUE if zFrom matches. Return FALSE if zFrom does not
265** match. Not that TRUE is returned even if xCond() fails and
266** no substitution occurs.
267*/
268static int stem(
269 char **pz, /* The word being stemmed (Reversed) */
270 const char *zFrom, /* If the ending matches this... (Reversed) */
271 const char *zTo, /* ... change the ending to this (not reversed) */
272 int (*xCond)(const char*) /* Condition that must be true */
273){
274 char *z = *pz;
275 while( *zFrom && *zFrom==*z ){ z++; zFrom++; }
276 if( *zFrom!=0 ) return 0;
277 if( xCond && !xCond(z) ) return 1;
278 while( *zTo ){
279 *(--z) = *(zTo++);
280 }
281 *pz = z;
282 return 1;
283}
284
285/*
286** This is the fallback stemmer used when the porter stemmer is
287** inappropriate. The input word is copied into the output with
288** US-ASCII case folding. If the input word is too long (more
289** than 20 bytes if it contains no digits or more than 6 bytes if
290** it contains digits) then word is truncated to 20 or 6 bytes
291** by taking 10 or 3 bytes from the beginning and end.
292*/
293static void copy_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
294 int i, mx, j;
295 int hasDigit = 0;
296 for(i=0; i<nIn; i++){
297 int c = zIn[i];
298 if( c>='A' && c<='Z' ){
299 zOut[i] = c - 'A' + 'a';
300 }else{
301 if( c>='0' && c<='9' ) hasDigit = 1;
302 zOut[i] = c;
303 }
304 }
305 mx = hasDigit ? 3 : 10;
306 if( nIn>mx*2 ){
307 for(j=mx, i=nIn-mx; i<nIn; i++, j++){
308 zOut[j] = zOut[i];
309 }
310 i = j;
311 }
312 zOut[i] = 0;
313 *pnOut = i;
314}
315
316
317/*
318** Stem the input word zIn[0..nIn-1]. Store the output in zOut.
319** zOut is at least big enough to hold nIn bytes. Write the actual
320** size of the output word (exclusive of the '\0' terminator) into *pnOut.
321**
322** Any upper-case characters in the US-ASCII character set ([A-Z])
323** are converted to lower case. Upper-case UTF characters are
324** unchanged.
325**
326** Words that are longer than about 20 bytes are stemmed by retaining
327** a few bytes from the beginning and the end of the word. If the
328** word contains digits, 3 bytes are taken from the beginning and
329** 3 bytes from the end. For long words without digits, 10 bytes
330** are taken from each end. US-ASCII case folding still applies.
331**
332** If the input word contains not digits but does characters not
333** in [a-zA-Z] then no stemming is attempted and this routine just
334** copies the input into the input into the output with US-ASCII
335** case folding.
336**
337** Stemming never increases the length of the word. So there is
338** no chance of overflowing the zOut buffer.
339*/
340static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
341 int i, j, c;
342 char zReverse[28];
343 char *z, *z2;
344 if( nIn<3 || nIn>=sizeof(zReverse)-7 ){
345 /* The word is too big or too small for the porter stemmer.
346 ** Fallback to the copy stemmer */
347 copy_stemmer(zIn, nIn, zOut, pnOut);
348 return;
349 }
350 for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){
351 c = zIn[i];
352 if( c>='A' && c<='Z' ){
353 zReverse[j] = c + 'a' - 'A';
354 }else if( c>='a' && c<='z' ){
355 zReverse[j] = c;
356 }else{
357 /* The use of a character not in [a-zA-Z] means that we fallback
358 ** to the copy stemmer */
359 copy_stemmer(zIn, nIn, zOut, pnOut);
360 return;
361 }
362 }
363 memset(&zReverse[sizeof(zReverse)-5], 0, 5);
364 z = &zReverse[j+1];
365
366
367 /* Step 1a */
368 if( z[0]=='s' ){
369 if(
370 !stem(&z, "sess", "ss", 0) &&
371 !stem(&z, "sei", "i", 0) &&
372 !stem(&z, "ss", "ss", 0)
373 ){
374 z++;
375 }
376 }
377
378 /* Step 1b */
379 z2 = z;
380 if( stem(&z, "dee", "ee", m_gt_0) ){
381 /* Do nothing. The work was all in the test */
382 }else if(
383 (stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel))
384 && z!=z2
385 ){
386 if( stem(&z, "ta", "ate", 0) ||
387 stem(&z, "lb", "ble", 0) ||
388 stem(&z, "zi", "ize", 0) ){
389 /* Do nothing. The work was all in the test */
390 }else if( doubleConsonant(z) && (*z!='l' && *z!='s' && *z!='z') ){
391 z++;
392 }else if( m_eq_1(z) && star_oh(z) ){
393 *(--z) = 'e';
394 }
395 }
396
397 /* Step 1c */
398 if( z[0]=='y' && hasVowel(z+1) ){
399 z[0] = 'i';
400 }
401
402 /* Step 2 */
403 switch( z[1] ){
404 case 'a':
405 stem(&z, "lanoita", "ate", m_gt_0) ||
406 stem(&z, "lanoit", "tion", m_gt_0);
407 break;
408 case 'c':
409 stem(&z, "icne", "ence", m_gt_0) ||
410 stem(&z, "icna", "ance", m_gt_0);
411 break;
412 case 'e':
413 stem(&z, "rezi", "ize", m_gt_0);
414 break;
415 case 'g':
416 stem(&z, "igol", "log", m_gt_0);
417 break;
418 case 'l':
419 stem(&z, "ilb", "ble", m_gt_0) ||
420 stem(&z, "illa", "al", m_gt_0) ||
421 stem(&z, "iltne", "ent", m_gt_0) ||
422 stem(&z, "ile", "e", m_gt_0) ||
423 stem(&z, "ilsuo", "ous", m_gt_0);
424 break;
425 case 'o':
426 stem(&z, "noitazi", "ize", m_gt_0) ||
427 stem(&z, "noita", "ate", m_gt_0) ||
428 stem(&z, "rota", "ate", m_gt_0);
429 break;
430 case 's':
431 stem(&z, "msila", "al", m_gt_0) ||
432 stem(&z, "ssenevi", "ive", m_gt_0) ||
433 stem(&z, "ssenluf", "ful", m_gt_0) ||
434 stem(&z, "ssensuo", "ous", m_gt_0);
435 break;
436 case 't':
437 stem(&z, "itila", "al", m_gt_0) ||
438 stem(&z, "itivi", "ive", m_gt_0) ||
439 stem(&z, "itilib", "ble", m_gt_0);
440 break;
441 }
442
443 /* Step 3 */
444 switch( z[0] ){
445 case 'e':
446 stem(&z, "etaci", "ic", m_gt_0) ||
447 stem(&z, "evita", "", m_gt_0) ||
448 stem(&z, "ezila", "al", m_gt_0);
449 break;
450 case 'i':
451 stem(&z, "itici", "ic", m_gt_0);
452 break;
453 case 'l':
454 stem(&z, "laci", "ic", m_gt_0) ||
455 stem(&z, "luf", "", m_gt_0);
456 break;
457 case 's':
458 stem(&z, "ssen", "", m_gt_0);
459 break;
460 }
461
462 /* Step 4 */
463 switch( z[1] ){
464 case 'a':
465 if( z[0]=='l' && m_gt_1(z+2) ){
466 z += 2;
467 }
468 break;
469 case 'c':
470 if( z[0]=='e' && z[2]=='n' && (z[3]=='a' || z[3]=='e') && m_gt_1(z+4) ){
471 z += 4;
472 }
473 break;
474 case 'e':
475 if( z[0]=='r' && m_gt_1(z+2) ){
476 z += 2;
477 }
478 break;
479 case 'i':
480 if( z[0]=='c' && m_gt_1(z+2) ){
481 z += 2;
482 }
483 break;
484 case 'l':
485 if( z[0]=='e' && z[2]=='b' && (z[3]=='a' || z[3]=='i') && m_gt_1(z+4) ){
486 z += 4;
487 }
488 break;
489 case 'n':
490 if( z[0]=='t' ){
491 if( z[2]=='a' ){
492 if( m_gt_1(z+3) ){
493 z += 3;
494 }
495 }else if( z[2]=='e' ){
496 stem(&z, "tneme", "", m_gt_1) ||
497 stem(&z, "tnem", "", m_gt_1) ||
498 stem(&z, "tne", "", m_gt_1);
499 }
500 }
501 break;
502 case 'o':
503 if( z[0]=='u' ){
504 if( m_gt_1(z+2) ){
505 z += 2;
506 }
507 }else if( z[3]=='s' || z[3]=='t' ){
508 stem(&z, "noi", "", m_gt_1);
509 }
510 break;
511 case 's':
512 if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){
513 z += 3;
514 }
515 break;
516 case 't':
517 stem(&z, "eta", "", m_gt_1) ||
518 stem(&z, "iti", "", m_gt_1);
519 break;
520 case 'u':
521 if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){
522 z += 3;
523 }
524 break;
525 case 'v':
526 case 'z':
527 if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){
528 z += 3;
529 }
530 break;
531 }
532
533 /* Step 5a */
534 if( z[0]=='e' ){
535 if( m_gt_1(z+1) ){
536 z++;
537 }else if( m_eq_1(z+1) && !star_oh(z+1) ){
538 z++;
539 }
540 }
541
542 /* Step 5b */
543 if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){
544 z++;
545 }
546
547 /* z[] is now the stemmed word in reverse order. Flip it back
548 ** around into forward order and return.
549 */
550 *pnOut = i = strlen(z);
551 zOut[i] = 0;
552 while( *z ){
553 zOut[--i] = *(z++);
554 }
555}
556
557/*
558** Characters that can be part of a token. We assume any character
559** whose value is greater than 0x80 (any UTF character) can be
560** part of a token. In other words, delimiters all must have
561** values of 0x7f or lower.
562*/
563static const char porterIdChar[] = {
564/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
565 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */
566 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */
567 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */
568 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */
569 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */
570};
571#define isDelim(C) (((ch=C)&0x80)==0 && (ch<0x30 || !porterIdChar[ch-0x30]))
572
573/*
574** Extract the next token from a tokenization cursor. The cursor must
575** have been opened by a prior call to porterOpen().
576*/
577static int porterNext(
578 sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by porterOpen */
579 const char **pzToken, /* OUT: *pzToken is the token text */
580 int *pnBytes, /* OUT: Number of bytes in token */
581 int *piStartOffset, /* OUT: Starting offset of token */
582 int *piEndOffset, /* OUT: Ending offset of token */
583 int *piPosition /* OUT: Position integer of token */
584){
585 porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
586 const char *z = c->zInput;
587
588 while( c->iOffset<c->nInput ){
589 int iStartOffset, ch;
590
591 /* Scan past delimiter characters */
592 while( c->iOffset<c->nInput && isDelim(z[c->iOffset]) ){
593 c->iOffset++;
594 }
595
596 /* Count non-delimiter characters. */
597 iStartOffset = c->iOffset;
598 while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){
599 c->iOffset++;
600 }
601
602 if( c->iOffset>iStartOffset ){
603 int n = c->iOffset-iStartOffset;
604 if( n>c->nAllocated ){
605 c->nAllocated = n+20;
606 c->zToken = realloc(c->zToken, c->nAllocated);
607 if( c->zToken==NULL ) return SQLITE_NOMEM;
608 }
609 porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes);
610 *pzToken = c->zToken;
611 *piStartOffset = iStartOffset;
612 *piEndOffset = c->iOffset;
613 *piPosition = c->iToken++;
614 return SQLITE_OK;
615 }
616 }
617 return SQLITE_DONE;
618}
619
620/*
621** The set of routines that implement the porter-stemmer tokenizer
622*/
623static const sqlite3_tokenizer_module porterTokenizerModule = {
624 0,
625 porterCreate,
626 porterDestroy,
627 porterOpen,
628 porterClose,
629 porterNext,
630};
631
632/*
633** Allocate a new porter tokenizer. Return a pointer to the new
634** tokenizer in *ppModule
635*/
636void sqlite3Fts2PorterTokenizerModule(
637 sqlite3_tokenizer_module const**ppModule
638){
639 *ppModule = &porterTokenizerModule;
640}
641
642#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_tokenizer.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_tokenizer.c
deleted file mode 100644
index cbf771b..0000000
--- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_tokenizer.c
+++ /dev/null
@@ -1,371 +0,0 @@
1/*
2** 2007 June 22
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11******************************************************************************
12**
13** This is part of an SQLite module implementing full-text search.
14** This particular file implements the generic tokenizer interface.
15*/
16
17/*
18** The code in this file is only compiled if:
19**
20** * The FTS2 module is being built as an extension
21** (in which case SQLITE_CORE is not defined), or
22**
23** * The FTS2 module is being built into the core of
24** SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
25*/
26#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
27
28
29#include "sqlite3.h"
30#include "sqlite3ext.h"
31SQLITE_EXTENSION_INIT1
32
33#include "fts2_hash.h"
34#include "fts2_tokenizer.h"
35#include <assert.h>
36
37/*
38** Implementation of the SQL scalar function for accessing the underlying
39** hash table. This function may be called as follows:
40**
41** SELECT <function-name>(<key-name>);
42** SELECT <function-name>(<key-name>, <pointer>);
43**
44** where <function-name> is the name passed as the second argument
45** to the sqlite3Fts2InitHashTable() function (e.g. 'fts2_tokenizer').
46**
47** If the <pointer> argument is specified, it must be a blob value
48** containing a pointer to be stored as the hash data corresponding
49** to the string <key-name>. If <pointer> is not specified, then
50** the string <key-name> must already exist in the has table. Otherwise,
51** an error is returned.
52**
53** Whether or not the <pointer> argument is specified, the value returned
54** is a blob containing the pointer stored as the hash data corresponding
55** to string <key-name> (after the hash-table is updated, if applicable).
56*/
57static void scalarFunc(
58 sqlite3_context *context,
59 int argc,
60 sqlite3_value **argv
61){
62 fts2Hash *pHash;
63 void *pPtr = 0;
64 const unsigned char *zName;
65 int nName;
66
67 assert( argc==1 || argc==2 );
68
69 pHash = (fts2Hash *)sqlite3_user_data(context);
70
71 zName = sqlite3_value_text(argv[0]);
72 nName = sqlite3_value_bytes(argv[0])+1;
73
74 if( argc==2 ){
75 void *pOld;
76 int n = sqlite3_value_bytes(argv[1]);
77 if( n!=sizeof(pPtr) ){
78 sqlite3_result_error(context, "argument type mismatch", -1);
79 return;
80 }
81 pPtr = *(void **)sqlite3_value_blob(argv[1]);
82 pOld = sqlite3Fts2HashInsert(pHash, (void *)zName, nName, pPtr);
83 if( pOld==pPtr ){
84 sqlite3_result_error(context, "out of memory", -1);
85 return;
86 }
87 }else{
88 pPtr = sqlite3Fts2HashFind(pHash, zName, nName);
89 if( !pPtr ){
90 char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName);
91 sqlite3_result_error(context, zErr, -1);
92 sqlite3_free(zErr);
93 return;
94 }
95 }
96
97 sqlite3_result_blob(context, (void *)&pPtr, sizeof(pPtr), SQLITE_TRANSIENT);
98}
99
100#ifdef SQLITE_TEST
101
102#include <tcl.h>
103#include <string.h>
104
105/*
106** Implementation of a special SQL scalar function for testing tokenizers
107** designed to be used in concert with the Tcl testing framework. This
108** function must be called with two arguments:
109**
110** SELECT <function-name>(<key-name>, <input-string>);
111** SELECT <function-name>(<key-name>, <pointer>);
112**
113** where <function-name> is the name passed as the second argument
114** to the sqlite3Fts2InitHashTable() function (e.g. 'fts2_tokenizer')
115** concatenated with the string '_test' (e.g. 'fts2_tokenizer_test').
116**
117** The return value is a string that may be interpreted as a Tcl
118** list. For each token in the <input-string>, three elements are
119** added to the returned list. The first is the token position, the
120** second is the token text (folded, stemmed, etc.) and the third is the
121** substring of <input-string> associated with the token. For example,
122** using the built-in "simple" tokenizer:
123**
124** SELECT fts_tokenizer_test('simple', 'I don't see how');
125**
126** will return the string:
127**
128** "{0 i I 1 dont don't 2 see see 3 how how}"
129**
130*/
131static void testFunc(
132 sqlite3_context *context,
133 int argc,
134 sqlite3_value **argv
135){
136 fts2Hash *pHash;
137 sqlite3_tokenizer_module *p;
138 sqlite3_tokenizer *pTokenizer = 0;
139 sqlite3_tokenizer_cursor *pCsr = 0;
140
141 const char *zErr = 0;
142
143 const char *zName;
144 int nName;
145 const char *zInput;
146 int nInput;
147
148 const char *zArg = 0;
149
150 const char *zToken;
151 int nToken;
152 int iStart;
153 int iEnd;
154 int iPos;
155
156 Tcl_Obj *pRet;
157
158 assert( argc==2 || argc==3 );
159
160 nName = sqlite3_value_bytes(argv[0]);
161 zName = (const char *)sqlite3_value_text(argv[0]);
162 nInput = sqlite3_value_bytes(argv[argc-1]);
163 zInput = (const char *)sqlite3_value_text(argv[argc-1]);
164
165 if( argc==3 ){
166 zArg = (const char *)sqlite3_value_text(argv[1]);
167 }
168
169 pHash = (fts2Hash *)sqlite3_user_data(context);
170 p = (sqlite3_tokenizer_module *)sqlite3Fts2HashFind(pHash, zName, nName+1);
171
172 if( !p ){
173 char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName);
174 sqlite3_result_error(context, zErr, -1);
175 sqlite3_free(zErr);
176 return;
177 }
178
179 pRet = Tcl_NewObj();
180 Tcl_IncrRefCount(pRet);
181
182 if( SQLITE_OK!=p->xCreate(zArg ? 1 : 0, &zArg, &pTokenizer) ){
183 zErr = "error in xCreate()";
184 goto finish;
185 }
186 pTokenizer->pModule = p;
187 if( SQLITE_OK!=p->xOpen(pTokenizer, zInput, nInput, &pCsr) ){
188 zErr = "error in xOpen()";
189 goto finish;
190 }
191 pCsr->pTokenizer = pTokenizer;
192
193 while( SQLITE_OK==p->xNext(pCsr, &zToken, &nToken, &iStart, &iEnd, &iPos) ){
194 Tcl_ListObjAppendElement(0, pRet, Tcl_NewIntObj(iPos));
195 Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
196 zToken = &zInput[iStart];
197 nToken = iEnd-iStart;
198 Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
199 }
200
201 if( SQLITE_OK!=p->xClose(pCsr) ){
202 zErr = "error in xClose()";
203 goto finish;
204 }
205 if( SQLITE_OK!=p->xDestroy(pTokenizer) ){
206 zErr = "error in xDestroy()";
207 goto finish;
208 }
209
210finish:
211 if( zErr ){
212 sqlite3_result_error(context, zErr, -1);
213 }else{
214 sqlite3_result_text(context, Tcl_GetString(pRet), -1, SQLITE_TRANSIENT);
215 }
216 Tcl_DecrRefCount(pRet);
217}
218
219static
220int registerTokenizer(
221 sqlite3 *db,
222 char *zName,
223 const sqlite3_tokenizer_module *p
224){
225 int rc;
226 sqlite3_stmt *pStmt;
227 const char zSql[] = "SELECT fts2_tokenizer(?, ?)";
228
229 rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
230 if( rc!=SQLITE_OK ){
231 return rc;
232 }
233
234 sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
235 sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
236 sqlite3_step(pStmt);
237
238 return sqlite3_finalize(pStmt);
239}
240
241static
242int queryTokenizer(
243 sqlite3 *db,
244 char *zName,
245 const sqlite3_tokenizer_module **pp
246){
247 int rc;
248 sqlite3_stmt *pStmt;
249 const char zSql[] = "SELECT fts2_tokenizer(?)";
250
251 *pp = 0;
252 rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
253 if( rc!=SQLITE_OK ){
254 return rc;
255 }
256
257 sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
258 if( SQLITE_ROW==sqlite3_step(pStmt) ){
259 if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){
260 memcpy(pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp));
261 }
262 }
263
264 return sqlite3_finalize(pStmt);
265}
266
267void sqlite3Fts2SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule);
268
269/*
270** Implementation of the scalar function fts2_tokenizer_internal_test().
271** This function is used for testing only, it is not included in the
272** build unless SQLITE_TEST is defined.
273**
274** The purpose of this is to test that the fts2_tokenizer() function
275** can be used as designed by the C-code in the queryTokenizer and
276** registerTokenizer() functions above. These two functions are repeated
277** in the README.tokenizer file as an example, so it is important to
278** test them.
279**
280** To run the tests, evaluate the fts2_tokenizer_internal_test() scalar
281** function with no arguments. An assert() will fail if a problem is
282** detected. i.e.:
283**
284** SELECT fts2_tokenizer_internal_test();
285**
286*/
287static void intTestFunc(
288 sqlite3_context *context,
289 int argc,
290 sqlite3_value **argv
291){
292 int rc;
293 const sqlite3_tokenizer_module *p1;
294 const sqlite3_tokenizer_module *p2;
295 sqlite3 *db = (sqlite3 *)sqlite3_user_data(context);
296
297 /* Test the query function */
298 sqlite3Fts2SimpleTokenizerModule(&p1);
299 rc = queryTokenizer(db, "simple", &p2);
300 assert( rc==SQLITE_OK );
301 assert( p1==p2 );
302 rc = queryTokenizer(db, "nosuchtokenizer", &p2);
303 assert( rc==SQLITE_ERROR );
304 assert( p2==0 );
305 assert( 0==strcmp(sqlite3_errmsg(db), "unknown tokenizer: nosuchtokenizer") );
306
307 /* Test the storage function */
308 rc = registerTokenizer(db, "nosuchtokenizer", p1);
309 assert( rc==SQLITE_OK );
310 rc = queryTokenizer(db, "nosuchtokenizer", &p2);
311 assert( rc==SQLITE_OK );
312 assert( p2==p1 );
313
314 sqlite3_result_text(context, "ok", -1, SQLITE_STATIC);
315}
316
317#endif
318
319/*
320** Set up SQL objects in database db used to access the contents of
321** the hash table pointed to by argument pHash. The hash table must
322** been initialised to use string keys, and to take a private copy
323** of the key when a value is inserted. i.e. by a call similar to:
324**
325** sqlite3Fts2HashInit(pHash, FTS2_HASH_STRING, 1);
326**
327** This function adds a scalar function (see header comment above
328** scalarFunc() in this file for details) and, if ENABLE_TABLE is
329** defined at compilation time, a temporary virtual table (see header
330** comment above struct HashTableVtab) to the database schema. Both
331** provide read/write access to the contents of *pHash.
332**
333** The third argument to this function, zName, is used as the name
334** of both the scalar and, if created, the virtual table.
335*/
336int sqlite3Fts2InitHashTable(
337 sqlite3 *db,
338 fts2Hash *pHash,
339 const char *zName
340){
341 int rc = SQLITE_OK;
342 void *p = (void *)pHash;
343 const int any = SQLITE_ANY;
344 char *zTest = 0;
345 char *zTest2 = 0;
346
347#ifdef SQLITE_TEST
348 void *pdb = (void *)db;
349 zTest = sqlite3_mprintf("%s_test", zName);
350 zTest2 = sqlite3_mprintf("%s_internal_test", zName);
351 if( !zTest || !zTest2 ){
352 rc = SQLITE_NOMEM;
353 }
354#endif
355
356 if( rc!=SQLITE_OK
357 || (rc = sqlite3_create_function(db, zName, 1, any, p, scalarFunc, 0, 0))
358 || (rc = sqlite3_create_function(db, zName, 2, any, p, scalarFunc, 0, 0))
359#ifdef SQLITE_TEST
360 || (rc = sqlite3_create_function(db, zTest, 2, any, p, testFunc, 0, 0))
361 || (rc = sqlite3_create_function(db, zTest, 3, any, p, testFunc, 0, 0))
362 || (rc = sqlite3_create_function(db, zTest2, 0, any, pdb, intTestFunc, 0, 0))
363#endif
364 );
365
366 sqlite3_free(zTest);
367 sqlite3_free(zTest2);
368 return rc;
369}
370
371#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_tokenizer.h b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_tokenizer.h
deleted file mode 100644
index 8c256b2..0000000
--- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_tokenizer.h
+++ /dev/null
@@ -1,145 +0,0 @@
1/*
2** 2006 July 10
3**
4** The author disclaims copyright to this source code.
5**
6*************************************************************************
7** Defines the interface to tokenizers used by fulltext-search. There
8** are three basic components:
9**
10** sqlite3_tokenizer_module is a singleton defining the tokenizer
11** interface functions. This is essentially the class structure for
12** tokenizers.
13**
14** sqlite3_tokenizer is used to define a particular tokenizer, perhaps
15** including customization information defined at creation time.
16**
17** sqlite3_tokenizer_cursor is generated by a tokenizer to generate
18** tokens from a particular input.
19*/
20#ifndef _FTS2_TOKENIZER_H_
21#define _FTS2_TOKENIZER_H_
22
23/* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time.
24** If tokenizers are to be allowed to call sqlite3_*() functions, then
25** we will need a way to register the API consistently.
26*/
27#include "sqlite3.h"
28
29/*
30** Structures used by the tokenizer interface. When a new tokenizer
31** implementation is registered, the caller provides a pointer to
32** an sqlite3_tokenizer_module containing pointers to the callback
33** functions that make up an implementation.
34**
35** When an fts2 table is created, it passes any arguments passed to
36** the tokenizer clause of the CREATE VIRTUAL TABLE statement to the
37** sqlite3_tokenizer_module.xCreate() function of the requested tokenizer
38** implementation. The xCreate() function in turn returns an
39** sqlite3_tokenizer structure representing the specific tokenizer to
40** be used for the fts2 table (customized by the tokenizer clause arguments).
41**
42** To tokenize an input buffer, the sqlite3_tokenizer_module.xOpen()
43** method is called. It returns an sqlite3_tokenizer_cursor object
44** that may be used to tokenize a specific input buffer based on
45** the tokenization rules supplied by a specific sqlite3_tokenizer
46** object.
47*/
48typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module;
49typedef struct sqlite3_tokenizer sqlite3_tokenizer;
50typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor;
51
52struct sqlite3_tokenizer_module {
53
54 /*
55 ** Structure version. Should always be set to 0.
56 */
57 int iVersion;
58
59 /*
60 ** Create a new tokenizer. The values in the argv[] array are the
61 ** arguments passed to the "tokenizer" clause of the CREATE VIRTUAL
62 ** TABLE statement that created the fts2 table. For example, if
63 ** the following SQL is executed:
64 **
65 ** CREATE .. USING fts2( ... , tokenizer <tokenizer-name> arg1 arg2)
66 **
67 ** then argc is set to 2, and the argv[] array contains pointers
68 ** to the strings "arg1" and "arg2".
69 **
70 ** This method should return either SQLITE_OK (0), or an SQLite error
71 ** code. If SQLITE_OK is returned, then *ppTokenizer should be set
72 ** to point at the newly created tokenizer structure. The generic
73 ** sqlite3_tokenizer.pModule variable should not be initialised by
74 ** this callback. The caller will do so.
75 */
76 int (*xCreate)(
77 int argc, /* Size of argv array */
78 const char *const*argv, /* Tokenizer argument strings */
79 sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */
80 );
81
82 /*
83 ** Destroy an existing tokenizer. The fts2 module calls this method
84 ** exactly once for each successful call to xCreate().
85 */
86 int (*xDestroy)(sqlite3_tokenizer *pTokenizer);
87
88 /*
89 ** Create a tokenizer cursor to tokenize an input buffer. The caller
90 ** is responsible for ensuring that the input buffer remains valid
91 ** until the cursor is closed (using the xClose() method).
92 */
93 int (*xOpen)(
94 sqlite3_tokenizer *pTokenizer, /* Tokenizer object */
95 const char *pInput, int nBytes, /* Input buffer */
96 sqlite3_tokenizer_cursor **ppCursor /* OUT: Created tokenizer cursor */
97 );
98
99 /*
100 ** Destroy an existing tokenizer cursor. The fts2 module calls this
101 ** method exactly once for each successful call to xOpen().
102 */
103 int (*xClose)(sqlite3_tokenizer_cursor *pCursor);
104
105 /*
106 ** Retrieve the next token from the tokenizer cursor pCursor. This
107 ** method should either return SQLITE_OK and set the values of the
108 ** "OUT" variables identified below, or SQLITE_DONE to indicate that
109 ** the end of the buffer has been reached, or an SQLite error code.
110 **
111 ** *ppToken should be set to point at a buffer containing the
112 ** normalized version of the token (i.e. after any case-folding and/or
113 ** stemming has been performed). *pnBytes should be set to the length
114 ** of this buffer in bytes. The input text that generated the token is
115 ** identified by the byte offsets returned in *piStartOffset and
116 ** *piEndOffset.
117 **
118 ** The buffer *ppToken is set to point at is managed by the tokenizer
119 ** implementation. It is only required to be valid until the next call
120 ** to xNext() or xClose().
121 */
122 /* TODO(shess) current implementation requires pInput to be
123 ** nul-terminated. This should either be fixed, or pInput/nBytes
124 ** should be converted to zInput.
125 */
126 int (*xNext)(
127 sqlite3_tokenizer_cursor *pCursor, /* Tokenizer cursor */
128 const char **ppToken, int *pnBytes, /* OUT: Normalized text for token */
129 int *piStartOffset, /* OUT: Byte offset of token in input buffer */
130 int *piEndOffset, /* OUT: Byte offset of end of token in input buffer */
131 int *piPosition /* OUT: Number of tokens returned before this one */
132 );
133};
134
135struct sqlite3_tokenizer {
136 const sqlite3_tokenizer_module *pModule; /* The module for this tokenizer */
137 /* Tokenizer implementations will typically add additional fields */
138};
139
140struct sqlite3_tokenizer_cursor {
141 sqlite3_tokenizer *pTokenizer; /* Tokenizer for this cursor. */
142 /* Tokenizer implementations will typically add additional fields */
143};
144
145#endif /* _FTS2_TOKENIZER_H_ */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_tokenizer1.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_tokenizer1.c
deleted file mode 100644
index 540ba27..0000000
--- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_tokenizer1.c
+++ /dev/null
@@ -1,229 +0,0 @@
1/*
2** 2006 Oct 10
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11******************************************************************************
12**
13** Implementation of the "simple" full-text-search tokenizer.
14*/
15
16/*
17** The code in this file is only compiled if:
18**
19** * The FTS2 module is being built as an extension
20** (in which case SQLITE_CORE is not defined), or
21**
22** * The FTS2 module is being built into the core of
23** SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
24*/
25#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
26
27
28#include <assert.h>
29#include <stdlib.h>
30#include <stdio.h>
31#include <string.h>
32#include <ctype.h>
33
34#include "fts2_tokenizer.h"
35
36typedef struct simple_tokenizer {
37 sqlite3_tokenizer base;
38 char delim[128]; /* flag ASCII delimiters */
39} simple_tokenizer;
40
41typedef struct simple_tokenizer_cursor {
42 sqlite3_tokenizer_cursor base;
43 const char *pInput; /* input we are tokenizing */
44 int nBytes; /* size of the input */
45 int iOffset; /* current position in pInput */
46 int iToken; /* index of next token to be returned */
47 char *pToken; /* storage for current token */
48 int nTokenAllocated; /* space allocated to zToken buffer */
49} simple_tokenizer_cursor;
50
51
52/* Forward declaration */
53static const sqlite3_tokenizer_module simpleTokenizerModule;
54
55static int simpleDelim(simple_tokenizer *t, unsigned char c){
56 return c<0x80 && t->delim[c];
57}
58
59/*
60** Create a new tokenizer instance.
61*/
62static int simpleCreate(
63 int argc, const char * const *argv,
64 sqlite3_tokenizer **ppTokenizer
65){
66 simple_tokenizer *t;
67
68 t = (simple_tokenizer *) calloc(sizeof(*t), 1);
69 if( t==NULL ) return SQLITE_NOMEM;
70
71 /* TODO(shess) Delimiters need to remain the same from run to run,
72 ** else we need to reindex. One solution would be a meta-table to
73 ** track such information in the database, then we'd only want this
74 ** information on the initial create.
75 */
76 if( argc>1 ){
77 int i, n = strlen(argv[1]);
78 for(i=0; i<n; i++){
79 unsigned char ch = argv[1][i];
80 /* We explicitly don't support UTF-8 delimiters for now. */
81 if( ch>=0x80 ){
82 free(t);
83 return SQLITE_ERROR;
84 }
85 t->delim[ch] = 1;
86 }
87 } else {
88 /* Mark non-alphanumeric ASCII characters as delimiters */
89 int i;
90 for(i=1; i<0x80; i++){
91 t->delim[i] = !isalnum(i);
92 }
93 }
94
95 *ppTokenizer = &t->base;
96 return SQLITE_OK;
97}
98
99/*
100** Destroy a tokenizer
101*/
102static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
103 free(pTokenizer);
104 return SQLITE_OK;
105}
106
107/*
108** Prepare to begin tokenizing a particular string. The input
109** string to be tokenized is pInput[0..nBytes-1]. A cursor
110** used to incrementally tokenize this string is returned in
111** *ppCursor.
112*/
113static int simpleOpen(
114 sqlite3_tokenizer *pTokenizer, /* The tokenizer */
115 const char *pInput, int nBytes, /* String to be tokenized */
116 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
117){
118 simple_tokenizer_cursor *c;
119
120 c = (simple_tokenizer_cursor *) malloc(sizeof(*c));
121 if( c==NULL ) return SQLITE_NOMEM;
122
123 c->pInput = pInput;
124 if( pInput==0 ){
125 c->nBytes = 0;
126 }else if( nBytes<0 ){
127 c->nBytes = (int)strlen(pInput);
128 }else{
129 c->nBytes = nBytes;
130 }
131 c->iOffset = 0; /* start tokenizing at the beginning */
132 c->iToken = 0;
133 c->pToken = NULL; /* no space allocated, yet. */
134 c->nTokenAllocated = 0;
135
136 *ppCursor = &c->base;
137 return SQLITE_OK;
138}
139
140/*
141** Close a tokenization cursor previously opened by a call to
142** simpleOpen() above.
143*/
144static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
145 simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
146 free(c->pToken);
147 free(c);
148 return SQLITE_OK;
149}
150
151/*
152** Extract the next token from a tokenization cursor. The cursor must
153** have been opened by a prior call to simpleOpen().
154*/
155static int simpleNext(
156 sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */
157 const char **ppToken, /* OUT: *ppToken is the token text */
158 int *pnBytes, /* OUT: Number of bytes in token */
159 int *piStartOffset, /* OUT: Starting offset of token */
160 int *piEndOffset, /* OUT: Ending offset of token */
161 int *piPosition /* OUT: Position integer of token */
162){
163 simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
164 simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
165 unsigned char *p = (unsigned char *)c->pInput;
166
167 while( c->iOffset<c->nBytes ){
168 int iStartOffset;
169
170 /* Scan past delimiter characters */
171 while( c->iOffset<c->nBytes && simpleDelim(t, p[c->iOffset]) ){
172 c->iOffset++;
173 }
174
175 /* Count non-delimiter characters. */
176 iStartOffset = c->iOffset;
177 while( c->iOffset<c->nBytes && !simpleDelim(t, p[c->iOffset]) ){
178 c->iOffset++;
179 }
180
181 if( c->iOffset>iStartOffset ){
182 int i, n = c->iOffset-iStartOffset;
183 if( n>c->nTokenAllocated ){
184 c->nTokenAllocated = n+20;
185 c->pToken = realloc(c->pToken, c->nTokenAllocated);
186 if( c->pToken==NULL ) return SQLITE_NOMEM;
187 }
188 for(i=0; i<n; i++){
189 /* TODO(shess) This needs expansion to handle UTF-8
190 ** case-insensitivity.
191 */
192 unsigned char ch = p[iStartOffset+i];
193 c->pToken[i] = ch<0x80 ? tolower(ch) : ch;
194 }
195 *ppToken = c->pToken;
196 *pnBytes = n;
197 *piStartOffset = iStartOffset;
198 *piEndOffset = c->iOffset;
199 *piPosition = c->iToken++;
200
201 return SQLITE_OK;
202 }
203 }
204 return SQLITE_DONE;
205}
206
207/*
208** The set of routines that implement the simple tokenizer
209*/
210static const sqlite3_tokenizer_module simpleTokenizerModule = {
211 0,
212 simpleCreate,
213 simpleDestroy,
214 simpleOpen,
215 simpleClose,
216 simpleNext,
217};
218
219/*
220** Allocate a new simple tokenizer. Return a pointer to the new
221** tokenizer in *ppModule
222*/
223void sqlite3Fts2SimpleTokenizerModule(
224 sqlite3_tokenizer_module const**ppModule
225){
226 *ppModule = &simpleTokenizerModule;
227}
228
229#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/mkfts2amal.tcl b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/mkfts2amal.tcl
deleted file mode 100644
index 5c8d1e9..0000000
--- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/mkfts2amal.tcl
+++ /dev/null
@@ -1,116 +0,0 @@
1#!/usr/bin/tclsh
2#
3# This script builds a single C code file holding all of FTS2 code.
4# The name of the output file is fts2amal.c. To build this file,
5# first do:
6#
7# make target_source
8#
9# The make target above moves all of the source code files into
10# a subdirectory named "tsrc". (This script expects to find the files
11# there and will not work if they are not found.)
12#
13# After the "tsrc" directory has been created and populated, run
14# this script:
15#
16# tclsh mkfts2amal.tcl
17#
18# The amalgamated FTS2 code will be written into fts2amal.c
19#
20
21# Open the output file and write a header comment at the beginning
22# of the file.
23#
24set out [open fts2amal.c w]
25set today [clock format [clock seconds] -format "%Y-%m-%d %H:%M:%S UTC" -gmt 1]
26puts $out [subst \
27{/******************************************************************************
28** This file is an amalgamation of separate C source files from the SQLite
29** Full Text Search extension 2 (fts2). By combining all the individual C
30** code files into this single large file, the entire code can be compiled
31** as a one translation unit. This allows many compilers to do optimizations
32** that would not be possible if the files were compiled separately. It also
33** makes the code easier to import into other projects.
34**
35** This amalgamation was generated on $today.
36*/}]
37
38# These are the header files used by FTS2. The first time any of these
39# files are seen in a #include statement in the C code, include the complete
40# text of the file in-line. The file only needs to be included once.
41#
42foreach hdr {
43 fts2.h
44 fts2_hash.h
45 fts2_tokenizer.h
46 sqlite3.h
47 sqlite3ext.h
48} {
49 set available_hdr($hdr) 1
50}
51
52# 78 stars used for comment formatting.
53set s78 \
54{*****************************************************************************}
55
56# Insert a comment into the code
57#
58proc section_comment {text} {
59 global out s78
60 set n [string length $text]
61 set nstar [expr {60 - $n}]
62 set stars [string range $s78 0 $nstar]
63 puts $out "/************** $text $stars/"
64}
65
66# Read the source file named $filename and write it into the
67# sqlite3.c output file. If any #include statements are seen,
68# process them approprately.
69#
70proc copy_file {filename} {
71 global seen_hdr available_hdr out
72 set tail [file tail $filename]
73 section_comment "Begin file $tail"
74 set in [open $filename r]
75 while {![eof $in]} {
76 set line [gets $in]
77 if {[regexp {^#\s*include\s+["<]([^">]+)[">]} $line all hdr]} {
78 if {[info exists available_hdr($hdr)]} {
79 if {$available_hdr($hdr)} {
80 section_comment "Include $hdr in the middle of $tail"
81 copy_file tsrc/$hdr
82 section_comment "Continuing where we left off in $tail"
83 }
84 } elseif {![info exists seen_hdr($hdr)]} {
85 set seen_hdr($hdr) 1
86 puts $out $line
87 }
88 } elseif {[regexp {^#ifdef __cplusplus} $line]} {
89 puts $out "#if 0"
90 } elseif {[regexp {^#line} $line]} {
91 # Skip #line directives.
92 } else {
93 puts $out $line
94 }
95 }
96 close $in
97 section_comment "End of $tail"
98}
99
100
101# Process the source files. Process files containing commonly
102# used subroutines first in order to help the compiler find
103# inlining opportunities.
104#
105foreach file {
106 fts2.c
107 fts2_hash.c
108 fts2_porter.c
109 fts2_tokenizer.c
110 fts2_tokenizer1.c
111 fts2_icu.c
112} {
113 copy_file tsrc/$file
114}
115
116close $out
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/README.tokenizers b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/README.tokenizers
deleted file mode 100644
index f214b24..0000000
--- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/README.tokenizers
+++ /dev/null
@@ -1,134 +0,0 @@
1
21. FTS3 Tokenizers
3
4 When creating a new full-text table, FTS3 allows the user to select
5 the text tokenizer implementation to be used when indexing text
6 by specifying a "tokenizer" clause as part of the CREATE VIRTUAL TABLE
7 statement:
8
9 CREATE VIRTUAL TABLE <table-name> USING fts3(
10 <columns ...> [, tokenizer <tokenizer-name> [<tokenizer-args>]]
11 );
12
13 The built-in tokenizers (valid values to pass as <tokenizer name>) are
14 "simple" and "porter".
15
16 <tokenizer-args> should consist of zero or more white-space separated
17 arguments to pass to the selected tokenizer implementation. The
18 interpretation of the arguments, if any, depends on the individual
19 tokenizer.
20
212. Custom Tokenizers
22
23 FTS3 allows users to provide custom tokenizer implementations. The
24 interface used to create a new tokenizer is defined and described in
25 the fts3_tokenizer.h source file.
26
27 Registering a new FTS3 tokenizer is similar to registering a new
28 virtual table module with SQLite. The user passes a pointer to a
29 structure containing pointers to various callback functions that
30 make up the implementation of the new tokenizer type. For tokenizers,
31 the structure (defined in fts3_tokenizer.h) is called
32 "sqlite3_tokenizer_module".
33
34 FTS3 does not expose a C-function that users call to register new
35 tokenizer types with a database handle. Instead, the pointer must
36 be encoded as an SQL blob value and passed to FTS3 through the SQL
37 engine by evaluating a special scalar function, "fts3_tokenizer()".
38 The fts3_tokenizer() function may be called with one or two arguments,
39 as follows:
40
41 SELECT fts3_tokenizer(<tokenizer-name>);
42 SELECT fts3_tokenizer(<tokenizer-name>, <sqlite3_tokenizer_module ptr>);
43
44 Where <tokenizer-name> is a string identifying the tokenizer and
45 <sqlite3_tokenizer_module ptr> is a pointer to an sqlite3_tokenizer_module
46 structure encoded as an SQL blob. If the second argument is present,
47 it is registered as tokenizer <tokenizer-name> and a copy of it
48 returned. If only one argument is passed, a pointer to the tokenizer
49 implementation currently registered as <tokenizer-name> is returned,
50 encoded as a blob. Or, if no such tokenizer exists, an SQL exception
51 (error) is raised.
52
53 SECURITY: If the fts3 extension is used in an environment where potentially
54 malicious users may execute arbitrary SQL (i.e. gears), they should be
55 prevented from invoking the fts3_tokenizer() function, possibly using the
56 authorisation callback.
57
58 See "Sample code" below for an example of calling the fts3_tokenizer()
59 function from C code.
60
613. ICU Library Tokenizers
62
63 If this extension is compiled with the SQLITE_ENABLE_ICU pre-processor
64 symbol defined, then there exists a built-in tokenizer named "icu"
65 implemented using the ICU library. The first argument passed to the
66 xCreate() method (see fts3_tokenizer.h) of this tokenizer may be
67 an ICU locale identifier. For example "tr_TR" for Turkish as used
68 in Turkey, or "en_AU" for English as used in Australia. For example:
69
70 "CREATE VIRTUAL TABLE thai_text USING fts3(text, tokenizer icu th_TH)"
71
72 The ICU tokenizer implementation is very simple. It splits the input
73 text according to the ICU rules for finding word boundaries and discards
74 any tokens that consist entirely of white-space. This may be suitable
75 for some applications in some locales, but not all. If more complex
76 processing is required, for example to implement stemming or
77 discard punctuation, this can be done by creating a tokenizer
78 implementation that uses the ICU tokenizer as part of it's implementation.
79
80 When using the ICU tokenizer this way, it is safe to overwrite the
81 contents of the strings returned by the xNext() method (see
82 fts3_tokenizer.h).
83
844. Sample code.
85
86 The following two code samples illustrate the way C code should invoke
87 the fts3_tokenizer() scalar function:
88
89 int registerTokenizer(
90 sqlite3 *db,
91 char *zName,
92 const sqlite3_tokenizer_module *p
93 ){
94 int rc;
95 sqlite3_stmt *pStmt;
96 const char zSql[] = "SELECT fts3_tokenizer(?, ?)";
97
98 rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
99 if( rc!=SQLITE_OK ){
100 return rc;
101 }
102
103 sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
104 sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
105 sqlite3_step(pStmt);
106
107 return sqlite3_finalize(pStmt);
108 }
109
110 int queryTokenizer(
111 sqlite3 *db,
112 char *zName,
113 const sqlite3_tokenizer_module **pp
114 ){
115 int rc;
116 sqlite3_stmt *pStmt;
117 const char zSql[] = "SELECT fts3_tokenizer(?)";
118
119 *pp = 0;
120 rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
121 if( rc!=SQLITE_OK ){
122 return rc;
123 }
124
125 sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
126 if( SQLITE_ROW==sqlite3_step(pStmt) ){
127 if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){
128 memcpy(pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp));
129 }
130 }
131
132 return sqlite3_finalize(pStmt);
133 }
134
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/README.txt b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/README.txt
deleted file mode 100644
index 517a2a0..0000000
--- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/README.txt
+++ /dev/null
@@ -1,4 +0,0 @@
1This folder contains source code to the second full-text search
2extension for SQLite. While the API is the same, this version uses a
3substantially different storage schema from fts1, so tables will need
4to be rebuilt.
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3.c
deleted file mode 100644
index b392919..0000000
--- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3.c
+++ /dev/null
@@ -1,5971 +0,0 @@
1/*
2** 2006 Oct 10
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11******************************************************************************
12**
13** This is an SQLite module implementing full-text search.
14*/
15
16/*
17** The code in this file is only compiled if:
18**
19** * The FTS3 module is being built as an extension
20** (in which case SQLITE_CORE is not defined), or
21**
22** * The FTS3 module is being built into the core of
23** SQLite (in which case SQLITE_ENABLE_FTS3 is defined).
24*/
25
26/* TODO(shess) Consider exporting this comment to an HTML file or the
27** wiki.
28*/
29/* The full-text index is stored in a series of b+tree (-like)
30** structures called segments which map terms to doclists. The
31** structures are like b+trees in layout, but are constructed from the
32** bottom up in optimal fashion and are not updatable. Since trees
33** are built from the bottom up, things will be described from the
34** bottom up.
35**
36**
37**** Varints ****
38** The basic unit of encoding is a variable-length integer called a
39** varint. We encode variable-length integers in little-endian order
40** using seven bits * per byte as follows:
41**
42** KEY:
43** A = 0xxxxxxx 7 bits of data and one flag bit
44** B = 1xxxxxxx 7 bits of data and one flag bit
45**
46** 7 bits - A
47** 14 bits - BA
48** 21 bits - BBA
49** and so on.
50**
51** This is identical to how sqlite encodes varints (see util.c).
52**
53**
54**** Document lists ****
55** A doclist (document list) holds a docid-sorted list of hits for a
56** given term. Doclists hold docids, and can optionally associate
57** token positions and offsets with docids.
58**
59** A DL_POSITIONS_OFFSETS doclist is stored like this:
60**
61** array {
62** varint docid;
63** array { (position list for column 0)
64** varint position; (delta from previous position plus POS_BASE)
65** varint startOffset; (delta from previous startOffset)
66** varint endOffset; (delta from startOffset)
67** }
68** array {
69** varint POS_COLUMN; (marks start of position list for new column)
70** varint column; (index of new column)
71** array {
72** varint position; (delta from previous position plus POS_BASE)
73** varint startOffset;(delta from previous startOffset)
74** varint endOffset; (delta from startOffset)
75** }
76** }
77** varint POS_END; (marks end of positions for this document.
78** }
79**
80** Here, array { X } means zero or more occurrences of X, adjacent in
81** memory. A "position" is an index of a token in the token stream
82** generated by the tokenizer, while an "offset" is a byte offset,
83** both based at 0. Note that POS_END and POS_COLUMN occur in the
84** same logical place as the position element, and act as sentinals
85** ending a position list array.
86**
87** A DL_POSITIONS doclist omits the startOffset and endOffset
88** information. A DL_DOCIDS doclist omits both the position and
89** offset information, becoming an array of varint-encoded docids.
90**
91** On-disk data is stored as type DL_DEFAULT, so we don't serialize
92** the type. Due to how deletion is implemented in the segmentation
93** system, on-disk doclists MUST store at least positions.
94**
95**
96**** Segment leaf nodes ****
97** Segment leaf nodes store terms and doclists, ordered by term. Leaf
98** nodes are written using LeafWriter, and read using LeafReader (to
99** iterate through a single leaf node's data) and LeavesReader (to
100** iterate through a segment's entire leaf layer). Leaf nodes have
101** the format:
102**
103** varint iHeight; (height from leaf level, always 0)
104** varint nTerm; (length of first term)
105** char pTerm[nTerm]; (content of first term)
106** varint nDoclist; (length of term's associated doclist)
107** char pDoclist[nDoclist]; (content of doclist)
108** array {
109** (further terms are delta-encoded)
110** varint nPrefix; (length of prefix shared with previous term)
111** varint nSuffix; (length of unshared suffix)
112** char pTermSuffix[nSuffix];(unshared suffix of next term)
113** varint nDoclist; (length of term's associated doclist)
114** char pDoclist[nDoclist]; (content of doclist)
115** }
116**
117** Here, array { X } means zero or more occurrences of X, adjacent in
118** memory.
119**
120** Leaf nodes are broken into blocks which are stored contiguously in
121** the %_segments table in sorted order. This means that when the end
122** of a node is reached, the next term is in the node with the next
123** greater node id.
124**
125** New data is spilled to a new leaf node when the current node
126** exceeds LEAF_MAX bytes (default 2048). New data which itself is
127** larger than STANDALONE_MIN (default 1024) is placed in a standalone
128** node (a leaf node with a single term and doclist). The goal of
129** these settings is to pack together groups of small doclists while
130** making it efficient to directly access large doclists. The
131** assumption is that large doclists represent terms which are more
132** likely to be query targets.
133**
134** TODO(shess) It may be useful for blocking decisions to be more
135** dynamic. For instance, it may make more sense to have a 2.5k leaf
136** node rather than splitting into 2k and .5k nodes. My intuition is
137** that this might extend through 2x or 4x the pagesize.
138**
139**
140**** Segment interior nodes ****
141** Segment interior nodes store blockids for subtree nodes and terms
142** to describe what data is stored by the each subtree. Interior
143** nodes are written using InteriorWriter, and read using
144** InteriorReader. InteriorWriters are created as needed when
145** SegmentWriter creates new leaf nodes, or when an interior node
146** itself grows too big and must be split. The format of interior
147** nodes:
148**
149** varint iHeight; (height from leaf level, always >0)
150** varint iBlockid; (block id of node's leftmost subtree)
151** optional {
152** varint nTerm; (length of first term)
153** char pTerm[nTerm]; (content of first term)
154** array {
155** (further terms are delta-encoded)
156** varint nPrefix; (length of shared prefix with previous term)
157** varint nSuffix; (length of unshared suffix)
158** char pTermSuffix[nSuffix]; (unshared suffix of next term)
159** }
160** }
161**
162** Here, optional { X } means an optional element, while array { X }
163** means zero or more occurrences of X, adjacent in memory.
164**
165** An interior node encodes n terms separating n+1 subtrees. The
166** subtree blocks are contiguous, so only the first subtree's blockid
167** is encoded. The subtree at iBlockid will contain all terms less
168** than the first term encoded (or all terms if no term is encoded).
169** Otherwise, for terms greater than or equal to pTerm[i] but less
170** than pTerm[i+1], the subtree for that term will be rooted at
171** iBlockid+i. Interior nodes only store enough term data to
172** distinguish adjacent children (if the rightmost term of the left
173** child is "something", and the leftmost term of the right child is
174** "wicked", only "w" is stored).
175**
176** New data is spilled to a new interior node at the same height when
177** the current node exceeds INTERIOR_MAX bytes (default 2048).
178** INTERIOR_MIN_TERMS (default 7) keeps large terms from monopolizing
179** interior nodes and making the tree too skinny. The interior nodes
180** at a given height are naturally tracked by interior nodes at
181** height+1, and so on.
182**
183**
184**** Segment directory ****
185** The segment directory in table %_segdir stores meta-information for
186** merging and deleting segments, and also the root node of the
187** segment's tree.
188**
189** The root node is the top node of the segment's tree after encoding
190** the entire segment, restricted to ROOT_MAX bytes (default 1024).
191** This could be either a leaf node or an interior node. If the top
192** node requires more than ROOT_MAX bytes, it is flushed to %_segments
193** and a new root interior node is generated (which should always fit
194** within ROOT_MAX because it only needs space for 2 varints, the
195** height and the blockid of the previous root).
196**
197** The meta-information in the segment directory is:
198** level - segment level (see below)
199** idx - index within level
200** - (level,idx uniquely identify a segment)
201** start_block - first leaf node
202** leaves_end_block - last leaf node
203** end_block - last block (including interior nodes)
204** root - contents of root node
205**
206** If the root node is a leaf node, then start_block,
207** leaves_end_block, and end_block are all 0.
208**
209**
210**** Segment merging ****
211** To amortize update costs, segments are groups into levels and
212** merged in matches. Each increase in level represents exponentially
213** more documents.
214**
215** New documents (actually, document updates) are tokenized and
216** written individually (using LeafWriter) to a level 0 segment, with
217** incrementing idx. When idx reaches MERGE_COUNT (default 16), all
218** level 0 segments are merged into a single level 1 segment. Level 1
219** is populated like level 0, and eventually MERGE_COUNT level 1
220** segments are merged to a single level 2 segment (representing
221** MERGE_COUNT^2 updates), and so on.
222**
223** A segment merge traverses all segments at a given level in
224** parallel, performing a straightforward sorted merge. Since segment
225** leaf nodes are written in to the %_segments table in order, this
226** merge traverses the underlying sqlite disk structures efficiently.
227** After the merge, all segment blocks from the merged level are
228** deleted.
229**
230** MERGE_COUNT controls how often we merge segments. 16 seems to be
231** somewhat of a sweet spot for insertion performance. 32 and 64 show
232** very similar performance numbers to 16 on insertion, though they're
233** a tiny bit slower (perhaps due to more overhead in merge-time
234** sorting). 8 is about 20% slower than 16, 4 about 50% slower than
235** 16, 2 about 66% slower than 16.
236**
237** At query time, high MERGE_COUNT increases the number of segments
238** which need to be scanned and merged. For instance, with 100k docs
239** inserted:
240**
241** MERGE_COUNT segments
242** 16 25
243** 8 12
244** 4 10
245** 2 6
246**
247** This appears to have only a moderate impact on queries for very
248** frequent terms (which are somewhat dominated by segment merge
249** costs), and infrequent and non-existent terms still seem to be fast
250** even with many segments.
251**
252** TODO(shess) That said, it would be nice to have a better query-side
253** argument for MERGE_COUNT of 16. Also, it's possible/likely that
254** optimizations to things like doclist merging will swing the sweet
255** spot around.
256**
257**
258**
259**** Handling of deletions and updates ****
260** Since we're using a segmented structure, with no docid-oriented
261** index into the term index, we clearly cannot simply update the term
262** index when a document is deleted or updated. For deletions, we
263** write an empty doclist (varint(docid) varint(POS_END)), for updates
264** we simply write the new doclist. Segment merges overwrite older
265** data for a particular docid with newer data, so deletes or updates
266** will eventually overtake the earlier data and knock it out. The
267** query logic likewise merges doclists so that newer data knocks out
268** older data.
269**
270** TODO(shess) Provide a VACUUM type operation to clear out all
271** deletions and duplications. This would basically be a forced merge
272** into a single segment.
273*/
274
275#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
276
277#if defined(SQLITE_ENABLE_FTS3) && !defined(SQLITE_CORE)
278# define SQLITE_CORE 1
279#endif
280
281#include <assert.h>
282#include <stdlib.h>
283#include <stdio.h>
284#include <string.h>
285#include <ctype.h>
286
287#include "fts3.h"
288#include "fts3_hash.h"
289#include "fts3_tokenizer.h"
290#include "sqlite3.h"
291#include "sqlite3ext.h"
292SQLITE_EXTENSION_INIT1
293
294
295/* TODO(shess) MAN, this thing needs some refactoring. At minimum, it
296** would be nice to order the file better, perhaps something along the
297** lines of:
298**
299** - utility functions
300** - table setup functions
301** - table update functions
302** - table query functions
303**
304** Put the query functions last because they're likely to reference
305** typedefs or functions from the table update section.
306*/
307
308#if 0
309# define TRACE(A) printf A; fflush(stdout)
310#else
311# define TRACE(A)
312#endif
313
314/* It is not safe to call isspace(), tolower(), or isalnum() on
315** hi-bit-set characters. This is the same solution used in the
316** tokenizer.
317*/
318/* TODO(shess) The snippet-generation code should be using the
319** tokenizer-generated tokens rather than doing its own local
320** tokenization.
321*/
322/* TODO(shess) Is __isascii() a portable version of (c&0x80)==0? */
323static int safe_isspace(char c){
324 return (c&0x80)==0 ? isspace(c) : 0;
325}
326static int safe_tolower(char c){
327 return (c&0x80)==0 ? tolower(c) : c;
328}
329static int safe_isalnum(char c){
330 return (c&0x80)==0 ? isalnum(c) : 0;
331}
332
333typedef enum DocListType {
334 DL_DOCIDS, /* docids only */
335 DL_POSITIONS, /* docids + positions */
336 DL_POSITIONS_OFFSETS /* docids + positions + offsets */
337} DocListType;
338
339/*
340** By default, only positions and not offsets are stored in the doclists.
341** To change this so that offsets are stored too, compile with
342**
343** -DDL_DEFAULT=DL_POSITIONS_OFFSETS
344**
345** If DL_DEFAULT is set to DL_DOCIDS, your table can only be inserted
346** into (no deletes or updates).
347*/
348#ifndef DL_DEFAULT
349# define DL_DEFAULT DL_POSITIONS
350#endif
351
352enum {
353 POS_END = 0, /* end of this position list */
354 POS_COLUMN, /* followed by new column number */
355 POS_BASE
356};
357
358/* MERGE_COUNT controls how often we merge segments (see comment at
359** top of file).
360*/
361#define MERGE_COUNT 16
362
363/* utility functions */
364
365/* CLEAR() and SCRAMBLE() abstract memset() on a pointer to a single
366** record to prevent errors of the form:
367**
368** my_function(SomeType *b){
369** memset(b, '\0', sizeof(b)); // sizeof(b)!=sizeof(*b)
370** }
371*/
372/* TODO(shess) Obvious candidates for a header file. */
373#define CLEAR(b) memset(b, '\0', sizeof(*(b)))
374
375#ifndef NDEBUG
376# define SCRAMBLE(b) memset(b, 0x55, sizeof(*(b)))
377#else
378# define SCRAMBLE(b)
379#endif
380
381/* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */
382#define VARINT_MAX 10
383
384/* Write a 64-bit variable-length integer to memory starting at p[0].
385 * The length of data written will be between 1 and VARINT_MAX bytes.
386 * The number of bytes written is returned. */
387static int putVarint(char *p, sqlite_int64 v){
388 unsigned char *q = (unsigned char *) p;
389 sqlite_uint64 vu = v;
390 do{
391 *q++ = (unsigned char) ((vu & 0x7f) | 0x80);
392 vu >>= 7;
393 }while( vu!=0 );
394 q[-1] &= 0x7f; /* turn off high bit in final byte */
395 assert( q - (unsigned char *)p <= VARINT_MAX );
396 return (int) (q - (unsigned char *)p);
397}
398
399/* Read a 64-bit variable-length integer from memory starting at p[0].
400 * Return the number of bytes read, or 0 on error.
401 * The value is stored in *v. */
402static int getVarint(const char *p, sqlite_int64 *v){
403 const unsigned char *q = (const unsigned char *) p;
404 sqlite_uint64 x = 0, y = 1;
405 while( (*q & 0x80) == 0x80 ){
406 x += y * (*q++ & 0x7f);
407 y <<= 7;
408 if( q - (unsigned char *)p >= VARINT_MAX ){ /* bad data */
409 assert( 0 );
410 return 0;
411 }
412 }
413 x += y * (*q++);
414 *v = (sqlite_int64) x;
415 return (int) (q - (unsigned char *)p);
416}
417
418static int getVarint32(const char *p, int *pi){
419 sqlite_int64 i;
420 int ret = getVarint(p, &i);
421 *pi = (int) i;
422 assert( *pi==i );
423 return ret;
424}
425
426/*******************************************************************/
427/* DataBuffer is used to collect data into a buffer in piecemeal
428** fashion. It implements the usual distinction between amount of
429** data currently stored (nData) and buffer capacity (nCapacity).
430**
431** dataBufferInit - create a buffer with given initial capacity.
432** dataBufferReset - forget buffer's data, retaining capacity.
433** dataBufferDestroy - free buffer's data.
434** dataBufferExpand - expand capacity without adding data.
435** dataBufferAppend - append data.
436** dataBufferAppend2 - append two pieces of data at once.
437** dataBufferReplace - replace buffer's data.
438*/
439typedef struct DataBuffer {
440 char *pData; /* Pointer to malloc'ed buffer. */
441 int nCapacity; /* Size of pData buffer. */
442 int nData; /* End of data loaded into pData. */
443} DataBuffer;
444
445static void dataBufferInit(DataBuffer *pBuffer, int nCapacity){
446 assert( nCapacity>=0 );
447 pBuffer->nData = 0;
448 pBuffer->nCapacity = nCapacity;
449 pBuffer->pData = nCapacity==0 ? NULL : malloc(nCapacity);
450}
451static void dataBufferReset(DataBuffer *pBuffer){
452 pBuffer->nData = 0;
453}
454static void dataBufferDestroy(DataBuffer *pBuffer){
455 if( pBuffer->pData!=NULL ) free(pBuffer->pData);
456 SCRAMBLE(pBuffer);
457}
458static void dataBufferExpand(DataBuffer *pBuffer, int nAddCapacity){
459 assert( nAddCapacity>0 );
460 /* TODO(shess) Consider expanding more aggressively. Note that the
461 ** underlying malloc implementation may take care of such things for
462 ** us already.
463 */
464 if( pBuffer->nData+nAddCapacity>pBuffer->nCapacity ){
465 pBuffer->nCapacity = pBuffer->nData+nAddCapacity;
466 pBuffer->pData = realloc(pBuffer->pData, pBuffer->nCapacity);
467 }
468}
469static void dataBufferAppend(DataBuffer *pBuffer,
470 const char *pSource, int nSource){
471 assert( nSource>0 && pSource!=NULL );
472 dataBufferExpand(pBuffer, nSource);
473 memcpy(pBuffer->pData+pBuffer->nData, pSource, nSource);
474 pBuffer->nData += nSource;
475}
476static void dataBufferAppend2(DataBuffer *pBuffer,
477 const char *pSource1, int nSource1,
478 const char *pSource2, int nSource2){
479 assert( nSource1>0 && pSource1!=NULL );
480 assert( nSource2>0 && pSource2!=NULL );
481 dataBufferExpand(pBuffer, nSource1+nSource2);
482 memcpy(pBuffer->pData+pBuffer->nData, pSource1, nSource1);
483 memcpy(pBuffer->pData+pBuffer->nData+nSource1, pSource2, nSource2);
484 pBuffer->nData += nSource1+nSource2;
485}
486static void dataBufferReplace(DataBuffer *pBuffer,
487 const char *pSource, int nSource){
488 dataBufferReset(pBuffer);
489 dataBufferAppend(pBuffer, pSource, nSource);
490}
491
492/* StringBuffer is a null-terminated version of DataBuffer. */
493typedef struct StringBuffer {
494 DataBuffer b; /* Includes null terminator. */
495} StringBuffer;
496
497static void initStringBuffer(StringBuffer *sb){
498 dataBufferInit(&sb->b, 100);
499 dataBufferReplace(&sb->b, "", 1);
500}
501static int stringBufferLength(StringBuffer *sb){
502 return sb->b.nData-1;
503}
504static char *stringBufferData(StringBuffer *sb){
505 return sb->b.pData;
506}
507static void stringBufferDestroy(StringBuffer *sb){
508 dataBufferDestroy(&sb->b);
509}
510
511static void nappend(StringBuffer *sb, const char *zFrom, int nFrom){
512 assert( sb->b.nData>0 );
513 if( nFrom>0 ){
514 sb->b.nData--;
515 dataBufferAppend2(&sb->b, zFrom, nFrom, "", 1);
516 }
517}
518static void append(StringBuffer *sb, const char *zFrom){
519 nappend(sb, zFrom, strlen(zFrom));
520}
521
522/* Append a list of strings separated by commas. */
523static void appendList(StringBuffer *sb, int nString, char **azString){
524 int i;
525 for(i=0; i<nString; ++i){
526 if( i>0 ) append(sb, ", ");
527 append(sb, azString[i]);
528 }
529}
530
531static int endsInWhiteSpace(StringBuffer *p){
532 return stringBufferLength(p)>0 &&
533 safe_isspace(stringBufferData(p)[stringBufferLength(p)-1]);
534}
535
536/* If the StringBuffer ends in something other than white space, add a
537** single space character to the end.
538*/
539static void appendWhiteSpace(StringBuffer *p){
540 if( stringBufferLength(p)==0 ) return;
541 if( !endsInWhiteSpace(p) ) append(p, " ");
542}
543
544/* Remove white space from the end of the StringBuffer */
545static void trimWhiteSpace(StringBuffer *p){
546 while( endsInWhiteSpace(p) ){
547 p->b.pData[--p->b.nData-1] = '\0';
548 }
549}
550
551/*******************************************************************/
552/* DLReader is used to read document elements from a doclist. The
553** current docid is cached, so dlrDocid() is fast. DLReader does not
554** own the doclist buffer.
555**
556** dlrAtEnd - true if there's no more data to read.
557** dlrDocid - docid of current document.
558** dlrDocData - doclist data for current document (including docid).
559** dlrDocDataBytes - length of same.
560** dlrAllDataBytes - length of all remaining data.
561** dlrPosData - position data for current document.
562** dlrPosDataLen - length of pos data for current document (incl POS_END).
563** dlrStep - step to current document.
564** dlrInit - initial for doclist of given type against given data.
565** dlrDestroy - clean up.
566**
567** Expected usage is something like:
568**
569** DLReader reader;
570** dlrInit(&reader, pData, nData);
571** while( !dlrAtEnd(&reader) ){
572** // calls to dlrDocid() and kin.
573** dlrStep(&reader);
574** }
575** dlrDestroy(&reader);
576*/
577typedef struct DLReader {
578 DocListType iType;
579 const char *pData;
580 int nData;
581
582 sqlite_int64 iDocid;
583 int nElement;
584} DLReader;
585
586static int dlrAtEnd(DLReader *pReader){
587 assert( pReader->nData>=0 );
588 return pReader->nData==0;
589}
590static sqlite_int64 dlrDocid(DLReader *pReader){
591 assert( !dlrAtEnd(pReader) );
592 return pReader->iDocid;
593}
594static const char *dlrDocData(DLReader *pReader){
595 assert( !dlrAtEnd(pReader) );
596 return pReader->pData;
597}
598static int dlrDocDataBytes(DLReader *pReader){
599 assert( !dlrAtEnd(pReader) );
600 return pReader->nElement;
601}
602static int dlrAllDataBytes(DLReader *pReader){
603 assert( !dlrAtEnd(pReader) );
604 return pReader->nData;
605}
606/* TODO(shess) Consider adding a field to track iDocid varint length
607** to make these two functions faster. This might matter (a tiny bit)
608** for queries.
609*/
610static const char *dlrPosData(DLReader *pReader){
611 sqlite_int64 iDummy;
612 int n = getVarint(pReader->pData, &iDummy);
613 assert( !dlrAtEnd(pReader) );
614 return pReader->pData+n;
615}
616static int dlrPosDataLen(DLReader *pReader){
617 sqlite_int64 iDummy;
618 int n = getVarint(pReader->pData, &iDummy);
619 assert( !dlrAtEnd(pReader) );
620 return pReader->nElement-n;
621}
622static void dlrStep(DLReader *pReader){
623 assert( !dlrAtEnd(pReader) );
624
625 /* Skip past current doclist element. */
626 assert( pReader->nElement<=pReader->nData );
627 pReader->pData += pReader->nElement;
628 pReader->nData -= pReader->nElement;
629
630 /* If there is more data, read the next doclist element. */
631 if( pReader->nData!=0 ){
632 sqlite_int64 iDocidDelta;
633 int iDummy, n = getVarint(pReader->pData, &iDocidDelta);
634 pReader->iDocid += iDocidDelta;
635 if( pReader->iType>=DL_POSITIONS ){
636 assert( n<pReader->nData );
637 while( 1 ){
638 n += getVarint32(pReader->pData+n, &iDummy);
639 assert( n<=pReader->nData );
640 if( iDummy==POS_END ) break;
641 if( iDummy==POS_COLUMN ){
642 n += getVarint32(pReader->pData+n, &iDummy);
643 assert( n<pReader->nData );
644 }else if( pReader->iType==DL_POSITIONS_OFFSETS ){
645 n += getVarint32(pReader->pData+n, &iDummy);
646 n += getVarint32(pReader->pData+n, &iDummy);
647 assert( n<pReader->nData );
648 }
649 }
650 }
651 pReader->nElement = n;
652 assert( pReader->nElement<=pReader->nData );
653 }
654}
655static void dlrInit(DLReader *pReader, DocListType iType,
656 const char *pData, int nData){
657 assert( pData!=NULL && nData!=0 );
658 pReader->iType = iType;
659 pReader->pData = pData;
660 pReader->nData = nData;
661 pReader->nElement = 0;
662 pReader->iDocid = 0;
663
664 /* Load the first element's data. There must be a first element. */
665 dlrStep(pReader);
666}
667static void dlrDestroy(DLReader *pReader){
668 SCRAMBLE(pReader);
669}
670
671#ifndef NDEBUG
672/* Verify that the doclist can be validly decoded. Also returns the
673** last docid found because it's convenient in other assertions for
674** DLWriter.
675*/
676static void docListValidate(DocListType iType, const char *pData, int nData,
677 sqlite_int64 *pLastDocid){
678 sqlite_int64 iPrevDocid = 0;
679 assert( nData>0 );
680 assert( pData!=0 );
681 assert( pData+nData>pData );
682 while( nData!=0 ){
683 sqlite_int64 iDocidDelta;
684 int n = getVarint(pData, &iDocidDelta);
685 iPrevDocid += iDocidDelta;
686 if( iType>DL_DOCIDS ){
687 int iDummy;
688 while( 1 ){
689 n += getVarint32(pData+n, &iDummy);
690 if( iDummy==POS_END ) break;
691 if( iDummy==POS_COLUMN ){
692 n += getVarint32(pData+n, &iDummy);
693 }else if( iType>DL_POSITIONS ){
694 n += getVarint32(pData+n, &iDummy);
695 n += getVarint32(pData+n, &iDummy);
696 }
697 assert( n<=nData );
698 }
699 }
700 assert( n<=nData );
701 pData += n;
702 nData -= n;
703 }
704 if( pLastDocid ) *pLastDocid = iPrevDocid;
705}
706#define ASSERT_VALID_DOCLIST(i, p, n, o) docListValidate(i, p, n, o)
707#else
708#define ASSERT_VALID_DOCLIST(i, p, n, o) assert( 1 )
709#endif
710
711/*******************************************************************/
712/* DLWriter is used to write doclist data to a DataBuffer. DLWriter
713** always appends to the buffer and does not own it.
714**
715** dlwInit - initialize to write a given type doclistto a buffer.
716** dlwDestroy - clear the writer's memory. Does not free buffer.
717** dlwAppend - append raw doclist data to buffer.
718** dlwCopy - copy next doclist from reader to writer.
719** dlwAdd - construct doclist element and append to buffer.
720** Only apply dlwAdd() to DL_DOCIDS doclists (else use PLWriter).
721*/
722typedef struct DLWriter {
723 DocListType iType;
724 DataBuffer *b;
725 sqlite_int64 iPrevDocid;
726#ifndef NDEBUG
727 int has_iPrevDocid;
728#endif
729} DLWriter;
730
731static void dlwInit(DLWriter *pWriter, DocListType iType, DataBuffer *b){
732 pWriter->b = b;
733 pWriter->iType = iType;
734 pWriter->iPrevDocid = 0;
735#ifndef NDEBUG
736 pWriter->has_iPrevDocid = 0;
737#endif
738}
739static void dlwDestroy(DLWriter *pWriter){
740 SCRAMBLE(pWriter);
741}
742/* iFirstDocid is the first docid in the doclist in pData. It is
743** needed because pData may point within a larger doclist, in which
744** case the first item would be delta-encoded.
745**
746** iLastDocid is the final docid in the doclist in pData. It is
747** needed to create the new iPrevDocid for future delta-encoding. The
748** code could decode the passed doclist to recreate iLastDocid, but
749** the only current user (docListMerge) already has decoded this
750** information.
751*/
752/* TODO(shess) This has become just a helper for docListMerge.
753** Consider a refactor to make this cleaner.
754*/
755static void dlwAppend(DLWriter *pWriter,
756 const char *pData, int nData,
757 sqlite_int64 iFirstDocid, sqlite_int64 iLastDocid){
758 sqlite_int64 iDocid = 0;
759 char c[VARINT_MAX];
760 int nFirstOld, nFirstNew; /* Old and new varint len of first docid. */
761#ifndef NDEBUG
762 sqlite_int64 iLastDocidDelta;
763#endif
764
765 /* Recode the initial docid as delta from iPrevDocid. */
766 nFirstOld = getVarint(pData, &iDocid);
767 assert( nFirstOld<nData || (nFirstOld==nData && pWriter->iType==DL_DOCIDS) );
768 nFirstNew = putVarint(c, iFirstDocid-pWriter->iPrevDocid);
769
770 /* Verify that the incoming doclist is valid AND that it ends with
771 ** the expected docid. This is essential because we'll trust this
772 ** docid in future delta-encoding.
773 */
774 ASSERT_VALID_DOCLIST(pWriter->iType, pData, nData, &iLastDocidDelta);
775 assert( iLastDocid==iFirstDocid-iDocid+iLastDocidDelta );
776
777 /* Append recoded initial docid and everything else. Rest of docids
778 ** should have been delta-encoded from previous initial docid.
779 */
780 if( nFirstOld<nData ){
781 dataBufferAppend2(pWriter->b, c, nFirstNew,
782 pData+nFirstOld, nData-nFirstOld);
783 }else{
784 dataBufferAppend(pWriter->b, c, nFirstNew);
785 }
786 pWriter->iPrevDocid = iLastDocid;
787}
788static void dlwCopy(DLWriter *pWriter, DLReader *pReader){
789 dlwAppend(pWriter, dlrDocData(pReader), dlrDocDataBytes(pReader),
790 dlrDocid(pReader), dlrDocid(pReader));
791}
792static void dlwAdd(DLWriter *pWriter, sqlite_int64 iDocid){
793 char c[VARINT_MAX];
794 int n = putVarint(c, iDocid-pWriter->iPrevDocid);
795
796 /* Docids must ascend. */
797 assert( !pWriter->has_iPrevDocid || iDocid>pWriter->iPrevDocid );
798 assert( pWriter->iType==DL_DOCIDS );
799
800 dataBufferAppend(pWriter->b, c, n);
801 pWriter->iPrevDocid = iDocid;
802#ifndef NDEBUG
803 pWriter->has_iPrevDocid = 1;
804#endif
805}
806
807/*******************************************************************/
808/* PLReader is used to read data from a document's position list. As
809** the caller steps through the list, data is cached so that varints
810** only need to be decoded once.
811**
812** plrInit, plrDestroy - create/destroy a reader.
813** plrColumn, plrPosition, plrStartOffset, plrEndOffset - accessors
814** plrAtEnd - at end of stream, only call plrDestroy once true.
815** plrStep - step to the next element.
816*/
817typedef struct PLReader {
818 /* These refer to the next position's data. nData will reach 0 when
819 ** reading the last position, so plrStep() signals EOF by setting
820 ** pData to NULL.
821 */
822 const char *pData;
823 int nData;
824
825 DocListType iType;
826 int iColumn; /* the last column read */
827 int iPosition; /* the last position read */
828 int iStartOffset; /* the last start offset read */
829 int iEndOffset; /* the last end offset read */
830} PLReader;
831
832static int plrAtEnd(PLReader *pReader){
833 return pReader->pData==NULL;
834}
835static int plrColumn(PLReader *pReader){
836 assert( !plrAtEnd(pReader) );
837 return pReader->iColumn;
838}
839static int plrPosition(PLReader *pReader){
840 assert( !plrAtEnd(pReader) );
841 return pReader->iPosition;
842}
843static int plrStartOffset(PLReader *pReader){
844 assert( !plrAtEnd(pReader) );
845 return pReader->iStartOffset;
846}
847static int plrEndOffset(PLReader *pReader){
848 assert( !plrAtEnd(pReader) );
849 return pReader->iEndOffset;
850}
851static void plrStep(PLReader *pReader){
852 int i, n;
853
854 assert( !plrAtEnd(pReader) );
855
856 if( pReader->nData==0 ){
857 pReader->pData = NULL;
858 return;
859 }
860
861 n = getVarint32(pReader->pData, &i);
862 if( i==POS_COLUMN ){
863 n += getVarint32(pReader->pData+n, &pReader->iColumn);
864 pReader->iPosition = 0;
865 pReader->iStartOffset = 0;
866 n += getVarint32(pReader->pData+n, &i);
867 }
868 /* Should never see adjacent column changes. */
869 assert( i!=POS_COLUMN );
870
871 if( i==POS_END ){
872 pReader->nData = 0;
873 pReader->pData = NULL;
874 return;
875 }
876
877 pReader->iPosition += i-POS_BASE;
878 if( pReader->iType==DL_POSITIONS_OFFSETS ){
879 n += getVarint32(pReader->pData+n, &i);
880 pReader->iStartOffset += i;
881 n += getVarint32(pReader->pData+n, &i);
882 pReader->iEndOffset = pReader->iStartOffset+i;
883 }
884 assert( n<=pReader->nData );
885 pReader->pData += n;
886 pReader->nData -= n;
887}
888
889static void plrInit(PLReader *pReader, DLReader *pDLReader){
890 pReader->pData = dlrPosData(pDLReader);
891 pReader->nData = dlrPosDataLen(pDLReader);
892 pReader->iType = pDLReader->iType;
893 pReader->iColumn = 0;
894 pReader->iPosition = 0;
895 pReader->iStartOffset = 0;
896 pReader->iEndOffset = 0;
897 plrStep(pReader);
898}
899static void plrDestroy(PLReader *pReader){
900 SCRAMBLE(pReader);
901}
902
903/*******************************************************************/
904/* PLWriter is used in constructing a document's position list. As a
905** convenience, if iType is DL_DOCIDS, PLWriter becomes a no-op.
906** PLWriter writes to the associated DLWriter's buffer.
907**
908** plwInit - init for writing a document's poslist.
909** plwDestroy - clear a writer.
910** plwAdd - append position and offset information.
911** plwCopy - copy next position's data from reader to writer.
912** plwTerminate - add any necessary doclist terminator.
913**
914** Calling plwAdd() after plwTerminate() may result in a corrupt
915** doclist.
916*/
917/* TODO(shess) Until we've written the second item, we can cache the
918** first item's information. Then we'd have three states:
919**
920** - initialized with docid, no positions.
921** - docid and one position.
922** - docid and multiple positions.
923**
924** Only the last state needs to actually write to dlw->b, which would
925** be an improvement in the DLCollector case.
926*/
927typedef struct PLWriter {
928 DLWriter *dlw;
929
930 int iColumn; /* the last column written */
931 int iPos; /* the last position written */
932 int iOffset; /* the last start offset written */
933} PLWriter;
934
935/* TODO(shess) In the case where the parent is reading these values
936** from a PLReader, we could optimize to a copy if that PLReader has
937** the same type as pWriter.
938*/
939static void plwAdd(PLWriter *pWriter, int iColumn, int iPos,
940 int iStartOffset, int iEndOffset){
941 /* Worst-case space for POS_COLUMN, iColumn, iPosDelta,
942 ** iStartOffsetDelta, and iEndOffsetDelta.
943 */
944 char c[5*VARINT_MAX];
945 int n = 0;
946
947 /* Ban plwAdd() after plwTerminate(). */
948 assert( pWriter->iPos!=-1 );
949
950 if( pWriter->dlw->iType==DL_DOCIDS ) return;
951
952 if( iColumn!=pWriter->iColumn ){
953 n += putVarint(c+n, POS_COLUMN);
954 n += putVarint(c+n, iColumn);
955 pWriter->iColumn = iColumn;
956 pWriter->iPos = 0;
957 pWriter->iOffset = 0;
958 }
959 assert( iPos>=pWriter->iPos );
960 n += putVarint(c+n, POS_BASE+(iPos-pWriter->iPos));
961 pWriter->iPos = iPos;
962 if( pWriter->dlw->iType==DL_POSITIONS_OFFSETS ){
963 assert( iStartOffset>=pWriter->iOffset );
964 n += putVarint(c+n, iStartOffset-pWriter->iOffset);
965 pWriter->iOffset = iStartOffset;
966 assert( iEndOffset>=iStartOffset );
967 n += putVarint(c+n, iEndOffset-iStartOffset);
968 }
969 dataBufferAppend(pWriter->dlw->b, c, n);
970}
971static void plwCopy(PLWriter *pWriter, PLReader *pReader){
972 plwAdd(pWriter, plrColumn(pReader), plrPosition(pReader),
973 plrStartOffset(pReader), plrEndOffset(pReader));
974}
975static void plwInit(PLWriter *pWriter, DLWriter *dlw, sqlite_int64 iDocid){
976 char c[VARINT_MAX];
977 int n;
978
979 pWriter->dlw = dlw;
980
981 /* Docids must ascend. */
982 assert( !pWriter->dlw->has_iPrevDocid || iDocid>pWriter->dlw->iPrevDocid );
983 n = putVarint(c, iDocid-pWriter->dlw->iPrevDocid);
984 dataBufferAppend(pWriter->dlw->b, c, n);
985 pWriter->dlw->iPrevDocid = iDocid;
986#ifndef NDEBUG
987 pWriter->dlw->has_iPrevDocid = 1;
988#endif
989
990 pWriter->iColumn = 0;
991 pWriter->iPos = 0;
992 pWriter->iOffset = 0;
993}
994/* TODO(shess) Should plwDestroy() also terminate the doclist? But
995** then plwDestroy() would no longer be just a destructor, it would
996** also be doing work, which isn't consistent with the overall idiom.
997** Another option would be for plwAdd() to always append any necessary
998** terminator, so that the output is always correct. But that would
999** add incremental work to the common case with the only benefit being
1000** API elegance. Punt for now.
1001*/
1002static void plwTerminate(PLWriter *pWriter){
1003 if( pWriter->dlw->iType>DL_DOCIDS ){
1004 char c[VARINT_MAX];
1005 int n = putVarint(c, POS_END);
1006 dataBufferAppend(pWriter->dlw->b, c, n);
1007 }
1008#ifndef NDEBUG
1009 /* Mark as terminated for assert in plwAdd(). */
1010 pWriter->iPos = -1;
1011#endif
1012}
1013static void plwDestroy(PLWriter *pWriter){
1014 SCRAMBLE(pWriter);
1015}
1016
1017/*******************************************************************/
1018/* DLCollector wraps PLWriter and DLWriter to provide a
1019** dynamically-allocated doclist area to use during tokenization.
1020**
1021** dlcNew - malloc up and initialize a collector.
1022** dlcDelete - destroy a collector and all contained items.
1023** dlcAddPos - append position and offset information.
1024** dlcAddDoclist - add the collected doclist to the given buffer.
1025** dlcNext - terminate the current document and open another.
1026*/
1027typedef struct DLCollector {
1028 DataBuffer b;
1029 DLWriter dlw;
1030 PLWriter plw;
1031} DLCollector;
1032
1033/* TODO(shess) This could also be done by calling plwTerminate() and
1034** dataBufferAppend(). I tried that, expecting nominal performance
1035** differences, but it seemed to pretty reliably be worth 1% to code
1036** it this way. I suspect it's the incremental malloc overhead (some
1037** percentage of the plwTerminate() calls will cause a realloc), so
1038** this might be worth revisiting if the DataBuffer implementation
1039** changes.
1040*/
1041static void dlcAddDoclist(DLCollector *pCollector, DataBuffer *b){
1042 if( pCollector->dlw.iType>DL_DOCIDS ){
1043 char c[VARINT_MAX];
1044 int n = putVarint(c, POS_END);
1045 dataBufferAppend2(b, pCollector->b.pData, pCollector->b.nData, c, n);
1046 }else{
1047 dataBufferAppend(b, pCollector->b.pData, pCollector->b.nData);
1048 }
1049}
1050static void dlcNext(DLCollector *pCollector, sqlite_int64 iDocid){
1051 plwTerminate(&pCollector->plw);
1052 plwDestroy(&pCollector->plw);
1053 plwInit(&pCollector->plw, &pCollector->dlw, iDocid);
1054}
1055static void dlcAddPos(DLCollector *pCollector, int iColumn, int iPos,
1056 int iStartOffset, int iEndOffset){
1057 plwAdd(&pCollector->plw, iColumn, iPos, iStartOffset, iEndOffset);
1058}
1059
1060static DLCollector *dlcNew(sqlite_int64 iDocid, DocListType iType){
1061 DLCollector *pCollector = malloc(sizeof(DLCollector));
1062 dataBufferInit(&pCollector->b, 0);
1063 dlwInit(&pCollector->dlw, iType, &pCollector->b);
1064 plwInit(&pCollector->plw, &pCollector->dlw, iDocid);
1065 return pCollector;
1066}
1067static void dlcDelete(DLCollector *pCollector){
1068 plwDestroy(&pCollector->plw);
1069 dlwDestroy(&pCollector->dlw);
1070 dataBufferDestroy(&pCollector->b);
1071 SCRAMBLE(pCollector);
1072 free(pCollector);
1073}
1074
1075
1076/* Copy the doclist data of iType in pData/nData into *out, trimming
1077** unnecessary data as we go. Only columns matching iColumn are
1078** copied, all columns copied if iColumn is -1. Elements with no
1079** matching columns are dropped. The output is an iOutType doclist.
1080*/
1081/* NOTE(shess) This code is only valid after all doclists are merged.
1082** If this is run before merges, then doclist items which represent
1083** deletion will be trimmed, and will thus not effect a deletion
1084** during the merge.
1085*/
1086static void docListTrim(DocListType iType, const char *pData, int nData,
1087 int iColumn, DocListType iOutType, DataBuffer *out){
1088 DLReader dlReader;
1089 DLWriter dlWriter;
1090
1091 assert( iOutType<=iType );
1092
1093 dlrInit(&dlReader, iType, pData, nData);
1094 dlwInit(&dlWriter, iOutType, out);
1095
1096 while( !dlrAtEnd(&dlReader) ){
1097 PLReader plReader;
1098 PLWriter plWriter;
1099 int match = 0;
1100
1101 plrInit(&plReader, &dlReader);
1102
1103 while( !plrAtEnd(&plReader) ){
1104 if( iColumn==-1 || plrColumn(&plReader)==iColumn ){
1105 if( !match ){
1106 plwInit(&plWriter, &dlWriter, dlrDocid(&dlReader));
1107 match = 1;
1108 }
1109 plwAdd(&plWriter, plrColumn(&plReader), plrPosition(&plReader),
1110 plrStartOffset(&plReader), plrEndOffset(&plReader));
1111 }
1112 plrStep(&plReader);
1113 }
1114 if( match ){
1115 plwTerminate(&plWriter);
1116 plwDestroy(&plWriter);
1117 }
1118
1119 plrDestroy(&plReader);
1120 dlrStep(&dlReader);
1121 }
1122 dlwDestroy(&dlWriter);
1123 dlrDestroy(&dlReader);
1124}
1125
1126/* Used by docListMerge() to keep doclists in the ascending order by
1127** docid, then ascending order by age (so the newest comes first).
1128*/
1129typedef struct OrderedDLReader {
1130 DLReader *pReader;
1131
1132 /* TODO(shess) If we assume that docListMerge pReaders is ordered by
1133 ** age (which we do), then we could use pReader comparisons to break
1134 ** ties.
1135 */
1136 int idx;
1137} OrderedDLReader;
1138
1139/* Order eof to end, then by docid asc, idx desc. */
1140static int orderedDLReaderCmp(OrderedDLReader *r1, OrderedDLReader *r2){
1141 if( dlrAtEnd(r1->pReader) ){
1142 if( dlrAtEnd(r2->pReader) ) return 0; /* Both atEnd(). */
1143 return 1; /* Only r1 atEnd(). */
1144 }
1145 if( dlrAtEnd(r2->pReader) ) return -1; /* Only r2 atEnd(). */
1146
1147 if( dlrDocid(r1->pReader)<dlrDocid(r2->pReader) ) return -1;
1148 if( dlrDocid(r1->pReader)>dlrDocid(r2->pReader) ) return 1;
1149
1150 /* Descending on idx. */
1151 return r2->idx-r1->idx;
1152}
1153
1154/* Bubble p[0] to appropriate place in p[1..n-1]. Assumes that
1155** p[1..n-1] is already sorted.
1156*/
1157/* TODO(shess) Is this frequent enough to warrant a binary search?
1158** Before implementing that, instrument the code to check. In most
1159** current usage, I expect that p[0] will be less than p[1] a very
1160** high proportion of the time.
1161*/
1162static void orderedDLReaderReorder(OrderedDLReader *p, int n){
1163 while( n>1 && orderedDLReaderCmp(p, p+1)>0 ){
1164 OrderedDLReader tmp = p[0];
1165 p[0] = p[1];
1166 p[1] = tmp;
1167 n--;
1168 p++;
1169 }
1170}
1171
1172/* Given an array of doclist readers, merge their doclist elements
1173** into out in sorted order (by docid), dropping elements from older
1174** readers when there is a duplicate docid. pReaders is assumed to be
1175** ordered by age, oldest first.
1176*/
1177/* TODO(shess) nReaders must be <= MERGE_COUNT. This should probably
1178** be fixed.
1179*/
1180static void docListMerge(DataBuffer *out,
1181 DLReader *pReaders, int nReaders){
1182 OrderedDLReader readers[MERGE_COUNT];
1183 DLWriter writer;
1184 int i, n;
1185 const char *pStart = 0;
1186 int nStart = 0;
1187 sqlite_int64 iFirstDocid = 0, iLastDocid = 0;
1188
1189 assert( nReaders>0 );
1190 if( nReaders==1 ){
1191 dataBufferAppend(out, dlrDocData(pReaders), dlrAllDataBytes(pReaders));
1192 return;
1193 }
1194
1195 assert( nReaders<=MERGE_COUNT );
1196 n = 0;
1197 for(i=0; i<nReaders; i++){
1198 assert( pReaders[i].iType==pReaders[0].iType );
1199 readers[i].pReader = pReaders+i;
1200 readers[i].idx = i;
1201 n += dlrAllDataBytes(&pReaders[i]);
1202 }
1203 /* Conservatively size output to sum of inputs. Output should end
1204 ** up strictly smaller than input.
1205 */
1206 dataBufferExpand(out, n);
1207
1208 /* Get the readers into sorted order. */
1209 while( i-->0 ){
1210 orderedDLReaderReorder(readers+i, nReaders-i);
1211 }
1212
1213 dlwInit(&writer, pReaders[0].iType, out);
1214 while( !dlrAtEnd(readers[0].pReader) ){
1215 sqlite_int64 iDocid = dlrDocid(readers[0].pReader);
1216
1217 /* If this is a continuation of the current buffer to copy, extend
1218 ** that buffer. memcpy() seems to be more efficient if it has a
1219 ** lots of data to copy.
1220 */
1221 if( dlrDocData(readers[0].pReader)==pStart+nStart ){
1222 nStart += dlrDocDataBytes(readers[0].pReader);
1223 }else{
1224 if( pStart!=0 ){
1225 dlwAppend(&writer, pStart, nStart, iFirstDocid, iLastDocid);
1226 }
1227 pStart = dlrDocData(readers[0].pReader);
1228 nStart = dlrDocDataBytes(readers[0].pReader);
1229 iFirstDocid = iDocid;
1230 }
1231 iLastDocid = iDocid;
1232 dlrStep(readers[0].pReader);
1233
1234 /* Drop all of the older elements with the same docid. */
1235 for(i=1; i<nReaders &&
1236 !dlrAtEnd(readers[i].pReader) &&
1237 dlrDocid(readers[i].pReader)==iDocid; i++){
1238 dlrStep(readers[i].pReader);
1239 }
1240
1241 /* Get the readers back into order. */
1242 while( i-->0 ){
1243 orderedDLReaderReorder(readers+i, nReaders-i);
1244 }
1245 }
1246
1247 /* Copy over any remaining elements. */
1248 if( nStart>0 ) dlwAppend(&writer, pStart, nStart, iFirstDocid, iLastDocid);
1249 dlwDestroy(&writer);
1250}
1251
1252/* Helper function for posListUnion(). Compares the current position
1253** between left and right, returning as standard C idiom of <0 if
1254** left<right, >0 if left>right, and 0 if left==right. "End" always
1255** compares greater.
1256*/
1257static int posListCmp(PLReader *pLeft, PLReader *pRight){
1258 assert( pLeft->iType==pRight->iType );
1259 if( pLeft->iType==DL_DOCIDS ) return 0;
1260
1261 if( plrAtEnd(pLeft) ) return plrAtEnd(pRight) ? 0 : 1;
1262 if( plrAtEnd(pRight) ) return -1;
1263
1264 if( plrColumn(pLeft)<plrColumn(pRight) ) return -1;
1265 if( plrColumn(pLeft)>plrColumn(pRight) ) return 1;
1266
1267 if( plrPosition(pLeft)<plrPosition(pRight) ) return -1;
1268 if( plrPosition(pLeft)>plrPosition(pRight) ) return 1;
1269 if( pLeft->iType==DL_POSITIONS ) return 0;
1270
1271 if( plrStartOffset(pLeft)<plrStartOffset(pRight) ) return -1;
1272 if( plrStartOffset(pLeft)>plrStartOffset(pRight) ) return 1;
1273
1274 if( plrEndOffset(pLeft)<plrEndOffset(pRight) ) return -1;
1275 if( plrEndOffset(pLeft)>plrEndOffset(pRight) ) return 1;
1276
1277 return 0;
1278}
1279
1280/* Write the union of position lists in pLeft and pRight to pOut.
1281** "Union" in this case meaning "All unique position tuples". Should
1282** work with any doclist type, though both inputs and the output
1283** should be the same type.
1284*/
1285static void posListUnion(DLReader *pLeft, DLReader *pRight, DLWriter *pOut){
1286 PLReader left, right;
1287 PLWriter writer;
1288
1289 assert( dlrDocid(pLeft)==dlrDocid(pRight) );
1290 assert( pLeft->iType==pRight->iType );
1291 assert( pLeft->iType==pOut->iType );
1292
1293 plrInit(&left, pLeft);
1294 plrInit(&right, pRight);
1295 plwInit(&writer, pOut, dlrDocid(pLeft));
1296
1297 while( !plrAtEnd(&left) || !plrAtEnd(&right) ){
1298 int c = posListCmp(&left, &right);
1299 if( c<0 ){
1300 plwCopy(&writer, &left);
1301 plrStep(&left);
1302 }else if( c>0 ){
1303 plwCopy(&writer, &right);
1304 plrStep(&right);
1305 }else{
1306 plwCopy(&writer, &left);
1307 plrStep(&left);
1308 plrStep(&right);
1309 }
1310 }
1311
1312 plwTerminate(&writer);
1313 plwDestroy(&writer);
1314 plrDestroy(&left);
1315 plrDestroy(&right);
1316}
1317
1318/* Write the union of doclists in pLeft and pRight to pOut. For
1319** docids in common between the inputs, the union of the position
1320** lists is written. Inputs and outputs are always type DL_DEFAULT.
1321*/
1322static void docListUnion(
1323 const char *pLeft, int nLeft,
1324 const char *pRight, int nRight,
1325 DataBuffer *pOut /* Write the combined doclist here */
1326){
1327 DLReader left, right;
1328 DLWriter writer;
1329
1330 if( nLeft==0 ){
1331 dataBufferAppend(pOut, pRight, nRight);
1332 return;
1333 }
1334 if( nRight==0 ){
1335 dataBufferAppend(pOut, pLeft, nLeft);
1336 return;
1337 }
1338
1339 dlrInit(&left, DL_DEFAULT, pLeft, nLeft);
1340 dlrInit(&right, DL_DEFAULT, pRight, nRight);
1341 dlwInit(&writer, DL_DEFAULT, pOut);
1342
1343 while( !dlrAtEnd(&left) || !dlrAtEnd(&right) ){
1344 if( dlrAtEnd(&right) ){
1345 dlwCopy(&writer, &left);
1346 dlrStep(&left);
1347 }else if( dlrAtEnd(&left) ){
1348 dlwCopy(&writer, &right);
1349 dlrStep(&right);
1350 }else if( dlrDocid(&left)<dlrDocid(&right) ){
1351 dlwCopy(&writer, &left);
1352 dlrStep(&left);
1353 }else if( dlrDocid(&left)>dlrDocid(&right) ){
1354 dlwCopy(&writer, &right);
1355 dlrStep(&right);
1356 }else{
1357 posListUnion(&left, &right, &writer);
1358 dlrStep(&left);
1359 dlrStep(&right);
1360 }
1361 }
1362
1363 dlrDestroy(&left);
1364 dlrDestroy(&right);
1365 dlwDestroy(&writer);
1366}
1367
1368/* pLeft and pRight are DLReaders positioned to the same docid.
1369**
1370** If there are no instances in pLeft or pRight where the position
1371** of pLeft is one less than the position of pRight, then this
1372** routine adds nothing to pOut.
1373**
1374** If there are one or more instances where positions from pLeft
1375** are exactly one less than positions from pRight, then add a new
1376** document record to pOut. If pOut wants to hold positions, then
1377** include the positions from pRight that are one more than a
1378** position in pLeft. In other words: pRight.iPos==pLeft.iPos+1.
1379*/
1380static void posListPhraseMerge(DLReader *pLeft, DLReader *pRight,
1381 DLWriter *pOut){
1382 PLReader left, right;
1383 PLWriter writer;
1384 int match = 0;
1385
1386 assert( dlrDocid(pLeft)==dlrDocid(pRight) );
1387 assert( pOut->iType!=DL_POSITIONS_OFFSETS );
1388
1389 plrInit(&left, pLeft);
1390 plrInit(&right, pRight);
1391
1392 while( !plrAtEnd(&left) && !plrAtEnd(&right) ){
1393 if( plrColumn(&left)<plrColumn(&right) ){
1394 plrStep(&left);
1395 }else if( plrColumn(&left)>plrColumn(&right) ){
1396 plrStep(&right);
1397 }else if( plrPosition(&left)+1<plrPosition(&right) ){
1398 plrStep(&left);
1399 }else if( plrPosition(&left)+1>plrPosition(&right) ){
1400 plrStep(&right);
1401 }else{
1402 if( !match ){
1403 plwInit(&writer, pOut, dlrDocid(pLeft));
1404 match = 1;
1405 }
1406 plwAdd(&writer, plrColumn(&right), plrPosition(&right), 0, 0);
1407 plrStep(&left);
1408 plrStep(&right);
1409 }
1410 }
1411
1412 if( match ){
1413 plwTerminate(&writer);
1414 plwDestroy(&writer);
1415 }
1416
1417 plrDestroy(&left);
1418 plrDestroy(&right);
1419}
1420
1421/* We have two doclists with positions: pLeft and pRight.
1422** Write the phrase intersection of these two doclists into pOut.
1423**
1424** A phrase intersection means that two documents only match
1425** if pLeft.iPos+1==pRight.iPos.
1426**
1427** iType controls the type of data written to pOut. If iType is
1428** DL_POSITIONS, the positions are those from pRight.
1429*/
1430static void docListPhraseMerge(
1431 const char *pLeft, int nLeft,
1432 const char *pRight, int nRight,
1433 DocListType iType,
1434 DataBuffer *pOut /* Write the combined doclist here */
1435){
1436 DLReader left, right;
1437 DLWriter writer;
1438
1439 if( nLeft==0 || nRight==0 ) return;
1440
1441 assert( iType!=DL_POSITIONS_OFFSETS );
1442
1443 dlrInit(&left, DL_POSITIONS, pLeft, nLeft);
1444 dlrInit(&right, DL_POSITIONS, pRight, nRight);
1445 dlwInit(&writer, iType, pOut);
1446
1447 while( !dlrAtEnd(&left) && !dlrAtEnd(&right) ){
1448 if( dlrDocid(&left)<dlrDocid(&right) ){
1449 dlrStep(&left);
1450 }else if( dlrDocid(&right)<dlrDocid(&left) ){
1451 dlrStep(&right);
1452 }else{
1453 posListPhraseMerge(&left, &right, &writer);
1454 dlrStep(&left);
1455 dlrStep(&right);
1456 }
1457 }
1458
1459 dlrDestroy(&left);
1460 dlrDestroy(&right);
1461 dlwDestroy(&writer);
1462}
1463
1464/* We have two DL_DOCIDS doclists: pLeft and pRight.
1465** Write the intersection of these two doclists into pOut as a
1466** DL_DOCIDS doclist.
1467*/
1468static void docListAndMerge(
1469 const char *pLeft, int nLeft,
1470 const char *pRight, int nRight,
1471 DataBuffer *pOut /* Write the combined doclist here */
1472){
1473 DLReader left, right;
1474 DLWriter writer;
1475
1476 if( nLeft==0 || nRight==0 ) return;
1477
1478 dlrInit(&left, DL_DOCIDS, pLeft, nLeft);
1479 dlrInit(&right, DL_DOCIDS, pRight, nRight);
1480 dlwInit(&writer, DL_DOCIDS, pOut);
1481
1482 while( !dlrAtEnd(&left) && !dlrAtEnd(&right) ){
1483 if( dlrDocid(&left)<dlrDocid(&right) ){
1484 dlrStep(&left);
1485 }else if( dlrDocid(&right)<dlrDocid(&left) ){
1486 dlrStep(&right);
1487 }else{
1488 dlwAdd(&writer, dlrDocid(&left));
1489 dlrStep(&left);
1490 dlrStep(&right);
1491 }
1492 }
1493
1494 dlrDestroy(&left);
1495 dlrDestroy(&right);
1496 dlwDestroy(&writer);
1497}
1498
1499/* We have two DL_DOCIDS doclists: pLeft and pRight.
1500** Write the union of these two doclists into pOut as a
1501** DL_DOCIDS doclist.
1502*/
1503static void docListOrMerge(
1504 const char *pLeft, int nLeft,
1505 const char *pRight, int nRight,
1506 DataBuffer *pOut /* Write the combined doclist here */
1507){
1508 DLReader left, right;
1509 DLWriter writer;
1510
1511 if( nLeft==0 ){
1512 dataBufferAppend(pOut, pRight, nRight);
1513 return;
1514 }
1515 if( nRight==0 ){
1516 dataBufferAppend(pOut, pLeft, nLeft);
1517 return;
1518 }
1519
1520 dlrInit(&left, DL_DOCIDS, pLeft, nLeft);
1521 dlrInit(&right, DL_DOCIDS, pRight, nRight);
1522 dlwInit(&writer, DL_DOCIDS, pOut);
1523
1524 while( !dlrAtEnd(&left) || !dlrAtEnd(&right) ){
1525 if( dlrAtEnd(&right) ){
1526 dlwAdd(&writer, dlrDocid(&left));
1527 dlrStep(&left);
1528 }else if( dlrAtEnd(&left) ){
1529 dlwAdd(&writer, dlrDocid(&right));
1530 dlrStep(&right);
1531 }else if( dlrDocid(&left)<dlrDocid(&right) ){
1532 dlwAdd(&writer, dlrDocid(&left));
1533 dlrStep(&left);
1534 }else if( dlrDocid(&right)<dlrDocid(&left) ){
1535 dlwAdd(&writer, dlrDocid(&right));
1536 dlrStep(&right);
1537 }else{
1538 dlwAdd(&writer, dlrDocid(&left));
1539 dlrStep(&left);
1540 dlrStep(&right);
1541 }
1542 }
1543
1544 dlrDestroy(&left);
1545 dlrDestroy(&right);
1546 dlwDestroy(&writer);
1547}
1548
1549/* We have two DL_DOCIDS doclists: pLeft and pRight.
1550** Write into pOut as DL_DOCIDS doclist containing all documents that
1551** occur in pLeft but not in pRight.
1552*/
1553static void docListExceptMerge(
1554 const char *pLeft, int nLeft,
1555 const char *pRight, int nRight,
1556 DataBuffer *pOut /* Write the combined doclist here */
1557){
1558 DLReader left, right;
1559 DLWriter writer;
1560
1561 if( nLeft==0 ) return;
1562 if( nRight==0 ){
1563 dataBufferAppend(pOut, pLeft, nLeft);
1564 return;
1565 }
1566
1567 dlrInit(&left, DL_DOCIDS, pLeft, nLeft);
1568 dlrInit(&right, DL_DOCIDS, pRight, nRight);
1569 dlwInit(&writer, DL_DOCIDS, pOut);
1570
1571 while( !dlrAtEnd(&left) ){
1572 while( !dlrAtEnd(&right) && dlrDocid(&right)<dlrDocid(&left) ){
1573 dlrStep(&right);
1574 }
1575 if( dlrAtEnd(&right) || dlrDocid(&left)<dlrDocid(&right) ){
1576 dlwAdd(&writer, dlrDocid(&left));
1577 }
1578 dlrStep(&left);
1579 }
1580
1581 dlrDestroy(&left);
1582 dlrDestroy(&right);
1583 dlwDestroy(&writer);
1584}
1585
1586static char *string_dup_n(const char *s, int n){
1587 char *str = malloc(n + 1);
1588 memcpy(str, s, n);
1589 str[n] = '\0';
1590 return str;
1591}
1592
1593/* Duplicate a string; the caller must free() the returned string.
1594 * (We don't use strdup() since it's not part of the standard C library and
1595 * may not be available everywhere.) */
1596static char *string_dup(const char *s){
1597 return string_dup_n(s, strlen(s));
1598}
1599
1600/* Format a string, replacing each occurrence of the % character with
1601 * zDb.zName. This may be more convenient than sqlite_mprintf()
1602 * when one string is used repeatedly in a format string.
1603 * The caller must free() the returned string. */
1604static char *string_format(const char *zFormat,
1605 const char *zDb, const char *zName){
1606 const char *p;
1607 size_t len = 0;
1608 size_t nDb = strlen(zDb);
1609 size_t nName = strlen(zName);
1610 size_t nFullTableName = nDb+1+nName;
1611 char *result;
1612 char *r;
1613
1614 /* first compute length needed */
1615 for(p = zFormat ; *p ; ++p){
1616 len += (*p=='%' ? nFullTableName : 1);
1617 }
1618 len += 1; /* for null terminator */
1619
1620 r = result = malloc(len);
1621 for(p = zFormat; *p; ++p){
1622 if( *p=='%' ){
1623 memcpy(r, zDb, nDb);
1624 r += nDb;
1625 *r++ = '.';
1626 memcpy(r, zName, nName);
1627 r += nName;
1628 } else {
1629 *r++ = *p;
1630 }
1631 }
1632 *r++ = '\0';
1633 assert( r == result + len );
1634 return result;
1635}
1636
1637static int sql_exec(sqlite3 *db, const char *zDb, const char *zName,
1638 const char *zFormat){
1639 char *zCommand = string_format(zFormat, zDb, zName);
1640 int rc;
1641 TRACE(("FTS3 sql: %s\n", zCommand));
1642 rc = sqlite3_exec(db, zCommand, NULL, 0, NULL);
1643 free(zCommand);
1644 return rc;
1645}
1646
1647static int sql_prepare(sqlite3 *db, const char *zDb, const char *zName,
1648 sqlite3_stmt **ppStmt, const char *zFormat){
1649 char *zCommand = string_format(zFormat, zDb, zName);
1650 int rc;
1651 TRACE(("FTS3 prepare: %s\n", zCommand));
1652 rc = sqlite3_prepare_v2(db, zCommand, -1, ppStmt, NULL);
1653 free(zCommand);
1654 return rc;
1655}
1656
1657/* end utility functions */
1658
1659/* Forward reference */
1660typedef struct fulltext_vtab fulltext_vtab;
1661
1662/* A single term in a query is represented by an instances of
1663** the following structure.
1664*/
1665typedef struct QueryTerm {
1666 short int nPhrase; /* How many following terms are part of the same phrase */
1667 short int iPhrase; /* This is the i-th term of a phrase. */
1668 short int iColumn; /* Column of the index that must match this term */
1669 signed char isOr; /* this term is preceded by "OR" */
1670 signed char isNot; /* this term is preceded by "-" */
1671 signed char isPrefix; /* this term is followed by "*" */
1672 char *pTerm; /* text of the term. '\000' terminated. malloced */
1673 int nTerm; /* Number of bytes in pTerm[] */
1674} QueryTerm;
1675
1676
1677/* A query string is parsed into a Query structure.
1678 *
1679 * We could, in theory, allow query strings to be complicated
1680 * nested expressions with precedence determined by parentheses.
1681 * But none of the major search engines do this. (Perhaps the
1682 * feeling is that an parenthesized expression is two complex of
1683 * an idea for the average user to grasp.) Taking our lead from
1684 * the major search engines, we will allow queries to be a list
1685 * of terms (with an implied AND operator) or phrases in double-quotes,
1686 * with a single optional "-" before each non-phrase term to designate
1687 * negation and an optional OR connector.
1688 *
1689 * OR binds more tightly than the implied AND, which is what the
1690 * major search engines seem to do. So, for example:
1691 *
1692 * [one two OR three] ==> one AND (two OR three)
1693 * [one OR two three] ==> (one OR two) AND three
1694 *
1695 * A "-" before a term matches all entries that lack that term.
1696 * The "-" must occur immediately before the term with in intervening
1697 * space. This is how the search engines do it.
1698 *
1699 * A NOT term cannot be the right-hand operand of an OR. If this
1700 * occurs in the query string, the NOT is ignored:
1701 *
1702 * [one OR -two] ==> one OR two
1703 *
1704 */
1705typedef struct Query {
1706 fulltext_vtab *pFts; /* The full text index */
1707 int nTerms; /* Number of terms in the query */
1708 QueryTerm *pTerms; /* Array of terms. Space obtained from malloc() */
1709 int nextIsOr; /* Set the isOr flag on the next inserted term */
1710 int nextColumn; /* Next word parsed must be in this column */
1711 int dfltColumn; /* The default column */
1712} Query;
1713
1714
1715/*
1716** An instance of the following structure keeps track of generated
1717** matching-word offset information and snippets.
1718*/
1719typedef struct Snippet {
1720 int nMatch; /* Total number of matches */
1721 int nAlloc; /* Space allocated for aMatch[] */
1722 struct snippetMatch { /* One entry for each matching term */
1723 char snStatus; /* Status flag for use while constructing snippets */
1724 short int iCol; /* The column that contains the match */
1725 short int iTerm; /* The index in Query.pTerms[] of the matching term */
1726 short int nByte; /* Number of bytes in the term */
1727 int iStart; /* The offset to the first character of the term */
1728 } *aMatch; /* Points to space obtained from malloc */
1729 char *zOffset; /* Text rendering of aMatch[] */
1730 int nOffset; /* strlen(zOffset) */
1731 char *zSnippet; /* Snippet text */
1732 int nSnippet; /* strlen(zSnippet) */
1733} Snippet;
1734
1735
1736typedef enum QueryType {
1737 QUERY_GENERIC, /* table scan */
1738 QUERY_DOCID, /* lookup by docid */
1739 QUERY_FULLTEXT /* QUERY_FULLTEXT + [i] is a full-text search for column i*/
1740} QueryType;
1741
1742typedef enum fulltext_statement {
1743 CONTENT_INSERT_STMT,
1744 CONTENT_SELECT_STMT,
1745 CONTENT_UPDATE_STMT,
1746 CONTENT_DELETE_STMT,
1747
1748 BLOCK_INSERT_STMT,
1749 BLOCK_SELECT_STMT,
1750 BLOCK_DELETE_STMT,
1751
1752 SEGDIR_MAX_INDEX_STMT,
1753 SEGDIR_SET_STMT,
1754 SEGDIR_SELECT_STMT,
1755 SEGDIR_SPAN_STMT,
1756 SEGDIR_DELETE_STMT,
1757 SEGDIR_SELECT_ALL_STMT,
1758
1759 MAX_STMT /* Always at end! */
1760} fulltext_statement;
1761
1762/* These must exactly match the enum above. */
1763/* TODO(shess): Is there some risk that a statement will be used in two
1764** cursors at once, e.g. if a query joins a virtual table to itself?
1765** If so perhaps we should move some of these to the cursor object.
1766*/
1767static const char *const fulltext_zStatement[MAX_STMT] = {
1768 /* CONTENT_INSERT */ NULL, /* generated in contentInsertStatement() */
1769 /* CONTENT_SELECT */ NULL, /* generated in contentSelectStatement() */
1770 /* CONTENT_UPDATE */ NULL, /* generated in contentUpdateStatement() */
1771 /* CONTENT_DELETE */ "delete from %_content where docid = ?",
1772
1773 /* BLOCK_INSERT */
1774 "insert into %_segments (blockid, block) values (null, ?)",
1775 /* BLOCK_SELECT */ "select block from %_segments where blockid = ?",
1776 /* BLOCK_DELETE */ "delete from %_segments where blockid between ? and ?",
1777
1778 /* SEGDIR_MAX_INDEX */ "select max(idx) from %_segdir where level = ?",
1779 /* SEGDIR_SET */ "insert into %_segdir values (?, ?, ?, ?, ?, ?)",
1780 /* SEGDIR_SELECT */
1781 "select start_block, leaves_end_block, root from %_segdir "
1782 " where level = ? order by idx",
1783 /* SEGDIR_SPAN */
1784 "select min(start_block), max(end_block) from %_segdir "
1785 " where level = ? and start_block <> 0",
1786 /* SEGDIR_DELETE */ "delete from %_segdir where level = ?",
1787 /* SEGDIR_SELECT_ALL */
1788 "select root, leaves_end_block from %_segdir order by level desc, idx",
1789};
1790
1791/*
1792** A connection to a fulltext index is an instance of the following
1793** structure. The xCreate and xConnect methods create an instance
1794** of this structure and xDestroy and xDisconnect free that instance.
1795** All other methods receive a pointer to the structure as one of their
1796** arguments.
1797*/
1798struct fulltext_vtab {
1799 sqlite3_vtab base; /* Base class used by SQLite core */
1800 sqlite3 *db; /* The database connection */
1801 const char *zDb; /* logical database name */
1802 const char *zName; /* virtual table name */
1803 int nColumn; /* number of columns in virtual table */
1804 char **azColumn; /* column names. malloced */
1805 char **azContentColumn; /* column names in content table; malloced */
1806 sqlite3_tokenizer *pTokenizer; /* tokenizer for inserts and queries */
1807
1808 /* Precompiled statements which we keep as long as the table is
1809 ** open.
1810 */
1811 sqlite3_stmt *pFulltextStatements[MAX_STMT];
1812
1813 /* Precompiled statements used for segment merges. We run a
1814 ** separate select across the leaf level of each tree being merged.
1815 */
1816 sqlite3_stmt *pLeafSelectStmts[MERGE_COUNT];
1817 /* The statement used to prepare pLeafSelectStmts. */
1818#define LEAF_SELECT \
1819 "select block from %_segments where blockid between ? and ? order by blockid"
1820
1821 /* These buffer pending index updates during transactions.
1822 ** nPendingData estimates the memory size of the pending data. It
1823 ** doesn't include the hash-bucket overhead, nor any malloc
1824 ** overhead. When nPendingData exceeds kPendingThreshold, the
1825 ** buffer is flushed even before the transaction closes.
1826 ** pendingTerms stores the data, and is only valid when nPendingData
1827 ** is >=0 (nPendingData<0 means pendingTerms has not been
1828 ** initialized). iPrevDocid is the last docid written, used to make
1829 ** certain we're inserting in sorted order.
1830 */
1831 int nPendingData;
1832#define kPendingThreshold (1*1024*1024)
1833 sqlite_int64 iPrevDocid;
1834 fts3Hash pendingTerms;
1835};
1836
1837/*
1838** When the core wants to do a query, it create a cursor using a
1839** call to xOpen. This structure is an instance of a cursor. It
1840** is destroyed by xClose.
1841*/
1842typedef struct fulltext_cursor {
1843 sqlite3_vtab_cursor base; /* Base class used by SQLite core */
1844 QueryType iCursorType; /* Copy of sqlite3_index_info.idxNum */
1845 sqlite3_stmt *pStmt; /* Prepared statement in use by the cursor */
1846 int eof; /* True if at End Of Results */
1847 Query q; /* Parsed query string */
1848 Snippet snippet; /* Cached snippet for the current row */
1849 int iColumn; /* Column being searched */
1850 DataBuffer result; /* Doclist results from fulltextQuery */
1851 DLReader reader; /* Result reader if result not empty */
1852} fulltext_cursor;
1853
1854static struct fulltext_vtab *cursor_vtab(fulltext_cursor *c){
1855 return (fulltext_vtab *) c->base.pVtab;
1856}
1857
1858static const sqlite3_module fts3Module; /* forward declaration */
1859
1860/* Return a dynamically generated statement of the form
1861 * insert into %_content (docid, ...) values (?, ...)
1862 */
1863static const char *contentInsertStatement(fulltext_vtab *v){
1864 StringBuffer sb;
1865 int i;
1866
1867 initStringBuffer(&sb);
1868 append(&sb, "insert into %_content (docid, ");
1869 appendList(&sb, v->nColumn, v->azContentColumn);
1870 append(&sb, ") values (?");
1871 for(i=0; i<v->nColumn; ++i)
1872 append(&sb, ", ?");
1873 append(&sb, ")");
1874 return stringBufferData(&sb);
1875}
1876
1877/* Return a dynamically generated statement of the form
1878 * select <content columns> from %_content where docid = ?
1879 */
1880static const char *contentSelectStatement(fulltext_vtab *v){
1881 StringBuffer sb;
1882 initStringBuffer(&sb);
1883 append(&sb, "SELECT ");
1884 appendList(&sb, v->nColumn, v->azContentColumn);
1885 append(&sb, " FROM %_content WHERE docid = ?");
1886 return stringBufferData(&sb);
1887}
1888
1889/* Return a dynamically generated statement of the form
1890 * update %_content set [col_0] = ?, [col_1] = ?, ...
1891 * where docid = ?
1892 */
1893static const char *contentUpdateStatement(fulltext_vtab *v){
1894 StringBuffer sb;
1895 int i;
1896
1897 initStringBuffer(&sb);
1898 append(&sb, "update %_content set ");
1899 for(i=0; i<v->nColumn; ++i) {
1900 if( i>0 ){
1901 append(&sb, ", ");
1902 }
1903 append(&sb, v->azContentColumn[i]);
1904 append(&sb, " = ?");
1905 }
1906 append(&sb, " where docid = ?");
1907 return stringBufferData(&sb);
1908}
1909
1910/* Puts a freshly-prepared statement determined by iStmt in *ppStmt.
1911** If the indicated statement has never been prepared, it is prepared
1912** and cached, otherwise the cached version is reset.
1913*/
1914static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt,
1915 sqlite3_stmt **ppStmt){
1916 assert( iStmt<MAX_STMT );
1917 if( v->pFulltextStatements[iStmt]==NULL ){
1918 const char *zStmt;
1919 int rc;
1920 switch( iStmt ){
1921 case CONTENT_INSERT_STMT:
1922 zStmt = contentInsertStatement(v); break;
1923 case CONTENT_SELECT_STMT:
1924 zStmt = contentSelectStatement(v); break;
1925 case CONTENT_UPDATE_STMT:
1926 zStmt = contentUpdateStatement(v); break;
1927 default:
1928 zStmt = fulltext_zStatement[iStmt];
1929 }
1930 rc = sql_prepare(v->db, v->zDb, v->zName, &v->pFulltextStatements[iStmt],
1931 zStmt);
1932 if( zStmt != fulltext_zStatement[iStmt]) free((void *) zStmt);
1933 if( rc!=SQLITE_OK ) return rc;
1934 } else {
1935 int rc = sqlite3_reset(v->pFulltextStatements[iStmt]);
1936 if( rc!=SQLITE_OK ) return rc;
1937 }
1938
1939 *ppStmt = v->pFulltextStatements[iStmt];
1940 return SQLITE_OK;
1941}
1942
1943/* Like sqlite3_step(), but convert SQLITE_DONE to SQLITE_OK and
1944** SQLITE_ROW to SQLITE_ERROR. Useful for statements like UPDATE,
1945** where we expect no results.
1946*/
1947static int sql_single_step(sqlite3_stmt *s){
1948 int rc = sqlite3_step(s);
1949 return (rc==SQLITE_DONE) ? SQLITE_OK : rc;
1950}
1951
1952/* Like sql_get_statement(), but for special replicated LEAF_SELECT
1953** statements.
1954*/
1955/* TODO(shess) Write version for generic statements and then share
1956** that between the cached-statement functions.
1957*/
1958static int sql_get_leaf_statement(fulltext_vtab *v, int idx,
1959 sqlite3_stmt **ppStmt){
1960 assert( idx>=0 && idx<MERGE_COUNT );
1961 if( v->pLeafSelectStmts[idx]==NULL ){
1962 int rc = sql_prepare(v->db, v->zDb, v->zName, &v->pLeafSelectStmts[idx],
1963 LEAF_SELECT);
1964 if( rc!=SQLITE_OK ) return rc;
1965 }else{
1966 int rc = sqlite3_reset(v->pLeafSelectStmts[idx]);
1967 if( rc!=SQLITE_OK ) return rc;
1968 }
1969
1970 *ppStmt = v->pLeafSelectStmts[idx];
1971 return SQLITE_OK;
1972}
1973
1974/* insert into %_content (docid, ...) values ([docid], [pValues])
1975** If the docid contains SQL NULL, then a unique docid will be
1976** generated.
1977*/
1978static int content_insert(fulltext_vtab *v, sqlite3_value *docid,
1979 sqlite3_value **pValues){
1980 sqlite3_stmt *s;
1981 int i;
1982 int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s);
1983 if( rc!=SQLITE_OK ) return rc;
1984
1985 rc = sqlite3_bind_value(s, 1, docid);
1986 if( rc!=SQLITE_OK ) return rc;
1987
1988 for(i=0; i<v->nColumn; ++i){
1989 rc = sqlite3_bind_value(s, 2+i, pValues[i]);
1990 if( rc!=SQLITE_OK ) return rc;
1991 }
1992
1993 return sql_single_step(s);
1994}
1995
1996/* update %_content set col0 = pValues[0], col1 = pValues[1], ...
1997 * where docid = [iDocid] */
1998static int content_update(fulltext_vtab *v, sqlite3_value **pValues,
1999 sqlite_int64 iDocid){
2000 sqlite3_stmt *s;
2001 int i;
2002 int rc = sql_get_statement(v, CONTENT_UPDATE_STMT, &s);
2003 if( rc!=SQLITE_OK ) return rc;
2004
2005 for(i=0; i<v->nColumn; ++i){
2006 rc = sqlite3_bind_value(s, 1+i, pValues[i]);
2007 if( rc!=SQLITE_OK ) return rc;
2008 }
2009
2010 rc = sqlite3_bind_int64(s, 1+v->nColumn, iDocid);
2011 if( rc!=SQLITE_OK ) return rc;
2012
2013 return sql_single_step(s);
2014}
2015
2016static void freeStringArray(int nString, const char **pString){
2017 int i;
2018
2019 for (i=0 ; i < nString ; ++i) {
2020 if( pString[i]!=NULL ) free((void *) pString[i]);
2021 }
2022 free((void *) pString);
2023}
2024
2025/* select * from %_content where docid = [iDocid]
2026 * The caller must delete the returned array and all strings in it.
2027 * null fields will be NULL in the returned array.
2028 *
2029 * TODO: Perhaps we should return pointer/length strings here for consistency
2030 * with other code which uses pointer/length. */
2031static int content_select(fulltext_vtab *v, sqlite_int64 iDocid,
2032 const char ***pValues){
2033 sqlite3_stmt *s;
2034 const char **values;
2035 int i;
2036 int rc;
2037
2038 *pValues = NULL;
2039
2040 rc = sql_get_statement(v, CONTENT_SELECT_STMT, &s);
2041 if( rc!=SQLITE_OK ) return rc;
2042
2043 rc = sqlite3_bind_int64(s, 1, iDocid);
2044 if( rc!=SQLITE_OK ) return rc;
2045
2046 rc = sqlite3_step(s);
2047 if( rc!=SQLITE_ROW ) return rc;
2048
2049 values = (const char **) malloc(v->nColumn * sizeof(const char *));
2050 for(i=0; i<v->nColumn; ++i){
2051 if( sqlite3_column_type(s, i)==SQLITE_NULL ){
2052 values[i] = NULL;
2053 }else{
2054 values[i] = string_dup((char*)sqlite3_column_text(s, i));
2055 }
2056 }
2057
2058 /* We expect only one row. We must execute another sqlite3_step()
2059 * to complete the iteration; otherwise the table will remain locked. */
2060 rc = sqlite3_step(s);
2061 if( rc==SQLITE_DONE ){
2062 *pValues = values;
2063 return SQLITE_OK;
2064 }
2065
2066 freeStringArray(v->nColumn, values);
2067 return rc;
2068}
2069
2070/* delete from %_content where docid = [iDocid ] */
2071static int content_delete(fulltext_vtab *v, sqlite_int64 iDocid){
2072 sqlite3_stmt *s;
2073 int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s);
2074 if( rc!=SQLITE_OK ) return rc;
2075
2076 rc = sqlite3_bind_int64(s, 1, iDocid);
2077 if( rc!=SQLITE_OK ) return rc;
2078
2079 return sql_single_step(s);
2080}
2081
2082/* insert into %_segments values ([pData])
2083** returns assigned blockid in *piBlockid
2084*/
2085static int block_insert(fulltext_vtab *v, const char *pData, int nData,
2086 sqlite_int64 *piBlockid){
2087 sqlite3_stmt *s;
2088 int rc = sql_get_statement(v, BLOCK_INSERT_STMT, &s);
2089 if( rc!=SQLITE_OK ) return rc;
2090
2091 rc = sqlite3_bind_blob(s, 1, pData, nData, SQLITE_STATIC);
2092 if( rc!=SQLITE_OK ) return rc;
2093
2094 rc = sqlite3_step(s);
2095 if( rc==SQLITE_ROW ) return SQLITE_ERROR;
2096 if( rc!=SQLITE_DONE ) return rc;
2097
2098 /* blockid column is an alias for rowid. */
2099 *piBlockid = sqlite3_last_insert_rowid(v->db);
2100 return SQLITE_OK;
2101}
2102
2103/* delete from %_segments
2104** where blockid between [iStartBlockid] and [iEndBlockid]
2105**
2106** Deletes the range of blocks, inclusive, used to delete the blocks
2107** which form a segment.
2108*/
2109static int block_delete(fulltext_vtab *v,
2110 sqlite_int64 iStartBlockid, sqlite_int64 iEndBlockid){
2111 sqlite3_stmt *s;
2112 int rc = sql_get_statement(v, BLOCK_DELETE_STMT, &s);
2113 if( rc!=SQLITE_OK ) return rc;
2114
2115 rc = sqlite3_bind_int64(s, 1, iStartBlockid);
2116 if( rc!=SQLITE_OK ) return rc;
2117
2118 rc = sqlite3_bind_int64(s, 2, iEndBlockid);
2119 if( rc!=SQLITE_OK ) return rc;
2120
2121 return sql_single_step(s);
2122}
2123
2124/* Returns SQLITE_ROW with *pidx set to the maximum segment idx found
2125** at iLevel. Returns SQLITE_DONE if there are no segments at
2126** iLevel. Otherwise returns an error.
2127*/
2128static int segdir_max_index(fulltext_vtab *v, int iLevel, int *pidx){
2129 sqlite3_stmt *s;
2130 int rc = sql_get_statement(v, SEGDIR_MAX_INDEX_STMT, &s);
2131 if( rc!=SQLITE_OK ) return rc;
2132
2133 rc = sqlite3_bind_int(s, 1, iLevel);
2134 if( rc!=SQLITE_OK ) return rc;
2135
2136 rc = sqlite3_step(s);
2137 /* Should always get at least one row due to how max() works. */
2138 if( rc==SQLITE_DONE ) return SQLITE_DONE;
2139 if( rc!=SQLITE_ROW ) return rc;
2140
2141 /* NULL means that there were no inputs to max(). */
2142 if( SQLITE_NULL==sqlite3_column_type(s, 0) ){
2143 rc = sqlite3_step(s);
2144 if( rc==SQLITE_ROW ) return SQLITE_ERROR;
2145 return rc;
2146 }
2147
2148 *pidx = sqlite3_column_int(s, 0);
2149
2150 /* We expect only one row. We must execute another sqlite3_step()
2151 * to complete the iteration; otherwise the table will remain locked. */
2152 rc = sqlite3_step(s);
2153 if( rc==SQLITE_ROW ) return SQLITE_ERROR;
2154 if( rc!=SQLITE_DONE ) return rc;
2155 return SQLITE_ROW;
2156}
2157
2158/* insert into %_segdir values (
2159** [iLevel], [idx],
2160** [iStartBlockid], [iLeavesEndBlockid], [iEndBlockid],
2161** [pRootData]
2162** )
2163*/
2164static int segdir_set(fulltext_vtab *v, int iLevel, int idx,
2165 sqlite_int64 iStartBlockid,
2166 sqlite_int64 iLeavesEndBlockid,
2167 sqlite_int64 iEndBlockid,
2168 const char *pRootData, int nRootData){
2169 sqlite3_stmt *s;
2170 int rc = sql_get_statement(v, SEGDIR_SET_STMT, &s);
2171 if( rc!=SQLITE_OK ) return rc;
2172
2173 rc = sqlite3_bind_int(s, 1, iLevel);
2174 if( rc!=SQLITE_OK ) return rc;
2175
2176 rc = sqlite3_bind_int(s, 2, idx);
2177 if( rc!=SQLITE_OK ) return rc;
2178
2179 rc = sqlite3_bind_int64(s, 3, iStartBlockid);
2180 if( rc!=SQLITE_OK ) return rc;
2181
2182 rc = sqlite3_bind_int64(s, 4, iLeavesEndBlockid);
2183 if( rc!=SQLITE_OK ) return rc;
2184
2185 rc = sqlite3_bind_int64(s, 5, iEndBlockid);
2186 if( rc!=SQLITE_OK ) return rc;
2187
2188 rc = sqlite3_bind_blob(s, 6, pRootData, nRootData, SQLITE_STATIC);
2189 if( rc!=SQLITE_OK ) return rc;
2190
2191 return sql_single_step(s);
2192}
2193
2194/* Queries %_segdir for the block span of the segments in level
2195** iLevel. Returns SQLITE_DONE if there are no blocks for iLevel,
2196** SQLITE_ROW if there are blocks, else an error.
2197*/
2198static int segdir_span(fulltext_vtab *v, int iLevel,
2199 sqlite_int64 *piStartBlockid,
2200 sqlite_int64 *piEndBlockid){
2201 sqlite3_stmt *s;
2202 int rc = sql_get_statement(v, SEGDIR_SPAN_STMT, &s);
2203 if( rc!=SQLITE_OK ) return rc;
2204
2205 rc = sqlite3_bind_int(s, 1, iLevel);
2206 if( rc!=SQLITE_OK ) return rc;
2207
2208 rc = sqlite3_step(s);
2209 if( rc==SQLITE_DONE ) return SQLITE_DONE; /* Should never happen */
2210 if( rc!=SQLITE_ROW ) return rc;
2211
2212 /* This happens if all segments at this level are entirely inline. */
2213 if( SQLITE_NULL==sqlite3_column_type(s, 0) ){
2214 /* We expect only one row. We must execute another sqlite3_step()
2215 * to complete the iteration; otherwise the table will remain locked. */
2216 int rc2 = sqlite3_step(s);
2217 if( rc2==SQLITE_ROW ) return SQLITE_ERROR;
2218 return rc2;
2219 }
2220
2221 *piStartBlockid = sqlite3_column_int64(s, 0);
2222 *piEndBlockid = sqlite3_column_int64(s, 1);
2223
2224 /* We expect only one row. We must execute another sqlite3_step()
2225 * to complete the iteration; otherwise the table will remain locked. */
2226 rc = sqlite3_step(s);
2227 if( rc==SQLITE_ROW ) return SQLITE_ERROR;
2228 if( rc!=SQLITE_DONE ) return rc;
2229 return SQLITE_ROW;
2230}
2231
2232/* Delete the segment blocks and segment directory records for all
2233** segments at iLevel.
2234*/
2235static int segdir_delete(fulltext_vtab *v, int iLevel){
2236 sqlite3_stmt *s;
2237 sqlite_int64 iStartBlockid, iEndBlockid;
2238 int rc = segdir_span(v, iLevel, &iStartBlockid, &iEndBlockid);
2239 if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ) return rc;
2240
2241 if( rc==SQLITE_ROW ){
2242 rc = block_delete(v, iStartBlockid, iEndBlockid);
2243 if( rc!=SQLITE_OK ) return rc;
2244 }
2245
2246 /* Delete the segment directory itself. */
2247 rc = sql_get_statement(v, SEGDIR_DELETE_STMT, &s);
2248 if( rc!=SQLITE_OK ) return rc;
2249
2250 rc = sqlite3_bind_int64(s, 1, iLevel);
2251 if( rc!=SQLITE_OK ) return rc;
2252
2253 return sql_single_step(s);
2254}
2255
2256/* TODO(shess) clearPendingTerms() is far down the file because
2257** writeZeroSegment() is far down the file because LeafWriter is far
2258** down the file. Consider refactoring the code to move the non-vtab
2259** code above the vtab code so that we don't need this forward
2260** reference.
2261*/
2262static int clearPendingTerms(fulltext_vtab *v);
2263
2264/*
2265** Free the memory used to contain a fulltext_vtab structure.
2266*/
2267static void fulltext_vtab_destroy(fulltext_vtab *v){
2268 int iStmt, i;
2269
2270 TRACE(("FTS3 Destroy %p\n", v));
2271 for( iStmt=0; iStmt<MAX_STMT; iStmt++ ){
2272 if( v->pFulltextStatements[iStmt]!=NULL ){
2273 sqlite3_finalize(v->pFulltextStatements[iStmt]);
2274 v->pFulltextStatements[iStmt] = NULL;
2275 }
2276 }
2277
2278 for( i=0; i<MERGE_COUNT; i++ ){
2279 if( v->pLeafSelectStmts[i]!=NULL ){
2280 sqlite3_finalize(v->pLeafSelectStmts[i]);
2281 v->pLeafSelectStmts[i] = NULL;
2282 }
2283 }
2284
2285 if( v->pTokenizer!=NULL ){
2286 v->pTokenizer->pModule->xDestroy(v->pTokenizer);
2287 v->pTokenizer = NULL;
2288 }
2289
2290 clearPendingTerms(v);
2291
2292 free(v->azColumn);
2293 for(i = 0; i < v->nColumn; ++i) {
2294 sqlite3_free(v->azContentColumn[i]);
2295 }
2296 free(v->azContentColumn);
2297 free(v);
2298}
2299
2300/*
2301** Token types for parsing the arguments to xConnect or xCreate.
2302*/
2303#define TOKEN_EOF 0 /* End of file */
2304#define TOKEN_SPACE 1 /* Any kind of whitespace */
2305#define TOKEN_ID 2 /* An identifier */
2306#define TOKEN_STRING 3 /* A string literal */
2307#define TOKEN_PUNCT 4 /* A single punctuation character */
2308
2309/*
2310** If X is a character that can be used in an identifier then
2311** IdChar(X) will be true. Otherwise it is false.
2312**
2313** For ASCII, any character with the high-order bit set is
2314** allowed in an identifier. For 7-bit characters,
2315** sqlite3IsIdChar[X] must be 1.
2316**
2317** Ticket #1066. the SQL standard does not allow '$' in the
2318** middle of identfiers. But many SQL implementations do.
2319** SQLite will allow '$' in identifiers for compatibility.
2320** But the feature is undocumented.
2321*/
2322static const char isIdChar[] = {
2323/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
2324 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 2x */
2325 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */
2326 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */
2327 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */
2328 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */
2329 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */
2330};
2331#define IdChar(C) (((c=C)&0x80)!=0 || (c>0x1f && isIdChar[c-0x20]))
2332
2333
2334/*
2335** Return the length of the token that begins at z[0].
2336** Store the token type in *tokenType before returning.
2337*/
2338static int getToken(const char *z, int *tokenType){
2339 int i, c;
2340 switch( *z ){
2341 case 0: {
2342 *tokenType = TOKEN_EOF;
2343 return 0;
2344 }
2345 case ' ': case '\t': case '\n': case '\f': case '\r': {
2346 for(i=1; safe_isspace(z[i]); i++){}
2347 *tokenType = TOKEN_SPACE;
2348 return i;
2349 }
2350 case '`':
2351 case '\'':
2352 case '"': {
2353 int delim = z[0];
2354 for(i=1; (c=z[i])!=0; i++){
2355 if( c==delim ){
2356 if( z[i+1]==delim ){
2357 i++;
2358 }else{
2359 break;
2360 }
2361 }
2362 }
2363 *tokenType = TOKEN_STRING;
2364 return i + (c!=0);
2365 }
2366 case '[': {
2367 for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){}
2368 *tokenType = TOKEN_ID;
2369 return i;
2370 }
2371 default: {
2372 if( !IdChar(*z) ){
2373 break;
2374 }
2375 for(i=1; IdChar(z[i]); i++){}
2376 *tokenType = TOKEN_ID;
2377 return i;
2378 }
2379 }
2380 *tokenType = TOKEN_PUNCT;
2381 return 1;
2382}
2383
2384/*
2385** A token extracted from a string is an instance of the following
2386** structure.
2387*/
2388typedef struct Token {
2389 const char *z; /* Pointer to token text. Not '\000' terminated */
2390 short int n; /* Length of the token text in bytes. */
2391} Token;
2392
2393/*
2394** Given a input string (which is really one of the argv[] parameters
2395** passed into xConnect or xCreate) split the string up into tokens.
2396** Return an array of pointers to '\000' terminated strings, one string
2397** for each non-whitespace token.
2398**
2399** The returned array is terminated by a single NULL pointer.
2400**
2401** Space to hold the returned array is obtained from a single
2402** malloc and should be freed by passing the return value to free().
2403** The individual strings within the token list are all a part of
2404** the single memory allocation and will all be freed at once.
2405*/
2406static char **tokenizeString(const char *z, int *pnToken){
2407 int nToken = 0;
2408 Token *aToken = malloc( strlen(z) * sizeof(aToken[0]) );
2409 int n = 1;
2410 int e, i;
2411 int totalSize = 0;
2412 char **azToken;
2413 char *zCopy;
2414 while( n>0 ){
2415 n = getToken(z, &e);
2416 if( e!=TOKEN_SPACE ){
2417 aToken[nToken].z = z;
2418 aToken[nToken].n = n;
2419 nToken++;
2420 totalSize += n+1;
2421 }
2422 z += n;
2423 }
2424 azToken = (char**)malloc( nToken*sizeof(char*) + totalSize );
2425 zCopy = (char*)&azToken[nToken];
2426 nToken--;
2427 for(i=0; i<nToken; i++){
2428 azToken[i] = zCopy;
2429 n = aToken[i].n;
2430 memcpy(zCopy, aToken[i].z, n);
2431 zCopy[n] = 0;
2432 zCopy += n+1;
2433 }
2434 azToken[nToken] = 0;
2435 free(aToken);
2436 *pnToken = nToken;
2437 return azToken;
2438}
2439
2440/*
2441** Convert an SQL-style quoted string into a normal string by removing
2442** the quote characters. The conversion is done in-place. If the
2443** input does not begin with a quote character, then this routine
2444** is a no-op.
2445**
2446** Examples:
2447**
2448** "abc" becomes abc
2449** 'xyz' becomes xyz
2450** [pqr] becomes pqr
2451** `mno` becomes mno
2452*/
2453static void dequoteString(char *z){
2454 int quote;
2455 int i, j;
2456 if( z==0 ) return;
2457 quote = z[0];
2458 switch( quote ){
2459 case '\'': break;
2460 case '"': break;
2461 case '`': break; /* For MySQL compatibility */
2462 case '[': quote = ']'; break; /* For MS SqlServer compatibility */
2463 default: return;
2464 }
2465 for(i=1, j=0; z[i]; i++){
2466 if( z[i]==quote ){
2467 if( z[i+1]==quote ){
2468 z[j++] = quote;
2469 i++;
2470 }else{
2471 z[j++] = 0;
2472 break;
2473 }
2474 }else{
2475 z[j++] = z[i];
2476 }
2477 }
2478}
2479
2480/*
2481** The input azIn is a NULL-terminated list of tokens. Remove the first
2482** token and all punctuation tokens. Remove the quotes from
2483** around string literal tokens.
2484**
2485** Example:
2486**
2487** input: tokenize chinese ( 'simplifed' , 'mixed' )
2488** output: chinese simplifed mixed
2489**
2490** Another example:
2491**
2492** input: delimiters ( '[' , ']' , '...' )
2493** output: [ ] ...
2494*/
2495static void tokenListToIdList(char **azIn){
2496 int i, j;
2497 if( azIn ){
2498 for(i=0, j=-1; azIn[i]; i++){
2499 if( safe_isalnum(azIn[i][0]) || azIn[i][1] ){
2500 dequoteString(azIn[i]);
2501 if( j>=0 ){
2502 azIn[j] = azIn[i];
2503 }
2504 j++;
2505 }
2506 }
2507 azIn[j] = 0;
2508 }
2509}
2510
2511
2512/*
2513** Find the first alphanumeric token in the string zIn. Null-terminate
2514** this token. Remove any quotation marks. And return a pointer to
2515** the result.
2516*/
2517static char *firstToken(char *zIn, char **pzTail){
2518 int n, ttype;
2519 while(1){
2520 n = getToken(zIn, &ttype);
2521 if( ttype==TOKEN_SPACE ){
2522 zIn += n;
2523 }else if( ttype==TOKEN_EOF ){
2524 *pzTail = zIn;
2525 return 0;
2526 }else{
2527 zIn[n] = 0;
2528 *pzTail = &zIn[1];
2529 dequoteString(zIn);
2530 return zIn;
2531 }
2532 }
2533 /*NOTREACHED*/
2534}
2535
2536/* Return true if...
2537**
2538** * s begins with the string t, ignoring case
2539** * s is longer than t
2540** * The first character of s beyond t is not a alphanumeric
2541**
2542** Ignore leading space in *s.
2543**
2544** To put it another way, return true if the first token of
2545** s[] is t[].
2546*/
2547static int startsWith(const char *s, const char *t){
2548 while( safe_isspace(*s) ){ s++; }
2549 while( *t ){
2550 if( safe_tolower(*s++)!=safe_tolower(*t++) ) return 0;
2551 }
2552 return *s!='_' && !safe_isalnum(*s);
2553}
2554
2555/*
2556** An instance of this structure defines the "spec" of a
2557** full text index. This structure is populated by parseSpec
2558** and use by fulltextConnect and fulltextCreate.
2559*/
2560typedef struct TableSpec {
2561 const char *zDb; /* Logical database name */
2562 const char *zName; /* Name of the full-text index */
2563 int nColumn; /* Number of columns to be indexed */
2564 char **azColumn; /* Original names of columns to be indexed */
2565 char **azContentColumn; /* Column names for %_content */
2566 char **azTokenizer; /* Name of tokenizer and its arguments */
2567} TableSpec;
2568
2569/*
2570** Reclaim all of the memory used by a TableSpec
2571*/
2572static void clearTableSpec(TableSpec *p) {
2573 free(p->azColumn);
2574 free(p->azContentColumn);
2575 free(p->azTokenizer);
2576}
2577
2578/* Parse a CREATE VIRTUAL TABLE statement, which looks like this:
2579 *
2580 * CREATE VIRTUAL TABLE email
2581 * USING fts3(subject, body, tokenize mytokenizer(myarg))
2582 *
2583 * We return parsed information in a TableSpec structure.
2584 *
2585 */
2586static int parseSpec(TableSpec *pSpec, int argc, const char *const*argv,
2587 char**pzErr){
2588 int i, n;
2589 char *z, *zDummy;
2590 char **azArg;
2591 const char *zTokenizer = 0; /* argv[] entry describing the tokenizer */
2592
2593 assert( argc>=3 );
2594 /* Current interface:
2595 ** argv[0] - module name
2596 ** argv[1] - database name
2597 ** argv[2] - table name
2598 ** argv[3..] - columns, optionally followed by tokenizer specification
2599 ** and snippet delimiters specification.
2600 */
2601
2602 /* Make a copy of the complete argv[][] array in a single allocation.
2603 ** The argv[][] array is read-only and transient. We can write to the
2604 ** copy in order to modify things and the copy is persistent.
2605 */
2606 CLEAR(pSpec);
2607 for(i=n=0; i<argc; i++){
2608 n += strlen(argv[i]) + 1;
2609 }
2610 azArg = malloc( sizeof(char*)*argc + n );
2611 if( azArg==0 ){
2612 return SQLITE_NOMEM;
2613 }
2614 z = (char*)&azArg[argc];
2615 for(i=0; i<argc; i++){
2616 azArg[i] = z;
2617 strcpy(z, argv[i]);
2618 z += strlen(z)+1;
2619 }
2620
2621 /* Identify the column names and the tokenizer and delimiter arguments
2622 ** in the argv[][] array.
2623 */
2624 pSpec->zDb = azArg[1];
2625 pSpec->zName = azArg[2];
2626 pSpec->nColumn = 0;
2627 pSpec->azColumn = azArg;
2628 zTokenizer = "tokenize simple";
2629 for(i=3; i<argc; ++i){
2630 if( startsWith(azArg[i],"tokenize") ){
2631 zTokenizer = azArg[i];
2632 }else{
2633 z = azArg[pSpec->nColumn] = firstToken(azArg[i], &zDummy);
2634 pSpec->nColumn++;
2635 }
2636 }
2637 if( pSpec->nColumn==0 ){
2638 azArg[0] = "content";
2639 pSpec->nColumn = 1;
2640 }
2641
2642 /*
2643 ** Construct the list of content column names.
2644 **
2645 ** Each content column name will be of the form cNNAAAA
2646 ** where NN is the column number and AAAA is the sanitized
2647 ** column name. "sanitized" means that special characters are
2648 ** converted to "_". The cNN prefix guarantees that all column
2649 ** names are unique.
2650 **
2651 ** The AAAA suffix is not strictly necessary. It is included
2652 ** for the convenience of people who might examine the generated
2653 ** %_content table and wonder what the columns are used for.
2654 */
2655 pSpec->azContentColumn = malloc( pSpec->nColumn * sizeof(char *) );
2656 if( pSpec->azContentColumn==0 ){
2657 clearTableSpec(pSpec);
2658 return SQLITE_NOMEM;
2659 }
2660 for(i=0; i<pSpec->nColumn; i++){
2661 char *p;
2662 pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]);
2663 for (p = pSpec->azContentColumn[i]; *p ; ++p) {
2664 if( !safe_isalnum(*p) ) *p = '_';
2665 }
2666 }
2667
2668 /*
2669 ** Parse the tokenizer specification string.
2670 */
2671 pSpec->azTokenizer = tokenizeString(zTokenizer, &n);
2672 tokenListToIdList(pSpec->azTokenizer);
2673
2674 return SQLITE_OK;
2675}
2676
2677/*
2678** Generate a CREATE TABLE statement that describes the schema of
2679** the virtual table. Return a pointer to this schema string.
2680**
2681** Space is obtained from sqlite3_mprintf() and should be freed
2682** using sqlite3_free().
2683*/
2684static char *fulltextSchema(
2685 int nColumn, /* Number of columns */
2686 const char *const* azColumn, /* List of columns */
2687 const char *zTableName /* Name of the table */
2688){
2689 int i;
2690 char *zSchema, *zNext;
2691 const char *zSep = "(";
2692 zSchema = sqlite3_mprintf("CREATE TABLE x");
2693 for(i=0; i<nColumn; i++){
2694 zNext = sqlite3_mprintf("%s%s%Q", zSchema, zSep, azColumn[i]);
2695 sqlite3_free(zSchema);
2696 zSchema = zNext;
2697 zSep = ",";
2698 }
2699 zNext = sqlite3_mprintf("%s,%Q HIDDEN", zSchema, zTableName);
2700 sqlite3_free(zSchema);
2701 zSchema = zNext;
2702 zNext = sqlite3_mprintf("%s,docid HIDDEN)", zSchema);
2703 sqlite3_free(zSchema);
2704 return zNext;
2705}
2706
2707/*
2708** Build a new sqlite3_vtab structure that will describe the
2709** fulltext index defined by spec.
2710*/
2711static int constructVtab(
2712 sqlite3 *db, /* The SQLite database connection */
2713 fts3Hash *pHash, /* Hash table containing tokenizers */
2714 TableSpec *spec, /* Parsed spec information from parseSpec() */
2715 sqlite3_vtab **ppVTab, /* Write the resulting vtab structure here */
2716 char **pzErr /* Write any error message here */
2717){
2718 int rc;
2719 int n;
2720 fulltext_vtab *v = 0;
2721 const sqlite3_tokenizer_module *m = NULL;
2722 char *schema;
2723
2724 char const *zTok; /* Name of tokenizer to use for this fts table */
2725 int nTok; /* Length of zTok, including nul terminator */
2726
2727 v = (fulltext_vtab *) malloc(sizeof(fulltext_vtab));
2728 if( v==0 ) return SQLITE_NOMEM;
2729 CLEAR(v);
2730 /* sqlite will initialize v->base */
2731 v->db = db;
2732 v->zDb = spec->zDb; /* Freed when azColumn is freed */
2733 v->zName = spec->zName; /* Freed when azColumn is freed */
2734 v->nColumn = spec->nColumn;
2735 v->azContentColumn = spec->azContentColumn;
2736 spec->azContentColumn = 0;
2737 v->azColumn = spec->azColumn;
2738 spec->azColumn = 0;
2739
2740 if( spec->azTokenizer==0 ){
2741 return SQLITE_NOMEM;
2742 }
2743
2744 zTok = spec->azTokenizer[0];
2745 if( !zTok ){
2746 zTok = "simple";
2747 }
2748 nTok = strlen(zTok)+1;
2749
2750 m = (sqlite3_tokenizer_module *)sqlite3Fts3HashFind(pHash, zTok, nTok);
2751 if( !m ){
2752 *pzErr = sqlite3_mprintf("unknown tokenizer: %s", spec->azTokenizer[0]);
2753 rc = SQLITE_ERROR;
2754 goto err;
2755 }
2756
2757 for(n=0; spec->azTokenizer[n]; n++){}
2758 if( n ){
2759 rc = m->xCreate(n-1, (const char*const*)&spec->azTokenizer[1],
2760 &v->pTokenizer);
2761 }else{
2762 rc = m->xCreate(0, 0, &v->pTokenizer);
2763 }
2764 if( rc!=SQLITE_OK ) goto err;
2765 v->pTokenizer->pModule = m;
2766
2767 /* TODO: verify the existence of backing tables foo_content, foo_term */
2768
2769 schema = fulltextSchema(v->nColumn, (const char*const*)v->azColumn,
2770 spec->zName);
2771 rc = sqlite3_declare_vtab(db, schema);
2772 sqlite3_free(schema);
2773 if( rc!=SQLITE_OK ) goto err;
2774
2775 memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements));
2776
2777 /* Indicate that the buffer is not live. */
2778 v->nPendingData = -1;
2779
2780 *ppVTab = &v->base;
2781 TRACE(("FTS3 Connect %p\n", v));
2782
2783 return rc;
2784
2785err:
2786 fulltext_vtab_destroy(v);
2787 return rc;
2788}
2789
2790static int fulltextConnect(
2791 sqlite3 *db,
2792 void *pAux,
2793 int argc, const char *const*argv,
2794 sqlite3_vtab **ppVTab,
2795 char **pzErr
2796){
2797 TableSpec spec;
2798 int rc = parseSpec(&spec, argc, argv, pzErr);
2799 if( rc!=SQLITE_OK ) return rc;
2800
2801 rc = constructVtab(db, (fts3Hash *)pAux, &spec, ppVTab, pzErr);
2802 clearTableSpec(&spec);
2803 return rc;
2804}
2805
2806/* The %_content table holds the text of each document, with
2807** the docid column exposed as the SQLite rowid for the table.
2808*/
2809/* TODO(shess) This comment needs elaboration to match the updated
2810** code. Work it into the top-of-file comment at that time.
2811*/
2812static int fulltextCreate(sqlite3 *db, void *pAux,
2813 int argc, const char * const *argv,
2814 sqlite3_vtab **ppVTab, char **pzErr){
2815 int rc;
2816 TableSpec spec;
2817 StringBuffer schema;
2818 TRACE(("FTS3 Create\n"));
2819
2820 rc = parseSpec(&spec, argc, argv, pzErr);
2821 if( rc!=SQLITE_OK ) return rc;
2822
2823 initStringBuffer(&schema);
2824 append(&schema, "CREATE TABLE %_content(");
2825 append(&schema, " docid INTEGER PRIMARY KEY,");
2826 appendList(&schema, spec.nColumn, spec.azContentColumn);
2827 append(&schema, ")");
2828 rc = sql_exec(db, spec.zDb, spec.zName, stringBufferData(&schema));
2829 stringBufferDestroy(&schema);
2830 if( rc!=SQLITE_OK ) goto out;
2831
2832 rc = sql_exec(db, spec.zDb, spec.zName,
2833 "create table %_segments("
2834 " blockid INTEGER PRIMARY KEY,"
2835 " block blob"
2836 ");"
2837 );
2838 if( rc!=SQLITE_OK ) goto out;
2839
2840 rc = sql_exec(db, spec.zDb, spec.zName,
2841 "create table %_segdir("
2842 " level integer,"
2843 " idx integer,"
2844 " start_block integer,"
2845 " leaves_end_block integer,"
2846 " end_block integer,"
2847 " root blob,"
2848 " primary key(level, idx)"
2849 ");");
2850 if( rc!=SQLITE_OK ) goto out;
2851
2852 rc = constructVtab(db, (fts3Hash *)pAux, &spec, ppVTab, pzErr);
2853
2854out:
2855 clearTableSpec(&spec);
2856 return rc;
2857}
2858
2859/* Decide how to handle an SQL query. */
2860static int fulltextBestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){
2861 fulltext_vtab *v = (fulltext_vtab *)pVTab;
2862 int i;
2863 TRACE(("FTS3 BestIndex\n"));
2864
2865 for(i=0; i<pInfo->nConstraint; ++i){
2866 const struct sqlite3_index_constraint *pConstraint;
2867 pConstraint = &pInfo->aConstraint[i];
2868 if( pConstraint->usable ) {
2869 if( (pConstraint->iColumn==-1 || pConstraint->iColumn==v->nColumn+1) &&
2870 pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ ){
2871 pInfo->idxNum = QUERY_DOCID; /* lookup by docid */
2872 TRACE(("FTS3 QUERY_DOCID\n"));
2873 } else if( pConstraint->iColumn>=0 && pConstraint->iColumn<=v->nColumn &&
2874 pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH ){
2875 /* full-text search */
2876 pInfo->idxNum = QUERY_FULLTEXT + pConstraint->iColumn;
2877 TRACE(("FTS3 QUERY_FULLTEXT %d\n", pConstraint->iColumn));
2878 } else continue;
2879
2880 pInfo->aConstraintUsage[i].argvIndex = 1;
2881 pInfo->aConstraintUsage[i].omit = 1;
2882
2883 /* An arbitrary value for now.
2884 * TODO: Perhaps docid matches should be considered cheaper than
2885 * full-text searches. */
2886 pInfo->estimatedCost = 1.0;
2887
2888 return SQLITE_OK;
2889 }
2890 }
2891 pInfo->idxNum = QUERY_GENERIC;
2892 return SQLITE_OK;
2893}
2894
2895static int fulltextDisconnect(sqlite3_vtab *pVTab){
2896 TRACE(("FTS3 Disconnect %p\n", pVTab));
2897 fulltext_vtab_destroy((fulltext_vtab *)pVTab);
2898 return SQLITE_OK;
2899}
2900
2901static int fulltextDestroy(sqlite3_vtab *pVTab){
2902 fulltext_vtab *v = (fulltext_vtab *)pVTab;
2903 int rc;
2904
2905 TRACE(("FTS3 Destroy %p\n", pVTab));
2906 rc = sql_exec(v->db, v->zDb, v->zName,
2907 "drop table if exists %_content;"
2908 "drop table if exists %_segments;"
2909 "drop table if exists %_segdir;"
2910 );
2911 if( rc!=SQLITE_OK ) return rc;
2912
2913 fulltext_vtab_destroy((fulltext_vtab *)pVTab);
2914 return SQLITE_OK;
2915}
2916
2917static int fulltextOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){
2918 fulltext_cursor *c;
2919
2920 c = (fulltext_cursor *) calloc(sizeof(fulltext_cursor), 1);
2921 /* sqlite will initialize c->base */
2922 *ppCursor = &c->base;
2923 TRACE(("FTS3 Open %p: %p\n", pVTab, c));
2924
2925 return SQLITE_OK;
2926}
2927
2928
2929/* Free all of the dynamically allocated memory held by *q
2930*/
2931static void queryClear(Query *q){
2932 int i;
2933 for(i = 0; i < q->nTerms; ++i){
2934 free(q->pTerms[i].pTerm);
2935 }
2936 free(q->pTerms);
2937 CLEAR(q);
2938}
2939
2940/* Free all of the dynamically allocated memory held by the
2941** Snippet
2942*/
2943static void snippetClear(Snippet *p){
2944 free(p->aMatch);
2945 free(p->zOffset);
2946 free(p->zSnippet);
2947 CLEAR(p);
2948}
2949/*
2950** Append a single entry to the p->aMatch[] log.
2951*/
2952static void snippetAppendMatch(
2953 Snippet *p, /* Append the entry to this snippet */
2954 int iCol, int iTerm, /* The column and query term */
2955 int iStart, int nByte /* Offset and size of the match */
2956){
2957 int i;
2958 struct snippetMatch *pMatch;
2959 if( p->nMatch+1>=p->nAlloc ){
2960 p->nAlloc = p->nAlloc*2 + 10;
2961 p->aMatch = realloc(p->aMatch, p->nAlloc*sizeof(p->aMatch[0]) );
2962 if( p->aMatch==0 ){
2963 p->nMatch = 0;
2964 p->nAlloc = 0;
2965 return;
2966 }
2967 }
2968 i = p->nMatch++;
2969 pMatch = &p->aMatch[i];
2970 pMatch->iCol = iCol;
2971 pMatch->iTerm = iTerm;
2972 pMatch->iStart = iStart;
2973 pMatch->nByte = nByte;
2974}
2975
2976/*
2977** Sizing information for the circular buffer used in snippetOffsetsOfColumn()
2978*/
2979#define FTS3_ROTOR_SZ (32)
2980#define FTS3_ROTOR_MASK (FTS3_ROTOR_SZ-1)
2981
2982/*
2983** Add entries to pSnippet->aMatch[] for every match that occurs against
2984** document zDoc[0..nDoc-1] which is stored in column iColumn.
2985*/
2986static void snippetOffsetsOfColumn(
2987 Query *pQuery,
2988 Snippet *pSnippet,
2989 int iColumn,
2990 const char *zDoc,
2991 int nDoc
2992){
2993 const sqlite3_tokenizer_module *pTModule; /* The tokenizer module */
2994 sqlite3_tokenizer *pTokenizer; /* The specific tokenizer */
2995 sqlite3_tokenizer_cursor *pTCursor; /* Tokenizer cursor */
2996 fulltext_vtab *pVtab; /* The full text index */
2997 int nColumn; /* Number of columns in the index */
2998 const QueryTerm *aTerm; /* Query string terms */
2999 int nTerm; /* Number of query string terms */
3000 int i, j; /* Loop counters */
3001 int rc; /* Return code */
3002 unsigned int match, prevMatch; /* Phrase search bitmasks */
3003 const char *zToken; /* Next token from the tokenizer */
3004 int nToken; /* Size of zToken */
3005 int iBegin, iEnd, iPos; /* Offsets of beginning and end */
3006
3007 /* The following variables keep a circular buffer of the last
3008 ** few tokens */
3009 unsigned int iRotor = 0; /* Index of current token */
3010 int iRotorBegin[FTS3_ROTOR_SZ]; /* Beginning offset of token */
3011 int iRotorLen[FTS3_ROTOR_SZ]; /* Length of token */
3012
3013 pVtab = pQuery->pFts;
3014 nColumn = pVtab->nColumn;
3015 pTokenizer = pVtab->pTokenizer;
3016 pTModule = pTokenizer->pModule;
3017 rc = pTModule->xOpen(pTokenizer, zDoc, nDoc, &pTCursor);
3018 if( rc ) return;
3019 pTCursor->pTokenizer = pTokenizer;
3020 aTerm = pQuery->pTerms;
3021 nTerm = pQuery->nTerms;
3022 if( nTerm>=FTS3_ROTOR_SZ ){
3023 nTerm = FTS3_ROTOR_SZ - 1;
3024 }
3025 prevMatch = 0;
3026 while(1){
3027 rc = pTModule->xNext(pTCursor, &zToken, &nToken, &iBegin, &iEnd, &iPos);
3028 if( rc ) break;
3029 iRotorBegin[iRotor&FTS3_ROTOR_MASK] = iBegin;
3030 iRotorLen[iRotor&FTS3_ROTOR_MASK] = iEnd-iBegin;
3031 match = 0;
3032 for(i=0; i<nTerm; i++){
3033 int iCol;
3034 iCol = aTerm[i].iColumn;
3035 if( iCol>=0 && iCol<nColumn && iCol!=iColumn ) continue;
3036 if( aTerm[i].nTerm>nToken ) continue;
3037 if( !aTerm[i].isPrefix && aTerm[i].nTerm<nToken ) continue;
3038 assert( aTerm[i].nTerm<=nToken );
3039 if( memcmp(aTerm[i].pTerm, zToken, aTerm[i].nTerm) ) continue;
3040 if( aTerm[i].iPhrase>1 && (prevMatch & (1<<i))==0 ) continue;
3041 match |= 1<<i;
3042 if( i==nTerm-1 || aTerm[i+1].iPhrase==1 ){
3043 for(j=aTerm[i].iPhrase-1; j>=0; j--){
3044 int k = (iRotor-j) & FTS3_ROTOR_MASK;
3045 snippetAppendMatch(pSnippet, iColumn, i-j,
3046 iRotorBegin[k], iRotorLen[k]);
3047 }
3048 }
3049 }
3050 prevMatch = match<<1;
3051 iRotor++;
3052 }
3053 pTModule->xClose(pTCursor);
3054}
3055
3056
3057/*
3058** Compute all offsets for the current row of the query.
3059** If the offsets have already been computed, this routine is a no-op.
3060*/
3061static void snippetAllOffsets(fulltext_cursor *p){
3062 int nColumn;
3063 int iColumn, i;
3064 int iFirst, iLast;
3065 fulltext_vtab *pFts;
3066
3067 if( p->snippet.nMatch ) return;
3068 if( p->q.nTerms==0 ) return;
3069 pFts = p->q.pFts;
3070 nColumn = pFts->nColumn;
3071 iColumn = (p->iCursorType - QUERY_FULLTEXT);
3072 if( iColumn<0 || iColumn>=nColumn ){
3073 iFirst = 0;
3074 iLast = nColumn-1;
3075 }else{
3076 iFirst = iColumn;
3077 iLast = iColumn;
3078 }
3079 for(i=iFirst; i<=iLast; i++){
3080 const char *zDoc;
3081 int nDoc;
3082 zDoc = (const char*)sqlite3_column_text(p->pStmt, i+1);
3083 nDoc = sqlite3_column_bytes(p->pStmt, i+1);
3084 snippetOffsetsOfColumn(&p->q, &p->snippet, i, zDoc, nDoc);
3085 }
3086}
3087
3088/*
3089** Convert the information in the aMatch[] array of the snippet
3090** into the string zOffset[0..nOffset-1].
3091*/
3092static void snippetOffsetText(Snippet *p){
3093 int i;
3094 int cnt = 0;
3095 StringBuffer sb;
3096 char zBuf[200];
3097 if( p->zOffset ) return;
3098 initStringBuffer(&sb);
3099 for(i=0; i<p->nMatch; i++){
3100 struct snippetMatch *pMatch = &p->aMatch[i];
3101 zBuf[0] = ' ';
3102 sprintf(&zBuf[cnt>0], "%d %d %d %d", pMatch->iCol,
3103 pMatch->iTerm, pMatch->iStart, pMatch->nByte);
3104 append(&sb, zBuf);
3105 cnt++;
3106 }
3107 p->zOffset = stringBufferData(&sb);
3108 p->nOffset = stringBufferLength(&sb);
3109}
3110
3111/*
3112** zDoc[0..nDoc-1] is phrase of text. aMatch[0..nMatch-1] are a set
3113** of matching words some of which might be in zDoc. zDoc is column
3114** number iCol.
3115**
3116** iBreak is suggested spot in zDoc where we could begin or end an
3117** excerpt. Return a value similar to iBreak but possibly adjusted
3118** to be a little left or right so that the break point is better.
3119*/
3120static int wordBoundary(
3121 int iBreak, /* The suggested break point */
3122 const char *zDoc, /* Document text */
3123 int nDoc, /* Number of bytes in zDoc[] */
3124 struct snippetMatch *aMatch, /* Matching words */
3125 int nMatch, /* Number of entries in aMatch[] */
3126 int iCol /* The column number for zDoc[] */
3127){
3128 int i;
3129 if( iBreak<=10 ){
3130 return 0;
3131 }
3132 if( iBreak>=nDoc-10 ){
3133 return nDoc;
3134 }
3135 for(i=0; i<nMatch && aMatch[i].iCol<iCol; i++){}
3136 while( i<nMatch && aMatch[i].iStart+aMatch[i].nByte<iBreak ){ i++; }
3137 if( i<nMatch ){
3138 if( aMatch[i].iStart<iBreak+10 ){
3139 return aMatch[i].iStart;
3140 }
3141 if( i>0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){
3142 return aMatch[i-1].iStart;
3143 }
3144 }
3145 for(i=1; i<=10; i++){
3146 if( safe_isspace(zDoc[iBreak-i]) ){
3147 return iBreak - i + 1;
3148 }
3149 if( safe_isspace(zDoc[iBreak+i]) ){
3150 return iBreak + i + 1;
3151 }
3152 }
3153 return iBreak;
3154}
3155
3156
3157
3158/*
3159** Allowed values for Snippet.aMatch[].snStatus
3160*/
3161#define SNIPPET_IGNORE 0 /* It is ok to omit this match from the snippet */
3162#define SNIPPET_DESIRED 1 /* We want to include this match in the snippet */
3163
3164/*
3165** Generate the text of a snippet.
3166*/
3167static void snippetText(
3168 fulltext_cursor *pCursor, /* The cursor we need the snippet for */
3169 const char *zStartMark, /* Markup to appear before each match */
3170 const char *zEndMark, /* Markup to appear after each match */
3171 const char *zEllipsis /* Ellipsis mark */
3172){
3173 int i, j;
3174 struct snippetMatch *aMatch;
3175 int nMatch;
3176 int nDesired;
3177 StringBuffer sb;
3178 int tailCol;
3179 int tailOffset;
3180 int iCol;
3181 int nDoc;
3182 const char *zDoc;
3183 int iStart, iEnd;
3184 int tailEllipsis = 0;
3185 int iMatch;
3186
3187
3188 free(pCursor->snippet.zSnippet);
3189 pCursor->snippet.zSnippet = 0;
3190 aMatch = pCursor->snippet.aMatch;
3191 nMatch = pCursor->snippet.nMatch;
3192 initStringBuffer(&sb);
3193
3194 for(i=0; i<nMatch; i++){
3195 aMatch[i].snStatus = SNIPPET_IGNORE;
3196 }
3197 nDesired = 0;
3198 for(i=0; i<pCursor->q.nTerms; i++){
3199 for(j=0; j<nMatch; j++){
3200 if( aMatch[j].iTerm==i ){
3201 aMatch[j].snStatus = SNIPPET_DESIRED;
3202 nDesired++;
3203 break;
3204 }
3205 }
3206 }
3207
3208 iMatch = 0;
3209 tailCol = -1;
3210 tailOffset = 0;
3211 for(i=0; i<nMatch && nDesired>0; i++){
3212 if( aMatch[i].snStatus!=SNIPPET_DESIRED ) continue;
3213 nDesired--;
3214 iCol = aMatch[i].iCol;
3215 zDoc = (const char*)sqlite3_column_text(pCursor->pStmt, iCol+1);
3216 nDoc = sqlite3_column_bytes(pCursor->pStmt, iCol+1);
3217 iStart = aMatch[i].iStart - 40;
3218 iStart = wordBoundary(iStart, zDoc, nDoc, aMatch, nMatch, iCol);
3219 if( iStart<=10 ){
3220 iStart = 0;
3221 }
3222 if( iCol==tailCol && iStart<=tailOffset+20 ){
3223 iStart = tailOffset;
3224 }
3225 if( (iCol!=tailCol && tailCol>=0) || iStart!=tailOffset ){
3226 trimWhiteSpace(&sb);
3227 appendWhiteSpace(&sb);
3228 append(&sb, zEllipsis);
3229 appendWhiteSpace(&sb);
3230 }
3231 iEnd = aMatch[i].iStart + aMatch[i].nByte + 40;
3232 iEnd = wordBoundary(iEnd, zDoc, nDoc, aMatch, nMatch, iCol);
3233 if( iEnd>=nDoc-10 ){
3234 iEnd = nDoc;
3235 tailEllipsis = 0;
3236 }else{
3237 tailEllipsis = 1;
3238 }
3239 while( iMatch<nMatch && aMatch[iMatch].iCol<iCol ){ iMatch++; }
3240 while( iStart<iEnd ){
3241 while( iMatch<nMatch && aMatch[iMatch].iStart<iStart
3242 && aMatch[iMatch].iCol<=iCol ){
3243 iMatch++;
3244 }
3245 if( iMatch<nMatch && aMatch[iMatch].iStart<iEnd
3246 && aMatch[iMatch].iCol==iCol ){
3247 nappend(&sb, &zDoc[iStart], aMatch[iMatch].iStart - iStart);
3248 iStart = aMatch[iMatch].iStart;
3249 append(&sb, zStartMark);
3250 nappend(&sb, &zDoc[iStart], aMatch[iMatch].nByte);
3251 append(&sb, zEndMark);
3252 iStart += aMatch[iMatch].nByte;
3253 for(j=iMatch+1; j<nMatch; j++){
3254 if( aMatch[j].iTerm==aMatch[iMatch].iTerm
3255 && aMatch[j].snStatus==SNIPPET_DESIRED ){
3256 nDesired--;
3257 aMatch[j].snStatus = SNIPPET_IGNORE;
3258 }
3259 }
3260 }else{
3261 nappend(&sb, &zDoc[iStart], iEnd - iStart);
3262 iStart = iEnd;
3263 }
3264 }
3265 tailCol = iCol;
3266 tailOffset = iEnd;
3267 }
3268 trimWhiteSpace(&sb);
3269 if( tailEllipsis ){
3270 appendWhiteSpace(&sb);
3271 append(&sb, zEllipsis);
3272 }
3273 pCursor->snippet.zSnippet = stringBufferData(&sb);
3274 pCursor->snippet.nSnippet = stringBufferLength(&sb);
3275}
3276
3277
3278/*
3279** Close the cursor. For additional information see the documentation
3280** on the xClose method of the virtual table interface.
3281*/
3282static int fulltextClose(sqlite3_vtab_cursor *pCursor){
3283 fulltext_cursor *c = (fulltext_cursor *) pCursor;
3284 TRACE(("FTS3 Close %p\n", c));
3285 sqlite3_finalize(c->pStmt);
3286 queryClear(&c->q);
3287 snippetClear(&c->snippet);
3288 if( c->result.nData!=0 ) dlrDestroy(&c->reader);
3289 dataBufferDestroy(&c->result);
3290 free(c);
3291 return SQLITE_OK;
3292}
3293
3294static int fulltextNext(sqlite3_vtab_cursor *pCursor){
3295 fulltext_cursor *c = (fulltext_cursor *) pCursor;
3296 int rc;
3297
3298 TRACE(("FTS3 Next %p\n", pCursor));
3299 snippetClear(&c->snippet);
3300 if( c->iCursorType < QUERY_FULLTEXT ){
3301 /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
3302 rc = sqlite3_step(c->pStmt);
3303 switch( rc ){
3304 case SQLITE_ROW:
3305 c->eof = 0;
3306 return SQLITE_OK;
3307 case SQLITE_DONE:
3308 c->eof = 1;
3309 return SQLITE_OK;
3310 default:
3311 c->eof = 1;
3312 return rc;
3313 }
3314 } else { /* full-text query */
3315 rc = sqlite3_reset(c->pStmt);
3316 if( rc!=SQLITE_OK ) return rc;
3317
3318 if( c->result.nData==0 || dlrAtEnd(&c->reader) ){
3319 c->eof = 1;
3320 return SQLITE_OK;
3321 }
3322 rc = sqlite3_bind_int64(c->pStmt, 1, dlrDocid(&c->reader));
3323 dlrStep(&c->reader);
3324 if( rc!=SQLITE_OK ) return rc;
3325 /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
3326 rc = sqlite3_step(c->pStmt);
3327 if( rc==SQLITE_ROW ){ /* the case we expect */
3328 c->eof = 0;
3329 return SQLITE_OK;
3330 }
3331 /* an error occurred; abort */
3332 return rc==SQLITE_DONE ? SQLITE_ERROR : rc;
3333 }
3334}
3335
3336
3337/* TODO(shess) If we pushed LeafReader to the top of the file, or to
3338** another file, term_select() could be pushed above
3339** docListOfTerm().
3340*/
3341static int termSelect(fulltext_vtab *v, int iColumn,
3342 const char *pTerm, int nTerm, int isPrefix,
3343 DocListType iType, DataBuffer *out);
3344
3345/* Return a DocList corresponding to the query term *pTerm. If *pTerm
3346** is the first term of a phrase query, go ahead and evaluate the phrase
3347** query and return the doclist for the entire phrase query.
3348**
3349** The resulting DL_DOCIDS doclist is stored in pResult, which is
3350** overwritten.
3351*/
3352static int docListOfTerm(
3353 fulltext_vtab *v, /* The full text index */
3354 int iColumn, /* column to restrict to. No restriction if >=nColumn */
3355 QueryTerm *pQTerm, /* Term we are looking for, or 1st term of a phrase */
3356 DataBuffer *pResult /* Write the result here */
3357){
3358 DataBuffer left, right, new;
3359 int i, rc;
3360
3361 /* No phrase search if no position info. */
3362 assert( pQTerm->nPhrase==0 || DL_DEFAULT!=DL_DOCIDS );
3363
3364 /* This code should never be called with buffered updates. */
3365 assert( v->nPendingData<0 );
3366
3367 dataBufferInit(&left, 0);
3368 rc = termSelect(v, iColumn, pQTerm->pTerm, pQTerm->nTerm, pQTerm->isPrefix,
3369 0<pQTerm->nPhrase ? DL_POSITIONS : DL_DOCIDS, &left);
3370 if( rc ) return rc;
3371 for(i=1; i<=pQTerm->nPhrase && left.nData>0; i++){
3372 dataBufferInit(&right, 0);
3373 rc = termSelect(v, iColumn, pQTerm[i].pTerm, pQTerm[i].nTerm,
3374 pQTerm[i].isPrefix, DL_POSITIONS, &right);
3375 if( rc ){
3376 dataBufferDestroy(&left);
3377 return rc;
3378 }
3379 dataBufferInit(&new, 0);
3380 docListPhraseMerge(left.pData, left.nData, right.pData, right.nData,
3381 i<pQTerm->nPhrase ? DL_POSITIONS : DL_DOCIDS, &new);
3382 dataBufferDestroy(&left);
3383 dataBufferDestroy(&right);
3384 left = new;
3385 }
3386 *pResult = left;
3387 return SQLITE_OK;
3388}
3389
3390/* Add a new term pTerm[0..nTerm-1] to the query *q.
3391*/
3392static void queryAdd(Query *q, const char *pTerm, int nTerm){
3393 QueryTerm *t;
3394 ++q->nTerms;
3395 q->pTerms = realloc(q->pTerms, q->nTerms * sizeof(q->pTerms[0]));
3396 if( q->pTerms==0 ){
3397 q->nTerms = 0;
3398 return;
3399 }
3400 t = &q->pTerms[q->nTerms - 1];
3401 CLEAR(t);
3402 t->pTerm = malloc(nTerm+1);
3403 memcpy(t->pTerm, pTerm, nTerm);
3404 t->pTerm[nTerm] = 0;
3405 t->nTerm = nTerm;
3406 t->isOr = q->nextIsOr;
3407 t->isPrefix = 0;
3408 q->nextIsOr = 0;
3409 t->iColumn = q->nextColumn;
3410 q->nextColumn = q->dfltColumn;
3411}
3412
3413/*
3414** Check to see if the string zToken[0...nToken-1] matches any
3415** column name in the virtual table. If it does,
3416** return the zero-indexed column number. If not, return -1.
3417*/
3418static int checkColumnSpecifier(
3419 fulltext_vtab *pVtab, /* The virtual table */
3420 const char *zToken, /* Text of the token */
3421 int nToken /* Number of characters in the token */
3422){
3423 int i;
3424 for(i=0; i<pVtab->nColumn; i++){
3425 if( memcmp(pVtab->azColumn[i], zToken, nToken)==0
3426 && pVtab->azColumn[i][nToken]==0 ){
3427 return i;
3428 }
3429 }
3430 return -1;
3431}
3432
3433/*
3434** Parse the text at pSegment[0..nSegment-1]. Add additional terms
3435** to the query being assemblied in pQuery.
3436**
3437** inPhrase is true if pSegment[0..nSegement-1] is contained within
3438** double-quotes. If inPhrase is true, then the first term
3439** is marked with the number of terms in the phrase less one and
3440** OR and "-" syntax is ignored. If inPhrase is false, then every
3441** term found is marked with nPhrase=0 and OR and "-" syntax is significant.
3442*/
3443static int tokenizeSegment(
3444 sqlite3_tokenizer *pTokenizer, /* The tokenizer to use */
3445 const char *pSegment, int nSegment, /* Query expression being parsed */
3446 int inPhrase, /* True if within "..." */
3447 Query *pQuery /* Append results here */
3448){
3449 const sqlite3_tokenizer_module *pModule = pTokenizer->pModule;
3450 sqlite3_tokenizer_cursor *pCursor;
3451 int firstIndex = pQuery->nTerms;
3452 int iCol;
3453 int nTerm = 1;
3454
3455 int rc = pModule->xOpen(pTokenizer, pSegment, nSegment, &pCursor);
3456 if( rc!=SQLITE_OK ) return rc;
3457 pCursor->pTokenizer = pTokenizer;
3458
3459 while( 1 ){
3460 const char *pToken;
3461 int nToken, iBegin, iEnd, iPos;
3462
3463 rc = pModule->xNext(pCursor,
3464 &pToken, &nToken,
3465 &iBegin, &iEnd, &iPos);
3466 if( rc!=SQLITE_OK ) break;
3467 if( !inPhrase &&
3468 pSegment[iEnd]==':' &&
3469 (iCol = checkColumnSpecifier(pQuery->pFts, pToken, nToken))>=0 ){
3470 pQuery->nextColumn = iCol;
3471 continue;
3472 }
3473 if( !inPhrase && pQuery->nTerms>0 && nToken==2
3474 && pSegment[iBegin]=='O' && pSegment[iBegin+1]=='R' ){
3475 pQuery->nextIsOr = 1;
3476 continue;
3477 }
3478 queryAdd(pQuery, pToken, nToken);
3479 if( !inPhrase && iBegin>0 && pSegment[iBegin-1]=='-' ){
3480 pQuery->pTerms[pQuery->nTerms-1].isNot = 1;
3481 }
3482 if( iEnd<nSegment && pSegment[iEnd]=='*' ){
3483 pQuery->pTerms[pQuery->nTerms-1].isPrefix = 1;
3484 }
3485 pQuery->pTerms[pQuery->nTerms-1].iPhrase = nTerm;
3486 if( inPhrase ){
3487 nTerm++;
3488 }
3489 }
3490
3491 if( inPhrase && pQuery->nTerms>firstIndex ){
3492 pQuery->pTerms[firstIndex].nPhrase = pQuery->nTerms - firstIndex - 1;
3493 }
3494
3495 return pModule->xClose(pCursor);
3496}
3497
3498/* Parse a query string, yielding a Query object pQuery.
3499**
3500** The calling function will need to queryClear() to clean up
3501** the dynamically allocated memory held by pQuery.
3502*/
3503static int parseQuery(
3504 fulltext_vtab *v, /* The fulltext index */
3505 const char *zInput, /* Input text of the query string */
3506 int nInput, /* Size of the input text */
3507 int dfltColumn, /* Default column of the index to match against */
3508 Query *pQuery /* Write the parse results here. */
3509){
3510 int iInput, inPhrase = 0;
3511
3512 if( zInput==0 ) nInput = 0;
3513 if( nInput<0 ) nInput = strlen(zInput);
3514 pQuery->nTerms = 0;
3515 pQuery->pTerms = NULL;
3516 pQuery->nextIsOr = 0;
3517 pQuery->nextColumn = dfltColumn;
3518 pQuery->dfltColumn = dfltColumn;
3519 pQuery->pFts = v;
3520
3521 for(iInput=0; iInput<nInput; ++iInput){
3522 int i;
3523 for(i=iInput; i<nInput && zInput[i]!='"'; ++i){}
3524 if( i>iInput ){
3525 tokenizeSegment(v->pTokenizer, zInput+iInput, i-iInput, inPhrase,
3526 pQuery);
3527 }
3528 iInput = i;
3529 if( i<nInput ){
3530 assert( zInput[i]=='"' );
3531 inPhrase = !inPhrase;
3532 }
3533 }
3534
3535 if( inPhrase ){
3536 /* unmatched quote */
3537 queryClear(pQuery);
3538 return SQLITE_ERROR;
3539 }
3540 return SQLITE_OK;
3541}
3542
3543/* TODO(shess) Refactor the code to remove this forward decl. */
3544static int flushPendingTerms(fulltext_vtab *v);
3545
3546/* Perform a full-text query using the search expression in
3547** zInput[0..nInput-1]. Return a list of matching documents
3548** in pResult.
3549**
3550** Queries must match column iColumn. Or if iColumn>=nColumn
3551** they are allowed to match against any column.
3552*/
3553static int fulltextQuery(
3554 fulltext_vtab *v, /* The full text index */
3555 int iColumn, /* Match against this column by default */
3556 const char *zInput, /* The query string */
3557 int nInput, /* Number of bytes in zInput[] */
3558 DataBuffer *pResult, /* Write the result doclist here */
3559 Query *pQuery /* Put parsed query string here */
3560){
3561 int i, iNext, rc;
3562 DataBuffer left, right, or, new;
3563 int nNot = 0;
3564 QueryTerm *aTerm;
3565
3566 /* TODO(shess) Instead of flushing pendingTerms, we could query for
3567 ** the relevant term and merge the doclist into what we receive from
3568 ** the database. Wait and see if this is a common issue, first.
3569 **
3570 ** A good reason not to flush is to not generate update-related
3571 ** error codes from here.
3572 */
3573
3574 /* Flush any buffered updates before executing the query. */
3575 rc = flushPendingTerms(v);
3576 if( rc!=SQLITE_OK ) return rc;
3577
3578 /* TODO(shess) I think that the queryClear() calls below are not
3579 ** necessary, because fulltextClose() already clears the query.
3580 */
3581 rc = parseQuery(v, zInput, nInput, iColumn, pQuery);
3582 if( rc!=SQLITE_OK ) return rc;
3583
3584 /* Empty or NULL queries return no results. */
3585 if( pQuery->nTerms==0 ){
3586 dataBufferInit(pResult, 0);
3587 return SQLITE_OK;
3588 }
3589
3590 /* Merge AND terms. */
3591 /* TODO(shess) I think we can early-exit if( i>nNot && left.nData==0 ). */
3592 aTerm = pQuery->pTerms;
3593 for(i = 0; i<pQuery->nTerms; i=iNext){
3594 if( aTerm[i].isNot ){
3595 /* Handle all NOT terms in a separate pass */
3596 nNot++;
3597 iNext = i + aTerm[i].nPhrase+1;
3598 continue;
3599 }
3600 iNext = i + aTerm[i].nPhrase + 1;
3601 rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &right);
3602 if( rc ){
3603 if( i!=nNot ) dataBufferDestroy(&left);
3604 queryClear(pQuery);
3605 return rc;
3606 }
3607 while( iNext<pQuery->nTerms && aTerm[iNext].isOr ){
3608 rc = docListOfTerm(v, aTerm[iNext].iColumn, &aTerm[iNext], &or);
3609 iNext += aTerm[iNext].nPhrase + 1;
3610 if( rc ){
3611 if( i!=nNot ) dataBufferDestroy(&left);
3612 dataBufferDestroy(&right);
3613 queryClear(pQuery);
3614 return rc;
3615 }
3616 dataBufferInit(&new, 0);
3617 docListOrMerge(right.pData, right.nData, or.pData, or.nData, &new);
3618 dataBufferDestroy(&right);
3619 dataBufferDestroy(&or);
3620 right = new;
3621 }
3622 if( i==nNot ){ /* first term processed. */
3623 left = right;
3624 }else{
3625 dataBufferInit(&new, 0);
3626 docListAndMerge(left.pData, left.nData, right.pData, right.nData, &new);
3627 dataBufferDestroy(&right);
3628 dataBufferDestroy(&left);
3629 left = new;
3630 }
3631 }
3632
3633 if( nNot==pQuery->nTerms ){
3634 /* We do not yet know how to handle a query of only NOT terms */
3635 return SQLITE_ERROR;
3636 }
3637
3638 /* Do the EXCEPT terms */
3639 for(i=0; i<pQuery->nTerms; i += aTerm[i].nPhrase + 1){
3640 if( !aTerm[i].isNot ) continue;
3641 rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &right);
3642 if( rc ){
3643 queryClear(pQuery);
3644 dataBufferDestroy(&left);
3645 return rc;
3646 }
3647 dataBufferInit(&new, 0);
3648 docListExceptMerge(left.pData, left.nData, right.pData, right.nData, &new);
3649 dataBufferDestroy(&right);
3650 dataBufferDestroy(&left);
3651 left = new;
3652 }
3653
3654 *pResult = left;
3655 return rc;
3656}
3657
3658/*
3659** This is the xFilter interface for the virtual table. See
3660** the virtual table xFilter method documentation for additional
3661** information.
3662**
3663** If idxNum==QUERY_GENERIC then do a full table scan against
3664** the %_content table.
3665**
3666** If idxNum==QUERY_DOCID then do a docid lookup for a single entry
3667** in the %_content table.
3668**
3669** If idxNum>=QUERY_FULLTEXT then use the full text index. The
3670** column on the left-hand side of the MATCH operator is column
3671** number idxNum-QUERY_FULLTEXT, 0 indexed. argv[0] is the right-hand
3672** side of the MATCH operator.
3673*/
3674/* TODO(shess) Upgrade the cursor initialization and destruction to
3675** account for fulltextFilter() being called multiple times on the
3676** same cursor. The current solution is very fragile. Apply fix to
3677** fts3 as appropriate.
3678*/
3679static int fulltextFilter(
3680 sqlite3_vtab_cursor *pCursor, /* The cursor used for this query */
3681 int idxNum, const char *idxStr, /* Which indexing scheme to use */
3682 int argc, sqlite3_value **argv /* Arguments for the indexing scheme */
3683){
3684 fulltext_cursor *c = (fulltext_cursor *) pCursor;
3685 fulltext_vtab *v = cursor_vtab(c);
3686 int rc;
3687 StringBuffer sb;
3688
3689 TRACE(("FTS3 Filter %p\n",pCursor));
3690
3691 initStringBuffer(&sb);
3692 append(&sb, "SELECT docid, ");
3693 appendList(&sb, v->nColumn, v->azContentColumn);
3694 append(&sb, " FROM %_content");
3695 if( idxNum!=QUERY_GENERIC ) append(&sb, " WHERE docid = ?");
3696 sqlite3_finalize(c->pStmt);
3697 rc = sql_prepare(v->db, v->zDb, v->zName, &c->pStmt, stringBufferData(&sb));
3698 stringBufferDestroy(&sb);
3699 if( rc!=SQLITE_OK ) return rc;
3700
3701 c->iCursorType = idxNum;
3702 switch( idxNum ){
3703 case QUERY_GENERIC:
3704 break;
3705
3706 case QUERY_DOCID:
3707 rc = sqlite3_bind_int64(c->pStmt, 1, sqlite3_value_int64(argv[0]));
3708 if( rc!=SQLITE_OK ) return rc;
3709 break;
3710
3711 default: /* full-text search */
3712 {
3713 const char *zQuery = (const char *)sqlite3_value_text(argv[0]);
3714 assert( idxNum<=QUERY_FULLTEXT+v->nColumn);
3715 assert( argc==1 );
3716 queryClear(&c->q);
3717 if( c->result.nData!=0 ){
3718 /* This case happens if the same cursor is used repeatedly. */
3719 dlrDestroy(&c->reader);
3720 dataBufferReset(&c->result);
3721 }else{
3722 dataBufferInit(&c->result, 0);
3723 }
3724 rc = fulltextQuery(v, idxNum-QUERY_FULLTEXT, zQuery, -1, &c->result, &c->q);
3725 if( rc!=SQLITE_OK ) return rc;
3726 if( c->result.nData!=0 ){
3727 dlrInit(&c->reader, DL_DOCIDS, c->result.pData, c->result.nData);
3728 }
3729 break;
3730 }
3731 }
3732
3733 return fulltextNext(pCursor);
3734}
3735
3736/* This is the xEof method of the virtual table. The SQLite core
3737** calls this routine to find out if it has reached the end of
3738** a query's results set.
3739*/
3740static int fulltextEof(sqlite3_vtab_cursor *pCursor){
3741 fulltext_cursor *c = (fulltext_cursor *) pCursor;
3742 return c->eof;
3743}
3744
3745/* This is the xColumn method of the virtual table. The SQLite
3746** core calls this method during a query when it needs the value
3747** of a column from the virtual table. This method needs to use
3748** one of the sqlite3_result_*() routines to store the requested
3749** value back in the pContext.
3750*/
3751static int fulltextColumn(sqlite3_vtab_cursor *pCursor,
3752 sqlite3_context *pContext, int idxCol){
3753 fulltext_cursor *c = (fulltext_cursor *) pCursor;
3754 fulltext_vtab *v = cursor_vtab(c);
3755
3756 if( idxCol<v->nColumn ){
3757 sqlite3_value *pVal = sqlite3_column_value(c->pStmt, idxCol+1);
3758 sqlite3_result_value(pContext, pVal);
3759 }else if( idxCol==v->nColumn ){
3760 /* The extra column whose name is the same as the table.
3761 ** Return a blob which is a pointer to the cursor
3762 */
3763 sqlite3_result_blob(pContext, &c, sizeof(c), SQLITE_TRANSIENT);
3764 }else if( idxCol==v->nColumn+1 ){
3765 /* The docid column, which is an alias for rowid. */
3766 sqlite3_value *pVal = sqlite3_column_value(c->pStmt, 0);
3767 sqlite3_result_value(pContext, pVal);
3768 }
3769 return SQLITE_OK;
3770}
3771
3772/* This is the xRowid method. The SQLite core calls this routine to
3773** retrieve the rowid for the current row of the result set. fts3
3774** exposes %_content.docid as the rowid for the virtual table. The
3775** rowid should be written to *pRowid.
3776*/
3777static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){
3778 fulltext_cursor *c = (fulltext_cursor *) pCursor;
3779
3780 *pRowid = sqlite3_column_int64(c->pStmt, 0);
3781 return SQLITE_OK;
3782}
3783
3784/* Add all terms in [zText] to pendingTerms table. If [iColumn] > 0,
3785** we also store positions and offsets in the hash table using that
3786** column number.
3787*/
3788static int buildTerms(fulltext_vtab *v, sqlite_int64 iDocid,
3789 const char *zText, int iColumn){
3790 sqlite3_tokenizer *pTokenizer = v->pTokenizer;
3791 sqlite3_tokenizer_cursor *pCursor;
3792 const char *pToken;
3793 int nTokenBytes;
3794 int iStartOffset, iEndOffset, iPosition;
3795 int rc;
3796
3797 rc = pTokenizer->pModule->xOpen(pTokenizer, zText, -1, &pCursor);
3798 if( rc!=SQLITE_OK ) return rc;
3799
3800 pCursor->pTokenizer = pTokenizer;
3801 while( SQLITE_OK==pTokenizer->pModule->xNext(pCursor,
3802 &pToken, &nTokenBytes,
3803 &iStartOffset, &iEndOffset,
3804 &iPosition) ){
3805 DLCollector *p;
3806 int nData; /* Size of doclist before our update. */
3807
3808 /* Positions can't be negative; we use -1 as a terminator internally. */
3809 if( iPosition<0 ){
3810 pTokenizer->pModule->xClose(pCursor);
3811 return SQLITE_ERROR;
3812 }
3813
3814 p = fts3HashFind(&v->pendingTerms, pToken, nTokenBytes);
3815 if( p==NULL ){
3816 nData = 0;
3817 p = dlcNew(iDocid, DL_DEFAULT);
3818 fts3HashInsert(&v->pendingTerms, pToken, nTokenBytes, p);
3819
3820 /* Overhead for our hash table entry, the key, and the value. */
3821 v->nPendingData += sizeof(struct fts3HashElem)+sizeof(*p)+nTokenBytes;
3822 }else{
3823 nData = p->b.nData;
3824 if( p->dlw.iPrevDocid!=iDocid ) dlcNext(p, iDocid);
3825 }
3826 if( iColumn>=0 ){
3827 dlcAddPos(p, iColumn, iPosition, iStartOffset, iEndOffset);
3828 }
3829
3830 /* Accumulate data added by dlcNew or dlcNext, and dlcAddPos. */
3831 v->nPendingData += p->b.nData-nData;
3832 }
3833
3834 /* TODO(shess) Check return? Should this be able to cause errors at
3835 ** this point? Actually, same question about sqlite3_finalize(),
3836 ** though one could argue that failure there means that the data is
3837 ** not durable. *ponder*
3838 */
3839 pTokenizer->pModule->xClose(pCursor);
3840 return rc;
3841}
3842
3843/* Add doclists for all terms in [pValues] to pendingTerms table. */
3844static int insertTerms(fulltext_vtab *v, sqlite_int64 iDocid,
3845 sqlite3_value **pValues){
3846 int i;
3847 for(i = 0; i < v->nColumn ; ++i){
3848 char *zText = (char*)sqlite3_value_text(pValues[i]);
3849 int rc = buildTerms(v, iDocid, zText, i);
3850 if( rc!=SQLITE_OK ) return rc;
3851 }
3852 return SQLITE_OK;
3853}
3854
3855/* Add empty doclists for all terms in the given row's content to
3856** pendingTerms.
3857*/
3858static int deleteTerms(fulltext_vtab *v, sqlite_int64 iDocid){
3859 const char **pValues;
3860 int i, rc;
3861
3862 /* TODO(shess) Should we allow such tables at all? */
3863 if( DL_DEFAULT==DL_DOCIDS ) return SQLITE_ERROR;
3864
3865 rc = content_select(v, iDocid, &pValues);
3866 if( rc!=SQLITE_OK ) return rc;
3867
3868 for(i = 0 ; i < v->nColumn; ++i) {
3869 rc = buildTerms(v, iDocid, pValues[i], -1);
3870 if( rc!=SQLITE_OK ) break;
3871 }
3872
3873 freeStringArray(v->nColumn, pValues);
3874 return SQLITE_OK;
3875}
3876
3877/* TODO(shess) Refactor the code to remove this forward decl. */
3878static int initPendingTerms(fulltext_vtab *v, sqlite_int64 iDocid);
3879
3880/* Insert a row into the %_content table; set *piDocid to be the ID of the
3881** new row. Add doclists for terms to pendingTerms.
3882*/
3883static int index_insert(fulltext_vtab *v, sqlite3_value *pRequestDocid,
3884 sqlite3_value **pValues, sqlite_int64 *piDocid){
3885 int rc;
3886
3887 rc = content_insert(v, pRequestDocid, pValues); /* execute an SQL INSERT */
3888 if( rc!=SQLITE_OK ) return rc;
3889
3890 /* docid column is an alias for rowid. */
3891 *piDocid = sqlite3_last_insert_rowid(v->db);
3892 rc = initPendingTerms(v, *piDocid);
3893 if( rc!=SQLITE_OK ) return rc;
3894
3895 return insertTerms(v, *piDocid, pValues);
3896}
3897
3898/* Delete a row from the %_content table; add empty doclists for terms
3899** to pendingTerms.
3900*/
3901static int index_delete(fulltext_vtab *v, sqlite_int64 iRow){
3902 int rc = initPendingTerms(v, iRow);
3903 if( rc!=SQLITE_OK ) return rc;
3904
3905 rc = deleteTerms(v, iRow);
3906 if( rc!=SQLITE_OK ) return rc;
3907
3908 return content_delete(v, iRow); /* execute an SQL DELETE */
3909}
3910
3911/* Update a row in the %_content table; add delete doclists to
3912** pendingTerms for old terms not in the new data, add insert doclists
3913** to pendingTerms for terms in the new data.
3914*/
3915static int index_update(fulltext_vtab *v, sqlite_int64 iRow,
3916 sqlite3_value **pValues){
3917 int rc = initPendingTerms(v, iRow);
3918 if( rc!=SQLITE_OK ) return rc;
3919
3920 /* Generate an empty doclist for each term that previously appeared in this
3921 * row. */
3922 rc = deleteTerms(v, iRow);
3923 if( rc!=SQLITE_OK ) return rc;
3924
3925 rc = content_update(v, pValues, iRow); /* execute an SQL UPDATE */
3926 if( rc!=SQLITE_OK ) return rc;
3927
3928 /* Now add positions for terms which appear in the updated row. */
3929 return insertTerms(v, iRow, pValues);
3930}
3931
3932/*******************************************************************/
3933/* InteriorWriter is used to collect terms and block references into
3934** interior nodes in %_segments. See commentary at top of file for
3935** format.
3936*/
3937
3938/* How large interior nodes can grow. */
3939#define INTERIOR_MAX 2048
3940
3941/* Minimum number of terms per interior node (except the root). This
3942** prevents large terms from making the tree too skinny - must be >0
3943** so that the tree always makes progress. Note that the min tree
3944** fanout will be INTERIOR_MIN_TERMS+1.
3945*/
3946#define INTERIOR_MIN_TERMS 7
3947#if INTERIOR_MIN_TERMS<1
3948# error INTERIOR_MIN_TERMS must be greater than 0.
3949#endif
3950
3951/* ROOT_MAX controls how much data is stored inline in the segment
3952** directory.
3953*/
3954/* TODO(shess) Push ROOT_MAX down to whoever is writing things. It's
3955** only here so that interiorWriterRootInfo() and leafWriterRootInfo()
3956** can both see it, but if the caller passed it in, we wouldn't even
3957** need a define.
3958*/
3959#define ROOT_MAX 1024
3960#if ROOT_MAX<VARINT_MAX*2
3961# error ROOT_MAX must have enough space for a header.
3962#endif
3963
3964/* InteriorBlock stores a linked-list of interior blocks while a lower
3965** layer is being constructed.
3966*/
3967typedef struct InteriorBlock {
3968 DataBuffer term; /* Leftmost term in block's subtree. */
3969 DataBuffer data; /* Accumulated data for the block. */
3970 struct InteriorBlock *next;
3971} InteriorBlock;
3972
3973static InteriorBlock *interiorBlockNew(int iHeight, sqlite_int64 iChildBlock,
3974 const char *pTerm, int nTerm){
3975 InteriorBlock *block = calloc(1, sizeof(InteriorBlock));
3976 char c[VARINT_MAX+VARINT_MAX];
3977 int n;
3978
3979 dataBufferInit(&block->term, 0);
3980 dataBufferReplace(&block->term, pTerm, nTerm);
3981
3982 n = putVarint(c, iHeight);
3983 n += putVarint(c+n, iChildBlock);
3984 dataBufferInit(&block->data, INTERIOR_MAX);
3985 dataBufferReplace(&block->data, c, n);
3986
3987 return block;
3988}
3989
3990#ifndef NDEBUG
3991/* Verify that the data is readable as an interior node. */
3992static void interiorBlockValidate(InteriorBlock *pBlock){
3993 const char *pData = pBlock->data.pData;
3994 int nData = pBlock->data.nData;
3995 int n, iDummy;
3996 sqlite_int64 iBlockid;
3997
3998 assert( nData>0 );
3999 assert( pData!=0 );
4000 assert( pData+nData>pData );
4001
4002 /* Must lead with height of node as a varint(n), n>0 */
4003 n = getVarint32(pData, &iDummy);
4004 assert( n>0 );
4005 assert( iDummy>0 );
4006 assert( n<nData );
4007 pData += n;
4008 nData -= n;
4009
4010 /* Must contain iBlockid. */
4011 n = getVarint(pData, &iBlockid);
4012 assert( n>0 );
4013 assert( n<=nData );
4014 pData += n;
4015 nData -= n;
4016
4017 /* Zero or more terms of positive length */
4018 if( nData!=0 ){
4019 /* First term is not delta-encoded. */
4020 n = getVarint32(pData, &iDummy);
4021 assert( n>0 );
4022 assert( iDummy>0 );
4023 assert( n+iDummy>0);
4024 assert( n+iDummy<=nData );
4025 pData += n+iDummy;
4026 nData -= n+iDummy;
4027
4028 /* Following terms delta-encoded. */
4029 while( nData!=0 ){
4030 /* Length of shared prefix. */
4031 n = getVarint32(pData, &iDummy);
4032 assert( n>0 );
4033 assert( iDummy>=0 );
4034 assert( n<nData );
4035 pData += n;
4036 nData -= n;
4037
4038 /* Length and data of distinct suffix. */
4039 n = getVarint32(pData, &iDummy);
4040 assert( n>0 );
4041 assert( iDummy>0 );
4042 assert( n+iDummy>0);
4043 assert( n+iDummy<=nData );
4044 pData += n+iDummy;
4045 nData -= n+iDummy;
4046 }
4047 }
4048}
4049#define ASSERT_VALID_INTERIOR_BLOCK(x) interiorBlockValidate(x)
4050#else
4051#define ASSERT_VALID_INTERIOR_BLOCK(x) assert( 1 )
4052#endif
4053
4054typedef struct InteriorWriter {
4055 int iHeight; /* from 0 at leaves. */
4056 InteriorBlock *first, *last;
4057 struct InteriorWriter *parentWriter;
4058
4059 DataBuffer term; /* Last term written to block "last". */
4060 sqlite_int64 iOpeningChildBlock; /* First child block in block "last". */
4061#ifndef NDEBUG
4062 sqlite_int64 iLastChildBlock; /* for consistency checks. */
4063#endif
4064} InteriorWriter;
4065
4066/* Initialize an interior node where pTerm[nTerm] marks the leftmost
4067** term in the tree. iChildBlock is the leftmost child block at the
4068** next level down the tree.
4069*/
4070static void interiorWriterInit(int iHeight, const char *pTerm, int nTerm,
4071 sqlite_int64 iChildBlock,
4072 InteriorWriter *pWriter){
4073 InteriorBlock *block;
4074 assert( iHeight>0 );
4075 CLEAR(pWriter);
4076
4077 pWriter->iHeight = iHeight;
4078 pWriter->iOpeningChildBlock = iChildBlock;
4079#ifndef NDEBUG
4080 pWriter->iLastChildBlock = iChildBlock;
4081#endif
4082 block = interiorBlockNew(iHeight, iChildBlock, pTerm, nTerm);
4083 pWriter->last = pWriter->first = block;
4084 ASSERT_VALID_INTERIOR_BLOCK(pWriter->last);
4085 dataBufferInit(&pWriter->term, 0);
4086}
4087
4088/* Append the child node rooted at iChildBlock to the interior node,
4089** with pTerm[nTerm] as the leftmost term in iChildBlock's subtree.
4090*/
4091static void interiorWriterAppend(InteriorWriter *pWriter,
4092 const char *pTerm, int nTerm,
4093 sqlite_int64 iChildBlock){
4094 char c[VARINT_MAX+VARINT_MAX];
4095 int n, nPrefix = 0;
4096
4097 ASSERT_VALID_INTERIOR_BLOCK(pWriter->last);
4098
4099 /* The first term written into an interior node is actually
4100 ** associated with the second child added (the first child was added
4101 ** in interiorWriterInit, or in the if clause at the bottom of this
4102 ** function). That term gets encoded straight up, with nPrefix left
4103 ** at 0.
4104 */
4105 if( pWriter->term.nData==0 ){
4106 n = putVarint(c, nTerm);
4107 }else{
4108 while( nPrefix<pWriter->term.nData &&
4109 pTerm[nPrefix]==pWriter->term.pData[nPrefix] ){
4110 nPrefix++;
4111 }
4112
4113 n = putVarint(c, nPrefix);
4114 n += putVarint(c+n, nTerm-nPrefix);
4115 }
4116
4117#ifndef NDEBUG
4118 pWriter->iLastChildBlock++;
4119#endif
4120 assert( pWriter->iLastChildBlock==iChildBlock );
4121
4122 /* Overflow to a new block if the new term makes the current block
4123 ** too big, and the current block already has enough terms.
4124 */
4125 if( pWriter->last->data.nData+n+nTerm-nPrefix>INTERIOR_MAX &&
4126 iChildBlock-pWriter->iOpeningChildBlock>INTERIOR_MIN_TERMS ){
4127 pWriter->last->next = interiorBlockNew(pWriter->iHeight, iChildBlock,
4128 pTerm, nTerm);
4129 pWriter->last = pWriter->last->next;
4130 pWriter->iOpeningChildBlock = iChildBlock;
4131 dataBufferReset(&pWriter->term);
4132 }else{
4133 dataBufferAppend2(&pWriter->last->data, c, n,
4134 pTerm+nPrefix, nTerm-nPrefix);
4135 dataBufferReplace(&pWriter->term, pTerm, nTerm);
4136 }
4137 ASSERT_VALID_INTERIOR_BLOCK(pWriter->last);
4138}
4139
4140/* Free the space used by pWriter, including the linked-list of
4141** InteriorBlocks, and parentWriter, if present.
4142*/
4143static int interiorWriterDestroy(InteriorWriter *pWriter){
4144 InteriorBlock *block = pWriter->first;
4145
4146 while( block!=NULL ){
4147 InteriorBlock *b = block;
4148 block = block->next;
4149 dataBufferDestroy(&b->term);
4150 dataBufferDestroy(&b->data);
4151 free(b);
4152 }
4153 if( pWriter->parentWriter!=NULL ){
4154 interiorWriterDestroy(pWriter->parentWriter);
4155 free(pWriter->parentWriter);
4156 }
4157 dataBufferDestroy(&pWriter->term);
4158 SCRAMBLE(pWriter);
4159 return SQLITE_OK;
4160}
4161
4162/* If pWriter can fit entirely in ROOT_MAX, return it as the root info
4163** directly, leaving *piEndBlockid unchanged. Otherwise, flush
4164** pWriter to %_segments, building a new layer of interior nodes, and
4165** recursively ask for their root into.
4166*/
4167static int interiorWriterRootInfo(fulltext_vtab *v, InteriorWriter *pWriter,
4168 char **ppRootInfo, int *pnRootInfo,
4169 sqlite_int64 *piEndBlockid){
4170 InteriorBlock *block = pWriter->first;
4171 sqlite_int64 iBlockid = 0;
4172 int rc;
4173
4174 /* If we can fit the segment inline */
4175 if( block==pWriter->last && block->data.nData<ROOT_MAX ){
4176 *ppRootInfo = block->data.pData;
4177 *pnRootInfo = block->data.nData;
4178 return SQLITE_OK;
4179 }
4180
4181 /* Flush the first block to %_segments, and create a new level of
4182 ** interior node.
4183 */
4184 ASSERT_VALID_INTERIOR_BLOCK(block);
4185 rc = block_insert(v, block->data.pData, block->data.nData, &iBlockid);
4186 if( rc!=SQLITE_OK ) return rc;
4187 *piEndBlockid = iBlockid;
4188
4189 pWriter->parentWriter = malloc(sizeof(*pWriter->parentWriter));
4190 interiorWriterInit(pWriter->iHeight+1,
4191 block->term.pData, block->term.nData,
4192 iBlockid, pWriter->parentWriter);
4193
4194 /* Flush additional blocks and append to the higher interior
4195 ** node.
4196 */
4197 for(block=block->next; block!=NULL; block=block->next){
4198 ASSERT_VALID_INTERIOR_BLOCK(block);
4199 rc = block_insert(v, block->data.pData, block->data.nData, &iBlockid);
4200 if( rc!=SQLITE_OK ) return rc;
4201 *piEndBlockid = iBlockid;
4202
4203 interiorWriterAppend(pWriter->parentWriter,
4204 block->term.pData, block->term.nData, iBlockid);
4205 }
4206
4207 /* Parent node gets the chance to be the root. */
4208 return interiorWriterRootInfo(v, pWriter->parentWriter,
4209 ppRootInfo, pnRootInfo, piEndBlockid);
4210}
4211
4212/****************************************************************/
4213/* InteriorReader is used to read off the data from an interior node
4214** (see comment at top of file for the format).
4215*/
4216typedef struct InteriorReader {
4217 const char *pData;
4218 int nData;
4219
4220 DataBuffer term; /* previous term, for decoding term delta. */
4221
4222 sqlite_int64 iBlockid;
4223} InteriorReader;
4224
4225static void interiorReaderDestroy(InteriorReader *pReader){
4226 dataBufferDestroy(&pReader->term);
4227 SCRAMBLE(pReader);
4228}
4229
4230/* TODO(shess) The assertions are great, but what if we're in NDEBUG
4231** and the blob is empty or otherwise contains suspect data?
4232*/
4233static void interiorReaderInit(const char *pData, int nData,
4234 InteriorReader *pReader){
4235 int n, nTerm;
4236
4237 /* Require at least the leading flag byte */
4238 assert( nData>0 );
4239 assert( pData[0]!='\0' );
4240
4241 CLEAR(pReader);
4242
4243 /* Decode the base blockid, and set the cursor to the first term. */
4244 n = getVarint(pData+1, &pReader->iBlockid);
4245 assert( 1+n<=nData );
4246 pReader->pData = pData+1+n;
4247 pReader->nData = nData-(1+n);
4248
4249 /* A single-child interior node (such as when a leaf node was too
4250 ** large for the segment directory) won't have any terms.
4251 ** Otherwise, decode the first term.
4252 */
4253 if( pReader->nData==0 ){
4254 dataBufferInit(&pReader->term, 0);
4255 }else{
4256 n = getVarint32(pReader->pData, &nTerm);
4257 dataBufferInit(&pReader->term, nTerm);
4258 dataBufferReplace(&pReader->term, pReader->pData+n, nTerm);
4259 assert( n+nTerm<=pReader->nData );
4260 pReader->pData += n+nTerm;
4261 pReader->nData -= n+nTerm;
4262 }
4263}
4264
4265static int interiorReaderAtEnd(InteriorReader *pReader){
4266 return pReader->term.nData==0;
4267}
4268
4269static sqlite_int64 interiorReaderCurrentBlockid(InteriorReader *pReader){
4270 return pReader->iBlockid;
4271}
4272
4273static int interiorReaderTermBytes(InteriorReader *pReader){
4274 assert( !interiorReaderAtEnd(pReader) );
4275 return pReader->term.nData;
4276}
4277static const char *interiorReaderTerm(InteriorReader *pReader){
4278 assert( !interiorReaderAtEnd(pReader) );
4279 return pReader->term.pData;
4280}
4281
4282/* Step forward to the next term in the node. */
4283static void interiorReaderStep(InteriorReader *pReader){
4284 assert( !interiorReaderAtEnd(pReader) );
4285
4286 /* If the last term has been read, signal eof, else construct the
4287 ** next term.
4288 */
4289 if( pReader->nData==0 ){
4290 dataBufferReset(&pReader->term);
4291 }else{
4292 int n, nPrefix, nSuffix;
4293
4294 n = getVarint32(pReader->pData, &nPrefix);
4295 n += getVarint32(pReader->pData+n, &nSuffix);
4296
4297 /* Truncate the current term and append suffix data. */
4298 pReader->term.nData = nPrefix;
4299 dataBufferAppend(&pReader->term, pReader->pData+n, nSuffix);
4300
4301 assert( n+nSuffix<=pReader->nData );
4302 pReader->pData += n+nSuffix;
4303 pReader->nData -= n+nSuffix;
4304 }
4305 pReader->iBlockid++;
4306}
4307
4308/* Compare the current term to pTerm[nTerm], returning strcmp-style
4309** results. If isPrefix, equality means equal through nTerm bytes.
4310*/
4311static int interiorReaderTermCmp(InteriorReader *pReader,
4312 const char *pTerm, int nTerm, int isPrefix){
4313 const char *pReaderTerm = interiorReaderTerm(pReader);
4314 int nReaderTerm = interiorReaderTermBytes(pReader);
4315 int c, n = nReaderTerm<nTerm ? nReaderTerm : nTerm;
4316
4317 if( n==0 ){
4318 if( nReaderTerm>0 ) return -1;
4319 if( nTerm>0 ) return 1;
4320 return 0;
4321 }
4322
4323 c = memcmp(pReaderTerm, pTerm, n);
4324 if( c!=0 ) return c;
4325 if( isPrefix && n==nTerm ) return 0;
4326 return nReaderTerm - nTerm;
4327}
4328
4329/****************************************************************/
4330/* LeafWriter is used to collect terms and associated doclist data
4331** into leaf blocks in %_segments (see top of file for format info).
4332** Expected usage is:
4333**
4334** LeafWriter writer;
4335** leafWriterInit(0, 0, &writer);
4336** while( sorted_terms_left_to_process ){
4337** // data is doclist data for that term.
4338** rc = leafWriterStep(v, &writer, pTerm, nTerm, pData, nData);
4339** if( rc!=SQLITE_OK ) goto err;
4340** }
4341** rc = leafWriterFinalize(v, &writer);
4342**err:
4343** leafWriterDestroy(&writer);
4344** return rc;
4345**
4346** leafWriterStep() may write a collected leaf out to %_segments.
4347** leafWriterFinalize() finishes writing any buffered data and stores
4348** a root node in %_segdir. leafWriterDestroy() frees all buffers and
4349** InteriorWriters allocated as part of writing this segment.
4350**
4351** TODO(shess) Document leafWriterStepMerge().
4352*/
4353
4354/* Put terms with data this big in their own block. */
4355#define STANDALONE_MIN 1024
4356
4357/* Keep leaf blocks below this size. */
4358#define LEAF_MAX 2048
4359
4360typedef struct LeafWriter {
4361 int iLevel;
4362 int idx;
4363 sqlite_int64 iStartBlockid; /* needed to create the root info */
4364 sqlite_int64 iEndBlockid; /* when we're done writing. */
4365
4366 DataBuffer term; /* previous encoded term */
4367 DataBuffer data; /* encoding buffer */
4368
4369 /* bytes of first term in the current node which distinguishes that
4370 ** term from the last term of the previous node.
4371 */
4372 int nTermDistinct;
4373
4374 InteriorWriter parentWriter; /* if we overflow */
4375 int has_parent;
4376} LeafWriter;
4377
4378static void leafWriterInit(int iLevel, int idx, LeafWriter *pWriter){
4379 CLEAR(pWriter);
4380 pWriter->iLevel = iLevel;
4381 pWriter->idx = idx;
4382
4383 dataBufferInit(&pWriter->term, 32);
4384
4385 /* Start out with a reasonably sized block, though it can grow. */
4386 dataBufferInit(&pWriter->data, LEAF_MAX);
4387}
4388
4389#ifndef NDEBUG
4390/* Verify that the data is readable as a leaf node. */
4391static void leafNodeValidate(const char *pData, int nData){
4392 int n, iDummy;
4393
4394 if( nData==0 ) return;
4395 assert( nData>0 );
4396 assert( pData!=0 );
4397 assert( pData+nData>pData );
4398
4399 /* Must lead with a varint(0) */
4400 n = getVarint32(pData, &iDummy);
4401 assert( iDummy==0 );
4402 assert( n>0 );
4403 assert( n<nData );
4404 pData += n;
4405 nData -= n;
4406
4407 /* Leading term length and data must fit in buffer. */
4408 n = getVarint32(pData, &iDummy);
4409 assert( n>0 );
4410 assert( iDummy>0 );
4411 assert( n+iDummy>0 );
4412 assert( n+iDummy<nData );
4413 pData += n+iDummy;
4414 nData -= n+iDummy;
4415
4416 /* Leading term's doclist length and data must fit. */
4417 n = getVarint32(pData, &iDummy);
4418 assert( n>0 );
4419 assert( iDummy>0 );
4420 assert( n+iDummy>0 );
4421 assert( n+iDummy<=nData );
4422 ASSERT_VALID_DOCLIST(DL_DEFAULT, pData+n, iDummy, NULL);
4423 pData += n+iDummy;
4424 nData -= n+iDummy;
4425
4426 /* Verify that trailing terms and doclists also are readable. */
4427 while( nData!=0 ){
4428 n = getVarint32(pData, &iDummy);
4429 assert( n>0 );
4430 assert( iDummy>=0 );
4431 assert( n<nData );
4432 pData += n;
4433 nData -= n;
4434 n = getVarint32(pData, &iDummy);
4435 assert( n>0 );
4436 assert( iDummy>0 );
4437 assert( n+iDummy>0 );
4438 assert( n+iDummy<nData );
4439 pData += n+iDummy;
4440 nData -= n+iDummy;
4441
4442 n = getVarint32(pData, &iDummy);
4443 assert( n>0 );
4444 assert( iDummy>0 );
4445 assert( n+iDummy>0 );
4446 assert( n+iDummy<=nData );
4447 ASSERT_VALID_DOCLIST(DL_DEFAULT, pData+n, iDummy, NULL);
4448 pData += n+iDummy;
4449 nData -= n+iDummy;
4450 }
4451}
4452#define ASSERT_VALID_LEAF_NODE(p, n) leafNodeValidate(p, n)
4453#else
4454#define ASSERT_VALID_LEAF_NODE(p, n) assert( 1 )
4455#endif
4456
4457/* Flush the current leaf node to %_segments, and adding the resulting
4458** blockid and the starting term to the interior node which will
4459** contain it.
4460*/
4461static int leafWriterInternalFlush(fulltext_vtab *v, LeafWriter *pWriter,
4462 int iData, int nData){
4463 sqlite_int64 iBlockid = 0;
4464 const char *pStartingTerm;
4465 int nStartingTerm, rc, n;
4466
4467 /* Must have the leading varint(0) flag, plus at least some
4468 ** valid-looking data.
4469 */
4470 assert( nData>2 );
4471 assert( iData>=0 );
4472 assert( iData+nData<=pWriter->data.nData );
4473 ASSERT_VALID_LEAF_NODE(pWriter->data.pData+iData, nData);
4474
4475 rc = block_insert(v, pWriter->data.pData+iData, nData, &iBlockid);
4476 if( rc!=SQLITE_OK ) return rc;
4477 assert( iBlockid!=0 );
4478
4479 /* Reconstruct the first term in the leaf for purposes of building
4480 ** the interior node.
4481 */
4482 n = getVarint32(pWriter->data.pData+iData+1, &nStartingTerm);
4483 pStartingTerm = pWriter->data.pData+iData+1+n;
4484 assert( pWriter->data.nData>iData+1+n+nStartingTerm );
4485 assert( pWriter->nTermDistinct>0 );
4486 assert( pWriter->nTermDistinct<=nStartingTerm );
4487 nStartingTerm = pWriter->nTermDistinct;
4488
4489 if( pWriter->has_parent ){
4490 interiorWriterAppend(&pWriter->parentWriter,
4491 pStartingTerm, nStartingTerm, iBlockid);
4492 }else{
4493 interiorWriterInit(1, pStartingTerm, nStartingTerm, iBlockid,
4494 &pWriter->parentWriter);
4495 pWriter->has_parent = 1;
4496 }
4497
4498 /* Track the span of this segment's leaf nodes. */
4499 if( pWriter->iEndBlockid==0 ){
4500 pWriter->iEndBlockid = pWriter->iStartBlockid = iBlockid;
4501 }else{
4502 pWriter->iEndBlockid++;
4503 assert( iBlockid==pWriter->iEndBlockid );
4504 }
4505
4506 return SQLITE_OK;
4507}
4508static int leafWriterFlush(fulltext_vtab *v, LeafWriter *pWriter){
4509 int rc = leafWriterInternalFlush(v, pWriter, 0, pWriter->data.nData);
4510 if( rc!=SQLITE_OK ) return rc;
4511
4512 /* Re-initialize the output buffer. */
4513 dataBufferReset(&pWriter->data);
4514
4515 return SQLITE_OK;
4516}
4517
4518/* Fetch the root info for the segment. If the entire leaf fits
4519** within ROOT_MAX, then it will be returned directly, otherwise it
4520** will be flushed and the root info will be returned from the
4521** interior node. *piEndBlockid is set to the blockid of the last
4522** interior or leaf node written to disk (0 if none are written at
4523** all).
4524*/
4525static int leafWriterRootInfo(fulltext_vtab *v, LeafWriter *pWriter,
4526 char **ppRootInfo, int *pnRootInfo,
4527 sqlite_int64 *piEndBlockid){
4528 /* we can fit the segment entirely inline */
4529 if( !pWriter->has_parent && pWriter->data.nData<ROOT_MAX ){
4530 *ppRootInfo = pWriter->data.pData;
4531 *pnRootInfo = pWriter->data.nData;
4532 *piEndBlockid = 0;
4533 return SQLITE_OK;
4534 }
4535
4536 /* Flush remaining leaf data. */
4537 if( pWriter->data.nData>0 ){
4538 int rc = leafWriterFlush(v, pWriter);
4539 if( rc!=SQLITE_OK ) return rc;
4540 }
4541
4542 /* We must have flushed a leaf at some point. */
4543 assert( pWriter->has_parent );
4544
4545 /* Tenatively set the end leaf blockid as the end blockid. If the
4546 ** interior node can be returned inline, this will be the final
4547 ** blockid, otherwise it will be overwritten by
4548 ** interiorWriterRootInfo().
4549 */
4550 *piEndBlockid = pWriter->iEndBlockid;
4551
4552 return interiorWriterRootInfo(v, &pWriter->parentWriter,
4553 ppRootInfo, pnRootInfo, piEndBlockid);
4554}
4555
4556/* Collect the rootInfo data and store it into the segment directory.
4557** This has the effect of flushing the segment's leaf data to
4558** %_segments, and also flushing any interior nodes to %_segments.
4559*/
4560static int leafWriterFinalize(fulltext_vtab *v, LeafWriter *pWriter){
4561 sqlite_int64 iEndBlockid;
4562 char *pRootInfo;
4563 int rc, nRootInfo;
4564
4565 rc = leafWriterRootInfo(v, pWriter, &pRootInfo, &nRootInfo, &iEndBlockid);
4566 if( rc!=SQLITE_OK ) return rc;
4567
4568 /* Don't bother storing an entirely empty segment. */
4569 if( iEndBlockid==0 && nRootInfo==0 ) return SQLITE_OK;
4570
4571 return segdir_set(v, pWriter->iLevel, pWriter->idx,
4572 pWriter->iStartBlockid, pWriter->iEndBlockid,
4573 iEndBlockid, pRootInfo, nRootInfo);
4574}
4575
4576static void leafWriterDestroy(LeafWriter *pWriter){
4577 if( pWriter->has_parent ) interiorWriterDestroy(&pWriter->parentWriter);
4578 dataBufferDestroy(&pWriter->term);
4579 dataBufferDestroy(&pWriter->data);
4580}
4581
4582/* Encode a term into the leafWriter, delta-encoding as appropriate.
4583** Returns the length of the new term which distinguishes it from the
4584** previous term, which can be used to set nTermDistinct when a node
4585** boundary is crossed.
4586*/
4587static int leafWriterEncodeTerm(LeafWriter *pWriter,
4588 const char *pTerm, int nTerm){
4589 char c[VARINT_MAX+VARINT_MAX];
4590 int n, nPrefix = 0;
4591
4592 assert( nTerm>0 );
4593 while( nPrefix<pWriter->term.nData &&
4594 pTerm[nPrefix]==pWriter->term.pData[nPrefix] ){
4595 nPrefix++;
4596 /* Failing this implies that the terms weren't in order. */
4597 assert( nPrefix<nTerm );
4598 }
4599
4600 if( pWriter->data.nData==0 ){
4601 /* Encode the node header and leading term as:
4602 ** varint(0)
4603 ** varint(nTerm)
4604 ** char pTerm[nTerm]
4605 */
4606 n = putVarint(c, '\0');
4607 n += putVarint(c+n, nTerm);
4608 dataBufferAppend2(&pWriter->data, c, n, pTerm, nTerm);
4609 }else{
4610 /* Delta-encode the term as:
4611 ** varint(nPrefix)
4612 ** varint(nSuffix)
4613 ** char pTermSuffix[nSuffix]
4614 */
4615 n = putVarint(c, nPrefix);
4616 n += putVarint(c+n, nTerm-nPrefix);
4617 dataBufferAppend2(&pWriter->data, c, n, pTerm+nPrefix, nTerm-nPrefix);
4618 }
4619 dataBufferReplace(&pWriter->term, pTerm, nTerm);
4620
4621 return nPrefix+1;
4622}
4623
4624/* Used to avoid a memmove when a large amount of doclist data is in
4625** the buffer. This constructs a node and term header before
4626** iDoclistData and flushes the resulting complete node using
4627** leafWriterInternalFlush().
4628*/
4629static int leafWriterInlineFlush(fulltext_vtab *v, LeafWriter *pWriter,
4630 const char *pTerm, int nTerm,
4631 int iDoclistData){
4632 char c[VARINT_MAX+VARINT_MAX];
4633 int iData, n = putVarint(c, 0);
4634 n += putVarint(c+n, nTerm);
4635
4636 /* There should always be room for the header. Even if pTerm shared
4637 ** a substantial prefix with the previous term, the entire prefix
4638 ** could be constructed from earlier data in the doclist, so there
4639 ** should be room.
4640 */
4641 assert( iDoclistData>=n+nTerm );
4642
4643 iData = iDoclistData-(n+nTerm);
4644 memcpy(pWriter->data.pData+iData, c, n);
4645 memcpy(pWriter->data.pData+iData+n, pTerm, nTerm);
4646
4647 return leafWriterInternalFlush(v, pWriter, iData, pWriter->data.nData-iData);
4648}
4649
4650/* Push pTerm[nTerm] along with the doclist data to the leaf layer of
4651** %_segments.
4652*/
4653static int leafWriterStepMerge(fulltext_vtab *v, LeafWriter *pWriter,
4654 const char *pTerm, int nTerm,
4655 DLReader *pReaders, int nReaders){
4656 char c[VARINT_MAX+VARINT_MAX];
4657 int iTermData = pWriter->data.nData, iDoclistData;
4658 int i, nData, n, nActualData, nActual, rc, nTermDistinct;
4659
4660 ASSERT_VALID_LEAF_NODE(pWriter->data.pData, pWriter->data.nData);
4661 nTermDistinct = leafWriterEncodeTerm(pWriter, pTerm, nTerm);
4662
4663 /* Remember nTermDistinct if opening a new node. */
4664 if( iTermData==0 ) pWriter->nTermDistinct = nTermDistinct;
4665
4666 iDoclistData = pWriter->data.nData;
4667
4668 /* Estimate the length of the merged doclist so we can leave space
4669 ** to encode it.
4670 */
4671 for(i=0, nData=0; i<nReaders; i++){
4672 nData += dlrAllDataBytes(&pReaders[i]);
4673 }
4674 n = putVarint(c, nData);
4675 dataBufferAppend(&pWriter->data, c, n);
4676
4677 docListMerge(&pWriter->data, pReaders, nReaders);
4678 ASSERT_VALID_DOCLIST(DL_DEFAULT,
4679 pWriter->data.pData+iDoclistData+n,
4680 pWriter->data.nData-iDoclistData-n, NULL);
4681
4682 /* The actual amount of doclist data at this point could be smaller
4683 ** than the length we encoded. Additionally, the space required to
4684 ** encode this length could be smaller. For small doclists, this is
4685 ** not a big deal, we can just use memmove() to adjust things.
4686 */
4687 nActualData = pWriter->data.nData-(iDoclistData+n);
4688 nActual = putVarint(c, nActualData);
4689 assert( nActualData<=nData );
4690 assert( nActual<=n );
4691
4692 /* If the new doclist is big enough for force a standalone leaf
4693 ** node, we can immediately flush it inline without doing the
4694 ** memmove().
4695 */
4696 /* TODO(shess) This test matches leafWriterStep(), which does this
4697 ** test before it knows the cost to varint-encode the term and
4698 ** doclist lengths. At some point, change to
4699 ** pWriter->data.nData-iTermData>STANDALONE_MIN.
4700 */
4701 if( nTerm+nActualData>STANDALONE_MIN ){
4702 /* Push leaf node from before this term. */
4703 if( iTermData>0 ){
4704 rc = leafWriterInternalFlush(v, pWriter, 0, iTermData);
4705 if( rc!=SQLITE_OK ) return rc;
4706
4707 pWriter->nTermDistinct = nTermDistinct;
4708 }
4709
4710 /* Fix the encoded doclist length. */
4711 iDoclistData += n - nActual;
4712 memcpy(pWriter->data.pData+iDoclistData, c, nActual);
4713
4714 /* Push the standalone leaf node. */
4715 rc = leafWriterInlineFlush(v, pWriter, pTerm, nTerm, iDoclistData);
4716 if( rc!=SQLITE_OK ) return rc;
4717
4718 /* Leave the node empty. */
4719 dataBufferReset(&pWriter->data);
4720
4721 return rc;
4722 }
4723
4724 /* At this point, we know that the doclist was small, so do the
4725 ** memmove if indicated.
4726 */
4727 if( nActual<n ){
4728 memmove(pWriter->data.pData+iDoclistData+nActual,
4729 pWriter->data.pData+iDoclistData+n,
4730 pWriter->data.nData-(iDoclistData+n));
4731 pWriter->data.nData -= n-nActual;
4732 }
4733
4734 /* Replace written length with actual length. */
4735 memcpy(pWriter->data.pData+iDoclistData, c, nActual);
4736
4737 /* If the node is too large, break things up. */
4738 /* TODO(shess) This test matches leafWriterStep(), which does this
4739 ** test before it knows the cost to varint-encode the term and
4740 ** doclist lengths. At some point, change to
4741 ** pWriter->data.nData>LEAF_MAX.
4742 */
4743 if( iTermData+nTerm+nActualData>LEAF_MAX ){
4744 /* Flush out the leading data as a node */
4745 rc = leafWriterInternalFlush(v, pWriter, 0, iTermData);
4746 if( rc!=SQLITE_OK ) return rc;
4747
4748 pWriter->nTermDistinct = nTermDistinct;
4749
4750 /* Rebuild header using the current term */
4751 n = putVarint(pWriter->data.pData, 0);
4752 n += putVarint(pWriter->data.pData+n, nTerm);
4753 memcpy(pWriter->data.pData+n, pTerm, nTerm);
4754 n += nTerm;
4755
4756 /* There should always be room, because the previous encoding
4757 ** included all data necessary to construct the term.
4758 */
4759 assert( n<iDoclistData );
4760 /* So long as STANDALONE_MIN is half or less of LEAF_MAX, the
4761 ** following memcpy() is safe (as opposed to needing a memmove).
4762 */
4763 assert( 2*STANDALONE_MIN<=LEAF_MAX );
4764 assert( n+pWriter->data.nData-iDoclistData<iDoclistData );
4765 memcpy(pWriter->data.pData+n,
4766 pWriter->data.pData+iDoclistData,
4767 pWriter->data.nData-iDoclistData);
4768 pWriter->data.nData -= iDoclistData-n;
4769 }
4770 ASSERT_VALID_LEAF_NODE(pWriter->data.pData, pWriter->data.nData);
4771
4772 return SQLITE_OK;
4773}
4774
4775/* Push pTerm[nTerm] along with the doclist data to the leaf layer of
4776** %_segments.
4777*/
4778/* TODO(shess) Revise writeZeroSegment() so that doclists are
4779** constructed directly in pWriter->data.
4780*/
4781static int leafWriterStep(fulltext_vtab *v, LeafWriter *pWriter,
4782 const char *pTerm, int nTerm,
4783 const char *pData, int nData){
4784 int rc;
4785 DLReader reader;
4786
4787 dlrInit(&reader, DL_DEFAULT, pData, nData);
4788 rc = leafWriterStepMerge(v, pWriter, pTerm, nTerm, &reader, 1);
4789 dlrDestroy(&reader);
4790
4791 return rc;
4792}
4793
4794
4795/****************************************************************/
4796/* LeafReader is used to iterate over an individual leaf node. */
4797typedef struct LeafReader {
4798 DataBuffer term; /* copy of current term. */
4799
4800 const char *pData; /* data for current term. */
4801 int nData;
4802} LeafReader;
4803
4804static void leafReaderDestroy(LeafReader *pReader){
4805 dataBufferDestroy(&pReader->term);
4806 SCRAMBLE(pReader);
4807}
4808
4809static int leafReaderAtEnd(LeafReader *pReader){
4810 return pReader->nData<=0;
4811}
4812
4813/* Access the current term. */
4814static int leafReaderTermBytes(LeafReader *pReader){
4815 return pReader->term.nData;
4816}
4817static const char *leafReaderTerm(LeafReader *pReader){
4818 assert( pReader->term.nData>0 );
4819 return pReader->term.pData;
4820}
4821
4822/* Access the doclist data for the current term. */
4823static int leafReaderDataBytes(LeafReader *pReader){
4824 int nData;
4825 assert( pReader->term.nData>0 );
4826 getVarint32(pReader->pData, &nData);
4827 return nData;
4828}
4829static const char *leafReaderData(LeafReader *pReader){
4830 int n, nData;
4831 assert( pReader->term.nData>0 );
4832 n = getVarint32(pReader->pData, &nData);
4833 return pReader->pData+n;
4834}
4835
4836static void leafReaderInit(const char *pData, int nData,
4837 LeafReader *pReader){
4838 int nTerm, n;
4839
4840 assert( nData>0 );
4841 assert( pData[0]=='\0' );
4842
4843 CLEAR(pReader);
4844
4845 /* Read the first term, skipping the header byte. */
4846 n = getVarint32(pData+1, &nTerm);
4847 dataBufferInit(&pReader->term, nTerm);
4848 dataBufferReplace(&pReader->term, pData+1+n, nTerm);
4849
4850 /* Position after the first term. */
4851 assert( 1+n+nTerm<nData );
4852 pReader->pData = pData+1+n+nTerm;
4853 pReader->nData = nData-1-n-nTerm;
4854}
4855
4856/* Step the reader forward to the next term. */
4857static void leafReaderStep(LeafReader *pReader){
4858 int n, nData, nPrefix, nSuffix;
4859 assert( !leafReaderAtEnd(pReader) );
4860
4861 /* Skip previous entry's data block. */
4862 n = getVarint32(pReader->pData, &nData);
4863 assert( n+nData<=pReader->nData );
4864 pReader->pData += n+nData;
4865 pReader->nData -= n+nData;
4866
4867 if( !leafReaderAtEnd(pReader) ){
4868 /* Construct the new term using a prefix from the old term plus a
4869 ** suffix from the leaf data.
4870 */
4871 n = getVarint32(pReader->pData, &nPrefix);
4872 n += getVarint32(pReader->pData+n, &nSuffix);
4873 assert( n+nSuffix<pReader->nData );
4874 pReader->term.nData = nPrefix;
4875 dataBufferAppend(&pReader->term, pReader->pData+n, nSuffix);
4876
4877 pReader->pData += n+nSuffix;
4878 pReader->nData -= n+nSuffix;
4879 }
4880}
4881
4882/* strcmp-style comparison of pReader's current term against pTerm.
4883** If isPrefix, equality means equal through nTerm bytes.
4884*/
4885static int leafReaderTermCmp(LeafReader *pReader,
4886 const char *pTerm, int nTerm, int isPrefix){
4887 int c, n = pReader->term.nData<nTerm ? pReader->term.nData : nTerm;
4888 if( n==0 ){
4889 if( pReader->term.nData>0 ) return -1;
4890 if(nTerm>0 ) return 1;
4891 return 0;
4892 }
4893
4894 c = memcmp(pReader->term.pData, pTerm, n);
4895 if( c!=0 ) return c;
4896 if( isPrefix && n==nTerm ) return 0;
4897 return pReader->term.nData - nTerm;
4898}
4899
4900
4901/****************************************************************/
4902/* LeavesReader wraps LeafReader to allow iterating over the entire
4903** leaf layer of the tree.
4904*/
4905typedef struct LeavesReader {
4906 int idx; /* Index within the segment. */
4907
4908 sqlite3_stmt *pStmt; /* Statement we're streaming leaves from. */
4909 int eof; /* we've seen SQLITE_DONE from pStmt. */
4910
4911 LeafReader leafReader; /* reader for the current leaf. */
4912 DataBuffer rootData; /* root data for inline. */
4913} LeavesReader;
4914
4915/* Access the current term. */
4916static int leavesReaderTermBytes(LeavesReader *pReader){
4917 assert( !pReader->eof );
4918 return leafReaderTermBytes(&pReader->leafReader);
4919}
4920static const char *leavesReaderTerm(LeavesReader *pReader){
4921 assert( !pReader->eof );
4922 return leafReaderTerm(&pReader->leafReader);
4923}
4924
4925/* Access the doclist data for the current term. */
4926static int leavesReaderDataBytes(LeavesReader *pReader){
4927 assert( !pReader->eof );
4928 return leafReaderDataBytes(&pReader->leafReader);
4929}
4930static const char *leavesReaderData(LeavesReader *pReader){
4931 assert( !pReader->eof );
4932 return leafReaderData(&pReader->leafReader);
4933}
4934
4935static int leavesReaderAtEnd(LeavesReader *pReader){
4936 return pReader->eof;
4937}
4938
4939/* loadSegmentLeaves() may not read all the way to SQLITE_DONE, thus
4940** leaving the statement handle open, which locks the table.
4941*/
4942/* TODO(shess) This "solution" is not satisfactory. Really, there
4943** should be check-in function for all statement handles which
4944** arranges to call sqlite3_reset(). This most likely will require
4945** modification to control flow all over the place, though, so for now
4946** just punt.
4947**
4948** Note the the current system assumes that segment merges will run to
4949** completion, which is why this particular probably hasn't arisen in
4950** this case. Probably a brittle assumption.
4951*/
4952static int leavesReaderReset(LeavesReader *pReader){
4953 return sqlite3_reset(pReader->pStmt);
4954}
4955
4956static void leavesReaderDestroy(LeavesReader *pReader){
4957 leafReaderDestroy(&pReader->leafReader);
4958 dataBufferDestroy(&pReader->rootData);
4959 SCRAMBLE(pReader);
4960}
4961
4962/* Initialize pReader with the given root data (if iStartBlockid==0
4963** the leaf data was entirely contained in the root), or from the
4964** stream of blocks between iStartBlockid and iEndBlockid, inclusive.
4965*/
4966static int leavesReaderInit(fulltext_vtab *v,
4967 int idx,
4968 sqlite_int64 iStartBlockid,
4969 sqlite_int64 iEndBlockid,
4970 const char *pRootData, int nRootData,
4971 LeavesReader *pReader){
4972 CLEAR(pReader);
4973 pReader->idx = idx;
4974
4975 dataBufferInit(&pReader->rootData, 0);
4976 if( iStartBlockid==0 ){
4977 /* Entire leaf level fit in root data. */
4978 dataBufferReplace(&pReader->rootData, pRootData, nRootData);
4979 leafReaderInit(pReader->rootData.pData, pReader->rootData.nData,
4980 &pReader->leafReader);
4981 }else{
4982 sqlite3_stmt *s;
4983 int rc = sql_get_leaf_statement(v, idx, &s);
4984 if( rc!=SQLITE_OK ) return rc;
4985
4986 rc = sqlite3_bind_int64(s, 1, iStartBlockid);
4987 if( rc!=SQLITE_OK ) return rc;
4988
4989 rc = sqlite3_bind_int64(s, 2, iEndBlockid);
4990 if( rc!=SQLITE_OK ) return rc;
4991
4992 rc = sqlite3_step(s);
4993 if( rc==SQLITE_DONE ){
4994 pReader->eof = 1;
4995 return SQLITE_OK;
4996 }
4997 if( rc!=SQLITE_ROW ) return rc;
4998
4999 pReader->pStmt = s;
5000 leafReaderInit(sqlite3_column_blob(pReader->pStmt, 0),
5001 sqlite3_column_bytes(pReader->pStmt, 0),
5002 &pReader->leafReader);
5003 }
5004 return SQLITE_OK;
5005}
5006
5007/* Step the current leaf forward to the next term. If we reach the
5008** end of the current leaf, step forward to the next leaf block.
5009*/
5010static int leavesReaderStep(fulltext_vtab *v, LeavesReader *pReader){
5011 assert( !leavesReaderAtEnd(pReader) );
5012 leafReaderStep(&pReader->leafReader);
5013
5014 if( leafReaderAtEnd(&pReader->leafReader) ){
5015 int rc;
5016 if( pReader->rootData.pData ){
5017 pReader->eof = 1;
5018 return SQLITE_OK;
5019 }
5020 rc = sqlite3_step(pReader->pStmt);
5021 if( rc!=SQLITE_ROW ){
5022 pReader->eof = 1;
5023 return rc==SQLITE_DONE ? SQLITE_OK : rc;
5024 }
5025 leafReaderDestroy(&pReader->leafReader);
5026 leafReaderInit(sqlite3_column_blob(pReader->pStmt, 0),
5027 sqlite3_column_bytes(pReader->pStmt, 0),
5028 &pReader->leafReader);
5029 }
5030 return SQLITE_OK;
5031}
5032
5033/* Order LeavesReaders by their term, ignoring idx. Readers at eof
5034** always sort to the end.
5035*/
5036static int leavesReaderTermCmp(LeavesReader *lr1, LeavesReader *lr2){
5037 if( leavesReaderAtEnd(lr1) ){
5038 if( leavesReaderAtEnd(lr2) ) return 0;
5039 return 1;
5040 }
5041 if( leavesReaderAtEnd(lr2) ) return -1;
5042
5043 return leafReaderTermCmp(&lr1->leafReader,
5044 leavesReaderTerm(lr2), leavesReaderTermBytes(lr2),
5045 0);
5046}
5047
5048/* Similar to leavesReaderTermCmp(), with additional ordering by idx
5049** so that older segments sort before newer segments.
5050*/
5051static int leavesReaderCmp(LeavesReader *lr1, LeavesReader *lr2){
5052 int c = leavesReaderTermCmp(lr1, lr2);
5053 if( c!=0 ) return c;
5054 return lr1->idx-lr2->idx;
5055}
5056
5057/* Assume that pLr[1]..pLr[nLr] are sorted. Bubble pLr[0] into its
5058** sorted position.
5059*/
5060static void leavesReaderReorder(LeavesReader *pLr, int nLr){
5061 while( nLr>1 && leavesReaderCmp(pLr, pLr+1)>0 ){
5062 LeavesReader tmp = pLr[0];
5063 pLr[0] = pLr[1];
5064 pLr[1] = tmp;
5065 nLr--;
5066 pLr++;
5067 }
5068}
5069
5070/* Initializes pReaders with the segments from level iLevel, returning
5071** the number of segments in *piReaders. Leaves pReaders in sorted
5072** order.
5073*/
5074static int leavesReadersInit(fulltext_vtab *v, int iLevel,
5075 LeavesReader *pReaders, int *piReaders){
5076 sqlite3_stmt *s;
5077 int i, rc = sql_get_statement(v, SEGDIR_SELECT_STMT, &s);
5078 if( rc!=SQLITE_OK ) return rc;
5079
5080 rc = sqlite3_bind_int(s, 1, iLevel);
5081 if( rc!=SQLITE_OK ) return rc;
5082
5083 i = 0;
5084 while( (rc = sqlite3_step(s))==SQLITE_ROW ){
5085 sqlite_int64 iStart = sqlite3_column_int64(s, 0);
5086 sqlite_int64 iEnd = sqlite3_column_int64(s, 1);
5087 const char *pRootData = sqlite3_column_blob(s, 2);
5088 int nRootData = sqlite3_column_bytes(s, 2);
5089
5090 assert( i<MERGE_COUNT );
5091 rc = leavesReaderInit(v, i, iStart, iEnd, pRootData, nRootData,
5092 &pReaders[i]);
5093 if( rc!=SQLITE_OK ) break;
5094
5095 i++;
5096 }
5097 if( rc!=SQLITE_DONE ){
5098 while( i-->0 ){
5099 leavesReaderDestroy(&pReaders[i]);
5100 }
5101 return rc;
5102 }
5103
5104 *piReaders = i;
5105
5106 /* Leave our results sorted by term, then age. */
5107 while( i-- ){
5108 leavesReaderReorder(pReaders+i, *piReaders-i);
5109 }
5110 return SQLITE_OK;
5111}
5112
5113/* Merge doclists from pReaders[nReaders] into a single doclist, which
5114** is written to pWriter. Assumes pReaders is ordered oldest to
5115** newest.
5116*/
5117/* TODO(shess) Consider putting this inline in segmentMerge(). */
5118static int leavesReadersMerge(fulltext_vtab *v,
5119 LeavesReader *pReaders, int nReaders,
5120 LeafWriter *pWriter){
5121 DLReader dlReaders[MERGE_COUNT];
5122 const char *pTerm = leavesReaderTerm(pReaders);
5123 int i, nTerm = leavesReaderTermBytes(pReaders);
5124
5125 assert( nReaders<=MERGE_COUNT );
5126
5127 for(i=0; i<nReaders; i++){
5128 dlrInit(&dlReaders[i], DL_DEFAULT,
5129 leavesReaderData(pReaders+i),
5130 leavesReaderDataBytes(pReaders+i));
5131 }
5132
5133 return leafWriterStepMerge(v, pWriter, pTerm, nTerm, dlReaders, nReaders);
5134}
5135
5136/* Forward ref due to mutual recursion with segdirNextIndex(). */
5137static int segmentMerge(fulltext_vtab *v, int iLevel);
5138
5139/* Put the next available index at iLevel into *pidx. If iLevel
5140** already has MERGE_COUNT segments, they are merged to a higher
5141** level to make room.
5142*/
5143static int segdirNextIndex(fulltext_vtab *v, int iLevel, int *pidx){
5144 int rc = segdir_max_index(v, iLevel, pidx);
5145 if( rc==SQLITE_DONE ){ /* No segments at iLevel. */
5146 *pidx = 0;
5147 }else if( rc==SQLITE_ROW ){
5148 if( *pidx==(MERGE_COUNT-1) ){
5149 rc = segmentMerge(v, iLevel);
5150 if( rc!=SQLITE_OK ) return rc;
5151 *pidx = 0;
5152 }else{
5153 (*pidx)++;
5154 }
5155 }else{
5156 return rc;
5157 }
5158 return SQLITE_OK;
5159}
5160
5161/* Merge MERGE_COUNT segments at iLevel into a new segment at
5162** iLevel+1. If iLevel+1 is already full of segments, those will be
5163** merged to make room.
5164*/
5165static int segmentMerge(fulltext_vtab *v, int iLevel){
5166 LeafWriter writer;
5167 LeavesReader lrs[MERGE_COUNT];
5168 int i, rc, idx = 0;
5169
5170 /* Determine the next available segment index at the next level,
5171 ** merging as necessary.
5172 */
5173 rc = segdirNextIndex(v, iLevel+1, &idx);
5174 if( rc!=SQLITE_OK ) return rc;
5175
5176 /* TODO(shess) This assumes that we'll always see exactly
5177 ** MERGE_COUNT segments to merge at a given level. That will be
5178 ** broken if we allow the developer to request preemptive or
5179 ** deferred merging.
5180 */
5181 memset(&lrs, '\0', sizeof(lrs));
5182 rc = leavesReadersInit(v, iLevel, lrs, &i);
5183 if( rc!=SQLITE_OK ) return rc;
5184 assert( i==MERGE_COUNT );
5185
5186 leafWriterInit(iLevel+1, idx, &writer);
5187
5188 /* Since leavesReaderReorder() pushes readers at eof to the end,
5189 ** when the first reader is empty, all will be empty.
5190 */
5191 while( !leavesReaderAtEnd(lrs) ){
5192 /* Figure out how many readers share their next term. */
5193 for(i=1; i<MERGE_COUNT && !leavesReaderAtEnd(lrs+i); i++){
5194 if( 0!=leavesReaderTermCmp(lrs, lrs+i) ) break;
5195 }
5196
5197 rc = leavesReadersMerge(v, lrs, i, &writer);
5198 if( rc!=SQLITE_OK ) goto err;
5199
5200 /* Step forward those that were merged. */
5201 while( i-->0 ){
5202 rc = leavesReaderStep(v, lrs+i);
5203 if( rc!=SQLITE_OK ) goto err;
5204
5205 /* Reorder by term, then by age. */
5206 leavesReaderReorder(lrs+i, MERGE_COUNT-i);
5207 }
5208 }
5209
5210 for(i=0; i<MERGE_COUNT; i++){
5211 leavesReaderDestroy(&lrs[i]);
5212 }
5213
5214 rc = leafWriterFinalize(v, &writer);
5215 leafWriterDestroy(&writer);
5216 if( rc!=SQLITE_OK ) return rc;
5217
5218 /* Delete the merged segment data. */
5219 return segdir_delete(v, iLevel);
5220
5221 err:
5222 for(i=0; i<MERGE_COUNT; i++){
5223 leavesReaderDestroy(&lrs[i]);
5224 }
5225 leafWriterDestroy(&writer);
5226 return rc;
5227}
5228
5229/* Scan pReader for pTerm/nTerm, and merge the term's doclist over
5230** *out (any doclists with duplicate docids overwrite those in *out).
5231** Internal function for loadSegmentLeaf().
5232*/
5233static int loadSegmentLeavesInt(fulltext_vtab *v, LeavesReader *pReader,
5234 const char *pTerm, int nTerm, int isPrefix,
5235 DataBuffer *out){
5236 assert( nTerm>0 );
5237
5238 /* Process while the prefix matches. */
5239 while( !leavesReaderAtEnd(pReader) ){
5240 /* TODO(shess) Really want leavesReaderTermCmp(), but that name is
5241 ** already taken to compare the terms of two LeavesReaders. Think
5242 ** on a better name. [Meanwhile, break encapsulation rather than
5243 ** use a confusing name.]
5244 */
5245 int rc;
5246 int c = leafReaderTermCmp(&pReader->leafReader, pTerm, nTerm, isPrefix);
5247 if( c==0 ){
5248 const char *pData = leavesReaderData(pReader);
5249 int nData = leavesReaderDataBytes(pReader);
5250 if( out->nData==0 ){
5251 dataBufferReplace(out, pData, nData);
5252 }else{
5253 DataBuffer result;
5254 dataBufferInit(&result, out->nData+nData);
5255 docListUnion(out->pData, out->nData, pData, nData, &result);
5256 dataBufferDestroy(out);
5257 *out = result;
5258 /* TODO(shess) Rather than destroy out, we could retain it for
5259 ** later reuse.
5260 */
5261 }
5262 }
5263 if( c>0 ) break; /* Past any possible matches. */
5264
5265 rc = leavesReaderStep(v, pReader);
5266 if( rc!=SQLITE_OK ) return rc;
5267 }
5268 return SQLITE_OK;
5269}
5270
5271/* Call loadSegmentLeavesInt() with pData/nData as input. */
5272static int loadSegmentLeaf(fulltext_vtab *v, const char *pData, int nData,
5273 const char *pTerm, int nTerm, int isPrefix,
5274 DataBuffer *out){
5275 LeavesReader reader;
5276 int rc;
5277
5278 assert( nData>1 );
5279 assert( *pData=='\0' );
5280 rc = leavesReaderInit(v, 0, 0, 0, pData, nData, &reader);
5281 if( rc!=SQLITE_OK ) return rc;
5282
5283 rc = loadSegmentLeavesInt(v, &reader, pTerm, nTerm, isPrefix, out);
5284 leavesReaderReset(&reader);
5285 leavesReaderDestroy(&reader);
5286 return rc;
5287}
5288
5289/* Call loadSegmentLeavesInt() with the leaf nodes from iStartLeaf to
5290** iEndLeaf (inclusive) as input, and merge the resulting doclist into
5291** out.
5292*/
5293static int loadSegmentLeaves(fulltext_vtab *v,
5294 sqlite_int64 iStartLeaf, sqlite_int64 iEndLeaf,
5295 const char *pTerm, int nTerm, int isPrefix,
5296 DataBuffer *out){
5297 int rc;
5298 LeavesReader reader;
5299
5300 assert( iStartLeaf<=iEndLeaf );
5301 rc = leavesReaderInit(v, 0, iStartLeaf, iEndLeaf, NULL, 0, &reader);
5302 if( rc!=SQLITE_OK ) return rc;
5303
5304 rc = loadSegmentLeavesInt(v, &reader, pTerm, nTerm, isPrefix, out);
5305 leavesReaderReset(&reader);
5306 leavesReaderDestroy(&reader);
5307 return rc;
5308}
5309
5310/* Taking pData/nData as an interior node, find the sequence of child
5311** nodes which could include pTerm/nTerm/isPrefix. Note that the
5312** interior node terms logically come between the blocks, so there is
5313** one more blockid than there are terms (that block contains terms >=
5314** the last interior-node term).
5315*/
5316/* TODO(shess) The calling code may already know that the end child is
5317** not worth calculating, because the end may be in a later sibling
5318** node. Consider whether breaking symmetry is worthwhile. I suspect
5319** it's not worthwhile.
5320*/
5321static void getChildrenContaining(const char *pData, int nData,
5322 const char *pTerm, int nTerm, int isPrefix,
5323 sqlite_int64 *piStartChild,
5324 sqlite_int64 *piEndChild){
5325 InteriorReader reader;
5326
5327 assert( nData>1 );
5328 assert( *pData!='\0' );
5329 interiorReaderInit(pData, nData, &reader);
5330
5331 /* Scan for the first child which could contain pTerm/nTerm. */
5332 while( !interiorReaderAtEnd(&reader) ){
5333 if( interiorReaderTermCmp(&reader, pTerm, nTerm, 0)>0 ) break;
5334 interiorReaderStep(&reader);
5335 }
5336 *piStartChild = interiorReaderCurrentBlockid(&reader);
5337
5338 /* Keep scanning to find a term greater than our term, using prefix
5339 ** comparison if indicated. If isPrefix is false, this will be the
5340 ** same blockid as the starting block.
5341 */
5342 while( !interiorReaderAtEnd(&reader) ){
5343 if( interiorReaderTermCmp(&reader, pTerm, nTerm, isPrefix)>0 ) break;
5344 interiorReaderStep(&reader);
5345 }
5346 *piEndChild = interiorReaderCurrentBlockid(&reader);
5347
5348 interiorReaderDestroy(&reader);
5349
5350 /* Children must ascend, and if !prefix, both must be the same. */
5351 assert( *piEndChild>=*piStartChild );
5352 assert( isPrefix || *piStartChild==*piEndChild );
5353}
5354
5355/* Read block at iBlockid and pass it with other params to
5356** getChildrenContaining().
5357*/
5358static int loadAndGetChildrenContaining(
5359 fulltext_vtab *v,
5360 sqlite_int64 iBlockid,
5361 const char *pTerm, int nTerm, int isPrefix,
5362 sqlite_int64 *piStartChild, sqlite_int64 *piEndChild
5363){
5364 sqlite3_stmt *s = NULL;
5365 int rc;
5366
5367 assert( iBlockid!=0 );
5368 assert( pTerm!=NULL );
5369 assert( nTerm!=0 ); /* TODO(shess) Why not allow this? */
5370 assert( piStartChild!=NULL );
5371 assert( piEndChild!=NULL );
5372
5373 rc = sql_get_statement(v, BLOCK_SELECT_STMT, &s);
5374 if( rc!=SQLITE_OK ) return rc;
5375
5376 rc = sqlite3_bind_int64(s, 1, iBlockid);
5377 if( rc!=SQLITE_OK ) return rc;
5378
5379 rc = sqlite3_step(s);
5380 if( rc==SQLITE_DONE ) return SQLITE_ERROR;
5381 if( rc!=SQLITE_ROW ) return rc;
5382
5383 getChildrenContaining(sqlite3_column_blob(s, 0), sqlite3_column_bytes(s, 0),
5384 pTerm, nTerm, isPrefix, piStartChild, piEndChild);
5385
5386 /* We expect only one row. We must execute another sqlite3_step()
5387 * to complete the iteration; otherwise the table will remain
5388 * locked. */
5389 rc = sqlite3_step(s);
5390 if( rc==SQLITE_ROW ) return SQLITE_ERROR;
5391 if( rc!=SQLITE_DONE ) return rc;
5392
5393 return SQLITE_OK;
5394}
5395
5396/* Traverse the tree represented by pData[nData] looking for
5397** pTerm[nTerm], placing its doclist into *out. This is internal to
5398** loadSegment() to make error-handling cleaner.
5399*/
5400static int loadSegmentInt(fulltext_vtab *v, const char *pData, int nData,
5401 sqlite_int64 iLeavesEnd,
5402 const char *pTerm, int nTerm, int isPrefix,
5403 DataBuffer *out){
5404 /* Special case where root is a leaf. */
5405 if( *pData=='\0' ){
5406 return loadSegmentLeaf(v, pData, nData, pTerm, nTerm, isPrefix, out);
5407 }else{
5408 int rc;
5409 sqlite_int64 iStartChild, iEndChild;
5410
5411 /* Process pData as an interior node, then loop down the tree
5412 ** until we find the set of leaf nodes to scan for the term.
5413 */
5414 getChildrenContaining(pData, nData, pTerm, nTerm, isPrefix,
5415 &iStartChild, &iEndChild);
5416 while( iStartChild>iLeavesEnd ){
5417 sqlite_int64 iNextStart, iNextEnd;
5418 rc = loadAndGetChildrenContaining(v, iStartChild, pTerm, nTerm, isPrefix,
5419 &iNextStart, &iNextEnd);
5420 if( rc!=SQLITE_OK ) return rc;
5421
5422 /* If we've branched, follow the end branch, too. */
5423 if( iStartChild!=iEndChild ){
5424 sqlite_int64 iDummy;
5425 rc = loadAndGetChildrenContaining(v, iEndChild, pTerm, nTerm, isPrefix,
5426 &iDummy, &iNextEnd);
5427 if( rc!=SQLITE_OK ) return rc;
5428 }
5429
5430 assert( iNextStart<=iNextEnd );
5431 iStartChild = iNextStart;
5432 iEndChild = iNextEnd;
5433 }
5434 assert( iStartChild<=iLeavesEnd );
5435 assert( iEndChild<=iLeavesEnd );
5436
5437 /* Scan through the leaf segments for doclists. */
5438 return loadSegmentLeaves(v, iStartChild, iEndChild,
5439 pTerm, nTerm, isPrefix, out);
5440 }
5441}
5442
5443/* Call loadSegmentInt() to collect the doclist for pTerm/nTerm, then
5444** merge its doclist over *out (any duplicate doclists read from the
5445** segment rooted at pData will overwrite those in *out).
5446*/
5447/* TODO(shess) Consider changing this to determine the depth of the
5448** leaves using either the first characters of interior nodes (when
5449** ==1, we're one level above the leaves), or the first character of
5450** the root (which will describe the height of the tree directly).
5451** Either feels somewhat tricky to me.
5452*/
5453/* TODO(shess) The current merge is likely to be slow for large
5454** doclists (though it should process from newest/smallest to
5455** oldest/largest, so it may not be that bad). It might be useful to
5456** modify things to allow for N-way merging. This could either be
5457** within a segment, with pairwise merges across segments, or across
5458** all segments at once.
5459*/
5460static int loadSegment(fulltext_vtab *v, const char *pData, int nData,
5461 sqlite_int64 iLeavesEnd,
5462 const char *pTerm, int nTerm, int isPrefix,
5463 DataBuffer *out){
5464 DataBuffer result;
5465 int rc;
5466
5467 assert( nData>1 );
5468
5469 /* This code should never be called with buffered updates. */
5470 assert( v->nPendingData<0 );
5471
5472 dataBufferInit(&result, 0);
5473 rc = loadSegmentInt(v, pData, nData, iLeavesEnd,
5474 pTerm, nTerm, isPrefix, &result);
5475 if( rc==SQLITE_OK && result.nData>0 ){
5476 if( out->nData==0 ){
5477 DataBuffer tmp = *out;
5478 *out = result;
5479 result = tmp;
5480 }else{
5481 DataBuffer merged;
5482 DLReader readers[2];
5483
5484 dlrInit(&readers[0], DL_DEFAULT, out->pData, out->nData);
5485 dlrInit(&readers[1], DL_DEFAULT, result.pData, result.nData);
5486 dataBufferInit(&merged, out->nData+result.nData);
5487 docListMerge(&merged, readers, 2);
5488 dataBufferDestroy(out);
5489 *out = merged;
5490 dlrDestroy(&readers[0]);
5491 dlrDestroy(&readers[1]);
5492 }
5493 }
5494 dataBufferDestroy(&result);
5495 return rc;
5496}
5497
5498/* Scan the database and merge together the posting lists for the term
5499** into *out.
5500*/
5501static int termSelect(fulltext_vtab *v, int iColumn,
5502 const char *pTerm, int nTerm, int isPrefix,
5503 DocListType iType, DataBuffer *out){
5504 DataBuffer doclist;
5505 sqlite3_stmt *s;
5506 int rc = sql_get_statement(v, SEGDIR_SELECT_ALL_STMT, &s);
5507 if( rc!=SQLITE_OK ) return rc;
5508
5509 /* This code should never be called with buffered updates. */
5510 assert( v->nPendingData<0 );
5511
5512 dataBufferInit(&doclist, 0);
5513
5514 /* Traverse the segments from oldest to newest so that newer doclist
5515 ** elements for given docids overwrite older elements.
5516 */
5517 while( (rc = sqlite3_step(s))==SQLITE_ROW ){
5518 const char *pData = sqlite3_column_blob(s, 0);
5519 const int nData = sqlite3_column_bytes(s, 0);
5520 const sqlite_int64 iLeavesEnd = sqlite3_column_int64(s, 1);
5521 rc = loadSegment(v, pData, nData, iLeavesEnd, pTerm, nTerm, isPrefix,
5522 &doclist);
5523 if( rc!=SQLITE_OK ) goto err;
5524 }
5525 if( rc==SQLITE_DONE ){
5526 if( doclist.nData!=0 ){
5527 /* TODO(shess) The old term_select_all() code applied the column
5528 ** restrict as we merged segments, leading to smaller buffers.
5529 ** This is probably worthwhile to bring back, once the new storage
5530 ** system is checked in.
5531 */
5532 if( iColumn==v->nColumn) iColumn = -1;
5533 docListTrim(DL_DEFAULT, doclist.pData, doclist.nData,
5534 iColumn, iType, out);
5535 }
5536 rc = SQLITE_OK;
5537 }
5538
5539 err:
5540 dataBufferDestroy(&doclist);
5541 return rc;
5542}
5543
5544/****************************************************************/
5545/* Used to hold hashtable data for sorting. */
5546typedef struct TermData {
5547 const char *pTerm;
5548 int nTerm;
5549 DLCollector *pCollector;
5550} TermData;
5551
5552/* Orders TermData elements in strcmp fashion ( <0 for less-than, 0
5553** for equal, >0 for greater-than).
5554*/
5555static int termDataCmp(const void *av, const void *bv){
5556 const TermData *a = (const TermData *)av;
5557 const TermData *b = (const TermData *)bv;
5558 int n = a->nTerm<b->nTerm ? a->nTerm : b->nTerm;
5559 int c = memcmp(a->pTerm, b->pTerm, n);
5560 if( c!=0 ) return c;
5561 return a->nTerm-b->nTerm;
5562}
5563
5564/* Order pTerms data by term, then write a new level 0 segment using
5565** LeafWriter.
5566*/
5567static int writeZeroSegment(fulltext_vtab *v, fts3Hash *pTerms){
5568 fts3HashElem *e;
5569 int idx, rc, i, n;
5570 TermData *pData;
5571 LeafWriter writer;
5572 DataBuffer dl;
5573
5574 /* Determine the next index at level 0, merging as necessary. */
5575 rc = segdirNextIndex(v, 0, &idx);
5576 if( rc!=SQLITE_OK ) return rc;
5577
5578 n = fts3HashCount(pTerms);
5579 pData = malloc(n*sizeof(TermData));
5580
5581 for(i = 0, e = fts3HashFirst(pTerms); e; i++, e = fts3HashNext(e)){
5582 assert( i<n );
5583 pData[i].pTerm = fts3HashKey(e);
5584 pData[i].nTerm = fts3HashKeysize(e);
5585 pData[i].pCollector = fts3HashData(e);
5586 }
5587 assert( i==n );
5588
5589 /* TODO(shess) Should we allow user-defined collation sequences,
5590 ** here? I think we only need that once we support prefix searches.
5591 */
5592 if( n>1 ) qsort(pData, n, sizeof(*pData), termDataCmp);
5593
5594 /* TODO(shess) Refactor so that we can write directly to the segment
5595 ** DataBuffer, as happens for segment merges.
5596 */
5597 leafWriterInit(0, idx, &writer);
5598 dataBufferInit(&dl, 0);
5599 for(i=0; i<n; i++){
5600 dataBufferReset(&dl);
5601 dlcAddDoclist(pData[i].pCollector, &dl);
5602 rc = leafWriterStep(v, &writer,
5603 pData[i].pTerm, pData[i].nTerm, dl.pData, dl.nData);
5604 if( rc!=SQLITE_OK ) goto err;
5605 }
5606 rc = leafWriterFinalize(v, &writer);
5607
5608 err:
5609 dataBufferDestroy(&dl);
5610 free(pData);
5611 leafWriterDestroy(&writer);
5612 return rc;
5613}
5614
5615/* If pendingTerms has data, free it. */
5616static int clearPendingTerms(fulltext_vtab *v){
5617 if( v->nPendingData>=0 ){
5618 fts3HashElem *e;
5619 for(e=fts3HashFirst(&v->pendingTerms); e; e=fts3HashNext(e)){
5620 dlcDelete(fts3HashData(e));
5621 }
5622 fts3HashClear(&v->pendingTerms);
5623 v->nPendingData = -1;
5624 }
5625 return SQLITE_OK;
5626}
5627
5628/* If pendingTerms has data, flush it to a level-zero segment, and
5629** free it.
5630*/
5631static int flushPendingTerms(fulltext_vtab *v){
5632 if( v->nPendingData>=0 ){
5633 int rc = writeZeroSegment(v, &v->pendingTerms);
5634 if( rc==SQLITE_OK ) clearPendingTerms(v);
5635 return rc;
5636 }
5637 return SQLITE_OK;
5638}
5639
5640/* If pendingTerms is "too big", or docid is out of order, flush it.
5641** Regardless, be certain that pendingTerms is initialized for use.
5642*/
5643static int initPendingTerms(fulltext_vtab *v, sqlite_int64 iDocid){
5644 /* TODO(shess) Explore whether partially flushing the buffer on
5645 ** forced-flush would provide better performance. I suspect that if
5646 ** we ordered the doclists by size and flushed the largest until the
5647 ** buffer was half empty, that would let the less frequent terms
5648 ** generate longer doclists.
5649 */
5650 if( iDocid<=v->iPrevDocid || v->nPendingData>kPendingThreshold ){
5651 int rc = flushPendingTerms(v);
5652 if( rc!=SQLITE_OK ) return rc;
5653 }
5654 if( v->nPendingData<0 ){
5655 fts3HashInit(&v->pendingTerms, FTS3_HASH_STRING, 1);
5656 v->nPendingData = 0;
5657 }
5658 v->iPrevDocid = iDocid;
5659 return SQLITE_OK;
5660}
5661
5662/* This function implements the xUpdate callback; it's the top-level entry
5663 * point for inserting, deleting or updating a row in a full-text table. */
5664static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg,
5665 sqlite_int64 *pRowid){
5666 fulltext_vtab *v = (fulltext_vtab *) pVtab;
5667 int rc;
5668
5669 TRACE(("FTS3 Update %p\n", pVtab));
5670
5671 if( nArg<2 ){
5672 rc = index_delete(v, sqlite3_value_int64(ppArg[0]));
5673 } else if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){
5674 /* An update:
5675 * ppArg[0] = old rowid
5676 * ppArg[1] = new rowid
5677 * ppArg[2..2+v->nColumn-1] = values
5678 * ppArg[2+v->nColumn] = value for magic column (we ignore this)
5679 * ppArg[2+v->nColumn+1] = value for docid
5680 */
5681 sqlite_int64 rowid = sqlite3_value_int64(ppArg[0]);
5682 if( sqlite3_value_type(ppArg[1]) != SQLITE_INTEGER ||
5683 sqlite3_value_int64(ppArg[1]) != rowid ){
5684 rc = SQLITE_ERROR; /* we don't allow changing the rowid */
5685 }else if( sqlite3_value_type(ppArg[2+v->nColumn+1]) != SQLITE_INTEGER ||
5686 sqlite3_value_int64(ppArg[2+v->nColumn+1]) != rowid ){
5687 rc = SQLITE_ERROR; /* we don't allow changing the docid */
5688 }else{
5689 assert( nArg==2+v->nColumn+2);
5690 rc = index_update(v, rowid, &ppArg[2]);
5691 }
5692 } else {
5693 /* An insert:
5694 * ppArg[1] = requested rowid
5695 * ppArg[2..2+v->nColumn-1] = values
5696 * ppArg[2+v->nColumn] = value for magic column (we ignore this)
5697 * ppArg[2+v->nColumn+1] = value for docid
5698 */
5699 sqlite3_value *pRequestDocid = ppArg[2+v->nColumn+1];
5700 assert( nArg==2+v->nColumn+2);
5701 if( SQLITE_NULL != sqlite3_value_type(pRequestDocid) &&
5702 SQLITE_NULL != sqlite3_value_type(ppArg[1]) ){
5703 /* TODO(shess) Consider allowing this to work if the values are
5704 ** identical. I'm inclined to discourage that usage, though,
5705 ** given that both rowid and docid are special columns. Better
5706 ** would be to define one or the other as the default winner,
5707 ** but should it be fts3-centric (docid) or SQLite-centric
5708 ** (rowid)?
5709 */
5710 rc = SQLITE_ERROR;
5711 }else{
5712 if( SQLITE_NULL == sqlite3_value_type(pRequestDocid) ){
5713 pRequestDocid = ppArg[1];
5714 }
5715 rc = index_insert(v, pRequestDocid, &ppArg[2], pRowid);
5716 }
5717 }
5718
5719 return rc;
5720}
5721
5722static int fulltextSync(sqlite3_vtab *pVtab){
5723 TRACE(("FTS3 xSync()\n"));
5724 return flushPendingTerms((fulltext_vtab *)pVtab);
5725}
5726
5727static int fulltextBegin(sqlite3_vtab *pVtab){
5728 fulltext_vtab *v = (fulltext_vtab *) pVtab;
5729 TRACE(("FTS3 xBegin()\n"));
5730
5731 /* Any buffered updates should have been cleared by the previous
5732 ** transaction.
5733 */
5734 assert( v->nPendingData<0 );
5735 return clearPendingTerms(v);
5736}
5737
5738static int fulltextCommit(sqlite3_vtab *pVtab){
5739 fulltext_vtab *v = (fulltext_vtab *) pVtab;
5740 TRACE(("FTS3 xCommit()\n"));
5741
5742 /* Buffered updates should have been cleared by fulltextSync(). */
5743 assert( v->nPendingData<0 );
5744 return clearPendingTerms(v);
5745}
5746
5747static int fulltextRollback(sqlite3_vtab *pVtab){
5748 TRACE(("FTS3 xRollback()\n"));
5749 return clearPendingTerms((fulltext_vtab *)pVtab);
5750}
5751
5752/*
5753** Implementation of the snippet() function for FTS3
5754*/
5755static void snippetFunc(
5756 sqlite3_context *pContext,
5757 int argc,
5758 sqlite3_value **argv
5759){
5760 fulltext_cursor *pCursor;
5761 if( argc<1 ) return;
5762 if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
5763 sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
5764 sqlite3_result_error(pContext, "illegal first argument to html_snippet",-1);
5765 }else{
5766 const char *zStart = "<b>";
5767 const char *zEnd = "</b>";
5768 const char *zEllipsis = "<b>...</b>";
5769 memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
5770 if( argc>=2 ){
5771 zStart = (const char*)sqlite3_value_text(argv[1]);
5772 if( argc>=3 ){
5773 zEnd = (const char*)sqlite3_value_text(argv[2]);
5774 if( argc>=4 ){
5775 zEllipsis = (const char*)sqlite3_value_text(argv[3]);
5776 }
5777 }
5778 }
5779 snippetAllOffsets(pCursor);
5780 snippetText(pCursor, zStart, zEnd, zEllipsis);
5781 sqlite3_result_text(pContext, pCursor->snippet.zSnippet,
5782 pCursor->snippet.nSnippet, SQLITE_STATIC);
5783 }
5784}
5785
5786/*
5787** Implementation of the offsets() function for FTS3
5788*/
5789static void snippetOffsetsFunc(
5790 sqlite3_context *pContext,
5791 int argc,
5792 sqlite3_value **argv
5793){
5794 fulltext_cursor *pCursor;
5795 if( argc<1 ) return;
5796 if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
5797 sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
5798 sqlite3_result_error(pContext, "illegal first argument to offsets",-1);
5799 }else{
5800 memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
5801 snippetAllOffsets(pCursor);
5802 snippetOffsetText(&pCursor->snippet);
5803 sqlite3_result_text(pContext,
5804 pCursor->snippet.zOffset, pCursor->snippet.nOffset,
5805 SQLITE_STATIC);
5806 }
5807}
5808
5809/*
5810** This routine implements the xFindFunction method for the FTS3
5811** virtual table.
5812*/
5813static int fulltextFindFunction(
5814 sqlite3_vtab *pVtab,
5815 int nArg,
5816 const char *zName,
5817 void (**pxFunc)(sqlite3_context*,int,sqlite3_value**),
5818 void **ppArg
5819){
5820 if( strcmp(zName,"snippet")==0 ){
5821 *pxFunc = snippetFunc;
5822 return 1;
5823 }else if( strcmp(zName,"offsets")==0 ){
5824 *pxFunc = snippetOffsetsFunc;
5825 return 1;
5826 }
5827 return 0;
5828}
5829
5830/*
5831** Rename an fts3 table.
5832*/
5833static int fulltextRename(
5834 sqlite3_vtab *pVtab,
5835 const char *zName
5836){
5837 fulltext_vtab *p = (fulltext_vtab *)pVtab;
5838 int rc = SQLITE_NOMEM;
5839 char *zSql = sqlite3_mprintf(
5840 "ALTER TABLE %Q.'%q_content' RENAME TO '%q_content';"
5841 "ALTER TABLE %Q.'%q_segments' RENAME TO '%q_segments';"
5842 "ALTER TABLE %Q.'%q_segdir' RENAME TO '%q_segdir';"
5843 , p->zDb, p->zName, zName
5844 , p->zDb, p->zName, zName
5845 , p->zDb, p->zName, zName
5846 );
5847 if( zSql ){
5848 rc = sqlite3_exec(p->db, zSql, 0, 0, 0);
5849 sqlite3_free(zSql);
5850 }
5851 return rc;
5852}
5853
5854static const sqlite3_module fts3Module = {
5855 /* iVersion */ 0,
5856 /* xCreate */ fulltextCreate,
5857 /* xConnect */ fulltextConnect,
5858 /* xBestIndex */ fulltextBestIndex,
5859 /* xDisconnect */ fulltextDisconnect,
5860 /* xDestroy */ fulltextDestroy,
5861 /* xOpen */ fulltextOpen,
5862 /* xClose */ fulltextClose,
5863 /* xFilter */ fulltextFilter,
5864 /* xNext */ fulltextNext,
5865 /* xEof */ fulltextEof,
5866 /* xColumn */ fulltextColumn,
5867 /* xRowid */ fulltextRowid,
5868 /* xUpdate */ fulltextUpdate,
5869 /* xBegin */ fulltextBegin,
5870 /* xSync */ fulltextSync,
5871 /* xCommit */ fulltextCommit,
5872 /* xRollback */ fulltextRollback,
5873 /* xFindFunction */ fulltextFindFunction,
5874 /* xRename */ fulltextRename,
5875};
5876
5877static void hashDestroy(void *p){
5878 fts3Hash *pHash = (fts3Hash *)p;
5879 sqlite3Fts3HashClear(pHash);
5880 sqlite3_free(pHash);
5881}
5882
5883/*
5884** The fts3 built-in tokenizers - "simple" and "porter" - are implemented
5885** in files fts3_tokenizer1.c and fts3_porter.c respectively. The following
5886** two forward declarations are for functions declared in these files
5887** used to retrieve the respective implementations.
5888**
5889** Calling sqlite3Fts3SimpleTokenizerModule() sets the value pointed
5890** to by the argument to point a the "simple" tokenizer implementation.
5891** Function ...PorterTokenizerModule() sets *pModule to point to the
5892** porter tokenizer/stemmer implementation.
5893*/
5894void sqlite3Fts3SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule);
5895void sqlite3Fts3PorterTokenizerModule(sqlite3_tokenizer_module const**ppModule);
5896void sqlite3Fts3IcuTokenizerModule(sqlite3_tokenizer_module const**ppModule);
5897
5898int sqlite3Fts3InitHashTable(sqlite3 *, fts3Hash *, const char *);
5899
5900/*
5901** Initialise the fts3 extension. If this extension is built as part
5902** of the sqlite library, then this function is called directly by
5903** SQLite. If fts3 is built as a dynamically loadable extension, this
5904** function is called by the sqlite3_extension_init() entry point.
5905*/
5906int sqlite3Fts3Init(sqlite3 *db){
5907 int rc = SQLITE_OK;
5908 fts3Hash *pHash = 0;
5909 const sqlite3_tokenizer_module *pSimple = 0;
5910 const sqlite3_tokenizer_module *pPorter = 0;
5911 const sqlite3_tokenizer_module *pIcu = 0;
5912
5913 sqlite3Fts3SimpleTokenizerModule(&pSimple);
5914 sqlite3Fts3PorterTokenizerModule(&pPorter);
5915#ifdef SQLITE_ENABLE_ICU
5916 sqlite3Fts3IcuTokenizerModule(&pIcu);
5917#endif
5918
5919 /* Allocate and initialise the hash-table used to store tokenizers. */
5920 pHash = sqlite3_malloc(sizeof(fts3Hash));
5921 if( !pHash ){
5922 rc = SQLITE_NOMEM;
5923 }else{
5924 sqlite3Fts3HashInit(pHash, FTS3_HASH_STRING, 1);
5925 }
5926
5927 /* Load the built-in tokenizers into the hash table */
5928 if( rc==SQLITE_OK ){
5929 if( sqlite3Fts3HashInsert(pHash, "simple", 7, (void *)pSimple)
5930 || sqlite3Fts3HashInsert(pHash, "porter", 7, (void *)pPorter)
5931 || (pIcu && sqlite3Fts3HashInsert(pHash, "icu", 4, (void *)pIcu))
5932 ){
5933 rc = SQLITE_NOMEM;
5934 }
5935 }
5936
5937 /* Create the virtual table wrapper around the hash-table and overload
5938 ** the two scalar functions. If this is successful, register the
5939 ** module with sqlite.
5940 */
5941 if( SQLITE_OK==rc
5942 && SQLITE_OK==(rc = sqlite3Fts3InitHashTable(db, pHash, "fts3_tokenizer"))
5943 && SQLITE_OK==(rc = sqlite3_overload_function(db, "snippet", -1))
5944 && SQLITE_OK==(rc = sqlite3_overload_function(db, "offsets", -1))
5945 ){
5946 return sqlite3_create_module_v2(
5947 db, "fts3", &fts3Module, (void *)pHash, hashDestroy
5948 );
5949 }
5950
5951 /* An error has occured. Delete the hash table and return the error code. */
5952 assert( rc!=SQLITE_OK );
5953 if( pHash ){
5954 sqlite3Fts3HashClear(pHash);
5955 sqlite3_free(pHash);
5956 }
5957 return rc;
5958}
5959
5960#if !SQLITE_CORE
5961int sqlite3_extension_init(
5962 sqlite3 *db,
5963 char **pzErrMsg,
5964 const sqlite3_api_routines *pApi
5965){
5966 SQLITE_EXTENSION_INIT2(pApi)
5967 return sqlite3Fts3Init(db);
5968}
5969#endif
5970
5971#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3.h b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3.h
deleted file mode 100644
index c1aa8ca..0000000
--- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3.h
+++ /dev/null
@@ -1,26 +0,0 @@
1/*
2** 2006 Oct 10
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11******************************************************************************
12**
13** This header file is used by programs that want to link against the
14** FTS3 library. All it does is declare the sqlite3Fts3Init() interface.
15*/
16#include "sqlite3.h"
17
18#ifdef __cplusplus
19extern "C" {
20#endif /* __cplusplus */
21
22int sqlite3Fts3Init(sqlite3 *db);
23
24#ifdef __cplusplus
25} /* extern "C" */
26#endif /* __cplusplus */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_hash.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_hash.c
deleted file mode 100644
index b14511a..0000000
--- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_hash.c
+++ /dev/null
@@ -1,373 +0,0 @@
1/*
2** 2001 September 22
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12** This is the implementation of generic hash-tables used in SQLite.
13** We've modified it slightly to serve as a standalone hash table
14** implementation for the full-text indexing module.
15*/
16
17/*
18** The code in this file is only compiled if:
19**
20** * The FTS3 module is being built as an extension
21** (in which case SQLITE_CORE is not defined), or
22**
23** * The FTS3 module is being built into the core of
24** SQLite (in which case SQLITE_ENABLE_FTS3 is defined).
25*/
26#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
27
28#include <assert.h>
29#include <stdlib.h>
30#include <string.h>
31
32#include "fts3_hash.h"
33
34/*
35** Malloc and Free functions
36*/
37static void *fts3HashMalloc(int n){
38 void *p = sqlite3_malloc(n);
39 if( p ){
40 memset(p, 0, n);
41 }
42 return p;
43}
44static void fts3HashFree(void *p){
45 sqlite3_free(p);
46}
47
48/* Turn bulk memory into a hash table object by initializing the
49** fields of the Hash structure.
50**
51** "pNew" is a pointer to the hash table that is to be initialized.
52** keyClass is one of the constants
53** FTS3_HASH_BINARY or FTS3_HASH_STRING. The value of keyClass
54** determines what kind of key the hash table will use. "copyKey" is
55** true if the hash table should make its own private copy of keys and
56** false if it should just use the supplied pointer.
57*/
58void sqlite3Fts3HashInit(fts3Hash *pNew, int keyClass, int copyKey){
59 assert( pNew!=0 );
60 assert( keyClass>=FTS3_HASH_STRING && keyClass<=FTS3_HASH_BINARY );
61 pNew->keyClass = keyClass;
62 pNew->copyKey = copyKey;
63 pNew->first = 0;
64 pNew->count = 0;
65 pNew->htsize = 0;
66 pNew->ht = 0;
67}
68
69/* Remove all entries from a hash table. Reclaim all memory.
70** Call this routine to delete a hash table or to reset a hash table
71** to the empty state.
72*/
73void sqlite3Fts3HashClear(fts3Hash *pH){
74 fts3HashElem *elem; /* For looping over all elements of the table */
75
76 assert( pH!=0 );
77 elem = pH->first;
78 pH->first = 0;
79 fts3HashFree(pH->ht);
80 pH->ht = 0;
81 pH->htsize = 0;
82 while( elem ){
83 fts3HashElem *next_elem = elem->next;
84 if( pH->copyKey && elem->pKey ){
85 fts3HashFree(elem->pKey);
86 }
87 fts3HashFree(elem);
88 elem = next_elem;
89 }
90 pH->count = 0;
91}
92
93/*
94** Hash and comparison functions when the mode is FTS3_HASH_STRING
95*/
96static int fts3StrHash(const void *pKey, int nKey){
97 const char *z = (const char *)pKey;
98 int h = 0;
99 if( nKey<=0 ) nKey = (int) strlen(z);
100 while( nKey > 0 ){
101 h = (h<<3) ^ h ^ *z++;
102 nKey--;
103 }
104 return h & 0x7fffffff;
105}
106static int fts3StrCompare(const void *pKey1, int n1, const void *pKey2, int n2){
107 if( n1!=n2 ) return 1;
108 return strncmp((const char*)pKey1,(const char*)pKey2,n1);
109}
110
111/*
112** Hash and comparison functions when the mode is FTS3_HASH_BINARY
113*/
114static int fts3BinHash(const void *pKey, int nKey){
115 int h = 0;
116 const char *z = (const char *)pKey;
117 while( nKey-- > 0 ){
118 h = (h<<3) ^ h ^ *(z++);
119 }
120 return h & 0x7fffffff;
121}
122static int fts3BinCompare(const void *pKey1, int n1, const void *pKey2, int n2){
123 if( n1!=n2 ) return 1;
124 return memcmp(pKey1,pKey2,n1);
125}
126
127/*
128** Return a pointer to the appropriate hash function given the key class.
129**
130** The C syntax in this function definition may be unfamilar to some
131** programmers, so we provide the following additional explanation:
132**
133** The name of the function is "hashFunction". The function takes a
134** single parameter "keyClass". The return value of hashFunction()
135** is a pointer to another function. Specifically, the return value
136** of hashFunction() is a pointer to a function that takes two parameters
137** with types "const void*" and "int" and returns an "int".
138*/
139static int (*hashFunction(int keyClass))(const void*,int){
140 if( keyClass==FTS3_HASH_STRING ){
141 return &fts3StrHash;
142 }else{
143 assert( keyClass==FTS3_HASH_BINARY );
144 return &fts3BinHash;
145 }
146}
147
148/*
149** Return a pointer to the appropriate hash function given the key class.
150**
151** For help in interpreted the obscure C code in the function definition,
152** see the header comment on the previous function.
153*/
154static int (*compareFunction(int keyClass))(const void*,int,const void*,int){
155 if( keyClass==FTS3_HASH_STRING ){
156 return &fts3StrCompare;
157 }else{
158 assert( keyClass==FTS3_HASH_BINARY );
159 return &fts3BinCompare;
160 }
161}
162
163/* Link an element into the hash table
164*/
165static void fts3HashInsertElement(
166 fts3Hash *pH, /* The complete hash table */
167 struct _fts3ht *pEntry, /* The entry into which pNew is inserted */
168 fts3HashElem *pNew /* The element to be inserted */
169){
170 fts3HashElem *pHead; /* First element already in pEntry */
171 pHead = pEntry->chain;
172 if( pHead ){
173 pNew->next = pHead;
174 pNew->prev = pHead->prev;
175 if( pHead->prev ){ pHead->prev->next = pNew; }
176 else { pH->first = pNew; }
177 pHead->prev = pNew;
178 }else{
179 pNew->next = pH->first;
180 if( pH->first ){ pH->first->prev = pNew; }
181 pNew->prev = 0;
182 pH->first = pNew;
183 }
184 pEntry->count++;
185 pEntry->chain = pNew;
186}
187
188
189/* Resize the hash table so that it cantains "new_size" buckets.
190** "new_size" must be a power of 2. The hash table might fail
191** to resize if sqliteMalloc() fails.
192*/
193static void fts3Rehash(fts3Hash *pH, int new_size){
194 struct _fts3ht *new_ht; /* The new hash table */
195 fts3HashElem *elem, *next_elem; /* For looping over existing elements */
196 int (*xHash)(const void*,int); /* The hash function */
197
198 assert( (new_size & (new_size-1))==0 );
199 new_ht = (struct _fts3ht *)fts3HashMalloc( new_size*sizeof(struct _fts3ht) );
200 if( new_ht==0 ) return;
201 fts3HashFree(pH->ht);
202 pH->ht = new_ht;
203 pH->htsize = new_size;
204 xHash = hashFunction(pH->keyClass);
205 for(elem=pH->first, pH->first=0; elem; elem = next_elem){
206 int h = (*xHash)(elem->pKey, elem->nKey) & (new_size-1);
207 next_elem = elem->next;
208 fts3HashInsertElement(pH, &new_ht[h], elem);
209 }
210}
211
212/* This function (for internal use only) locates an element in an
213** hash table that matches the given key. The hash for this key has
214** already been computed and is passed as the 4th parameter.
215*/
216static fts3HashElem *fts3FindElementByHash(
217 const fts3Hash *pH, /* The pH to be searched */
218 const void *pKey, /* The key we are searching for */
219 int nKey,
220 int h /* The hash for this key. */
221){
222 fts3HashElem *elem; /* Used to loop thru the element list */
223 int count; /* Number of elements left to test */
224 int (*xCompare)(const void*,int,const void*,int); /* comparison function */
225
226 if( pH->ht ){
227 struct _fts3ht *pEntry = &pH->ht[h];
228 elem = pEntry->chain;
229 count = pEntry->count;
230 xCompare = compareFunction(pH->keyClass);
231 while( count-- && elem ){
232 if( (*xCompare)(elem->pKey,elem->nKey,pKey,nKey)==0 ){
233 return elem;
234 }
235 elem = elem->next;
236 }
237 }
238 return 0;
239}
240
241/* Remove a single entry from the hash table given a pointer to that
242** element and a hash on the element's key.
243*/
244static void fts3RemoveElementByHash(
245 fts3Hash *pH, /* The pH containing "elem" */
246 fts3HashElem* elem, /* The element to be removed from the pH */
247 int h /* Hash value for the element */
248){
249 struct _fts3ht *pEntry;
250 if( elem->prev ){
251 elem->prev->next = elem->next;
252 }else{
253 pH->first = elem->next;
254 }
255 if( elem->next ){
256 elem->next->prev = elem->prev;
257 }
258 pEntry = &pH->ht[h];
259 if( pEntry->chain==elem ){
260 pEntry->chain = elem->next;
261 }
262 pEntry->count--;
263 if( pEntry->count<=0 ){
264 pEntry->chain = 0;
265 }
266 if( pH->copyKey && elem->pKey ){
267 fts3HashFree(elem->pKey);
268 }
269 fts3HashFree( elem );
270 pH->count--;
271 if( pH->count<=0 ){
272 assert( pH->first==0 );
273 assert( pH->count==0 );
274 fts3HashClear(pH);
275 }
276}
277
278/* Attempt to locate an element of the hash table pH with a key
279** that matches pKey,nKey. Return the data for this element if it is
280** found, or NULL if there is no match.
281*/
282void *sqlite3Fts3HashFind(const fts3Hash *pH, const void *pKey, int nKey){
283 int h; /* A hash on key */
284 fts3HashElem *elem; /* The element that matches key */
285 int (*xHash)(const void*,int); /* The hash function */
286
287 if( pH==0 || pH->ht==0 ) return 0;
288 xHash = hashFunction(pH->keyClass);
289 assert( xHash!=0 );
290 h = (*xHash)(pKey,nKey);
291 assert( (pH->htsize & (pH->htsize-1))==0 );
292 elem = fts3FindElementByHash(pH,pKey,nKey, h & (pH->htsize-1));
293 return elem ? elem->data : 0;
294}
295
296/* Insert an element into the hash table pH. The key is pKey,nKey
297** and the data is "data".
298**
299** If no element exists with a matching key, then a new
300** element is created. A copy of the key is made if the copyKey
301** flag is set. NULL is returned.
302**
303** If another element already exists with the same key, then the
304** new data replaces the old data and the old data is returned.
305** The key is not copied in this instance. If a malloc fails, then
306** the new data is returned and the hash table is unchanged.
307**
308** If the "data" parameter to this function is NULL, then the
309** element corresponding to "key" is removed from the hash table.
310*/
311void *sqlite3Fts3HashInsert(
312 fts3Hash *pH, /* The hash table to insert into */
313 const void *pKey, /* The key */
314 int nKey, /* Number of bytes in the key */
315 void *data /* The data */
316){
317 int hraw; /* Raw hash value of the key */
318 int h; /* the hash of the key modulo hash table size */
319 fts3HashElem *elem; /* Used to loop thru the element list */
320 fts3HashElem *new_elem; /* New element added to the pH */
321 int (*xHash)(const void*,int); /* The hash function */
322
323 assert( pH!=0 );
324 xHash = hashFunction(pH->keyClass);
325 assert( xHash!=0 );
326 hraw = (*xHash)(pKey, nKey);
327 assert( (pH->htsize & (pH->htsize-1))==0 );
328 h = hraw & (pH->htsize-1);
329 elem = fts3FindElementByHash(pH,pKey,nKey,h);
330 if( elem ){
331 void *old_data = elem->data;
332 if( data==0 ){
333 fts3RemoveElementByHash(pH,elem,h);
334 }else{
335 elem->data = data;
336 }
337 return old_data;
338 }
339 if( data==0 ) return 0;
340 new_elem = (fts3HashElem*)fts3HashMalloc( sizeof(fts3HashElem) );
341 if( new_elem==0 ) return data;
342 if( pH->copyKey && pKey!=0 ){
343 new_elem->pKey = fts3HashMalloc( nKey );
344 if( new_elem->pKey==0 ){
345 fts3HashFree(new_elem);
346 return data;
347 }
348 memcpy((void*)new_elem->pKey, pKey, nKey);
349 }else{
350 new_elem->pKey = (void*)pKey;
351 }
352 new_elem->nKey = nKey;
353 pH->count++;
354 if( pH->htsize==0 ){
355 fts3Rehash(pH,8);
356 if( pH->htsize==0 ){
357 pH->count = 0;
358 fts3HashFree(new_elem);
359 return data;
360 }
361 }
362 if( pH->count > pH->htsize ){
363 fts3Rehash(pH,pH->htsize*2);
364 }
365 assert( pH->htsize>0 );
366 assert( (pH->htsize & (pH->htsize-1))==0 );
367 h = hraw & (pH->htsize-1);
368 fts3HashInsertElement(pH, &pH->ht[h], new_elem);
369 new_elem->data = data;
370 return 0;
371}
372
373#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_hash.h b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_hash.h
deleted file mode 100644
index e01954e..0000000
--- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_hash.h
+++ /dev/null
@@ -1,110 +0,0 @@
1/*
2** 2001 September 22
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12** This is the header file for the generic hash-table implemenation
13** used in SQLite. We've modified it slightly to serve as a standalone
14** hash table implementation for the full-text indexing module.
15**
16*/
17#ifndef _FTS3_HASH_H_
18#define _FTS3_HASH_H_
19
20/* Forward declarations of structures. */
21typedef struct fts3Hash fts3Hash;
22typedef struct fts3HashElem fts3HashElem;
23
24/* A complete hash table is an instance of the following structure.
25** The internals of this structure are intended to be opaque -- client
26** code should not attempt to access or modify the fields of this structure
27** directly. Change this structure only by using the routines below.
28** However, many of the "procedures" and "functions" for modifying and
29** accessing this structure are really macros, so we can't really make
30** this structure opaque.
31*/
32struct fts3Hash {
33 char keyClass; /* HASH_INT, _POINTER, _STRING, _BINARY */
34 char copyKey; /* True if copy of key made on insert */
35 int count; /* Number of entries in this table */
36 fts3HashElem *first; /* The first element of the array */
37 int htsize; /* Number of buckets in the hash table */
38 struct _fts3ht { /* the hash table */
39 int count; /* Number of entries with this hash */
40 fts3HashElem *chain; /* Pointer to first entry with this hash */
41 } *ht;
42};
43
44/* Each element in the hash table is an instance of the following
45** structure. All elements are stored on a single doubly-linked list.
46**
47** Again, this structure is intended to be opaque, but it can't really
48** be opaque because it is used by macros.
49*/
50struct fts3HashElem {
51 fts3HashElem *next, *prev; /* Next and previous elements in the table */
52 void *data; /* Data associated with this element */
53 void *pKey; int nKey; /* Key associated with this element */
54};
55
56/*
57** There are 2 different modes of operation for a hash table:
58**
59** FTS3_HASH_STRING pKey points to a string that is nKey bytes long
60** (including the null-terminator, if any). Case
61** is respected in comparisons.
62**
63** FTS3_HASH_BINARY pKey points to binary data nKey bytes long.
64** memcmp() is used to compare keys.
65**
66** A copy of the key is made if the copyKey parameter to fts3HashInit is 1.
67*/
68#define FTS3_HASH_STRING 1
69#define FTS3_HASH_BINARY 2
70
71/*
72** Access routines. To delete, insert a NULL pointer.
73*/
74void sqlite3Fts3HashInit(fts3Hash*, int keytype, int copyKey);
75void *sqlite3Fts3HashInsert(fts3Hash*, const void *pKey, int nKey, void *pData);
76void *sqlite3Fts3HashFind(const fts3Hash*, const void *pKey, int nKey);
77void sqlite3Fts3HashClear(fts3Hash*);
78
79/*
80** Shorthand for the functions above
81*/
82#define fts3HashInit sqlite3Fts3HashInit
83#define fts3HashInsert sqlite3Fts3HashInsert
84#define fts3HashFind sqlite3Fts3HashFind
85#define fts3HashClear sqlite3Fts3HashClear
86
87/*
88** Macros for looping over all elements of a hash table. The idiom is
89** like this:
90**
91** fts3Hash h;
92** fts3HashElem *p;
93** ...
94** for(p=fts3HashFirst(&h); p; p=fts3HashNext(p)){
95** SomeStructure *pData = fts3HashData(p);
96** // do something with pData
97** }
98*/
99#define fts3HashFirst(H) ((H)->first)
100#define fts3HashNext(E) ((E)->next)
101#define fts3HashData(E) ((E)->data)
102#define fts3HashKey(E) ((E)->pKey)
103#define fts3HashKeysize(E) ((E)->nKey)
104
105/*
106** Number of entries in a hash table
107*/
108#define fts3HashCount(H) ((H)->count)
109
110#endif /* _FTS3_HASH_H_ */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_icu.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_icu.c
deleted file mode 100644
index 86a9a50..0000000
--- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_icu.c
+++ /dev/null
@@ -1,257 +0,0 @@
1/*
2** 2007 June 22
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12** This file implements a tokenizer for fts3 based on the ICU library.
13**
14** $Id: fts3_icu.c,v 1.1 2007/08/20 17:37:04 shess Exp $
15*/
16
17#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
18#ifdef SQLITE_ENABLE_ICU
19
20#include <assert.h>
21#include <string.h>
22#include "fts3_tokenizer.h"
23
24#include <unicode/ubrk.h>
25#include <unicode/ucol.h>
26#include <unicode/ustring.h>
27#include <unicode/utf16.h>
28
29typedef struct IcuTokenizer IcuTokenizer;
30typedef struct IcuCursor IcuCursor;
31
32struct IcuTokenizer {
33 sqlite3_tokenizer base;
34 char *zLocale;
35};
36
37struct IcuCursor {
38 sqlite3_tokenizer_cursor base;
39
40 UBreakIterator *pIter; /* ICU break-iterator object */
41 int nChar; /* Number of UChar elements in pInput */
42 UChar *aChar; /* Copy of input using utf-16 encoding */
43 int *aOffset; /* Offsets of each character in utf-8 input */
44
45 int nBuffer;
46 char *zBuffer;
47
48 int iToken;
49};
50
51/*
52** Create a new tokenizer instance.
53*/
54static int icuCreate(
55 int argc, /* Number of entries in argv[] */
56 const char * const *argv, /* Tokenizer creation arguments */
57 sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */
58){
59 IcuTokenizer *p;
60 int n = 0;
61
62 if( argc>0 ){
63 n = strlen(argv[0])+1;
64 }
65 p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n);
66 if( !p ){
67 return SQLITE_NOMEM;
68 }
69 memset(p, 0, sizeof(IcuTokenizer));
70
71 if( n ){
72 p->zLocale = (char *)&p[1];
73 memcpy(p->zLocale, argv[0], n);
74 }
75
76 *ppTokenizer = (sqlite3_tokenizer *)p;
77
78 return SQLITE_OK;
79}
80
81/*
82** Destroy a tokenizer
83*/
84static int icuDestroy(sqlite3_tokenizer *pTokenizer){
85 IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
86 sqlite3_free(p);
87 return SQLITE_OK;
88}
89
90/*
91** Prepare to begin tokenizing a particular string. The input
92** string to be tokenized is pInput[0..nBytes-1]. A cursor
93** used to incrementally tokenize this string is returned in
94** *ppCursor.
95*/
96static int icuOpen(
97 sqlite3_tokenizer *pTokenizer, /* The tokenizer */
98 const char *zInput, /* Input string */
99 int nInput, /* Length of zInput in bytes */
100 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
101){
102 IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
103 IcuCursor *pCsr;
104
105 const int32_t opt = U_FOLD_CASE_DEFAULT;
106 UErrorCode status = U_ZERO_ERROR;
107 int nChar;
108
109 UChar32 c;
110 int iInput = 0;
111 int iOut = 0;
112
113 *ppCursor = 0;
114
115 nChar = nInput+1;
116 pCsr = (IcuCursor *)sqlite3_malloc(
117 sizeof(IcuCursor) + /* IcuCursor */
118 nChar * sizeof(UChar) + /* IcuCursor.aChar[] */
119 (nChar+1) * sizeof(int) /* IcuCursor.aOffset[] */
120 );
121 if( !pCsr ){
122 return SQLITE_NOMEM;
123 }
124 memset(pCsr, 0, sizeof(IcuCursor));
125 pCsr->aChar = (UChar *)&pCsr[1];
126 pCsr->aOffset = (int *)&pCsr->aChar[nChar];
127
128 pCsr->aOffset[iOut] = iInput;
129 U8_NEXT(zInput, iInput, nInput, c);
130 while( c>0 ){
131 int isError = 0;
132 c = u_foldCase(c, opt);
133 U16_APPEND(pCsr->aChar, iOut, nChar, c, isError);
134 if( isError ){
135 sqlite3_free(pCsr);
136 return SQLITE_ERROR;
137 }
138 pCsr->aOffset[iOut] = iInput;
139
140 if( iInput<nInput ){
141 U8_NEXT(zInput, iInput, nInput, c);
142 }else{
143 c = 0;
144 }
145 }
146
147 pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status);
148 if( !U_SUCCESS(status) ){
149 sqlite3_free(pCsr);
150 return SQLITE_ERROR;
151 }
152 pCsr->nChar = iOut;
153
154 ubrk_first(pCsr->pIter);
155 *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
156 return SQLITE_OK;
157}
158
159/*
160** Close a tokenization cursor previously opened by a call to icuOpen().
161*/
162static int icuClose(sqlite3_tokenizer_cursor *pCursor){
163 IcuCursor *pCsr = (IcuCursor *)pCursor;
164 ubrk_close(pCsr->pIter);
165 sqlite3_free(pCsr->zBuffer);
166 sqlite3_free(pCsr);
167 return SQLITE_OK;
168}
169
170/*
171** Extract the next token from a tokenization cursor.
172*/
173static int icuNext(
174 sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */
175 const char **ppToken, /* OUT: *ppToken is the token text */
176 int *pnBytes, /* OUT: Number of bytes in token */
177 int *piStartOffset, /* OUT: Starting offset of token */
178 int *piEndOffset, /* OUT: Ending offset of token */
179 int *piPosition /* OUT: Position integer of token */
180){
181 IcuCursor *pCsr = (IcuCursor *)pCursor;
182
183 int iStart = 0;
184 int iEnd = 0;
185 int nByte = 0;
186
187 while( iStart==iEnd ){
188 UChar32 c;
189
190 iStart = ubrk_current(pCsr->pIter);
191 iEnd = ubrk_next(pCsr->pIter);
192 if( iEnd==UBRK_DONE ){
193 return SQLITE_DONE;
194 }
195
196 while( iStart<iEnd ){
197 int iWhite = iStart;
198 U8_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);
199 if( u_isspace(c) ){
200 iStart = iWhite;
201 }else{
202 break;
203 }
204 }
205 assert(iStart<=iEnd);
206 }
207
208 do {
209 UErrorCode status = U_ZERO_ERROR;
210 if( nByte ){
211 char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);
212 if( !zNew ){
213 return SQLITE_NOMEM;
214 }
215 pCsr->zBuffer = zNew;
216 pCsr->nBuffer = nByte;
217 }
218
219 u_strToUTF8(
220 pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */
221 &pCsr->aChar[iStart], iEnd-iStart, /* Input vars */
222 &status /* Output success/failure */
223 );
224 } while( nByte>pCsr->nBuffer );
225
226 *ppToken = pCsr->zBuffer;
227 *pnBytes = nByte;
228 *piStartOffset = pCsr->aOffset[iStart];
229 *piEndOffset = pCsr->aOffset[iEnd];
230 *piPosition = pCsr->iToken++;
231
232 return SQLITE_OK;
233}
234
235/*
236** The set of routines that implement the simple tokenizer
237*/
238static const sqlite3_tokenizer_module icuTokenizerModule = {
239 0, /* iVersion */
240 icuCreate, /* xCreate */
241 icuDestroy, /* xCreate */
242 icuOpen, /* xOpen */
243 icuClose, /* xClose */
244 icuNext, /* xNext */
245};
246
247/*
248** Set *ppModule to point at the implementation of the ICU tokenizer.
249*/
250void sqlite3Fts3IcuTokenizerModule(
251 sqlite3_tokenizer_module const**ppModule
252){
253 *ppModule = &icuTokenizerModule;
254}
255
256#endif /* defined(SQLITE_ENABLE_ICU) */
257#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_porter.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_porter.c
deleted file mode 100644
index 14e129f..0000000
--- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_porter.c
+++ /dev/null
@@ -1,642 +0,0 @@
1/*
2** 2006 September 30
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12** Implementation of the full-text-search tokenizer that implements
13** a Porter stemmer.
14*/
15
16/*
17** The code in this file is only compiled if:
18**
19** * The FTS3 module is being built as an extension
20** (in which case SQLITE_CORE is not defined), or
21**
22** * The FTS3 module is being built into the core of
23** SQLite (in which case SQLITE_ENABLE_FTS3 is defined).
24*/
25#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
26
27
28#include <assert.h>
29#include <stdlib.h>
30#include <stdio.h>
31#include <string.h>
32#include <ctype.h>
33
34#include "fts3_tokenizer.h"
35
36/*
37** Class derived from sqlite3_tokenizer
38*/
39typedef struct porter_tokenizer {
40 sqlite3_tokenizer base; /* Base class */
41} porter_tokenizer;
42
43/*
44** Class derived from sqlit3_tokenizer_cursor
45*/
46typedef struct porter_tokenizer_cursor {
47 sqlite3_tokenizer_cursor base;
48 const char *zInput; /* input we are tokenizing */
49 int nInput; /* size of the input */
50 int iOffset; /* current position in zInput */
51 int iToken; /* index of next token to be returned */
52 char *zToken; /* storage for current token */
53 int nAllocated; /* space allocated to zToken buffer */
54} porter_tokenizer_cursor;
55
56
57/* Forward declaration */
58static const sqlite3_tokenizer_module porterTokenizerModule;
59
60
61/*
62** Create a new tokenizer instance.
63*/
64static int porterCreate(
65 int argc, const char * const *argv,
66 sqlite3_tokenizer **ppTokenizer
67){
68 porter_tokenizer *t;
69 t = (porter_tokenizer *) calloc(sizeof(*t), 1);
70 if( t==NULL ) return SQLITE_NOMEM;
71
72 *ppTokenizer = &t->base;
73 return SQLITE_OK;
74}
75
76/*
77** Destroy a tokenizer
78*/
79static int porterDestroy(sqlite3_tokenizer *pTokenizer){
80 free(pTokenizer);
81 return SQLITE_OK;
82}
83
84/*
85** Prepare to begin tokenizing a particular string. The input
86** string to be tokenized is zInput[0..nInput-1]. A cursor
87** used to incrementally tokenize this string is returned in
88** *ppCursor.
89*/
90static int porterOpen(
91 sqlite3_tokenizer *pTokenizer, /* The tokenizer */
92 const char *zInput, int nInput, /* String to be tokenized */
93 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
94){
95 porter_tokenizer_cursor *c;
96
97 c = (porter_tokenizer_cursor *) malloc(sizeof(*c));
98 if( c==NULL ) return SQLITE_NOMEM;
99
100 c->zInput = zInput;
101 if( zInput==0 ){
102 c->nInput = 0;
103 }else if( nInput<0 ){
104 c->nInput = (int)strlen(zInput);
105 }else{
106 c->nInput = nInput;
107 }
108 c->iOffset = 0; /* start tokenizing at the beginning */
109 c->iToken = 0;
110 c->zToken = NULL; /* no space allocated, yet. */
111 c->nAllocated = 0;
112
113 *ppCursor = &c->base;
114 return SQLITE_OK;
115}
116
117/*
118** Close a tokenization cursor previously opened by a call to
119** porterOpen() above.
120*/
121static int porterClose(sqlite3_tokenizer_cursor *pCursor){
122 porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
123 free(c->zToken);
124 free(c);
125 return SQLITE_OK;
126}
127/*
128** Vowel or consonant
129*/
130static const char cType[] = {
131 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
132 1, 1, 1, 2, 1
133};
134
135/*
136** isConsonant() and isVowel() determine if their first character in
137** the string they point to is a consonant or a vowel, according
138** to Porter ruls.
139**
140** A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'.
141** 'Y' is a consonant unless it follows another consonant,
142** in which case it is a vowel.
143**
144** In these routine, the letters are in reverse order. So the 'y' rule
145** is that 'y' is a consonant unless it is followed by another
146** consonent.
147*/
148static int isVowel(const char*);
149static int isConsonant(const char *z){
150 int j;
151 char x = *z;
152 if( x==0 ) return 0;
153 assert( x>='a' && x<='z' );
154 j = cType[x-'a'];
155 if( j<2 ) return j;
156 return z[1]==0 || isVowel(z + 1);
157}
158static int isVowel(const char *z){
159 int j;
160 char x = *z;
161 if( x==0 ) return 0;
162 assert( x>='a' && x<='z' );
163 j = cType[x-'a'];
164 if( j<2 ) return 1-j;
165 return isConsonant(z + 1);
166}
167
168/*
169** Let any sequence of one or more vowels be represented by V and let
170** C be sequence of one or more consonants. Then every word can be
171** represented as:
172**
173** [C] (VC){m} [V]
174**
175** In prose: A word is an optional consonant followed by zero or
176** vowel-consonant pairs followed by an optional vowel. "m" is the
177** number of vowel consonant pairs. This routine computes the value
178** of m for the first i bytes of a word.
179**
180** Return true if the m-value for z is 1 or more. In other words,
181** return true if z contains at least one vowel that is followed
182** by a consonant.
183**
184** In this routine z[] is in reverse order. So we are really looking
185** for an instance of of a consonant followed by a vowel.
186*/
187static int m_gt_0(const char *z){
188 while( isVowel(z) ){ z++; }
189 if( *z==0 ) return 0;
190 while( isConsonant(z) ){ z++; }
191 return *z!=0;
192}
193
194/* Like mgt0 above except we are looking for a value of m which is
195** exactly 1
196*/
197static int m_eq_1(const char *z){
198 while( isVowel(z) ){ z++; }
199 if( *z==0 ) return 0;
200 while( isConsonant(z) ){ z++; }
201 if( *z==0 ) return 0;
202 while( isVowel(z) ){ z++; }
203 if( *z==0 ) return 1;
204 while( isConsonant(z) ){ z++; }
205 return *z==0;
206}
207
208/* Like mgt0 above except we are looking for a value of m>1 instead
209** or m>0
210*/
211static int m_gt_1(const char *z){
212 while( isVowel(z) ){ z++; }
213 if( *z==0 ) return 0;
214 while( isConsonant(z) ){ z++; }
215 if( *z==0 ) return 0;
216 while( isVowel(z) ){ z++; }
217 if( *z==0 ) return 0;
218 while( isConsonant(z) ){ z++; }
219 return *z!=0;
220}
221
222/*
223** Return TRUE if there is a vowel anywhere within z[0..n-1]
224*/
225static int hasVowel(const char *z){
226 while( isConsonant(z) ){ z++; }
227 return *z!=0;
228}
229
230/*
231** Return TRUE if the word ends in a double consonant.
232**
233** The text is reversed here. So we are really looking at
234** the first two characters of z[].
235*/
236static int doubleConsonant(const char *z){
237 return isConsonant(z) && z[0]==z[1] && isConsonant(z+1);
238}
239
240/*
241** Return TRUE if the word ends with three letters which
242** are consonant-vowel-consonent and where the final consonant
243** is not 'w', 'x', or 'y'.
244**
245** The word is reversed here. So we are really checking the
246** first three letters and the first one cannot be in [wxy].
247*/
248static int star_oh(const char *z){
249 return
250 z[0]!=0 && isConsonant(z) &&
251 z[0]!='w' && z[0]!='x' && z[0]!='y' &&
252 z[1]!=0 && isVowel(z+1) &&
253 z[2]!=0 && isConsonant(z+2);
254}
255
256/*
257** If the word ends with zFrom and xCond() is true for the stem
258** of the word that preceeds the zFrom ending, then change the
259** ending to zTo.
260**
261** The input word *pz and zFrom are both in reverse order. zTo
262** is in normal order.
263**
264** Return TRUE if zFrom matches. Return FALSE if zFrom does not
265** match. Not that TRUE is returned even if xCond() fails and
266** no substitution occurs.
267*/
268static int stem(
269 char **pz, /* The word being stemmed (Reversed) */
270 const char *zFrom, /* If the ending matches this... (Reversed) */
271 const char *zTo, /* ... change the ending to this (not reversed) */
272 int (*xCond)(const char*) /* Condition that must be true */
273){
274 char *z = *pz;
275 while( *zFrom && *zFrom==*z ){ z++; zFrom++; }
276 if( *zFrom!=0 ) return 0;
277 if( xCond && !xCond(z) ) return 1;
278 while( *zTo ){
279 *(--z) = *(zTo++);
280 }
281 *pz = z;
282 return 1;
283}
284
285/*
286** This is the fallback stemmer used when the porter stemmer is
287** inappropriate. The input word is copied into the output with
288** US-ASCII case folding. If the input word is too long (more
289** than 20 bytes if it contains no digits or more than 6 bytes if
290** it contains digits) then word is truncated to 20 or 6 bytes
291** by taking 10 or 3 bytes from the beginning and end.
292*/
293static void copy_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
294 int i, mx, j;
295 int hasDigit = 0;
296 for(i=0; i<nIn; i++){
297 int c = zIn[i];
298 if( c>='A' && c<='Z' ){
299 zOut[i] = c - 'A' + 'a';
300 }else{
301 if( c>='0' && c<='9' ) hasDigit = 1;
302 zOut[i] = c;
303 }
304 }
305 mx = hasDigit ? 3 : 10;
306 if( nIn>mx*2 ){
307 for(j=mx, i=nIn-mx; i<nIn; i++, j++){
308 zOut[j] = zOut[i];
309 }
310 i = j;
311 }
312 zOut[i] = 0;
313 *pnOut = i;
314}
315
316
317/*
318** Stem the input word zIn[0..nIn-1]. Store the output in zOut.
319** zOut is at least big enough to hold nIn bytes. Write the actual
320** size of the output word (exclusive of the '\0' terminator) into *pnOut.
321**
322** Any upper-case characters in the US-ASCII character set ([A-Z])
323** are converted to lower case. Upper-case UTF characters are
324** unchanged.
325**
326** Words that are longer than about 20 bytes are stemmed by retaining
327** a few bytes from the beginning and the end of the word. If the
328** word contains digits, 3 bytes are taken from the beginning and
329** 3 bytes from the end. For long words without digits, 10 bytes
330** are taken from each end. US-ASCII case folding still applies.
331**
332** If the input word contains not digits but does characters not
333** in [a-zA-Z] then no stemming is attempted and this routine just
334** copies the input into the input into the output with US-ASCII
335** case folding.
336**
337** Stemming never increases the length of the word. So there is
338** no chance of overflowing the zOut buffer.
339*/
340static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
341 int i, j, c;
342 char zReverse[28];
343 char *z, *z2;
344 if( nIn<3 || nIn>=sizeof(zReverse)-7 ){
345 /* The word is too big or too small for the porter stemmer.
346 ** Fallback to the copy stemmer */
347 copy_stemmer(zIn, nIn, zOut, pnOut);
348 return;
349 }
350 for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){
351 c = zIn[i];
352 if( c>='A' && c<='Z' ){
353 zReverse[j] = c + 'a' - 'A';
354 }else if( c>='a' && c<='z' ){
355 zReverse[j] = c;
356 }else{
357 /* The use of a character not in [a-zA-Z] means that we fallback
358 ** to the copy stemmer */
359 copy_stemmer(zIn, nIn, zOut, pnOut);
360 return;
361 }
362 }
363 memset(&zReverse[sizeof(zReverse)-5], 0, 5);
364 z = &zReverse[j+1];
365
366
367 /* Step 1a */
368 if( z[0]=='s' ){
369 if(
370 !stem(&z, "sess", "ss", 0) &&
371 !stem(&z, "sei", "i", 0) &&
372 !stem(&z, "ss", "ss", 0)
373 ){
374 z++;
375 }
376 }
377
378 /* Step 1b */
379 z2 = z;
380 if( stem(&z, "dee", "ee", m_gt_0) ){
381 /* Do nothing. The work was all in the test */
382 }else if(
383 (stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel))
384 && z!=z2
385 ){
386 if( stem(&z, "ta", "ate", 0) ||
387 stem(&z, "lb", "ble", 0) ||
388 stem(&z, "zi", "ize", 0) ){
389 /* Do nothing. The work was all in the test */
390 }else if( doubleConsonant(z) && (*z!='l' && *z!='s' && *z!='z') ){
391 z++;
392 }else if( m_eq_1(z) && star_oh(z) ){
393 *(--z) = 'e';
394 }
395 }
396
397 /* Step 1c */
398 if( z[0]=='y' && hasVowel(z+1) ){
399 z[0] = 'i';
400 }
401
402 /* Step 2 */
403 switch( z[1] ){
404 case 'a':
405 stem(&z, "lanoita", "ate", m_gt_0) ||
406 stem(&z, "lanoit", "tion", m_gt_0);
407 break;
408 case 'c':
409 stem(&z, "icne", "ence", m_gt_0) ||
410 stem(&z, "icna", "ance", m_gt_0);
411 break;
412 case 'e':
413 stem(&z, "rezi", "ize", m_gt_0);
414 break;
415 case 'g':
416 stem(&z, "igol", "log", m_gt_0);
417 break;
418 case 'l':
419 stem(&z, "ilb", "ble", m_gt_0) ||
420 stem(&z, "illa", "al", m_gt_0) ||
421 stem(&z, "iltne", "ent", m_gt_0) ||
422 stem(&z, "ile", "e", m_gt_0) ||
423 stem(&z, "ilsuo", "ous", m_gt_0);
424 break;
425 case 'o':
426 stem(&z, "noitazi", "ize", m_gt_0) ||
427 stem(&z, "noita", "ate", m_gt_0) ||
428 stem(&z, "rota", "ate", m_gt_0);
429 break;
430 case 's':
431 stem(&z, "msila", "al", m_gt_0) ||
432 stem(&z, "ssenevi", "ive", m_gt_0) ||
433 stem(&z, "ssenluf", "ful", m_gt_0) ||
434 stem(&z, "ssensuo", "ous", m_gt_0);
435 break;
436 case 't':
437 stem(&z, "itila", "al", m_gt_0) ||
438 stem(&z, "itivi", "ive", m_gt_0) ||
439 stem(&z, "itilib", "ble", m_gt_0);
440 break;
441 }
442
443 /* Step 3 */
444 switch( z[0] ){
445 case 'e':
446 stem(&z, "etaci", "ic", m_gt_0) ||
447 stem(&z, "evita", "", m_gt_0) ||
448 stem(&z, "ezila", "al", m_gt_0);
449 break;
450 case 'i':
451 stem(&z, "itici", "ic", m_gt_0);
452 break;
453 case 'l':
454 stem(&z, "laci", "ic", m_gt_0) ||
455 stem(&z, "luf", "", m_gt_0);
456 break;
457 case 's':
458 stem(&z, "ssen", "", m_gt_0);
459 break;
460 }
461
462 /* Step 4 */
463 switch( z[1] ){
464 case 'a':
465 if( z[0]=='l' && m_gt_1(z+2) ){
466 z += 2;
467 }
468 break;
469 case 'c':
470 if( z[0]=='e' && z[2]=='n' && (z[3]=='a' || z[3]=='e') && m_gt_1(z+4) ){
471 z += 4;
472 }
473 break;
474 case 'e':
475 if( z[0]=='r' && m_gt_1(z+2) ){
476 z += 2;
477 }
478 break;
479 case 'i':
480 if( z[0]=='c' && m_gt_1(z+2) ){
481 z += 2;
482 }
483 break;
484 case 'l':
485 if( z[0]=='e' && z[2]=='b' && (z[3]=='a' || z[3]=='i') && m_gt_1(z+4) ){
486 z += 4;
487 }
488 break;
489 case 'n':
490 if( z[0]=='t' ){
491 if( z[2]=='a' ){
492 if( m_gt_1(z+3) ){
493 z += 3;
494 }
495 }else if( z[2]=='e' ){
496 stem(&z, "tneme", "", m_gt_1) ||
497 stem(&z, "tnem", "", m_gt_1) ||
498 stem(&z, "tne", "", m_gt_1);
499 }
500 }
501 break;
502 case 'o':
503 if( z[0]=='u' ){
504 if( m_gt_1(z+2) ){
505 z += 2;
506 }
507 }else if( z[3]=='s' || z[3]=='t' ){
508 stem(&z, "noi", "", m_gt_1);
509 }
510 break;
511 case 's':
512 if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){
513 z += 3;
514 }
515 break;
516 case 't':
517 stem(&z, "eta", "", m_gt_1) ||
518 stem(&z, "iti", "", m_gt_1);
519 break;
520 case 'u':
521 if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){
522 z += 3;
523 }
524 break;
525 case 'v':
526 case 'z':
527 if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){
528 z += 3;
529 }
530 break;
531 }
532
533 /* Step 5a */
534 if( z[0]=='e' ){
535 if( m_gt_1(z+1) ){
536 z++;
537 }else if( m_eq_1(z+1) && !star_oh(z+1) ){
538 z++;
539 }
540 }
541
542 /* Step 5b */
543 if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){
544 z++;
545 }
546
547 /* z[] is now the stemmed word in reverse order. Flip it back
548 ** around into forward order and return.
549 */
550 *pnOut = i = strlen(z);
551 zOut[i] = 0;
552 while( *z ){
553 zOut[--i] = *(z++);
554 }
555}
556
557/*
558** Characters that can be part of a token. We assume any character
559** whose value is greater than 0x80 (any UTF character) can be
560** part of a token. In other words, delimiters all must have
561** values of 0x7f or lower.
562*/
563static const char porterIdChar[] = {
564/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
565 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */
566 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */
567 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */
568 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */
569 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */
570};
571#define isDelim(C) (((ch=C)&0x80)==0 && (ch<0x30 || !porterIdChar[ch-0x30]))
572
573/*
574** Extract the next token from a tokenization cursor. The cursor must
575** have been opened by a prior call to porterOpen().
576*/
577static int porterNext(
578 sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by porterOpen */
579 const char **pzToken, /* OUT: *pzToken is the token text */
580 int *pnBytes, /* OUT: Number of bytes in token */
581 int *piStartOffset, /* OUT: Starting offset of token */
582 int *piEndOffset, /* OUT: Ending offset of token */
583 int *piPosition /* OUT: Position integer of token */
584){
585 porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
586 const char *z = c->zInput;
587
588 while( c->iOffset<c->nInput ){
589 int iStartOffset, ch;
590
591 /* Scan past delimiter characters */
592 while( c->iOffset<c->nInput && isDelim(z[c->iOffset]) ){
593 c->iOffset++;
594 }
595
596 /* Count non-delimiter characters. */
597 iStartOffset = c->iOffset;
598 while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){
599 c->iOffset++;
600 }
601
602 if( c->iOffset>iStartOffset ){
603 int n = c->iOffset-iStartOffset;
604 if( n>c->nAllocated ){
605 c->nAllocated = n+20;
606 c->zToken = realloc(c->zToken, c->nAllocated);
607 if( c->zToken==NULL ) return SQLITE_NOMEM;
608 }
609 porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes);
610 *pzToken = c->zToken;
611 *piStartOffset = iStartOffset;
612 *piEndOffset = c->iOffset;
613 *piPosition = c->iToken++;
614 return SQLITE_OK;
615 }
616 }
617 return SQLITE_DONE;
618}
619
620/*
621** The set of routines that implement the porter-stemmer tokenizer
622*/
623static const sqlite3_tokenizer_module porterTokenizerModule = {
624 0,
625 porterCreate,
626 porterDestroy,
627 porterOpen,
628 porterClose,
629 porterNext,
630};
631
632/*
633** Allocate a new porter tokenizer. Return a pointer to the new
634** tokenizer in *ppModule
635*/
636void sqlite3Fts3PorterTokenizerModule(
637 sqlite3_tokenizer_module const**ppModule
638){
639 *ppModule = &porterTokenizerModule;
640}
641
642#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_tokenizer.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_tokenizer.c
deleted file mode 100644
index 7398227..0000000
--- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_tokenizer.c
+++ /dev/null
@@ -1,371 +0,0 @@
1/*
2** 2007 June 22
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11******************************************************************************
12**
13** This is part of an SQLite module implementing full-text search.
14** This particular file implements the generic tokenizer interface.
15*/
16
17/*
18** The code in this file is only compiled if:
19**
20** * The FTS3 module is being built as an extension
21** (in which case SQLITE_CORE is not defined), or
22**
23** * The FTS3 module is being built into the core of
24** SQLite (in which case SQLITE_ENABLE_FTS3 is defined).
25*/
26#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
27
28
29#include "sqlite3.h"
30#include "sqlite3ext.h"
31SQLITE_EXTENSION_INIT1
32
33#include "fts3_hash.h"
34#include "fts3_tokenizer.h"
35#include <assert.h>
36
37/*
38** Implementation of the SQL scalar function for accessing the underlying
39** hash table. This function may be called as follows:
40**
41** SELECT <function-name>(<key-name>);
42** SELECT <function-name>(<key-name>, <pointer>);
43**
44** where <function-name> is the name passed as the second argument
45** to the sqlite3Fts3InitHashTable() function (e.g. 'fts3_tokenizer').
46**
47** If the <pointer> argument is specified, it must be a blob value
48** containing a pointer to be stored as the hash data corresponding
49** to the string <key-name>. If <pointer> is not specified, then
50** the string <key-name> must already exist in the has table. Otherwise,
51** an error is returned.
52**
53** Whether or not the <pointer> argument is specified, the value returned
54** is a blob containing the pointer stored as the hash data corresponding
55** to string <key-name> (after the hash-table is updated, if applicable).
56*/
57static void scalarFunc(
58 sqlite3_context *context,
59 int argc,
60 sqlite3_value **argv
61){
62 fts3Hash *pHash;
63 void *pPtr = 0;
64 const unsigned char *zName;
65 int nName;
66
67 assert( argc==1 || argc==2 );
68
69 pHash = (fts3Hash *)sqlite3_user_data(context);
70
71 zName = sqlite3_value_text(argv[0]);
72 nName = sqlite3_value_bytes(argv[0])+1;
73
74 if( argc==2 ){
75 void *pOld;
76 int n = sqlite3_value_bytes(argv[1]);
77 if( n!=sizeof(pPtr) ){
78 sqlite3_result_error(context, "argument type mismatch", -1);
79 return;
80 }
81 pPtr = *(void **)sqlite3_value_blob(argv[1]);
82 pOld = sqlite3Fts3HashInsert(pHash, (void *)zName, nName, pPtr);
83 if( pOld==pPtr ){
84 sqlite3_result_error(context, "out of memory", -1);
85 return;
86 }
87 }else{
88 pPtr = sqlite3Fts3HashFind(pHash, zName, nName);
89 if( !pPtr ){
90 char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName);
91 sqlite3_result_error(context, zErr, -1);
92 sqlite3_free(zErr);
93 return;
94 }
95 }
96
97 sqlite3_result_blob(context, (void *)&pPtr, sizeof(pPtr), SQLITE_TRANSIENT);
98}
99
100#ifdef SQLITE_TEST
101
102#include <tcl.h>
103#include <string.h>
104
105/*
106** Implementation of a special SQL scalar function for testing tokenizers
107** designed to be used in concert with the Tcl testing framework. This
108** function must be called with two arguments:
109**
110** SELECT <function-name>(<key-name>, <input-string>);
111** SELECT <function-name>(<key-name>, <pointer>);
112**
113** where <function-name> is the name passed as the second argument
114** to the sqlite3Fts3InitHashTable() function (e.g. 'fts3_tokenizer')
115** concatenated with the string '_test' (e.g. 'fts3_tokenizer_test').
116**
117** The return value is a string that may be interpreted as a Tcl
118** list. For each token in the <input-string>, three elements are
119** added to the returned list. The first is the token position, the
120** second is the token text (folded, stemmed, etc.) and the third is the
121** substring of <input-string> associated with the token. For example,
122** using the built-in "simple" tokenizer:
123**
124** SELECT fts_tokenizer_test('simple', 'I don't see how');
125**
126** will return the string:
127**
128** "{0 i I 1 dont don't 2 see see 3 how how}"
129**
130*/
131static void testFunc(
132 sqlite3_context *context,
133 int argc,
134 sqlite3_value **argv
135){
136 fts3Hash *pHash;
137 sqlite3_tokenizer_module *p;
138 sqlite3_tokenizer *pTokenizer = 0;
139 sqlite3_tokenizer_cursor *pCsr = 0;
140
141 const char *zErr = 0;
142
143 const char *zName;
144 int nName;
145 const char *zInput;
146 int nInput;
147
148 const char *zArg = 0;
149
150 const char *zToken;
151 int nToken;
152 int iStart;
153 int iEnd;
154 int iPos;
155
156 Tcl_Obj *pRet;
157
158 assert( argc==2 || argc==3 );
159
160 nName = sqlite3_value_bytes(argv[0]);
161 zName = (const char *)sqlite3_value_text(argv[0]);
162 nInput = sqlite3_value_bytes(argv[argc-1]);
163 zInput = (const char *)sqlite3_value_text(argv[argc-1]);
164
165 if( argc==3 ){
166 zArg = (const char *)sqlite3_value_text(argv[1]);
167 }
168
169 pHash = (fts3Hash *)sqlite3_user_data(context);
170 p = (sqlite3_tokenizer_module *)sqlite3Fts3HashFind(pHash, zName, nName+1);
171
172 if( !p ){
173 char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName);
174 sqlite3_result_error(context, zErr, -1);
175 sqlite3_free(zErr);
176 return;
177 }
178
179 pRet = Tcl_NewObj();
180 Tcl_IncrRefCount(pRet);
181
182 if( SQLITE_OK!=p->xCreate(zArg ? 1 : 0, &zArg, &pTokenizer) ){
183 zErr = "error in xCreate()";
184 goto finish;
185 }
186 pTokenizer->pModule = p;
187 if( SQLITE_OK!=p->xOpen(pTokenizer, zInput, nInput, &pCsr) ){
188 zErr = "error in xOpen()";
189 goto finish;
190 }
191 pCsr->pTokenizer = pTokenizer;
192
193 while( SQLITE_OK==p->xNext(pCsr, &zToken, &nToken, &iStart, &iEnd, &iPos) ){
194 Tcl_ListObjAppendElement(0, pRet, Tcl_NewIntObj(iPos));
195 Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
196 zToken = &zInput[iStart];
197 nToken = iEnd-iStart;
198 Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
199 }
200
201 if( SQLITE_OK!=p->xClose(pCsr) ){
202 zErr = "error in xClose()";
203 goto finish;
204 }
205 if( SQLITE_OK!=p->xDestroy(pTokenizer) ){
206 zErr = "error in xDestroy()";
207 goto finish;
208 }
209
210finish:
211 if( zErr ){
212 sqlite3_result_error(context, zErr, -1);
213 }else{
214 sqlite3_result_text(context, Tcl_GetString(pRet), -1, SQLITE_TRANSIENT);
215 }
216 Tcl_DecrRefCount(pRet);
217}
218
219static
220int registerTokenizer(
221 sqlite3 *db,
222 char *zName,
223 const sqlite3_tokenizer_module *p
224){
225 int rc;
226 sqlite3_stmt *pStmt;
227 const char zSql[] = "SELECT fts3_tokenizer(?, ?)";
228
229 rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
230 if( rc!=SQLITE_OK ){
231 return rc;
232 }
233
234 sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
235 sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
236 sqlite3_step(pStmt);
237
238 return sqlite3_finalize(pStmt);
239}
240
241static
242int queryTokenizer(
243 sqlite3 *db,
244 char *zName,
245 const sqlite3_tokenizer_module **pp
246){
247 int rc;
248 sqlite3_stmt *pStmt;
249 const char zSql[] = "SELECT fts3_tokenizer(?)";
250
251 *pp = 0;
252 rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
253 if( rc!=SQLITE_OK ){
254 return rc;
255 }
256
257 sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
258 if( SQLITE_ROW==sqlite3_step(pStmt) ){
259 if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){
260 memcpy(pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp));
261 }
262 }
263
264 return sqlite3_finalize(pStmt);
265}
266
267void sqlite3Fts3SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule);
268
269/*
270** Implementation of the scalar function fts3_tokenizer_internal_test().
271** This function is used for testing only, it is not included in the
272** build unless SQLITE_TEST is defined.
273**
274** The purpose of this is to test that the fts3_tokenizer() function
275** can be used as designed by the C-code in the queryTokenizer and
276** registerTokenizer() functions above. These two functions are repeated
277** in the README.tokenizer file as an example, so it is important to
278** test them.
279**
280** To run the tests, evaluate the fts3_tokenizer_internal_test() scalar
281** function with no arguments. An assert() will fail if a problem is
282** detected. i.e.:
283**
284** SELECT fts3_tokenizer_internal_test();
285**
286*/
287static void intTestFunc(
288 sqlite3_context *context,
289 int argc,
290 sqlite3_value **argv
291){
292 int rc;
293 const sqlite3_tokenizer_module *p1;
294 const sqlite3_tokenizer_module *p2;
295 sqlite3 *db = (sqlite3 *)sqlite3_user_data(context);
296
297 /* Test the query function */
298 sqlite3Fts3SimpleTokenizerModule(&p1);
299 rc = queryTokenizer(db, "simple", &p2);
300 assert( rc==SQLITE_OK );
301 assert( p1==p2 );
302 rc = queryTokenizer(db, "nosuchtokenizer", &p2);
303 assert( rc==SQLITE_ERROR );
304 assert( p2==0 );
305 assert( 0==strcmp(sqlite3_errmsg(db), "unknown tokenizer: nosuchtokenizer") );
306
307 /* Test the storage function */
308 rc = registerTokenizer(db, "nosuchtokenizer", p1);
309 assert( rc==SQLITE_OK );
310 rc = queryTokenizer(db, "nosuchtokenizer", &p2);
311 assert( rc==SQLITE_OK );
312 assert( p2==p1 );
313
314 sqlite3_result_text(context, "ok", -1, SQLITE_STATIC);
315}
316
317#endif
318
319/*
320** Set up SQL objects in database db used to access the contents of
321** the hash table pointed to by argument pHash. The hash table must
322** been initialised to use string keys, and to take a private copy
323** of the key when a value is inserted. i.e. by a call similar to:
324**
325** sqlite3Fts3HashInit(pHash, FTS3_HASH_STRING, 1);
326**
327** This function adds a scalar function (see header comment above
328** scalarFunc() in this file for details) and, if ENABLE_TABLE is
329** defined at compilation time, a temporary virtual table (see header
330** comment above struct HashTableVtab) to the database schema. Both
331** provide read/write access to the contents of *pHash.
332**
333** The third argument to this function, zName, is used as the name
334** of both the scalar and, if created, the virtual table.
335*/
336int sqlite3Fts3InitHashTable(
337 sqlite3 *db,
338 fts3Hash *pHash,
339 const char *zName
340){
341 int rc = SQLITE_OK;
342 void *p = (void *)pHash;
343 const int any = SQLITE_ANY;
344 char *zTest = 0;
345 char *zTest2 = 0;
346
347#ifdef SQLITE_TEST
348 void *pdb = (void *)db;
349 zTest = sqlite3_mprintf("%s_test", zName);
350 zTest2 = sqlite3_mprintf("%s_internal_test", zName);
351 if( !zTest || !zTest2 ){
352 rc = SQLITE_NOMEM;
353 }
354#endif
355
356 if( rc!=SQLITE_OK
357 || (rc = sqlite3_create_function(db, zName, 1, any, p, scalarFunc, 0, 0))
358 || (rc = sqlite3_create_function(db, zName, 2, any, p, scalarFunc, 0, 0))
359#ifdef SQLITE_TEST
360 || (rc = sqlite3_create_function(db, zTest, 2, any, p, testFunc, 0, 0))
361 || (rc = sqlite3_create_function(db, zTest, 3, any, p, testFunc, 0, 0))
362 || (rc = sqlite3_create_function(db, zTest2, 0, any, pdb, intTestFunc, 0, 0))
363#endif
364 );
365
366 sqlite3_free(zTest);
367 sqlite3_free(zTest2);
368 return rc;
369}
370
371#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_tokenizer.h b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_tokenizer.h
deleted file mode 100644
index 4faef56..0000000
--- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_tokenizer.h
+++ /dev/null
@@ -1,145 +0,0 @@
1/*
2** 2006 July 10
3**
4** The author disclaims copyright to this source code.
5**
6*************************************************************************
7** Defines the interface to tokenizers used by fulltext-search. There
8** are three basic components:
9**
10** sqlite3_tokenizer_module is a singleton defining the tokenizer
11** interface functions. This is essentially the class structure for
12** tokenizers.
13**
14** sqlite3_tokenizer is used to define a particular tokenizer, perhaps
15** including customization information defined at creation time.
16**
17** sqlite3_tokenizer_cursor is generated by a tokenizer to generate
18** tokens from a particular input.
19*/
20#ifndef _FTS3_TOKENIZER_H_
21#define _FTS3_TOKENIZER_H_
22
23/* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time.
24** If tokenizers are to be allowed to call sqlite3_*() functions, then
25** we will need a way to register the API consistently.
26*/
27#include "sqlite3.h"
28
29/*
30** Structures used by the tokenizer interface. When a new tokenizer
31** implementation is registered, the caller provides a pointer to
32** an sqlite3_tokenizer_module containing pointers to the callback
33** functions that make up an implementation.
34**
35** When an fts3 table is created, it passes any arguments passed to
36** the tokenizer clause of the CREATE VIRTUAL TABLE statement to the
37** sqlite3_tokenizer_module.xCreate() function of the requested tokenizer
38** implementation. The xCreate() function in turn returns an
39** sqlite3_tokenizer structure representing the specific tokenizer to
40** be used for the fts3 table (customized by the tokenizer clause arguments).
41**
42** To tokenize an input buffer, the sqlite3_tokenizer_module.xOpen()
43** method is called. It returns an sqlite3_tokenizer_cursor object
44** that may be used to tokenize a specific input buffer based on
45** the tokenization rules supplied by a specific sqlite3_tokenizer
46** object.
47*/
48typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module;
49typedef struct sqlite3_tokenizer sqlite3_tokenizer;
50typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor;
51
52struct sqlite3_tokenizer_module {
53
54 /*
55 ** Structure version. Should always be set to 0.
56 */
57 int iVersion;
58
59 /*
60 ** Create a new tokenizer. The values in the argv[] array are the
61 ** arguments passed to the "tokenizer" clause of the CREATE VIRTUAL
62 ** TABLE statement that created the fts3 table. For example, if
63 ** the following SQL is executed:
64 **
65 ** CREATE .. USING fts3( ... , tokenizer <tokenizer-name> arg1 arg2)
66 **
67 ** then argc is set to 2, and the argv[] array contains pointers
68 ** to the strings "arg1" and "arg2".
69 **
70 ** This method should return either SQLITE_OK (0), or an SQLite error
71 ** code. If SQLITE_OK is returned, then *ppTokenizer should be set
72 ** to point at the newly created tokenizer structure. The generic
73 ** sqlite3_tokenizer.pModule variable should not be initialised by
74 ** this callback. The caller will do so.
75 */
76 int (*xCreate)(
77 int argc, /* Size of argv array */
78 const char *const*argv, /* Tokenizer argument strings */
79 sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */
80 );
81
82 /*
83 ** Destroy an existing tokenizer. The fts3 module calls this method
84 ** exactly once for each successful call to xCreate().
85 */
86 int (*xDestroy)(sqlite3_tokenizer *pTokenizer);
87
88 /*
89 ** Create a tokenizer cursor to tokenize an input buffer. The caller
90 ** is responsible for ensuring that the input buffer remains valid
91 ** until the cursor is closed (using the xClose() method).
92 */
93 int (*xOpen)(
94 sqlite3_tokenizer *pTokenizer, /* Tokenizer object */
95 const char *pInput, int nBytes, /* Input buffer */
96 sqlite3_tokenizer_cursor **ppCursor /* OUT: Created tokenizer cursor */
97 );
98
99 /*
100 ** Destroy an existing tokenizer cursor. The fts3 module calls this
101 ** method exactly once for each successful call to xOpen().
102 */
103 int (*xClose)(sqlite3_tokenizer_cursor *pCursor);
104
105 /*
106 ** Retrieve the next token from the tokenizer cursor pCursor. This
107 ** method should either return SQLITE_OK and set the values of the
108 ** "OUT" variables identified below, or SQLITE_DONE to indicate that
109 ** the end of the buffer has been reached, or an SQLite error code.
110 **
111 ** *ppToken should be set to point at a buffer containing the
112 ** normalized version of the token (i.e. after any case-folding and/or
113 ** stemming has been performed). *pnBytes should be set to the length
114 ** of this buffer in bytes. The input text that generated the token is
115 ** identified by the byte offsets returned in *piStartOffset and
116 ** *piEndOffset.
117 **
118 ** The buffer *ppToken is set to point at is managed by the tokenizer
119 ** implementation. It is only required to be valid until the next call
120 ** to xNext() or xClose().
121 */
122 /* TODO(shess) current implementation requires pInput to be
123 ** nul-terminated. This should either be fixed, or pInput/nBytes
124 ** should be converted to zInput.
125 */
126 int (*xNext)(
127 sqlite3_tokenizer_cursor *pCursor, /* Tokenizer cursor */
128 const char **ppToken, int *pnBytes, /* OUT: Normalized text for token */
129 int *piStartOffset, /* OUT: Byte offset of token in input buffer */
130 int *piEndOffset, /* OUT: Byte offset of end of token in input buffer */
131 int *piPosition /* OUT: Number of tokens returned before this one */
132 );
133};
134
135struct sqlite3_tokenizer {
136 const sqlite3_tokenizer_module *pModule; /* The module for this tokenizer */
137 /* Tokenizer implementations will typically add additional fields */
138};
139
140struct sqlite3_tokenizer_cursor {
141 sqlite3_tokenizer *pTokenizer; /* Tokenizer for this cursor. */
142 /* Tokenizer implementations will typically add additional fields */
143};
144
145#endif /* _FTS3_TOKENIZER_H_ */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_tokenizer1.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_tokenizer1.c
deleted file mode 100644
index f53cc1d..0000000
--- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_tokenizer1.c
+++ /dev/null
@@ -1,229 +0,0 @@
1/*
2** 2006 Oct 10
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11******************************************************************************
12**
13** Implementation of the "simple" full-text-search tokenizer.
14*/
15
16/*
17** The code in this file is only compiled if:
18**
19** * The FTS3 module is being built as an extension
20** (in which case SQLITE_CORE is not defined), or
21**
22** * The FTS3 module is being built into the core of
23** SQLite (in which case SQLITE_ENABLE_FTS3 is defined).
24*/
25#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
26
27
28#include <assert.h>
29#include <stdlib.h>
30#include <stdio.h>
31#include <string.h>
32#include <ctype.h>
33
34#include "fts3_tokenizer.h"
35
36typedef struct simple_tokenizer {
37 sqlite3_tokenizer base;
38 char delim[128]; /* flag ASCII delimiters */
39} simple_tokenizer;
40
41typedef struct simple_tokenizer_cursor {
42 sqlite3_tokenizer_cursor base;
43 const char *pInput; /* input we are tokenizing */
44 int nBytes; /* size of the input */
45 int iOffset; /* current position in pInput */
46 int iToken; /* index of next token to be returned */
47 char *pToken; /* storage for current token */
48 int nTokenAllocated; /* space allocated to zToken buffer */
49} simple_tokenizer_cursor;
50
51
52/* Forward declaration */
53static const sqlite3_tokenizer_module simpleTokenizerModule;
54
55static int simpleDelim(simple_tokenizer *t, unsigned char c){
56 return c<0x80 && t->delim[c];
57}
58
59/*
60** Create a new tokenizer instance.
61*/
62static int simpleCreate(
63 int argc, const char * const *argv,
64 sqlite3_tokenizer **ppTokenizer
65){
66 simple_tokenizer *t;
67
68 t = (simple_tokenizer *) calloc(sizeof(*t), 1);
69 if( t==NULL ) return SQLITE_NOMEM;
70
71 /* TODO(shess) Delimiters need to remain the same from run to run,
72 ** else we need to reindex. One solution would be a meta-table to
73 ** track such information in the database, then we'd only want this
74 ** information on the initial create.
75 */
76 if( argc>1 ){
77 int i, n = strlen(argv[1]);
78 for(i=0; i<n; i++){
79 unsigned char ch = argv[1][i];
80 /* We explicitly don't support UTF-8 delimiters for now. */
81 if( ch>=0x80 ){
82 free(t);
83 return SQLITE_ERROR;
84 }
85 t->delim[ch] = 1;
86 }
87 } else {
88 /* Mark non-alphanumeric ASCII characters as delimiters */
89 int i;
90 for(i=1; i<0x80; i++){
91 t->delim[i] = !isalnum(i);
92 }
93 }
94
95 *ppTokenizer = &t->base;
96 return SQLITE_OK;
97}
98
99/*
100** Destroy a tokenizer
101*/
102static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
103 free(pTokenizer);
104 return SQLITE_OK;
105}
106
107/*
108** Prepare to begin tokenizing a particular string. The input
109** string to be tokenized is pInput[0..nBytes-1]. A cursor
110** used to incrementally tokenize this string is returned in
111** *ppCursor.
112*/
113static int simpleOpen(
114 sqlite3_tokenizer *pTokenizer, /* The tokenizer */
115 const char *pInput, int nBytes, /* String to be tokenized */
116 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
117){
118 simple_tokenizer_cursor *c;
119
120 c = (simple_tokenizer_cursor *) malloc(sizeof(*c));
121 if( c==NULL ) return SQLITE_NOMEM;
122
123 c->pInput = pInput;
124 if( pInput==0 ){
125 c->nBytes = 0;
126 }else if( nBytes<0 ){
127 c->nBytes = (int)strlen(pInput);
128 }else{
129 c->nBytes = nBytes;
130 }
131 c->iOffset = 0; /* start tokenizing at the beginning */
132 c->iToken = 0;
133 c->pToken = NULL; /* no space allocated, yet. */
134 c->nTokenAllocated = 0;
135
136 *ppCursor = &c->base;
137 return SQLITE_OK;
138}
139
140/*
141** Close a tokenization cursor previously opened by a call to
142** simpleOpen() above.
143*/
144static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
145 simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
146 free(c->pToken);
147 free(c);
148 return SQLITE_OK;
149}
150
151/*
152** Extract the next token from a tokenization cursor. The cursor must
153** have been opened by a prior call to simpleOpen().
154*/
155static int simpleNext(
156 sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */
157 const char **ppToken, /* OUT: *ppToken is the token text */
158 int *pnBytes, /* OUT: Number of bytes in token */
159 int *piStartOffset, /* OUT: Starting offset of token */
160 int *piEndOffset, /* OUT: Ending offset of token */
161 int *piPosition /* OUT: Position integer of token */
162){
163 simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
164 simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
165 unsigned char *p = (unsigned char *)c->pInput;
166
167 while( c->iOffset<c->nBytes ){
168 int iStartOffset;
169
170 /* Scan past delimiter characters */
171 while( c->iOffset<c->nBytes && simpleDelim(t, p[c->iOffset]) ){
172 c->iOffset++;
173 }
174
175 /* Count non-delimiter characters. */
176 iStartOffset = c->iOffset;
177 while( c->iOffset<c->nBytes && !simpleDelim(t, p[c->iOffset]) ){
178 c->iOffset++;
179 }
180
181 if( c->iOffset>iStartOffset ){
182 int i, n = c->iOffset-iStartOffset;
183 if( n>c->nTokenAllocated ){
184 c->nTokenAllocated = n+20;
185 c->pToken = realloc(c->pToken, c->nTokenAllocated);
186 if( c->pToken==NULL ) return SQLITE_NOMEM;
187 }
188 for(i=0; i<n; i++){
189 /* TODO(shess) This needs expansion to handle UTF-8
190 ** case-insensitivity.
191 */
192 unsigned char ch = p[iStartOffset+i];
193 c->pToken[i] = ch<0x80 ? tolower(ch) : ch;
194 }
195 *ppToken = c->pToken;
196 *pnBytes = n;
197 *piStartOffset = iStartOffset;
198 *piEndOffset = c->iOffset;
199 *piPosition = c->iToken++;
200
201 return SQLITE_OK;
202 }
203 }
204 return SQLITE_DONE;
205}
206
207/*
208** The set of routines that implement the simple tokenizer
209*/
210static const sqlite3_tokenizer_module simpleTokenizerModule = {
211 0,
212 simpleCreate,
213 simpleDestroy,
214 simpleOpen,
215 simpleClose,
216 simpleNext,
217};
218
219/*
220** Allocate a new simple tokenizer. Return a pointer to the new
221** tokenizer in *ppModule
222*/
223void sqlite3Fts3SimpleTokenizerModule(
224 sqlite3_tokenizer_module const**ppModule
225){
226 *ppModule = &simpleTokenizerModule;
227}
228
229#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/mkfts3amal.tcl b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/mkfts3amal.tcl
deleted file mode 100644
index cfea5d2..0000000
--- a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/mkfts3amal.tcl
+++ /dev/null
@@ -1,116 +0,0 @@
1#!/usr/bin/tclsh
2#
3# This script builds a single C code file holding all of FTS3 code.
4# The name of the output file is fts3amal.c. To build this file,
5# first do:
6#
7# make target_source
8#
9# The make target above moves all of the source code files into
10# a subdirectory named "tsrc". (This script expects to find the files
11# there and will not work if they are not found.)
12#
13# After the "tsrc" directory has been created and populated, run
14# this script:
15#
16# tclsh mkfts3amal.tcl
17#
18# The amalgamated FTS3 code will be written into fts3amal.c
19#
20
21# Open the output file and write a header comment at the beginning
22# of the file.
23#
24set out [open fts3amal.c w]
25set today [clock format [clock seconds] -format "%Y-%m-%d %H:%M:%S UTC" -gmt 1]
26puts $out [subst \
27{/******************************************************************************
28** This file is an amalgamation of separate C source files from the SQLite
29** Full Text Search extension 2 (fts3). By combining all the individual C
30** code files into this single large file, the entire code can be compiled
31** as a one translation unit. This allows many compilers to do optimizations
32** that would not be possible if the files were compiled separately. It also
33** makes the code easier to import into other projects.
34**
35** This amalgamation was generated on $today.
36*/}]
37
38# These are the header files used by FTS3. The first time any of these
39# files are seen in a #include statement in the C code, include the complete
40# text of the file in-line. The file only needs to be included once.
41#
42foreach hdr {
43 fts3.h
44 fts3_hash.h
45 fts3_tokenizer.h
46 sqlite3.h
47 sqlite3ext.h
48} {
49 set available_hdr($hdr) 1
50}
51
52# 78 stars used for comment formatting.
53set s78 \
54{*****************************************************************************}
55
56# Insert a comment into the code
57#
58proc section_comment {text} {
59 global out s78
60 set n [string length $text]
61 set nstar [expr {60 - $n}]
62 set stars [string range $s78 0 $nstar]
63 puts $out "/************** $text $stars/"
64}
65
66# Read the source file named $filename and write it into the
67# sqlite3.c output file. If any #include statements are seen,
68# process them approprately.
69#
70proc copy_file {filename} {
71 global seen_hdr available_hdr out
72 set tail [file tail $filename]
73 section_comment "Begin file $tail"
74 set in [open $filename r]
75 while {![eof $in]} {
76 set line [gets $in]
77 if {[regexp {^#\s*include\s+["<]([^">]+)[">]} $line all hdr]} {
78 if {[info exists available_hdr($hdr)]} {
79 if {$available_hdr($hdr)} {
80 section_comment "Include $hdr in the middle of $tail"
81 copy_file tsrc/$hdr
82 section_comment "Continuing where we left off in $tail"
83 }
84 } elseif {![info exists seen_hdr($hdr)]} {
85 set seen_hdr($hdr) 1
86 puts $out $line
87 }
88 } elseif {[regexp {^#ifdef __cplusplus} $line]} {
89 puts $out "#if 0"
90 } elseif {[regexp {^#line} $line]} {
91 # Skip #line directives.
92 } else {
93 puts $out $line
94 }
95 }
96 close $in
97 section_comment "End of $tail"
98}
99
100
101# Process the source files. Process files containing commonly
102# used subroutines first in order to help the compiler find
103# inlining opportunities.
104#
105foreach file {
106 fts3.c
107 fts3_hash.c
108 fts3_porter.c
109 fts3_tokenizer.c
110 fts3_tokenizer1.c
111 fts3_icu.c
112} {
113 copy_file tsrc/$file
114}
115
116close $out
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/icu/README.txt b/libraries/sqlite/unix/sqlite-3.5.1/ext/icu/README.txt
deleted file mode 100644
index 5c995cc..0000000
--- a/libraries/sqlite/unix/sqlite-3.5.1/ext/icu/README.txt
+++ /dev/null
@@ -1,170 +0,0 @@
1
2This directory contains source code for the SQLite "ICU" extension, an
3integration of the "International Components for Unicode" library with
4SQLite. Documentation follows.
5
6 1. Features
7
8 1.1 SQL Scalars upper() and lower()
9 1.2 Unicode Aware LIKE Operator
10 1.3 ICU Collation Sequences
11 1.4 SQL REGEXP Operator
12
13 2. Compilation and Usage
14
15 3. Bugs, Problems and Security Issues
16
17 3.1 The "case_sensitive_like" Pragma
18 3.2 The SQLITE_MAX_LIKE_PATTERN_LENGTH Macro
19 3.3 Collation Sequence Security Issue
20
21
221. FEATURES
23
24 1.1 SQL Scalars upper() and lower()
25
26 SQLite's built-in implementations of these two functions only
27 provide case mapping for the 26 letters used in the English
28 language. The ICU based functions provided by this extension
29 provide case mapping, where defined, for the full range of
30 unicode characters.
31
32 ICU provides two types of case mapping, "general" case mapping and
33 "language specific". Refer to ICU documentation for the differences
34 between the two. Specifically:
35
36 http://www.icu-project.org/userguide/caseMappings.html
37 http://www.icu-project.org/userguide/posix.html#case_mappings
38
39 To utilise "general" case mapping, the upper() or lower() scalar
40 functions are invoked with one argument:
41
42 upper('ABC') -> 'abc'
43 lower('abc') -> 'ABC'
44
45 To access ICU "language specific" case mapping, upper() or lower()
46 should be invoked with two arguments. The second argument is the name
47 of the locale to use. Passing an empty string ("") or SQL NULL value
48 as the second argument is the same as invoking the 1 argument version
49 of upper() or lower():
50
51 lower('I', 'en_us') -> 'i'
52 lower('I', 'tr_tr') -> 'ı' (small dotless i)
53
54 1.2 Unicode Aware LIKE Operator
55
56 Similarly to the upper() and lower() functions, the built-in SQLite LIKE
57 operator understands case equivalence for the 26 letters of the English
58 language alphabet. The implementation of LIKE included in this
59 extension uses the ICU function u_foldCase() to provide case
60 independent comparisons for the full range of unicode characters.
61
62 The U_FOLD_CASE_DEFAULT flag is passed to u_foldCase(), meaning the
63 dotless 'I' character used in the Turkish language is considered
64 to be in the same equivalence class as the dotted 'I' character
65 used by many languages (including English).
66
67 1.3 ICU Collation Sequences
68
69 A special SQL scalar function, icu_load_collation() is provided that
70 may be used to register ICU collation sequences with SQLite. It
71 is always called with exactly two arguments, the ICU locale
72 identifying the collation sequence to ICU, and the name of the
73 SQLite collation sequence to create. For example, to create an
74 SQLite collation sequence named "turkish" using Turkish language
75 sorting rules, the SQL statement:
76
77 SELECT icu_load_collation('tr_TR', 'turkish');
78
79 Or, for Australian English:
80
81 SELECT icu_load_collation('en_AU', 'australian');
82
83 The identifiers "turkish" and "australian" may then be used
84 as collation sequence identifiers in SQL statements:
85
86 CREATE TABLE aust_turkish_penpals(
87 australian_penpal_name TEXT COLLATE australian,
88 turkish_penpal_name TEXT COLLATE turkish
89 );
90
91 1.4 SQL REGEXP Operator
92
93 This extension provides an implementation of the SQL binary
94 comparision operator "REGEXP", based on the regular expression functions
95 provided by the ICU library. The syntax of the operator is as described
96 in SQLite documentation:
97
98 <string> REGEXP <re-pattern>
99
100 This extension uses the ICU defaults for regular expression matching
101 behaviour. Specifically, this means that:
102
103 * Matching is case-sensitive,
104 * Regular expression comments are not allowed within patterns, and
105 * The '^' and '$' characters match the beginning and end of the
106 <string> argument, not the beginning and end of lines within
107 the <string> argument.
108
109 Even more specifically, the value passed to the "flags" parameter
110 of ICU C function uregex_open() is 0.
111
112
1132 COMPILATION AND USAGE
114
115 The easiest way to compile and use the ICU extension is to build
116 and use it as a dynamically loadable SQLite extension. To do this
117 using gcc on *nix:
118
119 gcc -shared icu.c `icu-config --ldflags` -o libSqliteIcu.so
120
121 You may need to add "-I" flags so that gcc can find sqlite3ext.h
122 and sqlite3.h. The resulting shared lib, libSqliteIcu.so, may be
123 loaded into sqlite in the same way as any other dynamically loadable
124 extension.
125
126
1273 BUGS, PROBLEMS AND SECURITY ISSUES
128
129 3.1 The "case_sensitive_like" Pragma
130
131 This extension does not work well with the "case_sensitive_like"
132 pragma. If this pragma is used before the ICU extension is loaded,
133 then the pragma has no effect. If the pragma is used after the ICU
134 extension is loaded, then SQLite ignores the ICU implementation and
135 always uses the built-in LIKE operator.
136
137 The ICU extension LIKE operator is always case insensitive.
138
139 3.2 The SQLITE_MAX_LIKE_PATTERN_LENGTH Macro
140
141 Passing very long patterns to the built-in SQLite LIKE operator can
142 cause a stack overflow. To curb this problem, SQLite defines the
143 SQLITE_MAX_LIKE_PATTERN_LENGTH macro as the maximum length of a
144 pattern in bytes (irrespective of encoding). The default value is
145 defined in internal header file "limits.h".
146
147 The ICU extension LIKE implementation suffers from the same
148 problem and uses the same solution. However, since the ICU extension
149 code does not include the SQLite file "limits.h", modifying
150 the default value therein does not affect the ICU extension.
151 The default value of SQLITE_MAX_LIKE_PATTERN_LENGTH used by
152 the ICU extension LIKE operator is 50000, defined in source
153 file "icu.c".
154
155 3.3 Collation Sequence Security Issue
156
157 Internally, SQLite assumes that indices stored in database files
158 are sorted according to the collation sequence indicated by the
159 SQL schema. Changing the definition of a collation sequence after
160 an index has been built is therefore equivalent to database
161 corruption. The SQLite library is not very well tested under
162 these conditions, and may contain potential buffer overruns
163 or other programming errors that could be exploited by a malicious
164 programmer.
165
166 If the ICU extension is used in an environment where potentially
167 malicious users may execute arbitrary SQL (i.e. gears), they
168 should be prevented from invoking the icu_load_collation() function,
169 possibly using the authorisation callback.
170
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/icu/icu.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/icu/icu.c
deleted file mode 100644
index 11bb116..0000000
--- a/libraries/sqlite/unix/sqlite-3.5.1/ext/icu/icu.c
+++ /dev/null
@@ -1,499 +0,0 @@
1/*
2** 2007 May 6
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12** $Id: icu.c,v 1.6 2007/06/22 15:21:16 danielk1977 Exp $
13**
14** This file implements an integration between the ICU library
15** ("International Components for Unicode", an open-source library
16** for handling unicode data) and SQLite. The integration uses
17** ICU to provide the following to SQLite:
18**
19** * An implementation of the SQL regexp() function (and hence REGEXP
20** operator) using the ICU uregex_XX() APIs.
21**
22** * Implementations of the SQL scalar upper() and lower() functions
23** for case mapping.
24**
25** * Integration of ICU and SQLite collation seqences.
26**
27** * An implementation of the LIKE operator that uses ICU to
28** provide case-independent matching.
29*/
30
31#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ICU)
32
33/* Include ICU headers */
34#include <unicode/utypes.h>
35#include <unicode/uregex.h>
36#include <unicode/ustring.h>
37#include <unicode/ucol.h>
38
39#include <assert.h>
40
41#ifndef SQLITE_CORE
42 #include "sqlite3ext.h"
43 SQLITE_EXTENSION_INIT1
44#else
45 #include "sqlite3.h"
46#endif
47
48/*
49** Maximum length (in bytes) of the pattern in a LIKE or GLOB
50** operator.
51*/
52#ifndef SQLITE_MAX_LIKE_PATTERN_LENGTH
53# define SQLITE_MAX_LIKE_PATTERN_LENGTH 50000
54#endif
55
56/*
57** Version of sqlite3_free() that is always a function, never a macro.
58*/
59static void xFree(void *p){
60 sqlite3_free(p);
61}
62
63/*
64** Compare two UTF-8 strings for equality where the first string is
65** a "LIKE" expression. Return true (1) if they are the same and
66** false (0) if they are different.
67*/
68static int icuLikeCompare(
69 const uint8_t *zPattern, /* LIKE pattern */
70 const uint8_t *zString, /* The UTF-8 string to compare against */
71 const UChar32 uEsc /* The escape character */
72){
73 static const int MATCH_ONE = (UChar32)'_';
74 static const int MATCH_ALL = (UChar32)'%';
75
76 int iPattern = 0; /* Current byte index in zPattern */
77 int iString = 0; /* Current byte index in zString */
78
79 int prevEscape = 0; /* True if the previous character was uEsc */
80
81 while( zPattern[iPattern]!=0 ){
82
83 /* Read (and consume) the next character from the input pattern. */
84 UChar32 uPattern;
85 U8_NEXT_UNSAFE(zPattern, iPattern, uPattern);
86 assert(uPattern!=0);
87
88 /* There are now 4 possibilities:
89 **
90 ** 1. uPattern is an unescaped match-all character "%",
91 ** 2. uPattern is an unescaped match-one character "_",
92 ** 3. uPattern is an unescaped escape character, or
93 ** 4. uPattern is to be handled as an ordinary character
94 */
95 if( !prevEscape && uPattern==MATCH_ALL ){
96 /* Case 1. */
97 uint8_t c;
98
99 /* Skip any MATCH_ALL or MATCH_ONE characters that follow a
100 ** MATCH_ALL. For each MATCH_ONE, skip one character in the
101 ** test string.
102 */
103 while( (c=zPattern[iPattern]) == MATCH_ALL || c == MATCH_ONE ){
104 if( c==MATCH_ONE ){
105 if( zString[iString]==0 ) return 0;
106 U8_FWD_1_UNSAFE(zString, iString);
107 }
108 iPattern++;
109 }
110
111 if( zPattern[iPattern]==0 ) return 1;
112
113 while( zString[iString] ){
114 if( icuLikeCompare(&zPattern[iPattern], &zString[iString], uEsc) ){
115 return 1;
116 }
117 U8_FWD_1_UNSAFE(zString, iString);
118 }
119 return 0;
120
121 }else if( !prevEscape && uPattern==MATCH_ONE ){
122 /* Case 2. */
123 if( zString[iString]==0 ) return 0;
124 U8_FWD_1_UNSAFE(zString, iString);
125
126 }else if( !prevEscape && uPattern==uEsc){
127 /* Case 3. */
128 prevEscape = 1;
129
130 }else{
131 /* Case 4. */
132 UChar32 uString;
133 U8_NEXT_UNSAFE(zString, iString, uString);
134 uString = u_foldCase(uString, U_FOLD_CASE_DEFAULT);
135 uPattern = u_foldCase(uPattern, U_FOLD_CASE_DEFAULT);
136 if( uString!=uPattern ){
137 return 0;
138 }
139 prevEscape = 0;
140 }
141 }
142
143 return zString[iString]==0;
144}
145
146/*
147** Implementation of the like() SQL function. This function implements
148** the build-in LIKE operator. The first argument to the function is the
149** pattern and the second argument is the string. So, the SQL statements:
150**
151** A LIKE B
152**
153** is implemented as like(B, A). If there is an escape character E,
154**
155** A LIKE B ESCAPE E
156**
157** is mapped to like(B, A, E).
158*/
159static void icuLikeFunc(
160 sqlite3_context *context,
161 int argc,
162 sqlite3_value **argv
163){
164 const unsigned char *zA = sqlite3_value_text(argv[0]);
165 const unsigned char *zB = sqlite3_value_text(argv[1]);
166 UChar32 uEsc = 0;
167
168 /* Limit the length of the LIKE or GLOB pattern to avoid problems
169 ** of deep recursion and N*N behavior in patternCompare().
170 */
171 if( sqlite3_value_bytes(argv[0])>SQLITE_MAX_LIKE_PATTERN_LENGTH ){
172 sqlite3_result_error(context, "LIKE or GLOB pattern too complex", -1);
173 return;
174 }
175
176
177 if( argc==3 ){
178 /* The escape character string must consist of a single UTF-8 character.
179 ** Otherwise, return an error.
180 */
181 int nE= sqlite3_value_bytes(argv[2]);
182 const unsigned char *zE = sqlite3_value_text(argv[2]);
183 int i = 0;
184 if( zE==0 ) return;
185 U8_NEXT(zE, i, nE, uEsc);
186 if( i!=nE){
187 sqlite3_result_error(context,
188 "ESCAPE expression must be a single character", -1);
189 return;
190 }
191 }
192
193 if( zA && zB ){
194 sqlite3_result_int(context, icuLikeCompare(zA, zB, uEsc));
195 }
196}
197
198/*
199** This function is called when an ICU function called from within
200** the implementation of an SQL scalar function returns an error.
201**
202** The scalar function context passed as the first argument is
203** loaded with an error message based on the following two args.
204*/
205static void icuFunctionError(
206 sqlite3_context *pCtx, /* SQLite scalar function context */
207 const char *zName, /* Name of ICU function that failed */
208 UErrorCode e /* Error code returned by ICU function */
209){
210 char zBuf[128];
211 sqlite3_snprintf(128, zBuf, "ICU error: %s(): %s", zName, u_errorName(e));
212 zBuf[127] = '\0';
213 sqlite3_result_error(pCtx, zBuf, -1);
214}
215
216/*
217** Function to delete compiled regexp objects. Registered as
218** a destructor function with sqlite3_set_auxdata().
219*/
220static void icuRegexpDelete(void *p){
221 URegularExpression *pExpr = (URegularExpression *)p;
222 uregex_close(pExpr);
223}
224
225/*
226** Implementation of SQLite REGEXP operator. This scalar function takes
227** two arguments. The first is a regular expression pattern to compile
228** the second is a string to match against that pattern. If either
229** argument is an SQL NULL, then NULL Is returned. Otherwise, the result
230** is 1 if the string matches the pattern, or 0 otherwise.
231**
232** SQLite maps the regexp() function to the regexp() operator such
233** that the following two are equivalent:
234**
235** zString REGEXP zPattern
236** regexp(zPattern, zString)
237**
238** Uses the following ICU regexp APIs:
239**
240** uregex_open()
241** uregex_matches()
242** uregex_close()
243*/
244static void icuRegexpFunc(sqlite3_context *p, int nArg, sqlite3_value **apArg){
245 UErrorCode status = U_ZERO_ERROR;
246 URegularExpression *pExpr;
247 UBool res;
248 const UChar *zString = sqlite3_value_text16(apArg[1]);
249
250 /* If the left hand side of the regexp operator is NULL,
251 ** then the result is also NULL.
252 */
253 if( !zString ){
254 return;
255 }
256
257 pExpr = sqlite3_get_auxdata(p, 0);
258 if( !pExpr ){
259 const UChar *zPattern = sqlite3_value_text16(apArg[0]);
260 if( !zPattern ){
261 return;
262 }
263 pExpr = uregex_open(zPattern, -1, 0, 0, &status);
264
265 if( U_SUCCESS(status) ){
266 sqlite3_set_auxdata(p, 0, pExpr, icuRegexpDelete);
267 }else{
268 assert(!pExpr);
269 icuFunctionError(p, "uregex_open", status);
270 return;
271 }
272 }
273
274 /* Configure the text that the regular expression operates on. */
275 uregex_setText(pExpr, zString, -1, &status);
276 if( !U_SUCCESS(status) ){
277 icuFunctionError(p, "uregex_setText", status);
278 return;
279 }
280
281 /* Attempt the match */
282 res = uregex_matches(pExpr, 0, &status);
283 if( !U_SUCCESS(status) ){
284 icuFunctionError(p, "uregex_matches", status);
285 return;
286 }
287
288 /* Set the text that the regular expression operates on to a NULL
289 ** pointer. This is not really necessary, but it is tidier than
290 ** leaving the regular expression object configured with an invalid
291 ** pointer after this function returns.
292 */
293 uregex_setText(pExpr, 0, 0, &status);
294
295 /* Return 1 or 0. */
296 sqlite3_result_int(p, res ? 1 : 0);
297}
298
299/*
300** Implementations of scalar functions for case mapping - upper() and
301** lower(). Function upper() converts it's input to upper-case (ABC).
302** Function lower() converts to lower-case (abc).
303**
304** ICU provides two types of case mapping, "general" case mapping and
305** "language specific". Refer to ICU documentation for the differences
306** between the two.
307**
308** To utilise "general" case mapping, the upper() or lower() scalar
309** functions are invoked with one argument:
310**
311** upper('ABC') -> 'abc'
312** lower('abc') -> 'ABC'
313**
314** To access ICU "language specific" case mapping, upper() or lower()
315** should be invoked with two arguments. The second argument is the name
316** of the locale to use. Passing an empty string ("") or SQL NULL value
317** as the second argument is the same as invoking the 1 argument version
318** of upper() or lower().
319**
320** lower('I', 'en_us') -> 'i'
321** lower('I', 'tr_tr') -> 'ı' (small dotless i)
322**
323** http://www.icu-project.org/userguide/posix.html#case_mappings
324*/
325static void icuCaseFunc16(sqlite3_context *p, int nArg, sqlite3_value **apArg){
326 const UChar *zInput;
327 UChar *zOutput;
328 int nInput;
329 int nOutput;
330
331 UErrorCode status = U_ZERO_ERROR;
332 const char *zLocale = 0;
333
334 assert(nArg==1 || nArg==2);
335 if( nArg==2 ){
336 zLocale = (const char *)sqlite3_value_text(apArg[1]);
337 }
338
339 zInput = sqlite3_value_text16(apArg[0]);
340 if( !zInput ){
341 return;
342 }
343 nInput = sqlite3_value_bytes16(apArg[0]);
344
345 nOutput = nInput * 2 + 2;
346 zOutput = sqlite3_malloc(nOutput);
347 if( !zOutput ){
348 return;
349 }
350
351 if( sqlite3_user_data(p) ){
352 u_strToUpper(zOutput, nOutput/2, zInput, nInput/2, zLocale, &status);
353 }else{
354 u_strToLower(zOutput, nOutput/2, zInput, nInput/2, zLocale, &status);
355 }
356
357 if( !U_SUCCESS(status) ){
358 icuFunctionError(p, "u_strToLower()/u_strToUpper", status);
359 return;
360 }
361
362 sqlite3_result_text16(p, zOutput, -1, xFree);
363}
364
365/*
366** Collation sequence destructor function. The pCtx argument points to
367** a UCollator structure previously allocated using ucol_open().
368*/
369static void icuCollationDel(void *pCtx){
370 UCollator *p = (UCollator *)pCtx;
371 ucol_close(p);
372}
373
374/*
375** Collation sequence comparison function. The pCtx argument points to
376** a UCollator structure previously allocated using ucol_open().
377*/
378static int icuCollationColl(
379 void *pCtx,
380 int nLeft,
381 const void *zLeft,
382 int nRight,
383 const void *zRight
384){
385 UCollationResult res;
386 UCollator *p = (UCollator *)pCtx;
387 res = ucol_strcoll(p, (UChar *)zLeft, nLeft/2, (UChar *)zRight, nRight/2);
388 switch( res ){
389 case UCOL_LESS: return -1;
390 case UCOL_GREATER: return +1;
391 case UCOL_EQUAL: return 0;
392 }
393 assert(!"Unexpected return value from ucol_strcoll()");
394 return 0;
395}
396
397/*
398** Implementation of the scalar function icu_load_collation().
399**
400** This scalar function is used to add ICU collation based collation
401** types to an SQLite database connection. It is intended to be called
402** as follows:
403**
404** SELECT icu_load_collation(<locale>, <collation-name>);
405**
406** Where <locale> is a string containing an ICU locale identifier (i.e.
407** "en_AU", "tr_TR" etc.) and <collation-name> is the name of the
408** collation sequence to create.
409*/
410static void icuLoadCollation(
411 sqlite3_context *p,
412 int nArg,
413 sqlite3_value **apArg
414){
415 sqlite3 *db = (sqlite3 *)sqlite3_user_data(p);
416 UErrorCode status = U_ZERO_ERROR;
417 const char *zLocale; /* Locale identifier - (eg. "jp_JP") */
418 const char *zName; /* SQL Collation sequence name (eg. "japanese") */
419 UCollator *pUCollator; /* ICU library collation object */
420 int rc; /* Return code from sqlite3_create_collation_x() */
421
422 assert(nArg==2);
423 zLocale = (const char *)sqlite3_value_text(apArg[0]);
424 zName = (const char *)sqlite3_value_text(apArg[1]);
425
426 if( !zLocale || !zName ){
427 return;
428 }
429
430 pUCollator = ucol_open(zLocale, &status);
431 if( !U_SUCCESS(status) ){
432 icuFunctionError(p, "ucol_open", status);
433 return;
434 }
435 assert(p);
436
437 rc = sqlite3_create_collation_v2(db, zName, SQLITE_UTF16, (void *)pUCollator,
438 icuCollationColl, icuCollationDel
439 );
440 if( rc!=SQLITE_OK ){
441 ucol_close(pUCollator);
442 sqlite3_result_error(p, "Error registering collation function", -1);
443 }
444}
445
446/*
447** Register the ICU extension functions with database db.
448*/
449int sqlite3IcuInit(sqlite3 *db){
450 struct IcuScalar {
451 const char *zName; /* Function name */
452 int nArg; /* Number of arguments */
453 int enc; /* Optimal text encoding */
454 void *pContext; /* sqlite3_user_data() context */
455 void (*xFunc)(sqlite3_context*,int,sqlite3_value**);
456 } scalars[] = {
457 {"regexp",-1, SQLITE_ANY, 0, icuRegexpFunc},
458
459 {"lower", 1, SQLITE_UTF16, 0, icuCaseFunc16},
460 {"lower", 2, SQLITE_UTF16, 0, icuCaseFunc16},
461 {"upper", 1, SQLITE_UTF16, (void*)1, icuCaseFunc16},
462 {"upper", 2, SQLITE_UTF16, (void*)1, icuCaseFunc16},
463
464 {"lower", 1, SQLITE_UTF8, 0, icuCaseFunc16},
465 {"lower", 2, SQLITE_UTF8, 0, icuCaseFunc16},
466 {"upper", 1, SQLITE_UTF8, (void*)1, icuCaseFunc16},
467 {"upper", 2, SQLITE_UTF8, (void*)1, icuCaseFunc16},
468
469 {"like", 2, SQLITE_UTF8, 0, icuLikeFunc},
470 {"like", 3, SQLITE_UTF8, 0, icuLikeFunc},
471
472 {"icu_load_collation", 2, SQLITE_UTF8, (void*)db, icuLoadCollation},
473 };
474
475 int rc = SQLITE_OK;
476 int i;
477
478 for(i=0; rc==SQLITE_OK && i<(sizeof(scalars)/sizeof(struct IcuScalar)); i++){
479 struct IcuScalar *p = &scalars[i];
480 rc = sqlite3_create_function(
481 db, p->zName, p->nArg, p->enc, p->pContext, p->xFunc, 0, 0
482 );
483 }
484
485 return rc;
486}
487
488#if !SQLITE_CORE
489int sqlite3_extension_init(
490 sqlite3 *db,
491 char **pzErrMsg,
492 const sqlite3_api_routines *pApi
493){
494 SQLITE_EXTENSION_INIT2(pApi)
495 return sqlite3IcuInit(db);
496}
497#endif
498
499#endif