aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1
diff options
context:
space:
mode:
Diffstat (limited to 'libraries/sqlite/unix/sqlite-3.5.1/ext/fts1')
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/README.txt2
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1.c3344
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1.h11
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_hash.c369
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_hash.h112
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_porter.c643
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_tokenizer.h90
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_tokenizer1.c221
8 files changed, 4792 insertions, 0 deletions
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/README.txt b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/README.txt
new file mode 100644
index 0000000..292b7da
--- /dev/null
+++ b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/README.txt
@@ -0,0 +1,2 @@
1This folder contains source code to the first full-text search
2extension for SQLite.
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1.c
new file mode 100644
index 0000000..5a69965
--- /dev/null
+++ b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1.c
@@ -0,0 +1,3344 @@
1/* fts1 has a design flaw which can lead to database corruption (see
2** below). It is recommended not to use it any longer, instead use
3** fts3 (or higher). If you believe that your use of fts1 is safe,
4** add -DSQLITE_ENABLE_BROKEN_FTS1=1 to your CFLAGS.
5*/
6#ifndef SQLITE_ENABLE_BROKEN_FTS1
7#error fts1 has a design flaw and has been deprecated.
8#endif
9/* The flaw is that fts1 uses the content table's unaliased rowid as
10** the unique docid. fts1 embeds the rowid in the index it builds,
11** and expects the rowid to not change. The SQLite VACUUM operation
12** will renumber such rowids, thereby breaking fts1. If you are using
13** fts1 in a system which has disabled VACUUM, then you can continue
14** to use it safely. Note that PRAGMA auto_vacuum does NOT disable
15** VACUUM, though systems using auto_vacuum are unlikely to invoke
16** VACUUM.
17**
18** fts1 should be safe even across VACUUM if you only insert documents
19** and never delete.
20*/
21
22/* The author disclaims copyright to this source code.
23 *
24 * This is an SQLite module implementing full-text search.
25 */
26
27/*
28** The code in this file is only compiled if:
29**
30** * The FTS1 module is being built as an extension
31** (in which case SQLITE_CORE is not defined), or
32**
33** * The FTS1 module is being built into the core of
34** SQLite (in which case SQLITE_ENABLE_FTS1 is defined).
35*/
36#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)
37
38#if defined(SQLITE_ENABLE_FTS1) && !defined(SQLITE_CORE)
39# define SQLITE_CORE 1
40#endif
41
42#include <assert.h>
43#include <stdlib.h>
44#include <stdio.h>
45#include <string.h>
46#include <ctype.h>
47
48#include "fts1.h"
49#include "fts1_hash.h"
50#include "fts1_tokenizer.h"
51#include "sqlite3.h"
52#include "sqlite3ext.h"
53SQLITE_EXTENSION_INIT1
54
55
56#if 0
57# define TRACE(A) printf A; fflush(stdout)
58#else
59# define TRACE(A)
60#endif
61
62/* utility functions */
63
64typedef struct StringBuffer {
65 int len; /* length, not including null terminator */
66 int alloced; /* Space allocated for s[] */
67 char *s; /* Content of the string */
68} StringBuffer;
69
70static void initStringBuffer(StringBuffer *sb){
71 sb->len = 0;
72 sb->alloced = 100;
73 sb->s = malloc(100);
74 sb->s[0] = '\0';
75}
76
77static void nappend(StringBuffer *sb, const char *zFrom, int nFrom){
78 if( sb->len + nFrom >= sb->alloced ){
79 sb->alloced = sb->len + nFrom + 100;
80 sb->s = realloc(sb->s, sb->alloced+1);
81 if( sb->s==0 ){
82 initStringBuffer(sb);
83 return;
84 }
85 }
86 memcpy(sb->s + sb->len, zFrom, nFrom);
87 sb->len += nFrom;
88 sb->s[sb->len] = 0;
89}
90static void append(StringBuffer *sb, const char *zFrom){
91 nappend(sb, zFrom, strlen(zFrom));
92}
93
94/* We encode variable-length integers in little-endian order using seven bits
95 * per byte as follows:
96**
97** KEY:
98** A = 0xxxxxxx 7 bits of data and one flag bit
99** B = 1xxxxxxx 7 bits of data and one flag bit
100**
101** 7 bits - A
102** 14 bits - BA
103** 21 bits - BBA
104** and so on.
105*/
106
107/* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */
108#define VARINT_MAX 10
109
110/* Write a 64-bit variable-length integer to memory starting at p[0].
111 * The length of data written will be between 1 and VARINT_MAX bytes.
112 * The number of bytes written is returned. */
113static int putVarint(char *p, sqlite_int64 v){
114 unsigned char *q = (unsigned char *) p;
115 sqlite_uint64 vu = v;
116 do{
117 *q++ = (unsigned char) ((vu & 0x7f) | 0x80);
118 vu >>= 7;
119 }while( vu!=0 );
120 q[-1] &= 0x7f; /* turn off high bit in final byte */
121 assert( q - (unsigned char *)p <= VARINT_MAX );
122 return (int) (q - (unsigned char *)p);
123}
124
125/* Read a 64-bit variable-length integer from memory starting at p[0].
126 * Return the number of bytes read, or 0 on error.
127 * The value is stored in *v. */
128static int getVarint(const char *p, sqlite_int64 *v){
129 const unsigned char *q = (const unsigned char *) p;
130 sqlite_uint64 x = 0, y = 1;
131 while( (*q & 0x80) == 0x80 ){
132 x += y * (*q++ & 0x7f);
133 y <<= 7;
134 if( q - (unsigned char *)p >= VARINT_MAX ){ /* bad data */
135 assert( 0 );
136 return 0;
137 }
138 }
139 x += y * (*q++);
140 *v = (sqlite_int64) x;
141 return (int) (q - (unsigned char *)p);
142}
143
144static int getVarint32(const char *p, int *pi){
145 sqlite_int64 i;
146 int ret = getVarint(p, &i);
147 *pi = (int) i;
148 assert( *pi==i );
149 return ret;
150}
151
152/*** Document lists ***
153 *
154 * A document list holds a sorted list of varint-encoded document IDs.
155 *
156 * A doclist with type DL_POSITIONS_OFFSETS is stored like this:
157 *
158 * array {
159 * varint docid;
160 * array {
161 * varint position; (delta from previous position plus POS_BASE)
162 * varint startOffset; (delta from previous startOffset)
163 * varint endOffset; (delta from startOffset)
164 * }
165 * }
166 *
167 * Here, array { X } means zero or more occurrences of X, adjacent in memory.
168 *
169 * A position list may hold positions for text in multiple columns. A position
170 * POS_COLUMN is followed by a varint containing the index of the column for
171 * following positions in the list. Any positions appearing before any
172 * occurrences of POS_COLUMN are for column 0.
173 *
174 * A doclist with type DL_POSITIONS is like the above, but holds only docids
175 * and positions without offset information.
176 *
177 * A doclist with type DL_DOCIDS is like the above, but holds only docids
178 * without positions or offset information.
179 *
180 * On disk, every document list has positions and offsets, so we don't bother
181 * to serialize a doclist's type.
182 *
183 * We don't yet delta-encode document IDs; doing so will probably be a
184 * modest win.
185 *
186 * NOTE(shess) I've thought of a slightly (1%) better offset encoding.
187 * After the first offset, estimate the next offset by using the
188 * current token position and the previous token position and offset,
189 * offset to handle some variance. So the estimate would be
190 * (iPosition*w->iStartOffset/w->iPosition-64), which is delta-encoded
191 * as normal. Offsets more than 64 chars from the estimate are
192 * encoded as the delta to the previous start offset + 128. An
193 * additional tiny increment can be gained by using the end offset of
194 * the previous token to make the estimate a tiny bit more precise.
195*/
196
197/* It is not safe to call isspace(), tolower(), or isalnum() on
198** hi-bit-set characters. This is the same solution used in the
199** tokenizer.
200*/
201/* TODO(shess) The snippet-generation code should be using the
202** tokenizer-generated tokens rather than doing its own local
203** tokenization.
204*/
205/* TODO(shess) Is __isascii() a portable version of (c&0x80)==0? */
206static int safe_isspace(char c){
207 return (c&0x80)==0 ? isspace(c) : 0;
208}
209static int safe_tolower(char c){
210 return (c&0x80)==0 ? tolower(c) : c;
211}
212static int safe_isalnum(char c){
213 return (c&0x80)==0 ? isalnum(c) : 0;
214}
215
216typedef enum DocListType {
217 DL_DOCIDS, /* docids only */
218 DL_POSITIONS, /* docids + positions */
219 DL_POSITIONS_OFFSETS /* docids + positions + offsets */
220} DocListType;
221
222/*
223** By default, only positions and not offsets are stored in the doclists.
224** To change this so that offsets are stored too, compile with
225**
226** -DDL_DEFAULT=DL_POSITIONS_OFFSETS
227**
228*/
229#ifndef DL_DEFAULT
230# define DL_DEFAULT DL_POSITIONS
231#endif
232
233typedef struct DocList {
234 char *pData;
235 int nData;
236 DocListType iType;
237 int iLastColumn; /* the last column written */
238 int iLastPos; /* the last position written */
239 int iLastOffset; /* the last start offset written */
240} DocList;
241
242enum {
243 POS_END = 0, /* end of this position list */
244 POS_COLUMN, /* followed by new column number */
245 POS_BASE
246};
247
248/* Initialize a new DocList to hold the given data. */
249static void docListInit(DocList *d, DocListType iType,
250 const char *pData, int nData){
251 d->nData = nData;
252 if( nData>0 ){
253 d->pData = malloc(nData);
254 memcpy(d->pData, pData, nData);
255 } else {
256 d->pData = NULL;
257 }
258 d->iType = iType;
259 d->iLastColumn = 0;
260 d->iLastPos = d->iLastOffset = 0;
261}
262
263/* Create a new dynamically-allocated DocList. */
264static DocList *docListNew(DocListType iType){
265 DocList *d = (DocList *) malloc(sizeof(DocList));
266 docListInit(d, iType, 0, 0);
267 return d;
268}
269
270static void docListDestroy(DocList *d){
271 free(d->pData);
272#ifndef NDEBUG
273 memset(d, 0x55, sizeof(*d));
274#endif
275}
276
277static void docListDelete(DocList *d){
278 docListDestroy(d);
279 free(d);
280}
281
282static char *docListEnd(DocList *d){
283 return d->pData + d->nData;
284}
285
286/* Append a varint to a DocList's data. */
287static void appendVarint(DocList *d, sqlite_int64 i){
288 char c[VARINT_MAX];
289 int n = putVarint(c, i);
290 d->pData = realloc(d->pData, d->nData + n);
291 memcpy(d->pData + d->nData, c, n);
292 d->nData += n;
293}
294
295static void docListAddDocid(DocList *d, sqlite_int64 iDocid){
296 appendVarint(d, iDocid);
297 if( d->iType>=DL_POSITIONS ){
298 appendVarint(d, POS_END); /* initially empty position list */
299 d->iLastColumn = 0;
300 d->iLastPos = d->iLastOffset = 0;
301 }
302}
303
304/* helper function for docListAddPos and docListAddPosOffset */
305static void addPos(DocList *d, int iColumn, int iPos){
306 assert( d->nData>0 );
307 --d->nData; /* remove previous terminator */
308 if( iColumn!=d->iLastColumn ){
309 assert( iColumn>d->iLastColumn );
310 appendVarint(d, POS_COLUMN);
311 appendVarint(d, iColumn);
312 d->iLastColumn = iColumn;
313 d->iLastPos = d->iLastOffset = 0;
314 }
315 assert( iPos>=d->iLastPos );
316 appendVarint(d, iPos-d->iLastPos+POS_BASE);
317 d->iLastPos = iPos;
318}
319
320/* Add a position to the last position list in a doclist. */
321static void docListAddPos(DocList *d, int iColumn, int iPos){
322 assert( d->iType==DL_POSITIONS );
323 addPos(d, iColumn, iPos);
324 appendVarint(d, POS_END); /* add new terminator */
325}
326
327/*
328** Add a position and starting and ending offsets to a doclist.
329**
330** If the doclist is setup to handle only positions, then insert
331** the position only and ignore the offsets.
332*/
333static void docListAddPosOffset(
334 DocList *d, /* Doclist under construction */
335 int iColumn, /* Column the inserted term is part of */
336 int iPos, /* Position of the inserted term */
337 int iStartOffset, /* Starting offset of inserted term */
338 int iEndOffset /* Ending offset of inserted term */
339){
340 assert( d->iType>=DL_POSITIONS );
341 addPos(d, iColumn, iPos);
342 if( d->iType==DL_POSITIONS_OFFSETS ){
343 assert( iStartOffset>=d->iLastOffset );
344 appendVarint(d, iStartOffset-d->iLastOffset);
345 d->iLastOffset = iStartOffset;
346 assert( iEndOffset>=iStartOffset );
347 appendVarint(d, iEndOffset-iStartOffset);
348 }
349 appendVarint(d, POS_END); /* add new terminator */
350}
351
352/*
353** A DocListReader object is a cursor into a doclist. Initialize
354** the cursor to the beginning of the doclist by calling readerInit().
355** Then use routines
356**
357** peekDocid()
358** readDocid()
359** readPosition()
360** skipPositionList()
361** and so forth...
362**
363** to read information out of the doclist. When we reach the end
364** of the doclist, atEnd() returns TRUE.
365*/
366typedef struct DocListReader {
367 DocList *pDoclist; /* The document list we are stepping through */
368 char *p; /* Pointer to next unread byte in the doclist */
369 int iLastColumn;
370 int iLastPos; /* the last position read, or -1 when not in a position list */
371} DocListReader;
372
373/*
374** Initialize the DocListReader r to point to the beginning of pDoclist.
375*/
376static void readerInit(DocListReader *r, DocList *pDoclist){
377 r->pDoclist = pDoclist;
378 if( pDoclist!=NULL ){
379 r->p = pDoclist->pData;
380 }
381 r->iLastColumn = -1;
382 r->iLastPos = -1;
383}
384
385/*
386** Return TRUE if we have reached then end of pReader and there is
387** nothing else left to read.
388*/
389static int atEnd(DocListReader *pReader){
390 return pReader->pDoclist==0 || (pReader->p >= docListEnd(pReader->pDoclist));
391}
392
393/* Peek at the next docid without advancing the read pointer.
394*/
395static sqlite_int64 peekDocid(DocListReader *pReader){
396 sqlite_int64 ret;
397 assert( !atEnd(pReader) );
398 assert( pReader->iLastPos==-1 );
399 getVarint(pReader->p, &ret);
400 return ret;
401}
402
403/* Read the next docid. See also nextDocid().
404*/
405static sqlite_int64 readDocid(DocListReader *pReader){
406 sqlite_int64 ret;
407 assert( !atEnd(pReader) );
408 assert( pReader->iLastPos==-1 );
409 pReader->p += getVarint(pReader->p, &ret);
410 if( pReader->pDoclist->iType>=DL_POSITIONS ){
411 pReader->iLastColumn = 0;
412 pReader->iLastPos = 0;
413 }
414 return ret;
415}
416
417/* Read the next position and column index from a position list.
418 * Returns the position, or -1 at the end of the list. */
419static int readPosition(DocListReader *pReader, int *iColumn){
420 int i;
421 int iType = pReader->pDoclist->iType;
422
423 if( pReader->iLastPos==-1 ){
424 return -1;
425 }
426 assert( !atEnd(pReader) );
427
428 if( iType<DL_POSITIONS ){
429 return -1;
430 }
431 pReader->p += getVarint32(pReader->p, &i);
432 if( i==POS_END ){
433 pReader->iLastColumn = pReader->iLastPos = -1;
434 *iColumn = -1;
435 return -1;
436 }
437 if( i==POS_COLUMN ){
438 pReader->p += getVarint32(pReader->p, &pReader->iLastColumn);
439 pReader->iLastPos = 0;
440 pReader->p += getVarint32(pReader->p, &i);
441 assert( i>=POS_BASE );
442 }
443 pReader->iLastPos += ((int) i)-POS_BASE;
444 if( iType>=DL_POSITIONS_OFFSETS ){
445 /* Skip over offsets, ignoring them for now. */
446 int iStart, iEnd;
447 pReader->p += getVarint32(pReader->p, &iStart);
448 pReader->p += getVarint32(pReader->p, &iEnd);
449 }
450 *iColumn = pReader->iLastColumn;
451 return pReader->iLastPos;
452}
453
454/* Skip past the end of a position list. */
455static void skipPositionList(DocListReader *pReader){
456 DocList *p = pReader->pDoclist;
457 if( p && p->iType>=DL_POSITIONS ){
458 int iColumn;
459 while( readPosition(pReader, &iColumn)!=-1 ){}
460 }
461}
462
463/* Skip over a docid, including its position list if the doclist has
464 * positions. */
465static void skipDocument(DocListReader *pReader){
466 readDocid(pReader);
467 skipPositionList(pReader);
468}
469
470/* Skip past all docids which are less than [iDocid]. Returns 1 if a docid
471 * matching [iDocid] was found. */
472static int skipToDocid(DocListReader *pReader, sqlite_int64 iDocid){
473 sqlite_int64 d = 0;
474 while( !atEnd(pReader) && (d=peekDocid(pReader))<iDocid ){
475 skipDocument(pReader);
476 }
477 return !atEnd(pReader) && d==iDocid;
478}
479
480/* Return the first document in a document list.
481*/
482static sqlite_int64 firstDocid(DocList *d){
483 DocListReader r;
484 readerInit(&r, d);
485 return readDocid(&r);
486}
487
488#ifdef SQLITE_DEBUG
489/*
490** This routine is used for debugging purpose only.
491**
492** Write the content of a doclist to standard output.
493*/
494static void printDoclist(DocList *p){
495 DocListReader r;
496 const char *zSep = "";
497
498 readerInit(&r, p);
499 while( !atEnd(&r) ){
500 sqlite_int64 docid = readDocid(&r);
501 if( docid==0 ){
502 skipPositionList(&r);
503 continue;
504 }
505 printf("%s%lld", zSep, docid);
506 zSep = ",";
507 if( p->iType>=DL_POSITIONS ){
508 int iPos, iCol;
509 const char *zDiv = "";
510 printf("(");
511 while( (iPos = readPosition(&r, &iCol))>=0 ){
512 printf("%s%d:%d", zDiv, iCol, iPos);
513 zDiv = ":";
514 }
515 printf(")");
516 }
517 }
518 printf("\n");
519 fflush(stdout);
520}
521#endif /* SQLITE_DEBUG */
522
523/* Trim the given doclist to contain only positions in column
524 * [iRestrictColumn]. */
525static void docListRestrictColumn(DocList *in, int iRestrictColumn){
526 DocListReader r;
527 DocList out;
528
529 assert( in->iType>=DL_POSITIONS );
530 readerInit(&r, in);
531 docListInit(&out, DL_POSITIONS, NULL, 0);
532
533 while( !atEnd(&r) ){
534 sqlite_int64 iDocid = readDocid(&r);
535 int iPos, iColumn;
536
537 docListAddDocid(&out, iDocid);
538 while( (iPos = readPosition(&r, &iColumn)) != -1 ){
539 if( iColumn==iRestrictColumn ){
540 docListAddPos(&out, iColumn, iPos);
541 }
542 }
543 }
544
545 docListDestroy(in);
546 *in = out;
547}
548
549/* Trim the given doclist by discarding any docids without any remaining
550 * positions. */
551static void docListDiscardEmpty(DocList *in) {
552 DocListReader r;
553 DocList out;
554
555 /* TODO: It would be nice to implement this operation in place; that
556 * could save a significant amount of memory in queries with long doclists. */
557 assert( in->iType>=DL_POSITIONS );
558 readerInit(&r, in);
559 docListInit(&out, DL_POSITIONS, NULL, 0);
560
561 while( !atEnd(&r) ){
562 sqlite_int64 iDocid = readDocid(&r);
563 int match = 0;
564 int iPos, iColumn;
565 while( (iPos = readPosition(&r, &iColumn)) != -1 ){
566 if( !match ){
567 docListAddDocid(&out, iDocid);
568 match = 1;
569 }
570 docListAddPos(&out, iColumn, iPos);
571 }
572 }
573
574 docListDestroy(in);
575 *in = out;
576}
577
578/* Helper function for docListUpdate() and docListAccumulate().
579** Splices a doclist element into the doclist represented by r,
580** leaving r pointing after the newly spliced element.
581*/
582static void docListSpliceElement(DocListReader *r, sqlite_int64 iDocid,
583 const char *pSource, int nSource){
584 DocList *d = r->pDoclist;
585 char *pTarget;
586 int nTarget, found;
587
588 found = skipToDocid(r, iDocid);
589
590 /* Describe slice in d to place pSource/nSource. */
591 pTarget = r->p;
592 if( found ){
593 skipDocument(r);
594 nTarget = r->p-pTarget;
595 }else{
596 nTarget = 0;
597 }
598
599 /* The sense of the following is that there are three possibilities.
600 ** If nTarget==nSource, we should not move any memory nor realloc.
601 ** If nTarget>nSource, trim target and realloc.
602 ** If nTarget<nSource, realloc then expand target.
603 */
604 if( nTarget>nSource ){
605 memmove(pTarget+nSource, pTarget+nTarget, docListEnd(d)-(pTarget+nTarget));
606 }
607 if( nTarget!=nSource ){
608 int iDoclist = pTarget-d->pData;
609 d->pData = realloc(d->pData, d->nData+nSource-nTarget);
610 pTarget = d->pData+iDoclist;
611 }
612 if( nTarget<nSource ){
613 memmove(pTarget+nSource, pTarget+nTarget, docListEnd(d)-(pTarget+nTarget));
614 }
615
616 memcpy(pTarget, pSource, nSource);
617 d->nData += nSource-nTarget;
618 r->p = pTarget+nSource;
619}
620
621/* Insert/update pUpdate into the doclist. */
622static void docListUpdate(DocList *d, DocList *pUpdate){
623 DocListReader reader;
624
625 assert( d!=NULL && pUpdate!=NULL );
626 assert( d->iType==pUpdate->iType);
627
628 readerInit(&reader, d);
629 docListSpliceElement(&reader, firstDocid(pUpdate),
630 pUpdate->pData, pUpdate->nData);
631}
632
633/* Propagate elements from pUpdate to pAcc, overwriting elements with
634** matching docids.
635*/
636static void docListAccumulate(DocList *pAcc, DocList *pUpdate){
637 DocListReader accReader, updateReader;
638
639 /* Handle edge cases where one doclist is empty. */
640 assert( pAcc!=NULL );
641 if( pUpdate==NULL || pUpdate->nData==0 ) return;
642 if( pAcc->nData==0 ){
643 pAcc->pData = malloc(pUpdate->nData);
644 memcpy(pAcc->pData, pUpdate->pData, pUpdate->nData);
645 pAcc->nData = pUpdate->nData;
646 return;
647 }
648
649 readerInit(&accReader, pAcc);
650 readerInit(&updateReader, pUpdate);
651
652 while( !atEnd(&updateReader) ){
653 char *pSource = updateReader.p;
654 sqlite_int64 iDocid = readDocid(&updateReader);
655 skipPositionList(&updateReader);
656 docListSpliceElement(&accReader, iDocid, pSource, updateReader.p-pSource);
657 }
658}
659
660/*
661** Read the next docid off of pIn. Return 0 if we reach the end.
662*
663* TODO: This assumes that docids are never 0, but they may actually be 0 since
664* users can choose docids when inserting into a full-text table. Fix this.
665*/
666static sqlite_int64 nextDocid(DocListReader *pIn){
667 skipPositionList(pIn);
668 return atEnd(pIn) ? 0 : readDocid(pIn);
669}
670
671/*
672** pLeft and pRight are two DocListReaders that are pointing to
673** positions lists of the same document: iDocid.
674**
675** If there are no instances in pLeft or pRight where the position
676** of pLeft is one less than the position of pRight, then this
677** routine adds nothing to pOut.
678**
679** If there are one or more instances where positions from pLeft
680** are exactly one less than positions from pRight, then add a new
681** document record to pOut. If pOut wants to hold positions, then
682** include the positions from pRight that are one more than a
683** position in pLeft. In other words: pRight.iPos==pLeft.iPos+1.
684**
685** pLeft and pRight are left pointing at the next document record.
686*/
687static void mergePosList(
688 DocListReader *pLeft, /* Left position list */
689 DocListReader *pRight, /* Right position list */
690 sqlite_int64 iDocid, /* The docid from pLeft and pRight */
691 DocList *pOut /* Write the merged document record here */
692){
693 int iLeftCol, iLeftPos = readPosition(pLeft, &iLeftCol);
694 int iRightCol, iRightPos = readPosition(pRight, &iRightCol);
695 int match = 0;
696
697 /* Loop until we've reached the end of both position lists. */
698 while( iLeftPos!=-1 && iRightPos!=-1 ){
699 if( iLeftCol==iRightCol && iLeftPos+1==iRightPos ){
700 if( !match ){
701 docListAddDocid(pOut, iDocid);
702 match = 1;
703 }
704 if( pOut->iType>=DL_POSITIONS ){
705 docListAddPos(pOut, iRightCol, iRightPos);
706 }
707 iLeftPos = readPosition(pLeft, &iLeftCol);
708 iRightPos = readPosition(pRight, &iRightCol);
709 }else if( iRightCol<iLeftCol ||
710 (iRightCol==iLeftCol && iRightPos<iLeftPos+1) ){
711 iRightPos = readPosition(pRight, &iRightCol);
712 }else{
713 iLeftPos = readPosition(pLeft, &iLeftCol);
714 }
715 }
716 if( iLeftPos>=0 ) skipPositionList(pLeft);
717 if( iRightPos>=0 ) skipPositionList(pRight);
718}
719
720/* We have two doclists: pLeft and pRight.
721** Write the phrase intersection of these two doclists into pOut.
722**
723** A phrase intersection means that two documents only match
724** if pLeft.iPos+1==pRight.iPos.
725**
726** The output pOut may or may not contain positions. If pOut
727** does contain positions, they are the positions of pRight.
728*/
729static void docListPhraseMerge(
730 DocList *pLeft, /* Doclist resulting from the words on the left */
731 DocList *pRight, /* Doclist for the next word to the right */
732 DocList *pOut /* Write the combined doclist here */
733){
734 DocListReader left, right;
735 sqlite_int64 docidLeft, docidRight;
736
737 readerInit(&left, pLeft);
738 readerInit(&right, pRight);
739 docidLeft = nextDocid(&left);
740 docidRight = nextDocid(&right);
741
742 while( docidLeft>0 && docidRight>0 ){
743 if( docidLeft<docidRight ){
744 docidLeft = nextDocid(&left);
745 }else if( docidRight<docidLeft ){
746 docidRight = nextDocid(&right);
747 }else{
748 mergePosList(&left, &right, docidLeft, pOut);
749 docidLeft = nextDocid(&left);
750 docidRight = nextDocid(&right);
751 }
752 }
753}
754
755/* We have two doclists: pLeft and pRight.
756** Write the intersection of these two doclists into pOut.
757** Only docids are matched. Position information is ignored.
758**
759** The output pOut never holds positions.
760*/
761static void docListAndMerge(
762 DocList *pLeft, /* Doclist resulting from the words on the left */
763 DocList *pRight, /* Doclist for the next word to the right */
764 DocList *pOut /* Write the combined doclist here */
765){
766 DocListReader left, right;
767 sqlite_int64 docidLeft, docidRight;
768
769 assert( pOut->iType<DL_POSITIONS );
770
771 readerInit(&left, pLeft);
772 readerInit(&right, pRight);
773 docidLeft = nextDocid(&left);
774 docidRight = nextDocid(&right);
775
776 while( docidLeft>0 && docidRight>0 ){
777 if( docidLeft<docidRight ){
778 docidLeft = nextDocid(&left);
779 }else if( docidRight<docidLeft ){
780 docidRight = nextDocid(&right);
781 }else{
782 docListAddDocid(pOut, docidLeft);
783 docidLeft = nextDocid(&left);
784 docidRight = nextDocid(&right);
785 }
786 }
787}
788
789/* We have two doclists: pLeft and pRight.
790** Write the union of these two doclists into pOut.
791** Only docids are matched. Position information is ignored.
792**
793** The output pOut never holds positions.
794*/
795static void docListOrMerge(
796 DocList *pLeft, /* Doclist resulting from the words on the left */
797 DocList *pRight, /* Doclist for the next word to the right */
798 DocList *pOut /* Write the combined doclist here */
799){
800 DocListReader left, right;
801 sqlite_int64 docidLeft, docidRight, priorLeft;
802
803 readerInit(&left, pLeft);
804 readerInit(&right, pRight);
805 docidLeft = nextDocid(&left);
806 docidRight = nextDocid(&right);
807
808 while( docidLeft>0 && docidRight>0 ){
809 if( docidLeft<=docidRight ){
810 docListAddDocid(pOut, docidLeft);
811 }else{
812 docListAddDocid(pOut, docidRight);
813 }
814 priorLeft = docidLeft;
815 if( docidLeft<=docidRight ){
816 docidLeft = nextDocid(&left);
817 }
818 if( docidRight>0 && docidRight<=priorLeft ){
819 docidRight = nextDocid(&right);
820 }
821 }
822 while( docidLeft>0 ){
823 docListAddDocid(pOut, docidLeft);
824 docidLeft = nextDocid(&left);
825 }
826 while( docidRight>0 ){
827 docListAddDocid(pOut, docidRight);
828 docidRight = nextDocid(&right);
829 }
830}
831
832/* We have two doclists: pLeft and pRight.
833** Write into pOut all documents that occur in pLeft but not
834** in pRight.
835**
836** Only docids are matched. Position information is ignored.
837**
838** The output pOut never holds positions.
839*/
840static void docListExceptMerge(
841 DocList *pLeft, /* Doclist resulting from the words on the left */
842 DocList *pRight, /* Doclist for the next word to the right */
843 DocList *pOut /* Write the combined doclist here */
844){
845 DocListReader left, right;
846 sqlite_int64 docidLeft, docidRight, priorLeft;
847
848 readerInit(&left, pLeft);
849 readerInit(&right, pRight);
850 docidLeft = nextDocid(&left);
851 docidRight = nextDocid(&right);
852
853 while( docidLeft>0 && docidRight>0 ){
854 priorLeft = docidLeft;
855 if( docidLeft<docidRight ){
856 docListAddDocid(pOut, docidLeft);
857 }
858 if( docidLeft<=docidRight ){
859 docidLeft = nextDocid(&left);
860 }
861 if( docidRight>0 && docidRight<=priorLeft ){
862 docidRight = nextDocid(&right);
863 }
864 }
865 while( docidLeft>0 ){
866 docListAddDocid(pOut, docidLeft);
867 docidLeft = nextDocid(&left);
868 }
869}
870
871static char *string_dup_n(const char *s, int n){
872 char *str = malloc(n + 1);
873 memcpy(str, s, n);
874 str[n] = '\0';
875 return str;
876}
877
878/* Duplicate a string; the caller must free() the returned string.
879 * (We don't use strdup() since it's not part of the standard C library and
880 * may not be available everywhere.) */
881static char *string_dup(const char *s){
882 return string_dup_n(s, strlen(s));
883}
884
885/* Format a string, replacing each occurrence of the % character with
886 * zDb.zName. This may be more convenient than sqlite_mprintf()
887 * when one string is used repeatedly in a format string.
888 * The caller must free() the returned string. */
889static char *string_format(const char *zFormat,
890 const char *zDb, const char *zName){
891 const char *p;
892 size_t len = 0;
893 size_t nDb = strlen(zDb);
894 size_t nName = strlen(zName);
895 size_t nFullTableName = nDb+1+nName;
896 char *result;
897 char *r;
898
899 /* first compute length needed */
900 for(p = zFormat ; *p ; ++p){
901 len += (*p=='%' ? nFullTableName : 1);
902 }
903 len += 1; /* for null terminator */
904
905 r = result = malloc(len);
906 for(p = zFormat; *p; ++p){
907 if( *p=='%' ){
908 memcpy(r, zDb, nDb);
909 r += nDb;
910 *r++ = '.';
911 memcpy(r, zName, nName);
912 r += nName;
913 } else {
914 *r++ = *p;
915 }
916 }
917 *r++ = '\0';
918 assert( r == result + len );
919 return result;
920}
921
922static int sql_exec(sqlite3 *db, const char *zDb, const char *zName,
923 const char *zFormat){
924 char *zCommand = string_format(zFormat, zDb, zName);
925 int rc;
926 TRACE(("FTS1 sql: %s\n", zCommand));
927 rc = sqlite3_exec(db, zCommand, NULL, 0, NULL);
928 free(zCommand);
929 return rc;
930}
931
932static int sql_prepare(sqlite3 *db, const char *zDb, const char *zName,
933 sqlite3_stmt **ppStmt, const char *zFormat){
934 char *zCommand = string_format(zFormat, zDb, zName);
935 int rc;
936 TRACE(("FTS1 prepare: %s\n", zCommand));
937 rc = sqlite3_prepare(db, zCommand, -1, ppStmt, NULL);
938 free(zCommand);
939 return rc;
940}
941
942/* end utility functions */
943
944/* Forward reference */
945typedef struct fulltext_vtab fulltext_vtab;
946
947/* A single term in a query is represented by an instances of
948** the following structure.
949*/
950typedef struct QueryTerm {
951 short int nPhrase; /* How many following terms are part of the same phrase */
952 short int iPhrase; /* This is the i-th term of a phrase. */
953 short int iColumn; /* Column of the index that must match this term */
954 signed char isOr; /* this term is preceded by "OR" */
955 signed char isNot; /* this term is preceded by "-" */
956 char *pTerm; /* text of the term. '\000' terminated. malloced */
957 int nTerm; /* Number of bytes in pTerm[] */
958} QueryTerm;
959
960
961/* A query string is parsed into a Query structure.
962 *
963 * We could, in theory, allow query strings to be complicated
964 * nested expressions with precedence determined by parentheses.
965 * But none of the major search engines do this. (Perhaps the
966 * feeling is that an parenthesized expression is two complex of
967 * an idea for the average user to grasp.) Taking our lead from
968 * the major search engines, we will allow queries to be a list
969 * of terms (with an implied AND operator) or phrases in double-quotes,
970 * with a single optional "-" before each non-phrase term to designate
971 * negation and an optional OR connector.
972 *
973 * OR binds more tightly than the implied AND, which is what the
974 * major search engines seem to do. So, for example:
975 *
976 * [one two OR three] ==> one AND (two OR three)
977 * [one OR two three] ==> (one OR two) AND three
978 *
979 * A "-" before a term matches all entries that lack that term.
980 * The "-" must occur immediately before the term with in intervening
981 * space. This is how the search engines do it.
982 *
983 * A NOT term cannot be the right-hand operand of an OR. If this
984 * occurs in the query string, the NOT is ignored:
985 *
986 * [one OR -two] ==> one OR two
987 *
988 */
989typedef struct Query {
990 fulltext_vtab *pFts; /* The full text index */
991 int nTerms; /* Number of terms in the query */
992 QueryTerm *pTerms; /* Array of terms. Space obtained from malloc() */
993 int nextIsOr; /* Set the isOr flag on the next inserted term */
994 int nextColumn; /* Next word parsed must be in this column */
995 int dfltColumn; /* The default column */
996} Query;
997
998
999/*
1000** An instance of the following structure keeps track of generated
1001** matching-word offset information and snippets.
1002*/
1003typedef struct Snippet {
1004 int nMatch; /* Total number of matches */
1005 int nAlloc; /* Space allocated for aMatch[] */
1006 struct snippetMatch { /* One entry for each matching term */
1007 char snStatus; /* Status flag for use while constructing snippets */
1008 short int iCol; /* The column that contains the match */
1009 short int iTerm; /* The index in Query.pTerms[] of the matching term */
1010 short int nByte; /* Number of bytes in the term */
1011 int iStart; /* The offset to the first character of the term */
1012 } *aMatch; /* Points to space obtained from malloc */
1013 char *zOffset; /* Text rendering of aMatch[] */
1014 int nOffset; /* strlen(zOffset) */
1015 char *zSnippet; /* Snippet text */
1016 int nSnippet; /* strlen(zSnippet) */
1017} Snippet;
1018
1019
1020typedef enum QueryType {
1021 QUERY_GENERIC, /* table scan */
1022 QUERY_ROWID, /* lookup by rowid */
1023 QUERY_FULLTEXT /* QUERY_FULLTEXT + [i] is a full-text search for column i*/
1024} QueryType;
1025
1026/* TODO(shess) CHUNK_MAX controls how much data we allow in segment 0
1027** before we start aggregating into larger segments. Lower CHUNK_MAX
1028** means that for a given input we have more individual segments per
1029** term, which means more rows in the table and a bigger index (due to
1030** both more rows and bigger rowids). But it also reduces the average
1031** cost of adding new elements to the segment 0 doclist, and it seems
1032** to reduce the number of pages read and written during inserts. 256
1033** was chosen by measuring insertion times for a certain input (first
1034** 10k documents of Enron corpus), though including query performance
1035** in the decision may argue for a larger value.
1036*/
1037#define CHUNK_MAX 256
1038
1039typedef enum fulltext_statement {
1040 CONTENT_INSERT_STMT,
1041 CONTENT_SELECT_STMT,
1042 CONTENT_UPDATE_STMT,
1043 CONTENT_DELETE_STMT,
1044
1045 TERM_SELECT_STMT,
1046 TERM_SELECT_ALL_STMT,
1047 TERM_INSERT_STMT,
1048 TERM_UPDATE_STMT,
1049 TERM_DELETE_STMT,
1050
1051 MAX_STMT /* Always at end! */
1052} fulltext_statement;
1053
1054/* These must exactly match the enum above. */
1055/* TODO(adam): Is there some risk that a statement (in particular,
1056** pTermSelectStmt) will be used in two cursors at once, e.g. if a
1057** query joins a virtual table to itself? If so perhaps we should
1058** move some of these to the cursor object.
1059*/
1060static const char *const fulltext_zStatement[MAX_STMT] = {
1061 /* CONTENT_INSERT */ NULL, /* generated in contentInsertStatement() */
1062 /* CONTENT_SELECT */ "select * from %_content where rowid = ?",
1063 /* CONTENT_UPDATE */ NULL, /* generated in contentUpdateStatement() */
1064 /* CONTENT_DELETE */ "delete from %_content where rowid = ?",
1065
1066 /* TERM_SELECT */
1067 "select rowid, doclist from %_term where term = ? and segment = ?",
1068 /* TERM_SELECT_ALL */
1069 "select doclist from %_term where term = ? order by segment",
1070 /* TERM_INSERT */
1071 "insert into %_term (rowid, term, segment, doclist) values (?, ?, ?, ?)",
1072 /* TERM_UPDATE */ "update %_term set doclist = ? where rowid = ?",
1073 /* TERM_DELETE */ "delete from %_term where rowid = ?",
1074};
1075
1076/*
1077** A connection to a fulltext index is an instance of the following
1078** structure. The xCreate and xConnect methods create an instance
1079** of this structure and xDestroy and xDisconnect free that instance.
1080** All other methods receive a pointer to the structure as one of their
1081** arguments.
1082*/
1083struct fulltext_vtab {
1084 sqlite3_vtab base; /* Base class used by SQLite core */
1085 sqlite3 *db; /* The database connection */
1086 const char *zDb; /* logical database name */
1087 const char *zName; /* virtual table name */
1088 int nColumn; /* number of columns in virtual table */
1089 char **azColumn; /* column names. malloced */
1090 char **azContentColumn; /* column names in content table; malloced */
1091 sqlite3_tokenizer *pTokenizer; /* tokenizer for inserts and queries */
1092
1093 /* Precompiled statements which we keep as long as the table is
1094 ** open.
1095 */
1096 sqlite3_stmt *pFulltextStatements[MAX_STMT];
1097};
1098
1099/*
1100** When the core wants to do a query, it create a cursor using a
1101** call to xOpen. This structure is an instance of a cursor. It
1102** is destroyed by xClose.
1103*/
1104typedef struct fulltext_cursor {
1105 sqlite3_vtab_cursor base; /* Base class used by SQLite core */
1106 QueryType iCursorType; /* Copy of sqlite3_index_info.idxNum */
1107 sqlite3_stmt *pStmt; /* Prepared statement in use by the cursor */
1108 int eof; /* True if at End Of Results */
1109 Query q; /* Parsed query string */
1110 Snippet snippet; /* Cached snippet for the current row */
1111 int iColumn; /* Column being searched */
1112 DocListReader result; /* used when iCursorType == QUERY_FULLTEXT */
1113} fulltext_cursor;
1114
1115static struct fulltext_vtab *cursor_vtab(fulltext_cursor *c){
1116 return (fulltext_vtab *) c->base.pVtab;
1117}
1118
1119static const sqlite3_module fulltextModule; /* forward declaration */
1120
1121/* Append a list of strings separated by commas to a StringBuffer. */
1122static void appendList(StringBuffer *sb, int nString, char **azString){
1123 int i;
1124 for(i=0; i<nString; ++i){
1125 if( i>0 ) append(sb, ", ");
1126 append(sb, azString[i]);
1127 }
1128}
1129
1130/* Return a dynamically generated statement of the form
1131 * insert into %_content (rowid, ...) values (?, ...)
1132 */
1133static const char *contentInsertStatement(fulltext_vtab *v){
1134 StringBuffer sb;
1135 int i;
1136
1137 initStringBuffer(&sb);
1138 append(&sb, "insert into %_content (rowid, ");
1139 appendList(&sb, v->nColumn, v->azContentColumn);
1140 append(&sb, ") values (?");
1141 for(i=0; i<v->nColumn; ++i)
1142 append(&sb, ", ?");
1143 append(&sb, ")");
1144 return sb.s;
1145}
1146
1147/* Return a dynamically generated statement of the form
1148 * update %_content set [col_0] = ?, [col_1] = ?, ...
1149 * where rowid = ?
1150 */
1151static const char *contentUpdateStatement(fulltext_vtab *v){
1152 StringBuffer sb;
1153 int i;
1154
1155 initStringBuffer(&sb);
1156 append(&sb, "update %_content set ");
1157 for(i=0; i<v->nColumn; ++i) {
1158 if( i>0 ){
1159 append(&sb, ", ");
1160 }
1161 append(&sb, v->azContentColumn[i]);
1162 append(&sb, " = ?");
1163 }
1164 append(&sb, " where rowid = ?");
1165 return sb.s;
1166}
1167
1168/* Puts a freshly-prepared statement determined by iStmt in *ppStmt.
1169** If the indicated statement has never been prepared, it is prepared
1170** and cached, otherwise the cached version is reset.
1171*/
1172static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt,
1173 sqlite3_stmt **ppStmt){
1174 assert( iStmt<MAX_STMT );
1175 if( v->pFulltextStatements[iStmt]==NULL ){
1176 const char *zStmt;
1177 int rc;
1178 switch( iStmt ){
1179 case CONTENT_INSERT_STMT:
1180 zStmt = contentInsertStatement(v); break;
1181 case CONTENT_UPDATE_STMT:
1182 zStmt = contentUpdateStatement(v); break;
1183 default:
1184 zStmt = fulltext_zStatement[iStmt];
1185 }
1186 rc = sql_prepare(v->db, v->zDb, v->zName, &v->pFulltextStatements[iStmt],
1187 zStmt);
1188 if( zStmt != fulltext_zStatement[iStmt]) free((void *) zStmt);
1189 if( rc!=SQLITE_OK ) return rc;
1190 } else {
1191 int rc = sqlite3_reset(v->pFulltextStatements[iStmt]);
1192 if( rc!=SQLITE_OK ) return rc;
1193 }
1194
1195 *ppStmt = v->pFulltextStatements[iStmt];
1196 return SQLITE_OK;
1197}
1198
1199/* Step the indicated statement, handling errors SQLITE_BUSY (by
1200** retrying) and SQLITE_SCHEMA (by re-preparing and transferring
1201** bindings to the new statement).
1202** TODO(adam): We should extend this function so that it can work with
1203** statements declared locally, not only globally cached statements.
1204*/
1205static int sql_step_statement(fulltext_vtab *v, fulltext_statement iStmt,
1206 sqlite3_stmt **ppStmt){
1207 int rc;
1208 sqlite3_stmt *s = *ppStmt;
1209 assert( iStmt<MAX_STMT );
1210 assert( s==v->pFulltextStatements[iStmt] );
1211
1212 while( (rc=sqlite3_step(s))!=SQLITE_DONE && rc!=SQLITE_ROW ){
1213 if( rc==SQLITE_BUSY ) continue;
1214 if( rc!=SQLITE_ERROR ) return rc;
1215
1216 /* If an SQLITE_SCHEMA error has occured, then finalizing this
1217 * statement is going to delete the fulltext_vtab structure. If
1218 * the statement just executed is in the pFulltextStatements[]
1219 * array, it will be finalized twice. So remove it before
1220 * calling sqlite3_finalize().
1221 */
1222 v->pFulltextStatements[iStmt] = NULL;
1223 rc = sqlite3_finalize(s);
1224 break;
1225 }
1226 return rc;
1227
1228 err:
1229 sqlite3_finalize(s);
1230 return rc;
1231}
1232
1233/* Like sql_step_statement(), but convert SQLITE_DONE to SQLITE_OK.
1234** Useful for statements like UPDATE, where we expect no results.
1235*/
1236static int sql_single_step_statement(fulltext_vtab *v,
1237 fulltext_statement iStmt,
1238 sqlite3_stmt **ppStmt){
1239 int rc = sql_step_statement(v, iStmt, ppStmt);
1240 return (rc==SQLITE_DONE) ? SQLITE_OK : rc;
1241}
1242
1243/* insert into %_content (rowid, ...) values ([rowid], [pValues]) */
1244static int content_insert(fulltext_vtab *v, sqlite3_value *rowid,
1245 sqlite3_value **pValues){
1246 sqlite3_stmt *s;
1247 int i;
1248 int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s);
1249 if( rc!=SQLITE_OK ) return rc;
1250
1251 rc = sqlite3_bind_value(s, 1, rowid);
1252 if( rc!=SQLITE_OK ) return rc;
1253
1254 for(i=0; i<v->nColumn; ++i){
1255 rc = sqlite3_bind_value(s, 2+i, pValues[i]);
1256 if( rc!=SQLITE_OK ) return rc;
1257 }
1258
1259 return sql_single_step_statement(v, CONTENT_INSERT_STMT, &s);
1260}
1261
1262/* update %_content set col0 = pValues[0], col1 = pValues[1], ...
1263 * where rowid = [iRowid] */
1264static int content_update(fulltext_vtab *v, sqlite3_value **pValues,
1265 sqlite_int64 iRowid){
1266 sqlite3_stmt *s;
1267 int i;
1268 int rc = sql_get_statement(v, CONTENT_UPDATE_STMT, &s);
1269 if( rc!=SQLITE_OK ) return rc;
1270
1271 for(i=0; i<v->nColumn; ++i){
1272 rc = sqlite3_bind_value(s, 1+i, pValues[i]);
1273 if( rc!=SQLITE_OK ) return rc;
1274 }
1275
1276 rc = sqlite3_bind_int64(s, 1+v->nColumn, iRowid);
1277 if( rc!=SQLITE_OK ) return rc;
1278
1279 return sql_single_step_statement(v, CONTENT_UPDATE_STMT, &s);
1280}
1281
1282static void freeStringArray(int nString, const char **pString){
1283 int i;
1284
1285 for (i=0 ; i < nString ; ++i) {
1286 if( pString[i]!=NULL ) free((void *) pString[i]);
1287 }
1288 free((void *) pString);
1289}
1290
1291/* select * from %_content where rowid = [iRow]
1292 * The caller must delete the returned array and all strings in it.
1293 * null fields will be NULL in the returned array.
1294 *
1295 * TODO: Perhaps we should return pointer/length strings here for consistency
1296 * with other code which uses pointer/length. */
1297static int content_select(fulltext_vtab *v, sqlite_int64 iRow,
1298 const char ***pValues){
1299 sqlite3_stmt *s;
1300 const char **values;
1301 int i;
1302 int rc;
1303
1304 *pValues = NULL;
1305
1306 rc = sql_get_statement(v, CONTENT_SELECT_STMT, &s);
1307 if( rc!=SQLITE_OK ) return rc;
1308
1309 rc = sqlite3_bind_int64(s, 1, iRow);
1310 if( rc!=SQLITE_OK ) return rc;
1311
1312 rc = sql_step_statement(v, CONTENT_SELECT_STMT, &s);
1313 if( rc!=SQLITE_ROW ) return rc;
1314
1315 values = (const char **) malloc(v->nColumn * sizeof(const char *));
1316 for(i=0; i<v->nColumn; ++i){
1317 if( sqlite3_column_type(s, i)==SQLITE_NULL ){
1318 values[i] = NULL;
1319 }else{
1320 values[i] = string_dup((char*)sqlite3_column_text(s, i));
1321 }
1322 }
1323
1324 /* We expect only one row. We must execute another sqlite3_step()
1325 * to complete the iteration; otherwise the table will remain locked. */
1326 rc = sqlite3_step(s);
1327 if( rc==SQLITE_DONE ){
1328 *pValues = values;
1329 return SQLITE_OK;
1330 }
1331
1332 freeStringArray(v->nColumn, values);
1333 return rc;
1334}
1335
1336/* delete from %_content where rowid = [iRow ] */
1337static int content_delete(fulltext_vtab *v, sqlite_int64 iRow){
1338 sqlite3_stmt *s;
1339 int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s);
1340 if( rc!=SQLITE_OK ) return rc;
1341
1342 rc = sqlite3_bind_int64(s, 1, iRow);
1343 if( rc!=SQLITE_OK ) return rc;
1344
1345 return sql_single_step_statement(v, CONTENT_DELETE_STMT, &s);
1346}
1347
1348/* select rowid, doclist from %_term
1349 * where term = [pTerm] and segment = [iSegment]
1350 * If found, returns SQLITE_ROW; the caller must free the
1351 * returned doclist. If no rows found, returns SQLITE_DONE. */
1352static int term_select(fulltext_vtab *v, const char *pTerm, int nTerm,
1353 int iSegment,
1354 sqlite_int64 *rowid, DocList *out){
1355 sqlite3_stmt *s;
1356 int rc = sql_get_statement(v, TERM_SELECT_STMT, &s);
1357 if( rc!=SQLITE_OK ) return rc;
1358
1359 rc = sqlite3_bind_text(s, 1, pTerm, nTerm, SQLITE_STATIC);
1360 if( rc!=SQLITE_OK ) return rc;
1361
1362 rc = sqlite3_bind_int(s, 2, iSegment);
1363 if( rc!=SQLITE_OK ) return rc;
1364
1365 rc = sql_step_statement(v, TERM_SELECT_STMT, &s);
1366 if( rc!=SQLITE_ROW ) return rc;
1367
1368 *rowid = sqlite3_column_int64(s, 0);
1369 docListInit(out, DL_DEFAULT,
1370 sqlite3_column_blob(s, 1), sqlite3_column_bytes(s, 1));
1371
1372 /* We expect only one row. We must execute another sqlite3_step()
1373 * to complete the iteration; otherwise the table will remain locked. */
1374 rc = sqlite3_step(s);
1375 return rc==SQLITE_DONE ? SQLITE_ROW : rc;
1376}
1377
1378/* Load the segment doclists for term pTerm and merge them in
1379** appropriate order into out. Returns SQLITE_OK if successful. If
1380** there are no segments for pTerm, successfully returns an empty
1381** doclist in out.
1382**
1383** Each document consists of 1 or more "columns". The number of
1384** columns is v->nColumn. If iColumn==v->nColumn, then return
1385** position information about all columns. If iColumn<v->nColumn,
1386** then only return position information about the iColumn-th column
1387** (where the first column is 0).
1388*/
1389static int term_select_all(
1390 fulltext_vtab *v, /* The fulltext index we are querying against */
1391 int iColumn, /* If <nColumn, only look at the iColumn-th column */
1392 const char *pTerm, /* The term whose posting lists we want */
1393 int nTerm, /* Number of bytes in pTerm */
1394 DocList *out /* Write the resulting doclist here */
1395){
1396 DocList doclist;
1397 sqlite3_stmt *s;
1398 int rc = sql_get_statement(v, TERM_SELECT_ALL_STMT, &s);
1399 if( rc!=SQLITE_OK ) return rc;
1400
1401 rc = sqlite3_bind_text(s, 1, pTerm, nTerm, SQLITE_STATIC);
1402 if( rc!=SQLITE_OK ) return rc;
1403
1404 docListInit(&doclist, DL_DEFAULT, 0, 0);
1405
1406 /* TODO(shess) Handle schema and busy errors. */
1407 while( (rc=sql_step_statement(v, TERM_SELECT_ALL_STMT, &s))==SQLITE_ROW ){
1408 DocList old;
1409
1410 /* TODO(shess) If we processed doclists from oldest to newest, we
1411 ** could skip the malloc() involved with the following call. For
1412 ** now, I'd rather keep this logic similar to index_insert_term().
1413 ** We could additionally drop elements when we see deletes, but
1414 ** that would require a distinct version of docListAccumulate().
1415 */
1416 docListInit(&old, DL_DEFAULT,
1417 sqlite3_column_blob(s, 0), sqlite3_column_bytes(s, 0));
1418
1419 if( iColumn<v->nColumn ){ /* querying a single column */
1420 docListRestrictColumn(&old, iColumn);
1421 }
1422
1423 /* doclist contains the newer data, so write it over old. Then
1424 ** steal accumulated result for doclist.
1425 */
1426 docListAccumulate(&old, &doclist);
1427 docListDestroy(&doclist);
1428 doclist = old;
1429 }
1430 if( rc!=SQLITE_DONE ){
1431 docListDestroy(&doclist);
1432 return rc;
1433 }
1434
1435 docListDiscardEmpty(&doclist);
1436 *out = doclist;
1437 return SQLITE_OK;
1438}
1439
1440/* insert into %_term (rowid, term, segment, doclist)
1441 values ([piRowid], [pTerm], [iSegment], [doclist])
1442** Lets sqlite select rowid if piRowid is NULL, else uses *piRowid.
1443**
1444** NOTE(shess) piRowid is IN, with values of "space of int64" plus
1445** null, it is not used to pass data back to the caller.
1446*/
1447static int term_insert(fulltext_vtab *v, sqlite_int64 *piRowid,
1448 const char *pTerm, int nTerm,
1449 int iSegment, DocList *doclist){
1450 sqlite3_stmt *s;
1451 int rc = sql_get_statement(v, TERM_INSERT_STMT, &s);
1452 if( rc!=SQLITE_OK ) return rc;
1453
1454 if( piRowid==NULL ){
1455 rc = sqlite3_bind_null(s, 1);
1456 }else{
1457 rc = sqlite3_bind_int64(s, 1, *piRowid);
1458 }
1459 if( rc!=SQLITE_OK ) return rc;
1460
1461 rc = sqlite3_bind_text(s, 2, pTerm, nTerm, SQLITE_STATIC);
1462 if( rc!=SQLITE_OK ) return rc;
1463
1464 rc = sqlite3_bind_int(s, 3, iSegment);
1465 if( rc!=SQLITE_OK ) return rc;
1466
1467 rc = sqlite3_bind_blob(s, 4, doclist->pData, doclist->nData, SQLITE_STATIC);
1468 if( rc!=SQLITE_OK ) return rc;
1469
1470 return sql_single_step_statement(v, TERM_INSERT_STMT, &s);
1471}
1472
1473/* update %_term set doclist = [doclist] where rowid = [rowid] */
1474static int term_update(fulltext_vtab *v, sqlite_int64 rowid,
1475 DocList *doclist){
1476 sqlite3_stmt *s;
1477 int rc = sql_get_statement(v, TERM_UPDATE_STMT, &s);
1478 if( rc!=SQLITE_OK ) return rc;
1479
1480 rc = sqlite3_bind_blob(s, 1, doclist->pData, doclist->nData, SQLITE_STATIC);
1481 if( rc!=SQLITE_OK ) return rc;
1482
1483 rc = sqlite3_bind_int64(s, 2, rowid);
1484 if( rc!=SQLITE_OK ) return rc;
1485
1486 return sql_single_step_statement(v, TERM_UPDATE_STMT, &s);
1487}
1488
1489static int term_delete(fulltext_vtab *v, sqlite_int64 rowid){
1490 sqlite3_stmt *s;
1491 int rc = sql_get_statement(v, TERM_DELETE_STMT, &s);
1492 if( rc!=SQLITE_OK ) return rc;
1493
1494 rc = sqlite3_bind_int64(s, 1, rowid);
1495 if( rc!=SQLITE_OK ) return rc;
1496
1497 return sql_single_step_statement(v, TERM_DELETE_STMT, &s);
1498}
1499
1500/*
1501** Free the memory used to contain a fulltext_vtab structure.
1502*/
1503static void fulltext_vtab_destroy(fulltext_vtab *v){
1504 int iStmt, i;
1505
1506 TRACE(("FTS1 Destroy %p\n", v));
1507 for( iStmt=0; iStmt<MAX_STMT; iStmt++ ){
1508 if( v->pFulltextStatements[iStmt]!=NULL ){
1509 sqlite3_finalize(v->pFulltextStatements[iStmt]);
1510 v->pFulltextStatements[iStmt] = NULL;
1511 }
1512 }
1513
1514 if( v->pTokenizer!=NULL ){
1515 v->pTokenizer->pModule->xDestroy(v->pTokenizer);
1516 v->pTokenizer = NULL;
1517 }
1518
1519 free(v->azColumn);
1520 for(i = 0; i < v->nColumn; ++i) {
1521 sqlite3_free(v->azContentColumn[i]);
1522 }
1523 free(v->azContentColumn);
1524 free(v);
1525}
1526
1527/*
1528** Token types for parsing the arguments to xConnect or xCreate.
1529*/
1530#define TOKEN_EOF 0 /* End of file */
1531#define TOKEN_SPACE 1 /* Any kind of whitespace */
1532#define TOKEN_ID 2 /* An identifier */
1533#define TOKEN_STRING 3 /* A string literal */
1534#define TOKEN_PUNCT 4 /* A single punctuation character */
1535
1536/*
1537** If X is a character that can be used in an identifier then
1538** IdChar(X) will be true. Otherwise it is false.
1539**
1540** For ASCII, any character with the high-order bit set is
1541** allowed in an identifier. For 7-bit characters,
1542** sqlite3IsIdChar[X] must be 1.
1543**
1544** Ticket #1066. the SQL standard does not allow '$' in the
1545** middle of identfiers. But many SQL implementations do.
1546** SQLite will allow '$' in identifiers for compatibility.
1547** But the feature is undocumented.
1548*/
1549static const char isIdChar[] = {
1550/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
1551 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 2x */
1552 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */
1553 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */
1554 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */
1555 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */
1556 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */
1557};
1558#define IdChar(C) (((c=C)&0x80)!=0 || (c>0x1f && isIdChar[c-0x20]))
1559
1560
1561/*
1562** Return the length of the token that begins at z[0].
1563** Store the token type in *tokenType before returning.
1564*/
1565static int getToken(const char *z, int *tokenType){
1566 int i, c;
1567 switch( *z ){
1568 case 0: {
1569 *tokenType = TOKEN_EOF;
1570 return 0;
1571 }
1572 case ' ': case '\t': case '\n': case '\f': case '\r': {
1573 for(i=1; safe_isspace(z[i]); i++){}
1574 *tokenType = TOKEN_SPACE;
1575 return i;
1576 }
1577 case '`':
1578 case '\'':
1579 case '"': {
1580 int delim = z[0];
1581 for(i=1; (c=z[i])!=0; i++){
1582 if( c==delim ){
1583 if( z[i+1]==delim ){
1584 i++;
1585 }else{
1586 break;
1587 }
1588 }
1589 }
1590 *tokenType = TOKEN_STRING;
1591 return i + (c!=0);
1592 }
1593 case '[': {
1594 for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){}
1595 *tokenType = TOKEN_ID;
1596 return i;
1597 }
1598 default: {
1599 if( !IdChar(*z) ){
1600 break;
1601 }
1602 for(i=1; IdChar(z[i]); i++){}
1603 *tokenType = TOKEN_ID;
1604 return i;
1605 }
1606 }
1607 *tokenType = TOKEN_PUNCT;
1608 return 1;
1609}
1610
1611/*
1612** A token extracted from a string is an instance of the following
1613** structure.
1614*/
1615typedef struct Token {
1616 const char *z; /* Pointer to token text. Not '\000' terminated */
1617 short int n; /* Length of the token text in bytes. */
1618} Token;
1619
1620/*
1621** Given a input string (which is really one of the argv[] parameters
1622** passed into xConnect or xCreate) split the string up into tokens.
1623** Return an array of pointers to '\000' terminated strings, one string
1624** for each non-whitespace token.
1625**
1626** The returned array is terminated by a single NULL pointer.
1627**
1628** Space to hold the returned array is obtained from a single
1629** malloc and should be freed by passing the return value to free().
1630** The individual strings within the token list are all a part of
1631** the single memory allocation and will all be freed at once.
1632*/
1633static char **tokenizeString(const char *z, int *pnToken){
1634 int nToken = 0;
1635 Token *aToken = malloc( strlen(z) * sizeof(aToken[0]) );
1636 int n = 1;
1637 int e, i;
1638 int totalSize = 0;
1639 char **azToken;
1640 char *zCopy;
1641 while( n>0 ){
1642 n = getToken(z, &e);
1643 if( e!=TOKEN_SPACE ){
1644 aToken[nToken].z = z;
1645 aToken[nToken].n = n;
1646 nToken++;
1647 totalSize += n+1;
1648 }
1649 z += n;
1650 }
1651 azToken = (char**)malloc( nToken*sizeof(char*) + totalSize );
1652 zCopy = (char*)&azToken[nToken];
1653 nToken--;
1654 for(i=0; i<nToken; i++){
1655 azToken[i] = zCopy;
1656 n = aToken[i].n;
1657 memcpy(zCopy, aToken[i].z, n);
1658 zCopy[n] = 0;
1659 zCopy += n+1;
1660 }
1661 azToken[nToken] = 0;
1662 free(aToken);
1663 *pnToken = nToken;
1664 return azToken;
1665}
1666
1667/*
1668** Convert an SQL-style quoted string into a normal string by removing
1669** the quote characters. The conversion is done in-place. If the
1670** input does not begin with a quote character, then this routine
1671** is a no-op.
1672**
1673** Examples:
1674**
1675** "abc" becomes abc
1676** 'xyz' becomes xyz
1677** [pqr] becomes pqr
1678** `mno` becomes mno
1679*/
1680static void dequoteString(char *z){
1681 int quote;
1682 int i, j;
1683 if( z==0 ) return;
1684 quote = z[0];
1685 switch( quote ){
1686 case '\'': break;
1687 case '"': break;
1688 case '`': break; /* For MySQL compatibility */
1689 case '[': quote = ']'; break; /* For MS SqlServer compatibility */
1690 default: return;
1691 }
1692 for(i=1, j=0; z[i]; i++){
1693 if( z[i]==quote ){
1694 if( z[i+1]==quote ){
1695 z[j++] = quote;
1696 i++;
1697 }else{
1698 z[j++] = 0;
1699 break;
1700 }
1701 }else{
1702 z[j++] = z[i];
1703 }
1704 }
1705}
1706
1707/*
1708** The input azIn is a NULL-terminated list of tokens. Remove the first
1709** token and all punctuation tokens. Remove the quotes from
1710** around string literal tokens.
1711**
1712** Example:
1713**
1714** input: tokenize chinese ( 'simplifed' , 'mixed' )
1715** output: chinese simplifed mixed
1716**
1717** Another example:
1718**
1719** input: delimiters ( '[' , ']' , '...' )
1720** output: [ ] ...
1721*/
1722static void tokenListToIdList(char **azIn){
1723 int i, j;
1724 if( azIn ){
1725 for(i=0, j=-1; azIn[i]; i++){
1726 if( safe_isalnum(azIn[i][0]) || azIn[i][1] ){
1727 dequoteString(azIn[i]);
1728 if( j>=0 ){
1729 azIn[j] = azIn[i];
1730 }
1731 j++;
1732 }
1733 }
1734 azIn[j] = 0;
1735 }
1736}
1737
1738
1739/*
1740** Find the first alphanumeric token in the string zIn. Null-terminate
1741** this token. Remove any quotation marks. And return a pointer to
1742** the result.
1743*/
1744static char *firstToken(char *zIn, char **pzTail){
1745 int n, ttype;
1746 while(1){
1747 n = getToken(zIn, &ttype);
1748 if( ttype==TOKEN_SPACE ){
1749 zIn += n;
1750 }else if( ttype==TOKEN_EOF ){
1751 *pzTail = zIn;
1752 return 0;
1753 }else{
1754 zIn[n] = 0;
1755 *pzTail = &zIn[1];
1756 dequoteString(zIn);
1757 return zIn;
1758 }
1759 }
1760 /*NOTREACHED*/
1761}
1762
1763/* Return true if...
1764**
1765** * s begins with the string t, ignoring case
1766** * s is longer than t
1767** * The first character of s beyond t is not a alphanumeric
1768**
1769** Ignore leading space in *s.
1770**
1771** To put it another way, return true if the first token of
1772** s[] is t[].
1773*/
1774static int startsWith(const char *s, const char *t){
1775 while( safe_isspace(*s) ){ s++; }
1776 while( *t ){
1777 if( safe_tolower(*s++)!=safe_tolower(*t++) ) return 0;
1778 }
1779 return *s!='_' && !safe_isalnum(*s);
1780}
1781
1782/*
1783** An instance of this structure defines the "spec" of a
1784** full text index. This structure is populated by parseSpec
1785** and use by fulltextConnect and fulltextCreate.
1786*/
1787typedef struct TableSpec {
1788 const char *zDb; /* Logical database name */
1789 const char *zName; /* Name of the full-text index */
1790 int nColumn; /* Number of columns to be indexed */
1791 char **azColumn; /* Original names of columns to be indexed */
1792 char **azContentColumn; /* Column names for %_content */
1793 char **azTokenizer; /* Name of tokenizer and its arguments */
1794} TableSpec;
1795
1796/*
1797** Reclaim all of the memory used by a TableSpec
1798*/
1799static void clearTableSpec(TableSpec *p) {
1800 free(p->azColumn);
1801 free(p->azContentColumn);
1802 free(p->azTokenizer);
1803}
1804
1805/* Parse a CREATE VIRTUAL TABLE statement, which looks like this:
1806 *
1807 * CREATE VIRTUAL TABLE email
1808 * USING fts1(subject, body, tokenize mytokenizer(myarg))
1809 *
1810 * We return parsed information in a TableSpec structure.
1811 *
1812 */
1813static int parseSpec(TableSpec *pSpec, int argc, const char *const*argv,
1814 char**pzErr){
1815 int i, n;
1816 char *z, *zDummy;
1817 char **azArg;
1818 const char *zTokenizer = 0; /* argv[] entry describing the tokenizer */
1819
1820 assert( argc>=3 );
1821 /* Current interface:
1822 ** argv[0] - module name
1823 ** argv[1] - database name
1824 ** argv[2] - table name
1825 ** argv[3..] - columns, optionally followed by tokenizer specification
1826 ** and snippet delimiters specification.
1827 */
1828
1829 /* Make a copy of the complete argv[][] array in a single allocation.
1830 ** The argv[][] array is read-only and transient. We can write to the
1831 ** copy in order to modify things and the copy is persistent.
1832 */
1833 memset(pSpec, 0, sizeof(*pSpec));
1834 for(i=n=0; i<argc; i++){
1835 n += strlen(argv[i]) + 1;
1836 }
1837 azArg = malloc( sizeof(char*)*argc + n );
1838 if( azArg==0 ){
1839 return SQLITE_NOMEM;
1840 }
1841 z = (char*)&azArg[argc];
1842 for(i=0; i<argc; i++){
1843 azArg[i] = z;
1844 strcpy(z, argv[i]);
1845 z += strlen(z)+1;
1846 }
1847
1848 /* Identify the column names and the tokenizer and delimiter arguments
1849 ** in the argv[][] array.
1850 */
1851 pSpec->zDb = azArg[1];
1852 pSpec->zName = azArg[2];
1853 pSpec->nColumn = 0;
1854 pSpec->azColumn = azArg;
1855 zTokenizer = "tokenize simple";
1856 for(i=3; i<argc; ++i){
1857 if( startsWith(azArg[i],"tokenize") ){
1858 zTokenizer = azArg[i];
1859 }else{
1860 z = azArg[pSpec->nColumn] = firstToken(azArg[i], &zDummy);
1861 pSpec->nColumn++;
1862 }
1863 }
1864 if( pSpec->nColumn==0 ){
1865 azArg[0] = "content";
1866 pSpec->nColumn = 1;
1867 }
1868
1869 /*
1870 ** Construct the list of content column names.
1871 **
1872 ** Each content column name will be of the form cNNAAAA
1873 ** where NN is the column number and AAAA is the sanitized
1874 ** column name. "sanitized" means that special characters are
1875 ** converted to "_". The cNN prefix guarantees that all column
1876 ** names are unique.
1877 **
1878 ** The AAAA suffix is not strictly necessary. It is included
1879 ** for the convenience of people who might examine the generated
1880 ** %_content table and wonder what the columns are used for.
1881 */
1882 pSpec->azContentColumn = malloc( pSpec->nColumn * sizeof(char *) );
1883 if( pSpec->azContentColumn==0 ){
1884 clearTableSpec(pSpec);
1885 return SQLITE_NOMEM;
1886 }
1887 for(i=0; i<pSpec->nColumn; i++){
1888 char *p;
1889 pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]);
1890 for (p = pSpec->azContentColumn[i]; *p ; ++p) {
1891 if( !safe_isalnum(*p) ) *p = '_';
1892 }
1893 }
1894
1895 /*
1896 ** Parse the tokenizer specification string.
1897 */
1898 pSpec->azTokenizer = tokenizeString(zTokenizer, &n);
1899 tokenListToIdList(pSpec->azTokenizer);
1900
1901 return SQLITE_OK;
1902}
1903
1904/*
1905** Generate a CREATE TABLE statement that describes the schema of
1906** the virtual table. Return a pointer to this schema string.
1907**
1908** Space is obtained from sqlite3_mprintf() and should be freed
1909** using sqlite3_free().
1910*/
1911static char *fulltextSchema(
1912 int nColumn, /* Number of columns */
1913 const char *const* azColumn, /* List of columns */
1914 const char *zTableName /* Name of the table */
1915){
1916 int i;
1917 char *zSchema, *zNext;
1918 const char *zSep = "(";
1919 zSchema = sqlite3_mprintf("CREATE TABLE x");
1920 for(i=0; i<nColumn; i++){
1921 zNext = sqlite3_mprintf("%s%s%Q", zSchema, zSep, azColumn[i]);
1922 sqlite3_free(zSchema);
1923 zSchema = zNext;
1924 zSep = ",";
1925 }
1926 zNext = sqlite3_mprintf("%s,%Q)", zSchema, zTableName);
1927 sqlite3_free(zSchema);
1928 return zNext;
1929}
1930
1931/*
1932** Build a new sqlite3_vtab structure that will describe the
1933** fulltext index defined by spec.
1934*/
1935static int constructVtab(
1936 sqlite3 *db, /* The SQLite database connection */
1937 TableSpec *spec, /* Parsed spec information from parseSpec() */
1938 sqlite3_vtab **ppVTab, /* Write the resulting vtab structure here */
1939 char **pzErr /* Write any error message here */
1940){
1941 int rc;
1942 int n;
1943 fulltext_vtab *v = 0;
1944 const sqlite3_tokenizer_module *m = NULL;
1945 char *schema;
1946
1947 v = (fulltext_vtab *) malloc(sizeof(fulltext_vtab));
1948 if( v==0 ) return SQLITE_NOMEM;
1949 memset(v, 0, sizeof(*v));
1950 /* sqlite will initialize v->base */
1951 v->db = db;
1952 v->zDb = spec->zDb; /* Freed when azColumn is freed */
1953 v->zName = spec->zName; /* Freed when azColumn is freed */
1954 v->nColumn = spec->nColumn;
1955 v->azContentColumn = spec->azContentColumn;
1956 spec->azContentColumn = 0;
1957 v->azColumn = spec->azColumn;
1958 spec->azColumn = 0;
1959
1960 if( spec->azTokenizer==0 ){
1961 return SQLITE_NOMEM;
1962 }
1963 /* TODO(shess) For now, add new tokenizers as else if clauses. */
1964 if( spec->azTokenizer[0]==0 || startsWith(spec->azTokenizer[0], "simple") ){
1965 sqlite3Fts1SimpleTokenizerModule(&m);
1966 }else if( startsWith(spec->azTokenizer[0], "porter") ){
1967 sqlite3Fts1PorterTokenizerModule(&m);
1968 }else{
1969 *pzErr = sqlite3_mprintf("unknown tokenizer: %s", spec->azTokenizer[0]);
1970 rc = SQLITE_ERROR;
1971 goto err;
1972 }
1973 for(n=0; spec->azTokenizer[n]; n++){}
1974 if( n ){
1975 rc = m->xCreate(n-1, (const char*const*)&spec->azTokenizer[1],
1976 &v->pTokenizer);
1977 }else{
1978 rc = m->xCreate(0, 0, &v->pTokenizer);
1979 }
1980 if( rc!=SQLITE_OK ) goto err;
1981 v->pTokenizer->pModule = m;
1982
1983 /* TODO: verify the existence of backing tables foo_content, foo_term */
1984
1985 schema = fulltextSchema(v->nColumn, (const char*const*)v->azColumn,
1986 spec->zName);
1987 rc = sqlite3_declare_vtab(db, schema);
1988 sqlite3_free(schema);
1989 if( rc!=SQLITE_OK ) goto err;
1990
1991 memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements));
1992
1993 *ppVTab = &v->base;
1994 TRACE(("FTS1 Connect %p\n", v));
1995
1996 return rc;
1997
1998err:
1999 fulltext_vtab_destroy(v);
2000 return rc;
2001}
2002
2003static int fulltextConnect(
2004 sqlite3 *db,
2005 void *pAux,
2006 int argc, const char *const*argv,
2007 sqlite3_vtab **ppVTab,
2008 char **pzErr
2009){
2010 TableSpec spec;
2011 int rc = parseSpec(&spec, argc, argv, pzErr);
2012 if( rc!=SQLITE_OK ) return rc;
2013
2014 rc = constructVtab(db, &spec, ppVTab, pzErr);
2015 clearTableSpec(&spec);
2016 return rc;
2017}
2018
2019 /* The %_content table holds the text of each document, with
2020 ** the rowid used as the docid.
2021 **
2022 ** The %_term table maps each term to a document list blob
2023 ** containing elements sorted by ascending docid, each element
2024 ** encoded as:
2025 **
2026 ** docid varint-encoded
2027 ** token elements:
2028 ** position+1 varint-encoded as delta from previous position
2029 ** start offset varint-encoded as delta from previous start offset
2030 ** end offset varint-encoded as delta from start offset
2031 **
2032 ** The sentinel position of 0 indicates the end of the token list.
2033 **
2034 ** Additionally, doclist blobs are chunked into multiple segments,
2035 ** using segment to order the segments. New elements are added to
2036 ** the segment at segment 0, until it exceeds CHUNK_MAX. Then
2037 ** segment 0 is deleted, and the doclist is inserted at segment 1.
2038 ** If there is already a doclist at segment 1, the segment 0 doclist
2039 ** is merged with it, the segment 1 doclist is deleted, and the
2040 ** merged doclist is inserted at segment 2, repeating those
2041 ** operations until an insert succeeds.
2042 **
2043 ** Since this structure doesn't allow us to update elements in place
2044 ** in case of deletion or update, these are simply written to
2045 ** segment 0 (with an empty token list in case of deletion), with
2046 ** docListAccumulate() taking care to retain lower-segment
2047 ** information in preference to higher-segment information.
2048 */
2049 /* TODO(shess) Provide a VACUUM type operation which both removes
2050 ** deleted elements which are no longer necessary, and duplicated
2051 ** elements. I suspect this will probably not be necessary in
2052 ** practice, though.
2053 */
2054static int fulltextCreate(sqlite3 *db, void *pAux,
2055 int argc, const char * const *argv,
2056 sqlite3_vtab **ppVTab, char **pzErr){
2057 int rc;
2058 TableSpec spec;
2059 StringBuffer schema;
2060 TRACE(("FTS1 Create\n"));
2061
2062 rc = parseSpec(&spec, argc, argv, pzErr);
2063 if( rc!=SQLITE_OK ) return rc;
2064
2065 initStringBuffer(&schema);
2066 append(&schema, "CREATE TABLE %_content(");
2067 appendList(&schema, spec.nColumn, spec.azContentColumn);
2068 append(&schema, ")");
2069 rc = sql_exec(db, spec.zDb, spec.zName, schema.s);
2070 free(schema.s);
2071 if( rc!=SQLITE_OK ) goto out;
2072
2073 rc = sql_exec(db, spec.zDb, spec.zName,
2074 "create table %_term(term text, segment integer, doclist blob, "
2075 "primary key(term, segment));");
2076 if( rc!=SQLITE_OK ) goto out;
2077
2078 rc = constructVtab(db, &spec, ppVTab, pzErr);
2079
2080out:
2081 clearTableSpec(&spec);
2082 return rc;
2083}
2084
2085/* Decide how to handle an SQL query. */
2086static int fulltextBestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){
2087 int i;
2088 TRACE(("FTS1 BestIndex\n"));
2089
2090 for(i=0; i<pInfo->nConstraint; ++i){
2091 const struct sqlite3_index_constraint *pConstraint;
2092 pConstraint = &pInfo->aConstraint[i];
2093 if( pConstraint->usable ) {
2094 if( pConstraint->iColumn==-1 &&
2095 pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ ){
2096 pInfo->idxNum = QUERY_ROWID; /* lookup by rowid */
2097 TRACE(("FTS1 QUERY_ROWID\n"));
2098 } else if( pConstraint->iColumn>=0 &&
2099 pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH ){
2100 /* full-text search */
2101 pInfo->idxNum = QUERY_FULLTEXT + pConstraint->iColumn;
2102 TRACE(("FTS1 QUERY_FULLTEXT %d\n", pConstraint->iColumn));
2103 } else continue;
2104
2105 pInfo->aConstraintUsage[i].argvIndex = 1;
2106 pInfo->aConstraintUsage[i].omit = 1;
2107
2108 /* An arbitrary value for now.
2109 * TODO: Perhaps rowid matches should be considered cheaper than
2110 * full-text searches. */
2111 pInfo->estimatedCost = 1.0;
2112
2113 return SQLITE_OK;
2114 }
2115 }
2116 pInfo->idxNum = QUERY_GENERIC;
2117 return SQLITE_OK;
2118}
2119
2120static int fulltextDisconnect(sqlite3_vtab *pVTab){
2121 TRACE(("FTS1 Disconnect %p\n", pVTab));
2122 fulltext_vtab_destroy((fulltext_vtab *)pVTab);
2123 return SQLITE_OK;
2124}
2125
2126static int fulltextDestroy(sqlite3_vtab *pVTab){
2127 fulltext_vtab *v = (fulltext_vtab *)pVTab;
2128 int rc;
2129
2130 TRACE(("FTS1 Destroy %p\n", pVTab));
2131 rc = sql_exec(v->db, v->zDb, v->zName,
2132 "drop table if exists %_content;"
2133 "drop table if exists %_term;"
2134 );
2135 if( rc!=SQLITE_OK ) return rc;
2136
2137 fulltext_vtab_destroy((fulltext_vtab *)pVTab);
2138 return SQLITE_OK;
2139}
2140
2141static int fulltextOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){
2142 fulltext_cursor *c;
2143
2144 c = (fulltext_cursor *) calloc(sizeof(fulltext_cursor), 1);
2145 /* sqlite will initialize c->base */
2146 *ppCursor = &c->base;
2147 TRACE(("FTS1 Open %p: %p\n", pVTab, c));
2148
2149 return SQLITE_OK;
2150}
2151
2152
2153/* Free all of the dynamically allocated memory held by *q
2154*/
2155static void queryClear(Query *q){
2156 int i;
2157 for(i = 0; i < q->nTerms; ++i){
2158 free(q->pTerms[i].pTerm);
2159 }
2160 free(q->pTerms);
2161 memset(q, 0, sizeof(*q));
2162}
2163
2164/* Free all of the dynamically allocated memory held by the
2165** Snippet
2166*/
2167static void snippetClear(Snippet *p){
2168 free(p->aMatch);
2169 free(p->zOffset);
2170 free(p->zSnippet);
2171 memset(p, 0, sizeof(*p));
2172}
2173/*
2174** Append a single entry to the p->aMatch[] log.
2175*/
2176static void snippetAppendMatch(
2177 Snippet *p, /* Append the entry to this snippet */
2178 int iCol, int iTerm, /* The column and query term */
2179 int iStart, int nByte /* Offset and size of the match */
2180){
2181 int i;
2182 struct snippetMatch *pMatch;
2183 if( p->nMatch+1>=p->nAlloc ){
2184 p->nAlloc = p->nAlloc*2 + 10;
2185 p->aMatch = realloc(p->aMatch, p->nAlloc*sizeof(p->aMatch[0]) );
2186 if( p->aMatch==0 ){
2187 p->nMatch = 0;
2188 p->nAlloc = 0;
2189 return;
2190 }
2191 }
2192 i = p->nMatch++;
2193 pMatch = &p->aMatch[i];
2194 pMatch->iCol = iCol;
2195 pMatch->iTerm = iTerm;
2196 pMatch->iStart = iStart;
2197 pMatch->nByte = nByte;
2198}
2199
2200/*
2201** Sizing information for the circular buffer used in snippetOffsetsOfColumn()
2202*/
2203#define FTS1_ROTOR_SZ (32)
2204#define FTS1_ROTOR_MASK (FTS1_ROTOR_SZ-1)
2205
2206/*
2207** Add entries to pSnippet->aMatch[] for every match that occurs against
2208** document zDoc[0..nDoc-1] which is stored in column iColumn.
2209*/
2210static void snippetOffsetsOfColumn(
2211 Query *pQuery,
2212 Snippet *pSnippet,
2213 int iColumn,
2214 const char *zDoc,
2215 int nDoc
2216){
2217 const sqlite3_tokenizer_module *pTModule; /* The tokenizer module */
2218 sqlite3_tokenizer *pTokenizer; /* The specific tokenizer */
2219 sqlite3_tokenizer_cursor *pTCursor; /* Tokenizer cursor */
2220 fulltext_vtab *pVtab; /* The full text index */
2221 int nColumn; /* Number of columns in the index */
2222 const QueryTerm *aTerm; /* Query string terms */
2223 int nTerm; /* Number of query string terms */
2224 int i, j; /* Loop counters */
2225 int rc; /* Return code */
2226 unsigned int match, prevMatch; /* Phrase search bitmasks */
2227 const char *zToken; /* Next token from the tokenizer */
2228 int nToken; /* Size of zToken */
2229 int iBegin, iEnd, iPos; /* Offsets of beginning and end */
2230
2231 /* The following variables keep a circular buffer of the last
2232 ** few tokens */
2233 unsigned int iRotor = 0; /* Index of current token */
2234 int iRotorBegin[FTS1_ROTOR_SZ]; /* Beginning offset of token */
2235 int iRotorLen[FTS1_ROTOR_SZ]; /* Length of token */
2236
2237 pVtab = pQuery->pFts;
2238 nColumn = pVtab->nColumn;
2239 pTokenizer = pVtab->pTokenizer;
2240 pTModule = pTokenizer->pModule;
2241 rc = pTModule->xOpen(pTokenizer, zDoc, nDoc, &pTCursor);
2242 if( rc ) return;
2243 pTCursor->pTokenizer = pTokenizer;
2244 aTerm = pQuery->pTerms;
2245 nTerm = pQuery->nTerms;
2246 if( nTerm>=FTS1_ROTOR_SZ ){
2247 nTerm = FTS1_ROTOR_SZ - 1;
2248 }
2249 prevMatch = 0;
2250 while(1){
2251 rc = pTModule->xNext(pTCursor, &zToken, &nToken, &iBegin, &iEnd, &iPos);
2252 if( rc ) break;
2253 iRotorBegin[iRotor&FTS1_ROTOR_MASK] = iBegin;
2254 iRotorLen[iRotor&FTS1_ROTOR_MASK] = iEnd-iBegin;
2255 match = 0;
2256 for(i=0; i<nTerm; i++){
2257 int iCol;
2258 iCol = aTerm[i].iColumn;
2259 if( iCol>=0 && iCol<nColumn && iCol!=iColumn ) continue;
2260 if( aTerm[i].nTerm!=nToken ) continue;
2261 if( memcmp(aTerm[i].pTerm, zToken, nToken) ) continue;
2262 if( aTerm[i].iPhrase>1 && (prevMatch & (1<<i))==0 ) continue;
2263 match |= 1<<i;
2264 if( i==nTerm-1 || aTerm[i+1].iPhrase==1 ){
2265 for(j=aTerm[i].iPhrase-1; j>=0; j--){
2266 int k = (iRotor-j) & FTS1_ROTOR_MASK;
2267 snippetAppendMatch(pSnippet, iColumn, i-j,
2268 iRotorBegin[k], iRotorLen[k]);
2269 }
2270 }
2271 }
2272 prevMatch = match<<1;
2273 iRotor++;
2274 }
2275 pTModule->xClose(pTCursor);
2276}
2277
2278
2279/*
2280** Compute all offsets for the current row of the query.
2281** If the offsets have already been computed, this routine is a no-op.
2282*/
2283static void snippetAllOffsets(fulltext_cursor *p){
2284 int nColumn;
2285 int iColumn, i;
2286 int iFirst, iLast;
2287 fulltext_vtab *pFts;
2288
2289 if( p->snippet.nMatch ) return;
2290 if( p->q.nTerms==0 ) return;
2291 pFts = p->q.pFts;
2292 nColumn = pFts->nColumn;
2293 iColumn = p->iCursorType - QUERY_FULLTEXT;
2294 if( iColumn<0 || iColumn>=nColumn ){
2295 iFirst = 0;
2296 iLast = nColumn-1;
2297 }else{
2298 iFirst = iColumn;
2299 iLast = iColumn;
2300 }
2301 for(i=iFirst; i<=iLast; i++){
2302 const char *zDoc;
2303 int nDoc;
2304 zDoc = (const char*)sqlite3_column_text(p->pStmt, i+1);
2305 nDoc = sqlite3_column_bytes(p->pStmt, i+1);
2306 snippetOffsetsOfColumn(&p->q, &p->snippet, i, zDoc, nDoc);
2307 }
2308}
2309
2310/*
2311** Convert the information in the aMatch[] array of the snippet
2312** into the string zOffset[0..nOffset-1].
2313*/
2314static void snippetOffsetText(Snippet *p){
2315 int i;
2316 int cnt = 0;
2317 StringBuffer sb;
2318 char zBuf[200];
2319 if( p->zOffset ) return;
2320 initStringBuffer(&sb);
2321 for(i=0; i<p->nMatch; i++){
2322 struct snippetMatch *pMatch = &p->aMatch[i];
2323 zBuf[0] = ' ';
2324 sprintf(&zBuf[cnt>0], "%d %d %d %d", pMatch->iCol,
2325 pMatch->iTerm, pMatch->iStart, pMatch->nByte);
2326 append(&sb, zBuf);
2327 cnt++;
2328 }
2329 p->zOffset = sb.s;
2330 p->nOffset = sb.len;
2331}
2332
2333/*
2334** zDoc[0..nDoc-1] is phrase of text. aMatch[0..nMatch-1] are a set
2335** of matching words some of which might be in zDoc. zDoc is column
2336** number iCol.
2337**
2338** iBreak is suggested spot in zDoc where we could begin or end an
2339** excerpt. Return a value similar to iBreak but possibly adjusted
2340** to be a little left or right so that the break point is better.
2341*/
2342static int wordBoundary(
2343 int iBreak, /* The suggested break point */
2344 const char *zDoc, /* Document text */
2345 int nDoc, /* Number of bytes in zDoc[] */
2346 struct snippetMatch *aMatch, /* Matching words */
2347 int nMatch, /* Number of entries in aMatch[] */
2348 int iCol /* The column number for zDoc[] */
2349){
2350 int i;
2351 if( iBreak<=10 ){
2352 return 0;
2353 }
2354 if( iBreak>=nDoc-10 ){
2355 return nDoc;
2356 }
2357 for(i=0; i<nMatch && aMatch[i].iCol<iCol; i++){}
2358 while( i<nMatch && aMatch[i].iStart+aMatch[i].nByte<iBreak ){ i++; }
2359 if( i<nMatch ){
2360 if( aMatch[i].iStart<iBreak+10 ){
2361 return aMatch[i].iStart;
2362 }
2363 if( i>0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){
2364 return aMatch[i-1].iStart;
2365 }
2366 }
2367 for(i=1; i<=10; i++){
2368 if( safe_isspace(zDoc[iBreak-i]) ){
2369 return iBreak - i + 1;
2370 }
2371 if( safe_isspace(zDoc[iBreak+i]) ){
2372 return iBreak + i + 1;
2373 }
2374 }
2375 return iBreak;
2376}
2377
2378/*
2379** If the StringBuffer does not end in white space, add a single
2380** space character to the end.
2381*/
2382static void appendWhiteSpace(StringBuffer *p){
2383 if( p->len==0 ) return;
2384 if( safe_isspace(p->s[p->len-1]) ) return;
2385 append(p, " ");
2386}
2387
2388/*
2389** Remove white space from teh end of the StringBuffer
2390*/
2391static void trimWhiteSpace(StringBuffer *p){
2392 while( p->len>0 && safe_isspace(p->s[p->len-1]) ){
2393 p->len--;
2394 }
2395}
2396
2397
2398
2399/*
2400** Allowed values for Snippet.aMatch[].snStatus
2401*/
2402#define SNIPPET_IGNORE 0 /* It is ok to omit this match from the snippet */
2403#define SNIPPET_DESIRED 1 /* We want to include this match in the snippet */
2404
2405/*
2406** Generate the text of a snippet.
2407*/
2408static void snippetText(
2409 fulltext_cursor *pCursor, /* The cursor we need the snippet for */
2410 const char *zStartMark, /* Markup to appear before each match */
2411 const char *zEndMark, /* Markup to appear after each match */
2412 const char *zEllipsis /* Ellipsis mark */
2413){
2414 int i, j;
2415 struct snippetMatch *aMatch;
2416 int nMatch;
2417 int nDesired;
2418 StringBuffer sb;
2419 int tailCol;
2420 int tailOffset;
2421 int iCol;
2422 int nDoc;
2423 const char *zDoc;
2424 int iStart, iEnd;
2425 int tailEllipsis = 0;
2426 int iMatch;
2427
2428
2429 free(pCursor->snippet.zSnippet);
2430 pCursor->snippet.zSnippet = 0;
2431 aMatch = pCursor->snippet.aMatch;
2432 nMatch = pCursor->snippet.nMatch;
2433 initStringBuffer(&sb);
2434
2435 for(i=0; i<nMatch; i++){
2436 aMatch[i].snStatus = SNIPPET_IGNORE;
2437 }
2438 nDesired = 0;
2439 for(i=0; i<pCursor->q.nTerms; i++){
2440 for(j=0; j<nMatch; j++){
2441 if( aMatch[j].iTerm==i ){
2442 aMatch[j].snStatus = SNIPPET_DESIRED;
2443 nDesired++;
2444 break;
2445 }
2446 }
2447 }
2448
2449 iMatch = 0;
2450 tailCol = -1;
2451 tailOffset = 0;
2452 for(i=0; i<nMatch && nDesired>0; i++){
2453 if( aMatch[i].snStatus!=SNIPPET_DESIRED ) continue;
2454 nDesired--;
2455 iCol = aMatch[i].iCol;
2456 zDoc = (const char*)sqlite3_column_text(pCursor->pStmt, iCol+1);
2457 nDoc = sqlite3_column_bytes(pCursor->pStmt, iCol+1);
2458 iStart = aMatch[i].iStart - 40;
2459 iStart = wordBoundary(iStart, zDoc, nDoc, aMatch, nMatch, iCol);
2460 if( iStart<=10 ){
2461 iStart = 0;
2462 }
2463 if( iCol==tailCol && iStart<=tailOffset+20 ){
2464 iStart = tailOffset;
2465 }
2466 if( (iCol!=tailCol && tailCol>=0) || iStart!=tailOffset ){
2467 trimWhiteSpace(&sb);
2468 appendWhiteSpace(&sb);
2469 append(&sb, zEllipsis);
2470 appendWhiteSpace(&sb);
2471 }
2472 iEnd = aMatch[i].iStart + aMatch[i].nByte + 40;
2473 iEnd = wordBoundary(iEnd, zDoc, nDoc, aMatch, nMatch, iCol);
2474 if( iEnd>=nDoc-10 ){
2475 iEnd = nDoc;
2476 tailEllipsis = 0;
2477 }else{
2478 tailEllipsis = 1;
2479 }
2480 while( iMatch<nMatch && aMatch[iMatch].iCol<iCol ){ iMatch++; }
2481 while( iStart<iEnd ){
2482 while( iMatch<nMatch && aMatch[iMatch].iStart<iStart
2483 && aMatch[iMatch].iCol<=iCol ){
2484 iMatch++;
2485 }
2486 if( iMatch<nMatch && aMatch[iMatch].iStart<iEnd
2487 && aMatch[iMatch].iCol==iCol ){
2488 nappend(&sb, &zDoc[iStart], aMatch[iMatch].iStart - iStart);
2489 iStart = aMatch[iMatch].iStart;
2490 append(&sb, zStartMark);
2491 nappend(&sb, &zDoc[iStart], aMatch[iMatch].nByte);
2492 append(&sb, zEndMark);
2493 iStart += aMatch[iMatch].nByte;
2494 for(j=iMatch+1; j<nMatch; j++){
2495 if( aMatch[j].iTerm==aMatch[iMatch].iTerm
2496 && aMatch[j].snStatus==SNIPPET_DESIRED ){
2497 nDesired--;
2498 aMatch[j].snStatus = SNIPPET_IGNORE;
2499 }
2500 }
2501 }else{
2502 nappend(&sb, &zDoc[iStart], iEnd - iStart);
2503 iStart = iEnd;
2504 }
2505 }
2506 tailCol = iCol;
2507 tailOffset = iEnd;
2508 }
2509 trimWhiteSpace(&sb);
2510 if( tailEllipsis ){
2511 appendWhiteSpace(&sb);
2512 append(&sb, zEllipsis);
2513 }
2514 pCursor->snippet.zSnippet = sb.s;
2515 pCursor->snippet.nSnippet = sb.len;
2516}
2517
2518
2519/*
2520** Close the cursor. For additional information see the documentation
2521** on the xClose method of the virtual table interface.
2522*/
2523static int fulltextClose(sqlite3_vtab_cursor *pCursor){
2524 fulltext_cursor *c = (fulltext_cursor *) pCursor;
2525 TRACE(("FTS1 Close %p\n", c));
2526 sqlite3_finalize(c->pStmt);
2527 queryClear(&c->q);
2528 snippetClear(&c->snippet);
2529 if( c->result.pDoclist!=NULL ){
2530 docListDelete(c->result.pDoclist);
2531 }
2532 free(c);
2533 return SQLITE_OK;
2534}
2535
2536static int fulltextNext(sqlite3_vtab_cursor *pCursor){
2537 fulltext_cursor *c = (fulltext_cursor *) pCursor;
2538 sqlite_int64 iDocid;
2539 int rc;
2540
2541 TRACE(("FTS1 Next %p\n", pCursor));
2542 snippetClear(&c->snippet);
2543 if( c->iCursorType < QUERY_FULLTEXT ){
2544 /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
2545 rc = sqlite3_step(c->pStmt);
2546 switch( rc ){
2547 case SQLITE_ROW:
2548 c->eof = 0;
2549 return SQLITE_OK;
2550 case SQLITE_DONE:
2551 c->eof = 1;
2552 return SQLITE_OK;
2553 default:
2554 c->eof = 1;
2555 return rc;
2556 }
2557 } else { /* full-text query */
2558 rc = sqlite3_reset(c->pStmt);
2559 if( rc!=SQLITE_OK ) return rc;
2560
2561 iDocid = nextDocid(&c->result);
2562 if( iDocid==0 ){
2563 c->eof = 1;
2564 return SQLITE_OK;
2565 }
2566 rc = sqlite3_bind_int64(c->pStmt, 1, iDocid);
2567 if( rc!=SQLITE_OK ) return rc;
2568 /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
2569 rc = sqlite3_step(c->pStmt);
2570 if( rc==SQLITE_ROW ){ /* the case we expect */
2571 c->eof = 0;
2572 return SQLITE_OK;
2573 }
2574 /* an error occurred; abort */
2575 return rc==SQLITE_DONE ? SQLITE_ERROR : rc;
2576 }
2577}
2578
2579
2580/* Return a DocList corresponding to the query term *pTerm. If *pTerm
2581** is the first term of a phrase query, go ahead and evaluate the phrase
2582** query and return the doclist for the entire phrase query.
2583**
2584** The result is stored in pTerm->doclist.
2585*/
2586static int docListOfTerm(
2587 fulltext_vtab *v, /* The full text index */
2588 int iColumn, /* column to restrict to. No restrition if >=nColumn */
2589 QueryTerm *pQTerm, /* Term we are looking for, or 1st term of a phrase */
2590 DocList **ppResult /* Write the result here */
2591){
2592 DocList *pLeft, *pRight, *pNew;
2593 int i, rc;
2594
2595 pLeft = docListNew(DL_POSITIONS);
2596 rc = term_select_all(v, iColumn, pQTerm->pTerm, pQTerm->nTerm, pLeft);
2597 if( rc ){
2598 docListDelete(pLeft);
2599 return rc;
2600 }
2601 for(i=1; i<=pQTerm->nPhrase; i++){
2602 pRight = docListNew(DL_POSITIONS);
2603 rc = term_select_all(v, iColumn, pQTerm[i].pTerm, pQTerm[i].nTerm, pRight);
2604 if( rc ){
2605 docListDelete(pLeft);
2606 return rc;
2607 }
2608 pNew = docListNew(i<pQTerm->nPhrase ? DL_POSITIONS : DL_DOCIDS);
2609 docListPhraseMerge(pLeft, pRight, pNew);
2610 docListDelete(pLeft);
2611 docListDelete(pRight);
2612 pLeft = pNew;
2613 }
2614 *ppResult = pLeft;
2615 return SQLITE_OK;
2616}
2617
2618/* Add a new term pTerm[0..nTerm-1] to the query *q.
2619*/
2620static void queryAdd(Query *q, const char *pTerm, int nTerm){
2621 QueryTerm *t;
2622 ++q->nTerms;
2623 q->pTerms = realloc(q->pTerms, q->nTerms * sizeof(q->pTerms[0]));
2624 if( q->pTerms==0 ){
2625 q->nTerms = 0;
2626 return;
2627 }
2628 t = &q->pTerms[q->nTerms - 1];
2629 memset(t, 0, sizeof(*t));
2630 t->pTerm = malloc(nTerm+1);
2631 memcpy(t->pTerm, pTerm, nTerm);
2632 t->pTerm[nTerm] = 0;
2633 t->nTerm = nTerm;
2634 t->isOr = q->nextIsOr;
2635 q->nextIsOr = 0;
2636 t->iColumn = q->nextColumn;
2637 q->nextColumn = q->dfltColumn;
2638}
2639
2640/*
2641** Check to see if the string zToken[0...nToken-1] matches any
2642** column name in the virtual table. If it does,
2643** return the zero-indexed column number. If not, return -1.
2644*/
2645static int checkColumnSpecifier(
2646 fulltext_vtab *pVtab, /* The virtual table */
2647 const char *zToken, /* Text of the token */
2648 int nToken /* Number of characters in the token */
2649){
2650 int i;
2651 for(i=0; i<pVtab->nColumn; i++){
2652 if( memcmp(pVtab->azColumn[i], zToken, nToken)==0
2653 && pVtab->azColumn[i][nToken]==0 ){
2654 return i;
2655 }
2656 }
2657 return -1;
2658}
2659
2660/*
2661** Parse the text at pSegment[0..nSegment-1]. Add additional terms
2662** to the query being assemblied in pQuery.
2663**
2664** inPhrase is true if pSegment[0..nSegement-1] is contained within
2665** double-quotes. If inPhrase is true, then the first term
2666** is marked with the number of terms in the phrase less one and
2667** OR and "-" syntax is ignored. If inPhrase is false, then every
2668** term found is marked with nPhrase=0 and OR and "-" syntax is significant.
2669*/
2670static int tokenizeSegment(
2671 sqlite3_tokenizer *pTokenizer, /* The tokenizer to use */
2672 const char *pSegment, int nSegment, /* Query expression being parsed */
2673 int inPhrase, /* True if within "..." */
2674 Query *pQuery /* Append results here */
2675){
2676 const sqlite3_tokenizer_module *pModule = pTokenizer->pModule;
2677 sqlite3_tokenizer_cursor *pCursor;
2678 int firstIndex = pQuery->nTerms;
2679 int iCol;
2680 int nTerm = 1;
2681
2682 int rc = pModule->xOpen(pTokenizer, pSegment, nSegment, &pCursor);
2683 if( rc!=SQLITE_OK ) return rc;
2684 pCursor->pTokenizer = pTokenizer;
2685
2686 while( 1 ){
2687 const char *pToken;
2688 int nToken, iBegin, iEnd, iPos;
2689
2690 rc = pModule->xNext(pCursor,
2691 &pToken, &nToken,
2692 &iBegin, &iEnd, &iPos);
2693 if( rc!=SQLITE_OK ) break;
2694 if( !inPhrase &&
2695 pSegment[iEnd]==':' &&
2696 (iCol = checkColumnSpecifier(pQuery->pFts, pToken, nToken))>=0 ){
2697 pQuery->nextColumn = iCol;
2698 continue;
2699 }
2700 if( !inPhrase && pQuery->nTerms>0 && nToken==2
2701 && pSegment[iBegin]=='O' && pSegment[iBegin+1]=='R' ){
2702 pQuery->nextIsOr = 1;
2703 continue;
2704 }
2705 queryAdd(pQuery, pToken, nToken);
2706 if( !inPhrase && iBegin>0 && pSegment[iBegin-1]=='-' ){
2707 pQuery->pTerms[pQuery->nTerms-1].isNot = 1;
2708 }
2709 pQuery->pTerms[pQuery->nTerms-1].iPhrase = nTerm;
2710 if( inPhrase ){
2711 nTerm++;
2712 }
2713 }
2714
2715 if( inPhrase && pQuery->nTerms>firstIndex ){
2716 pQuery->pTerms[firstIndex].nPhrase = pQuery->nTerms - firstIndex - 1;
2717 }
2718
2719 return pModule->xClose(pCursor);
2720}
2721
2722/* Parse a query string, yielding a Query object pQuery.
2723**
2724** The calling function will need to queryClear() to clean up
2725** the dynamically allocated memory held by pQuery.
2726*/
2727static int parseQuery(
2728 fulltext_vtab *v, /* The fulltext index */
2729 const char *zInput, /* Input text of the query string */
2730 int nInput, /* Size of the input text */
2731 int dfltColumn, /* Default column of the index to match against */
2732 Query *pQuery /* Write the parse results here. */
2733){
2734 int iInput, inPhrase = 0;
2735
2736 if( zInput==0 ) nInput = 0;
2737 if( nInput<0 ) nInput = strlen(zInput);
2738 pQuery->nTerms = 0;
2739 pQuery->pTerms = NULL;
2740 pQuery->nextIsOr = 0;
2741 pQuery->nextColumn = dfltColumn;
2742 pQuery->dfltColumn = dfltColumn;
2743 pQuery->pFts = v;
2744
2745 for(iInput=0; iInput<nInput; ++iInput){
2746 int i;
2747 for(i=iInput; i<nInput && zInput[i]!='"'; ++i){}
2748 if( i>iInput ){
2749 tokenizeSegment(v->pTokenizer, zInput+iInput, i-iInput, inPhrase,
2750 pQuery);
2751 }
2752 iInput = i;
2753 if( i<nInput ){
2754 assert( zInput[i]=='"' );
2755 inPhrase = !inPhrase;
2756 }
2757 }
2758
2759 if( inPhrase ){
2760 /* unmatched quote */
2761 queryClear(pQuery);
2762 return SQLITE_ERROR;
2763 }
2764 return SQLITE_OK;
2765}
2766
2767/* Perform a full-text query using the search expression in
2768** zInput[0..nInput-1]. Return a list of matching documents
2769** in pResult.
2770**
2771** Queries must match column iColumn. Or if iColumn>=nColumn
2772** they are allowed to match against any column.
2773*/
2774static int fulltextQuery(
2775 fulltext_vtab *v, /* The full text index */
2776 int iColumn, /* Match against this column by default */
2777 const char *zInput, /* The query string */
2778 int nInput, /* Number of bytes in zInput[] */
2779 DocList **pResult, /* Write the result doclist here */
2780 Query *pQuery /* Put parsed query string here */
2781){
2782 int i, iNext, rc;
2783 DocList *pLeft = NULL;
2784 DocList *pRight, *pNew, *pOr;
2785 int nNot = 0;
2786 QueryTerm *aTerm;
2787
2788 rc = parseQuery(v, zInput, nInput, iColumn, pQuery);
2789 if( rc!=SQLITE_OK ) return rc;
2790
2791 /* Merge AND terms. */
2792 aTerm = pQuery->pTerms;
2793 for(i = 0; i<pQuery->nTerms; i=iNext){
2794 if( aTerm[i].isNot ){
2795 /* Handle all NOT terms in a separate pass */
2796 nNot++;
2797 iNext = i + aTerm[i].nPhrase+1;
2798 continue;
2799 }
2800 iNext = i + aTerm[i].nPhrase + 1;
2801 rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &pRight);
2802 if( rc ){
2803 queryClear(pQuery);
2804 return rc;
2805 }
2806 while( iNext<pQuery->nTerms && aTerm[iNext].isOr ){
2807 rc = docListOfTerm(v, aTerm[iNext].iColumn, &aTerm[iNext], &pOr);
2808 iNext += aTerm[iNext].nPhrase + 1;
2809 if( rc ){
2810 queryClear(pQuery);
2811 return rc;
2812 }
2813 pNew = docListNew(DL_DOCIDS);
2814 docListOrMerge(pRight, pOr, pNew);
2815 docListDelete(pRight);
2816 docListDelete(pOr);
2817 pRight = pNew;
2818 }
2819 if( pLeft==0 ){
2820 pLeft = pRight;
2821 }else{
2822 pNew = docListNew(DL_DOCIDS);
2823 docListAndMerge(pLeft, pRight, pNew);
2824 docListDelete(pRight);
2825 docListDelete(pLeft);
2826 pLeft = pNew;
2827 }
2828 }
2829
2830 if( nNot && pLeft==0 ){
2831 /* We do not yet know how to handle a query of only NOT terms */
2832 return SQLITE_ERROR;
2833 }
2834
2835 /* Do the EXCEPT terms */
2836 for(i=0; i<pQuery->nTerms; i += aTerm[i].nPhrase + 1){
2837 if( !aTerm[i].isNot ) continue;
2838 rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &pRight);
2839 if( rc ){
2840 queryClear(pQuery);
2841 docListDelete(pLeft);
2842 return rc;
2843 }
2844 pNew = docListNew(DL_DOCIDS);
2845 docListExceptMerge(pLeft, pRight, pNew);
2846 docListDelete(pRight);
2847 docListDelete(pLeft);
2848 pLeft = pNew;
2849 }
2850
2851 *pResult = pLeft;
2852 return rc;
2853}
2854
2855/*
2856** This is the xFilter interface for the virtual table. See
2857** the virtual table xFilter method documentation for additional
2858** information.
2859**
2860** If idxNum==QUERY_GENERIC then do a full table scan against
2861** the %_content table.
2862**
2863** If idxNum==QUERY_ROWID then do a rowid lookup for a single entry
2864** in the %_content table.
2865**
2866** If idxNum>=QUERY_FULLTEXT then use the full text index. The
2867** column on the left-hand side of the MATCH operator is column
2868** number idxNum-QUERY_FULLTEXT, 0 indexed. argv[0] is the right-hand
2869** side of the MATCH operator.
2870*/
2871/* TODO(shess) Upgrade the cursor initialization and destruction to
2872** account for fulltextFilter() being called multiple times on the
2873** same cursor. The current solution is very fragile. Apply fix to
2874** fts2 as appropriate.
2875*/
2876static int fulltextFilter(
2877 sqlite3_vtab_cursor *pCursor, /* The cursor used for this query */
2878 int idxNum, const char *idxStr, /* Which indexing scheme to use */
2879 int argc, sqlite3_value **argv /* Arguments for the indexing scheme */
2880){
2881 fulltext_cursor *c = (fulltext_cursor *) pCursor;
2882 fulltext_vtab *v = cursor_vtab(c);
2883 int rc;
2884 char *zSql;
2885
2886 TRACE(("FTS1 Filter %p\n",pCursor));
2887
2888 zSql = sqlite3_mprintf("select rowid, * from %%_content %s",
2889 idxNum==QUERY_GENERIC ? "" : "where rowid=?");
2890 sqlite3_finalize(c->pStmt);
2891 rc = sql_prepare(v->db, v->zDb, v->zName, &c->pStmt, zSql);
2892 sqlite3_free(zSql);
2893 if( rc!=SQLITE_OK ) return rc;
2894
2895 c->iCursorType = idxNum;
2896 switch( idxNum ){
2897 case QUERY_GENERIC:
2898 break;
2899
2900 case QUERY_ROWID:
2901 rc = sqlite3_bind_int64(c->pStmt, 1, sqlite3_value_int64(argv[0]));
2902 if( rc!=SQLITE_OK ) return rc;
2903 break;
2904
2905 default: /* full-text search */
2906 {
2907 const char *zQuery = (const char *)sqlite3_value_text(argv[0]);
2908 DocList *pResult;
2909 assert( idxNum<=QUERY_FULLTEXT+v->nColumn);
2910 assert( argc==1 );
2911 queryClear(&c->q);
2912 rc = fulltextQuery(v, idxNum-QUERY_FULLTEXT, zQuery, -1, &pResult, &c->q);
2913 if( rc!=SQLITE_OK ) return rc;
2914 if( c->result.pDoclist!=NULL ) docListDelete(c->result.pDoclist);
2915 readerInit(&c->result, pResult);
2916 break;
2917 }
2918 }
2919
2920 return fulltextNext(pCursor);
2921}
2922
2923/* This is the xEof method of the virtual table. The SQLite core
2924** calls this routine to find out if it has reached the end of
2925** a query's results set.
2926*/
2927static int fulltextEof(sqlite3_vtab_cursor *pCursor){
2928 fulltext_cursor *c = (fulltext_cursor *) pCursor;
2929 return c->eof;
2930}
2931
2932/* This is the xColumn method of the virtual table. The SQLite
2933** core calls this method during a query when it needs the value
2934** of a column from the virtual table. This method needs to use
2935** one of the sqlite3_result_*() routines to store the requested
2936** value back in the pContext.
2937*/
2938static int fulltextColumn(sqlite3_vtab_cursor *pCursor,
2939 sqlite3_context *pContext, int idxCol){
2940 fulltext_cursor *c = (fulltext_cursor *) pCursor;
2941 fulltext_vtab *v = cursor_vtab(c);
2942
2943 if( idxCol<v->nColumn ){
2944 sqlite3_value *pVal = sqlite3_column_value(c->pStmt, idxCol+1);
2945 sqlite3_result_value(pContext, pVal);
2946 }else if( idxCol==v->nColumn ){
2947 /* The extra column whose name is the same as the table.
2948 ** Return a blob which is a pointer to the cursor
2949 */
2950 sqlite3_result_blob(pContext, &c, sizeof(c), SQLITE_TRANSIENT);
2951 }
2952 return SQLITE_OK;
2953}
2954
2955/* This is the xRowid method. The SQLite core calls this routine to
2956** retrive the rowid for the current row of the result set. The
2957** rowid should be written to *pRowid.
2958*/
2959static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){
2960 fulltext_cursor *c = (fulltext_cursor *) pCursor;
2961
2962 *pRowid = sqlite3_column_int64(c->pStmt, 0);
2963 return SQLITE_OK;
2964}
2965
2966/* Add all terms in [zText] to the given hash table. If [iColumn] > 0,
2967 * we also store positions and offsets in the hash table using the given
2968 * column number. */
2969static int buildTerms(fulltext_vtab *v, fts1Hash *terms, sqlite_int64 iDocid,
2970 const char *zText, int iColumn){
2971 sqlite3_tokenizer *pTokenizer = v->pTokenizer;
2972 sqlite3_tokenizer_cursor *pCursor;
2973 const char *pToken;
2974 int nTokenBytes;
2975 int iStartOffset, iEndOffset, iPosition;
2976 int rc;
2977
2978 rc = pTokenizer->pModule->xOpen(pTokenizer, zText, -1, &pCursor);
2979 if( rc!=SQLITE_OK ) return rc;
2980
2981 pCursor->pTokenizer = pTokenizer;
2982 while( SQLITE_OK==pTokenizer->pModule->xNext(pCursor,
2983 &pToken, &nTokenBytes,
2984 &iStartOffset, &iEndOffset,
2985 &iPosition) ){
2986 DocList *p;
2987
2988 /* Positions can't be negative; we use -1 as a terminator internally. */
2989 if( iPosition<0 ){
2990 pTokenizer->pModule->xClose(pCursor);
2991 return SQLITE_ERROR;
2992 }
2993
2994 p = fts1HashFind(terms, pToken, nTokenBytes);
2995 if( p==NULL ){
2996 p = docListNew(DL_DEFAULT);
2997 docListAddDocid(p, iDocid);
2998 fts1HashInsert(terms, pToken, nTokenBytes, p);
2999 }
3000 if( iColumn>=0 ){
3001 docListAddPosOffset(p, iColumn, iPosition, iStartOffset, iEndOffset);
3002 }
3003 }
3004
3005 /* TODO(shess) Check return? Should this be able to cause errors at
3006 ** this point? Actually, same question about sqlite3_finalize(),
3007 ** though one could argue that failure there means that the data is
3008 ** not durable. *ponder*
3009 */
3010 pTokenizer->pModule->xClose(pCursor);
3011 return rc;
3012}
3013
3014/* Update the %_terms table to map the term [pTerm] to the given rowid. */
3015static int index_insert_term(fulltext_vtab *v, const char *pTerm, int nTerm,
3016 DocList *d){
3017 sqlite_int64 iIndexRow;
3018 DocList doclist;
3019 int iSegment = 0, rc;
3020
3021 rc = term_select(v, pTerm, nTerm, iSegment, &iIndexRow, &doclist);
3022 if( rc==SQLITE_DONE ){
3023 docListInit(&doclist, DL_DEFAULT, 0, 0);
3024 docListUpdate(&doclist, d);
3025 /* TODO(shess) Consider length(doclist)>CHUNK_MAX? */
3026 rc = term_insert(v, NULL, pTerm, nTerm, iSegment, &doclist);
3027 goto err;
3028 }
3029 if( rc!=SQLITE_ROW ) return SQLITE_ERROR;
3030
3031 docListUpdate(&doclist, d);
3032 if( doclist.nData<=CHUNK_MAX ){
3033 rc = term_update(v, iIndexRow, &doclist);
3034 goto err;
3035 }
3036
3037 /* Doclist doesn't fit, delete what's there, and accumulate
3038 ** forward.
3039 */
3040 rc = term_delete(v, iIndexRow);
3041 if( rc!=SQLITE_OK ) goto err;
3042
3043 /* Try to insert the doclist into a higher segment bucket. On
3044 ** failure, accumulate existing doclist with the doclist from that
3045 ** bucket, and put results in the next bucket.
3046 */
3047 iSegment++;
3048 while( (rc=term_insert(v, &iIndexRow, pTerm, nTerm, iSegment,
3049 &doclist))!=SQLITE_OK ){
3050 sqlite_int64 iSegmentRow;
3051 DocList old;
3052 int rc2;
3053
3054 /* Retain old error in case the term_insert() error was really an
3055 ** error rather than a bounced insert.
3056 */
3057 rc2 = term_select(v, pTerm, nTerm, iSegment, &iSegmentRow, &old);
3058 if( rc2!=SQLITE_ROW ) goto err;
3059
3060 rc = term_delete(v, iSegmentRow);
3061 if( rc!=SQLITE_OK ) goto err;
3062
3063 /* Reusing lowest-number deleted row keeps the index smaller. */
3064 if( iSegmentRow<iIndexRow ) iIndexRow = iSegmentRow;
3065
3066 /* doclist contains the newer data, so accumulate it over old.
3067 ** Then steal accumulated data for doclist.
3068 */
3069 docListAccumulate(&old, &doclist);
3070 docListDestroy(&doclist);
3071 doclist = old;
3072
3073 iSegment++;
3074 }
3075
3076 err:
3077 docListDestroy(&doclist);
3078 return rc;
3079}
3080
3081/* Add doclists for all terms in [pValues] to the hash table [terms]. */
3082static int insertTerms(fulltext_vtab *v, fts1Hash *terms, sqlite_int64 iRowid,
3083 sqlite3_value **pValues){
3084 int i;
3085 for(i = 0; i < v->nColumn ; ++i){
3086 char *zText = (char*)sqlite3_value_text(pValues[i]);
3087 int rc = buildTerms(v, terms, iRowid, zText, i);
3088 if( rc!=SQLITE_OK ) return rc;
3089 }
3090 return SQLITE_OK;
3091}
3092
3093/* Add empty doclists for all terms in the given row's content to the hash
3094 * table [pTerms]. */
3095static int deleteTerms(fulltext_vtab *v, fts1Hash *pTerms, sqlite_int64 iRowid){
3096 const char **pValues;
3097 int i;
3098
3099 int rc = content_select(v, iRowid, &pValues);
3100 if( rc!=SQLITE_OK ) return rc;
3101
3102 for(i = 0 ; i < v->nColumn; ++i) {
3103 rc = buildTerms(v, pTerms, iRowid, pValues[i], -1);
3104 if( rc!=SQLITE_OK ) break;
3105 }
3106
3107 freeStringArray(v->nColumn, pValues);
3108 return SQLITE_OK;
3109}
3110
3111/* Insert a row into the %_content table; set *piRowid to be the ID of the
3112 * new row. Fill [pTerms] with new doclists for the %_term table. */
3113static int index_insert(fulltext_vtab *v, sqlite3_value *pRequestRowid,
3114 sqlite3_value **pValues,
3115 sqlite_int64 *piRowid, fts1Hash *pTerms){
3116 int rc;
3117
3118 rc = content_insert(v, pRequestRowid, pValues); /* execute an SQL INSERT */
3119 if( rc!=SQLITE_OK ) return rc;
3120 *piRowid = sqlite3_last_insert_rowid(v->db);
3121 return insertTerms(v, pTerms, *piRowid, pValues);
3122}
3123
3124/* Delete a row from the %_content table; fill [pTerms] with empty doclists
3125 * to be written to the %_term table. */
3126static int index_delete(fulltext_vtab *v, sqlite_int64 iRow, fts1Hash *pTerms){
3127 int rc = deleteTerms(v, pTerms, iRow);
3128 if( rc!=SQLITE_OK ) return rc;
3129 return content_delete(v, iRow); /* execute an SQL DELETE */
3130}
3131
3132/* Update a row in the %_content table; fill [pTerms] with new doclists for the
3133 * %_term table. */
3134static int index_update(fulltext_vtab *v, sqlite_int64 iRow,
3135 sqlite3_value **pValues, fts1Hash *pTerms){
3136 /* Generate an empty doclist for each term that previously appeared in this
3137 * row. */
3138 int rc = deleteTerms(v, pTerms, iRow);
3139 if( rc!=SQLITE_OK ) return rc;
3140
3141 rc = content_update(v, pValues, iRow); /* execute an SQL UPDATE */
3142 if( rc!=SQLITE_OK ) return rc;
3143
3144 /* Now add positions for terms which appear in the updated row. */
3145 return insertTerms(v, pTerms, iRow, pValues);
3146}
3147
3148/* This function implements the xUpdate callback; it's the top-level entry
3149 * point for inserting, deleting or updating a row in a full-text table. */
3150static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg,
3151 sqlite_int64 *pRowid){
3152 fulltext_vtab *v = (fulltext_vtab *) pVtab;
3153 fts1Hash terms; /* maps term string -> PosList */
3154 int rc;
3155 fts1HashElem *e;
3156
3157 TRACE(("FTS1 Update %p\n", pVtab));
3158
3159 fts1HashInit(&terms, FTS1_HASH_STRING, 1);
3160
3161 if( nArg<2 ){
3162 rc = index_delete(v, sqlite3_value_int64(ppArg[0]), &terms);
3163 } else if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){
3164 /* An update:
3165 * ppArg[0] = old rowid
3166 * ppArg[1] = new rowid
3167 * ppArg[2..2+v->nColumn-1] = values
3168 * ppArg[2+v->nColumn] = value for magic column (we ignore this)
3169 */
3170 sqlite_int64 rowid = sqlite3_value_int64(ppArg[0]);
3171 if( sqlite3_value_type(ppArg[1]) != SQLITE_INTEGER ||
3172 sqlite3_value_int64(ppArg[1]) != rowid ){
3173 rc = SQLITE_ERROR; /* we don't allow changing the rowid */
3174 } else {
3175 assert( nArg==2+v->nColumn+1);
3176 rc = index_update(v, rowid, &ppArg[2], &terms);
3177 }
3178 } else {
3179 /* An insert:
3180 * ppArg[1] = requested rowid
3181 * ppArg[2..2+v->nColumn-1] = values
3182 * ppArg[2+v->nColumn] = value for magic column (we ignore this)
3183 */
3184 assert( nArg==2+v->nColumn+1);
3185 rc = index_insert(v, ppArg[1], &ppArg[2], pRowid, &terms);
3186 }
3187
3188 if( rc==SQLITE_OK ){
3189 /* Write updated doclists to disk. */
3190 for(e=fts1HashFirst(&terms); e; e=fts1HashNext(e)){
3191 DocList *p = fts1HashData(e);
3192 rc = index_insert_term(v, fts1HashKey(e), fts1HashKeysize(e), p);
3193 if( rc!=SQLITE_OK ) break;
3194 }
3195 }
3196
3197 /* clean up */
3198 for(e=fts1HashFirst(&terms); e; e=fts1HashNext(e)){
3199 DocList *p = fts1HashData(e);
3200 docListDelete(p);
3201 }
3202 fts1HashClear(&terms);
3203
3204 return rc;
3205}
3206
3207/*
3208** Implementation of the snippet() function for FTS1
3209*/
3210static void snippetFunc(
3211 sqlite3_context *pContext,
3212 int argc,
3213 sqlite3_value **argv
3214){
3215 fulltext_cursor *pCursor;
3216 if( argc<1 ) return;
3217 if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
3218 sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
3219 sqlite3_result_error(pContext, "illegal first argument to html_snippet",-1);
3220 }else{
3221 const char *zStart = "<b>";
3222 const char *zEnd = "</b>";
3223 const char *zEllipsis = "<b>...</b>";
3224 memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
3225 if( argc>=2 ){
3226 zStart = (const char*)sqlite3_value_text(argv[1]);
3227 if( argc>=3 ){
3228 zEnd = (const char*)sqlite3_value_text(argv[2]);
3229 if( argc>=4 ){
3230 zEllipsis = (const char*)sqlite3_value_text(argv[3]);
3231 }
3232 }
3233 }
3234 snippetAllOffsets(pCursor);
3235 snippetText(pCursor, zStart, zEnd, zEllipsis);
3236 sqlite3_result_text(pContext, pCursor->snippet.zSnippet,
3237 pCursor->snippet.nSnippet, SQLITE_STATIC);
3238 }
3239}
3240
3241/*
3242** Implementation of the offsets() function for FTS1
3243*/
3244static void snippetOffsetsFunc(
3245 sqlite3_context *pContext,
3246 int argc,
3247 sqlite3_value **argv
3248){
3249 fulltext_cursor *pCursor;
3250 if( argc<1 ) return;
3251 if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
3252 sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
3253 sqlite3_result_error(pContext, "illegal first argument to offsets",-1);
3254 }else{
3255 memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
3256 snippetAllOffsets(pCursor);
3257 snippetOffsetText(&pCursor->snippet);
3258 sqlite3_result_text(pContext,
3259 pCursor->snippet.zOffset, pCursor->snippet.nOffset,
3260 SQLITE_STATIC);
3261 }
3262}
3263
3264/*
3265** This routine implements the xFindFunction method for the FTS1
3266** virtual table.
3267*/
3268static int fulltextFindFunction(
3269 sqlite3_vtab *pVtab,
3270 int nArg,
3271 const char *zName,
3272 void (**pxFunc)(sqlite3_context*,int,sqlite3_value**),
3273 void **ppArg
3274){
3275 if( strcmp(zName,"snippet")==0 ){
3276 *pxFunc = snippetFunc;
3277 return 1;
3278 }else if( strcmp(zName,"offsets")==0 ){
3279 *pxFunc = snippetOffsetsFunc;
3280 return 1;
3281 }
3282 return 0;
3283}
3284
3285/*
3286** Rename an fts1 table.
3287*/
3288static int fulltextRename(
3289 sqlite3_vtab *pVtab,
3290 const char *zName
3291){
3292 fulltext_vtab *p = (fulltext_vtab *)pVtab;
3293 int rc = SQLITE_NOMEM;
3294 char *zSql = sqlite3_mprintf(
3295 "ALTER TABLE %Q.'%q_content' RENAME TO '%q_content';"
3296 "ALTER TABLE %Q.'%q_term' RENAME TO '%q_term';"
3297 , p->zDb, p->zName, zName
3298 , p->zDb, p->zName, zName
3299 );
3300 if( zSql ){
3301 rc = sqlite3_exec(p->db, zSql, 0, 0, 0);
3302 sqlite3_free(zSql);
3303 }
3304 return rc;
3305}
3306
3307static const sqlite3_module fulltextModule = {
3308 /* iVersion */ 0,
3309 /* xCreate */ fulltextCreate,
3310 /* xConnect */ fulltextConnect,
3311 /* xBestIndex */ fulltextBestIndex,
3312 /* xDisconnect */ fulltextDisconnect,
3313 /* xDestroy */ fulltextDestroy,
3314 /* xOpen */ fulltextOpen,
3315 /* xClose */ fulltextClose,
3316 /* xFilter */ fulltextFilter,
3317 /* xNext */ fulltextNext,
3318 /* xEof */ fulltextEof,
3319 /* xColumn */ fulltextColumn,
3320 /* xRowid */ fulltextRowid,
3321 /* xUpdate */ fulltextUpdate,
3322 /* xBegin */ 0,
3323 /* xSync */ 0,
3324 /* xCommit */ 0,
3325 /* xRollback */ 0,
3326 /* xFindFunction */ fulltextFindFunction,
3327 /* xRename */ fulltextRename,
3328};
3329
3330int sqlite3Fts1Init(sqlite3 *db){
3331 sqlite3_overload_function(db, "snippet", -1);
3332 sqlite3_overload_function(db, "offsets", -1);
3333 return sqlite3_create_module(db, "fts1", &fulltextModule, 0);
3334}
3335
3336#if !SQLITE_CORE
3337int sqlite3_extension_init(sqlite3 *db, char **pzErrMsg,
3338 const sqlite3_api_routines *pApi){
3339 SQLITE_EXTENSION_INIT2(pApi)
3340 return sqlite3Fts1Init(db);
3341}
3342#endif
3343
3344#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1.h b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1.h
new file mode 100644
index 0000000..d55e689
--- /dev/null
+++ b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1.h
@@ -0,0 +1,11 @@
1#include "sqlite3.h"
2
3#ifdef __cplusplus
4extern "C" {
5#endif /* __cplusplus */
6
7int sqlite3Fts1Init(sqlite3 *db);
8
9#ifdef __cplusplus
10} /* extern "C" */
11#endif /* __cplusplus */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_hash.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_hash.c
new file mode 100644
index 0000000..463a52b
--- /dev/null
+++ b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_hash.c
@@ -0,0 +1,369 @@
1/*
2** 2001 September 22
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12** This is the implementation of generic hash-tables used in SQLite.
13** We've modified it slightly to serve as a standalone hash table
14** implementation for the full-text indexing module.
15*/
16#include <assert.h>
17#include <stdlib.h>
18#include <string.h>
19
20/*
21** The code in this file is only compiled if:
22**
23** * The FTS1 module is being built as an extension
24** (in which case SQLITE_CORE is not defined), or
25**
26** * The FTS1 module is being built into the core of
27** SQLite (in which case SQLITE_ENABLE_FTS1 is defined).
28*/
29#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)
30
31
32#include "fts1_hash.h"
33
34static void *malloc_and_zero(int n){
35 void *p = malloc(n);
36 if( p ){
37 memset(p, 0, n);
38 }
39 return p;
40}
41
42/* Turn bulk memory into a hash table object by initializing the
43** fields of the Hash structure.
44**
45** "pNew" is a pointer to the hash table that is to be initialized.
46** keyClass is one of the constants
47** FTS1_HASH_BINARY or FTS1_HASH_STRING. The value of keyClass
48** determines what kind of key the hash table will use. "copyKey" is
49** true if the hash table should make its own private copy of keys and
50** false if it should just use the supplied pointer.
51*/
52void sqlite3Fts1HashInit(fts1Hash *pNew, int keyClass, int copyKey){
53 assert( pNew!=0 );
54 assert( keyClass>=FTS1_HASH_STRING && keyClass<=FTS1_HASH_BINARY );
55 pNew->keyClass = keyClass;
56 pNew->copyKey = copyKey;
57 pNew->first = 0;
58 pNew->count = 0;
59 pNew->htsize = 0;
60 pNew->ht = 0;
61 pNew->xMalloc = malloc_and_zero;
62 pNew->xFree = free;
63}
64
65/* Remove all entries from a hash table. Reclaim all memory.
66** Call this routine to delete a hash table or to reset a hash table
67** to the empty state.
68*/
69void sqlite3Fts1HashClear(fts1Hash *pH){
70 fts1HashElem *elem; /* For looping over all elements of the table */
71
72 assert( pH!=0 );
73 elem = pH->first;
74 pH->first = 0;
75 if( pH->ht ) pH->xFree(pH->ht);
76 pH->ht = 0;
77 pH->htsize = 0;
78 while( elem ){
79 fts1HashElem *next_elem = elem->next;
80 if( pH->copyKey && elem->pKey ){
81 pH->xFree(elem->pKey);
82 }
83 pH->xFree(elem);
84 elem = next_elem;
85 }
86 pH->count = 0;
87}
88
89/*
90** Hash and comparison functions when the mode is FTS1_HASH_STRING
91*/
92static int strHash(const void *pKey, int nKey){
93 const char *z = (const char *)pKey;
94 int h = 0;
95 if( nKey<=0 ) nKey = (int) strlen(z);
96 while( nKey > 0 ){
97 h = (h<<3) ^ h ^ *z++;
98 nKey--;
99 }
100 return h & 0x7fffffff;
101}
102static int strCompare(const void *pKey1, int n1, const void *pKey2, int n2){
103 if( n1!=n2 ) return 1;
104 return strncmp((const char*)pKey1,(const char*)pKey2,n1);
105}
106
107/*
108** Hash and comparison functions when the mode is FTS1_HASH_BINARY
109*/
110static int binHash(const void *pKey, int nKey){
111 int h = 0;
112 const char *z = (const char *)pKey;
113 while( nKey-- > 0 ){
114 h = (h<<3) ^ h ^ *(z++);
115 }
116 return h & 0x7fffffff;
117}
118static int binCompare(const void *pKey1, int n1, const void *pKey2, int n2){
119 if( n1!=n2 ) return 1;
120 return memcmp(pKey1,pKey2,n1);
121}
122
123/*
124** Return a pointer to the appropriate hash function given the key class.
125**
126** The C syntax in this function definition may be unfamilar to some
127** programmers, so we provide the following additional explanation:
128**
129** The name of the function is "hashFunction". The function takes a
130** single parameter "keyClass". The return value of hashFunction()
131** is a pointer to another function. Specifically, the return value
132** of hashFunction() is a pointer to a function that takes two parameters
133** with types "const void*" and "int" and returns an "int".
134*/
135static int (*hashFunction(int keyClass))(const void*,int){
136 if( keyClass==FTS1_HASH_STRING ){
137 return &strHash;
138 }else{
139 assert( keyClass==FTS1_HASH_BINARY );
140 return &binHash;
141 }
142}
143
144/*
145** Return a pointer to the appropriate hash function given the key class.
146**
147** For help in interpreted the obscure C code in the function definition,
148** see the header comment on the previous function.
149*/
150static int (*compareFunction(int keyClass))(const void*,int,const void*,int){
151 if( keyClass==FTS1_HASH_STRING ){
152 return &strCompare;
153 }else{
154 assert( keyClass==FTS1_HASH_BINARY );
155 return &binCompare;
156 }
157}
158
159/* Link an element into the hash table
160*/
161static void insertElement(
162 fts1Hash *pH, /* The complete hash table */
163 struct _fts1ht *pEntry, /* The entry into which pNew is inserted */
164 fts1HashElem *pNew /* The element to be inserted */
165){
166 fts1HashElem *pHead; /* First element already in pEntry */
167 pHead = pEntry->chain;
168 if( pHead ){
169 pNew->next = pHead;
170 pNew->prev = pHead->prev;
171 if( pHead->prev ){ pHead->prev->next = pNew; }
172 else { pH->first = pNew; }
173 pHead->prev = pNew;
174 }else{
175 pNew->next = pH->first;
176 if( pH->first ){ pH->first->prev = pNew; }
177 pNew->prev = 0;
178 pH->first = pNew;
179 }
180 pEntry->count++;
181 pEntry->chain = pNew;
182}
183
184
185/* Resize the hash table so that it cantains "new_size" buckets.
186** "new_size" must be a power of 2. The hash table might fail
187** to resize if sqliteMalloc() fails.
188*/
189static void rehash(fts1Hash *pH, int new_size){
190 struct _fts1ht *new_ht; /* The new hash table */
191 fts1HashElem *elem, *next_elem; /* For looping over existing elements */
192 int (*xHash)(const void*,int); /* The hash function */
193
194 assert( (new_size & (new_size-1))==0 );
195 new_ht = (struct _fts1ht *)pH->xMalloc( new_size*sizeof(struct _fts1ht) );
196 if( new_ht==0 ) return;
197 if( pH->ht ) pH->xFree(pH->ht);
198 pH->ht = new_ht;
199 pH->htsize = new_size;
200 xHash = hashFunction(pH->keyClass);
201 for(elem=pH->first, pH->first=0; elem; elem = next_elem){
202 int h = (*xHash)(elem->pKey, elem->nKey) & (new_size-1);
203 next_elem = elem->next;
204 insertElement(pH, &new_ht[h], elem);
205 }
206}
207
208/* This function (for internal use only) locates an element in an
209** hash table that matches the given key. The hash for this key has
210** already been computed and is passed as the 4th parameter.
211*/
212static fts1HashElem *findElementGivenHash(
213 const fts1Hash *pH, /* The pH to be searched */
214 const void *pKey, /* The key we are searching for */
215 int nKey,
216 int h /* The hash for this key. */
217){
218 fts1HashElem *elem; /* Used to loop thru the element list */
219 int count; /* Number of elements left to test */
220 int (*xCompare)(const void*,int,const void*,int); /* comparison function */
221
222 if( pH->ht ){
223 struct _fts1ht *pEntry = &pH->ht[h];
224 elem = pEntry->chain;
225 count = pEntry->count;
226 xCompare = compareFunction(pH->keyClass);
227 while( count-- && elem ){
228 if( (*xCompare)(elem->pKey,elem->nKey,pKey,nKey)==0 ){
229 return elem;
230 }
231 elem = elem->next;
232 }
233 }
234 return 0;
235}
236
237/* Remove a single entry from the hash table given a pointer to that
238** element and a hash on the element's key.
239*/
240static void removeElementGivenHash(
241 fts1Hash *pH, /* The pH containing "elem" */
242 fts1HashElem* elem, /* The element to be removed from the pH */
243 int h /* Hash value for the element */
244){
245 struct _fts1ht *pEntry;
246 if( elem->prev ){
247 elem->prev->next = elem->next;
248 }else{
249 pH->first = elem->next;
250 }
251 if( elem->next ){
252 elem->next->prev = elem->prev;
253 }
254 pEntry = &pH->ht[h];
255 if( pEntry->chain==elem ){
256 pEntry->chain = elem->next;
257 }
258 pEntry->count--;
259 if( pEntry->count<=0 ){
260 pEntry->chain = 0;
261 }
262 if( pH->copyKey && elem->pKey ){
263 pH->xFree(elem->pKey);
264 }
265 pH->xFree( elem );
266 pH->count--;
267 if( pH->count<=0 ){
268 assert( pH->first==0 );
269 assert( pH->count==0 );
270 fts1HashClear(pH);
271 }
272}
273
274/* Attempt to locate an element of the hash table pH with a key
275** that matches pKey,nKey. Return the data for this element if it is
276** found, or NULL if there is no match.
277*/
278void *sqlite3Fts1HashFind(const fts1Hash *pH, const void *pKey, int nKey){
279 int h; /* A hash on key */
280 fts1HashElem *elem; /* The element that matches key */
281 int (*xHash)(const void*,int); /* The hash function */
282
283 if( pH==0 || pH->ht==0 ) return 0;
284 xHash = hashFunction(pH->keyClass);
285 assert( xHash!=0 );
286 h = (*xHash)(pKey,nKey);
287 assert( (pH->htsize & (pH->htsize-1))==0 );
288 elem = findElementGivenHash(pH,pKey,nKey, h & (pH->htsize-1));
289 return elem ? elem->data : 0;
290}
291
292/* Insert an element into the hash table pH. The key is pKey,nKey
293** and the data is "data".
294**
295** If no element exists with a matching key, then a new
296** element is created. A copy of the key is made if the copyKey
297** flag is set. NULL is returned.
298**
299** If another element already exists with the same key, then the
300** new data replaces the old data and the old data is returned.
301** The key is not copied in this instance. If a malloc fails, then
302** the new data is returned and the hash table is unchanged.
303**
304** If the "data" parameter to this function is NULL, then the
305** element corresponding to "key" is removed from the hash table.
306*/
307void *sqlite3Fts1HashInsert(
308 fts1Hash *pH, /* The hash table to insert into */
309 const void *pKey, /* The key */
310 int nKey, /* Number of bytes in the key */
311 void *data /* The data */
312){
313 int hraw; /* Raw hash value of the key */
314 int h; /* the hash of the key modulo hash table size */
315 fts1HashElem *elem; /* Used to loop thru the element list */
316 fts1HashElem *new_elem; /* New element added to the pH */
317 int (*xHash)(const void*,int); /* The hash function */
318
319 assert( pH!=0 );
320 xHash = hashFunction(pH->keyClass);
321 assert( xHash!=0 );
322 hraw = (*xHash)(pKey, nKey);
323 assert( (pH->htsize & (pH->htsize-1))==0 );
324 h = hraw & (pH->htsize-1);
325 elem = findElementGivenHash(pH,pKey,nKey,h);
326 if( elem ){
327 void *old_data = elem->data;
328 if( data==0 ){
329 removeElementGivenHash(pH,elem,h);
330 }else{
331 elem->data = data;
332 }
333 return old_data;
334 }
335 if( data==0 ) return 0;
336 new_elem = (fts1HashElem*)pH->xMalloc( sizeof(fts1HashElem) );
337 if( new_elem==0 ) return data;
338 if( pH->copyKey && pKey!=0 ){
339 new_elem->pKey = pH->xMalloc( nKey );
340 if( new_elem->pKey==0 ){
341 pH->xFree(new_elem);
342 return data;
343 }
344 memcpy((void*)new_elem->pKey, pKey, nKey);
345 }else{
346 new_elem->pKey = (void*)pKey;
347 }
348 new_elem->nKey = nKey;
349 pH->count++;
350 if( pH->htsize==0 ){
351 rehash(pH,8);
352 if( pH->htsize==0 ){
353 pH->count = 0;
354 pH->xFree(new_elem);
355 return data;
356 }
357 }
358 if( pH->count > pH->htsize ){
359 rehash(pH,pH->htsize*2);
360 }
361 assert( pH->htsize>0 );
362 assert( (pH->htsize & (pH->htsize-1))==0 );
363 h = hraw & (pH->htsize-1);
364 insertElement(pH, &pH->ht[h], new_elem);
365 new_elem->data = data;
366 return 0;
367}
368
369#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_hash.h b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_hash.h
new file mode 100644
index 0000000..c31c430
--- /dev/null
+++ b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_hash.h
@@ -0,0 +1,112 @@
1/*
2** 2001 September 22
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12** This is the header file for the generic hash-table implemenation
13** used in SQLite. We've modified it slightly to serve as a standalone
14** hash table implementation for the full-text indexing module.
15**
16*/
17#ifndef _FTS1_HASH_H_
18#define _FTS1_HASH_H_
19
20/* Forward declarations of structures. */
21typedef struct fts1Hash fts1Hash;
22typedef struct fts1HashElem fts1HashElem;
23
24/* A complete hash table is an instance of the following structure.
25** The internals of this structure are intended to be opaque -- client
26** code should not attempt to access or modify the fields of this structure
27** directly. Change this structure only by using the routines below.
28** However, many of the "procedures" and "functions" for modifying and
29** accessing this structure are really macros, so we can't really make
30** this structure opaque.
31*/
32struct fts1Hash {
33 char keyClass; /* HASH_INT, _POINTER, _STRING, _BINARY */
34 char copyKey; /* True if copy of key made on insert */
35 int count; /* Number of entries in this table */
36 fts1HashElem *first; /* The first element of the array */
37 void *(*xMalloc)(int); /* malloc() function to use */
38 void (*xFree)(void *); /* free() function to use */
39 int htsize; /* Number of buckets in the hash table */
40 struct _fts1ht { /* the hash table */
41 int count; /* Number of entries with this hash */
42 fts1HashElem *chain; /* Pointer to first entry with this hash */
43 } *ht;
44};
45
46/* Each element in the hash table is an instance of the following
47** structure. All elements are stored on a single doubly-linked list.
48**
49** Again, this structure is intended to be opaque, but it can't really
50** be opaque because it is used by macros.
51*/
52struct fts1HashElem {
53 fts1HashElem *next, *prev; /* Next and previous elements in the table */
54 void *data; /* Data associated with this element */
55 void *pKey; int nKey; /* Key associated with this element */
56};
57
58/*
59** There are 2 different modes of operation for a hash table:
60**
61** FTS1_HASH_STRING pKey points to a string that is nKey bytes long
62** (including the null-terminator, if any). Case
63** is respected in comparisons.
64**
65** FTS1_HASH_BINARY pKey points to binary data nKey bytes long.
66** memcmp() is used to compare keys.
67**
68** A copy of the key is made if the copyKey parameter to fts1HashInit is 1.
69*/
70#define FTS1_HASH_STRING 1
71#define FTS1_HASH_BINARY 2
72
73/*
74** Access routines. To delete, insert a NULL pointer.
75*/
76void sqlite3Fts1HashInit(fts1Hash*, int keytype, int copyKey);
77void *sqlite3Fts1HashInsert(fts1Hash*, const void *pKey, int nKey, void *pData);
78void *sqlite3Fts1HashFind(const fts1Hash*, const void *pKey, int nKey);
79void sqlite3Fts1HashClear(fts1Hash*);
80
81/*
82** Shorthand for the functions above
83*/
84#define fts1HashInit sqlite3Fts1HashInit
85#define fts1HashInsert sqlite3Fts1HashInsert
86#define fts1HashFind sqlite3Fts1HashFind
87#define fts1HashClear sqlite3Fts1HashClear
88
89/*
90** Macros for looping over all elements of a hash table. The idiom is
91** like this:
92**
93** fts1Hash h;
94** fts1HashElem *p;
95** ...
96** for(p=fts1HashFirst(&h); p; p=fts1HashNext(p)){
97** SomeStructure *pData = fts1HashData(p);
98** // do something with pData
99** }
100*/
101#define fts1HashFirst(H) ((H)->first)
102#define fts1HashNext(E) ((E)->next)
103#define fts1HashData(E) ((E)->data)
104#define fts1HashKey(E) ((E)->pKey)
105#define fts1HashKeysize(E) ((E)->nKey)
106
107/*
108** Number of entries in a hash table
109*/
110#define fts1HashCount(H) ((H)->count)
111
112#endif /* _FTS1_HASH_H_ */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_porter.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_porter.c
new file mode 100644
index 0000000..1d26236
--- /dev/null
+++ b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_porter.c
@@ -0,0 +1,643 @@
1/*
2** 2006 September 30
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12** Implementation of the full-text-search tokenizer that implements
13** a Porter stemmer.
14*/
15
16/*
17** The code in this file is only compiled if:
18**
19** * The FTS1 module is being built as an extension
20** (in which case SQLITE_CORE is not defined), or
21**
22** * The FTS1 module is being built into the core of
23** SQLite (in which case SQLITE_ENABLE_FTS1 is defined).
24*/
25#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)
26
27
28#include <assert.h>
29#include <stdlib.h>
30#include <stdio.h>
31#include <string.h>
32#include <ctype.h>
33
34#include "fts1_tokenizer.h"
35
36/*
37** Class derived from sqlite3_tokenizer
38*/
39typedef struct porter_tokenizer {
40 sqlite3_tokenizer base; /* Base class */
41} porter_tokenizer;
42
43/*
44** Class derived from sqlit3_tokenizer_cursor
45*/
46typedef struct porter_tokenizer_cursor {
47 sqlite3_tokenizer_cursor base;
48 const char *zInput; /* input we are tokenizing */
49 int nInput; /* size of the input */
50 int iOffset; /* current position in zInput */
51 int iToken; /* index of next token to be returned */
52 char *zToken; /* storage for current token */
53 int nAllocated; /* space allocated to zToken buffer */
54} porter_tokenizer_cursor;
55
56
57/* Forward declaration */
58static const sqlite3_tokenizer_module porterTokenizerModule;
59
60
61/*
62** Create a new tokenizer instance.
63*/
64static int porterCreate(
65 int argc, const char * const *argv,
66 sqlite3_tokenizer **ppTokenizer
67){
68 porter_tokenizer *t;
69 t = (porter_tokenizer *) calloc(sizeof(*t), 1);
70 if( t==NULL ) return SQLITE_NOMEM;
71
72 *ppTokenizer = &t->base;
73 return SQLITE_OK;
74}
75
76/*
77** Destroy a tokenizer
78*/
79static int porterDestroy(sqlite3_tokenizer *pTokenizer){
80 free(pTokenizer);
81 return SQLITE_OK;
82}
83
84/*
85** Prepare to begin tokenizing a particular string. The input
86** string to be tokenized is zInput[0..nInput-1]. A cursor
87** used to incrementally tokenize this string is returned in
88** *ppCursor.
89*/
90static int porterOpen(
91 sqlite3_tokenizer *pTokenizer, /* The tokenizer */
92 const char *zInput, int nInput, /* String to be tokenized */
93 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
94){
95 porter_tokenizer_cursor *c;
96
97 c = (porter_tokenizer_cursor *) malloc(sizeof(*c));
98 if( c==NULL ) return SQLITE_NOMEM;
99
100 c->zInput = zInput;
101 if( zInput==0 ){
102 c->nInput = 0;
103 }else if( nInput<0 ){
104 c->nInput = (int)strlen(zInput);
105 }else{
106 c->nInput = nInput;
107 }
108 c->iOffset = 0; /* start tokenizing at the beginning */
109 c->iToken = 0;
110 c->zToken = NULL; /* no space allocated, yet. */
111 c->nAllocated = 0;
112
113 *ppCursor = &c->base;
114 return SQLITE_OK;
115}
116
117/*
118** Close a tokenization cursor previously opened by a call to
119** porterOpen() above.
120*/
121static int porterClose(sqlite3_tokenizer_cursor *pCursor){
122 porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
123 free(c->zToken);
124 free(c);
125 return SQLITE_OK;
126}
127/*
128** Vowel or consonant
129*/
130static const char cType[] = {
131 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
132 1, 1, 1, 2, 1
133};
134
135/*
136** isConsonant() and isVowel() determine if their first character in
137** the string they point to is a consonant or a vowel, according
138** to Porter ruls.
139**
140** A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'.
141** 'Y' is a consonant unless it follows another consonant,
142** in which case it is a vowel.
143**
144** In these routine, the letters are in reverse order. So the 'y' rule
145** is that 'y' is a consonant unless it is followed by another
146** consonent.
147*/
148static int isVowel(const char*);
149static int isConsonant(const char *z){
150 int j;
151 char x = *z;
152 if( x==0 ) return 0;
153 assert( x>='a' && x<='z' );
154 j = cType[x-'a'];
155 if( j<2 ) return j;
156 return z[1]==0 || isVowel(z + 1);
157}
158static int isVowel(const char *z){
159 int j;
160 char x = *z;
161 if( x==0 ) return 0;
162 assert( x>='a' && x<='z' );
163 j = cType[x-'a'];
164 if( j<2 ) return 1-j;
165 return isConsonant(z + 1);
166}
167
168/*
169** Let any sequence of one or more vowels be represented by V and let
170** C be sequence of one or more consonants. Then every word can be
171** represented as:
172**
173** [C] (VC){m} [V]
174**
175** In prose: A word is an optional consonant followed by zero or
176** vowel-consonant pairs followed by an optional vowel. "m" is the
177** number of vowel consonant pairs. This routine computes the value
178** of m for the first i bytes of a word.
179**
180** Return true if the m-value for z is 1 or more. In other words,
181** return true if z contains at least one vowel that is followed
182** by a consonant.
183**
184** In this routine z[] is in reverse order. So we are really looking
185** for an instance of of a consonant followed by a vowel.
186*/
187static int m_gt_0(const char *z){
188 while( isVowel(z) ){ z++; }
189 if( *z==0 ) return 0;
190 while( isConsonant(z) ){ z++; }
191 return *z!=0;
192}
193
194/* Like mgt0 above except we are looking for a value of m which is
195** exactly 1
196*/
197static int m_eq_1(const char *z){
198 while( isVowel(z) ){ z++; }
199 if( *z==0 ) return 0;
200 while( isConsonant(z) ){ z++; }
201 if( *z==0 ) return 0;
202 while( isVowel(z) ){ z++; }
203 if( *z==0 ) return 1;
204 while( isConsonant(z) ){ z++; }
205 return *z==0;
206}
207
208/* Like mgt0 above except we are looking for a value of m>1 instead
209** or m>0
210*/
211static int m_gt_1(const char *z){
212 while( isVowel(z) ){ z++; }
213 if( *z==0 ) return 0;
214 while( isConsonant(z) ){ z++; }
215 if( *z==0 ) return 0;
216 while( isVowel(z) ){ z++; }
217 if( *z==0 ) return 0;
218 while( isConsonant(z) ){ z++; }
219 return *z!=0;
220}
221
222/*
223** Return TRUE if there is a vowel anywhere within z[0..n-1]
224*/
225static int hasVowel(const char *z){
226 while( isConsonant(z) ){ z++; }
227 return *z!=0;
228}
229
230/*
231** Return TRUE if the word ends in a double consonant.
232**
233** The text is reversed here. So we are really looking at
234** the first two characters of z[].
235*/
236static int doubleConsonant(const char *z){
237 return isConsonant(z) && z[0]==z[1] && isConsonant(z+1);
238}
239
240/*
241** Return TRUE if the word ends with three letters which
242** are consonant-vowel-consonent and where the final consonant
243** is not 'w', 'x', or 'y'.
244**
245** The word is reversed here. So we are really checking the
246** first three letters and the first one cannot be in [wxy].
247*/
248static int star_oh(const char *z){
249 return
250 z[0]!=0 && isConsonant(z) &&
251 z[0]!='w' && z[0]!='x' && z[0]!='y' &&
252 z[1]!=0 && isVowel(z+1) &&
253 z[2]!=0 && isConsonant(z+2);
254}
255
256/*
257** If the word ends with zFrom and xCond() is true for the stem
258** of the word that preceeds the zFrom ending, then change the
259** ending to zTo.
260**
261** The input word *pz and zFrom are both in reverse order. zTo
262** is in normal order.
263**
264** Return TRUE if zFrom matches. Return FALSE if zFrom does not
265** match. Not that TRUE is returned even if xCond() fails and
266** no substitution occurs.
267*/
268static int stem(
269 char **pz, /* The word being stemmed (Reversed) */
270 const char *zFrom, /* If the ending matches this... (Reversed) */
271 const char *zTo, /* ... change the ending to this (not reversed) */
272 int (*xCond)(const char*) /* Condition that must be true */
273){
274 char *z = *pz;
275 while( *zFrom && *zFrom==*z ){ z++; zFrom++; }
276 if( *zFrom!=0 ) return 0;
277 if( xCond && !xCond(z) ) return 1;
278 while( *zTo ){
279 *(--z) = *(zTo++);
280 }
281 *pz = z;
282 return 1;
283}
284
285/*
286** This is the fallback stemmer used when the porter stemmer is
287** inappropriate. The input word is copied into the output with
288** US-ASCII case folding. If the input word is too long (more
289** than 20 bytes if it contains no digits or more than 6 bytes if
290** it contains digits) then word is truncated to 20 or 6 bytes
291** by taking 10 or 3 bytes from the beginning and end.
292*/
293static void copy_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
294 int i, mx, j;
295 int hasDigit = 0;
296 for(i=0; i<nIn; i++){
297 int c = zIn[i];
298 if( c>='A' && c<='Z' ){
299 zOut[i] = c - 'A' + 'a';
300 }else{
301 if( c>='0' && c<='9' ) hasDigit = 1;
302 zOut[i] = c;
303 }
304 }
305 mx = hasDigit ? 3 : 10;
306 if( nIn>mx*2 ){
307 for(j=mx, i=nIn-mx; i<nIn; i++, j++){
308 zOut[j] = zOut[i];
309 }
310 i = j;
311 }
312 zOut[i] = 0;
313 *pnOut = i;
314}
315
316
317/*
318** Stem the input word zIn[0..nIn-1]. Store the output in zOut.
319** zOut is at least big enough to hold nIn bytes. Write the actual
320** size of the output word (exclusive of the '\0' terminator) into *pnOut.
321**
322** Any upper-case characters in the US-ASCII character set ([A-Z])
323** are converted to lower case. Upper-case UTF characters are
324** unchanged.
325**
326** Words that are longer than about 20 bytes are stemmed by retaining
327** a few bytes from the beginning and the end of the word. If the
328** word contains digits, 3 bytes are taken from the beginning and
329** 3 bytes from the end. For long words without digits, 10 bytes
330** are taken from each end. US-ASCII case folding still applies.
331**
332** If the input word contains not digits but does characters not
333** in [a-zA-Z] then no stemming is attempted and this routine just
334** copies the input into the input into the output with US-ASCII
335** case folding.
336**
337** Stemming never increases the length of the word. So there is
338** no chance of overflowing the zOut buffer.
339*/
340static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
341 int i, j, c;
342 char zReverse[28];
343 char *z, *z2;
344 if( nIn<3 || nIn>=sizeof(zReverse)-7 ){
345 /* The word is too big or too small for the porter stemmer.
346 ** Fallback to the copy stemmer */
347 copy_stemmer(zIn, nIn, zOut, pnOut);
348 return;
349 }
350 for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){
351 c = zIn[i];
352 if( c>='A' && c<='Z' ){
353 zReverse[j] = c + 'a' - 'A';
354 }else if( c>='a' && c<='z' ){
355 zReverse[j] = c;
356 }else{
357 /* The use of a character not in [a-zA-Z] means that we fallback
358 ** to the copy stemmer */
359 copy_stemmer(zIn, nIn, zOut, pnOut);
360 return;
361 }
362 }
363 memset(&zReverse[sizeof(zReverse)-5], 0, 5);
364 z = &zReverse[j+1];
365
366
367 /* Step 1a */
368 if( z[0]=='s' ){
369 if(
370 !stem(&z, "sess", "ss", 0) &&
371 !stem(&z, "sei", "i", 0) &&
372 !stem(&z, "ss", "ss", 0)
373 ){
374 z++;
375 }
376 }
377
378 /* Step 1b */
379 z2 = z;
380 if( stem(&z, "dee", "ee", m_gt_0) ){
381 /* Do nothing. The work was all in the test */
382 }else if(
383 (stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel))
384 && z!=z2
385 ){
386 if( stem(&z, "ta", "ate", 0) ||
387 stem(&z, "lb", "ble", 0) ||
388 stem(&z, "zi", "ize", 0) ){
389 /* Do nothing. The work was all in the test */
390 }else if( doubleConsonant(z) && (*z!='l' && *z!='s' && *z!='z') ){
391 z++;
392 }else if( m_eq_1(z) && star_oh(z) ){
393 *(--z) = 'e';
394 }
395 }
396
397 /* Step 1c */
398 if( z[0]=='y' && hasVowel(z+1) ){
399 z[0] = 'i';
400 }
401
402 /* Step 2 */
403 switch( z[1] ){
404 case 'a':
405 stem(&z, "lanoita", "ate", m_gt_0) ||
406 stem(&z, "lanoit", "tion", m_gt_0);
407 break;
408 case 'c':
409 stem(&z, "icne", "ence", m_gt_0) ||
410 stem(&z, "icna", "ance", m_gt_0);
411 break;
412 case 'e':
413 stem(&z, "rezi", "ize", m_gt_0);
414 break;
415 case 'g':
416 stem(&z, "igol", "log", m_gt_0);
417 break;
418 case 'l':
419 stem(&z, "ilb", "ble", m_gt_0) ||
420 stem(&z, "illa", "al", m_gt_0) ||
421 stem(&z, "iltne", "ent", m_gt_0) ||
422 stem(&z, "ile", "e", m_gt_0) ||
423 stem(&z, "ilsuo", "ous", m_gt_0);
424 break;
425 case 'o':
426 stem(&z, "noitazi", "ize", m_gt_0) ||
427 stem(&z, "noita", "ate", m_gt_0) ||
428 stem(&z, "rota", "ate", m_gt_0);
429 break;
430 case 's':
431 stem(&z, "msila", "al", m_gt_0) ||
432 stem(&z, "ssenevi", "ive", m_gt_0) ||
433 stem(&z, "ssenluf", "ful", m_gt_0) ||
434 stem(&z, "ssensuo", "ous", m_gt_0);
435 break;
436 case 't':
437 stem(&z, "itila", "al", m_gt_0) ||
438 stem(&z, "itivi", "ive", m_gt_0) ||
439 stem(&z, "itilib", "ble", m_gt_0);
440 break;
441 }
442
443 /* Step 3 */
444 switch( z[0] ){
445 case 'e':
446 stem(&z, "etaci", "ic", m_gt_0) ||
447 stem(&z, "evita", "", m_gt_0) ||
448 stem(&z, "ezila", "al", m_gt_0);
449 break;
450 case 'i':
451 stem(&z, "itici", "ic", m_gt_0);
452 break;
453 case 'l':
454 stem(&z, "laci", "ic", m_gt_0) ||
455 stem(&z, "luf", "", m_gt_0);
456 break;
457 case 's':
458 stem(&z, "ssen", "", m_gt_0);
459 break;
460 }
461
462 /* Step 4 */
463 switch( z[1] ){
464 case 'a':
465 if( z[0]=='l' && m_gt_1(z+2) ){
466 z += 2;
467 }
468 break;
469 case 'c':
470 if( z[0]=='e' && z[2]=='n' && (z[3]=='a' || z[3]=='e') && m_gt_1(z+4) ){
471 z += 4;
472 }
473 break;
474 case 'e':
475 if( z[0]=='r' && m_gt_1(z+2) ){
476 z += 2;
477 }
478 break;
479 case 'i':
480 if( z[0]=='c' && m_gt_1(z+2) ){
481 z += 2;
482 }
483 break;
484 case 'l':
485 if( z[0]=='e' && z[2]=='b' && (z[3]=='a' || z[3]=='i') && m_gt_1(z+4) ){
486 z += 4;
487 }
488 break;
489 case 'n':
490 if( z[0]=='t' ){
491 if( z[2]=='a' ){
492 if( m_gt_1(z+3) ){
493 z += 3;
494 }
495 }else if( z[2]=='e' ){
496 stem(&z, "tneme", "", m_gt_1) ||
497 stem(&z, "tnem", "", m_gt_1) ||
498 stem(&z, "tne", "", m_gt_1);
499 }
500 }
501 break;
502 case 'o':
503 if( z[0]=='u' ){
504 if( m_gt_1(z+2) ){
505 z += 2;
506 }
507 }else if( z[3]=='s' || z[3]=='t' ){
508 stem(&z, "noi", "", m_gt_1);
509 }
510 break;
511 case 's':
512 if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){
513 z += 3;
514 }
515 break;
516 case 't':
517 stem(&z, "eta", "", m_gt_1) ||
518 stem(&z, "iti", "", m_gt_1);
519 break;
520 case 'u':
521 if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){
522 z += 3;
523 }
524 break;
525 case 'v':
526 case 'z':
527 if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){
528 z += 3;
529 }
530 break;
531 }
532
533 /* Step 5a */
534 if( z[0]=='e' ){
535 if( m_gt_1(z+1) ){
536 z++;
537 }else if( m_eq_1(z+1) && !star_oh(z+1) ){
538 z++;
539 }
540 }
541
542 /* Step 5b */
543 if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){
544 z++;
545 }
546
547 /* z[] is now the stemmed word in reverse order. Flip it back
548 ** around into forward order and return.
549 */
550 *pnOut = i = strlen(z);
551 zOut[i] = 0;
552 while( *z ){
553 zOut[--i] = *(z++);
554 }
555}
556
557/*
558** Characters that can be part of a token. We assume any character
559** whose value is greater than 0x80 (any UTF character) can be
560** part of a token. In other words, delimiters all must have
561** values of 0x7f or lower.
562*/
563static const char isIdChar[] = {
564/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
565 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */
566 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */
567 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */
568 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */
569 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */
570};
571#define idChar(C) (((ch=C)&0x80)!=0 || (ch>0x2f && isIdChar[ch-0x30]))
572#define isDelim(C) (((ch=C)&0x80)==0 && (ch<0x30 || !isIdChar[ch-0x30]))
573
574/*
575** Extract the next token from a tokenization cursor. The cursor must
576** have been opened by a prior call to porterOpen().
577*/
578static int porterNext(
579 sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by porterOpen */
580 const char **pzToken, /* OUT: *pzToken is the token text */
581 int *pnBytes, /* OUT: Number of bytes in token */
582 int *piStartOffset, /* OUT: Starting offset of token */
583 int *piEndOffset, /* OUT: Ending offset of token */
584 int *piPosition /* OUT: Position integer of token */
585){
586 porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
587 const char *z = c->zInput;
588
589 while( c->iOffset<c->nInput ){
590 int iStartOffset, ch;
591
592 /* Scan past delimiter characters */
593 while( c->iOffset<c->nInput && isDelim(z[c->iOffset]) ){
594 c->iOffset++;
595 }
596
597 /* Count non-delimiter characters. */
598 iStartOffset = c->iOffset;
599 while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){
600 c->iOffset++;
601 }
602
603 if( c->iOffset>iStartOffset ){
604 int n = c->iOffset-iStartOffset;
605 if( n>c->nAllocated ){
606 c->nAllocated = n+20;
607 c->zToken = realloc(c->zToken, c->nAllocated);
608 if( c->zToken==NULL ) return SQLITE_NOMEM;
609 }
610 porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes);
611 *pzToken = c->zToken;
612 *piStartOffset = iStartOffset;
613 *piEndOffset = c->iOffset;
614 *piPosition = c->iToken++;
615 return SQLITE_OK;
616 }
617 }
618 return SQLITE_DONE;
619}
620
621/*
622** The set of routines that implement the porter-stemmer tokenizer
623*/
624static const sqlite3_tokenizer_module porterTokenizerModule = {
625 0,
626 porterCreate,
627 porterDestroy,
628 porterOpen,
629 porterClose,
630 porterNext,
631};
632
633/*
634** Allocate a new porter tokenizer. Return a pointer to the new
635** tokenizer in *ppModule
636*/
637void sqlite3Fts1PorterTokenizerModule(
638 sqlite3_tokenizer_module const**ppModule
639){
640 *ppModule = &porterTokenizerModule;
641}
642
643#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_tokenizer.h b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_tokenizer.h
new file mode 100644
index 0000000..a48cb74
--- /dev/null
+++ b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_tokenizer.h
@@ -0,0 +1,90 @@
1/*
2** 2006 July 10
3**
4** The author disclaims copyright to this source code.
5**
6*************************************************************************
7** Defines the interface to tokenizers used by fulltext-search. There
8** are three basic components:
9**
10** sqlite3_tokenizer_module is a singleton defining the tokenizer
11** interface functions. This is essentially the class structure for
12** tokenizers.
13**
14** sqlite3_tokenizer is used to define a particular tokenizer, perhaps
15** including customization information defined at creation time.
16**
17** sqlite3_tokenizer_cursor is generated by a tokenizer to generate
18** tokens from a particular input.
19*/
20#ifndef _FTS1_TOKENIZER_H_
21#define _FTS1_TOKENIZER_H_
22
23/* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time.
24** If tokenizers are to be allowed to call sqlite3_*() functions, then
25** we will need a way to register the API consistently.
26*/
27#include "sqlite3.h"
28
29/*
30** Structures used by the tokenizer interface.
31*/
32typedef struct sqlite3_tokenizer sqlite3_tokenizer;
33typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor;
34typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module;
35
36struct sqlite3_tokenizer_module {
37 int iVersion; /* currently 0 */
38
39 /*
40 ** Create and destroy a tokenizer. argc/argv are passed down from
41 ** the fulltext virtual table creation to allow customization.
42 */
43 int (*xCreate)(int argc, const char *const*argv,
44 sqlite3_tokenizer **ppTokenizer);
45 int (*xDestroy)(sqlite3_tokenizer *pTokenizer);
46
47 /*
48 ** Tokenize a particular input. Call xOpen() to prepare to
49 ** tokenize, xNext() repeatedly until it returns SQLITE_DONE, then
50 ** xClose() to free any internal state. The pInput passed to
51 ** xOpen() must exist until the cursor is closed. The ppToken
52 ** result from xNext() is only valid until the next call to xNext()
53 ** or until xClose() is called.
54 */
55 /* TODO(shess) current implementation requires pInput to be
56 ** nul-terminated. This should either be fixed, or pInput/nBytes
57 ** should be converted to zInput.
58 */
59 int (*xOpen)(sqlite3_tokenizer *pTokenizer,
60 const char *pInput, int nBytes,
61 sqlite3_tokenizer_cursor **ppCursor);
62 int (*xClose)(sqlite3_tokenizer_cursor *pCursor);
63 int (*xNext)(sqlite3_tokenizer_cursor *pCursor,
64 const char **ppToken, int *pnBytes,
65 int *piStartOffset, int *piEndOffset, int *piPosition);
66};
67
68struct sqlite3_tokenizer {
69 const sqlite3_tokenizer_module *pModule; /* The module for this tokenizer */
70 /* Tokenizer implementations will typically add additional fields */
71};
72
73struct sqlite3_tokenizer_cursor {
74 sqlite3_tokenizer *pTokenizer; /* Tokenizer for this cursor. */
75 /* Tokenizer implementations will typically add additional fields */
76};
77
78/*
79** Get the module for a tokenizer which generates tokens based on a
80** set of non-token characters. The default is to break tokens at any
81** non-alnum character, though the set of delimiters can also be
82** specified by the first argv argument to xCreate().
83*/
84/* TODO(shess) This doesn't belong here. Need some sort of
85** registration process.
86*/
87void sqlite3Fts1SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule);
88void sqlite3Fts1PorterTokenizerModule(sqlite3_tokenizer_module const**ppModule);
89
90#endif /* _FTS1_TOKENIZER_H_ */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_tokenizer1.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_tokenizer1.c
new file mode 100644
index 0000000..f58fba8
--- /dev/null
+++ b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts1/fts1_tokenizer1.c
@@ -0,0 +1,221 @@
1/*
2** The author disclaims copyright to this source code.
3**
4*************************************************************************
5** Implementation of the "simple" full-text-search tokenizer.
6*/
7
8/*
9** The code in this file is only compiled if:
10**
11** * The FTS1 module is being built as an extension
12** (in which case SQLITE_CORE is not defined), or
13**
14** * The FTS1 module is being built into the core of
15** SQLite (in which case SQLITE_ENABLE_FTS1 is defined).
16*/
17#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)
18
19
20#include <assert.h>
21#include <stdlib.h>
22#include <stdio.h>
23#include <string.h>
24#include <ctype.h>
25
26#include "fts1_tokenizer.h"
27
28typedef struct simple_tokenizer {
29 sqlite3_tokenizer base;
30 char delim[128]; /* flag ASCII delimiters */
31} simple_tokenizer;
32
33typedef struct simple_tokenizer_cursor {
34 sqlite3_tokenizer_cursor base;
35 const char *pInput; /* input we are tokenizing */
36 int nBytes; /* size of the input */
37 int iOffset; /* current position in pInput */
38 int iToken; /* index of next token to be returned */
39 char *pToken; /* storage for current token */
40 int nTokenAllocated; /* space allocated to zToken buffer */
41} simple_tokenizer_cursor;
42
43
44/* Forward declaration */
45static const sqlite3_tokenizer_module simpleTokenizerModule;
46
47static int isDelim(simple_tokenizer *t, unsigned char c){
48 return c<0x80 && t->delim[c];
49}
50
51/*
52** Create a new tokenizer instance.
53*/
54static int simpleCreate(
55 int argc, const char * const *argv,
56 sqlite3_tokenizer **ppTokenizer
57){
58 simple_tokenizer *t;
59
60 t = (simple_tokenizer *) calloc(sizeof(*t), 1);
61 if( t==NULL ) return SQLITE_NOMEM;
62
63 /* TODO(shess) Delimiters need to remain the same from run to run,
64 ** else we need to reindex. One solution would be a meta-table to
65 ** track such information in the database, then we'd only want this
66 ** information on the initial create.
67 */
68 if( argc>1 ){
69 int i, n = strlen(argv[1]);
70 for(i=0; i<n; i++){
71 unsigned char ch = argv[1][i];
72 /* We explicitly don't support UTF-8 delimiters for now. */
73 if( ch>=0x80 ){
74 free(t);
75 return SQLITE_ERROR;
76 }
77 t->delim[ch] = 1;
78 }
79 } else {
80 /* Mark non-alphanumeric ASCII characters as delimiters */
81 int i;
82 for(i=1; i<0x80; i++){
83 t->delim[i] = !isalnum(i);
84 }
85 }
86
87 *ppTokenizer = &t->base;
88 return SQLITE_OK;
89}
90
91/*
92** Destroy a tokenizer
93*/
94static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
95 free(pTokenizer);
96 return SQLITE_OK;
97}
98
99/*
100** Prepare to begin tokenizing a particular string. The input
101** string to be tokenized is pInput[0..nBytes-1]. A cursor
102** used to incrementally tokenize this string is returned in
103** *ppCursor.
104*/
105static int simpleOpen(
106 sqlite3_tokenizer *pTokenizer, /* The tokenizer */
107 const char *pInput, int nBytes, /* String to be tokenized */
108 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
109){
110 simple_tokenizer_cursor *c;
111
112 c = (simple_tokenizer_cursor *) malloc(sizeof(*c));
113 if( c==NULL ) return SQLITE_NOMEM;
114
115 c->pInput = pInput;
116 if( pInput==0 ){
117 c->nBytes = 0;
118 }else if( nBytes<0 ){
119 c->nBytes = (int)strlen(pInput);
120 }else{
121 c->nBytes = nBytes;
122 }
123 c->iOffset = 0; /* start tokenizing at the beginning */
124 c->iToken = 0;
125 c->pToken = NULL; /* no space allocated, yet. */
126 c->nTokenAllocated = 0;
127
128 *ppCursor = &c->base;
129 return SQLITE_OK;
130}
131
132/*
133** Close a tokenization cursor previously opened by a call to
134** simpleOpen() above.
135*/
136static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
137 simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
138 free(c->pToken);
139 free(c);
140 return SQLITE_OK;
141}
142
143/*
144** Extract the next token from a tokenization cursor. The cursor must
145** have been opened by a prior call to simpleOpen().
146*/
147static int simpleNext(
148 sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */
149 const char **ppToken, /* OUT: *ppToken is the token text */
150 int *pnBytes, /* OUT: Number of bytes in token */
151 int *piStartOffset, /* OUT: Starting offset of token */
152 int *piEndOffset, /* OUT: Ending offset of token */
153 int *piPosition /* OUT: Position integer of token */
154){
155 simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
156 simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
157 unsigned char *p = (unsigned char *)c->pInput;
158
159 while( c->iOffset<c->nBytes ){
160 int iStartOffset;
161
162 /* Scan past delimiter characters */
163 while( c->iOffset<c->nBytes && isDelim(t, p[c->iOffset]) ){
164 c->iOffset++;
165 }
166
167 /* Count non-delimiter characters. */
168 iStartOffset = c->iOffset;
169 while( c->iOffset<c->nBytes && !isDelim(t, p[c->iOffset]) ){
170 c->iOffset++;
171 }
172
173 if( c->iOffset>iStartOffset ){
174 int i, n = c->iOffset-iStartOffset;
175 if( n>c->nTokenAllocated ){
176 c->nTokenAllocated = n+20;
177 c->pToken = realloc(c->pToken, c->nTokenAllocated);
178 if( c->pToken==NULL ) return SQLITE_NOMEM;
179 }
180 for(i=0; i<n; i++){
181 /* TODO(shess) This needs expansion to handle UTF-8
182 ** case-insensitivity.
183 */
184 unsigned char ch = p[iStartOffset+i];
185 c->pToken[i] = ch<0x80 ? tolower(ch) : ch;
186 }
187 *ppToken = c->pToken;
188 *pnBytes = n;
189 *piStartOffset = iStartOffset;
190 *piEndOffset = c->iOffset;
191 *piPosition = c->iToken++;
192
193 return SQLITE_OK;
194 }
195 }
196 return SQLITE_DONE;
197}
198
199/*
200** The set of routines that implement the simple tokenizer
201*/
202static const sqlite3_tokenizer_module simpleTokenizerModule = {
203 0,
204 simpleCreate,
205 simpleDestroy,
206 simpleOpen,
207 simpleClose,
208 simpleNext,
209};
210
211/*
212** Allocate a new simple tokenizer. Return a pointer to the new
213** tokenizer in *ppModule
214*/
215void sqlite3Fts1SimpleTokenizerModule(
216 sqlite3_tokenizer_module const**ppModule
217){
218 *ppModule = &simpleTokenizerModule;
219}
220
221#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */