aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3
diff options
context:
space:
mode:
Diffstat (limited to 'libraries/sqlite/unix/sqlite-3.5.1/ext/fts3')
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/README.tokenizers134
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/README.txt4
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3.c5971
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3.h26
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_hash.c373
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_hash.h110
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_icu.c257
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_porter.c642
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_tokenizer.c371
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_tokenizer.h145
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_tokenizer1.c229
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/mkfts3amal.tcl116
12 files changed, 8378 insertions, 0 deletions
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/README.tokenizers b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/README.tokenizers
new file mode 100644
index 0000000..f214b24
--- /dev/null
+++ b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/README.tokenizers
@@ -0,0 +1,134 @@
1
21. FTS3 Tokenizers
3
4 When creating a new full-text table, FTS3 allows the user to select
5 the text tokenizer implementation to be used when indexing text
6 by specifying a "tokenizer" clause as part of the CREATE VIRTUAL TABLE
7 statement:
8
9 CREATE VIRTUAL TABLE <table-name> USING fts3(
10 <columns ...> [, tokenizer <tokenizer-name> [<tokenizer-args>]]
11 );
12
13 The built-in tokenizers (valid values to pass as <tokenizer name>) are
14 "simple" and "porter".
15
16 <tokenizer-args> should consist of zero or more white-space separated
17 arguments to pass to the selected tokenizer implementation. The
18 interpretation of the arguments, if any, depends on the individual
19 tokenizer.
20
212. Custom Tokenizers
22
23 FTS3 allows users to provide custom tokenizer implementations. The
24 interface used to create a new tokenizer is defined and described in
25 the fts3_tokenizer.h source file.
26
27 Registering a new FTS3 tokenizer is similar to registering a new
28 virtual table module with SQLite. The user passes a pointer to a
29 structure containing pointers to various callback functions that
30 make up the implementation of the new tokenizer type. For tokenizers,
31 the structure (defined in fts3_tokenizer.h) is called
32 "sqlite3_tokenizer_module".
33
34 FTS3 does not expose a C-function that users call to register new
35 tokenizer types with a database handle. Instead, the pointer must
36 be encoded as an SQL blob value and passed to FTS3 through the SQL
37 engine by evaluating a special scalar function, "fts3_tokenizer()".
38 The fts3_tokenizer() function may be called with one or two arguments,
39 as follows:
40
41 SELECT fts3_tokenizer(<tokenizer-name>);
42 SELECT fts3_tokenizer(<tokenizer-name>, <sqlite3_tokenizer_module ptr>);
43
44 Where <tokenizer-name> is a string identifying the tokenizer and
45 <sqlite3_tokenizer_module ptr> is a pointer to an sqlite3_tokenizer_module
46 structure encoded as an SQL blob. If the second argument is present,
47 it is registered as tokenizer <tokenizer-name> and a copy of it
48 returned. If only one argument is passed, a pointer to the tokenizer
49 implementation currently registered as <tokenizer-name> is returned,
50 encoded as a blob. Or, if no such tokenizer exists, an SQL exception
51 (error) is raised.
52
53 SECURITY: If the fts3 extension is used in an environment where potentially
54 malicious users may execute arbitrary SQL (i.e. gears), they should be
55 prevented from invoking the fts3_tokenizer() function, possibly using the
56 authorisation callback.
57
58 See "Sample code" below for an example of calling the fts3_tokenizer()
59 function from C code.
60
613. ICU Library Tokenizers
62
63 If this extension is compiled with the SQLITE_ENABLE_ICU pre-processor
64 symbol defined, then there exists a built-in tokenizer named "icu"
65 implemented using the ICU library. The first argument passed to the
66 xCreate() method (see fts3_tokenizer.h) of this tokenizer may be
67 an ICU locale identifier. For example "tr_TR" for Turkish as used
68 in Turkey, or "en_AU" for English as used in Australia. For example:
69
70 "CREATE VIRTUAL TABLE thai_text USING fts3(text, tokenizer icu th_TH)"
71
72 The ICU tokenizer implementation is very simple. It splits the input
73 text according to the ICU rules for finding word boundaries and discards
74 any tokens that consist entirely of white-space. This may be suitable
75 for some applications in some locales, but not all. If more complex
76 processing is required, for example to implement stemming or
77 discard punctuation, this can be done by creating a tokenizer
78 implementation that uses the ICU tokenizer as part of it's implementation.
79
80 When using the ICU tokenizer this way, it is safe to overwrite the
81 contents of the strings returned by the xNext() method (see
82 fts3_tokenizer.h).
83
844. Sample code.
85
86 The following two code samples illustrate the way C code should invoke
87 the fts3_tokenizer() scalar function:
88
89 int registerTokenizer(
90 sqlite3 *db,
91 char *zName,
92 const sqlite3_tokenizer_module *p
93 ){
94 int rc;
95 sqlite3_stmt *pStmt;
96 const char zSql[] = "SELECT fts3_tokenizer(?, ?)";
97
98 rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
99 if( rc!=SQLITE_OK ){
100 return rc;
101 }
102
103 sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
104 sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
105 sqlite3_step(pStmt);
106
107 return sqlite3_finalize(pStmt);
108 }
109
110 int queryTokenizer(
111 sqlite3 *db,
112 char *zName,
113 const sqlite3_tokenizer_module **pp
114 ){
115 int rc;
116 sqlite3_stmt *pStmt;
117 const char zSql[] = "SELECT fts3_tokenizer(?)";
118
119 *pp = 0;
120 rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
121 if( rc!=SQLITE_OK ){
122 return rc;
123 }
124
125 sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
126 if( SQLITE_ROW==sqlite3_step(pStmt) ){
127 if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){
128 memcpy(pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp));
129 }
130 }
131
132 return sqlite3_finalize(pStmt);
133 }
134
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/README.txt b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/README.txt
new file mode 100644
index 0000000..517a2a0
--- /dev/null
+++ b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/README.txt
@@ -0,0 +1,4 @@
1This folder contains source code to the second full-text search
2extension for SQLite. While the API is the same, this version uses a
3substantially different storage schema from fts1, so tables will need
4to be rebuilt.
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3.c
new file mode 100644
index 0000000..b392919
--- /dev/null
+++ b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3.c
@@ -0,0 +1,5971 @@
1/*
2** 2006 Oct 10
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11******************************************************************************
12**
13** This is an SQLite module implementing full-text search.
14*/
15
16/*
17** The code in this file is only compiled if:
18**
19** * The FTS3 module is being built as an extension
20** (in which case SQLITE_CORE is not defined), or
21**
22** * The FTS3 module is being built into the core of
23** SQLite (in which case SQLITE_ENABLE_FTS3 is defined).
24*/
25
26/* TODO(shess) Consider exporting this comment to an HTML file or the
27** wiki.
28*/
29/* The full-text index is stored in a series of b+tree (-like)
30** structures called segments which map terms to doclists. The
31** structures are like b+trees in layout, but are constructed from the
32** bottom up in optimal fashion and are not updatable. Since trees
33** are built from the bottom up, things will be described from the
34** bottom up.
35**
36**
37**** Varints ****
38** The basic unit of encoding is a variable-length integer called a
39** varint. We encode variable-length integers in little-endian order
40** using seven bits * per byte as follows:
41**
42** KEY:
43** A = 0xxxxxxx 7 bits of data and one flag bit
44** B = 1xxxxxxx 7 bits of data and one flag bit
45**
46** 7 bits - A
47** 14 bits - BA
48** 21 bits - BBA
49** and so on.
50**
51** This is identical to how sqlite encodes varints (see util.c).
52**
53**
54**** Document lists ****
55** A doclist (document list) holds a docid-sorted list of hits for a
56** given term. Doclists hold docids, and can optionally associate
57** token positions and offsets with docids.
58**
59** A DL_POSITIONS_OFFSETS doclist is stored like this:
60**
61** array {
62** varint docid;
63** array { (position list for column 0)
64** varint position; (delta from previous position plus POS_BASE)
65** varint startOffset; (delta from previous startOffset)
66** varint endOffset; (delta from startOffset)
67** }
68** array {
69** varint POS_COLUMN; (marks start of position list for new column)
70** varint column; (index of new column)
71** array {
72** varint position; (delta from previous position plus POS_BASE)
73** varint startOffset;(delta from previous startOffset)
74** varint endOffset; (delta from startOffset)
75** }
76** }
77** varint POS_END; (marks end of positions for this document.
78** }
79**
80** Here, array { X } means zero or more occurrences of X, adjacent in
81** memory. A "position" is an index of a token in the token stream
82** generated by the tokenizer, while an "offset" is a byte offset,
83** both based at 0. Note that POS_END and POS_COLUMN occur in the
84** same logical place as the position element, and act as sentinals
85** ending a position list array.
86**
87** A DL_POSITIONS doclist omits the startOffset and endOffset
88** information. A DL_DOCIDS doclist omits both the position and
89** offset information, becoming an array of varint-encoded docids.
90**
91** On-disk data is stored as type DL_DEFAULT, so we don't serialize
92** the type. Due to how deletion is implemented in the segmentation
93** system, on-disk doclists MUST store at least positions.
94**
95**
96**** Segment leaf nodes ****
97** Segment leaf nodes store terms and doclists, ordered by term. Leaf
98** nodes are written using LeafWriter, and read using LeafReader (to
99** iterate through a single leaf node's data) and LeavesReader (to
100** iterate through a segment's entire leaf layer). Leaf nodes have
101** the format:
102**
103** varint iHeight; (height from leaf level, always 0)
104** varint nTerm; (length of first term)
105** char pTerm[nTerm]; (content of first term)
106** varint nDoclist; (length of term's associated doclist)
107** char pDoclist[nDoclist]; (content of doclist)
108** array {
109** (further terms are delta-encoded)
110** varint nPrefix; (length of prefix shared with previous term)
111** varint nSuffix; (length of unshared suffix)
112** char pTermSuffix[nSuffix];(unshared suffix of next term)
113** varint nDoclist; (length of term's associated doclist)
114** char pDoclist[nDoclist]; (content of doclist)
115** }
116**
117** Here, array { X } means zero or more occurrences of X, adjacent in
118** memory.
119**
120** Leaf nodes are broken into blocks which are stored contiguously in
121** the %_segments table in sorted order. This means that when the end
122** of a node is reached, the next term is in the node with the next
123** greater node id.
124**
125** New data is spilled to a new leaf node when the current node
126** exceeds LEAF_MAX bytes (default 2048). New data which itself is
127** larger than STANDALONE_MIN (default 1024) is placed in a standalone
128** node (a leaf node with a single term and doclist). The goal of
129** these settings is to pack together groups of small doclists while
130** making it efficient to directly access large doclists. The
131** assumption is that large doclists represent terms which are more
132** likely to be query targets.
133**
134** TODO(shess) It may be useful for blocking decisions to be more
135** dynamic. For instance, it may make more sense to have a 2.5k leaf
136** node rather than splitting into 2k and .5k nodes. My intuition is
137** that this might extend through 2x or 4x the pagesize.
138**
139**
140**** Segment interior nodes ****
141** Segment interior nodes store blockids for subtree nodes and terms
142** to describe what data is stored by the each subtree. Interior
143** nodes are written using InteriorWriter, and read using
144** InteriorReader. InteriorWriters are created as needed when
145** SegmentWriter creates new leaf nodes, or when an interior node
146** itself grows too big and must be split. The format of interior
147** nodes:
148**
149** varint iHeight; (height from leaf level, always >0)
150** varint iBlockid; (block id of node's leftmost subtree)
151** optional {
152** varint nTerm; (length of first term)
153** char pTerm[nTerm]; (content of first term)
154** array {
155** (further terms are delta-encoded)
156** varint nPrefix; (length of shared prefix with previous term)
157** varint nSuffix; (length of unshared suffix)
158** char pTermSuffix[nSuffix]; (unshared suffix of next term)
159** }
160** }
161**
162** Here, optional { X } means an optional element, while array { X }
163** means zero or more occurrences of X, adjacent in memory.
164**
165** An interior node encodes n terms separating n+1 subtrees. The
166** subtree blocks are contiguous, so only the first subtree's blockid
167** is encoded. The subtree at iBlockid will contain all terms less
168** than the first term encoded (or all terms if no term is encoded).
169** Otherwise, for terms greater than or equal to pTerm[i] but less
170** than pTerm[i+1], the subtree for that term will be rooted at
171** iBlockid+i. Interior nodes only store enough term data to
172** distinguish adjacent children (if the rightmost term of the left
173** child is "something", and the leftmost term of the right child is
174** "wicked", only "w" is stored).
175**
176** New data is spilled to a new interior node at the same height when
177** the current node exceeds INTERIOR_MAX bytes (default 2048).
178** INTERIOR_MIN_TERMS (default 7) keeps large terms from monopolizing
179** interior nodes and making the tree too skinny. The interior nodes
180** at a given height are naturally tracked by interior nodes at
181** height+1, and so on.
182**
183**
184**** Segment directory ****
185** The segment directory in table %_segdir stores meta-information for
186** merging and deleting segments, and also the root node of the
187** segment's tree.
188**
189** The root node is the top node of the segment's tree after encoding
190** the entire segment, restricted to ROOT_MAX bytes (default 1024).
191** This could be either a leaf node or an interior node. If the top
192** node requires more than ROOT_MAX bytes, it is flushed to %_segments
193** and a new root interior node is generated (which should always fit
194** within ROOT_MAX because it only needs space for 2 varints, the
195** height and the blockid of the previous root).
196**
197** The meta-information in the segment directory is:
198** level - segment level (see below)
199** idx - index within level
200** - (level,idx uniquely identify a segment)
201** start_block - first leaf node
202** leaves_end_block - last leaf node
203** end_block - last block (including interior nodes)
204** root - contents of root node
205**
206** If the root node is a leaf node, then start_block,
207** leaves_end_block, and end_block are all 0.
208**
209**
210**** Segment merging ****
211** To amortize update costs, segments are groups into levels and
212** merged in matches. Each increase in level represents exponentially
213** more documents.
214**
215** New documents (actually, document updates) are tokenized and
216** written individually (using LeafWriter) to a level 0 segment, with
217** incrementing idx. When idx reaches MERGE_COUNT (default 16), all
218** level 0 segments are merged into a single level 1 segment. Level 1
219** is populated like level 0, and eventually MERGE_COUNT level 1
220** segments are merged to a single level 2 segment (representing
221** MERGE_COUNT^2 updates), and so on.
222**
223** A segment merge traverses all segments at a given level in
224** parallel, performing a straightforward sorted merge. Since segment
225** leaf nodes are written in to the %_segments table in order, this
226** merge traverses the underlying sqlite disk structures efficiently.
227** After the merge, all segment blocks from the merged level are
228** deleted.
229**
230** MERGE_COUNT controls how often we merge segments. 16 seems to be
231** somewhat of a sweet spot for insertion performance. 32 and 64 show
232** very similar performance numbers to 16 on insertion, though they're
233** a tiny bit slower (perhaps due to more overhead in merge-time
234** sorting). 8 is about 20% slower than 16, 4 about 50% slower than
235** 16, 2 about 66% slower than 16.
236**
237** At query time, high MERGE_COUNT increases the number of segments
238** which need to be scanned and merged. For instance, with 100k docs
239** inserted:
240**
241** MERGE_COUNT segments
242** 16 25
243** 8 12
244** 4 10
245** 2 6
246**
247** This appears to have only a moderate impact on queries for very
248** frequent terms (which are somewhat dominated by segment merge
249** costs), and infrequent and non-existent terms still seem to be fast
250** even with many segments.
251**
252** TODO(shess) That said, it would be nice to have a better query-side
253** argument for MERGE_COUNT of 16. Also, it's possible/likely that
254** optimizations to things like doclist merging will swing the sweet
255** spot around.
256**
257**
258**
259**** Handling of deletions and updates ****
260** Since we're using a segmented structure, with no docid-oriented
261** index into the term index, we clearly cannot simply update the term
262** index when a document is deleted or updated. For deletions, we
263** write an empty doclist (varint(docid) varint(POS_END)), for updates
264** we simply write the new doclist. Segment merges overwrite older
265** data for a particular docid with newer data, so deletes or updates
266** will eventually overtake the earlier data and knock it out. The
267** query logic likewise merges doclists so that newer data knocks out
268** older data.
269**
270** TODO(shess) Provide a VACUUM type operation to clear out all
271** deletions and duplications. This would basically be a forced merge
272** into a single segment.
273*/
274
275#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
276
277#if defined(SQLITE_ENABLE_FTS3) && !defined(SQLITE_CORE)
278# define SQLITE_CORE 1
279#endif
280
281#include <assert.h>
282#include <stdlib.h>
283#include <stdio.h>
284#include <string.h>
285#include <ctype.h>
286
287#include "fts3.h"
288#include "fts3_hash.h"
289#include "fts3_tokenizer.h"
290#include "sqlite3.h"
291#include "sqlite3ext.h"
292SQLITE_EXTENSION_INIT1
293
294
295/* TODO(shess) MAN, this thing needs some refactoring. At minimum, it
296** would be nice to order the file better, perhaps something along the
297** lines of:
298**
299** - utility functions
300** - table setup functions
301** - table update functions
302** - table query functions
303**
304** Put the query functions last because they're likely to reference
305** typedefs or functions from the table update section.
306*/
307
308#if 0
309# define TRACE(A) printf A; fflush(stdout)
310#else
311# define TRACE(A)
312#endif
313
314/* It is not safe to call isspace(), tolower(), or isalnum() on
315** hi-bit-set characters. This is the same solution used in the
316** tokenizer.
317*/
318/* TODO(shess) The snippet-generation code should be using the
319** tokenizer-generated tokens rather than doing its own local
320** tokenization.
321*/
322/* TODO(shess) Is __isascii() a portable version of (c&0x80)==0? */
323static int safe_isspace(char c){
324 return (c&0x80)==0 ? isspace(c) : 0;
325}
326static int safe_tolower(char c){
327 return (c&0x80)==0 ? tolower(c) : c;
328}
329static int safe_isalnum(char c){
330 return (c&0x80)==0 ? isalnum(c) : 0;
331}
332
333typedef enum DocListType {
334 DL_DOCIDS, /* docids only */
335 DL_POSITIONS, /* docids + positions */
336 DL_POSITIONS_OFFSETS /* docids + positions + offsets */
337} DocListType;
338
339/*
340** By default, only positions and not offsets are stored in the doclists.
341** To change this so that offsets are stored too, compile with
342**
343** -DDL_DEFAULT=DL_POSITIONS_OFFSETS
344**
345** If DL_DEFAULT is set to DL_DOCIDS, your table can only be inserted
346** into (no deletes or updates).
347*/
348#ifndef DL_DEFAULT
349# define DL_DEFAULT DL_POSITIONS
350#endif
351
352enum {
353 POS_END = 0, /* end of this position list */
354 POS_COLUMN, /* followed by new column number */
355 POS_BASE
356};
357
358/* MERGE_COUNT controls how often we merge segments (see comment at
359** top of file).
360*/
361#define MERGE_COUNT 16
362
363/* utility functions */
364
365/* CLEAR() and SCRAMBLE() abstract memset() on a pointer to a single
366** record to prevent errors of the form:
367**
368** my_function(SomeType *b){
369** memset(b, '\0', sizeof(b)); // sizeof(b)!=sizeof(*b)
370** }
371*/
372/* TODO(shess) Obvious candidates for a header file. */
373#define CLEAR(b) memset(b, '\0', sizeof(*(b)))
374
375#ifndef NDEBUG
376# define SCRAMBLE(b) memset(b, 0x55, sizeof(*(b)))
377#else
378# define SCRAMBLE(b)
379#endif
380
381/* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */
382#define VARINT_MAX 10
383
384/* Write a 64-bit variable-length integer to memory starting at p[0].
385 * The length of data written will be between 1 and VARINT_MAX bytes.
386 * The number of bytes written is returned. */
387static int putVarint(char *p, sqlite_int64 v){
388 unsigned char *q = (unsigned char *) p;
389 sqlite_uint64 vu = v;
390 do{
391 *q++ = (unsigned char) ((vu & 0x7f) | 0x80);
392 vu >>= 7;
393 }while( vu!=0 );
394 q[-1] &= 0x7f; /* turn off high bit in final byte */
395 assert( q - (unsigned char *)p <= VARINT_MAX );
396 return (int) (q - (unsigned char *)p);
397}
398
399/* Read a 64-bit variable-length integer from memory starting at p[0].
400 * Return the number of bytes read, or 0 on error.
401 * The value is stored in *v. */
402static int getVarint(const char *p, sqlite_int64 *v){
403 const unsigned char *q = (const unsigned char *) p;
404 sqlite_uint64 x = 0, y = 1;
405 while( (*q & 0x80) == 0x80 ){
406 x += y * (*q++ & 0x7f);
407 y <<= 7;
408 if( q - (unsigned char *)p >= VARINT_MAX ){ /* bad data */
409 assert( 0 );
410 return 0;
411 }
412 }
413 x += y * (*q++);
414 *v = (sqlite_int64) x;
415 return (int) (q - (unsigned char *)p);
416}
417
418static int getVarint32(const char *p, int *pi){
419 sqlite_int64 i;
420 int ret = getVarint(p, &i);
421 *pi = (int) i;
422 assert( *pi==i );
423 return ret;
424}
425
426/*******************************************************************/
427/* DataBuffer is used to collect data into a buffer in piecemeal
428** fashion. It implements the usual distinction between amount of
429** data currently stored (nData) and buffer capacity (nCapacity).
430**
431** dataBufferInit - create a buffer with given initial capacity.
432** dataBufferReset - forget buffer's data, retaining capacity.
433** dataBufferDestroy - free buffer's data.
434** dataBufferExpand - expand capacity without adding data.
435** dataBufferAppend - append data.
436** dataBufferAppend2 - append two pieces of data at once.
437** dataBufferReplace - replace buffer's data.
438*/
439typedef struct DataBuffer {
440 char *pData; /* Pointer to malloc'ed buffer. */
441 int nCapacity; /* Size of pData buffer. */
442 int nData; /* End of data loaded into pData. */
443} DataBuffer;
444
445static void dataBufferInit(DataBuffer *pBuffer, int nCapacity){
446 assert( nCapacity>=0 );
447 pBuffer->nData = 0;
448 pBuffer->nCapacity = nCapacity;
449 pBuffer->pData = nCapacity==0 ? NULL : malloc(nCapacity);
450}
451static void dataBufferReset(DataBuffer *pBuffer){
452 pBuffer->nData = 0;
453}
454static void dataBufferDestroy(DataBuffer *pBuffer){
455 if( pBuffer->pData!=NULL ) free(pBuffer->pData);
456 SCRAMBLE(pBuffer);
457}
458static void dataBufferExpand(DataBuffer *pBuffer, int nAddCapacity){
459 assert( nAddCapacity>0 );
460 /* TODO(shess) Consider expanding more aggressively. Note that the
461 ** underlying malloc implementation may take care of such things for
462 ** us already.
463 */
464 if( pBuffer->nData+nAddCapacity>pBuffer->nCapacity ){
465 pBuffer->nCapacity = pBuffer->nData+nAddCapacity;
466 pBuffer->pData = realloc(pBuffer->pData, pBuffer->nCapacity);
467 }
468}
469static void dataBufferAppend(DataBuffer *pBuffer,
470 const char *pSource, int nSource){
471 assert( nSource>0 && pSource!=NULL );
472 dataBufferExpand(pBuffer, nSource);
473 memcpy(pBuffer->pData+pBuffer->nData, pSource, nSource);
474 pBuffer->nData += nSource;
475}
476static void dataBufferAppend2(DataBuffer *pBuffer,
477 const char *pSource1, int nSource1,
478 const char *pSource2, int nSource2){
479 assert( nSource1>0 && pSource1!=NULL );
480 assert( nSource2>0 && pSource2!=NULL );
481 dataBufferExpand(pBuffer, nSource1+nSource2);
482 memcpy(pBuffer->pData+pBuffer->nData, pSource1, nSource1);
483 memcpy(pBuffer->pData+pBuffer->nData+nSource1, pSource2, nSource2);
484 pBuffer->nData += nSource1+nSource2;
485}
486static void dataBufferReplace(DataBuffer *pBuffer,
487 const char *pSource, int nSource){
488 dataBufferReset(pBuffer);
489 dataBufferAppend(pBuffer, pSource, nSource);
490}
491
492/* StringBuffer is a null-terminated version of DataBuffer. */
493typedef struct StringBuffer {
494 DataBuffer b; /* Includes null terminator. */
495} StringBuffer;
496
497static void initStringBuffer(StringBuffer *sb){
498 dataBufferInit(&sb->b, 100);
499 dataBufferReplace(&sb->b, "", 1);
500}
501static int stringBufferLength(StringBuffer *sb){
502 return sb->b.nData-1;
503}
504static char *stringBufferData(StringBuffer *sb){
505 return sb->b.pData;
506}
507static void stringBufferDestroy(StringBuffer *sb){
508 dataBufferDestroy(&sb->b);
509}
510
511static void nappend(StringBuffer *sb, const char *zFrom, int nFrom){
512 assert( sb->b.nData>0 );
513 if( nFrom>0 ){
514 sb->b.nData--;
515 dataBufferAppend2(&sb->b, zFrom, nFrom, "", 1);
516 }
517}
518static void append(StringBuffer *sb, const char *zFrom){
519 nappend(sb, zFrom, strlen(zFrom));
520}
521
522/* Append a list of strings separated by commas. */
523static void appendList(StringBuffer *sb, int nString, char **azString){
524 int i;
525 for(i=0; i<nString; ++i){
526 if( i>0 ) append(sb, ", ");
527 append(sb, azString[i]);
528 }
529}
530
531static int endsInWhiteSpace(StringBuffer *p){
532 return stringBufferLength(p)>0 &&
533 safe_isspace(stringBufferData(p)[stringBufferLength(p)-1]);
534}
535
536/* If the StringBuffer ends in something other than white space, add a
537** single space character to the end.
538*/
539static void appendWhiteSpace(StringBuffer *p){
540 if( stringBufferLength(p)==0 ) return;
541 if( !endsInWhiteSpace(p) ) append(p, " ");
542}
543
544/* Remove white space from the end of the StringBuffer */
545static void trimWhiteSpace(StringBuffer *p){
546 while( endsInWhiteSpace(p) ){
547 p->b.pData[--p->b.nData-1] = '\0';
548 }
549}
550
551/*******************************************************************/
552/* DLReader is used to read document elements from a doclist. The
553** current docid is cached, so dlrDocid() is fast. DLReader does not
554** own the doclist buffer.
555**
556** dlrAtEnd - true if there's no more data to read.
557** dlrDocid - docid of current document.
558** dlrDocData - doclist data for current document (including docid).
559** dlrDocDataBytes - length of same.
560** dlrAllDataBytes - length of all remaining data.
561** dlrPosData - position data for current document.
562** dlrPosDataLen - length of pos data for current document (incl POS_END).
563** dlrStep - step to current document.
564** dlrInit - initial for doclist of given type against given data.
565** dlrDestroy - clean up.
566**
567** Expected usage is something like:
568**
569** DLReader reader;
570** dlrInit(&reader, pData, nData);
571** while( !dlrAtEnd(&reader) ){
572** // calls to dlrDocid() and kin.
573** dlrStep(&reader);
574** }
575** dlrDestroy(&reader);
576*/
577typedef struct DLReader {
578 DocListType iType;
579 const char *pData;
580 int nData;
581
582 sqlite_int64 iDocid;
583 int nElement;
584} DLReader;
585
586static int dlrAtEnd(DLReader *pReader){
587 assert( pReader->nData>=0 );
588 return pReader->nData==0;
589}
590static sqlite_int64 dlrDocid(DLReader *pReader){
591 assert( !dlrAtEnd(pReader) );
592 return pReader->iDocid;
593}
594static const char *dlrDocData(DLReader *pReader){
595 assert( !dlrAtEnd(pReader) );
596 return pReader->pData;
597}
598static int dlrDocDataBytes(DLReader *pReader){
599 assert( !dlrAtEnd(pReader) );
600 return pReader->nElement;
601}
602static int dlrAllDataBytes(DLReader *pReader){
603 assert( !dlrAtEnd(pReader) );
604 return pReader->nData;
605}
606/* TODO(shess) Consider adding a field to track iDocid varint length
607** to make these two functions faster. This might matter (a tiny bit)
608** for queries.
609*/
610static const char *dlrPosData(DLReader *pReader){
611 sqlite_int64 iDummy;
612 int n = getVarint(pReader->pData, &iDummy);
613 assert( !dlrAtEnd(pReader) );
614 return pReader->pData+n;
615}
616static int dlrPosDataLen(DLReader *pReader){
617 sqlite_int64 iDummy;
618 int n = getVarint(pReader->pData, &iDummy);
619 assert( !dlrAtEnd(pReader) );
620 return pReader->nElement-n;
621}
622static void dlrStep(DLReader *pReader){
623 assert( !dlrAtEnd(pReader) );
624
625 /* Skip past current doclist element. */
626 assert( pReader->nElement<=pReader->nData );
627 pReader->pData += pReader->nElement;
628 pReader->nData -= pReader->nElement;
629
630 /* If there is more data, read the next doclist element. */
631 if( pReader->nData!=0 ){
632 sqlite_int64 iDocidDelta;
633 int iDummy, n = getVarint(pReader->pData, &iDocidDelta);
634 pReader->iDocid += iDocidDelta;
635 if( pReader->iType>=DL_POSITIONS ){
636 assert( n<pReader->nData );
637 while( 1 ){
638 n += getVarint32(pReader->pData+n, &iDummy);
639 assert( n<=pReader->nData );
640 if( iDummy==POS_END ) break;
641 if( iDummy==POS_COLUMN ){
642 n += getVarint32(pReader->pData+n, &iDummy);
643 assert( n<pReader->nData );
644 }else if( pReader->iType==DL_POSITIONS_OFFSETS ){
645 n += getVarint32(pReader->pData+n, &iDummy);
646 n += getVarint32(pReader->pData+n, &iDummy);
647 assert( n<pReader->nData );
648 }
649 }
650 }
651 pReader->nElement = n;
652 assert( pReader->nElement<=pReader->nData );
653 }
654}
655static void dlrInit(DLReader *pReader, DocListType iType,
656 const char *pData, int nData){
657 assert( pData!=NULL && nData!=0 );
658 pReader->iType = iType;
659 pReader->pData = pData;
660 pReader->nData = nData;
661 pReader->nElement = 0;
662 pReader->iDocid = 0;
663
664 /* Load the first element's data. There must be a first element. */
665 dlrStep(pReader);
666}
667static void dlrDestroy(DLReader *pReader){
668 SCRAMBLE(pReader);
669}
670
671#ifndef NDEBUG
672/* Verify that the doclist can be validly decoded. Also returns the
673** last docid found because it's convenient in other assertions for
674** DLWriter.
675*/
676static void docListValidate(DocListType iType, const char *pData, int nData,
677 sqlite_int64 *pLastDocid){
678 sqlite_int64 iPrevDocid = 0;
679 assert( nData>0 );
680 assert( pData!=0 );
681 assert( pData+nData>pData );
682 while( nData!=0 ){
683 sqlite_int64 iDocidDelta;
684 int n = getVarint(pData, &iDocidDelta);
685 iPrevDocid += iDocidDelta;
686 if( iType>DL_DOCIDS ){
687 int iDummy;
688 while( 1 ){
689 n += getVarint32(pData+n, &iDummy);
690 if( iDummy==POS_END ) break;
691 if( iDummy==POS_COLUMN ){
692 n += getVarint32(pData+n, &iDummy);
693 }else if( iType>DL_POSITIONS ){
694 n += getVarint32(pData+n, &iDummy);
695 n += getVarint32(pData+n, &iDummy);
696 }
697 assert( n<=nData );
698 }
699 }
700 assert( n<=nData );
701 pData += n;
702 nData -= n;
703 }
704 if( pLastDocid ) *pLastDocid = iPrevDocid;
705}
706#define ASSERT_VALID_DOCLIST(i, p, n, o) docListValidate(i, p, n, o)
707#else
708#define ASSERT_VALID_DOCLIST(i, p, n, o) assert( 1 )
709#endif
710
711/*******************************************************************/
712/* DLWriter is used to write doclist data to a DataBuffer. DLWriter
713** always appends to the buffer and does not own it.
714**
715** dlwInit - initialize to write a given type doclistto a buffer.
716** dlwDestroy - clear the writer's memory. Does not free buffer.
717** dlwAppend - append raw doclist data to buffer.
718** dlwCopy - copy next doclist from reader to writer.
719** dlwAdd - construct doclist element and append to buffer.
720** Only apply dlwAdd() to DL_DOCIDS doclists (else use PLWriter).
721*/
722typedef struct DLWriter {
723 DocListType iType;
724 DataBuffer *b;
725 sqlite_int64 iPrevDocid;
726#ifndef NDEBUG
727 int has_iPrevDocid;
728#endif
729} DLWriter;
730
731static void dlwInit(DLWriter *pWriter, DocListType iType, DataBuffer *b){
732 pWriter->b = b;
733 pWriter->iType = iType;
734 pWriter->iPrevDocid = 0;
735#ifndef NDEBUG
736 pWriter->has_iPrevDocid = 0;
737#endif
738}
739static void dlwDestroy(DLWriter *pWriter){
740 SCRAMBLE(pWriter);
741}
742/* iFirstDocid is the first docid in the doclist in pData. It is
743** needed because pData may point within a larger doclist, in which
744** case the first item would be delta-encoded.
745**
746** iLastDocid is the final docid in the doclist in pData. It is
747** needed to create the new iPrevDocid for future delta-encoding. The
748** code could decode the passed doclist to recreate iLastDocid, but
749** the only current user (docListMerge) already has decoded this
750** information.
751*/
752/* TODO(shess) This has become just a helper for docListMerge.
753** Consider a refactor to make this cleaner.
754*/
755static void dlwAppend(DLWriter *pWriter,
756 const char *pData, int nData,
757 sqlite_int64 iFirstDocid, sqlite_int64 iLastDocid){
758 sqlite_int64 iDocid = 0;
759 char c[VARINT_MAX];
760 int nFirstOld, nFirstNew; /* Old and new varint len of first docid. */
761#ifndef NDEBUG
762 sqlite_int64 iLastDocidDelta;
763#endif
764
765 /* Recode the initial docid as delta from iPrevDocid. */
766 nFirstOld = getVarint(pData, &iDocid);
767 assert( nFirstOld<nData || (nFirstOld==nData && pWriter->iType==DL_DOCIDS) );
768 nFirstNew = putVarint(c, iFirstDocid-pWriter->iPrevDocid);
769
770 /* Verify that the incoming doclist is valid AND that it ends with
771 ** the expected docid. This is essential because we'll trust this
772 ** docid in future delta-encoding.
773 */
774 ASSERT_VALID_DOCLIST(pWriter->iType, pData, nData, &iLastDocidDelta);
775 assert( iLastDocid==iFirstDocid-iDocid+iLastDocidDelta );
776
777 /* Append recoded initial docid and everything else. Rest of docids
778 ** should have been delta-encoded from previous initial docid.
779 */
780 if( nFirstOld<nData ){
781 dataBufferAppend2(pWriter->b, c, nFirstNew,
782 pData+nFirstOld, nData-nFirstOld);
783 }else{
784 dataBufferAppend(pWriter->b, c, nFirstNew);
785 }
786 pWriter->iPrevDocid = iLastDocid;
787}
788static void dlwCopy(DLWriter *pWriter, DLReader *pReader){
789 dlwAppend(pWriter, dlrDocData(pReader), dlrDocDataBytes(pReader),
790 dlrDocid(pReader), dlrDocid(pReader));
791}
792static void dlwAdd(DLWriter *pWriter, sqlite_int64 iDocid){
793 char c[VARINT_MAX];
794 int n = putVarint(c, iDocid-pWriter->iPrevDocid);
795
796 /* Docids must ascend. */
797 assert( !pWriter->has_iPrevDocid || iDocid>pWriter->iPrevDocid );
798 assert( pWriter->iType==DL_DOCIDS );
799
800 dataBufferAppend(pWriter->b, c, n);
801 pWriter->iPrevDocid = iDocid;
802#ifndef NDEBUG
803 pWriter->has_iPrevDocid = 1;
804#endif
805}
806
807/*******************************************************************/
808/* PLReader is used to read data from a document's position list. As
809** the caller steps through the list, data is cached so that varints
810** only need to be decoded once.
811**
812** plrInit, plrDestroy - create/destroy a reader.
813** plrColumn, plrPosition, plrStartOffset, plrEndOffset - accessors
814** plrAtEnd - at end of stream, only call plrDestroy once true.
815** plrStep - step to the next element.
816*/
817typedef struct PLReader {
818 /* These refer to the next position's data. nData will reach 0 when
819 ** reading the last position, so plrStep() signals EOF by setting
820 ** pData to NULL.
821 */
822 const char *pData;
823 int nData;
824
825 DocListType iType;
826 int iColumn; /* the last column read */
827 int iPosition; /* the last position read */
828 int iStartOffset; /* the last start offset read */
829 int iEndOffset; /* the last end offset read */
830} PLReader;
831
832static int plrAtEnd(PLReader *pReader){
833 return pReader->pData==NULL;
834}
835static int plrColumn(PLReader *pReader){
836 assert( !plrAtEnd(pReader) );
837 return pReader->iColumn;
838}
839static int plrPosition(PLReader *pReader){
840 assert( !plrAtEnd(pReader) );
841 return pReader->iPosition;
842}
843static int plrStartOffset(PLReader *pReader){
844 assert( !plrAtEnd(pReader) );
845 return pReader->iStartOffset;
846}
847static int plrEndOffset(PLReader *pReader){
848 assert( !plrAtEnd(pReader) );
849 return pReader->iEndOffset;
850}
851static void plrStep(PLReader *pReader){
852 int i, n;
853
854 assert( !plrAtEnd(pReader) );
855
856 if( pReader->nData==0 ){
857 pReader->pData = NULL;
858 return;
859 }
860
861 n = getVarint32(pReader->pData, &i);
862 if( i==POS_COLUMN ){
863 n += getVarint32(pReader->pData+n, &pReader->iColumn);
864 pReader->iPosition = 0;
865 pReader->iStartOffset = 0;
866 n += getVarint32(pReader->pData+n, &i);
867 }
868 /* Should never see adjacent column changes. */
869 assert( i!=POS_COLUMN );
870
871 if( i==POS_END ){
872 pReader->nData = 0;
873 pReader->pData = NULL;
874 return;
875 }
876
877 pReader->iPosition += i-POS_BASE;
878 if( pReader->iType==DL_POSITIONS_OFFSETS ){
879 n += getVarint32(pReader->pData+n, &i);
880 pReader->iStartOffset += i;
881 n += getVarint32(pReader->pData+n, &i);
882 pReader->iEndOffset = pReader->iStartOffset+i;
883 }
884 assert( n<=pReader->nData );
885 pReader->pData += n;
886 pReader->nData -= n;
887}
888
889static void plrInit(PLReader *pReader, DLReader *pDLReader){
890 pReader->pData = dlrPosData(pDLReader);
891 pReader->nData = dlrPosDataLen(pDLReader);
892 pReader->iType = pDLReader->iType;
893 pReader->iColumn = 0;
894 pReader->iPosition = 0;
895 pReader->iStartOffset = 0;
896 pReader->iEndOffset = 0;
897 plrStep(pReader);
898}
899static void plrDestroy(PLReader *pReader){
900 SCRAMBLE(pReader);
901}
902
903/*******************************************************************/
904/* PLWriter is used in constructing a document's position list. As a
905** convenience, if iType is DL_DOCIDS, PLWriter becomes a no-op.
906** PLWriter writes to the associated DLWriter's buffer.
907**
908** plwInit - init for writing a document's poslist.
909** plwDestroy - clear a writer.
910** plwAdd - append position and offset information.
911** plwCopy - copy next position's data from reader to writer.
912** plwTerminate - add any necessary doclist terminator.
913**
914** Calling plwAdd() after plwTerminate() may result in a corrupt
915** doclist.
916*/
917/* TODO(shess) Until we've written the second item, we can cache the
918** first item's information. Then we'd have three states:
919**
920** - initialized with docid, no positions.
921** - docid and one position.
922** - docid and multiple positions.
923**
924** Only the last state needs to actually write to dlw->b, which would
925** be an improvement in the DLCollector case.
926*/
927typedef struct PLWriter {
928 DLWriter *dlw;
929
930 int iColumn; /* the last column written */
931 int iPos; /* the last position written */
932 int iOffset; /* the last start offset written */
933} PLWriter;
934
935/* TODO(shess) In the case where the parent is reading these values
936** from a PLReader, we could optimize to a copy if that PLReader has
937** the same type as pWriter.
938*/
939static void plwAdd(PLWriter *pWriter, int iColumn, int iPos,
940 int iStartOffset, int iEndOffset){
941 /* Worst-case space for POS_COLUMN, iColumn, iPosDelta,
942 ** iStartOffsetDelta, and iEndOffsetDelta.
943 */
944 char c[5*VARINT_MAX];
945 int n = 0;
946
947 /* Ban plwAdd() after plwTerminate(). */
948 assert( pWriter->iPos!=-1 );
949
950 if( pWriter->dlw->iType==DL_DOCIDS ) return;
951
952 if( iColumn!=pWriter->iColumn ){
953 n += putVarint(c+n, POS_COLUMN);
954 n += putVarint(c+n, iColumn);
955 pWriter->iColumn = iColumn;
956 pWriter->iPos = 0;
957 pWriter->iOffset = 0;
958 }
959 assert( iPos>=pWriter->iPos );
960 n += putVarint(c+n, POS_BASE+(iPos-pWriter->iPos));
961 pWriter->iPos = iPos;
962 if( pWriter->dlw->iType==DL_POSITIONS_OFFSETS ){
963 assert( iStartOffset>=pWriter->iOffset );
964 n += putVarint(c+n, iStartOffset-pWriter->iOffset);
965 pWriter->iOffset = iStartOffset;
966 assert( iEndOffset>=iStartOffset );
967 n += putVarint(c+n, iEndOffset-iStartOffset);
968 }
969 dataBufferAppend(pWriter->dlw->b, c, n);
970}
971static void plwCopy(PLWriter *pWriter, PLReader *pReader){
972 plwAdd(pWriter, plrColumn(pReader), plrPosition(pReader),
973 plrStartOffset(pReader), plrEndOffset(pReader));
974}
975static void plwInit(PLWriter *pWriter, DLWriter *dlw, sqlite_int64 iDocid){
976 char c[VARINT_MAX];
977 int n;
978
979 pWriter->dlw = dlw;
980
981 /* Docids must ascend. */
982 assert( !pWriter->dlw->has_iPrevDocid || iDocid>pWriter->dlw->iPrevDocid );
983 n = putVarint(c, iDocid-pWriter->dlw->iPrevDocid);
984 dataBufferAppend(pWriter->dlw->b, c, n);
985 pWriter->dlw->iPrevDocid = iDocid;
986#ifndef NDEBUG
987 pWriter->dlw->has_iPrevDocid = 1;
988#endif
989
990 pWriter->iColumn = 0;
991 pWriter->iPos = 0;
992 pWriter->iOffset = 0;
993}
994/* TODO(shess) Should plwDestroy() also terminate the doclist? But
995** then plwDestroy() would no longer be just a destructor, it would
996** also be doing work, which isn't consistent with the overall idiom.
997** Another option would be for plwAdd() to always append any necessary
998** terminator, so that the output is always correct. But that would
999** add incremental work to the common case with the only benefit being
1000** API elegance. Punt for now.
1001*/
1002static void plwTerminate(PLWriter *pWriter){
1003 if( pWriter->dlw->iType>DL_DOCIDS ){
1004 char c[VARINT_MAX];
1005 int n = putVarint(c, POS_END);
1006 dataBufferAppend(pWriter->dlw->b, c, n);
1007 }
1008#ifndef NDEBUG
1009 /* Mark as terminated for assert in plwAdd(). */
1010 pWriter->iPos = -1;
1011#endif
1012}
1013static void plwDestroy(PLWriter *pWriter){
1014 SCRAMBLE(pWriter);
1015}
1016
1017/*******************************************************************/
1018/* DLCollector wraps PLWriter and DLWriter to provide a
1019** dynamically-allocated doclist area to use during tokenization.
1020**
1021** dlcNew - malloc up and initialize a collector.
1022** dlcDelete - destroy a collector and all contained items.
1023** dlcAddPos - append position and offset information.
1024** dlcAddDoclist - add the collected doclist to the given buffer.
1025** dlcNext - terminate the current document and open another.
1026*/
1027typedef struct DLCollector {
1028 DataBuffer b;
1029 DLWriter dlw;
1030 PLWriter plw;
1031} DLCollector;
1032
1033/* TODO(shess) This could also be done by calling plwTerminate() and
1034** dataBufferAppend(). I tried that, expecting nominal performance
1035** differences, but it seemed to pretty reliably be worth 1% to code
1036** it this way. I suspect it's the incremental malloc overhead (some
1037** percentage of the plwTerminate() calls will cause a realloc), so
1038** this might be worth revisiting if the DataBuffer implementation
1039** changes.
1040*/
1041static void dlcAddDoclist(DLCollector *pCollector, DataBuffer *b){
1042 if( pCollector->dlw.iType>DL_DOCIDS ){
1043 char c[VARINT_MAX];
1044 int n = putVarint(c, POS_END);
1045 dataBufferAppend2(b, pCollector->b.pData, pCollector->b.nData, c, n);
1046 }else{
1047 dataBufferAppend(b, pCollector->b.pData, pCollector->b.nData);
1048 }
1049}
1050static void dlcNext(DLCollector *pCollector, sqlite_int64 iDocid){
1051 plwTerminate(&pCollector->plw);
1052 plwDestroy(&pCollector->plw);
1053 plwInit(&pCollector->plw, &pCollector->dlw, iDocid);
1054}
1055static void dlcAddPos(DLCollector *pCollector, int iColumn, int iPos,
1056 int iStartOffset, int iEndOffset){
1057 plwAdd(&pCollector->plw, iColumn, iPos, iStartOffset, iEndOffset);
1058}
1059
1060static DLCollector *dlcNew(sqlite_int64 iDocid, DocListType iType){
1061 DLCollector *pCollector = malloc(sizeof(DLCollector));
1062 dataBufferInit(&pCollector->b, 0);
1063 dlwInit(&pCollector->dlw, iType, &pCollector->b);
1064 plwInit(&pCollector->plw, &pCollector->dlw, iDocid);
1065 return pCollector;
1066}
1067static void dlcDelete(DLCollector *pCollector){
1068 plwDestroy(&pCollector->plw);
1069 dlwDestroy(&pCollector->dlw);
1070 dataBufferDestroy(&pCollector->b);
1071 SCRAMBLE(pCollector);
1072 free(pCollector);
1073}
1074
1075
1076/* Copy the doclist data of iType in pData/nData into *out, trimming
1077** unnecessary data as we go. Only columns matching iColumn are
1078** copied, all columns copied if iColumn is -1. Elements with no
1079** matching columns are dropped. The output is an iOutType doclist.
1080*/
1081/* NOTE(shess) This code is only valid after all doclists are merged.
1082** If this is run before merges, then doclist items which represent
1083** deletion will be trimmed, and will thus not effect a deletion
1084** during the merge.
1085*/
1086static void docListTrim(DocListType iType, const char *pData, int nData,
1087 int iColumn, DocListType iOutType, DataBuffer *out){
1088 DLReader dlReader;
1089 DLWriter dlWriter;
1090
1091 assert( iOutType<=iType );
1092
1093 dlrInit(&dlReader, iType, pData, nData);
1094 dlwInit(&dlWriter, iOutType, out);
1095
1096 while( !dlrAtEnd(&dlReader) ){
1097 PLReader plReader;
1098 PLWriter plWriter;
1099 int match = 0;
1100
1101 plrInit(&plReader, &dlReader);
1102
1103 while( !plrAtEnd(&plReader) ){
1104 if( iColumn==-1 || plrColumn(&plReader)==iColumn ){
1105 if( !match ){
1106 plwInit(&plWriter, &dlWriter, dlrDocid(&dlReader));
1107 match = 1;
1108 }
1109 plwAdd(&plWriter, plrColumn(&plReader), plrPosition(&plReader),
1110 plrStartOffset(&plReader), plrEndOffset(&plReader));
1111 }
1112 plrStep(&plReader);
1113 }
1114 if( match ){
1115 plwTerminate(&plWriter);
1116 plwDestroy(&plWriter);
1117 }
1118
1119 plrDestroy(&plReader);
1120 dlrStep(&dlReader);
1121 }
1122 dlwDestroy(&dlWriter);
1123 dlrDestroy(&dlReader);
1124}
1125
1126/* Used by docListMerge() to keep doclists in the ascending order by
1127** docid, then ascending order by age (so the newest comes first).
1128*/
1129typedef struct OrderedDLReader {
1130 DLReader *pReader;
1131
1132 /* TODO(shess) If we assume that docListMerge pReaders is ordered by
1133 ** age (which we do), then we could use pReader comparisons to break
1134 ** ties.
1135 */
1136 int idx;
1137} OrderedDLReader;
1138
1139/* Order eof to end, then by docid asc, idx desc. */
1140static int orderedDLReaderCmp(OrderedDLReader *r1, OrderedDLReader *r2){
1141 if( dlrAtEnd(r1->pReader) ){
1142 if( dlrAtEnd(r2->pReader) ) return 0; /* Both atEnd(). */
1143 return 1; /* Only r1 atEnd(). */
1144 }
1145 if( dlrAtEnd(r2->pReader) ) return -1; /* Only r2 atEnd(). */
1146
1147 if( dlrDocid(r1->pReader)<dlrDocid(r2->pReader) ) return -1;
1148 if( dlrDocid(r1->pReader)>dlrDocid(r2->pReader) ) return 1;
1149
1150 /* Descending on idx. */
1151 return r2->idx-r1->idx;
1152}
1153
1154/* Bubble p[0] to appropriate place in p[1..n-1]. Assumes that
1155** p[1..n-1] is already sorted.
1156*/
1157/* TODO(shess) Is this frequent enough to warrant a binary search?
1158** Before implementing that, instrument the code to check. In most
1159** current usage, I expect that p[0] will be less than p[1] a very
1160** high proportion of the time.
1161*/
1162static void orderedDLReaderReorder(OrderedDLReader *p, int n){
1163 while( n>1 && orderedDLReaderCmp(p, p+1)>0 ){
1164 OrderedDLReader tmp = p[0];
1165 p[0] = p[1];
1166 p[1] = tmp;
1167 n--;
1168 p++;
1169 }
1170}
1171
1172/* Given an array of doclist readers, merge their doclist elements
1173** into out in sorted order (by docid), dropping elements from older
1174** readers when there is a duplicate docid. pReaders is assumed to be
1175** ordered by age, oldest first.
1176*/
1177/* TODO(shess) nReaders must be <= MERGE_COUNT. This should probably
1178** be fixed.
1179*/
1180static void docListMerge(DataBuffer *out,
1181 DLReader *pReaders, int nReaders){
1182 OrderedDLReader readers[MERGE_COUNT];
1183 DLWriter writer;
1184 int i, n;
1185 const char *pStart = 0;
1186 int nStart = 0;
1187 sqlite_int64 iFirstDocid = 0, iLastDocid = 0;
1188
1189 assert( nReaders>0 );
1190 if( nReaders==1 ){
1191 dataBufferAppend(out, dlrDocData(pReaders), dlrAllDataBytes(pReaders));
1192 return;
1193 }
1194
1195 assert( nReaders<=MERGE_COUNT );
1196 n = 0;
1197 for(i=0; i<nReaders; i++){
1198 assert( pReaders[i].iType==pReaders[0].iType );
1199 readers[i].pReader = pReaders+i;
1200 readers[i].idx = i;
1201 n += dlrAllDataBytes(&pReaders[i]);
1202 }
1203 /* Conservatively size output to sum of inputs. Output should end
1204 ** up strictly smaller than input.
1205 */
1206 dataBufferExpand(out, n);
1207
1208 /* Get the readers into sorted order. */
1209 while( i-->0 ){
1210 orderedDLReaderReorder(readers+i, nReaders-i);
1211 }
1212
1213 dlwInit(&writer, pReaders[0].iType, out);
1214 while( !dlrAtEnd(readers[0].pReader) ){
1215 sqlite_int64 iDocid = dlrDocid(readers[0].pReader);
1216
1217 /* If this is a continuation of the current buffer to copy, extend
1218 ** that buffer. memcpy() seems to be more efficient if it has a
1219 ** lots of data to copy.
1220 */
1221 if( dlrDocData(readers[0].pReader)==pStart+nStart ){
1222 nStart += dlrDocDataBytes(readers[0].pReader);
1223 }else{
1224 if( pStart!=0 ){
1225 dlwAppend(&writer, pStart, nStart, iFirstDocid, iLastDocid);
1226 }
1227 pStart = dlrDocData(readers[0].pReader);
1228 nStart = dlrDocDataBytes(readers[0].pReader);
1229 iFirstDocid = iDocid;
1230 }
1231 iLastDocid = iDocid;
1232 dlrStep(readers[0].pReader);
1233
1234 /* Drop all of the older elements with the same docid. */
1235 for(i=1; i<nReaders &&
1236 !dlrAtEnd(readers[i].pReader) &&
1237 dlrDocid(readers[i].pReader)==iDocid; i++){
1238 dlrStep(readers[i].pReader);
1239 }
1240
1241 /* Get the readers back into order. */
1242 while( i-->0 ){
1243 orderedDLReaderReorder(readers+i, nReaders-i);
1244 }
1245 }
1246
1247 /* Copy over any remaining elements. */
1248 if( nStart>0 ) dlwAppend(&writer, pStart, nStart, iFirstDocid, iLastDocid);
1249 dlwDestroy(&writer);
1250}
1251
1252/* Helper function for posListUnion(). Compares the current position
1253** between left and right, returning as standard C idiom of <0 if
1254** left<right, >0 if left>right, and 0 if left==right. "End" always
1255** compares greater.
1256*/
1257static int posListCmp(PLReader *pLeft, PLReader *pRight){
1258 assert( pLeft->iType==pRight->iType );
1259 if( pLeft->iType==DL_DOCIDS ) return 0;
1260
1261 if( plrAtEnd(pLeft) ) return plrAtEnd(pRight) ? 0 : 1;
1262 if( plrAtEnd(pRight) ) return -1;
1263
1264 if( plrColumn(pLeft)<plrColumn(pRight) ) return -1;
1265 if( plrColumn(pLeft)>plrColumn(pRight) ) return 1;
1266
1267 if( plrPosition(pLeft)<plrPosition(pRight) ) return -1;
1268 if( plrPosition(pLeft)>plrPosition(pRight) ) return 1;
1269 if( pLeft->iType==DL_POSITIONS ) return 0;
1270
1271 if( plrStartOffset(pLeft)<plrStartOffset(pRight) ) return -1;
1272 if( plrStartOffset(pLeft)>plrStartOffset(pRight) ) return 1;
1273
1274 if( plrEndOffset(pLeft)<plrEndOffset(pRight) ) return -1;
1275 if( plrEndOffset(pLeft)>plrEndOffset(pRight) ) return 1;
1276
1277 return 0;
1278}
1279
1280/* Write the union of position lists in pLeft and pRight to pOut.
1281** "Union" in this case meaning "All unique position tuples". Should
1282** work with any doclist type, though both inputs and the output
1283** should be the same type.
1284*/
1285static void posListUnion(DLReader *pLeft, DLReader *pRight, DLWriter *pOut){
1286 PLReader left, right;
1287 PLWriter writer;
1288
1289 assert( dlrDocid(pLeft)==dlrDocid(pRight) );
1290 assert( pLeft->iType==pRight->iType );
1291 assert( pLeft->iType==pOut->iType );
1292
1293 plrInit(&left, pLeft);
1294 plrInit(&right, pRight);
1295 plwInit(&writer, pOut, dlrDocid(pLeft));
1296
1297 while( !plrAtEnd(&left) || !plrAtEnd(&right) ){
1298 int c = posListCmp(&left, &right);
1299 if( c<0 ){
1300 plwCopy(&writer, &left);
1301 plrStep(&left);
1302 }else if( c>0 ){
1303 plwCopy(&writer, &right);
1304 plrStep(&right);
1305 }else{
1306 plwCopy(&writer, &left);
1307 plrStep(&left);
1308 plrStep(&right);
1309 }
1310 }
1311
1312 plwTerminate(&writer);
1313 plwDestroy(&writer);
1314 plrDestroy(&left);
1315 plrDestroy(&right);
1316}
1317
1318/* Write the union of doclists in pLeft and pRight to pOut. For
1319** docids in common between the inputs, the union of the position
1320** lists is written. Inputs and outputs are always type DL_DEFAULT.
1321*/
1322static void docListUnion(
1323 const char *pLeft, int nLeft,
1324 const char *pRight, int nRight,
1325 DataBuffer *pOut /* Write the combined doclist here */
1326){
1327 DLReader left, right;
1328 DLWriter writer;
1329
1330 if( nLeft==0 ){
1331 dataBufferAppend(pOut, pRight, nRight);
1332 return;
1333 }
1334 if( nRight==0 ){
1335 dataBufferAppend(pOut, pLeft, nLeft);
1336 return;
1337 }
1338
1339 dlrInit(&left, DL_DEFAULT, pLeft, nLeft);
1340 dlrInit(&right, DL_DEFAULT, pRight, nRight);
1341 dlwInit(&writer, DL_DEFAULT, pOut);
1342
1343 while( !dlrAtEnd(&left) || !dlrAtEnd(&right) ){
1344 if( dlrAtEnd(&right) ){
1345 dlwCopy(&writer, &left);
1346 dlrStep(&left);
1347 }else if( dlrAtEnd(&left) ){
1348 dlwCopy(&writer, &right);
1349 dlrStep(&right);
1350 }else if( dlrDocid(&left)<dlrDocid(&right) ){
1351 dlwCopy(&writer, &left);
1352 dlrStep(&left);
1353 }else if( dlrDocid(&left)>dlrDocid(&right) ){
1354 dlwCopy(&writer, &right);
1355 dlrStep(&right);
1356 }else{
1357 posListUnion(&left, &right, &writer);
1358 dlrStep(&left);
1359 dlrStep(&right);
1360 }
1361 }
1362
1363 dlrDestroy(&left);
1364 dlrDestroy(&right);
1365 dlwDestroy(&writer);
1366}
1367
1368/* pLeft and pRight are DLReaders positioned to the same docid.
1369**
1370** If there are no instances in pLeft or pRight where the position
1371** of pLeft is one less than the position of pRight, then this
1372** routine adds nothing to pOut.
1373**
1374** If there are one or more instances where positions from pLeft
1375** are exactly one less than positions from pRight, then add a new
1376** document record to pOut. If pOut wants to hold positions, then
1377** include the positions from pRight that are one more than a
1378** position in pLeft. In other words: pRight.iPos==pLeft.iPos+1.
1379*/
1380static void posListPhraseMerge(DLReader *pLeft, DLReader *pRight,
1381 DLWriter *pOut){
1382 PLReader left, right;
1383 PLWriter writer;
1384 int match = 0;
1385
1386 assert( dlrDocid(pLeft)==dlrDocid(pRight) );
1387 assert( pOut->iType!=DL_POSITIONS_OFFSETS );
1388
1389 plrInit(&left, pLeft);
1390 plrInit(&right, pRight);
1391
1392 while( !plrAtEnd(&left) && !plrAtEnd(&right) ){
1393 if( plrColumn(&left)<plrColumn(&right) ){
1394 plrStep(&left);
1395 }else if( plrColumn(&left)>plrColumn(&right) ){
1396 plrStep(&right);
1397 }else if( plrPosition(&left)+1<plrPosition(&right) ){
1398 plrStep(&left);
1399 }else if( plrPosition(&left)+1>plrPosition(&right) ){
1400 plrStep(&right);
1401 }else{
1402 if( !match ){
1403 plwInit(&writer, pOut, dlrDocid(pLeft));
1404 match = 1;
1405 }
1406 plwAdd(&writer, plrColumn(&right), plrPosition(&right), 0, 0);
1407 plrStep(&left);
1408 plrStep(&right);
1409 }
1410 }
1411
1412 if( match ){
1413 plwTerminate(&writer);
1414 plwDestroy(&writer);
1415 }
1416
1417 plrDestroy(&left);
1418 plrDestroy(&right);
1419}
1420
1421/* We have two doclists with positions: pLeft and pRight.
1422** Write the phrase intersection of these two doclists into pOut.
1423**
1424** A phrase intersection means that two documents only match
1425** if pLeft.iPos+1==pRight.iPos.
1426**
1427** iType controls the type of data written to pOut. If iType is
1428** DL_POSITIONS, the positions are those from pRight.
1429*/
1430static void docListPhraseMerge(
1431 const char *pLeft, int nLeft,
1432 const char *pRight, int nRight,
1433 DocListType iType,
1434 DataBuffer *pOut /* Write the combined doclist here */
1435){
1436 DLReader left, right;
1437 DLWriter writer;
1438
1439 if( nLeft==0 || nRight==0 ) return;
1440
1441 assert( iType!=DL_POSITIONS_OFFSETS );
1442
1443 dlrInit(&left, DL_POSITIONS, pLeft, nLeft);
1444 dlrInit(&right, DL_POSITIONS, pRight, nRight);
1445 dlwInit(&writer, iType, pOut);
1446
1447 while( !dlrAtEnd(&left) && !dlrAtEnd(&right) ){
1448 if( dlrDocid(&left)<dlrDocid(&right) ){
1449 dlrStep(&left);
1450 }else if( dlrDocid(&right)<dlrDocid(&left) ){
1451 dlrStep(&right);
1452 }else{
1453 posListPhraseMerge(&left, &right, &writer);
1454 dlrStep(&left);
1455 dlrStep(&right);
1456 }
1457 }
1458
1459 dlrDestroy(&left);
1460 dlrDestroy(&right);
1461 dlwDestroy(&writer);
1462}
1463
1464/* We have two DL_DOCIDS doclists: pLeft and pRight.
1465** Write the intersection of these two doclists into pOut as a
1466** DL_DOCIDS doclist.
1467*/
1468static void docListAndMerge(
1469 const char *pLeft, int nLeft,
1470 const char *pRight, int nRight,
1471 DataBuffer *pOut /* Write the combined doclist here */
1472){
1473 DLReader left, right;
1474 DLWriter writer;
1475
1476 if( nLeft==0 || nRight==0 ) return;
1477
1478 dlrInit(&left, DL_DOCIDS, pLeft, nLeft);
1479 dlrInit(&right, DL_DOCIDS, pRight, nRight);
1480 dlwInit(&writer, DL_DOCIDS, pOut);
1481
1482 while( !dlrAtEnd(&left) && !dlrAtEnd(&right) ){
1483 if( dlrDocid(&left)<dlrDocid(&right) ){
1484 dlrStep(&left);
1485 }else if( dlrDocid(&right)<dlrDocid(&left) ){
1486 dlrStep(&right);
1487 }else{
1488 dlwAdd(&writer, dlrDocid(&left));
1489 dlrStep(&left);
1490 dlrStep(&right);
1491 }
1492 }
1493
1494 dlrDestroy(&left);
1495 dlrDestroy(&right);
1496 dlwDestroy(&writer);
1497}
1498
1499/* We have two DL_DOCIDS doclists: pLeft and pRight.
1500** Write the union of these two doclists into pOut as a
1501** DL_DOCIDS doclist.
1502*/
1503static void docListOrMerge(
1504 const char *pLeft, int nLeft,
1505 const char *pRight, int nRight,
1506 DataBuffer *pOut /* Write the combined doclist here */
1507){
1508 DLReader left, right;
1509 DLWriter writer;
1510
1511 if( nLeft==0 ){
1512 dataBufferAppend(pOut, pRight, nRight);
1513 return;
1514 }
1515 if( nRight==0 ){
1516 dataBufferAppend(pOut, pLeft, nLeft);
1517 return;
1518 }
1519
1520 dlrInit(&left, DL_DOCIDS, pLeft, nLeft);
1521 dlrInit(&right, DL_DOCIDS, pRight, nRight);
1522 dlwInit(&writer, DL_DOCIDS, pOut);
1523
1524 while( !dlrAtEnd(&left) || !dlrAtEnd(&right) ){
1525 if( dlrAtEnd(&right) ){
1526 dlwAdd(&writer, dlrDocid(&left));
1527 dlrStep(&left);
1528 }else if( dlrAtEnd(&left) ){
1529 dlwAdd(&writer, dlrDocid(&right));
1530 dlrStep(&right);
1531 }else if( dlrDocid(&left)<dlrDocid(&right) ){
1532 dlwAdd(&writer, dlrDocid(&left));
1533 dlrStep(&left);
1534 }else if( dlrDocid(&right)<dlrDocid(&left) ){
1535 dlwAdd(&writer, dlrDocid(&right));
1536 dlrStep(&right);
1537 }else{
1538 dlwAdd(&writer, dlrDocid(&left));
1539 dlrStep(&left);
1540 dlrStep(&right);
1541 }
1542 }
1543
1544 dlrDestroy(&left);
1545 dlrDestroy(&right);
1546 dlwDestroy(&writer);
1547}
1548
1549/* We have two DL_DOCIDS doclists: pLeft and pRight.
1550** Write into pOut as DL_DOCIDS doclist containing all documents that
1551** occur in pLeft but not in pRight.
1552*/
1553static void docListExceptMerge(
1554 const char *pLeft, int nLeft,
1555 const char *pRight, int nRight,
1556 DataBuffer *pOut /* Write the combined doclist here */
1557){
1558 DLReader left, right;
1559 DLWriter writer;
1560
1561 if( nLeft==0 ) return;
1562 if( nRight==0 ){
1563 dataBufferAppend(pOut, pLeft, nLeft);
1564 return;
1565 }
1566
1567 dlrInit(&left, DL_DOCIDS, pLeft, nLeft);
1568 dlrInit(&right, DL_DOCIDS, pRight, nRight);
1569 dlwInit(&writer, DL_DOCIDS, pOut);
1570
1571 while( !dlrAtEnd(&left) ){
1572 while( !dlrAtEnd(&right) && dlrDocid(&right)<dlrDocid(&left) ){
1573 dlrStep(&right);
1574 }
1575 if( dlrAtEnd(&right) || dlrDocid(&left)<dlrDocid(&right) ){
1576 dlwAdd(&writer, dlrDocid(&left));
1577 }
1578 dlrStep(&left);
1579 }
1580
1581 dlrDestroy(&left);
1582 dlrDestroy(&right);
1583 dlwDestroy(&writer);
1584}
1585
1586static char *string_dup_n(const char *s, int n){
1587 char *str = malloc(n + 1);
1588 memcpy(str, s, n);
1589 str[n] = '\0';
1590 return str;
1591}
1592
1593/* Duplicate a string; the caller must free() the returned string.
1594 * (We don't use strdup() since it's not part of the standard C library and
1595 * may not be available everywhere.) */
1596static char *string_dup(const char *s){
1597 return string_dup_n(s, strlen(s));
1598}
1599
1600/* Format a string, replacing each occurrence of the % character with
1601 * zDb.zName. This may be more convenient than sqlite_mprintf()
1602 * when one string is used repeatedly in a format string.
1603 * The caller must free() the returned string. */
1604static char *string_format(const char *zFormat,
1605 const char *zDb, const char *zName){
1606 const char *p;
1607 size_t len = 0;
1608 size_t nDb = strlen(zDb);
1609 size_t nName = strlen(zName);
1610 size_t nFullTableName = nDb+1+nName;
1611 char *result;
1612 char *r;
1613
1614 /* first compute length needed */
1615 for(p = zFormat ; *p ; ++p){
1616 len += (*p=='%' ? nFullTableName : 1);
1617 }
1618 len += 1; /* for null terminator */
1619
1620 r = result = malloc(len);
1621 for(p = zFormat; *p; ++p){
1622 if( *p=='%' ){
1623 memcpy(r, zDb, nDb);
1624 r += nDb;
1625 *r++ = '.';
1626 memcpy(r, zName, nName);
1627 r += nName;
1628 } else {
1629 *r++ = *p;
1630 }
1631 }
1632 *r++ = '\0';
1633 assert( r == result + len );
1634 return result;
1635}
1636
1637static int sql_exec(sqlite3 *db, const char *zDb, const char *zName,
1638 const char *zFormat){
1639 char *zCommand = string_format(zFormat, zDb, zName);
1640 int rc;
1641 TRACE(("FTS3 sql: %s\n", zCommand));
1642 rc = sqlite3_exec(db, zCommand, NULL, 0, NULL);
1643 free(zCommand);
1644 return rc;
1645}
1646
1647static int sql_prepare(sqlite3 *db, const char *zDb, const char *zName,
1648 sqlite3_stmt **ppStmt, const char *zFormat){
1649 char *zCommand = string_format(zFormat, zDb, zName);
1650 int rc;
1651 TRACE(("FTS3 prepare: %s\n", zCommand));
1652 rc = sqlite3_prepare_v2(db, zCommand, -1, ppStmt, NULL);
1653 free(zCommand);
1654 return rc;
1655}
1656
1657/* end utility functions */
1658
1659/* Forward reference */
1660typedef struct fulltext_vtab fulltext_vtab;
1661
1662/* A single term in a query is represented by an instances of
1663** the following structure.
1664*/
1665typedef struct QueryTerm {
1666 short int nPhrase; /* How many following terms are part of the same phrase */
1667 short int iPhrase; /* This is the i-th term of a phrase. */
1668 short int iColumn; /* Column of the index that must match this term */
1669 signed char isOr; /* this term is preceded by "OR" */
1670 signed char isNot; /* this term is preceded by "-" */
1671 signed char isPrefix; /* this term is followed by "*" */
1672 char *pTerm; /* text of the term. '\000' terminated. malloced */
1673 int nTerm; /* Number of bytes in pTerm[] */
1674} QueryTerm;
1675
1676
1677/* A query string is parsed into a Query structure.
1678 *
1679 * We could, in theory, allow query strings to be complicated
1680 * nested expressions with precedence determined by parentheses.
1681 * But none of the major search engines do this. (Perhaps the
1682 * feeling is that an parenthesized expression is two complex of
1683 * an idea for the average user to grasp.) Taking our lead from
1684 * the major search engines, we will allow queries to be a list
1685 * of terms (with an implied AND operator) or phrases in double-quotes,
1686 * with a single optional "-" before each non-phrase term to designate
1687 * negation and an optional OR connector.
1688 *
1689 * OR binds more tightly than the implied AND, which is what the
1690 * major search engines seem to do. So, for example:
1691 *
1692 * [one two OR three] ==> one AND (two OR three)
1693 * [one OR two three] ==> (one OR two) AND three
1694 *
1695 * A "-" before a term matches all entries that lack that term.
1696 * The "-" must occur immediately before the term with in intervening
1697 * space. This is how the search engines do it.
1698 *
1699 * A NOT term cannot be the right-hand operand of an OR. If this
1700 * occurs in the query string, the NOT is ignored:
1701 *
1702 * [one OR -two] ==> one OR two
1703 *
1704 */
1705typedef struct Query {
1706 fulltext_vtab *pFts; /* The full text index */
1707 int nTerms; /* Number of terms in the query */
1708 QueryTerm *pTerms; /* Array of terms. Space obtained from malloc() */
1709 int nextIsOr; /* Set the isOr flag on the next inserted term */
1710 int nextColumn; /* Next word parsed must be in this column */
1711 int dfltColumn; /* The default column */
1712} Query;
1713
1714
1715/*
1716** An instance of the following structure keeps track of generated
1717** matching-word offset information and snippets.
1718*/
1719typedef struct Snippet {
1720 int nMatch; /* Total number of matches */
1721 int nAlloc; /* Space allocated for aMatch[] */
1722 struct snippetMatch { /* One entry for each matching term */
1723 char snStatus; /* Status flag for use while constructing snippets */
1724 short int iCol; /* The column that contains the match */
1725 short int iTerm; /* The index in Query.pTerms[] of the matching term */
1726 short int nByte; /* Number of bytes in the term */
1727 int iStart; /* The offset to the first character of the term */
1728 } *aMatch; /* Points to space obtained from malloc */
1729 char *zOffset; /* Text rendering of aMatch[] */
1730 int nOffset; /* strlen(zOffset) */
1731 char *zSnippet; /* Snippet text */
1732 int nSnippet; /* strlen(zSnippet) */
1733} Snippet;
1734
1735
1736typedef enum QueryType {
1737 QUERY_GENERIC, /* table scan */
1738 QUERY_DOCID, /* lookup by docid */
1739 QUERY_FULLTEXT /* QUERY_FULLTEXT + [i] is a full-text search for column i*/
1740} QueryType;
1741
1742typedef enum fulltext_statement {
1743 CONTENT_INSERT_STMT,
1744 CONTENT_SELECT_STMT,
1745 CONTENT_UPDATE_STMT,
1746 CONTENT_DELETE_STMT,
1747
1748 BLOCK_INSERT_STMT,
1749 BLOCK_SELECT_STMT,
1750 BLOCK_DELETE_STMT,
1751
1752 SEGDIR_MAX_INDEX_STMT,
1753 SEGDIR_SET_STMT,
1754 SEGDIR_SELECT_STMT,
1755 SEGDIR_SPAN_STMT,
1756 SEGDIR_DELETE_STMT,
1757 SEGDIR_SELECT_ALL_STMT,
1758
1759 MAX_STMT /* Always at end! */
1760} fulltext_statement;
1761
1762/* These must exactly match the enum above. */
1763/* TODO(shess): Is there some risk that a statement will be used in two
1764** cursors at once, e.g. if a query joins a virtual table to itself?
1765** If so perhaps we should move some of these to the cursor object.
1766*/
1767static const char *const fulltext_zStatement[MAX_STMT] = {
1768 /* CONTENT_INSERT */ NULL, /* generated in contentInsertStatement() */
1769 /* CONTENT_SELECT */ NULL, /* generated in contentSelectStatement() */
1770 /* CONTENT_UPDATE */ NULL, /* generated in contentUpdateStatement() */
1771 /* CONTENT_DELETE */ "delete from %_content where docid = ?",
1772
1773 /* BLOCK_INSERT */
1774 "insert into %_segments (blockid, block) values (null, ?)",
1775 /* BLOCK_SELECT */ "select block from %_segments where blockid = ?",
1776 /* BLOCK_DELETE */ "delete from %_segments where blockid between ? and ?",
1777
1778 /* SEGDIR_MAX_INDEX */ "select max(idx) from %_segdir where level = ?",
1779 /* SEGDIR_SET */ "insert into %_segdir values (?, ?, ?, ?, ?, ?)",
1780 /* SEGDIR_SELECT */
1781 "select start_block, leaves_end_block, root from %_segdir "
1782 " where level = ? order by idx",
1783 /* SEGDIR_SPAN */
1784 "select min(start_block), max(end_block) from %_segdir "
1785 " where level = ? and start_block <> 0",
1786 /* SEGDIR_DELETE */ "delete from %_segdir where level = ?",
1787 /* SEGDIR_SELECT_ALL */
1788 "select root, leaves_end_block from %_segdir order by level desc, idx",
1789};
1790
1791/*
1792** A connection to a fulltext index is an instance of the following
1793** structure. The xCreate and xConnect methods create an instance
1794** of this structure and xDestroy and xDisconnect free that instance.
1795** All other methods receive a pointer to the structure as one of their
1796** arguments.
1797*/
1798struct fulltext_vtab {
1799 sqlite3_vtab base; /* Base class used by SQLite core */
1800 sqlite3 *db; /* The database connection */
1801 const char *zDb; /* logical database name */
1802 const char *zName; /* virtual table name */
1803 int nColumn; /* number of columns in virtual table */
1804 char **azColumn; /* column names. malloced */
1805 char **azContentColumn; /* column names in content table; malloced */
1806 sqlite3_tokenizer *pTokenizer; /* tokenizer for inserts and queries */
1807
1808 /* Precompiled statements which we keep as long as the table is
1809 ** open.
1810 */
1811 sqlite3_stmt *pFulltextStatements[MAX_STMT];
1812
1813 /* Precompiled statements used for segment merges. We run a
1814 ** separate select across the leaf level of each tree being merged.
1815 */
1816 sqlite3_stmt *pLeafSelectStmts[MERGE_COUNT];
1817 /* The statement used to prepare pLeafSelectStmts. */
1818#define LEAF_SELECT \
1819 "select block from %_segments where blockid between ? and ? order by blockid"
1820
1821 /* These buffer pending index updates during transactions.
1822 ** nPendingData estimates the memory size of the pending data. It
1823 ** doesn't include the hash-bucket overhead, nor any malloc
1824 ** overhead. When nPendingData exceeds kPendingThreshold, the
1825 ** buffer is flushed even before the transaction closes.
1826 ** pendingTerms stores the data, and is only valid when nPendingData
1827 ** is >=0 (nPendingData<0 means pendingTerms has not been
1828 ** initialized). iPrevDocid is the last docid written, used to make
1829 ** certain we're inserting in sorted order.
1830 */
1831 int nPendingData;
1832#define kPendingThreshold (1*1024*1024)
1833 sqlite_int64 iPrevDocid;
1834 fts3Hash pendingTerms;
1835};
1836
1837/*
1838** When the core wants to do a query, it create a cursor using a
1839** call to xOpen. This structure is an instance of a cursor. It
1840** is destroyed by xClose.
1841*/
1842typedef struct fulltext_cursor {
1843 sqlite3_vtab_cursor base; /* Base class used by SQLite core */
1844 QueryType iCursorType; /* Copy of sqlite3_index_info.idxNum */
1845 sqlite3_stmt *pStmt; /* Prepared statement in use by the cursor */
1846 int eof; /* True if at End Of Results */
1847 Query q; /* Parsed query string */
1848 Snippet snippet; /* Cached snippet for the current row */
1849 int iColumn; /* Column being searched */
1850 DataBuffer result; /* Doclist results from fulltextQuery */
1851 DLReader reader; /* Result reader if result not empty */
1852} fulltext_cursor;
1853
1854static struct fulltext_vtab *cursor_vtab(fulltext_cursor *c){
1855 return (fulltext_vtab *) c->base.pVtab;
1856}
1857
1858static const sqlite3_module fts3Module; /* forward declaration */
1859
1860/* Return a dynamically generated statement of the form
1861 * insert into %_content (docid, ...) values (?, ...)
1862 */
1863static const char *contentInsertStatement(fulltext_vtab *v){
1864 StringBuffer sb;
1865 int i;
1866
1867 initStringBuffer(&sb);
1868 append(&sb, "insert into %_content (docid, ");
1869 appendList(&sb, v->nColumn, v->azContentColumn);
1870 append(&sb, ") values (?");
1871 for(i=0; i<v->nColumn; ++i)
1872 append(&sb, ", ?");
1873 append(&sb, ")");
1874 return stringBufferData(&sb);
1875}
1876
1877/* Return a dynamically generated statement of the form
1878 * select <content columns> from %_content where docid = ?
1879 */
1880static const char *contentSelectStatement(fulltext_vtab *v){
1881 StringBuffer sb;
1882 initStringBuffer(&sb);
1883 append(&sb, "SELECT ");
1884 appendList(&sb, v->nColumn, v->azContentColumn);
1885 append(&sb, " FROM %_content WHERE docid = ?");
1886 return stringBufferData(&sb);
1887}
1888
1889/* Return a dynamically generated statement of the form
1890 * update %_content set [col_0] = ?, [col_1] = ?, ...
1891 * where docid = ?
1892 */
1893static const char *contentUpdateStatement(fulltext_vtab *v){
1894 StringBuffer sb;
1895 int i;
1896
1897 initStringBuffer(&sb);
1898 append(&sb, "update %_content set ");
1899 for(i=0; i<v->nColumn; ++i) {
1900 if( i>0 ){
1901 append(&sb, ", ");
1902 }
1903 append(&sb, v->azContentColumn[i]);
1904 append(&sb, " = ?");
1905 }
1906 append(&sb, " where docid = ?");
1907 return stringBufferData(&sb);
1908}
1909
1910/* Puts a freshly-prepared statement determined by iStmt in *ppStmt.
1911** If the indicated statement has never been prepared, it is prepared
1912** and cached, otherwise the cached version is reset.
1913*/
1914static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt,
1915 sqlite3_stmt **ppStmt){
1916 assert( iStmt<MAX_STMT );
1917 if( v->pFulltextStatements[iStmt]==NULL ){
1918 const char *zStmt;
1919 int rc;
1920 switch( iStmt ){
1921 case CONTENT_INSERT_STMT:
1922 zStmt = contentInsertStatement(v); break;
1923 case CONTENT_SELECT_STMT:
1924 zStmt = contentSelectStatement(v); break;
1925 case CONTENT_UPDATE_STMT:
1926 zStmt = contentUpdateStatement(v); break;
1927 default:
1928 zStmt = fulltext_zStatement[iStmt];
1929 }
1930 rc = sql_prepare(v->db, v->zDb, v->zName, &v->pFulltextStatements[iStmt],
1931 zStmt);
1932 if( zStmt != fulltext_zStatement[iStmt]) free((void *) zStmt);
1933 if( rc!=SQLITE_OK ) return rc;
1934 } else {
1935 int rc = sqlite3_reset(v->pFulltextStatements[iStmt]);
1936 if( rc!=SQLITE_OK ) return rc;
1937 }
1938
1939 *ppStmt = v->pFulltextStatements[iStmt];
1940 return SQLITE_OK;
1941}
1942
1943/* Like sqlite3_step(), but convert SQLITE_DONE to SQLITE_OK and
1944** SQLITE_ROW to SQLITE_ERROR. Useful for statements like UPDATE,
1945** where we expect no results.
1946*/
1947static int sql_single_step(sqlite3_stmt *s){
1948 int rc = sqlite3_step(s);
1949 return (rc==SQLITE_DONE) ? SQLITE_OK : rc;
1950}
1951
1952/* Like sql_get_statement(), but for special replicated LEAF_SELECT
1953** statements.
1954*/
1955/* TODO(shess) Write version for generic statements and then share
1956** that between the cached-statement functions.
1957*/
1958static int sql_get_leaf_statement(fulltext_vtab *v, int idx,
1959 sqlite3_stmt **ppStmt){
1960 assert( idx>=0 && idx<MERGE_COUNT );
1961 if( v->pLeafSelectStmts[idx]==NULL ){
1962 int rc = sql_prepare(v->db, v->zDb, v->zName, &v->pLeafSelectStmts[idx],
1963 LEAF_SELECT);
1964 if( rc!=SQLITE_OK ) return rc;
1965 }else{
1966 int rc = sqlite3_reset(v->pLeafSelectStmts[idx]);
1967 if( rc!=SQLITE_OK ) return rc;
1968 }
1969
1970 *ppStmt = v->pLeafSelectStmts[idx];
1971 return SQLITE_OK;
1972}
1973
1974/* insert into %_content (docid, ...) values ([docid], [pValues])
1975** If the docid contains SQL NULL, then a unique docid will be
1976** generated.
1977*/
1978static int content_insert(fulltext_vtab *v, sqlite3_value *docid,
1979 sqlite3_value **pValues){
1980 sqlite3_stmt *s;
1981 int i;
1982 int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s);
1983 if( rc!=SQLITE_OK ) return rc;
1984
1985 rc = sqlite3_bind_value(s, 1, docid);
1986 if( rc!=SQLITE_OK ) return rc;
1987
1988 for(i=0; i<v->nColumn; ++i){
1989 rc = sqlite3_bind_value(s, 2+i, pValues[i]);
1990 if( rc!=SQLITE_OK ) return rc;
1991 }
1992
1993 return sql_single_step(s);
1994}
1995
1996/* update %_content set col0 = pValues[0], col1 = pValues[1], ...
1997 * where docid = [iDocid] */
1998static int content_update(fulltext_vtab *v, sqlite3_value **pValues,
1999 sqlite_int64 iDocid){
2000 sqlite3_stmt *s;
2001 int i;
2002 int rc = sql_get_statement(v, CONTENT_UPDATE_STMT, &s);
2003 if( rc!=SQLITE_OK ) return rc;
2004
2005 for(i=0; i<v->nColumn; ++i){
2006 rc = sqlite3_bind_value(s, 1+i, pValues[i]);
2007 if( rc!=SQLITE_OK ) return rc;
2008 }
2009
2010 rc = sqlite3_bind_int64(s, 1+v->nColumn, iDocid);
2011 if( rc!=SQLITE_OK ) return rc;
2012
2013 return sql_single_step(s);
2014}
2015
2016static void freeStringArray(int nString, const char **pString){
2017 int i;
2018
2019 for (i=0 ; i < nString ; ++i) {
2020 if( pString[i]!=NULL ) free((void *) pString[i]);
2021 }
2022 free((void *) pString);
2023}
2024
2025/* select * from %_content where docid = [iDocid]
2026 * The caller must delete the returned array and all strings in it.
2027 * null fields will be NULL in the returned array.
2028 *
2029 * TODO: Perhaps we should return pointer/length strings here for consistency
2030 * with other code which uses pointer/length. */
2031static int content_select(fulltext_vtab *v, sqlite_int64 iDocid,
2032 const char ***pValues){
2033 sqlite3_stmt *s;
2034 const char **values;
2035 int i;
2036 int rc;
2037
2038 *pValues = NULL;
2039
2040 rc = sql_get_statement(v, CONTENT_SELECT_STMT, &s);
2041 if( rc!=SQLITE_OK ) return rc;
2042
2043 rc = sqlite3_bind_int64(s, 1, iDocid);
2044 if( rc!=SQLITE_OK ) return rc;
2045
2046 rc = sqlite3_step(s);
2047 if( rc!=SQLITE_ROW ) return rc;
2048
2049 values = (const char **) malloc(v->nColumn * sizeof(const char *));
2050 for(i=0; i<v->nColumn; ++i){
2051 if( sqlite3_column_type(s, i)==SQLITE_NULL ){
2052 values[i] = NULL;
2053 }else{
2054 values[i] = string_dup((char*)sqlite3_column_text(s, i));
2055 }
2056 }
2057
2058 /* We expect only one row. We must execute another sqlite3_step()
2059 * to complete the iteration; otherwise the table will remain locked. */
2060 rc = sqlite3_step(s);
2061 if( rc==SQLITE_DONE ){
2062 *pValues = values;
2063 return SQLITE_OK;
2064 }
2065
2066 freeStringArray(v->nColumn, values);
2067 return rc;
2068}
2069
2070/* delete from %_content where docid = [iDocid ] */
2071static int content_delete(fulltext_vtab *v, sqlite_int64 iDocid){
2072 sqlite3_stmt *s;
2073 int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s);
2074 if( rc!=SQLITE_OK ) return rc;
2075
2076 rc = sqlite3_bind_int64(s, 1, iDocid);
2077 if( rc!=SQLITE_OK ) return rc;
2078
2079 return sql_single_step(s);
2080}
2081
2082/* insert into %_segments values ([pData])
2083** returns assigned blockid in *piBlockid
2084*/
2085static int block_insert(fulltext_vtab *v, const char *pData, int nData,
2086 sqlite_int64 *piBlockid){
2087 sqlite3_stmt *s;
2088 int rc = sql_get_statement(v, BLOCK_INSERT_STMT, &s);
2089 if( rc!=SQLITE_OK ) return rc;
2090
2091 rc = sqlite3_bind_blob(s, 1, pData, nData, SQLITE_STATIC);
2092 if( rc!=SQLITE_OK ) return rc;
2093
2094 rc = sqlite3_step(s);
2095 if( rc==SQLITE_ROW ) return SQLITE_ERROR;
2096 if( rc!=SQLITE_DONE ) return rc;
2097
2098 /* blockid column is an alias for rowid. */
2099 *piBlockid = sqlite3_last_insert_rowid(v->db);
2100 return SQLITE_OK;
2101}
2102
2103/* delete from %_segments
2104** where blockid between [iStartBlockid] and [iEndBlockid]
2105**
2106** Deletes the range of blocks, inclusive, used to delete the blocks
2107** which form a segment.
2108*/
2109static int block_delete(fulltext_vtab *v,
2110 sqlite_int64 iStartBlockid, sqlite_int64 iEndBlockid){
2111 sqlite3_stmt *s;
2112 int rc = sql_get_statement(v, BLOCK_DELETE_STMT, &s);
2113 if( rc!=SQLITE_OK ) return rc;
2114
2115 rc = sqlite3_bind_int64(s, 1, iStartBlockid);
2116 if( rc!=SQLITE_OK ) return rc;
2117
2118 rc = sqlite3_bind_int64(s, 2, iEndBlockid);
2119 if( rc!=SQLITE_OK ) return rc;
2120
2121 return sql_single_step(s);
2122}
2123
2124/* Returns SQLITE_ROW with *pidx set to the maximum segment idx found
2125** at iLevel. Returns SQLITE_DONE if there are no segments at
2126** iLevel. Otherwise returns an error.
2127*/
2128static int segdir_max_index(fulltext_vtab *v, int iLevel, int *pidx){
2129 sqlite3_stmt *s;
2130 int rc = sql_get_statement(v, SEGDIR_MAX_INDEX_STMT, &s);
2131 if( rc!=SQLITE_OK ) return rc;
2132
2133 rc = sqlite3_bind_int(s, 1, iLevel);
2134 if( rc!=SQLITE_OK ) return rc;
2135
2136 rc = sqlite3_step(s);
2137 /* Should always get at least one row due to how max() works. */
2138 if( rc==SQLITE_DONE ) return SQLITE_DONE;
2139 if( rc!=SQLITE_ROW ) return rc;
2140
2141 /* NULL means that there were no inputs to max(). */
2142 if( SQLITE_NULL==sqlite3_column_type(s, 0) ){
2143 rc = sqlite3_step(s);
2144 if( rc==SQLITE_ROW ) return SQLITE_ERROR;
2145 return rc;
2146 }
2147
2148 *pidx = sqlite3_column_int(s, 0);
2149
2150 /* We expect only one row. We must execute another sqlite3_step()
2151 * to complete the iteration; otherwise the table will remain locked. */
2152 rc = sqlite3_step(s);
2153 if( rc==SQLITE_ROW ) return SQLITE_ERROR;
2154 if( rc!=SQLITE_DONE ) return rc;
2155 return SQLITE_ROW;
2156}
2157
2158/* insert into %_segdir values (
2159** [iLevel], [idx],
2160** [iStartBlockid], [iLeavesEndBlockid], [iEndBlockid],
2161** [pRootData]
2162** )
2163*/
2164static int segdir_set(fulltext_vtab *v, int iLevel, int idx,
2165 sqlite_int64 iStartBlockid,
2166 sqlite_int64 iLeavesEndBlockid,
2167 sqlite_int64 iEndBlockid,
2168 const char *pRootData, int nRootData){
2169 sqlite3_stmt *s;
2170 int rc = sql_get_statement(v, SEGDIR_SET_STMT, &s);
2171 if( rc!=SQLITE_OK ) return rc;
2172
2173 rc = sqlite3_bind_int(s, 1, iLevel);
2174 if( rc!=SQLITE_OK ) return rc;
2175
2176 rc = sqlite3_bind_int(s, 2, idx);
2177 if( rc!=SQLITE_OK ) return rc;
2178
2179 rc = sqlite3_bind_int64(s, 3, iStartBlockid);
2180 if( rc!=SQLITE_OK ) return rc;
2181
2182 rc = sqlite3_bind_int64(s, 4, iLeavesEndBlockid);
2183 if( rc!=SQLITE_OK ) return rc;
2184
2185 rc = sqlite3_bind_int64(s, 5, iEndBlockid);
2186 if( rc!=SQLITE_OK ) return rc;
2187
2188 rc = sqlite3_bind_blob(s, 6, pRootData, nRootData, SQLITE_STATIC);
2189 if( rc!=SQLITE_OK ) return rc;
2190
2191 return sql_single_step(s);
2192}
2193
2194/* Queries %_segdir for the block span of the segments in level
2195** iLevel. Returns SQLITE_DONE if there are no blocks for iLevel,
2196** SQLITE_ROW if there are blocks, else an error.
2197*/
2198static int segdir_span(fulltext_vtab *v, int iLevel,
2199 sqlite_int64 *piStartBlockid,
2200 sqlite_int64 *piEndBlockid){
2201 sqlite3_stmt *s;
2202 int rc = sql_get_statement(v, SEGDIR_SPAN_STMT, &s);
2203 if( rc!=SQLITE_OK ) return rc;
2204
2205 rc = sqlite3_bind_int(s, 1, iLevel);
2206 if( rc!=SQLITE_OK ) return rc;
2207
2208 rc = sqlite3_step(s);
2209 if( rc==SQLITE_DONE ) return SQLITE_DONE; /* Should never happen */
2210 if( rc!=SQLITE_ROW ) return rc;
2211
2212 /* This happens if all segments at this level are entirely inline. */
2213 if( SQLITE_NULL==sqlite3_column_type(s, 0) ){
2214 /* We expect only one row. We must execute another sqlite3_step()
2215 * to complete the iteration; otherwise the table will remain locked. */
2216 int rc2 = sqlite3_step(s);
2217 if( rc2==SQLITE_ROW ) return SQLITE_ERROR;
2218 return rc2;
2219 }
2220
2221 *piStartBlockid = sqlite3_column_int64(s, 0);
2222 *piEndBlockid = sqlite3_column_int64(s, 1);
2223
2224 /* We expect only one row. We must execute another sqlite3_step()
2225 * to complete the iteration; otherwise the table will remain locked. */
2226 rc = sqlite3_step(s);
2227 if( rc==SQLITE_ROW ) return SQLITE_ERROR;
2228 if( rc!=SQLITE_DONE ) return rc;
2229 return SQLITE_ROW;
2230}
2231
2232/* Delete the segment blocks and segment directory records for all
2233** segments at iLevel.
2234*/
2235static int segdir_delete(fulltext_vtab *v, int iLevel){
2236 sqlite3_stmt *s;
2237 sqlite_int64 iStartBlockid, iEndBlockid;
2238 int rc = segdir_span(v, iLevel, &iStartBlockid, &iEndBlockid);
2239 if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ) return rc;
2240
2241 if( rc==SQLITE_ROW ){
2242 rc = block_delete(v, iStartBlockid, iEndBlockid);
2243 if( rc!=SQLITE_OK ) return rc;
2244 }
2245
2246 /* Delete the segment directory itself. */
2247 rc = sql_get_statement(v, SEGDIR_DELETE_STMT, &s);
2248 if( rc!=SQLITE_OK ) return rc;
2249
2250 rc = sqlite3_bind_int64(s, 1, iLevel);
2251 if( rc!=SQLITE_OK ) return rc;
2252
2253 return sql_single_step(s);
2254}
2255
2256/* TODO(shess) clearPendingTerms() is far down the file because
2257** writeZeroSegment() is far down the file because LeafWriter is far
2258** down the file. Consider refactoring the code to move the non-vtab
2259** code above the vtab code so that we don't need this forward
2260** reference.
2261*/
2262static int clearPendingTerms(fulltext_vtab *v);
2263
2264/*
2265** Free the memory used to contain a fulltext_vtab structure.
2266*/
2267static void fulltext_vtab_destroy(fulltext_vtab *v){
2268 int iStmt, i;
2269
2270 TRACE(("FTS3 Destroy %p\n", v));
2271 for( iStmt=0; iStmt<MAX_STMT; iStmt++ ){
2272 if( v->pFulltextStatements[iStmt]!=NULL ){
2273 sqlite3_finalize(v->pFulltextStatements[iStmt]);
2274 v->pFulltextStatements[iStmt] = NULL;
2275 }
2276 }
2277
2278 for( i=0; i<MERGE_COUNT; i++ ){
2279 if( v->pLeafSelectStmts[i]!=NULL ){
2280 sqlite3_finalize(v->pLeafSelectStmts[i]);
2281 v->pLeafSelectStmts[i] = NULL;
2282 }
2283 }
2284
2285 if( v->pTokenizer!=NULL ){
2286 v->pTokenizer->pModule->xDestroy(v->pTokenizer);
2287 v->pTokenizer = NULL;
2288 }
2289
2290 clearPendingTerms(v);
2291
2292 free(v->azColumn);
2293 for(i = 0; i < v->nColumn; ++i) {
2294 sqlite3_free(v->azContentColumn[i]);
2295 }
2296 free(v->azContentColumn);
2297 free(v);
2298}
2299
2300/*
2301** Token types for parsing the arguments to xConnect or xCreate.
2302*/
2303#define TOKEN_EOF 0 /* End of file */
2304#define TOKEN_SPACE 1 /* Any kind of whitespace */
2305#define TOKEN_ID 2 /* An identifier */
2306#define TOKEN_STRING 3 /* A string literal */
2307#define TOKEN_PUNCT 4 /* A single punctuation character */
2308
2309/*
2310** If X is a character that can be used in an identifier then
2311** IdChar(X) will be true. Otherwise it is false.
2312**
2313** For ASCII, any character with the high-order bit set is
2314** allowed in an identifier. For 7-bit characters,
2315** sqlite3IsIdChar[X] must be 1.
2316**
2317** Ticket #1066. the SQL standard does not allow '$' in the
2318** middle of identfiers. But many SQL implementations do.
2319** SQLite will allow '$' in identifiers for compatibility.
2320** But the feature is undocumented.
2321*/
2322static const char isIdChar[] = {
2323/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
2324 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 2x */
2325 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */
2326 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */
2327 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */
2328 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */
2329 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */
2330};
2331#define IdChar(C) (((c=C)&0x80)!=0 || (c>0x1f && isIdChar[c-0x20]))
2332
2333
2334/*
2335** Return the length of the token that begins at z[0].
2336** Store the token type in *tokenType before returning.
2337*/
2338static int getToken(const char *z, int *tokenType){
2339 int i, c;
2340 switch( *z ){
2341 case 0: {
2342 *tokenType = TOKEN_EOF;
2343 return 0;
2344 }
2345 case ' ': case '\t': case '\n': case '\f': case '\r': {
2346 for(i=1; safe_isspace(z[i]); i++){}
2347 *tokenType = TOKEN_SPACE;
2348 return i;
2349 }
2350 case '`':
2351 case '\'':
2352 case '"': {
2353 int delim = z[0];
2354 for(i=1; (c=z[i])!=0; i++){
2355 if( c==delim ){
2356 if( z[i+1]==delim ){
2357 i++;
2358 }else{
2359 break;
2360 }
2361 }
2362 }
2363 *tokenType = TOKEN_STRING;
2364 return i + (c!=0);
2365 }
2366 case '[': {
2367 for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){}
2368 *tokenType = TOKEN_ID;
2369 return i;
2370 }
2371 default: {
2372 if( !IdChar(*z) ){
2373 break;
2374 }
2375 for(i=1; IdChar(z[i]); i++){}
2376 *tokenType = TOKEN_ID;
2377 return i;
2378 }
2379 }
2380 *tokenType = TOKEN_PUNCT;
2381 return 1;
2382}
2383
2384/*
2385** A token extracted from a string is an instance of the following
2386** structure.
2387*/
2388typedef struct Token {
2389 const char *z; /* Pointer to token text. Not '\000' terminated */
2390 short int n; /* Length of the token text in bytes. */
2391} Token;
2392
2393/*
2394** Given a input string (which is really one of the argv[] parameters
2395** passed into xConnect or xCreate) split the string up into tokens.
2396** Return an array of pointers to '\000' terminated strings, one string
2397** for each non-whitespace token.
2398**
2399** The returned array is terminated by a single NULL pointer.
2400**
2401** Space to hold the returned array is obtained from a single
2402** malloc and should be freed by passing the return value to free().
2403** The individual strings within the token list are all a part of
2404** the single memory allocation and will all be freed at once.
2405*/
2406static char **tokenizeString(const char *z, int *pnToken){
2407 int nToken = 0;
2408 Token *aToken = malloc( strlen(z) * sizeof(aToken[0]) );
2409 int n = 1;
2410 int e, i;
2411 int totalSize = 0;
2412 char **azToken;
2413 char *zCopy;
2414 while( n>0 ){
2415 n = getToken(z, &e);
2416 if( e!=TOKEN_SPACE ){
2417 aToken[nToken].z = z;
2418 aToken[nToken].n = n;
2419 nToken++;
2420 totalSize += n+1;
2421 }
2422 z += n;
2423 }
2424 azToken = (char**)malloc( nToken*sizeof(char*) + totalSize );
2425 zCopy = (char*)&azToken[nToken];
2426 nToken--;
2427 for(i=0; i<nToken; i++){
2428 azToken[i] = zCopy;
2429 n = aToken[i].n;
2430 memcpy(zCopy, aToken[i].z, n);
2431 zCopy[n] = 0;
2432 zCopy += n+1;
2433 }
2434 azToken[nToken] = 0;
2435 free(aToken);
2436 *pnToken = nToken;
2437 return azToken;
2438}
2439
2440/*
2441** Convert an SQL-style quoted string into a normal string by removing
2442** the quote characters. The conversion is done in-place. If the
2443** input does not begin with a quote character, then this routine
2444** is a no-op.
2445**
2446** Examples:
2447**
2448** "abc" becomes abc
2449** 'xyz' becomes xyz
2450** [pqr] becomes pqr
2451** `mno` becomes mno
2452*/
2453static void dequoteString(char *z){
2454 int quote;
2455 int i, j;
2456 if( z==0 ) return;
2457 quote = z[0];
2458 switch( quote ){
2459 case '\'': break;
2460 case '"': break;
2461 case '`': break; /* For MySQL compatibility */
2462 case '[': quote = ']'; break; /* For MS SqlServer compatibility */
2463 default: return;
2464 }
2465 for(i=1, j=0; z[i]; i++){
2466 if( z[i]==quote ){
2467 if( z[i+1]==quote ){
2468 z[j++] = quote;
2469 i++;
2470 }else{
2471 z[j++] = 0;
2472 break;
2473 }
2474 }else{
2475 z[j++] = z[i];
2476 }
2477 }
2478}
2479
2480/*
2481** The input azIn is a NULL-terminated list of tokens. Remove the first
2482** token and all punctuation tokens. Remove the quotes from
2483** around string literal tokens.
2484**
2485** Example:
2486**
2487** input: tokenize chinese ( 'simplifed' , 'mixed' )
2488** output: chinese simplifed mixed
2489**
2490** Another example:
2491**
2492** input: delimiters ( '[' , ']' , '...' )
2493** output: [ ] ...
2494*/
2495static void tokenListToIdList(char **azIn){
2496 int i, j;
2497 if( azIn ){
2498 for(i=0, j=-1; azIn[i]; i++){
2499 if( safe_isalnum(azIn[i][0]) || azIn[i][1] ){
2500 dequoteString(azIn[i]);
2501 if( j>=0 ){
2502 azIn[j] = azIn[i];
2503 }
2504 j++;
2505 }
2506 }
2507 azIn[j] = 0;
2508 }
2509}
2510
2511
2512/*
2513** Find the first alphanumeric token in the string zIn. Null-terminate
2514** this token. Remove any quotation marks. And return a pointer to
2515** the result.
2516*/
2517static char *firstToken(char *zIn, char **pzTail){
2518 int n, ttype;
2519 while(1){
2520 n = getToken(zIn, &ttype);
2521 if( ttype==TOKEN_SPACE ){
2522 zIn += n;
2523 }else if( ttype==TOKEN_EOF ){
2524 *pzTail = zIn;
2525 return 0;
2526 }else{
2527 zIn[n] = 0;
2528 *pzTail = &zIn[1];
2529 dequoteString(zIn);
2530 return zIn;
2531 }
2532 }
2533 /*NOTREACHED*/
2534}
2535
2536/* Return true if...
2537**
2538** * s begins with the string t, ignoring case
2539** * s is longer than t
2540** * The first character of s beyond t is not a alphanumeric
2541**
2542** Ignore leading space in *s.
2543**
2544** To put it another way, return true if the first token of
2545** s[] is t[].
2546*/
2547static int startsWith(const char *s, const char *t){
2548 while( safe_isspace(*s) ){ s++; }
2549 while( *t ){
2550 if( safe_tolower(*s++)!=safe_tolower(*t++) ) return 0;
2551 }
2552 return *s!='_' && !safe_isalnum(*s);
2553}
2554
2555/*
2556** An instance of this structure defines the "spec" of a
2557** full text index. This structure is populated by parseSpec
2558** and use by fulltextConnect and fulltextCreate.
2559*/
2560typedef struct TableSpec {
2561 const char *zDb; /* Logical database name */
2562 const char *zName; /* Name of the full-text index */
2563 int nColumn; /* Number of columns to be indexed */
2564 char **azColumn; /* Original names of columns to be indexed */
2565 char **azContentColumn; /* Column names for %_content */
2566 char **azTokenizer; /* Name of tokenizer and its arguments */
2567} TableSpec;
2568
2569/*
2570** Reclaim all of the memory used by a TableSpec
2571*/
2572static void clearTableSpec(TableSpec *p) {
2573 free(p->azColumn);
2574 free(p->azContentColumn);
2575 free(p->azTokenizer);
2576}
2577
2578/* Parse a CREATE VIRTUAL TABLE statement, which looks like this:
2579 *
2580 * CREATE VIRTUAL TABLE email
2581 * USING fts3(subject, body, tokenize mytokenizer(myarg))
2582 *
2583 * We return parsed information in a TableSpec structure.
2584 *
2585 */
2586static int parseSpec(TableSpec *pSpec, int argc, const char *const*argv,
2587 char**pzErr){
2588 int i, n;
2589 char *z, *zDummy;
2590 char **azArg;
2591 const char *zTokenizer = 0; /* argv[] entry describing the tokenizer */
2592
2593 assert( argc>=3 );
2594 /* Current interface:
2595 ** argv[0] - module name
2596 ** argv[1] - database name
2597 ** argv[2] - table name
2598 ** argv[3..] - columns, optionally followed by tokenizer specification
2599 ** and snippet delimiters specification.
2600 */
2601
2602 /* Make a copy of the complete argv[][] array in a single allocation.
2603 ** The argv[][] array is read-only and transient. We can write to the
2604 ** copy in order to modify things and the copy is persistent.
2605 */
2606 CLEAR(pSpec);
2607 for(i=n=0; i<argc; i++){
2608 n += strlen(argv[i]) + 1;
2609 }
2610 azArg = malloc( sizeof(char*)*argc + n );
2611 if( azArg==0 ){
2612 return SQLITE_NOMEM;
2613 }
2614 z = (char*)&azArg[argc];
2615 for(i=0; i<argc; i++){
2616 azArg[i] = z;
2617 strcpy(z, argv[i]);
2618 z += strlen(z)+1;
2619 }
2620
2621 /* Identify the column names and the tokenizer and delimiter arguments
2622 ** in the argv[][] array.
2623 */
2624 pSpec->zDb = azArg[1];
2625 pSpec->zName = azArg[2];
2626 pSpec->nColumn = 0;
2627 pSpec->azColumn = azArg;
2628 zTokenizer = "tokenize simple";
2629 for(i=3; i<argc; ++i){
2630 if( startsWith(azArg[i],"tokenize") ){
2631 zTokenizer = azArg[i];
2632 }else{
2633 z = azArg[pSpec->nColumn] = firstToken(azArg[i], &zDummy);
2634 pSpec->nColumn++;
2635 }
2636 }
2637 if( pSpec->nColumn==0 ){
2638 azArg[0] = "content";
2639 pSpec->nColumn = 1;
2640 }
2641
2642 /*
2643 ** Construct the list of content column names.
2644 **
2645 ** Each content column name will be of the form cNNAAAA
2646 ** where NN is the column number and AAAA is the sanitized
2647 ** column name. "sanitized" means that special characters are
2648 ** converted to "_". The cNN prefix guarantees that all column
2649 ** names are unique.
2650 **
2651 ** The AAAA suffix is not strictly necessary. It is included
2652 ** for the convenience of people who might examine the generated
2653 ** %_content table and wonder what the columns are used for.
2654 */
2655 pSpec->azContentColumn = malloc( pSpec->nColumn * sizeof(char *) );
2656 if( pSpec->azContentColumn==0 ){
2657 clearTableSpec(pSpec);
2658 return SQLITE_NOMEM;
2659 }
2660 for(i=0; i<pSpec->nColumn; i++){
2661 char *p;
2662 pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]);
2663 for (p = pSpec->azContentColumn[i]; *p ; ++p) {
2664 if( !safe_isalnum(*p) ) *p = '_';
2665 }
2666 }
2667
2668 /*
2669 ** Parse the tokenizer specification string.
2670 */
2671 pSpec->azTokenizer = tokenizeString(zTokenizer, &n);
2672 tokenListToIdList(pSpec->azTokenizer);
2673
2674 return SQLITE_OK;
2675}
2676
2677/*
2678** Generate a CREATE TABLE statement that describes the schema of
2679** the virtual table. Return a pointer to this schema string.
2680**
2681** Space is obtained from sqlite3_mprintf() and should be freed
2682** using sqlite3_free().
2683*/
2684static char *fulltextSchema(
2685 int nColumn, /* Number of columns */
2686 const char *const* azColumn, /* List of columns */
2687 const char *zTableName /* Name of the table */
2688){
2689 int i;
2690 char *zSchema, *zNext;
2691 const char *zSep = "(";
2692 zSchema = sqlite3_mprintf("CREATE TABLE x");
2693 for(i=0; i<nColumn; i++){
2694 zNext = sqlite3_mprintf("%s%s%Q", zSchema, zSep, azColumn[i]);
2695 sqlite3_free(zSchema);
2696 zSchema = zNext;
2697 zSep = ",";
2698 }
2699 zNext = sqlite3_mprintf("%s,%Q HIDDEN", zSchema, zTableName);
2700 sqlite3_free(zSchema);
2701 zSchema = zNext;
2702 zNext = sqlite3_mprintf("%s,docid HIDDEN)", zSchema);
2703 sqlite3_free(zSchema);
2704 return zNext;
2705}
2706
2707/*
2708** Build a new sqlite3_vtab structure that will describe the
2709** fulltext index defined by spec.
2710*/
2711static int constructVtab(
2712 sqlite3 *db, /* The SQLite database connection */
2713 fts3Hash *pHash, /* Hash table containing tokenizers */
2714 TableSpec *spec, /* Parsed spec information from parseSpec() */
2715 sqlite3_vtab **ppVTab, /* Write the resulting vtab structure here */
2716 char **pzErr /* Write any error message here */
2717){
2718 int rc;
2719 int n;
2720 fulltext_vtab *v = 0;
2721 const sqlite3_tokenizer_module *m = NULL;
2722 char *schema;
2723
2724 char const *zTok; /* Name of tokenizer to use for this fts table */
2725 int nTok; /* Length of zTok, including nul terminator */
2726
2727 v = (fulltext_vtab *) malloc(sizeof(fulltext_vtab));
2728 if( v==0 ) return SQLITE_NOMEM;
2729 CLEAR(v);
2730 /* sqlite will initialize v->base */
2731 v->db = db;
2732 v->zDb = spec->zDb; /* Freed when azColumn is freed */
2733 v->zName = spec->zName; /* Freed when azColumn is freed */
2734 v->nColumn = spec->nColumn;
2735 v->azContentColumn = spec->azContentColumn;
2736 spec->azContentColumn = 0;
2737 v->azColumn = spec->azColumn;
2738 spec->azColumn = 0;
2739
2740 if( spec->azTokenizer==0 ){
2741 return SQLITE_NOMEM;
2742 }
2743
2744 zTok = spec->azTokenizer[0];
2745 if( !zTok ){
2746 zTok = "simple";
2747 }
2748 nTok = strlen(zTok)+1;
2749
2750 m = (sqlite3_tokenizer_module *)sqlite3Fts3HashFind(pHash, zTok, nTok);
2751 if( !m ){
2752 *pzErr = sqlite3_mprintf("unknown tokenizer: %s", spec->azTokenizer[0]);
2753 rc = SQLITE_ERROR;
2754 goto err;
2755 }
2756
2757 for(n=0; spec->azTokenizer[n]; n++){}
2758 if( n ){
2759 rc = m->xCreate(n-1, (const char*const*)&spec->azTokenizer[1],
2760 &v->pTokenizer);
2761 }else{
2762 rc = m->xCreate(0, 0, &v->pTokenizer);
2763 }
2764 if( rc!=SQLITE_OK ) goto err;
2765 v->pTokenizer->pModule = m;
2766
2767 /* TODO: verify the existence of backing tables foo_content, foo_term */
2768
2769 schema = fulltextSchema(v->nColumn, (const char*const*)v->azColumn,
2770 spec->zName);
2771 rc = sqlite3_declare_vtab(db, schema);
2772 sqlite3_free(schema);
2773 if( rc!=SQLITE_OK ) goto err;
2774
2775 memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements));
2776
2777 /* Indicate that the buffer is not live. */
2778 v->nPendingData = -1;
2779
2780 *ppVTab = &v->base;
2781 TRACE(("FTS3 Connect %p\n", v));
2782
2783 return rc;
2784
2785err:
2786 fulltext_vtab_destroy(v);
2787 return rc;
2788}
2789
2790static int fulltextConnect(
2791 sqlite3 *db,
2792 void *pAux,
2793 int argc, const char *const*argv,
2794 sqlite3_vtab **ppVTab,
2795 char **pzErr
2796){
2797 TableSpec spec;
2798 int rc = parseSpec(&spec, argc, argv, pzErr);
2799 if( rc!=SQLITE_OK ) return rc;
2800
2801 rc = constructVtab(db, (fts3Hash *)pAux, &spec, ppVTab, pzErr);
2802 clearTableSpec(&spec);
2803 return rc;
2804}
2805
2806/* The %_content table holds the text of each document, with
2807** the docid column exposed as the SQLite rowid for the table.
2808*/
2809/* TODO(shess) This comment needs elaboration to match the updated
2810** code. Work it into the top-of-file comment at that time.
2811*/
2812static int fulltextCreate(sqlite3 *db, void *pAux,
2813 int argc, const char * const *argv,
2814 sqlite3_vtab **ppVTab, char **pzErr){
2815 int rc;
2816 TableSpec spec;
2817 StringBuffer schema;
2818 TRACE(("FTS3 Create\n"));
2819
2820 rc = parseSpec(&spec, argc, argv, pzErr);
2821 if( rc!=SQLITE_OK ) return rc;
2822
2823 initStringBuffer(&schema);
2824 append(&schema, "CREATE TABLE %_content(");
2825 append(&schema, " docid INTEGER PRIMARY KEY,");
2826 appendList(&schema, spec.nColumn, spec.azContentColumn);
2827 append(&schema, ")");
2828 rc = sql_exec(db, spec.zDb, spec.zName, stringBufferData(&schema));
2829 stringBufferDestroy(&schema);
2830 if( rc!=SQLITE_OK ) goto out;
2831
2832 rc = sql_exec(db, spec.zDb, spec.zName,
2833 "create table %_segments("
2834 " blockid INTEGER PRIMARY KEY,"
2835 " block blob"
2836 ");"
2837 );
2838 if( rc!=SQLITE_OK ) goto out;
2839
2840 rc = sql_exec(db, spec.zDb, spec.zName,
2841 "create table %_segdir("
2842 " level integer,"
2843 " idx integer,"
2844 " start_block integer,"
2845 " leaves_end_block integer,"
2846 " end_block integer,"
2847 " root blob,"
2848 " primary key(level, idx)"
2849 ");");
2850 if( rc!=SQLITE_OK ) goto out;
2851
2852 rc = constructVtab(db, (fts3Hash *)pAux, &spec, ppVTab, pzErr);
2853
2854out:
2855 clearTableSpec(&spec);
2856 return rc;
2857}
2858
2859/* Decide how to handle an SQL query. */
2860static int fulltextBestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){
2861 fulltext_vtab *v = (fulltext_vtab *)pVTab;
2862 int i;
2863 TRACE(("FTS3 BestIndex\n"));
2864
2865 for(i=0; i<pInfo->nConstraint; ++i){
2866 const struct sqlite3_index_constraint *pConstraint;
2867 pConstraint = &pInfo->aConstraint[i];
2868 if( pConstraint->usable ) {
2869 if( (pConstraint->iColumn==-1 || pConstraint->iColumn==v->nColumn+1) &&
2870 pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ ){
2871 pInfo->idxNum = QUERY_DOCID; /* lookup by docid */
2872 TRACE(("FTS3 QUERY_DOCID\n"));
2873 } else if( pConstraint->iColumn>=0 && pConstraint->iColumn<=v->nColumn &&
2874 pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH ){
2875 /* full-text search */
2876 pInfo->idxNum = QUERY_FULLTEXT + pConstraint->iColumn;
2877 TRACE(("FTS3 QUERY_FULLTEXT %d\n", pConstraint->iColumn));
2878 } else continue;
2879
2880 pInfo->aConstraintUsage[i].argvIndex = 1;
2881 pInfo->aConstraintUsage[i].omit = 1;
2882
2883 /* An arbitrary value for now.
2884 * TODO: Perhaps docid matches should be considered cheaper than
2885 * full-text searches. */
2886 pInfo->estimatedCost = 1.0;
2887
2888 return SQLITE_OK;
2889 }
2890 }
2891 pInfo->idxNum = QUERY_GENERIC;
2892 return SQLITE_OK;
2893}
2894
2895static int fulltextDisconnect(sqlite3_vtab *pVTab){
2896 TRACE(("FTS3 Disconnect %p\n", pVTab));
2897 fulltext_vtab_destroy((fulltext_vtab *)pVTab);
2898 return SQLITE_OK;
2899}
2900
2901static int fulltextDestroy(sqlite3_vtab *pVTab){
2902 fulltext_vtab *v = (fulltext_vtab *)pVTab;
2903 int rc;
2904
2905 TRACE(("FTS3 Destroy %p\n", pVTab));
2906 rc = sql_exec(v->db, v->zDb, v->zName,
2907 "drop table if exists %_content;"
2908 "drop table if exists %_segments;"
2909 "drop table if exists %_segdir;"
2910 );
2911 if( rc!=SQLITE_OK ) return rc;
2912
2913 fulltext_vtab_destroy((fulltext_vtab *)pVTab);
2914 return SQLITE_OK;
2915}
2916
2917static int fulltextOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){
2918 fulltext_cursor *c;
2919
2920 c = (fulltext_cursor *) calloc(sizeof(fulltext_cursor), 1);
2921 /* sqlite will initialize c->base */
2922 *ppCursor = &c->base;
2923 TRACE(("FTS3 Open %p: %p\n", pVTab, c));
2924
2925 return SQLITE_OK;
2926}
2927
2928
2929/* Free all of the dynamically allocated memory held by *q
2930*/
2931static void queryClear(Query *q){
2932 int i;
2933 for(i = 0; i < q->nTerms; ++i){
2934 free(q->pTerms[i].pTerm);
2935 }
2936 free(q->pTerms);
2937 CLEAR(q);
2938}
2939
2940/* Free all of the dynamically allocated memory held by the
2941** Snippet
2942*/
2943static void snippetClear(Snippet *p){
2944 free(p->aMatch);
2945 free(p->zOffset);
2946 free(p->zSnippet);
2947 CLEAR(p);
2948}
2949/*
2950** Append a single entry to the p->aMatch[] log.
2951*/
2952static void snippetAppendMatch(
2953 Snippet *p, /* Append the entry to this snippet */
2954 int iCol, int iTerm, /* The column and query term */
2955 int iStart, int nByte /* Offset and size of the match */
2956){
2957 int i;
2958 struct snippetMatch *pMatch;
2959 if( p->nMatch+1>=p->nAlloc ){
2960 p->nAlloc = p->nAlloc*2 + 10;
2961 p->aMatch = realloc(p->aMatch, p->nAlloc*sizeof(p->aMatch[0]) );
2962 if( p->aMatch==0 ){
2963 p->nMatch = 0;
2964 p->nAlloc = 0;
2965 return;
2966 }
2967 }
2968 i = p->nMatch++;
2969 pMatch = &p->aMatch[i];
2970 pMatch->iCol = iCol;
2971 pMatch->iTerm = iTerm;
2972 pMatch->iStart = iStart;
2973 pMatch->nByte = nByte;
2974}
2975
2976/*
2977** Sizing information for the circular buffer used in snippetOffsetsOfColumn()
2978*/
2979#define FTS3_ROTOR_SZ (32)
2980#define FTS3_ROTOR_MASK (FTS3_ROTOR_SZ-1)
2981
2982/*
2983** Add entries to pSnippet->aMatch[] for every match that occurs against
2984** document zDoc[0..nDoc-1] which is stored in column iColumn.
2985*/
2986static void snippetOffsetsOfColumn(
2987 Query *pQuery,
2988 Snippet *pSnippet,
2989 int iColumn,
2990 const char *zDoc,
2991 int nDoc
2992){
2993 const sqlite3_tokenizer_module *pTModule; /* The tokenizer module */
2994 sqlite3_tokenizer *pTokenizer; /* The specific tokenizer */
2995 sqlite3_tokenizer_cursor *pTCursor; /* Tokenizer cursor */
2996 fulltext_vtab *pVtab; /* The full text index */
2997 int nColumn; /* Number of columns in the index */
2998 const QueryTerm *aTerm; /* Query string terms */
2999 int nTerm; /* Number of query string terms */
3000 int i, j; /* Loop counters */
3001 int rc; /* Return code */
3002 unsigned int match, prevMatch; /* Phrase search bitmasks */
3003 const char *zToken; /* Next token from the tokenizer */
3004 int nToken; /* Size of zToken */
3005 int iBegin, iEnd, iPos; /* Offsets of beginning and end */
3006
3007 /* The following variables keep a circular buffer of the last
3008 ** few tokens */
3009 unsigned int iRotor = 0; /* Index of current token */
3010 int iRotorBegin[FTS3_ROTOR_SZ]; /* Beginning offset of token */
3011 int iRotorLen[FTS3_ROTOR_SZ]; /* Length of token */
3012
3013 pVtab = pQuery->pFts;
3014 nColumn = pVtab->nColumn;
3015 pTokenizer = pVtab->pTokenizer;
3016 pTModule = pTokenizer->pModule;
3017 rc = pTModule->xOpen(pTokenizer, zDoc, nDoc, &pTCursor);
3018 if( rc ) return;
3019 pTCursor->pTokenizer = pTokenizer;
3020 aTerm = pQuery->pTerms;
3021 nTerm = pQuery->nTerms;
3022 if( nTerm>=FTS3_ROTOR_SZ ){
3023 nTerm = FTS3_ROTOR_SZ - 1;
3024 }
3025 prevMatch = 0;
3026 while(1){
3027 rc = pTModule->xNext(pTCursor, &zToken, &nToken, &iBegin, &iEnd, &iPos);
3028 if( rc ) break;
3029 iRotorBegin[iRotor&FTS3_ROTOR_MASK] = iBegin;
3030 iRotorLen[iRotor&FTS3_ROTOR_MASK] = iEnd-iBegin;
3031 match = 0;
3032 for(i=0; i<nTerm; i++){
3033 int iCol;
3034 iCol = aTerm[i].iColumn;
3035 if( iCol>=0 && iCol<nColumn && iCol!=iColumn ) continue;
3036 if( aTerm[i].nTerm>nToken ) continue;
3037 if( !aTerm[i].isPrefix && aTerm[i].nTerm<nToken ) continue;
3038 assert( aTerm[i].nTerm<=nToken );
3039 if( memcmp(aTerm[i].pTerm, zToken, aTerm[i].nTerm) ) continue;
3040 if( aTerm[i].iPhrase>1 && (prevMatch & (1<<i))==0 ) continue;
3041 match |= 1<<i;
3042 if( i==nTerm-1 || aTerm[i+1].iPhrase==1 ){
3043 for(j=aTerm[i].iPhrase-1; j>=0; j--){
3044 int k = (iRotor-j) & FTS3_ROTOR_MASK;
3045 snippetAppendMatch(pSnippet, iColumn, i-j,
3046 iRotorBegin[k], iRotorLen[k]);
3047 }
3048 }
3049 }
3050 prevMatch = match<<1;
3051 iRotor++;
3052 }
3053 pTModule->xClose(pTCursor);
3054}
3055
3056
3057/*
3058** Compute all offsets for the current row of the query.
3059** If the offsets have already been computed, this routine is a no-op.
3060*/
3061static void snippetAllOffsets(fulltext_cursor *p){
3062 int nColumn;
3063 int iColumn, i;
3064 int iFirst, iLast;
3065 fulltext_vtab *pFts;
3066
3067 if( p->snippet.nMatch ) return;
3068 if( p->q.nTerms==0 ) return;
3069 pFts = p->q.pFts;
3070 nColumn = pFts->nColumn;
3071 iColumn = (p->iCursorType - QUERY_FULLTEXT);
3072 if( iColumn<0 || iColumn>=nColumn ){
3073 iFirst = 0;
3074 iLast = nColumn-1;
3075 }else{
3076 iFirst = iColumn;
3077 iLast = iColumn;
3078 }
3079 for(i=iFirst; i<=iLast; i++){
3080 const char *zDoc;
3081 int nDoc;
3082 zDoc = (const char*)sqlite3_column_text(p->pStmt, i+1);
3083 nDoc = sqlite3_column_bytes(p->pStmt, i+1);
3084 snippetOffsetsOfColumn(&p->q, &p->snippet, i, zDoc, nDoc);
3085 }
3086}
3087
3088/*
3089** Convert the information in the aMatch[] array of the snippet
3090** into the string zOffset[0..nOffset-1].
3091*/
3092static void snippetOffsetText(Snippet *p){
3093 int i;
3094 int cnt = 0;
3095 StringBuffer sb;
3096 char zBuf[200];
3097 if( p->zOffset ) return;
3098 initStringBuffer(&sb);
3099 for(i=0; i<p->nMatch; i++){
3100 struct snippetMatch *pMatch = &p->aMatch[i];
3101 zBuf[0] = ' ';
3102 sprintf(&zBuf[cnt>0], "%d %d %d %d", pMatch->iCol,
3103 pMatch->iTerm, pMatch->iStart, pMatch->nByte);
3104 append(&sb, zBuf);
3105 cnt++;
3106 }
3107 p->zOffset = stringBufferData(&sb);
3108 p->nOffset = stringBufferLength(&sb);
3109}
3110
3111/*
3112** zDoc[0..nDoc-1] is phrase of text. aMatch[0..nMatch-1] are a set
3113** of matching words some of which might be in zDoc. zDoc is column
3114** number iCol.
3115**
3116** iBreak is suggested spot in zDoc where we could begin or end an
3117** excerpt. Return a value similar to iBreak but possibly adjusted
3118** to be a little left or right so that the break point is better.
3119*/
3120static int wordBoundary(
3121 int iBreak, /* The suggested break point */
3122 const char *zDoc, /* Document text */
3123 int nDoc, /* Number of bytes in zDoc[] */
3124 struct snippetMatch *aMatch, /* Matching words */
3125 int nMatch, /* Number of entries in aMatch[] */
3126 int iCol /* The column number for zDoc[] */
3127){
3128 int i;
3129 if( iBreak<=10 ){
3130 return 0;
3131 }
3132 if( iBreak>=nDoc-10 ){
3133 return nDoc;
3134 }
3135 for(i=0; i<nMatch && aMatch[i].iCol<iCol; i++){}
3136 while( i<nMatch && aMatch[i].iStart+aMatch[i].nByte<iBreak ){ i++; }
3137 if( i<nMatch ){
3138 if( aMatch[i].iStart<iBreak+10 ){
3139 return aMatch[i].iStart;
3140 }
3141 if( i>0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){
3142 return aMatch[i-1].iStart;
3143 }
3144 }
3145 for(i=1; i<=10; i++){
3146 if( safe_isspace(zDoc[iBreak-i]) ){
3147 return iBreak - i + 1;
3148 }
3149 if( safe_isspace(zDoc[iBreak+i]) ){
3150 return iBreak + i + 1;
3151 }
3152 }
3153 return iBreak;
3154}
3155
3156
3157
3158/*
3159** Allowed values for Snippet.aMatch[].snStatus
3160*/
3161#define SNIPPET_IGNORE 0 /* It is ok to omit this match from the snippet */
3162#define SNIPPET_DESIRED 1 /* We want to include this match in the snippet */
3163
3164/*
3165** Generate the text of a snippet.
3166*/
3167static void snippetText(
3168 fulltext_cursor *pCursor, /* The cursor we need the snippet for */
3169 const char *zStartMark, /* Markup to appear before each match */
3170 const char *zEndMark, /* Markup to appear after each match */
3171 const char *zEllipsis /* Ellipsis mark */
3172){
3173 int i, j;
3174 struct snippetMatch *aMatch;
3175 int nMatch;
3176 int nDesired;
3177 StringBuffer sb;
3178 int tailCol;
3179 int tailOffset;
3180 int iCol;
3181 int nDoc;
3182 const char *zDoc;
3183 int iStart, iEnd;
3184 int tailEllipsis = 0;
3185 int iMatch;
3186
3187
3188 free(pCursor->snippet.zSnippet);
3189 pCursor->snippet.zSnippet = 0;
3190 aMatch = pCursor->snippet.aMatch;
3191 nMatch = pCursor->snippet.nMatch;
3192 initStringBuffer(&sb);
3193
3194 for(i=0; i<nMatch; i++){
3195 aMatch[i].snStatus = SNIPPET_IGNORE;
3196 }
3197 nDesired = 0;
3198 for(i=0; i<pCursor->q.nTerms; i++){
3199 for(j=0; j<nMatch; j++){
3200 if( aMatch[j].iTerm==i ){
3201 aMatch[j].snStatus = SNIPPET_DESIRED;
3202 nDesired++;
3203 break;
3204 }
3205 }
3206 }
3207
3208 iMatch = 0;
3209 tailCol = -1;
3210 tailOffset = 0;
3211 for(i=0; i<nMatch && nDesired>0; i++){
3212 if( aMatch[i].snStatus!=SNIPPET_DESIRED ) continue;
3213 nDesired--;
3214 iCol = aMatch[i].iCol;
3215 zDoc = (const char*)sqlite3_column_text(pCursor->pStmt, iCol+1);
3216 nDoc = sqlite3_column_bytes(pCursor->pStmt, iCol+1);
3217 iStart = aMatch[i].iStart - 40;
3218 iStart = wordBoundary(iStart, zDoc, nDoc, aMatch, nMatch, iCol);
3219 if( iStart<=10 ){
3220 iStart = 0;
3221 }
3222 if( iCol==tailCol && iStart<=tailOffset+20 ){
3223 iStart = tailOffset;
3224 }
3225 if( (iCol!=tailCol && tailCol>=0) || iStart!=tailOffset ){
3226 trimWhiteSpace(&sb);
3227 appendWhiteSpace(&sb);
3228 append(&sb, zEllipsis);
3229 appendWhiteSpace(&sb);
3230 }
3231 iEnd = aMatch[i].iStart + aMatch[i].nByte + 40;
3232 iEnd = wordBoundary(iEnd, zDoc, nDoc, aMatch, nMatch, iCol);
3233 if( iEnd>=nDoc-10 ){
3234 iEnd = nDoc;
3235 tailEllipsis = 0;
3236 }else{
3237 tailEllipsis = 1;
3238 }
3239 while( iMatch<nMatch && aMatch[iMatch].iCol<iCol ){ iMatch++; }
3240 while( iStart<iEnd ){
3241 while( iMatch<nMatch && aMatch[iMatch].iStart<iStart
3242 && aMatch[iMatch].iCol<=iCol ){
3243 iMatch++;
3244 }
3245 if( iMatch<nMatch && aMatch[iMatch].iStart<iEnd
3246 && aMatch[iMatch].iCol==iCol ){
3247 nappend(&sb, &zDoc[iStart], aMatch[iMatch].iStart - iStart);
3248 iStart = aMatch[iMatch].iStart;
3249 append(&sb, zStartMark);
3250 nappend(&sb, &zDoc[iStart], aMatch[iMatch].nByte);
3251 append(&sb, zEndMark);
3252 iStart += aMatch[iMatch].nByte;
3253 for(j=iMatch+1; j<nMatch; j++){
3254 if( aMatch[j].iTerm==aMatch[iMatch].iTerm
3255 && aMatch[j].snStatus==SNIPPET_DESIRED ){
3256 nDesired--;
3257 aMatch[j].snStatus = SNIPPET_IGNORE;
3258 }
3259 }
3260 }else{
3261 nappend(&sb, &zDoc[iStart], iEnd - iStart);
3262 iStart = iEnd;
3263 }
3264 }
3265 tailCol = iCol;
3266 tailOffset = iEnd;
3267 }
3268 trimWhiteSpace(&sb);
3269 if( tailEllipsis ){
3270 appendWhiteSpace(&sb);
3271 append(&sb, zEllipsis);
3272 }
3273 pCursor->snippet.zSnippet = stringBufferData(&sb);
3274 pCursor->snippet.nSnippet = stringBufferLength(&sb);
3275}
3276
3277
3278/*
3279** Close the cursor. For additional information see the documentation
3280** on the xClose method of the virtual table interface.
3281*/
3282static int fulltextClose(sqlite3_vtab_cursor *pCursor){
3283 fulltext_cursor *c = (fulltext_cursor *) pCursor;
3284 TRACE(("FTS3 Close %p\n", c));
3285 sqlite3_finalize(c->pStmt);
3286 queryClear(&c->q);
3287 snippetClear(&c->snippet);
3288 if( c->result.nData!=0 ) dlrDestroy(&c->reader);
3289 dataBufferDestroy(&c->result);
3290 free(c);
3291 return SQLITE_OK;
3292}
3293
3294static int fulltextNext(sqlite3_vtab_cursor *pCursor){
3295 fulltext_cursor *c = (fulltext_cursor *) pCursor;
3296 int rc;
3297
3298 TRACE(("FTS3 Next %p\n", pCursor));
3299 snippetClear(&c->snippet);
3300 if( c->iCursorType < QUERY_FULLTEXT ){
3301 /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
3302 rc = sqlite3_step(c->pStmt);
3303 switch( rc ){
3304 case SQLITE_ROW:
3305 c->eof = 0;
3306 return SQLITE_OK;
3307 case SQLITE_DONE:
3308 c->eof = 1;
3309 return SQLITE_OK;
3310 default:
3311 c->eof = 1;
3312 return rc;
3313 }
3314 } else { /* full-text query */
3315 rc = sqlite3_reset(c->pStmt);
3316 if( rc!=SQLITE_OK ) return rc;
3317
3318 if( c->result.nData==0 || dlrAtEnd(&c->reader) ){
3319 c->eof = 1;
3320 return SQLITE_OK;
3321 }
3322 rc = sqlite3_bind_int64(c->pStmt, 1, dlrDocid(&c->reader));
3323 dlrStep(&c->reader);
3324 if( rc!=SQLITE_OK ) return rc;
3325 /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
3326 rc = sqlite3_step(c->pStmt);
3327 if( rc==SQLITE_ROW ){ /* the case we expect */
3328 c->eof = 0;
3329 return SQLITE_OK;
3330 }
3331 /* an error occurred; abort */
3332 return rc==SQLITE_DONE ? SQLITE_ERROR : rc;
3333 }
3334}
3335
3336
3337/* TODO(shess) If we pushed LeafReader to the top of the file, or to
3338** another file, term_select() could be pushed above
3339** docListOfTerm().
3340*/
3341static int termSelect(fulltext_vtab *v, int iColumn,
3342 const char *pTerm, int nTerm, int isPrefix,
3343 DocListType iType, DataBuffer *out);
3344
3345/* Return a DocList corresponding to the query term *pTerm. If *pTerm
3346** is the first term of a phrase query, go ahead and evaluate the phrase
3347** query and return the doclist for the entire phrase query.
3348**
3349** The resulting DL_DOCIDS doclist is stored in pResult, which is
3350** overwritten.
3351*/
3352static int docListOfTerm(
3353 fulltext_vtab *v, /* The full text index */
3354 int iColumn, /* column to restrict to. No restriction if >=nColumn */
3355 QueryTerm *pQTerm, /* Term we are looking for, or 1st term of a phrase */
3356 DataBuffer *pResult /* Write the result here */
3357){
3358 DataBuffer left, right, new;
3359 int i, rc;
3360
3361 /* No phrase search if no position info. */
3362 assert( pQTerm->nPhrase==0 || DL_DEFAULT!=DL_DOCIDS );
3363
3364 /* This code should never be called with buffered updates. */
3365 assert( v->nPendingData<0 );
3366
3367 dataBufferInit(&left, 0);
3368 rc = termSelect(v, iColumn, pQTerm->pTerm, pQTerm->nTerm, pQTerm->isPrefix,
3369 0<pQTerm->nPhrase ? DL_POSITIONS : DL_DOCIDS, &left);
3370 if( rc ) return rc;
3371 for(i=1; i<=pQTerm->nPhrase && left.nData>0; i++){
3372 dataBufferInit(&right, 0);
3373 rc = termSelect(v, iColumn, pQTerm[i].pTerm, pQTerm[i].nTerm,
3374 pQTerm[i].isPrefix, DL_POSITIONS, &right);
3375 if( rc ){
3376 dataBufferDestroy(&left);
3377 return rc;
3378 }
3379 dataBufferInit(&new, 0);
3380 docListPhraseMerge(left.pData, left.nData, right.pData, right.nData,
3381 i<pQTerm->nPhrase ? DL_POSITIONS : DL_DOCIDS, &new);
3382 dataBufferDestroy(&left);
3383 dataBufferDestroy(&right);
3384 left = new;
3385 }
3386 *pResult = left;
3387 return SQLITE_OK;
3388}
3389
3390/* Add a new term pTerm[0..nTerm-1] to the query *q.
3391*/
3392static void queryAdd(Query *q, const char *pTerm, int nTerm){
3393 QueryTerm *t;
3394 ++q->nTerms;
3395 q->pTerms = realloc(q->pTerms, q->nTerms * sizeof(q->pTerms[0]));
3396 if( q->pTerms==0 ){
3397 q->nTerms = 0;
3398 return;
3399 }
3400 t = &q->pTerms[q->nTerms - 1];
3401 CLEAR(t);
3402 t->pTerm = malloc(nTerm+1);
3403 memcpy(t->pTerm, pTerm, nTerm);
3404 t->pTerm[nTerm] = 0;
3405 t->nTerm = nTerm;
3406 t->isOr = q->nextIsOr;
3407 t->isPrefix = 0;
3408 q->nextIsOr = 0;
3409 t->iColumn = q->nextColumn;
3410 q->nextColumn = q->dfltColumn;
3411}
3412
3413/*
3414** Check to see if the string zToken[0...nToken-1] matches any
3415** column name in the virtual table. If it does,
3416** return the zero-indexed column number. If not, return -1.
3417*/
3418static int checkColumnSpecifier(
3419 fulltext_vtab *pVtab, /* The virtual table */
3420 const char *zToken, /* Text of the token */
3421 int nToken /* Number of characters in the token */
3422){
3423 int i;
3424 for(i=0; i<pVtab->nColumn; i++){
3425 if( memcmp(pVtab->azColumn[i], zToken, nToken)==0
3426 && pVtab->azColumn[i][nToken]==0 ){
3427 return i;
3428 }
3429 }
3430 return -1;
3431}
3432
3433/*
3434** Parse the text at pSegment[0..nSegment-1]. Add additional terms
3435** to the query being assemblied in pQuery.
3436**
3437** inPhrase is true if pSegment[0..nSegement-1] is contained within
3438** double-quotes. If inPhrase is true, then the first term
3439** is marked with the number of terms in the phrase less one and
3440** OR and "-" syntax is ignored. If inPhrase is false, then every
3441** term found is marked with nPhrase=0 and OR and "-" syntax is significant.
3442*/
3443static int tokenizeSegment(
3444 sqlite3_tokenizer *pTokenizer, /* The tokenizer to use */
3445 const char *pSegment, int nSegment, /* Query expression being parsed */
3446 int inPhrase, /* True if within "..." */
3447 Query *pQuery /* Append results here */
3448){
3449 const sqlite3_tokenizer_module *pModule = pTokenizer->pModule;
3450 sqlite3_tokenizer_cursor *pCursor;
3451 int firstIndex = pQuery->nTerms;
3452 int iCol;
3453 int nTerm = 1;
3454
3455 int rc = pModule->xOpen(pTokenizer, pSegment, nSegment, &pCursor);
3456 if( rc!=SQLITE_OK ) return rc;
3457 pCursor->pTokenizer = pTokenizer;
3458
3459 while( 1 ){
3460 const char *pToken;
3461 int nToken, iBegin, iEnd, iPos;
3462
3463 rc = pModule->xNext(pCursor,
3464 &pToken, &nToken,
3465 &iBegin, &iEnd, &iPos);
3466 if( rc!=SQLITE_OK ) break;
3467 if( !inPhrase &&
3468 pSegment[iEnd]==':' &&
3469 (iCol = checkColumnSpecifier(pQuery->pFts, pToken, nToken))>=0 ){
3470 pQuery->nextColumn = iCol;
3471 continue;
3472 }
3473 if( !inPhrase && pQuery->nTerms>0 && nToken==2
3474 && pSegment[iBegin]=='O' && pSegment[iBegin+1]=='R' ){
3475 pQuery->nextIsOr = 1;
3476 continue;
3477 }
3478 queryAdd(pQuery, pToken, nToken);
3479 if( !inPhrase && iBegin>0 && pSegment[iBegin-1]=='-' ){
3480 pQuery->pTerms[pQuery->nTerms-1].isNot = 1;
3481 }
3482 if( iEnd<nSegment && pSegment[iEnd]=='*' ){
3483 pQuery->pTerms[pQuery->nTerms-1].isPrefix = 1;
3484 }
3485 pQuery->pTerms[pQuery->nTerms-1].iPhrase = nTerm;
3486 if( inPhrase ){
3487 nTerm++;
3488 }
3489 }
3490
3491 if( inPhrase && pQuery->nTerms>firstIndex ){
3492 pQuery->pTerms[firstIndex].nPhrase = pQuery->nTerms - firstIndex - 1;
3493 }
3494
3495 return pModule->xClose(pCursor);
3496}
3497
3498/* Parse a query string, yielding a Query object pQuery.
3499**
3500** The calling function will need to queryClear() to clean up
3501** the dynamically allocated memory held by pQuery.
3502*/
3503static int parseQuery(
3504 fulltext_vtab *v, /* The fulltext index */
3505 const char *zInput, /* Input text of the query string */
3506 int nInput, /* Size of the input text */
3507 int dfltColumn, /* Default column of the index to match against */
3508 Query *pQuery /* Write the parse results here. */
3509){
3510 int iInput, inPhrase = 0;
3511
3512 if( zInput==0 ) nInput = 0;
3513 if( nInput<0 ) nInput = strlen(zInput);
3514 pQuery->nTerms = 0;
3515 pQuery->pTerms = NULL;
3516 pQuery->nextIsOr = 0;
3517 pQuery->nextColumn = dfltColumn;
3518 pQuery->dfltColumn = dfltColumn;
3519 pQuery->pFts = v;
3520
3521 for(iInput=0; iInput<nInput; ++iInput){
3522 int i;
3523 for(i=iInput; i<nInput && zInput[i]!='"'; ++i){}
3524 if( i>iInput ){
3525 tokenizeSegment(v->pTokenizer, zInput+iInput, i-iInput, inPhrase,
3526 pQuery);
3527 }
3528 iInput = i;
3529 if( i<nInput ){
3530 assert( zInput[i]=='"' );
3531 inPhrase = !inPhrase;
3532 }
3533 }
3534
3535 if( inPhrase ){
3536 /* unmatched quote */
3537 queryClear(pQuery);
3538 return SQLITE_ERROR;
3539 }
3540 return SQLITE_OK;
3541}
3542
3543/* TODO(shess) Refactor the code to remove this forward decl. */
3544static int flushPendingTerms(fulltext_vtab *v);
3545
3546/* Perform a full-text query using the search expression in
3547** zInput[0..nInput-1]. Return a list of matching documents
3548** in pResult.
3549**
3550** Queries must match column iColumn. Or if iColumn>=nColumn
3551** they are allowed to match against any column.
3552*/
3553static int fulltextQuery(
3554 fulltext_vtab *v, /* The full text index */
3555 int iColumn, /* Match against this column by default */
3556 const char *zInput, /* The query string */
3557 int nInput, /* Number of bytes in zInput[] */
3558 DataBuffer *pResult, /* Write the result doclist here */
3559 Query *pQuery /* Put parsed query string here */
3560){
3561 int i, iNext, rc;
3562 DataBuffer left, right, or, new;
3563 int nNot = 0;
3564 QueryTerm *aTerm;
3565
3566 /* TODO(shess) Instead of flushing pendingTerms, we could query for
3567 ** the relevant term and merge the doclist into what we receive from
3568 ** the database. Wait and see if this is a common issue, first.
3569 **
3570 ** A good reason not to flush is to not generate update-related
3571 ** error codes from here.
3572 */
3573
3574 /* Flush any buffered updates before executing the query. */
3575 rc = flushPendingTerms(v);
3576 if( rc!=SQLITE_OK ) return rc;
3577
3578 /* TODO(shess) I think that the queryClear() calls below are not
3579 ** necessary, because fulltextClose() already clears the query.
3580 */
3581 rc = parseQuery(v, zInput, nInput, iColumn, pQuery);
3582 if( rc!=SQLITE_OK ) return rc;
3583
3584 /* Empty or NULL queries return no results. */
3585 if( pQuery->nTerms==0 ){
3586 dataBufferInit(pResult, 0);
3587 return SQLITE_OK;
3588 }
3589
3590 /* Merge AND terms. */
3591 /* TODO(shess) I think we can early-exit if( i>nNot && left.nData==0 ). */
3592 aTerm = pQuery->pTerms;
3593 for(i = 0; i<pQuery->nTerms; i=iNext){
3594 if( aTerm[i].isNot ){
3595 /* Handle all NOT terms in a separate pass */
3596 nNot++;
3597 iNext = i + aTerm[i].nPhrase+1;
3598 continue;
3599 }
3600 iNext = i + aTerm[i].nPhrase + 1;
3601 rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &right);
3602 if( rc ){
3603 if( i!=nNot ) dataBufferDestroy(&left);
3604 queryClear(pQuery);
3605 return rc;
3606 }
3607 while( iNext<pQuery->nTerms && aTerm[iNext].isOr ){
3608 rc = docListOfTerm(v, aTerm[iNext].iColumn, &aTerm[iNext], &or);
3609 iNext += aTerm[iNext].nPhrase + 1;
3610 if( rc ){
3611 if( i!=nNot ) dataBufferDestroy(&left);
3612 dataBufferDestroy(&right);
3613 queryClear(pQuery);
3614 return rc;
3615 }
3616 dataBufferInit(&new, 0);
3617 docListOrMerge(right.pData, right.nData, or.pData, or.nData, &new);
3618 dataBufferDestroy(&right);
3619 dataBufferDestroy(&or);
3620 right = new;
3621 }
3622 if( i==nNot ){ /* first term processed. */
3623 left = right;
3624 }else{
3625 dataBufferInit(&new, 0);
3626 docListAndMerge(left.pData, left.nData, right.pData, right.nData, &new);
3627 dataBufferDestroy(&right);
3628 dataBufferDestroy(&left);
3629 left = new;
3630 }
3631 }
3632
3633 if( nNot==pQuery->nTerms ){
3634 /* We do not yet know how to handle a query of only NOT terms */
3635 return SQLITE_ERROR;
3636 }
3637
3638 /* Do the EXCEPT terms */
3639 for(i=0; i<pQuery->nTerms; i += aTerm[i].nPhrase + 1){
3640 if( !aTerm[i].isNot ) continue;
3641 rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &right);
3642 if( rc ){
3643 queryClear(pQuery);
3644 dataBufferDestroy(&left);
3645 return rc;
3646 }
3647 dataBufferInit(&new, 0);
3648 docListExceptMerge(left.pData, left.nData, right.pData, right.nData, &new);
3649 dataBufferDestroy(&right);
3650 dataBufferDestroy(&left);
3651 left = new;
3652 }
3653
3654 *pResult = left;
3655 return rc;
3656}
3657
3658/*
3659** This is the xFilter interface for the virtual table. See
3660** the virtual table xFilter method documentation for additional
3661** information.
3662**
3663** If idxNum==QUERY_GENERIC then do a full table scan against
3664** the %_content table.
3665**
3666** If idxNum==QUERY_DOCID then do a docid lookup for a single entry
3667** in the %_content table.
3668**
3669** If idxNum>=QUERY_FULLTEXT then use the full text index. The
3670** column on the left-hand side of the MATCH operator is column
3671** number idxNum-QUERY_FULLTEXT, 0 indexed. argv[0] is the right-hand
3672** side of the MATCH operator.
3673*/
3674/* TODO(shess) Upgrade the cursor initialization and destruction to
3675** account for fulltextFilter() being called multiple times on the
3676** same cursor. The current solution is very fragile. Apply fix to
3677** fts3 as appropriate.
3678*/
3679static int fulltextFilter(
3680 sqlite3_vtab_cursor *pCursor, /* The cursor used for this query */
3681 int idxNum, const char *idxStr, /* Which indexing scheme to use */
3682 int argc, sqlite3_value **argv /* Arguments for the indexing scheme */
3683){
3684 fulltext_cursor *c = (fulltext_cursor *) pCursor;
3685 fulltext_vtab *v = cursor_vtab(c);
3686 int rc;
3687 StringBuffer sb;
3688
3689 TRACE(("FTS3 Filter %p\n",pCursor));
3690
3691 initStringBuffer(&sb);
3692 append(&sb, "SELECT docid, ");
3693 appendList(&sb, v->nColumn, v->azContentColumn);
3694 append(&sb, " FROM %_content");
3695 if( idxNum!=QUERY_GENERIC ) append(&sb, " WHERE docid = ?");
3696 sqlite3_finalize(c->pStmt);
3697 rc = sql_prepare(v->db, v->zDb, v->zName, &c->pStmt, stringBufferData(&sb));
3698 stringBufferDestroy(&sb);
3699 if( rc!=SQLITE_OK ) return rc;
3700
3701 c->iCursorType = idxNum;
3702 switch( idxNum ){
3703 case QUERY_GENERIC:
3704 break;
3705
3706 case QUERY_DOCID:
3707 rc = sqlite3_bind_int64(c->pStmt, 1, sqlite3_value_int64(argv[0]));
3708 if( rc!=SQLITE_OK ) return rc;
3709 break;
3710
3711 default: /* full-text search */
3712 {
3713 const char *zQuery = (const char *)sqlite3_value_text(argv[0]);
3714 assert( idxNum<=QUERY_FULLTEXT+v->nColumn);
3715 assert( argc==1 );
3716 queryClear(&c->q);
3717 if( c->result.nData!=0 ){
3718 /* This case happens if the same cursor is used repeatedly. */
3719 dlrDestroy(&c->reader);
3720 dataBufferReset(&c->result);
3721 }else{
3722 dataBufferInit(&c->result, 0);
3723 }
3724 rc = fulltextQuery(v, idxNum-QUERY_FULLTEXT, zQuery, -1, &c->result, &c->q);
3725 if( rc!=SQLITE_OK ) return rc;
3726 if( c->result.nData!=0 ){
3727 dlrInit(&c->reader, DL_DOCIDS, c->result.pData, c->result.nData);
3728 }
3729 break;
3730 }
3731 }
3732
3733 return fulltextNext(pCursor);
3734}
3735
3736/* This is the xEof method of the virtual table. The SQLite core
3737** calls this routine to find out if it has reached the end of
3738** a query's results set.
3739*/
3740static int fulltextEof(sqlite3_vtab_cursor *pCursor){
3741 fulltext_cursor *c = (fulltext_cursor *) pCursor;
3742 return c->eof;
3743}
3744
3745/* This is the xColumn method of the virtual table. The SQLite
3746** core calls this method during a query when it needs the value
3747** of a column from the virtual table. This method needs to use
3748** one of the sqlite3_result_*() routines to store the requested
3749** value back in the pContext.
3750*/
3751static int fulltextColumn(sqlite3_vtab_cursor *pCursor,
3752 sqlite3_context *pContext, int idxCol){
3753 fulltext_cursor *c = (fulltext_cursor *) pCursor;
3754 fulltext_vtab *v = cursor_vtab(c);
3755
3756 if( idxCol<v->nColumn ){
3757 sqlite3_value *pVal = sqlite3_column_value(c->pStmt, idxCol+1);
3758 sqlite3_result_value(pContext, pVal);
3759 }else if( idxCol==v->nColumn ){
3760 /* The extra column whose name is the same as the table.
3761 ** Return a blob which is a pointer to the cursor
3762 */
3763 sqlite3_result_blob(pContext, &c, sizeof(c), SQLITE_TRANSIENT);
3764 }else if( idxCol==v->nColumn+1 ){
3765 /* The docid column, which is an alias for rowid. */
3766 sqlite3_value *pVal = sqlite3_column_value(c->pStmt, 0);
3767 sqlite3_result_value(pContext, pVal);
3768 }
3769 return SQLITE_OK;
3770}
3771
3772/* This is the xRowid method. The SQLite core calls this routine to
3773** retrieve the rowid for the current row of the result set. fts3
3774** exposes %_content.docid as the rowid for the virtual table. The
3775** rowid should be written to *pRowid.
3776*/
3777static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){
3778 fulltext_cursor *c = (fulltext_cursor *) pCursor;
3779
3780 *pRowid = sqlite3_column_int64(c->pStmt, 0);
3781 return SQLITE_OK;
3782}
3783
3784/* Add all terms in [zText] to pendingTerms table. If [iColumn] > 0,
3785** we also store positions and offsets in the hash table using that
3786** column number.
3787*/
3788static int buildTerms(fulltext_vtab *v, sqlite_int64 iDocid,
3789 const char *zText, int iColumn){
3790 sqlite3_tokenizer *pTokenizer = v->pTokenizer;
3791 sqlite3_tokenizer_cursor *pCursor;
3792 const char *pToken;
3793 int nTokenBytes;
3794 int iStartOffset, iEndOffset, iPosition;
3795 int rc;
3796
3797 rc = pTokenizer->pModule->xOpen(pTokenizer, zText, -1, &pCursor);
3798 if( rc!=SQLITE_OK ) return rc;
3799
3800 pCursor->pTokenizer = pTokenizer;
3801 while( SQLITE_OK==pTokenizer->pModule->xNext(pCursor,
3802 &pToken, &nTokenBytes,
3803 &iStartOffset, &iEndOffset,
3804 &iPosition) ){
3805 DLCollector *p;
3806 int nData; /* Size of doclist before our update. */
3807
3808 /* Positions can't be negative; we use -1 as a terminator internally. */
3809 if( iPosition<0 ){
3810 pTokenizer->pModule->xClose(pCursor);
3811 return SQLITE_ERROR;
3812 }
3813
3814 p = fts3HashFind(&v->pendingTerms, pToken, nTokenBytes);
3815 if( p==NULL ){
3816 nData = 0;
3817 p = dlcNew(iDocid, DL_DEFAULT);
3818 fts3HashInsert(&v->pendingTerms, pToken, nTokenBytes, p);
3819
3820 /* Overhead for our hash table entry, the key, and the value. */
3821 v->nPendingData += sizeof(struct fts3HashElem)+sizeof(*p)+nTokenBytes;
3822 }else{
3823 nData = p->b.nData;
3824 if( p->dlw.iPrevDocid!=iDocid ) dlcNext(p, iDocid);
3825 }
3826 if( iColumn>=0 ){
3827 dlcAddPos(p, iColumn, iPosition, iStartOffset, iEndOffset);
3828 }
3829
3830 /* Accumulate data added by dlcNew or dlcNext, and dlcAddPos. */
3831 v->nPendingData += p->b.nData-nData;
3832 }
3833
3834 /* TODO(shess) Check return? Should this be able to cause errors at
3835 ** this point? Actually, same question about sqlite3_finalize(),
3836 ** though one could argue that failure there means that the data is
3837 ** not durable. *ponder*
3838 */
3839 pTokenizer->pModule->xClose(pCursor);
3840 return rc;
3841}
3842
3843/* Add doclists for all terms in [pValues] to pendingTerms table. */
3844static int insertTerms(fulltext_vtab *v, sqlite_int64 iDocid,
3845 sqlite3_value **pValues){
3846 int i;
3847 for(i = 0; i < v->nColumn ; ++i){
3848 char *zText = (char*)sqlite3_value_text(pValues[i]);
3849 int rc = buildTerms(v, iDocid, zText, i);
3850 if( rc!=SQLITE_OK ) return rc;
3851 }
3852 return SQLITE_OK;
3853}
3854
3855/* Add empty doclists for all terms in the given row's content to
3856** pendingTerms.
3857*/
3858static int deleteTerms(fulltext_vtab *v, sqlite_int64 iDocid){
3859 const char **pValues;
3860 int i, rc;
3861
3862 /* TODO(shess) Should we allow such tables at all? */
3863 if( DL_DEFAULT==DL_DOCIDS ) return SQLITE_ERROR;
3864
3865 rc = content_select(v, iDocid, &pValues);
3866 if( rc!=SQLITE_OK ) return rc;
3867
3868 for(i = 0 ; i < v->nColumn; ++i) {
3869 rc = buildTerms(v, iDocid, pValues[i], -1);
3870 if( rc!=SQLITE_OK ) break;
3871 }
3872
3873 freeStringArray(v->nColumn, pValues);
3874 return SQLITE_OK;
3875}
3876
3877/* TODO(shess) Refactor the code to remove this forward decl. */
3878static int initPendingTerms(fulltext_vtab *v, sqlite_int64 iDocid);
3879
3880/* Insert a row into the %_content table; set *piDocid to be the ID of the
3881** new row. Add doclists for terms to pendingTerms.
3882*/
3883static int index_insert(fulltext_vtab *v, sqlite3_value *pRequestDocid,
3884 sqlite3_value **pValues, sqlite_int64 *piDocid){
3885 int rc;
3886
3887 rc = content_insert(v, pRequestDocid, pValues); /* execute an SQL INSERT */
3888 if( rc!=SQLITE_OK ) return rc;
3889
3890 /* docid column is an alias for rowid. */
3891 *piDocid = sqlite3_last_insert_rowid(v->db);
3892 rc = initPendingTerms(v, *piDocid);
3893 if( rc!=SQLITE_OK ) return rc;
3894
3895 return insertTerms(v, *piDocid, pValues);
3896}
3897
3898/* Delete a row from the %_content table; add empty doclists for terms
3899** to pendingTerms.
3900*/
3901static int index_delete(fulltext_vtab *v, sqlite_int64 iRow){
3902 int rc = initPendingTerms(v, iRow);
3903 if( rc!=SQLITE_OK ) return rc;
3904
3905 rc = deleteTerms(v, iRow);
3906 if( rc!=SQLITE_OK ) return rc;
3907
3908 return content_delete(v, iRow); /* execute an SQL DELETE */
3909}
3910
3911/* Update a row in the %_content table; add delete doclists to
3912** pendingTerms for old terms not in the new data, add insert doclists
3913** to pendingTerms for terms in the new data.
3914*/
3915static int index_update(fulltext_vtab *v, sqlite_int64 iRow,
3916 sqlite3_value **pValues){
3917 int rc = initPendingTerms(v, iRow);
3918 if( rc!=SQLITE_OK ) return rc;
3919
3920 /* Generate an empty doclist for each term that previously appeared in this
3921 * row. */
3922 rc = deleteTerms(v, iRow);
3923 if( rc!=SQLITE_OK ) return rc;
3924
3925 rc = content_update(v, pValues, iRow); /* execute an SQL UPDATE */
3926 if( rc!=SQLITE_OK ) return rc;
3927
3928 /* Now add positions for terms which appear in the updated row. */
3929 return insertTerms(v, iRow, pValues);
3930}
3931
3932/*******************************************************************/
3933/* InteriorWriter is used to collect terms and block references into
3934** interior nodes in %_segments. See commentary at top of file for
3935** format.
3936*/
3937
3938/* How large interior nodes can grow. */
3939#define INTERIOR_MAX 2048
3940
3941/* Minimum number of terms per interior node (except the root). This
3942** prevents large terms from making the tree too skinny - must be >0
3943** so that the tree always makes progress. Note that the min tree
3944** fanout will be INTERIOR_MIN_TERMS+1.
3945*/
3946#define INTERIOR_MIN_TERMS 7
3947#if INTERIOR_MIN_TERMS<1
3948# error INTERIOR_MIN_TERMS must be greater than 0.
3949#endif
3950
3951/* ROOT_MAX controls how much data is stored inline in the segment
3952** directory.
3953*/
3954/* TODO(shess) Push ROOT_MAX down to whoever is writing things. It's
3955** only here so that interiorWriterRootInfo() and leafWriterRootInfo()
3956** can both see it, but if the caller passed it in, we wouldn't even
3957** need a define.
3958*/
3959#define ROOT_MAX 1024
3960#if ROOT_MAX<VARINT_MAX*2
3961# error ROOT_MAX must have enough space for a header.
3962#endif
3963
3964/* InteriorBlock stores a linked-list of interior blocks while a lower
3965** layer is being constructed.
3966*/
3967typedef struct InteriorBlock {
3968 DataBuffer term; /* Leftmost term in block's subtree. */
3969 DataBuffer data; /* Accumulated data for the block. */
3970 struct InteriorBlock *next;
3971} InteriorBlock;
3972
3973static InteriorBlock *interiorBlockNew(int iHeight, sqlite_int64 iChildBlock,
3974 const char *pTerm, int nTerm){
3975 InteriorBlock *block = calloc(1, sizeof(InteriorBlock));
3976 char c[VARINT_MAX+VARINT_MAX];
3977 int n;
3978
3979 dataBufferInit(&block->term, 0);
3980 dataBufferReplace(&block->term, pTerm, nTerm);
3981
3982 n = putVarint(c, iHeight);
3983 n += putVarint(c+n, iChildBlock);
3984 dataBufferInit(&block->data, INTERIOR_MAX);
3985 dataBufferReplace(&block->data, c, n);
3986
3987 return block;
3988}
3989
3990#ifndef NDEBUG
3991/* Verify that the data is readable as an interior node. */
3992static void interiorBlockValidate(InteriorBlock *pBlock){
3993 const char *pData = pBlock->data.pData;
3994 int nData = pBlock->data.nData;
3995 int n, iDummy;
3996 sqlite_int64 iBlockid;
3997
3998 assert( nData>0 );
3999 assert( pData!=0 );
4000 assert( pData+nData>pData );
4001
4002 /* Must lead with height of node as a varint(n), n>0 */
4003 n = getVarint32(pData, &iDummy);
4004 assert( n>0 );
4005 assert( iDummy>0 );
4006 assert( n<nData );
4007 pData += n;
4008 nData -= n;
4009
4010 /* Must contain iBlockid. */
4011 n = getVarint(pData, &iBlockid);
4012 assert( n>0 );
4013 assert( n<=nData );
4014 pData += n;
4015 nData -= n;
4016
4017 /* Zero or more terms of positive length */
4018 if( nData!=0 ){
4019 /* First term is not delta-encoded. */
4020 n = getVarint32(pData, &iDummy);
4021 assert( n>0 );
4022 assert( iDummy>0 );
4023 assert( n+iDummy>0);
4024 assert( n+iDummy<=nData );
4025 pData += n+iDummy;
4026 nData -= n+iDummy;
4027
4028 /* Following terms delta-encoded. */
4029 while( nData!=0 ){
4030 /* Length of shared prefix. */
4031 n = getVarint32(pData, &iDummy);
4032 assert( n>0 );
4033 assert( iDummy>=0 );
4034 assert( n<nData );
4035 pData += n;
4036 nData -= n;
4037
4038 /* Length and data of distinct suffix. */
4039 n = getVarint32(pData, &iDummy);
4040 assert( n>0 );
4041 assert( iDummy>0 );
4042 assert( n+iDummy>0);
4043 assert( n+iDummy<=nData );
4044 pData += n+iDummy;
4045 nData -= n+iDummy;
4046 }
4047 }
4048}
4049#define ASSERT_VALID_INTERIOR_BLOCK(x) interiorBlockValidate(x)
4050#else
4051#define ASSERT_VALID_INTERIOR_BLOCK(x) assert( 1 )
4052#endif
4053
4054typedef struct InteriorWriter {
4055 int iHeight; /* from 0 at leaves. */
4056 InteriorBlock *first, *last;
4057 struct InteriorWriter *parentWriter;
4058
4059 DataBuffer term; /* Last term written to block "last". */
4060 sqlite_int64 iOpeningChildBlock; /* First child block in block "last". */
4061#ifndef NDEBUG
4062 sqlite_int64 iLastChildBlock; /* for consistency checks. */
4063#endif
4064} InteriorWriter;
4065
4066/* Initialize an interior node where pTerm[nTerm] marks the leftmost
4067** term in the tree. iChildBlock is the leftmost child block at the
4068** next level down the tree.
4069*/
4070static void interiorWriterInit(int iHeight, const char *pTerm, int nTerm,
4071 sqlite_int64 iChildBlock,
4072 InteriorWriter *pWriter){
4073 InteriorBlock *block;
4074 assert( iHeight>0 );
4075 CLEAR(pWriter);
4076
4077 pWriter->iHeight = iHeight;
4078 pWriter->iOpeningChildBlock = iChildBlock;
4079#ifndef NDEBUG
4080 pWriter->iLastChildBlock = iChildBlock;
4081#endif
4082 block = interiorBlockNew(iHeight, iChildBlock, pTerm, nTerm);
4083 pWriter->last = pWriter->first = block;
4084 ASSERT_VALID_INTERIOR_BLOCK(pWriter->last);
4085 dataBufferInit(&pWriter->term, 0);
4086}
4087
4088/* Append the child node rooted at iChildBlock to the interior node,
4089** with pTerm[nTerm] as the leftmost term in iChildBlock's subtree.
4090*/
4091static void interiorWriterAppend(InteriorWriter *pWriter,
4092 const char *pTerm, int nTerm,
4093 sqlite_int64 iChildBlock){
4094 char c[VARINT_MAX+VARINT_MAX];
4095 int n, nPrefix = 0;
4096
4097 ASSERT_VALID_INTERIOR_BLOCK(pWriter->last);
4098
4099 /* The first term written into an interior node is actually
4100 ** associated with the second child added (the first child was added
4101 ** in interiorWriterInit, or in the if clause at the bottom of this
4102 ** function). That term gets encoded straight up, with nPrefix left
4103 ** at 0.
4104 */
4105 if( pWriter->term.nData==0 ){
4106 n = putVarint(c, nTerm);
4107 }else{
4108 while( nPrefix<pWriter->term.nData &&
4109 pTerm[nPrefix]==pWriter->term.pData[nPrefix] ){
4110 nPrefix++;
4111 }
4112
4113 n = putVarint(c, nPrefix);
4114 n += putVarint(c+n, nTerm-nPrefix);
4115 }
4116
4117#ifndef NDEBUG
4118 pWriter->iLastChildBlock++;
4119#endif
4120 assert( pWriter->iLastChildBlock==iChildBlock );
4121
4122 /* Overflow to a new block if the new term makes the current block
4123 ** too big, and the current block already has enough terms.
4124 */
4125 if( pWriter->last->data.nData+n+nTerm-nPrefix>INTERIOR_MAX &&
4126 iChildBlock-pWriter->iOpeningChildBlock>INTERIOR_MIN_TERMS ){
4127 pWriter->last->next = interiorBlockNew(pWriter->iHeight, iChildBlock,
4128 pTerm, nTerm);
4129 pWriter->last = pWriter->last->next;
4130 pWriter->iOpeningChildBlock = iChildBlock;
4131 dataBufferReset(&pWriter->term);
4132 }else{
4133 dataBufferAppend2(&pWriter->last->data, c, n,
4134 pTerm+nPrefix, nTerm-nPrefix);
4135 dataBufferReplace(&pWriter->term, pTerm, nTerm);
4136 }
4137 ASSERT_VALID_INTERIOR_BLOCK(pWriter->last);
4138}
4139
4140/* Free the space used by pWriter, including the linked-list of
4141** InteriorBlocks, and parentWriter, if present.
4142*/
4143static int interiorWriterDestroy(InteriorWriter *pWriter){
4144 InteriorBlock *block = pWriter->first;
4145
4146 while( block!=NULL ){
4147 InteriorBlock *b = block;
4148 block = block->next;
4149 dataBufferDestroy(&b->term);
4150 dataBufferDestroy(&b->data);
4151 free(b);
4152 }
4153 if( pWriter->parentWriter!=NULL ){
4154 interiorWriterDestroy(pWriter->parentWriter);
4155 free(pWriter->parentWriter);
4156 }
4157 dataBufferDestroy(&pWriter->term);
4158 SCRAMBLE(pWriter);
4159 return SQLITE_OK;
4160}
4161
4162/* If pWriter can fit entirely in ROOT_MAX, return it as the root info
4163** directly, leaving *piEndBlockid unchanged. Otherwise, flush
4164** pWriter to %_segments, building a new layer of interior nodes, and
4165** recursively ask for their root into.
4166*/
4167static int interiorWriterRootInfo(fulltext_vtab *v, InteriorWriter *pWriter,
4168 char **ppRootInfo, int *pnRootInfo,
4169 sqlite_int64 *piEndBlockid){
4170 InteriorBlock *block = pWriter->first;
4171 sqlite_int64 iBlockid = 0;
4172 int rc;
4173
4174 /* If we can fit the segment inline */
4175 if( block==pWriter->last && block->data.nData<ROOT_MAX ){
4176 *ppRootInfo = block->data.pData;
4177 *pnRootInfo = block->data.nData;
4178 return SQLITE_OK;
4179 }
4180
4181 /* Flush the first block to %_segments, and create a new level of
4182 ** interior node.
4183 */
4184 ASSERT_VALID_INTERIOR_BLOCK(block);
4185 rc = block_insert(v, block->data.pData, block->data.nData, &iBlockid);
4186 if( rc!=SQLITE_OK ) return rc;
4187 *piEndBlockid = iBlockid;
4188
4189 pWriter->parentWriter = malloc(sizeof(*pWriter->parentWriter));
4190 interiorWriterInit(pWriter->iHeight+1,
4191 block->term.pData, block->term.nData,
4192 iBlockid, pWriter->parentWriter);
4193
4194 /* Flush additional blocks and append to the higher interior
4195 ** node.
4196 */
4197 for(block=block->next; block!=NULL; block=block->next){
4198 ASSERT_VALID_INTERIOR_BLOCK(block);
4199 rc = block_insert(v, block->data.pData, block->data.nData, &iBlockid);
4200 if( rc!=SQLITE_OK ) return rc;
4201 *piEndBlockid = iBlockid;
4202
4203 interiorWriterAppend(pWriter->parentWriter,
4204 block->term.pData, block->term.nData, iBlockid);
4205 }
4206
4207 /* Parent node gets the chance to be the root. */
4208 return interiorWriterRootInfo(v, pWriter->parentWriter,
4209 ppRootInfo, pnRootInfo, piEndBlockid);
4210}
4211
4212/****************************************************************/
4213/* InteriorReader is used to read off the data from an interior node
4214** (see comment at top of file for the format).
4215*/
4216typedef struct InteriorReader {
4217 const char *pData;
4218 int nData;
4219
4220 DataBuffer term; /* previous term, for decoding term delta. */
4221
4222 sqlite_int64 iBlockid;
4223} InteriorReader;
4224
4225static void interiorReaderDestroy(InteriorReader *pReader){
4226 dataBufferDestroy(&pReader->term);
4227 SCRAMBLE(pReader);
4228}
4229
4230/* TODO(shess) The assertions are great, but what if we're in NDEBUG
4231** and the blob is empty or otherwise contains suspect data?
4232*/
4233static void interiorReaderInit(const char *pData, int nData,
4234 InteriorReader *pReader){
4235 int n, nTerm;
4236
4237 /* Require at least the leading flag byte */
4238 assert( nData>0 );
4239 assert( pData[0]!='\0' );
4240
4241 CLEAR(pReader);
4242
4243 /* Decode the base blockid, and set the cursor to the first term. */
4244 n = getVarint(pData+1, &pReader->iBlockid);
4245 assert( 1+n<=nData );
4246 pReader->pData = pData+1+n;
4247 pReader->nData = nData-(1+n);
4248
4249 /* A single-child interior node (such as when a leaf node was too
4250 ** large for the segment directory) won't have any terms.
4251 ** Otherwise, decode the first term.
4252 */
4253 if( pReader->nData==0 ){
4254 dataBufferInit(&pReader->term, 0);
4255 }else{
4256 n = getVarint32(pReader->pData, &nTerm);
4257 dataBufferInit(&pReader->term, nTerm);
4258 dataBufferReplace(&pReader->term, pReader->pData+n, nTerm);
4259 assert( n+nTerm<=pReader->nData );
4260 pReader->pData += n+nTerm;
4261 pReader->nData -= n+nTerm;
4262 }
4263}
4264
4265static int interiorReaderAtEnd(InteriorReader *pReader){
4266 return pReader->term.nData==0;
4267}
4268
4269static sqlite_int64 interiorReaderCurrentBlockid(InteriorReader *pReader){
4270 return pReader->iBlockid;
4271}
4272
4273static int interiorReaderTermBytes(InteriorReader *pReader){
4274 assert( !interiorReaderAtEnd(pReader) );
4275 return pReader->term.nData;
4276}
4277static const char *interiorReaderTerm(InteriorReader *pReader){
4278 assert( !interiorReaderAtEnd(pReader) );
4279 return pReader->term.pData;
4280}
4281
4282/* Step forward to the next term in the node. */
4283static void interiorReaderStep(InteriorReader *pReader){
4284 assert( !interiorReaderAtEnd(pReader) );
4285
4286 /* If the last term has been read, signal eof, else construct the
4287 ** next term.
4288 */
4289 if( pReader->nData==0 ){
4290 dataBufferReset(&pReader->term);
4291 }else{
4292 int n, nPrefix, nSuffix;
4293
4294 n = getVarint32(pReader->pData, &nPrefix);
4295 n += getVarint32(pReader->pData+n, &nSuffix);
4296
4297 /* Truncate the current term and append suffix data. */
4298 pReader->term.nData = nPrefix;
4299 dataBufferAppend(&pReader->term, pReader->pData+n, nSuffix);
4300
4301 assert( n+nSuffix<=pReader->nData );
4302 pReader->pData += n+nSuffix;
4303 pReader->nData -= n+nSuffix;
4304 }
4305 pReader->iBlockid++;
4306}
4307
4308/* Compare the current term to pTerm[nTerm], returning strcmp-style
4309** results. If isPrefix, equality means equal through nTerm bytes.
4310*/
4311static int interiorReaderTermCmp(InteriorReader *pReader,
4312 const char *pTerm, int nTerm, int isPrefix){
4313 const char *pReaderTerm = interiorReaderTerm(pReader);
4314 int nReaderTerm = interiorReaderTermBytes(pReader);
4315 int c, n = nReaderTerm<nTerm ? nReaderTerm : nTerm;
4316
4317 if( n==0 ){
4318 if( nReaderTerm>0 ) return -1;
4319 if( nTerm>0 ) return 1;
4320 return 0;
4321 }
4322
4323 c = memcmp(pReaderTerm, pTerm, n);
4324 if( c!=0 ) return c;
4325 if( isPrefix && n==nTerm ) return 0;
4326 return nReaderTerm - nTerm;
4327}
4328
4329/****************************************************************/
4330/* LeafWriter is used to collect terms and associated doclist data
4331** into leaf blocks in %_segments (see top of file for format info).
4332** Expected usage is:
4333**
4334** LeafWriter writer;
4335** leafWriterInit(0, 0, &writer);
4336** while( sorted_terms_left_to_process ){
4337** // data is doclist data for that term.
4338** rc = leafWriterStep(v, &writer, pTerm, nTerm, pData, nData);
4339** if( rc!=SQLITE_OK ) goto err;
4340** }
4341** rc = leafWriterFinalize(v, &writer);
4342**err:
4343** leafWriterDestroy(&writer);
4344** return rc;
4345**
4346** leafWriterStep() may write a collected leaf out to %_segments.
4347** leafWriterFinalize() finishes writing any buffered data and stores
4348** a root node in %_segdir. leafWriterDestroy() frees all buffers and
4349** InteriorWriters allocated as part of writing this segment.
4350**
4351** TODO(shess) Document leafWriterStepMerge().
4352*/
4353
4354/* Put terms with data this big in their own block. */
4355#define STANDALONE_MIN 1024
4356
4357/* Keep leaf blocks below this size. */
4358#define LEAF_MAX 2048
4359
4360typedef struct LeafWriter {
4361 int iLevel;
4362 int idx;
4363 sqlite_int64 iStartBlockid; /* needed to create the root info */
4364 sqlite_int64 iEndBlockid; /* when we're done writing. */
4365
4366 DataBuffer term; /* previous encoded term */
4367 DataBuffer data; /* encoding buffer */
4368
4369 /* bytes of first term in the current node which distinguishes that
4370 ** term from the last term of the previous node.
4371 */
4372 int nTermDistinct;
4373
4374 InteriorWriter parentWriter; /* if we overflow */
4375 int has_parent;
4376} LeafWriter;
4377
4378static void leafWriterInit(int iLevel, int idx, LeafWriter *pWriter){
4379 CLEAR(pWriter);
4380 pWriter->iLevel = iLevel;
4381 pWriter->idx = idx;
4382
4383 dataBufferInit(&pWriter->term, 32);
4384
4385 /* Start out with a reasonably sized block, though it can grow. */
4386 dataBufferInit(&pWriter->data, LEAF_MAX);
4387}
4388
4389#ifndef NDEBUG
4390/* Verify that the data is readable as a leaf node. */
4391static void leafNodeValidate(const char *pData, int nData){
4392 int n, iDummy;
4393
4394 if( nData==0 ) return;
4395 assert( nData>0 );
4396 assert( pData!=0 );
4397 assert( pData+nData>pData );
4398
4399 /* Must lead with a varint(0) */
4400 n = getVarint32(pData, &iDummy);
4401 assert( iDummy==0 );
4402 assert( n>0 );
4403 assert( n<nData );
4404 pData += n;
4405 nData -= n;
4406
4407 /* Leading term length and data must fit in buffer. */
4408 n = getVarint32(pData, &iDummy);
4409 assert( n>0 );
4410 assert( iDummy>0 );
4411 assert( n+iDummy>0 );
4412 assert( n+iDummy<nData );
4413 pData += n+iDummy;
4414 nData -= n+iDummy;
4415
4416 /* Leading term's doclist length and data must fit. */
4417 n = getVarint32(pData, &iDummy);
4418 assert( n>0 );
4419 assert( iDummy>0 );
4420 assert( n+iDummy>0 );
4421 assert( n+iDummy<=nData );
4422 ASSERT_VALID_DOCLIST(DL_DEFAULT, pData+n, iDummy, NULL);
4423 pData += n+iDummy;
4424 nData -= n+iDummy;
4425
4426 /* Verify that trailing terms and doclists also are readable. */
4427 while( nData!=0 ){
4428 n = getVarint32(pData, &iDummy);
4429 assert( n>0 );
4430 assert( iDummy>=0 );
4431 assert( n<nData );
4432 pData += n;
4433 nData -= n;
4434 n = getVarint32(pData, &iDummy);
4435 assert( n>0 );
4436 assert( iDummy>0 );
4437 assert( n+iDummy>0 );
4438 assert( n+iDummy<nData );
4439 pData += n+iDummy;
4440 nData -= n+iDummy;
4441
4442 n = getVarint32(pData, &iDummy);
4443 assert( n>0 );
4444 assert( iDummy>0 );
4445 assert( n+iDummy>0 );
4446 assert( n+iDummy<=nData );
4447 ASSERT_VALID_DOCLIST(DL_DEFAULT, pData+n, iDummy, NULL);
4448 pData += n+iDummy;
4449 nData -= n+iDummy;
4450 }
4451}
4452#define ASSERT_VALID_LEAF_NODE(p, n) leafNodeValidate(p, n)
4453#else
4454#define ASSERT_VALID_LEAF_NODE(p, n) assert( 1 )
4455#endif
4456
4457/* Flush the current leaf node to %_segments, and adding the resulting
4458** blockid and the starting term to the interior node which will
4459** contain it.
4460*/
4461static int leafWriterInternalFlush(fulltext_vtab *v, LeafWriter *pWriter,
4462 int iData, int nData){
4463 sqlite_int64 iBlockid = 0;
4464 const char *pStartingTerm;
4465 int nStartingTerm, rc, n;
4466
4467 /* Must have the leading varint(0) flag, plus at least some
4468 ** valid-looking data.
4469 */
4470 assert( nData>2 );
4471 assert( iData>=0 );
4472 assert( iData+nData<=pWriter->data.nData );
4473 ASSERT_VALID_LEAF_NODE(pWriter->data.pData+iData, nData);
4474
4475 rc = block_insert(v, pWriter->data.pData+iData, nData, &iBlockid);
4476 if( rc!=SQLITE_OK ) return rc;
4477 assert( iBlockid!=0 );
4478
4479 /* Reconstruct the first term in the leaf for purposes of building
4480 ** the interior node.
4481 */
4482 n = getVarint32(pWriter->data.pData+iData+1, &nStartingTerm);
4483 pStartingTerm = pWriter->data.pData+iData+1+n;
4484 assert( pWriter->data.nData>iData+1+n+nStartingTerm );
4485 assert( pWriter->nTermDistinct>0 );
4486 assert( pWriter->nTermDistinct<=nStartingTerm );
4487 nStartingTerm = pWriter->nTermDistinct;
4488
4489 if( pWriter->has_parent ){
4490 interiorWriterAppend(&pWriter->parentWriter,
4491 pStartingTerm, nStartingTerm, iBlockid);
4492 }else{
4493 interiorWriterInit(1, pStartingTerm, nStartingTerm, iBlockid,
4494 &pWriter->parentWriter);
4495 pWriter->has_parent = 1;
4496 }
4497
4498 /* Track the span of this segment's leaf nodes. */
4499 if( pWriter->iEndBlockid==0 ){
4500 pWriter->iEndBlockid = pWriter->iStartBlockid = iBlockid;
4501 }else{
4502 pWriter->iEndBlockid++;
4503 assert( iBlockid==pWriter->iEndBlockid );
4504 }
4505
4506 return SQLITE_OK;
4507}
4508static int leafWriterFlush(fulltext_vtab *v, LeafWriter *pWriter){
4509 int rc = leafWriterInternalFlush(v, pWriter, 0, pWriter->data.nData);
4510 if( rc!=SQLITE_OK ) return rc;
4511
4512 /* Re-initialize the output buffer. */
4513 dataBufferReset(&pWriter->data);
4514
4515 return SQLITE_OK;
4516}
4517
4518/* Fetch the root info for the segment. If the entire leaf fits
4519** within ROOT_MAX, then it will be returned directly, otherwise it
4520** will be flushed and the root info will be returned from the
4521** interior node. *piEndBlockid is set to the blockid of the last
4522** interior or leaf node written to disk (0 if none are written at
4523** all).
4524*/
4525static int leafWriterRootInfo(fulltext_vtab *v, LeafWriter *pWriter,
4526 char **ppRootInfo, int *pnRootInfo,
4527 sqlite_int64 *piEndBlockid){
4528 /* we can fit the segment entirely inline */
4529 if( !pWriter->has_parent && pWriter->data.nData<ROOT_MAX ){
4530 *ppRootInfo = pWriter->data.pData;
4531 *pnRootInfo = pWriter->data.nData;
4532 *piEndBlockid = 0;
4533 return SQLITE_OK;
4534 }
4535
4536 /* Flush remaining leaf data. */
4537 if( pWriter->data.nData>0 ){
4538 int rc = leafWriterFlush(v, pWriter);
4539 if( rc!=SQLITE_OK ) return rc;
4540 }
4541
4542 /* We must have flushed a leaf at some point. */
4543 assert( pWriter->has_parent );
4544
4545 /* Tenatively set the end leaf blockid as the end blockid. If the
4546 ** interior node can be returned inline, this will be the final
4547 ** blockid, otherwise it will be overwritten by
4548 ** interiorWriterRootInfo().
4549 */
4550 *piEndBlockid = pWriter->iEndBlockid;
4551
4552 return interiorWriterRootInfo(v, &pWriter->parentWriter,
4553 ppRootInfo, pnRootInfo, piEndBlockid);
4554}
4555
4556/* Collect the rootInfo data and store it into the segment directory.
4557** This has the effect of flushing the segment's leaf data to
4558** %_segments, and also flushing any interior nodes to %_segments.
4559*/
4560static int leafWriterFinalize(fulltext_vtab *v, LeafWriter *pWriter){
4561 sqlite_int64 iEndBlockid;
4562 char *pRootInfo;
4563 int rc, nRootInfo;
4564
4565 rc = leafWriterRootInfo(v, pWriter, &pRootInfo, &nRootInfo, &iEndBlockid);
4566 if( rc!=SQLITE_OK ) return rc;
4567
4568 /* Don't bother storing an entirely empty segment. */
4569 if( iEndBlockid==0 && nRootInfo==0 ) return SQLITE_OK;
4570
4571 return segdir_set(v, pWriter->iLevel, pWriter->idx,
4572 pWriter->iStartBlockid, pWriter->iEndBlockid,
4573 iEndBlockid, pRootInfo, nRootInfo);
4574}
4575
4576static void leafWriterDestroy(LeafWriter *pWriter){
4577 if( pWriter->has_parent ) interiorWriterDestroy(&pWriter->parentWriter);
4578 dataBufferDestroy(&pWriter->term);
4579 dataBufferDestroy(&pWriter->data);
4580}
4581
4582/* Encode a term into the leafWriter, delta-encoding as appropriate.
4583** Returns the length of the new term which distinguishes it from the
4584** previous term, which can be used to set nTermDistinct when a node
4585** boundary is crossed.
4586*/
4587static int leafWriterEncodeTerm(LeafWriter *pWriter,
4588 const char *pTerm, int nTerm){
4589 char c[VARINT_MAX+VARINT_MAX];
4590 int n, nPrefix = 0;
4591
4592 assert( nTerm>0 );
4593 while( nPrefix<pWriter->term.nData &&
4594 pTerm[nPrefix]==pWriter->term.pData[nPrefix] ){
4595 nPrefix++;
4596 /* Failing this implies that the terms weren't in order. */
4597 assert( nPrefix<nTerm );
4598 }
4599
4600 if( pWriter->data.nData==0 ){
4601 /* Encode the node header and leading term as:
4602 ** varint(0)
4603 ** varint(nTerm)
4604 ** char pTerm[nTerm]
4605 */
4606 n = putVarint(c, '\0');
4607 n += putVarint(c+n, nTerm);
4608 dataBufferAppend2(&pWriter->data, c, n, pTerm, nTerm);
4609 }else{
4610 /* Delta-encode the term as:
4611 ** varint(nPrefix)
4612 ** varint(nSuffix)
4613 ** char pTermSuffix[nSuffix]
4614 */
4615 n = putVarint(c, nPrefix);
4616 n += putVarint(c+n, nTerm-nPrefix);
4617 dataBufferAppend2(&pWriter->data, c, n, pTerm+nPrefix, nTerm-nPrefix);
4618 }
4619 dataBufferReplace(&pWriter->term, pTerm, nTerm);
4620
4621 return nPrefix+1;
4622}
4623
4624/* Used to avoid a memmove when a large amount of doclist data is in
4625** the buffer. This constructs a node and term header before
4626** iDoclistData and flushes the resulting complete node using
4627** leafWriterInternalFlush().
4628*/
4629static int leafWriterInlineFlush(fulltext_vtab *v, LeafWriter *pWriter,
4630 const char *pTerm, int nTerm,
4631 int iDoclistData){
4632 char c[VARINT_MAX+VARINT_MAX];
4633 int iData, n = putVarint(c, 0);
4634 n += putVarint(c+n, nTerm);
4635
4636 /* There should always be room for the header. Even if pTerm shared
4637 ** a substantial prefix with the previous term, the entire prefix
4638 ** could be constructed from earlier data in the doclist, so there
4639 ** should be room.
4640 */
4641 assert( iDoclistData>=n+nTerm );
4642
4643 iData = iDoclistData-(n+nTerm);
4644 memcpy(pWriter->data.pData+iData, c, n);
4645 memcpy(pWriter->data.pData+iData+n, pTerm, nTerm);
4646
4647 return leafWriterInternalFlush(v, pWriter, iData, pWriter->data.nData-iData);
4648}
4649
4650/* Push pTerm[nTerm] along with the doclist data to the leaf layer of
4651** %_segments.
4652*/
4653static int leafWriterStepMerge(fulltext_vtab *v, LeafWriter *pWriter,
4654 const char *pTerm, int nTerm,
4655 DLReader *pReaders, int nReaders){
4656 char c[VARINT_MAX+VARINT_MAX];
4657 int iTermData = pWriter->data.nData, iDoclistData;
4658 int i, nData, n, nActualData, nActual, rc, nTermDistinct;
4659
4660 ASSERT_VALID_LEAF_NODE(pWriter->data.pData, pWriter->data.nData);
4661 nTermDistinct = leafWriterEncodeTerm(pWriter, pTerm, nTerm);
4662
4663 /* Remember nTermDistinct if opening a new node. */
4664 if( iTermData==0 ) pWriter->nTermDistinct = nTermDistinct;
4665
4666 iDoclistData = pWriter->data.nData;
4667
4668 /* Estimate the length of the merged doclist so we can leave space
4669 ** to encode it.
4670 */
4671 for(i=0, nData=0; i<nReaders; i++){
4672 nData += dlrAllDataBytes(&pReaders[i]);
4673 }
4674 n = putVarint(c, nData);
4675 dataBufferAppend(&pWriter->data, c, n);
4676
4677 docListMerge(&pWriter->data, pReaders, nReaders);
4678 ASSERT_VALID_DOCLIST(DL_DEFAULT,
4679 pWriter->data.pData+iDoclistData+n,
4680 pWriter->data.nData-iDoclistData-n, NULL);
4681
4682 /* The actual amount of doclist data at this point could be smaller
4683 ** than the length we encoded. Additionally, the space required to
4684 ** encode this length could be smaller. For small doclists, this is
4685 ** not a big deal, we can just use memmove() to adjust things.
4686 */
4687 nActualData = pWriter->data.nData-(iDoclistData+n);
4688 nActual = putVarint(c, nActualData);
4689 assert( nActualData<=nData );
4690 assert( nActual<=n );
4691
4692 /* If the new doclist is big enough for force a standalone leaf
4693 ** node, we can immediately flush it inline without doing the
4694 ** memmove().
4695 */
4696 /* TODO(shess) This test matches leafWriterStep(), which does this
4697 ** test before it knows the cost to varint-encode the term and
4698 ** doclist lengths. At some point, change to
4699 ** pWriter->data.nData-iTermData>STANDALONE_MIN.
4700 */
4701 if( nTerm+nActualData>STANDALONE_MIN ){
4702 /* Push leaf node from before this term. */
4703 if( iTermData>0 ){
4704 rc = leafWriterInternalFlush(v, pWriter, 0, iTermData);
4705 if( rc!=SQLITE_OK ) return rc;
4706
4707 pWriter->nTermDistinct = nTermDistinct;
4708 }
4709
4710 /* Fix the encoded doclist length. */
4711 iDoclistData += n - nActual;
4712 memcpy(pWriter->data.pData+iDoclistData, c, nActual);
4713
4714 /* Push the standalone leaf node. */
4715 rc = leafWriterInlineFlush(v, pWriter, pTerm, nTerm, iDoclistData);
4716 if( rc!=SQLITE_OK ) return rc;
4717
4718 /* Leave the node empty. */
4719 dataBufferReset(&pWriter->data);
4720
4721 return rc;
4722 }
4723
4724 /* At this point, we know that the doclist was small, so do the
4725 ** memmove if indicated.
4726 */
4727 if( nActual<n ){
4728 memmove(pWriter->data.pData+iDoclistData+nActual,
4729 pWriter->data.pData+iDoclistData+n,
4730 pWriter->data.nData-(iDoclistData+n));
4731 pWriter->data.nData -= n-nActual;
4732 }
4733
4734 /* Replace written length with actual length. */
4735 memcpy(pWriter->data.pData+iDoclistData, c, nActual);
4736
4737 /* If the node is too large, break things up. */
4738 /* TODO(shess) This test matches leafWriterStep(), which does this
4739 ** test before it knows the cost to varint-encode the term and
4740 ** doclist lengths. At some point, change to
4741 ** pWriter->data.nData>LEAF_MAX.
4742 */
4743 if( iTermData+nTerm+nActualData>LEAF_MAX ){
4744 /* Flush out the leading data as a node */
4745 rc = leafWriterInternalFlush(v, pWriter, 0, iTermData);
4746 if( rc!=SQLITE_OK ) return rc;
4747
4748 pWriter->nTermDistinct = nTermDistinct;
4749
4750 /* Rebuild header using the current term */
4751 n = putVarint(pWriter->data.pData, 0);
4752 n += putVarint(pWriter->data.pData+n, nTerm);
4753 memcpy(pWriter->data.pData+n, pTerm, nTerm);
4754 n += nTerm;
4755
4756 /* There should always be room, because the previous encoding
4757 ** included all data necessary to construct the term.
4758 */
4759 assert( n<iDoclistData );
4760 /* So long as STANDALONE_MIN is half or less of LEAF_MAX, the
4761 ** following memcpy() is safe (as opposed to needing a memmove).
4762 */
4763 assert( 2*STANDALONE_MIN<=LEAF_MAX );
4764 assert( n+pWriter->data.nData-iDoclistData<iDoclistData );
4765 memcpy(pWriter->data.pData+n,
4766 pWriter->data.pData+iDoclistData,
4767 pWriter->data.nData-iDoclistData);
4768 pWriter->data.nData -= iDoclistData-n;
4769 }
4770 ASSERT_VALID_LEAF_NODE(pWriter->data.pData, pWriter->data.nData);
4771
4772 return SQLITE_OK;
4773}
4774
4775/* Push pTerm[nTerm] along with the doclist data to the leaf layer of
4776** %_segments.
4777*/
4778/* TODO(shess) Revise writeZeroSegment() so that doclists are
4779** constructed directly in pWriter->data.
4780*/
4781static int leafWriterStep(fulltext_vtab *v, LeafWriter *pWriter,
4782 const char *pTerm, int nTerm,
4783 const char *pData, int nData){
4784 int rc;
4785 DLReader reader;
4786
4787 dlrInit(&reader, DL_DEFAULT, pData, nData);
4788 rc = leafWriterStepMerge(v, pWriter, pTerm, nTerm, &reader, 1);
4789 dlrDestroy(&reader);
4790
4791 return rc;
4792}
4793
4794
4795/****************************************************************/
4796/* LeafReader is used to iterate over an individual leaf node. */
4797typedef struct LeafReader {
4798 DataBuffer term; /* copy of current term. */
4799
4800 const char *pData; /* data for current term. */
4801 int nData;
4802} LeafReader;
4803
4804static void leafReaderDestroy(LeafReader *pReader){
4805 dataBufferDestroy(&pReader->term);
4806 SCRAMBLE(pReader);
4807}
4808
4809static int leafReaderAtEnd(LeafReader *pReader){
4810 return pReader->nData<=0;
4811}
4812
4813/* Access the current term. */
4814static int leafReaderTermBytes(LeafReader *pReader){
4815 return pReader->term.nData;
4816}
4817static const char *leafReaderTerm(LeafReader *pReader){
4818 assert( pReader->term.nData>0 );
4819 return pReader->term.pData;
4820}
4821
4822/* Access the doclist data for the current term. */
4823static int leafReaderDataBytes(LeafReader *pReader){
4824 int nData;
4825 assert( pReader->term.nData>0 );
4826 getVarint32(pReader->pData, &nData);
4827 return nData;
4828}
4829static const char *leafReaderData(LeafReader *pReader){
4830 int n, nData;
4831 assert( pReader->term.nData>0 );
4832 n = getVarint32(pReader->pData, &nData);
4833 return pReader->pData+n;
4834}
4835
4836static void leafReaderInit(const char *pData, int nData,
4837 LeafReader *pReader){
4838 int nTerm, n;
4839
4840 assert( nData>0 );
4841 assert( pData[0]=='\0' );
4842
4843 CLEAR(pReader);
4844
4845 /* Read the first term, skipping the header byte. */
4846 n = getVarint32(pData+1, &nTerm);
4847 dataBufferInit(&pReader->term, nTerm);
4848 dataBufferReplace(&pReader->term, pData+1+n, nTerm);
4849
4850 /* Position after the first term. */
4851 assert( 1+n+nTerm<nData );
4852 pReader->pData = pData+1+n+nTerm;
4853 pReader->nData = nData-1-n-nTerm;
4854}
4855
4856/* Step the reader forward to the next term. */
4857static void leafReaderStep(LeafReader *pReader){
4858 int n, nData, nPrefix, nSuffix;
4859 assert( !leafReaderAtEnd(pReader) );
4860
4861 /* Skip previous entry's data block. */
4862 n = getVarint32(pReader->pData, &nData);
4863 assert( n+nData<=pReader->nData );
4864 pReader->pData += n+nData;
4865 pReader->nData -= n+nData;
4866
4867 if( !leafReaderAtEnd(pReader) ){
4868 /* Construct the new term using a prefix from the old term plus a
4869 ** suffix from the leaf data.
4870 */
4871 n = getVarint32(pReader->pData, &nPrefix);
4872 n += getVarint32(pReader->pData+n, &nSuffix);
4873 assert( n+nSuffix<pReader->nData );
4874 pReader->term.nData = nPrefix;
4875 dataBufferAppend(&pReader->term, pReader->pData+n, nSuffix);
4876
4877 pReader->pData += n+nSuffix;
4878 pReader->nData -= n+nSuffix;
4879 }
4880}
4881
4882/* strcmp-style comparison of pReader's current term against pTerm.
4883** If isPrefix, equality means equal through nTerm bytes.
4884*/
4885static int leafReaderTermCmp(LeafReader *pReader,
4886 const char *pTerm, int nTerm, int isPrefix){
4887 int c, n = pReader->term.nData<nTerm ? pReader->term.nData : nTerm;
4888 if( n==0 ){
4889 if( pReader->term.nData>0 ) return -1;
4890 if(nTerm>0 ) return 1;
4891 return 0;
4892 }
4893
4894 c = memcmp(pReader->term.pData, pTerm, n);
4895 if( c!=0 ) return c;
4896 if( isPrefix && n==nTerm ) return 0;
4897 return pReader->term.nData - nTerm;
4898}
4899
4900
4901/****************************************************************/
4902/* LeavesReader wraps LeafReader to allow iterating over the entire
4903** leaf layer of the tree.
4904*/
4905typedef struct LeavesReader {
4906 int idx; /* Index within the segment. */
4907
4908 sqlite3_stmt *pStmt; /* Statement we're streaming leaves from. */
4909 int eof; /* we've seen SQLITE_DONE from pStmt. */
4910
4911 LeafReader leafReader; /* reader for the current leaf. */
4912 DataBuffer rootData; /* root data for inline. */
4913} LeavesReader;
4914
4915/* Access the current term. */
4916static int leavesReaderTermBytes(LeavesReader *pReader){
4917 assert( !pReader->eof );
4918 return leafReaderTermBytes(&pReader->leafReader);
4919}
4920static const char *leavesReaderTerm(LeavesReader *pReader){
4921 assert( !pReader->eof );
4922 return leafReaderTerm(&pReader->leafReader);
4923}
4924
4925/* Access the doclist data for the current term. */
4926static int leavesReaderDataBytes(LeavesReader *pReader){
4927 assert( !pReader->eof );
4928 return leafReaderDataBytes(&pReader->leafReader);
4929}
4930static const char *leavesReaderData(LeavesReader *pReader){
4931 assert( !pReader->eof );
4932 return leafReaderData(&pReader->leafReader);
4933}
4934
4935static int leavesReaderAtEnd(LeavesReader *pReader){
4936 return pReader->eof;
4937}
4938
4939/* loadSegmentLeaves() may not read all the way to SQLITE_DONE, thus
4940** leaving the statement handle open, which locks the table.
4941*/
4942/* TODO(shess) This "solution" is not satisfactory. Really, there
4943** should be check-in function for all statement handles which
4944** arranges to call sqlite3_reset(). This most likely will require
4945** modification to control flow all over the place, though, so for now
4946** just punt.
4947**
4948** Note the the current system assumes that segment merges will run to
4949** completion, which is why this particular probably hasn't arisen in
4950** this case. Probably a brittle assumption.
4951*/
4952static int leavesReaderReset(LeavesReader *pReader){
4953 return sqlite3_reset(pReader->pStmt);
4954}
4955
4956static void leavesReaderDestroy(LeavesReader *pReader){
4957 leafReaderDestroy(&pReader->leafReader);
4958 dataBufferDestroy(&pReader->rootData);
4959 SCRAMBLE(pReader);
4960}
4961
4962/* Initialize pReader with the given root data (if iStartBlockid==0
4963** the leaf data was entirely contained in the root), or from the
4964** stream of blocks between iStartBlockid and iEndBlockid, inclusive.
4965*/
4966static int leavesReaderInit(fulltext_vtab *v,
4967 int idx,
4968 sqlite_int64 iStartBlockid,
4969 sqlite_int64 iEndBlockid,
4970 const char *pRootData, int nRootData,
4971 LeavesReader *pReader){
4972 CLEAR(pReader);
4973 pReader->idx = idx;
4974
4975 dataBufferInit(&pReader->rootData, 0);
4976 if( iStartBlockid==0 ){
4977 /* Entire leaf level fit in root data. */
4978 dataBufferReplace(&pReader->rootData, pRootData, nRootData);
4979 leafReaderInit(pReader->rootData.pData, pReader->rootData.nData,
4980 &pReader->leafReader);
4981 }else{
4982 sqlite3_stmt *s;
4983 int rc = sql_get_leaf_statement(v, idx, &s);
4984 if( rc!=SQLITE_OK ) return rc;
4985
4986 rc = sqlite3_bind_int64(s, 1, iStartBlockid);
4987 if( rc!=SQLITE_OK ) return rc;
4988
4989 rc = sqlite3_bind_int64(s, 2, iEndBlockid);
4990 if( rc!=SQLITE_OK ) return rc;
4991
4992 rc = sqlite3_step(s);
4993 if( rc==SQLITE_DONE ){
4994 pReader->eof = 1;
4995 return SQLITE_OK;
4996 }
4997 if( rc!=SQLITE_ROW ) return rc;
4998
4999 pReader->pStmt = s;
5000 leafReaderInit(sqlite3_column_blob(pReader->pStmt, 0),
5001 sqlite3_column_bytes(pReader->pStmt, 0),
5002 &pReader->leafReader);
5003 }
5004 return SQLITE_OK;
5005}
5006
5007/* Step the current leaf forward to the next term. If we reach the
5008** end of the current leaf, step forward to the next leaf block.
5009*/
5010static int leavesReaderStep(fulltext_vtab *v, LeavesReader *pReader){
5011 assert( !leavesReaderAtEnd(pReader) );
5012 leafReaderStep(&pReader->leafReader);
5013
5014 if( leafReaderAtEnd(&pReader->leafReader) ){
5015 int rc;
5016 if( pReader->rootData.pData ){
5017 pReader->eof = 1;
5018 return SQLITE_OK;
5019 }
5020 rc = sqlite3_step(pReader->pStmt);
5021 if( rc!=SQLITE_ROW ){
5022 pReader->eof = 1;
5023 return rc==SQLITE_DONE ? SQLITE_OK : rc;
5024 }
5025 leafReaderDestroy(&pReader->leafReader);
5026 leafReaderInit(sqlite3_column_blob(pReader->pStmt, 0),
5027 sqlite3_column_bytes(pReader->pStmt, 0),
5028 &pReader->leafReader);
5029 }
5030 return SQLITE_OK;
5031}
5032
5033/* Order LeavesReaders by their term, ignoring idx. Readers at eof
5034** always sort to the end.
5035*/
5036static int leavesReaderTermCmp(LeavesReader *lr1, LeavesReader *lr2){
5037 if( leavesReaderAtEnd(lr1) ){
5038 if( leavesReaderAtEnd(lr2) ) return 0;
5039 return 1;
5040 }
5041 if( leavesReaderAtEnd(lr2) ) return -1;
5042
5043 return leafReaderTermCmp(&lr1->leafReader,
5044 leavesReaderTerm(lr2), leavesReaderTermBytes(lr2),
5045 0);
5046}
5047
5048/* Similar to leavesReaderTermCmp(), with additional ordering by idx
5049** so that older segments sort before newer segments.
5050*/
5051static int leavesReaderCmp(LeavesReader *lr1, LeavesReader *lr2){
5052 int c = leavesReaderTermCmp(lr1, lr2);
5053 if( c!=0 ) return c;
5054 return lr1->idx-lr2->idx;
5055}
5056
5057/* Assume that pLr[1]..pLr[nLr] are sorted. Bubble pLr[0] into its
5058** sorted position.
5059*/
5060static void leavesReaderReorder(LeavesReader *pLr, int nLr){
5061 while( nLr>1 && leavesReaderCmp(pLr, pLr+1)>0 ){
5062 LeavesReader tmp = pLr[0];
5063 pLr[0] = pLr[1];
5064 pLr[1] = tmp;
5065 nLr--;
5066 pLr++;
5067 }
5068}
5069
5070/* Initializes pReaders with the segments from level iLevel, returning
5071** the number of segments in *piReaders. Leaves pReaders in sorted
5072** order.
5073*/
5074static int leavesReadersInit(fulltext_vtab *v, int iLevel,
5075 LeavesReader *pReaders, int *piReaders){
5076 sqlite3_stmt *s;
5077 int i, rc = sql_get_statement(v, SEGDIR_SELECT_STMT, &s);
5078 if( rc!=SQLITE_OK ) return rc;
5079
5080 rc = sqlite3_bind_int(s, 1, iLevel);
5081 if( rc!=SQLITE_OK ) return rc;
5082
5083 i = 0;
5084 while( (rc = sqlite3_step(s))==SQLITE_ROW ){
5085 sqlite_int64 iStart = sqlite3_column_int64(s, 0);
5086 sqlite_int64 iEnd = sqlite3_column_int64(s, 1);
5087 const char *pRootData = sqlite3_column_blob(s, 2);
5088 int nRootData = sqlite3_column_bytes(s, 2);
5089
5090 assert( i<MERGE_COUNT );
5091 rc = leavesReaderInit(v, i, iStart, iEnd, pRootData, nRootData,
5092 &pReaders[i]);
5093 if( rc!=SQLITE_OK ) break;
5094
5095 i++;
5096 }
5097 if( rc!=SQLITE_DONE ){
5098 while( i-->0 ){
5099 leavesReaderDestroy(&pReaders[i]);
5100 }
5101 return rc;
5102 }
5103
5104 *piReaders = i;
5105
5106 /* Leave our results sorted by term, then age. */
5107 while( i-- ){
5108 leavesReaderReorder(pReaders+i, *piReaders-i);
5109 }
5110 return SQLITE_OK;
5111}
5112
5113/* Merge doclists from pReaders[nReaders] into a single doclist, which
5114** is written to pWriter. Assumes pReaders is ordered oldest to
5115** newest.
5116*/
5117/* TODO(shess) Consider putting this inline in segmentMerge(). */
5118static int leavesReadersMerge(fulltext_vtab *v,
5119 LeavesReader *pReaders, int nReaders,
5120 LeafWriter *pWriter){
5121 DLReader dlReaders[MERGE_COUNT];
5122 const char *pTerm = leavesReaderTerm(pReaders);
5123 int i, nTerm = leavesReaderTermBytes(pReaders);
5124
5125 assert( nReaders<=MERGE_COUNT );
5126
5127 for(i=0; i<nReaders; i++){
5128 dlrInit(&dlReaders[i], DL_DEFAULT,
5129 leavesReaderData(pReaders+i),
5130 leavesReaderDataBytes(pReaders+i));
5131 }
5132
5133 return leafWriterStepMerge(v, pWriter, pTerm, nTerm, dlReaders, nReaders);
5134}
5135
5136/* Forward ref due to mutual recursion with segdirNextIndex(). */
5137static int segmentMerge(fulltext_vtab *v, int iLevel);
5138
5139/* Put the next available index at iLevel into *pidx. If iLevel
5140** already has MERGE_COUNT segments, they are merged to a higher
5141** level to make room.
5142*/
5143static int segdirNextIndex(fulltext_vtab *v, int iLevel, int *pidx){
5144 int rc = segdir_max_index(v, iLevel, pidx);
5145 if( rc==SQLITE_DONE ){ /* No segments at iLevel. */
5146 *pidx = 0;
5147 }else if( rc==SQLITE_ROW ){
5148 if( *pidx==(MERGE_COUNT-1) ){
5149 rc = segmentMerge(v, iLevel);
5150 if( rc!=SQLITE_OK ) return rc;
5151 *pidx = 0;
5152 }else{
5153 (*pidx)++;
5154 }
5155 }else{
5156 return rc;
5157 }
5158 return SQLITE_OK;
5159}
5160
5161/* Merge MERGE_COUNT segments at iLevel into a new segment at
5162** iLevel+1. If iLevel+1 is already full of segments, those will be
5163** merged to make room.
5164*/
5165static int segmentMerge(fulltext_vtab *v, int iLevel){
5166 LeafWriter writer;
5167 LeavesReader lrs[MERGE_COUNT];
5168 int i, rc, idx = 0;
5169
5170 /* Determine the next available segment index at the next level,
5171 ** merging as necessary.
5172 */
5173 rc = segdirNextIndex(v, iLevel+1, &idx);
5174 if( rc!=SQLITE_OK ) return rc;
5175
5176 /* TODO(shess) This assumes that we'll always see exactly
5177 ** MERGE_COUNT segments to merge at a given level. That will be
5178 ** broken if we allow the developer to request preemptive or
5179 ** deferred merging.
5180 */
5181 memset(&lrs, '\0', sizeof(lrs));
5182 rc = leavesReadersInit(v, iLevel, lrs, &i);
5183 if( rc!=SQLITE_OK ) return rc;
5184 assert( i==MERGE_COUNT );
5185
5186 leafWriterInit(iLevel+1, idx, &writer);
5187
5188 /* Since leavesReaderReorder() pushes readers at eof to the end,
5189 ** when the first reader is empty, all will be empty.
5190 */
5191 while( !leavesReaderAtEnd(lrs) ){
5192 /* Figure out how many readers share their next term. */
5193 for(i=1; i<MERGE_COUNT && !leavesReaderAtEnd(lrs+i); i++){
5194 if( 0!=leavesReaderTermCmp(lrs, lrs+i) ) break;
5195 }
5196
5197 rc = leavesReadersMerge(v, lrs, i, &writer);
5198 if( rc!=SQLITE_OK ) goto err;
5199
5200 /* Step forward those that were merged. */
5201 while( i-->0 ){
5202 rc = leavesReaderStep(v, lrs+i);
5203 if( rc!=SQLITE_OK ) goto err;
5204
5205 /* Reorder by term, then by age. */
5206 leavesReaderReorder(lrs+i, MERGE_COUNT-i);
5207 }
5208 }
5209
5210 for(i=0; i<MERGE_COUNT; i++){
5211 leavesReaderDestroy(&lrs[i]);
5212 }
5213
5214 rc = leafWriterFinalize(v, &writer);
5215 leafWriterDestroy(&writer);
5216 if( rc!=SQLITE_OK ) return rc;
5217
5218 /* Delete the merged segment data. */
5219 return segdir_delete(v, iLevel);
5220
5221 err:
5222 for(i=0; i<MERGE_COUNT; i++){
5223 leavesReaderDestroy(&lrs[i]);
5224 }
5225 leafWriterDestroy(&writer);
5226 return rc;
5227}
5228
5229/* Scan pReader for pTerm/nTerm, and merge the term's doclist over
5230** *out (any doclists with duplicate docids overwrite those in *out).
5231** Internal function for loadSegmentLeaf().
5232*/
5233static int loadSegmentLeavesInt(fulltext_vtab *v, LeavesReader *pReader,
5234 const char *pTerm, int nTerm, int isPrefix,
5235 DataBuffer *out){
5236 assert( nTerm>0 );
5237
5238 /* Process while the prefix matches. */
5239 while( !leavesReaderAtEnd(pReader) ){
5240 /* TODO(shess) Really want leavesReaderTermCmp(), but that name is
5241 ** already taken to compare the terms of two LeavesReaders. Think
5242 ** on a better name. [Meanwhile, break encapsulation rather than
5243 ** use a confusing name.]
5244 */
5245 int rc;
5246 int c = leafReaderTermCmp(&pReader->leafReader, pTerm, nTerm, isPrefix);
5247 if( c==0 ){
5248 const char *pData = leavesReaderData(pReader);
5249 int nData = leavesReaderDataBytes(pReader);
5250 if( out->nData==0 ){
5251 dataBufferReplace(out, pData, nData);
5252 }else{
5253 DataBuffer result;
5254 dataBufferInit(&result, out->nData+nData);
5255 docListUnion(out->pData, out->nData, pData, nData, &result);
5256 dataBufferDestroy(out);
5257 *out = result;
5258 /* TODO(shess) Rather than destroy out, we could retain it for
5259 ** later reuse.
5260 */
5261 }
5262 }
5263 if( c>0 ) break; /* Past any possible matches. */
5264
5265 rc = leavesReaderStep(v, pReader);
5266 if( rc!=SQLITE_OK ) return rc;
5267 }
5268 return SQLITE_OK;
5269}
5270
5271/* Call loadSegmentLeavesInt() with pData/nData as input. */
5272static int loadSegmentLeaf(fulltext_vtab *v, const char *pData, int nData,
5273 const char *pTerm, int nTerm, int isPrefix,
5274 DataBuffer *out){
5275 LeavesReader reader;
5276 int rc;
5277
5278 assert( nData>1 );
5279 assert( *pData=='\0' );
5280 rc = leavesReaderInit(v, 0, 0, 0, pData, nData, &reader);
5281 if( rc!=SQLITE_OK ) return rc;
5282
5283 rc = loadSegmentLeavesInt(v, &reader, pTerm, nTerm, isPrefix, out);
5284 leavesReaderReset(&reader);
5285 leavesReaderDestroy(&reader);
5286 return rc;
5287}
5288
5289/* Call loadSegmentLeavesInt() with the leaf nodes from iStartLeaf to
5290** iEndLeaf (inclusive) as input, and merge the resulting doclist into
5291** out.
5292*/
5293static int loadSegmentLeaves(fulltext_vtab *v,
5294 sqlite_int64 iStartLeaf, sqlite_int64 iEndLeaf,
5295 const char *pTerm, int nTerm, int isPrefix,
5296 DataBuffer *out){
5297 int rc;
5298 LeavesReader reader;
5299
5300 assert( iStartLeaf<=iEndLeaf );
5301 rc = leavesReaderInit(v, 0, iStartLeaf, iEndLeaf, NULL, 0, &reader);
5302 if( rc!=SQLITE_OK ) return rc;
5303
5304 rc = loadSegmentLeavesInt(v, &reader, pTerm, nTerm, isPrefix, out);
5305 leavesReaderReset(&reader);
5306 leavesReaderDestroy(&reader);
5307 return rc;
5308}
5309
5310/* Taking pData/nData as an interior node, find the sequence of child
5311** nodes which could include pTerm/nTerm/isPrefix. Note that the
5312** interior node terms logically come between the blocks, so there is
5313** one more blockid than there are terms (that block contains terms >=
5314** the last interior-node term).
5315*/
5316/* TODO(shess) The calling code may already know that the end child is
5317** not worth calculating, because the end may be in a later sibling
5318** node. Consider whether breaking symmetry is worthwhile. I suspect
5319** it's not worthwhile.
5320*/
5321static void getChildrenContaining(const char *pData, int nData,
5322 const char *pTerm, int nTerm, int isPrefix,
5323 sqlite_int64 *piStartChild,
5324 sqlite_int64 *piEndChild){
5325 InteriorReader reader;
5326
5327 assert( nData>1 );
5328 assert( *pData!='\0' );
5329 interiorReaderInit(pData, nData, &reader);
5330
5331 /* Scan for the first child which could contain pTerm/nTerm. */
5332 while( !interiorReaderAtEnd(&reader) ){
5333 if( interiorReaderTermCmp(&reader, pTerm, nTerm, 0)>0 ) break;
5334 interiorReaderStep(&reader);
5335 }
5336 *piStartChild = interiorReaderCurrentBlockid(&reader);
5337
5338 /* Keep scanning to find a term greater than our term, using prefix
5339 ** comparison if indicated. If isPrefix is false, this will be the
5340 ** same blockid as the starting block.
5341 */
5342 while( !interiorReaderAtEnd(&reader) ){
5343 if( interiorReaderTermCmp(&reader, pTerm, nTerm, isPrefix)>0 ) break;
5344 interiorReaderStep(&reader);
5345 }
5346 *piEndChild = interiorReaderCurrentBlockid(&reader);
5347
5348 interiorReaderDestroy(&reader);
5349
5350 /* Children must ascend, and if !prefix, both must be the same. */
5351 assert( *piEndChild>=*piStartChild );
5352 assert( isPrefix || *piStartChild==*piEndChild );
5353}
5354
5355/* Read block at iBlockid and pass it with other params to
5356** getChildrenContaining().
5357*/
5358static int loadAndGetChildrenContaining(
5359 fulltext_vtab *v,
5360 sqlite_int64 iBlockid,
5361 const char *pTerm, int nTerm, int isPrefix,
5362 sqlite_int64 *piStartChild, sqlite_int64 *piEndChild
5363){
5364 sqlite3_stmt *s = NULL;
5365 int rc;
5366
5367 assert( iBlockid!=0 );
5368 assert( pTerm!=NULL );
5369 assert( nTerm!=0 ); /* TODO(shess) Why not allow this? */
5370 assert( piStartChild!=NULL );
5371 assert( piEndChild!=NULL );
5372
5373 rc = sql_get_statement(v, BLOCK_SELECT_STMT, &s);
5374 if( rc!=SQLITE_OK ) return rc;
5375
5376 rc = sqlite3_bind_int64(s, 1, iBlockid);
5377 if( rc!=SQLITE_OK ) return rc;
5378
5379 rc = sqlite3_step(s);
5380 if( rc==SQLITE_DONE ) return SQLITE_ERROR;
5381 if( rc!=SQLITE_ROW ) return rc;
5382
5383 getChildrenContaining(sqlite3_column_blob(s, 0), sqlite3_column_bytes(s, 0),
5384 pTerm, nTerm, isPrefix, piStartChild, piEndChild);
5385
5386 /* We expect only one row. We must execute another sqlite3_step()
5387 * to complete the iteration; otherwise the table will remain
5388 * locked. */
5389 rc = sqlite3_step(s);
5390 if( rc==SQLITE_ROW ) return SQLITE_ERROR;
5391 if( rc!=SQLITE_DONE ) return rc;
5392
5393 return SQLITE_OK;
5394}
5395
5396/* Traverse the tree represented by pData[nData] looking for
5397** pTerm[nTerm], placing its doclist into *out. This is internal to
5398** loadSegment() to make error-handling cleaner.
5399*/
5400static int loadSegmentInt(fulltext_vtab *v, const char *pData, int nData,
5401 sqlite_int64 iLeavesEnd,
5402 const char *pTerm, int nTerm, int isPrefix,
5403 DataBuffer *out){
5404 /* Special case where root is a leaf. */
5405 if( *pData=='\0' ){
5406 return loadSegmentLeaf(v, pData, nData, pTerm, nTerm, isPrefix, out);
5407 }else{
5408 int rc;
5409 sqlite_int64 iStartChild, iEndChild;
5410
5411 /* Process pData as an interior node, then loop down the tree
5412 ** until we find the set of leaf nodes to scan for the term.
5413 */
5414 getChildrenContaining(pData, nData, pTerm, nTerm, isPrefix,
5415 &iStartChild, &iEndChild);
5416 while( iStartChild>iLeavesEnd ){
5417 sqlite_int64 iNextStart, iNextEnd;
5418 rc = loadAndGetChildrenContaining(v, iStartChild, pTerm, nTerm, isPrefix,
5419 &iNextStart, &iNextEnd);
5420 if( rc!=SQLITE_OK ) return rc;
5421
5422 /* If we've branched, follow the end branch, too. */
5423 if( iStartChild!=iEndChild ){
5424 sqlite_int64 iDummy;
5425 rc = loadAndGetChildrenContaining(v, iEndChild, pTerm, nTerm, isPrefix,
5426 &iDummy, &iNextEnd);
5427 if( rc!=SQLITE_OK ) return rc;
5428 }
5429
5430 assert( iNextStart<=iNextEnd );
5431 iStartChild = iNextStart;
5432 iEndChild = iNextEnd;
5433 }
5434 assert( iStartChild<=iLeavesEnd );
5435 assert( iEndChild<=iLeavesEnd );
5436
5437 /* Scan through the leaf segments for doclists. */
5438 return loadSegmentLeaves(v, iStartChild, iEndChild,
5439 pTerm, nTerm, isPrefix, out);
5440 }
5441}
5442
5443/* Call loadSegmentInt() to collect the doclist for pTerm/nTerm, then
5444** merge its doclist over *out (any duplicate doclists read from the
5445** segment rooted at pData will overwrite those in *out).
5446*/
5447/* TODO(shess) Consider changing this to determine the depth of the
5448** leaves using either the first characters of interior nodes (when
5449** ==1, we're one level above the leaves), or the first character of
5450** the root (which will describe the height of the tree directly).
5451** Either feels somewhat tricky to me.
5452*/
5453/* TODO(shess) The current merge is likely to be slow for large
5454** doclists (though it should process from newest/smallest to
5455** oldest/largest, so it may not be that bad). It might be useful to
5456** modify things to allow for N-way merging. This could either be
5457** within a segment, with pairwise merges across segments, or across
5458** all segments at once.
5459*/
5460static int loadSegment(fulltext_vtab *v, const char *pData, int nData,
5461 sqlite_int64 iLeavesEnd,
5462 const char *pTerm, int nTerm, int isPrefix,
5463 DataBuffer *out){
5464 DataBuffer result;
5465 int rc;
5466
5467 assert( nData>1 );
5468
5469 /* This code should never be called with buffered updates. */
5470 assert( v->nPendingData<0 );
5471
5472 dataBufferInit(&result, 0);
5473 rc = loadSegmentInt(v, pData, nData, iLeavesEnd,
5474 pTerm, nTerm, isPrefix, &result);
5475 if( rc==SQLITE_OK && result.nData>0 ){
5476 if( out->nData==0 ){
5477 DataBuffer tmp = *out;
5478 *out = result;
5479 result = tmp;
5480 }else{
5481 DataBuffer merged;
5482 DLReader readers[2];
5483
5484 dlrInit(&readers[0], DL_DEFAULT, out->pData, out->nData);
5485 dlrInit(&readers[1], DL_DEFAULT, result.pData, result.nData);
5486 dataBufferInit(&merged, out->nData+result.nData);
5487 docListMerge(&merged, readers, 2);
5488 dataBufferDestroy(out);
5489 *out = merged;
5490 dlrDestroy(&readers[0]);
5491 dlrDestroy(&readers[1]);
5492 }
5493 }
5494 dataBufferDestroy(&result);
5495 return rc;
5496}
5497
5498/* Scan the database and merge together the posting lists for the term
5499** into *out.
5500*/
5501static int termSelect(fulltext_vtab *v, int iColumn,
5502 const char *pTerm, int nTerm, int isPrefix,
5503 DocListType iType, DataBuffer *out){
5504 DataBuffer doclist;
5505 sqlite3_stmt *s;
5506 int rc = sql_get_statement(v, SEGDIR_SELECT_ALL_STMT, &s);
5507 if( rc!=SQLITE_OK ) return rc;
5508
5509 /* This code should never be called with buffered updates. */
5510 assert( v->nPendingData<0 );
5511
5512 dataBufferInit(&doclist, 0);
5513
5514 /* Traverse the segments from oldest to newest so that newer doclist
5515 ** elements for given docids overwrite older elements.
5516 */
5517 while( (rc = sqlite3_step(s))==SQLITE_ROW ){
5518 const char *pData = sqlite3_column_blob(s, 0);
5519 const int nData = sqlite3_column_bytes(s, 0);
5520 const sqlite_int64 iLeavesEnd = sqlite3_column_int64(s, 1);
5521 rc = loadSegment(v, pData, nData, iLeavesEnd, pTerm, nTerm, isPrefix,
5522 &doclist);
5523 if( rc!=SQLITE_OK ) goto err;
5524 }
5525 if( rc==SQLITE_DONE ){
5526 if( doclist.nData!=0 ){
5527 /* TODO(shess) The old term_select_all() code applied the column
5528 ** restrict as we merged segments, leading to smaller buffers.
5529 ** This is probably worthwhile to bring back, once the new storage
5530 ** system is checked in.
5531 */
5532 if( iColumn==v->nColumn) iColumn = -1;
5533 docListTrim(DL_DEFAULT, doclist.pData, doclist.nData,
5534 iColumn, iType, out);
5535 }
5536 rc = SQLITE_OK;
5537 }
5538
5539 err:
5540 dataBufferDestroy(&doclist);
5541 return rc;
5542}
5543
5544/****************************************************************/
5545/* Used to hold hashtable data for sorting. */
5546typedef struct TermData {
5547 const char *pTerm;
5548 int nTerm;
5549 DLCollector *pCollector;
5550} TermData;
5551
5552/* Orders TermData elements in strcmp fashion ( <0 for less-than, 0
5553** for equal, >0 for greater-than).
5554*/
5555static int termDataCmp(const void *av, const void *bv){
5556 const TermData *a = (const TermData *)av;
5557 const TermData *b = (const TermData *)bv;
5558 int n = a->nTerm<b->nTerm ? a->nTerm : b->nTerm;
5559 int c = memcmp(a->pTerm, b->pTerm, n);
5560 if( c!=0 ) return c;
5561 return a->nTerm-b->nTerm;
5562}
5563
5564/* Order pTerms data by term, then write a new level 0 segment using
5565** LeafWriter.
5566*/
5567static int writeZeroSegment(fulltext_vtab *v, fts3Hash *pTerms){
5568 fts3HashElem *e;
5569 int idx, rc, i, n;
5570 TermData *pData;
5571 LeafWriter writer;
5572 DataBuffer dl;
5573
5574 /* Determine the next index at level 0, merging as necessary. */
5575 rc = segdirNextIndex(v, 0, &idx);
5576 if( rc!=SQLITE_OK ) return rc;
5577
5578 n = fts3HashCount(pTerms);
5579 pData = malloc(n*sizeof(TermData));
5580
5581 for(i = 0, e = fts3HashFirst(pTerms); e; i++, e = fts3HashNext(e)){
5582 assert( i<n );
5583 pData[i].pTerm = fts3HashKey(e);
5584 pData[i].nTerm = fts3HashKeysize(e);
5585 pData[i].pCollector = fts3HashData(e);
5586 }
5587 assert( i==n );
5588
5589 /* TODO(shess) Should we allow user-defined collation sequences,
5590 ** here? I think we only need that once we support prefix searches.
5591 */
5592 if( n>1 ) qsort(pData, n, sizeof(*pData), termDataCmp);
5593
5594 /* TODO(shess) Refactor so that we can write directly to the segment
5595 ** DataBuffer, as happens for segment merges.
5596 */
5597 leafWriterInit(0, idx, &writer);
5598 dataBufferInit(&dl, 0);
5599 for(i=0; i<n; i++){
5600 dataBufferReset(&dl);
5601 dlcAddDoclist(pData[i].pCollector, &dl);
5602 rc = leafWriterStep(v, &writer,
5603 pData[i].pTerm, pData[i].nTerm, dl.pData, dl.nData);
5604 if( rc!=SQLITE_OK ) goto err;
5605 }
5606 rc = leafWriterFinalize(v, &writer);
5607
5608 err:
5609 dataBufferDestroy(&dl);
5610 free(pData);
5611 leafWriterDestroy(&writer);
5612 return rc;
5613}
5614
5615/* If pendingTerms has data, free it. */
5616static int clearPendingTerms(fulltext_vtab *v){
5617 if( v->nPendingData>=0 ){
5618 fts3HashElem *e;
5619 for(e=fts3HashFirst(&v->pendingTerms); e; e=fts3HashNext(e)){
5620 dlcDelete(fts3HashData(e));
5621 }
5622 fts3HashClear(&v->pendingTerms);
5623 v->nPendingData = -1;
5624 }
5625 return SQLITE_OK;
5626}
5627
5628/* If pendingTerms has data, flush it to a level-zero segment, and
5629** free it.
5630*/
5631static int flushPendingTerms(fulltext_vtab *v){
5632 if( v->nPendingData>=0 ){
5633 int rc = writeZeroSegment(v, &v->pendingTerms);
5634 if( rc==SQLITE_OK ) clearPendingTerms(v);
5635 return rc;
5636 }
5637 return SQLITE_OK;
5638}
5639
5640/* If pendingTerms is "too big", or docid is out of order, flush it.
5641** Regardless, be certain that pendingTerms is initialized for use.
5642*/
5643static int initPendingTerms(fulltext_vtab *v, sqlite_int64 iDocid){
5644 /* TODO(shess) Explore whether partially flushing the buffer on
5645 ** forced-flush would provide better performance. I suspect that if
5646 ** we ordered the doclists by size and flushed the largest until the
5647 ** buffer was half empty, that would let the less frequent terms
5648 ** generate longer doclists.
5649 */
5650 if( iDocid<=v->iPrevDocid || v->nPendingData>kPendingThreshold ){
5651 int rc = flushPendingTerms(v);
5652 if( rc!=SQLITE_OK ) return rc;
5653 }
5654 if( v->nPendingData<0 ){
5655 fts3HashInit(&v->pendingTerms, FTS3_HASH_STRING, 1);
5656 v->nPendingData = 0;
5657 }
5658 v->iPrevDocid = iDocid;
5659 return SQLITE_OK;
5660}
5661
5662/* This function implements the xUpdate callback; it's the top-level entry
5663 * point for inserting, deleting or updating a row in a full-text table. */
5664static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg,
5665 sqlite_int64 *pRowid){
5666 fulltext_vtab *v = (fulltext_vtab *) pVtab;
5667 int rc;
5668
5669 TRACE(("FTS3 Update %p\n", pVtab));
5670
5671 if( nArg<2 ){
5672 rc = index_delete(v, sqlite3_value_int64(ppArg[0]));
5673 } else if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){
5674 /* An update:
5675 * ppArg[0] = old rowid
5676 * ppArg[1] = new rowid
5677 * ppArg[2..2+v->nColumn-1] = values
5678 * ppArg[2+v->nColumn] = value for magic column (we ignore this)
5679 * ppArg[2+v->nColumn+1] = value for docid
5680 */
5681 sqlite_int64 rowid = sqlite3_value_int64(ppArg[0]);
5682 if( sqlite3_value_type(ppArg[1]) != SQLITE_INTEGER ||
5683 sqlite3_value_int64(ppArg[1]) != rowid ){
5684 rc = SQLITE_ERROR; /* we don't allow changing the rowid */
5685 }else if( sqlite3_value_type(ppArg[2+v->nColumn+1]) != SQLITE_INTEGER ||
5686 sqlite3_value_int64(ppArg[2+v->nColumn+1]) != rowid ){
5687 rc = SQLITE_ERROR; /* we don't allow changing the docid */
5688 }else{
5689 assert( nArg==2+v->nColumn+2);
5690 rc = index_update(v, rowid, &ppArg[2]);
5691 }
5692 } else {
5693 /* An insert:
5694 * ppArg[1] = requested rowid
5695 * ppArg[2..2+v->nColumn-1] = values
5696 * ppArg[2+v->nColumn] = value for magic column (we ignore this)
5697 * ppArg[2+v->nColumn+1] = value for docid
5698 */
5699 sqlite3_value *pRequestDocid = ppArg[2+v->nColumn+1];
5700 assert( nArg==2+v->nColumn+2);
5701 if( SQLITE_NULL != sqlite3_value_type(pRequestDocid) &&
5702 SQLITE_NULL != sqlite3_value_type(ppArg[1]) ){
5703 /* TODO(shess) Consider allowing this to work if the values are
5704 ** identical. I'm inclined to discourage that usage, though,
5705 ** given that both rowid and docid are special columns. Better
5706 ** would be to define one or the other as the default winner,
5707 ** but should it be fts3-centric (docid) or SQLite-centric
5708 ** (rowid)?
5709 */
5710 rc = SQLITE_ERROR;
5711 }else{
5712 if( SQLITE_NULL == sqlite3_value_type(pRequestDocid) ){
5713 pRequestDocid = ppArg[1];
5714 }
5715 rc = index_insert(v, pRequestDocid, &ppArg[2], pRowid);
5716 }
5717 }
5718
5719 return rc;
5720}
5721
5722static int fulltextSync(sqlite3_vtab *pVtab){
5723 TRACE(("FTS3 xSync()\n"));
5724 return flushPendingTerms((fulltext_vtab *)pVtab);
5725}
5726
5727static int fulltextBegin(sqlite3_vtab *pVtab){
5728 fulltext_vtab *v = (fulltext_vtab *) pVtab;
5729 TRACE(("FTS3 xBegin()\n"));
5730
5731 /* Any buffered updates should have been cleared by the previous
5732 ** transaction.
5733 */
5734 assert( v->nPendingData<0 );
5735 return clearPendingTerms(v);
5736}
5737
5738static int fulltextCommit(sqlite3_vtab *pVtab){
5739 fulltext_vtab *v = (fulltext_vtab *) pVtab;
5740 TRACE(("FTS3 xCommit()\n"));
5741
5742 /* Buffered updates should have been cleared by fulltextSync(). */
5743 assert( v->nPendingData<0 );
5744 return clearPendingTerms(v);
5745}
5746
5747static int fulltextRollback(sqlite3_vtab *pVtab){
5748 TRACE(("FTS3 xRollback()\n"));
5749 return clearPendingTerms((fulltext_vtab *)pVtab);
5750}
5751
5752/*
5753** Implementation of the snippet() function for FTS3
5754*/
5755static void snippetFunc(
5756 sqlite3_context *pContext,
5757 int argc,
5758 sqlite3_value **argv
5759){
5760 fulltext_cursor *pCursor;
5761 if( argc<1 ) return;
5762 if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
5763 sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
5764 sqlite3_result_error(pContext, "illegal first argument to html_snippet",-1);
5765 }else{
5766 const char *zStart = "<b>";
5767 const char *zEnd = "</b>";
5768 const char *zEllipsis = "<b>...</b>";
5769 memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
5770 if( argc>=2 ){
5771 zStart = (const char*)sqlite3_value_text(argv[1]);
5772 if( argc>=3 ){
5773 zEnd = (const char*)sqlite3_value_text(argv[2]);
5774 if( argc>=4 ){
5775 zEllipsis = (const char*)sqlite3_value_text(argv[3]);
5776 }
5777 }
5778 }
5779 snippetAllOffsets(pCursor);
5780 snippetText(pCursor, zStart, zEnd, zEllipsis);
5781 sqlite3_result_text(pContext, pCursor->snippet.zSnippet,
5782 pCursor->snippet.nSnippet, SQLITE_STATIC);
5783 }
5784}
5785
5786/*
5787** Implementation of the offsets() function for FTS3
5788*/
5789static void snippetOffsetsFunc(
5790 sqlite3_context *pContext,
5791 int argc,
5792 sqlite3_value **argv
5793){
5794 fulltext_cursor *pCursor;
5795 if( argc<1 ) return;
5796 if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
5797 sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
5798 sqlite3_result_error(pContext, "illegal first argument to offsets",-1);
5799 }else{
5800 memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
5801 snippetAllOffsets(pCursor);
5802 snippetOffsetText(&pCursor->snippet);
5803 sqlite3_result_text(pContext,
5804 pCursor->snippet.zOffset, pCursor->snippet.nOffset,
5805 SQLITE_STATIC);
5806 }
5807}
5808
5809/*
5810** This routine implements the xFindFunction method for the FTS3
5811** virtual table.
5812*/
5813static int fulltextFindFunction(
5814 sqlite3_vtab *pVtab,
5815 int nArg,
5816 const char *zName,
5817 void (**pxFunc)(sqlite3_context*,int,sqlite3_value**),
5818 void **ppArg
5819){
5820 if( strcmp(zName,"snippet")==0 ){
5821 *pxFunc = snippetFunc;
5822 return 1;
5823 }else if( strcmp(zName,"offsets")==0 ){
5824 *pxFunc = snippetOffsetsFunc;
5825 return 1;
5826 }
5827 return 0;
5828}
5829
5830/*
5831** Rename an fts3 table.
5832*/
5833static int fulltextRename(
5834 sqlite3_vtab *pVtab,
5835 const char *zName
5836){
5837 fulltext_vtab *p = (fulltext_vtab *)pVtab;
5838 int rc = SQLITE_NOMEM;
5839 char *zSql = sqlite3_mprintf(
5840 "ALTER TABLE %Q.'%q_content' RENAME TO '%q_content';"
5841 "ALTER TABLE %Q.'%q_segments' RENAME TO '%q_segments';"
5842 "ALTER TABLE %Q.'%q_segdir' RENAME TO '%q_segdir';"
5843 , p->zDb, p->zName, zName
5844 , p->zDb, p->zName, zName
5845 , p->zDb, p->zName, zName
5846 );
5847 if( zSql ){
5848 rc = sqlite3_exec(p->db, zSql, 0, 0, 0);
5849 sqlite3_free(zSql);
5850 }
5851 return rc;
5852}
5853
5854static const sqlite3_module fts3Module = {
5855 /* iVersion */ 0,
5856 /* xCreate */ fulltextCreate,
5857 /* xConnect */ fulltextConnect,
5858 /* xBestIndex */ fulltextBestIndex,
5859 /* xDisconnect */ fulltextDisconnect,
5860 /* xDestroy */ fulltextDestroy,
5861 /* xOpen */ fulltextOpen,
5862 /* xClose */ fulltextClose,
5863 /* xFilter */ fulltextFilter,
5864 /* xNext */ fulltextNext,
5865 /* xEof */ fulltextEof,
5866 /* xColumn */ fulltextColumn,
5867 /* xRowid */ fulltextRowid,
5868 /* xUpdate */ fulltextUpdate,
5869 /* xBegin */ fulltextBegin,
5870 /* xSync */ fulltextSync,
5871 /* xCommit */ fulltextCommit,
5872 /* xRollback */ fulltextRollback,
5873 /* xFindFunction */ fulltextFindFunction,
5874 /* xRename */ fulltextRename,
5875};
5876
5877static void hashDestroy(void *p){
5878 fts3Hash *pHash = (fts3Hash *)p;
5879 sqlite3Fts3HashClear(pHash);
5880 sqlite3_free(pHash);
5881}
5882
5883/*
5884** The fts3 built-in tokenizers - "simple" and "porter" - are implemented
5885** in files fts3_tokenizer1.c and fts3_porter.c respectively. The following
5886** two forward declarations are for functions declared in these files
5887** used to retrieve the respective implementations.
5888**
5889** Calling sqlite3Fts3SimpleTokenizerModule() sets the value pointed
5890** to by the argument to point a the "simple" tokenizer implementation.
5891** Function ...PorterTokenizerModule() sets *pModule to point to the
5892** porter tokenizer/stemmer implementation.
5893*/
5894void sqlite3Fts3SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule);
5895void sqlite3Fts3PorterTokenizerModule(sqlite3_tokenizer_module const**ppModule);
5896void sqlite3Fts3IcuTokenizerModule(sqlite3_tokenizer_module const**ppModule);
5897
5898int sqlite3Fts3InitHashTable(sqlite3 *, fts3Hash *, const char *);
5899
5900/*
5901** Initialise the fts3 extension. If this extension is built as part
5902** of the sqlite library, then this function is called directly by
5903** SQLite. If fts3 is built as a dynamically loadable extension, this
5904** function is called by the sqlite3_extension_init() entry point.
5905*/
5906int sqlite3Fts3Init(sqlite3 *db){
5907 int rc = SQLITE_OK;
5908 fts3Hash *pHash = 0;
5909 const sqlite3_tokenizer_module *pSimple = 0;
5910 const sqlite3_tokenizer_module *pPorter = 0;
5911 const sqlite3_tokenizer_module *pIcu = 0;
5912
5913 sqlite3Fts3SimpleTokenizerModule(&pSimple);
5914 sqlite3Fts3PorterTokenizerModule(&pPorter);
5915#ifdef SQLITE_ENABLE_ICU
5916 sqlite3Fts3IcuTokenizerModule(&pIcu);
5917#endif
5918
5919 /* Allocate and initialise the hash-table used to store tokenizers. */
5920 pHash = sqlite3_malloc(sizeof(fts3Hash));
5921 if( !pHash ){
5922 rc = SQLITE_NOMEM;
5923 }else{
5924 sqlite3Fts3HashInit(pHash, FTS3_HASH_STRING, 1);
5925 }
5926
5927 /* Load the built-in tokenizers into the hash table */
5928 if( rc==SQLITE_OK ){
5929 if( sqlite3Fts3HashInsert(pHash, "simple", 7, (void *)pSimple)
5930 || sqlite3Fts3HashInsert(pHash, "porter", 7, (void *)pPorter)
5931 || (pIcu && sqlite3Fts3HashInsert(pHash, "icu", 4, (void *)pIcu))
5932 ){
5933 rc = SQLITE_NOMEM;
5934 }
5935 }
5936
5937 /* Create the virtual table wrapper around the hash-table and overload
5938 ** the two scalar functions. If this is successful, register the
5939 ** module with sqlite.
5940 */
5941 if( SQLITE_OK==rc
5942 && SQLITE_OK==(rc = sqlite3Fts3InitHashTable(db, pHash, "fts3_tokenizer"))
5943 && SQLITE_OK==(rc = sqlite3_overload_function(db, "snippet", -1))
5944 && SQLITE_OK==(rc = sqlite3_overload_function(db, "offsets", -1))
5945 ){
5946 return sqlite3_create_module_v2(
5947 db, "fts3", &fts3Module, (void *)pHash, hashDestroy
5948 );
5949 }
5950
5951 /* An error has occured. Delete the hash table and return the error code. */
5952 assert( rc!=SQLITE_OK );
5953 if( pHash ){
5954 sqlite3Fts3HashClear(pHash);
5955 sqlite3_free(pHash);
5956 }
5957 return rc;
5958}
5959
5960#if !SQLITE_CORE
5961int sqlite3_extension_init(
5962 sqlite3 *db,
5963 char **pzErrMsg,
5964 const sqlite3_api_routines *pApi
5965){
5966 SQLITE_EXTENSION_INIT2(pApi)
5967 return sqlite3Fts3Init(db);
5968}
5969#endif
5970
5971#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3.h b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3.h
new file mode 100644
index 0000000..c1aa8ca
--- /dev/null
+++ b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3.h
@@ -0,0 +1,26 @@
1/*
2** 2006 Oct 10
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11******************************************************************************
12**
13** This header file is used by programs that want to link against the
14** FTS3 library. All it does is declare the sqlite3Fts3Init() interface.
15*/
16#include "sqlite3.h"
17
18#ifdef __cplusplus
19extern "C" {
20#endif /* __cplusplus */
21
22int sqlite3Fts3Init(sqlite3 *db);
23
24#ifdef __cplusplus
25} /* extern "C" */
26#endif /* __cplusplus */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_hash.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_hash.c
new file mode 100644
index 0000000..b14511a
--- /dev/null
+++ b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_hash.c
@@ -0,0 +1,373 @@
1/*
2** 2001 September 22
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12** This is the implementation of generic hash-tables used in SQLite.
13** We've modified it slightly to serve as a standalone hash table
14** implementation for the full-text indexing module.
15*/
16
17/*
18** The code in this file is only compiled if:
19**
20** * The FTS3 module is being built as an extension
21** (in which case SQLITE_CORE is not defined), or
22**
23** * The FTS3 module is being built into the core of
24** SQLite (in which case SQLITE_ENABLE_FTS3 is defined).
25*/
26#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
27
28#include <assert.h>
29#include <stdlib.h>
30#include <string.h>
31
32#include "fts3_hash.h"
33
34/*
35** Malloc and Free functions
36*/
37static void *fts3HashMalloc(int n){
38 void *p = sqlite3_malloc(n);
39 if( p ){
40 memset(p, 0, n);
41 }
42 return p;
43}
44static void fts3HashFree(void *p){
45 sqlite3_free(p);
46}
47
48/* Turn bulk memory into a hash table object by initializing the
49** fields of the Hash structure.
50**
51** "pNew" is a pointer to the hash table that is to be initialized.
52** keyClass is one of the constants
53** FTS3_HASH_BINARY or FTS3_HASH_STRING. The value of keyClass
54** determines what kind of key the hash table will use. "copyKey" is
55** true if the hash table should make its own private copy of keys and
56** false if it should just use the supplied pointer.
57*/
58void sqlite3Fts3HashInit(fts3Hash *pNew, int keyClass, int copyKey){
59 assert( pNew!=0 );
60 assert( keyClass>=FTS3_HASH_STRING && keyClass<=FTS3_HASH_BINARY );
61 pNew->keyClass = keyClass;
62 pNew->copyKey = copyKey;
63 pNew->first = 0;
64 pNew->count = 0;
65 pNew->htsize = 0;
66 pNew->ht = 0;
67}
68
69/* Remove all entries from a hash table. Reclaim all memory.
70** Call this routine to delete a hash table or to reset a hash table
71** to the empty state.
72*/
73void sqlite3Fts3HashClear(fts3Hash *pH){
74 fts3HashElem *elem; /* For looping over all elements of the table */
75
76 assert( pH!=0 );
77 elem = pH->first;
78 pH->first = 0;
79 fts3HashFree(pH->ht);
80 pH->ht = 0;
81 pH->htsize = 0;
82 while( elem ){
83 fts3HashElem *next_elem = elem->next;
84 if( pH->copyKey && elem->pKey ){
85 fts3HashFree(elem->pKey);
86 }
87 fts3HashFree(elem);
88 elem = next_elem;
89 }
90 pH->count = 0;
91}
92
93/*
94** Hash and comparison functions when the mode is FTS3_HASH_STRING
95*/
96static int fts3StrHash(const void *pKey, int nKey){
97 const char *z = (const char *)pKey;
98 int h = 0;
99 if( nKey<=0 ) nKey = (int) strlen(z);
100 while( nKey > 0 ){
101 h = (h<<3) ^ h ^ *z++;
102 nKey--;
103 }
104 return h & 0x7fffffff;
105}
106static int fts3StrCompare(const void *pKey1, int n1, const void *pKey2, int n2){
107 if( n1!=n2 ) return 1;
108 return strncmp((const char*)pKey1,(const char*)pKey2,n1);
109}
110
111/*
112** Hash and comparison functions when the mode is FTS3_HASH_BINARY
113*/
114static int fts3BinHash(const void *pKey, int nKey){
115 int h = 0;
116 const char *z = (const char *)pKey;
117 while( nKey-- > 0 ){
118 h = (h<<3) ^ h ^ *(z++);
119 }
120 return h & 0x7fffffff;
121}
122static int fts3BinCompare(const void *pKey1, int n1, const void *pKey2, int n2){
123 if( n1!=n2 ) return 1;
124 return memcmp(pKey1,pKey2,n1);
125}
126
127/*
128** Return a pointer to the appropriate hash function given the key class.
129**
130** The C syntax in this function definition may be unfamilar to some
131** programmers, so we provide the following additional explanation:
132**
133** The name of the function is "hashFunction". The function takes a
134** single parameter "keyClass". The return value of hashFunction()
135** is a pointer to another function. Specifically, the return value
136** of hashFunction() is a pointer to a function that takes two parameters
137** with types "const void*" and "int" and returns an "int".
138*/
139static int (*hashFunction(int keyClass))(const void*,int){
140 if( keyClass==FTS3_HASH_STRING ){
141 return &fts3StrHash;
142 }else{
143 assert( keyClass==FTS3_HASH_BINARY );
144 return &fts3BinHash;
145 }
146}
147
148/*
149** Return a pointer to the appropriate hash function given the key class.
150**
151** For help in interpreted the obscure C code in the function definition,
152** see the header comment on the previous function.
153*/
154static int (*compareFunction(int keyClass))(const void*,int,const void*,int){
155 if( keyClass==FTS3_HASH_STRING ){
156 return &fts3StrCompare;
157 }else{
158 assert( keyClass==FTS3_HASH_BINARY );
159 return &fts3BinCompare;
160 }
161}
162
163/* Link an element into the hash table
164*/
165static void fts3HashInsertElement(
166 fts3Hash *pH, /* The complete hash table */
167 struct _fts3ht *pEntry, /* The entry into which pNew is inserted */
168 fts3HashElem *pNew /* The element to be inserted */
169){
170 fts3HashElem *pHead; /* First element already in pEntry */
171 pHead = pEntry->chain;
172 if( pHead ){
173 pNew->next = pHead;
174 pNew->prev = pHead->prev;
175 if( pHead->prev ){ pHead->prev->next = pNew; }
176 else { pH->first = pNew; }
177 pHead->prev = pNew;
178 }else{
179 pNew->next = pH->first;
180 if( pH->first ){ pH->first->prev = pNew; }
181 pNew->prev = 0;
182 pH->first = pNew;
183 }
184 pEntry->count++;
185 pEntry->chain = pNew;
186}
187
188
189/* Resize the hash table so that it cantains "new_size" buckets.
190** "new_size" must be a power of 2. The hash table might fail
191** to resize if sqliteMalloc() fails.
192*/
193static void fts3Rehash(fts3Hash *pH, int new_size){
194 struct _fts3ht *new_ht; /* The new hash table */
195 fts3HashElem *elem, *next_elem; /* For looping over existing elements */
196 int (*xHash)(const void*,int); /* The hash function */
197
198 assert( (new_size & (new_size-1))==0 );
199 new_ht = (struct _fts3ht *)fts3HashMalloc( new_size*sizeof(struct _fts3ht) );
200 if( new_ht==0 ) return;
201 fts3HashFree(pH->ht);
202 pH->ht = new_ht;
203 pH->htsize = new_size;
204 xHash = hashFunction(pH->keyClass);
205 for(elem=pH->first, pH->first=0; elem; elem = next_elem){
206 int h = (*xHash)(elem->pKey, elem->nKey) & (new_size-1);
207 next_elem = elem->next;
208 fts3HashInsertElement(pH, &new_ht[h], elem);
209 }
210}
211
212/* This function (for internal use only) locates an element in an
213** hash table that matches the given key. The hash for this key has
214** already been computed and is passed as the 4th parameter.
215*/
216static fts3HashElem *fts3FindElementByHash(
217 const fts3Hash *pH, /* The pH to be searched */
218 const void *pKey, /* The key we are searching for */
219 int nKey,
220 int h /* The hash for this key. */
221){
222 fts3HashElem *elem; /* Used to loop thru the element list */
223 int count; /* Number of elements left to test */
224 int (*xCompare)(const void*,int,const void*,int); /* comparison function */
225
226 if( pH->ht ){
227 struct _fts3ht *pEntry = &pH->ht[h];
228 elem = pEntry->chain;
229 count = pEntry->count;
230 xCompare = compareFunction(pH->keyClass);
231 while( count-- && elem ){
232 if( (*xCompare)(elem->pKey,elem->nKey,pKey,nKey)==0 ){
233 return elem;
234 }
235 elem = elem->next;
236 }
237 }
238 return 0;
239}
240
241/* Remove a single entry from the hash table given a pointer to that
242** element and a hash on the element's key.
243*/
244static void fts3RemoveElementByHash(
245 fts3Hash *pH, /* The pH containing "elem" */
246 fts3HashElem* elem, /* The element to be removed from the pH */
247 int h /* Hash value for the element */
248){
249 struct _fts3ht *pEntry;
250 if( elem->prev ){
251 elem->prev->next = elem->next;
252 }else{
253 pH->first = elem->next;
254 }
255 if( elem->next ){
256 elem->next->prev = elem->prev;
257 }
258 pEntry = &pH->ht[h];
259 if( pEntry->chain==elem ){
260 pEntry->chain = elem->next;
261 }
262 pEntry->count--;
263 if( pEntry->count<=0 ){
264 pEntry->chain = 0;
265 }
266 if( pH->copyKey && elem->pKey ){
267 fts3HashFree(elem->pKey);
268 }
269 fts3HashFree( elem );
270 pH->count--;
271 if( pH->count<=0 ){
272 assert( pH->first==0 );
273 assert( pH->count==0 );
274 fts3HashClear(pH);
275 }
276}
277
278/* Attempt to locate an element of the hash table pH with a key
279** that matches pKey,nKey. Return the data for this element if it is
280** found, or NULL if there is no match.
281*/
282void *sqlite3Fts3HashFind(const fts3Hash *pH, const void *pKey, int nKey){
283 int h; /* A hash on key */
284 fts3HashElem *elem; /* The element that matches key */
285 int (*xHash)(const void*,int); /* The hash function */
286
287 if( pH==0 || pH->ht==0 ) return 0;
288 xHash = hashFunction(pH->keyClass);
289 assert( xHash!=0 );
290 h = (*xHash)(pKey,nKey);
291 assert( (pH->htsize & (pH->htsize-1))==0 );
292 elem = fts3FindElementByHash(pH,pKey,nKey, h & (pH->htsize-1));
293 return elem ? elem->data : 0;
294}
295
296/* Insert an element into the hash table pH. The key is pKey,nKey
297** and the data is "data".
298**
299** If no element exists with a matching key, then a new
300** element is created. A copy of the key is made if the copyKey
301** flag is set. NULL is returned.
302**
303** If another element already exists with the same key, then the
304** new data replaces the old data and the old data is returned.
305** The key is not copied in this instance. If a malloc fails, then
306** the new data is returned and the hash table is unchanged.
307**
308** If the "data" parameter to this function is NULL, then the
309** element corresponding to "key" is removed from the hash table.
310*/
311void *sqlite3Fts3HashInsert(
312 fts3Hash *pH, /* The hash table to insert into */
313 const void *pKey, /* The key */
314 int nKey, /* Number of bytes in the key */
315 void *data /* The data */
316){
317 int hraw; /* Raw hash value of the key */
318 int h; /* the hash of the key modulo hash table size */
319 fts3HashElem *elem; /* Used to loop thru the element list */
320 fts3HashElem *new_elem; /* New element added to the pH */
321 int (*xHash)(const void*,int); /* The hash function */
322
323 assert( pH!=0 );
324 xHash = hashFunction(pH->keyClass);
325 assert( xHash!=0 );
326 hraw = (*xHash)(pKey, nKey);
327 assert( (pH->htsize & (pH->htsize-1))==0 );
328 h = hraw & (pH->htsize-1);
329 elem = fts3FindElementByHash(pH,pKey,nKey,h);
330 if( elem ){
331 void *old_data = elem->data;
332 if( data==0 ){
333 fts3RemoveElementByHash(pH,elem,h);
334 }else{
335 elem->data = data;
336 }
337 return old_data;
338 }
339 if( data==0 ) return 0;
340 new_elem = (fts3HashElem*)fts3HashMalloc( sizeof(fts3HashElem) );
341 if( new_elem==0 ) return data;
342 if( pH->copyKey && pKey!=0 ){
343 new_elem->pKey = fts3HashMalloc( nKey );
344 if( new_elem->pKey==0 ){
345 fts3HashFree(new_elem);
346 return data;
347 }
348 memcpy((void*)new_elem->pKey, pKey, nKey);
349 }else{
350 new_elem->pKey = (void*)pKey;
351 }
352 new_elem->nKey = nKey;
353 pH->count++;
354 if( pH->htsize==0 ){
355 fts3Rehash(pH,8);
356 if( pH->htsize==0 ){
357 pH->count = 0;
358 fts3HashFree(new_elem);
359 return data;
360 }
361 }
362 if( pH->count > pH->htsize ){
363 fts3Rehash(pH,pH->htsize*2);
364 }
365 assert( pH->htsize>0 );
366 assert( (pH->htsize & (pH->htsize-1))==0 );
367 h = hraw & (pH->htsize-1);
368 fts3HashInsertElement(pH, &pH->ht[h], new_elem);
369 new_elem->data = data;
370 return 0;
371}
372
373#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_hash.h b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_hash.h
new file mode 100644
index 0000000..e01954e
--- /dev/null
+++ b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_hash.h
@@ -0,0 +1,110 @@
1/*
2** 2001 September 22
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12** This is the header file for the generic hash-table implemenation
13** used in SQLite. We've modified it slightly to serve as a standalone
14** hash table implementation for the full-text indexing module.
15**
16*/
17#ifndef _FTS3_HASH_H_
18#define _FTS3_HASH_H_
19
20/* Forward declarations of structures. */
21typedef struct fts3Hash fts3Hash;
22typedef struct fts3HashElem fts3HashElem;
23
24/* A complete hash table is an instance of the following structure.
25** The internals of this structure are intended to be opaque -- client
26** code should not attempt to access or modify the fields of this structure
27** directly. Change this structure only by using the routines below.
28** However, many of the "procedures" and "functions" for modifying and
29** accessing this structure are really macros, so we can't really make
30** this structure opaque.
31*/
32struct fts3Hash {
33 char keyClass; /* HASH_INT, _POINTER, _STRING, _BINARY */
34 char copyKey; /* True if copy of key made on insert */
35 int count; /* Number of entries in this table */
36 fts3HashElem *first; /* The first element of the array */
37 int htsize; /* Number of buckets in the hash table */
38 struct _fts3ht { /* the hash table */
39 int count; /* Number of entries with this hash */
40 fts3HashElem *chain; /* Pointer to first entry with this hash */
41 } *ht;
42};
43
44/* Each element in the hash table is an instance of the following
45** structure. All elements are stored on a single doubly-linked list.
46**
47** Again, this structure is intended to be opaque, but it can't really
48** be opaque because it is used by macros.
49*/
50struct fts3HashElem {
51 fts3HashElem *next, *prev; /* Next and previous elements in the table */
52 void *data; /* Data associated with this element */
53 void *pKey; int nKey; /* Key associated with this element */
54};
55
56/*
57** There are 2 different modes of operation for a hash table:
58**
59** FTS3_HASH_STRING pKey points to a string that is nKey bytes long
60** (including the null-terminator, if any). Case
61** is respected in comparisons.
62**
63** FTS3_HASH_BINARY pKey points to binary data nKey bytes long.
64** memcmp() is used to compare keys.
65**
66** A copy of the key is made if the copyKey parameter to fts3HashInit is 1.
67*/
68#define FTS3_HASH_STRING 1
69#define FTS3_HASH_BINARY 2
70
71/*
72** Access routines. To delete, insert a NULL pointer.
73*/
74void sqlite3Fts3HashInit(fts3Hash*, int keytype, int copyKey);
75void *sqlite3Fts3HashInsert(fts3Hash*, const void *pKey, int nKey, void *pData);
76void *sqlite3Fts3HashFind(const fts3Hash*, const void *pKey, int nKey);
77void sqlite3Fts3HashClear(fts3Hash*);
78
79/*
80** Shorthand for the functions above
81*/
82#define fts3HashInit sqlite3Fts3HashInit
83#define fts3HashInsert sqlite3Fts3HashInsert
84#define fts3HashFind sqlite3Fts3HashFind
85#define fts3HashClear sqlite3Fts3HashClear
86
87/*
88** Macros for looping over all elements of a hash table. The idiom is
89** like this:
90**
91** fts3Hash h;
92** fts3HashElem *p;
93** ...
94** for(p=fts3HashFirst(&h); p; p=fts3HashNext(p)){
95** SomeStructure *pData = fts3HashData(p);
96** // do something with pData
97** }
98*/
99#define fts3HashFirst(H) ((H)->first)
100#define fts3HashNext(E) ((E)->next)
101#define fts3HashData(E) ((E)->data)
102#define fts3HashKey(E) ((E)->pKey)
103#define fts3HashKeysize(E) ((E)->nKey)
104
105/*
106** Number of entries in a hash table
107*/
108#define fts3HashCount(H) ((H)->count)
109
110#endif /* _FTS3_HASH_H_ */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_icu.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_icu.c
new file mode 100644
index 0000000..86a9a50
--- /dev/null
+++ b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_icu.c
@@ -0,0 +1,257 @@
1/*
2** 2007 June 22
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12** This file implements a tokenizer for fts3 based on the ICU library.
13**
14** $Id: fts3_icu.c,v 1.1 2007/08/20 17:37:04 shess Exp $
15*/
16
17#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
18#ifdef SQLITE_ENABLE_ICU
19
20#include <assert.h>
21#include <string.h>
22#include "fts3_tokenizer.h"
23
24#include <unicode/ubrk.h>
25#include <unicode/ucol.h>
26#include <unicode/ustring.h>
27#include <unicode/utf16.h>
28
29typedef struct IcuTokenizer IcuTokenizer;
30typedef struct IcuCursor IcuCursor;
31
32struct IcuTokenizer {
33 sqlite3_tokenizer base;
34 char *zLocale;
35};
36
37struct IcuCursor {
38 sqlite3_tokenizer_cursor base;
39
40 UBreakIterator *pIter; /* ICU break-iterator object */
41 int nChar; /* Number of UChar elements in pInput */
42 UChar *aChar; /* Copy of input using utf-16 encoding */
43 int *aOffset; /* Offsets of each character in utf-8 input */
44
45 int nBuffer;
46 char *zBuffer;
47
48 int iToken;
49};
50
51/*
52** Create a new tokenizer instance.
53*/
54static int icuCreate(
55 int argc, /* Number of entries in argv[] */
56 const char * const *argv, /* Tokenizer creation arguments */
57 sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */
58){
59 IcuTokenizer *p;
60 int n = 0;
61
62 if( argc>0 ){
63 n = strlen(argv[0])+1;
64 }
65 p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n);
66 if( !p ){
67 return SQLITE_NOMEM;
68 }
69 memset(p, 0, sizeof(IcuTokenizer));
70
71 if( n ){
72 p->zLocale = (char *)&p[1];
73 memcpy(p->zLocale, argv[0], n);
74 }
75
76 *ppTokenizer = (sqlite3_tokenizer *)p;
77
78 return SQLITE_OK;
79}
80
81/*
82** Destroy a tokenizer
83*/
84static int icuDestroy(sqlite3_tokenizer *pTokenizer){
85 IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
86 sqlite3_free(p);
87 return SQLITE_OK;
88}
89
90/*
91** Prepare to begin tokenizing a particular string. The input
92** string to be tokenized is pInput[0..nBytes-1]. A cursor
93** used to incrementally tokenize this string is returned in
94** *ppCursor.
95*/
96static int icuOpen(
97 sqlite3_tokenizer *pTokenizer, /* The tokenizer */
98 const char *zInput, /* Input string */
99 int nInput, /* Length of zInput in bytes */
100 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
101){
102 IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
103 IcuCursor *pCsr;
104
105 const int32_t opt = U_FOLD_CASE_DEFAULT;
106 UErrorCode status = U_ZERO_ERROR;
107 int nChar;
108
109 UChar32 c;
110 int iInput = 0;
111 int iOut = 0;
112
113 *ppCursor = 0;
114
115 nChar = nInput+1;
116 pCsr = (IcuCursor *)sqlite3_malloc(
117 sizeof(IcuCursor) + /* IcuCursor */
118 nChar * sizeof(UChar) + /* IcuCursor.aChar[] */
119 (nChar+1) * sizeof(int) /* IcuCursor.aOffset[] */
120 );
121 if( !pCsr ){
122 return SQLITE_NOMEM;
123 }
124 memset(pCsr, 0, sizeof(IcuCursor));
125 pCsr->aChar = (UChar *)&pCsr[1];
126 pCsr->aOffset = (int *)&pCsr->aChar[nChar];
127
128 pCsr->aOffset[iOut] = iInput;
129 U8_NEXT(zInput, iInput, nInput, c);
130 while( c>0 ){
131 int isError = 0;
132 c = u_foldCase(c, opt);
133 U16_APPEND(pCsr->aChar, iOut, nChar, c, isError);
134 if( isError ){
135 sqlite3_free(pCsr);
136 return SQLITE_ERROR;
137 }
138 pCsr->aOffset[iOut] = iInput;
139
140 if( iInput<nInput ){
141 U8_NEXT(zInput, iInput, nInput, c);
142 }else{
143 c = 0;
144 }
145 }
146
147 pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status);
148 if( !U_SUCCESS(status) ){
149 sqlite3_free(pCsr);
150 return SQLITE_ERROR;
151 }
152 pCsr->nChar = iOut;
153
154 ubrk_first(pCsr->pIter);
155 *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
156 return SQLITE_OK;
157}
158
159/*
160** Close a tokenization cursor previously opened by a call to icuOpen().
161*/
162static int icuClose(sqlite3_tokenizer_cursor *pCursor){
163 IcuCursor *pCsr = (IcuCursor *)pCursor;
164 ubrk_close(pCsr->pIter);
165 sqlite3_free(pCsr->zBuffer);
166 sqlite3_free(pCsr);
167 return SQLITE_OK;
168}
169
170/*
171** Extract the next token from a tokenization cursor.
172*/
173static int icuNext(
174 sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */
175 const char **ppToken, /* OUT: *ppToken is the token text */
176 int *pnBytes, /* OUT: Number of bytes in token */
177 int *piStartOffset, /* OUT: Starting offset of token */
178 int *piEndOffset, /* OUT: Ending offset of token */
179 int *piPosition /* OUT: Position integer of token */
180){
181 IcuCursor *pCsr = (IcuCursor *)pCursor;
182
183 int iStart = 0;
184 int iEnd = 0;
185 int nByte = 0;
186
187 while( iStart==iEnd ){
188 UChar32 c;
189
190 iStart = ubrk_current(pCsr->pIter);
191 iEnd = ubrk_next(pCsr->pIter);
192 if( iEnd==UBRK_DONE ){
193 return SQLITE_DONE;
194 }
195
196 while( iStart<iEnd ){
197 int iWhite = iStart;
198 U8_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);
199 if( u_isspace(c) ){
200 iStart = iWhite;
201 }else{
202 break;
203 }
204 }
205 assert(iStart<=iEnd);
206 }
207
208 do {
209 UErrorCode status = U_ZERO_ERROR;
210 if( nByte ){
211 char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);
212 if( !zNew ){
213 return SQLITE_NOMEM;
214 }
215 pCsr->zBuffer = zNew;
216 pCsr->nBuffer = nByte;
217 }
218
219 u_strToUTF8(
220 pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */
221 &pCsr->aChar[iStart], iEnd-iStart, /* Input vars */
222 &status /* Output success/failure */
223 );
224 } while( nByte>pCsr->nBuffer );
225
226 *ppToken = pCsr->zBuffer;
227 *pnBytes = nByte;
228 *piStartOffset = pCsr->aOffset[iStart];
229 *piEndOffset = pCsr->aOffset[iEnd];
230 *piPosition = pCsr->iToken++;
231
232 return SQLITE_OK;
233}
234
235/*
236** The set of routines that implement the simple tokenizer
237*/
238static const sqlite3_tokenizer_module icuTokenizerModule = {
239 0, /* iVersion */
240 icuCreate, /* xCreate */
241 icuDestroy, /* xCreate */
242 icuOpen, /* xOpen */
243 icuClose, /* xClose */
244 icuNext, /* xNext */
245};
246
247/*
248** Set *ppModule to point at the implementation of the ICU tokenizer.
249*/
250void sqlite3Fts3IcuTokenizerModule(
251 sqlite3_tokenizer_module const**ppModule
252){
253 *ppModule = &icuTokenizerModule;
254}
255
256#endif /* defined(SQLITE_ENABLE_ICU) */
257#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_porter.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_porter.c
new file mode 100644
index 0000000..14e129f
--- /dev/null
+++ b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_porter.c
@@ -0,0 +1,642 @@
1/*
2** 2006 September 30
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12** Implementation of the full-text-search tokenizer that implements
13** a Porter stemmer.
14*/
15
16/*
17** The code in this file is only compiled if:
18**
19** * The FTS3 module is being built as an extension
20** (in which case SQLITE_CORE is not defined), or
21**
22** * The FTS3 module is being built into the core of
23** SQLite (in which case SQLITE_ENABLE_FTS3 is defined).
24*/
25#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
26
27
28#include <assert.h>
29#include <stdlib.h>
30#include <stdio.h>
31#include <string.h>
32#include <ctype.h>
33
34#include "fts3_tokenizer.h"
35
36/*
37** Class derived from sqlite3_tokenizer
38*/
39typedef struct porter_tokenizer {
40 sqlite3_tokenizer base; /* Base class */
41} porter_tokenizer;
42
43/*
44** Class derived from sqlit3_tokenizer_cursor
45*/
46typedef struct porter_tokenizer_cursor {
47 sqlite3_tokenizer_cursor base;
48 const char *zInput; /* input we are tokenizing */
49 int nInput; /* size of the input */
50 int iOffset; /* current position in zInput */
51 int iToken; /* index of next token to be returned */
52 char *zToken; /* storage for current token */
53 int nAllocated; /* space allocated to zToken buffer */
54} porter_tokenizer_cursor;
55
56
57/* Forward declaration */
58static const sqlite3_tokenizer_module porterTokenizerModule;
59
60
61/*
62** Create a new tokenizer instance.
63*/
64static int porterCreate(
65 int argc, const char * const *argv,
66 sqlite3_tokenizer **ppTokenizer
67){
68 porter_tokenizer *t;
69 t = (porter_tokenizer *) calloc(sizeof(*t), 1);
70 if( t==NULL ) return SQLITE_NOMEM;
71
72 *ppTokenizer = &t->base;
73 return SQLITE_OK;
74}
75
76/*
77** Destroy a tokenizer
78*/
79static int porterDestroy(sqlite3_tokenizer *pTokenizer){
80 free(pTokenizer);
81 return SQLITE_OK;
82}
83
84/*
85** Prepare to begin tokenizing a particular string. The input
86** string to be tokenized is zInput[0..nInput-1]. A cursor
87** used to incrementally tokenize this string is returned in
88** *ppCursor.
89*/
90static int porterOpen(
91 sqlite3_tokenizer *pTokenizer, /* The tokenizer */
92 const char *zInput, int nInput, /* String to be tokenized */
93 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
94){
95 porter_tokenizer_cursor *c;
96
97 c = (porter_tokenizer_cursor *) malloc(sizeof(*c));
98 if( c==NULL ) return SQLITE_NOMEM;
99
100 c->zInput = zInput;
101 if( zInput==0 ){
102 c->nInput = 0;
103 }else if( nInput<0 ){
104 c->nInput = (int)strlen(zInput);
105 }else{
106 c->nInput = nInput;
107 }
108 c->iOffset = 0; /* start tokenizing at the beginning */
109 c->iToken = 0;
110 c->zToken = NULL; /* no space allocated, yet. */
111 c->nAllocated = 0;
112
113 *ppCursor = &c->base;
114 return SQLITE_OK;
115}
116
117/*
118** Close a tokenization cursor previously opened by a call to
119** porterOpen() above.
120*/
121static int porterClose(sqlite3_tokenizer_cursor *pCursor){
122 porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
123 free(c->zToken);
124 free(c);
125 return SQLITE_OK;
126}
127/*
128** Vowel or consonant
129*/
130static const char cType[] = {
131 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
132 1, 1, 1, 2, 1
133};
134
135/*
136** isConsonant() and isVowel() determine if their first character in
137** the string they point to is a consonant or a vowel, according
138** to Porter ruls.
139**
140** A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'.
141** 'Y' is a consonant unless it follows another consonant,
142** in which case it is a vowel.
143**
144** In these routine, the letters are in reverse order. So the 'y' rule
145** is that 'y' is a consonant unless it is followed by another
146** consonent.
147*/
148static int isVowel(const char*);
149static int isConsonant(const char *z){
150 int j;
151 char x = *z;
152 if( x==0 ) return 0;
153 assert( x>='a' && x<='z' );
154 j = cType[x-'a'];
155 if( j<2 ) return j;
156 return z[1]==0 || isVowel(z + 1);
157}
158static int isVowel(const char *z){
159 int j;
160 char x = *z;
161 if( x==0 ) return 0;
162 assert( x>='a' && x<='z' );
163 j = cType[x-'a'];
164 if( j<2 ) return 1-j;
165 return isConsonant(z + 1);
166}
167
168/*
169** Let any sequence of one or more vowels be represented by V and let
170** C be sequence of one or more consonants. Then every word can be
171** represented as:
172**
173** [C] (VC){m} [V]
174**
175** In prose: A word is an optional consonant followed by zero or
176** vowel-consonant pairs followed by an optional vowel. "m" is the
177** number of vowel consonant pairs. This routine computes the value
178** of m for the first i bytes of a word.
179**
180** Return true if the m-value for z is 1 or more. In other words,
181** return true if z contains at least one vowel that is followed
182** by a consonant.
183**
184** In this routine z[] is in reverse order. So we are really looking
185** for an instance of of a consonant followed by a vowel.
186*/
187static int m_gt_0(const char *z){
188 while( isVowel(z) ){ z++; }
189 if( *z==0 ) return 0;
190 while( isConsonant(z) ){ z++; }
191 return *z!=0;
192}
193
194/* Like mgt0 above except we are looking for a value of m which is
195** exactly 1
196*/
197static int m_eq_1(const char *z){
198 while( isVowel(z) ){ z++; }
199 if( *z==0 ) return 0;
200 while( isConsonant(z) ){ z++; }
201 if( *z==0 ) return 0;
202 while( isVowel(z) ){ z++; }
203 if( *z==0 ) return 1;
204 while( isConsonant(z) ){ z++; }
205 return *z==0;
206}
207
208/* Like mgt0 above except we are looking for a value of m>1 instead
209** or m>0
210*/
211static int m_gt_1(const char *z){
212 while( isVowel(z) ){ z++; }
213 if( *z==0 ) return 0;
214 while( isConsonant(z) ){ z++; }
215 if( *z==0 ) return 0;
216 while( isVowel(z) ){ z++; }
217 if( *z==0 ) return 0;
218 while( isConsonant(z) ){ z++; }
219 return *z!=0;
220}
221
222/*
223** Return TRUE if there is a vowel anywhere within z[0..n-1]
224*/
225static int hasVowel(const char *z){
226 while( isConsonant(z) ){ z++; }
227 return *z!=0;
228}
229
230/*
231** Return TRUE if the word ends in a double consonant.
232**
233** The text is reversed here. So we are really looking at
234** the first two characters of z[].
235*/
236static int doubleConsonant(const char *z){
237 return isConsonant(z) && z[0]==z[1] && isConsonant(z+1);
238}
239
240/*
241** Return TRUE if the word ends with three letters which
242** are consonant-vowel-consonent and where the final consonant
243** is not 'w', 'x', or 'y'.
244**
245** The word is reversed here. So we are really checking the
246** first three letters and the first one cannot be in [wxy].
247*/
248static int star_oh(const char *z){
249 return
250 z[0]!=0 && isConsonant(z) &&
251 z[0]!='w' && z[0]!='x' && z[0]!='y' &&
252 z[1]!=0 && isVowel(z+1) &&
253 z[2]!=0 && isConsonant(z+2);
254}
255
256/*
257** If the word ends with zFrom and xCond() is true for the stem
258** of the word that preceeds the zFrom ending, then change the
259** ending to zTo.
260**
261** The input word *pz and zFrom are both in reverse order. zTo
262** is in normal order.
263**
264** Return TRUE if zFrom matches. Return FALSE if zFrom does not
265** match. Not that TRUE is returned even if xCond() fails and
266** no substitution occurs.
267*/
268static int stem(
269 char **pz, /* The word being stemmed (Reversed) */
270 const char *zFrom, /* If the ending matches this... (Reversed) */
271 const char *zTo, /* ... change the ending to this (not reversed) */
272 int (*xCond)(const char*) /* Condition that must be true */
273){
274 char *z = *pz;
275 while( *zFrom && *zFrom==*z ){ z++; zFrom++; }
276 if( *zFrom!=0 ) return 0;
277 if( xCond && !xCond(z) ) return 1;
278 while( *zTo ){
279 *(--z) = *(zTo++);
280 }
281 *pz = z;
282 return 1;
283}
284
285/*
286** This is the fallback stemmer used when the porter stemmer is
287** inappropriate. The input word is copied into the output with
288** US-ASCII case folding. If the input word is too long (more
289** than 20 bytes if it contains no digits or more than 6 bytes if
290** it contains digits) then word is truncated to 20 or 6 bytes
291** by taking 10 or 3 bytes from the beginning and end.
292*/
293static void copy_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
294 int i, mx, j;
295 int hasDigit = 0;
296 for(i=0; i<nIn; i++){
297 int c = zIn[i];
298 if( c>='A' && c<='Z' ){
299 zOut[i] = c - 'A' + 'a';
300 }else{
301 if( c>='0' && c<='9' ) hasDigit = 1;
302 zOut[i] = c;
303 }
304 }
305 mx = hasDigit ? 3 : 10;
306 if( nIn>mx*2 ){
307 for(j=mx, i=nIn-mx; i<nIn; i++, j++){
308 zOut[j] = zOut[i];
309 }
310 i = j;
311 }
312 zOut[i] = 0;
313 *pnOut = i;
314}
315
316
317/*
318** Stem the input word zIn[0..nIn-1]. Store the output in zOut.
319** zOut is at least big enough to hold nIn bytes. Write the actual
320** size of the output word (exclusive of the '\0' terminator) into *pnOut.
321**
322** Any upper-case characters in the US-ASCII character set ([A-Z])
323** are converted to lower case. Upper-case UTF characters are
324** unchanged.
325**
326** Words that are longer than about 20 bytes are stemmed by retaining
327** a few bytes from the beginning and the end of the word. If the
328** word contains digits, 3 bytes are taken from the beginning and
329** 3 bytes from the end. For long words without digits, 10 bytes
330** are taken from each end. US-ASCII case folding still applies.
331**
332** If the input word contains not digits but does characters not
333** in [a-zA-Z] then no stemming is attempted and this routine just
334** copies the input into the input into the output with US-ASCII
335** case folding.
336**
337** Stemming never increases the length of the word. So there is
338** no chance of overflowing the zOut buffer.
339*/
340static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
341 int i, j, c;
342 char zReverse[28];
343 char *z, *z2;
344 if( nIn<3 || nIn>=sizeof(zReverse)-7 ){
345 /* The word is too big or too small for the porter stemmer.
346 ** Fallback to the copy stemmer */
347 copy_stemmer(zIn, nIn, zOut, pnOut);
348 return;
349 }
350 for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){
351 c = zIn[i];
352 if( c>='A' && c<='Z' ){
353 zReverse[j] = c + 'a' - 'A';
354 }else if( c>='a' && c<='z' ){
355 zReverse[j] = c;
356 }else{
357 /* The use of a character not in [a-zA-Z] means that we fallback
358 ** to the copy stemmer */
359 copy_stemmer(zIn, nIn, zOut, pnOut);
360 return;
361 }
362 }
363 memset(&zReverse[sizeof(zReverse)-5], 0, 5);
364 z = &zReverse[j+1];
365
366
367 /* Step 1a */
368 if( z[0]=='s' ){
369 if(
370 !stem(&z, "sess", "ss", 0) &&
371 !stem(&z, "sei", "i", 0) &&
372 !stem(&z, "ss", "ss", 0)
373 ){
374 z++;
375 }
376 }
377
378 /* Step 1b */
379 z2 = z;
380 if( stem(&z, "dee", "ee", m_gt_0) ){
381 /* Do nothing. The work was all in the test */
382 }else if(
383 (stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel))
384 && z!=z2
385 ){
386 if( stem(&z, "ta", "ate", 0) ||
387 stem(&z, "lb", "ble", 0) ||
388 stem(&z, "zi", "ize", 0) ){
389 /* Do nothing. The work was all in the test */
390 }else if( doubleConsonant(z) && (*z!='l' && *z!='s' && *z!='z') ){
391 z++;
392 }else if( m_eq_1(z) && star_oh(z) ){
393 *(--z) = 'e';
394 }
395 }
396
397 /* Step 1c */
398 if( z[0]=='y' && hasVowel(z+1) ){
399 z[0] = 'i';
400 }
401
402 /* Step 2 */
403 switch( z[1] ){
404 case 'a':
405 stem(&z, "lanoita", "ate", m_gt_0) ||
406 stem(&z, "lanoit", "tion", m_gt_0);
407 break;
408 case 'c':
409 stem(&z, "icne", "ence", m_gt_0) ||
410 stem(&z, "icna", "ance", m_gt_0);
411 break;
412 case 'e':
413 stem(&z, "rezi", "ize", m_gt_0);
414 break;
415 case 'g':
416 stem(&z, "igol", "log", m_gt_0);
417 break;
418 case 'l':
419 stem(&z, "ilb", "ble", m_gt_0) ||
420 stem(&z, "illa", "al", m_gt_0) ||
421 stem(&z, "iltne", "ent", m_gt_0) ||
422 stem(&z, "ile", "e", m_gt_0) ||
423 stem(&z, "ilsuo", "ous", m_gt_0);
424 break;
425 case 'o':
426 stem(&z, "noitazi", "ize", m_gt_0) ||
427 stem(&z, "noita", "ate", m_gt_0) ||
428 stem(&z, "rota", "ate", m_gt_0);
429 break;
430 case 's':
431 stem(&z, "msila", "al", m_gt_0) ||
432 stem(&z, "ssenevi", "ive", m_gt_0) ||
433 stem(&z, "ssenluf", "ful", m_gt_0) ||
434 stem(&z, "ssensuo", "ous", m_gt_0);
435 break;
436 case 't':
437 stem(&z, "itila", "al", m_gt_0) ||
438 stem(&z, "itivi", "ive", m_gt_0) ||
439 stem(&z, "itilib", "ble", m_gt_0);
440 break;
441 }
442
443 /* Step 3 */
444 switch( z[0] ){
445 case 'e':
446 stem(&z, "etaci", "ic", m_gt_0) ||
447 stem(&z, "evita", "", m_gt_0) ||
448 stem(&z, "ezila", "al", m_gt_0);
449 break;
450 case 'i':
451 stem(&z, "itici", "ic", m_gt_0);
452 break;
453 case 'l':
454 stem(&z, "laci", "ic", m_gt_0) ||
455 stem(&z, "luf", "", m_gt_0);
456 break;
457 case 's':
458 stem(&z, "ssen", "", m_gt_0);
459 break;
460 }
461
462 /* Step 4 */
463 switch( z[1] ){
464 case 'a':
465 if( z[0]=='l' && m_gt_1(z+2) ){
466 z += 2;
467 }
468 break;
469 case 'c':
470 if( z[0]=='e' && z[2]=='n' && (z[3]=='a' || z[3]=='e') && m_gt_1(z+4) ){
471 z += 4;
472 }
473 break;
474 case 'e':
475 if( z[0]=='r' && m_gt_1(z+2) ){
476 z += 2;
477 }
478 break;
479 case 'i':
480 if( z[0]=='c' && m_gt_1(z+2) ){
481 z += 2;
482 }
483 break;
484 case 'l':
485 if( z[0]=='e' && z[2]=='b' && (z[3]=='a' || z[3]=='i') && m_gt_1(z+4) ){
486 z += 4;
487 }
488 break;
489 case 'n':
490 if( z[0]=='t' ){
491 if( z[2]=='a' ){
492 if( m_gt_1(z+3) ){
493 z += 3;
494 }
495 }else if( z[2]=='e' ){
496 stem(&z, "tneme", "", m_gt_1) ||
497 stem(&z, "tnem", "", m_gt_1) ||
498 stem(&z, "tne", "", m_gt_1);
499 }
500 }
501 break;
502 case 'o':
503 if( z[0]=='u' ){
504 if( m_gt_1(z+2) ){
505 z += 2;
506 }
507 }else if( z[3]=='s' || z[3]=='t' ){
508 stem(&z, "noi", "", m_gt_1);
509 }
510 break;
511 case 's':
512 if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){
513 z += 3;
514 }
515 break;
516 case 't':
517 stem(&z, "eta", "", m_gt_1) ||
518 stem(&z, "iti", "", m_gt_1);
519 break;
520 case 'u':
521 if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){
522 z += 3;
523 }
524 break;
525 case 'v':
526 case 'z':
527 if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){
528 z += 3;
529 }
530 break;
531 }
532
533 /* Step 5a */
534 if( z[0]=='e' ){
535 if( m_gt_1(z+1) ){
536 z++;
537 }else if( m_eq_1(z+1) && !star_oh(z+1) ){
538 z++;
539 }
540 }
541
542 /* Step 5b */
543 if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){
544 z++;
545 }
546
547 /* z[] is now the stemmed word in reverse order. Flip it back
548 ** around into forward order and return.
549 */
550 *pnOut = i = strlen(z);
551 zOut[i] = 0;
552 while( *z ){
553 zOut[--i] = *(z++);
554 }
555}
556
557/*
558** Characters that can be part of a token. We assume any character
559** whose value is greater than 0x80 (any UTF character) can be
560** part of a token. In other words, delimiters all must have
561** values of 0x7f or lower.
562*/
563static const char porterIdChar[] = {
564/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
565 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */
566 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */
567 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */
568 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */
569 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */
570};
571#define isDelim(C) (((ch=C)&0x80)==0 && (ch<0x30 || !porterIdChar[ch-0x30]))
572
573/*
574** Extract the next token from a tokenization cursor. The cursor must
575** have been opened by a prior call to porterOpen().
576*/
577static int porterNext(
578 sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by porterOpen */
579 const char **pzToken, /* OUT: *pzToken is the token text */
580 int *pnBytes, /* OUT: Number of bytes in token */
581 int *piStartOffset, /* OUT: Starting offset of token */
582 int *piEndOffset, /* OUT: Ending offset of token */
583 int *piPosition /* OUT: Position integer of token */
584){
585 porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
586 const char *z = c->zInput;
587
588 while( c->iOffset<c->nInput ){
589 int iStartOffset, ch;
590
591 /* Scan past delimiter characters */
592 while( c->iOffset<c->nInput && isDelim(z[c->iOffset]) ){
593 c->iOffset++;
594 }
595
596 /* Count non-delimiter characters. */
597 iStartOffset = c->iOffset;
598 while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){
599 c->iOffset++;
600 }
601
602 if( c->iOffset>iStartOffset ){
603 int n = c->iOffset-iStartOffset;
604 if( n>c->nAllocated ){
605 c->nAllocated = n+20;
606 c->zToken = realloc(c->zToken, c->nAllocated);
607 if( c->zToken==NULL ) return SQLITE_NOMEM;
608 }
609 porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes);
610 *pzToken = c->zToken;
611 *piStartOffset = iStartOffset;
612 *piEndOffset = c->iOffset;
613 *piPosition = c->iToken++;
614 return SQLITE_OK;
615 }
616 }
617 return SQLITE_DONE;
618}
619
620/*
621** The set of routines that implement the porter-stemmer tokenizer
622*/
623static const sqlite3_tokenizer_module porterTokenizerModule = {
624 0,
625 porterCreate,
626 porterDestroy,
627 porterOpen,
628 porterClose,
629 porterNext,
630};
631
632/*
633** Allocate a new porter tokenizer. Return a pointer to the new
634** tokenizer in *ppModule
635*/
636void sqlite3Fts3PorterTokenizerModule(
637 sqlite3_tokenizer_module const**ppModule
638){
639 *ppModule = &porterTokenizerModule;
640}
641
642#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_tokenizer.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_tokenizer.c
new file mode 100644
index 0000000..7398227
--- /dev/null
+++ b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_tokenizer.c
@@ -0,0 +1,371 @@
1/*
2** 2007 June 22
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11******************************************************************************
12**
13** This is part of an SQLite module implementing full-text search.
14** This particular file implements the generic tokenizer interface.
15*/
16
17/*
18** The code in this file is only compiled if:
19**
20** * The FTS3 module is being built as an extension
21** (in which case SQLITE_CORE is not defined), or
22**
23** * The FTS3 module is being built into the core of
24** SQLite (in which case SQLITE_ENABLE_FTS3 is defined).
25*/
26#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
27
28
29#include "sqlite3.h"
30#include "sqlite3ext.h"
31SQLITE_EXTENSION_INIT1
32
33#include "fts3_hash.h"
34#include "fts3_tokenizer.h"
35#include <assert.h>
36
37/*
38** Implementation of the SQL scalar function for accessing the underlying
39** hash table. This function may be called as follows:
40**
41** SELECT <function-name>(<key-name>);
42** SELECT <function-name>(<key-name>, <pointer>);
43**
44** where <function-name> is the name passed as the second argument
45** to the sqlite3Fts3InitHashTable() function (e.g. 'fts3_tokenizer').
46**
47** If the <pointer> argument is specified, it must be a blob value
48** containing a pointer to be stored as the hash data corresponding
49** to the string <key-name>. If <pointer> is not specified, then
50** the string <key-name> must already exist in the has table. Otherwise,
51** an error is returned.
52**
53** Whether or not the <pointer> argument is specified, the value returned
54** is a blob containing the pointer stored as the hash data corresponding
55** to string <key-name> (after the hash-table is updated, if applicable).
56*/
57static void scalarFunc(
58 sqlite3_context *context,
59 int argc,
60 sqlite3_value **argv
61){
62 fts3Hash *pHash;
63 void *pPtr = 0;
64 const unsigned char *zName;
65 int nName;
66
67 assert( argc==1 || argc==2 );
68
69 pHash = (fts3Hash *)sqlite3_user_data(context);
70
71 zName = sqlite3_value_text(argv[0]);
72 nName = sqlite3_value_bytes(argv[0])+1;
73
74 if( argc==2 ){
75 void *pOld;
76 int n = sqlite3_value_bytes(argv[1]);
77 if( n!=sizeof(pPtr) ){
78 sqlite3_result_error(context, "argument type mismatch", -1);
79 return;
80 }
81 pPtr = *(void **)sqlite3_value_blob(argv[1]);
82 pOld = sqlite3Fts3HashInsert(pHash, (void *)zName, nName, pPtr);
83 if( pOld==pPtr ){
84 sqlite3_result_error(context, "out of memory", -1);
85 return;
86 }
87 }else{
88 pPtr = sqlite3Fts3HashFind(pHash, zName, nName);
89 if( !pPtr ){
90 char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName);
91 sqlite3_result_error(context, zErr, -1);
92 sqlite3_free(zErr);
93 return;
94 }
95 }
96
97 sqlite3_result_blob(context, (void *)&pPtr, sizeof(pPtr), SQLITE_TRANSIENT);
98}
99
100#ifdef SQLITE_TEST
101
102#include <tcl.h>
103#include <string.h>
104
105/*
106** Implementation of a special SQL scalar function for testing tokenizers
107** designed to be used in concert with the Tcl testing framework. This
108** function must be called with two arguments:
109**
110** SELECT <function-name>(<key-name>, <input-string>);
111** SELECT <function-name>(<key-name>, <pointer>);
112**
113** where <function-name> is the name passed as the second argument
114** to the sqlite3Fts3InitHashTable() function (e.g. 'fts3_tokenizer')
115** concatenated with the string '_test' (e.g. 'fts3_tokenizer_test').
116**
117** The return value is a string that may be interpreted as a Tcl
118** list. For each token in the <input-string>, three elements are
119** added to the returned list. The first is the token position, the
120** second is the token text (folded, stemmed, etc.) and the third is the
121** substring of <input-string> associated with the token. For example,
122** using the built-in "simple" tokenizer:
123**
124** SELECT fts_tokenizer_test('simple', 'I don't see how');
125**
126** will return the string:
127**
128** "{0 i I 1 dont don't 2 see see 3 how how}"
129**
130*/
131static void testFunc(
132 sqlite3_context *context,
133 int argc,
134 sqlite3_value **argv
135){
136 fts3Hash *pHash;
137 sqlite3_tokenizer_module *p;
138 sqlite3_tokenizer *pTokenizer = 0;
139 sqlite3_tokenizer_cursor *pCsr = 0;
140
141 const char *zErr = 0;
142
143 const char *zName;
144 int nName;
145 const char *zInput;
146 int nInput;
147
148 const char *zArg = 0;
149
150 const char *zToken;
151 int nToken;
152 int iStart;
153 int iEnd;
154 int iPos;
155
156 Tcl_Obj *pRet;
157
158 assert( argc==2 || argc==3 );
159
160 nName = sqlite3_value_bytes(argv[0]);
161 zName = (const char *)sqlite3_value_text(argv[0]);
162 nInput = sqlite3_value_bytes(argv[argc-1]);
163 zInput = (const char *)sqlite3_value_text(argv[argc-1]);
164
165 if( argc==3 ){
166 zArg = (const char *)sqlite3_value_text(argv[1]);
167 }
168
169 pHash = (fts3Hash *)sqlite3_user_data(context);
170 p = (sqlite3_tokenizer_module *)sqlite3Fts3HashFind(pHash, zName, nName+1);
171
172 if( !p ){
173 char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName);
174 sqlite3_result_error(context, zErr, -1);
175 sqlite3_free(zErr);
176 return;
177 }
178
179 pRet = Tcl_NewObj();
180 Tcl_IncrRefCount(pRet);
181
182 if( SQLITE_OK!=p->xCreate(zArg ? 1 : 0, &zArg, &pTokenizer) ){
183 zErr = "error in xCreate()";
184 goto finish;
185 }
186 pTokenizer->pModule = p;
187 if( SQLITE_OK!=p->xOpen(pTokenizer, zInput, nInput, &pCsr) ){
188 zErr = "error in xOpen()";
189 goto finish;
190 }
191 pCsr->pTokenizer = pTokenizer;
192
193 while( SQLITE_OK==p->xNext(pCsr, &zToken, &nToken, &iStart, &iEnd, &iPos) ){
194 Tcl_ListObjAppendElement(0, pRet, Tcl_NewIntObj(iPos));
195 Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
196 zToken = &zInput[iStart];
197 nToken = iEnd-iStart;
198 Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
199 }
200
201 if( SQLITE_OK!=p->xClose(pCsr) ){
202 zErr = "error in xClose()";
203 goto finish;
204 }
205 if( SQLITE_OK!=p->xDestroy(pTokenizer) ){
206 zErr = "error in xDestroy()";
207 goto finish;
208 }
209
210finish:
211 if( zErr ){
212 sqlite3_result_error(context, zErr, -1);
213 }else{
214 sqlite3_result_text(context, Tcl_GetString(pRet), -1, SQLITE_TRANSIENT);
215 }
216 Tcl_DecrRefCount(pRet);
217}
218
219static
220int registerTokenizer(
221 sqlite3 *db,
222 char *zName,
223 const sqlite3_tokenizer_module *p
224){
225 int rc;
226 sqlite3_stmt *pStmt;
227 const char zSql[] = "SELECT fts3_tokenizer(?, ?)";
228
229 rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
230 if( rc!=SQLITE_OK ){
231 return rc;
232 }
233
234 sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
235 sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
236 sqlite3_step(pStmt);
237
238 return sqlite3_finalize(pStmt);
239}
240
241static
242int queryTokenizer(
243 sqlite3 *db,
244 char *zName,
245 const sqlite3_tokenizer_module **pp
246){
247 int rc;
248 sqlite3_stmt *pStmt;
249 const char zSql[] = "SELECT fts3_tokenizer(?)";
250
251 *pp = 0;
252 rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
253 if( rc!=SQLITE_OK ){
254 return rc;
255 }
256
257 sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
258 if( SQLITE_ROW==sqlite3_step(pStmt) ){
259 if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){
260 memcpy(pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp));
261 }
262 }
263
264 return sqlite3_finalize(pStmt);
265}
266
267void sqlite3Fts3SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule);
268
269/*
270** Implementation of the scalar function fts3_tokenizer_internal_test().
271** This function is used for testing only, it is not included in the
272** build unless SQLITE_TEST is defined.
273**
274** The purpose of this is to test that the fts3_tokenizer() function
275** can be used as designed by the C-code in the queryTokenizer and
276** registerTokenizer() functions above. These two functions are repeated
277** in the README.tokenizer file as an example, so it is important to
278** test them.
279**
280** To run the tests, evaluate the fts3_tokenizer_internal_test() scalar
281** function with no arguments. An assert() will fail if a problem is
282** detected. i.e.:
283**
284** SELECT fts3_tokenizer_internal_test();
285**
286*/
287static void intTestFunc(
288 sqlite3_context *context,
289 int argc,
290 sqlite3_value **argv
291){
292 int rc;
293 const sqlite3_tokenizer_module *p1;
294 const sqlite3_tokenizer_module *p2;
295 sqlite3 *db = (sqlite3 *)sqlite3_user_data(context);
296
297 /* Test the query function */
298 sqlite3Fts3SimpleTokenizerModule(&p1);
299 rc = queryTokenizer(db, "simple", &p2);
300 assert( rc==SQLITE_OK );
301 assert( p1==p2 );
302 rc = queryTokenizer(db, "nosuchtokenizer", &p2);
303 assert( rc==SQLITE_ERROR );
304 assert( p2==0 );
305 assert( 0==strcmp(sqlite3_errmsg(db), "unknown tokenizer: nosuchtokenizer") );
306
307 /* Test the storage function */
308 rc = registerTokenizer(db, "nosuchtokenizer", p1);
309 assert( rc==SQLITE_OK );
310 rc = queryTokenizer(db, "nosuchtokenizer", &p2);
311 assert( rc==SQLITE_OK );
312 assert( p2==p1 );
313
314 sqlite3_result_text(context, "ok", -1, SQLITE_STATIC);
315}
316
317#endif
318
319/*
320** Set up SQL objects in database db used to access the contents of
321** the hash table pointed to by argument pHash. The hash table must
322** been initialised to use string keys, and to take a private copy
323** of the key when a value is inserted. i.e. by a call similar to:
324**
325** sqlite3Fts3HashInit(pHash, FTS3_HASH_STRING, 1);
326**
327** This function adds a scalar function (see header comment above
328** scalarFunc() in this file for details) and, if ENABLE_TABLE is
329** defined at compilation time, a temporary virtual table (see header
330** comment above struct HashTableVtab) to the database schema. Both
331** provide read/write access to the contents of *pHash.
332**
333** The third argument to this function, zName, is used as the name
334** of both the scalar and, if created, the virtual table.
335*/
336int sqlite3Fts3InitHashTable(
337 sqlite3 *db,
338 fts3Hash *pHash,
339 const char *zName
340){
341 int rc = SQLITE_OK;
342 void *p = (void *)pHash;
343 const int any = SQLITE_ANY;
344 char *zTest = 0;
345 char *zTest2 = 0;
346
347#ifdef SQLITE_TEST
348 void *pdb = (void *)db;
349 zTest = sqlite3_mprintf("%s_test", zName);
350 zTest2 = sqlite3_mprintf("%s_internal_test", zName);
351 if( !zTest || !zTest2 ){
352 rc = SQLITE_NOMEM;
353 }
354#endif
355
356 if( rc!=SQLITE_OK
357 || (rc = sqlite3_create_function(db, zName, 1, any, p, scalarFunc, 0, 0))
358 || (rc = sqlite3_create_function(db, zName, 2, any, p, scalarFunc, 0, 0))
359#ifdef SQLITE_TEST
360 || (rc = sqlite3_create_function(db, zTest, 2, any, p, testFunc, 0, 0))
361 || (rc = sqlite3_create_function(db, zTest, 3, any, p, testFunc, 0, 0))
362 || (rc = sqlite3_create_function(db, zTest2, 0, any, pdb, intTestFunc, 0, 0))
363#endif
364 );
365
366 sqlite3_free(zTest);
367 sqlite3_free(zTest2);
368 return rc;
369}
370
371#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_tokenizer.h b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_tokenizer.h
new file mode 100644
index 0000000..4faef56
--- /dev/null
+++ b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_tokenizer.h
@@ -0,0 +1,145 @@
1/*
2** 2006 July 10
3**
4** The author disclaims copyright to this source code.
5**
6*************************************************************************
7** Defines the interface to tokenizers used by fulltext-search. There
8** are three basic components:
9**
10** sqlite3_tokenizer_module is a singleton defining the tokenizer
11** interface functions. This is essentially the class structure for
12** tokenizers.
13**
14** sqlite3_tokenizer is used to define a particular tokenizer, perhaps
15** including customization information defined at creation time.
16**
17** sqlite3_tokenizer_cursor is generated by a tokenizer to generate
18** tokens from a particular input.
19*/
20#ifndef _FTS3_TOKENIZER_H_
21#define _FTS3_TOKENIZER_H_
22
23/* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time.
24** If tokenizers are to be allowed to call sqlite3_*() functions, then
25** we will need a way to register the API consistently.
26*/
27#include "sqlite3.h"
28
29/*
30** Structures used by the tokenizer interface. When a new tokenizer
31** implementation is registered, the caller provides a pointer to
32** an sqlite3_tokenizer_module containing pointers to the callback
33** functions that make up an implementation.
34**
35** When an fts3 table is created, it passes any arguments passed to
36** the tokenizer clause of the CREATE VIRTUAL TABLE statement to the
37** sqlite3_tokenizer_module.xCreate() function of the requested tokenizer
38** implementation. The xCreate() function in turn returns an
39** sqlite3_tokenizer structure representing the specific tokenizer to
40** be used for the fts3 table (customized by the tokenizer clause arguments).
41**
42** To tokenize an input buffer, the sqlite3_tokenizer_module.xOpen()
43** method is called. It returns an sqlite3_tokenizer_cursor object
44** that may be used to tokenize a specific input buffer based on
45** the tokenization rules supplied by a specific sqlite3_tokenizer
46** object.
47*/
48typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module;
49typedef struct sqlite3_tokenizer sqlite3_tokenizer;
50typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor;
51
52struct sqlite3_tokenizer_module {
53
54 /*
55 ** Structure version. Should always be set to 0.
56 */
57 int iVersion;
58
59 /*
60 ** Create a new tokenizer. The values in the argv[] array are the
61 ** arguments passed to the "tokenizer" clause of the CREATE VIRTUAL
62 ** TABLE statement that created the fts3 table. For example, if
63 ** the following SQL is executed:
64 **
65 ** CREATE .. USING fts3( ... , tokenizer <tokenizer-name> arg1 arg2)
66 **
67 ** then argc is set to 2, and the argv[] array contains pointers
68 ** to the strings "arg1" and "arg2".
69 **
70 ** This method should return either SQLITE_OK (0), or an SQLite error
71 ** code. If SQLITE_OK is returned, then *ppTokenizer should be set
72 ** to point at the newly created tokenizer structure. The generic
73 ** sqlite3_tokenizer.pModule variable should not be initialised by
74 ** this callback. The caller will do so.
75 */
76 int (*xCreate)(
77 int argc, /* Size of argv array */
78 const char *const*argv, /* Tokenizer argument strings */
79 sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */
80 );
81
82 /*
83 ** Destroy an existing tokenizer. The fts3 module calls this method
84 ** exactly once for each successful call to xCreate().
85 */
86 int (*xDestroy)(sqlite3_tokenizer *pTokenizer);
87
88 /*
89 ** Create a tokenizer cursor to tokenize an input buffer. The caller
90 ** is responsible for ensuring that the input buffer remains valid
91 ** until the cursor is closed (using the xClose() method).
92 */
93 int (*xOpen)(
94 sqlite3_tokenizer *pTokenizer, /* Tokenizer object */
95 const char *pInput, int nBytes, /* Input buffer */
96 sqlite3_tokenizer_cursor **ppCursor /* OUT: Created tokenizer cursor */
97 );
98
99 /*
100 ** Destroy an existing tokenizer cursor. The fts3 module calls this
101 ** method exactly once for each successful call to xOpen().
102 */
103 int (*xClose)(sqlite3_tokenizer_cursor *pCursor);
104
105 /*
106 ** Retrieve the next token from the tokenizer cursor pCursor. This
107 ** method should either return SQLITE_OK and set the values of the
108 ** "OUT" variables identified below, or SQLITE_DONE to indicate that
109 ** the end of the buffer has been reached, or an SQLite error code.
110 **
111 ** *ppToken should be set to point at a buffer containing the
112 ** normalized version of the token (i.e. after any case-folding and/or
113 ** stemming has been performed). *pnBytes should be set to the length
114 ** of this buffer in bytes. The input text that generated the token is
115 ** identified by the byte offsets returned in *piStartOffset and
116 ** *piEndOffset.
117 **
118 ** The buffer *ppToken is set to point at is managed by the tokenizer
119 ** implementation. It is only required to be valid until the next call
120 ** to xNext() or xClose().
121 */
122 /* TODO(shess) current implementation requires pInput to be
123 ** nul-terminated. This should either be fixed, or pInput/nBytes
124 ** should be converted to zInput.
125 */
126 int (*xNext)(
127 sqlite3_tokenizer_cursor *pCursor, /* Tokenizer cursor */
128 const char **ppToken, int *pnBytes, /* OUT: Normalized text for token */
129 int *piStartOffset, /* OUT: Byte offset of token in input buffer */
130 int *piEndOffset, /* OUT: Byte offset of end of token in input buffer */
131 int *piPosition /* OUT: Number of tokens returned before this one */
132 );
133};
134
135struct sqlite3_tokenizer {
136 const sqlite3_tokenizer_module *pModule; /* The module for this tokenizer */
137 /* Tokenizer implementations will typically add additional fields */
138};
139
140struct sqlite3_tokenizer_cursor {
141 sqlite3_tokenizer *pTokenizer; /* Tokenizer for this cursor. */
142 /* Tokenizer implementations will typically add additional fields */
143};
144
145#endif /* _FTS3_TOKENIZER_H_ */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_tokenizer1.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_tokenizer1.c
new file mode 100644
index 0000000..f53cc1d
--- /dev/null
+++ b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/fts3_tokenizer1.c
@@ -0,0 +1,229 @@
1/*
2** 2006 Oct 10
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11******************************************************************************
12**
13** Implementation of the "simple" full-text-search tokenizer.
14*/
15
16/*
17** The code in this file is only compiled if:
18**
19** * The FTS3 module is being built as an extension
20** (in which case SQLITE_CORE is not defined), or
21**
22** * The FTS3 module is being built into the core of
23** SQLite (in which case SQLITE_ENABLE_FTS3 is defined).
24*/
25#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
26
27
28#include <assert.h>
29#include <stdlib.h>
30#include <stdio.h>
31#include <string.h>
32#include <ctype.h>
33
34#include "fts3_tokenizer.h"
35
36typedef struct simple_tokenizer {
37 sqlite3_tokenizer base;
38 char delim[128]; /* flag ASCII delimiters */
39} simple_tokenizer;
40
41typedef struct simple_tokenizer_cursor {
42 sqlite3_tokenizer_cursor base;
43 const char *pInput; /* input we are tokenizing */
44 int nBytes; /* size of the input */
45 int iOffset; /* current position in pInput */
46 int iToken; /* index of next token to be returned */
47 char *pToken; /* storage for current token */
48 int nTokenAllocated; /* space allocated to zToken buffer */
49} simple_tokenizer_cursor;
50
51
52/* Forward declaration */
53static const sqlite3_tokenizer_module simpleTokenizerModule;
54
55static int simpleDelim(simple_tokenizer *t, unsigned char c){
56 return c<0x80 && t->delim[c];
57}
58
59/*
60** Create a new tokenizer instance.
61*/
62static int simpleCreate(
63 int argc, const char * const *argv,
64 sqlite3_tokenizer **ppTokenizer
65){
66 simple_tokenizer *t;
67
68 t = (simple_tokenizer *) calloc(sizeof(*t), 1);
69 if( t==NULL ) return SQLITE_NOMEM;
70
71 /* TODO(shess) Delimiters need to remain the same from run to run,
72 ** else we need to reindex. One solution would be a meta-table to
73 ** track such information in the database, then we'd only want this
74 ** information on the initial create.
75 */
76 if( argc>1 ){
77 int i, n = strlen(argv[1]);
78 for(i=0; i<n; i++){
79 unsigned char ch = argv[1][i];
80 /* We explicitly don't support UTF-8 delimiters for now. */
81 if( ch>=0x80 ){
82 free(t);
83 return SQLITE_ERROR;
84 }
85 t->delim[ch] = 1;
86 }
87 } else {
88 /* Mark non-alphanumeric ASCII characters as delimiters */
89 int i;
90 for(i=1; i<0x80; i++){
91 t->delim[i] = !isalnum(i);
92 }
93 }
94
95 *ppTokenizer = &t->base;
96 return SQLITE_OK;
97}
98
99/*
100** Destroy a tokenizer
101*/
102static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
103 free(pTokenizer);
104 return SQLITE_OK;
105}
106
107/*
108** Prepare to begin tokenizing a particular string. The input
109** string to be tokenized is pInput[0..nBytes-1]. A cursor
110** used to incrementally tokenize this string is returned in
111** *ppCursor.
112*/
113static int simpleOpen(
114 sqlite3_tokenizer *pTokenizer, /* The tokenizer */
115 const char *pInput, int nBytes, /* String to be tokenized */
116 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
117){
118 simple_tokenizer_cursor *c;
119
120 c = (simple_tokenizer_cursor *) malloc(sizeof(*c));
121 if( c==NULL ) return SQLITE_NOMEM;
122
123 c->pInput = pInput;
124 if( pInput==0 ){
125 c->nBytes = 0;
126 }else if( nBytes<0 ){
127 c->nBytes = (int)strlen(pInput);
128 }else{
129 c->nBytes = nBytes;
130 }
131 c->iOffset = 0; /* start tokenizing at the beginning */
132 c->iToken = 0;
133 c->pToken = NULL; /* no space allocated, yet. */
134 c->nTokenAllocated = 0;
135
136 *ppCursor = &c->base;
137 return SQLITE_OK;
138}
139
140/*
141** Close a tokenization cursor previously opened by a call to
142** simpleOpen() above.
143*/
144static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
145 simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
146 free(c->pToken);
147 free(c);
148 return SQLITE_OK;
149}
150
151/*
152** Extract the next token from a tokenization cursor. The cursor must
153** have been opened by a prior call to simpleOpen().
154*/
155static int simpleNext(
156 sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */
157 const char **ppToken, /* OUT: *ppToken is the token text */
158 int *pnBytes, /* OUT: Number of bytes in token */
159 int *piStartOffset, /* OUT: Starting offset of token */
160 int *piEndOffset, /* OUT: Ending offset of token */
161 int *piPosition /* OUT: Position integer of token */
162){
163 simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
164 simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
165 unsigned char *p = (unsigned char *)c->pInput;
166
167 while( c->iOffset<c->nBytes ){
168 int iStartOffset;
169
170 /* Scan past delimiter characters */
171 while( c->iOffset<c->nBytes && simpleDelim(t, p[c->iOffset]) ){
172 c->iOffset++;
173 }
174
175 /* Count non-delimiter characters. */
176 iStartOffset = c->iOffset;
177 while( c->iOffset<c->nBytes && !simpleDelim(t, p[c->iOffset]) ){
178 c->iOffset++;
179 }
180
181 if( c->iOffset>iStartOffset ){
182 int i, n = c->iOffset-iStartOffset;
183 if( n>c->nTokenAllocated ){
184 c->nTokenAllocated = n+20;
185 c->pToken = realloc(c->pToken, c->nTokenAllocated);
186 if( c->pToken==NULL ) return SQLITE_NOMEM;
187 }
188 for(i=0; i<n; i++){
189 /* TODO(shess) This needs expansion to handle UTF-8
190 ** case-insensitivity.
191 */
192 unsigned char ch = p[iStartOffset+i];
193 c->pToken[i] = ch<0x80 ? tolower(ch) : ch;
194 }
195 *ppToken = c->pToken;
196 *pnBytes = n;
197 *piStartOffset = iStartOffset;
198 *piEndOffset = c->iOffset;
199 *piPosition = c->iToken++;
200
201 return SQLITE_OK;
202 }
203 }
204 return SQLITE_DONE;
205}
206
207/*
208** The set of routines that implement the simple tokenizer
209*/
210static const sqlite3_tokenizer_module simpleTokenizerModule = {
211 0,
212 simpleCreate,
213 simpleDestroy,
214 simpleOpen,
215 simpleClose,
216 simpleNext,
217};
218
219/*
220** Allocate a new simple tokenizer. Return a pointer to the new
221** tokenizer in *ppModule
222*/
223void sqlite3Fts3SimpleTokenizerModule(
224 sqlite3_tokenizer_module const**ppModule
225){
226 *ppModule = &simpleTokenizerModule;
227}
228
229#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/mkfts3amal.tcl b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/mkfts3amal.tcl
new file mode 100644
index 0000000..cfea5d2
--- /dev/null
+++ b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/mkfts3amal.tcl
@@ -0,0 +1,116 @@
1#!/usr/bin/tclsh
2#
3# This script builds a single C code file holding all of FTS3 code.
4# The name of the output file is fts3amal.c. To build this file,
5# first do:
6#
7# make target_source
8#
9# The make target above moves all of the source code files into
10# a subdirectory named "tsrc". (This script expects to find the files
11# there and will not work if they are not found.)
12#
13# After the "tsrc" directory has been created and populated, run
14# this script:
15#
16# tclsh mkfts3amal.tcl
17#
18# The amalgamated FTS3 code will be written into fts3amal.c
19#
20
21# Open the output file and write a header comment at the beginning
22# of the file.
23#
24set out [open fts3amal.c w]
25set today [clock format [clock seconds] -format "%Y-%m-%d %H:%M:%S UTC" -gmt 1]
26puts $out [subst \
27{/******************************************************************************
28** This file is an amalgamation of separate C source files from the SQLite
29** Full Text Search extension 2 (fts3). By combining all the individual C
30** code files into this single large file, the entire code can be compiled
31** as a one translation unit. This allows many compilers to do optimizations
32** that would not be possible if the files were compiled separately. It also
33** makes the code easier to import into other projects.
34**
35** This amalgamation was generated on $today.
36*/}]
37
38# These are the header files used by FTS3. The first time any of these
39# files are seen in a #include statement in the C code, include the complete
40# text of the file in-line. The file only needs to be included once.
41#
42foreach hdr {
43 fts3.h
44 fts3_hash.h
45 fts3_tokenizer.h
46 sqlite3.h
47 sqlite3ext.h
48} {
49 set available_hdr($hdr) 1
50}
51
52# 78 stars used for comment formatting.
53set s78 \
54{*****************************************************************************}
55
56# Insert a comment into the code
57#
58proc section_comment {text} {
59 global out s78
60 set n [string length $text]
61 set nstar [expr {60 - $n}]
62 set stars [string range $s78 0 $nstar]
63 puts $out "/************** $text $stars/"
64}
65
66# Read the source file named $filename and write it into the
67# sqlite3.c output file. If any #include statements are seen,
68# process them approprately.
69#
70proc copy_file {filename} {
71 global seen_hdr available_hdr out
72 set tail [file tail $filename]
73 section_comment "Begin file $tail"
74 set in [open $filename r]
75 while {![eof $in]} {
76 set line [gets $in]
77 if {[regexp {^#\s*include\s+["<]([^">]+)[">]} $line all hdr]} {
78 if {[info exists available_hdr($hdr)]} {
79 if {$available_hdr($hdr)} {
80 section_comment "Include $hdr in the middle of $tail"
81 copy_file tsrc/$hdr
82 section_comment "Continuing where we left off in $tail"
83 }
84 } elseif {![info exists seen_hdr($hdr)]} {
85 set seen_hdr($hdr) 1
86 puts $out $line
87 }
88 } elseif {[regexp {^#ifdef __cplusplus} $line]} {
89 puts $out "#if 0"
90 } elseif {[regexp {^#line} $line]} {
91 # Skip #line directives.
92 } else {
93 puts $out $line
94 }
95 }
96 close $in
97 section_comment "End of $tail"
98}
99
100
101# Process the source files. Process files containing commonly
102# used subroutines first in order to help the compiler find
103# inlining opportunities.
104#
105foreach file {
106 fts3.c
107 fts3_hash.c
108 fts3_porter.c
109 fts3_tokenizer.c
110 fts3_tokenizer1.c
111 fts3_icu.c
112} {
113 copy_file tsrc/$file
114}
115
116close $out