aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2
diff options
context:
space:
mode:
Diffstat (limited to 'libraries/sqlite/unix/sqlite-3.5.1/ext/fts2')
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/README.tokenizers134
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/README.txt4
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2.c5936
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2.h26
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_hash.c369
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_hash.h112
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_icu.c257
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_porter.c642
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_tokenizer.c371
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_tokenizer.h145
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_tokenizer1.c229
-rw-r--r--libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/mkfts2amal.tcl116
12 files changed, 8341 insertions, 0 deletions
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/README.tokenizers b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/README.tokenizers
new file mode 100644
index 0000000..6625b31
--- /dev/null
+++ b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/README.tokenizers
@@ -0,0 +1,134 @@
1
21. FTS2 Tokenizers
3
4 When creating a new full-text table, FTS2 allows the user to select
5 the text tokenizer implementation to be used when indexing text
6 by specifying a "tokenizer" clause as part of the CREATE VIRTUAL TABLE
7 statement:
8
9 CREATE VIRTUAL TABLE <table-name> USING fts2(
10 <columns ...> [, tokenizer <tokenizer-name> [<tokenizer-args>]]
11 );
12
13 The built-in tokenizers (valid values to pass as <tokenizer name>) are
14 "simple" and "porter".
15
16 <tokenizer-args> should consist of zero or more white-space separated
17 arguments to pass to the selected tokenizer implementation. The
18 interpretation of the arguments, if any, depends on the individual
19 tokenizer.
20
212. Custom Tokenizers
22
23 FTS2 allows users to provide custom tokenizer implementations. The
24 interface used to create a new tokenizer is defined and described in
25 the fts2_tokenizer.h source file.
26
27 Registering a new FTS2 tokenizer is similar to registering a new
28 virtual table module with SQLite. The user passes a pointer to a
29 structure containing pointers to various callback functions that
30 make up the implementation of the new tokenizer type. For tokenizers,
31 the structure (defined in fts2_tokenizer.h) is called
32 "sqlite3_tokenizer_module".
33
34 FTS2 does not expose a C-function that users call to register new
35 tokenizer types with a database handle. Instead, the pointer must
36 be encoded as an SQL blob value and passed to FTS2 through the SQL
37 engine by evaluating a special scalar function, "fts2_tokenizer()".
38 The fts2_tokenizer() function may be called with one or two arguments,
39 as follows:
40
41 SELECT fts2_tokenizer(<tokenizer-name>);
42 SELECT fts2_tokenizer(<tokenizer-name>, <sqlite3_tokenizer_module ptr>);
43
44 Where <tokenizer-name> is a string identifying the tokenizer and
45 <sqlite3_tokenizer_module ptr> is a pointer to an sqlite3_tokenizer_module
46 structure encoded as an SQL blob. If the second argument is present,
47 it is registered as tokenizer <tokenizer-name> and a copy of it
48 returned. If only one argument is passed, a pointer to the tokenizer
49 implementation currently registered as <tokenizer-name> is returned,
50 encoded as a blob. Or, if no such tokenizer exists, an SQL exception
51 (error) is raised.
52
53 SECURITY: If the fts2 extension is used in an environment where potentially
54 malicious users may execute arbitrary SQL (i.e. gears), they should be
55 prevented from invoking the fts2_tokenizer() function, possibly using the
56 authorisation callback.
57
58 See "Sample code" below for an example of calling the fts2_tokenizer()
59 function from C code.
60
613. ICU Library Tokenizers
62
63 If this extension is compiled with the SQLITE_ENABLE_ICU pre-processor
64 symbol defined, then there exists a built-in tokenizer named "icu"
65 implemented using the ICU library. The first argument passed to the
66 xCreate() method (see fts2_tokenizer.h) of this tokenizer may be
67 an ICU locale identifier. For example "tr_TR" for Turkish as used
68 in Turkey, or "en_AU" for English as used in Australia. For example:
69
70 "CREATE VIRTUAL TABLE thai_text USING fts2(text, tokenizer icu th_TH)"
71
72 The ICU tokenizer implementation is very simple. It splits the input
73 text according to the ICU rules for finding word boundaries and discards
74 any tokens that consist entirely of white-space. This may be suitable
75 for some applications in some locales, but not all. If more complex
76 processing is required, for example to implement stemming or
77 discard punctuation, this can be done by creating a tokenizer
78 implementation that uses the ICU tokenizer as part of it's implementation.
79
80 When using the ICU tokenizer this way, it is safe to overwrite the
81 contents of the strings returned by the xNext() method (see
82 fts2_tokenizer.h).
83
844. Sample code.
85
86 The following two code samples illustrate the way C code should invoke
87 the fts2_tokenizer() scalar function:
88
89 int registerTokenizer(
90 sqlite3 *db,
91 char *zName,
92 const sqlite3_tokenizer_module *p
93 ){
94 int rc;
95 sqlite3_stmt *pStmt;
96 const char zSql[] = "SELECT fts2_tokenizer(?, ?)";
97
98 rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
99 if( rc!=SQLITE_OK ){
100 return rc;
101 }
102
103 sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
104 sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
105 sqlite3_step(pStmt);
106
107 return sqlite3_finalize(pStmt);
108 }
109
110 int queryTokenizer(
111 sqlite3 *db,
112 char *zName,
113 const sqlite3_tokenizer_module **pp
114 ){
115 int rc;
116 sqlite3_stmt *pStmt;
117 const char zSql[] = "SELECT fts2_tokenizer(?)";
118
119 *pp = 0;
120 rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
121 if( rc!=SQLITE_OK ){
122 return rc;
123 }
124
125 sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
126 if( SQLITE_ROW==sqlite3_step(pStmt) ){
127 if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){
128 memcpy(pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp));
129 }
130 }
131
132 return sqlite3_finalize(pStmt);
133 }
134
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/README.txt b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/README.txt
new file mode 100644
index 0000000..517a2a0
--- /dev/null
+++ b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/README.txt
@@ -0,0 +1,4 @@
1This folder contains source code to the second full-text search
2extension for SQLite. While the API is the same, this version uses a
3substantially different storage schema from fts1, so tables will need
4to be rebuilt.
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2.c
new file mode 100644
index 0000000..65ad173
--- /dev/null
+++ b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2.c
@@ -0,0 +1,5936 @@
1/* fts2 has a design flaw which can lead to database corruption (see
2** below). It is recommended not to use it any longer, instead use
3** fts3 (or higher). If you believe that your use of fts2 is safe,
4** add -DSQLITE_ENABLE_BROKEN_FTS2=1 to your CFLAGS.
5*/
6#ifndef SQLITE_ENABLE_BROKEN_FTS2
7#error fts2 has a design flaw and has been deprecated.
8#endif
9/* The flaw is that fts2 uses the content table's unaliased rowid as
10** the unique docid. fts2 embeds the rowid in the index it builds,
11** and expects the rowid to not change. The SQLite VACUUM operation
12** will renumber such rowids, thereby breaking fts2. If you are using
13** fts2 in a system which has disabled VACUUM, then you can continue
14** to use it safely. Note that PRAGMA auto_vacuum does NOT disable
15** VACUUM, though systems using auto_vacuum are unlikely to invoke
16** VACUUM.
17**
18** Unlike fts1, which is safe across VACUUM if you never delete
19** documents, fts2 has a second exposure to this flaw, in the segments
20** table. So fts2 should be considered unsafe across VACUUM in all
21** cases.
22*/
23
24/*
25** 2006 Oct 10
26**
27** The author disclaims copyright to this source code. In place of
28** a legal notice, here is a blessing:
29**
30** May you do good and not evil.
31** May you find forgiveness for yourself and forgive others.
32** May you share freely, never taking more than you give.
33**
34******************************************************************************
35**
36** This is an SQLite module implementing full-text search.
37*/
38
39/*
40** The code in this file is only compiled if:
41**
42** * The FTS2 module is being built as an extension
43** (in which case SQLITE_CORE is not defined), or
44**
45** * The FTS2 module is being built into the core of
46** SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
47*/
48
49/* TODO(shess) Consider exporting this comment to an HTML file or the
50** wiki.
51*/
52/* The full-text index is stored in a series of b+tree (-like)
53** structures called segments which map terms to doclists. The
54** structures are like b+trees in layout, but are constructed from the
55** bottom up in optimal fashion and are not updatable. Since trees
56** are built from the bottom up, things will be described from the
57** bottom up.
58**
59**
60**** Varints ****
61** The basic unit of encoding is a variable-length integer called a
62** varint. We encode variable-length integers in little-endian order
63** using seven bits * per byte as follows:
64**
65** KEY:
66** A = 0xxxxxxx 7 bits of data and one flag bit
67** B = 1xxxxxxx 7 bits of data and one flag bit
68**
69** 7 bits - A
70** 14 bits - BA
71** 21 bits - BBA
72** and so on.
73**
74** This is identical to how sqlite encodes varints (see util.c).
75**
76**
77**** Document lists ****
78** A doclist (document list) holds a docid-sorted list of hits for a
79** given term. Doclists hold docids, and can optionally associate
80** token positions and offsets with docids.
81**
82** A DL_POSITIONS_OFFSETS doclist is stored like this:
83**
84** array {
85** varint docid;
86** array { (position list for column 0)
87** varint position; (delta from previous position plus POS_BASE)
88** varint startOffset; (delta from previous startOffset)
89** varint endOffset; (delta from startOffset)
90** }
91** array {
92** varint POS_COLUMN; (marks start of position list for new column)
93** varint column; (index of new column)
94** array {
95** varint position; (delta from previous position plus POS_BASE)
96** varint startOffset;(delta from previous startOffset)
97** varint endOffset; (delta from startOffset)
98** }
99** }
100** varint POS_END; (marks end of positions for this document.
101** }
102**
103** Here, array { X } means zero or more occurrences of X, adjacent in
104** memory. A "position" is an index of a token in the token stream
105** generated by the tokenizer, while an "offset" is a byte offset,
106** both based at 0. Note that POS_END and POS_COLUMN occur in the
107** same logical place as the position element, and act as sentinals
108** ending a position list array.
109**
110** A DL_POSITIONS doclist omits the startOffset and endOffset
111** information. A DL_DOCIDS doclist omits both the position and
112** offset information, becoming an array of varint-encoded docids.
113**
114** On-disk data is stored as type DL_DEFAULT, so we don't serialize
115** the type. Due to how deletion is implemented in the segmentation
116** system, on-disk doclists MUST store at least positions.
117**
118**
119**** Segment leaf nodes ****
120** Segment leaf nodes store terms and doclists, ordered by term. Leaf
121** nodes are written using LeafWriter, and read using LeafReader (to
122** iterate through a single leaf node's data) and LeavesReader (to
123** iterate through a segment's entire leaf layer). Leaf nodes have
124** the format:
125**
126** varint iHeight; (height from leaf level, always 0)
127** varint nTerm; (length of first term)
128** char pTerm[nTerm]; (content of first term)
129** varint nDoclist; (length of term's associated doclist)
130** char pDoclist[nDoclist]; (content of doclist)
131** array {
132** (further terms are delta-encoded)
133** varint nPrefix; (length of prefix shared with previous term)
134** varint nSuffix; (length of unshared suffix)
135** char pTermSuffix[nSuffix];(unshared suffix of next term)
136** varint nDoclist; (length of term's associated doclist)
137** char pDoclist[nDoclist]; (content of doclist)
138** }
139**
140** Here, array { X } means zero or more occurrences of X, adjacent in
141** memory.
142**
143** Leaf nodes are broken into blocks which are stored contiguously in
144** the %_segments table in sorted order. This means that when the end
145** of a node is reached, the next term is in the node with the next
146** greater node id.
147**
148** New data is spilled to a new leaf node when the current node
149** exceeds LEAF_MAX bytes (default 2048). New data which itself is
150** larger than STANDALONE_MIN (default 1024) is placed in a standalone
151** node (a leaf node with a single term and doclist). The goal of
152** these settings is to pack together groups of small doclists while
153** making it efficient to directly access large doclists. The
154** assumption is that large doclists represent terms which are more
155** likely to be query targets.
156**
157** TODO(shess) It may be useful for blocking decisions to be more
158** dynamic. For instance, it may make more sense to have a 2.5k leaf
159** node rather than splitting into 2k and .5k nodes. My intuition is
160** that this might extend through 2x or 4x the pagesize.
161**
162**
163**** Segment interior nodes ****
164** Segment interior nodes store blockids for subtree nodes and terms
165** to describe what data is stored by the each subtree. Interior
166** nodes are written using InteriorWriter, and read using
167** InteriorReader. InteriorWriters are created as needed when
168** SegmentWriter creates new leaf nodes, or when an interior node
169** itself grows too big and must be split. The format of interior
170** nodes:
171**
172** varint iHeight; (height from leaf level, always >0)
173** varint iBlockid; (block id of node's leftmost subtree)
174** optional {
175** varint nTerm; (length of first term)
176** char pTerm[nTerm]; (content of first term)
177** array {
178** (further terms are delta-encoded)
179** varint nPrefix; (length of shared prefix with previous term)
180** varint nSuffix; (length of unshared suffix)
181** char pTermSuffix[nSuffix]; (unshared suffix of next term)
182** }
183** }
184**
185** Here, optional { X } means an optional element, while array { X }
186** means zero or more occurrences of X, adjacent in memory.
187**
188** An interior node encodes n terms separating n+1 subtrees. The
189** subtree blocks are contiguous, so only the first subtree's blockid
190** is encoded. The subtree at iBlockid will contain all terms less
191** than the first term encoded (or all terms if no term is encoded).
192** Otherwise, for terms greater than or equal to pTerm[i] but less
193** than pTerm[i+1], the subtree for that term will be rooted at
194** iBlockid+i. Interior nodes only store enough term data to
195** distinguish adjacent children (if the rightmost term of the left
196** child is "something", and the leftmost term of the right child is
197** "wicked", only "w" is stored).
198**
199** New data is spilled to a new interior node at the same height when
200** the current node exceeds INTERIOR_MAX bytes (default 2048).
201** INTERIOR_MIN_TERMS (default 7) keeps large terms from monopolizing
202** interior nodes and making the tree too skinny. The interior nodes
203** at a given height are naturally tracked by interior nodes at
204** height+1, and so on.
205**
206**
207**** Segment directory ****
208** The segment directory in table %_segdir stores meta-information for
209** merging and deleting segments, and also the root node of the
210** segment's tree.
211**
212** The root node is the top node of the segment's tree after encoding
213** the entire segment, restricted to ROOT_MAX bytes (default 1024).
214** This could be either a leaf node or an interior node. If the top
215** node requires more than ROOT_MAX bytes, it is flushed to %_segments
216** and a new root interior node is generated (which should always fit
217** within ROOT_MAX because it only needs space for 2 varints, the
218** height and the blockid of the previous root).
219**
220** The meta-information in the segment directory is:
221** level - segment level (see below)
222** idx - index within level
223** - (level,idx uniquely identify a segment)
224** start_block - first leaf node
225** leaves_end_block - last leaf node
226** end_block - last block (including interior nodes)
227** root - contents of root node
228**
229** If the root node is a leaf node, then start_block,
230** leaves_end_block, and end_block are all 0.
231**
232**
233**** Segment merging ****
234** To amortize update costs, segments are groups into levels and
235** merged in matches. Each increase in level represents exponentially
236** more documents.
237**
238** New documents (actually, document updates) are tokenized and
239** written individually (using LeafWriter) to a level 0 segment, with
240** incrementing idx. When idx reaches MERGE_COUNT (default 16), all
241** level 0 segments are merged into a single level 1 segment. Level 1
242** is populated like level 0, and eventually MERGE_COUNT level 1
243** segments are merged to a single level 2 segment (representing
244** MERGE_COUNT^2 updates), and so on.
245**
246** A segment merge traverses all segments at a given level in
247** parallel, performing a straightforward sorted merge. Since segment
248** leaf nodes are written in to the %_segments table in order, this
249** merge traverses the underlying sqlite disk structures efficiently.
250** After the merge, all segment blocks from the merged level are
251** deleted.
252**
253** MERGE_COUNT controls how often we merge segments. 16 seems to be
254** somewhat of a sweet spot for insertion performance. 32 and 64 show
255** very similar performance numbers to 16 on insertion, though they're
256** a tiny bit slower (perhaps due to more overhead in merge-time
257** sorting). 8 is about 20% slower than 16, 4 about 50% slower than
258** 16, 2 about 66% slower than 16.
259**
260** At query time, high MERGE_COUNT increases the number of segments
261** which need to be scanned and merged. For instance, with 100k docs
262** inserted:
263**
264** MERGE_COUNT segments
265** 16 25
266** 8 12
267** 4 10
268** 2 6
269**
270** This appears to have only a moderate impact on queries for very
271** frequent terms (which are somewhat dominated by segment merge
272** costs), and infrequent and non-existent terms still seem to be fast
273** even with many segments.
274**
275** TODO(shess) That said, it would be nice to have a better query-side
276** argument for MERGE_COUNT of 16. Also, it's possible/likely that
277** optimizations to things like doclist merging will swing the sweet
278** spot around.
279**
280**
281**
282**** Handling of deletions and updates ****
283** Since we're using a segmented structure, with no docid-oriented
284** index into the term index, we clearly cannot simply update the term
285** index when a document is deleted or updated. For deletions, we
286** write an empty doclist (varint(docid) varint(POS_END)), for updates
287** we simply write the new doclist. Segment merges overwrite older
288** data for a particular docid with newer data, so deletes or updates
289** will eventually overtake the earlier data and knock it out. The
290** query logic likewise merges doclists so that newer data knocks out
291** older data.
292**
293** TODO(shess) Provide a VACUUM type operation to clear out all
294** deletions and duplications. This would basically be a forced merge
295** into a single segment.
296*/
297
298#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
299
300#if defined(SQLITE_ENABLE_FTS2) && !defined(SQLITE_CORE)
301# define SQLITE_CORE 1
302#endif
303
304#include <assert.h>
305#include <stdlib.h>
306#include <stdio.h>
307#include <string.h>
308#include <ctype.h>
309
310#include "fts2.h"
311#include "fts2_hash.h"
312#include "fts2_tokenizer.h"
313#include "sqlite3.h"
314#include "sqlite3ext.h"
315SQLITE_EXTENSION_INIT1
316
317
318/* TODO(shess) MAN, this thing needs some refactoring. At minimum, it
319** would be nice to order the file better, perhaps something along the
320** lines of:
321**
322** - utility functions
323** - table setup functions
324** - table update functions
325** - table query functions
326**
327** Put the query functions last because they're likely to reference
328** typedefs or functions from the table update section.
329*/
330
331#if 0
332# define TRACE(A) printf A; fflush(stdout)
333#else
334# define TRACE(A)
335#endif
336
337/* It is not safe to call isspace(), tolower(), or isalnum() on
338** hi-bit-set characters. This is the same solution used in the
339** tokenizer.
340*/
341/* TODO(shess) The snippet-generation code should be using the
342** tokenizer-generated tokens rather than doing its own local
343** tokenization.
344*/
345/* TODO(shess) Is __isascii() a portable version of (c&0x80)==0? */
346static int safe_isspace(char c){
347 return (c&0x80)==0 ? isspace(c) : 0;
348}
349static int safe_tolower(char c){
350 return (c&0x80)==0 ? tolower(c) : c;
351}
352static int safe_isalnum(char c){
353 return (c&0x80)==0 ? isalnum(c) : 0;
354}
355
356typedef enum DocListType {
357 DL_DOCIDS, /* docids only */
358 DL_POSITIONS, /* docids + positions */
359 DL_POSITIONS_OFFSETS /* docids + positions + offsets */
360} DocListType;
361
362/*
363** By default, only positions and not offsets are stored in the doclists.
364** To change this so that offsets are stored too, compile with
365**
366** -DDL_DEFAULT=DL_POSITIONS_OFFSETS
367**
368** If DL_DEFAULT is set to DL_DOCIDS, your table can only be inserted
369** into (no deletes or updates).
370*/
371#ifndef DL_DEFAULT
372# define DL_DEFAULT DL_POSITIONS
373#endif
374
375enum {
376 POS_END = 0, /* end of this position list */
377 POS_COLUMN, /* followed by new column number */
378 POS_BASE
379};
380
381/* MERGE_COUNT controls how often we merge segments (see comment at
382** top of file).
383*/
384#define MERGE_COUNT 16
385
386/* utility functions */
387
388/* CLEAR() and SCRAMBLE() abstract memset() on a pointer to a single
389** record to prevent errors of the form:
390**
391** my_function(SomeType *b){
392** memset(b, '\0', sizeof(b)); // sizeof(b)!=sizeof(*b)
393** }
394*/
395/* TODO(shess) Obvious candidates for a header file. */
396#define CLEAR(b) memset(b, '\0', sizeof(*(b)))
397
398#ifndef NDEBUG
399# define SCRAMBLE(b) memset(b, 0x55, sizeof(*(b)))
400#else
401# define SCRAMBLE(b)
402#endif
403
404/* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */
405#define VARINT_MAX 10
406
407/* Write a 64-bit variable-length integer to memory starting at p[0].
408 * The length of data written will be between 1 and VARINT_MAX bytes.
409 * The number of bytes written is returned. */
410static int putVarint(char *p, sqlite_int64 v){
411 unsigned char *q = (unsigned char *) p;
412 sqlite_uint64 vu = v;
413 do{
414 *q++ = (unsigned char) ((vu & 0x7f) | 0x80);
415 vu >>= 7;
416 }while( vu!=0 );
417 q[-1] &= 0x7f; /* turn off high bit in final byte */
418 assert( q - (unsigned char *)p <= VARINT_MAX );
419 return (int) (q - (unsigned char *)p);
420}
421
422/* Read a 64-bit variable-length integer from memory starting at p[0].
423 * Return the number of bytes read, or 0 on error.
424 * The value is stored in *v. */
425static int getVarint(const char *p, sqlite_int64 *v){
426 const unsigned char *q = (const unsigned char *) p;
427 sqlite_uint64 x = 0, y = 1;
428 while( (*q & 0x80) == 0x80 ){
429 x += y * (*q++ & 0x7f);
430 y <<= 7;
431 if( q - (unsigned char *)p >= VARINT_MAX ){ /* bad data */
432 assert( 0 );
433 return 0;
434 }
435 }
436 x += y * (*q++);
437 *v = (sqlite_int64) x;
438 return (int) (q - (unsigned char *)p);
439}
440
441static int getVarint32(const char *p, int *pi){
442 sqlite_int64 i;
443 int ret = getVarint(p, &i);
444 *pi = (int) i;
445 assert( *pi==i );
446 return ret;
447}
448
449/*******************************************************************/
450/* DataBuffer is used to collect data into a buffer in piecemeal
451** fashion. It implements the usual distinction between amount of
452** data currently stored (nData) and buffer capacity (nCapacity).
453**
454** dataBufferInit - create a buffer with given initial capacity.
455** dataBufferReset - forget buffer's data, retaining capacity.
456** dataBufferDestroy - free buffer's data.
457** dataBufferExpand - expand capacity without adding data.
458** dataBufferAppend - append data.
459** dataBufferAppend2 - append two pieces of data at once.
460** dataBufferReplace - replace buffer's data.
461*/
462typedef struct DataBuffer {
463 char *pData; /* Pointer to malloc'ed buffer. */
464 int nCapacity; /* Size of pData buffer. */
465 int nData; /* End of data loaded into pData. */
466} DataBuffer;
467
468static void dataBufferInit(DataBuffer *pBuffer, int nCapacity){
469 assert( nCapacity>=0 );
470 pBuffer->nData = 0;
471 pBuffer->nCapacity = nCapacity;
472 pBuffer->pData = nCapacity==0 ? NULL : malloc(nCapacity);
473}
474static void dataBufferReset(DataBuffer *pBuffer){
475 pBuffer->nData = 0;
476}
477static void dataBufferDestroy(DataBuffer *pBuffer){
478 if( pBuffer->pData!=NULL ) free(pBuffer->pData);
479 SCRAMBLE(pBuffer);
480}
481static void dataBufferExpand(DataBuffer *pBuffer, int nAddCapacity){
482 assert( nAddCapacity>0 );
483 /* TODO(shess) Consider expanding more aggressively. Note that the
484 ** underlying malloc implementation may take care of such things for
485 ** us already.
486 */
487 if( pBuffer->nData+nAddCapacity>pBuffer->nCapacity ){
488 pBuffer->nCapacity = pBuffer->nData+nAddCapacity;
489 pBuffer->pData = realloc(pBuffer->pData, pBuffer->nCapacity);
490 }
491}
492static void dataBufferAppend(DataBuffer *pBuffer,
493 const char *pSource, int nSource){
494 assert( nSource>0 && pSource!=NULL );
495 dataBufferExpand(pBuffer, nSource);
496 memcpy(pBuffer->pData+pBuffer->nData, pSource, nSource);
497 pBuffer->nData += nSource;
498}
499static void dataBufferAppend2(DataBuffer *pBuffer,
500 const char *pSource1, int nSource1,
501 const char *pSource2, int nSource2){
502 assert( nSource1>0 && pSource1!=NULL );
503 assert( nSource2>0 && pSource2!=NULL );
504 dataBufferExpand(pBuffer, nSource1+nSource2);
505 memcpy(pBuffer->pData+pBuffer->nData, pSource1, nSource1);
506 memcpy(pBuffer->pData+pBuffer->nData+nSource1, pSource2, nSource2);
507 pBuffer->nData += nSource1+nSource2;
508}
509static void dataBufferReplace(DataBuffer *pBuffer,
510 const char *pSource, int nSource){
511 dataBufferReset(pBuffer);
512 dataBufferAppend(pBuffer, pSource, nSource);
513}
514
515/* StringBuffer is a null-terminated version of DataBuffer. */
516typedef struct StringBuffer {
517 DataBuffer b; /* Includes null terminator. */
518} StringBuffer;
519
520static void initStringBuffer(StringBuffer *sb){
521 dataBufferInit(&sb->b, 100);
522 dataBufferReplace(&sb->b, "", 1);
523}
524static int stringBufferLength(StringBuffer *sb){
525 return sb->b.nData-1;
526}
527static char *stringBufferData(StringBuffer *sb){
528 return sb->b.pData;
529}
530static void stringBufferDestroy(StringBuffer *sb){
531 dataBufferDestroy(&sb->b);
532}
533
534static void nappend(StringBuffer *sb, const char *zFrom, int nFrom){
535 assert( sb->b.nData>0 );
536 if( nFrom>0 ){
537 sb->b.nData--;
538 dataBufferAppend2(&sb->b, zFrom, nFrom, "", 1);
539 }
540}
541static void append(StringBuffer *sb, const char *zFrom){
542 nappend(sb, zFrom, strlen(zFrom));
543}
544
545/* Append a list of strings separated by commas. */
546static void appendList(StringBuffer *sb, int nString, char **azString){
547 int i;
548 for(i=0; i<nString; ++i){
549 if( i>0 ) append(sb, ", ");
550 append(sb, azString[i]);
551 }
552}
553
554static int endsInWhiteSpace(StringBuffer *p){
555 return stringBufferLength(p)>0 &&
556 safe_isspace(stringBufferData(p)[stringBufferLength(p)-1]);
557}
558
559/* If the StringBuffer ends in something other than white space, add a
560** single space character to the end.
561*/
562static void appendWhiteSpace(StringBuffer *p){
563 if( stringBufferLength(p)==0 ) return;
564 if( !endsInWhiteSpace(p) ) append(p, " ");
565}
566
567/* Remove white space from the end of the StringBuffer */
568static void trimWhiteSpace(StringBuffer *p){
569 while( endsInWhiteSpace(p) ){
570 p->b.pData[--p->b.nData-1] = '\0';
571 }
572}
573
574/*******************************************************************/
575/* DLReader is used to read document elements from a doclist. The
576** current docid is cached, so dlrDocid() is fast. DLReader does not
577** own the doclist buffer.
578**
579** dlrAtEnd - true if there's no more data to read.
580** dlrDocid - docid of current document.
581** dlrDocData - doclist data for current document (including docid).
582** dlrDocDataBytes - length of same.
583** dlrAllDataBytes - length of all remaining data.
584** dlrPosData - position data for current document.
585** dlrPosDataLen - length of pos data for current document (incl POS_END).
586** dlrStep - step to current document.
587** dlrInit - initial for doclist of given type against given data.
588** dlrDestroy - clean up.
589**
590** Expected usage is something like:
591**
592** DLReader reader;
593** dlrInit(&reader, pData, nData);
594** while( !dlrAtEnd(&reader) ){
595** // calls to dlrDocid() and kin.
596** dlrStep(&reader);
597** }
598** dlrDestroy(&reader);
599*/
600typedef struct DLReader {
601 DocListType iType;
602 const char *pData;
603 int nData;
604
605 sqlite_int64 iDocid;
606 int nElement;
607} DLReader;
608
609static int dlrAtEnd(DLReader *pReader){
610 assert( pReader->nData>=0 );
611 return pReader->nData==0;
612}
613static sqlite_int64 dlrDocid(DLReader *pReader){
614 assert( !dlrAtEnd(pReader) );
615 return pReader->iDocid;
616}
617static const char *dlrDocData(DLReader *pReader){
618 assert( !dlrAtEnd(pReader) );
619 return pReader->pData;
620}
621static int dlrDocDataBytes(DLReader *pReader){
622 assert( !dlrAtEnd(pReader) );
623 return pReader->nElement;
624}
625static int dlrAllDataBytes(DLReader *pReader){
626 assert( !dlrAtEnd(pReader) );
627 return pReader->nData;
628}
629/* TODO(shess) Consider adding a field to track iDocid varint length
630** to make these two functions faster. This might matter (a tiny bit)
631** for queries.
632*/
633static const char *dlrPosData(DLReader *pReader){
634 sqlite_int64 iDummy;
635 int n = getVarint(pReader->pData, &iDummy);
636 assert( !dlrAtEnd(pReader) );
637 return pReader->pData+n;
638}
639static int dlrPosDataLen(DLReader *pReader){
640 sqlite_int64 iDummy;
641 int n = getVarint(pReader->pData, &iDummy);
642 assert( !dlrAtEnd(pReader) );
643 return pReader->nElement-n;
644}
645static void dlrStep(DLReader *pReader){
646 assert( !dlrAtEnd(pReader) );
647
648 /* Skip past current doclist element. */
649 assert( pReader->nElement<=pReader->nData );
650 pReader->pData += pReader->nElement;
651 pReader->nData -= pReader->nElement;
652
653 /* If there is more data, read the next doclist element. */
654 if( pReader->nData!=0 ){
655 sqlite_int64 iDocidDelta;
656 int iDummy, n = getVarint(pReader->pData, &iDocidDelta);
657 pReader->iDocid += iDocidDelta;
658 if( pReader->iType>=DL_POSITIONS ){
659 assert( n<pReader->nData );
660 while( 1 ){
661 n += getVarint32(pReader->pData+n, &iDummy);
662 assert( n<=pReader->nData );
663 if( iDummy==POS_END ) break;
664 if( iDummy==POS_COLUMN ){
665 n += getVarint32(pReader->pData+n, &iDummy);
666 assert( n<pReader->nData );
667 }else if( pReader->iType==DL_POSITIONS_OFFSETS ){
668 n += getVarint32(pReader->pData+n, &iDummy);
669 n += getVarint32(pReader->pData+n, &iDummy);
670 assert( n<pReader->nData );
671 }
672 }
673 }
674 pReader->nElement = n;
675 assert( pReader->nElement<=pReader->nData );
676 }
677}
678static void dlrInit(DLReader *pReader, DocListType iType,
679 const char *pData, int nData){
680 assert( pData!=NULL && nData!=0 );
681 pReader->iType = iType;
682 pReader->pData = pData;
683 pReader->nData = nData;
684 pReader->nElement = 0;
685 pReader->iDocid = 0;
686
687 /* Load the first element's data. There must be a first element. */
688 dlrStep(pReader);
689}
690static void dlrDestroy(DLReader *pReader){
691 SCRAMBLE(pReader);
692}
693
694#ifndef NDEBUG
695/* Verify that the doclist can be validly decoded. Also returns the
696** last docid found because it's convenient in other assertions for
697** DLWriter.
698*/
699static void docListValidate(DocListType iType, const char *pData, int nData,
700 sqlite_int64 *pLastDocid){
701 sqlite_int64 iPrevDocid = 0;
702 assert( nData>0 );
703 assert( pData!=0 );
704 assert( pData+nData>pData );
705 while( nData!=0 ){
706 sqlite_int64 iDocidDelta;
707 int n = getVarint(pData, &iDocidDelta);
708 iPrevDocid += iDocidDelta;
709 if( iType>DL_DOCIDS ){
710 int iDummy;
711 while( 1 ){
712 n += getVarint32(pData+n, &iDummy);
713 if( iDummy==POS_END ) break;
714 if( iDummy==POS_COLUMN ){
715 n += getVarint32(pData+n, &iDummy);
716 }else if( iType>DL_POSITIONS ){
717 n += getVarint32(pData+n, &iDummy);
718 n += getVarint32(pData+n, &iDummy);
719 }
720 assert( n<=nData );
721 }
722 }
723 assert( n<=nData );
724 pData += n;
725 nData -= n;
726 }
727 if( pLastDocid ) *pLastDocid = iPrevDocid;
728}
729#define ASSERT_VALID_DOCLIST(i, p, n, o) docListValidate(i, p, n, o)
730#else
731#define ASSERT_VALID_DOCLIST(i, p, n, o) assert( 1 )
732#endif
733
734/*******************************************************************/
735/* DLWriter is used to write doclist data to a DataBuffer. DLWriter
736** always appends to the buffer and does not own it.
737**
738** dlwInit - initialize to write a given type doclistto a buffer.
739** dlwDestroy - clear the writer's memory. Does not free buffer.
740** dlwAppend - append raw doclist data to buffer.
741** dlwCopy - copy next doclist from reader to writer.
742** dlwAdd - construct doclist element and append to buffer.
743** Only apply dlwAdd() to DL_DOCIDS doclists (else use PLWriter).
744*/
745typedef struct DLWriter {
746 DocListType iType;
747 DataBuffer *b;
748 sqlite_int64 iPrevDocid;
749#ifndef NDEBUG
750 int has_iPrevDocid;
751#endif
752} DLWriter;
753
754static void dlwInit(DLWriter *pWriter, DocListType iType, DataBuffer *b){
755 pWriter->b = b;
756 pWriter->iType = iType;
757 pWriter->iPrevDocid = 0;
758#ifndef NDEBUG
759 pWriter->has_iPrevDocid = 0;
760#endif
761}
762static void dlwDestroy(DLWriter *pWriter){
763 SCRAMBLE(pWriter);
764}
765/* iFirstDocid is the first docid in the doclist in pData. It is
766** needed because pData may point within a larger doclist, in which
767** case the first item would be delta-encoded.
768**
769** iLastDocid is the final docid in the doclist in pData. It is
770** needed to create the new iPrevDocid for future delta-encoding. The
771** code could decode the passed doclist to recreate iLastDocid, but
772** the only current user (docListMerge) already has decoded this
773** information.
774*/
775/* TODO(shess) This has become just a helper for docListMerge.
776** Consider a refactor to make this cleaner.
777*/
778static void dlwAppend(DLWriter *pWriter,
779 const char *pData, int nData,
780 sqlite_int64 iFirstDocid, sqlite_int64 iLastDocid){
781 sqlite_int64 iDocid = 0;
782 char c[VARINT_MAX];
783 int nFirstOld, nFirstNew; /* Old and new varint len of first docid. */
784#ifndef NDEBUG
785 sqlite_int64 iLastDocidDelta;
786#endif
787
788 /* Recode the initial docid as delta from iPrevDocid. */
789 nFirstOld = getVarint(pData, &iDocid);
790 assert( nFirstOld<nData || (nFirstOld==nData && pWriter->iType==DL_DOCIDS) );
791 nFirstNew = putVarint(c, iFirstDocid-pWriter->iPrevDocid);
792
793 /* Verify that the incoming doclist is valid AND that it ends with
794 ** the expected docid. This is essential because we'll trust this
795 ** docid in future delta-encoding.
796 */
797 ASSERT_VALID_DOCLIST(pWriter->iType, pData, nData, &iLastDocidDelta);
798 assert( iLastDocid==iFirstDocid-iDocid+iLastDocidDelta );
799
800 /* Append recoded initial docid and everything else. Rest of docids
801 ** should have been delta-encoded from previous initial docid.
802 */
803 if( nFirstOld<nData ){
804 dataBufferAppend2(pWriter->b, c, nFirstNew,
805 pData+nFirstOld, nData-nFirstOld);
806 }else{
807 dataBufferAppend(pWriter->b, c, nFirstNew);
808 }
809 pWriter->iPrevDocid = iLastDocid;
810}
811static void dlwCopy(DLWriter *pWriter, DLReader *pReader){
812 dlwAppend(pWriter, dlrDocData(pReader), dlrDocDataBytes(pReader),
813 dlrDocid(pReader), dlrDocid(pReader));
814}
815static void dlwAdd(DLWriter *pWriter, sqlite_int64 iDocid){
816 char c[VARINT_MAX];
817 int n = putVarint(c, iDocid-pWriter->iPrevDocid);
818
819 /* Docids must ascend. */
820 assert( !pWriter->has_iPrevDocid || iDocid>pWriter->iPrevDocid );
821 assert( pWriter->iType==DL_DOCIDS );
822
823 dataBufferAppend(pWriter->b, c, n);
824 pWriter->iPrevDocid = iDocid;
825#ifndef NDEBUG
826 pWriter->has_iPrevDocid = 1;
827#endif
828}
829
830/*******************************************************************/
831/* PLReader is used to read data from a document's position list. As
832** the caller steps through the list, data is cached so that varints
833** only need to be decoded once.
834**
835** plrInit, plrDestroy - create/destroy a reader.
836** plrColumn, plrPosition, plrStartOffset, plrEndOffset - accessors
837** plrAtEnd - at end of stream, only call plrDestroy once true.
838** plrStep - step to the next element.
839*/
840typedef struct PLReader {
841 /* These refer to the next position's data. nData will reach 0 when
842 ** reading the last position, so plrStep() signals EOF by setting
843 ** pData to NULL.
844 */
845 const char *pData;
846 int nData;
847
848 DocListType iType;
849 int iColumn; /* the last column read */
850 int iPosition; /* the last position read */
851 int iStartOffset; /* the last start offset read */
852 int iEndOffset; /* the last end offset read */
853} PLReader;
854
855static int plrAtEnd(PLReader *pReader){
856 return pReader->pData==NULL;
857}
858static int plrColumn(PLReader *pReader){
859 assert( !plrAtEnd(pReader) );
860 return pReader->iColumn;
861}
862static int plrPosition(PLReader *pReader){
863 assert( !plrAtEnd(pReader) );
864 return pReader->iPosition;
865}
866static int plrStartOffset(PLReader *pReader){
867 assert( !plrAtEnd(pReader) );
868 return pReader->iStartOffset;
869}
870static int plrEndOffset(PLReader *pReader){
871 assert( !plrAtEnd(pReader) );
872 return pReader->iEndOffset;
873}
874static void plrStep(PLReader *pReader){
875 int i, n;
876
877 assert( !plrAtEnd(pReader) );
878
879 if( pReader->nData==0 ){
880 pReader->pData = NULL;
881 return;
882 }
883
884 n = getVarint32(pReader->pData, &i);
885 if( i==POS_COLUMN ){
886 n += getVarint32(pReader->pData+n, &pReader->iColumn);
887 pReader->iPosition = 0;
888 pReader->iStartOffset = 0;
889 n += getVarint32(pReader->pData+n, &i);
890 }
891 /* Should never see adjacent column changes. */
892 assert( i!=POS_COLUMN );
893
894 if( i==POS_END ){
895 pReader->nData = 0;
896 pReader->pData = NULL;
897 return;
898 }
899
900 pReader->iPosition += i-POS_BASE;
901 if( pReader->iType==DL_POSITIONS_OFFSETS ){
902 n += getVarint32(pReader->pData+n, &i);
903 pReader->iStartOffset += i;
904 n += getVarint32(pReader->pData+n, &i);
905 pReader->iEndOffset = pReader->iStartOffset+i;
906 }
907 assert( n<=pReader->nData );
908 pReader->pData += n;
909 pReader->nData -= n;
910}
911
912static void plrInit(PLReader *pReader, DLReader *pDLReader){
913 pReader->pData = dlrPosData(pDLReader);
914 pReader->nData = dlrPosDataLen(pDLReader);
915 pReader->iType = pDLReader->iType;
916 pReader->iColumn = 0;
917 pReader->iPosition = 0;
918 pReader->iStartOffset = 0;
919 pReader->iEndOffset = 0;
920 plrStep(pReader);
921}
922static void plrDestroy(PLReader *pReader){
923 SCRAMBLE(pReader);
924}
925
926/*******************************************************************/
927/* PLWriter is used in constructing a document's position list. As a
928** convenience, if iType is DL_DOCIDS, PLWriter becomes a no-op.
929** PLWriter writes to the associated DLWriter's buffer.
930**
931** plwInit - init for writing a document's poslist.
932** plwDestroy - clear a writer.
933** plwAdd - append position and offset information.
934** plwCopy - copy next position's data from reader to writer.
935** plwTerminate - add any necessary doclist terminator.
936**
937** Calling plwAdd() after plwTerminate() may result in a corrupt
938** doclist.
939*/
940/* TODO(shess) Until we've written the second item, we can cache the
941** first item's information. Then we'd have three states:
942**
943** - initialized with docid, no positions.
944** - docid and one position.
945** - docid and multiple positions.
946**
947** Only the last state needs to actually write to dlw->b, which would
948** be an improvement in the DLCollector case.
949*/
950typedef struct PLWriter {
951 DLWriter *dlw;
952
953 int iColumn; /* the last column written */
954 int iPos; /* the last position written */
955 int iOffset; /* the last start offset written */
956} PLWriter;
957
958/* TODO(shess) In the case where the parent is reading these values
959** from a PLReader, we could optimize to a copy if that PLReader has
960** the same type as pWriter.
961*/
962static void plwAdd(PLWriter *pWriter, int iColumn, int iPos,
963 int iStartOffset, int iEndOffset){
964 /* Worst-case space for POS_COLUMN, iColumn, iPosDelta,
965 ** iStartOffsetDelta, and iEndOffsetDelta.
966 */
967 char c[5*VARINT_MAX];
968 int n = 0;
969
970 /* Ban plwAdd() after plwTerminate(). */
971 assert( pWriter->iPos!=-1 );
972
973 if( pWriter->dlw->iType==DL_DOCIDS ) return;
974
975 if( iColumn!=pWriter->iColumn ){
976 n += putVarint(c+n, POS_COLUMN);
977 n += putVarint(c+n, iColumn);
978 pWriter->iColumn = iColumn;
979 pWriter->iPos = 0;
980 pWriter->iOffset = 0;
981 }
982 assert( iPos>=pWriter->iPos );
983 n += putVarint(c+n, POS_BASE+(iPos-pWriter->iPos));
984 pWriter->iPos = iPos;
985 if( pWriter->dlw->iType==DL_POSITIONS_OFFSETS ){
986 assert( iStartOffset>=pWriter->iOffset );
987 n += putVarint(c+n, iStartOffset-pWriter->iOffset);
988 pWriter->iOffset = iStartOffset;
989 assert( iEndOffset>=iStartOffset );
990 n += putVarint(c+n, iEndOffset-iStartOffset);
991 }
992 dataBufferAppend(pWriter->dlw->b, c, n);
993}
994static void plwCopy(PLWriter *pWriter, PLReader *pReader){
995 plwAdd(pWriter, plrColumn(pReader), plrPosition(pReader),
996 plrStartOffset(pReader), plrEndOffset(pReader));
997}
998static void plwInit(PLWriter *pWriter, DLWriter *dlw, sqlite_int64 iDocid){
999 char c[VARINT_MAX];
1000 int n;
1001
1002 pWriter->dlw = dlw;
1003
1004 /* Docids must ascend. */
1005 assert( !pWriter->dlw->has_iPrevDocid || iDocid>pWriter->dlw->iPrevDocid );
1006 n = putVarint(c, iDocid-pWriter->dlw->iPrevDocid);
1007 dataBufferAppend(pWriter->dlw->b, c, n);
1008 pWriter->dlw->iPrevDocid = iDocid;
1009#ifndef NDEBUG
1010 pWriter->dlw->has_iPrevDocid = 1;
1011#endif
1012
1013 pWriter->iColumn = 0;
1014 pWriter->iPos = 0;
1015 pWriter->iOffset = 0;
1016}
1017/* TODO(shess) Should plwDestroy() also terminate the doclist? But
1018** then plwDestroy() would no longer be just a destructor, it would
1019** also be doing work, which isn't consistent with the overall idiom.
1020** Another option would be for plwAdd() to always append any necessary
1021** terminator, so that the output is always correct. But that would
1022** add incremental work to the common case with the only benefit being
1023** API elegance. Punt for now.
1024*/
1025static void plwTerminate(PLWriter *pWriter){
1026 if( pWriter->dlw->iType>DL_DOCIDS ){
1027 char c[VARINT_MAX];
1028 int n = putVarint(c, POS_END);
1029 dataBufferAppend(pWriter->dlw->b, c, n);
1030 }
1031#ifndef NDEBUG
1032 /* Mark as terminated for assert in plwAdd(). */
1033 pWriter->iPos = -1;
1034#endif
1035}
1036static void plwDestroy(PLWriter *pWriter){
1037 SCRAMBLE(pWriter);
1038}
1039
1040/*******************************************************************/
1041/* DLCollector wraps PLWriter and DLWriter to provide a
1042** dynamically-allocated doclist area to use during tokenization.
1043**
1044** dlcNew - malloc up and initialize a collector.
1045** dlcDelete - destroy a collector and all contained items.
1046** dlcAddPos - append position and offset information.
1047** dlcAddDoclist - add the collected doclist to the given buffer.
1048** dlcNext - terminate the current document and open another.
1049*/
1050typedef struct DLCollector {
1051 DataBuffer b;
1052 DLWriter dlw;
1053 PLWriter plw;
1054} DLCollector;
1055
1056/* TODO(shess) This could also be done by calling plwTerminate() and
1057** dataBufferAppend(). I tried that, expecting nominal performance
1058** differences, but it seemed to pretty reliably be worth 1% to code
1059** it this way. I suspect it's the incremental malloc overhead (some
1060** percentage of the plwTerminate() calls will cause a realloc), so
1061** this might be worth revisiting if the DataBuffer implementation
1062** changes.
1063*/
1064static void dlcAddDoclist(DLCollector *pCollector, DataBuffer *b){
1065 if( pCollector->dlw.iType>DL_DOCIDS ){
1066 char c[VARINT_MAX];
1067 int n = putVarint(c, POS_END);
1068 dataBufferAppend2(b, pCollector->b.pData, pCollector->b.nData, c, n);
1069 }else{
1070 dataBufferAppend(b, pCollector->b.pData, pCollector->b.nData);
1071 }
1072}
1073static void dlcNext(DLCollector *pCollector, sqlite_int64 iDocid){
1074 plwTerminate(&pCollector->plw);
1075 plwDestroy(&pCollector->plw);
1076 plwInit(&pCollector->plw, &pCollector->dlw, iDocid);
1077}
1078static void dlcAddPos(DLCollector *pCollector, int iColumn, int iPos,
1079 int iStartOffset, int iEndOffset){
1080 plwAdd(&pCollector->plw, iColumn, iPos, iStartOffset, iEndOffset);
1081}
1082
1083static DLCollector *dlcNew(sqlite_int64 iDocid, DocListType iType){
1084 DLCollector *pCollector = malloc(sizeof(DLCollector));
1085 dataBufferInit(&pCollector->b, 0);
1086 dlwInit(&pCollector->dlw, iType, &pCollector->b);
1087 plwInit(&pCollector->plw, &pCollector->dlw, iDocid);
1088 return pCollector;
1089}
1090static void dlcDelete(DLCollector *pCollector){
1091 plwDestroy(&pCollector->plw);
1092 dlwDestroy(&pCollector->dlw);
1093 dataBufferDestroy(&pCollector->b);
1094 SCRAMBLE(pCollector);
1095 free(pCollector);
1096}
1097
1098
1099/* Copy the doclist data of iType in pData/nData into *out, trimming
1100** unnecessary data as we go. Only columns matching iColumn are
1101** copied, all columns copied if iColumn is -1. Elements with no
1102** matching columns are dropped. The output is an iOutType doclist.
1103*/
1104/* NOTE(shess) This code is only valid after all doclists are merged.
1105** If this is run before merges, then doclist items which represent
1106** deletion will be trimmed, and will thus not effect a deletion
1107** during the merge.
1108*/
1109static void docListTrim(DocListType iType, const char *pData, int nData,
1110 int iColumn, DocListType iOutType, DataBuffer *out){
1111 DLReader dlReader;
1112 DLWriter dlWriter;
1113
1114 assert( iOutType<=iType );
1115
1116 dlrInit(&dlReader, iType, pData, nData);
1117 dlwInit(&dlWriter, iOutType, out);
1118
1119 while( !dlrAtEnd(&dlReader) ){
1120 PLReader plReader;
1121 PLWriter plWriter;
1122 int match = 0;
1123
1124 plrInit(&plReader, &dlReader);
1125
1126 while( !plrAtEnd(&plReader) ){
1127 if( iColumn==-1 || plrColumn(&plReader)==iColumn ){
1128 if( !match ){
1129 plwInit(&plWriter, &dlWriter, dlrDocid(&dlReader));
1130 match = 1;
1131 }
1132 plwAdd(&plWriter, plrColumn(&plReader), plrPosition(&plReader),
1133 plrStartOffset(&plReader), plrEndOffset(&plReader));
1134 }
1135 plrStep(&plReader);
1136 }
1137 if( match ){
1138 plwTerminate(&plWriter);
1139 plwDestroy(&plWriter);
1140 }
1141
1142 plrDestroy(&plReader);
1143 dlrStep(&dlReader);
1144 }
1145 dlwDestroy(&dlWriter);
1146 dlrDestroy(&dlReader);
1147}
1148
1149/* Used by docListMerge() to keep doclists in the ascending order by
1150** docid, then ascending order by age (so the newest comes first).
1151*/
1152typedef struct OrderedDLReader {
1153 DLReader *pReader;
1154
1155 /* TODO(shess) If we assume that docListMerge pReaders is ordered by
1156 ** age (which we do), then we could use pReader comparisons to break
1157 ** ties.
1158 */
1159 int idx;
1160} OrderedDLReader;
1161
1162/* Order eof to end, then by docid asc, idx desc. */
1163static int orderedDLReaderCmp(OrderedDLReader *r1, OrderedDLReader *r2){
1164 if( dlrAtEnd(r1->pReader) ){
1165 if( dlrAtEnd(r2->pReader) ) return 0; /* Both atEnd(). */
1166 return 1; /* Only r1 atEnd(). */
1167 }
1168 if( dlrAtEnd(r2->pReader) ) return -1; /* Only r2 atEnd(). */
1169
1170 if( dlrDocid(r1->pReader)<dlrDocid(r2->pReader) ) return -1;
1171 if( dlrDocid(r1->pReader)>dlrDocid(r2->pReader) ) return 1;
1172
1173 /* Descending on idx. */
1174 return r2->idx-r1->idx;
1175}
1176
1177/* Bubble p[0] to appropriate place in p[1..n-1]. Assumes that
1178** p[1..n-1] is already sorted.
1179*/
1180/* TODO(shess) Is this frequent enough to warrant a binary search?
1181** Before implementing that, instrument the code to check. In most
1182** current usage, I expect that p[0] will be less than p[1] a very
1183** high proportion of the time.
1184*/
1185static void orderedDLReaderReorder(OrderedDLReader *p, int n){
1186 while( n>1 && orderedDLReaderCmp(p, p+1)>0 ){
1187 OrderedDLReader tmp = p[0];
1188 p[0] = p[1];
1189 p[1] = tmp;
1190 n--;
1191 p++;
1192 }
1193}
1194
1195/* Given an array of doclist readers, merge their doclist elements
1196** into out in sorted order (by docid), dropping elements from older
1197** readers when there is a duplicate docid. pReaders is assumed to be
1198** ordered by age, oldest first.
1199*/
1200/* TODO(shess) nReaders must be <= MERGE_COUNT. This should probably
1201** be fixed.
1202*/
1203static void docListMerge(DataBuffer *out,
1204 DLReader *pReaders, int nReaders){
1205 OrderedDLReader readers[MERGE_COUNT];
1206 DLWriter writer;
1207 int i, n;
1208 const char *pStart = 0;
1209 int nStart = 0;
1210 sqlite_int64 iFirstDocid = 0, iLastDocid = 0;
1211
1212 assert( nReaders>0 );
1213 if( nReaders==1 ){
1214 dataBufferAppend(out, dlrDocData(pReaders), dlrAllDataBytes(pReaders));
1215 return;
1216 }
1217
1218 assert( nReaders<=MERGE_COUNT );
1219 n = 0;
1220 for(i=0; i<nReaders; i++){
1221 assert( pReaders[i].iType==pReaders[0].iType );
1222 readers[i].pReader = pReaders+i;
1223 readers[i].idx = i;
1224 n += dlrAllDataBytes(&pReaders[i]);
1225 }
1226 /* Conservatively size output to sum of inputs. Output should end
1227 ** up strictly smaller than input.
1228 */
1229 dataBufferExpand(out, n);
1230
1231 /* Get the readers into sorted order. */
1232 while( i-->0 ){
1233 orderedDLReaderReorder(readers+i, nReaders-i);
1234 }
1235
1236 dlwInit(&writer, pReaders[0].iType, out);
1237 while( !dlrAtEnd(readers[0].pReader) ){
1238 sqlite_int64 iDocid = dlrDocid(readers[0].pReader);
1239
1240 /* If this is a continuation of the current buffer to copy, extend
1241 ** that buffer. memcpy() seems to be more efficient if it has a
1242 ** lots of data to copy.
1243 */
1244 if( dlrDocData(readers[0].pReader)==pStart+nStart ){
1245 nStart += dlrDocDataBytes(readers[0].pReader);
1246 }else{
1247 if( pStart!=0 ){
1248 dlwAppend(&writer, pStart, nStart, iFirstDocid, iLastDocid);
1249 }
1250 pStart = dlrDocData(readers[0].pReader);
1251 nStart = dlrDocDataBytes(readers[0].pReader);
1252 iFirstDocid = iDocid;
1253 }
1254 iLastDocid = iDocid;
1255 dlrStep(readers[0].pReader);
1256
1257 /* Drop all of the older elements with the same docid. */
1258 for(i=1; i<nReaders &&
1259 !dlrAtEnd(readers[i].pReader) &&
1260 dlrDocid(readers[i].pReader)==iDocid; i++){
1261 dlrStep(readers[i].pReader);
1262 }
1263
1264 /* Get the readers back into order. */
1265 while( i-->0 ){
1266 orderedDLReaderReorder(readers+i, nReaders-i);
1267 }
1268 }
1269
1270 /* Copy over any remaining elements. */
1271 if( nStart>0 ) dlwAppend(&writer, pStart, nStart, iFirstDocid, iLastDocid);
1272 dlwDestroy(&writer);
1273}
1274
1275/* Helper function for posListUnion(). Compares the current position
1276** between left and right, returning as standard C idiom of <0 if
1277** left<right, >0 if left>right, and 0 if left==right. "End" always
1278** compares greater.
1279*/
1280static int posListCmp(PLReader *pLeft, PLReader *pRight){
1281 assert( pLeft->iType==pRight->iType );
1282 if( pLeft->iType==DL_DOCIDS ) return 0;
1283
1284 if( plrAtEnd(pLeft) ) return plrAtEnd(pRight) ? 0 : 1;
1285 if( plrAtEnd(pRight) ) return -1;
1286
1287 if( plrColumn(pLeft)<plrColumn(pRight) ) return -1;
1288 if( plrColumn(pLeft)>plrColumn(pRight) ) return 1;
1289
1290 if( plrPosition(pLeft)<plrPosition(pRight) ) return -1;
1291 if( plrPosition(pLeft)>plrPosition(pRight) ) return 1;
1292 if( pLeft->iType==DL_POSITIONS ) return 0;
1293
1294 if( plrStartOffset(pLeft)<plrStartOffset(pRight) ) return -1;
1295 if( plrStartOffset(pLeft)>plrStartOffset(pRight) ) return 1;
1296
1297 if( plrEndOffset(pLeft)<plrEndOffset(pRight) ) return -1;
1298 if( plrEndOffset(pLeft)>plrEndOffset(pRight) ) return 1;
1299
1300 return 0;
1301}
1302
1303/* Write the union of position lists in pLeft and pRight to pOut.
1304** "Union" in this case meaning "All unique position tuples". Should
1305** work with any doclist type, though both inputs and the output
1306** should be the same type.
1307*/
1308static void posListUnion(DLReader *pLeft, DLReader *pRight, DLWriter *pOut){
1309 PLReader left, right;
1310 PLWriter writer;
1311
1312 assert( dlrDocid(pLeft)==dlrDocid(pRight) );
1313 assert( pLeft->iType==pRight->iType );
1314 assert( pLeft->iType==pOut->iType );
1315
1316 plrInit(&left, pLeft);
1317 plrInit(&right, pRight);
1318 plwInit(&writer, pOut, dlrDocid(pLeft));
1319
1320 while( !plrAtEnd(&left) || !plrAtEnd(&right) ){
1321 int c = posListCmp(&left, &right);
1322 if( c<0 ){
1323 plwCopy(&writer, &left);
1324 plrStep(&left);
1325 }else if( c>0 ){
1326 plwCopy(&writer, &right);
1327 plrStep(&right);
1328 }else{
1329 plwCopy(&writer, &left);
1330 plrStep(&left);
1331 plrStep(&right);
1332 }
1333 }
1334
1335 plwTerminate(&writer);
1336 plwDestroy(&writer);
1337 plrDestroy(&left);
1338 plrDestroy(&right);
1339}
1340
1341/* Write the union of doclists in pLeft and pRight to pOut. For
1342** docids in common between the inputs, the union of the position
1343** lists is written. Inputs and outputs are always type DL_DEFAULT.
1344*/
1345static void docListUnion(
1346 const char *pLeft, int nLeft,
1347 const char *pRight, int nRight,
1348 DataBuffer *pOut /* Write the combined doclist here */
1349){
1350 DLReader left, right;
1351 DLWriter writer;
1352
1353 if( nLeft==0 ){
1354 dataBufferAppend(pOut, pRight, nRight);
1355 return;
1356 }
1357 if( nRight==0 ){
1358 dataBufferAppend(pOut, pLeft, nLeft);
1359 return;
1360 }
1361
1362 dlrInit(&left, DL_DEFAULT, pLeft, nLeft);
1363 dlrInit(&right, DL_DEFAULT, pRight, nRight);
1364 dlwInit(&writer, DL_DEFAULT, pOut);
1365
1366 while( !dlrAtEnd(&left) || !dlrAtEnd(&right) ){
1367 if( dlrAtEnd(&right) ){
1368 dlwCopy(&writer, &left);
1369 dlrStep(&left);
1370 }else if( dlrAtEnd(&left) ){
1371 dlwCopy(&writer, &right);
1372 dlrStep(&right);
1373 }else if( dlrDocid(&left)<dlrDocid(&right) ){
1374 dlwCopy(&writer, &left);
1375 dlrStep(&left);
1376 }else if( dlrDocid(&left)>dlrDocid(&right) ){
1377 dlwCopy(&writer, &right);
1378 dlrStep(&right);
1379 }else{
1380 posListUnion(&left, &right, &writer);
1381 dlrStep(&left);
1382 dlrStep(&right);
1383 }
1384 }
1385
1386 dlrDestroy(&left);
1387 dlrDestroy(&right);
1388 dlwDestroy(&writer);
1389}
1390
1391/* pLeft and pRight are DLReaders positioned to the same docid.
1392**
1393** If there are no instances in pLeft or pRight where the position
1394** of pLeft is one less than the position of pRight, then this
1395** routine adds nothing to pOut.
1396**
1397** If there are one or more instances where positions from pLeft
1398** are exactly one less than positions from pRight, then add a new
1399** document record to pOut. If pOut wants to hold positions, then
1400** include the positions from pRight that are one more than a
1401** position in pLeft. In other words: pRight.iPos==pLeft.iPos+1.
1402*/
1403static void posListPhraseMerge(DLReader *pLeft, DLReader *pRight,
1404 DLWriter *pOut){
1405 PLReader left, right;
1406 PLWriter writer;
1407 int match = 0;
1408
1409 assert( dlrDocid(pLeft)==dlrDocid(pRight) );
1410 assert( pOut->iType!=DL_POSITIONS_OFFSETS );
1411
1412 plrInit(&left, pLeft);
1413 plrInit(&right, pRight);
1414
1415 while( !plrAtEnd(&left) && !plrAtEnd(&right) ){
1416 if( plrColumn(&left)<plrColumn(&right) ){
1417 plrStep(&left);
1418 }else if( plrColumn(&left)>plrColumn(&right) ){
1419 plrStep(&right);
1420 }else if( plrPosition(&left)+1<plrPosition(&right) ){
1421 plrStep(&left);
1422 }else if( plrPosition(&left)+1>plrPosition(&right) ){
1423 plrStep(&right);
1424 }else{
1425 if( !match ){
1426 plwInit(&writer, pOut, dlrDocid(pLeft));
1427 match = 1;
1428 }
1429 plwAdd(&writer, plrColumn(&right), plrPosition(&right), 0, 0);
1430 plrStep(&left);
1431 plrStep(&right);
1432 }
1433 }
1434
1435 if( match ){
1436 plwTerminate(&writer);
1437 plwDestroy(&writer);
1438 }
1439
1440 plrDestroy(&left);
1441 plrDestroy(&right);
1442}
1443
1444/* We have two doclists with positions: pLeft and pRight.
1445** Write the phrase intersection of these two doclists into pOut.
1446**
1447** A phrase intersection means that two documents only match
1448** if pLeft.iPos+1==pRight.iPos.
1449**
1450** iType controls the type of data written to pOut. If iType is
1451** DL_POSITIONS, the positions are those from pRight.
1452*/
1453static void docListPhraseMerge(
1454 const char *pLeft, int nLeft,
1455 const char *pRight, int nRight,
1456 DocListType iType,
1457 DataBuffer *pOut /* Write the combined doclist here */
1458){
1459 DLReader left, right;
1460 DLWriter writer;
1461
1462 if( nLeft==0 || nRight==0 ) return;
1463
1464 assert( iType!=DL_POSITIONS_OFFSETS );
1465
1466 dlrInit(&left, DL_POSITIONS, pLeft, nLeft);
1467 dlrInit(&right, DL_POSITIONS, pRight, nRight);
1468 dlwInit(&writer, iType, pOut);
1469
1470 while( !dlrAtEnd(&left) && !dlrAtEnd(&right) ){
1471 if( dlrDocid(&left)<dlrDocid(&right) ){
1472 dlrStep(&left);
1473 }else if( dlrDocid(&right)<dlrDocid(&left) ){
1474 dlrStep(&right);
1475 }else{
1476 posListPhraseMerge(&left, &right, &writer);
1477 dlrStep(&left);
1478 dlrStep(&right);
1479 }
1480 }
1481
1482 dlrDestroy(&left);
1483 dlrDestroy(&right);
1484 dlwDestroy(&writer);
1485}
1486
1487/* We have two DL_DOCIDS doclists: pLeft and pRight.
1488** Write the intersection of these two doclists into pOut as a
1489** DL_DOCIDS doclist.
1490*/
1491static void docListAndMerge(
1492 const char *pLeft, int nLeft,
1493 const char *pRight, int nRight,
1494 DataBuffer *pOut /* Write the combined doclist here */
1495){
1496 DLReader left, right;
1497 DLWriter writer;
1498
1499 if( nLeft==0 || nRight==0 ) return;
1500
1501 dlrInit(&left, DL_DOCIDS, pLeft, nLeft);
1502 dlrInit(&right, DL_DOCIDS, pRight, nRight);
1503 dlwInit(&writer, DL_DOCIDS, pOut);
1504
1505 while( !dlrAtEnd(&left) && !dlrAtEnd(&right) ){
1506 if( dlrDocid(&left)<dlrDocid(&right) ){
1507 dlrStep(&left);
1508 }else if( dlrDocid(&right)<dlrDocid(&left) ){
1509 dlrStep(&right);
1510 }else{
1511 dlwAdd(&writer, dlrDocid(&left));
1512 dlrStep(&left);
1513 dlrStep(&right);
1514 }
1515 }
1516
1517 dlrDestroy(&left);
1518 dlrDestroy(&right);
1519 dlwDestroy(&writer);
1520}
1521
1522/* We have two DL_DOCIDS doclists: pLeft and pRight.
1523** Write the union of these two doclists into pOut as a
1524** DL_DOCIDS doclist.
1525*/
1526static void docListOrMerge(
1527 const char *pLeft, int nLeft,
1528 const char *pRight, int nRight,
1529 DataBuffer *pOut /* Write the combined doclist here */
1530){
1531 DLReader left, right;
1532 DLWriter writer;
1533
1534 if( nLeft==0 ){
1535 dataBufferAppend(pOut, pRight, nRight);
1536 return;
1537 }
1538 if( nRight==0 ){
1539 dataBufferAppend(pOut, pLeft, nLeft);
1540 return;
1541 }
1542
1543 dlrInit(&left, DL_DOCIDS, pLeft, nLeft);
1544 dlrInit(&right, DL_DOCIDS, pRight, nRight);
1545 dlwInit(&writer, DL_DOCIDS, pOut);
1546
1547 while( !dlrAtEnd(&left) || !dlrAtEnd(&right) ){
1548 if( dlrAtEnd(&right) ){
1549 dlwAdd(&writer, dlrDocid(&left));
1550 dlrStep(&left);
1551 }else if( dlrAtEnd(&left) ){
1552 dlwAdd(&writer, dlrDocid(&right));
1553 dlrStep(&right);
1554 }else if( dlrDocid(&left)<dlrDocid(&right) ){
1555 dlwAdd(&writer, dlrDocid(&left));
1556 dlrStep(&left);
1557 }else if( dlrDocid(&right)<dlrDocid(&left) ){
1558 dlwAdd(&writer, dlrDocid(&right));
1559 dlrStep(&right);
1560 }else{
1561 dlwAdd(&writer, dlrDocid(&left));
1562 dlrStep(&left);
1563 dlrStep(&right);
1564 }
1565 }
1566
1567 dlrDestroy(&left);
1568 dlrDestroy(&right);
1569 dlwDestroy(&writer);
1570}
1571
1572/* We have two DL_DOCIDS doclists: pLeft and pRight.
1573** Write into pOut as DL_DOCIDS doclist containing all documents that
1574** occur in pLeft but not in pRight.
1575*/
1576static void docListExceptMerge(
1577 const char *pLeft, int nLeft,
1578 const char *pRight, int nRight,
1579 DataBuffer *pOut /* Write the combined doclist here */
1580){
1581 DLReader left, right;
1582 DLWriter writer;
1583
1584 if( nLeft==0 ) return;
1585 if( nRight==0 ){
1586 dataBufferAppend(pOut, pLeft, nLeft);
1587 return;
1588 }
1589
1590 dlrInit(&left, DL_DOCIDS, pLeft, nLeft);
1591 dlrInit(&right, DL_DOCIDS, pRight, nRight);
1592 dlwInit(&writer, DL_DOCIDS, pOut);
1593
1594 while( !dlrAtEnd(&left) ){
1595 while( !dlrAtEnd(&right) && dlrDocid(&right)<dlrDocid(&left) ){
1596 dlrStep(&right);
1597 }
1598 if( dlrAtEnd(&right) || dlrDocid(&left)<dlrDocid(&right) ){
1599 dlwAdd(&writer, dlrDocid(&left));
1600 }
1601 dlrStep(&left);
1602 }
1603
1604 dlrDestroy(&left);
1605 dlrDestroy(&right);
1606 dlwDestroy(&writer);
1607}
1608
1609static char *string_dup_n(const char *s, int n){
1610 char *str = malloc(n + 1);
1611 memcpy(str, s, n);
1612 str[n] = '\0';
1613 return str;
1614}
1615
1616/* Duplicate a string; the caller must free() the returned string.
1617 * (We don't use strdup() since it's not part of the standard C library and
1618 * may not be available everywhere.) */
1619static char *string_dup(const char *s){
1620 return string_dup_n(s, strlen(s));
1621}
1622
1623/* Format a string, replacing each occurrence of the % character with
1624 * zDb.zName. This may be more convenient than sqlite_mprintf()
1625 * when one string is used repeatedly in a format string.
1626 * The caller must free() the returned string. */
1627static char *string_format(const char *zFormat,
1628 const char *zDb, const char *zName){
1629 const char *p;
1630 size_t len = 0;
1631 size_t nDb = strlen(zDb);
1632 size_t nName = strlen(zName);
1633 size_t nFullTableName = nDb+1+nName;
1634 char *result;
1635 char *r;
1636
1637 /* first compute length needed */
1638 for(p = zFormat ; *p ; ++p){
1639 len += (*p=='%' ? nFullTableName : 1);
1640 }
1641 len += 1; /* for null terminator */
1642
1643 r = result = malloc(len);
1644 for(p = zFormat; *p; ++p){
1645 if( *p=='%' ){
1646 memcpy(r, zDb, nDb);
1647 r += nDb;
1648 *r++ = '.';
1649 memcpy(r, zName, nName);
1650 r += nName;
1651 } else {
1652 *r++ = *p;
1653 }
1654 }
1655 *r++ = '\0';
1656 assert( r == result + len );
1657 return result;
1658}
1659
1660static int sql_exec(sqlite3 *db, const char *zDb, const char *zName,
1661 const char *zFormat){
1662 char *zCommand = string_format(zFormat, zDb, zName);
1663 int rc;
1664 TRACE(("FTS2 sql: %s\n", zCommand));
1665 rc = sqlite3_exec(db, zCommand, NULL, 0, NULL);
1666 free(zCommand);
1667 return rc;
1668}
1669
1670static int sql_prepare(sqlite3 *db, const char *zDb, const char *zName,
1671 sqlite3_stmt **ppStmt, const char *zFormat){
1672 char *zCommand = string_format(zFormat, zDb, zName);
1673 int rc;
1674 TRACE(("FTS2 prepare: %s\n", zCommand));
1675 rc = sqlite3_prepare_v2(db, zCommand, -1, ppStmt, NULL);
1676 free(zCommand);
1677 return rc;
1678}
1679
1680/* end utility functions */
1681
1682/* Forward reference */
1683typedef struct fulltext_vtab fulltext_vtab;
1684
1685/* A single term in a query is represented by an instances of
1686** the following structure.
1687*/
1688typedef struct QueryTerm {
1689 short int nPhrase; /* How many following terms are part of the same phrase */
1690 short int iPhrase; /* This is the i-th term of a phrase. */
1691 short int iColumn; /* Column of the index that must match this term */
1692 signed char isOr; /* this term is preceded by "OR" */
1693 signed char isNot; /* this term is preceded by "-" */
1694 signed char isPrefix; /* this term is followed by "*" */
1695 char *pTerm; /* text of the term. '\000' terminated. malloced */
1696 int nTerm; /* Number of bytes in pTerm[] */
1697} QueryTerm;
1698
1699
1700/* A query string is parsed into a Query structure.
1701 *
1702 * We could, in theory, allow query strings to be complicated
1703 * nested expressions with precedence determined by parentheses.
1704 * But none of the major search engines do this. (Perhaps the
1705 * feeling is that an parenthesized expression is two complex of
1706 * an idea for the average user to grasp.) Taking our lead from
1707 * the major search engines, we will allow queries to be a list
1708 * of terms (with an implied AND operator) or phrases in double-quotes,
1709 * with a single optional "-" before each non-phrase term to designate
1710 * negation and an optional OR connector.
1711 *
1712 * OR binds more tightly than the implied AND, which is what the
1713 * major search engines seem to do. So, for example:
1714 *
1715 * [one two OR three] ==> one AND (two OR three)
1716 * [one OR two three] ==> (one OR two) AND three
1717 *
1718 * A "-" before a term matches all entries that lack that term.
1719 * The "-" must occur immediately before the term with in intervening
1720 * space. This is how the search engines do it.
1721 *
1722 * A NOT term cannot be the right-hand operand of an OR. If this
1723 * occurs in the query string, the NOT is ignored:
1724 *
1725 * [one OR -two] ==> one OR two
1726 *
1727 */
1728typedef struct Query {
1729 fulltext_vtab *pFts; /* The full text index */
1730 int nTerms; /* Number of terms in the query */
1731 QueryTerm *pTerms; /* Array of terms. Space obtained from malloc() */
1732 int nextIsOr; /* Set the isOr flag on the next inserted term */
1733 int nextColumn; /* Next word parsed must be in this column */
1734 int dfltColumn; /* The default column */
1735} Query;
1736
1737
1738/*
1739** An instance of the following structure keeps track of generated
1740** matching-word offset information and snippets.
1741*/
1742typedef struct Snippet {
1743 int nMatch; /* Total number of matches */
1744 int nAlloc; /* Space allocated for aMatch[] */
1745 struct snippetMatch { /* One entry for each matching term */
1746 char snStatus; /* Status flag for use while constructing snippets */
1747 short int iCol; /* The column that contains the match */
1748 short int iTerm; /* The index in Query.pTerms[] of the matching term */
1749 short int nByte; /* Number of bytes in the term */
1750 int iStart; /* The offset to the first character of the term */
1751 } *aMatch; /* Points to space obtained from malloc */
1752 char *zOffset; /* Text rendering of aMatch[] */
1753 int nOffset; /* strlen(zOffset) */
1754 char *zSnippet; /* Snippet text */
1755 int nSnippet; /* strlen(zSnippet) */
1756} Snippet;
1757
1758
1759typedef enum QueryType {
1760 QUERY_GENERIC, /* table scan */
1761 QUERY_ROWID, /* lookup by rowid */
1762 QUERY_FULLTEXT /* QUERY_FULLTEXT + [i] is a full-text search for column i*/
1763} QueryType;
1764
1765typedef enum fulltext_statement {
1766 CONTENT_INSERT_STMT,
1767 CONTENT_SELECT_STMT,
1768 CONTENT_UPDATE_STMT,
1769 CONTENT_DELETE_STMT,
1770
1771 BLOCK_INSERT_STMT,
1772 BLOCK_SELECT_STMT,
1773 BLOCK_DELETE_STMT,
1774
1775 SEGDIR_MAX_INDEX_STMT,
1776 SEGDIR_SET_STMT,
1777 SEGDIR_SELECT_STMT,
1778 SEGDIR_SPAN_STMT,
1779 SEGDIR_DELETE_STMT,
1780 SEGDIR_SELECT_ALL_STMT,
1781
1782 MAX_STMT /* Always at end! */
1783} fulltext_statement;
1784
1785/* These must exactly match the enum above. */
1786/* TODO(shess): Is there some risk that a statement will be used in two
1787** cursors at once, e.g. if a query joins a virtual table to itself?
1788** If so perhaps we should move some of these to the cursor object.
1789*/
1790static const char *const fulltext_zStatement[MAX_STMT] = {
1791 /* CONTENT_INSERT */ NULL, /* generated in contentInsertStatement() */
1792 /* CONTENT_SELECT */ "select * from %_content where rowid = ?",
1793 /* CONTENT_UPDATE */ NULL, /* generated in contentUpdateStatement() */
1794 /* CONTENT_DELETE */ "delete from %_content where rowid = ?",
1795
1796 /* BLOCK_INSERT */ "insert into %_segments values (?)",
1797 /* BLOCK_SELECT */ "select block from %_segments where rowid = ?",
1798 /* BLOCK_DELETE */ "delete from %_segments where rowid between ? and ?",
1799
1800 /* SEGDIR_MAX_INDEX */ "select max(idx) from %_segdir where level = ?",
1801 /* SEGDIR_SET */ "insert into %_segdir values (?, ?, ?, ?, ?, ?)",
1802 /* SEGDIR_SELECT */
1803 "select start_block, leaves_end_block, root from %_segdir "
1804 " where level = ? order by idx",
1805 /* SEGDIR_SPAN */
1806 "select min(start_block), max(end_block) from %_segdir "
1807 " where level = ? and start_block <> 0",
1808 /* SEGDIR_DELETE */ "delete from %_segdir where level = ?",
1809 /* SEGDIR_SELECT_ALL */
1810 "select root, leaves_end_block from %_segdir order by level desc, idx",
1811};
1812
1813/*
1814** A connection to a fulltext index is an instance of the following
1815** structure. The xCreate and xConnect methods create an instance
1816** of this structure and xDestroy and xDisconnect free that instance.
1817** All other methods receive a pointer to the structure as one of their
1818** arguments.
1819*/
1820struct fulltext_vtab {
1821 sqlite3_vtab base; /* Base class used by SQLite core */
1822 sqlite3 *db; /* The database connection */
1823 const char *zDb; /* logical database name */
1824 const char *zName; /* virtual table name */
1825 int nColumn; /* number of columns in virtual table */
1826 char **azColumn; /* column names. malloced */
1827 char **azContentColumn; /* column names in content table; malloced */
1828 sqlite3_tokenizer *pTokenizer; /* tokenizer for inserts and queries */
1829
1830 /* Precompiled statements which we keep as long as the table is
1831 ** open.
1832 */
1833 sqlite3_stmt *pFulltextStatements[MAX_STMT];
1834
1835 /* Precompiled statements used for segment merges. We run a
1836 ** separate select across the leaf level of each tree being merged.
1837 */
1838 sqlite3_stmt *pLeafSelectStmts[MERGE_COUNT];
1839 /* The statement used to prepare pLeafSelectStmts. */
1840#define LEAF_SELECT \
1841 "select block from %_segments where rowid between ? and ? order by rowid"
1842
1843 /* These buffer pending index updates during transactions.
1844 ** nPendingData estimates the memory size of the pending data. It
1845 ** doesn't include the hash-bucket overhead, nor any malloc
1846 ** overhead. When nPendingData exceeds kPendingThreshold, the
1847 ** buffer is flushed even before the transaction closes.
1848 ** pendingTerms stores the data, and is only valid when nPendingData
1849 ** is >=0 (nPendingData<0 means pendingTerms has not been
1850 ** initialized). iPrevDocid is the last docid written, used to make
1851 ** certain we're inserting in sorted order.
1852 */
1853 int nPendingData;
1854#define kPendingThreshold (1*1024*1024)
1855 sqlite_int64 iPrevDocid;
1856 fts2Hash pendingTerms;
1857};
1858
1859/*
1860** When the core wants to do a query, it create a cursor using a
1861** call to xOpen. This structure is an instance of a cursor. It
1862** is destroyed by xClose.
1863*/
1864typedef struct fulltext_cursor {
1865 sqlite3_vtab_cursor base; /* Base class used by SQLite core */
1866 QueryType iCursorType; /* Copy of sqlite3_index_info.idxNum */
1867 sqlite3_stmt *pStmt; /* Prepared statement in use by the cursor */
1868 int eof; /* True if at End Of Results */
1869 Query q; /* Parsed query string */
1870 Snippet snippet; /* Cached snippet for the current row */
1871 int iColumn; /* Column being searched */
1872 DataBuffer result; /* Doclist results from fulltextQuery */
1873 DLReader reader; /* Result reader if result not empty */
1874} fulltext_cursor;
1875
1876static struct fulltext_vtab *cursor_vtab(fulltext_cursor *c){
1877 return (fulltext_vtab *) c->base.pVtab;
1878}
1879
1880static const sqlite3_module fts2Module; /* forward declaration */
1881
1882/* Return a dynamically generated statement of the form
1883 * insert into %_content (rowid, ...) values (?, ...)
1884 */
1885static const char *contentInsertStatement(fulltext_vtab *v){
1886 StringBuffer sb;
1887 int i;
1888
1889 initStringBuffer(&sb);
1890 append(&sb, "insert into %_content (rowid, ");
1891 appendList(&sb, v->nColumn, v->azContentColumn);
1892 append(&sb, ") values (?");
1893 for(i=0; i<v->nColumn; ++i)
1894 append(&sb, ", ?");
1895 append(&sb, ")");
1896 return stringBufferData(&sb);
1897}
1898
1899/* Return a dynamically generated statement of the form
1900 * update %_content set [col_0] = ?, [col_1] = ?, ...
1901 * where rowid = ?
1902 */
1903static const char *contentUpdateStatement(fulltext_vtab *v){
1904 StringBuffer sb;
1905 int i;
1906
1907 initStringBuffer(&sb);
1908 append(&sb, "update %_content set ");
1909 for(i=0; i<v->nColumn; ++i) {
1910 if( i>0 ){
1911 append(&sb, ", ");
1912 }
1913 append(&sb, v->azContentColumn[i]);
1914 append(&sb, " = ?");
1915 }
1916 append(&sb, " where rowid = ?");
1917 return stringBufferData(&sb);
1918}
1919
1920/* Puts a freshly-prepared statement determined by iStmt in *ppStmt.
1921** If the indicated statement has never been prepared, it is prepared
1922** and cached, otherwise the cached version is reset.
1923*/
1924static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt,
1925 sqlite3_stmt **ppStmt){
1926 assert( iStmt<MAX_STMT );
1927 if( v->pFulltextStatements[iStmt]==NULL ){
1928 const char *zStmt;
1929 int rc;
1930 switch( iStmt ){
1931 case CONTENT_INSERT_STMT:
1932 zStmt = contentInsertStatement(v); break;
1933 case CONTENT_UPDATE_STMT:
1934 zStmt = contentUpdateStatement(v); break;
1935 default:
1936 zStmt = fulltext_zStatement[iStmt];
1937 }
1938 rc = sql_prepare(v->db, v->zDb, v->zName, &v->pFulltextStatements[iStmt],
1939 zStmt);
1940 if( zStmt != fulltext_zStatement[iStmt]) free((void *) zStmt);
1941 if( rc!=SQLITE_OK ) return rc;
1942 } else {
1943 int rc = sqlite3_reset(v->pFulltextStatements[iStmt]);
1944 if( rc!=SQLITE_OK ) return rc;
1945 }
1946
1947 *ppStmt = v->pFulltextStatements[iStmt];
1948 return SQLITE_OK;
1949}
1950
1951/* Like sqlite3_step(), but convert SQLITE_DONE to SQLITE_OK and
1952** SQLITE_ROW to SQLITE_ERROR. Useful for statements like UPDATE,
1953** where we expect no results.
1954*/
1955static int sql_single_step(sqlite3_stmt *s){
1956 int rc = sqlite3_step(s);
1957 return (rc==SQLITE_DONE) ? SQLITE_OK : rc;
1958}
1959
1960/* Like sql_get_statement(), but for special replicated LEAF_SELECT
1961** statements.
1962*/
1963/* TODO(shess) Write version for generic statements and then share
1964** that between the cached-statement functions.
1965*/
1966static int sql_get_leaf_statement(fulltext_vtab *v, int idx,
1967 sqlite3_stmt **ppStmt){
1968 assert( idx>=0 && idx<MERGE_COUNT );
1969 if( v->pLeafSelectStmts[idx]==NULL ){
1970 int rc = sql_prepare(v->db, v->zDb, v->zName, &v->pLeafSelectStmts[idx],
1971 LEAF_SELECT);
1972 if( rc!=SQLITE_OK ) return rc;
1973 }else{
1974 int rc = sqlite3_reset(v->pLeafSelectStmts[idx]);
1975 if( rc!=SQLITE_OK ) return rc;
1976 }
1977
1978 *ppStmt = v->pLeafSelectStmts[idx];
1979 return SQLITE_OK;
1980}
1981
1982/* insert into %_content (rowid, ...) values ([rowid], [pValues]) */
1983static int content_insert(fulltext_vtab *v, sqlite3_value *rowid,
1984 sqlite3_value **pValues){
1985 sqlite3_stmt *s;
1986 int i;
1987 int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s);
1988 if( rc!=SQLITE_OK ) return rc;
1989
1990 rc = sqlite3_bind_value(s, 1, rowid);
1991 if( rc!=SQLITE_OK ) return rc;
1992
1993 for(i=0; i<v->nColumn; ++i){
1994 rc = sqlite3_bind_value(s, 2+i, pValues[i]);
1995 if( rc!=SQLITE_OK ) return rc;
1996 }
1997
1998 return sql_single_step(s);
1999}
2000
2001/* update %_content set col0 = pValues[0], col1 = pValues[1], ...
2002 * where rowid = [iRowid] */
2003static int content_update(fulltext_vtab *v, sqlite3_value **pValues,
2004 sqlite_int64 iRowid){
2005 sqlite3_stmt *s;
2006 int i;
2007 int rc = sql_get_statement(v, CONTENT_UPDATE_STMT, &s);
2008 if( rc!=SQLITE_OK ) return rc;
2009
2010 for(i=0; i<v->nColumn; ++i){
2011 rc = sqlite3_bind_value(s, 1+i, pValues[i]);
2012 if( rc!=SQLITE_OK ) return rc;
2013 }
2014
2015 rc = sqlite3_bind_int64(s, 1+v->nColumn, iRowid);
2016 if( rc!=SQLITE_OK ) return rc;
2017
2018 return sql_single_step(s);
2019}
2020
2021static void freeStringArray(int nString, const char **pString){
2022 int i;
2023
2024 for (i=0 ; i < nString ; ++i) {
2025 if( pString[i]!=NULL ) free((void *) pString[i]);
2026 }
2027 free((void *) pString);
2028}
2029
2030/* select * from %_content where rowid = [iRow]
2031 * The caller must delete the returned array and all strings in it.
2032 * null fields will be NULL in the returned array.
2033 *
2034 * TODO: Perhaps we should return pointer/length strings here for consistency
2035 * with other code which uses pointer/length. */
2036static int content_select(fulltext_vtab *v, sqlite_int64 iRow,
2037 const char ***pValues){
2038 sqlite3_stmt *s;
2039 const char **values;
2040 int i;
2041 int rc;
2042
2043 *pValues = NULL;
2044
2045 rc = sql_get_statement(v, CONTENT_SELECT_STMT, &s);
2046 if( rc!=SQLITE_OK ) return rc;
2047
2048 rc = sqlite3_bind_int64(s, 1, iRow);
2049 if( rc!=SQLITE_OK ) return rc;
2050
2051 rc = sqlite3_step(s);
2052 if( rc!=SQLITE_ROW ) return rc;
2053
2054 values = (const char **) malloc(v->nColumn * sizeof(const char *));
2055 for(i=0; i<v->nColumn; ++i){
2056 if( sqlite3_column_type(s, i)==SQLITE_NULL ){
2057 values[i] = NULL;
2058 }else{
2059 values[i] = string_dup((char*)sqlite3_column_text(s, i));
2060 }
2061 }
2062
2063 /* We expect only one row. We must execute another sqlite3_step()
2064 * to complete the iteration; otherwise the table will remain locked. */
2065 rc = sqlite3_step(s);
2066 if( rc==SQLITE_DONE ){
2067 *pValues = values;
2068 return SQLITE_OK;
2069 }
2070
2071 freeStringArray(v->nColumn, values);
2072 return rc;
2073}
2074
2075/* delete from %_content where rowid = [iRow ] */
2076static int content_delete(fulltext_vtab *v, sqlite_int64 iRow){
2077 sqlite3_stmt *s;
2078 int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s);
2079 if( rc!=SQLITE_OK ) return rc;
2080
2081 rc = sqlite3_bind_int64(s, 1, iRow);
2082 if( rc!=SQLITE_OK ) return rc;
2083
2084 return sql_single_step(s);
2085}
2086
2087/* insert into %_segments values ([pData])
2088** returns assigned rowid in *piBlockid
2089*/
2090static int block_insert(fulltext_vtab *v, const char *pData, int nData,
2091 sqlite_int64 *piBlockid){
2092 sqlite3_stmt *s;
2093 int rc = sql_get_statement(v, BLOCK_INSERT_STMT, &s);
2094 if( rc!=SQLITE_OK ) return rc;
2095
2096 rc = sqlite3_bind_blob(s, 1, pData, nData, SQLITE_STATIC);
2097 if( rc!=SQLITE_OK ) return rc;
2098
2099 rc = sqlite3_step(s);
2100 if( rc==SQLITE_ROW ) return SQLITE_ERROR;
2101 if( rc!=SQLITE_DONE ) return rc;
2102
2103 *piBlockid = sqlite3_last_insert_rowid(v->db);
2104 return SQLITE_OK;
2105}
2106
2107/* delete from %_segments
2108** where rowid between [iStartBlockid] and [iEndBlockid]
2109**
2110** Deletes the range of blocks, inclusive, used to delete the blocks
2111** which form a segment.
2112*/
2113static int block_delete(fulltext_vtab *v,
2114 sqlite_int64 iStartBlockid, sqlite_int64 iEndBlockid){
2115 sqlite3_stmt *s;
2116 int rc = sql_get_statement(v, BLOCK_DELETE_STMT, &s);
2117 if( rc!=SQLITE_OK ) return rc;
2118
2119 rc = sqlite3_bind_int64(s, 1, iStartBlockid);
2120 if( rc!=SQLITE_OK ) return rc;
2121
2122 rc = sqlite3_bind_int64(s, 2, iEndBlockid);
2123 if( rc!=SQLITE_OK ) return rc;
2124
2125 return sql_single_step(s);
2126}
2127
2128/* Returns SQLITE_ROW with *pidx set to the maximum segment idx found
2129** at iLevel. Returns SQLITE_DONE if there are no segments at
2130** iLevel. Otherwise returns an error.
2131*/
2132static int segdir_max_index(fulltext_vtab *v, int iLevel, int *pidx){
2133 sqlite3_stmt *s;
2134 int rc = sql_get_statement(v, SEGDIR_MAX_INDEX_STMT, &s);
2135 if( rc!=SQLITE_OK ) return rc;
2136
2137 rc = sqlite3_bind_int(s, 1, iLevel);
2138 if( rc!=SQLITE_OK ) return rc;
2139
2140 rc = sqlite3_step(s);
2141 /* Should always get at least one row due to how max() works. */
2142 if( rc==SQLITE_DONE ) return SQLITE_DONE;
2143 if( rc!=SQLITE_ROW ) return rc;
2144
2145 /* NULL means that there were no inputs to max(). */
2146 if( SQLITE_NULL==sqlite3_column_type(s, 0) ){
2147 rc = sqlite3_step(s);
2148 if( rc==SQLITE_ROW ) return SQLITE_ERROR;
2149 return rc;
2150 }
2151
2152 *pidx = sqlite3_column_int(s, 0);
2153
2154 /* We expect only one row. We must execute another sqlite3_step()
2155 * to complete the iteration; otherwise the table will remain locked. */
2156 rc = sqlite3_step(s);
2157 if( rc==SQLITE_ROW ) return SQLITE_ERROR;
2158 if( rc!=SQLITE_DONE ) return rc;
2159 return SQLITE_ROW;
2160}
2161
2162/* insert into %_segdir values (
2163** [iLevel], [idx],
2164** [iStartBlockid], [iLeavesEndBlockid], [iEndBlockid],
2165** [pRootData]
2166** )
2167*/
2168static int segdir_set(fulltext_vtab *v, int iLevel, int idx,
2169 sqlite_int64 iStartBlockid,
2170 sqlite_int64 iLeavesEndBlockid,
2171 sqlite_int64 iEndBlockid,
2172 const char *pRootData, int nRootData){
2173 sqlite3_stmt *s;
2174 int rc = sql_get_statement(v, SEGDIR_SET_STMT, &s);
2175 if( rc!=SQLITE_OK ) return rc;
2176
2177 rc = sqlite3_bind_int(s, 1, iLevel);
2178 if( rc!=SQLITE_OK ) return rc;
2179
2180 rc = sqlite3_bind_int(s, 2, idx);
2181 if( rc!=SQLITE_OK ) return rc;
2182
2183 rc = sqlite3_bind_int64(s, 3, iStartBlockid);
2184 if( rc!=SQLITE_OK ) return rc;
2185
2186 rc = sqlite3_bind_int64(s, 4, iLeavesEndBlockid);
2187 if( rc!=SQLITE_OK ) return rc;
2188
2189 rc = sqlite3_bind_int64(s, 5, iEndBlockid);
2190 if( rc!=SQLITE_OK ) return rc;
2191
2192 rc = sqlite3_bind_blob(s, 6, pRootData, nRootData, SQLITE_STATIC);
2193 if( rc!=SQLITE_OK ) return rc;
2194
2195 return sql_single_step(s);
2196}
2197
2198/* Queries %_segdir for the block span of the segments in level
2199** iLevel. Returns SQLITE_DONE if there are no blocks for iLevel,
2200** SQLITE_ROW if there are blocks, else an error.
2201*/
2202static int segdir_span(fulltext_vtab *v, int iLevel,
2203 sqlite_int64 *piStartBlockid,
2204 sqlite_int64 *piEndBlockid){
2205 sqlite3_stmt *s;
2206 int rc = sql_get_statement(v, SEGDIR_SPAN_STMT, &s);
2207 if( rc!=SQLITE_OK ) return rc;
2208
2209 rc = sqlite3_bind_int(s, 1, iLevel);
2210 if( rc!=SQLITE_OK ) return rc;
2211
2212 rc = sqlite3_step(s);
2213 if( rc==SQLITE_DONE ) return SQLITE_DONE; /* Should never happen */
2214 if( rc!=SQLITE_ROW ) return rc;
2215
2216 /* This happens if all segments at this level are entirely inline. */
2217 if( SQLITE_NULL==sqlite3_column_type(s, 0) ){
2218 /* We expect only one row. We must execute another sqlite3_step()
2219 * to complete the iteration; otherwise the table will remain locked. */
2220 int rc2 = sqlite3_step(s);
2221 if( rc2==SQLITE_ROW ) return SQLITE_ERROR;
2222 return rc2;
2223 }
2224
2225 *piStartBlockid = sqlite3_column_int64(s, 0);
2226 *piEndBlockid = sqlite3_column_int64(s, 1);
2227
2228 /* We expect only one row. We must execute another sqlite3_step()
2229 * to complete the iteration; otherwise the table will remain locked. */
2230 rc = sqlite3_step(s);
2231 if( rc==SQLITE_ROW ) return SQLITE_ERROR;
2232 if( rc!=SQLITE_DONE ) return rc;
2233 return SQLITE_ROW;
2234}
2235
2236/* Delete the segment blocks and segment directory records for all
2237** segments at iLevel.
2238*/
2239static int segdir_delete(fulltext_vtab *v, int iLevel){
2240 sqlite3_stmt *s;
2241 sqlite_int64 iStartBlockid, iEndBlockid;
2242 int rc = segdir_span(v, iLevel, &iStartBlockid, &iEndBlockid);
2243 if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ) return rc;
2244
2245 if( rc==SQLITE_ROW ){
2246 rc = block_delete(v, iStartBlockid, iEndBlockid);
2247 if( rc!=SQLITE_OK ) return rc;
2248 }
2249
2250 /* Delete the segment directory itself. */
2251 rc = sql_get_statement(v, SEGDIR_DELETE_STMT, &s);
2252 if( rc!=SQLITE_OK ) return rc;
2253
2254 rc = sqlite3_bind_int64(s, 1, iLevel);
2255 if( rc!=SQLITE_OK ) return rc;
2256
2257 return sql_single_step(s);
2258}
2259
2260/* TODO(shess) clearPendingTerms() is far down the file because
2261** writeZeroSegment() is far down the file because LeafWriter is far
2262** down the file. Consider refactoring the code to move the non-vtab
2263** code above the vtab code so that we don't need this forward
2264** reference.
2265*/
2266static int clearPendingTerms(fulltext_vtab *v);
2267
2268/*
2269** Free the memory used to contain a fulltext_vtab structure.
2270*/
2271static void fulltext_vtab_destroy(fulltext_vtab *v){
2272 int iStmt, i;
2273
2274 TRACE(("FTS2 Destroy %p\n", v));
2275 for( iStmt=0; iStmt<MAX_STMT; iStmt++ ){
2276 if( v->pFulltextStatements[iStmt]!=NULL ){
2277 sqlite3_finalize(v->pFulltextStatements[iStmt]);
2278 v->pFulltextStatements[iStmt] = NULL;
2279 }
2280 }
2281
2282 for( i=0; i<MERGE_COUNT; i++ ){
2283 if( v->pLeafSelectStmts[i]!=NULL ){
2284 sqlite3_finalize(v->pLeafSelectStmts[i]);
2285 v->pLeafSelectStmts[i] = NULL;
2286 }
2287 }
2288
2289 if( v->pTokenizer!=NULL ){
2290 v->pTokenizer->pModule->xDestroy(v->pTokenizer);
2291 v->pTokenizer = NULL;
2292 }
2293
2294 clearPendingTerms(v);
2295
2296 free(v->azColumn);
2297 for(i = 0; i < v->nColumn; ++i) {
2298 sqlite3_free(v->azContentColumn[i]);
2299 }
2300 free(v->azContentColumn);
2301 free(v);
2302}
2303
2304/*
2305** Token types for parsing the arguments to xConnect or xCreate.
2306*/
2307#define TOKEN_EOF 0 /* End of file */
2308#define TOKEN_SPACE 1 /* Any kind of whitespace */
2309#define TOKEN_ID 2 /* An identifier */
2310#define TOKEN_STRING 3 /* A string literal */
2311#define TOKEN_PUNCT 4 /* A single punctuation character */
2312
2313/*
2314** If X is a character that can be used in an identifier then
2315** IdChar(X) will be true. Otherwise it is false.
2316**
2317** For ASCII, any character with the high-order bit set is
2318** allowed in an identifier. For 7-bit characters,
2319** sqlite3IsIdChar[X] must be 1.
2320**
2321** Ticket #1066. the SQL standard does not allow '$' in the
2322** middle of identfiers. But many SQL implementations do.
2323** SQLite will allow '$' in identifiers for compatibility.
2324** But the feature is undocumented.
2325*/
2326static const char isIdChar[] = {
2327/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
2328 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 2x */
2329 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */
2330 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */
2331 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */
2332 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */
2333 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */
2334};
2335#define IdChar(C) (((c=C)&0x80)!=0 || (c>0x1f && isIdChar[c-0x20]))
2336
2337
2338/*
2339** Return the length of the token that begins at z[0].
2340** Store the token type in *tokenType before returning.
2341*/
2342static int getToken(const char *z, int *tokenType){
2343 int i, c;
2344 switch( *z ){
2345 case 0: {
2346 *tokenType = TOKEN_EOF;
2347 return 0;
2348 }
2349 case ' ': case '\t': case '\n': case '\f': case '\r': {
2350 for(i=1; safe_isspace(z[i]); i++){}
2351 *tokenType = TOKEN_SPACE;
2352 return i;
2353 }
2354 case '`':
2355 case '\'':
2356 case '"': {
2357 int delim = z[0];
2358 for(i=1; (c=z[i])!=0; i++){
2359 if( c==delim ){
2360 if( z[i+1]==delim ){
2361 i++;
2362 }else{
2363 break;
2364 }
2365 }
2366 }
2367 *tokenType = TOKEN_STRING;
2368 return i + (c!=0);
2369 }
2370 case '[': {
2371 for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){}
2372 *tokenType = TOKEN_ID;
2373 return i;
2374 }
2375 default: {
2376 if( !IdChar(*z) ){
2377 break;
2378 }
2379 for(i=1; IdChar(z[i]); i++){}
2380 *tokenType = TOKEN_ID;
2381 return i;
2382 }
2383 }
2384 *tokenType = TOKEN_PUNCT;
2385 return 1;
2386}
2387
2388/*
2389** A token extracted from a string is an instance of the following
2390** structure.
2391*/
2392typedef struct Token {
2393 const char *z; /* Pointer to token text. Not '\000' terminated */
2394 short int n; /* Length of the token text in bytes. */
2395} Token;
2396
2397/*
2398** Given a input string (which is really one of the argv[] parameters
2399** passed into xConnect or xCreate) split the string up into tokens.
2400** Return an array of pointers to '\000' terminated strings, one string
2401** for each non-whitespace token.
2402**
2403** The returned array is terminated by a single NULL pointer.
2404**
2405** Space to hold the returned array is obtained from a single
2406** malloc and should be freed by passing the return value to free().
2407** The individual strings within the token list are all a part of
2408** the single memory allocation and will all be freed at once.
2409*/
2410static char **tokenizeString(const char *z, int *pnToken){
2411 int nToken = 0;
2412 Token *aToken = malloc( strlen(z) * sizeof(aToken[0]) );
2413 int n = 1;
2414 int e, i;
2415 int totalSize = 0;
2416 char **azToken;
2417 char *zCopy;
2418 while( n>0 ){
2419 n = getToken(z, &e);
2420 if( e!=TOKEN_SPACE ){
2421 aToken[nToken].z = z;
2422 aToken[nToken].n = n;
2423 nToken++;
2424 totalSize += n+1;
2425 }
2426 z += n;
2427 }
2428 azToken = (char**)malloc( nToken*sizeof(char*) + totalSize );
2429 zCopy = (char*)&azToken[nToken];
2430 nToken--;
2431 for(i=0; i<nToken; i++){
2432 azToken[i] = zCopy;
2433 n = aToken[i].n;
2434 memcpy(zCopy, aToken[i].z, n);
2435 zCopy[n] = 0;
2436 zCopy += n+1;
2437 }
2438 azToken[nToken] = 0;
2439 free(aToken);
2440 *pnToken = nToken;
2441 return azToken;
2442}
2443
2444/*
2445** Convert an SQL-style quoted string into a normal string by removing
2446** the quote characters. The conversion is done in-place. If the
2447** input does not begin with a quote character, then this routine
2448** is a no-op.
2449**
2450** Examples:
2451**
2452** "abc" becomes abc
2453** 'xyz' becomes xyz
2454** [pqr] becomes pqr
2455** `mno` becomes mno
2456*/
2457static void dequoteString(char *z){
2458 int quote;
2459 int i, j;
2460 if( z==0 ) return;
2461 quote = z[0];
2462 switch( quote ){
2463 case '\'': break;
2464 case '"': break;
2465 case '`': break; /* For MySQL compatibility */
2466 case '[': quote = ']'; break; /* For MS SqlServer compatibility */
2467 default: return;
2468 }
2469 for(i=1, j=0; z[i]; i++){
2470 if( z[i]==quote ){
2471 if( z[i+1]==quote ){
2472 z[j++] = quote;
2473 i++;
2474 }else{
2475 z[j++] = 0;
2476 break;
2477 }
2478 }else{
2479 z[j++] = z[i];
2480 }
2481 }
2482}
2483
2484/*
2485** The input azIn is a NULL-terminated list of tokens. Remove the first
2486** token and all punctuation tokens. Remove the quotes from
2487** around string literal tokens.
2488**
2489** Example:
2490**
2491** input: tokenize chinese ( 'simplifed' , 'mixed' )
2492** output: chinese simplifed mixed
2493**
2494** Another example:
2495**
2496** input: delimiters ( '[' , ']' , '...' )
2497** output: [ ] ...
2498*/
2499static void tokenListToIdList(char **azIn){
2500 int i, j;
2501 if( azIn ){
2502 for(i=0, j=-1; azIn[i]; i++){
2503 if( safe_isalnum(azIn[i][0]) || azIn[i][1] ){
2504 dequoteString(azIn[i]);
2505 if( j>=0 ){
2506 azIn[j] = azIn[i];
2507 }
2508 j++;
2509 }
2510 }
2511 azIn[j] = 0;
2512 }
2513}
2514
2515
2516/*
2517** Find the first alphanumeric token in the string zIn. Null-terminate
2518** this token. Remove any quotation marks. And return a pointer to
2519** the result.
2520*/
2521static char *firstToken(char *zIn, char **pzTail){
2522 int n, ttype;
2523 while(1){
2524 n = getToken(zIn, &ttype);
2525 if( ttype==TOKEN_SPACE ){
2526 zIn += n;
2527 }else if( ttype==TOKEN_EOF ){
2528 *pzTail = zIn;
2529 return 0;
2530 }else{
2531 zIn[n] = 0;
2532 *pzTail = &zIn[1];
2533 dequoteString(zIn);
2534 return zIn;
2535 }
2536 }
2537 /*NOTREACHED*/
2538}
2539
2540/* Return true if...
2541**
2542** * s begins with the string t, ignoring case
2543** * s is longer than t
2544** * The first character of s beyond t is not a alphanumeric
2545**
2546** Ignore leading space in *s.
2547**
2548** To put it another way, return true if the first token of
2549** s[] is t[].
2550*/
2551static int startsWith(const char *s, const char *t){
2552 while( safe_isspace(*s) ){ s++; }
2553 while( *t ){
2554 if( safe_tolower(*s++)!=safe_tolower(*t++) ) return 0;
2555 }
2556 return *s!='_' && !safe_isalnum(*s);
2557}
2558
2559/*
2560** An instance of this structure defines the "spec" of a
2561** full text index. This structure is populated by parseSpec
2562** and use by fulltextConnect and fulltextCreate.
2563*/
2564typedef struct TableSpec {
2565 const char *zDb; /* Logical database name */
2566 const char *zName; /* Name of the full-text index */
2567 int nColumn; /* Number of columns to be indexed */
2568 char **azColumn; /* Original names of columns to be indexed */
2569 char **azContentColumn; /* Column names for %_content */
2570 char **azTokenizer; /* Name of tokenizer and its arguments */
2571} TableSpec;
2572
2573/*
2574** Reclaim all of the memory used by a TableSpec
2575*/
2576static void clearTableSpec(TableSpec *p) {
2577 free(p->azColumn);
2578 free(p->azContentColumn);
2579 free(p->azTokenizer);
2580}
2581
2582/* Parse a CREATE VIRTUAL TABLE statement, which looks like this:
2583 *
2584 * CREATE VIRTUAL TABLE email
2585 * USING fts2(subject, body, tokenize mytokenizer(myarg))
2586 *
2587 * We return parsed information in a TableSpec structure.
2588 *
2589 */
2590static int parseSpec(TableSpec *pSpec, int argc, const char *const*argv,
2591 char**pzErr){
2592 int i, n;
2593 char *z, *zDummy;
2594 char **azArg;
2595 const char *zTokenizer = 0; /* argv[] entry describing the tokenizer */
2596
2597 assert( argc>=3 );
2598 /* Current interface:
2599 ** argv[0] - module name
2600 ** argv[1] - database name
2601 ** argv[2] - table name
2602 ** argv[3..] - columns, optionally followed by tokenizer specification
2603 ** and snippet delimiters specification.
2604 */
2605
2606 /* Make a copy of the complete argv[][] array in a single allocation.
2607 ** The argv[][] array is read-only and transient. We can write to the
2608 ** copy in order to modify things and the copy is persistent.
2609 */
2610 CLEAR(pSpec);
2611 for(i=n=0; i<argc; i++){
2612 n += strlen(argv[i]) + 1;
2613 }
2614 azArg = malloc( sizeof(char*)*argc + n );
2615 if( azArg==0 ){
2616 return SQLITE_NOMEM;
2617 }
2618 z = (char*)&azArg[argc];
2619 for(i=0; i<argc; i++){
2620 azArg[i] = z;
2621 strcpy(z, argv[i]);
2622 z += strlen(z)+1;
2623 }
2624
2625 /* Identify the column names and the tokenizer and delimiter arguments
2626 ** in the argv[][] array.
2627 */
2628 pSpec->zDb = azArg[1];
2629 pSpec->zName = azArg[2];
2630 pSpec->nColumn = 0;
2631 pSpec->azColumn = azArg;
2632 zTokenizer = "tokenize simple";
2633 for(i=3; i<argc; ++i){
2634 if( startsWith(azArg[i],"tokenize") ){
2635 zTokenizer = azArg[i];
2636 }else{
2637 z = azArg[pSpec->nColumn] = firstToken(azArg[i], &zDummy);
2638 pSpec->nColumn++;
2639 }
2640 }
2641 if( pSpec->nColumn==0 ){
2642 azArg[0] = "content";
2643 pSpec->nColumn = 1;
2644 }
2645
2646 /*
2647 ** Construct the list of content column names.
2648 **
2649 ** Each content column name will be of the form cNNAAAA
2650 ** where NN is the column number and AAAA is the sanitized
2651 ** column name. "sanitized" means that special characters are
2652 ** converted to "_". The cNN prefix guarantees that all column
2653 ** names are unique.
2654 **
2655 ** The AAAA suffix is not strictly necessary. It is included
2656 ** for the convenience of people who might examine the generated
2657 ** %_content table and wonder what the columns are used for.
2658 */
2659 pSpec->azContentColumn = malloc( pSpec->nColumn * sizeof(char *) );
2660 if( pSpec->azContentColumn==0 ){
2661 clearTableSpec(pSpec);
2662 return SQLITE_NOMEM;
2663 }
2664 for(i=0; i<pSpec->nColumn; i++){
2665 char *p;
2666 pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]);
2667 for (p = pSpec->azContentColumn[i]; *p ; ++p) {
2668 if( !safe_isalnum(*p) ) *p = '_';
2669 }
2670 }
2671
2672 /*
2673 ** Parse the tokenizer specification string.
2674 */
2675 pSpec->azTokenizer = tokenizeString(zTokenizer, &n);
2676 tokenListToIdList(pSpec->azTokenizer);
2677
2678 return SQLITE_OK;
2679}
2680
2681/*
2682** Generate a CREATE TABLE statement that describes the schema of
2683** the virtual table. Return a pointer to this schema string.
2684**
2685** Space is obtained from sqlite3_mprintf() and should be freed
2686** using sqlite3_free().
2687*/
2688static char *fulltextSchema(
2689 int nColumn, /* Number of columns */
2690 const char *const* azColumn, /* List of columns */
2691 const char *zTableName /* Name of the table */
2692){
2693 int i;
2694 char *zSchema, *zNext;
2695 const char *zSep = "(";
2696 zSchema = sqlite3_mprintf("CREATE TABLE x");
2697 for(i=0; i<nColumn; i++){
2698 zNext = sqlite3_mprintf("%s%s%Q", zSchema, zSep, azColumn[i]);
2699 sqlite3_free(zSchema);
2700 zSchema = zNext;
2701 zSep = ",";
2702 }
2703 zNext = sqlite3_mprintf("%s,%Q)", zSchema, zTableName);
2704 sqlite3_free(zSchema);
2705 return zNext;
2706}
2707
2708/*
2709** Build a new sqlite3_vtab structure that will describe the
2710** fulltext index defined by spec.
2711*/
2712static int constructVtab(
2713 sqlite3 *db, /* The SQLite database connection */
2714 fts2Hash *pHash, /* Hash table containing tokenizers */
2715 TableSpec *spec, /* Parsed spec information from parseSpec() */
2716 sqlite3_vtab **ppVTab, /* Write the resulting vtab structure here */
2717 char **pzErr /* Write any error message here */
2718){
2719 int rc;
2720 int n;
2721 fulltext_vtab *v = 0;
2722 const sqlite3_tokenizer_module *m = NULL;
2723 char *schema;
2724
2725 char const *zTok; /* Name of tokenizer to use for this fts table */
2726 int nTok; /* Length of zTok, including nul terminator */
2727
2728 v = (fulltext_vtab *) malloc(sizeof(fulltext_vtab));
2729 if( v==0 ) return SQLITE_NOMEM;
2730 CLEAR(v);
2731 /* sqlite will initialize v->base */
2732 v->db = db;
2733 v->zDb = spec->zDb; /* Freed when azColumn is freed */
2734 v->zName = spec->zName; /* Freed when azColumn is freed */
2735 v->nColumn = spec->nColumn;
2736 v->azContentColumn = spec->azContentColumn;
2737 spec->azContentColumn = 0;
2738 v->azColumn = spec->azColumn;
2739 spec->azColumn = 0;
2740
2741 if( spec->azTokenizer==0 ){
2742 return SQLITE_NOMEM;
2743 }
2744
2745 zTok = spec->azTokenizer[0];
2746 if( !zTok ){
2747 zTok = "simple";
2748 }
2749 nTok = strlen(zTok)+1;
2750
2751 m = (sqlite3_tokenizer_module *)sqlite3Fts2HashFind(pHash, zTok, nTok);
2752 if( !m ){
2753 *pzErr = sqlite3_mprintf("unknown tokenizer: %s", spec->azTokenizer[0]);
2754 rc = SQLITE_ERROR;
2755 goto err;
2756 }
2757
2758 for(n=0; spec->azTokenizer[n]; n++){}
2759 if( n ){
2760 rc = m->xCreate(n-1, (const char*const*)&spec->azTokenizer[1],
2761 &v->pTokenizer);
2762 }else{
2763 rc = m->xCreate(0, 0, &v->pTokenizer);
2764 }
2765 if( rc!=SQLITE_OK ) goto err;
2766 v->pTokenizer->pModule = m;
2767
2768 /* TODO: verify the existence of backing tables foo_content, foo_term */
2769
2770 schema = fulltextSchema(v->nColumn, (const char*const*)v->azColumn,
2771 spec->zName);
2772 rc = sqlite3_declare_vtab(db, schema);
2773 sqlite3_free(schema);
2774 if( rc!=SQLITE_OK ) goto err;
2775
2776 memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements));
2777
2778 /* Indicate that the buffer is not live. */
2779 v->nPendingData = -1;
2780
2781 *ppVTab = &v->base;
2782 TRACE(("FTS2 Connect %p\n", v));
2783
2784 return rc;
2785
2786err:
2787 fulltext_vtab_destroy(v);
2788 return rc;
2789}
2790
2791static int fulltextConnect(
2792 sqlite3 *db,
2793 void *pAux,
2794 int argc, const char *const*argv,
2795 sqlite3_vtab **ppVTab,
2796 char **pzErr
2797){
2798 TableSpec spec;
2799 int rc = parseSpec(&spec, argc, argv, pzErr);
2800 if( rc!=SQLITE_OK ) return rc;
2801
2802 rc = constructVtab(db, (fts2Hash *)pAux, &spec, ppVTab, pzErr);
2803 clearTableSpec(&spec);
2804 return rc;
2805}
2806
2807/* The %_content table holds the text of each document, with
2808** the rowid used as the docid.
2809*/
2810/* TODO(shess) This comment needs elaboration to match the updated
2811** code. Work it into the top-of-file comment at that time.
2812*/
2813static int fulltextCreate(sqlite3 *db, void *pAux,
2814 int argc, const char * const *argv,
2815 sqlite3_vtab **ppVTab, char **pzErr){
2816 int rc;
2817 TableSpec spec;
2818 StringBuffer schema;
2819 TRACE(("FTS2 Create\n"));
2820
2821 rc = parseSpec(&spec, argc, argv, pzErr);
2822 if( rc!=SQLITE_OK ) return rc;
2823
2824 initStringBuffer(&schema);
2825 append(&schema, "CREATE TABLE %_content(");
2826 appendList(&schema, spec.nColumn, spec.azContentColumn);
2827 append(&schema, ")");
2828 rc = sql_exec(db, spec.zDb, spec.zName, stringBufferData(&schema));
2829 stringBufferDestroy(&schema);
2830 if( rc!=SQLITE_OK ) goto out;
2831
2832 rc = sql_exec(db, spec.zDb, spec.zName,
2833 "create table %_segments(block blob);");
2834 if( rc!=SQLITE_OK ) goto out;
2835
2836 rc = sql_exec(db, spec.zDb, spec.zName,
2837 "create table %_segdir("
2838 " level integer,"
2839 " idx integer,"
2840 " start_block integer,"
2841 " leaves_end_block integer,"
2842 " end_block integer,"
2843 " root blob,"
2844 " primary key(level, idx)"
2845 ");");
2846 if( rc!=SQLITE_OK ) goto out;
2847
2848 rc = constructVtab(db, (fts2Hash *)pAux, &spec, ppVTab, pzErr);
2849
2850out:
2851 clearTableSpec(&spec);
2852 return rc;
2853}
2854
2855/* Decide how to handle an SQL query. */
2856static int fulltextBestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){
2857 int i;
2858 TRACE(("FTS2 BestIndex\n"));
2859
2860 for(i=0; i<pInfo->nConstraint; ++i){
2861 const struct sqlite3_index_constraint *pConstraint;
2862 pConstraint = &pInfo->aConstraint[i];
2863 if( pConstraint->usable ) {
2864 if( pConstraint->iColumn==-1 &&
2865 pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ ){
2866 pInfo->idxNum = QUERY_ROWID; /* lookup by rowid */
2867 TRACE(("FTS2 QUERY_ROWID\n"));
2868 } else if( pConstraint->iColumn>=0 &&
2869 pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH ){
2870 /* full-text search */
2871 pInfo->idxNum = QUERY_FULLTEXT + pConstraint->iColumn;
2872 TRACE(("FTS2 QUERY_FULLTEXT %d\n", pConstraint->iColumn));
2873 } else continue;
2874
2875 pInfo->aConstraintUsage[i].argvIndex = 1;
2876 pInfo->aConstraintUsage[i].omit = 1;
2877
2878 /* An arbitrary value for now.
2879 * TODO: Perhaps rowid matches should be considered cheaper than
2880 * full-text searches. */
2881 pInfo->estimatedCost = 1.0;
2882
2883 return SQLITE_OK;
2884 }
2885 }
2886 pInfo->idxNum = QUERY_GENERIC;
2887 return SQLITE_OK;
2888}
2889
2890static int fulltextDisconnect(sqlite3_vtab *pVTab){
2891 TRACE(("FTS2 Disconnect %p\n", pVTab));
2892 fulltext_vtab_destroy((fulltext_vtab *)pVTab);
2893 return SQLITE_OK;
2894}
2895
2896static int fulltextDestroy(sqlite3_vtab *pVTab){
2897 fulltext_vtab *v = (fulltext_vtab *)pVTab;
2898 int rc;
2899
2900 TRACE(("FTS2 Destroy %p\n", pVTab));
2901 rc = sql_exec(v->db, v->zDb, v->zName,
2902 "drop table if exists %_content;"
2903 "drop table if exists %_segments;"
2904 "drop table if exists %_segdir;"
2905 );
2906 if( rc!=SQLITE_OK ) return rc;
2907
2908 fulltext_vtab_destroy((fulltext_vtab *)pVTab);
2909 return SQLITE_OK;
2910}
2911
2912static int fulltextOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){
2913 fulltext_cursor *c;
2914
2915 c = (fulltext_cursor *) calloc(sizeof(fulltext_cursor), 1);
2916 /* sqlite will initialize c->base */
2917 *ppCursor = &c->base;
2918 TRACE(("FTS2 Open %p: %p\n", pVTab, c));
2919
2920 return SQLITE_OK;
2921}
2922
2923
2924/* Free all of the dynamically allocated memory held by *q
2925*/
2926static void queryClear(Query *q){
2927 int i;
2928 for(i = 0; i < q->nTerms; ++i){
2929 free(q->pTerms[i].pTerm);
2930 }
2931 free(q->pTerms);
2932 CLEAR(q);
2933}
2934
2935/* Free all of the dynamically allocated memory held by the
2936** Snippet
2937*/
2938static void snippetClear(Snippet *p){
2939 free(p->aMatch);
2940 free(p->zOffset);
2941 free(p->zSnippet);
2942 CLEAR(p);
2943}
2944/*
2945** Append a single entry to the p->aMatch[] log.
2946*/
2947static void snippetAppendMatch(
2948 Snippet *p, /* Append the entry to this snippet */
2949 int iCol, int iTerm, /* The column and query term */
2950 int iStart, int nByte /* Offset and size of the match */
2951){
2952 int i;
2953 struct snippetMatch *pMatch;
2954 if( p->nMatch+1>=p->nAlloc ){
2955 p->nAlloc = p->nAlloc*2 + 10;
2956 p->aMatch = realloc(p->aMatch, p->nAlloc*sizeof(p->aMatch[0]) );
2957 if( p->aMatch==0 ){
2958 p->nMatch = 0;
2959 p->nAlloc = 0;
2960 return;
2961 }
2962 }
2963 i = p->nMatch++;
2964 pMatch = &p->aMatch[i];
2965 pMatch->iCol = iCol;
2966 pMatch->iTerm = iTerm;
2967 pMatch->iStart = iStart;
2968 pMatch->nByte = nByte;
2969}
2970
2971/*
2972** Sizing information for the circular buffer used in snippetOffsetsOfColumn()
2973*/
2974#define FTS2_ROTOR_SZ (32)
2975#define FTS2_ROTOR_MASK (FTS2_ROTOR_SZ-1)
2976
2977/*
2978** Add entries to pSnippet->aMatch[] for every match that occurs against
2979** document zDoc[0..nDoc-1] which is stored in column iColumn.
2980*/
2981static void snippetOffsetsOfColumn(
2982 Query *pQuery,
2983 Snippet *pSnippet,
2984 int iColumn,
2985 const char *zDoc,
2986 int nDoc
2987){
2988 const sqlite3_tokenizer_module *pTModule; /* The tokenizer module */
2989 sqlite3_tokenizer *pTokenizer; /* The specific tokenizer */
2990 sqlite3_tokenizer_cursor *pTCursor; /* Tokenizer cursor */
2991 fulltext_vtab *pVtab; /* The full text index */
2992 int nColumn; /* Number of columns in the index */
2993 const QueryTerm *aTerm; /* Query string terms */
2994 int nTerm; /* Number of query string terms */
2995 int i, j; /* Loop counters */
2996 int rc; /* Return code */
2997 unsigned int match, prevMatch; /* Phrase search bitmasks */
2998 const char *zToken; /* Next token from the tokenizer */
2999 int nToken; /* Size of zToken */
3000 int iBegin, iEnd, iPos; /* Offsets of beginning and end */
3001
3002 /* The following variables keep a circular buffer of the last
3003 ** few tokens */
3004 unsigned int iRotor = 0; /* Index of current token */
3005 int iRotorBegin[FTS2_ROTOR_SZ]; /* Beginning offset of token */
3006 int iRotorLen[FTS2_ROTOR_SZ]; /* Length of token */
3007
3008 pVtab = pQuery->pFts;
3009 nColumn = pVtab->nColumn;
3010 pTokenizer = pVtab->pTokenizer;
3011 pTModule = pTokenizer->pModule;
3012 rc = pTModule->xOpen(pTokenizer, zDoc, nDoc, &pTCursor);
3013 if( rc ) return;
3014 pTCursor->pTokenizer = pTokenizer;
3015 aTerm = pQuery->pTerms;
3016 nTerm = pQuery->nTerms;
3017 if( nTerm>=FTS2_ROTOR_SZ ){
3018 nTerm = FTS2_ROTOR_SZ - 1;
3019 }
3020 prevMatch = 0;
3021 while(1){
3022 rc = pTModule->xNext(pTCursor, &zToken, &nToken, &iBegin, &iEnd, &iPos);
3023 if( rc ) break;
3024 iRotorBegin[iRotor&FTS2_ROTOR_MASK] = iBegin;
3025 iRotorLen[iRotor&FTS2_ROTOR_MASK] = iEnd-iBegin;
3026 match = 0;
3027 for(i=0; i<nTerm; i++){
3028 int iCol;
3029 iCol = aTerm[i].iColumn;
3030 if( iCol>=0 && iCol<nColumn && iCol!=iColumn ) continue;
3031 if( aTerm[i].nTerm>nToken ) continue;
3032 if( !aTerm[i].isPrefix && aTerm[i].nTerm<nToken ) continue;
3033 assert( aTerm[i].nTerm<=nToken );
3034 if( memcmp(aTerm[i].pTerm, zToken, aTerm[i].nTerm) ) continue;
3035 if( aTerm[i].iPhrase>1 && (prevMatch & (1<<i))==0 ) continue;
3036 match |= 1<<i;
3037 if( i==nTerm-1 || aTerm[i+1].iPhrase==1 ){
3038 for(j=aTerm[i].iPhrase-1; j>=0; j--){
3039 int k = (iRotor-j) & FTS2_ROTOR_MASK;
3040 snippetAppendMatch(pSnippet, iColumn, i-j,
3041 iRotorBegin[k], iRotorLen[k]);
3042 }
3043 }
3044 }
3045 prevMatch = match<<1;
3046 iRotor++;
3047 }
3048 pTModule->xClose(pTCursor);
3049}
3050
3051
3052/*
3053** Compute all offsets for the current row of the query.
3054** If the offsets have already been computed, this routine is a no-op.
3055*/
3056static void snippetAllOffsets(fulltext_cursor *p){
3057 int nColumn;
3058 int iColumn, i;
3059 int iFirst, iLast;
3060 fulltext_vtab *pFts;
3061
3062 if( p->snippet.nMatch ) return;
3063 if( p->q.nTerms==0 ) return;
3064 pFts = p->q.pFts;
3065 nColumn = pFts->nColumn;
3066 iColumn = (p->iCursorType - QUERY_FULLTEXT);
3067 if( iColumn<0 || iColumn>=nColumn ){
3068 iFirst = 0;
3069 iLast = nColumn-1;
3070 }else{
3071 iFirst = iColumn;
3072 iLast = iColumn;
3073 }
3074 for(i=iFirst; i<=iLast; i++){
3075 const char *zDoc;
3076 int nDoc;
3077 zDoc = (const char*)sqlite3_column_text(p->pStmt, i+1);
3078 nDoc = sqlite3_column_bytes(p->pStmt, i+1);
3079 snippetOffsetsOfColumn(&p->q, &p->snippet, i, zDoc, nDoc);
3080 }
3081}
3082
3083/*
3084** Convert the information in the aMatch[] array of the snippet
3085** into the string zOffset[0..nOffset-1].
3086*/
3087static void snippetOffsetText(Snippet *p){
3088 int i;
3089 int cnt = 0;
3090 StringBuffer sb;
3091 char zBuf[200];
3092 if( p->zOffset ) return;
3093 initStringBuffer(&sb);
3094 for(i=0; i<p->nMatch; i++){
3095 struct snippetMatch *pMatch = &p->aMatch[i];
3096 zBuf[0] = ' ';
3097 sprintf(&zBuf[cnt>0], "%d %d %d %d", pMatch->iCol,
3098 pMatch->iTerm, pMatch->iStart, pMatch->nByte);
3099 append(&sb, zBuf);
3100 cnt++;
3101 }
3102 p->zOffset = stringBufferData(&sb);
3103 p->nOffset = stringBufferLength(&sb);
3104}
3105
3106/*
3107** zDoc[0..nDoc-1] is phrase of text. aMatch[0..nMatch-1] are a set
3108** of matching words some of which might be in zDoc. zDoc is column
3109** number iCol.
3110**
3111** iBreak is suggested spot in zDoc where we could begin or end an
3112** excerpt. Return a value similar to iBreak but possibly adjusted
3113** to be a little left or right so that the break point is better.
3114*/
3115static int wordBoundary(
3116 int iBreak, /* The suggested break point */
3117 const char *zDoc, /* Document text */
3118 int nDoc, /* Number of bytes in zDoc[] */
3119 struct snippetMatch *aMatch, /* Matching words */
3120 int nMatch, /* Number of entries in aMatch[] */
3121 int iCol /* The column number for zDoc[] */
3122){
3123 int i;
3124 if( iBreak<=10 ){
3125 return 0;
3126 }
3127 if( iBreak>=nDoc-10 ){
3128 return nDoc;
3129 }
3130 for(i=0; i<nMatch && aMatch[i].iCol<iCol; i++){}
3131 while( i<nMatch && aMatch[i].iStart+aMatch[i].nByte<iBreak ){ i++; }
3132 if( i<nMatch ){
3133 if( aMatch[i].iStart<iBreak+10 ){
3134 return aMatch[i].iStart;
3135 }
3136 if( i>0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){
3137 return aMatch[i-1].iStart;
3138 }
3139 }
3140 for(i=1; i<=10; i++){
3141 if( safe_isspace(zDoc[iBreak-i]) ){
3142 return iBreak - i + 1;
3143 }
3144 if( safe_isspace(zDoc[iBreak+i]) ){
3145 return iBreak + i + 1;
3146 }
3147 }
3148 return iBreak;
3149}
3150
3151
3152
3153/*
3154** Allowed values for Snippet.aMatch[].snStatus
3155*/
3156#define SNIPPET_IGNORE 0 /* It is ok to omit this match from the snippet */
3157#define SNIPPET_DESIRED 1 /* We want to include this match in the snippet */
3158
3159/*
3160** Generate the text of a snippet.
3161*/
3162static void snippetText(
3163 fulltext_cursor *pCursor, /* The cursor we need the snippet for */
3164 const char *zStartMark, /* Markup to appear before each match */
3165 const char *zEndMark, /* Markup to appear after each match */
3166 const char *zEllipsis /* Ellipsis mark */
3167){
3168 int i, j;
3169 struct snippetMatch *aMatch;
3170 int nMatch;
3171 int nDesired;
3172 StringBuffer sb;
3173 int tailCol;
3174 int tailOffset;
3175 int iCol;
3176 int nDoc;
3177 const char *zDoc;
3178 int iStart, iEnd;
3179 int tailEllipsis = 0;
3180 int iMatch;
3181
3182
3183 free(pCursor->snippet.zSnippet);
3184 pCursor->snippet.zSnippet = 0;
3185 aMatch = pCursor->snippet.aMatch;
3186 nMatch = pCursor->snippet.nMatch;
3187 initStringBuffer(&sb);
3188
3189 for(i=0; i<nMatch; i++){
3190 aMatch[i].snStatus = SNIPPET_IGNORE;
3191 }
3192 nDesired = 0;
3193 for(i=0; i<pCursor->q.nTerms; i++){
3194 for(j=0; j<nMatch; j++){
3195 if( aMatch[j].iTerm==i ){
3196 aMatch[j].snStatus = SNIPPET_DESIRED;
3197 nDesired++;
3198 break;
3199 }
3200 }
3201 }
3202
3203 iMatch = 0;
3204 tailCol = -1;
3205 tailOffset = 0;
3206 for(i=0; i<nMatch && nDesired>0; i++){
3207 if( aMatch[i].snStatus!=SNIPPET_DESIRED ) continue;
3208 nDesired--;
3209 iCol = aMatch[i].iCol;
3210 zDoc = (const char*)sqlite3_column_text(pCursor->pStmt, iCol+1);
3211 nDoc = sqlite3_column_bytes(pCursor->pStmt, iCol+1);
3212 iStart = aMatch[i].iStart - 40;
3213 iStart = wordBoundary(iStart, zDoc, nDoc, aMatch, nMatch, iCol);
3214 if( iStart<=10 ){
3215 iStart = 0;
3216 }
3217 if( iCol==tailCol && iStart<=tailOffset+20 ){
3218 iStart = tailOffset;
3219 }
3220 if( (iCol!=tailCol && tailCol>=0) || iStart!=tailOffset ){
3221 trimWhiteSpace(&sb);
3222 appendWhiteSpace(&sb);
3223 append(&sb, zEllipsis);
3224 appendWhiteSpace(&sb);
3225 }
3226 iEnd = aMatch[i].iStart + aMatch[i].nByte + 40;
3227 iEnd = wordBoundary(iEnd, zDoc, nDoc, aMatch, nMatch, iCol);
3228 if( iEnd>=nDoc-10 ){
3229 iEnd = nDoc;
3230 tailEllipsis = 0;
3231 }else{
3232 tailEllipsis = 1;
3233 }
3234 while( iMatch<nMatch && aMatch[iMatch].iCol<iCol ){ iMatch++; }
3235 while( iStart<iEnd ){
3236 while( iMatch<nMatch && aMatch[iMatch].iStart<iStart
3237 && aMatch[iMatch].iCol<=iCol ){
3238 iMatch++;
3239 }
3240 if( iMatch<nMatch && aMatch[iMatch].iStart<iEnd
3241 && aMatch[iMatch].iCol==iCol ){
3242 nappend(&sb, &zDoc[iStart], aMatch[iMatch].iStart - iStart);
3243 iStart = aMatch[iMatch].iStart;
3244 append(&sb, zStartMark);
3245 nappend(&sb, &zDoc[iStart], aMatch[iMatch].nByte);
3246 append(&sb, zEndMark);
3247 iStart += aMatch[iMatch].nByte;
3248 for(j=iMatch+1; j<nMatch; j++){
3249 if( aMatch[j].iTerm==aMatch[iMatch].iTerm
3250 && aMatch[j].snStatus==SNIPPET_DESIRED ){
3251 nDesired--;
3252 aMatch[j].snStatus = SNIPPET_IGNORE;
3253 }
3254 }
3255 }else{
3256 nappend(&sb, &zDoc[iStart], iEnd - iStart);
3257 iStart = iEnd;
3258 }
3259 }
3260 tailCol = iCol;
3261 tailOffset = iEnd;
3262 }
3263 trimWhiteSpace(&sb);
3264 if( tailEllipsis ){
3265 appendWhiteSpace(&sb);
3266 append(&sb, zEllipsis);
3267 }
3268 pCursor->snippet.zSnippet = stringBufferData(&sb);
3269 pCursor->snippet.nSnippet = stringBufferLength(&sb);
3270}
3271
3272
3273/*
3274** Close the cursor. For additional information see the documentation
3275** on the xClose method of the virtual table interface.
3276*/
3277static int fulltextClose(sqlite3_vtab_cursor *pCursor){
3278 fulltext_cursor *c = (fulltext_cursor *) pCursor;
3279 TRACE(("FTS2 Close %p\n", c));
3280 sqlite3_finalize(c->pStmt);
3281 queryClear(&c->q);
3282 snippetClear(&c->snippet);
3283 if( c->result.nData!=0 ) dlrDestroy(&c->reader);
3284 dataBufferDestroy(&c->result);
3285 free(c);
3286 return SQLITE_OK;
3287}
3288
3289static int fulltextNext(sqlite3_vtab_cursor *pCursor){
3290 fulltext_cursor *c = (fulltext_cursor *) pCursor;
3291 int rc;
3292
3293 TRACE(("FTS2 Next %p\n", pCursor));
3294 snippetClear(&c->snippet);
3295 if( c->iCursorType < QUERY_FULLTEXT ){
3296 /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
3297 rc = sqlite3_step(c->pStmt);
3298 switch( rc ){
3299 case SQLITE_ROW:
3300 c->eof = 0;
3301 return SQLITE_OK;
3302 case SQLITE_DONE:
3303 c->eof = 1;
3304 return SQLITE_OK;
3305 default:
3306 c->eof = 1;
3307 return rc;
3308 }
3309 } else { /* full-text query */
3310 rc = sqlite3_reset(c->pStmt);
3311 if( rc!=SQLITE_OK ) return rc;
3312
3313 if( c->result.nData==0 || dlrAtEnd(&c->reader) ){
3314 c->eof = 1;
3315 return SQLITE_OK;
3316 }
3317 rc = sqlite3_bind_int64(c->pStmt, 1, dlrDocid(&c->reader));
3318 dlrStep(&c->reader);
3319 if( rc!=SQLITE_OK ) return rc;
3320 /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
3321 rc = sqlite3_step(c->pStmt);
3322 if( rc==SQLITE_ROW ){ /* the case we expect */
3323 c->eof = 0;
3324 return SQLITE_OK;
3325 }
3326 /* an error occurred; abort */
3327 return rc==SQLITE_DONE ? SQLITE_ERROR : rc;
3328 }
3329}
3330
3331
3332/* TODO(shess) If we pushed LeafReader to the top of the file, or to
3333** another file, term_select() could be pushed above
3334** docListOfTerm().
3335*/
3336static int termSelect(fulltext_vtab *v, int iColumn,
3337 const char *pTerm, int nTerm, int isPrefix,
3338 DocListType iType, DataBuffer *out);
3339
3340/* Return a DocList corresponding to the query term *pTerm. If *pTerm
3341** is the first term of a phrase query, go ahead and evaluate the phrase
3342** query and return the doclist for the entire phrase query.
3343**
3344** The resulting DL_DOCIDS doclist is stored in pResult, which is
3345** overwritten.
3346*/
3347static int docListOfTerm(
3348 fulltext_vtab *v, /* The full text index */
3349 int iColumn, /* column to restrict to. No restriction if >=nColumn */
3350 QueryTerm *pQTerm, /* Term we are looking for, or 1st term of a phrase */
3351 DataBuffer *pResult /* Write the result here */
3352){
3353 DataBuffer left, right, new;
3354 int i, rc;
3355
3356 /* No phrase search if no position info. */
3357 assert( pQTerm->nPhrase==0 || DL_DEFAULT!=DL_DOCIDS );
3358
3359 /* This code should never be called with buffered updates. */
3360 assert( v->nPendingData<0 );
3361
3362 dataBufferInit(&left, 0);
3363 rc = termSelect(v, iColumn, pQTerm->pTerm, pQTerm->nTerm, pQTerm->isPrefix,
3364 0<pQTerm->nPhrase ? DL_POSITIONS : DL_DOCIDS, &left);
3365 if( rc ) return rc;
3366 for(i=1; i<=pQTerm->nPhrase && left.nData>0; i++){
3367 dataBufferInit(&right, 0);
3368 rc = termSelect(v, iColumn, pQTerm[i].pTerm, pQTerm[i].nTerm,
3369 pQTerm[i].isPrefix, DL_POSITIONS, &right);
3370 if( rc ){
3371 dataBufferDestroy(&left);
3372 return rc;
3373 }
3374 dataBufferInit(&new, 0);
3375 docListPhraseMerge(left.pData, left.nData, right.pData, right.nData,
3376 i<pQTerm->nPhrase ? DL_POSITIONS : DL_DOCIDS, &new);
3377 dataBufferDestroy(&left);
3378 dataBufferDestroy(&right);
3379 left = new;
3380 }
3381 *pResult = left;
3382 return SQLITE_OK;
3383}
3384
3385/* Add a new term pTerm[0..nTerm-1] to the query *q.
3386*/
3387static void queryAdd(Query *q, const char *pTerm, int nTerm){
3388 QueryTerm *t;
3389 ++q->nTerms;
3390 q->pTerms = realloc(q->pTerms, q->nTerms * sizeof(q->pTerms[0]));
3391 if( q->pTerms==0 ){
3392 q->nTerms = 0;
3393 return;
3394 }
3395 t = &q->pTerms[q->nTerms - 1];
3396 CLEAR(t);
3397 t->pTerm = malloc(nTerm+1);
3398 memcpy(t->pTerm, pTerm, nTerm);
3399 t->pTerm[nTerm] = 0;
3400 t->nTerm = nTerm;
3401 t->isOr = q->nextIsOr;
3402 t->isPrefix = 0;
3403 q->nextIsOr = 0;
3404 t->iColumn = q->nextColumn;
3405 q->nextColumn = q->dfltColumn;
3406}
3407
3408/*
3409** Check to see if the string zToken[0...nToken-1] matches any
3410** column name in the virtual table. If it does,
3411** return the zero-indexed column number. If not, return -1.
3412*/
3413static int checkColumnSpecifier(
3414 fulltext_vtab *pVtab, /* The virtual table */
3415 const char *zToken, /* Text of the token */
3416 int nToken /* Number of characters in the token */
3417){
3418 int i;
3419 for(i=0; i<pVtab->nColumn; i++){
3420 if( memcmp(pVtab->azColumn[i], zToken, nToken)==0
3421 && pVtab->azColumn[i][nToken]==0 ){
3422 return i;
3423 }
3424 }
3425 return -1;
3426}
3427
3428/*
3429** Parse the text at pSegment[0..nSegment-1]. Add additional terms
3430** to the query being assemblied in pQuery.
3431**
3432** inPhrase is true if pSegment[0..nSegement-1] is contained within
3433** double-quotes. If inPhrase is true, then the first term
3434** is marked with the number of terms in the phrase less one and
3435** OR and "-" syntax is ignored. If inPhrase is false, then every
3436** term found is marked with nPhrase=0 and OR and "-" syntax is significant.
3437*/
3438static int tokenizeSegment(
3439 sqlite3_tokenizer *pTokenizer, /* The tokenizer to use */
3440 const char *pSegment, int nSegment, /* Query expression being parsed */
3441 int inPhrase, /* True if within "..." */
3442 Query *pQuery /* Append results here */
3443){
3444 const sqlite3_tokenizer_module *pModule = pTokenizer->pModule;
3445 sqlite3_tokenizer_cursor *pCursor;
3446 int firstIndex = pQuery->nTerms;
3447 int iCol;
3448 int nTerm = 1;
3449
3450 int rc = pModule->xOpen(pTokenizer, pSegment, nSegment, &pCursor);
3451 if( rc!=SQLITE_OK ) return rc;
3452 pCursor->pTokenizer = pTokenizer;
3453
3454 while( 1 ){
3455 const char *pToken;
3456 int nToken, iBegin, iEnd, iPos;
3457
3458 rc = pModule->xNext(pCursor,
3459 &pToken, &nToken,
3460 &iBegin, &iEnd, &iPos);
3461 if( rc!=SQLITE_OK ) break;
3462 if( !inPhrase &&
3463 pSegment[iEnd]==':' &&
3464 (iCol = checkColumnSpecifier(pQuery->pFts, pToken, nToken))>=0 ){
3465 pQuery->nextColumn = iCol;
3466 continue;
3467 }
3468 if( !inPhrase && pQuery->nTerms>0 && nToken==2
3469 && pSegment[iBegin]=='O' && pSegment[iBegin+1]=='R' ){
3470 pQuery->nextIsOr = 1;
3471 continue;
3472 }
3473 queryAdd(pQuery, pToken, nToken);
3474 if( !inPhrase && iBegin>0 && pSegment[iBegin-1]=='-' ){
3475 pQuery->pTerms[pQuery->nTerms-1].isNot = 1;
3476 }
3477 if( iEnd<nSegment && pSegment[iEnd]=='*' ){
3478 pQuery->pTerms[pQuery->nTerms-1].isPrefix = 1;
3479 }
3480 pQuery->pTerms[pQuery->nTerms-1].iPhrase = nTerm;
3481 if( inPhrase ){
3482 nTerm++;
3483 }
3484 }
3485
3486 if( inPhrase && pQuery->nTerms>firstIndex ){
3487 pQuery->pTerms[firstIndex].nPhrase = pQuery->nTerms - firstIndex - 1;
3488 }
3489
3490 return pModule->xClose(pCursor);
3491}
3492
3493/* Parse a query string, yielding a Query object pQuery.
3494**
3495** The calling function will need to queryClear() to clean up
3496** the dynamically allocated memory held by pQuery.
3497*/
3498static int parseQuery(
3499 fulltext_vtab *v, /* The fulltext index */
3500 const char *zInput, /* Input text of the query string */
3501 int nInput, /* Size of the input text */
3502 int dfltColumn, /* Default column of the index to match against */
3503 Query *pQuery /* Write the parse results here. */
3504){
3505 int iInput, inPhrase = 0;
3506
3507 if( zInput==0 ) nInput = 0;
3508 if( nInput<0 ) nInput = strlen(zInput);
3509 pQuery->nTerms = 0;
3510 pQuery->pTerms = NULL;
3511 pQuery->nextIsOr = 0;
3512 pQuery->nextColumn = dfltColumn;
3513 pQuery->dfltColumn = dfltColumn;
3514 pQuery->pFts = v;
3515
3516 for(iInput=0; iInput<nInput; ++iInput){
3517 int i;
3518 for(i=iInput; i<nInput && zInput[i]!='"'; ++i){}
3519 if( i>iInput ){
3520 tokenizeSegment(v->pTokenizer, zInput+iInput, i-iInput, inPhrase,
3521 pQuery);
3522 }
3523 iInput = i;
3524 if( i<nInput ){
3525 assert( zInput[i]=='"' );
3526 inPhrase = !inPhrase;
3527 }
3528 }
3529
3530 if( inPhrase ){
3531 /* unmatched quote */
3532 queryClear(pQuery);
3533 return SQLITE_ERROR;
3534 }
3535 return SQLITE_OK;
3536}
3537
3538/* TODO(shess) Refactor the code to remove this forward decl. */
3539static int flushPendingTerms(fulltext_vtab *v);
3540
3541/* Perform a full-text query using the search expression in
3542** zInput[0..nInput-1]. Return a list of matching documents
3543** in pResult.
3544**
3545** Queries must match column iColumn. Or if iColumn>=nColumn
3546** they are allowed to match against any column.
3547*/
3548static int fulltextQuery(
3549 fulltext_vtab *v, /* The full text index */
3550 int iColumn, /* Match against this column by default */
3551 const char *zInput, /* The query string */
3552 int nInput, /* Number of bytes in zInput[] */
3553 DataBuffer *pResult, /* Write the result doclist here */
3554 Query *pQuery /* Put parsed query string here */
3555){
3556 int i, iNext, rc;
3557 DataBuffer left, right, or, new;
3558 int nNot = 0;
3559 QueryTerm *aTerm;
3560
3561 /* TODO(shess) Instead of flushing pendingTerms, we could query for
3562 ** the relevant term and merge the doclist into what we receive from
3563 ** the database. Wait and see if this is a common issue, first.
3564 **
3565 ** A good reason not to flush is to not generate update-related
3566 ** error codes from here.
3567 */
3568
3569 /* Flush any buffered updates before executing the query. */
3570 rc = flushPendingTerms(v);
3571 if( rc!=SQLITE_OK ) return rc;
3572
3573 /* TODO(shess) I think that the queryClear() calls below are not
3574 ** necessary, because fulltextClose() already clears the query.
3575 */
3576 rc = parseQuery(v, zInput, nInput, iColumn, pQuery);
3577 if( rc!=SQLITE_OK ) return rc;
3578
3579 /* Empty or NULL queries return no results. */
3580 if( pQuery->nTerms==0 ){
3581 dataBufferInit(pResult, 0);
3582 return SQLITE_OK;
3583 }
3584
3585 /* Merge AND terms. */
3586 /* TODO(shess) I think we can early-exit if( i>nNot && left.nData==0 ). */
3587 aTerm = pQuery->pTerms;
3588 for(i = 0; i<pQuery->nTerms; i=iNext){
3589 if( aTerm[i].isNot ){
3590 /* Handle all NOT terms in a separate pass */
3591 nNot++;
3592 iNext = i + aTerm[i].nPhrase+1;
3593 continue;
3594 }
3595 iNext = i + aTerm[i].nPhrase + 1;
3596 rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &right);
3597 if( rc ){
3598 if( i!=nNot ) dataBufferDestroy(&left);
3599 queryClear(pQuery);
3600 return rc;
3601 }
3602 while( iNext<pQuery->nTerms && aTerm[iNext].isOr ){
3603 rc = docListOfTerm(v, aTerm[iNext].iColumn, &aTerm[iNext], &or);
3604 iNext += aTerm[iNext].nPhrase + 1;
3605 if( rc ){
3606 if( i!=nNot ) dataBufferDestroy(&left);
3607 dataBufferDestroy(&right);
3608 queryClear(pQuery);
3609 return rc;
3610 }
3611 dataBufferInit(&new, 0);
3612 docListOrMerge(right.pData, right.nData, or.pData, or.nData, &new);
3613 dataBufferDestroy(&right);
3614 dataBufferDestroy(&or);
3615 right = new;
3616 }
3617 if( i==nNot ){ /* first term processed. */
3618 left = right;
3619 }else{
3620 dataBufferInit(&new, 0);
3621 docListAndMerge(left.pData, left.nData, right.pData, right.nData, &new);
3622 dataBufferDestroy(&right);
3623 dataBufferDestroy(&left);
3624 left = new;
3625 }
3626 }
3627
3628 if( nNot==pQuery->nTerms ){
3629 /* We do not yet know how to handle a query of only NOT terms */
3630 return SQLITE_ERROR;
3631 }
3632
3633 /* Do the EXCEPT terms */
3634 for(i=0; i<pQuery->nTerms; i += aTerm[i].nPhrase + 1){
3635 if( !aTerm[i].isNot ) continue;
3636 rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &right);
3637 if( rc ){
3638 queryClear(pQuery);
3639 dataBufferDestroy(&left);
3640 return rc;
3641 }
3642 dataBufferInit(&new, 0);
3643 docListExceptMerge(left.pData, left.nData, right.pData, right.nData, &new);
3644 dataBufferDestroy(&right);
3645 dataBufferDestroy(&left);
3646 left = new;
3647 }
3648
3649 *pResult = left;
3650 return rc;
3651}
3652
3653/*
3654** This is the xFilter interface for the virtual table. See
3655** the virtual table xFilter method documentation for additional
3656** information.
3657**
3658** If idxNum==QUERY_GENERIC then do a full table scan against
3659** the %_content table.
3660**
3661** If idxNum==QUERY_ROWID then do a rowid lookup for a single entry
3662** in the %_content table.
3663**
3664** If idxNum>=QUERY_FULLTEXT then use the full text index. The
3665** column on the left-hand side of the MATCH operator is column
3666** number idxNum-QUERY_FULLTEXT, 0 indexed. argv[0] is the right-hand
3667** side of the MATCH operator.
3668*/
3669/* TODO(shess) Upgrade the cursor initialization and destruction to
3670** account for fulltextFilter() being called multiple times on the
3671** same cursor. The current solution is very fragile. Apply fix to
3672** fts2 as appropriate.
3673*/
3674static int fulltextFilter(
3675 sqlite3_vtab_cursor *pCursor, /* The cursor used for this query */
3676 int idxNum, const char *idxStr, /* Which indexing scheme to use */
3677 int argc, sqlite3_value **argv /* Arguments for the indexing scheme */
3678){
3679 fulltext_cursor *c = (fulltext_cursor *) pCursor;
3680 fulltext_vtab *v = cursor_vtab(c);
3681 int rc;
3682 char *zSql;
3683
3684 TRACE(("FTS2 Filter %p\n",pCursor));
3685
3686 zSql = sqlite3_mprintf("select rowid, * from %%_content %s",
3687 idxNum==QUERY_GENERIC ? "" : "where rowid=?");
3688 sqlite3_finalize(c->pStmt);
3689 rc = sql_prepare(v->db, v->zDb, v->zName, &c->pStmt, zSql);
3690 sqlite3_free(zSql);
3691 if( rc!=SQLITE_OK ) return rc;
3692
3693 c->iCursorType = idxNum;
3694 switch( idxNum ){
3695 case QUERY_GENERIC:
3696 break;
3697
3698 case QUERY_ROWID:
3699 rc = sqlite3_bind_int64(c->pStmt, 1, sqlite3_value_int64(argv[0]));
3700 if( rc!=SQLITE_OK ) return rc;
3701 break;
3702
3703 default: /* full-text search */
3704 {
3705 const char *zQuery = (const char *)sqlite3_value_text(argv[0]);
3706 assert( idxNum<=QUERY_FULLTEXT+v->nColumn);
3707 assert( argc==1 );
3708 queryClear(&c->q);
3709 if( c->result.nData!=0 ){
3710 /* This case happens if the same cursor is used repeatedly. */
3711 dlrDestroy(&c->reader);
3712 dataBufferReset(&c->result);
3713 }else{
3714 dataBufferInit(&c->result, 0);
3715 }
3716 rc = fulltextQuery(v, idxNum-QUERY_FULLTEXT, zQuery, -1, &c->result, &c->q);
3717 if( rc!=SQLITE_OK ) return rc;
3718 if( c->result.nData!=0 ){
3719 dlrInit(&c->reader, DL_DOCIDS, c->result.pData, c->result.nData);
3720 }
3721 break;
3722 }
3723 }
3724
3725 return fulltextNext(pCursor);
3726}
3727
3728/* This is the xEof method of the virtual table. The SQLite core
3729** calls this routine to find out if it has reached the end of
3730** a query's results set.
3731*/
3732static int fulltextEof(sqlite3_vtab_cursor *pCursor){
3733 fulltext_cursor *c = (fulltext_cursor *) pCursor;
3734 return c->eof;
3735}
3736
3737/* This is the xColumn method of the virtual table. The SQLite
3738** core calls this method during a query when it needs the value
3739** of a column from the virtual table. This method needs to use
3740** one of the sqlite3_result_*() routines to store the requested
3741** value back in the pContext.
3742*/
3743static int fulltextColumn(sqlite3_vtab_cursor *pCursor,
3744 sqlite3_context *pContext, int idxCol){
3745 fulltext_cursor *c = (fulltext_cursor *) pCursor;
3746 fulltext_vtab *v = cursor_vtab(c);
3747
3748 if( idxCol<v->nColumn ){
3749 sqlite3_value *pVal = sqlite3_column_value(c->pStmt, idxCol+1);
3750 sqlite3_result_value(pContext, pVal);
3751 }else if( idxCol==v->nColumn ){
3752 /* The extra column whose name is the same as the table.
3753 ** Return a blob which is a pointer to the cursor
3754 */
3755 sqlite3_result_blob(pContext, &c, sizeof(c), SQLITE_TRANSIENT);
3756 }
3757 return SQLITE_OK;
3758}
3759
3760/* This is the xRowid method. The SQLite core calls this routine to
3761** retrive the rowid for the current row of the result set. The
3762** rowid should be written to *pRowid.
3763*/
3764static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){
3765 fulltext_cursor *c = (fulltext_cursor *) pCursor;
3766
3767 *pRowid = sqlite3_column_int64(c->pStmt, 0);
3768 return SQLITE_OK;
3769}
3770
3771/* Add all terms in [zText] to pendingTerms table. If [iColumn] > 0,
3772** we also store positions and offsets in the hash table using that
3773** column number.
3774*/
3775static int buildTerms(fulltext_vtab *v, sqlite_int64 iDocid,
3776 const char *zText, int iColumn){
3777 sqlite3_tokenizer *pTokenizer = v->pTokenizer;
3778 sqlite3_tokenizer_cursor *pCursor;
3779 const char *pToken;
3780 int nTokenBytes;
3781 int iStartOffset, iEndOffset, iPosition;
3782 int rc;
3783
3784 rc = pTokenizer->pModule->xOpen(pTokenizer, zText, -1, &pCursor);
3785 if( rc!=SQLITE_OK ) return rc;
3786
3787 pCursor->pTokenizer = pTokenizer;
3788 while( SQLITE_OK==pTokenizer->pModule->xNext(pCursor,
3789 &pToken, &nTokenBytes,
3790 &iStartOffset, &iEndOffset,
3791 &iPosition) ){
3792 DLCollector *p;
3793 int nData; /* Size of doclist before our update. */
3794
3795 /* Positions can't be negative; we use -1 as a terminator internally. */
3796 if( iPosition<0 ){
3797 pTokenizer->pModule->xClose(pCursor);
3798 return SQLITE_ERROR;
3799 }
3800
3801 p = fts2HashFind(&v->pendingTerms, pToken, nTokenBytes);
3802 if( p==NULL ){
3803 nData = 0;
3804 p = dlcNew(iDocid, DL_DEFAULT);
3805 fts2HashInsert(&v->pendingTerms, pToken, nTokenBytes, p);
3806
3807 /* Overhead for our hash table entry, the key, and the value. */
3808 v->nPendingData += sizeof(struct fts2HashElem)+sizeof(*p)+nTokenBytes;
3809 }else{
3810 nData = p->b.nData;
3811 if( p->dlw.iPrevDocid!=iDocid ) dlcNext(p, iDocid);
3812 }
3813 if( iColumn>=0 ){
3814 dlcAddPos(p, iColumn, iPosition, iStartOffset, iEndOffset);
3815 }
3816
3817 /* Accumulate data added by dlcNew or dlcNext, and dlcAddPos. */
3818 v->nPendingData += p->b.nData-nData;
3819 }
3820
3821 /* TODO(shess) Check return? Should this be able to cause errors at
3822 ** this point? Actually, same question about sqlite3_finalize(),
3823 ** though one could argue that failure there means that the data is
3824 ** not durable. *ponder*
3825 */
3826 pTokenizer->pModule->xClose(pCursor);
3827 return rc;
3828}
3829
3830/* Add doclists for all terms in [pValues] to pendingTerms table. */
3831static int insertTerms(fulltext_vtab *v, sqlite_int64 iRowid,
3832 sqlite3_value **pValues){
3833 int i;
3834 for(i = 0; i < v->nColumn ; ++i){
3835 char *zText = (char*)sqlite3_value_text(pValues[i]);
3836 int rc = buildTerms(v, iRowid, zText, i);
3837 if( rc!=SQLITE_OK ) return rc;
3838 }
3839 return SQLITE_OK;
3840}
3841
3842/* Add empty doclists for all terms in the given row's content to
3843** pendingTerms.
3844*/
3845static int deleteTerms(fulltext_vtab *v, sqlite_int64 iRowid){
3846 const char **pValues;
3847 int i, rc;
3848
3849 /* TODO(shess) Should we allow such tables at all? */
3850 if( DL_DEFAULT==DL_DOCIDS ) return SQLITE_ERROR;
3851
3852 rc = content_select(v, iRowid, &pValues);
3853 if( rc!=SQLITE_OK ) return rc;
3854
3855 for(i = 0 ; i < v->nColumn; ++i) {
3856 rc = buildTerms(v, iRowid, pValues[i], -1);
3857 if( rc!=SQLITE_OK ) break;
3858 }
3859
3860 freeStringArray(v->nColumn, pValues);
3861 return SQLITE_OK;
3862}
3863
3864/* TODO(shess) Refactor the code to remove this forward decl. */
3865static int initPendingTerms(fulltext_vtab *v, sqlite_int64 iDocid);
3866
3867/* Insert a row into the %_content table; set *piRowid to be the ID of the
3868** new row. Add doclists for terms to pendingTerms.
3869*/
3870static int index_insert(fulltext_vtab *v, sqlite3_value *pRequestRowid,
3871 sqlite3_value **pValues, sqlite_int64 *piRowid){
3872 int rc;
3873
3874 rc = content_insert(v, pRequestRowid, pValues); /* execute an SQL INSERT */
3875 if( rc!=SQLITE_OK ) return rc;
3876
3877 *piRowid = sqlite3_last_insert_rowid(v->db);
3878 rc = initPendingTerms(v, *piRowid);
3879 if( rc!=SQLITE_OK ) return rc;
3880
3881 return insertTerms(v, *piRowid, pValues);
3882}
3883
3884/* Delete a row from the %_content table; add empty doclists for terms
3885** to pendingTerms.
3886*/
3887static int index_delete(fulltext_vtab *v, sqlite_int64 iRow){
3888 int rc = initPendingTerms(v, iRow);
3889 if( rc!=SQLITE_OK ) return rc;
3890
3891 rc = deleteTerms(v, iRow);
3892 if( rc!=SQLITE_OK ) return rc;
3893
3894 return content_delete(v, iRow); /* execute an SQL DELETE */
3895}
3896
3897/* Update a row in the %_content table; add delete doclists to
3898** pendingTerms for old terms not in the new data, add insert doclists
3899** to pendingTerms for terms in the new data.
3900*/
3901static int index_update(fulltext_vtab *v, sqlite_int64 iRow,
3902 sqlite3_value **pValues){
3903 int rc = initPendingTerms(v, iRow);
3904 if( rc!=SQLITE_OK ) return rc;
3905
3906 /* Generate an empty doclist for each term that previously appeared in this
3907 * row. */
3908 rc = deleteTerms(v, iRow);
3909 if( rc!=SQLITE_OK ) return rc;
3910
3911 rc = content_update(v, pValues, iRow); /* execute an SQL UPDATE */
3912 if( rc!=SQLITE_OK ) return rc;
3913
3914 /* Now add positions for terms which appear in the updated row. */
3915 return insertTerms(v, iRow, pValues);
3916}
3917
3918/*******************************************************************/
3919/* InteriorWriter is used to collect terms and block references into
3920** interior nodes in %_segments. See commentary at top of file for
3921** format.
3922*/
3923
3924/* How large interior nodes can grow. */
3925#define INTERIOR_MAX 2048
3926
3927/* Minimum number of terms per interior node (except the root). This
3928** prevents large terms from making the tree too skinny - must be >0
3929** so that the tree always makes progress. Note that the min tree
3930** fanout will be INTERIOR_MIN_TERMS+1.
3931*/
3932#define INTERIOR_MIN_TERMS 7
3933#if INTERIOR_MIN_TERMS<1
3934# error INTERIOR_MIN_TERMS must be greater than 0.
3935#endif
3936
3937/* ROOT_MAX controls how much data is stored inline in the segment
3938** directory.
3939*/
3940/* TODO(shess) Push ROOT_MAX down to whoever is writing things. It's
3941** only here so that interiorWriterRootInfo() and leafWriterRootInfo()
3942** can both see it, but if the caller passed it in, we wouldn't even
3943** need a define.
3944*/
3945#define ROOT_MAX 1024
3946#if ROOT_MAX<VARINT_MAX*2
3947# error ROOT_MAX must have enough space for a header.
3948#endif
3949
3950/* InteriorBlock stores a linked-list of interior blocks while a lower
3951** layer is being constructed.
3952*/
3953typedef struct InteriorBlock {
3954 DataBuffer term; /* Leftmost term in block's subtree. */
3955 DataBuffer data; /* Accumulated data for the block. */
3956 struct InteriorBlock *next;
3957} InteriorBlock;
3958
3959static InteriorBlock *interiorBlockNew(int iHeight, sqlite_int64 iChildBlock,
3960 const char *pTerm, int nTerm){
3961 InteriorBlock *block = calloc(1, sizeof(InteriorBlock));
3962 char c[VARINT_MAX+VARINT_MAX];
3963 int n;
3964
3965 dataBufferInit(&block->term, 0);
3966 dataBufferReplace(&block->term, pTerm, nTerm);
3967
3968 n = putVarint(c, iHeight);
3969 n += putVarint(c+n, iChildBlock);
3970 dataBufferInit(&block->data, INTERIOR_MAX);
3971 dataBufferReplace(&block->data, c, n);
3972
3973 return block;
3974}
3975
3976#ifndef NDEBUG
3977/* Verify that the data is readable as an interior node. */
3978static void interiorBlockValidate(InteriorBlock *pBlock){
3979 const char *pData = pBlock->data.pData;
3980 int nData = pBlock->data.nData;
3981 int n, iDummy;
3982 sqlite_int64 iBlockid;
3983
3984 assert( nData>0 );
3985 assert( pData!=0 );
3986 assert( pData+nData>pData );
3987
3988 /* Must lead with height of node as a varint(n), n>0 */
3989 n = getVarint32(pData, &iDummy);
3990 assert( n>0 );
3991 assert( iDummy>0 );
3992 assert( n<nData );
3993 pData += n;
3994 nData -= n;
3995
3996 /* Must contain iBlockid. */
3997 n = getVarint(pData, &iBlockid);
3998 assert( n>0 );
3999 assert( n<=nData );
4000 pData += n;
4001 nData -= n;
4002
4003 /* Zero or more terms of positive length */
4004 if( nData!=0 ){
4005 /* First term is not delta-encoded. */
4006 n = getVarint32(pData, &iDummy);
4007 assert( n>0 );
4008 assert( iDummy>0 );
4009 assert( n+iDummy>0);
4010 assert( n+iDummy<=nData );
4011 pData += n+iDummy;
4012 nData -= n+iDummy;
4013
4014 /* Following terms delta-encoded. */
4015 while( nData!=0 ){
4016 /* Length of shared prefix. */
4017 n = getVarint32(pData, &iDummy);
4018 assert( n>0 );
4019 assert( iDummy>=0 );
4020 assert( n<nData );
4021 pData += n;
4022 nData -= n;
4023
4024 /* Length and data of distinct suffix. */
4025 n = getVarint32(pData, &iDummy);
4026 assert( n>0 );
4027 assert( iDummy>0 );
4028 assert( n+iDummy>0);
4029 assert( n+iDummy<=nData );
4030 pData += n+iDummy;
4031 nData -= n+iDummy;
4032 }
4033 }
4034}
4035#define ASSERT_VALID_INTERIOR_BLOCK(x) interiorBlockValidate(x)
4036#else
4037#define ASSERT_VALID_INTERIOR_BLOCK(x) assert( 1 )
4038#endif
4039
4040typedef struct InteriorWriter {
4041 int iHeight; /* from 0 at leaves. */
4042 InteriorBlock *first, *last;
4043 struct InteriorWriter *parentWriter;
4044
4045 DataBuffer term; /* Last term written to block "last". */
4046 sqlite_int64 iOpeningChildBlock; /* First child block in block "last". */
4047#ifndef NDEBUG
4048 sqlite_int64 iLastChildBlock; /* for consistency checks. */
4049#endif
4050} InteriorWriter;
4051
4052/* Initialize an interior node where pTerm[nTerm] marks the leftmost
4053** term in the tree. iChildBlock is the leftmost child block at the
4054** next level down the tree.
4055*/
4056static void interiorWriterInit(int iHeight, const char *pTerm, int nTerm,
4057 sqlite_int64 iChildBlock,
4058 InteriorWriter *pWriter){
4059 InteriorBlock *block;
4060 assert( iHeight>0 );
4061 CLEAR(pWriter);
4062
4063 pWriter->iHeight = iHeight;
4064 pWriter->iOpeningChildBlock = iChildBlock;
4065#ifndef NDEBUG
4066 pWriter->iLastChildBlock = iChildBlock;
4067#endif
4068 block = interiorBlockNew(iHeight, iChildBlock, pTerm, nTerm);
4069 pWriter->last = pWriter->first = block;
4070 ASSERT_VALID_INTERIOR_BLOCK(pWriter->last);
4071 dataBufferInit(&pWriter->term, 0);
4072}
4073
4074/* Append the child node rooted at iChildBlock to the interior node,
4075** with pTerm[nTerm] as the leftmost term in iChildBlock's subtree.
4076*/
4077static void interiorWriterAppend(InteriorWriter *pWriter,
4078 const char *pTerm, int nTerm,
4079 sqlite_int64 iChildBlock){
4080 char c[VARINT_MAX+VARINT_MAX];
4081 int n, nPrefix = 0;
4082
4083 ASSERT_VALID_INTERIOR_BLOCK(pWriter->last);
4084
4085 /* The first term written into an interior node is actually
4086 ** associated with the second child added (the first child was added
4087 ** in interiorWriterInit, or in the if clause at the bottom of this
4088 ** function). That term gets encoded straight up, with nPrefix left
4089 ** at 0.
4090 */
4091 if( pWriter->term.nData==0 ){
4092 n = putVarint(c, nTerm);
4093 }else{
4094 while( nPrefix<pWriter->term.nData &&
4095 pTerm[nPrefix]==pWriter->term.pData[nPrefix] ){
4096 nPrefix++;
4097 }
4098
4099 n = putVarint(c, nPrefix);
4100 n += putVarint(c+n, nTerm-nPrefix);
4101 }
4102
4103#ifndef NDEBUG
4104 pWriter->iLastChildBlock++;
4105#endif
4106 assert( pWriter->iLastChildBlock==iChildBlock );
4107
4108 /* Overflow to a new block if the new term makes the current block
4109 ** too big, and the current block already has enough terms.
4110 */
4111 if( pWriter->last->data.nData+n+nTerm-nPrefix>INTERIOR_MAX &&
4112 iChildBlock-pWriter->iOpeningChildBlock>INTERIOR_MIN_TERMS ){
4113 pWriter->last->next = interiorBlockNew(pWriter->iHeight, iChildBlock,
4114 pTerm, nTerm);
4115 pWriter->last = pWriter->last->next;
4116 pWriter->iOpeningChildBlock = iChildBlock;
4117 dataBufferReset(&pWriter->term);
4118 }else{
4119 dataBufferAppend2(&pWriter->last->data, c, n,
4120 pTerm+nPrefix, nTerm-nPrefix);
4121 dataBufferReplace(&pWriter->term, pTerm, nTerm);
4122 }
4123 ASSERT_VALID_INTERIOR_BLOCK(pWriter->last);
4124}
4125
4126/* Free the space used by pWriter, including the linked-list of
4127** InteriorBlocks, and parentWriter, if present.
4128*/
4129static int interiorWriterDestroy(InteriorWriter *pWriter){
4130 InteriorBlock *block = pWriter->first;
4131
4132 while( block!=NULL ){
4133 InteriorBlock *b = block;
4134 block = block->next;
4135 dataBufferDestroy(&b->term);
4136 dataBufferDestroy(&b->data);
4137 free(b);
4138 }
4139 if( pWriter->parentWriter!=NULL ){
4140 interiorWriterDestroy(pWriter->parentWriter);
4141 free(pWriter->parentWriter);
4142 }
4143 dataBufferDestroy(&pWriter->term);
4144 SCRAMBLE(pWriter);
4145 return SQLITE_OK;
4146}
4147
4148/* If pWriter can fit entirely in ROOT_MAX, return it as the root info
4149** directly, leaving *piEndBlockid unchanged. Otherwise, flush
4150** pWriter to %_segments, building a new layer of interior nodes, and
4151** recursively ask for their root into.
4152*/
4153static int interiorWriterRootInfo(fulltext_vtab *v, InteriorWriter *pWriter,
4154 char **ppRootInfo, int *pnRootInfo,
4155 sqlite_int64 *piEndBlockid){
4156 InteriorBlock *block = pWriter->first;
4157 sqlite_int64 iBlockid = 0;
4158 int rc;
4159
4160 /* If we can fit the segment inline */
4161 if( block==pWriter->last && block->data.nData<ROOT_MAX ){
4162 *ppRootInfo = block->data.pData;
4163 *pnRootInfo = block->data.nData;
4164 return SQLITE_OK;
4165 }
4166
4167 /* Flush the first block to %_segments, and create a new level of
4168 ** interior node.
4169 */
4170 ASSERT_VALID_INTERIOR_BLOCK(block);
4171 rc = block_insert(v, block->data.pData, block->data.nData, &iBlockid);
4172 if( rc!=SQLITE_OK ) return rc;
4173 *piEndBlockid = iBlockid;
4174
4175 pWriter->parentWriter = malloc(sizeof(*pWriter->parentWriter));
4176 interiorWriterInit(pWriter->iHeight+1,
4177 block->term.pData, block->term.nData,
4178 iBlockid, pWriter->parentWriter);
4179
4180 /* Flush additional blocks and append to the higher interior
4181 ** node.
4182 */
4183 for(block=block->next; block!=NULL; block=block->next){
4184 ASSERT_VALID_INTERIOR_BLOCK(block);
4185 rc = block_insert(v, block->data.pData, block->data.nData, &iBlockid);
4186 if( rc!=SQLITE_OK ) return rc;
4187 *piEndBlockid = iBlockid;
4188
4189 interiorWriterAppend(pWriter->parentWriter,
4190 block->term.pData, block->term.nData, iBlockid);
4191 }
4192
4193 /* Parent node gets the chance to be the root. */
4194 return interiorWriterRootInfo(v, pWriter->parentWriter,
4195 ppRootInfo, pnRootInfo, piEndBlockid);
4196}
4197
4198/****************************************************************/
4199/* InteriorReader is used to read off the data from an interior node
4200** (see comment at top of file for the format).
4201*/
4202typedef struct InteriorReader {
4203 const char *pData;
4204 int nData;
4205
4206 DataBuffer term; /* previous term, for decoding term delta. */
4207
4208 sqlite_int64 iBlockid;
4209} InteriorReader;
4210
4211static void interiorReaderDestroy(InteriorReader *pReader){
4212 dataBufferDestroy(&pReader->term);
4213 SCRAMBLE(pReader);
4214}
4215
4216/* TODO(shess) The assertions are great, but what if we're in NDEBUG
4217** and the blob is empty or otherwise contains suspect data?
4218*/
4219static void interiorReaderInit(const char *pData, int nData,
4220 InteriorReader *pReader){
4221 int n, nTerm;
4222
4223 /* Require at least the leading flag byte */
4224 assert( nData>0 );
4225 assert( pData[0]!='\0' );
4226
4227 CLEAR(pReader);
4228
4229 /* Decode the base blockid, and set the cursor to the first term. */
4230 n = getVarint(pData+1, &pReader->iBlockid);
4231 assert( 1+n<=nData );
4232 pReader->pData = pData+1+n;
4233 pReader->nData = nData-(1+n);
4234
4235 /* A single-child interior node (such as when a leaf node was too
4236 ** large for the segment directory) won't have any terms.
4237 ** Otherwise, decode the first term.
4238 */
4239 if( pReader->nData==0 ){
4240 dataBufferInit(&pReader->term, 0);
4241 }else{
4242 n = getVarint32(pReader->pData, &nTerm);
4243 dataBufferInit(&pReader->term, nTerm);
4244 dataBufferReplace(&pReader->term, pReader->pData+n, nTerm);
4245 assert( n+nTerm<=pReader->nData );
4246 pReader->pData += n+nTerm;
4247 pReader->nData -= n+nTerm;
4248 }
4249}
4250
4251static int interiorReaderAtEnd(InteriorReader *pReader){
4252 return pReader->term.nData==0;
4253}
4254
4255static sqlite_int64 interiorReaderCurrentBlockid(InteriorReader *pReader){
4256 return pReader->iBlockid;
4257}
4258
4259static int interiorReaderTermBytes(InteriorReader *pReader){
4260 assert( !interiorReaderAtEnd(pReader) );
4261 return pReader->term.nData;
4262}
4263static const char *interiorReaderTerm(InteriorReader *pReader){
4264 assert( !interiorReaderAtEnd(pReader) );
4265 return pReader->term.pData;
4266}
4267
4268/* Step forward to the next term in the node. */
4269static void interiorReaderStep(InteriorReader *pReader){
4270 assert( !interiorReaderAtEnd(pReader) );
4271
4272 /* If the last term has been read, signal eof, else construct the
4273 ** next term.
4274 */
4275 if( pReader->nData==0 ){
4276 dataBufferReset(&pReader->term);
4277 }else{
4278 int n, nPrefix, nSuffix;
4279
4280 n = getVarint32(pReader->pData, &nPrefix);
4281 n += getVarint32(pReader->pData+n, &nSuffix);
4282
4283 /* Truncate the current term and append suffix data. */
4284 pReader->term.nData = nPrefix;
4285 dataBufferAppend(&pReader->term, pReader->pData+n, nSuffix);
4286
4287 assert( n+nSuffix<=pReader->nData );
4288 pReader->pData += n+nSuffix;
4289 pReader->nData -= n+nSuffix;
4290 }
4291 pReader->iBlockid++;
4292}
4293
4294/* Compare the current term to pTerm[nTerm], returning strcmp-style
4295** results. If isPrefix, equality means equal through nTerm bytes.
4296*/
4297static int interiorReaderTermCmp(InteriorReader *pReader,
4298 const char *pTerm, int nTerm, int isPrefix){
4299 const char *pReaderTerm = interiorReaderTerm(pReader);
4300 int nReaderTerm = interiorReaderTermBytes(pReader);
4301 int c, n = nReaderTerm<nTerm ? nReaderTerm : nTerm;
4302
4303 if( n==0 ){
4304 if( nReaderTerm>0 ) return -1;
4305 if( nTerm>0 ) return 1;
4306 return 0;
4307 }
4308
4309 c = memcmp(pReaderTerm, pTerm, n);
4310 if( c!=0 ) return c;
4311 if( isPrefix && n==nTerm ) return 0;
4312 return nReaderTerm - nTerm;
4313}
4314
4315/****************************************************************/
4316/* LeafWriter is used to collect terms and associated doclist data
4317** into leaf blocks in %_segments (see top of file for format info).
4318** Expected usage is:
4319**
4320** LeafWriter writer;
4321** leafWriterInit(0, 0, &writer);
4322** while( sorted_terms_left_to_process ){
4323** // data is doclist data for that term.
4324** rc = leafWriterStep(v, &writer, pTerm, nTerm, pData, nData);
4325** if( rc!=SQLITE_OK ) goto err;
4326** }
4327** rc = leafWriterFinalize(v, &writer);
4328**err:
4329** leafWriterDestroy(&writer);
4330** return rc;
4331**
4332** leafWriterStep() may write a collected leaf out to %_segments.
4333** leafWriterFinalize() finishes writing any buffered data and stores
4334** a root node in %_segdir. leafWriterDestroy() frees all buffers and
4335** InteriorWriters allocated as part of writing this segment.
4336**
4337** TODO(shess) Document leafWriterStepMerge().
4338*/
4339
4340/* Put terms with data this big in their own block. */
4341#define STANDALONE_MIN 1024
4342
4343/* Keep leaf blocks below this size. */
4344#define LEAF_MAX 2048
4345
4346typedef struct LeafWriter {
4347 int iLevel;
4348 int idx;
4349 sqlite_int64 iStartBlockid; /* needed to create the root info */
4350 sqlite_int64 iEndBlockid; /* when we're done writing. */
4351
4352 DataBuffer term; /* previous encoded term */
4353 DataBuffer data; /* encoding buffer */
4354
4355 /* bytes of first term in the current node which distinguishes that
4356 ** term from the last term of the previous node.
4357 */
4358 int nTermDistinct;
4359
4360 InteriorWriter parentWriter; /* if we overflow */
4361 int has_parent;
4362} LeafWriter;
4363
4364static void leafWriterInit(int iLevel, int idx, LeafWriter *pWriter){
4365 CLEAR(pWriter);
4366 pWriter->iLevel = iLevel;
4367 pWriter->idx = idx;
4368
4369 dataBufferInit(&pWriter->term, 32);
4370
4371 /* Start out with a reasonably sized block, though it can grow. */
4372 dataBufferInit(&pWriter->data, LEAF_MAX);
4373}
4374
4375#ifndef NDEBUG
4376/* Verify that the data is readable as a leaf node. */
4377static void leafNodeValidate(const char *pData, int nData){
4378 int n, iDummy;
4379
4380 if( nData==0 ) return;
4381 assert( nData>0 );
4382 assert( pData!=0 );
4383 assert( pData+nData>pData );
4384
4385 /* Must lead with a varint(0) */
4386 n = getVarint32(pData, &iDummy);
4387 assert( iDummy==0 );
4388 assert( n>0 );
4389 assert( n<nData );
4390 pData += n;
4391 nData -= n;
4392
4393 /* Leading term length and data must fit in buffer. */
4394 n = getVarint32(pData, &iDummy);
4395 assert( n>0 );
4396 assert( iDummy>0 );
4397 assert( n+iDummy>0 );
4398 assert( n+iDummy<nData );
4399 pData += n+iDummy;
4400 nData -= n+iDummy;
4401
4402 /* Leading term's doclist length and data must fit. */
4403 n = getVarint32(pData, &iDummy);
4404 assert( n>0 );
4405 assert( iDummy>0 );
4406 assert( n+iDummy>0 );
4407 assert( n+iDummy<=nData );
4408 ASSERT_VALID_DOCLIST(DL_DEFAULT, pData+n, iDummy, NULL);
4409 pData += n+iDummy;
4410 nData -= n+iDummy;
4411
4412 /* Verify that trailing terms and doclists also are readable. */
4413 while( nData!=0 ){
4414 n = getVarint32(pData, &iDummy);
4415 assert( n>0 );
4416 assert( iDummy>=0 );
4417 assert( n<nData );
4418 pData += n;
4419 nData -= n;
4420 n = getVarint32(pData, &iDummy);
4421 assert( n>0 );
4422 assert( iDummy>0 );
4423 assert( n+iDummy>0 );
4424 assert( n+iDummy<nData );
4425 pData += n+iDummy;
4426 nData -= n+iDummy;
4427
4428 n = getVarint32(pData, &iDummy);
4429 assert( n>0 );
4430 assert( iDummy>0 );
4431 assert( n+iDummy>0 );
4432 assert( n+iDummy<=nData );
4433 ASSERT_VALID_DOCLIST(DL_DEFAULT, pData+n, iDummy, NULL);
4434 pData += n+iDummy;
4435 nData -= n+iDummy;
4436 }
4437}
4438#define ASSERT_VALID_LEAF_NODE(p, n) leafNodeValidate(p, n)
4439#else
4440#define ASSERT_VALID_LEAF_NODE(p, n) assert( 1 )
4441#endif
4442
4443/* Flush the current leaf node to %_segments, and adding the resulting
4444** blockid and the starting term to the interior node which will
4445** contain it.
4446*/
4447static int leafWriterInternalFlush(fulltext_vtab *v, LeafWriter *pWriter,
4448 int iData, int nData){
4449 sqlite_int64 iBlockid = 0;
4450 const char *pStartingTerm;
4451 int nStartingTerm, rc, n;
4452
4453 /* Must have the leading varint(0) flag, plus at least some
4454 ** valid-looking data.
4455 */
4456 assert( nData>2 );
4457 assert( iData>=0 );
4458 assert( iData+nData<=pWriter->data.nData );
4459 ASSERT_VALID_LEAF_NODE(pWriter->data.pData+iData, nData);
4460
4461 rc = block_insert(v, pWriter->data.pData+iData, nData, &iBlockid);
4462 if( rc!=SQLITE_OK ) return rc;
4463 assert( iBlockid!=0 );
4464
4465 /* Reconstruct the first term in the leaf for purposes of building
4466 ** the interior node.
4467 */
4468 n = getVarint32(pWriter->data.pData+iData+1, &nStartingTerm);
4469 pStartingTerm = pWriter->data.pData+iData+1+n;
4470 assert( pWriter->data.nData>iData+1+n+nStartingTerm );
4471 assert( pWriter->nTermDistinct>0 );
4472 assert( pWriter->nTermDistinct<=nStartingTerm );
4473 nStartingTerm = pWriter->nTermDistinct;
4474
4475 if( pWriter->has_parent ){
4476 interiorWriterAppend(&pWriter->parentWriter,
4477 pStartingTerm, nStartingTerm, iBlockid);
4478 }else{
4479 interiorWriterInit(1, pStartingTerm, nStartingTerm, iBlockid,
4480 &pWriter->parentWriter);
4481 pWriter->has_parent = 1;
4482 }
4483
4484 /* Track the span of this segment's leaf nodes. */
4485 if( pWriter->iEndBlockid==0 ){
4486 pWriter->iEndBlockid = pWriter->iStartBlockid = iBlockid;
4487 }else{
4488 pWriter->iEndBlockid++;
4489 assert( iBlockid==pWriter->iEndBlockid );
4490 }
4491
4492 return SQLITE_OK;
4493}
4494static int leafWriterFlush(fulltext_vtab *v, LeafWriter *pWriter){
4495 int rc = leafWriterInternalFlush(v, pWriter, 0, pWriter->data.nData);
4496 if( rc!=SQLITE_OK ) return rc;
4497
4498 /* Re-initialize the output buffer. */
4499 dataBufferReset(&pWriter->data);
4500
4501 return SQLITE_OK;
4502}
4503
4504/* Fetch the root info for the segment. If the entire leaf fits
4505** within ROOT_MAX, then it will be returned directly, otherwise it
4506** will be flushed and the root info will be returned from the
4507** interior node. *piEndBlockid is set to the blockid of the last
4508** interior or leaf node written to disk (0 if none are written at
4509** all).
4510*/
4511static int leafWriterRootInfo(fulltext_vtab *v, LeafWriter *pWriter,
4512 char **ppRootInfo, int *pnRootInfo,
4513 sqlite_int64 *piEndBlockid){
4514 /* we can fit the segment entirely inline */
4515 if( !pWriter->has_parent && pWriter->data.nData<ROOT_MAX ){
4516 *ppRootInfo = pWriter->data.pData;
4517 *pnRootInfo = pWriter->data.nData;
4518 *piEndBlockid = 0;
4519 return SQLITE_OK;
4520 }
4521
4522 /* Flush remaining leaf data. */
4523 if( pWriter->data.nData>0 ){
4524 int rc = leafWriterFlush(v, pWriter);
4525 if( rc!=SQLITE_OK ) return rc;
4526 }
4527
4528 /* We must have flushed a leaf at some point. */
4529 assert( pWriter->has_parent );
4530
4531 /* Tenatively set the end leaf blockid as the end blockid. If the
4532 ** interior node can be returned inline, this will be the final
4533 ** blockid, otherwise it will be overwritten by
4534 ** interiorWriterRootInfo().
4535 */
4536 *piEndBlockid = pWriter->iEndBlockid;
4537
4538 return interiorWriterRootInfo(v, &pWriter->parentWriter,
4539 ppRootInfo, pnRootInfo, piEndBlockid);
4540}
4541
4542/* Collect the rootInfo data and store it into the segment directory.
4543** This has the effect of flushing the segment's leaf data to
4544** %_segments, and also flushing any interior nodes to %_segments.
4545*/
4546static int leafWriterFinalize(fulltext_vtab *v, LeafWriter *pWriter){
4547 sqlite_int64 iEndBlockid;
4548 char *pRootInfo;
4549 int rc, nRootInfo;
4550
4551 rc = leafWriterRootInfo(v, pWriter, &pRootInfo, &nRootInfo, &iEndBlockid);
4552 if( rc!=SQLITE_OK ) return rc;
4553
4554 /* Don't bother storing an entirely empty segment. */
4555 if( iEndBlockid==0 && nRootInfo==0 ) return SQLITE_OK;
4556
4557 return segdir_set(v, pWriter->iLevel, pWriter->idx,
4558 pWriter->iStartBlockid, pWriter->iEndBlockid,
4559 iEndBlockid, pRootInfo, nRootInfo);
4560}
4561
4562static void leafWriterDestroy(LeafWriter *pWriter){
4563 if( pWriter->has_parent ) interiorWriterDestroy(&pWriter->parentWriter);
4564 dataBufferDestroy(&pWriter->term);
4565 dataBufferDestroy(&pWriter->data);
4566}
4567
4568/* Encode a term into the leafWriter, delta-encoding as appropriate.
4569** Returns the length of the new term which distinguishes it from the
4570** previous term, which can be used to set nTermDistinct when a node
4571** boundary is crossed.
4572*/
4573static int leafWriterEncodeTerm(LeafWriter *pWriter,
4574 const char *pTerm, int nTerm){
4575 char c[VARINT_MAX+VARINT_MAX];
4576 int n, nPrefix = 0;
4577
4578 assert( nTerm>0 );
4579 while( nPrefix<pWriter->term.nData &&
4580 pTerm[nPrefix]==pWriter->term.pData[nPrefix] ){
4581 nPrefix++;
4582 /* Failing this implies that the terms weren't in order. */
4583 assert( nPrefix<nTerm );
4584 }
4585
4586 if( pWriter->data.nData==0 ){
4587 /* Encode the node header and leading term as:
4588 ** varint(0)
4589 ** varint(nTerm)
4590 ** char pTerm[nTerm]
4591 */
4592 n = putVarint(c, '\0');
4593 n += putVarint(c+n, nTerm);
4594 dataBufferAppend2(&pWriter->data, c, n, pTerm, nTerm);
4595 }else{
4596 /* Delta-encode the term as:
4597 ** varint(nPrefix)
4598 ** varint(nSuffix)
4599 ** char pTermSuffix[nSuffix]
4600 */
4601 n = putVarint(c, nPrefix);
4602 n += putVarint(c+n, nTerm-nPrefix);
4603 dataBufferAppend2(&pWriter->data, c, n, pTerm+nPrefix, nTerm-nPrefix);
4604 }
4605 dataBufferReplace(&pWriter->term, pTerm, nTerm);
4606
4607 return nPrefix+1;
4608}
4609
4610/* Used to avoid a memmove when a large amount of doclist data is in
4611** the buffer. This constructs a node and term header before
4612** iDoclistData and flushes the resulting complete node using
4613** leafWriterInternalFlush().
4614*/
4615static int leafWriterInlineFlush(fulltext_vtab *v, LeafWriter *pWriter,
4616 const char *pTerm, int nTerm,
4617 int iDoclistData){
4618 char c[VARINT_MAX+VARINT_MAX];
4619 int iData, n = putVarint(c, 0);
4620 n += putVarint(c+n, nTerm);
4621
4622 /* There should always be room for the header. Even if pTerm shared
4623 ** a substantial prefix with the previous term, the entire prefix
4624 ** could be constructed from earlier data in the doclist, so there
4625 ** should be room.
4626 */
4627 assert( iDoclistData>=n+nTerm );
4628
4629 iData = iDoclistData-(n+nTerm);
4630 memcpy(pWriter->data.pData+iData, c, n);
4631 memcpy(pWriter->data.pData+iData+n, pTerm, nTerm);
4632
4633 return leafWriterInternalFlush(v, pWriter, iData, pWriter->data.nData-iData);
4634}
4635
4636/* Push pTerm[nTerm] along with the doclist data to the leaf layer of
4637** %_segments.
4638*/
4639static int leafWriterStepMerge(fulltext_vtab *v, LeafWriter *pWriter,
4640 const char *pTerm, int nTerm,
4641 DLReader *pReaders, int nReaders){
4642 char c[VARINT_MAX+VARINT_MAX];
4643 int iTermData = pWriter->data.nData, iDoclistData;
4644 int i, nData, n, nActualData, nActual, rc, nTermDistinct;
4645
4646 ASSERT_VALID_LEAF_NODE(pWriter->data.pData, pWriter->data.nData);
4647 nTermDistinct = leafWriterEncodeTerm(pWriter, pTerm, nTerm);
4648
4649 /* Remember nTermDistinct if opening a new node. */
4650 if( iTermData==0 ) pWriter->nTermDistinct = nTermDistinct;
4651
4652 iDoclistData = pWriter->data.nData;
4653
4654 /* Estimate the length of the merged doclist so we can leave space
4655 ** to encode it.
4656 */
4657 for(i=0, nData=0; i<nReaders; i++){
4658 nData += dlrAllDataBytes(&pReaders[i]);
4659 }
4660 n = putVarint(c, nData);
4661 dataBufferAppend(&pWriter->data, c, n);
4662
4663 docListMerge(&pWriter->data, pReaders, nReaders);
4664 ASSERT_VALID_DOCLIST(DL_DEFAULT,
4665 pWriter->data.pData+iDoclistData+n,
4666 pWriter->data.nData-iDoclistData-n, NULL);
4667
4668 /* The actual amount of doclist data at this point could be smaller
4669 ** than the length we encoded. Additionally, the space required to
4670 ** encode this length could be smaller. For small doclists, this is
4671 ** not a big deal, we can just use memmove() to adjust things.
4672 */
4673 nActualData = pWriter->data.nData-(iDoclistData+n);
4674 nActual = putVarint(c, nActualData);
4675 assert( nActualData<=nData );
4676 assert( nActual<=n );
4677
4678 /* If the new doclist is big enough for force a standalone leaf
4679 ** node, we can immediately flush it inline without doing the
4680 ** memmove().
4681 */
4682 /* TODO(shess) This test matches leafWriterStep(), which does this
4683 ** test before it knows the cost to varint-encode the term and
4684 ** doclist lengths. At some point, change to
4685 ** pWriter->data.nData-iTermData>STANDALONE_MIN.
4686 */
4687 if( nTerm+nActualData>STANDALONE_MIN ){
4688 /* Push leaf node from before this term. */
4689 if( iTermData>0 ){
4690 rc = leafWriterInternalFlush(v, pWriter, 0, iTermData);
4691 if( rc!=SQLITE_OK ) return rc;
4692
4693 pWriter->nTermDistinct = nTermDistinct;
4694 }
4695
4696 /* Fix the encoded doclist length. */
4697 iDoclistData += n - nActual;
4698 memcpy(pWriter->data.pData+iDoclistData, c, nActual);
4699
4700 /* Push the standalone leaf node. */
4701 rc = leafWriterInlineFlush(v, pWriter, pTerm, nTerm, iDoclistData);
4702 if( rc!=SQLITE_OK ) return rc;
4703
4704 /* Leave the node empty. */
4705 dataBufferReset(&pWriter->data);
4706
4707 return rc;
4708 }
4709
4710 /* At this point, we know that the doclist was small, so do the
4711 ** memmove if indicated.
4712 */
4713 if( nActual<n ){
4714 memmove(pWriter->data.pData+iDoclistData+nActual,
4715 pWriter->data.pData+iDoclistData+n,
4716 pWriter->data.nData-(iDoclistData+n));
4717 pWriter->data.nData -= n-nActual;
4718 }
4719
4720 /* Replace written length with actual length. */
4721 memcpy(pWriter->data.pData+iDoclistData, c, nActual);
4722
4723 /* If the node is too large, break things up. */
4724 /* TODO(shess) This test matches leafWriterStep(), which does this
4725 ** test before it knows the cost to varint-encode the term and
4726 ** doclist lengths. At some point, change to
4727 ** pWriter->data.nData>LEAF_MAX.
4728 */
4729 if( iTermData+nTerm+nActualData>LEAF_MAX ){
4730 /* Flush out the leading data as a node */
4731 rc = leafWriterInternalFlush(v, pWriter, 0, iTermData);
4732 if( rc!=SQLITE_OK ) return rc;
4733
4734 pWriter->nTermDistinct = nTermDistinct;
4735
4736 /* Rebuild header using the current term */
4737 n = putVarint(pWriter->data.pData, 0);
4738 n += putVarint(pWriter->data.pData+n, nTerm);
4739 memcpy(pWriter->data.pData+n, pTerm, nTerm);
4740 n += nTerm;
4741
4742 /* There should always be room, because the previous encoding
4743 ** included all data necessary to construct the term.
4744 */
4745 assert( n<iDoclistData );
4746 /* So long as STANDALONE_MIN is half or less of LEAF_MAX, the
4747 ** following memcpy() is safe (as opposed to needing a memmove).
4748 */
4749 assert( 2*STANDALONE_MIN<=LEAF_MAX );
4750 assert( n+pWriter->data.nData-iDoclistData<iDoclistData );
4751 memcpy(pWriter->data.pData+n,
4752 pWriter->data.pData+iDoclistData,
4753 pWriter->data.nData-iDoclistData);
4754 pWriter->data.nData -= iDoclistData-n;
4755 }
4756 ASSERT_VALID_LEAF_NODE(pWriter->data.pData, pWriter->data.nData);
4757
4758 return SQLITE_OK;
4759}
4760
4761/* Push pTerm[nTerm] along with the doclist data to the leaf layer of
4762** %_segments.
4763*/
4764/* TODO(shess) Revise writeZeroSegment() so that doclists are
4765** constructed directly in pWriter->data.
4766*/
4767static int leafWriterStep(fulltext_vtab *v, LeafWriter *pWriter,
4768 const char *pTerm, int nTerm,
4769 const char *pData, int nData){
4770 int rc;
4771 DLReader reader;
4772
4773 dlrInit(&reader, DL_DEFAULT, pData, nData);
4774 rc = leafWriterStepMerge(v, pWriter, pTerm, nTerm, &reader, 1);
4775 dlrDestroy(&reader);
4776
4777 return rc;
4778}
4779
4780
4781/****************************************************************/
4782/* LeafReader is used to iterate over an individual leaf node. */
4783typedef struct LeafReader {
4784 DataBuffer term; /* copy of current term. */
4785
4786 const char *pData; /* data for current term. */
4787 int nData;
4788} LeafReader;
4789
4790static void leafReaderDestroy(LeafReader *pReader){
4791 dataBufferDestroy(&pReader->term);
4792 SCRAMBLE(pReader);
4793}
4794
4795static int leafReaderAtEnd(LeafReader *pReader){
4796 return pReader->nData<=0;
4797}
4798
4799/* Access the current term. */
4800static int leafReaderTermBytes(LeafReader *pReader){
4801 return pReader->term.nData;
4802}
4803static const char *leafReaderTerm(LeafReader *pReader){
4804 assert( pReader->term.nData>0 );
4805 return pReader->term.pData;
4806}
4807
4808/* Access the doclist data for the current term. */
4809static int leafReaderDataBytes(LeafReader *pReader){
4810 int nData;
4811 assert( pReader->term.nData>0 );
4812 getVarint32(pReader->pData, &nData);
4813 return nData;
4814}
4815static const char *leafReaderData(LeafReader *pReader){
4816 int n, nData;
4817 assert( pReader->term.nData>0 );
4818 n = getVarint32(pReader->pData, &nData);
4819 return pReader->pData+n;
4820}
4821
4822static void leafReaderInit(const char *pData, int nData,
4823 LeafReader *pReader){
4824 int nTerm, n;
4825
4826 assert( nData>0 );
4827 assert( pData[0]=='\0' );
4828
4829 CLEAR(pReader);
4830
4831 /* Read the first term, skipping the header byte. */
4832 n = getVarint32(pData+1, &nTerm);
4833 dataBufferInit(&pReader->term, nTerm);
4834 dataBufferReplace(&pReader->term, pData+1+n, nTerm);
4835
4836 /* Position after the first term. */
4837 assert( 1+n+nTerm<nData );
4838 pReader->pData = pData+1+n+nTerm;
4839 pReader->nData = nData-1-n-nTerm;
4840}
4841
4842/* Step the reader forward to the next term. */
4843static void leafReaderStep(LeafReader *pReader){
4844 int n, nData, nPrefix, nSuffix;
4845 assert( !leafReaderAtEnd(pReader) );
4846
4847 /* Skip previous entry's data block. */
4848 n = getVarint32(pReader->pData, &nData);
4849 assert( n+nData<=pReader->nData );
4850 pReader->pData += n+nData;
4851 pReader->nData -= n+nData;
4852
4853 if( !leafReaderAtEnd(pReader) ){
4854 /* Construct the new term using a prefix from the old term plus a
4855 ** suffix from the leaf data.
4856 */
4857 n = getVarint32(pReader->pData, &nPrefix);
4858 n += getVarint32(pReader->pData+n, &nSuffix);
4859 assert( n+nSuffix<pReader->nData );
4860 pReader->term.nData = nPrefix;
4861 dataBufferAppend(&pReader->term, pReader->pData+n, nSuffix);
4862
4863 pReader->pData += n+nSuffix;
4864 pReader->nData -= n+nSuffix;
4865 }
4866}
4867
4868/* strcmp-style comparison of pReader's current term against pTerm.
4869** If isPrefix, equality means equal through nTerm bytes.
4870*/
4871static int leafReaderTermCmp(LeafReader *pReader,
4872 const char *pTerm, int nTerm, int isPrefix){
4873 int c, n = pReader->term.nData<nTerm ? pReader->term.nData : nTerm;
4874 if( n==0 ){
4875 if( pReader->term.nData>0 ) return -1;
4876 if(nTerm>0 ) return 1;
4877 return 0;
4878 }
4879
4880 c = memcmp(pReader->term.pData, pTerm, n);
4881 if( c!=0 ) return c;
4882 if( isPrefix && n==nTerm ) return 0;
4883 return pReader->term.nData - nTerm;
4884}
4885
4886
4887/****************************************************************/
4888/* LeavesReader wraps LeafReader to allow iterating over the entire
4889** leaf layer of the tree.
4890*/
4891typedef struct LeavesReader {
4892 int idx; /* Index within the segment. */
4893
4894 sqlite3_stmt *pStmt; /* Statement we're streaming leaves from. */
4895 int eof; /* we've seen SQLITE_DONE from pStmt. */
4896
4897 LeafReader leafReader; /* reader for the current leaf. */
4898 DataBuffer rootData; /* root data for inline. */
4899} LeavesReader;
4900
4901/* Access the current term. */
4902static int leavesReaderTermBytes(LeavesReader *pReader){
4903 assert( !pReader->eof );
4904 return leafReaderTermBytes(&pReader->leafReader);
4905}
4906static const char *leavesReaderTerm(LeavesReader *pReader){
4907 assert( !pReader->eof );
4908 return leafReaderTerm(&pReader->leafReader);
4909}
4910
4911/* Access the doclist data for the current term. */
4912static int leavesReaderDataBytes(LeavesReader *pReader){
4913 assert( !pReader->eof );
4914 return leafReaderDataBytes(&pReader->leafReader);
4915}
4916static const char *leavesReaderData(LeavesReader *pReader){
4917 assert( !pReader->eof );
4918 return leafReaderData(&pReader->leafReader);
4919}
4920
4921static int leavesReaderAtEnd(LeavesReader *pReader){
4922 return pReader->eof;
4923}
4924
4925/* loadSegmentLeaves() may not read all the way to SQLITE_DONE, thus
4926** leaving the statement handle open, which locks the table.
4927*/
4928/* TODO(shess) This "solution" is not satisfactory. Really, there
4929** should be check-in function for all statement handles which
4930** arranges to call sqlite3_reset(). This most likely will require
4931** modification to control flow all over the place, though, so for now
4932** just punt.
4933**
4934** Note the the current system assumes that segment merges will run to
4935** completion, which is why this particular probably hasn't arisen in
4936** this case. Probably a brittle assumption.
4937*/
4938static int leavesReaderReset(LeavesReader *pReader){
4939 return sqlite3_reset(pReader->pStmt);
4940}
4941
4942static void leavesReaderDestroy(LeavesReader *pReader){
4943 leafReaderDestroy(&pReader->leafReader);
4944 dataBufferDestroy(&pReader->rootData);
4945 SCRAMBLE(pReader);
4946}
4947
4948/* Initialize pReader with the given root data (if iStartBlockid==0
4949** the leaf data was entirely contained in the root), or from the
4950** stream of blocks between iStartBlockid and iEndBlockid, inclusive.
4951*/
4952static int leavesReaderInit(fulltext_vtab *v,
4953 int idx,
4954 sqlite_int64 iStartBlockid,
4955 sqlite_int64 iEndBlockid,
4956 const char *pRootData, int nRootData,
4957 LeavesReader *pReader){
4958 CLEAR(pReader);
4959 pReader->idx = idx;
4960
4961 dataBufferInit(&pReader->rootData, 0);
4962 if( iStartBlockid==0 ){
4963 /* Entire leaf level fit in root data. */
4964 dataBufferReplace(&pReader->rootData, pRootData, nRootData);
4965 leafReaderInit(pReader->rootData.pData, pReader->rootData.nData,
4966 &pReader->leafReader);
4967 }else{
4968 sqlite3_stmt *s;
4969 int rc = sql_get_leaf_statement(v, idx, &s);
4970 if( rc!=SQLITE_OK ) return rc;
4971
4972 rc = sqlite3_bind_int64(s, 1, iStartBlockid);
4973 if( rc!=SQLITE_OK ) return rc;
4974
4975 rc = sqlite3_bind_int64(s, 2, iEndBlockid);
4976 if( rc!=SQLITE_OK ) return rc;
4977
4978 rc = sqlite3_step(s);
4979 if( rc==SQLITE_DONE ){
4980 pReader->eof = 1;
4981 return SQLITE_OK;
4982 }
4983 if( rc!=SQLITE_ROW ) return rc;
4984
4985 pReader->pStmt = s;
4986 leafReaderInit(sqlite3_column_blob(pReader->pStmt, 0),
4987 sqlite3_column_bytes(pReader->pStmt, 0),
4988 &pReader->leafReader);
4989 }
4990 return SQLITE_OK;
4991}
4992
4993/* Step the current leaf forward to the next term. If we reach the
4994** end of the current leaf, step forward to the next leaf block.
4995*/
4996static int leavesReaderStep(fulltext_vtab *v, LeavesReader *pReader){
4997 assert( !leavesReaderAtEnd(pReader) );
4998 leafReaderStep(&pReader->leafReader);
4999
5000 if( leafReaderAtEnd(&pReader->leafReader) ){
5001 int rc;
5002 if( pReader->rootData.pData ){
5003 pReader->eof = 1;
5004 return SQLITE_OK;
5005 }
5006 rc = sqlite3_step(pReader->pStmt);
5007 if( rc!=SQLITE_ROW ){
5008 pReader->eof = 1;
5009 return rc==SQLITE_DONE ? SQLITE_OK : rc;
5010 }
5011 leafReaderDestroy(&pReader->leafReader);
5012 leafReaderInit(sqlite3_column_blob(pReader->pStmt, 0),
5013 sqlite3_column_bytes(pReader->pStmt, 0),
5014 &pReader->leafReader);
5015 }
5016 return SQLITE_OK;
5017}
5018
5019/* Order LeavesReaders by their term, ignoring idx. Readers at eof
5020** always sort to the end.
5021*/
5022static int leavesReaderTermCmp(LeavesReader *lr1, LeavesReader *lr2){
5023 if( leavesReaderAtEnd(lr1) ){
5024 if( leavesReaderAtEnd(lr2) ) return 0;
5025 return 1;
5026 }
5027 if( leavesReaderAtEnd(lr2) ) return -1;
5028
5029 return leafReaderTermCmp(&lr1->leafReader,
5030 leavesReaderTerm(lr2), leavesReaderTermBytes(lr2),
5031 0);
5032}
5033
5034/* Similar to leavesReaderTermCmp(), with additional ordering by idx
5035** so that older segments sort before newer segments.
5036*/
5037static int leavesReaderCmp(LeavesReader *lr1, LeavesReader *lr2){
5038 int c = leavesReaderTermCmp(lr1, lr2);
5039 if( c!=0 ) return c;
5040 return lr1->idx-lr2->idx;
5041}
5042
5043/* Assume that pLr[1]..pLr[nLr] are sorted. Bubble pLr[0] into its
5044** sorted position.
5045*/
5046static void leavesReaderReorder(LeavesReader *pLr, int nLr){
5047 while( nLr>1 && leavesReaderCmp(pLr, pLr+1)>0 ){
5048 LeavesReader tmp = pLr[0];
5049 pLr[0] = pLr[1];
5050 pLr[1] = tmp;
5051 nLr--;
5052 pLr++;
5053 }
5054}
5055
5056/* Initializes pReaders with the segments from level iLevel, returning
5057** the number of segments in *piReaders. Leaves pReaders in sorted
5058** order.
5059*/
5060static int leavesReadersInit(fulltext_vtab *v, int iLevel,
5061 LeavesReader *pReaders, int *piReaders){
5062 sqlite3_stmt *s;
5063 int i, rc = sql_get_statement(v, SEGDIR_SELECT_STMT, &s);
5064 if( rc!=SQLITE_OK ) return rc;
5065
5066 rc = sqlite3_bind_int(s, 1, iLevel);
5067 if( rc!=SQLITE_OK ) return rc;
5068
5069 i = 0;
5070 while( (rc = sqlite3_step(s))==SQLITE_ROW ){
5071 sqlite_int64 iStart = sqlite3_column_int64(s, 0);
5072 sqlite_int64 iEnd = sqlite3_column_int64(s, 1);
5073 const char *pRootData = sqlite3_column_blob(s, 2);
5074 int nRootData = sqlite3_column_bytes(s, 2);
5075
5076 assert( i<MERGE_COUNT );
5077 rc = leavesReaderInit(v, i, iStart, iEnd, pRootData, nRootData,
5078 &pReaders[i]);
5079 if( rc!=SQLITE_OK ) break;
5080
5081 i++;
5082 }
5083 if( rc!=SQLITE_DONE ){
5084 while( i-->0 ){
5085 leavesReaderDestroy(&pReaders[i]);
5086 }
5087 return rc;
5088 }
5089
5090 *piReaders = i;
5091
5092 /* Leave our results sorted by term, then age. */
5093 while( i-- ){
5094 leavesReaderReorder(pReaders+i, *piReaders-i);
5095 }
5096 return SQLITE_OK;
5097}
5098
5099/* Merge doclists from pReaders[nReaders] into a single doclist, which
5100** is written to pWriter. Assumes pReaders is ordered oldest to
5101** newest.
5102*/
5103/* TODO(shess) Consider putting this inline in segmentMerge(). */
5104static int leavesReadersMerge(fulltext_vtab *v,
5105 LeavesReader *pReaders, int nReaders,
5106 LeafWriter *pWriter){
5107 DLReader dlReaders[MERGE_COUNT];
5108 const char *pTerm = leavesReaderTerm(pReaders);
5109 int i, nTerm = leavesReaderTermBytes(pReaders);
5110
5111 assert( nReaders<=MERGE_COUNT );
5112
5113 for(i=0; i<nReaders; i++){
5114 dlrInit(&dlReaders[i], DL_DEFAULT,
5115 leavesReaderData(pReaders+i),
5116 leavesReaderDataBytes(pReaders+i));
5117 }
5118
5119 return leafWriterStepMerge(v, pWriter, pTerm, nTerm, dlReaders, nReaders);
5120}
5121
5122/* Forward ref due to mutual recursion with segdirNextIndex(). */
5123static int segmentMerge(fulltext_vtab *v, int iLevel);
5124
5125/* Put the next available index at iLevel into *pidx. If iLevel
5126** already has MERGE_COUNT segments, they are merged to a higher
5127** level to make room.
5128*/
5129static int segdirNextIndex(fulltext_vtab *v, int iLevel, int *pidx){
5130 int rc = segdir_max_index(v, iLevel, pidx);
5131 if( rc==SQLITE_DONE ){ /* No segments at iLevel. */
5132 *pidx = 0;
5133 }else if( rc==SQLITE_ROW ){
5134 if( *pidx==(MERGE_COUNT-1) ){
5135 rc = segmentMerge(v, iLevel);
5136 if( rc!=SQLITE_OK ) return rc;
5137 *pidx = 0;
5138 }else{
5139 (*pidx)++;
5140 }
5141 }else{
5142 return rc;
5143 }
5144 return SQLITE_OK;
5145}
5146
5147/* Merge MERGE_COUNT segments at iLevel into a new segment at
5148** iLevel+1. If iLevel+1 is already full of segments, those will be
5149** merged to make room.
5150*/
5151static int segmentMerge(fulltext_vtab *v, int iLevel){
5152 LeafWriter writer;
5153 LeavesReader lrs[MERGE_COUNT];
5154 int i, rc, idx = 0;
5155
5156 /* Determine the next available segment index at the next level,
5157 ** merging as necessary.
5158 */
5159 rc = segdirNextIndex(v, iLevel+1, &idx);
5160 if( rc!=SQLITE_OK ) return rc;
5161
5162 /* TODO(shess) This assumes that we'll always see exactly
5163 ** MERGE_COUNT segments to merge at a given level. That will be
5164 ** broken if we allow the developer to request preemptive or
5165 ** deferred merging.
5166 */
5167 memset(&lrs, '\0', sizeof(lrs));
5168 rc = leavesReadersInit(v, iLevel, lrs, &i);
5169 if( rc!=SQLITE_OK ) return rc;
5170 assert( i==MERGE_COUNT );
5171
5172 leafWriterInit(iLevel+1, idx, &writer);
5173
5174 /* Since leavesReaderReorder() pushes readers at eof to the end,
5175 ** when the first reader is empty, all will be empty.
5176 */
5177 while( !leavesReaderAtEnd(lrs) ){
5178 /* Figure out how many readers share their next term. */
5179 for(i=1; i<MERGE_COUNT && !leavesReaderAtEnd(lrs+i); i++){
5180 if( 0!=leavesReaderTermCmp(lrs, lrs+i) ) break;
5181 }
5182
5183 rc = leavesReadersMerge(v, lrs, i, &writer);
5184 if( rc!=SQLITE_OK ) goto err;
5185
5186 /* Step forward those that were merged. */
5187 while( i-->0 ){
5188 rc = leavesReaderStep(v, lrs+i);
5189 if( rc!=SQLITE_OK ) goto err;
5190
5191 /* Reorder by term, then by age. */
5192 leavesReaderReorder(lrs+i, MERGE_COUNT-i);
5193 }
5194 }
5195
5196 for(i=0; i<MERGE_COUNT; i++){
5197 leavesReaderDestroy(&lrs[i]);
5198 }
5199
5200 rc = leafWriterFinalize(v, &writer);
5201 leafWriterDestroy(&writer);
5202 if( rc!=SQLITE_OK ) return rc;
5203
5204 /* Delete the merged segment data. */
5205 return segdir_delete(v, iLevel);
5206
5207 err:
5208 for(i=0; i<MERGE_COUNT; i++){
5209 leavesReaderDestroy(&lrs[i]);
5210 }
5211 leafWriterDestroy(&writer);
5212 return rc;
5213}
5214
5215/* Scan pReader for pTerm/nTerm, and merge the term's doclist over
5216** *out (any doclists with duplicate docids overwrite those in *out).
5217** Internal function for loadSegmentLeaf().
5218*/
5219static int loadSegmentLeavesInt(fulltext_vtab *v, LeavesReader *pReader,
5220 const char *pTerm, int nTerm, int isPrefix,
5221 DataBuffer *out){
5222 assert( nTerm>0 );
5223
5224 /* Process while the prefix matches. */
5225 while( !leavesReaderAtEnd(pReader) ){
5226 /* TODO(shess) Really want leavesReaderTermCmp(), but that name is
5227 ** already taken to compare the terms of two LeavesReaders. Think
5228 ** on a better name. [Meanwhile, break encapsulation rather than
5229 ** use a confusing name.]
5230 */
5231 int rc;
5232 int c = leafReaderTermCmp(&pReader->leafReader, pTerm, nTerm, isPrefix);
5233 if( c==0 ){
5234 const char *pData = leavesReaderData(pReader);
5235 int nData = leavesReaderDataBytes(pReader);
5236 if( out->nData==0 ){
5237 dataBufferReplace(out, pData, nData);
5238 }else{
5239 DataBuffer result;
5240 dataBufferInit(&result, out->nData+nData);
5241 docListUnion(out->pData, out->nData, pData, nData, &result);
5242 dataBufferDestroy(out);
5243 *out = result;
5244 /* TODO(shess) Rather than destroy out, we could retain it for
5245 ** later reuse.
5246 */
5247 }
5248 }
5249 if( c>0 ) break; /* Past any possible matches. */
5250
5251 rc = leavesReaderStep(v, pReader);
5252 if( rc!=SQLITE_OK ) return rc;
5253 }
5254 return SQLITE_OK;
5255}
5256
5257/* Call loadSegmentLeavesInt() with pData/nData as input. */
5258static int loadSegmentLeaf(fulltext_vtab *v, const char *pData, int nData,
5259 const char *pTerm, int nTerm, int isPrefix,
5260 DataBuffer *out){
5261 LeavesReader reader;
5262 int rc;
5263
5264 assert( nData>1 );
5265 assert( *pData=='\0' );
5266 rc = leavesReaderInit(v, 0, 0, 0, pData, nData, &reader);
5267 if( rc!=SQLITE_OK ) return rc;
5268
5269 rc = loadSegmentLeavesInt(v, &reader, pTerm, nTerm, isPrefix, out);
5270 leavesReaderReset(&reader);
5271 leavesReaderDestroy(&reader);
5272 return rc;
5273}
5274
5275/* Call loadSegmentLeavesInt() with the leaf nodes from iStartLeaf to
5276** iEndLeaf (inclusive) as input, and merge the resulting doclist into
5277** out.
5278*/
5279static int loadSegmentLeaves(fulltext_vtab *v,
5280 sqlite_int64 iStartLeaf, sqlite_int64 iEndLeaf,
5281 const char *pTerm, int nTerm, int isPrefix,
5282 DataBuffer *out){
5283 int rc;
5284 LeavesReader reader;
5285
5286 assert( iStartLeaf<=iEndLeaf );
5287 rc = leavesReaderInit(v, 0, iStartLeaf, iEndLeaf, NULL, 0, &reader);
5288 if( rc!=SQLITE_OK ) return rc;
5289
5290 rc = loadSegmentLeavesInt(v, &reader, pTerm, nTerm, isPrefix, out);
5291 leavesReaderReset(&reader);
5292 leavesReaderDestroy(&reader);
5293 return rc;
5294}
5295
5296/* Taking pData/nData as an interior node, find the sequence of child
5297** nodes which could include pTerm/nTerm/isPrefix. Note that the
5298** interior node terms logically come between the blocks, so there is
5299** one more blockid than there are terms (that block contains terms >=
5300** the last interior-node term).
5301*/
5302/* TODO(shess) The calling code may already know that the end child is
5303** not worth calculating, because the end may be in a later sibling
5304** node. Consider whether breaking symmetry is worthwhile. I suspect
5305** it's not worthwhile.
5306*/
5307static void getChildrenContaining(const char *pData, int nData,
5308 const char *pTerm, int nTerm, int isPrefix,
5309 sqlite_int64 *piStartChild,
5310 sqlite_int64 *piEndChild){
5311 InteriorReader reader;
5312
5313 assert( nData>1 );
5314 assert( *pData!='\0' );
5315 interiorReaderInit(pData, nData, &reader);
5316
5317 /* Scan for the first child which could contain pTerm/nTerm. */
5318 while( !interiorReaderAtEnd(&reader) ){
5319 if( interiorReaderTermCmp(&reader, pTerm, nTerm, 0)>0 ) break;
5320 interiorReaderStep(&reader);
5321 }
5322 *piStartChild = interiorReaderCurrentBlockid(&reader);
5323
5324 /* Keep scanning to find a term greater than our term, using prefix
5325 ** comparison if indicated. If isPrefix is false, this will be the
5326 ** same blockid as the starting block.
5327 */
5328 while( !interiorReaderAtEnd(&reader) ){
5329 if( interiorReaderTermCmp(&reader, pTerm, nTerm, isPrefix)>0 ) break;
5330 interiorReaderStep(&reader);
5331 }
5332 *piEndChild = interiorReaderCurrentBlockid(&reader);
5333
5334 interiorReaderDestroy(&reader);
5335
5336 /* Children must ascend, and if !prefix, both must be the same. */
5337 assert( *piEndChild>=*piStartChild );
5338 assert( isPrefix || *piStartChild==*piEndChild );
5339}
5340
5341/* Read block at iBlockid and pass it with other params to
5342** getChildrenContaining().
5343*/
5344static int loadAndGetChildrenContaining(
5345 fulltext_vtab *v,
5346 sqlite_int64 iBlockid,
5347 const char *pTerm, int nTerm, int isPrefix,
5348 sqlite_int64 *piStartChild, sqlite_int64 *piEndChild
5349){
5350 sqlite3_stmt *s = NULL;
5351 int rc;
5352
5353 assert( iBlockid!=0 );
5354 assert( pTerm!=NULL );
5355 assert( nTerm!=0 ); /* TODO(shess) Why not allow this? */
5356 assert( piStartChild!=NULL );
5357 assert( piEndChild!=NULL );
5358
5359 rc = sql_get_statement(v, BLOCK_SELECT_STMT, &s);
5360 if( rc!=SQLITE_OK ) return rc;
5361
5362 rc = sqlite3_bind_int64(s, 1, iBlockid);
5363 if( rc!=SQLITE_OK ) return rc;
5364
5365 rc = sqlite3_step(s);
5366 if( rc==SQLITE_DONE ) return SQLITE_ERROR;
5367 if( rc!=SQLITE_ROW ) return rc;
5368
5369 getChildrenContaining(sqlite3_column_blob(s, 0), sqlite3_column_bytes(s, 0),
5370 pTerm, nTerm, isPrefix, piStartChild, piEndChild);
5371
5372 /* We expect only one row. We must execute another sqlite3_step()
5373 * to complete the iteration; otherwise the table will remain
5374 * locked. */
5375 rc = sqlite3_step(s);
5376 if( rc==SQLITE_ROW ) return SQLITE_ERROR;
5377 if( rc!=SQLITE_DONE ) return rc;
5378
5379 return SQLITE_OK;
5380}
5381
5382/* Traverse the tree represented by pData[nData] looking for
5383** pTerm[nTerm], placing its doclist into *out. This is internal to
5384** loadSegment() to make error-handling cleaner.
5385*/
5386static int loadSegmentInt(fulltext_vtab *v, const char *pData, int nData,
5387 sqlite_int64 iLeavesEnd,
5388 const char *pTerm, int nTerm, int isPrefix,
5389 DataBuffer *out){
5390 /* Special case where root is a leaf. */
5391 if( *pData=='\0' ){
5392 return loadSegmentLeaf(v, pData, nData, pTerm, nTerm, isPrefix, out);
5393 }else{
5394 int rc;
5395 sqlite_int64 iStartChild, iEndChild;
5396
5397 /* Process pData as an interior node, then loop down the tree
5398 ** until we find the set of leaf nodes to scan for the term.
5399 */
5400 getChildrenContaining(pData, nData, pTerm, nTerm, isPrefix,
5401 &iStartChild, &iEndChild);
5402 while( iStartChild>iLeavesEnd ){
5403 sqlite_int64 iNextStart, iNextEnd;
5404 rc = loadAndGetChildrenContaining(v, iStartChild, pTerm, nTerm, isPrefix,
5405 &iNextStart, &iNextEnd);
5406 if( rc!=SQLITE_OK ) return rc;
5407
5408 /* If we've branched, follow the end branch, too. */
5409 if( iStartChild!=iEndChild ){
5410 sqlite_int64 iDummy;
5411 rc = loadAndGetChildrenContaining(v, iEndChild, pTerm, nTerm, isPrefix,
5412 &iDummy, &iNextEnd);
5413 if( rc!=SQLITE_OK ) return rc;
5414 }
5415
5416 assert( iNextStart<=iNextEnd );
5417 iStartChild = iNextStart;
5418 iEndChild = iNextEnd;
5419 }
5420 assert( iStartChild<=iLeavesEnd );
5421 assert( iEndChild<=iLeavesEnd );
5422
5423 /* Scan through the leaf segments for doclists. */
5424 return loadSegmentLeaves(v, iStartChild, iEndChild,
5425 pTerm, nTerm, isPrefix, out);
5426 }
5427}
5428
5429/* Call loadSegmentInt() to collect the doclist for pTerm/nTerm, then
5430** merge its doclist over *out (any duplicate doclists read from the
5431** segment rooted at pData will overwrite those in *out).
5432*/
5433/* TODO(shess) Consider changing this to determine the depth of the
5434** leaves using either the first characters of interior nodes (when
5435** ==1, we're one level above the leaves), or the first character of
5436** the root (which will describe the height of the tree directly).
5437** Either feels somewhat tricky to me.
5438*/
5439/* TODO(shess) The current merge is likely to be slow for large
5440** doclists (though it should process from newest/smallest to
5441** oldest/largest, so it may not be that bad). It might be useful to
5442** modify things to allow for N-way merging. This could either be
5443** within a segment, with pairwise merges across segments, or across
5444** all segments at once.
5445*/
5446static int loadSegment(fulltext_vtab *v, const char *pData, int nData,
5447 sqlite_int64 iLeavesEnd,
5448 const char *pTerm, int nTerm, int isPrefix,
5449 DataBuffer *out){
5450 DataBuffer result;
5451 int rc;
5452
5453 assert( nData>1 );
5454
5455 /* This code should never be called with buffered updates. */
5456 assert( v->nPendingData<0 );
5457
5458 dataBufferInit(&result, 0);
5459 rc = loadSegmentInt(v, pData, nData, iLeavesEnd,
5460 pTerm, nTerm, isPrefix, &result);
5461 if( rc==SQLITE_OK && result.nData>0 ){
5462 if( out->nData==0 ){
5463 DataBuffer tmp = *out;
5464 *out = result;
5465 result = tmp;
5466 }else{
5467 DataBuffer merged;
5468 DLReader readers[2];
5469
5470 dlrInit(&readers[0], DL_DEFAULT, out->pData, out->nData);
5471 dlrInit(&readers[1], DL_DEFAULT, result.pData, result.nData);
5472 dataBufferInit(&merged, out->nData+result.nData);
5473 docListMerge(&merged, readers, 2);
5474 dataBufferDestroy(out);
5475 *out = merged;
5476 dlrDestroy(&readers[0]);
5477 dlrDestroy(&readers[1]);
5478 }
5479 }
5480 dataBufferDestroy(&result);
5481 return rc;
5482}
5483
5484/* Scan the database and merge together the posting lists for the term
5485** into *out.
5486*/
5487static int termSelect(fulltext_vtab *v, int iColumn,
5488 const char *pTerm, int nTerm, int isPrefix,
5489 DocListType iType, DataBuffer *out){
5490 DataBuffer doclist;
5491 sqlite3_stmt *s;
5492 int rc = sql_get_statement(v, SEGDIR_SELECT_ALL_STMT, &s);
5493 if( rc!=SQLITE_OK ) return rc;
5494
5495 /* This code should never be called with buffered updates. */
5496 assert( v->nPendingData<0 );
5497
5498 dataBufferInit(&doclist, 0);
5499
5500 /* Traverse the segments from oldest to newest so that newer doclist
5501 ** elements for given docids overwrite older elements.
5502 */
5503 while( (rc = sqlite3_step(s))==SQLITE_ROW ){
5504 const char *pData = sqlite3_column_blob(s, 0);
5505 const int nData = sqlite3_column_bytes(s, 0);
5506 const sqlite_int64 iLeavesEnd = sqlite3_column_int64(s, 1);
5507 rc = loadSegment(v, pData, nData, iLeavesEnd, pTerm, nTerm, isPrefix,
5508 &doclist);
5509 if( rc!=SQLITE_OK ) goto err;
5510 }
5511 if( rc==SQLITE_DONE ){
5512 if( doclist.nData!=0 ){
5513 /* TODO(shess) The old term_select_all() code applied the column
5514 ** restrict as we merged segments, leading to smaller buffers.
5515 ** This is probably worthwhile to bring back, once the new storage
5516 ** system is checked in.
5517 */
5518 if( iColumn==v->nColumn) iColumn = -1;
5519 docListTrim(DL_DEFAULT, doclist.pData, doclist.nData,
5520 iColumn, iType, out);
5521 }
5522 rc = SQLITE_OK;
5523 }
5524
5525 err:
5526 dataBufferDestroy(&doclist);
5527 return rc;
5528}
5529
5530/****************************************************************/
5531/* Used to hold hashtable data for sorting. */
5532typedef struct TermData {
5533 const char *pTerm;
5534 int nTerm;
5535 DLCollector *pCollector;
5536} TermData;
5537
5538/* Orders TermData elements in strcmp fashion ( <0 for less-than, 0
5539** for equal, >0 for greater-than).
5540*/
5541static int termDataCmp(const void *av, const void *bv){
5542 const TermData *a = (const TermData *)av;
5543 const TermData *b = (const TermData *)bv;
5544 int n = a->nTerm<b->nTerm ? a->nTerm : b->nTerm;
5545 int c = memcmp(a->pTerm, b->pTerm, n);
5546 if( c!=0 ) return c;
5547 return a->nTerm-b->nTerm;
5548}
5549
5550/* Order pTerms data by term, then write a new level 0 segment using
5551** LeafWriter.
5552*/
5553static int writeZeroSegment(fulltext_vtab *v, fts2Hash *pTerms){
5554 fts2HashElem *e;
5555 int idx, rc, i, n;
5556 TermData *pData;
5557 LeafWriter writer;
5558 DataBuffer dl;
5559
5560 /* Determine the next index at level 0, merging as necessary. */
5561 rc = segdirNextIndex(v, 0, &idx);
5562 if( rc!=SQLITE_OK ) return rc;
5563
5564 n = fts2HashCount(pTerms);
5565 pData = malloc(n*sizeof(TermData));
5566
5567 for(i = 0, e = fts2HashFirst(pTerms); e; i++, e = fts2HashNext(e)){
5568 assert( i<n );
5569 pData[i].pTerm = fts2HashKey(e);
5570 pData[i].nTerm = fts2HashKeysize(e);
5571 pData[i].pCollector = fts2HashData(e);
5572 }
5573 assert( i==n );
5574
5575 /* TODO(shess) Should we allow user-defined collation sequences,
5576 ** here? I think we only need that once we support prefix searches.
5577 */
5578 if( n>1 ) qsort(pData, n, sizeof(*pData), termDataCmp);
5579
5580 /* TODO(shess) Refactor so that we can write directly to the segment
5581 ** DataBuffer, as happens for segment merges.
5582 */
5583 leafWriterInit(0, idx, &writer);
5584 dataBufferInit(&dl, 0);
5585 for(i=0; i<n; i++){
5586 dataBufferReset(&dl);
5587 dlcAddDoclist(pData[i].pCollector, &dl);
5588 rc = leafWriterStep(v, &writer,
5589 pData[i].pTerm, pData[i].nTerm, dl.pData, dl.nData);
5590 if( rc!=SQLITE_OK ) goto err;
5591 }
5592 rc = leafWriterFinalize(v, &writer);
5593
5594 err:
5595 dataBufferDestroy(&dl);
5596 free(pData);
5597 leafWriterDestroy(&writer);
5598 return rc;
5599}
5600
5601/* If pendingTerms has data, free it. */
5602static int clearPendingTerms(fulltext_vtab *v){
5603 if( v->nPendingData>=0 ){
5604 fts2HashElem *e;
5605 for(e=fts2HashFirst(&v->pendingTerms); e; e=fts2HashNext(e)){
5606 dlcDelete(fts2HashData(e));
5607 }
5608 fts2HashClear(&v->pendingTerms);
5609 v->nPendingData = -1;
5610 }
5611 return SQLITE_OK;
5612}
5613
5614/* If pendingTerms has data, flush it to a level-zero segment, and
5615** free it.
5616*/
5617static int flushPendingTerms(fulltext_vtab *v){
5618 if( v->nPendingData>=0 ){
5619 int rc = writeZeroSegment(v, &v->pendingTerms);
5620 if( rc==SQLITE_OK ) clearPendingTerms(v);
5621 return rc;
5622 }
5623 return SQLITE_OK;
5624}
5625
5626/* If pendingTerms is "too big", or docid is out of order, flush it.
5627** Regardless, be certain that pendingTerms is initialized for use.
5628*/
5629static int initPendingTerms(fulltext_vtab *v, sqlite_int64 iDocid){
5630 /* TODO(shess) Explore whether partially flushing the buffer on
5631 ** forced-flush would provide better performance. I suspect that if
5632 ** we ordered the doclists by size and flushed the largest until the
5633 ** buffer was half empty, that would let the less frequent terms
5634 ** generate longer doclists.
5635 */
5636 if( iDocid<=v->iPrevDocid || v->nPendingData>kPendingThreshold ){
5637 int rc = flushPendingTerms(v);
5638 if( rc!=SQLITE_OK ) return rc;
5639 }
5640 if( v->nPendingData<0 ){
5641 fts2HashInit(&v->pendingTerms, FTS2_HASH_STRING, 1);
5642 v->nPendingData = 0;
5643 }
5644 v->iPrevDocid = iDocid;
5645 return SQLITE_OK;
5646}
5647
5648/* This function implements the xUpdate callback; it's the top-level entry
5649 * point for inserting, deleting or updating a row in a full-text table. */
5650static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg,
5651 sqlite_int64 *pRowid){
5652 fulltext_vtab *v = (fulltext_vtab *) pVtab;
5653 int rc;
5654
5655 TRACE(("FTS2 Update %p\n", pVtab));
5656
5657 if( nArg<2 ){
5658 rc = index_delete(v, sqlite3_value_int64(ppArg[0]));
5659 } else if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){
5660 /* An update:
5661 * ppArg[0] = old rowid
5662 * ppArg[1] = new rowid
5663 * ppArg[2..2+v->nColumn-1] = values
5664 * ppArg[2+v->nColumn] = value for magic column (we ignore this)
5665 */
5666 sqlite_int64 rowid = sqlite3_value_int64(ppArg[0]);
5667 if( sqlite3_value_type(ppArg[1]) != SQLITE_INTEGER ||
5668 sqlite3_value_int64(ppArg[1]) != rowid ){
5669 rc = SQLITE_ERROR; /* we don't allow changing the rowid */
5670 } else {
5671 assert( nArg==2+v->nColumn+1);
5672 rc = index_update(v, rowid, &ppArg[2]);
5673 }
5674 } else {
5675 /* An insert:
5676 * ppArg[1] = requested rowid
5677 * ppArg[2..2+v->nColumn-1] = values
5678 * ppArg[2+v->nColumn] = value for magic column (we ignore this)
5679 */
5680 assert( nArg==2+v->nColumn+1);
5681 rc = index_insert(v, ppArg[1], &ppArg[2], pRowid);
5682 }
5683
5684 return rc;
5685}
5686
5687static int fulltextSync(sqlite3_vtab *pVtab){
5688 TRACE(("FTS2 xSync()\n"));
5689 return flushPendingTerms((fulltext_vtab *)pVtab);
5690}
5691
5692static int fulltextBegin(sqlite3_vtab *pVtab){
5693 fulltext_vtab *v = (fulltext_vtab *) pVtab;
5694 TRACE(("FTS2 xBegin()\n"));
5695
5696 /* Any buffered updates should have been cleared by the previous
5697 ** transaction.
5698 */
5699 assert( v->nPendingData<0 );
5700 return clearPendingTerms(v);
5701}
5702
5703static int fulltextCommit(sqlite3_vtab *pVtab){
5704 fulltext_vtab *v = (fulltext_vtab *) pVtab;
5705 TRACE(("FTS2 xCommit()\n"));
5706
5707 /* Buffered updates should have been cleared by fulltextSync(). */
5708 assert( v->nPendingData<0 );
5709 return clearPendingTerms(v);
5710}
5711
5712static int fulltextRollback(sqlite3_vtab *pVtab){
5713 TRACE(("FTS2 xRollback()\n"));
5714 return clearPendingTerms((fulltext_vtab *)pVtab);
5715}
5716
5717/*
5718** Implementation of the snippet() function for FTS2
5719*/
5720static void snippetFunc(
5721 sqlite3_context *pContext,
5722 int argc,
5723 sqlite3_value **argv
5724){
5725 fulltext_cursor *pCursor;
5726 if( argc<1 ) return;
5727 if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
5728 sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
5729 sqlite3_result_error(pContext, "illegal first argument to html_snippet",-1);
5730 }else{
5731 const char *zStart = "<b>";
5732 const char *zEnd = "</b>";
5733 const char *zEllipsis = "<b>...</b>";
5734 memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
5735 if( argc>=2 ){
5736 zStart = (const char*)sqlite3_value_text(argv[1]);
5737 if( argc>=3 ){
5738 zEnd = (const char*)sqlite3_value_text(argv[2]);
5739 if( argc>=4 ){
5740 zEllipsis = (const char*)sqlite3_value_text(argv[3]);
5741 }
5742 }
5743 }
5744 snippetAllOffsets(pCursor);
5745 snippetText(pCursor, zStart, zEnd, zEllipsis);
5746 sqlite3_result_text(pContext, pCursor->snippet.zSnippet,
5747 pCursor->snippet.nSnippet, SQLITE_STATIC);
5748 }
5749}
5750
5751/*
5752** Implementation of the offsets() function for FTS2
5753*/
5754static void snippetOffsetsFunc(
5755 sqlite3_context *pContext,
5756 int argc,
5757 sqlite3_value **argv
5758){
5759 fulltext_cursor *pCursor;
5760 if( argc<1 ) return;
5761 if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
5762 sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
5763 sqlite3_result_error(pContext, "illegal first argument to offsets",-1);
5764 }else{
5765 memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
5766 snippetAllOffsets(pCursor);
5767 snippetOffsetText(&pCursor->snippet);
5768 sqlite3_result_text(pContext,
5769 pCursor->snippet.zOffset, pCursor->snippet.nOffset,
5770 SQLITE_STATIC);
5771 }
5772}
5773
5774/*
5775** This routine implements the xFindFunction method for the FTS2
5776** virtual table.
5777*/
5778static int fulltextFindFunction(
5779 sqlite3_vtab *pVtab,
5780 int nArg,
5781 const char *zName,
5782 void (**pxFunc)(sqlite3_context*,int,sqlite3_value**),
5783 void **ppArg
5784){
5785 if( strcmp(zName,"snippet")==0 ){
5786 *pxFunc = snippetFunc;
5787 return 1;
5788 }else if( strcmp(zName,"offsets")==0 ){
5789 *pxFunc = snippetOffsetsFunc;
5790 return 1;
5791 }
5792 return 0;
5793}
5794
5795/*
5796** Rename an fts2 table.
5797*/
5798static int fulltextRename(
5799 sqlite3_vtab *pVtab,
5800 const char *zName
5801){
5802 fulltext_vtab *p = (fulltext_vtab *)pVtab;
5803 int rc = SQLITE_NOMEM;
5804 char *zSql = sqlite3_mprintf(
5805 "ALTER TABLE %Q.'%q_content' RENAME TO '%q_content';"
5806 "ALTER TABLE %Q.'%q_segments' RENAME TO '%q_segments';"
5807 "ALTER TABLE %Q.'%q_segdir' RENAME TO '%q_segdir';"
5808 , p->zDb, p->zName, zName
5809 , p->zDb, p->zName, zName
5810 , p->zDb, p->zName, zName
5811 );
5812 if( zSql ){
5813 rc = sqlite3_exec(p->db, zSql, 0, 0, 0);
5814 sqlite3_free(zSql);
5815 }
5816 return rc;
5817}
5818
5819static const sqlite3_module fts2Module = {
5820 /* iVersion */ 0,
5821 /* xCreate */ fulltextCreate,
5822 /* xConnect */ fulltextConnect,
5823 /* xBestIndex */ fulltextBestIndex,
5824 /* xDisconnect */ fulltextDisconnect,
5825 /* xDestroy */ fulltextDestroy,
5826 /* xOpen */ fulltextOpen,
5827 /* xClose */ fulltextClose,
5828 /* xFilter */ fulltextFilter,
5829 /* xNext */ fulltextNext,
5830 /* xEof */ fulltextEof,
5831 /* xColumn */ fulltextColumn,
5832 /* xRowid */ fulltextRowid,
5833 /* xUpdate */ fulltextUpdate,
5834 /* xBegin */ fulltextBegin,
5835 /* xSync */ fulltextSync,
5836 /* xCommit */ fulltextCommit,
5837 /* xRollback */ fulltextRollback,
5838 /* xFindFunction */ fulltextFindFunction,
5839 /* xRename */ fulltextRename,
5840};
5841
5842static void hashDestroy(void *p){
5843 fts2Hash *pHash = (fts2Hash *)p;
5844 sqlite3Fts2HashClear(pHash);
5845 sqlite3_free(pHash);
5846}
5847
5848/*
5849** The fts2 built-in tokenizers - "simple" and "porter" - are implemented
5850** in files fts2_tokenizer1.c and fts2_porter.c respectively. The following
5851** two forward declarations are for functions declared in these files
5852** used to retrieve the respective implementations.
5853**
5854** Calling sqlite3Fts2SimpleTokenizerModule() sets the value pointed
5855** to by the argument to point a the "simple" tokenizer implementation.
5856** Function ...PorterTokenizerModule() sets *pModule to point to the
5857** porter tokenizer/stemmer implementation.
5858*/
5859void sqlite3Fts2SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule);
5860void sqlite3Fts2PorterTokenizerModule(sqlite3_tokenizer_module const**ppModule);
5861void sqlite3Fts2IcuTokenizerModule(sqlite3_tokenizer_module const**ppModule);
5862
5863int sqlite3Fts2InitHashTable(sqlite3 *, fts2Hash *, const char *);
5864
5865/*
5866** Initialise the fts2 extension. If this extension is built as part
5867** of the sqlite library, then this function is called directly by
5868** SQLite. If fts2 is built as a dynamically loadable extension, this
5869** function is called by the sqlite3_extension_init() entry point.
5870*/
5871int sqlite3Fts2Init(sqlite3 *db){
5872 int rc = SQLITE_OK;
5873 fts2Hash *pHash = 0;
5874 const sqlite3_tokenizer_module *pSimple = 0;
5875 const sqlite3_tokenizer_module *pPorter = 0;
5876 const sqlite3_tokenizer_module *pIcu = 0;
5877
5878 sqlite3Fts2SimpleTokenizerModule(&pSimple);
5879 sqlite3Fts2PorterTokenizerModule(&pPorter);
5880#ifdef SQLITE_ENABLE_ICU
5881 sqlite3Fts2IcuTokenizerModule(&pIcu);
5882#endif
5883
5884 /* Allocate and initialise the hash-table used to store tokenizers. */
5885 pHash = sqlite3_malloc(sizeof(fts2Hash));
5886 if( !pHash ){
5887 rc = SQLITE_NOMEM;
5888 }else{
5889 sqlite3Fts2HashInit(pHash, FTS2_HASH_STRING, 1);
5890 }
5891
5892 /* Load the built-in tokenizers into the hash table */
5893 if( rc==SQLITE_OK ){
5894 if( sqlite3Fts2HashInsert(pHash, "simple", 7, (void *)pSimple)
5895 || sqlite3Fts2HashInsert(pHash, "porter", 7, (void *)pPorter)
5896 || (pIcu && sqlite3Fts2HashInsert(pHash, "icu", 4, (void *)pIcu))
5897 ){
5898 rc = SQLITE_NOMEM;
5899 }
5900 }
5901
5902 /* Create the virtual table wrapper around the hash-table and overload
5903 ** the two scalar functions. If this is successful, register the
5904 ** module with sqlite.
5905 */
5906 if( SQLITE_OK==rc
5907 && SQLITE_OK==(rc = sqlite3Fts2InitHashTable(db, pHash, "fts2_tokenizer"))
5908 && SQLITE_OK==(rc = sqlite3_overload_function(db, "snippet", -1))
5909 && SQLITE_OK==(rc = sqlite3_overload_function(db, "offsets", -1))
5910 ){
5911 return sqlite3_create_module_v2(
5912 db, "fts2", &fts2Module, (void *)pHash, hashDestroy
5913 );
5914 }
5915
5916 /* An error has occured. Delete the hash table and return the error code. */
5917 assert( rc!=SQLITE_OK );
5918 if( pHash ){
5919 sqlite3Fts2HashClear(pHash);
5920 sqlite3_free(pHash);
5921 }
5922 return rc;
5923}
5924
5925#if !SQLITE_CORE
5926int sqlite3_extension_init(
5927 sqlite3 *db,
5928 char **pzErrMsg,
5929 const sqlite3_api_routines *pApi
5930){
5931 SQLITE_EXTENSION_INIT2(pApi)
5932 return sqlite3Fts2Init(db);
5933}
5934#endif
5935
5936#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2.h b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2.h
new file mode 100644
index 0000000..4da4c38
--- /dev/null
+++ b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2.h
@@ -0,0 +1,26 @@
1/*
2** 2006 Oct 10
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11******************************************************************************
12**
13** This header file is used by programs that want to link against the
14** FTS2 library. All it does is declare the sqlite3Fts2Init() interface.
15*/
16#include "sqlite3.h"
17
18#ifdef __cplusplus
19extern "C" {
20#endif /* __cplusplus */
21
22int sqlite3Fts2Init(sqlite3 *db);
23
24#ifdef __cplusplus
25} /* extern "C" */
26#endif /* __cplusplus */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_hash.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_hash.c
new file mode 100644
index 0000000..fcd5cc2
--- /dev/null
+++ b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_hash.c
@@ -0,0 +1,369 @@
1/*
2** 2001 September 22
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12** This is the implementation of generic hash-tables used in SQLite.
13** We've modified it slightly to serve as a standalone hash table
14** implementation for the full-text indexing module.
15*/
16
17/*
18** The code in this file is only compiled if:
19**
20** * The FTS2 module is being built as an extension
21** (in which case SQLITE_CORE is not defined), or
22**
23** * The FTS2 module is being built into the core of
24** SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
25*/
26#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
27
28#include <assert.h>
29#include <stdlib.h>
30#include <string.h>
31
32#include "fts2_hash.h"
33
34static void *malloc_and_zero(int n){
35 void *p = malloc(n);
36 if( p ){
37 memset(p, 0, n);
38 }
39 return p;
40}
41
42/* Turn bulk memory into a hash table object by initializing the
43** fields of the Hash structure.
44**
45** "pNew" is a pointer to the hash table that is to be initialized.
46** keyClass is one of the constants
47** FTS2_HASH_BINARY or FTS2_HASH_STRING. The value of keyClass
48** determines what kind of key the hash table will use. "copyKey" is
49** true if the hash table should make its own private copy of keys and
50** false if it should just use the supplied pointer.
51*/
52void sqlite3Fts2HashInit(fts2Hash *pNew, int keyClass, int copyKey){
53 assert( pNew!=0 );
54 assert( keyClass>=FTS2_HASH_STRING && keyClass<=FTS2_HASH_BINARY );
55 pNew->keyClass = keyClass;
56 pNew->copyKey = copyKey;
57 pNew->first = 0;
58 pNew->count = 0;
59 pNew->htsize = 0;
60 pNew->ht = 0;
61 pNew->xMalloc = malloc_and_zero;
62 pNew->xFree = free;
63}
64
65/* Remove all entries from a hash table. Reclaim all memory.
66** Call this routine to delete a hash table or to reset a hash table
67** to the empty state.
68*/
69void sqlite3Fts2HashClear(fts2Hash *pH){
70 fts2HashElem *elem; /* For looping over all elements of the table */
71
72 assert( pH!=0 );
73 elem = pH->first;
74 pH->first = 0;
75 if( pH->ht ) pH->xFree(pH->ht);
76 pH->ht = 0;
77 pH->htsize = 0;
78 while( elem ){
79 fts2HashElem *next_elem = elem->next;
80 if( pH->copyKey && elem->pKey ){
81 pH->xFree(elem->pKey);
82 }
83 pH->xFree(elem);
84 elem = next_elem;
85 }
86 pH->count = 0;
87}
88
89/*
90** Hash and comparison functions when the mode is FTS2_HASH_STRING
91*/
92static int strHash(const void *pKey, int nKey){
93 const char *z = (const char *)pKey;
94 int h = 0;
95 if( nKey<=0 ) nKey = (int) strlen(z);
96 while( nKey > 0 ){
97 h = (h<<3) ^ h ^ *z++;
98 nKey--;
99 }
100 return h & 0x7fffffff;
101}
102static int strCompare(const void *pKey1, int n1, const void *pKey2, int n2){
103 if( n1!=n2 ) return 1;
104 return strncmp((const char*)pKey1,(const char*)pKey2,n1);
105}
106
107/*
108** Hash and comparison functions when the mode is FTS2_HASH_BINARY
109*/
110static int binHash(const void *pKey, int nKey){
111 int h = 0;
112 const char *z = (const char *)pKey;
113 while( nKey-- > 0 ){
114 h = (h<<3) ^ h ^ *(z++);
115 }
116 return h & 0x7fffffff;
117}
118static int binCompare(const void *pKey1, int n1, const void *pKey2, int n2){
119 if( n1!=n2 ) return 1;
120 return memcmp(pKey1,pKey2,n1);
121}
122
123/*
124** Return a pointer to the appropriate hash function given the key class.
125**
126** The C syntax in this function definition may be unfamilar to some
127** programmers, so we provide the following additional explanation:
128**
129** The name of the function is "hashFunction". The function takes a
130** single parameter "keyClass". The return value of hashFunction()
131** is a pointer to another function. Specifically, the return value
132** of hashFunction() is a pointer to a function that takes two parameters
133** with types "const void*" and "int" and returns an "int".
134*/
135static int (*hashFunction(int keyClass))(const void*,int){
136 if( keyClass==FTS2_HASH_STRING ){
137 return &strHash;
138 }else{
139 assert( keyClass==FTS2_HASH_BINARY );
140 return &binHash;
141 }
142}
143
144/*
145** Return a pointer to the appropriate hash function given the key class.
146**
147** For help in interpreted the obscure C code in the function definition,
148** see the header comment on the previous function.
149*/
150static int (*compareFunction(int keyClass))(const void*,int,const void*,int){
151 if( keyClass==FTS2_HASH_STRING ){
152 return &strCompare;
153 }else{
154 assert( keyClass==FTS2_HASH_BINARY );
155 return &binCompare;
156 }
157}
158
159/* Link an element into the hash table
160*/
161static void insertElement(
162 fts2Hash *pH, /* The complete hash table */
163 struct _fts2ht *pEntry, /* The entry into which pNew is inserted */
164 fts2HashElem *pNew /* The element to be inserted */
165){
166 fts2HashElem *pHead; /* First element already in pEntry */
167 pHead = pEntry->chain;
168 if( pHead ){
169 pNew->next = pHead;
170 pNew->prev = pHead->prev;
171 if( pHead->prev ){ pHead->prev->next = pNew; }
172 else { pH->first = pNew; }
173 pHead->prev = pNew;
174 }else{
175 pNew->next = pH->first;
176 if( pH->first ){ pH->first->prev = pNew; }
177 pNew->prev = 0;
178 pH->first = pNew;
179 }
180 pEntry->count++;
181 pEntry->chain = pNew;
182}
183
184
185/* Resize the hash table so that it cantains "new_size" buckets.
186** "new_size" must be a power of 2. The hash table might fail
187** to resize if sqliteMalloc() fails.
188*/
189static void rehash(fts2Hash *pH, int new_size){
190 struct _fts2ht *new_ht; /* The new hash table */
191 fts2HashElem *elem, *next_elem; /* For looping over existing elements */
192 int (*xHash)(const void*,int); /* The hash function */
193
194 assert( (new_size & (new_size-1))==0 );
195 new_ht = (struct _fts2ht *)pH->xMalloc( new_size*sizeof(struct _fts2ht) );
196 if( new_ht==0 ) return;
197 if( pH->ht ) pH->xFree(pH->ht);
198 pH->ht = new_ht;
199 pH->htsize = new_size;
200 xHash = hashFunction(pH->keyClass);
201 for(elem=pH->first, pH->first=0; elem; elem = next_elem){
202 int h = (*xHash)(elem->pKey, elem->nKey) & (new_size-1);
203 next_elem = elem->next;
204 insertElement(pH, &new_ht[h], elem);
205 }
206}
207
208/* This function (for internal use only) locates an element in an
209** hash table that matches the given key. The hash for this key has
210** already been computed and is passed as the 4th parameter.
211*/
212static fts2HashElem *findElementGivenHash(
213 const fts2Hash *pH, /* The pH to be searched */
214 const void *pKey, /* The key we are searching for */
215 int nKey,
216 int h /* The hash for this key. */
217){
218 fts2HashElem *elem; /* Used to loop thru the element list */
219 int count; /* Number of elements left to test */
220 int (*xCompare)(const void*,int,const void*,int); /* comparison function */
221
222 if( pH->ht ){
223 struct _fts2ht *pEntry = &pH->ht[h];
224 elem = pEntry->chain;
225 count = pEntry->count;
226 xCompare = compareFunction(pH->keyClass);
227 while( count-- && elem ){
228 if( (*xCompare)(elem->pKey,elem->nKey,pKey,nKey)==0 ){
229 return elem;
230 }
231 elem = elem->next;
232 }
233 }
234 return 0;
235}
236
237/* Remove a single entry from the hash table given a pointer to that
238** element and a hash on the element's key.
239*/
240static void removeElementGivenHash(
241 fts2Hash *pH, /* The pH containing "elem" */
242 fts2HashElem* elem, /* The element to be removed from the pH */
243 int h /* Hash value for the element */
244){
245 struct _fts2ht *pEntry;
246 if( elem->prev ){
247 elem->prev->next = elem->next;
248 }else{
249 pH->first = elem->next;
250 }
251 if( elem->next ){
252 elem->next->prev = elem->prev;
253 }
254 pEntry = &pH->ht[h];
255 if( pEntry->chain==elem ){
256 pEntry->chain = elem->next;
257 }
258 pEntry->count--;
259 if( pEntry->count<=0 ){
260 pEntry->chain = 0;
261 }
262 if( pH->copyKey && elem->pKey ){
263 pH->xFree(elem->pKey);
264 }
265 pH->xFree( elem );
266 pH->count--;
267 if( pH->count<=0 ){
268 assert( pH->first==0 );
269 assert( pH->count==0 );
270 fts2HashClear(pH);
271 }
272}
273
274/* Attempt to locate an element of the hash table pH with a key
275** that matches pKey,nKey. Return the data for this element if it is
276** found, or NULL if there is no match.
277*/
278void *sqlite3Fts2HashFind(const fts2Hash *pH, const void *pKey, int nKey){
279 int h; /* A hash on key */
280 fts2HashElem *elem; /* The element that matches key */
281 int (*xHash)(const void*,int); /* The hash function */
282
283 if( pH==0 || pH->ht==0 ) return 0;
284 xHash = hashFunction(pH->keyClass);
285 assert( xHash!=0 );
286 h = (*xHash)(pKey,nKey);
287 assert( (pH->htsize & (pH->htsize-1))==0 );
288 elem = findElementGivenHash(pH,pKey,nKey, h & (pH->htsize-1));
289 return elem ? elem->data : 0;
290}
291
292/* Insert an element into the hash table pH. The key is pKey,nKey
293** and the data is "data".
294**
295** If no element exists with a matching key, then a new
296** element is created. A copy of the key is made if the copyKey
297** flag is set. NULL is returned.
298**
299** If another element already exists with the same key, then the
300** new data replaces the old data and the old data is returned.
301** The key is not copied in this instance. If a malloc fails, then
302** the new data is returned and the hash table is unchanged.
303**
304** If the "data" parameter to this function is NULL, then the
305** element corresponding to "key" is removed from the hash table.
306*/
307void *sqlite3Fts2HashInsert(
308 fts2Hash *pH, /* The hash table to insert into */
309 const void *pKey, /* The key */
310 int nKey, /* Number of bytes in the key */
311 void *data /* The data */
312){
313 int hraw; /* Raw hash value of the key */
314 int h; /* the hash of the key modulo hash table size */
315 fts2HashElem *elem; /* Used to loop thru the element list */
316 fts2HashElem *new_elem; /* New element added to the pH */
317 int (*xHash)(const void*,int); /* The hash function */
318
319 assert( pH!=0 );
320 xHash = hashFunction(pH->keyClass);
321 assert( xHash!=0 );
322 hraw = (*xHash)(pKey, nKey);
323 assert( (pH->htsize & (pH->htsize-1))==0 );
324 h = hraw & (pH->htsize-1);
325 elem = findElementGivenHash(pH,pKey,nKey,h);
326 if( elem ){
327 void *old_data = elem->data;
328 if( data==0 ){
329 removeElementGivenHash(pH,elem,h);
330 }else{
331 elem->data = data;
332 }
333 return old_data;
334 }
335 if( data==0 ) return 0;
336 new_elem = (fts2HashElem*)pH->xMalloc( sizeof(fts2HashElem) );
337 if( new_elem==0 ) return data;
338 if( pH->copyKey && pKey!=0 ){
339 new_elem->pKey = pH->xMalloc( nKey );
340 if( new_elem->pKey==0 ){
341 pH->xFree(new_elem);
342 return data;
343 }
344 memcpy((void*)new_elem->pKey, pKey, nKey);
345 }else{
346 new_elem->pKey = (void*)pKey;
347 }
348 new_elem->nKey = nKey;
349 pH->count++;
350 if( pH->htsize==0 ){
351 rehash(pH,8);
352 if( pH->htsize==0 ){
353 pH->count = 0;
354 pH->xFree(new_elem);
355 return data;
356 }
357 }
358 if( pH->count > pH->htsize ){
359 rehash(pH,pH->htsize*2);
360 }
361 assert( pH->htsize>0 );
362 assert( (pH->htsize & (pH->htsize-1))==0 );
363 h = hraw & (pH->htsize-1);
364 insertElement(pH, &pH->ht[h], new_elem);
365 new_elem->data = data;
366 return 0;
367}
368
369#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_hash.h b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_hash.h
new file mode 100644
index 0000000..97f3529
--- /dev/null
+++ b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_hash.h
@@ -0,0 +1,112 @@
1/*
2** 2001 September 22
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12** This is the header file for the generic hash-table implemenation
13** used in SQLite. We've modified it slightly to serve as a standalone
14** hash table implementation for the full-text indexing module.
15**
16*/
17#ifndef _FTS2_HASH_H_
18#define _FTS2_HASH_H_
19
20/* Forward declarations of structures. */
21typedef struct fts2Hash fts2Hash;
22typedef struct fts2HashElem fts2HashElem;
23
24/* A complete hash table is an instance of the following structure.
25** The internals of this structure are intended to be opaque -- client
26** code should not attempt to access or modify the fields of this structure
27** directly. Change this structure only by using the routines below.
28** However, many of the "procedures" and "functions" for modifying and
29** accessing this structure are really macros, so we can't really make
30** this structure opaque.
31*/
32struct fts2Hash {
33 char keyClass; /* HASH_INT, _POINTER, _STRING, _BINARY */
34 char copyKey; /* True if copy of key made on insert */
35 int count; /* Number of entries in this table */
36 fts2HashElem *first; /* The first element of the array */
37 void *(*xMalloc)(int); /* malloc() function to use */
38 void (*xFree)(void *); /* free() function to use */
39 int htsize; /* Number of buckets in the hash table */
40 struct _fts2ht { /* the hash table */
41 int count; /* Number of entries with this hash */
42 fts2HashElem *chain; /* Pointer to first entry with this hash */
43 } *ht;
44};
45
46/* Each element in the hash table is an instance of the following
47** structure. All elements are stored on a single doubly-linked list.
48**
49** Again, this structure is intended to be opaque, but it can't really
50** be opaque because it is used by macros.
51*/
52struct fts2HashElem {
53 fts2HashElem *next, *prev; /* Next and previous elements in the table */
54 void *data; /* Data associated with this element */
55 void *pKey; int nKey; /* Key associated with this element */
56};
57
58/*
59** There are 2 different modes of operation for a hash table:
60**
61** FTS2_HASH_STRING pKey points to a string that is nKey bytes long
62** (including the null-terminator, if any). Case
63** is respected in comparisons.
64**
65** FTS2_HASH_BINARY pKey points to binary data nKey bytes long.
66** memcmp() is used to compare keys.
67**
68** A copy of the key is made if the copyKey parameter to fts2HashInit is 1.
69*/
70#define FTS2_HASH_STRING 1
71#define FTS2_HASH_BINARY 2
72
73/*
74** Access routines. To delete, insert a NULL pointer.
75*/
76void sqlite3Fts2HashInit(fts2Hash*, int keytype, int copyKey);
77void *sqlite3Fts2HashInsert(fts2Hash*, const void *pKey, int nKey, void *pData);
78void *sqlite3Fts2HashFind(const fts2Hash*, const void *pKey, int nKey);
79void sqlite3Fts2HashClear(fts2Hash*);
80
81/*
82** Shorthand for the functions above
83*/
84#define fts2HashInit sqlite3Fts2HashInit
85#define fts2HashInsert sqlite3Fts2HashInsert
86#define fts2HashFind sqlite3Fts2HashFind
87#define fts2HashClear sqlite3Fts2HashClear
88
89/*
90** Macros for looping over all elements of a hash table. The idiom is
91** like this:
92**
93** fts2Hash h;
94** fts2HashElem *p;
95** ...
96** for(p=fts2HashFirst(&h); p; p=fts2HashNext(p)){
97** SomeStructure *pData = fts2HashData(p);
98** // do something with pData
99** }
100*/
101#define fts2HashFirst(H) ((H)->first)
102#define fts2HashNext(E) ((E)->next)
103#define fts2HashData(E) ((E)->data)
104#define fts2HashKey(E) ((E)->pKey)
105#define fts2HashKeysize(E) ((E)->nKey)
106
107/*
108** Number of entries in a hash table
109*/
110#define fts2HashCount(H) ((H)->count)
111
112#endif /* _FTS2_HASH_H_ */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_icu.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_icu.c
new file mode 100644
index 0000000..ed15f33
--- /dev/null
+++ b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_icu.c
@@ -0,0 +1,257 @@
1/*
2** 2007 June 22
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12** This file implements a tokenizer for fts2 based on the ICU library.
13**
14** $Id: fts2_icu.c,v 1.1 2007/06/22 15:21:16 danielk1977 Exp $
15*/
16
17#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
18#ifdef SQLITE_ENABLE_ICU
19
20#include <assert.h>
21#include <string.h>
22#include "fts2_tokenizer.h"
23
24#include <unicode/ubrk.h>
25#include <unicode/ucol.h>
26#include <unicode/ustring.h>
27#include <unicode/utf16.h>
28
29typedef struct IcuTokenizer IcuTokenizer;
30typedef struct IcuCursor IcuCursor;
31
32struct IcuTokenizer {
33 sqlite3_tokenizer base;
34 char *zLocale;
35};
36
37struct IcuCursor {
38 sqlite3_tokenizer_cursor base;
39
40 UBreakIterator *pIter; /* ICU break-iterator object */
41 int nChar; /* Number of UChar elements in pInput */
42 UChar *aChar; /* Copy of input using utf-16 encoding */
43 int *aOffset; /* Offsets of each character in utf-8 input */
44
45 int nBuffer;
46 char *zBuffer;
47
48 int iToken;
49};
50
51/*
52** Create a new tokenizer instance.
53*/
54static int icuCreate(
55 int argc, /* Number of entries in argv[] */
56 const char * const *argv, /* Tokenizer creation arguments */
57 sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */
58){
59 IcuTokenizer *p;
60 int n = 0;
61
62 if( argc>0 ){
63 n = strlen(argv[0])+1;
64 }
65 p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n);
66 if( !p ){
67 return SQLITE_NOMEM;
68 }
69 memset(p, 0, sizeof(IcuTokenizer));
70
71 if( n ){
72 p->zLocale = (char *)&p[1];
73 memcpy(p->zLocale, argv[0], n);
74 }
75
76 *ppTokenizer = (sqlite3_tokenizer *)p;
77
78 return SQLITE_OK;
79}
80
81/*
82** Destroy a tokenizer
83*/
84static int icuDestroy(sqlite3_tokenizer *pTokenizer){
85 IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
86 sqlite3_free(p);
87 return SQLITE_OK;
88}
89
90/*
91** Prepare to begin tokenizing a particular string. The input
92** string to be tokenized is pInput[0..nBytes-1]. A cursor
93** used to incrementally tokenize this string is returned in
94** *ppCursor.
95*/
96static int icuOpen(
97 sqlite3_tokenizer *pTokenizer, /* The tokenizer */
98 const char *zInput, /* Input string */
99 int nInput, /* Length of zInput in bytes */
100 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
101){
102 IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
103 IcuCursor *pCsr;
104
105 const int32_t opt = U_FOLD_CASE_DEFAULT;
106 UErrorCode status = U_ZERO_ERROR;
107 int nChar;
108
109 UChar32 c;
110 int iInput = 0;
111 int iOut = 0;
112
113 *ppCursor = 0;
114
115 nChar = nInput+1;
116 pCsr = (IcuCursor *)sqlite3_malloc(
117 sizeof(IcuCursor) + /* IcuCursor */
118 nChar * sizeof(UChar) + /* IcuCursor.aChar[] */
119 (nChar+1) * sizeof(int) /* IcuCursor.aOffset[] */
120 );
121 if( !pCsr ){
122 return SQLITE_NOMEM;
123 }
124 memset(pCsr, 0, sizeof(IcuCursor));
125 pCsr->aChar = (UChar *)&pCsr[1];
126 pCsr->aOffset = (int *)&pCsr->aChar[nChar];
127
128 pCsr->aOffset[iOut] = iInput;
129 U8_NEXT(zInput, iInput, nInput, c);
130 while( c>0 ){
131 int isError = 0;
132 c = u_foldCase(c, opt);
133 U16_APPEND(pCsr->aChar, iOut, nChar, c, isError);
134 if( isError ){
135 sqlite3_free(pCsr);
136 return SQLITE_ERROR;
137 }
138 pCsr->aOffset[iOut] = iInput;
139
140 if( iInput<nInput ){
141 U8_NEXT(zInput, iInput, nInput, c);
142 }else{
143 c = 0;
144 }
145 }
146
147 pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status);
148 if( !U_SUCCESS(status) ){
149 sqlite3_free(pCsr);
150 return SQLITE_ERROR;
151 }
152 pCsr->nChar = iOut;
153
154 ubrk_first(pCsr->pIter);
155 *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
156 return SQLITE_OK;
157}
158
159/*
160** Close a tokenization cursor previously opened by a call to icuOpen().
161*/
162static int icuClose(sqlite3_tokenizer_cursor *pCursor){
163 IcuCursor *pCsr = (IcuCursor *)pCursor;
164 ubrk_close(pCsr->pIter);
165 sqlite3_free(pCsr->zBuffer);
166 sqlite3_free(pCsr);
167 return SQLITE_OK;
168}
169
170/*
171** Extract the next token from a tokenization cursor.
172*/
173static int icuNext(
174 sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */
175 const char **ppToken, /* OUT: *ppToken is the token text */
176 int *pnBytes, /* OUT: Number of bytes in token */
177 int *piStartOffset, /* OUT: Starting offset of token */
178 int *piEndOffset, /* OUT: Ending offset of token */
179 int *piPosition /* OUT: Position integer of token */
180){
181 IcuCursor *pCsr = (IcuCursor *)pCursor;
182
183 int iStart = 0;
184 int iEnd = 0;
185 int nByte = 0;
186
187 while( iStart==iEnd ){
188 UChar32 c;
189
190 iStart = ubrk_current(pCsr->pIter);
191 iEnd = ubrk_next(pCsr->pIter);
192 if( iEnd==UBRK_DONE ){
193 return SQLITE_DONE;
194 }
195
196 while( iStart<iEnd ){
197 int iWhite = iStart;
198 U8_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);
199 if( u_isspace(c) ){
200 iStart = iWhite;
201 }else{
202 break;
203 }
204 }
205 assert(iStart<=iEnd);
206 }
207
208 do {
209 UErrorCode status = U_ZERO_ERROR;
210 if( nByte ){
211 char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);
212 if( !zNew ){
213 return SQLITE_NOMEM;
214 }
215 pCsr->zBuffer = zNew;
216 pCsr->nBuffer = nByte;
217 }
218
219 u_strToUTF8(
220 pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */
221 &pCsr->aChar[iStart], iEnd-iStart, /* Input vars */
222 &status /* Output success/failure */
223 );
224 } while( nByte>pCsr->nBuffer );
225
226 *ppToken = pCsr->zBuffer;
227 *pnBytes = nByte;
228 *piStartOffset = pCsr->aOffset[iStart];
229 *piEndOffset = pCsr->aOffset[iEnd];
230 *piPosition = pCsr->iToken++;
231
232 return SQLITE_OK;
233}
234
235/*
236** The set of routines that implement the simple tokenizer
237*/
238static const sqlite3_tokenizer_module icuTokenizerModule = {
239 0, /* iVersion */
240 icuCreate, /* xCreate */
241 icuDestroy, /* xCreate */
242 icuOpen, /* xOpen */
243 icuClose, /* xClose */
244 icuNext, /* xNext */
245};
246
247/*
248** Set *ppModule to point at the implementation of the ICU tokenizer.
249*/
250void sqlite3Fts2IcuTokenizerModule(
251 sqlite3_tokenizer_module const**ppModule
252){
253 *ppModule = &icuTokenizerModule;
254}
255
256#endif /* defined(SQLITE_ENABLE_ICU) */
257#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_porter.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_porter.c
new file mode 100644
index 0000000..dab1849
--- /dev/null
+++ b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_porter.c
@@ -0,0 +1,642 @@
1/*
2** 2006 September 30
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11*************************************************************************
12** Implementation of the full-text-search tokenizer that implements
13** a Porter stemmer.
14*/
15
16/*
17** The code in this file is only compiled if:
18**
19** * The FTS2 module is being built as an extension
20** (in which case SQLITE_CORE is not defined), or
21**
22** * The FTS2 module is being built into the core of
23** SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
24*/
25#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
26
27
28#include <assert.h>
29#include <stdlib.h>
30#include <stdio.h>
31#include <string.h>
32#include <ctype.h>
33
34#include "fts2_tokenizer.h"
35
36/*
37** Class derived from sqlite3_tokenizer
38*/
39typedef struct porter_tokenizer {
40 sqlite3_tokenizer base; /* Base class */
41} porter_tokenizer;
42
43/*
44** Class derived from sqlit3_tokenizer_cursor
45*/
46typedef struct porter_tokenizer_cursor {
47 sqlite3_tokenizer_cursor base;
48 const char *zInput; /* input we are tokenizing */
49 int nInput; /* size of the input */
50 int iOffset; /* current position in zInput */
51 int iToken; /* index of next token to be returned */
52 char *zToken; /* storage for current token */
53 int nAllocated; /* space allocated to zToken buffer */
54} porter_tokenizer_cursor;
55
56
57/* Forward declaration */
58static const sqlite3_tokenizer_module porterTokenizerModule;
59
60
61/*
62** Create a new tokenizer instance.
63*/
64static int porterCreate(
65 int argc, const char * const *argv,
66 sqlite3_tokenizer **ppTokenizer
67){
68 porter_tokenizer *t;
69 t = (porter_tokenizer *) calloc(sizeof(*t), 1);
70 if( t==NULL ) return SQLITE_NOMEM;
71
72 *ppTokenizer = &t->base;
73 return SQLITE_OK;
74}
75
76/*
77** Destroy a tokenizer
78*/
79static int porterDestroy(sqlite3_tokenizer *pTokenizer){
80 free(pTokenizer);
81 return SQLITE_OK;
82}
83
84/*
85** Prepare to begin tokenizing a particular string. The input
86** string to be tokenized is zInput[0..nInput-1]. A cursor
87** used to incrementally tokenize this string is returned in
88** *ppCursor.
89*/
90static int porterOpen(
91 sqlite3_tokenizer *pTokenizer, /* The tokenizer */
92 const char *zInput, int nInput, /* String to be tokenized */
93 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
94){
95 porter_tokenizer_cursor *c;
96
97 c = (porter_tokenizer_cursor *) malloc(sizeof(*c));
98 if( c==NULL ) return SQLITE_NOMEM;
99
100 c->zInput = zInput;
101 if( zInput==0 ){
102 c->nInput = 0;
103 }else if( nInput<0 ){
104 c->nInput = (int)strlen(zInput);
105 }else{
106 c->nInput = nInput;
107 }
108 c->iOffset = 0; /* start tokenizing at the beginning */
109 c->iToken = 0;
110 c->zToken = NULL; /* no space allocated, yet. */
111 c->nAllocated = 0;
112
113 *ppCursor = &c->base;
114 return SQLITE_OK;
115}
116
117/*
118** Close a tokenization cursor previously opened by a call to
119** porterOpen() above.
120*/
121static int porterClose(sqlite3_tokenizer_cursor *pCursor){
122 porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
123 free(c->zToken);
124 free(c);
125 return SQLITE_OK;
126}
127/*
128** Vowel or consonant
129*/
130static const char cType[] = {
131 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
132 1, 1, 1, 2, 1
133};
134
135/*
136** isConsonant() and isVowel() determine if their first character in
137** the string they point to is a consonant or a vowel, according
138** to Porter ruls.
139**
140** A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'.
141** 'Y' is a consonant unless it follows another consonant,
142** in which case it is a vowel.
143**
144** In these routine, the letters are in reverse order. So the 'y' rule
145** is that 'y' is a consonant unless it is followed by another
146** consonent.
147*/
148static int isVowel(const char*);
149static int isConsonant(const char *z){
150 int j;
151 char x = *z;
152 if( x==0 ) return 0;
153 assert( x>='a' && x<='z' );
154 j = cType[x-'a'];
155 if( j<2 ) return j;
156 return z[1]==0 || isVowel(z + 1);
157}
158static int isVowel(const char *z){
159 int j;
160 char x = *z;
161 if( x==0 ) return 0;
162 assert( x>='a' && x<='z' );
163 j = cType[x-'a'];
164 if( j<2 ) return 1-j;
165 return isConsonant(z + 1);
166}
167
168/*
169** Let any sequence of one or more vowels be represented by V and let
170** C be sequence of one or more consonants. Then every word can be
171** represented as:
172**
173** [C] (VC){m} [V]
174**
175** In prose: A word is an optional consonant followed by zero or
176** vowel-consonant pairs followed by an optional vowel. "m" is the
177** number of vowel consonant pairs. This routine computes the value
178** of m for the first i bytes of a word.
179**
180** Return true if the m-value for z is 1 or more. In other words,
181** return true if z contains at least one vowel that is followed
182** by a consonant.
183**
184** In this routine z[] is in reverse order. So we are really looking
185** for an instance of of a consonant followed by a vowel.
186*/
187static int m_gt_0(const char *z){
188 while( isVowel(z) ){ z++; }
189 if( *z==0 ) return 0;
190 while( isConsonant(z) ){ z++; }
191 return *z!=0;
192}
193
194/* Like mgt0 above except we are looking for a value of m which is
195** exactly 1
196*/
197static int m_eq_1(const char *z){
198 while( isVowel(z) ){ z++; }
199 if( *z==0 ) return 0;
200 while( isConsonant(z) ){ z++; }
201 if( *z==0 ) return 0;
202 while( isVowel(z) ){ z++; }
203 if( *z==0 ) return 1;
204 while( isConsonant(z) ){ z++; }
205 return *z==0;
206}
207
208/* Like mgt0 above except we are looking for a value of m>1 instead
209** or m>0
210*/
211static int m_gt_1(const char *z){
212 while( isVowel(z) ){ z++; }
213 if( *z==0 ) return 0;
214 while( isConsonant(z) ){ z++; }
215 if( *z==0 ) return 0;
216 while( isVowel(z) ){ z++; }
217 if( *z==0 ) return 0;
218 while( isConsonant(z) ){ z++; }
219 return *z!=0;
220}
221
222/*
223** Return TRUE if there is a vowel anywhere within z[0..n-1]
224*/
225static int hasVowel(const char *z){
226 while( isConsonant(z) ){ z++; }
227 return *z!=0;
228}
229
230/*
231** Return TRUE if the word ends in a double consonant.
232**
233** The text is reversed here. So we are really looking at
234** the first two characters of z[].
235*/
236static int doubleConsonant(const char *z){
237 return isConsonant(z) && z[0]==z[1] && isConsonant(z+1);
238}
239
240/*
241** Return TRUE if the word ends with three letters which
242** are consonant-vowel-consonent and where the final consonant
243** is not 'w', 'x', or 'y'.
244**
245** The word is reversed here. So we are really checking the
246** first three letters and the first one cannot be in [wxy].
247*/
248static int star_oh(const char *z){
249 return
250 z[0]!=0 && isConsonant(z) &&
251 z[0]!='w' && z[0]!='x' && z[0]!='y' &&
252 z[1]!=0 && isVowel(z+1) &&
253 z[2]!=0 && isConsonant(z+2);
254}
255
256/*
257** If the word ends with zFrom and xCond() is true for the stem
258** of the word that preceeds the zFrom ending, then change the
259** ending to zTo.
260**
261** The input word *pz and zFrom are both in reverse order. zTo
262** is in normal order.
263**
264** Return TRUE if zFrom matches. Return FALSE if zFrom does not
265** match. Not that TRUE is returned even if xCond() fails and
266** no substitution occurs.
267*/
268static int stem(
269 char **pz, /* The word being stemmed (Reversed) */
270 const char *zFrom, /* If the ending matches this... (Reversed) */
271 const char *zTo, /* ... change the ending to this (not reversed) */
272 int (*xCond)(const char*) /* Condition that must be true */
273){
274 char *z = *pz;
275 while( *zFrom && *zFrom==*z ){ z++; zFrom++; }
276 if( *zFrom!=0 ) return 0;
277 if( xCond && !xCond(z) ) return 1;
278 while( *zTo ){
279 *(--z) = *(zTo++);
280 }
281 *pz = z;
282 return 1;
283}
284
285/*
286** This is the fallback stemmer used when the porter stemmer is
287** inappropriate. The input word is copied into the output with
288** US-ASCII case folding. If the input word is too long (more
289** than 20 bytes if it contains no digits or more than 6 bytes if
290** it contains digits) then word is truncated to 20 or 6 bytes
291** by taking 10 or 3 bytes from the beginning and end.
292*/
293static void copy_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
294 int i, mx, j;
295 int hasDigit = 0;
296 for(i=0; i<nIn; i++){
297 int c = zIn[i];
298 if( c>='A' && c<='Z' ){
299 zOut[i] = c - 'A' + 'a';
300 }else{
301 if( c>='0' && c<='9' ) hasDigit = 1;
302 zOut[i] = c;
303 }
304 }
305 mx = hasDigit ? 3 : 10;
306 if( nIn>mx*2 ){
307 for(j=mx, i=nIn-mx; i<nIn; i++, j++){
308 zOut[j] = zOut[i];
309 }
310 i = j;
311 }
312 zOut[i] = 0;
313 *pnOut = i;
314}
315
316
317/*
318** Stem the input word zIn[0..nIn-1]. Store the output in zOut.
319** zOut is at least big enough to hold nIn bytes. Write the actual
320** size of the output word (exclusive of the '\0' terminator) into *pnOut.
321**
322** Any upper-case characters in the US-ASCII character set ([A-Z])
323** are converted to lower case. Upper-case UTF characters are
324** unchanged.
325**
326** Words that are longer than about 20 bytes are stemmed by retaining
327** a few bytes from the beginning and the end of the word. If the
328** word contains digits, 3 bytes are taken from the beginning and
329** 3 bytes from the end. For long words without digits, 10 bytes
330** are taken from each end. US-ASCII case folding still applies.
331**
332** If the input word contains not digits but does characters not
333** in [a-zA-Z] then no stemming is attempted and this routine just
334** copies the input into the input into the output with US-ASCII
335** case folding.
336**
337** Stemming never increases the length of the word. So there is
338** no chance of overflowing the zOut buffer.
339*/
340static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
341 int i, j, c;
342 char zReverse[28];
343 char *z, *z2;
344 if( nIn<3 || nIn>=sizeof(zReverse)-7 ){
345 /* The word is too big or too small for the porter stemmer.
346 ** Fallback to the copy stemmer */
347 copy_stemmer(zIn, nIn, zOut, pnOut);
348 return;
349 }
350 for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){
351 c = zIn[i];
352 if( c>='A' && c<='Z' ){
353 zReverse[j] = c + 'a' - 'A';
354 }else if( c>='a' && c<='z' ){
355 zReverse[j] = c;
356 }else{
357 /* The use of a character not in [a-zA-Z] means that we fallback
358 ** to the copy stemmer */
359 copy_stemmer(zIn, nIn, zOut, pnOut);
360 return;
361 }
362 }
363 memset(&zReverse[sizeof(zReverse)-5], 0, 5);
364 z = &zReverse[j+1];
365
366
367 /* Step 1a */
368 if( z[0]=='s' ){
369 if(
370 !stem(&z, "sess", "ss", 0) &&
371 !stem(&z, "sei", "i", 0) &&
372 !stem(&z, "ss", "ss", 0)
373 ){
374 z++;
375 }
376 }
377
378 /* Step 1b */
379 z2 = z;
380 if( stem(&z, "dee", "ee", m_gt_0) ){
381 /* Do nothing. The work was all in the test */
382 }else if(
383 (stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel))
384 && z!=z2
385 ){
386 if( stem(&z, "ta", "ate", 0) ||
387 stem(&z, "lb", "ble", 0) ||
388 stem(&z, "zi", "ize", 0) ){
389 /* Do nothing. The work was all in the test */
390 }else if( doubleConsonant(z) && (*z!='l' && *z!='s' && *z!='z') ){
391 z++;
392 }else if( m_eq_1(z) && star_oh(z) ){
393 *(--z) = 'e';
394 }
395 }
396
397 /* Step 1c */
398 if( z[0]=='y' && hasVowel(z+1) ){
399 z[0] = 'i';
400 }
401
402 /* Step 2 */
403 switch( z[1] ){
404 case 'a':
405 stem(&z, "lanoita", "ate", m_gt_0) ||
406 stem(&z, "lanoit", "tion", m_gt_0);
407 break;
408 case 'c':
409 stem(&z, "icne", "ence", m_gt_0) ||
410 stem(&z, "icna", "ance", m_gt_0);
411 break;
412 case 'e':
413 stem(&z, "rezi", "ize", m_gt_0);
414 break;
415 case 'g':
416 stem(&z, "igol", "log", m_gt_0);
417 break;
418 case 'l':
419 stem(&z, "ilb", "ble", m_gt_0) ||
420 stem(&z, "illa", "al", m_gt_0) ||
421 stem(&z, "iltne", "ent", m_gt_0) ||
422 stem(&z, "ile", "e", m_gt_0) ||
423 stem(&z, "ilsuo", "ous", m_gt_0);
424 break;
425 case 'o':
426 stem(&z, "noitazi", "ize", m_gt_0) ||
427 stem(&z, "noita", "ate", m_gt_0) ||
428 stem(&z, "rota", "ate", m_gt_0);
429 break;
430 case 's':
431 stem(&z, "msila", "al", m_gt_0) ||
432 stem(&z, "ssenevi", "ive", m_gt_0) ||
433 stem(&z, "ssenluf", "ful", m_gt_0) ||
434 stem(&z, "ssensuo", "ous", m_gt_0);
435 break;
436 case 't':
437 stem(&z, "itila", "al", m_gt_0) ||
438 stem(&z, "itivi", "ive", m_gt_0) ||
439 stem(&z, "itilib", "ble", m_gt_0);
440 break;
441 }
442
443 /* Step 3 */
444 switch( z[0] ){
445 case 'e':
446 stem(&z, "etaci", "ic", m_gt_0) ||
447 stem(&z, "evita", "", m_gt_0) ||
448 stem(&z, "ezila", "al", m_gt_0);
449 break;
450 case 'i':
451 stem(&z, "itici", "ic", m_gt_0);
452 break;
453 case 'l':
454 stem(&z, "laci", "ic", m_gt_0) ||
455 stem(&z, "luf", "", m_gt_0);
456 break;
457 case 's':
458 stem(&z, "ssen", "", m_gt_0);
459 break;
460 }
461
462 /* Step 4 */
463 switch( z[1] ){
464 case 'a':
465 if( z[0]=='l' && m_gt_1(z+2) ){
466 z += 2;
467 }
468 break;
469 case 'c':
470 if( z[0]=='e' && z[2]=='n' && (z[3]=='a' || z[3]=='e') && m_gt_1(z+4) ){
471 z += 4;
472 }
473 break;
474 case 'e':
475 if( z[0]=='r' && m_gt_1(z+2) ){
476 z += 2;
477 }
478 break;
479 case 'i':
480 if( z[0]=='c' && m_gt_1(z+2) ){
481 z += 2;
482 }
483 break;
484 case 'l':
485 if( z[0]=='e' && z[2]=='b' && (z[3]=='a' || z[3]=='i') && m_gt_1(z+4) ){
486 z += 4;
487 }
488 break;
489 case 'n':
490 if( z[0]=='t' ){
491 if( z[2]=='a' ){
492 if( m_gt_1(z+3) ){
493 z += 3;
494 }
495 }else if( z[2]=='e' ){
496 stem(&z, "tneme", "", m_gt_1) ||
497 stem(&z, "tnem", "", m_gt_1) ||
498 stem(&z, "tne", "", m_gt_1);
499 }
500 }
501 break;
502 case 'o':
503 if( z[0]=='u' ){
504 if( m_gt_1(z+2) ){
505 z += 2;
506 }
507 }else if( z[3]=='s' || z[3]=='t' ){
508 stem(&z, "noi", "", m_gt_1);
509 }
510 break;
511 case 's':
512 if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){
513 z += 3;
514 }
515 break;
516 case 't':
517 stem(&z, "eta", "", m_gt_1) ||
518 stem(&z, "iti", "", m_gt_1);
519 break;
520 case 'u':
521 if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){
522 z += 3;
523 }
524 break;
525 case 'v':
526 case 'z':
527 if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){
528 z += 3;
529 }
530 break;
531 }
532
533 /* Step 5a */
534 if( z[0]=='e' ){
535 if( m_gt_1(z+1) ){
536 z++;
537 }else if( m_eq_1(z+1) && !star_oh(z+1) ){
538 z++;
539 }
540 }
541
542 /* Step 5b */
543 if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){
544 z++;
545 }
546
547 /* z[] is now the stemmed word in reverse order. Flip it back
548 ** around into forward order and return.
549 */
550 *pnOut = i = strlen(z);
551 zOut[i] = 0;
552 while( *z ){
553 zOut[--i] = *(z++);
554 }
555}
556
557/*
558** Characters that can be part of a token. We assume any character
559** whose value is greater than 0x80 (any UTF character) can be
560** part of a token. In other words, delimiters all must have
561** values of 0x7f or lower.
562*/
563static const char porterIdChar[] = {
564/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
565 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */
566 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */
567 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */
568 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */
569 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */
570};
571#define isDelim(C) (((ch=C)&0x80)==0 && (ch<0x30 || !porterIdChar[ch-0x30]))
572
573/*
574** Extract the next token from a tokenization cursor. The cursor must
575** have been opened by a prior call to porterOpen().
576*/
577static int porterNext(
578 sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by porterOpen */
579 const char **pzToken, /* OUT: *pzToken is the token text */
580 int *pnBytes, /* OUT: Number of bytes in token */
581 int *piStartOffset, /* OUT: Starting offset of token */
582 int *piEndOffset, /* OUT: Ending offset of token */
583 int *piPosition /* OUT: Position integer of token */
584){
585 porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
586 const char *z = c->zInput;
587
588 while( c->iOffset<c->nInput ){
589 int iStartOffset, ch;
590
591 /* Scan past delimiter characters */
592 while( c->iOffset<c->nInput && isDelim(z[c->iOffset]) ){
593 c->iOffset++;
594 }
595
596 /* Count non-delimiter characters. */
597 iStartOffset = c->iOffset;
598 while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){
599 c->iOffset++;
600 }
601
602 if( c->iOffset>iStartOffset ){
603 int n = c->iOffset-iStartOffset;
604 if( n>c->nAllocated ){
605 c->nAllocated = n+20;
606 c->zToken = realloc(c->zToken, c->nAllocated);
607 if( c->zToken==NULL ) return SQLITE_NOMEM;
608 }
609 porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes);
610 *pzToken = c->zToken;
611 *piStartOffset = iStartOffset;
612 *piEndOffset = c->iOffset;
613 *piPosition = c->iToken++;
614 return SQLITE_OK;
615 }
616 }
617 return SQLITE_DONE;
618}
619
620/*
621** The set of routines that implement the porter-stemmer tokenizer
622*/
623static const sqlite3_tokenizer_module porterTokenizerModule = {
624 0,
625 porterCreate,
626 porterDestroy,
627 porterOpen,
628 porterClose,
629 porterNext,
630};
631
632/*
633** Allocate a new porter tokenizer. Return a pointer to the new
634** tokenizer in *ppModule
635*/
636void sqlite3Fts2PorterTokenizerModule(
637 sqlite3_tokenizer_module const**ppModule
638){
639 *ppModule = &porterTokenizerModule;
640}
641
642#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_tokenizer.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_tokenizer.c
new file mode 100644
index 0000000..cbf771b
--- /dev/null
+++ b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_tokenizer.c
@@ -0,0 +1,371 @@
1/*
2** 2007 June 22
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11******************************************************************************
12**
13** This is part of an SQLite module implementing full-text search.
14** This particular file implements the generic tokenizer interface.
15*/
16
17/*
18** The code in this file is only compiled if:
19**
20** * The FTS2 module is being built as an extension
21** (in which case SQLITE_CORE is not defined), or
22**
23** * The FTS2 module is being built into the core of
24** SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
25*/
26#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
27
28
29#include "sqlite3.h"
30#include "sqlite3ext.h"
31SQLITE_EXTENSION_INIT1
32
33#include "fts2_hash.h"
34#include "fts2_tokenizer.h"
35#include <assert.h>
36
37/*
38** Implementation of the SQL scalar function for accessing the underlying
39** hash table. This function may be called as follows:
40**
41** SELECT <function-name>(<key-name>);
42** SELECT <function-name>(<key-name>, <pointer>);
43**
44** where <function-name> is the name passed as the second argument
45** to the sqlite3Fts2InitHashTable() function (e.g. 'fts2_tokenizer').
46**
47** If the <pointer> argument is specified, it must be a blob value
48** containing a pointer to be stored as the hash data corresponding
49** to the string <key-name>. If <pointer> is not specified, then
50** the string <key-name> must already exist in the has table. Otherwise,
51** an error is returned.
52**
53** Whether or not the <pointer> argument is specified, the value returned
54** is a blob containing the pointer stored as the hash data corresponding
55** to string <key-name> (after the hash-table is updated, if applicable).
56*/
57static void scalarFunc(
58 sqlite3_context *context,
59 int argc,
60 sqlite3_value **argv
61){
62 fts2Hash *pHash;
63 void *pPtr = 0;
64 const unsigned char *zName;
65 int nName;
66
67 assert( argc==1 || argc==2 );
68
69 pHash = (fts2Hash *)sqlite3_user_data(context);
70
71 zName = sqlite3_value_text(argv[0]);
72 nName = sqlite3_value_bytes(argv[0])+1;
73
74 if( argc==2 ){
75 void *pOld;
76 int n = sqlite3_value_bytes(argv[1]);
77 if( n!=sizeof(pPtr) ){
78 sqlite3_result_error(context, "argument type mismatch", -1);
79 return;
80 }
81 pPtr = *(void **)sqlite3_value_blob(argv[1]);
82 pOld = sqlite3Fts2HashInsert(pHash, (void *)zName, nName, pPtr);
83 if( pOld==pPtr ){
84 sqlite3_result_error(context, "out of memory", -1);
85 return;
86 }
87 }else{
88 pPtr = sqlite3Fts2HashFind(pHash, zName, nName);
89 if( !pPtr ){
90 char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName);
91 sqlite3_result_error(context, zErr, -1);
92 sqlite3_free(zErr);
93 return;
94 }
95 }
96
97 sqlite3_result_blob(context, (void *)&pPtr, sizeof(pPtr), SQLITE_TRANSIENT);
98}
99
100#ifdef SQLITE_TEST
101
102#include <tcl.h>
103#include <string.h>
104
105/*
106** Implementation of a special SQL scalar function for testing tokenizers
107** designed to be used in concert with the Tcl testing framework. This
108** function must be called with two arguments:
109**
110** SELECT <function-name>(<key-name>, <input-string>);
111** SELECT <function-name>(<key-name>, <pointer>);
112**
113** where <function-name> is the name passed as the second argument
114** to the sqlite3Fts2InitHashTable() function (e.g. 'fts2_tokenizer')
115** concatenated with the string '_test' (e.g. 'fts2_tokenizer_test').
116**
117** The return value is a string that may be interpreted as a Tcl
118** list. For each token in the <input-string>, three elements are
119** added to the returned list. The first is the token position, the
120** second is the token text (folded, stemmed, etc.) and the third is the
121** substring of <input-string> associated with the token. For example,
122** using the built-in "simple" tokenizer:
123**
124** SELECT fts_tokenizer_test('simple', 'I don't see how');
125**
126** will return the string:
127**
128** "{0 i I 1 dont don't 2 see see 3 how how}"
129**
130*/
131static void testFunc(
132 sqlite3_context *context,
133 int argc,
134 sqlite3_value **argv
135){
136 fts2Hash *pHash;
137 sqlite3_tokenizer_module *p;
138 sqlite3_tokenizer *pTokenizer = 0;
139 sqlite3_tokenizer_cursor *pCsr = 0;
140
141 const char *zErr = 0;
142
143 const char *zName;
144 int nName;
145 const char *zInput;
146 int nInput;
147
148 const char *zArg = 0;
149
150 const char *zToken;
151 int nToken;
152 int iStart;
153 int iEnd;
154 int iPos;
155
156 Tcl_Obj *pRet;
157
158 assert( argc==2 || argc==3 );
159
160 nName = sqlite3_value_bytes(argv[0]);
161 zName = (const char *)sqlite3_value_text(argv[0]);
162 nInput = sqlite3_value_bytes(argv[argc-1]);
163 zInput = (const char *)sqlite3_value_text(argv[argc-1]);
164
165 if( argc==3 ){
166 zArg = (const char *)sqlite3_value_text(argv[1]);
167 }
168
169 pHash = (fts2Hash *)sqlite3_user_data(context);
170 p = (sqlite3_tokenizer_module *)sqlite3Fts2HashFind(pHash, zName, nName+1);
171
172 if( !p ){
173 char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName);
174 sqlite3_result_error(context, zErr, -1);
175 sqlite3_free(zErr);
176 return;
177 }
178
179 pRet = Tcl_NewObj();
180 Tcl_IncrRefCount(pRet);
181
182 if( SQLITE_OK!=p->xCreate(zArg ? 1 : 0, &zArg, &pTokenizer) ){
183 zErr = "error in xCreate()";
184 goto finish;
185 }
186 pTokenizer->pModule = p;
187 if( SQLITE_OK!=p->xOpen(pTokenizer, zInput, nInput, &pCsr) ){
188 zErr = "error in xOpen()";
189 goto finish;
190 }
191 pCsr->pTokenizer = pTokenizer;
192
193 while( SQLITE_OK==p->xNext(pCsr, &zToken, &nToken, &iStart, &iEnd, &iPos) ){
194 Tcl_ListObjAppendElement(0, pRet, Tcl_NewIntObj(iPos));
195 Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
196 zToken = &zInput[iStart];
197 nToken = iEnd-iStart;
198 Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
199 }
200
201 if( SQLITE_OK!=p->xClose(pCsr) ){
202 zErr = "error in xClose()";
203 goto finish;
204 }
205 if( SQLITE_OK!=p->xDestroy(pTokenizer) ){
206 zErr = "error in xDestroy()";
207 goto finish;
208 }
209
210finish:
211 if( zErr ){
212 sqlite3_result_error(context, zErr, -1);
213 }else{
214 sqlite3_result_text(context, Tcl_GetString(pRet), -1, SQLITE_TRANSIENT);
215 }
216 Tcl_DecrRefCount(pRet);
217}
218
219static
220int registerTokenizer(
221 sqlite3 *db,
222 char *zName,
223 const sqlite3_tokenizer_module *p
224){
225 int rc;
226 sqlite3_stmt *pStmt;
227 const char zSql[] = "SELECT fts2_tokenizer(?, ?)";
228
229 rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
230 if( rc!=SQLITE_OK ){
231 return rc;
232 }
233
234 sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
235 sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
236 sqlite3_step(pStmt);
237
238 return sqlite3_finalize(pStmt);
239}
240
241static
242int queryTokenizer(
243 sqlite3 *db,
244 char *zName,
245 const sqlite3_tokenizer_module **pp
246){
247 int rc;
248 sqlite3_stmt *pStmt;
249 const char zSql[] = "SELECT fts2_tokenizer(?)";
250
251 *pp = 0;
252 rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
253 if( rc!=SQLITE_OK ){
254 return rc;
255 }
256
257 sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
258 if( SQLITE_ROW==sqlite3_step(pStmt) ){
259 if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){
260 memcpy(pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp));
261 }
262 }
263
264 return sqlite3_finalize(pStmt);
265}
266
267void sqlite3Fts2SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule);
268
269/*
270** Implementation of the scalar function fts2_tokenizer_internal_test().
271** This function is used for testing only, it is not included in the
272** build unless SQLITE_TEST is defined.
273**
274** The purpose of this is to test that the fts2_tokenizer() function
275** can be used as designed by the C-code in the queryTokenizer and
276** registerTokenizer() functions above. These two functions are repeated
277** in the README.tokenizer file as an example, so it is important to
278** test them.
279**
280** To run the tests, evaluate the fts2_tokenizer_internal_test() scalar
281** function with no arguments. An assert() will fail if a problem is
282** detected. i.e.:
283**
284** SELECT fts2_tokenizer_internal_test();
285**
286*/
287static void intTestFunc(
288 sqlite3_context *context,
289 int argc,
290 sqlite3_value **argv
291){
292 int rc;
293 const sqlite3_tokenizer_module *p1;
294 const sqlite3_tokenizer_module *p2;
295 sqlite3 *db = (sqlite3 *)sqlite3_user_data(context);
296
297 /* Test the query function */
298 sqlite3Fts2SimpleTokenizerModule(&p1);
299 rc = queryTokenizer(db, "simple", &p2);
300 assert( rc==SQLITE_OK );
301 assert( p1==p2 );
302 rc = queryTokenizer(db, "nosuchtokenizer", &p2);
303 assert( rc==SQLITE_ERROR );
304 assert( p2==0 );
305 assert( 0==strcmp(sqlite3_errmsg(db), "unknown tokenizer: nosuchtokenizer") );
306
307 /* Test the storage function */
308 rc = registerTokenizer(db, "nosuchtokenizer", p1);
309 assert( rc==SQLITE_OK );
310 rc = queryTokenizer(db, "nosuchtokenizer", &p2);
311 assert( rc==SQLITE_OK );
312 assert( p2==p1 );
313
314 sqlite3_result_text(context, "ok", -1, SQLITE_STATIC);
315}
316
317#endif
318
319/*
320** Set up SQL objects in database db used to access the contents of
321** the hash table pointed to by argument pHash. The hash table must
322** been initialised to use string keys, and to take a private copy
323** of the key when a value is inserted. i.e. by a call similar to:
324**
325** sqlite3Fts2HashInit(pHash, FTS2_HASH_STRING, 1);
326**
327** This function adds a scalar function (see header comment above
328** scalarFunc() in this file for details) and, if ENABLE_TABLE is
329** defined at compilation time, a temporary virtual table (see header
330** comment above struct HashTableVtab) to the database schema. Both
331** provide read/write access to the contents of *pHash.
332**
333** The third argument to this function, zName, is used as the name
334** of both the scalar and, if created, the virtual table.
335*/
336int sqlite3Fts2InitHashTable(
337 sqlite3 *db,
338 fts2Hash *pHash,
339 const char *zName
340){
341 int rc = SQLITE_OK;
342 void *p = (void *)pHash;
343 const int any = SQLITE_ANY;
344 char *zTest = 0;
345 char *zTest2 = 0;
346
347#ifdef SQLITE_TEST
348 void *pdb = (void *)db;
349 zTest = sqlite3_mprintf("%s_test", zName);
350 zTest2 = sqlite3_mprintf("%s_internal_test", zName);
351 if( !zTest || !zTest2 ){
352 rc = SQLITE_NOMEM;
353 }
354#endif
355
356 if( rc!=SQLITE_OK
357 || (rc = sqlite3_create_function(db, zName, 1, any, p, scalarFunc, 0, 0))
358 || (rc = sqlite3_create_function(db, zName, 2, any, p, scalarFunc, 0, 0))
359#ifdef SQLITE_TEST
360 || (rc = sqlite3_create_function(db, zTest, 2, any, p, testFunc, 0, 0))
361 || (rc = sqlite3_create_function(db, zTest, 3, any, p, testFunc, 0, 0))
362 || (rc = sqlite3_create_function(db, zTest2, 0, any, pdb, intTestFunc, 0, 0))
363#endif
364 );
365
366 sqlite3_free(zTest);
367 sqlite3_free(zTest2);
368 return rc;
369}
370
371#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_tokenizer.h b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_tokenizer.h
new file mode 100644
index 0000000..8c256b2
--- /dev/null
+++ b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_tokenizer.h
@@ -0,0 +1,145 @@
1/*
2** 2006 July 10
3**
4** The author disclaims copyright to this source code.
5**
6*************************************************************************
7** Defines the interface to tokenizers used by fulltext-search. There
8** are three basic components:
9**
10** sqlite3_tokenizer_module is a singleton defining the tokenizer
11** interface functions. This is essentially the class structure for
12** tokenizers.
13**
14** sqlite3_tokenizer is used to define a particular tokenizer, perhaps
15** including customization information defined at creation time.
16**
17** sqlite3_tokenizer_cursor is generated by a tokenizer to generate
18** tokens from a particular input.
19*/
20#ifndef _FTS2_TOKENIZER_H_
21#define _FTS2_TOKENIZER_H_
22
23/* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time.
24** If tokenizers are to be allowed to call sqlite3_*() functions, then
25** we will need a way to register the API consistently.
26*/
27#include "sqlite3.h"
28
29/*
30** Structures used by the tokenizer interface. When a new tokenizer
31** implementation is registered, the caller provides a pointer to
32** an sqlite3_tokenizer_module containing pointers to the callback
33** functions that make up an implementation.
34**
35** When an fts2 table is created, it passes any arguments passed to
36** the tokenizer clause of the CREATE VIRTUAL TABLE statement to the
37** sqlite3_tokenizer_module.xCreate() function of the requested tokenizer
38** implementation. The xCreate() function in turn returns an
39** sqlite3_tokenizer structure representing the specific tokenizer to
40** be used for the fts2 table (customized by the tokenizer clause arguments).
41**
42** To tokenize an input buffer, the sqlite3_tokenizer_module.xOpen()
43** method is called. It returns an sqlite3_tokenizer_cursor object
44** that may be used to tokenize a specific input buffer based on
45** the tokenization rules supplied by a specific sqlite3_tokenizer
46** object.
47*/
48typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module;
49typedef struct sqlite3_tokenizer sqlite3_tokenizer;
50typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor;
51
52struct sqlite3_tokenizer_module {
53
54 /*
55 ** Structure version. Should always be set to 0.
56 */
57 int iVersion;
58
59 /*
60 ** Create a new tokenizer. The values in the argv[] array are the
61 ** arguments passed to the "tokenizer" clause of the CREATE VIRTUAL
62 ** TABLE statement that created the fts2 table. For example, if
63 ** the following SQL is executed:
64 **
65 ** CREATE .. USING fts2( ... , tokenizer <tokenizer-name> arg1 arg2)
66 **
67 ** then argc is set to 2, and the argv[] array contains pointers
68 ** to the strings "arg1" and "arg2".
69 **
70 ** This method should return either SQLITE_OK (0), or an SQLite error
71 ** code. If SQLITE_OK is returned, then *ppTokenizer should be set
72 ** to point at the newly created tokenizer structure. The generic
73 ** sqlite3_tokenizer.pModule variable should not be initialised by
74 ** this callback. The caller will do so.
75 */
76 int (*xCreate)(
77 int argc, /* Size of argv array */
78 const char *const*argv, /* Tokenizer argument strings */
79 sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */
80 );
81
82 /*
83 ** Destroy an existing tokenizer. The fts2 module calls this method
84 ** exactly once for each successful call to xCreate().
85 */
86 int (*xDestroy)(sqlite3_tokenizer *pTokenizer);
87
88 /*
89 ** Create a tokenizer cursor to tokenize an input buffer. The caller
90 ** is responsible for ensuring that the input buffer remains valid
91 ** until the cursor is closed (using the xClose() method).
92 */
93 int (*xOpen)(
94 sqlite3_tokenizer *pTokenizer, /* Tokenizer object */
95 const char *pInput, int nBytes, /* Input buffer */
96 sqlite3_tokenizer_cursor **ppCursor /* OUT: Created tokenizer cursor */
97 );
98
99 /*
100 ** Destroy an existing tokenizer cursor. The fts2 module calls this
101 ** method exactly once for each successful call to xOpen().
102 */
103 int (*xClose)(sqlite3_tokenizer_cursor *pCursor);
104
105 /*
106 ** Retrieve the next token from the tokenizer cursor pCursor. This
107 ** method should either return SQLITE_OK and set the values of the
108 ** "OUT" variables identified below, or SQLITE_DONE to indicate that
109 ** the end of the buffer has been reached, or an SQLite error code.
110 **
111 ** *ppToken should be set to point at a buffer containing the
112 ** normalized version of the token (i.e. after any case-folding and/or
113 ** stemming has been performed). *pnBytes should be set to the length
114 ** of this buffer in bytes. The input text that generated the token is
115 ** identified by the byte offsets returned in *piStartOffset and
116 ** *piEndOffset.
117 **
118 ** The buffer *ppToken is set to point at is managed by the tokenizer
119 ** implementation. It is only required to be valid until the next call
120 ** to xNext() or xClose().
121 */
122 /* TODO(shess) current implementation requires pInput to be
123 ** nul-terminated. This should either be fixed, or pInput/nBytes
124 ** should be converted to zInput.
125 */
126 int (*xNext)(
127 sqlite3_tokenizer_cursor *pCursor, /* Tokenizer cursor */
128 const char **ppToken, int *pnBytes, /* OUT: Normalized text for token */
129 int *piStartOffset, /* OUT: Byte offset of token in input buffer */
130 int *piEndOffset, /* OUT: Byte offset of end of token in input buffer */
131 int *piPosition /* OUT: Number of tokens returned before this one */
132 );
133};
134
135struct sqlite3_tokenizer {
136 const sqlite3_tokenizer_module *pModule; /* The module for this tokenizer */
137 /* Tokenizer implementations will typically add additional fields */
138};
139
140struct sqlite3_tokenizer_cursor {
141 sqlite3_tokenizer *pTokenizer; /* Tokenizer for this cursor. */
142 /* Tokenizer implementations will typically add additional fields */
143};
144
145#endif /* _FTS2_TOKENIZER_H_ */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_tokenizer1.c b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_tokenizer1.c
new file mode 100644
index 0000000..540ba27
--- /dev/null
+++ b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/fts2_tokenizer1.c
@@ -0,0 +1,229 @@
1/*
2** 2006 Oct 10
3**
4** The author disclaims copyright to this source code. In place of
5** a legal notice, here is a blessing:
6**
7** May you do good and not evil.
8** May you find forgiveness for yourself and forgive others.
9** May you share freely, never taking more than you give.
10**
11******************************************************************************
12**
13** Implementation of the "simple" full-text-search tokenizer.
14*/
15
16/*
17** The code in this file is only compiled if:
18**
19** * The FTS2 module is being built as an extension
20** (in which case SQLITE_CORE is not defined), or
21**
22** * The FTS2 module is being built into the core of
23** SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
24*/
25#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
26
27
28#include <assert.h>
29#include <stdlib.h>
30#include <stdio.h>
31#include <string.h>
32#include <ctype.h>
33
34#include "fts2_tokenizer.h"
35
36typedef struct simple_tokenizer {
37 sqlite3_tokenizer base;
38 char delim[128]; /* flag ASCII delimiters */
39} simple_tokenizer;
40
41typedef struct simple_tokenizer_cursor {
42 sqlite3_tokenizer_cursor base;
43 const char *pInput; /* input we are tokenizing */
44 int nBytes; /* size of the input */
45 int iOffset; /* current position in pInput */
46 int iToken; /* index of next token to be returned */
47 char *pToken; /* storage for current token */
48 int nTokenAllocated; /* space allocated to zToken buffer */
49} simple_tokenizer_cursor;
50
51
52/* Forward declaration */
53static const sqlite3_tokenizer_module simpleTokenizerModule;
54
55static int simpleDelim(simple_tokenizer *t, unsigned char c){
56 return c<0x80 && t->delim[c];
57}
58
59/*
60** Create a new tokenizer instance.
61*/
62static int simpleCreate(
63 int argc, const char * const *argv,
64 sqlite3_tokenizer **ppTokenizer
65){
66 simple_tokenizer *t;
67
68 t = (simple_tokenizer *) calloc(sizeof(*t), 1);
69 if( t==NULL ) return SQLITE_NOMEM;
70
71 /* TODO(shess) Delimiters need to remain the same from run to run,
72 ** else we need to reindex. One solution would be a meta-table to
73 ** track such information in the database, then we'd only want this
74 ** information on the initial create.
75 */
76 if( argc>1 ){
77 int i, n = strlen(argv[1]);
78 for(i=0; i<n; i++){
79 unsigned char ch = argv[1][i];
80 /* We explicitly don't support UTF-8 delimiters for now. */
81 if( ch>=0x80 ){
82 free(t);
83 return SQLITE_ERROR;
84 }
85 t->delim[ch] = 1;
86 }
87 } else {
88 /* Mark non-alphanumeric ASCII characters as delimiters */
89 int i;
90 for(i=1; i<0x80; i++){
91 t->delim[i] = !isalnum(i);
92 }
93 }
94
95 *ppTokenizer = &t->base;
96 return SQLITE_OK;
97}
98
99/*
100** Destroy a tokenizer
101*/
102static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
103 free(pTokenizer);
104 return SQLITE_OK;
105}
106
107/*
108** Prepare to begin tokenizing a particular string. The input
109** string to be tokenized is pInput[0..nBytes-1]. A cursor
110** used to incrementally tokenize this string is returned in
111** *ppCursor.
112*/
113static int simpleOpen(
114 sqlite3_tokenizer *pTokenizer, /* The tokenizer */
115 const char *pInput, int nBytes, /* String to be tokenized */
116 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
117){
118 simple_tokenizer_cursor *c;
119
120 c = (simple_tokenizer_cursor *) malloc(sizeof(*c));
121 if( c==NULL ) return SQLITE_NOMEM;
122
123 c->pInput = pInput;
124 if( pInput==0 ){
125 c->nBytes = 0;
126 }else if( nBytes<0 ){
127 c->nBytes = (int)strlen(pInput);
128 }else{
129 c->nBytes = nBytes;
130 }
131 c->iOffset = 0; /* start tokenizing at the beginning */
132 c->iToken = 0;
133 c->pToken = NULL; /* no space allocated, yet. */
134 c->nTokenAllocated = 0;
135
136 *ppCursor = &c->base;
137 return SQLITE_OK;
138}
139
140/*
141** Close a tokenization cursor previously opened by a call to
142** simpleOpen() above.
143*/
144static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
145 simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
146 free(c->pToken);
147 free(c);
148 return SQLITE_OK;
149}
150
151/*
152** Extract the next token from a tokenization cursor. The cursor must
153** have been opened by a prior call to simpleOpen().
154*/
155static int simpleNext(
156 sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */
157 const char **ppToken, /* OUT: *ppToken is the token text */
158 int *pnBytes, /* OUT: Number of bytes in token */
159 int *piStartOffset, /* OUT: Starting offset of token */
160 int *piEndOffset, /* OUT: Ending offset of token */
161 int *piPosition /* OUT: Position integer of token */
162){
163 simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
164 simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
165 unsigned char *p = (unsigned char *)c->pInput;
166
167 while( c->iOffset<c->nBytes ){
168 int iStartOffset;
169
170 /* Scan past delimiter characters */
171 while( c->iOffset<c->nBytes && simpleDelim(t, p[c->iOffset]) ){
172 c->iOffset++;
173 }
174
175 /* Count non-delimiter characters. */
176 iStartOffset = c->iOffset;
177 while( c->iOffset<c->nBytes && !simpleDelim(t, p[c->iOffset]) ){
178 c->iOffset++;
179 }
180
181 if( c->iOffset>iStartOffset ){
182 int i, n = c->iOffset-iStartOffset;
183 if( n>c->nTokenAllocated ){
184 c->nTokenAllocated = n+20;
185 c->pToken = realloc(c->pToken, c->nTokenAllocated);
186 if( c->pToken==NULL ) return SQLITE_NOMEM;
187 }
188 for(i=0; i<n; i++){
189 /* TODO(shess) This needs expansion to handle UTF-8
190 ** case-insensitivity.
191 */
192 unsigned char ch = p[iStartOffset+i];
193 c->pToken[i] = ch<0x80 ? tolower(ch) : ch;
194 }
195 *ppToken = c->pToken;
196 *pnBytes = n;
197 *piStartOffset = iStartOffset;
198 *piEndOffset = c->iOffset;
199 *piPosition = c->iToken++;
200
201 return SQLITE_OK;
202 }
203 }
204 return SQLITE_DONE;
205}
206
207/*
208** The set of routines that implement the simple tokenizer
209*/
210static const sqlite3_tokenizer_module simpleTokenizerModule = {
211 0,
212 simpleCreate,
213 simpleDestroy,
214 simpleOpen,
215 simpleClose,
216 simpleNext,
217};
218
219/*
220** Allocate a new simple tokenizer. Return a pointer to the new
221** tokenizer in *ppModule
222*/
223void sqlite3Fts2SimpleTokenizerModule(
224 sqlite3_tokenizer_module const**ppModule
225){
226 *ppModule = &simpleTokenizerModule;
227}
228
229#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/mkfts2amal.tcl b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/mkfts2amal.tcl
new file mode 100644
index 0000000..5c8d1e9
--- /dev/null
+++ b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts2/mkfts2amal.tcl
@@ -0,0 +1,116 @@
1#!/usr/bin/tclsh
2#
3# This script builds a single C code file holding all of FTS2 code.
4# The name of the output file is fts2amal.c. To build this file,
5# first do:
6#
7# make target_source
8#
9# The make target above moves all of the source code files into
10# a subdirectory named "tsrc". (This script expects to find the files
11# there and will not work if they are not found.)
12#
13# After the "tsrc" directory has been created and populated, run
14# this script:
15#
16# tclsh mkfts2amal.tcl
17#
18# The amalgamated FTS2 code will be written into fts2amal.c
19#
20
21# Open the output file and write a header comment at the beginning
22# of the file.
23#
24set out [open fts2amal.c w]
25set today [clock format [clock seconds] -format "%Y-%m-%d %H:%M:%S UTC" -gmt 1]
26puts $out [subst \
27{/******************************************************************************
28** This file is an amalgamation of separate C source files from the SQLite
29** Full Text Search extension 2 (fts2). By combining all the individual C
30** code files into this single large file, the entire code can be compiled
31** as a one translation unit. This allows many compilers to do optimizations
32** that would not be possible if the files were compiled separately. It also
33** makes the code easier to import into other projects.
34**
35** This amalgamation was generated on $today.
36*/}]
37
38# These are the header files used by FTS2. The first time any of these
39# files are seen in a #include statement in the C code, include the complete
40# text of the file in-line. The file only needs to be included once.
41#
42foreach hdr {
43 fts2.h
44 fts2_hash.h
45 fts2_tokenizer.h
46 sqlite3.h
47 sqlite3ext.h
48} {
49 set available_hdr($hdr) 1
50}
51
52# 78 stars used for comment formatting.
53set s78 \
54{*****************************************************************************}
55
56# Insert a comment into the code
57#
58proc section_comment {text} {
59 global out s78
60 set n [string length $text]
61 set nstar [expr {60 - $n}]
62 set stars [string range $s78 0 $nstar]
63 puts $out "/************** $text $stars/"
64}
65
66# Read the source file named $filename and write it into the
67# sqlite3.c output file. If any #include statements are seen,
68# process them approprately.
69#
70proc copy_file {filename} {
71 global seen_hdr available_hdr out
72 set tail [file tail $filename]
73 section_comment "Begin file $tail"
74 set in [open $filename r]
75 while {![eof $in]} {
76 set line [gets $in]
77 if {[regexp {^#\s*include\s+["<]([^">]+)[">]} $line all hdr]} {
78 if {[info exists available_hdr($hdr)]} {
79 if {$available_hdr($hdr)} {
80 section_comment "Include $hdr in the middle of $tail"
81 copy_file tsrc/$hdr
82 section_comment "Continuing where we left off in $tail"
83 }
84 } elseif {![info exists seen_hdr($hdr)]} {
85 set seen_hdr($hdr) 1
86 puts $out $line
87 }
88 } elseif {[regexp {^#ifdef __cplusplus} $line]} {
89 puts $out "#if 0"
90 } elseif {[regexp {^#line} $line]} {
91 # Skip #line directives.
92 } else {
93 puts $out $line
94 }
95 }
96 close $in
97 section_comment "End of $tail"
98}
99
100
101# Process the source files. Process files containing commonly
102# used subroutines first in order to help the compiler find
103# inlining opportunities.
104#
105foreach file {
106 fts2.c
107 fts2_hash.c
108 fts2_porter.c
109 fts2_tokenizer.c
110 fts2_tokenizer1.c
111 fts2_icu.c
112} {
113 copy_file tsrc/$file
114}
115
116close $out