diff options
author | dan miller | 2007-10-20 02:49:29 +0000 |
---|---|---|
committer | dan miller | 2007-10-20 02:49:29 +0000 |
commit | e36d23a85ebff914d74bb541558c2b6082b78edb (patch) | |
tree | 54b58fdf162e78af64055282a6035c8d2443389d /libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/README.tokenizers | |
parent | * Fixed an issue whereby avatar chat distances were being calculated against ... (diff) | |
download | opensim-SC-e36d23a85ebff914d74bb541558c2b6082b78edb.zip opensim-SC-e36d23a85ebff914d74bb541558c2b6082b78edb.tar.gz opensim-SC-e36d23a85ebff914d74bb541558c2b6082b78edb.tar.bz2 opensim-SC-e36d23a85ebff914d74bb541558c2b6082b78edb.tar.xz |
sqlite source (unix build) added to libraries
Diffstat (limited to 'libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/README.tokenizers')
-rw-r--r-- | libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/README.tokenizers | 134 |
1 files changed, 134 insertions, 0 deletions
diff --git a/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/README.tokenizers b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/README.tokenizers new file mode 100644 index 0000000..f214b24 --- /dev/null +++ b/libraries/sqlite/unix/sqlite-3.5.1/ext/fts3/README.tokenizers | |||
@@ -0,0 +1,134 @@ | |||
1 | |||
2 | 1. FTS3 Tokenizers | ||
3 | |||
4 | When creating a new full-text table, FTS3 allows the user to select | ||
5 | the text tokenizer implementation to be used when indexing text | ||
6 | by specifying a "tokenizer" clause as part of the CREATE VIRTUAL TABLE | ||
7 | statement: | ||
8 | |||
9 | CREATE VIRTUAL TABLE <table-name> USING fts3( | ||
10 | <columns ...> [, tokenizer <tokenizer-name> [<tokenizer-args>]] | ||
11 | ); | ||
12 | |||
13 | The built-in tokenizers (valid values to pass as <tokenizer name>) are | ||
14 | "simple" and "porter". | ||
15 | |||
16 | <tokenizer-args> should consist of zero or more white-space separated | ||
17 | arguments to pass to the selected tokenizer implementation. The | ||
18 | interpretation of the arguments, if any, depends on the individual | ||
19 | tokenizer. | ||
20 | |||
21 | 2. Custom Tokenizers | ||
22 | |||
23 | FTS3 allows users to provide custom tokenizer implementations. The | ||
24 | interface used to create a new tokenizer is defined and described in | ||
25 | the fts3_tokenizer.h source file. | ||
26 | |||
27 | Registering a new FTS3 tokenizer is similar to registering a new | ||
28 | virtual table module with SQLite. The user passes a pointer to a | ||
29 | structure containing pointers to various callback functions that | ||
30 | make up the implementation of the new tokenizer type. For tokenizers, | ||
31 | the structure (defined in fts3_tokenizer.h) is called | ||
32 | "sqlite3_tokenizer_module". | ||
33 | |||
34 | FTS3 does not expose a C-function that users call to register new | ||
35 | tokenizer types with a database handle. Instead, the pointer must | ||
36 | be encoded as an SQL blob value and passed to FTS3 through the SQL | ||
37 | engine by evaluating a special scalar function, "fts3_tokenizer()". | ||
38 | The fts3_tokenizer() function may be called with one or two arguments, | ||
39 | as follows: | ||
40 | |||
41 | SELECT fts3_tokenizer(<tokenizer-name>); | ||
42 | SELECT fts3_tokenizer(<tokenizer-name>, <sqlite3_tokenizer_module ptr>); | ||
43 | |||
44 | Where <tokenizer-name> is a string identifying the tokenizer and | ||
45 | <sqlite3_tokenizer_module ptr> is a pointer to an sqlite3_tokenizer_module | ||
46 | structure encoded as an SQL blob. If the second argument is present, | ||
47 | it is registered as tokenizer <tokenizer-name> and a copy of it | ||
48 | returned. If only one argument is passed, a pointer to the tokenizer | ||
49 | implementation currently registered as <tokenizer-name> is returned, | ||
50 | encoded as a blob. Or, if no such tokenizer exists, an SQL exception | ||
51 | (error) is raised. | ||
52 | |||
53 | SECURITY: If the fts3 extension is used in an environment where potentially | ||
54 | malicious users may execute arbitrary SQL (i.e. gears), they should be | ||
55 | prevented from invoking the fts3_tokenizer() function, possibly using the | ||
56 | authorisation callback. | ||
57 | |||
58 | See "Sample code" below for an example of calling the fts3_tokenizer() | ||
59 | function from C code. | ||
60 | |||
61 | 3. ICU Library Tokenizers | ||
62 | |||
63 | If this extension is compiled with the SQLITE_ENABLE_ICU pre-processor | ||
64 | symbol defined, then there exists a built-in tokenizer named "icu" | ||
65 | implemented using the ICU library. The first argument passed to the | ||
66 | xCreate() method (see fts3_tokenizer.h) of this tokenizer may be | ||
67 | an ICU locale identifier. For example "tr_TR" for Turkish as used | ||
68 | in Turkey, or "en_AU" for English as used in Australia. For example: | ||
69 | |||
70 | "CREATE VIRTUAL TABLE thai_text USING fts3(text, tokenizer icu th_TH)" | ||
71 | |||
72 | The ICU tokenizer implementation is very simple. It splits the input | ||
73 | text according to the ICU rules for finding word boundaries and discards | ||
74 | any tokens that consist entirely of white-space. This may be suitable | ||
75 | for some applications in some locales, but not all. If more complex | ||
76 | processing is required, for example to implement stemming or | ||
77 | discard punctuation, this can be done by creating a tokenizer | ||
78 | implementation that uses the ICU tokenizer as part of it's implementation. | ||
79 | |||
80 | When using the ICU tokenizer this way, it is safe to overwrite the | ||
81 | contents of the strings returned by the xNext() method (see | ||
82 | fts3_tokenizer.h). | ||
83 | |||
84 | 4. Sample code. | ||
85 | |||
86 | The following two code samples illustrate the way C code should invoke | ||
87 | the fts3_tokenizer() scalar function: | ||
88 | |||
89 | int registerTokenizer( | ||
90 | sqlite3 *db, | ||
91 | char *zName, | ||
92 | const sqlite3_tokenizer_module *p | ||
93 | ){ | ||
94 | int rc; | ||
95 | sqlite3_stmt *pStmt; | ||
96 | const char zSql[] = "SELECT fts3_tokenizer(?, ?)"; | ||
97 | |||
98 | rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0); | ||
99 | if( rc!=SQLITE_OK ){ | ||
100 | return rc; | ||
101 | } | ||
102 | |||
103 | sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC); | ||
104 | sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC); | ||
105 | sqlite3_step(pStmt); | ||
106 | |||
107 | return sqlite3_finalize(pStmt); | ||
108 | } | ||
109 | |||
110 | int queryTokenizer( | ||
111 | sqlite3 *db, | ||
112 | char *zName, | ||
113 | const sqlite3_tokenizer_module **pp | ||
114 | ){ | ||
115 | int rc; | ||
116 | sqlite3_stmt *pStmt; | ||
117 | const char zSql[] = "SELECT fts3_tokenizer(?)"; | ||
118 | |||
119 | *pp = 0; | ||
120 | rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0); | ||
121 | if( rc!=SQLITE_OK ){ | ||
122 | return rc; | ||
123 | } | ||
124 | |||
125 | sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC); | ||
126 | if( SQLITE_ROW==sqlite3_step(pStmt) ){ | ||
127 | if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){ | ||
128 | memcpy(pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp)); | ||
129 | } | ||
130 | } | ||
131 | |||
132 | return sqlite3_finalize(pStmt); | ||
133 | } | ||
134 | |||