#ifdef HAVE_STDINT_H #include <stdint.h> #endif #ifdef HAVE_INTTYPES_H #include <inttypes.h> #endif #line 1 "fts5.h" /* ** 2014 May 31 ** ** The author disclaims copyright to this source code. In place of ** a legal notice, here is a blessing: ** ** May you do good and not evil. ** May you find forgiveness for yourself and forgive others. ** May you share freely, never taking more than you give. ** ****************************************************************************** ** ** Interfaces to extend FTS5. Using the interfaces defined in this file, ** FTS5 may be extended with: ** ** * custom tokenizers, and ** * custom auxiliary functions.
*/
#ifndef _FTS5_H #define _FTS5_H
#include"sqlite3.h"
#ifdef __cplusplus extern"C" { #endif
/************************************************************************* ** CUSTOM AUXILIARY FUNCTIONS ** ** Virtual table implementations may overload SQL functions by implementing ** the sqlite3_module.xFindFunction() method.
*/
typedefvoid (*fts5_extension_function)( const Fts5ExtensionApi *pApi, /* API offered by current FTS version */
Fts5Context *pFts, /* First arg to pass to pApi functions */
sqlite3_context *pCtx, /* Context for returning result/error */ int nVal, /* Number of values in apVal[] array */
sqlite3_value **apVal /* Array of trailing arguments */
);
/* ** EXTENSION API FUNCTIONS ** ** xUserData(pFts): ** Return a copy of the pUserData pointer passed to the xCreateFunction() ** API when the extension function was registered. ** ** xColumnTotalSize(pFts, iCol, pnToken): ** If parameter iCol is less than zero, set output variable *pnToken ** to the total number of tokens in the FTS5 table. Or, if iCol is ** non-negative but less than the number of columns in the table, return ** the total number of tokens in column iCol, considering all rows in ** the FTS5 table. ** ** If parameter iCol is greater than or equal to the number of columns ** in the table, SQLITE_RANGE is returned. Or, if an error occurs (e.g. ** an OOM condition or IO error), an appropriate SQLite error code is ** returned. ** ** xColumnCount(pFts): ** Return the number of columns in the table. ** ** xColumnSize(pFts, iCol, pnToken): ** If parameter iCol is less than zero, set output variable *pnToken ** to the total number of tokens in the current row. Or, if iCol is ** non-negative but less than the number of columns in the table, set ** *pnToken to the number of tokens in column iCol of the current row. ** ** If parameter iCol is greater than or equal to the number of columns ** in the table, SQLITE_RANGE is returned. Or, if an error occurs (e.g. ** an OOM condition or IO error), an appropriate SQLite error code is ** returned. ** ** This function may be quite inefficient if used with an FTS5 table ** created with the "columnsize=0" option. ** ** xColumnText: ** If parameter iCol is less than zero, or greater than or equal to the ** number of columns in the table, SQLITE_RANGE is returned. ** ** Otherwise, this function attempts to retrieve the text of column iCol of ** the current document. If successful, (*pz) is set to point to a buffer ** containing the text in utf-8 encoding, (*pn) is set to the size in bytes ** (not characters) of the buffer and SQLITE_OK is returned. Otherwise, ** if an error occurs, an SQLite error code is returned and the final values ** of (*pz) and (*pn) are undefined. ** ** xPhraseCount: ** Returns the number of phrases in the current query expression. ** ** xPhraseSize: ** If parameter iCol is less than zero, or greater than or equal to the ** number of phrases in the current query, as returned by xPhraseCount, ** 0 is returned. Otherwise, this function returns the number of tokens in ** phrase iPhrase of the query. Phrases are numbered starting from zero. ** ** xInstCount: ** Set *pnInst to the total number of occurrences of all phrases within ** the query within the current row. Return SQLITE_OK if successful, or ** an error code (i.e. SQLITE_NOMEM) if an error occurs. ** ** This API can be quite slow if used with an FTS5 table created with the ** "detail=none" or "detail=column" option. If the FTS5 table is created ** with either "detail=none" or "detail=column" and "content=" option ** (i.e. if it is a contentless table), then this API always returns 0. ** ** xInst: ** Query for the details of phrase match iIdx within the current row. ** Phrase matches are numbered starting from zero, so the iIdx argument ** should be greater than or equal to zero and smaller than the value ** output by xInstCount(). If iIdx is less than zero or greater than ** or equal to the value returned by xInstCount(), SQLITE_RANGE is returned. ** ** Otherwise, output parameter *piPhrase is set to the phrase number, *piCol ** to the column in which it occurs and *piOff the token offset of the ** first token of the phrase. SQLITE_OK is returned if successful, or an ** error code (i.e. SQLITE_NOMEM) if an error occurs. ** ** This API can be quite slow if used with an FTS5 table created with the ** "detail=none" or "detail=column" option. ** ** xRowid: ** Returns the rowid of the current row. ** ** xTokenize: ** Tokenize text using the tokenizer belonging to the FTS5 table. ** ** xQueryPhrase(pFts5, iPhrase, pUserData, xCallback): ** This API function is used to query the FTS table for phrase iPhrase ** of the current query. Specifically, a query equivalent to: ** ** ... FROM ftstable WHERE ftstable MATCH $p ORDER BY rowid ** ** with $p set to a phrase equivalent to the phrase iPhrase of the ** current query is executed. Any column filter that applies to ** phrase iPhrase of the current query is included in $p. For each ** row visited, the callback function passed as the fourth argument ** is invoked. The context and API objects passed to the callback ** function may be used to access the properties of each matched row. ** Invoking Api.xUserData() returns a copy of the pointer passed as ** the third argument to pUserData. ** ** If parameter iPhrase is less than zero, or greater than or equal to ** the number of phrases in the query, as returned by xPhraseCount(), ** this function returns SQLITE_RANGE. ** ** If the callback function returns any value other than SQLITE_OK, the ** query is abandoned and the xQueryPhrase function returns immediately. ** If the returned value is SQLITE_DONE, xQueryPhrase returns SQLITE_OK. ** Otherwise, the error code is propagated upwards. ** ** If the query runs to completion without incident, SQLITE_OK is returned. ** Or, if some error occurs before the query completes or is aborted by ** the callback, an SQLite error code is returned. ** ** ** xSetAuxdata(pFts5, pAux, xDelete) ** ** Save the pointer passed as the second argument as the extension function's ** "auxiliary data". The pointer may then be retrieved by the current or any ** future invocation of the same fts5 extension function made as part of ** the same MATCH query using the xGetAuxdata() API. ** ** Each extension function is allocated a single auxiliary data slot for ** each FTS query (MATCH expression). If the extension function is invoked ** more than once for a single FTS query, then all invocations share a ** single auxiliary data context. ** ** If there is already an auxiliary data pointer when this function is ** invoked, then it is replaced by the new pointer. If an xDelete callback ** was specified along with the original pointer, it is invoked at this ** point. ** ** The xDelete callback, if one is specified, is also invoked on the ** auxiliary data pointer after the FTS5 query has finished. ** ** If an error (e.g. an OOM condition) occurs within this function, ** the auxiliary data is set to NULL and an error code returned. If the ** xDelete parameter was not NULL, it is invoked on the auxiliary data ** pointer before returning. ** ** ** xGetAuxdata(pFts5, bClear) ** ** Returns the current auxiliary data pointer for the fts5 extension ** function. See the xSetAuxdata() method for details. ** ** If the bClear argument is non-zero, then the auxiliary data is cleared ** (set to NULL) before this function returns. In this case the xDelete, ** if any, is not invoked. ** ** ** xRowCount(pFts5, pnRow) ** ** This function is used to retrieve the total number of rows in the table. ** In other words, the same value that would be returned by: ** ** SELECT count(*) FROM ftstable; ** ** xPhraseFirst() ** This function is used, along with type Fts5PhraseIter and the xPhraseNext ** method, to iterate through all instances of a single query phrase within ** the current row. This is the same information as is accessible via the ** xInstCount/xInst APIs. While the xInstCount/xInst APIs are more convenient ** to use, this API may be faster under some circumstances. To iterate ** through instances of phrase iPhrase, use the following code: ** ** Fts5PhraseIter iter; ** int iCol, iOff; ** for(pApi->xPhraseFirst(pFts, iPhrase, &iter, &iCol, &iOff); ** iCol>=0; ** pApi->xPhraseNext(pFts, &iter, &iCol, &iOff) ** ){ ** // An instance of phrase iPhrase at offset iOff of column iCol ** } ** ** The Fts5PhraseIter structure is defined above. Applications should not ** modify this structure directly - it should only be used as shown above ** with the xPhraseFirst() and xPhraseNext() API methods (and by ** xPhraseFirstColumn() and xPhraseNextColumn() as illustrated below). ** ** This API can be quite slow if used with an FTS5 table created with the ** "detail=none" or "detail=column" option. If the FTS5 table is created ** with either "detail=none" or "detail=column" and "content=" option ** (i.e. if it is a contentless table), then this API always iterates ** through an empty set (all calls to xPhraseFirst() set iCol to -1). ** ** In all cases, matches are visited in (column ASC, offset ASC) order. ** i.e. all those in column 0, sorted by offset, followed by those in ** column 1, etc. ** ** xPhraseNext() ** See xPhraseFirst above. ** ** xPhraseFirstColumn() ** This function and xPhraseNextColumn() are similar to the xPhraseFirst() ** and xPhraseNext() APIs described above. The difference is that instead ** of iterating through all instances of a phrase in the current row, these ** APIs are used to iterate through the set of columns in the current row ** that contain one or more instances of a specified phrase. For example: ** ** Fts5PhraseIter iter; ** int iCol; ** for(pApi->xPhraseFirstColumn(pFts, iPhrase, &iter, &iCol); ** iCol>=0; ** pApi->xPhraseNextColumn(pFts, &iter, &iCol) ** ){ ** // Column iCol contains at least one instance of phrase iPhrase ** } ** ** This API can be quite slow if used with an FTS5 table created with the ** "detail=none" option. If the FTS5 table is created with either ** "detail=none" "content=" option (i.e. if it is a contentless table), ** then this API always iterates through an empty set (all calls to ** xPhraseFirstColumn() set iCol to -1). ** ** The information accessed using this API and its companion ** xPhraseFirstColumn() may also be obtained using xPhraseFirst/xPhraseNext ** (or xInst/xInstCount). The chief advantage of this API is that it is ** significantly more efficient than those alternatives when used with ** "detail=column" tables. ** ** xPhraseNextColumn() ** See xPhraseFirstColumn above. ** ** xQueryToken(pFts5, iPhrase, iToken, ppToken, pnToken) ** This is used to access token iToken of phrase iPhrase of the current ** query. Before returning, output parameter *ppToken is set to point ** to a buffer containing the requested token, and *pnToken to the ** size of this buffer in bytes. ** ** If iPhrase or iToken are less than zero, or if iPhrase is greater than ** or equal to the number of phrases in the query as reported by ** xPhraseCount(), or if iToken is equal to or greater than the number of ** tokens in the phrase, SQLITE_RANGE is returned and *ppToken and *pnToken are both zeroed. ** ** The output text is not a copy of the query text that specified the ** token. It is the output of the tokenizer module. For tokendata=1 ** tables, this includes any embedded 0x00 and trailing data. ** ** xInstToken(pFts5, iIdx, iToken, ppToken, pnToken) ** This is used to access token iToken of phrase hit iIdx within the ** current row. If iIdx is less than zero or greater than or equal to the ** value returned by xInstCount(), SQLITE_RANGE is returned. Otherwise, ** output variable (*ppToken) is set to point to a buffer containing the ** matching document token, and (*pnToken) to the size of that buffer in ** bytes. This API is not available if the specified token matches a ** prefix query term. In that case both output variables are always set ** to 0. ** ** The output text is not a copy of the document text that was tokenized. ** It is the output of the tokenizer module. For tokendata=1 tables, this ** includes any embedded 0x00 and trailing data. ** ** This API can be quite slow if used with an FTS5 table created with the ** "detail=none" or "detail=column" option. ** ** xColumnLocale(pFts5, iIdx, pzLocale, pnLocale) ** If parameter iCol is less than zero, or greater than or equal to the ** number of columns in the table, SQLITE_RANGE is returned. ** ** Otherwise, this function attempts to retrieve the locale associated ** with column iCol of the current row. Usually, there is no associated ** locale, and output parameters (*pzLocale) and (*pnLocale) are set ** to NULL and 0, respectively. However, if the fts5_locale() function ** was used to associate a locale with the value when it was inserted ** into the fts5 table, then (*pzLocale) is set to point to a nul-terminated ** buffer containing the name of the locale in utf-8 encoding. (*pnLocale) ** is set to the size in bytes of the buffer, not including the ** nul-terminator. ** ** If successful, SQLITE_OK is returned. Or, if an error occurs, an ** SQLite error code is returned. The final value of the output parameters ** is undefined in this case. ** ** xTokenize_v2: ** Tokenize text using the tokenizer belonging to the FTS5 table. This ** API is the same as the xTokenize() API, except that it allows a tokenizer ** locale to be specified.
*/ struct Fts5ExtensionApi { int iVersion; /* Currently always set to 4 */
void *(*xUserData)(Fts5Context*);
int (*xColumnCount)(Fts5Context*); int (*xRowCount)(Fts5Context*, sqlite3_int64 *pnRow); int (*xColumnTotalSize)(Fts5Context*, int iCol, sqlite3_int64 *pnToken);
int (*xTokenize)(Fts5Context*, constchar *pText, int nText, /* Text to tokenize */ void *pCtx, /* Context passed to xToken() */ int (*xToken)(void*, int, constchar*, int, int, int) /* Callback */
);
int (*xPhraseCount)(Fts5Context*); int (*xPhraseSize)(Fts5Context*, int iPhrase);
int (*xInstCount)(Fts5Context*, int *pnInst); int (*xInst)(Fts5Context*, int iIdx, int *piPhrase, int *piCol, int *piOff);
sqlite3_int64 (*xRowid)(Fts5Context*); int (*xColumnText)(Fts5Context*, int iCol, constchar **pz, int *pn); int (*xColumnSize)(Fts5Context*, int iCol, int *pnToken);
int (*xQueryPhrase)(Fts5Context*, int iPhrase, void *pUserData, int(*)(const Fts5ExtensionApi*,Fts5Context*,void*)
); int (*xSetAuxdata)(Fts5Context*, void *pAux, void(*xDelete)(void*)); void *(*xGetAuxdata)(Fts5Context*, int bClear);
int (*xPhraseFirst)(Fts5Context*, int iPhrase, Fts5PhraseIter*, int*, int*); void (*xPhraseNext)(Fts5Context*, Fts5PhraseIter*, int *piCol, int *piOff);
int (*xPhraseFirstColumn)(Fts5Context*, int iPhrase, Fts5PhraseIter*, int*); void (*xPhraseNextColumn)(Fts5Context*, Fts5PhraseIter*, int *piCol);
/* Below this point are iVersion>=3 only */ int (*xQueryToken)(Fts5Context*, int iPhrase, int iToken, constchar **ppToken, int *pnToken
); int (*xInstToken)(Fts5Context*, int iIdx, int iToken, constchar**, int*);
/* Below this point are iVersion>=4 only */ int (*xColumnLocale)(Fts5Context*, int iCol, constchar **pz, int *pn); int (*xTokenize_v2)(Fts5Context*, constchar *pText, int nText, /* Text to tokenize */ constchar *pLocale, int nLocale, /* Locale to pass to tokenizer */ void *pCtx, /* Context passed to xToken() */ int (*xToken)(void*, int, constchar*, int, int, int) /* Callback */
);
};
/************************************************************************* ** CUSTOM TOKENIZERS ** ** Applications may also register custom tokenizer types. A tokenizer ** is registered by providing fts5 with a populated instance of the ** following structure. All structure methods must be defined, setting ** any member of the fts5_tokenizer struct to NULL leads to undefined ** behaviour. The structure methods are expected to function as follows: ** ** xCreate: ** This function is used to allocate and initialize a tokenizer instance. ** A tokenizer instance is required to actually tokenize text. ** ** The first argument passed to this function is a copy of the (void*) ** pointer provided by the application when the fts5_tokenizer_v2 object ** was registered with FTS5 (the third argument to xCreateTokenizer()). ** The second and third arguments are an array of nul-terminated strings ** containing the tokenizer arguments, if any, specified following the ** tokenizer name as part of the CREATE VIRTUAL TABLE statement used ** to create the FTS5 table. ** ** The final argument is an output variable. If successful, (*ppOut) ** should be set to point to the new tokenizer handle and SQLITE_OK ** returned. If an error occurs, some value other than SQLITE_OK should ** be returned. In this case, fts5 assumes that the final value of *ppOut ** is undefined. ** ** xDelete: ** This function is invoked to delete a tokenizer handle previously ** allocated using xCreate(). Fts5 guarantees that this function will ** be invoked exactly once for each successful call to xCreate(). ** ** xTokenize: ** This function is expected to tokenize the nText byte string indicated ** by argument pText. pText may or may not be nul-terminated. The first ** argument passed to this function is a pointer to an Fts5Tokenizer object ** returned by an earlier call to xCreate(). ** ** The third argument indicates the reason that FTS5 is requesting ** tokenization of the supplied text. This is always one of the following ** four values: ** ** <ul><li> <b>FTS5_TOKENIZE_DOCUMENT</b> - A document is being inserted into ** or removed from the FTS table. The tokenizer is being invoked to ** determine the set of tokens to add to (or delete from) the ** FTS index. ** ** <li> <b>FTS5_TOKENIZE_QUERY</b> - A MATCH query is being executed ** against the FTS index. The tokenizer is being called to tokenize ** a bareword or quoted string specified as part of the query. ** ** <li> <b>(FTS5_TOKENIZE_QUERY | FTS5_TOKENIZE_PREFIX)</b> - Same as ** FTS5_TOKENIZE_QUERY, except that the bareword or quoted string is ** followed by a "*" character, indicating that the last token ** returned by the tokenizer will be treated as a token prefix. ** ** <li> <b>FTS5_TOKENIZE_AUX</b> - The tokenizer is being invoked to ** satisfy an fts5_api.xTokenize() request made by an auxiliary ** function. Or an fts5_api.xColumnSize() request made by the same ** on a columnsize=0 database. ** </ul> ** ** The sixth and seventh arguments passed to xTokenize() - pLocale and ** nLocale - are a pointer to a buffer containing the locale to use for ** tokenization (e.g. "en_US") and its size in bytes, respectively. The ** pLocale buffer is not nul-terminated. pLocale may be passed NULL (in ** which case nLocale is always 0) to indicate that the tokenizer should ** use its default locale. ** ** For each token in the input string, the supplied callback xToken() must ** be invoked. The first argument to it should be a copy of the pointer ** passed as the second argument to xTokenize(). The third and fourth ** arguments are a pointer to a buffer containing the token text, and the ** size of the token in bytes. The 4th and 5th arguments are the byte offsets ** of the first byte of and first byte immediately following the text from ** which the token is derived within the input. ** ** The second argument passed to the xToken() callback ("tflags") should ** normally be set to 0. The exception is if the tokenizer supports ** synonyms. In this case see the discussion below for details. ** ** FTS5 assumes the xToken() callback is invoked for each token in the ** order that they occur within the input text. ** ** If an xToken() callback returns any value other than SQLITE_OK, then ** the tokenization should be abandoned and the xTokenize() method should ** immediately return a copy of the xToken() return value. Or, if the ** input buffer is exhausted, xTokenize() should return SQLITE_OK. Finally, ** if an error occurs with the xTokenize() implementation itself, it ** may abandon the tokenization and return any error code other than ** SQLITE_OK or SQLITE_DONE. ** ** If the tokenizer is registered using an fts5_tokenizer_v2 object, ** then the xTokenize() method has two additional arguments - pLocale ** and nLocale. These specify the locale that the tokenizer should use ** for the current request. If pLocale and nLocale are both 0, then the ** tokenizer should use its default locale. Otherwise, pLocale points to ** an nLocale byte buffer containing the name of the locale to use as utf-8 ** text. pLocale is not nul-terminated. ** ** FTS5_TOKENIZER ** ** There is also an fts5_tokenizer object. This is an older, deprecated, ** version of fts5_tokenizer_v2. It is similar except that: ** ** <ul> ** <li> There is no "iVersion" field, and ** <li> The xTokenize() method does not take a locale argument. ** </ul> ** ** Legacy fts5_tokenizer tokenizers must be registered using the ** legacy xCreateTokenizer() function, instead of xCreateTokenizer_v2(). ** ** Tokenizer implementations registered using either API may be retrieved ** using both xFindTokenizer() and xFindTokenizer_v2(). ** ** SYNONYM SUPPORT ** ** Custom tokenizers may also support synonyms. Consider a case in which a ** user wishes to query for a phrase such as "first place". Using the ** built-in tokenizers, the FTS5 query 'first + place' will match instances ** of "first place" within the document set, but not alternative forms ** such as "1st place". In some applications, it would be better to match ** all instances of "first place" or "1st place" regardless of which form ** the user specified in the MATCH query text. ** ** There are several ways to approach this in FTS5: ** ** <ol><li> By mapping all synonyms to a single token. In this case, using ** the above example, this means that the tokenizer returns the ** same token for inputs "first" and "1st". Say that token is in ** fact "first", so that when the user inserts the document "I won ** 1st place" entries are added to the index for tokens "i", "won", ** "first" and "place". If the user then queries for '1st + place', ** the tokenizer substitutes "first" for "1st" and the query works ** as expected. ** ** <li> By querying the index for all synonyms of each query term ** separately. In this case, when tokenizing query text, the ** tokenizer may provide multiple synonyms for a single term ** within the document. FTS5 then queries the index for each ** synonym individually. For example, faced with the query: ** ** <codeblock> ** ... MATCH 'first place'</codeblock> ** ** the tokenizer offers both "1st" and "first" as synonyms for the ** first token in the MATCH query and FTS5 effectively runs a query ** similar to: ** ** <codeblock> ** ... MATCH '(first OR 1st) place'</codeblock> ** ** except that, for the purposes of auxiliary functions, the query ** still appears to contain just two phrases - "(first OR 1st)" ** being treated as a single phrase. ** ** <li> By adding multiple synonyms for a single term to the FTS index. ** Using this method, when tokenizing document text, the tokenizer ** provides multiple synonyms for each token. So that when a ** document such as "I won first place" is tokenized, entries are ** added to the FTS index for "i", "won", "first", "1st" and ** "place". ** ** This way, even if the tokenizer does not provide synonyms ** when tokenizing query text (it should not - to do so would be ** inefficient), it doesn't matter if the user queries for ** 'first + place' or '1st + place', as there are entries in the ** FTS index corresponding to both forms of the first token. ** </ol> ** ** Whether it is parsing document or query text, any call to xToken that ** specifies a <i>tflags</i> argument with the FTS5_TOKEN_COLOCATED bit ** is considered to supply a synonym for the previous token. For example, ** when parsing the document "I won first place", a tokenizer that supports ** synonyms would call xToken() 5 times, as follows: ** ** <codeblock> ** xToken(pCtx, 0, "i", 1, 0, 1); ** xToken(pCtx, 0, "won", 3, 2, 5); ** xToken(pCtx, 0, "first", 5, 6, 11); ** xToken(pCtx, FTS5_TOKEN_COLOCATED, "1st", 3, 6, 11); ** xToken(pCtx, 0, "place", 5, 12, 17); **</codeblock> ** ** It is an error to specify the FTS5_TOKEN_COLOCATED flag the first time ** xToken() is called. Multiple synonyms may be specified for a single token ** by making multiple calls to xToken(FTS5_TOKEN_COLOCATED) in sequence. ** There is no limit to the number of synonyms that may be provided for a ** single token. ** ** In many cases, method (1) above is the best approach. It does not add ** extra data to the FTS index or require FTS5 to query for multiple terms, ** so it is efficient in terms of disk space and query speed. However, it ** does not support prefix queries very well. If, as suggested above, the ** token "first" is substituted for "1st" by the tokenizer, then the query: ** ** <codeblock> ** ... MATCH '1s*'</codeblock> ** ** will not match documents that contain the token "1st" (as the tokenizer ** will probably not map "1s" to any prefix of "first"). ** ** For full prefix support, method (3) may be preferred. In this case, ** because the index contains entries for both "first" and "1st", prefix ** queries such as 'fi*' or '1s*' will match correctly. However, because ** extra entries are added to the FTS index, this method uses more space ** within the database. ** ** Method (2) offers a midpoint between (1) and (3). Using this method, ** a query such as '1s*' will match documents that contain the literal ** token "1st", but not "first" (assuming the tokenizer is not able to ** provide synonyms for prefixes). However, a non-prefix query like '1st' ** will match against "1st" and "first". This method does not require ** extra disk space, as no extra entries are added to the FTS index. ** On the other hand, it may require more CPU cycles to run MATCH queries, ** as separate queries of the FTS index are required for each synonym. ** ** When using methods (2) or (3), it is important that the tokenizer only ** provide synonyms when tokenizing document text (method (3)) or query ** text (method (2)), not both. Doing so will not cause any errors, but is ** inefficient.
*/ typedefstruct Fts5Tokenizer Fts5Tokenizer; typedefstruct fts5_tokenizer_v2 fts5_tokenizer_v2; struct fts5_tokenizer_v2 { int iVersion; /* Currently always 2 */
int (*xCreate)(void*, constchar **azArg, int nArg, Fts5Tokenizer **ppOut); void (*xDelete)(Fts5Tokenizer*); int (*xTokenize)(Fts5Tokenizer*, void *pCtx, int flags, /* Mask of FTS5_TOKENIZE_* flags */ constchar *pText, int nText, constchar *pLocale, int nLocale, int (*xToken)( void *pCtx, /* Copy of 2nd argument to xTokenize() */ int tflags, /* Mask of FTS5_TOKEN_* flags */ constchar *pToken, /* Pointer to buffer containing token */ int nToken, /* Size of token in bytes */ int iStart, /* Byte offset of token within input text */ int iEnd /* Byte offset of end of token within input text */
)
);
};
/* ** New code should use the fts5_tokenizer_v2 type to define tokenizer ** implementations. The following type is included for legacy applications ** that still use it.
*/ typedefstruct fts5_tokenizer fts5_tokenizer; struct fts5_tokenizer { int (*xCreate)(void*, constchar **azArg, int nArg, Fts5Tokenizer **ppOut); void (*xDelete)(Fts5Tokenizer*); int (*xTokenize)(Fts5Tokenizer*, void *pCtx, int flags, /* Mask of FTS5_TOKENIZE_* flags */ constchar *pText, int nText, int (*xToken)( void *pCtx, /* Copy of 2nd argument to xTokenize() */ int tflags, /* Mask of FTS5_TOKEN_* flags */ constchar *pToken, /* Pointer to buffer containing token */ int nToken, /* Size of token in bytes */ int iStart, /* Byte offset of token within input text */ int iEnd /* Byte offset of end of token within input text */
)
);
};
/* Flags that may be passed as the third argument to xTokenize() */ #define FTS5_TOKENIZE_QUERY 0x0001 #define FTS5_TOKENIZE_PREFIX 0x0002 #define FTS5_TOKENIZE_DOCUMENT 0x0004 #define FTS5_TOKENIZE_AUX 0x0008
/* Flags that may be passed by the tokenizer implementation back to FTS5
** as the third argument to the supplied xToken callback. */ #define FTS5_TOKEN_COLOCATED 0x0001 /* Same position as prev. token */
/* ** END OF CUSTOM TOKENIZERS
*************************************************************************/
/************************************************************************* ** FTS5 EXTENSION REGISTRATION API
*/ typedefstruct fts5_api fts5_api; struct fts5_api { int iVersion; /* Currently always set to 3 */
/* Create a new tokenizer */ int (*xCreateTokenizer)(
fts5_api *pApi, constchar *zName, void *pUserData,
fts5_tokenizer *pTokenizer, void (*xDestroy)(void*)
);
/* Find an existing tokenizer */ int (*xFindTokenizer)(
fts5_api *pApi, constchar *zName, void **ppUserData,
fts5_tokenizer *pTokenizer
);
/* Create a new auxiliary function */ int (*xCreateFunction)(
fts5_api *pApi, constchar *zName, void *pUserData,
fts5_extension_function xFunction, void (*xDestroy)(void*)
);
/* APIs below this point are only available if iVersion>=3 */
/* Create a new tokenizer */ int (*xCreateTokenizer_v2)(
fts5_api *pApi, constchar *zName, void *pUserData,
fts5_tokenizer_v2 *pTokenizer, void (*xDestroy)(void*)
);
/* Find an existing tokenizer */ int (*xFindTokenizer_v2)(
fts5_api *pApi, constchar *zName, void **ppUserData,
fts5_tokenizer_v2 **ppTokenizer
);
};
/* ** END OF REGISTRATION API
*************************************************************************/
#ifdef __cplusplus
} /* end of the 'extern "C"' block */ #endif
#endif/* _FTS5_H */
#line 1 "fts5Int.h" /* ** 2014 May 31 ** ** The author disclaims copyright to this source code. In place of ** a legal notice, here is a blessing: ** ** May you do good and not evil. ** May you find forgiveness for yourself and forgive others. ** May you share freely, never taking more than you give. ** ****************************************************************************** **
*/ #ifndef _FTS5INT_H #define _FTS5INT_H
/* ** Constants for the largest and smallest possible 64-bit signed integers.
*/ # define LARGEST_INT64 (0xffffffff|(((i64)0x7fffffff)<<32)) # define SMALLEST_INT64 (((i64)-1) - LARGEST_INT64)
/* The uptr type is an unsigned integer large enough to hold a pointer
*/ #ifdefined(HAVE_STDINT_H) typedef uintptr_t uptr; #elif SQLITE_PTRSIZE==4 typedef u32 uptr; #else typedef u64 uptr; #endif
/* Truncate very long tokens to this many bytes. Hard limit is ** (65536-1-1-4-9)==65521 bytes. The limiting factor is the 16-bit offset
** field that occurs at the start of each leaf page (see fts5_index.c). */ #define FTS5_MAX_TOKEN_SIZE 32768
/* ** Maximum number of prefix indexes on single FTS5 table. This must be ** less than 32. If it is set to anything large than that, an #error ** directive in fts5_index.c will cause the build to fail.
*/ #define FTS5_MAX_PREFIX_INDEXES 31
/* ** Maximum segments permitted in a single index
*/ #define FTS5_MAX_SEGMENT 2000
/* ** The assert_nc() macro is similar to the assert() macro, except that it ** is used for assert() conditions that are true only if it can be ** guranteed that the database is not corrupt.
*/ #ifdef SQLITE_DEBUG externint sqlite3_fts5_may_be_corrupt; # define assert_nc(x) assert(sqlite3_fts5_may_be_corrupt || (x)) #else # define assert_nc(x) assert(x) #endif
/* ** A version of memcmp() that does not cause asan errors if one of the pointer ** parameters is NULL and the number of bytes to compare is zero.
*/ #define fts5Memcmp(s1, s2, n) ((n)<=0 ? 0 : memcmp((s1), (s2), (n)))
/* Mark a function parameter as unused, to suppress nuisance compiler
** warnings. */ #ifndef UNUSED_PARAM # define UNUSED_PARAM(X) (void)(X) #endif
/* If a NEAR() clump or phrase may only match a specific set of columns, ** then an object of the following type is used to record the set of columns. ** Each entry in the aiCol[] array is a column that may be matched. ** ** This object is used by fts5_expr.c and fts5_index.c.
*/ struct Fts5Colset { int nCol; int aiCol[1];
};
/************************************************************************** ** Interface to code in fts5_config.c. fts5_config.c contains contains code ** to parse the arguments passed to the CREATE VIRTUAL TABLE statement.
*/
struct Fts5TokenizerConfig {
Fts5Tokenizer *pTok;
fts5_tokenizer_v2 *pApi2;
fts5_tokenizer *pApi1; constchar **azArg; int nArg; int ePattern; /* FTS_PATTERN_XXX constant */ constchar *pLocale; /* Current locale to use */ int nLocale; /* Size of pLocale in bytes */
};
/* ** An instance of the following structure encodes all information that can ** be gleaned from the CREATE VIRTUAL TABLE statement. ** ** And all information loaded from the %_config table. ** ** nAutomerge: ** The minimum number of segments that an auto-merge operation should ** attempt to merge together. A value of 1 sets the object to use the ** compile time default. Zero disables auto-merge altogether. ** ** bContentlessDelete: ** True if the contentless_delete option was present in the CREATE ** VIRTUAL TABLE statement. ** ** zContent: ** ** zContentRowid: ** The value of the content_rowid= option, if one was specified. Or ** the string "rowid" otherwise. This text is not quoted - if it is ** used as part of an SQL statement it needs to be quoted appropriately. ** ** zContentExprlist: ** ** pzErrmsg: ** This exists in order to allow the fts5_index.c module to return a ** decent error message if it encounters a file-format version it does ** not understand. ** ** bColumnsize: ** True if the %_docsize table is created. ** ** bPrefixIndex: ** This is only used for debugging. If set to false, any prefix indexes ** are ignored. This value is configured using: ** ** INSERT INTO tbl(tbl, rank) VALUES('prefix-index', $bPrefixIndex); ** ** bLocale: ** Set to true if locale=1 was specified when the table was created.
*/ struct Fts5Config {
sqlite3 *db; /* Database handle */
Fts5Global *pGlobal; /* Global fts5 object for handle db */ char *zDb; /* Database holding FTS index (e.g. "main") */ char *zName; /* Name of FTS index */ int nCol; /* Number of columns */ char **azCol; /* Column names */
u8 *abUnindexed; /* True for unindexed columns */ int nPrefix; /* Number of prefix indexes */ int *aPrefix; /* Sizes in bytes of nPrefix prefix indexes */ int eContent; /* An FTS5_CONTENT value */ int bContentlessDelete; /* "contentless_delete=" option (dflt==0) */ int bContentlessUnindexed; /* "contentless_unindexed=" option (dflt=0) */ char *zContent; /* content table */ char *zContentRowid; /* "content_rowid=" option value */ int bColumnsize; /* "columnsize=" option value (dflt==1) */ int bTokendata; /* "tokendata=" option value (dflt==0) */ int bLocale; /* "locale=" option value (dflt==0) */ int eDetail; /* FTS5_DETAIL_XXX value */ char *zContentExprlist;
Fts5TokenizerConfig t; int bLock; /* True when table is preparing statement */
/* Values loaded from the %_config table */ int iVersion; /* fts5 file format 'version' */ int iCookie; /* Incremented when %_config is modified */ int pgsz; /* Approximate page size used in %_data */ int nAutomerge; /* 'automerge' setting */ int nCrisisMerge; /* Maximum allowed segments per level */ int nUsermerge; /* 'usermerge' setting */ int nHashSize; /* Bytes of memory for in-memory hash */ char *zRank; /* Name of rank function */ char *zRankArgs; /* Arguments to rank function */ int bSecureDelete; /* 'secure-delete' */ int nDeleteMerge; /* 'deletemerge' */
/* If non-NULL, points to sqlite3_vtab.base.zErrmsg. Often NULL. */ char **pzErrmsg;
#ifdef SQLITE_DEBUG int bPrefixIndex; /* True to use prefix-indexes */ #endif
};
/* Current expected value of %_config table 'version' field. And ** the expected version if the 'secure-delete' option has ever been
** set on the table. */ #define FTS5_CURRENT_VERSION 4 #define FTS5_CURRENT_VERSION_SECUREDELETE 5
/* ** End of interface to code in fts5_config.c.
**************************************************************************/
/************************************************************************** ** Interface to code in fts5_buffer.c.
*/
/* ** Buffer object for the incremental building of string data.
*/ typedefstruct Fts5Buffer Fts5Buffer; struct Fts5Buffer {
u8 *p; int n; int nSpace;
};
typedefstruct Fts5PoslistReader Fts5PoslistReader; struct Fts5PoslistReader { /* Variables used only by sqlite3Fts5PoslistIterXXX() functions. */ const u8 *a; /* Position list to iterate through */ int n; /* Size of buffer at a[] in bytes */ int i; /* Current offset in a[] */
u8 bFlag; /* For client use (any custom purpose) */
/* Output variables */
u8 bEof; /* Set to true at EOF */
i64 iPos; /* (iCol<<32) + iPos */
}; staticint sqlite3Fts5PoslistReaderInit( const u8 *a, int n, /* Poslist buffer to iterate through */
Fts5PoslistReader *pIter /* Iterator object to initialize */
); staticint sqlite3Fts5PoslistReaderNext(Fts5PoslistReader*);
/* Character set tests (like isspace(), isalpha() etc.) */ staticint sqlite3Fts5IsBareword(char t);
/* Bucket of terms object used by the integrity-check in offsets=0 mode. */ typedefstruct Fts5Termset Fts5Termset; staticint sqlite3Fts5TermsetNew(Fts5Termset**); staticint sqlite3Fts5TermsetAdd(Fts5Termset*, int, constchar*, int, int *pbPresent); staticvoid sqlite3Fts5TermsetFree(Fts5Termset*);
/* ** End of interface to code in fts5_buffer.c.
**************************************************************************/
/************************************************************************** ** Interface to code in fts5_index.c. fts5_index.c contains contains code ** to access the data stored in the %_data table.
*/
/* ** Values used as part of the flags argument passed to IndexQuery().
*/ #define FTS5INDEX_QUERY_PREFIX 0x0001 /* Prefix query */ #define FTS5INDEX_QUERY_DESC 0x0002 /* Docs in descending rowid order */ #define FTS5INDEX_QUERY_TEST_NOIDX 0x0004 /* Do not use prefix index */ #define FTS5INDEX_QUERY_SCAN 0x0008 /* Scan query (fts5vocab) */
/* The following are used internally by the fts5_index.c module. They are ** defined here only to make it easier to avoid clashes with the flags
** above. */ #define FTS5INDEX_QUERY_SKIPEMPTY 0x0010 #define FTS5INDEX_QUERY_NOOUTPUT 0x0020 #define FTS5INDEX_QUERY_SKIPHASH 0x0040 #define FTS5INDEX_QUERY_NOTOKENDATA 0x0080 #define FTS5INDEX_QUERY_SCANONETERM 0x0100
/* ** Create/destroy an Fts5Index object.
*/ staticint sqlite3Fts5IndexOpen(Fts5Config *pConfig, int bCreate, Fts5Index**, char**); staticint sqlite3Fts5IndexClose(Fts5Index *p);
/* ** Return a simple checksum value based on the arguments.
*/ static u64 sqlite3Fts5IndexEntryCksum(
i64 iRowid, int iCol, int iPos, int iIdx, constchar *pTerm, int nTerm
);
/* ** Argument p points to a buffer containing utf-8 text that is n bytes in ** size. Return the number of bytes in the nChar character prefix of the ** buffer, or 0 if there are less than nChar characters in total.
*/ staticint sqlite3Fts5IndexCharlenToBytelen( constchar *p, int nByte, int nChar
);
/* ** Open a new iterator to iterate though all rowids that match the ** specified token or token prefix.
*/ staticint sqlite3Fts5IndexQuery(
Fts5Index *p, /* FTS index to query */ constchar *pToken, int nToken, /* Token (or prefix) to query for */ int flags, /* Mask of FTS5INDEX_QUERY_X flags */
Fts5Colset *pColset, /* Match these columns only */
Fts5IndexIter **ppIter /* OUT: New iterator object */
);
/* ** The various operations on open token or token prefix iterators opened ** using sqlite3Fts5IndexQuery().
*/ staticint sqlite3Fts5IterNext(Fts5IndexIter*); staticint sqlite3Fts5IterNextFrom(Fts5IndexIter*, i64 iMatch);
/* ** Close an iterator opened by sqlite3Fts5IndexQuery().
*/ staticvoid sqlite3Fts5IterClose(Fts5IndexIter*);
/* ** Close the reader blob handle, if it is open.
*/ staticvoid sqlite3Fts5IndexCloseReader(Fts5Index*);
/* ** This interface is used by the fts5vocab module.
*/ staticconstchar *sqlite3Fts5IterTerm(Fts5IndexIter*, int*); staticint sqlite3Fts5IterNextScan(Fts5IndexIter*); staticvoid *sqlite3Fts5StructureRef(Fts5Index*); staticvoid sqlite3Fts5StructureRelease(void*); staticint sqlite3Fts5StructureTest(Fts5Index*, void*);
/* ** Used by xInstToken():
*/ staticint sqlite3Fts5IterToken(Fts5IndexIter*, i64, int, int, constchar**, int*);
/* ** Insert or remove data to or from the index. Each time a document is ** added to or removed from the index, this function is called one or more ** times. ** ** For an insert, it must be called once for each token in the new document. ** If the operation is a delete, it must be called (at least) once for each ** unique token in the document with an iCol value less than zero. The iPos ** argument is ignored for a delete.
*/ staticint sqlite3Fts5IndexWrite(
Fts5Index *p, /* Index to write to */ int iCol, /* Column token appears in (-ve -> delete) */ int iPos, /* Position of token within column */ constchar *pToken, int nToken /* Token to add or remove to or from index */
);
/* ** Indicate that subsequent calls to sqlite3Fts5IndexWrite() pertain to ** document iDocid.
*/ staticint sqlite3Fts5IndexBeginWrite(
Fts5Index *p, /* Index to write to */ int bDelete, /* True if current operation is a delete */
i64 iDocid /* Docid to add or remove data from */
);
/* ** Flush any data stored in the in-memory hash tables to the database. ** Also close any open blob handles.
*/ staticint sqlite3Fts5IndexSync(Fts5Index *p);
/* ** Discard any data stored in the in-memory hash tables. Do not write it ** to the database. Additionally, assume that the contents of the %_data ** table may have changed on disk. So any in-memory caches of %_data ** records must be invalidated.
*/ staticint sqlite3Fts5IndexRollback(Fts5Index *p);
/* ** Get or set the "averages" values.
*/ staticint sqlite3Fts5IndexGetAverages(Fts5Index *p, i64 *pnRow, i64 *anSize); staticint sqlite3Fts5IndexSetAverages(Fts5Index *p, const u8*, int);
/* ** Functions called by the storage module as part of integrity-check.
*/ staticint sqlite3Fts5IndexIntegrityCheck(Fts5Index*, u64 cksum, int bUseCksum);
/* ** Called during virtual module initialization to register UDF ** fts5_decode() with SQLite
*/ staticint sqlite3Fts5IndexInit(sqlite3*);
/* ** Return the total number of entries read from the %_data table by ** this connection since it was created.
*/ staticint sqlite3Fts5IndexReads(Fts5Index *p);
/* Used to populate hash tables for xInstToken in detail=none/column mode. */ staticint sqlite3Fts5IndexIterWriteTokendata(
Fts5IndexIter*, constchar*, int, i64 iRowid, int iCol, int iOff
);
/* ** End of interface to code in fts5_index.c.
**************************************************************************/
staticvoid sqlite3Fts5ClearLocale(Fts5Config *pConfig); staticvoid sqlite3Fts5SetLocale(Fts5Config *pConfig, constchar *pLoc, int nLoc);
staticint sqlite3Fts5IsLocaleValue(Fts5Config *pConfig, sqlite3_value *pVal); staticint sqlite3Fts5DecodeLocaleValue(sqlite3_value *pVal, constchar **ppText, int *pnText, constchar **ppLoc, int *pnLoc
);
/* ** End of interface to code in fts5.c.
**************************************************************************/
/************************************************************************** ** Interface to code in fts5_hash.c.
*/ typedefstruct Fts5Hash Fts5Hash;
/* ** Create a hash table, free a hash table.
*/ staticint sqlite3Fts5HashNew(Fts5Config*, Fts5Hash**, int *pnSize); staticvoid sqlite3Fts5HashFree(Fts5Hash*);
staticint sqlite3Fts5HashWrite(
Fts5Hash*,
i64 iRowid, /* Rowid for this entry */ int iCol, /* Column token appears in (-ve -> delete) */ int iPos, /* Position of token within column */ char bByte, constchar *pToken, int nToken /* Token to add or remove to or from index */
);
/* ** Empty (but do not delete) a hash table.
*/ staticvoid sqlite3Fts5HashClear(Fts5Hash*);
/* ** Return true if the hash is empty, false otherwise.
*/ staticint sqlite3Fts5HashIsEmpty(Fts5Hash*);
staticint sqlite3Fts5HashQuery(
Fts5Hash*, /* Hash table to query */ int nPre, constchar *pTerm, int nTerm, /* Query term */ void **ppObj, /* OUT: Pointer to doclist for pTerm */ int *pnDoclist /* OUT: Size of doclist in bytes */
);
staticint sqlite3Fts5HashScanInit(
Fts5Hash*, /* Hash table to query */ constchar *pTerm, int nTerm /* Query prefix */
); staticvoid sqlite3Fts5HashScanNext(Fts5Hash*); staticint sqlite3Fts5HashScanEof(Fts5Hash*); staticvoid sqlite3Fts5HashScanEntry(Fts5Hash *, constchar **pzTerm, /* OUT: term (nul-terminated) */ int *pnTerm, /* OUT: Size of term in bytes */ const u8 **ppDoclist, /* OUT: pointer to doclist */ int *pnDoclist /* OUT: size of doclist in bytes */
);
/* ** End of interface to code in fts5_hash.c.
**************************************************************************/
/************************************************************************** ** Interface to code in fts5_storage.c. fts5_storage.c contains contains ** code to access the data stored in the %_content and %_docsize tables.
*/
#define FTS5_STMT_SCAN_ASC 0 /* SELECT rowid, * FROM ... ORDER BY 1 ASC */ #define FTS5_STMT_SCAN_DESC 1 /* SELECT rowid, * FROM ... ORDER BY 1 DESC */ #define FTS5_STMT_LOOKUP 2 /* SELECT rowid, * FROM ... WHERE rowid=? */
struct Fts5Token { constchar *p; /* Token text (not NULL terminated) */ int n; /* Size of buffer p in bytes */
};
/* Parse a MATCH expression. */ staticint sqlite3Fts5ExprNew(
Fts5Config *pConfig, int bPhraseToAnd, int iCol, /* Column on LHS of MATCH operator */ constchar *zExpr,
Fts5Expr **ppNew, char **pzErr
); staticint sqlite3Fts5ExprPattern(
Fts5Config *pConfig, int bGlob, int iCol, constchar *zText,
Fts5Expr **pp
);
/******************************************* ** The fts5_expr.c API above this point is used by the other hand-written ** C code in this module. The interfaces below this point are called by
** the parser code in fts5parse.y. */
/* ** End of interface to code in fts5_expr.c.
**************************************************************************/
/************************************************************************** ** Interface to code in fts5_aux.c.
*/
staticint sqlite3Fts5AuxInit(fts5_api*); /* ** End of interface to code in fts5_aux.c.
**************************************************************************/
/************************************************************************** ** Interface to code in fts5_tokenizer.c.
*/
staticint sqlite3Fts5TokenizerInit(fts5_api*); staticint sqlite3Fts5TokenizerPattern( int (*xCreate)(void*, constchar**, int, Fts5Tokenizer**),
Fts5Tokenizer *pTok
); staticint sqlite3Fts5TokenizerPreload(Fts5TokenizerConfig*); /* ** End of interface to code in fts5_tokenizer.c.
**************************************************************************/
/************************************************************************** ** Interface to code in fts5_vocab.c.
*/
/* ** End of interface to code in fts5_vocab.c.
**************************************************************************/
/************************************************************************** ** Interface to automatically generated code in fts5_unicode2.c.
*/ staticint sqlite3Fts5UnicodeIsdiacritic(int c); staticint sqlite3Fts5UnicodeFold(int c, int bRemoveDiacritic);
staticint sqlite3Fts5UnicodeCatParse(constchar*, u8*); staticint sqlite3Fts5UnicodeCategory(u32 iCode); staticvoid sqlite3Fts5UnicodeAscii(u8*, u8*); /* ** End of interface to code in fts5_unicode2.c.
**************************************************************************/
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.