Groonga 3.0.9 Source Code Document
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
tokenizer.h
Go to the documentation of this file.
1 /* -*- c-basic-offset: 2 -*- */
2 /*
3  Copyright(C) 2012 Brazil
4 
5  This library is free software; you can redistribute it and/or
6  modify it under the terms of the GNU Lesser General Public
7  License version 2.1 as published by the Free Software Foundation.
8 
9  This library is distributed in the hope that it will be useful,
10  but WITHOUT ANY WARRANTY; without even the implied warranty of
11  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12  Lesser General Public License for more details.
13 
14  You should have received a copy of the GNU Lesser General Public
15  License along with this library; if not, write to the Free Software
16  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18 #ifndef GRN_PLUGIN_TOKENIZER_H
19 #define GRN_PLUGIN_TOKENIZER_H
20 
21 #include <stddef.h>
22 
23 #include <groonga/plugin.h>
24 
25 #ifdef __cplusplus
26 extern "C" {
27 #endif /* __cplusplus */
28 
29 #define GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8 "\xEF\xBF\xBE"
30 #define GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8_LEN 3
31 
32 /*
33  grn_tokenizer_charlen() returns the length (#bytes) of the first character
34  in the string specified by `str_ptr' and `str_length'. If the starting bytes
35  are invalid as a character, grn_tokenizer_charlen() returns 0. See
36  grn_encoding in "groonga.h" for more details of `encoding'
37 
38  Deprecated. Use grn_plugin_charlen() instead.
39  */
40 int grn_tokenizer_charlen(grn_ctx *ctx, const char *str_ptr,
41  unsigned int str_length, grn_encoding encoding);
42 
43 /*
44  grn_tokenizer_isspace() returns the length (#bytes) of the first character
45  in the string specified by `str_ptr' and `str_length' if it is a space
46  character. Otherwise, grn_tokenizer_isspace() returns 0.
47 
48  Deprecated. Use grn_plugin_isspace() instead.
49  */
50 int grn_tokenizer_isspace(grn_ctx *ctx, const char *str_ptr,
51  unsigned int str_length, grn_encoding encoding);
52 
53 /*
54  grn_tokenizer_is_tokenized_delimiter() returns whether is the first
55  character in the string specified by `str_ptr' and `str_length' the
56  special tokenized delimiter character or not.
57  */
59  const char *str_ptr,
60  unsigned int str_length,
61  grn_encoding encoding);
62 
63 /*
64  grn_tokenizer_have_tokenized_delimiter() returns whether is there
65  the special delimiter character in the string specified by `str_ptr'
66  and `str_length' the special tokenized delimiter character or not.
67  */
69  const char *str_ptr,
70  unsigned int str_length,
71  grn_encoding encoding);
72 
73 /*
74  grn_tokenizer_query is a structure for storing a query. See the following
75  functions.
76  */
78 
81  char *query_buf;
82  const char *ptr;
83  unsigned int length;
85  unsigned int flags;
87 };
88 
89 /*
90  grn_tokenizer_query_open() parses `args' and returns a new object of
91  grn_tokenizer_query. The new object stores information of the query.
92  grn_tokenizer_query_open() normalizes the query if the target table
93  requires normalization. grn_tokenizer_query_open() returns NULL if
94  something goes wrong. Note that grn_tokenizer_query_open() must be called
95  just once in the function that initializes a tokenizer.
96 
97  See `GRN_STRING_*' flags for `normalize_flags'.
98  */
100  int num_args, grn_obj **args,
101  unsigned int normalize_flags);
102 
103 /*
104  grn_tokenizer_query_create() is deprecated. Use grn_tokenizer_query_open()
105  instead.
106 */
107 
109  int num_args, grn_obj **args);
110 
111 /*
112  grn_tokenizer_query_close() finalizes an object of grn_tokenizer_query
113  and then frees memory allocated for that object.
114  */
116 
117 /*
118  grn_tokenizer_query_destroy() is deprecated. Use grn_tokenizer_query_close()
119  instead.
120  */
122 
123 /*
124  grn_tokenizer_token is needed to return tokens. A grn_tokenizer_token object
125  stores a token to be returned and it must be maintained until a request for
126  next token or finalization comes.
127  */
129 
133 };
134 
135 /*
136  grn_tokenizer_token_init() initializes `token'. Note that an initialized
137  object must be finalized by grn_tokenizer_token_fin().
138  */
140 
141 /*
142  grn_tokenizer_token_fin() finalizes `token' that has been initialized by
143  grn_tokenizer_token_init().
144  */
146 
147 /*
148  * grn_tokenizer_status is a flag set for tokenizer status codes.
149  * If a document or query contains no tokens, push an empty string with
150  * GRN_TOKENIZER_TOKEN_LAST as a token.
151  */
152 typedef unsigned int grn_tokenizer_status;
153 
154 /* GRN_TOKENIZER_TOKEN_CONTINUE means that the next token is not the last one. */
155 #define GRN_TOKENIZER_TOKEN_CONTINUE (0)
156 /* GRN_TOKENIZER_TOKEN_LAST means that the next token is the last one. */
157 #define GRN_TOKENIZER_TOKEN_LAST (0x01L<<0)
158 /* GRN_TOKENIZER_TOKEN_OVERLAP means that ... */
159 #define GRN_TOKENIZER_TOKEN_OVERLAP (0x01L<<1)
160 /* GRN_TOKENIZER_TOKEN_UNMATURED means that ... */
161 #define GRN_TOKENIZER_TOKEN_UNMATURED (0x01L<<2)
162 /* GRN_TOKENIZER_TOKEN_REACH_END means that ... */
163 #define GRN_TOKENIZER_TOKEN_REACH_END (0x01L<<3)
164 
165 /*
166  * GRN_TOKENIZER_CONTINUE and GRN_TOKENIZER_LAST are deprecated. They
167  * are just for backward compatibility. Use
168  * GRN_TOKENIZER_TOKEN_CONTINUE and GRN_TOKENIZER_TOKEN_LAST
169  * instead.
170  */
171 #define GRN_TOKENIZER_CONTINUE GRN_TOKENIZER_TOKEN_CONTINUE
172 #define GRN_TOKENIZER_LAST GRN_TOKENIZER_TOKEN_LAST
173 
174 /*
175  grn_tokenizer_token_push() pushes the next token into `token'. Note that
176  grn_tokenizer_token_push() does not make a copy of the given string. This
177  means that you have to maintain a memory space allocated to the string.
178  Also note that the grn_tokenizer_token object must be maintained until the
179  request for the next token or finalization comes. See grn_tokenizer_status in
180  this header for more details of `status'.
181  */
183  const char *str_ptr, unsigned int str_length,
184  grn_tokenizer_status status);
185 
186 /*
187  grn_tokenizer_tokenized_delimiter_next() extracts the next token
188  from the string specified by `str_ptr' and `str_length' and pushes
189  the next token into `token'. It returns the string after the next
190  token. The returned string may be `NULL' when all tokens are
191  extracted.
192  */
194  grn_tokenizer_token *token,
195  const char *str_ptr,
196  unsigned int str_length,
197  grn_encoding encoding);
198 
199 /*
200  grn_tokenizer_register() registers a plugin to the database which is
201  associated with `ctx'. `plugin_name_ptr' and `plugin_name_length' specify the
202  plugin name. Alphabetic letters ('A'-'Z' and 'a'-'z'), digits ('0'-'9') and
203  an underscore ('_') are capable characters. `init', `next' and `fin' specify
204  the plugin functions. `init' is called for initializing a tokenizer for a
205  document or query. `next' is called for extracting tokens one by one. `fin'
206  is called for finalizing a tokenizer. grn_tokenizer_register() returns
207  GRN_SUCCESS on success, an error code on failure. See "groonga.h" for more
208  details of grn_proc_func and grn_user_data, that is used as an argument of
209  grn_proc_func.
210  */
211 GRN_PLUGIN_EXPORT grn_rc grn_tokenizer_register(grn_ctx *ctx, const char *plugin_name_ptr,
212  unsigned int plugin_name_length,
213  grn_proc_func *init, grn_proc_func *next,
214  grn_proc_func *fin);
215 
216 #ifdef __cplusplus
217 } /* extern "C" */
218 #endif /* __cplusplus */
219 
220 #endif /* GRN_PLUGIN_TOKENIZER_H */