Groonga 3.0.9 Source Code Document
Main Page
Related Pages
Namespaces
Data Structures
Files
File List
Globals
All
Data Structures
Namespaces
Files
Functions
Variables
Typedefs
Enumerations
Enumerator
Macros
Pages
groonga
include
groonga
tokenizer.h
Go to the documentation of this file.
1
/* -*- c-basic-offset: 2 -*- */
2
/*
3
Copyright(C) 2012 Brazil
4
5
This library is free software; you can redistribute it and/or
6
modify it under the terms of the GNU Lesser General Public
7
License version 2.1 as published by the Free Software Foundation.
8
9
This library is distributed in the hope that it will be useful,
10
but WITHOUT ANY WARRANTY; without even the implied warranty of
11
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12
Lesser General Public License for more details.
13
14
You should have received a copy of the GNU Lesser General Public
15
License along with this library; if not, write to the Free Software
16
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
*/
18
#ifndef GRN_PLUGIN_TOKENIZER_H
19
#define GRN_PLUGIN_TOKENIZER_H
20
21
#include <stddef.h>
22
23
#include <
groonga/plugin.h
>
24
25
#ifdef __cplusplus
26
extern
"C"
{
27
#endif
/* __cplusplus */
28
29
#define GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8 "\xEF\xBF\xBE"
30
#define GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8_LEN 3
31
32
/*
33
grn_tokenizer_charlen() returns the length (#bytes) of the first character
34
in the string specified by `str_ptr' and `str_length'. If the starting bytes
35
are invalid as a character, grn_tokenizer_charlen() returns 0. See
36
grn_encoding in "groonga.h" for more details of `encoding'
37
38
Deprecated. Use grn_plugin_charlen() instead.
39
*/
40
int
grn_tokenizer_charlen
(
grn_ctx
*ctx,
const
char
*str_ptr,
41
unsigned
int
str_length,
grn_encoding
encoding);
42
43
/*
44
grn_tokenizer_isspace() returns the length (#bytes) of the first character
45
in the string specified by `str_ptr' and `str_length' if it is a space
46
character. Otherwise, grn_tokenizer_isspace() returns 0.
47
48
Deprecated. Use grn_plugin_isspace() instead.
49
*/
50
int
grn_tokenizer_isspace
(
grn_ctx
*ctx,
const
char
*str_ptr,
51
unsigned
int
str_length,
grn_encoding
encoding);
52
53
/*
54
grn_tokenizer_is_tokenized_delimiter() returns whether is the first
55
character in the string specified by `str_ptr' and `str_length' the
56
special tokenized delimiter character or not.
57
*/
58
grn_bool
grn_tokenizer_is_tokenized_delimiter
(
grn_ctx
*ctx,
59
const
char
*str_ptr,
60
unsigned
int
str_length,
61
grn_encoding
encoding);
62
63
/*
64
grn_tokenizer_have_tokenized_delimiter() returns whether is there
65
the special delimiter character in the string specified by `str_ptr'
66
and `str_length' the special tokenized delimiter character or not.
67
*/
68
GRN_PLUGIN_EXPORT
grn_bool
grn_tokenizer_have_tokenized_delimiter
(
grn_ctx
*ctx,
69
const
char
*str_ptr,
70
unsigned
int
str_length,
71
grn_encoding
encoding);
72
73
/*
74
grn_tokenizer_query is a structure for storing a query. See the following
75
functions.
76
*/
77
typedef
struct
_grn_tokenizer_query
grn_tokenizer_query
;
78
79
struct
_grn_tokenizer_query
{
80
grn_obj
*
normalized_query
;
81
char
*
query_buf
;
82
const
char
*
ptr
;
83
unsigned
int
length
;
84
grn_encoding
encoding
;
85
unsigned
int
flags
;
86
grn_bool
have_tokenized_delimiter
;
87
};
88
89
/*
90
grn_tokenizer_query_open() parses `args' and returns a new object of
91
grn_tokenizer_query. The new object stores information of the query.
92
grn_tokenizer_query_open() normalizes the query if the target table
93
requires normalization. grn_tokenizer_query_open() returns NULL if
94
something goes wrong. Note that grn_tokenizer_query_open() must be called
95
just once in the function that initializes a tokenizer.
96
97
See `GRN_STRING_*' flags for `normalize_flags'.
98
*/
99
GRN_PLUGIN_EXPORT
grn_tokenizer_query
*
grn_tokenizer_query_open
(
grn_ctx
*ctx,
100
int
num_args,
grn_obj
**args,
101
unsigned
int
normalize_flags);
102
103
/*
104
grn_tokenizer_query_create() is deprecated. Use grn_tokenizer_query_open()
105
instead.
106
*/
107
108
grn_tokenizer_query
*
grn_tokenizer_query_create
(
grn_ctx
*ctx,
109
int
num_args,
grn_obj
**args);
110
111
/*
112
grn_tokenizer_query_close() finalizes an object of grn_tokenizer_query
113
and then frees memory allocated for that object.
114
*/
115
GRN_PLUGIN_EXPORT
void
grn_tokenizer_query_close
(
grn_ctx
*ctx,
grn_tokenizer_query
*query);
116
117
/*
118
grn_tokenizer_query_destroy() is deprecated. Use grn_tokenizer_query_close()
119
instead.
120
*/
121
void
grn_tokenizer_query_destroy
(
grn_ctx
*ctx,
grn_tokenizer_query
*query);
122
123
/*
124
grn_tokenizer_token is needed to return tokens. A grn_tokenizer_token object
125
stores a token to be returned and it must be maintained until a request for
126
next token or finalization comes.
127
*/
128
typedef
struct
_grn_tokenizer_token
grn_tokenizer_token
;
129
130
struct
_grn_tokenizer_token
{
131
grn_obj
str
;
132
grn_obj
status
;
133
};
134
135
/*
136
grn_tokenizer_token_init() initializes `token'. Note that an initialized
137
object must be finalized by grn_tokenizer_token_fin().
138
*/
139
GRN_PLUGIN_EXPORT
void
grn_tokenizer_token_init
(
grn_ctx
*ctx,
grn_tokenizer_token
*token);
140
141
/*
142
grn_tokenizer_token_fin() finalizes `token' that has been initialized by
143
grn_tokenizer_token_init().
144
*/
145
GRN_PLUGIN_EXPORT
void
grn_tokenizer_token_fin
(
grn_ctx
*ctx,
grn_tokenizer_token
*token);
146
147
/*
148
* grn_tokenizer_status is a flag set for tokenizer status codes.
149
* If a document or query contains no tokens, push an empty string with
150
* GRN_TOKENIZER_TOKEN_LAST as a token.
151
*/
152
typedef
unsigned
int
grn_tokenizer_status
;
153
154
/* GRN_TOKENIZER_TOKEN_CONTINUE means that the next token is not the last one. */
155
#define GRN_TOKENIZER_TOKEN_CONTINUE (0)
156
/* GRN_TOKENIZER_TOKEN_LAST means that the next token is the last one. */
157
#define GRN_TOKENIZER_TOKEN_LAST (0x01L<<0)
158
/* GRN_TOKENIZER_TOKEN_OVERLAP means that ... */
159
#define GRN_TOKENIZER_TOKEN_OVERLAP (0x01L<<1)
160
/* GRN_TOKENIZER_TOKEN_UNMATURED means that ... */
161
#define GRN_TOKENIZER_TOKEN_UNMATURED (0x01L<<2)
162
/* GRN_TOKENIZER_TOKEN_REACH_END means that ... */
163
#define GRN_TOKENIZER_TOKEN_REACH_END (0x01L<<3)
164
165
/*
166
* GRN_TOKENIZER_CONTINUE and GRN_TOKENIZER_LAST are deprecated. They
167
* are just for backward compatibility. Use
168
* GRN_TOKENIZER_TOKEN_CONTINUE and GRN_TOKENIZER_TOKEN_LAST
169
* instead.
170
*/
171
#define GRN_TOKENIZER_CONTINUE GRN_TOKENIZER_TOKEN_CONTINUE
172
#define GRN_TOKENIZER_LAST GRN_TOKENIZER_TOKEN_LAST
173
174
/*
175
grn_tokenizer_token_push() pushes the next token into `token'. Note that
176
grn_tokenizer_token_push() does not make a copy of the given string. This
177
means that you have to maintain a memory space allocated to the string.
178
Also note that the grn_tokenizer_token object must be maintained until the
179
request for the next token or finalization comes. See grn_tokenizer_status in
180
this header for more details of `status'.
181
*/
182
GRN_PLUGIN_EXPORT
void
grn_tokenizer_token_push
(
grn_ctx
*ctx,
grn_tokenizer_token
*token,
183
const
char
*str_ptr,
unsigned
int
str_length,
184
grn_tokenizer_status status);
185
186
/*
187
grn_tokenizer_tokenized_delimiter_next() extracts the next token
188
from the string specified by `str_ptr' and `str_length' and pushes
189
the next token into `token'. It returns the string after the next
190
token. The returned string may be `NULL' when all tokens are
191
extracted.
192
*/
193
GRN_PLUGIN_EXPORT
const
char
*
grn_tokenizer_tokenized_delimiter_next
(
grn_ctx
*ctx,
194
grn_tokenizer_token
*token,
195
const
char
*str_ptr,
196
unsigned
int
str_length,
197
grn_encoding
encoding);
198
199
/*
200
grn_tokenizer_register() registers a plugin to the database which is
201
associated with `ctx'. `plugin_name_ptr' and `plugin_name_length' specify the
202
plugin name. Alphabetic letters ('A'-'Z' and 'a'-'z'), digits ('0'-'9') and
203
an underscore ('_') are capable characters. `init', `next' and `fin' specify
204
the plugin functions. `init' is called for initializing a tokenizer for a
205
document or query. `next' is called for extracting tokens one by one. `fin'
206
is called for finalizing a tokenizer. grn_tokenizer_register() returns
207
GRN_SUCCESS on success, an error code on failure. See "groonga.h" for more
208
details of grn_proc_func and grn_user_data, that is used as an argument of
209
grn_proc_func.
210
*/
211
GRN_PLUGIN_EXPORT
grn_rc
grn_tokenizer_register
(
grn_ctx
*ctx,
const
char
*plugin_name_ptr,
212
unsigned
int
plugin_name_length,
213
grn_proc_func
*init,
grn_proc_func
*next,
214
grn_proc_func
*
fin
);
215
216
#ifdef __cplusplus
217
}
/* extern "C" */
218
#endif
/* __cplusplus */
219
220
#endif
/* GRN_PLUGIN_TOKENIZER_H */
Generated on Sun Nov 10 2013 09:49:01 for Groonga 3.0.9 Source Code Document by
1.8.1.2