Groonga 3.0.9 Source Code Document
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
string.c
Go to the documentation of this file.
1 /* -*- c-basic-offset: 2 -*- */
2 /*
3  Copyright(C) 2009-2012 Brazil
4 
5  This library is free software; you can redistribute it and/or
6  modify it under the terms of the GNU Lesser General Public
7  License version 2.1 as published by the Free Software Foundation.
8 
9  This library is distributed in the hope that it will be useful,
10  but WITHOUT ANY WARRANTY; without even the implied warranty of
11  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12  Lesser General Public License for more details.
13 
14  You should have received a copy of the GNU Lesser General Public
15  License along with this library; if not, write to the Free Software
16  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18 
19 #include "groonga_in.h"
20 #include <string.h>
21 #include "string_in.h"
22 #include "normalizer_in.h"
23 #include "str.h"
24 #include "util.h"
25 
26 #include <groonga/tokenizer.h>
27 
28 static grn_string *
29 grn_fake_string_open(grn_ctx *ctx, grn_string *string)
30 {
31  /* TODO: support GRN_STRING_REMOVE_BLANK flag and ctypes */
32  grn_string *nstr = string;
33  const char *str;
34  unsigned int str_len;
35 
36  str = nstr->original;
37  str_len = nstr->original_length_in_bytes;
38 
39  if (!(nstr->normalized = GRN_MALLOC(str_len + 1))) {
41  "[strinig][fake] failed to allocate normalized text space");
42  grn_string_close(ctx, (grn_obj *)nstr);
43  return NULL;
44  }
45 
47  ctx->encoding == GRN_ENC_UTF8) {
48  int char_length;
49  const char *source_current = str;
50  const char *source_end = str + str_len;
51  char *destination = nstr->normalized;
52  unsigned int destination_length = 0;
53  while ((char_length = grn_charlen(ctx, source_current, source_end)) > 0) {
55  source_current, char_length,
56  ctx->encoding)) {
57  memcpy(destination, source_current, char_length);
58  destination += char_length;
59  destination_length += char_length;
60  }
61  source_current += char_length;
62  }
63  nstr->normalized[destination_length] = '\0';
64  nstr->normalized_length_in_bytes = destination_length;
65  } else {
66  memcpy(nstr->normalized, str, str_len);
67  nstr->normalized[str_len] = '\0';
68  nstr->normalized_length_in_bytes = str_len;
69  }
70 
71  if (nstr->flags & GRN_STRING_WITH_CHECKS) {
72  int16_t f = 0;
73  unsigned char c;
74  size_t i;
75  if (!(nstr->checks = (int16_t *) GRN_MALLOC(sizeof(int16_t) * str_len))) {
76  grn_string_close(ctx, (grn_obj *)nstr);
78  "[strinig][fake] failed to allocate checks space");
79  return NULL;
80  }
81  switch (nstr->encoding) {
82  case GRN_ENC_EUC_JP:
83  for (i = 0; i < str_len; i++) {
84  if (!f) {
85  c = (unsigned char) str[i];
86  f = ((c >= 0xa1U && c <= 0xfeU) || c == 0x8eU ? 2 : (c == 0x8fU ? 3 : 1)
87  );
88  nstr->checks[i] = f;
89  } else {
90  nstr->checks[i] = 0;
91  }
92  f--;
93  }
94  break;
95  case GRN_ENC_SJIS:
96  for (i = 0; i < str_len; i++) {
97  if (!f) {
98  c = (unsigned char) str[i];
99  f = (c >= 0x81U && ((c <= 0x9fU) || (c >= 0xe0U && c <= 0xfcU)) ? 2 : 1);
100  nstr->checks[i] = f;
101  } else {
102  nstr->checks[i] = 0;
103  }
104  f--;
105  }
106  break;
107  case GRN_ENC_UTF8:
108  for (i = 0; i < str_len; i++) {
109  if (!f) {
110  c = (unsigned char) str[i];
111  f = (c & 0x80U ? (c & 0x20U ? (c & 0x10U ? 4 : 3)
112  : 2)
113  : 1);
114  nstr->checks[i] = f;
115  } else {
116  nstr->checks[i] = 0;
117  }
118  f--;
119  }
120  break;
121  default:
122  for (i = 0; i < str_len; i++) {
123  nstr->checks[i] = 1;
124  }
125  break;
126  }
127  }
128  return nstr;
129 }
130 
131 grn_obj *
132 grn_string_open_(grn_ctx *ctx, const char *str, unsigned int str_len,
133  grn_obj *normalizer, int flags, grn_encoding encoding)
134 {
135  grn_string *string;
136  grn_obj *obj;
137  grn_bool is_normalizer_auto;
138 
139  if (!str || !str_len) {
140  return NULL;
141  }
142 
143  string = GRN_MALLOCN(grn_string, 1);
144  if (!string) {
145  GRN_LOG(ctx, GRN_LOG_ALERT,
146  "[string][open] failed to allocate memory");
147  return NULL;
148  }
149 
150  obj = (grn_obj *)string;
152  string->original = str;
153  string->original_length_in_bytes = str_len;
154  string->normalized = NULL;
155  string->normalized_length_in_bytes = 0;
156  string->n_characters = 0;
157  string->checks = NULL;
158  string->ctypes = NULL;
159  string->encoding = encoding;
160  string->flags = flags;
161 
162  if (!normalizer) {
163  return (grn_obj *)grn_fake_string_open(ctx, string);
164  }
165 
166  is_normalizer_auto = (normalizer == GRN_NORMALIZER_AUTO);
167  if (is_normalizer_auto) {
168  normalizer = grn_ctx_get(ctx, GRN_NORMALIZER_AUTO_NAME, -1);
169  }
170 
171  /* TODO: check rc */
172  grn_normalizer_normalize(ctx, normalizer, (grn_obj *)string);
173  if (ctx->rc) {
174  grn_obj_close(ctx, obj);
175  obj = NULL;
176  }
177 
178  if (is_normalizer_auto) {
179  grn_obj_unlink(ctx, normalizer);
180  }
181 
182  return obj;
183 }
184 
185 grn_obj *
186 grn_string_open(grn_ctx *ctx, const char *str, unsigned int str_len,
187  grn_obj *normalizer, int flags)
188 {
189  return grn_string_open_(ctx, str, str_len, normalizer, flags, ctx->encoding);
190 }
191 
192 grn_rc
194  const char **original,
195  unsigned int *length_in_bytes)
196 {
197  grn_rc rc;
198  grn_string *string_ = (grn_string *)string;
200  if (string_) {
201  if (original) { *original = string_->original; }
202  if (length_in_bytes) {
203  *length_in_bytes = string_->original_length_in_bytes;
204  }
205  rc = GRN_SUCCESS;
206  } else {
208  }
209  GRN_API_RETURN(rc);
210 }
211 
212 int
214 {
215  int flags = 0;
216  grn_string *string_ = (grn_string *)string;
218  if (string_) {
219  flags = string_->flags;
220  }
221  GRN_API_RETURN(flags);
222 }
223 
224 grn_rc
226  const char **normalized,
227  unsigned int *length_in_bytes,
228  unsigned int *n_characters)
229 {
230  grn_rc rc;
231  grn_string *string_ = (grn_string *)string;
233  if (string_) {
234  if (normalized) { *normalized = string_->normalized; }
235  if (length_in_bytes) {
236  *length_in_bytes = string_->normalized_length_in_bytes;
237  }
238  if (n_characters) { *n_characters = string_->n_characters; }
239  rc = GRN_SUCCESS;
240  } else {
242  }
243  GRN_API_RETURN(rc);
244 }
245 
246 grn_rc
248  char *normalized, unsigned int length_in_bytes,
249  unsigned int n_characters)
250 {
251  grn_rc rc;
252  grn_string *string_ = (grn_string *)string;
254  if (string_) {
255  if (string_->normalized) { GRN_FREE(string_->normalized); }
256  string_->normalized = normalized;
257  string_->normalized_length_in_bytes = length_in_bytes;
258  string_->n_characters = n_characters;
259  rc = GRN_SUCCESS;
260  } else {
262  }
263  GRN_API_RETURN(rc);
264 }
265 
266 const short *
268 {
269  int16_t *checks = NULL;
270  grn_string *string_ = (grn_string *)string;
272  if (string_) {
273  checks = string_->checks;
274  } else {
275  checks = NULL;
276  }
277  GRN_API_RETURN(checks);
278 }
279 
280 grn_rc
281 grn_string_set_checks(grn_ctx *ctx, grn_obj *string, short *checks)
282 {
283  grn_rc rc;
284  grn_string *string_ = (grn_string *)string;
286  if (string_) {
287  if (string_->checks) { GRN_FREE(string_->checks); }
288  string_->checks = checks;
289  rc = GRN_SUCCESS;
290  } else {
292  }
293  GRN_API_RETURN(rc);
294 }
295 
296 const unsigned char *
298 {
299  unsigned char *types = NULL;
300  grn_string *string_ = (grn_string *)string;
302  if (string_) {
303  types = string_->ctypes;
304  } else {
305  types = NULL;
306  }
307  GRN_API_RETURN(types);
308 }
309 
310 grn_rc
311 grn_string_set_types(grn_ctx *ctx, grn_obj *string, unsigned char *types)
312 {
313  grn_rc rc;
314  grn_string *string_ = (grn_string *)string;
316  if (string_) {
317  if (string_->ctypes) { GRN_FREE(string_->ctypes); }
318  string_->ctypes = types;
319  rc = GRN_SUCCESS;
320  } else {
322  }
323  GRN_API_RETURN(rc);
324 }
325 
328 {
329  grn_encoding encoding = GRN_ENC_NONE;
330  grn_string *string_ = (grn_string *)string;
332  if (string_) {
333  encoding = string_->encoding;
334  }
335  GRN_API_RETURN(encoding);
336 }
337 
338 grn_rc
340 {
341  grn_string *string_ = (grn_string *)string;
342 
343  GRN_TEXT_PUTS(ctx, buffer, "#<string:");
344 
345  GRN_TEXT_PUTS(ctx, buffer, " original:<");
346  GRN_TEXT_PUT(ctx, buffer,
347  string_->original,
348  string_->original_length_in_bytes);
349  GRN_TEXT_PUTS(ctx, buffer, ">");
350  GRN_TEXT_PUTS(ctx, buffer, "(");
351  grn_text_itoa(ctx, buffer, string_->original_length_in_bytes);
352  GRN_TEXT_PUTS(ctx, buffer, ")");
353 
354  GRN_TEXT_PUTS(ctx, buffer, " normalized:<");
355  GRN_TEXT_PUT(ctx, buffer,
356  string_->normalized,
357  string_->normalized_length_in_bytes);
358  GRN_TEXT_PUTS(ctx, buffer, ">");
359  GRN_TEXT_PUTS(ctx, buffer, "(");
360  grn_text_itoa(ctx, buffer, string_->normalized_length_in_bytes);
361  GRN_TEXT_PUTS(ctx, buffer, ")");
362 
363  GRN_TEXT_PUTS(ctx, buffer, " n_characters:");
364  grn_text_itoa(ctx, buffer, string_->n_characters);
365 
366  GRN_TEXT_PUTS(ctx, buffer, " encoding:");
367  grn_inspect_encoding(ctx, buffer, string_->encoding);
368 
369  GRN_TEXT_PUTS(ctx, buffer, " flags:");
370  if (string_->flags & GRN_STRING_REMOVE_BLANK) {
371  GRN_TEXT_PUTS(ctx, buffer, "REMOVE_BLANK|");
372  }
373  if (string_->flags & GRN_STRING_WITH_TYPES) {
374  GRN_TEXT_PUTS(ctx, buffer, "WITH_TYPES|");
375  }
376  if (string_->flags & GRN_STRING_WITH_CHECKS) {
377  GRN_TEXT_PUTS(ctx, buffer, "WITH_CHECKS|");
378  }
380  GRN_TEXT_PUTS(ctx, buffer, "REMOVE_TOKENIZED_DELIMITER|");
381  }
382  if (GRN_TEXT_VALUE(buffer)[GRN_TEXT_LEN(buffer) - 1] == '|') {
383  grn_bulk_truncate(ctx, buffer, GRN_TEXT_LEN(buffer) - 1);
384  }
385 
386  GRN_TEXT_PUTS(ctx, buffer, ">");
387 
388  return GRN_SUCCESS;
389 }
390 
391 grn_rc
393 {
394  grn_rc rc;
395  grn_string *string_ = (grn_string *)string;
396  if (string_) {
397  if (string_->normalized) { GRN_FREE(string_->normalized); }
398  if (string_->ctypes) { GRN_FREE(string_->ctypes); }
399  if (string_->checks) { GRN_FREE(string_->checks); }
400  GRN_FREE(string);
401  rc = GRN_SUCCESS;
402  } else {
404  }
405  return rc;
406 }