Groonga 3.0.9 Source Code Document
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
test-tokenizer.c
Go to the documentation of this file.
1 /* -*- c-basic-offset: 2; coding: utf-8 -*- */
2 /*
3  Copyright (C) 2012 Kouhei Sutou <kou@clear-code.com>
4 
5  This library is free software; you can redistribute it and/or
6  modify it under the terms of the GNU Lesser General Public
7  License version 2.1 as published by the Free Software Foundation.
8 
9  This library is distributed in the hope that it will be useful,
10  but WITHOUT ANY WARRANTY; without even the implied warranty of
11  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12  Lesser General Public License for more details.
13 
14  You should have received a copy of the GNU Lesser General Public
15  License along with this library; if not, write to the Free Software
16  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18 
19 #include <groonga.h>
20 #include <groonga/tokenizer.h>
21 
22 #include <gcutter.h>
23 
24 #include "../lib/grn-assertions.h"
25 
27 void test_is_tokenized_delimiter(gconstpointer data);
29 void test_have_tokenized_delimiter(gconstpointer data);
30 
31 static grn_ctx context;
32 static grn_obj *db;
33 
34 void
35 setup (void)
36 {
37  grn_ctx_init(&context, GRN_CTX_USE_QL);
38  db = grn_db_create(&context, NULL, NULL);
39 }
40 
41 void
42 teardown (void)
43 {
44  grn_obj_unlink(&context, db);
45  grn_ctx_fin(&context);
46 }
47 
48 void
50 {
51 #define ADD_DATUM(label, expected, input, encoding) \
52  gcut_add_datum(label, \
53  "expected", G_TYPE_BOOLEAN, expected, \
54  "input", G_TYPE_STRING, input, \
55  "encoding", G_TYPE_INT, encoding, \
56  NULL)
57 
58  ADD_DATUM("U+FFFE (UTF-8)", GRN_TRUE, "\xEF\xBF\xBE", GRN_ENC_UTF8);
59  ADD_DATUM("U+FFFE (EUC-JP)", GRN_FALSE, "\xEF\xBF\xBE", GRN_ENC_EUC_JP);
60  ADD_DATUM("U+FFFE (Shift_JIS)", GRN_FALSE, "\xEF\xBF\xBE", GRN_ENC_SJIS);
61  ADD_DATUM("U+FFFE (NONE)", GRN_FALSE, "\xEF\xBF\xBE", GRN_ENC_NONE);
62  ADD_DATUM("U+FFFE (LATIN1)", GRN_FALSE, "\xEF\xBF\xBE", GRN_ENC_LATIN1);
63  ADD_DATUM("U+FFFE (KOI8R)", GRN_FALSE, "\xEF\xBF\xBE", GRN_ENC_KOI8R);
64 
65  ADD_DATUM("U+FFFF", GRN_FALSE, "\xEF\xBF\xBF", GRN_ENC_UTF8);
66 
67 #undef ADD_DATUM
68 }
69 
70 void
71 test_is_tokenized_delimiter(gconstpointer data)
72 {
73  const gchar *input;
74  grn_encoding encoding;
75 
76  encoding = gcut_data_get_int(data, "encoding");
77  GRN_CTX_SET_ENCODING(&context, encoding);
78  input = gcut_data_get_string(data, "input");
79  if (gcut_data_get_boolean(data, "expected")) {
80  cut_assert_true(grn_tokenizer_is_tokenized_delimiter(&context,
81  input, strlen(input),
82  encoding));
83  } else {
84  cut_assert_false(grn_tokenizer_is_tokenized_delimiter(&context,
85  input, strlen(input),
86  encoding));
87  }
88 }
89 
90 void
92 {
93 #define ADD_DATUM(label, expected, input) \
94  gcut_add_datum(label, \
95  "expected", G_TYPE_BOOLEAN, expected, \
96  "input", G_TYPE_STRING, input, \
97  NULL)
98 
99 #define UFFFE_IN_UTF8 "\xef\xbf\xbe"
100 
101  ADD_DATUM("have", GRN_TRUE, "a" UFFFE_IN_UTF8 "b");
102  ADD_DATUM("not have", GRN_FALSE, "ab");
103 
104 #undef UFFFE_IN_UTF8
105 
106 #undef ADD_DATUM
107 }
108 
109 void
110 test_have_tokenized_delimiter(gconstpointer data)
111 {
112  const gchar *input;
113  grn_encoding encoding = GRN_ENC_UTF8;
114 
115  GRN_CTX_SET_ENCODING(&context, encoding);
116  input = gcut_data_get_string(data, "input");
117  if (gcut_data_get_boolean(data, "expected")) {
118  cut_assert_true(grn_tokenizer_have_tokenized_delimiter(&context,
119  input, strlen(input),
120  encoding));
121  } else {
122  cut_assert_false(grn_tokenizer_have_tokenized_delimiter(&context,
123  input, strlen(input),
124  encoding));
125  }
126 }