Groonga 3.0.9 Source Code Document
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
tsv.c
Go to the documentation of this file.
1 /* -*- c-basic-offset: 2 -*- */
2 /* Copyright(C) 2012 Brazil
3 
4  This library is free software; you can redistribute it and/or
5  modify it under the terms of the GNU Lesser General Public
6  License version 2.1 as published by the Free Software Foundation.
7 
8  This library is distributed in the hope that it will be useful,
9  but WITHOUT ANY WARRANTY; without even the implied warranty of
10  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  Lesser General Public License for more details.
12 
13  You should have received a copy of the GNU Lesser General Public
14  License along with this library; if not, write to the Free Software
15  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16 */
17 
18 #include <groonga/plugin.h>
19 
20 /* groonga's internal headers */
21 /* for grn_text_fgets(): We don't want to require stdio.h for groonga.h.
22  What should we do? Should we split header file such as groonga/stdio.h? */
23 #include <str.h>
24 
25 #include <stdio.h>
26 #include <string.h>
27 
28 #ifndef HAVE_STRNCASECMP
29 # ifdef HAVE__STRNICMP
30 # define strncasecmp(s1,s2,n) _strnicmp(s1,s2,n)
31 # endif /* HAVE__STRNICMP */
32 #endif /* HAVE_STRNCASECMP */
33 
34 #define MAX_SYNONYM_BYTES 4096
35 
36 static grn_hash *synonyms = NULL;
37 
38 #ifdef WIN32
39 static char *win32_synonyms_file = NULL;
40 const char *
42 {
43  if (!win32_synonyms_file) {
44  const char *base_dir;
45  const char *relative_path = GRN_QUERY_EXPANDER_TSV_RELATIVE_SYNONYMS_FILE;
46  char *synonyms_file;
47  char *path;
48  size_t base_dir_length;
49 
50  base_dir = grn_plugin_win32_base_dir();
51  base_dir_length = strlen(base_dir);
52  synonyms_file =
53  malloc(base_dir_length + strlen("/") + strlen(relative_path) + 1);
54  strcpy(synonyms_file, base_dir);
55  strcat(synonyms_file, "/");
56  strcat(synonyms_file, relative_path);
57  win32_synonyms_file = synonyms_file;
58  }
59  return win32_synonyms_file;
60 }
61 
62 #else /* WIN32 */
63 const char *
65 {
67 }
68 #endif /* WIN32 */
69 
70 static inline grn_bool
71 is_comment_mark(char character)
72 {
73  return character == '#';
74 }
75 
76 static inline grn_encoding
77 detect_coding_part(grn_ctx *ctx, const char *line, size_t line_length)
78 {
79  grn_encoding encoding = GRN_ENC_NONE;
80  grn_obj null_terminated_line_buffer;
81  const char *c_line;
82  const char *coding_part_keyword = "coding: ";
83  const char *coding_part;
84  const char *encoding_name;
85 
86  GRN_TEXT_INIT(&null_terminated_line_buffer, 0);
87  GRN_TEXT_PUT(ctx, &null_terminated_line_buffer, line, line_length);
88  GRN_TEXT_PUTC(ctx, &null_terminated_line_buffer, '\0');
89 
90  c_line = GRN_TEXT_VALUE(&null_terminated_line_buffer);
91  coding_part = strstr(c_line, coding_part_keyword);
92  if (coding_part) {
93  encoding_name = coding_part + strlen(coding_part_keyword);
94  if (strncasecmp(encoding_name, "utf-8", strlen("utf-8")) == 0 ||
95  strncasecmp(encoding_name, "utf8", strlen("utf8")) == 0) {
96  encoding = GRN_ENC_UTF8;
97  } else if (strncasecmp(encoding_name, "sjis", strlen("sjis")) == 0 ||
98  strncasecmp(encoding_name, "Shift_JIS", strlen("Shift_JIS")) == 0) {
99  encoding = GRN_ENC_SJIS;
100  } else if (strncasecmp(encoding_name, "EUC-JP", strlen("EUC-JP")) == 0 ||
101  strncasecmp(encoding_name, "euc_jp", strlen("euc_jp")) == 0) {
102  encoding = GRN_ENC_EUC_JP;
103  } else if (strncasecmp(encoding_name, "latin1", strlen("latin1")) == 0) {
104  encoding = GRN_ENC_LATIN1;
105  } else if (strncasecmp(encoding_name, "KOI8-R", strlen("KOI8-R")) == 0 ||
106  strncasecmp(encoding_name, "koi8r", strlen("koi8r")) == 0) {
107  encoding = GRN_ENC_KOI8R;
108  }
109  } else {
110  encoding = ctx->encoding;
111  }
112  GRN_OBJ_FIN(ctx, &null_terminated_line_buffer);
113 
114  return encoding;
115 }
116 
117 static inline grn_encoding
118 guess_encoding(grn_ctx *ctx, const char **line, size_t *line_length)
119 {
120  const char bom[] = {0xef, 0xbb, 0xbf};
121  size_t bom_length = sizeof(bom);
122 
123  if (*line_length >= bom_length && memcmp(*line, bom, bom_length) == 0) {
124  *line += bom_length;
125  *line_length -= bom_length;
126  return GRN_ENC_UTF8;
127  }
128 
129  if (!is_comment_mark((*line)[0])) {
130  return ctx->encoding;
131  }
132 
133  return detect_coding_part(ctx, (*line) + 1, (*line_length) - 1);
134 }
135 
136 static void
137 parse_synonyms_file_line(grn_ctx *ctx, const char *line, int line_length,
138  grn_obj *key, grn_obj *value)
139 {
140  size_t i = 0;
141 
142  if (is_comment_mark(line[i])) {
143  return;
144  }
145 
146  while (i < line_length) {
147  char character = line[i];
148  i++;
149  if (character == '\t') {
150  break;
151  }
152  GRN_TEXT_PUTC(ctx, key, character);
153  }
154 
155  if (i == line_length) {
156  return;
157  }
158 
159  GRN_TEXT_PUTS(ctx, value, "((");
160  while (i < line_length) {
161  char character = line[i];
162  i++;
163  if (character == '\t') {
164  GRN_TEXT_PUTS(ctx, value, ") OR (");
165  } else {
166  GRN_TEXT_PUTC(ctx, value, character);
167  }
168  }
169  GRN_TEXT_PUTS(ctx, value, "))");
170 
171  {
172  grn_id id;
173  void *value_location = NULL;
174 
175  id = grn_hash_add(ctx, synonyms, GRN_TEXT_VALUE(key), GRN_TEXT_LEN(key),
176  &value_location, NULL);
177  if (id == GRN_ID_NIL) {
179  "[plugin][query-expander][tsv] "
180  "failed to register key: <%.*s>",
181  (int)GRN_TEXT_LEN(key), GRN_TEXT_VALUE(key));
182  return;
183  }
184 
185  grn_bulk_truncate(ctx, value, MAX_SYNONYM_BYTES - 1);
186  GRN_TEXT_PUTC(ctx, value, '\0');
187  memcpy(value_location, GRN_TEXT_VALUE(value), MAX_SYNONYM_BYTES);
188  }
189 }
190 
191 static void
192 load_synonyms(grn_ctx *ctx)
193 {
194  const char *path;
195  FILE *file;
196  int number_of_lines;
197  grn_encoding encoding;
198  grn_obj line, key, value;
199 
200  path = getenv("GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE");
201  if (!path) {
202  path = get_system_synonyms_file();
203  }
204  file = fopen(path, "r");
205  if (!file) {
207  "[plugin][query-expander][tsv] "
208  "synonyms file doesn't exist: <%s>",
209  path);
210  return;
211  }
212 
213  GRN_TEXT_INIT(&line, 0);
214  GRN_TEXT_INIT(&key, 0);
215  GRN_TEXT_INIT(&value, 0);
216  grn_bulk_reserve(ctx, &value, MAX_SYNONYM_BYTES);
217  number_of_lines = 0;
218  while (grn_text_fgets(ctx, &line, file) == GRN_SUCCESS) {
219  const char *line_value = GRN_TEXT_VALUE(&line);
220  size_t line_length = GRN_TEXT_LEN(&line);
221 
222  number_of_lines++;
223  if (number_of_lines == 1) {
224  encoding = guess_encoding(ctx, &line_value, &line_length);
225  }
226  GRN_BULK_REWIND(&key);
227  GRN_BULK_REWIND(&value);
228  parse_synonyms_file_line(ctx, line_value, line_length, &key, &value);
229  GRN_BULK_REWIND(&line);
230  }
231  GRN_OBJ_FIN(ctx, &line);
232  GRN_OBJ_FIN(ctx, &key);
233  GRN_OBJ_FIN(ctx, &value);
234 
235  fclose(file);
236 }
237 
238 static grn_obj *
239 func_query_expander_tsv(grn_ctx *ctx, int nargs, grn_obj **args,
240  grn_user_data *user_data)
241 {
242  grn_rc rc = GRN_END_OF_DATA;
243  grn_id id;
244  grn_obj *term, *expanded_term;
245  void *value;
246  grn_obj *rc_object;
247 
248  term = args[0];
249  expanded_term = args[1];
250  id = grn_hash_get(ctx, synonyms,
251  GRN_TEXT_VALUE(term), GRN_TEXT_LEN(term),
252  &value);
253  if (id != GRN_ID_NIL) {
254  const char *query = value;
255  GRN_TEXT_PUTS(ctx, expanded_term, query);
256  rc = GRN_SUCCESS;
257  }
258 
259  rc_object = grn_plugin_proc_alloc(ctx, user_data, GRN_DB_INT32, 0);
260  if (rc_object) {
261  GRN_INT32_SET(ctx, rc_object, rc);
262  }
263 
264  return rc_object;
265 }
266 
267 grn_rc
269 {
270  if (!synonyms) {
271  synonyms = grn_hash_create(ctx, NULL,
275  if (!synonyms) {
276  return ctx->rc;
277  }
278  load_synonyms(ctx);
279  }
280  return ctx->rc;
281 }
282 
283 grn_rc
285 {
286  grn_proc_create(ctx, "QueryExpanderTSV", strlen("QueryExpanderTSV"),
288  func_query_expander_tsv, NULL, NULL,
289  0, NULL);
290  return GRN_SUCCESS;
291 }
292 
293 grn_rc
295 {
296  if (synonyms) {
297  grn_hash_close(ctx, synonyms);
298  synonyms = NULL;
299  }
300  return GRN_SUCCESS;
301 }