39 if (name_length < 0) {
40 name_length = strlen(name_ptr);
45 name_ptr, name_length,
48 sizeof(*vars) /
sizeof(vars),
52 "[normalizer] failed to register normalizer: <%.*s>",
53 name_length, name_ptr);
72 static unsigned char symbol[] = {
73 ',',
'.', 0,
':',
';',
'?',
'!', 0, 0, 0,
'`', 0,
'^',
'~',
'_', 0, 0, 0,
74 0, 0, 0, 0, 0, 0, 0,
'-',
'-',
'/',
'\\', 0, 0,
'|', 0, 0, 0,
'\'', 0,
75 '"',
'(',
')', 0, 0,
'[',
']',
'{',
'}', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
76 '+',
'-', 0, 0, 0,
'=', 0,
'<',
'>', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
77 '$', 0, 0,
'%',
'#',
'&',
'*',
'@', 0, 0, 0, 0, 0, 0, 0, 0
83 static uint16_t hankana[] = {
84 0xa1a1, 0xa1a3, 0xa1d6, 0xa1d7, 0xa1a2, 0xa1a6, 0xa5f2, 0xa5a1, 0xa5a3,
85 0xa5a5, 0xa5a7, 0xa5a9, 0xa5e3, 0xa5e5, 0xa5e7, 0xa5c3, 0xa1bc, 0xa5a2,
86 0xa5a4, 0xa5a6, 0xa5a8, 0xa5aa, 0xa5ab, 0xa5ad, 0xa5af, 0xa5b1, 0xa5b3,
87 0xa5b5, 0xa5b7, 0xa5b9, 0xa5bb, 0xa5bd, 0xa5bf, 0xa5c1, 0xa5c4, 0xa5c6,
88 0xa5c8, 0xa5ca, 0xa5cb, 0xa5cc, 0xa5cd, 0xa5ce, 0xa5cf, 0xa5d2, 0xa5d5,
89 0xa5d8, 0xa5db, 0xa5de, 0xa5df, 0xa5e0, 0xa5e1, 0xa5e2, 0xa5e4, 0xa5e6,
90 0xa5e8, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5ef, 0xa5f3, 0xa1ab,
93 static unsigned char dakuten[] = {
94 0xf4, 0, 0, 0, 0, 0xac, 0, 0xae, 0, 0xb0, 0, 0xb2, 0, 0xb4, 0, 0xb6, 0,
95 0xb8, 0, 0xba, 0, 0xbc, 0, 0xbe, 0, 0xc0, 0, 0xc2, 0, 0, 0xc5, 0, 0xc7,
96 0, 0xc9, 0, 0, 0, 0, 0, 0, 0xd0, 0, 0, 0xd3, 0, 0, 0xd6, 0, 0, 0xd9, 0,
99 static unsigned char handaku[] = {
100 0xd1, 0, 0, 0xd4, 0, 0, 0xd7, 0, 0, 0xda, 0, 0, 0xdd
103 const unsigned char *s, *s_, *e;
104 unsigned char *d, *d0, *d_,
b;
110 "[string][eucjp] failed to allocate normalized text space");
119 "[string][eucjp] failed to allocate checks space");
131 "[string][eucjp] failed to allocate character types space");
135 cp = ctypes = nstr->
ctypes;
136 e = (
unsigned char *)nstr->
original + size;
137 for (s = s_ = (
unsigned char *) nstr->
original, d = d_ = d0; s < e; s++) {
139 if (((s + 1) < e) && (*(s + 1) & 0x80)) {
140 unsigned char c1 = *s++, c2 = *s, c3 = 0;
143 if (c1 == 0x8e && 0xa0 <= c2 && c2 <= 0xdf) {
144 uint16_t c = hankana[c2 - 0xa0];
147 if (d > d0 + 1 && d[-2] == 0xa5
148 && 0xa6 <= d[-1] && d[-1] <= 0xdb && (b = dakuten[d[-1] - 0xa6])) {
150 if (ch) { ch[-1] += 2; s_ += 2; }
153 *d++ = c >> 8; *d = c & 0xff;
157 if (d > d0 + 1 && d[-2] == 0xa5
158 && 0xcf <= d[-1] && d[-1] <= 0xdb && (b = handaku[d[-1] - 0xcf])) {
160 if (ch) { ch[-1] += 2; s_ += 2; }
163 *d++ = c >> 8; *d = c & 0xff;
167 *d++ = c >> 8; *d = c & 0xff;
202 if (c2 >= 0xa4 && (c3 = symbol[c2 - 0xa4])) {
218 if (
'a' <= c3 && c3 <=
'z') {
221 }
else if (
'A' <= c3 && c3 <=
'Z') {
224 }
else if (
'0' <= c3 && c3 <=
'9') {
262 unsigned char c = *s;
288 *d = (
'A' <= c) ? c + 0x20 : c;
292 *d = (c <=
'Z') ? c + 0x20 : c;
311 if (cp) { *cp++ = ctype; }
313 *ch++ = (int16_t)(s + 1 - s_);
315 while (++d_ < d) { *ch++ = 0; }
328 static uint16_t hankana[] = {
329 0x8140, 0x8142, 0x8175, 0x8176, 0x8141, 0x8145, 0x8392, 0x8340, 0x8342,
330 0x8344, 0x8346, 0x8348, 0x8383, 0x8385, 0x8387, 0x8362, 0x815b, 0x8341,
331 0x8343, 0x8345, 0x8347, 0x8349, 0x834a, 0x834c, 0x834e, 0x8350, 0x8352,
332 0x8354, 0x8356, 0x8358, 0x835a, 0x835c, 0x835e, 0x8360, 0x8363, 0x8365,
333 0x8367, 0x8369, 0x836a, 0x836b, 0x836c, 0x836d, 0x836e, 0x8371, 0x8374,
334 0x8377, 0x837a, 0x837d, 0x837e, 0x8380, 0x8381, 0x8382, 0x8384, 0x8386,
335 0x8388, 0x8389, 0x838a, 0x838b, 0x838c, 0x838d, 0x838f, 0x8393, 0x814a,
338 static unsigned char dakuten[] = {
339 0x94, 0, 0, 0, 0, 0x4b, 0, 0x4d, 0, 0x4f, 0, 0x51, 0, 0x53, 0, 0x55, 0,
340 0x57, 0, 0x59, 0, 0x5b, 0, 0x5d, 0, 0x5f, 0, 0x61, 0, 0, 0x64, 0, 0x66,
341 0, 0x68, 0, 0, 0, 0, 0, 0, 0x6f, 0, 0, 0x72, 0, 0, 0x75, 0, 0, 0x78, 0,
344 static unsigned char handaku[] = {
345 0x70, 0, 0, 0x73, 0, 0, 0x76, 0, 0, 0x79, 0, 0, 0x7c
348 const unsigned char *s, *s_;
349 unsigned char *d, *d0, *d_,
b, *e;
355 "[string][sjis] failed to allocate normalized text space");
364 "[string][sjis] failed to allocate checks space");
376 "[string][sjis] failed to allocate character types space");
380 cp = ctypes = nstr->
ctypes;
381 e = (
unsigned char *)nstr->
original + size;
382 for (s = s_ = (
unsigned char *) nstr->
original, d = d_ = d0; s < e; s++) {
384 if (0xa0 <= *s && *s <= 0xdf) {
385 uint16_t c = hankana[*s - 0xa0];
388 if (d > d0 + 1 && d[-2] == 0x83
389 && 0x45 <= d[-1] && d[-1] <= 0x7a && (b = dakuten[d[-1] - 0x45])) {
391 if (ch) { ch[-1]++; s_++; }
394 *d++ = c >> 8; *d = c & 0xff;
398 if (d > d0 + 1 && d[-2] == 0x83
399 && 0x6e <= d[-1] && d[-1] <= 0x7a && (b = handaku[d[-1] - 0x6e])) {
401 if (ch) { ch[-1]++; s_++; }
404 *d++ = c >> 8; *d = c & 0xff;
408 *d++ = c >> 8; *d = c & 0xff;
413 if ((s + 1) < e && 0x40 <= *(s + 1) && *(s + 1) <= 0xfc) {
414 unsigned char c1 = *s++, c2 = *s, c3 = 0;
415 if (0x81 <= c1 && c1 <= 0x87) {
437 if (0x43 <= c2 && c2 <= 0x7e && (c3 = symbol[c2 - 0x43])) {
440 }
else if (0x7f <= c2 && c2 <= 0x97 && (c3 = symbol[c2 - 0x44])) {
452 if (0x4f <= c2 && c2 <= 0x58) {
455 }
else if (0x60 <= c2 && c2 <= 0x79) {
458 }
else if (0x81 <= c2 && c2 <= 0x9a) {
461 }
else if (0x9f <= c2 && c2 <= 0xf1) {
470 if (0x40 <= c2 && c2 <= 0x96) {
498 unsigned char c = *s;
524 *d = (
'A' <= c) ? c + 0x20 : c;
528 *d = (c <=
'Z') ? c + 0x20 : c;
547 if (cp) { *cp++ = ctype; }
549 *ch++ = (int16_t)(s + 1 - s_);
551 while (++d_ < d) { *ch++ = 0; }
562 const char *grn_nfkc_map1(
const unsigned char *str);
563 const char *grn_nfkc_map2(
const unsigned char *prefix,
const unsigned char *suffix);
566 grn_str_charlen_utf8(
grn_ctx *ctx,
const unsigned char *str,
const unsigned char *end)
570 const unsigned char *p = str;
571 if (end <= p || !*p) {
return 0; }
576 for (b = 0x40, w = 0; b && (*p &
b); b >>= 1, w++);
579 "invalid utf8 string: the first bit is 0x80: <%.*s>: <%.*s>",
581 (
int)(end - str), str);
585 for (i = 1; i < size; i++) {
588 "invalid utf8 string: too short: "
589 "%d byte is required but %d byte is given: <%.*s>",
591 (
int)(end - str), str);
596 "invalid utf8 string: NULL character is found: <%.*s>",
597 (
int)(end - str), str);
600 if ((*p & 0xc0) != 0x80) {
602 "invalid utf8 string: 0x80 is not allowed: <%.*s>: <%.*s>",
604 (
int)(end - str), str);
619 const unsigned char *s, *s_, *s__ = NULL, *p, *p2, *pe, *e;
620 unsigned char *d, *d_, *de;
624 grn_bool remove_tokenized_delimiter_p =
628 "[string][utf8] failed to allocate normalized text space");
636 "[string][utf8] failed to allocate checks space");
646 "[string][utf8] failed to allocate character types space");
654 e = (
unsigned char *)nstr->
original + size;
655 for (s = s_ = (
unsigned char *)nstr->
original; ; s += ls) {
656 if (!(ls = grn_str_charlen_utf8(ctx, s, e))) {
659 if (remove_tokenized_delimiter_p &&
664 if ((p = (
unsigned char *)grn_nfkc_map1(s))) {
665 pe = p + strlen((
char *)p);
670 if (d_ && (p2 = (
unsigned char *)grn_nfkc_map2(d_, p))) {
672 pe = p + strlen((
char *)p);
684 if (!(lp = grn_str_charlen_utf8(ctx, p, pe))) {
687 if ((*p ==
' ' && removeblankp) || *p < 0x20 ) {
691 unsigned char *normalized;
692 ds += (ds >> 1) + lp;
698 "[string][utf8] failed to expand normalized text space");
701 de = normalized + ds;
702 d = normalized + (d - (
unsigned char *)nstr->
normalized);
711 "[string][utf8] failed to expand checks space");
714 ch = checks + (ch - nstr->
checks);
724 "[string][utf8] failed to expand character types space");
727 cp = ctypes + (cp - nstr->
ctypes);
741 *ch++ = (int16_t)(s + ls - s_);
745 for (i = lp; i > 1; i--) { *ch++ = 0; }
762 const unsigned char *s, *s_, *e;
763 unsigned char *d, *d0, *d_;
769 "[string][ascii] failed to allocate normalized text space");
778 "[string][ascii] failed to allocate checks space");
790 "[string][ascii] failed to allocate character types space");
794 cp = ctypes = nstr->
ctypes;
795 e = (
unsigned char *)nstr->
original + size;
796 for (s = s_ = (
unsigned char *) nstr->
original, d = d_ = d0; s < e; s++) {
797 unsigned char c = *s;
823 *d = (
'A' <= c) ? c + 0x20 : c;
827 *d = (c <=
'Z') ? c + 0x20 : c;
845 if (cp) { *cp++ = ctype; }
847 *ch++ = (int16_t)(s + 1 - s_);
849 while (++d_ < d) { *ch++ = 0; }
864 const unsigned char *s, *s_, *e;
865 unsigned char *d, *d0, *d_;
871 "[string][latin1] failed to allocate normalized text space");
880 "[string][latin1] failed to allocate checks space");
892 "[normalizer][latin1] failed to allocate character types space");
896 cp = ctypes = nstr->
ctypes;
897 e = (
unsigned char *)nstr->
original + size;
898 for (s = s_ = (
unsigned char *) nstr->
original, d = d_ = d0; s < e; s++) {
899 unsigned char c = *s;
925 *d = (
'A' <= c) ? c + 0x20 : c;
929 *d = (c <=
'Z') ? c + 0x20 : c;
941 if (c == 0x8a || c == 0x8c || c == 0x8e) {
950 if (c == 0x9a || c == 0x9c || c == 0x9e || c == 0x9f) {
951 *d = (c == 0x9f) ? c + 0x60 : c;
963 *d = (c == 0xd7 || c == 0xdf) ? c : c + 0x20;
981 if (cp) { *cp++ = ctype; }
983 *ch++ = (int16_t)(s + 1 - s_);
985 while (++d_ < d) { *ch++ = 0; }
999 const unsigned char *s, *s_, *e;
1000 unsigned char *d, *d0, *d_;
1006 "[string][koi8r] failed to allocate normalized text space");
1015 "[string][koi8r] failed to allocate checks space");
1027 "[string][koi8r] failed to allocate character types space");
1031 cp = ctypes = nstr->
ctypes;
1032 e = (
unsigned char *)nstr->
original + size;
1033 for (s = s_ = (
unsigned char *) nstr->
original, d = d_ = d0; s < e; s++) {
1034 unsigned char c = *s;
1060 *d = (
'A' <= c) ? c + 0x20 : c;
1064 *d = (c <=
'Z') ? c + 0x20 : c;
1105 if (cp) { *cp++ = ctype; }
1107 *ch++ = (int16_t)(s + 1 - s_);
1109 while (++d_ < d) { *ch++ = 0; }
1125 eucjp_normalize(ctx,
string);
1128 #ifdef GRN_WITH_NFKC
1129 utf8_normalize(ctx,
string);
1131 ascii_normalize(ctx,
string);
1135 sjis_normalize(ctx,
string);
1138 latin1_normalize(ctx,
string);
1141 koi8r_normalize(ctx,
string);
1144 ascii_normalize(ctx,
string);
1150 #ifdef GRN_WITH_NFKC
1155 utf8_normalize(ctx,
string);
1177 const char *normalizer_nfkc51_name =
"NormalizerNFKC51";
1180 NULL, auto_next, NULL);
1182 #ifdef GRN_WITH_NFKC
1184 NULL, nfkc51_next, NULL);