__C_LINK int __DPSCALL dps_mb_wc_8bit(DPS_CONV *conv, DPS_CHARSET *cs, dpsunicode_t *wc, const unsigned char *str, const unsigned char *end) { const unsigned char *p; unsigned char *e, z; unsigned int sw; int n; conv->ocodes = 1; if ( (*str == '&' && ((conv->flags & DPS_RECODE_HTML_FROM)||(conv->flags & DPS_RECODE_URL_FROM)) ) || (*str == '!' && (conv->flags & DPS_RECODE_URL_FROM)) ) { /* p = strchr(str, ';');*/ /*if (p != NULL)*/ { if (str[1] == '#') { p = str + 2; if (str[2] == 'x' || str[2] == 'X') sscanf((const char*)(str + 3), "%x", &sw); else sscanf((const char*)(str + 2), "%d", &sw); *wc = (dpsunicode_t)sw; if (sw < 256 && sw > 0x20 && DpsUniCType(*wc) >= DPS_UNI_OTHER_C) { // try to resolve bogus ENTITY escaping dpsunicode_t sv = cs->tab_to_uni[sw]; if (DpsUniCType(sv) < DPS_UNI_OTHER_C) *wc = sv; } } else { p = str + 1; if (!(conv->flags & DPS_RECODE_TEXT_FROM)) { for(e = (unsigned char*)str + 1 ; (e - str < DPS_MAX_SGML_LEN) && (((*e<='z')&&(*e>='a'))||((*e<='Z')&&(*e>='A'))); e++); if (/*!(conv->flags & DPS_RECODE_URL_FROM) ||*/ (*e == ';')) { z = *e; *e = '\0'; n = DpsSgmlToUni((const char*)str + 1, wc); if (n == 0) *wc = 0; else conv->ocodes = (size_t)n; *e = z; } else *wc = 0; } else *wc = 0; } if (*wc) { for (; isalpha(*p) || isdigit(*p); p++); if (*p == ';') p++; return conv->icodes = (size_t)(p - str /*+ 1*/); } } } if ( *str == '\\' && (conv->flags & DPS_RECODE_JSON_FROM)) { n = DpsJSONToUni((const char*)str + 1, wc, &conv->icodes); if (n) { conv->ocodes = n; return ++conv->icodes; } } conv->icodes = 1; *wc = cs->tab_to_uni[*str]; return (!wc[0] && str[0]) ? DPS_CHARSET_ILSEQ : 1; }
dpsunicode_t *DpsUniAccentStrip(const dpsunicode_t *str) { dpsunicode_t *nfd, *s, *d; s = d = nfd = DpsUniNormalizeNFD(NULL, str); while (*s != 0) { switch(DpsUniCType(*s)) { case DPS_UNI_MARK_N: break; default: if (s != d) *d = *s; d++; } s++; } *d = *s; return nfd; }
dpsunicode_t *DpsSegmentByFreq(DPS_CHINALIST *List, dpsunicode_t *line) { dpsunicode_t *out, *mid, *last, *sentence, *segmented_sentence, part; size_t i, j, l, a; int /*reg = 1,*/ ctype, have_bukva_forte, fb_type; dpsunicode_t space[] = { 32, 0 }; l = 2 * (DpsUniLen(line) + 1); if (l < 2) return NULL; out = (dpsunicode_t*)DpsMalloc(l * sizeof(dpsunicode_t)); if (out == NULL) return NULL; *out = '\0'; mid = (dpsunicode_t*)DpsXmalloc(l * sizeof(dpsunicode_t)); if (mid == NULL) { DPS_FREE(out); return NULL; } *mid = '\0'; for (i = j = 0; i < DpsUniLen(line); i++) { /* if (line[i] >= 0x80) { if (reg == 0) { mid[j++] = *space; reg = 1; } } else { if (reg == 1) { mid[j++] = *space; reg = 0; } }*/ mid[j++] = line[i]; } /* mid[j] = 0;*/ for (sentence = DpsUniGetSepToken(/*line*/ mid, &last, &ctype, &have_bukva_forte, 0); sentence; sentence = DpsUniGetSepToken(NULL, &last, &ctype, &have_bukva_forte, 0)) { part = *last; *last = 0; fb_type = DpsUniCType(*sentence); if (fb_type > DPS_UNI_BUKVA || fb_type == 2 || fb_type == 1) { a = 2 * (DpsUniLen(sentence) + 1); j = DpsUniLen(out); if (j + a >= l) { l = j + a + 1; out = (dpsunicode_t*)DpsRealloc(out, l * sizeof(dpsunicode_t)); if (out == NULL) { DPS_FREE(mid); return NULL; } } if (*out) DpsUniStrCat(out, space); DpsUniStrCat(out, sentence); } else { if ((segmented_sentence = DpsSegmentProcess(List, sentence)) != NULL) { a = 2 * (DpsUniLen(segmented_sentence) + 1); j = DpsUniLen(out); if (j + a >= l) { l = j + a + 1; out = (dpsunicode_t*)DpsRealloc(out, l * sizeof(dpsunicode_t)); if (out == NULL) { DPS_FREE(mid); return NULL; } } if (*out) DpsUniStrCat(out, space); DpsUniStrCat(out, segmented_sentence); DPS_FREE(segmented_sentence); } else { DPS_FREE(mid); return NULL; } } *last = part; } DPS_FREE(mid); return out; }