Esempio n. 1
0
__C_LINK int __DPSCALL dps_mb_wc_8bit(DPS_CONV *conv, DPS_CHARSET *cs, dpsunicode_t *wc,
				      const unsigned char *str, const unsigned char *end) {

  const unsigned char *p;
  unsigned char *e, z;
  unsigned int sw;
  int n;

  conv->ocodes = 1;

  if ( (*str == '&' && ((conv->flags & DPS_RECODE_HTML_FROM)||(conv->flags & DPS_RECODE_URL_FROM)) ) || 
       (*str == '!' && (conv->flags & DPS_RECODE_URL_FROM)) ) {
/*    p = strchr(str, ';');*/
    /*if (p != NULL)*/ {
      if (str[1] == '#') {
	p = str + 2;
	if (str[2] == 'x' || str[2] == 'X') sscanf((const char*)(str + 3), "%x", &sw);
	else sscanf((const char*)(str + 2), "%d", &sw);
	*wc = (dpsunicode_t)sw;
	if (sw < 256 && sw > 0x20 && DpsUniCType(*wc) >= DPS_UNI_OTHER_C) { // try to resolve bogus ENTITY escaping
	  dpsunicode_t sv = cs->tab_to_uni[sw];
	  if (DpsUniCType(sv) < DPS_UNI_OTHER_C) *wc = sv;
	}
      } else {
	p = str + 1;
	if (!(conv->flags & DPS_RECODE_TEXT_FROM)) {
	  for(e = (unsigned char*)str + 1 ; (e - str < DPS_MAX_SGML_LEN) && (((*e<='z')&&(*e>='a'))||((*e<='Z')&&(*e>='A'))); e++);
	  if (/*!(conv->flags & DPS_RECODE_URL_FROM) ||*/ (*e == ';')) {
	    z = *e;
	    *e = '\0';
	    n = DpsSgmlToUni((const char*)str + 1, wc);
	    if (n == 0) *wc = 0;
	    else conv->ocodes = (size_t)n;
	    *e = z;
	  } else *wc = 0;
	} else *wc = 0;
      }
      if (*wc) {
	for (; isalpha(*p) || isdigit(*p); p++);
	if (*p == ';') p++;
	return conv->icodes = (size_t)(p - str /*+ 1*/);
      }
    }
  }
  if ( *str == '\\' && (conv->flags & DPS_RECODE_JSON_FROM)) {
    n = DpsJSONToUni((const char*)str + 1, wc, &conv->icodes);
    if (n) {
      conv->ocodes = n;
      return ++conv->icodes;
    }
  }

  conv->icodes = 1;
  *wc = cs->tab_to_uni[*str];
  return (!wc[0] && str[0]) ? DPS_CHARSET_ILSEQ : 1;
}
Esempio n. 2
0
dpsunicode_t *DpsUniAccentStrip(const dpsunicode_t *str) {
  dpsunicode_t *nfd, *s, *d;

  s = d = nfd = DpsUniNormalizeNFD(NULL, str);
  while (*s != 0) {
    switch(DpsUniCType(*s)) {
    case DPS_UNI_MARK_N: break;
    default:
      if (s != d) *d = *s;
      d++;
    }
    s++;
  }
  *d = *s;
  return nfd;
}
Esempio n. 3
0
dpsunicode_t *DpsSegmentByFreq(DPS_CHINALIST *List, dpsunicode_t *line) {
  dpsunicode_t *out, *mid, *last, *sentence, *segmented_sentence, part;
  size_t i, j, l, a;
  int /*reg = 1,*/ ctype, have_bukva_forte, fb_type;
  dpsunicode_t space[] = { 32, 0 };

  l = 2 * (DpsUniLen(line) + 1);
  if (l < 2) return NULL;
  out = (dpsunicode_t*)DpsMalloc(l * sizeof(dpsunicode_t));
  if (out == NULL) return NULL;
  *out = '\0';
  mid = (dpsunicode_t*)DpsXmalloc(l * sizeof(dpsunicode_t));
  if (mid == NULL) { DPS_FREE(out); return NULL; }
  *mid = '\0';
  
  for (i = j = 0; i < DpsUniLen(line); i++) {
/*    if (line[i] >= 0x80) {
      if (reg == 0) {
	mid[j++] = *space;
	reg = 1;
      }
    } else {
      if (reg == 1) {
	mid[j++] = *space;
	reg = 0;
      }
    }*/
    mid[j++] = line[i];
  }
/*  mid[j] = 0;*/

  for (sentence = DpsUniGetSepToken(/*line*/ mid, &last, &ctype, &have_bukva_forte, 0);
       sentence;
       sentence = DpsUniGetSepToken(NULL, &last, &ctype, &have_bukva_forte, 0)) {
    part = *last;
    *last = 0;
    fb_type = DpsUniCType(*sentence);

    if (fb_type > DPS_UNI_BUKVA || fb_type == 2 || fb_type == 1) {
      a = 2 * (DpsUniLen(sentence) + 1);
      j = DpsUniLen(out);
      if (j + a >= l) {
	l = j + a + 1;
	out = (dpsunicode_t*)DpsRealloc(out, l * sizeof(dpsunicode_t));
	if (out == NULL) {
	  DPS_FREE(mid); return NULL;
	}
      }
      if (*out) DpsUniStrCat(out, space);
      DpsUniStrCat(out, sentence);
    } else {
      if ((segmented_sentence = DpsSegmentProcess(List, sentence)) != NULL) {
	a = 2 * (DpsUniLen(segmented_sentence) + 1);
	j = DpsUniLen(out);
	if (j + a >= l) {
	  l = j + a + 1;
	  out = (dpsunicode_t*)DpsRealloc(out, l * sizeof(dpsunicode_t));
	  if (out == NULL) {
	    DPS_FREE(mid); return NULL;
	  }
	}
	if (*out) DpsUniStrCat(out, space);
	DpsUniStrCat(out, segmented_sentence);
	DPS_FREE(segmented_sentence);
      } else {
	  DPS_FREE(mid); return NULL;
      }
    }
    *last = part;
  }

  DPS_FREE(mid);
  
  return out;
}