Пример #1
0
librevenge::RVNGString libvisio::VSDMetaData::readCodePageString(librevenge::RVNGInputStream *input)
{
  uint32_t size = readU32(input);
  if (size > getRemainingLength(input))
    size = getRemainingLength(input);

  if (size == 0)
    return librevenge::RVNGString();

  std::vector<unsigned char> characters;
  for (uint32_t i = 0; i < size; ++i)
    characters.push_back(readU8(input));

  uint32_t codepage = getCodePage();
  librevenge::RVNGString string;

  if (codepage == 65001)
  {
    // http://msdn.microsoft.com/en-us/library/windows/desktop/dd374130%28v=vs.85%29.aspx
    // says this is UTF-8.
    characters.push_back(0);
    string.append(reinterpret_cast<const char *>(characters.data()));
  }
  else
  {
    UErrorCode status = U_ZERO_ERROR;
    UConverter *conv = nullptr;

    switch (codepage)
    {
    case 1252:
      // http://msdn.microsoft.com/en-us/goglobal/bb964654
      conv = ucnv_open("windows-1252", &status);
      break;
    }

    if (U_SUCCESS(status) && conv)
    {
      assert(!characters.empty());
      const auto *src = (const char *)&characters[0];
      const char *srcLimit = (const char *)src + characters.size();
      while (src < srcLimit)
      {
        UChar32 ucs4Character = ucnv_getNextUChar(conv, &src, srcLimit, &status);
        if (U_SUCCESS(status) && U_IS_UNICODE_CHAR(ucs4Character))
          appendUCS4(string, ucs4Character);
      }
    }

    if (conv)
      ucnv_close(conv);
  }

  return string;
}
Пример #2
0
SEXP R_stri_length(SEXP vec) {
  int vec_len = LENGTH(vec);
  SEXP ret = PROTECT(allocVector(INTSXP, vec_len));
  int* retint = INTEGER(ret);

  for (int i = 0; i < vec_len; i++) {
    SEXP str = STRING_ELT(vec, i);
    if (str == NA_STRING) {
      retint[i] = NA_INTEGER;
      continue;
    }

    int str_len = LENGTH(str);
    if (getCharCE(str) == CE_LATIN1 || (getCharCE(str) == CE_NATIVE && getNativeCE() == CE_LATIN1)) {
      retint[i] = str_len;
    } else if (getCharCE(str) == CE_BYTES) {
      UNPROTECT(1);
      error("Invalid encoding: bytes.");
    } else if (getCharCE(str) == CE_UTF8 || (getCharCE(str) == CE_NATIVE && getNativeCE() == CE_UTF8)) {
      UChar32 out = 0;
      const char* source = CHAR(str);
      R_len_t j = 0;
      int count;
      for (count = 0; out >= 0 && j < str_len; count++) {
        U8_NEXT(source, j, str_len, out); // faster that U8_FWD_1 & gives bad UChar32s
      }
      if (out < 0) {
        warning("Invalid UTF8 string: %s", source);
        retint[i] = NA_INTEGER;
      } else {
        retint[i] = count;
      }
    } else if (native_is_singlebyte()) { // native-8bit
      retint[i] = str_len;
    } else {
      // native encoding, not 8 bit
      UErrorCode status = U_ZERO_ERROR;
      UConverter* conv = ucnv_open(NULL, &status);
      const char* source = CHAR(str);
      const char* sourceLimit = source + str_len;
      int j;
      for (j = 0; source != sourceLimit; j++) {
        ucnv_getNextUChar(conv, &source, sourceLimit, &status);
      }
      retint[i] = j; // all right, we got it!
    }
  }

  UNPROTECT(1);
  return ret;
}
Пример #3
0
UErrorCode convsample_13()
{
  printf("\n\n==============================================\n"
         "Sample 13: C: simple Big5 -> unicode conversion, char at a time\n");


  const char sourceChars[] = { 0x7a, 0x68, 0x3d, (char)0xa4, (char)0xa4, (char)0xa4, (char)0xe5, (char)0x2e };
  //  const char sourceChars[] = { 0x7a, 0x68, 0x3d, 0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87, 0x2e };
  const char *source, *sourceLimit;
  UChar32 target;
  UErrorCode status = U_ZERO_ERROR;
  UConverter *conv = NULL;
  int32_t srcCount=0;
  int32_t dstCount=0;
  
  srcCount = sizeof(sourceChars);

  conv = ucnv_open("Big5", &status);
  U_ASSERT(status);

  source = sourceChars;
  sourceLimit = sourceChars + sizeof(sourceChars);

  // **************************** START SAMPLE *******************


  printBytes("src",source,sourceLimit-source);

  while(source < sourceLimit)
  {
    puts("");
    target = ucnv_getNextUChar (conv,
                                &source,
                                sourceLimit,
                                &status);
    
    //    printBytes("src",source,sourceLimit-source);
    U_ASSERT(status);
    printUChar(target);
    dstCount++;
  }
  
  
  // ************************** END SAMPLE *************************
  
  printf("src=%d bytes, dst=%d uchars\n", srcCount, dstCount);
  ucnv_close(conv);

  return U_ZERO_ERROR;
}
Пример #4
0
void
str_ucnv_each_uchar32_starting_from(rb_str_t *self,
	long start_offset_in_bytes,
	each_uchar32_callback_t callback)
{
    USE_CONVERTER(cnv, self->encoding);

    rb_ensure_b(^{
	const char *pos = self->bytes + start_offset_in_bytes;
	const char *end = pos + self->length_in_bytes;
	bool stop = false;
	for (;;) {
	    const char *char_start_pos = pos;
	    // iterate through the string one Unicode code point at a time
	    UErrorCode err = U_ZERO_ERROR;
	    UChar32 c = ucnv_getNextUChar(cnv, &pos, end, &err);
	    if (err == U_INDEX_OUTOFBOUNDS_ERROR) {
		// end of the string
		break;
	    }
	    else if (U_FAILURE(err)) {
		long min_char_size = self->encoding->min_char_size;
		while (char_start_pos < pos) {
		    long char_len = pos - char_start_pos;
		    if (char_len > min_char_size) {
			char_len = min_char_size;
		    }
		    callback(U_SENTINEL, char_start_pos-self->bytes, char_len, &stop);
		    if (stop) {
			return;
		    }
		    char_start_pos += char_len;
		}
	    }
	    else {
		long char_len = pos - char_start_pos;
		callback(c, char_start_pos-self->bytes, char_len, &stop);
		if (stop) {
		    return;
		}
	    }
	}
    }, ^{
Пример #5
0
long
str_ucnv_length(rb_str_t *self, bool ucs2_mode)
{
    USE_CONVERTER(cnv, self->encoding);

    const char *pos = self->bytes;
    const char *end = pos + self->length_in_bytes;
    long len = 0;
    bool valid_encoding = true;
    for (;;) {
	const char *character_start_pos = pos;
	// iterate through the string one Unicode code point at a time
	UErrorCode err = U_ZERO_ERROR;
	UChar32 c = ucnv_getNextUChar(cnv, &pos, end, &err);
	if (err == U_INDEX_OUTOFBOUNDS_ERROR) {
	    // end of the string
	    break;
	}
	else if (U_FAILURE(err)) {
	    valid_encoding = false;
	    long min_char_size = self->encoding->min_char_size;
	    long converted_width = pos - character_start_pos;
	    len += div_round_up(converted_width, min_char_size);
	}
	else {
	    if (ucs2_mode && !U_IS_BMP(c)) {
		len += 2;
	    }
	    else {
		++len;
	    }
	}
    }

    ucnv_close(cnv);

    str_set_valid_encoding(self, valid_encoding);

    return len;
}
Пример #6
0
void
str_ucnv_update_flags(rb_str_t *self)
{
    USE_CONVERTER(cnv, self->encoding);

    bool ascii_only = true;
    bool valid_encoding = true;

    const char *pos = self->bytes;
    const char *end = pos + self->length_in_bytes;
    for (;;) {
	// iterate through the string one Unicode code point at a time
	UErrorCode err = U_ZERO_ERROR;
	UChar32 c = ucnv_getNextUChar(cnv, &pos, end, &err);
	if (U_FAILURE(err)) {
	    if (err == U_INDEX_OUTOFBOUNDS_ERROR) {
		// end of the string
		break;
	    }
	    else {
		// conversion error
		valid_encoding = false;
		ascii_only = false;
		break;
	    }
	}
	else {
	    if (c > 127) {
		ascii_only = false;
	    }
	}
    }

    ucnv_close(cnv);

    str_set_valid_encoding(self, valid_encoding);
    str_set_ascii_only(self, ascii_only);
}
Пример #7
0
/**
 * Count the number of characters in a string
 *
 * Note that ICU permits only strings of length < 2^31.
 * @param s R character vector
 * @return integer vector
 *
 * @version 0.1-?? (Marcin Bujarski)
 *
 * @version 0.1-?? (Marek Gagolewski)
 *          Multiple input encoding support
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-16)
 *          make StriException-friendly
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-03-27)
 *          using StriUcnv;
 *          warn on invalid utf-8 sequences
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_length(SEXP str)
{
    PROTECT(str = stri_prepare_arg_string(str, "str"));

    STRI__ERROR_HANDLER_BEGIN(1)

    R_len_t str_n = LENGTH(str);
    SEXP ret;
    STRI__PROTECT(ret = Rf_allocVector(INTSXP, str_n));
    int* retint = INTEGER(ret);

    StriUcnv ucnvNative(NULL);

    for (R_len_t k = 0; k < str_n; k++) {
        SEXP curs = STRING_ELT(str, k);
        if (curs == NA_STRING) {
            retint[k] = NA_INTEGER;
            continue;
        }

        R_len_t curs_n = LENGTH(curs);  // O(1) - stored by R
        if (IS_ASCII(curs) || IS_LATIN1(curs)) {
            retint[k] = curs_n;
        }
        else if (IS_BYTES(curs)) {
            throw StriException(MSG__BYTESENC);
        }
        else if (IS_UTF8(curs) || ucnvNative.isUTF8()) { // utf8 or native-utf8
            UChar32 c = 0;
            const char* curs_s = CHAR(curs);
            R_len_t j = 0;
            R_len_t i = 0;
            while (c >= 0 && j < curs_n) {
                U8_NEXT(curs_s, j, curs_n, c); // faster that U8_FWD_1 & gives bad UChar32s
                i++;
            }

            if (c < 0) { // invalid utf-8 sequence
                Rf_warning(MSG__INVALID_UTF8);
                retint[k] = NA_INTEGER;
            }
            else
                retint[k] = i;
        }
        else if (ucnvNative.is8bit()) { // native-8bit
            retint[k] = curs_n;
        }
        else { // native encoding, not 8 bit

            UConverter* uconv = ucnvNative.getConverter();

            // native encoding which is neither 8-bit, nor UTF-8 (e.g. 'Big5')
            // this is weird, but we'll face it
            UErrorCode status = U_ZERO_ERROR;
            const char* source = CHAR(curs);
            const char* sourceLimit = source + curs_n;
            R_len_t j;
            for (j = 0; source != sourceLimit; j++) {
                /*ignore_retval=*/ucnv_getNextUChar(uconv, &source, sourceLimit, &status);
                STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
            }
            retint[k] = j; // all right, we got it!
        }
    }

    STRI__UNPROTECT_ALL
    return ret;

    STRI__ERROR_HANDLER_END({ /* no special action on error */ })
}
Пример #8
0
UErrorCode convsample_06()
{
  printf("\n\n==============================================\n"
         "Sample 06: C: frequency distribution of letters in a UTF-8 document\n");

  FILE *f;
  int32_t count;
  char inBuf[BUFFERSIZE];
  const char *source;
  const char *sourceLimit;
  UChar *uBuf;
  int32_t uBufSize = 0;
  UConverter *conv;
  UErrorCode status = U_ZERO_ERROR;
  uint32_t letters=0, total=0;

  CharFreqInfo   *info;
  UChar32   charCount = 0x10000;  /* increase this if you want to handle non bmp.. todo: automatically bump it.. */
  UChar32   p;

  uint32_t ie = 0;
  uint32_t gh = 0;
  UChar32 l = 0;

  f = fopen("data06.txt", "r");
  if(!f)
  {
    fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n");
    return U_FILE_ACCESS_ERROR;
  }

  info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount);
  if(!info)
  {
    fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", sizeof(CharFreqInfo)*charCount);
  }

  /* reset frequencies */
  for(p=0;p<charCount;p++)
  {
    info[p].codepoint = p;
    info[p].frequency = 0;
  }

  // **************************** START SAMPLE *******************
  conv = ucnv_open("utf-8", &status);
  assert(U_SUCCESS(status));

  uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
  printf("input bytes %d / min chars %d = %d UChars\n",
         BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
  uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
  assert(uBuf!=NULL);

  // grab another buffer's worth
  while((!feof(f)) && 
        ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
  {
    // Convert bytes to unicode
    source = inBuf;
    sourceLimit = inBuf + count;
    
    while(source < sourceLimit)
    {
      p = ucnv_getNextUChar(conv, &source, sourceLimit, &status);
      if(U_FAILURE(status))
      {
        fprintf(stderr, "%s @ %d\n", u_errorName(status), total);
        status = U_ZERO_ERROR;
        continue;
      }
      U_ASSERT(status);
      total++;

      if(u_isalpha(p))
        letters++;

      if((u_tolower(l) == 'i') && (u_tolower(p) == 'e'))
        ie++;

      if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127))
        gh++;

      if(p>charCount)
      {
        fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p);
        return U_UNSUPPORTED_ERROR;
      }
      info[p].frequency++;
      l = p;
    }
  }

  fclose(f);
  ucnv_close(conv);

  printf("%d letters out of %d total UChars.\n", letters, total);
  printf("%d ie digraphs, %d gh digraphs.\n", ie, gh);

  // now, we could sort it..

  //  qsort(info, charCount, sizeof(info[0]), charfreq_compare);

  for(p=0;p<charCount;p++)
  {
    if(info[p].frequency)
    {
      printf("% 5d U+%06X ", info[p].frequency, p);
      if(p <= 0xFFFF)
      {
        prettyPrintUChar((UChar)p);
      }
      printf("\n");
    }
  }
  free(info);
  // ***************************** END SAMPLE ********************

  printf("\n");

  return U_ZERO_ERROR;
}
Пример #9
0
/**
 * Count the number of characters in a string
 *
 * Note that ICU permits only strings of length < 2^31.
 * @param s R character vector
 * @return integer vector
 * @version 0.1 (Marcin Bujarski)
 * @version 0.2 (Marek Gagolewski) Multiple input encoding support
 * @version 0.3 (Marek Gagolewski, 2013-06-16) make StriException-friendly
 */
SEXP stri_length(SEXP str)
{
   str = stri_prepare_arg_string(str, "str");
   R_len_t ns = LENGTH(str);
   SEXP ret;

   UConverter* uconv = NULL;
   bool uconv_8bit = false;
   bool uconv_utf8 = false;

   STRI__ERROR_HANDLER_BEGIN

/* Note: ICU50 permits only int-size strings in U8_NEXT and U8_FWD_1 */
#define STRI_LENGTH_CALCULATE_UTF8  \
   const char* qc = CHAR(q);        \
   R_len_t j = 0;                   \
   for (R_len_t i = 0; i < nq; j++) \
      U8_FWD_1(qc, i, nq);          \
   retint[k] = j;

   PROTECT(ret = Rf_allocVector(INTSXP, ns));
   int* retint = INTEGER(ret);
   for (R_len_t k = 0; k < ns; k++) {
      SEXP q = STRING_ELT(str, k);
      if (q == NA_STRING)
         retint[k] = NA_INTEGER;
      else {
         R_len_t nq = LENGTH(q);  // O(1) - stored by R

         // We trust (is that a wise assumption?)
         // R encoding marks; However, it there is no mark,
         // the string may have any encoding (ascii, latin1, utf8, native)
         if (IS_ASCII(q) || IS_LATIN1(q))
            retint[k] = nq;
         else if (IS_BYTES(q))
            throw StriException(MSG__BYTESENC);
         else if (IS_UTF8(q)) {
            STRI_LENGTH_CALCULATE_UTF8
         }
         else { // Any encoding - detection needed
//            UTF-8 strings can be fairly reliably recognized as such by a
//            simple algorithm, i.e., the probability that a string of
//            characters in any other encoding appears as valid UTF-8 is low,
//            diminishing with increasing string length.
//            We have two possibilities here:
//            1. Auto detect encoding: Is this ASCII or UTF-8? If not => use Native
//                This won't work correctly in some cases.
//                e.g. (c4,85) represents ("Polish a with ogonek") in UTF-8
//                and ("A umlaut", "Ellipsis") in WINDOWS-1250
//            2. Assume it's Native; this assumes the user working in an 8-bit environment
//                would convert strings to UTF-8 manually if needed - I think is's
//                a more reasonable approach (Native --> input via keyboard)

            if (!uconv) { // open ucnv on demand
               uconv = stri__ucnv_open((const char*)NULL); // native decoder
               if (!uconv) {
                  retint[k] = NA_INTEGER;
                  continue;
               }
               uconv_8bit = ((int)ucnv_getMaxCharSize(uconv) == 1);
               if (!uconv_8bit) {
                  UErrorCode err = U_ZERO_ERROR;
                  const char* name = ucnv_getName(uconv, &err);
                  if (U_FAILURE(err))
                     throw StriException("could not query default converter");
                  uconv_utf8 = !strncmp("UTF-8", name, 5);
               }
            }

            if (uconv_8bit) {
               retint[k] = nq; // it's an 8-bit encoding :-)
            }
            else if (uconv_utf8) { // it's UTF-8
               STRI_LENGTH_CALCULATE_UTF8
            }
            else {  // native encoding which is neither 8-bit, nor UTF-8 (e.g. 'Big5')
               UErrorCode err = U_ZERO_ERROR;
               const char* source = CHAR(q);
               const char* sourceLimit = source + nq;
               R_len_t j;
               for (j = 0; source != sourceLimit; j++) {
                  if (U_FAILURE(err)) break; // error from previous iteration
                  // iterate through each native-encoded character:
                  ucnv_getNextUChar(uconv, &source, sourceLimit, &err);
               }
               if (U_FAILURE(err)) {  // error from last iteration
                  Rf_warning("error determining length for native, neither 8-bit- nor UTF-8-encoded string.");
                  retint[k] = NA_INTEGER;
               }
               else
                  retint[k] = j; // all right, we got it!
            }
         }
      }