示例#1
0
文件: gtm_conv.c 项目: mihawk/fis-gtm
int gtm_conv(UConverter* from, UConverter* to, mstr *src, char* dstbuff, int* bufflen)
{
	char		*dstptr, *dstbase, *srcptr;
	const char	*ichset;
	int		dstlen, src_charlen, srclen;
	UErrorCode	status, status1;

	if (0 == src->len)
		return 0;
	if (NULL == dstbuff)
	{
		/* Compute the stringpool buffer space needed for conversion given that source
		 * is encoded in the ichset representation.  The ICU functions ucnv_getMinCharSize()
		 * and ucnv_getMaxCharSize() are used to compute the minimum and maximum number of
		 * bytes required per UChar if converted from/to ichset/ochset respectively
		 */
		src_charlen = (src->len / ucnv_getMinCharSize(from)) + 1; /* number of UChar's from ichset */
		dstlen = UCNV_GET_MAX_BYTES_FOR_STRING(src_charlen, ucnv_getMaxCharSize(to));
		dstlen = (dstlen > MAX_STRLEN) ? MAX_STRLEN : dstlen;
		ENSURE_STP_FREE_SPACE(dstlen);
		dstbase = (char *)stringpool.free;
	} else
	{
		dstbase = dstbuff;
		dstlen = *bufflen;
	}
	srcptr = src->addr;
	srclen = (int)src->len;
	dstptr = dstbase;
	status = U_ZERO_ERROR; /* initialization to "success" is required by ICU */
	ucnv_convertEx(to, from, &dstptr, dstptr + dstlen, (const char**)&srcptr, srcptr + srclen,
		NULL, NULL, NULL, NULL, TRUE, TRUE, &status);
	if (U_FAILURE(status))
	{
		if (U_BUFFER_OVERFLOW_ERROR == status)
		{	/* translation requires more space than the maximum allowed GT.M string size */
			if (NULL == dstbuff)
				rts_error_csa(NULL, VARLSTCNT(1) ERR_MAXSTRLEN);
			else
			{
				/* Insufficient buffer passed. Return the required buffer length */
				src_charlen = (srclen / ucnv_getMinCharSize(from)) + 1;
				*bufflen = UCNV_GET_MAX_BYTES_FOR_STRING(src_charlen, ucnv_getMaxCharSize(to));
				return -1;
			}
		}
		status1 = U_ZERO_ERROR;
		ichset = ucnv_getName(from, &status1);
		assert(U_SUCCESS(status1));
		UTF8_BADCHAR(1,(unsigned char *) (srcptr - 1), NULL,STRLEN(ichset), ichset);
	}
	return (int) (dstptr - dstbase);
}
示例#2
0
bool CSICU_charset_init(charset* cs,
						const ASCII* charSetName)
{
	UErrorCode status = U_ZERO_ERROR;
	UConverter* conv = ucnv_open(charSetName, &status);

	if (U_SUCCESS(status))
	{
		// charSetName comes from stack. Copy it.
		ASCII* p = new ASCII[strlen(charSetName) + 1];
		cs->charset_name = p;
		strcpy(p, charSetName);

		cs->charset_version = CHARSET_VERSION_1;
		cs->charset_flags |= CHARSET_ASCII_BASED;
		cs->charset_min_bytes_per_char = ucnv_getMinCharSize(conv);
		cs->charset_max_bytes_per_char = ucnv_getMaxCharSize(conv);
		cs->charset_fn_destroy = charset_destroy;
		cs->charset_fn_well_formed = NULL;

		const UChar unicodeSpace = 32;

		BYTE* p2 = new BYTE[cs->charset_max_bytes_per_char];
		cs->charset_space_character = p2;
		cs->charset_space_length = ucnv_fromUChars(conv, reinterpret_cast<char*>(p2),
			cs->charset_max_bytes_per_char, &unicodeSpace, 1, &status);
		fb_assert(U_SUCCESS(status));

		ucnv_close(conv);

		CVICU_convert_init(cs);
	}

	return U_SUCCESS(status);
}
示例#3
0
inline int
mod_websocket_conv(UConverter *to, UConverter *from,
                   char **dst, size_t *dstsiz,
                   const char *src, size_t srcsiz) {
    UErrorCode err = U_ZERO_ERROR;
    size_t unisiz;
    UChar *unibuf, *punibuf, *ppunibuf;
    char *pdst;

    if (srcsiz == 0) {
        return -1;
    }
    if (!to) {
        *dst = (char *)malloc(srcsiz + 1);
        if (*dst == NULL) {
            return -1;
        }
        memcpy(*dst, src, srcsiz);
        (*dst)[srcsiz] = '\0';
        *dstsiz = srcsiz;
        return 0;
    }
    if (!from || !dst || !src || !dstsiz) {
        return -1;
    }
    unisiz = srcsiz / ucnv_getMinCharSize(from);
    unibuf = (UChar *)malloc(sizeof(UChar) * unisiz + 1);
    if (!unibuf) {
        return -1;
    }
    punibuf = unibuf;
    ucnv_toUnicode(from, &punibuf, punibuf + unisiz,
                   &src, src + srcsiz, 0, 0, &err);
    if (U_FAILURE(err)) {
        free(unibuf);
        return -1;
    }
    *punibuf = '\0';
    *dstsiz = (punibuf - unibuf) * ucnv_getMaxCharSize(to);
    *dst = (char *)malloc(*dstsiz + 1);
    if (!*dst) {
        free(unibuf);
        return -1;
    }
    pdst = *dst;
    ppunibuf = unibuf;
    ucnv_fromUnicode(to, &pdst, pdst + *dstsiz,
                     (const UChar **)&ppunibuf, punibuf, 0, 0, &err);
    free(unibuf);
    if (U_FAILURE(err)) {
        free(*dst);
        return -1;
    }
    *pdst = '\0';
    *dstsiz = pdst - *dst;
    return 0;
}
示例#4
0
U_CAPI int32_t U_EXPORT2
ucbuf_size(UCHARBUF* buf){
    if(buf){
        if(buf->isBuffered){
            return (T_FileStream_size(buf->in)-buf->signatureLength)/ucnv_getMinCharSize(buf->conv);
        }else{
            return (int32_t)(buf->bufLimit - buf->buffer);
        }
    }
    return 0;
}
示例#5
0
// ---------------------------------------------------------------------------
//  ICUTranscoder: Constructors and Destructor
// ---------------------------------------------------------------------------
ICUTranscoder::ICUTranscoder(const  XMLCh* const        encodingName
                            ,       UConverter* const   toAdopt
                            , const unsigned int        blockSize
                            , MemoryManager* const      manager) :

    XMLTranscoder(encodingName, blockSize, manager)
    , fConverter(toAdopt)
    , fFixed(false)
    , fSrcOffsets(0)
{
    // If there is a block size, then allocate our source offset array
    if (blockSize)
        fSrcOffsets = (XMLUInt32*) manager->allocate
        (
            blockSize * sizeof(XMLUInt32)
        );//new XMLUInt32[blockSize];

    // Remember if its a fixed size encoding
    fFixed = (ucnv_getMaxCharSize(fConverter) == ucnv_getMinCharSize(fConverter));
}
示例#6
0
/* private function used for buffering input */
void
ufile_fill_uchar_buffer(UFILE *f)
{
    UErrorCode  status;
    const char  *mySource;
    const char  *mySourceEnd;
    UChar       *myTarget;
    int32_t     bufferSize;
    int32_t     maxCPBytes;
    int32_t     bytesRead;
    int32_t     availLength;
    int32_t     dataSize;
    char        charBuffer[UFILE_CHARBUFFER_SIZE];
    u_localized_string *str;

    if (f->fFile == NULL) {
        /* There is nothing to do. It's a string. */
        return;
    }

    str = &f->str;
    dataSize = (int32_t)(str->fLimit - str->fPos);
    if (f->fFileno == 0 && dataSize > 0) {
        /* Don't read from stdin too many times. There is still some data. */
        return;
    }

    /* shift the buffer if it isn't empty */
    if(dataSize != 0) {
        uprv_memmove(f->fUCBuffer, str->fPos, dataSize * sizeof(UChar));
    }


    /* record how much buffer space is available */
    availLength = UFILE_UCHARBUFFER_SIZE - dataSize;

    /* Determine the # of codepage bytes needed to fill our UChar buffer */
    /* weiv: if converter is NULL, we use invariant converter with charwidth = 1)*/
    maxCPBytes = availLength / (f->fConverter!=NULL?(2*ucnv_getMinCharSize(f->fConverter)):1);

    /* Read in the data to convert */
    if (f->fFileno == 0) {
        /* Special case. Read from stdin one line at a time. */
        char *retStr = fgets(charBuffer, ufmt_min(maxCPBytes, UFILE_CHARBUFFER_SIZE), f->fFile);
        bytesRead = (int32_t)(retStr ? uprv_strlen(charBuffer) : 0);
    }
    else {
        /* A normal file */
        bytesRead = (int32_t)fread(charBuffer,
                                   sizeof(char),
                                   ufmt_min(maxCPBytes, UFILE_CHARBUFFER_SIZE),
                                   f->fFile);
    }

    /* Set up conversion parameters */
    status      = U_ZERO_ERROR;
    mySource    = charBuffer;
    mySourceEnd = charBuffer + bytesRead;
    myTarget    = f->fUCBuffer + dataSize;
    bufferSize  = UFILE_UCHARBUFFER_SIZE;

    if(f->fConverter != NULL) { /* We have a valid converter */
        /* Perform the conversion */
        ucnv_toUnicode(f->fConverter,
                       &myTarget,
                       f->fUCBuffer + bufferSize,
                       &mySource,
                       mySourceEnd,
                       NULL,
                       (UBool)(feof(f->fFile) != 0),
                       &status);

    } else { /*weiv: do the invariant conversion */
        u_charsToUChars(mySource, myTarget, bytesRead);
        myTarget += bytesRead;
    }

    /* update the pointers into our array */
    str->fPos    = str->fBuffer;
    str->fLimit  = myTarget;
}
示例#7
0
UErrorCode convsample_40()
{
  printf("\n\n==============================================\n"
    "Sample 40: C: convert data02.bin from cp37 to UTF16 [data40.utf16]\n");

  FILE *f;
  FILE *out;
  int32_t count;
  char inBuf[BUFFERSIZE];
  const char *source;
  const char *sourceLimit;
  UChar *uBuf;
  UChar *target;
  UChar *targetLimit;
  int32_t uBufSize = 0;
  UConverter *conv = NULL;
  UErrorCode status = U_ZERO_ERROR;
  uint32_t inbytes=0, total=0;

  f = fopen("data02.bin", "rb");
  if(!f)
  {
    fprintf(stderr, "Couldn't open file 'data02.bin' (cp37 data file).\n");
    return U_FILE_ACCESS_ERROR;
  }

  out = fopen("data40.utf16", "wb");
  if(!out)
  {
    fprintf(stderr, "Couldn't create file 'data40.utf16'.\n");
    fclose(f);
    return U_FILE_ACCESS_ERROR;
  }

  // **************************** START SAMPLE *******************
  conv = ucnv_openCCSID(37, UCNV_IBM, &status);
  assert(U_SUCCESS(status));

  uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
  printf("input bytes %d / min chars %d = %d UChars\n",
         BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
  uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
  assert(uBuf!=NULL);

  // grab another buffer's worth
  while((!feof(f)) && 
        ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
  {
    inbytes += count;

    // Convert bytes to unicode
    source = inBuf;
    sourceLimit = inBuf + count;
    
    do
    {
        target = uBuf;
        targetLimit = uBuf + uBufSize;
        
        ucnv_toUnicode( conv, &target, targetLimit, 
                       &source, sourceLimit, NULL,
                       feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
                                   /* is true (when no more data will come) */
                         &status);
      
        if(status == U_BUFFER_OVERFLOW_ERROR)
        {
          // simply ran out of space - we'll reset the target ptr the next
          // time through the loop.
          status = U_ZERO_ERROR;
        }
        else
        {
          //  Check other errors here.
          assert(U_SUCCESS(status));
          // Break out of the loop (by force)
        }

        // Process the Unicode
        // Todo: handle UTF-16/surrogates
        assert(fwrite(uBuf, sizeof(uBuf[0]), (target-uBuf), out) ==
               (size_t)(target-uBuf));
        total += (target-uBuf);
    } while (source < sourceLimit); // while simply out of space
  }

  printf("%d bytes in,  %d UChars out.\n", inbytes, total);
  
  // ***************************** END SAMPLE ********************
  ucnv_close(conv);

  fclose(f);
  fclose(out);
  printf("\n");

  return U_ZERO_ERROR;
}
示例#8
0
UErrorCode convsample_06()
{
  printf("\n\n==============================================\n"
         "Sample 06: C: frequency distribution of letters in a UTF-8 document\n");

  FILE *f;
  int32_t count;
  char inBuf[BUFFERSIZE];
  const char *source;
  const char *sourceLimit;
  UChar *uBuf;
  int32_t uBufSize = 0;
  UConverter *conv;
  UErrorCode status = U_ZERO_ERROR;
  uint32_t letters=0, total=0;

  CharFreqInfo   *info;
  UChar32   charCount = 0x10000;  /* increase this if you want to handle non bmp.. todo: automatically bump it.. */
  UChar32   p;

  uint32_t ie = 0;
  uint32_t gh = 0;
  UChar32 l = 0;

  f = fopen("data06.txt", "r");
  if(!f)
  {
    fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n");
    return U_FILE_ACCESS_ERROR;
  }

  info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount);
  if(!info)
  {
    fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", sizeof(CharFreqInfo)*charCount);
  }

  /* reset frequencies */
  for(p=0;p<charCount;p++)
  {
    info[p].codepoint = p;
    info[p].frequency = 0;
  }

  // **************************** START SAMPLE *******************
  conv = ucnv_open("utf-8", &status);
  assert(U_SUCCESS(status));

  uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
  printf("input bytes %d / min chars %d = %d UChars\n",
         BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
  uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
  assert(uBuf!=NULL);

  // grab another buffer's worth
  while((!feof(f)) && 
        ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
  {
    // Convert bytes to unicode
    source = inBuf;
    sourceLimit = inBuf + count;
    
    while(source < sourceLimit)
    {
      p = ucnv_getNextUChar(conv, &source, sourceLimit, &status);
      if(U_FAILURE(status))
      {
        fprintf(stderr, "%s @ %d\n", u_errorName(status), total);
        status = U_ZERO_ERROR;
        continue;
      }
      U_ASSERT(status);
      total++;

      if(u_isalpha(p))
        letters++;

      if((u_tolower(l) == 'i') && (u_tolower(p) == 'e'))
        ie++;

      if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127))
        gh++;

      if(p>charCount)
      {
        fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p);
        return U_UNSUPPORTED_ERROR;
      }
      info[p].frequency++;
      l = p;
    }
  }

  fclose(f);
  ucnv_close(conv);

  printf("%d letters out of %d total UChars.\n", letters, total);
  printf("%d ie digraphs, %d gh digraphs.\n", ie, gh);

  // now, we could sort it..

  //  qsort(info, charCount, sizeof(info[0]), charfreq_compare);

  for(p=0;p<charCount;p++)
  {
    if(info[p].frequency)
    {
      printf("% 5d U+%06X ", info[p].frequency, p);
      if(p <= 0xFFFF)
      {
        prettyPrintUChar((UChar)p);
      }
      printf("\n");
    }
  }
  free(info);
  // ***************************** END SAMPLE ********************

  printf("\n");

  return U_ZERO_ERROR;
}
示例#9
0
UErrorCode convsample_05()
{
  printf("\n\n==============================================\n"
         "Sample 05: C: count the number of letters in a UTF-8 document\n");

  FILE *f;
  int32_t count;
  char inBuf[BUFFERSIZE];
  const char *source;
  const char *sourceLimit;
  UChar *uBuf;
  UChar *target;
  UChar *targetLimit;
  UChar *p;
  int32_t uBufSize = 0;
  UConverter *conv;
  UErrorCode status = U_ZERO_ERROR;
  uint32_t letters=0, total=0;

  f = fopen("data01.txt", "r");
  if(!f)
  {
    fprintf(stderr, "Couldn't open file 'data01.txt' (UTF-8 data file).\n");
    return U_FILE_ACCESS_ERROR;
  }

  // **************************** START SAMPLE *******************
  conv = ucnv_open("utf-8", &status);
  assert(U_SUCCESS(status));

  uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv));
  printf("input bytes %d / min chars %d = %d UChars\n",
         BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize);
  uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
  assert(uBuf!=NULL);

  // grab another buffer's worth
  while((!feof(f)) && 
        ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) )
  {
    // Convert bytes to unicode
    source = inBuf;
    sourceLimit = inBuf + count;
    
    do
    {
        target = uBuf;
        targetLimit = uBuf + uBufSize;
        
        ucnv_toUnicode(conv, &target, targetLimit, 
                       &source, sourceLimit, NULL,
                       feof(f)?TRUE:FALSE,         /* pass 'flush' when eof */
                                   /* is true (when no more data will come) */
                       &status);
      
        if(status == U_BUFFER_OVERFLOW_ERROR)
        {
          // simply ran out of space - we'll reset the target ptr the next
          // time through the loop.
          status = U_ZERO_ERROR;
        }
        else
        {
          //  Check other errors here.
          assert(U_SUCCESS(status));
          // Break out of the loop (by force)
        }

        // Process the Unicode
        // Todo: handle UTF-16/surrogates

        for(p = uBuf; p<target; p++)
        {
          if(u_isalpha(*p))
            letters++;
          total++;
        }
    } while (source < sourceLimit); // while simply out of space
  }

  printf("%d letters out of %d total UChars.\n", letters, total);
  
  // ***************************** END SAMPLE ********************
  ucnv_close(conv);

  printf("\n");

  fclose(f);

  return U_ZERO_ERROR;
}
示例#10
0
UErrorCode
convert_to_unicode(const text* buffer, const text* encoding, UChar** uBuf, int32_t *uBuf_len, bool force, bool* dropped_bytes)
{
    UErrorCode status = U_ZERO_ERROR;

    UConverter *conv;
    int32_t uConvertedLen = 0;

    // used to set dropped_bytes flag if force is true
    ToUFLAGContext * context = NULL;

    size_t uBufSize = 0;

    const char* encoding_cstr = text_to_cstring(encoding);

    // open converter for detected encoding
    conv = ucnv_open(encoding_cstr, &status);

    if (U_FAILURE(status))
    {
        ereport(WARNING,
            (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION),
             errmsg("Cannot open %s converter - error: %s.\n", (const char *) encoding_cstr, u_errorName(status))));

        if (NULL != encoding_cstr)
            pfree((void *) encoding_cstr);

        ucnv_close(conv);
        return status;
    }

    if (force)
    {
        // set callback to skip illegal, irregular or unassigned bytes

        // set converter to use SKIP callback
        // contecxt will save and call it after calling custom callback
        ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_SKIP, NULL, NULL, NULL, &status);

        //TODO: refactor warning and error message reporting
        if (U_FAILURE(status))
        {
            ereport(WARNING,
                (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION),
                 errmsg("Cannot set callback on converter - error: %s.\n", u_errorName(status))));

            if (NULL != encoding_cstr)
                pfree((void *) encoding_cstr);

            ucnv_close(conv);
            return status;
        }

        // initialize flagging callback
        context = flagCB_toU_openContext();

        /* Set our special callback */
        ucnv_setToUCallBack(conv,
                            flagCB_toU,
                            context,
                            &(context->subCallback),
                            &(context->subContext),
                            &status
                           );

        if (U_FAILURE(status))
        {
            ereport(WARNING,
                (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION),
                 errmsg("Cannot set callback on converter - error: %s.\n", u_errorName(status))));

            if (NULL != encoding_cstr)
                pfree((void *) encoding_cstr);

            ucnv_close(conv);
            return status;
        }
    }

    // allocate unicode buffer
    // must pfree before exiting calling function
    uBufSize = (VARSIZE_ANY_EXHDR(buffer)/ucnv_getMinCharSize(conv) + 1);
    *uBuf = (UChar*) palloc0(uBufSize * sizeof(UChar));

    if (*uBuf == NULL)
    {
        status = U_MEMORY_ALLOCATION_ERROR;

        ereport(WARNING,
            (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION),
             errmsg("Cannot allocate %d bytes for Unicode pivot buffer - error: %s.\n", (int) uBufSize, u_errorName(status))));

        if (NULL != encoding_cstr)
            pfree((void *) encoding_cstr);

        ucnv_close(conv);
        return status;
    }

    ereport(DEBUG1,
        (errcode(ERRCODE_SUCCESSFUL_COMPLETION),
            errmsg("Original string: %s\n", (const char*) text_to_cstring(buffer))));

    // convert to Unicode
    // returns length of converted string, not counting NUL-terminator
    uConvertedLen = ucnv_toUChars(conv,
                                  *uBuf,
                                  uBufSize,
                                  (const char*) text_to_cstring(buffer),
                                  STRING_IS_NULL_TERMINATED,
                                  &status
                                 );

    if (U_SUCCESS(status))
    {
        // add 1 for NUL terminator
        *uBuf_len = uConvertedLen + 1;

        ereport(DEBUG1,
            (errcode(ERRCODE_SUCCESSFUL_COMPLETION),
                errmsg("Converted string: %s\n", (const char*) *uBuf)));

        // see if any bytes where dropped
        // context struct will go away with converter is closed
        if (NULL != context)
            *dropped_bytes = context->flag;
        else
            *dropped_bytes = false;
    }

    if (U_FAILURE(status))
    {
        ereport(WARNING,
            (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION),
             errmsg("ICU conversion from %s to Unicode failed - error: %s.\n", encoding_cstr, u_errorName(status))));
    }

    if (NULL != encoding_cstr)
        pfree((void *) encoding_cstr);

    ucnv_close(conv);
    return status;
}
static jint NativeConverter_getMinBytesPerChar(JNIEnv*, jclass, jlong address) {
    UConverter* cnv = toUConverter(address);
    return (cnv != NULL) ? ucnv_getMinCharSize(cnv) : -1;
}
/** Fetch information on an encoding
 *
 * @param enc either NULL or "" for default encoding,
 *        or one string with encoding name
 * @return R list object with many components (see R doc for details)
 *
 * @version 0.1-?? (Marek Gagolewski)
 *
 * @version 0.2-1 (Marek Gagolewski)
 *          use StriUcnv; make StriException-friendly
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_enc_info(SEXP enc)
{
   const char* selected_enc = stri__prepare_arg_enc(enc, "enc", true/*default ok*/); /* this is R_alloc'ed */

   STRI__ERROR_HANDLER_BEGIN(0)
   StriUcnv uconv_obj(selected_enc);
   //uconv_obj.setCallBackSubstitute(); // restore default callbacks (no warning)
   UConverter* uconv = uconv_obj.getConverter(false);
   UErrorCode status = U_ZERO_ERROR;

   // get the list of available standards
   vector<const char*> standards = StriUcnv::getStandards();
   R_len_t standards_n = (R_len_t)standards.size();

   // alloc output list
   SEXP vals;
   SEXP names;
   const int nval = standards_n+2+5;
   STRI__PROTECT(names = Rf_allocVector(STRSXP, nval));
   SET_STRING_ELT(names, 0, Rf_mkChar("Name.friendly"));
   SET_STRING_ELT(names, 1, Rf_mkChar("Name.ICU"));
   for (R_len_t i=0; i<standards_n; ++i) {
      if (standards[i])
         SET_STRING_ELT(names, i+2, Rf_mkChar((string("Name.")+standards[i]).c_str()));
   }
   SET_STRING_ELT(names, nval-5, Rf_mkChar("ASCII.subset"));
   SET_STRING_ELT(names, nval-4, Rf_mkChar("Unicode.1to1"));
   SET_STRING_ELT(names, nval-3, Rf_mkChar("CharSize.8bit"));
   SET_STRING_ELT(names, nval-2, Rf_mkChar("CharSize.min"));
   SET_STRING_ELT(names, nval-1, Rf_mkChar("CharSize.max"));

   STRI__PROTECT(vals = Rf_allocVector(VECSXP, nval));


   // get canonical (ICU) name
   status = U_ZERO_ERROR;
   const char* canname = ucnv_getName(uconv, &status);
   if (U_FAILURE(status) || !canname) {
      SET_VECTOR_ELT(vals, 1, Rf_ScalarString(NA_STRING));
      Rf_warning(MSG__ENC_ERROR_GETNAME);
   }
   else {
      SET_VECTOR_ELT(vals, 1, stri__make_character_vector_char_ptr(1, canname));

      // friendly name
      const char* frname = StriUcnv::getFriendlyName(canname);
      if (frname)  SET_VECTOR_ELT(vals, 0, stri__make_character_vector_char_ptr(1, frname));
      else         SET_VECTOR_ELT(vals, 0, Rf_ScalarString(NA_STRING));

      // has ASCII as its subset?
      SET_VECTOR_ELT(vals, nval-5, Rf_ScalarLogical((int)uconv_obj.hasASCIIsubset()));

      // min,max character size, is 8bit?
      int mincharsize = (int)ucnv_getMinCharSize(uconv);
      int maxcharsize = (int)ucnv_getMaxCharSize(uconv);
      int is8bit = (mincharsize==1 && maxcharsize == 1);
      SET_VECTOR_ELT(vals, nval-3, Rf_ScalarLogical(is8bit));
      SET_VECTOR_ELT(vals, nval-2, Rf_ScalarInteger(mincharsize));
      SET_VECTOR_ELT(vals, nval-1, Rf_ScalarInteger(maxcharsize));

      // is there a one-to-one correspondence with Unicode?
      if (!is8bit)
         SET_VECTOR_ELT(vals, nval-4, Rf_ScalarLogical(NA_LOGICAL));
      else
         SET_VECTOR_ELT(vals, nval-4, Rf_ScalarLogical((int)uconv_obj.is1to1Unicode()));

      // other standard names
      for (R_len_t i=0; i<standards_n; ++i) {
         if (!standards[i]) continue;

         status = U_ZERO_ERROR;
         const char* stdname = ucnv_getStandardName(canname, standards[i], &status);
         if (U_FAILURE(status) || !stdname)
            SET_VECTOR_ELT(vals, i+2, Rf_ScalarString(NA_STRING));
         else
            SET_VECTOR_ELT(vals, i+2, stri__make_character_vector_char_ptr(1, stdname));
      }
   }
   Rf_setAttrib(vals, R_NamesSymbol, names);
   STRI__UNPROTECT_ALL
   return vals;

   STRI__ERROR_HANDLER_END({/* no special action on error */})
}
示例#13
0
void charsetConverter_icu::convert
	(utility::inputStream& in, utility::outputStream& out, status* st)
{
	UErrorCode err = U_ZERO_ERROR;

	ucnv_reset(m_from);
	ucnv_reset(m_to);

	if (st)
		new (st) status();

	// From buffers
	byte_t cpInBuffer[16]; // stream data put here
	const size_t outSize = ucnv_getMinCharSize(m_from) * sizeof(cpInBuffer) * sizeof(UChar);
	std::vector <UChar> uOutBuffer(outSize); // Unicode chars end up here

	// To buffers
	// converted (char) data end up here
	const size_t cpOutBufferSz = ucnv_getMaxCharSize(m_to) * outSize;
	std::vector <char> cpOutBuffer(cpOutBufferSz);

	// Tell ICU what to do when encountering an illegal byte sequence
	if (m_options.silentlyReplaceInvalidSequences)
	{
		// Set replacement chars for when converting from Unicode to codepage
		icu::UnicodeString substString(m_options.invalidSequence.c_str());
		ucnv_setSubstString(m_to, substString.getTerminatedBuffer(), -1, &err);

		if (U_FAILURE(err))
			throw exceptions::charset_conv_error("[ICU] Error when setting substitution string.");
	}
	else
	{
		// Tell ICU top stop (and return an error) on illegal byte sequences
		ucnv_setToUCallBack
			(m_from, UCNV_TO_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err);

		if (U_FAILURE(err))
			throw exceptions::charset_conv_error("[ICU] Error when setting ToU callback.");

		ucnv_setFromUCallBack
			(m_to, UCNV_FROM_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err);

		if (U_FAILURE(err))
			throw exceptions::charset_conv_error("[ICU] Error when setting FromU callback.");
	}

	// Input data available
	while (!in.eof())
	{
		// Read input data into buffer
		size_t inLength = in.read(cpInBuffer, sizeof(cpInBuffer));

		// Beginning of read data
		const char* source = reinterpret_cast <const char*>(&cpInBuffer[0]);
		const char* sourceLimit = source + inLength; // end + 1

		UBool flush = in.eof();  // is this last run?

		UErrorCode toErr;

		// Loop until all source has been processed
		do
		{
			// Set up target pointers
			UChar* target = &uOutBuffer[0];
			UChar* targetLimit = &target[0] + outSize;

			toErr = U_ZERO_ERROR;
			ucnv_toUnicode(m_from, &target, targetLimit,
			               &source, sourceLimit, NULL, flush, &toErr);

			if (st)
				st->inputBytesRead += (source - reinterpret_cast <const char*>(&cpInBuffer[0]));

			if (toErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(toErr))
			{
				if (toErr == U_INVALID_CHAR_FOUND ||
				    toErr == U_TRUNCATED_CHAR_FOUND ||
				    toErr == U_ILLEGAL_CHAR_FOUND)
				{
					// Error will be thrown later (*)
				}
				else
				{
					throw exceptions::charset_conv_error("[ICU] Error converting to Unicode from " + m_source.getName());
				}
			}

			// The Unicode source is the buffer just written and the limit
			// is where the previous conversion stopped (target is moved in the conversion)
			const UChar* uSource = &uOutBuffer[0];
			UChar* uSourceLimit = &target[0];
			UErrorCode fromErr;

			// Loop until converted chars are fully written
			do
			{
				char* cpTarget = &cpOutBuffer[0];
				const char* cpTargetLimit = &cpOutBuffer[0] + cpOutBufferSz;

				fromErr = U_ZERO_ERROR;

				// Write converted bytes (Unicode) to destination codepage
				ucnv_fromUnicode(m_to, &cpTarget, cpTargetLimit,
				                 &uSource, uSourceLimit, NULL, flush, &fromErr);

				if (st)
				{
					// Decrement input bytes count by the number of input bytes in error
					char errBytes[16];
					int8_t errBytesLen = sizeof(errBytes);
					UErrorCode errBytesErr = U_ZERO_ERROR;

	 				ucnv_getInvalidChars(m_from, errBytes, &errBytesLen, &errBytesErr);

					st->inputBytesRead -= errBytesLen;
					st->outputBytesWritten += cpTarget - &cpOutBuffer[0];
				}

				// (*) If an error occurred while converting from input charset, throw it now
				if (toErr == U_INVALID_CHAR_FOUND ||
				    toErr == U_TRUNCATED_CHAR_FOUND ||
				    toErr == U_ILLEGAL_CHAR_FOUND)
				{
					throw exceptions::illegal_byte_sequence_for_charset();
				}

				if (fromErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(fromErr))
				{
					if (fromErr == U_INVALID_CHAR_FOUND ||
					    fromErr == U_TRUNCATED_CHAR_FOUND ||
					    fromErr == U_ILLEGAL_CHAR_FOUND)
					{
						throw exceptions::illegal_byte_sequence_for_charset();
					}
					else
					{
						throw exceptions::charset_conv_error("[ICU] Error converting from Unicode to " + m_dest.getName());
					}
				}

				// Write to destination stream
				out.write(&cpOutBuffer[0], (cpTarget - &cpOutBuffer[0]));

			} while (fromErr == U_BUFFER_OVERFLOW_ERROR);

		} while (toErr == U_BUFFER_OVERFLOW_ERROR);
	}
}
示例#14
0
void charsetFilteredOutputStream_icu::flush()
{
	if (m_from == NULL || m_to == NULL)
		throw exceptions::charset_conv_error("Cannot initialize converters.");

	// Allocate buffer for Unicode chars
	const size_t uniSize = ucnv_getMinCharSize(m_from) * 1024 * sizeof(UChar);
	std::vector <UChar> uniBuffer(uniSize);

	// Conversion loop (with flushing)
	UErrorCode toErr = U_ZERO_ERROR;

	const char* uniSource = 0;
	const char* uniSourceLimit = 0;

	do
	{
		// Convert from source charset to Unicode
		UChar* uniTarget = &uniBuffer[0];
		UChar* uniTargetLimit = &uniBuffer[0] + uniSize;

		toErr = U_ZERO_ERROR;

		ucnv_toUnicode(m_from, &uniTarget, uniTargetLimit,
		               &uniSource, uniSourceLimit, NULL, /* flush */ TRUE, &toErr);

		if (U_FAILURE(toErr) && toErr != U_BUFFER_OVERFLOW_ERROR)
		{
			throw exceptions::charset_conv_error
				("[ICU] Error converting to Unicode from '" + m_sourceCharset.getName() + "'.");
		}

		const size_t uniLength = uniTarget - &uniBuffer[0];

		// Allocate buffer for destination charset
		const size_t cpSize = ucnv_getMinCharSize(m_to) * uniLength;
		std::vector <char> cpBuffer(cpSize);

		// Convert from Unicode to destination charset
		UErrorCode fromErr = U_ZERO_ERROR;

		const UChar* cpSource = &uniBuffer[0];
		const UChar* cpSourceLimit = &uniBuffer[0] + uniLength;

		do
		{
			char* cpTarget = &cpBuffer[0];
			char* cpTargetLimit = &cpBuffer[0] + cpSize;

			fromErr = U_ZERO_ERROR;

			ucnv_fromUnicode(m_to, &cpTarget, cpTargetLimit,
							 &cpSource, cpSourceLimit, NULL, /* flush */ TRUE, &fromErr);

			if (fromErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(fromErr))
			{
				throw exceptions::charset_conv_error
					("[ICU] Error converting from Unicode to '" + m_destCharset.getName() + "'.");
			}

			const size_t cpLength = cpTarget - &cpBuffer[0];

			// Write successfully converted bytes
			m_stream.write(&cpBuffer[0], cpLength);

		} while (fromErr == U_BUFFER_OVERFLOW_ERROR);

	} while (toErr == U_BUFFER_OVERFLOW_ERROR);

	m_stream.flush();
}
示例#15
0
void charsetFilteredOutputStream_icu::writeImpl
	(const byte_t* const data, const size_t count)
{
	if (m_from == NULL || m_to == NULL)
		throw exceptions::charset_conv_error("Cannot initialize converters.");

	// Allocate buffer for Unicode chars
	const size_t uniSize = ucnv_getMinCharSize(m_from) * count * sizeof(UChar);
	std::vector <UChar> uniBuffer(uniSize);

	// Conversion loop
	UErrorCode toErr = U_ZERO_ERROR;

	const char* uniSource = reinterpret_cast <const char*>(data);
	const char* uniSourceLimit = uniSource + count;

	do
	{
		// Convert from source charset to Unicode
		UChar* uniTarget = &uniBuffer[0];
		UChar* uniTargetLimit = &uniBuffer[0] + uniSize;

		toErr = U_ZERO_ERROR;

		ucnv_toUnicode(m_from, &uniTarget, uniTargetLimit,
		               &uniSource, uniSourceLimit, NULL, /* flush */ FALSE, &toErr);

		if (U_FAILURE(toErr) && toErr != U_BUFFER_OVERFLOW_ERROR)
		{
			if (toErr == U_INVALID_CHAR_FOUND ||
			    toErr == U_TRUNCATED_CHAR_FOUND ||
			    toErr == U_ILLEGAL_CHAR_FOUND)
			{
				throw exceptions::illegal_byte_sequence_for_charset();
			}
			else
			{
				throw exceptions::charset_conv_error
					("[ICU] Error converting to Unicode from '" + m_sourceCharset.getName() + "'.");
			}
		}

		const size_t uniLength = uniTarget - &uniBuffer[0];

		// Allocate buffer for destination charset
		const size_t cpSize = ucnv_getMinCharSize(m_to) * uniLength;
		std::vector <char> cpBuffer(cpSize);

		// Convert from Unicode to destination charset
		UErrorCode fromErr = U_ZERO_ERROR;

		const UChar* cpSource = &uniBuffer[0];
		const UChar* cpSourceLimit = &uniBuffer[0] + uniLength;

		do
		{
			char* cpTarget = &cpBuffer[0];
			char* cpTargetLimit = &cpBuffer[0] + cpSize;

			fromErr = U_ZERO_ERROR;

			ucnv_fromUnicode(m_to, &cpTarget, cpTargetLimit,
							 &cpSource, cpSourceLimit, NULL, /* flush */ FALSE, &fromErr);

			if (fromErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(fromErr))
			{
				if (fromErr == U_INVALID_CHAR_FOUND ||
				    fromErr == U_TRUNCATED_CHAR_FOUND ||
				    fromErr == U_ILLEGAL_CHAR_FOUND)
				{
					throw exceptions::illegal_byte_sequence_for_charset();
				}
				else
				{
					throw exceptions::charset_conv_error
						("[ICU] Error converting from Unicode to '" + m_destCharset.getName() + "'.");
				}
			}

			const size_t cpLength = cpTarget - &cpBuffer[0];

			// Write successfully converted bytes
			m_stream.write(&cpBuffer[0], cpLength);

		} while (fromErr == U_BUFFER_OVERFLOW_ERROR);

	} while (toErr == U_BUFFER_OVERFLOW_ERROR);
}
示例#16
0
文件: ustdio.c 项目: gitpan/ponie
/* private function used for buffering input */
void
ufile_fill_uchar_buffer(UFILE *f)
{
    UErrorCode         status;
    const char        *mySource;
    const char        *mySourceEnd;
    UChar            *myTarget;
    int32_t        bufferSize;
    int32_t        maxCPBytes;
    int32_t        bytesRead;
    int32_t        availLength;
    int32_t        dataSize;


    /* shift the buffer if it isn't empty */
    dataSize = (int32_t)(f->fUCLimit - f->fUCPos);
    if(dataSize != 0) {
        memmove(f->fUCBuffer,
            f->fUCPos,
            dataSize * sizeof(UChar));
    }


    /* record how much buffer space is available */
    availLength = UFILE_UCHARBUFFER_SIZE - dataSize;

    /* Determine the # of codepage bytes needed to fill our UChar buffer */
    /* weiv: if converter is NULL, we use invariant converter with charwidth = 1)*/
    maxCPBytes = availLength / (f->fConverter!=NULL?(2*ucnv_getMinCharSize(f->fConverter)):1);

    /* Read in the data to convert */
    bytesRead = (int32_t)fread(f->fCharBuffer,
        sizeof(char),
        ufmt_min(maxCPBytes, UFILE_CHARBUFFER_SIZE),
        f->fFile);

    /* Set up conversion parameters */
    status      = U_ZERO_ERROR;
    mySource    = f->fCharBuffer;
    mySourceEnd = f->fCharBuffer + bytesRead;
    myTarget    = f->fUCBuffer + dataSize;
    bufferSize  = UFILE_UCHARBUFFER_SIZE;

    if(f->fConverter != NULL) { /* We have a valid converter */
        /* Perform the conversion */
        ucnv_toUnicode(f->fConverter,
            &myTarget,
            f->fUCBuffer + bufferSize,
            &mySource,
            mySourceEnd,
            NULL,
            (UBool)(feof(f->fFile) != 0),
            &status);

    } else { /*weiv: do the invariant conversion */
        u_charsToUChars(mySource, myTarget, bytesRead);
        myTarget += bytesRead;
    }

    /* update the pointers into our array */
    f->fUCPos    = f->fUCBuffer;
    f->fUCLimit     = myTarget;
}
static jfloat NativeConverter_getAveBytesPerChar(JNIEnv*, jclass, jlong address) {
    UConverter* cnv = toUConverter(address);
    return (cnv != NULL) ? ((ucnv_getMaxCharSize(cnv) + ucnv_getMinCharSize(cnv)) / 2.0) : -1;
}