Пример #1
0
static UChar* toUChar(const char *src, void **freeHook) {
    /* Structure of the memory that we allocate on the heap */

    int32_t    numUChars;
    int32_t    destSize;
    UChar      stackBuf[2000 + sizeof(void *)/sizeof(UChar)];
    StringStruct  *dest;
    UConverter *cnv;

    UErrorCode status = U_ZERO_ERROR;
    if (src == NULL) {
        return NULL;
    };

    cnv = ucnv_open(NULL, &status);
    if(U_FAILURE(status) || cnv == NULL) {
        return NULL;
    }
    ucnv_reset(cnv);
    numUChars = ucnv_toUChars(cnv,
                  stackBuf,
                  2000,
                  src, -1,
                  &status);

    destSize = (numUChars+1) * sizeof(UChar) + sizeof(struct StringStruct);
    dest = (StringStruct *)malloc(destSize);
    if (dest != NULL) {
        if (status == U_BUFFER_OVERFLOW_ERROR || status == U_STRING_NOT_TERMINATED_WARNING) {
            ucnv_toUChars(cnv, dest->str, numUChars+1, src, -1, &status);
        } else if (status == U_ZERO_ERROR) {
            u_strcpy(dest->str, stackBuf);
        } else {
            free(dest);
            dest = NULL;
        }
    }

    ucnv_reset(cnv); /* be good citizens */
    ucnv_close(cnv);
    if (dest == NULL) {
        return NULL;
    }
    
    dest->link = (StringStruct*)(*freeHook);
    *freeHook = dest;
    return dest->str;
}
Пример #2
0
	void FStringConverter::ConvertString(const TCHAR* Source, const int32 SourceStartIndex, const int32 SourceLen, icu::UnicodeString& Destination, const bool ShouldNullTerminate)
	{
		if (SourceLen > 0)
		{
			UErrorCode ICUStatus = U_ZERO_ERROR;

			ucnv_reset(ICUConverter);

			// Get the internal buffer of the string, we're going to use it as scratch space
			const int32_t DestinationCapacityUChars = SourceLen * 2;
			UChar* InternalStringBuffer = Destination.getBuffer(DestinationCapacityUChars);

			// Perform the conversion into the string buffer
			const int32_t SourceSizeBytes = SourceLen * sizeof(TCHAR);
			const int32_t DestinationLength = ucnv_toUChars(ICUConverter, InternalStringBuffer, DestinationCapacityUChars, reinterpret_cast<const char*>(Source + SourceStartIndex), SourceSizeBytes, &ICUStatus);

			// Optionally null terminate the string
			if (ShouldNullTerminate)
			{
				InternalStringBuffer[DestinationLength] = 0;
			}

			// Size it back down to the correct size and release our lock on the string buffer
			Destination.releaseBuffer(DestinationLength);

			check(U_SUCCESS(ICUStatus));
		}
		else
		{
			Destination.remove();
		}
	}
Пример #3
0
/* params : desc : the document descriptor
 *          buf  : destination buffer for UTF-16 data
 * return : the length of the paragraph
 *          NO_MORE_DATA if there is no more paragraph
 *          ERR_STREAMFILE if an error occured
 *
 * reads the next paragraph and converts to UTF-16
 */
int p_read_content(struct doc_descriptor *desc, UChar *buf) {
    char *outputbuf;
    int len;
    UErrorCode err;

    len = 0;

    outputbuf = (char *) malloc(INTERNAL_BUFSIZE);

    /* reading the next paragraph */
    memset(outputbuf, '\x00', INTERNAL_BUFSIZE);
    len = getText(desc, outputbuf, INTERNAL_BUFSIZE);

    if (len > 0) {
        (desc->nb_par_read) += 1;

        /* converting to UTF-16 */
        err = U_ZERO_ERROR;
        len = 2 * ucnv_toUChars(desc->conv, buf, 2*INTERNAL_BUFSIZE,
                                outputbuf, strlen(outputbuf), &err);
        if (U_FAILURE(err)) {
            free(outputbuf);
            outputbuf = NULL;
            fprintf(stderr, "Unable to convert buffer\n");
            return ERR_ICU;
        }

    }

    if(outputbuf != NULL) {
        free(outputbuf);
    }

    return len;
}
Пример #4
0
  size_t CodePage2UnicodeConverter::convertToUnicode(UChar * pclTarget,
      size_t uiTargetMaxLength,
      const char * cpacSource,
      size_t uiSourceLength)
  /* ----------------------------------------------------------------------- */
  {
    size_t                     uiTargetSize;

    assert(iv_uconverter !=NULL);
    UErrorCode err=(UErrorCode)0;
    uiTargetSize = ucnv_toUChars(iv_uconverter, pclTarget, uiTargetMaxLength, cpacSource, uiSourceLength, &err);

    if (!U_SUCCESS(err) &&  err != U_BUFFER_OVERFLOW_ERROR) {
      cout << "ERROR: convertToUnicode " << err << endl;
      ///cerr << "CodePage2UnicodeConverter::getMaximumLength() rc= " << err << endl;
      ErrorMessage errMsg = ErrorMessage(UIMA_MSG_ID_CODEPAGE_CONV_ERROR);
      errMsg.addParam(err);
      UIMA_EXC_THROW_NEW(CodePageConversionException,
                         UIMA_ERR_CODEPAGE,
                         errMsg,
                         UIMA_MSG_ID_CODEPAGE_CONV_ERROR,
                         ErrorInfo::unrecoverable);
    }
    return uiTargetSize;
    //// return(uiTargetSize / sizeof(UChar));       /* as characters */
  }
Пример #5
0
static char*
convertToUtf8(UConverter* conv, const unsigned char* name, int len)
{
    char* buffer1 = NULL;
    char* buffer2 = NULL;
    int bufSize = -1;

    if (2 * (len + 1) > bufSize) {
        if (buffer1 != NULL) {
            delete[] buffer1;
            delete[] buffer2;
        }
        bufSize = 2 * len + 100;
        buffer1 = new char[bufSize];
        buffer2 = new char[bufSize];
    }

    UErrorCode status = U_ZERO_ERROR;
    len = ucnv_toUChars(conv, (UChar*)buffer1, bufSize, (const char*)name, len, &status);
    len = ucnv_fromUChars(utf8Conv, buffer2, bufSize, (UChar*)buffer1, len, &status);
    buffer2[len] = 0;

    delete[] buffer1;
    return buffer2;
}
Пример #6
0
UErrorCode convsample_12()
{
  printf("\n\n==============================================\n"
         "Sample 12: C: simple sjis -> unicode conversion\n");


  // **************************** START SAMPLE *******************

  char source[] = { 0x63, 0x61, 0x74, (char)0x94, 0x4C, (char)0x82, 0x6E, (char)0x82, 0x6A, 0x00 };
  UChar target[100];
  UErrorCode status = U_ZERO_ERROR;
  UConverter *conv;
  int32_t     len;

  // set up the converter
  conv = ucnv_open("shift_jis", &status);
  assert(U_SUCCESS(status));

  // convert to Unicode
  // Note: we can use strlen, we know it's an 8 bit null terminated codepage
  target[6] = 0xFDCA;
  len = ucnv_toUChars(conv, target, 100, source, strlen(source), &status);
  U_ASSERT(status);
  // close the converter
  ucnv_close(conv);

  // ***************************** END SAMPLE ********************
  
  // Print it out
  printBytes("src", source, strlen(source) );
  printf("\n");
  printUChars("targ", target, len);

  return U_ZERO_ERROR;
}
Пример #7
0
int32_t CharsetMatch::getUChars(UChar *buf, int32_t cap, UErrorCode *status) const
{
    UConverter *conv = ucnv_open(getName(), status);
    int32_t result = ucnv_toUChars(conv, buf, cap, (const char *) textIn->fRawInput, textIn->fRawLength, status);

    ucnv_close(conv);

    return result;
}
Пример #8
0
Файл: icu.cpp Проект: fdbzn/hhvm
String u16(const char *u8, int32_t u8_len, UErrorCode &error) {
  error = U_ZERO_ERROR;
  int32_t outlen = ucnv_toUChars(s_intl_request->utf8(),
                                 nullptr, 0, u8, u8_len, &error);
  if (error != U_BUFFER_OVERFLOW_ERROR) {
    return uninit_null();
  }
  String ret = String(sizeof(UChar) * (outlen + 1), ReserveString);
  UChar *out = (UChar*)ret->mutableData();
  error = U_ZERO_ERROR;
  outlen = ucnv_toUChars(s_intl_request->utf8(),
                         out, outlen + 1, u8, u8_len, &error);
  if (U_FAILURE(error)) {
    return uninit_null();
  }
  ret.setSize(outlen * sizeof(UChar));
  return ret;
}
		size_t StringCharsetConverter::getConvertedLengthToUTF16(const char *str, size_t length)
		{
			UErrorCode err = U_ZERO_ERROR;
			int32_t size = ucnv_toUChars(cv, nullptr, 0, str, static_cast<int32_t>(length), &err);
			if(err != U_ZERO_ERROR && err != U_BUFFER_OVERFLOW_ERROR && err != U_STRING_NOT_TERMINATED_WARNING)
				return static_cast<size_t>(-1);
			if(length == numeric_limits<size_t>::max())
				++ size; // Size must includes null character.
			return static_cast<size_t>(size);
		}
Пример #10
0
/* test one string with the ICU and the reference BOCU-1 implementations */
static void
roundtripBOCU1(UConverter *bocu1, int32_t number, const UChar *text, int32_t length) {
    UChar *roundtripRef, *roundtripICU;
    char *bocu1Ref, *bocu1ICU;

    int32_t bocu1RefLength, bocu1ICULength, roundtripRefLength, roundtripICULength;
    UErrorCode errorCode;

    roundtripRef = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar));
    roundtripICU = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar));
    bocu1Ref = malloc(DEFAULT_BUFFER_SIZE);
    bocu1ICU = malloc(DEFAULT_BUFFER_SIZE);

    /* Unicode -> BOCU-1 */
    bocu1RefLength=writeString(text, length, (uint8_t *)bocu1Ref);

    errorCode=U_ZERO_ERROR;
    bocu1ICULength=ucnv_fromUChars(bocu1, bocu1ICU, DEFAULT_BUFFER_SIZE, text, length, &errorCode);
    if(U_FAILURE(errorCode)) {
        log_err("ucnv_fromUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode));
        goto cleanup; 
    }

    if(bocu1RefLength!=bocu1ICULength || 0!=uprv_memcmp(bocu1Ref, bocu1ICU, bocu1RefLength)) {
        log_err("Unicode(%d)[%d] -> BOCU-1: reference[%d]!=ICU[%d]\n", number, length, bocu1RefLength, bocu1ICULength);
        goto cleanup;
    }

    /* BOCU-1 -> Unicode */
    roundtripRefLength=readString((uint8_t *)bocu1Ref, bocu1RefLength, roundtripRef);
    if(roundtripRefLength<0) {
        goto cleanup; /* readString() found an error and reported it */
    }

    roundtripICULength=ucnv_toUChars(bocu1, roundtripICU, DEFAULT_BUFFER_SIZE, bocu1ICU, bocu1ICULength, &errorCode);
    if(U_FAILURE(errorCode)) {
        log_err("ucnv_toUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode));
        goto cleanup;
    }

    if(length!=roundtripRefLength || 0!=u_memcmp(text, roundtripRef, length)) {
        log_err("BOCU-1 -> Unicode: original(%d)[%d]!=reference[%d]\n", number, length, roundtripRefLength);
        goto cleanup;
    }
    if(roundtripRefLength!=roundtripICULength || 0!=u_memcmp(roundtripRef, roundtripICU, roundtripRefLength)) {
        log_err("BOCU-1 -> Unicode: reference(%d)[%d]!=ICU[%d]\n", number, roundtripRefLength, roundtripICULength);
        goto cleanup;
    }
cleanup:
    free(roundtripRef);
    free(roundtripICU);
    free(bocu1Ref);
    free(bocu1ICU);
}
Пример #11
0
FromUTF8::FromUTF8(Char const* src, UInt length)
{
    UErrorCode err = U_ZERO_ERROR;
    UConverter *conv = ucnv_open("utf8", &err);
    CHECK_ICU(err);
    ON_BLOCK_EXIT(ucnv_close, conv);

    if (!length)
        length = static_cast<UInt>(strlen(src));

    int32_t utf16_len = ucnv_toUChars(conv, 0, 0, src, length, &err);
    if(err==U_BUFFER_OVERFLOW_ERROR)
    {
        err = U_ZERO_ERROR;
        m_utf16.reset(new UChar[utf16_len+1]);
        ucnv_toUChars(conv, m_utf16.get(), utf16_len, src, length, &err);
        CHECK_ICU(err);
        m_utf16[utf16_len]=0;
    }
}
		size_t StringCharsetConverter::convertToUTF16(wchar_t *toString, size_t toBufferLength, const char *fromString, size_t fromLength)
		{
			UErrorCode err = U_ZERO_ERROR;
			int32_t size = ucnv_toUChars(cv, toString, static_cast<int32_t>(toBufferLength), fromString, static_cast<int32_t>(fromLength), &err);
			if(err != U_ZERO_ERROR && err != U_BUFFER_OVERFLOW_ERROR && err != U_STRING_NOT_TERMINATED_WARNING)
				return static_cast<size_t>(-1);
			if(fromLength == numeric_limits<size_t>::max())
				++ size; // Size must includes null character.
			if(size > 0 && toString[0] == L'\uFEFF') // remove byte order mark
				memmove(toString, toString + 1, (-- size) * sizeof(wchar_t));
			return static_cast<size_t>(size);
		}
Пример #13
0
int initTxt(struct doc_descriptor *desc) {
  UErrorCode err;
  char *encoding = NULL;
  int len, BOMlength = 0;
  char buf[BUFSIZE];
  UChar outbuf[4*BUFSIZE];


  lseek(desc->fd, 0, SEEK_SET);
  len = read(desc->fd, buf, BUFSIZE);

  /* detect BOM */
  err = U_ZERO_ERROR;
  encoding = ucnv_detectUnicodeSignature(buf, BUFSIZE, &BOMlength, &err);
  if(encoding != NULL) {
    lseek(desc->fd, BOMlength, SEEK_SET);

    /* initialize converter to encoding */
    err = U_ZERO_ERROR;
    desc->conv = ucnv_open(encoding, &err);
    if (U_FAILURE(err)) {
      fprintf(stderr, "unable to open ICU converter\n");
      return ERR_ICU;
    }
    
  } else {
    /* initialize converter to UTF-8 */
    err = U_ZERO_ERROR;
    desc->conv = ucnv_open("utf8", &err);
    if (U_FAILURE(err)) {
      fprintf(stderr, "unable to open ICU converter\n");
      return ERR_ICU;
    }

    /* check the first 2048 bytes */
    err = U_ZERO_ERROR;
    ucnv_setToUCallBack(desc->conv, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &err);
    if (U_FAILURE(err)) {
      fprintf(stderr, "error setToUCallback\n");
      return ERR_ICU;
    }
    err = U_ZERO_ERROR;
    ucnv_toUChars(desc->conv, outbuf, 4 * BUFSIZE, buf, len, &err);
    if (U_FAILURE(err)) {
      fprintf(stderr, "Unknown encoding\n");
      return ERR_ICU;
    }
    lseek(desc->fd, 0, SEEK_SET);
  }

  return OK;
}
Пример #14
0
/*----------------------------------------------------------------------------------------------
	This method uses an ICU converter to convert a string from UTF-8 to UTF-16.

	Assumptions:
		<text>

	Exit conditions:
		<text>

	Parameters:
		<text>

	Return value:
		The number of characters required to store the fully-converted string
			(which may be greater than targetLen)
----------------------------------------------------------------------------------------------*/
int UnicodeConverter::Convert(const char* source, int sourceLen,
	UChar* target, int targetLen)
{
	UErrorCode status = U_ZERO_ERROR;

	int spaceRequiredForData = ucnv_toUChars(m_converter,
		target, targetLen, source, sourceLen, &status);

	if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
	{
		TRACE("Unable to convert from UTF-8 to UTF-16 (" << status << ")\n");
		//throw std::runtime_error("Unable to convert from UTF-8 to UTF-16");
	}

	return spaceRequiredForData;
}
Пример #15
0
/**
 * Convert from an existing encoding to UTF-16
 * @param src the data read from the file
 * @param srclen the length of src in bytes
 * @param dst the destination buffer
 * @param dstlen the length of the destination in UChars
 * @param charset the charset of the src
 * @return the number of BYTES written
 */
int convert_from_encoding( char *src, int srclen, UChar *dst, 
    int dstlen, char *charset )
{
    UConverter *conv = NULL;
    UErrorCode status = U_ZERO_ERROR;
    int32_t len=0;

    conv = ucnv_open( charset, &status );
    if ( status == U_ZERO_ERROR )
    {
        len = ucnv_toUChars( conv, dst, dstlen, src, srclen, &status );
        if ( status != U_ZERO_ERROR )
        fprintf(stderr,"encoding: %s\n",u_errorName(status));
        len *= sizeof(UChar);
            ucnv_close(conv);
    }
    return len;
}
Пример #16
0
Bool
CodeSet_Validate(const char *buf,   // IN: the string
                 size_t size,	    // IN: length of string
                 const char *code)  // IN: encoding
{
#if defined(NO_ICU)
   return CodeSetOld_Validate(buf, size, code);
#else
   UConverter *cv;
   UErrorCode uerr;

   // ucnv_toUChars takes 32-bit int size
   ASSERT_NOT_IMPLEMENTED(size <= (size_t) MAX_INT32);

   if (size == 0) {
      return TRUE;
   }

   /*
    * Fallback if necessary.
    */

   if (dontUseIcu) {
      return CodeSetOld_Validate(buf, size, code);
   }

   /*
    * Calling ucnv_toUChars() this way is the idiom to precompute
    * the length of the output.  (See preflighting in the ICU User Guide.)
    * So if the error is not U_BUFFER_OVERFLOW_ERROR, then the input
    * is bad.
    */

   uerr = U_ZERO_ERROR;
   cv = ucnv_open(code, &uerr);
   ASSERT_NOT_IMPLEMENTED(uerr == U_ZERO_ERROR);
   ucnv_setToUCallBack(cv, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &uerr);
   ASSERT_NOT_IMPLEMENTED(uerr == U_ZERO_ERROR);
   ucnv_toUChars(cv, NULL, 0, buf, size, &uerr);
   ucnv_close(cv);

   return uerr == U_BUFFER_OVERFLOW_ERROR;
#endif
}
Пример #17
0
/**
 * How many bytes are needed to convert from an encoding to utf16?
 * @param src the source in the encoding
 * @param srclen its length in bytes
 * @param encoding the src's encoding
 * @return the number of UCHARS needed
 */
int measure_from_encoding( char *src, size_t srclen, char *encoding )
{
    UConverter *conv = NULL;
  	UErrorCode status = U_ZERO_ERROR;
  	int32_t len=0;
  	
	conv = ucnv_open( encoding, &status );
  	if ( status == U_ZERO_ERROR )
	{	
	  	len = ucnv_toUChars( conv, NULL, 0, src, srclen, &status );
	  	if ( status != U_BUFFER_OVERFLOW_ERROR )
        {
            printf("encoding: %s\n",u_errorName(status));
            len = 0;
        }
        ucnv_close(conv);
	}
    return len;
}
Пример #18
0
UBool isEuroAware(UConverter* myConv)
{
    static const UChar euroString[2] = { 0x20AC, 0x0000 };
    char target[20];
    UChar euroBack[2];
    int32_t targetSize, euroBackSize;
    UErrorCode err = U_ZERO_ERROR;
    /*const char* myName =   ucnv_getName(myConv, &err);*/

    targetSize = ucnv_fromUChars(myConv,
            target,
            sizeof(target),
            euroString,
            -1,
            &err);
    if (U_FAILURE(err))
    {
      log_err("Failure Occured in ucnv_fromUChars euro roundtrip test\n");
      return FALSE;
    }
    euroBackSize = ucnv_toUChars(myConv,
            euroBack,
            2,
            target,
            targetSize,
            &err);
    if (U_FAILURE(err))
    {
        log_err("Failure Occured in ucnv_toUChars euro roundtrip test\n");
        return FALSE;
    }
    if (u_strcmp(euroString, euroBack)) 
    {
        /*      log_err("%s FAILED Euro rountrip\n", myName);*/
        return FALSE;
    }
    else 
    {
        /*      log_verbose("%s PASSED Euro rountrip\n", myName);*/
        return TRUE;
    }

}
Пример #19
0
// Requires free() of returned UTF16Chars.
void convertUTF8ToUTF16(const NPUTF8 *UTF8Chars, int UTF8Length, NPUTF16 **UTF16Chars, unsigned int *UTF16Length)
{
#if USE(ICU_UNICODE)
    assert(UTF8Chars || UTF8Length == 0);
    assert(UTF16Chars);
    
    if (UTF8Length == -1)
        UTF8Length = static_cast<int>(strlen(UTF8Chars));
        
    // UTF16Length maximum length is the length of the UTF8 string, plus one to include terminator
    // Without the plus one, it will convert ok, but a warning is generated from the converter as
    // there is not enough room for a terminating character.
    *UTF16Length = UTF8Length + 1; 
        
    *UTF16Chars = 0;
    UErrorCode status = U_ZERO_ERROR;
    UConverter* conv = ucnv_open("utf8", &status);
    if (U_SUCCESS(status)) { 
        *UTF16Chars = (NPUTF16 *)malloc(sizeof(NPUTF16) * (*UTF16Length));
        ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_STOP, 0, 0, 0, &status);
        *UTF16Length = ucnv_toUChars(conv, (::UChar*)*UTF16Chars, *UTF16Length, UTF8Chars, UTF8Length, &status); 
        ucnv_close(conv);
    } 
    
    // Check to see if the conversion was successful
    // Some plugins return invalid UTF-8 in NPVariantType_String, see <http://bugs.webkit.org/show_bug.cgi?id=5163>
    // There is no "bad data" for latin1. It is unlikely that the plugin was really sending text in this encoding,
    // but it should have used UTF-8, and now we are simply avoiding a crash.
    if (!U_SUCCESS(status)) {
        *UTF16Length = UTF8Length;
        
        if (!*UTF16Chars)   // If the memory wasn't allocated, allocate it.
            *UTF16Chars = (NPUTF16 *)malloc(sizeof(NPUTF16) * (*UTF16Length));
 
        for (unsigned i = 0; i < *UTF16Length; i++)
            (*UTF16Chars)[i] = UTF8Chars[i] & 0xFF;
    }
#else
    assert(!"Implement me!");    
#endif
}
Пример #20
0
U_CAPI UChar*  U_EXPORT2
u_uastrcpy(UChar *ucs1,
          const char *s2 )
{
  UErrorCode err = U_ZERO_ERROR;
  UConverter *cnv = u_getDefaultConverter(&err);
  if(U_SUCCESS(err) && cnv != NULL) {
    ucnv_toUChars(cnv,
                    ucs1,
                    MAX_STRLEN,
                    s2,
                    (int32_t)uprv_strlen(s2),
                    &err);
    u_releaseDefaultConverter(cnv);
    if(U_FAILURE(err)) {
      *ucs1 = 0;
    }
  } else {
    *ucs1 = 0;
  }
  return ucs1;
}
Пример #21
0
char UTF8arShaping::processText(SWBuf &text, const SWKey *key, const SWModule *module)
{
        UChar *ustr, *ustr2;
	 if ((unsigned long)key < 2)	// hack, we're en(1)/de(0)ciphering
		return -1;

        int32_t len = text.length();
        ustr = new UChar[len];
        ustr2 = new UChar[len];

	// Convert UTF-8 string to UTF-16 (UChars)
        len = ucnv_toUChars(conv, ustr, len, text.c_str(), -1, &err);

        len = u_shapeArabic(ustr, len, ustr2, len, U_SHAPE_LETTERS_SHAPE | U_SHAPE_DIGITS_EN2AN, &err);

	   text.setSize(text.size()*2);
	   len = ucnv_fromUChars(conv, text.getRawData(), text.size(), ustr2, len, &err);
	   text.setSize(len);

        delete [] ustr2;
        delete [] ustr;
	return 0;
}
Пример #22
0
char UTF8NFKD::processText(SWBuf &text, const SWKey *key, const SWModule *module)
{
	if ((unsigned long)key < 2)	// hack, we're en(1)/de(0)ciphering
		return -1;
        
	int32_t len =  5 + text.length() * 5;
        source = new UChar[len + 1]; //each char could become a surrogate pair

	// Convert UTF-8 string to UTF-16 (UChars)
        int32_t ulen = ucnv_toUChars(conv, source, len, text.c_str(), -1, &err);
        target = new UChar[len + 1];

        //compatability decomposition
        ulen = unorm_normalize(source, ulen, UNORM_NFKD, 0, target, len, &err);

	   text.setSize(len);
	   len = ucnv_fromUChars(conv, text.getRawData(), len, target, ulen, &err);
	   text.setSize(len);

	   delete [] source;
	   delete [] target;

	return 0;
}
Пример #23
0
int
getText (struct doc_descriptor *desc, UChar * buf, int size)
{
  struct meta *meta = NULL;
  char buf2[BUFSIZE];
  UErrorCode err;
  char *src;
  UChar *dest, esc[3];
  UChar name[1024], value[1024];
  int len, i, isMarkup, isJavascript, isMeta, l, j;
  int dangerousCut, fini, r, offset, endOfFile, space_added;

  space_added = 0;
  l = 0;
  fini = 0;
  endOfFile = 0;
  isJavascript = 0;
  dangerousCut = 0;
  isMarkup = 0;
  isMeta = 0;
  len = read (desc->fd, buf2, BUFSIZE);
  while (!fini && len > 0 && 2*l < size - 2)
    {

      /* consuming buffer */
      for (i = 0; 2*l < size - 2 && i < len && !dangerousCut && !fini; i++)
        {

          /* end of buffer are possible points of failure
             if a markup or a token is cut, it will not be
             parsed. */
          if (!endOfFile && i > len - 9 && (!strncmp (buf2 + i, "\x3c", 1) ||
                                            !strncmp (buf2 + i, "\x26", 1)))
            {
              dangerousCut = 1;
              break;
            }

          /* detecting end of javascript */
          if (isJavascript
              && !strncasecmp (buf2 + i, "</script>", 9))
            {
              isJavascript = 0;
              i += 9;
            }

          /* detecting new paragraph */
          if (l > 0 && !isJavascript
              && (!strncasecmp (buf2 + i, "<p", 2)
                  || !strncasecmp (buf2 + i, "<br", 3)
                  || !strncasecmp (buf2 + i, "<div", 4)))
            {
              fini = 1;
              i += 2;
              while (strncmp (buf2 + i, ">", 1))
                {
                  i++;
                }
              lseek (desc->fd, i - len, SEEK_CUR);
              break;
            }

          /* detecting begining of markup */
          if (!isJavascript && !isMarkup && !strncmp (buf2 + i, "\x3c", 1))
            {

              /* detecting begining of javascript */
              if (!strncasecmp (buf2 + i, "<script", 7))
                {
                  isJavascript = 1;

                }
              else if (!strncasecmp (buf2 + i, "<title", 6))
                {
                  err = U_ZERO_ERROR;
                  /* finding last metadata of desc */
                  if (desc->meta == NULL)
                    {
                      meta = (struct meta *) malloc (sizeof (struct meta));
                      desc->meta = meta;
                    }
                  else
                    {
                      meta = desc->meta;
                      while (meta->next != NULL)
                        {
                          meta = meta->next;
                        }
                      meta->next =
                        (struct meta *) malloc (sizeof (struct meta));
                      meta = meta->next;
                    }
                  meta->next = NULL;
                  meta->name = (UChar *) malloc (12);

                  /* filling name field */
                  meta->name_length =
                    2 * ucnv_toUChars (desc->conv, meta->name, 12, "title", 5,
                                       &err);
                  meta->name_length = u_strlen (meta->name);
                  if (U_FAILURE (err))
                    {
                      printf ("error icu\n");
                      return ERR_ICU;
                    }
                  isMeta = 1;

                }
              else if (!strncasecmp (buf2 + i, "<meta", 5))
                {
                  i += 5;
                  if (i >= size - 9)
                    {
                      strncpy (buf2, buf2 + i, len - i);
                      len =
                        read (desc->fd, buf2 + i,
                              BUFSIZE - len + i) + len - i;
                      i = 0;
                    }
                  for (; strncasecmp (buf2 + i, "name=\"", 6)  &&
                       strncmp (buf2 + i, "\x3E", 1); i++)
                    {
                      if (i >= size - 9)
                        {
                          strncpy (buf2, buf2 + i, len - i);
                          len =
                            read (desc->fd, buf2 + i,
                                  BUFSIZE - len + i) + len - i;
                          i = 0;
                        }
                    }
                  if (!strncmp (buf2 + i, "\x3E", 1))
                    {
                      continue;

                    }
                  else
                    {
                      i += 6;
                      /* get metadata name */
                      memset (name, '\x00', 2048);
                      for (j = 0; len != 0 && strncmp (buf2 + i, "\"", 1);
                           i++)
                        {
                          if (i >= size - 9)
                            {
                              strncpy (buf2, buf2 + i, len - i);
                              len =
                                read (desc->fd, buf2 + i,
                                      BUFSIZE - len + i) + len - i;
                              i = 0;
                            }
                          if (!strncmp (buf2 + i, "\x26", 1))
                            {
                              memset (esc, '\x00', 6);
                              offset = escapeChar (desc, buf2 + i, esc);
                              memcpy (name + j, esc, 2 * u_strlen (esc));
                              j += u_strlen (esc);
                              i += (offset - 1);
                            }
                          else
                            {

                              /* filling name buffer */
                              dest = name + j;
                              src = buf2 + i;
                              err = U_ZERO_ERROR;
                              ucnv_toUnicode (desc->conv, &dest, name + 1024,
                                              &src, buf2 + i + 1, NULL, FALSE,
                                              &err);
                              if (U_FAILURE (err))
                                {
                                  fprintf (stderr,
                                           "Unable to convert buffer\n");
                                  return ERR_ICU;
                                }
                              j += (dest - name - j);
                            }
                        }

                      /* get metadata value */
                      for (; strncasecmp (buf2 + i, "content=\"", 9) && strncmp (buf2 + i, "\x3E", 1); i++)
                        {
                          if (i >= size - 9)
                            {
                              strncpy (buf2, buf2 + i, len - i);
                              len =
                                read (desc->fd, buf2 + i,
                                      BUFSIZE - len + i) + len - i;
                              i = 0;
                            }
                        }
                      i += 9;
                      if (i >= size - 9)
                        {
                          strncpy (buf2, buf2 + i, len - i);
                          len =
                            read (desc->fd, buf2 + i,
                                  BUFSIZE - len + i) + len - i;
                          i = 0;
                        }
                      memset (value, '\x00', 2048);
                      for (j = 0; len != 0 && strncmp (buf2 + i, "\"", 1);
                           i++)
                        {
                          if (i >= size - 9)
                            {
                              strncpy (buf2, buf2 + i, len - i);
                              len =
                                read (desc->fd, buf2 + i,
                                      BUFSIZE - len + i) + len - i;
                              i = 0;
                            }
                          if (!strncmp (buf2 + i, "\x26", 1))
                            {
                              memset (esc, '\x00', 6);
                              offset = escapeChar (desc, buf2 + i, esc);
                              memcpy (value + j, esc, 2 * u_strlen (esc));
                              j += u_strlen (esc);
                              i += (offset - 1);
                            }
                          else
                            {

                              /* filling value buffer */
                              dest = value + j;
                              src = buf2 + i;
                              err = U_ZERO_ERROR;
                              ucnv_toUnicode (desc->conv, &dest, value + 1024,
                                              &src, buf2 + i + 1, NULL, FALSE,
                                              &err);
                              if (U_FAILURE (err))
                                {
                                  fprintf (stderr,
                                           "Unable to convert buffer\n");
                                  return ERR_ICU;
                                }
                              j += (dest - value - j);
                            }
                        }

                      /* insert metadata in list */
                      if (desc->meta == NULL)
                        {
                          meta =
                            (struct meta *) malloc (sizeof (struct meta));
                          desc->meta = meta;
                        }
                      else
                        {
                          meta = desc->meta;
                          while (meta->next != NULL)
                            {
                              meta = meta->next;
                            }
                          meta->next =
                            (struct meta *) malloc (sizeof (struct meta));
                          meta = meta->next;
                        }
                      meta->next = NULL;
                      meta->name = (UChar *) malloc (2 * u_strlen (name) + 2);
                      meta->value =
                        (UChar *) malloc (2 * u_strlen (value) + 2);
                      memset (meta->name, '\x00', 2 * u_strlen (name) + 2);
                      memset (meta->value, '\x00', 2 * u_strlen (value) + 2);
                      memcpy (meta->name, name, 2 * u_strlen (name));
                      memcpy (meta->value, value, 2 * u_strlen (value));
                      meta->name_length = u_strlen (name);
                      meta->value_length = u_strlen (value);

                      for (; strncmp (buf2 + i, "\x3E", 1); i++)
                        {
                          if (i >= size - 9)
                            {
                              strncpy (buf2, buf2 + i, len - i);
                              len =
                                read (desc->fd, buf2 + i,
                                      BUFSIZE - len + i) + len - i;
                              i = 0;
                            }
                        }
                      continue;
                    }

                }
              else
                {

                  isMarkup = 1;
                }
            }

          /* get metadata value */
          if (!isJavascript && isMeta)
            {
              for (; len != 0 && strncmp (buf2 + i, "\x3E", 1); i++)
                {
                  if (i >= size - 9)
                    {
                      strncpy (buf2, buf2 + i, len - i);
                      len =
                        read (desc->fd, buf2 + i,
                              BUFSIZE - len + i) + len - i;
                      i = 0;
                    }
                }
              i++;
              memset (value, '\x00', 2048);
              for (j = 0; len != 0 && strncmp (buf2 + i, "\x3C", 1); i++)
                {
                  if (i >= size - 9)
                    {
                      strncpy (buf2, buf2 + i, len - i);
                      len =
                        read (desc->fd, buf2 + i,
                              BUFSIZE - len + i) + len - i;
                      i = 0;
                    }
                  if (!strncmp (buf2 + i, "\x26", 1))
                    {
                      memset (esc, '\x00', 6);
                      offset = escapeChar (desc, buf2 + i, esc);
                      memcpy (value + j, esc, 2 * u_strlen (esc));
                      j += u_strlen (esc);
                      i += (offset - 1);
                    }
                  else
                    {

                      /* filling value buffer */
                      dest = value + j;
                      src = buf2 + i;
                      err = U_ZERO_ERROR;
                      ucnv_toUnicode (desc->conv, &dest, value + 1024,
                                      &src, buf2 + i + 1, NULL, FALSE, &err);
                      if (U_FAILURE (err))
                        {
                          fprintf (stderr, "Unable to convert buffer\n");
                          return ERR_ICU;
                        }
                      j += (dest - value - j);
                    }
                }
              meta->value = (UChar *) malloc (2 * (j + 1));
              memcpy (meta->value, value, 2 * u_strlen (value));
              meta->value_length = u_strlen (value);
              isMeta = 0;
              i += 7;
              continue;
            }

          /* detecting end of markup */
          if (!isJavascript && isMarkup && !strncmp (buf2 + i, "\x3e", 1))
            {
              if (!space_added && l > 0)
                {
                  buf[l] = 0x20;
                  l ++;
                  space_added = 1;
                }
              isMarkup = 0;
            }

          /* handling text */
          if (!isJavascript && !isMarkup && strncmp (buf2 + i, "\x3e", 1))
            {

              if (strncmp (buf2 + i, "\n", 1) && strncmp (buf2 + i, "\t", 1) && strncmp (buf2 + i, "\r", 1))
                {

                  /* converting tokens */
                  if (!isJavascript && !isMarkup
                      && !strncmp (buf2 + i, "\x26", 1))
                    {
                      memset (esc, '\x00', 6);
                      offset = escapeChar (desc, buf2 + i, esc);
                      if (memcmp (esc, "\x20\x00", u_strlen (esc)))
                        {
                          memcpy (buf + l, esc, 2 * u_strlen (esc));
                          l += u_strlen (esc);
                          space_added = 0;
                        }
                      else {
                        if (!space_added){
                            buf[l] = 0x20;
                            space_added = 1;
                            l++;
                        }
                      }
                      i += (offset - 1);
                    }
                  else
                    {
                      if (buf2[i] != 0x20 || !space_added){
                        /* filling output buffer */
                        dest = buf + l;
                        src = buf2 + i;
                        err = U_ZERO_ERROR;
                        ucnv_toUnicode (desc->conv, &dest, buf + size / 2,
                                        &src, buf2 + i + 1, NULL, FALSE, &err);
                        if (U_FAILURE (err))
                            {
                            fprintf (stderr, "Unable to convert buffer\n");
                            return ERR_ICU;
                            }
                        l += (dest - buf - l);
                        if (buf2[i] == 0x20) {space_added = 1;} else {space_added=0;}
                      }
                    }
                }
              else
                {
                  /* replace tabs and eol by spaces */
                  if (!space_added){
                    buf[l] = 0x20;
                    space_added = 1;
                    l++;
                  }
                }
            }
        }

      /* filling new buffer correctly */
      if (!fini)
        {
          if (dangerousCut)
            {
              r = len - i;
              strncpy (buf2, buf2 + i, r);
              len = read (desc->fd, buf2 + r, BUFSIZE - r) + r;
              if (len < 9)
                {
                  endOfFile = 1;
                }
              dangerousCut = 0;
            }
          else
            {
              len = read (desc->fd, buf2, BUFSIZE);
            }
        }

    }

  /* ending buffer properly */
  if (l > 0)
    {
      buf[l] = 0x20;
      return 2*l;
    }

  if (len == 0)
    {
      return NO_MORE_DATA;
    }

  return 2*l;
}
Пример #24
0
/* test invariant-character handling */
static void
TestInvariant() {
    /* all invariant graphic chars and some control codes (not \n!) */
    const char invariantChars[]=
        "\t\r \"%&'()*+,-./"
        "0123456789:;<=>?"
        "ABCDEFGHIJKLMNOPQRSTUVWXYZ_"
        "abcdefghijklmnopqrstuvwxyz";

    const UChar invariantUChars[]={
        9, 0xd, 0x20, 0x22, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
        0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
        0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
        0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5f,
        0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
        0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0
    };

    const char variantChars[]="\n!#$@[\\]^`{|}~";

    const UChar variantUChars[]={
        0x0a, 0x21, 0x23, 0x24, 0x40, 0x5b, 0x5c, 0x5d, 0x5e, 0x60, 0x7b, 0x7c, 0x7d, 0x7e, 0
    };

    const UChar nonASCIIUChars[]={ 0x80, 0xa0, 0x900, 0xff51 };

    UChar us[120];
    char cs[120];

    int32_t i, length;

    /* make sure that all invariant characters convert both ways */
    length=sizeof(invariantChars);
    u_charsToUChars(invariantChars, us, length);
    if(u_strcmp(us, invariantUChars)!=0) {
        log_err("u_charsToUChars(invariantChars) failed\n");
    }

    u_UCharsToChars(invariantUChars, cs, length);
    if(strcmp(cs, invariantChars)!=0) {
        log_err("u_UCharsToChars(invariantUChars) failed\n");
    }


    /*
     * make sure that variant characters convert from source code literals to Unicode
     * but not back to char *
     */
    length=sizeof(variantChars);
    u_charsToUChars(variantChars, us, length);
    if(u_strcmp(us, variantUChars)!=0) {
        log_err("u_charsToUChars(variantChars) failed\n");
    }

#ifdef NDEBUG
    /*
     * Test u_UCharsToChars(variantUChars) only in release mode because it will
     * cause an assertion failure in debug builds.
     */
    u_UCharsToChars(variantUChars, cs, length);
    for(i=0; i<length; ++i) {
        if(cs[i]!=0) {
            log_err("u_UCharsToChars(variantUChars) converted the %d-th character to %02x instead of 00\n", i, cs[i]);
        }
    }
#endif

    /*
     * Verify that invariant characters roundtrip from Unicode to the
     * default converter and back.
     */
    {
        UConverter *cnv;
        UErrorCode errorCode;

        errorCode=U_ZERO_ERROR;
        cnv=ucnv_open(NULL, &errorCode);
        if(U_FAILURE(errorCode)) {
            log_err("unable to open the default converter\n");
        } else {
            length=ucnv_fromUChars(cnv, cs, sizeof(cs), invariantUChars, -1, &errorCode);
            if(U_FAILURE(errorCode)) {
                log_err("ucnv_fromUChars(invariantUChars) failed - %s\n", u_errorName(errorCode));
            } else if(length!=sizeof(invariantChars)-1 || strcmp(cs, invariantChars)!=0) {
                log_err("ucnv_fromUChars(invariantUChars) failed\n");
            }

            errorCode=U_ZERO_ERROR;
            length=ucnv_toUChars(cnv, us, LENGTHOF(us), invariantChars, -1, &errorCode);
            if(U_FAILURE(errorCode)) {
                log_err("ucnv_toUChars(invariantChars) failed - %s\n", u_errorName(errorCode));
            } else if(length!=LENGTHOF(invariantUChars)-1 || u_strcmp(us, invariantUChars)!=0) {
                log_err("ucnv_toUChars(invariantChars) failed\n");
            }

            ucnv_close(cnv);
        }
    }

    /* API tests */
    if(!uprv_isInvariantString(invariantChars, -1)) {
        log_err("uprv_isInvariantString(invariantChars) failed\n");
    }
    if(!uprv_isInvariantUString(invariantUChars, -1)) {
        log_err("uprv_isInvariantUString(invariantUChars) failed\n");
    }
    if(!uprv_isInvariantString(invariantChars+strlen(invariantChars), 1)) {
        log_err("uprv_isInvariantString(\"\\0\") failed\n");
    }

    for(i=0; i<(sizeof(variantChars)-1); ++i) {
        if(uprv_isInvariantString(variantChars+i, 1)) {
            log_err("uprv_isInvariantString(variantChars[%d]) failed\n", i);
        }
        if(uprv_isInvariantUString(variantUChars+i, 1)) {
            log_err("uprv_isInvariantUString(variantUChars[%d]) failed\n", i);
        }
    }

    for(i=0; i<LENGTHOF(nonASCIIUChars); ++i) {
        if(uprv_isInvariantUString(nonASCIIUChars+i, 1)) {
            log_err("uprv_isInvariantUString(nonASCIIUChars[%d]) failed\n", i);
        }
    }
}
Пример #25
0
//----------------------------------------------------------------------------
//
//  main      for genctd
//
//----------------------------------------------------------------------------
int  main(int argc, char **argv) {
    UErrorCode  status = U_ZERO_ERROR;
    const char *wordFileName;
    const char *outFileName;
    const char *outDir = NULL;
    const char *copyright = NULL;

    //
    // Pick up and check the command line arguments,
    //    using the standard ICU tool utils option handling.
    //
    U_MAIN_INIT_ARGS(argc, argv);
    progName = argv[0];
    argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
    if(argc<0) {
        // Unrecognized option
        fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
        usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
    }

    if(options[0].doesOccur || options[1].doesOccur) {
        //  -? or -h for help.
        usageAndDie(0);
    }

    if (!options[3].doesOccur || argc < 2) {
        fprintf(stderr, "input and output file must both be specified.\n");
        usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
    }
    outFileName  = options[3].value;
    wordFileName = argv[1];

    if (options[4].doesOccur) {
        u_setDataDirectory(options[4].value);
    }

    status = U_ZERO_ERROR;

    /* Combine the directory with the file name */
    if(options[5].doesOccur) {
        outDir = options[5].value;
    }
    if (options[6].doesOccur) {
        copyright = U_COPYRIGHT_STRING;
    }

#if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO

    UNewDataMemory *pData;
    char msg[1024];

    /* write message with just the name */
    sprintf(msg, "genctd writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName);
    fprintf(stderr, "%s\n", msg);

    /* write the dummy data file */
    pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status);
    udata_writeBlock(pData, msg, strlen(msg));
    udata_finish(pData, &status);
    return (int)status;

#else
    /* Initialize ICU */
    u_init(&status);
    if (U_FAILURE(status)) {
        fprintf(stderr, "%s: can not initialize ICU.  status = %s\n",
            argv[0], u_errorName(status));
        exit(1);
    }
    status = U_ZERO_ERROR;

    //
    //  Read in the dictionary source file
    //
    long        result;
    long        wordFileSize;
    FILE        *file;
    char        *wordBufferC;
    MutableTrieDictionary *mtd = NULL;
    
    file = fopen(wordFileName, "rb");
    if( file == 0 ) { //cannot find file
        //create 1-line dummy file: ie 1 char, 1 value
        UNewDataMemory *pData;
        char msg[1024];

        /* write message with just the name */
        sprintf(msg, "%s not found, genctd writes dummy %s", wordFileName, outFileName);
        fprintf(stderr, "%s\n", msg);

        UChar c = 0x0020;
        mtd = new MutableTrieDictionary(c, status, TRUE);
        mtd->addWord(&c, 1, status, 1);

    } else { //read words in from input file
        fseek(file, 0, SEEK_END);
        wordFileSize = ftell(file);
        fseek(file, 0, SEEK_SET);
        wordBufferC = new char[wordFileSize+10];
    
        result = (long)fread(wordBufferC, 1, wordFileSize, file);
        if (result != wordFileSize)  {
            fprintf(stderr, "Error reading file \"%s\"\n", wordFileName);
            exit (-1);
        }
        wordBufferC[wordFileSize]=0;
        fclose(file);
    
        //
        // Look for a Unicode Signature (BOM) on the word file
        //
        int32_t        signatureLength;
        const char *   wordSourceC = wordBufferC;
        const char*    encoding = ucnv_detectUnicodeSignature(
                               wordSourceC, wordFileSize, &signatureLength, &status);
        if (U_FAILURE(status)) {
            exit(status);
        }
        if(encoding!=NULL ){
            wordSourceC  += signatureLength;
            wordFileSize -= signatureLength;
        }
    
        //
        // Open a converter to take the rule file to UTF-16
        //
        UConverter* conv;
        conv = ucnv_open(encoding, &status);
        if (U_FAILURE(status)) {
            fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
            exit(status);
        }
    
        //
        // Convert the words to UChar.
        //  Preflight first to determine required buffer size.
        //
        uint32_t destCap = ucnv_toUChars(conv,
                           NULL,           //  dest,
                           0,              //  destCapacity,
                           wordSourceC,
                           wordFileSize,
                           &status);
        if (status != U_BUFFER_OVERFLOW_ERROR) {
            fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
            exit(status);
        };
    
        status = U_ZERO_ERROR;
        UChar *wordSourceU = new UChar[destCap+1];
        ucnv_toUChars(conv,
                      wordSourceU,     //  dest,
                      destCap+1,
                      wordSourceC,
                      wordFileSize,
                      &status);
        if (U_FAILURE(status)) {
            fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
            exit(status);
        };
        ucnv_close(conv);
    
        // Get rid of the original file buffer
        delete[] wordBufferC;
    
        // Create a MutableTrieDictionary, and loop through all the lines, inserting
        // words.
    
        // First, pick a median character.
        UChar *current = wordSourceU + (destCap/2);
        UChar uc = *current++;
        UnicodeSet breaks;
        breaks.add(0x000A);     // Line Feed
        breaks.add(0x000D);     // Carriage Return
        breaks.add(0x2028);     // Line Separator
        breaks.add(0x2029);     // Paragraph Separator
    
        do { 
            // Look for line break
            while (uc && !breaks.contains(uc)) {
                uc = *current++;
            }
            // Now skip to first non-line-break
            while (uc && breaks.contains(uc)) {
                uc = *current++;
            }
        }
        while (uc && (breaks.contains(uc) || u_isspace(uc)));
    
        mtd = new MutableTrieDictionary(uc, status);
        
        if (U_FAILURE(status)) {
            fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));
            exit(status);
        }
        
        // Now add the words. Words are non-space characters at the beginning of
        // lines, and must be at least one UChar. If a word has an associated value,
        // the value should follow the word on the same line after a tab character.
        current = wordSourceU;
        UChar *candidate = current;
        uc = *current++;
        int32_t length = 0;
        int count = 0;
                
        while (uc) {
            while (uc && !u_isspace(uc)) {
                ++length;
                uc = *current++;
            }
            
            UnicodeString valueString;
            UChar candidateValue;
            if(uc == 0x0009){ //separator is a tab char, read in number after space
            	while (uc && u_isspace(uc)) {
            		uc = *current++;
            	}
                while (uc && !u_isspace(uc)) {
                    valueString.append(uc);
                    uc = *current++;
                }
            }
            
            if (length > 0) {
                count++;
                if(valueString.length() > 0){
                    mtd->setValued(TRUE);
    
                    uint32_t value = 0;
                    char* s = new char[valueString.length()];
                    valueString.extract(0,valueString.length(), s, valueString.length());
                    int n = sscanf(s, "%ud", &value);
                    U_ASSERT(n == 1);
                    U_ASSERT(value >= 0); 
                    mtd->addWord(candidate, length, status, (uint16_t)value);
                    delete[] s;
                } else {
                    mtd->addWord(candidate, length, status);
                }
    
                if (U_FAILURE(status)) {
                    fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\" at line %d in input file\n",
                            u_errorName(status), count);
                    exit(status);
                }
            }
    
            // Find beginning of next line
            while (uc && !breaks.contains(uc)) {
                uc = *current++;
            }
            // Find next non-line-breaking character
            while (uc && breaks.contains(uc)) {
                uc = *current++;
            }
            candidate = current-1;
            length = 0;
        }
    
        // Get rid of the Unicode text buffer
        delete[] wordSourceU;
    }

    // Now, create a CompactTrieDictionary from the mutable dictionary
    CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status);
    if (U_FAILURE(status)) {
        fprintf(stderr, "new CompactTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));
        exit(status);
    }
    
    // Get rid of the MutableTrieDictionary
    delete mtd;

    //
    //  Get the binary data from the dictionary.
    //
    uint32_t        outDataSize = ctd->dataSize();
    const uint8_t  *outData = (const uint8_t *)ctd->data();

    //
    //  Create the output file
    //
    size_t bytesWritten;
    UNewDataMemory *pData;
    pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status);
    if(U_FAILURE(status)) {
        fprintf(stderr, "genctd: Could not open output file \"%s\", \"%s\"\n", 
                         outFileName, u_errorName(status));
        exit(status);
    }


    //  Write the data itself.
    udata_writeBlock(pData, outData, outDataSize);
    // finish up 
    bytesWritten = udata_finish(pData, &status);
    if(U_FAILURE(status)) {
        fprintf(stderr, "genctd: error \"%s\" writing the output file\n", u_errorName(status));
        exit(status);
    }
    
    if (bytesWritten != outDataSize) {
        fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
        exit(-1);
    }
    
    // Get rid of the CompactTrieDictionary
    delete ctd;

    u_cleanup();

    printf("genctd: tool completed successfully.\n");
    return 0;

#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
}
Пример #26
0
//------------------------------------------------------------------------------------------
//
//    readFile          Read a file into memory, and convert it to Unicode.
//
//                      Since this is just a demo program, take the simple minded approach
//                      of always reading the whole file at once.  No intelligent buffering
//                      is done.
//
//------------------------------------------------------------------------------------------
void readFile(const char *name) {

    //
    //  Initialize global file variables
    //
    fileName = name;
    fileLen  = 0;      // zero length prevents processing in case of errors.


    //
    //  Open the file and determine its size.
    //
    FILE *file = fopen(name, "rb");
    if (file == 0 ) {
        fprintf(stderr, "ugrep: Could not open file \"%s\"\n", fileName);
        return;
    }
    fseek(file, 0, SEEK_END);
    int rawFileLen = ftell(file);
    fseek(file, 0, SEEK_SET);
    

    //
    //   Read in the file
    //
    charBuf    = (char *)realloc(charBuf, rawFileLen+1);   // Need error checking...
    int t = fread(charBuf, 1, rawFileLen, file);
    if (t != rawFileLen)  {
        fprintf(stderr, "Error reading file \"%s\"\n", fileName);
        fclose(file);
        return;
    }
    charBuf[rawFileLen]=0;
    fclose(file);

    //
    // Look for a Unicode Signature (BOM) in the data
    //
    int32_t        signatureLength;
    const char *   charDataStart = charBuf;
    UErrorCode     status        = U_ZERO_ERROR;
    const char*    encoding      = ucnv_detectUnicodeSignature(
                           charDataStart, rawFileLen, &signatureLength, &status);
    if (U_FAILURE(status)) {
        fprintf(stderr, "ugrep: ICU Error \"%s\" from ucnv_detectUnicodeSignature()\n",
            u_errorName(status));
        return;
    }
    if(encoding!=NULL ){
        charDataStart  += signatureLength;
        rawFileLen     -= signatureLength;
    }

    //
    // Open a converter to take the file to UTF-16
    //
    UConverter* conv;
    conv = ucnv_open(encoding, &status);
    if (U_FAILURE(status)) {
        fprintf(stderr, "ugrep: ICU Error \"%s\" from ucnv_open()\n", u_errorName(status));
        return;
    }

    //
    // Convert the file data to UChar.
    //  Preflight first to determine required buffer size.
    //
    uint32_t destCap = ucnv_toUChars(conv,
                       NULL,           //  dest,
                       0,              //  destCapacity,
                       charDataStart,
                       rawFileLen,
                       &status);
    if (status != U_BUFFER_OVERFLOW_ERROR) {
        fprintf(stderr, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
        return;
    };
    
    status = U_ZERO_ERROR;
    ucharBuf = (UChar *)realloc(ucharBuf, (destCap+1) * sizeof(UChar));
    ucnv_toUChars(conv,
        ucharBuf,           //  dest,
        destCap+1,
        charDataStart,
        rawFileLen,
        &status);
    if (U_FAILURE(status)) {
        fprintf(stderr, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
        return;
    };
    ucnv_close(conv);
    
    //
    //  Successful conversion.  Set the global size variables so that
    //     the rest of the processing will proceed for this file.
    //
    fileLen = destCap;
}
Пример #27
0
U16Char_t* convCharStrToU16Str(const char* src, const char* Encoding)
{

    //static char const* const tocode = CHARCONV_ICONV_UTF16;
    char const* const fromcode = getPlatformEncoding(Encoding);

    UErrorCode status = U_ZERO_ERROR;

#ifdef ENCCONV_DEBUG
    std::cout << "\t" "convString" << std::endl;
    //std::cout << "\t\t" "tocode   = " << tocode   << std::endl;
    std::cout << "\t\t" "fromcode = " << fromcode << std::endl;
#endif

    //iconv_t cd = iconv_open(tocode, fromcode);
    // Initializing ICU converter
    UConverter *conv= ucnv_open(fromcode, &status);
#ifdef CHARCONV_DEBUG
    std::cout << "\t\t" "aft ucnv_open: status = " << status << std::endl;
#endif
    if (conv == NULL)
    {   // try default encoding "ISO-8859-1"
        //throw std::runtime_error("Unable to create Unicode converter object");
        conv = ucnv_open("ISO-8859-1", &status);
    }



    char const* srcWrk = src;
    const size_t srcSizeInBytes = std::strlen(src);
    const size_t dstSizeInBytes = MAX(256, (srcSizeInBytes + 1)) * sizeof(U16Char_t);
    U16Char_t* dst = new U16Char_t [dstSizeInBytes / sizeof(U16Char_t)];
    U16Char_t* dstWrk = dst;
    size_t srcLeftInBytes = srcSizeInBytes;
    size_t dstLeftInBytes = dstSizeInBytes - sizeof(U16Char_t);

    status = U_ZERO_ERROR;

    //still if conv is null simply return blank string

    if (conv == NULL)
    {
        dst[0] = NULL;
        //Fix for #3211945
        dstWrk = NULL;
        return dst;
    }

    ucnv_toUChars(conv, (UChar *) dstWrk, dstLeftInBytes, (char*)srcWrk, srcLeftInBytes, &status);

#ifdef CHARCONV_DEBUG
    std::cout << "\t\t" "aft iconv: status = " << status << std::endl;
#endif
    if (status != U_ZERO_ERROR )
    {
        //	throw std::runtime_error("Unable to convert to string");
        *dstWrk = 0;
    }


    //const int err = iconv_close(cd);

    ucnv_close(conv);

    //if (err == -1)
    //	throw std::runtime_error("Unable to deallocate iconv_t object");
    //Fix for #3211945
    dstWrk = NULL;
    return dst;

}
Пример #28
0
std::string ReaderUtil::Recode(const std::string& str_to_encode,
                               const std::string& src_enc,
                               const std::string& dst_enc) {
	std::string src_enc_str = src_enc;
	std::string dst_enc_str = dst_enc;

	if (src_enc.empty() || dst_enc.empty() || str_to_encode.empty()) {
		return str_to_encode;
	}
	if (atoi(src_enc.c_str()) > 0) {
		src_enc_str = ReaderUtil::CodepageToEncoding(atoi(src_enc.c_str()));
	}
	if (atoi(dst_enc.c_str()) > 0) {
		dst_enc_str = ReaderUtil::CodepageToEncoding(atoi(dst_enc.c_str()));
	}
#ifdef LCF_SUPPORT_ICU
	UErrorCode status = U_ZERO_ERROR;
	int size = str_to_encode.size() * 4;
	UChar* unicode_str = new UChar[size];
	UConverter *conv;
	int length;
	std::string result_str;

	conv = ucnv_open(src_enc_str.c_str(), &status);
	
	if (status != U_ZERO_ERROR && status != U_AMBIGUOUS_ALIAS_WARNING) {
		fprintf(stderr, "liblcf:  ucnv_open() error for source encoding \"%s\": %s\n", src_enc_str.c_str(), u_errorName(status));
		return std::string();
	}
	status = U_ZERO_ERROR;

	length = ucnv_toUChars(conv, unicode_str, size, str_to_encode.c_str(), -1, &status);
	ucnv_close(conv);
	if (status != U_ZERO_ERROR) {
		fprintf(stderr, "liblcf: ucnv_toUChars() error when encoding \"%s\": %s\n", str_to_encode.c_str(), u_errorName(status));
		delete[] unicode_str;
		return std::string();
	}

	char* result = new char[length * 4];

	conv = ucnv_open(dst_enc_str.c_str(), &status);
	if (status != U_ZERO_ERROR && status != U_AMBIGUOUS_ALIAS_WARNING) {
		fprintf(stderr, "liblcf: ucnv_open() error for destination encoding \"%s\": %s\n", dst_enc_str.c_str(), u_errorName(status));
		delete[] unicode_str;
		delete[] result;
		return std::string();
	}
	status = U_ZERO_ERROR;

	ucnv_fromUChars(conv, result, length * 4, unicode_str, -1, &status);
	ucnv_close(conv);
	if (status != U_ZERO_ERROR) {
		fprintf(stderr, "liblcf: ucnv_fromUChars() error: %s\n", u_errorName(status));
		delete[] unicode_str;
		delete[] result;
		return std::string();
	}

	result_str = result;

	delete[] unicode_str;
	delete[] result;

	return std::string(result_str);
#else
	iconv_t cd = iconv_open(dst_enc_str.c_str(), src_enc_str.c_str());
	if (cd == (iconv_t)-1)
		return str_to_encode;
	char *src = const_cast<char *>(str_to_encode.c_str());
	size_t src_left = str_to_encode.size();
	size_t dst_size = str_to_encode.size() * 5 + 10;
	char *dst = new char[dst_size];
	size_t dst_left = dst_size;
#    ifdef ICONV_CONST
	char ICONV_CONST *p = src;
#    else
	char *p = src;
#    endif
	char *q = dst;
	size_t status = iconv(cd, &p, &src_left, &q, &dst_left);
	iconv_close(cd);
	if (status == (size_t) -1 || src_left > 0) {
		delete[] dst;
		return std::string();
	}
	*q++ = '\0';
	std::string result(dst);
	delete[] dst;
	return result;
#endif
}
Пример #29
0
UBool convsample_21_didSubstitute(const char *source)
{
  UChar uchars[100];
  char bytes[100];
  UConverter *conv = NULL, *cloneCnv = NULL;
  UErrorCode status = U_ZERO_ERROR;
  uint32_t len, len2;
  int32_t  cloneLen;
  UBool  flagVal = FALSE;
  UConverterFromUCallback junkCB;
  
  FromUFLAGContext *flagCtx = NULL, 
                   *cloneFlagCtx = NULL;

  debugCBContext   *debugCtx1 = NULL,
                   *debugCtx2 = NULL,
                   *cloneDebugCtx = NULL;

  printf("\n\n==============================================\n"
         "Sample 21: C: Test for substitution w/ callbacks & clones \n");

  /* print out the original source */
  printBytes("src", source);
  printf("\n");

  /* First, convert from UTF8 to unicode */
  conv = ucnv_open("utf-8", &status);
  U_ASSERT(status);

  len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
  U_ASSERT(status);
 
  printUChars("uch", uchars, len);
  printf("\n");

  /* Now, close the converter */
  ucnv_close(conv);

  /* Now, convert to windows-1252 */
  conv = ucnv_open("windows-1252", &status);
  U_ASSERT(status);

  /* Converter starts out with the SUBSTITUTE callback set. */

  /* initialize our callback */
  /* from the 'bottom' innermost, out
   *   CNV ->  debugCtx1[debug]  ->  flagCtx[flag] -> debugCtx2[debug]  */

#if DEBUG_TMI
  printf("flagCB_fromU = %p\n", &flagCB_fromU);
  printf("debugCB_fromU = %p\n", &debugCB_fromU);
#endif

  debugCtx1 = debugCB_openContext();
   flagCtx  = flagCB_fromU_openContext();
  debugCtx2 = debugCB_openContext();

  debugCtx1->subCallback =  flagCB_fromU;  /* debug1 -> flag */
  debugCtx1->subContext  =  flagCtx;

  flagCtx->subCallback   =  debugCB_fromU; /*  flag -> debug2 */
  flagCtx->subContext    =  debugCtx2;

  debugCtx2->subCallback =  UCNV_FROM_U_CALLBACK_SUBSTITUTE;
  debugCtx2->subContext  = NULL;

  /* Set our special callback */

  ucnv_setFromUCallBack(conv,
                        debugCB_fromU,
                        debugCtx1,
                        &(debugCtx2->subCallback),
                        &(debugCtx2->subContext),
                        &status);

  U_ASSERT(status);

#if DEBUG_TMI
  printf("Callback chain now: Converter %p -> debug1:%p-> (%p:%p)==flag:%p -> debug2:%p -> cb %p\n",
         conv, debugCtx1, debugCtx1->subCallback,
         debugCtx1->subContext, flagCtx, debugCtx2, debugCtx2->subCallback);
#endif

  cloneLen = 1; /* but passing in null so it will clone */
  cloneCnv = ucnv_safeClone(conv,  NULL,  &cloneLen, &status);

  U_ASSERT(status);

#if DEBUG_TMI
  printf("Cloned converter from %p -> %p.  Closing %p.\n", conv, cloneCnv, conv);
#endif
  
  ucnv_close(conv);

#if DEBUG_TMI
  printf("%p closed.\n", conv);
#endif 

  U_ASSERT(status);
  /* Now, we have to extract the context */
  cloneDebugCtx = NULL;
  cloneFlagCtx  = NULL;

  ucnv_getFromUCallBack(cloneCnv, &junkCB, (const void **)&cloneDebugCtx);
  if(cloneDebugCtx != NULL) {
      cloneFlagCtx = (FromUFLAGContext*) cloneDebugCtx -> subContext;
  }

  printf("Cloned converter chain: %p -> %p[debug1] -> %p[flag] -> %p[debug2] -> substitute\n",
         cloneCnv, cloneDebugCtx, cloneFlagCtx, cloneFlagCtx?cloneFlagCtx->subContext:NULL );

  len2 = ucnv_fromUChars(cloneCnv, bytes, 100, uchars, len, &status);
  U_ASSERT(status);

  if(cloneFlagCtx != NULL) {
      flagVal = cloneFlagCtx->flag;  /* it's about to go away when we close the cnv */
  } else {
      printf("** Warning, couldn't get the subcallback \n");
  }

  ucnv_close(cloneCnv);

  /* print out the original source */
  printBytes("bytes", bytes, len2);

  return flagVal; /* true if callback was called */
}
Пример #30
0
UBool convsample_20_didSubstitute(const char *source)
{
  UChar uchars[100];
  char bytes[100];
  UConverter *conv = NULL;
  UErrorCode status = U_ZERO_ERROR;
  uint32_t len, len2;
  UBool  flagVal;
  
  FromUFLAGContext * context = NULL;

  printf("\n\n==============================================\n"
         "Sample 20: C: Test for substitution using callbacks\n");

  /* print out the original source */
  printBytes("src", source);
  printf("\n");

  /* First, convert from UTF8 to unicode */
  conv = ucnv_open("utf-8", &status);
  U_ASSERT(status);

  len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status);
  U_ASSERT(status);
 
  printUChars("uch", uchars, len);
  printf("\n");

  /* Now, close the converter */
  ucnv_close(conv);

  /* Now, convert to windows-1252 */
  conv = ucnv_open("windows-1252", &status);
  U_ASSERT(status);

  /* Converter starts out with the SUBSTITUTE callback set. */

  /* initialize our callback */
  context = flagCB_fromU_openContext();

  /* Set our special callback */
  ucnv_setFromUCallBack(conv,
                        flagCB_fromU,
                        context,
                        &(context->subCallback),
                        &(context->subContext),
                        &status);

  U_ASSERT(status);

  len2 = ucnv_fromUChars(conv, bytes, 100, uchars, len, &status);
  U_ASSERT(status);

  flagVal = context->flag;  /* it's about to go away when we close the cnv */

  ucnv_close(conv);

  /* print out the original source */
  printBytes("bytes", bytes, len2);

  return flagVal; /* true if callback was called */
}