void VerifyTables(FILE *fp) { fprintf(stderr, "Testing final ITT and STT.\n"); fseek(fp, 0, SEEK_SET); int Value; UTF32 nextcode = ReadCodePoint(fp); UTF32 i; for (i = 0; i <= UNI_MAX_LEGAL_UTF32; i++) { bool bMember; if (i == nextcode) { bMember = true; if (UNI_EOF != nextcode) { nextcode = ReadCodePoint(fp); if (nextcode <= i) { fprintf(stderr, "Codes in file are not in order (U+%04X).\n", nextcode); exit(0); } } } else { bMember = false; } UTF32 Source[2]; Source[0] = i; Source[1] = L'\0'; const UTF32 *pSource = Source; UTF8 Target[5]; UTF8 *pTarget = Target; ConversionResult cr; cr = ConvertUTF32toUTF8(&pSource, pSource+1, &pTarget, pTarget+sizeof(Target)-1, lenientConversion); if (conversionOK == cr) { int iState = PRINT_START_STATE; UTF8 *p = Target; while ( p < pTarget && iState < PRINT_ACCEPTING_STATES_START) { iState = print_stt[iState][print_itt[(unsigned char)*p]]; p++; } bool j = ((iState - PRINT_ACCEPTING_STATES_START) == 1) ? true : false; if (j != bMember) { fprintf(stderr, "Input Translation Table and State Transition Table do not work.\n"); exit(0); } } } }
int main( int argc, char **argv ) { UTF32 data32[] = { 0x00000041, 0x00000042, 0x00000043, 0x00000044, 0x00000045, 0x00000046, 0x00000047, 0x00000048 }; UTF32 *src32 = data32; UTF8 *out8 = (UTF8*)malloc(1024); UTF8 *work8 = out8; memset(out8,0xff,1024); ConversionResult res; res = ConvertUTF32toUTF8( (const UTF32**)&src32, src32+8, &work8, work8+8, 0 ); printf("ptf:%p to %p\n", src32, src32+7); printf("res:%d out:%x %x %x %x %x %x %x %x\n",res,out8[0], out8[1], out8[2], out8[3], out8[4], out8[5], out8[6], out8[7] ); size_t outlen = work8 - out8; printf("len:%d",outlen); }
intp StringParser::ConvertToBuffer(const wchar_t* str, size_t length, HeapString& outBuffer) { outBuffer.Clear(); if (length == 0) { return 0; } constexpr bool isUTF16 = sizeof(wchar_t) == 2; if (isUTF16) { //sizeof(wchar_t)==2 size_t utf8Size = length * 3 + 1; outBuffer.ReserveSize(utf8Size); const UTF16* sourceStart = reinterpret_cast<const UTF16*>(str); const UTF16* sourceEnd = sourceStart + length; UTF8* targetStart = reinterpret_cast<UTF8*>(outBuffer.MutableBuffer()); UTF8* targetEnd = targetStart + utf8Size; ConversionResult res = ConvertUTF16toUTF8(&sourceStart, sourceEnd, &targetStart, targetEnd, strictConversion); *targetStart = 0; if (res == conversionOK) { intp count = targetStart - reinterpret_cast<UTF8*>(outBuffer.MutableBuffer()); outBuffer.ForceSetLength(count); return count; } } else { //sizeof(wchar_t)==4 size_t utf8Size = length * 4 + 1; outBuffer.ReserveSize(utf8Size); const UTF32* sourceStart = reinterpret_cast<const UTF32*>(str); const UTF32* sourceEnd = sourceStart + length; UTF8* targetStart = reinterpret_cast<UTF8*>(outBuffer.MutableBuffer()); UTF8* targetEnd = targetStart + utf8Size; ConversionResult res = ConvertUTF32toUTF8(&sourceStart, sourceEnd, &targetStart, targetEnd, strictConversion); *targetStart = 0; if (res == conversionOK) { intp count = targetStart - reinterpret_cast<UTF8*>(outBuffer.MutableBuffer()); outBuffer.ForceSetLength(count); return count; } } return 0; }
bool ConvertCodePointToUTF8(unsigned Source, char *&ResultPtr) { const UTF32 *SourceStart = &Source; const UTF32 *SourceEnd = SourceStart + 1; UTF8 *TargetStart = reinterpret_cast<UTF8 *>(ResultPtr); UTF8 *TargetEnd = TargetStart + 4; ConversionResult CR = ConvertUTF32toUTF8(&SourceStart, SourceEnd, &TargetStart, TargetEnd, strictConversion); if (CR != conversionOK) return false; ResultPtr = reinterpret_cast<char*>(TargetStart); return true; }
static utf8_errors::error_code_enum convert(wchar_t const** src_start , wchar_t const* src_end , std::string& utf8) { char* dst_start = &utf8[0]; int ret = ConvertUTF32toUTF8( reinterpret_cast<UTF32 const**>(src_start) , reinterpret_cast<UTF32 const*>(src_end) , reinterpret_cast<UTF8**>(&dst_start) , reinterpret_cast<UTF8*>(dst_start + utf8.size()) , lenientConversion); utf8.resize(aux::numeric_cast<std::size_t>(dst_start - &utf8[0])); return static_cast<utf8_errors::error_code_enum>(ret); }
String StringUtils::wideString2utf8String( const WideString& wideString ) { size_t widesize = wideString.length(); String returnString; if ( sizeof( wchar_t ) == 2 ) { size_t utf8size = MAX_UTF8_CHAR_LENGTH * widesize + 1; returnString.resize( utf8size, '\0' ); const UTF16* sourcestart = reinterpret_cast<const UTF16*>( wideString.c_str() ); const UTF16* sourceend = sourcestart + widesize; UTF8* targetstart = reinterpret_cast<UTF8*>( &((returnString)[ 0 ]) ); UTF8* thisFirstWChar = targetstart; UTF8* targetend = targetstart + utf8size; ConversionResult res = ConvertUTF16toUTF8( &sourcestart, sourceend, &targetstart, targetend, strictConversion ); if ( res != conversionOK ) { throw Exception(Exception::ERROR_WIDE_2_UTF8, String("Could not convert from wide string to UTF8.")); } returnString.resize(targetstart - thisFirstWChar); } else if ( sizeof( wchar_t ) == 4 ) { size_t utf8size = MAX_UTF8_CHAR_LENGTH * widesize + 1; returnString.resize( utf8size, '\0' ); const UTF32* sourcestart = reinterpret_cast<const UTF32*>( wideString.c_str() ); const UTF32* sourceend = sourcestart + widesize; UTF8* targetstart = reinterpret_cast<UTF8*>( &((returnString)[ 0 ]) ); UTF8* thisFirstWChar = targetstart; UTF8* targetend = targetstart + utf8size; ConversionResult res = ConvertUTF32toUTF8( &sourcestart, sourceend, &targetstart, targetend, strictConversion ); if ( res != conversionOK ) { throw Exception(Exception::ERROR_WIDE_2_UTF8, String("Could not convert from wide string to UTF8.")); } returnString.resize(targetstart - thisFirstWChar); } else { throw Exception(Exception::ERROR_WIDE_2_UTF8, String("Could not convert from wide string to UTF8.")); } return returnString; }
static std::string ToUtf8(const std::wstring& widestring) { size_t widesize = widestring.length(); if (sizeof(wchar_t) == 2) { size_t utf8size = 3 * widesize + 1; char* utf8stringnative = new char[utf8size]; const UTF16* sourcestart = reinterpret_cast<const UTF16*>(widestring.c_str()); const UTF16* sourceend = sourcestart + widesize; UTF8* targetstart = reinterpret_cast<UTF8*>(utf8stringnative); UTF8* targetend = targetstart + utf8size; ConversionResult res = ConvertUTF16toUTF8(&sourcestart, sourceend, &targetstart, targetend, strictConversion); if (res != conversionOK) { delete [] utf8stringnative; throw std::exception(); } *targetstart = 0; std::string resultstring(utf8stringnative); delete [] utf8stringnative; return resultstring; } else if (sizeof(wchar_t) == 4) { size_t utf8size = 4 * widesize + 1; char* utf8stringnative = new char[utf8size]; const UTF32* sourcestart = reinterpret_cast<const UTF32*>(widestring.c_str()); const UTF32* sourceend = sourcestart + widesize; UTF8* targetstart = reinterpret_cast<UTF8*>(utf8stringnative); UTF8* targetend = targetstart + utf8size; ConversionResult res = ConvertUTF32toUTF8(&sourcestart, sourceend, &targetstart, targetend, strictConversion); if (res != conversionOK) { delete [] utf8stringnative; throw std::exception(); } *targetstart = 0; std::string resultstring(utf8stringnative); delete [] utf8stringnative; return resultstring; } else { throw std::exception(); } return ""; }
std::string ToUtf8(const std::wstring& widestring) { size_t widesize = widestring.length(); if (sizeof(wchar_t) == 2) { size_t utf8size = 3 * widesize + 1; std::string resultstring; resultstring.resize(utf8size, '\0'); const UTF16* sourcestart = reinterpret_cast<const UTF16*>(widestring.c_str()); const UTF16* sourceend = sourcestart + widesize; UTF8* targetstart = reinterpret_cast<UTF8*>(&resultstring[0]); UTF8* targetend = targetstart + utf8size; ConversionResult res = ConvertUTF16toUTF8 (&sourcestart, sourceend, &targetstart, targetend, strictConversion); if (res != conversionOK) { throw std::exception("La falla!"); } *targetstart = 0; return resultstring; } else if (sizeof(wchar_t) == 4) { size_t utf8size = 4 * widesize + 1; std::string resultstring; resultstring.resize(utf8size, '\0'); const UTF32* sourcestart = reinterpret_cast<const UTF32*>(widestring.c_str()); const UTF32* sourceend = sourcestart + widesize; UTF8* targetstart = reinterpret_cast<UTF8*>(&resultstring[0]); UTF8* targetend = targetstart + utf8size; ConversionResult res = ConvertUTF32toUTF8 (&sourcestart, sourceend, &targetstart, targetend, strictConversion); if (res != conversionOK) { throw std::exception("La falla!"); } *targetstart = 0; return resultstring; } else { throw std::exception("La falla!"); } return ""; }
void TestTable(FILE *fp) { fprintf(stderr, "Testing STT table.\n"); fseek(fp, 0, SEEK_SET); int Value; UTF32 nextcode = ReadCodePoint(fp); UTF32 i; for (i = 0; i <= UNI_MAX_LEGAL_UTF32; i++) { bool bMember; if (i == nextcode) { bMember = true; if (UNI_EOF != nextcode) { nextcode = ReadCodePoint(fp); if (nextcode <= i) { fprintf(stderr, "Codes in file are not in order (U+%04X).\n", static_cast<unsigned int>(nextcode)); exit(0); } } } else { bMember = false; } UTF32 Source[2]; Source[0] = i; Source[1] = L'\0'; const UTF32 *pSource = Source; UTF8 Target[5]; UTF8 *pTarget = Target; ConversionResult cr; cr = ConvertUTF32toUTF8(&pSource, pSource+1, &pTarget, pTarget+sizeof(Target)-1, lenientConversion); if (conversionOK == cr) { sm.TestString(Target, pTarget, bMember); } } }
std::string StringUtils::wstring_To_Utf8(const std::wstring& widestring) { size_t widesize = widestring.length(); if (sizeof(wchar_t) == 2) { size_t utf8size = 3 * widesize + 1; std::string resultstring; resultstring.resize(utf8size, '\0'); const UTF16* sourcestart = reinterpret_cast<const UTF16*>(widestring.c_str()); const UTF16* sourceend = sourcestart + widesize; UTF8* targetstart = reinterpret_cast<UTF8*>(&resultstring[0]); UTF8* targetend = targetstart + utf8size; ConversionResult res = ConvertUTF16toUTF8(&sourcestart, sourceend, &targetstart, targetend, strictConversion); if (res != conversionOK) { return std::string(widestring.begin(), widestring.end()); } *targetstart = 0; return std::string(resultstring.c_str()); } else if (sizeof(wchar_t) == 4) { size_t utf8size = 4 * widesize + 1; std::string resultstring; resultstring.resize(utf8size, '\0'); const UTF32* sourcestart = reinterpret_cast<const UTF32*>(widestring.c_str()); const UTF32* sourceend = sourcestart + widesize; UTF8* targetstart = reinterpret_cast<UTF8*>(&resultstring[0]); UTF8* targetend = targetstart + utf8size; ConversionResult res = ConvertUTF32toUTF8(&sourcestart, sourceend, &targetstart, targetend, strictConversion); if (res != conversionOK) { return std::string(widestring.begin(), widestring.end()); } *targetstart = 0; return std::string(resultstring.c_str()); } else { assert(false); } return ""; }
const char* jx_utf8_character(JxChar in_char) /* returns the UTF-8 string representation of the supplied UTF-32 character; performs no unnecessary memory allocations */ { static char result[7]; memset(result, 0, sizeof(result)); UTF32 input_buffer[2] = { (UTF32)in_char, 0 }; UTF32 *source = (UTF32*)&input_buffer; UTF8 *temp = (UTF8*)result; UTF8 *temp_end = temp; ConversionResult status = ConvertUTF32toUTF8((const UTF32**)&source, (const UTF32*)source + 1, (UTF8**)&temp_end, (UTF8*)temp + sizeof(result), lenientConversion); if (status != conversionOK) return NULL; long size = temp_end - temp; result[size] = '\0'; return result; }
const bool ToUTF8(const std::wstring &wcstring, std::string &utf8string) { if(wcstring.size()==0) { utf8string.assign(""); return true; } std::vector<std::wstring::value_type> source(wcstring.begin(),wcstring.end()); if(sizeof(std::wstring::value_type)==2 && sizeof(UTF16)==2) { std::vector<std::string::value_type> dest(wcstring.size()*2,0); const UTF16 *sourcestart=reinterpret_cast<const UTF16 *>(&source[0]); const UTF16 *sourceend=sourcestart+source.size(); UTF8 *deststart=reinterpret_cast<UTF8 *>(&dest[0]); UTF8 *destend=deststart+dest.size(); ConversionResult rval=ConvertUTF16toUTF8(&sourcestart,sourceend,&deststart,destend,lenientConversion); if(rval!=conversionOK) { return false; } utf8string.assign(dest.begin(),dest.end()-(destend-deststart)); } else if(sizeof(std::wstring::value_type)==4 && sizeof(UTF32)==4) { std::vector<std::string::value_type> dest(wcstring.size()*4,0); const UTF32 *sourcestart=reinterpret_cast<const UTF32 *>(&source[0]); const UTF32 *sourceend=sourcestart+source.size(); UTF8 *deststart=reinterpret_cast<UTF8 *>(&dest[0]); UTF8 *destend=deststart+dest.size(); ConversionResult rval=ConvertUTF32toUTF8(&sourcestart,sourceend,&deststart,destend,lenientConversion); if(rval!=conversionOK) { return false; } utf8string.assign(dest.begin(),dest.end()-(destend-deststart)); } else { std::vector<UTF32> source2(wcstring.begin(),wcstring.end()); std::vector<std::string::value_type> dest(wcstring.size()*sizeof(std::wstring::value_type),0); const UTF32 *sourcestart=reinterpret_cast<const UTF32 *>(&source2[0]); const UTF32 *sourceend=sourcestart+source2.size(); UTF8 *deststart=reinterpret_cast<UTF8 *>(&dest[0]); UTF8 *destend=deststart+dest.size(); ConversionResult rval=ConvertUTF32toUTF8(&sourcestart,sourceend,&deststart,destend,lenientConversion); if(rval!=conversionOK) { return false; } utf8string.assign(dest.begin(),dest.end()-(destend-deststart)); } return true; }
void LoadStrings(FILE *fp, FILE *fpBody, FILE *fpInclude) { int cIncluded = 0; int cExcluded = 0; int cErrors = 0; fseek(fp, 0, SEEK_SET); int Value; UTF32 nextcode = ReadCodePoint(fp); UTF32 i; for (i = 0; i <= UNI_MAX_LEGAL_UTF32; i++) { bool bMember; if (i == nextcode) { bMember = true; cIncluded++; if (UNI_EOF != nextcode) { nextcode = ReadCodePoint(fp); if (nextcode <= i) { fprintf(stderr, "Codes in file are not in order (U+%04X).\n", static_cast<unsigned int>(nextcode)); exit(0); } } } else { bMember = false; cExcluded++; } UTF32 Source[2]; Source[0] = i; Source[1] = L'\0'; const UTF32 *pSource = Source; UTF8 Target[5]; UTF8 *pTarget = Target; ConversionResult cr; cr = ConvertUTF32toUTF8(&pSource, pSource+1, &pTarget, pTarget+sizeof(Target)-1, lenientConversion); if (conversionOK == cr) { sm.RecordString(Target, pTarget, bMember); } else { cErrors++; } } fprintf(fpBody, "// %d included, %d excluded, %d errors.\n", cIncluded, cExcluded, cErrors); fprintf(fpInclude, "// %d included, %d excluded, %d errors.\n", cIncluded, cExcluded, cErrors); fprintf(stderr, "%d included, %d excluded, %d errors.\n", cIncluded, cExcluded, cErrors); OutputStatus os; sm.OutputTables(NULL, &os); fprintf(stderr, "%d states, %d columns, %d bytes\n", os.nStates, os.nColumns, os.SizeOfMachine); }
/* UTF-8 입력 */ int IDX_IndexByMA(char *SecVal, POSTINFO *PostInfo, int StopCheck) { int ret_tok, PostInfoCnt = 0, org_PostInfoCnt; UTF32 token[MAXTOKENLEN], u32_str[MAXTOKENLEN], *u32_ptr; UTF32 hconv_tok[MAXTOKENLEN]; UTF8 u8_str[MAXTOKENLEN], *u8_start_ptr, *u8_end_ptr; UTF8 original_word[MAXTOKENLEN]; int u8str_len; int token_len; JO_CHAR j_hanstr[MAXTOKENLEN]; int j_hanstr_len; int idx_num, i, j, k; JO_INDEX_WORD idx_words; ConversionResult cnvt_res; int wordNum = 1, max_wordNum = 0, org_wordNum; int old_psgNum, firstFlag = 1; extern int StemCheck; extern int HanjaFlag; extern int StartWordNum; wordNum = StartWordNum; InitTokenizer((unsigned char *) SecVal); while ((ret_tok = GetNextToken(token, &token_len, 0)) != -1) { if (token_len > 42) continue; u32_ptr = token; u8_start_ptr = (UTF8 *) u8_str; u8_end_ptr = (UTF8 *) &(u8_str[MAXTOKENLEN]); cnvt_res = ConvertUTF32toUTF8(&u32_ptr, &(token[token_len]), &u8_start_ptr, u8_end_ptr, strictConversion, &u8str_len); u8_str[u8str_len] = '\0'; /* 불용어 제거 */ if (IDX_FindStopWord(u8_str)) continue; strcpy(original_word, u8_str); switch (ret_tok) { case T_HAN: /* 한글 */ /* UCS4 --> Johab */ j_hanstr_len = 0; for (i = 0; i < token_len; i++) { /* ucs2_to_johab((int) token[i], (int *) &(j_hanstr[j_hanstr_len].j_code)); */ j_hanstr[j_hanstr_len].j_code = ucs2_to_johab((int) token[i]); j_hanstr[j_hanstr_len].j_han.sign = 1; j_hanstr_len++; } org_wordNum = wordNum; org_PostInfoCnt = PostInfoCnt; max_wordNum = 0; /* 조합형 어절 색인 수행 */ idx_num = GetIndexFromOneWord(j_hanstr, j_hanstr_len, &idx_words, 1); // printf("\nIDX_ByMa\n"); if (idx_num > 0) { firstFlag = 1; for (j = 0; j < idx_words.nIndex; j++) { /* 중복 색인어 제거 과정에서 str_len값을 0으로 세팅하므로 이 부분을 꼭 넣어야 한다. */ if (idx_words.IDX[j].str_len == 0) continue; /* Johab --> UCS4 */ for (k = 0; k < idx_words.IDX[j].str_len; k++) u32_str[k] = johab_to_ucs2((int)(idx_words.IDX[j].str[k].j_code)); /* UCS4 --> UTF-8 */ u32_ptr = u32_str; u8_start_ptr = (UTF8 *) u8_str; u8_end_ptr = (UTF8 *) &(u8_str[MAXTOKENLEN]); cnvt_res = ConvertUTF32toUTF8(&u32_ptr, &(u32_str[k]), &u8_start_ptr, u8_end_ptr, strictConversion, &u8str_len); u8_str[u8str_len] = '\0'; /* * 2003-06-02 불용어 처리 * 형태소분석 결과에 대한 불용어 처리 추가 * 2음절이상인 단일명사에 대해서만 불용어 처리 */ if (idx_words.IDX[j].str_len >= 2 && idx_words.nIndex == 1) { /* 불용어 제거 */ if (IDX_FindStopWord(u8_str)) continue; } strcpy(PostInfo[PostInfoCnt].key, (char *) u8_str); PostInfo[PostInfoCnt].keyLen = u8str_len; PostInfo[PostInfoCnt].psgNum = idx_words.IDX[j].loc; if (firstFlag) { firstFlag = 0; old_psgNum = PostInfo[PostInfoCnt].psgNum; } else { if (old_psgNum != PostInfo[PostInfoCnt].psgNum) { wordNum = org_wordNum; old_psgNum = PostInfo[PostInfoCnt].psgNum; } } if (max_wordNum < wordNum) max_wordNum = wordNum; PostInfo[PostInfoCnt].wordNum = wordNum++; PostInfoCnt++; if (PostInfoCnt > MAXPOSTINFOSIZE - 1) return PostInfoCnt; } if (max_wordNum != 0) wordNum = max_wordNum + 1; ///////////////////////////////////////////////////////////////// // 워드 자체도 색인어로 지정(2005/02/14) ///////////////////////////////////////////////////////////////// for (k = org_PostInfoCnt; k < PostInfoCnt; k++) if (!strcmp(original_word, PostInfo[k].key)) break; if (k == PostInfoCnt) { strcpy(PostInfo[PostInfoCnt].key, (char *) original_word); PostInfo[PostInfoCnt].keyLen = strlen(original_word); PostInfo[PostInfoCnt].psgNum = 7777; PostInfo[PostInfoCnt].wordNum = org_wordNum; if (org_PostInfoCnt == PostInfoCnt) // 색인어가 없는 어절... wordNum++; PostInfoCnt++; } ///////////////////////////////////////////////////////////////// } break; case T_CJK: /* 한자 */ /* "한자는 그대로" 플래그가 세팅되지 않으면 한글로 변환 */ /* 수정 : 한자 플래그 1 --> 변환 */ if (HanjaFlag == 1) { Hanja2Hangul_UCS4(token, token_len, hconv_tok); /* 색인 수행 */ /* UCS4 --> Johab */ j_hanstr_len = 0; for (i = 0; i < token_len; i++) { /* ucs2_to_johab((int) hconv_tok[i], (int *) &(j_hanstr[j_hanstr_len].j_code)); */ j_hanstr[j_hanstr_len].j_code = ucs2_to_johab((int) hconv_tok[i]); j_hanstr[j_hanstr_len].j_han.sign = 1; j_hanstr_len++; } org_wordNum = wordNum; max_wordNum = 0; /* 조합형 어절 색인 수행 */ idx_num = GetIndexFromOneWord(j_hanstr, j_hanstr_len, &idx_words, 1); if (idx_num > 0) { firstFlag = 1; for (j = 0; j < idx_words.nIndex; j++) { /* 중복 색인어 제거 과정에서 str_len값을 0으로 세팅하므로 이 부분을 꼭 넣어야 한다. */ if (idx_words.IDX[j].str_len == 0) continue; /* Johab --> UCS4 */ for (k = 0; k < idx_words.IDX[j].str_len; k++) u32_str[k] = johab_to_ucs2((int)(idx_words.IDX[j].str[k].j_code)); /* UCS4 --> UTF-8 */ u32_ptr = u32_str; u8_start_ptr = (UTF8 *) u8_str; u8_end_ptr = (UTF8 *) &(u8_str[MAXTOKENLEN]); cnvt_res = ConvertUTF32toUTF8(&u32_ptr, &(u32_str[k]), &u8_start_ptr, u8_end_ptr, strictConversion, &u8str_len); u8_str[u8str_len] = '\0'; strcpy(PostInfo[PostInfoCnt].key, (char *) u8_str); PostInfo[PostInfoCnt].keyLen = u8str_len; PostInfo[PostInfoCnt].psgNum = idx_words.IDX[j].loc; if (firstFlag) { firstFlag = 0; old_psgNum = PostInfo[PostInfoCnt].psgNum; } else { if (old_psgNum != PostInfo[PostInfoCnt].psgNum) { wordNum = org_wordNum; old_psgNum = PostInfo[PostInfoCnt].psgNum; } } if (max_wordNum < wordNum) max_wordNum = wordNum; PostInfo[PostInfoCnt].wordNum = wordNum++; PostInfoCnt++; if (PostInfoCnt > MAXPOSTINFOSIZE - 1) return PostInfoCnt; } if (max_wordNum != 0) wordNum = max_wordNum + 1; } } else { /* UCS4 --> UTF-8 */ u32_ptr = token; u8_start_ptr = (UTF8 *) u8_str; u8_end_ptr = (UTF8 *) &(u8_str[MAXTOKENLEN]); cnvt_res = ConvertUTF32toUTF8(&u32_ptr, &(token[token_len]), &u8_start_ptr, u8_end_ptr, strictConversion, &u8str_len); u8_str[u8str_len] = '\0'; if (strlen(u8_str) > MAXKEYLEN) break; strcpy(PostInfo[PostInfoCnt].key, (char *) u8_str); PostInfo[PostInfoCnt].keyLen = u8str_len; PostInfo[PostInfoCnt].psgNum = 1; PostInfo[PostInfoCnt].wordNum = wordNum++; PostInfoCnt++; if (PostInfoCnt > MAXPOSTINFOSIZE - 1) return PostInfoCnt; } break; case T_DIG: /* 숫자 */ case T_CYR: /* 러시아어 */ /* UCS4 --> UTF-8 */ /* u32_ptr = token; u8_start_ptr = (UTF8 *) u8_str; u8_end_ptr = (UTF8 *) &(u8_str[1024]); cnvt_res = ConvertUTF32toUTF8(&u32_ptr, &(token[token_len]), &u8_start_ptr, u8_end_ptr, strictConversion, &u8str_len); u8_str[u8str_len] = '\0'; */ if (strlen(u8_str) > MAXKEYLEN) break; strcpy(PostInfo[PostInfoCnt].key, (char *) u8_str); PostInfo[PostInfoCnt].keyLen = strlen(u8_str); PostInfo[PostInfoCnt].psgNum = 1; PostInfo[PostInfoCnt].wordNum = wordNum++; PostInfoCnt++; if (PostInfoCnt > MAXPOSTINFOSIZE - 1) return PostInfoCnt; break; case T_LAT: /* 영어 */ /* UCS4 --> UTF-8 */ /* u32_ptr = token; u8_start_ptr = (UTF8 *) u8_str; u8_end_ptr = (UTF8 *) &(u8_str[1024]); cnvt_res = ConvertUTF32toUTF8(&u32_ptr, &(token[token_len]), &u8_start_ptr, u8_end_ptr, strictConversion, &u8str_len); u8_str[u8str_len] = '\0'; */ if (strlen(u8_str) > MAXKEYLEN) break; strcpy(PostInfo[PostInfoCnt].key, (char *) u8_str); /* 영어 스태밍 */ if (StemCheck == 1) { IDX_strip_affixes(PostInfo[PostInfoCnt].key, &u8str_len); if (u8str_len <= 0) break; } PostInfo[PostInfoCnt].key[u8str_len] = '\0'; strlower(PostInfo[PostInfoCnt].key); PostInfo[PostInfoCnt].keyLen = u8str_len; PostInfo[PostInfoCnt].psgNum = 1; PostInfo[PostInfoCnt].wordNum = wordNum++; PostInfoCnt++; if (PostInfoCnt > MAXPOSTINFOSIZE - 1) return PostInfoCnt; break; default: break; } } return PostInfoCnt; }