const CompactTrieDictionary * ICULanguageBreakFactory::loadDictionaryFor(UScriptCode script, int32_t /*breakType*/) { UErrorCode status = U_ZERO_ERROR; // Open root from brkitr tree. char dictnbuff[256]; char ext[4]={'\0'}; UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status); b = ures_getByKeyWithFallback(b, "dictionaries", b, &status); b = ures_getByKeyWithFallback(b, uscript_getShortName(script), b, &status); int32_t dictnlength = 0; const UChar *dictfname = ures_getString(b, &dictnlength, &status); if (U_SUCCESS(status) && (size_t)dictnlength >= sizeof(dictnbuff)) { dictnlength = 0; status = U_BUFFER_OVERFLOW_ERROR; } if (U_SUCCESS(status) && dictfname) { UChar* extStart=u_strchr(dictfname, 0x002e); int len = 0; if(extStart!=NULL){ len = extStart-dictfname; u_UCharsToChars(extStart+1, ext, sizeof(ext)); // nul terminates the buff u_UCharsToChars(dictfname, dictnbuff, len); } dictnbuff[len]=0; // nul terminate } ures_close(b); UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext, dictnbuff, &status); if (U_SUCCESS(status)) { const CompactTrieDictionary *dict = new CompactTrieDictionary( file, status); if (U_SUCCESS(status) && dict == NULL) { status = U_MEMORY_ALLOCATION_ERROR; } if (U_FAILURE(status)) { delete dict; dict = NULL; } return dict; } else if (dictfname != NULL){ //create dummy dict if dictionary filename not valid UChar c = 0x0020; status = U_ZERO_ERROR; MutableTrieDictionary *mtd = new MutableTrieDictionary(c, status, TRUE); mtd->addWord(&c, 1, status, 1); return new CompactTrieDictionary(*mtd, status); } return NULL; }
//---------------------------------------------------------------------------- // // main for genctd // //---------------------------------------------------------------------------- int main(int argc, char **argv) { UErrorCode status = U_ZERO_ERROR; const char *wordFileName; const char *outFileName; const char *outDir = NULL; const char *copyright = NULL; // // Pick up and check the command line arguments, // using the standard ICU tool utils option handling. // U_MAIN_INIT_ARGS(argc, argv); progName = argv[0]; argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); if(argc<0) { // Unrecognized option fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]); usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); } if(options[0].doesOccur || options[1].doesOccur) { // -? or -h for help. usageAndDie(0); } if (!options[3].doesOccur || argc < 2) { fprintf(stderr, "input and output file must both be specified.\n"); usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); } outFileName = options[3].value; wordFileName = argv[1]; if (options[4].doesOccur) { u_setDataDirectory(options[4].value); } status = U_ZERO_ERROR; /* Combine the directory with the file name */ if(options[5].doesOccur) { outDir = options[5].value; } if (options[6].doesOccur) { copyright = U_COPYRIGHT_STRING; } #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO UNewDataMemory *pData; char msg[1024]; /* write message with just the name */ sprintf(msg, "genctd writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName); fprintf(stderr, "%s\n", msg); /* write the dummy data file */ pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status); udata_writeBlock(pData, msg, strlen(msg)); udata_finish(pData, &status); return (int)status; #else /* Initialize ICU */ u_init(&status); if (U_FAILURE(status)) { fprintf(stderr, "%s: can not initialize ICU. status = %s\n", argv[0], u_errorName(status)); exit(1); } status = U_ZERO_ERROR; // // Read in the dictionary source file // long result; long wordFileSize; FILE *file; char *wordBufferC; MutableTrieDictionary *mtd = NULL; file = fopen(wordFileName, "rb"); if( file == 0 ) { //cannot find file //create 1-line dummy file: ie 1 char, 1 value UNewDataMemory *pData; char msg[1024]; /* write message with just the name */ sprintf(msg, "%s not found, genctd writes dummy %s", wordFileName, outFileName); fprintf(stderr, "%s\n", msg); UChar c = 0x0020; mtd = new MutableTrieDictionary(c, status, TRUE); mtd->addWord(&c, 1, status, 1); } else { //read words in from input file fseek(file, 0, SEEK_END); wordFileSize = ftell(file); fseek(file, 0, SEEK_SET); wordBufferC = new char[wordFileSize+10]; result = (long)fread(wordBufferC, 1, wordFileSize, file); if (result != wordFileSize) { fprintf(stderr, "Error reading file \"%s\"\n", wordFileName); exit (-1); } wordBufferC[wordFileSize]=0; fclose(file); // // Look for a Unicode Signature (BOM) on the word file // int32_t signatureLength; const char * wordSourceC = wordBufferC; const char* encoding = ucnv_detectUnicodeSignature( wordSourceC, wordFileSize, &signatureLength, &status); if (U_FAILURE(status)) { exit(status); } if(encoding!=NULL ){ wordSourceC += signatureLength; wordFileSize -= signatureLength; } // // Open a converter to take the rule file to UTF-16 // UConverter* conv; conv = ucnv_open(encoding, &status); if (U_FAILURE(status)) { fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status)); exit(status); } // // Convert the words to UChar. // Preflight first to determine required buffer size. // uint32_t destCap = ucnv_toUChars(conv, NULL, // dest, 0, // destCapacity, wordSourceC, wordFileSize, &status); if (status != U_BUFFER_OVERFLOW_ERROR) { fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); exit(status); }; status = U_ZERO_ERROR; UChar *wordSourceU = new UChar[destCap+1]; ucnv_toUChars(conv, wordSourceU, // dest, destCap+1, wordSourceC, wordFileSize, &status); if (U_FAILURE(status)) { fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); exit(status); }; ucnv_close(conv); // Get rid of the original file buffer delete[] wordBufferC; // Create a MutableTrieDictionary, and loop through all the lines, inserting // words. // First, pick a median character. UChar *current = wordSourceU + (destCap/2); UChar uc = *current++; UnicodeSet breaks; breaks.add(0x000A); // Line Feed breaks.add(0x000D); // Carriage Return breaks.add(0x2028); // Line Separator breaks.add(0x2029); // Paragraph Separator do { // Look for line break while (uc && !breaks.contains(uc)) { uc = *current++; } // Now skip to first non-line-break while (uc && breaks.contains(uc)) { uc = *current++; } } while (uc && (breaks.contains(uc) || u_isspace(uc))); mtd = new MutableTrieDictionary(uc, status); if (U_FAILURE(status)) { fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status)); exit(status); } // Now add the words. Words are non-space characters at the beginning of // lines, and must be at least one UChar. If a word has an associated value, // the value should follow the word on the same line after a tab character. current = wordSourceU; UChar *candidate = current; uc = *current++; int32_t length = 0; int count = 0; while (uc) { while (uc && !u_isspace(uc)) { ++length; uc = *current++; } UnicodeString valueString; UChar candidateValue; if(uc == 0x0009){ //separator is a tab char, read in number after space while (uc && u_isspace(uc)) { uc = *current++; } while (uc && !u_isspace(uc)) { valueString.append(uc); uc = *current++; } } if (length > 0) { count++; if(valueString.length() > 0){ mtd->setValued(TRUE); uint32_t value = 0; char* s = new char[valueString.length()]; valueString.extract(0,valueString.length(), s, valueString.length()); int n = sscanf(s, "%ud", &value); U_ASSERT(n == 1); U_ASSERT(value >= 0); mtd->addWord(candidate, length, status, (uint16_t)value); delete[] s; } else { mtd->addWord(candidate, length, status); } if (U_FAILURE(status)) { fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\" at line %d in input file\n", u_errorName(status), count); exit(status); } } // Find beginning of next line while (uc && !breaks.contains(uc)) { uc = *current++; } // Find next non-line-breaking character while (uc && breaks.contains(uc)) { uc = *current++; } candidate = current-1; length = 0; } // Get rid of the Unicode text buffer delete[] wordSourceU; } // Now, create a CompactTrieDictionary from the mutable dictionary CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status); if (U_FAILURE(status)) { fprintf(stderr, "new CompactTrieDictionary: ICU Error \"%s\"\n", u_errorName(status)); exit(status); } // Get rid of the MutableTrieDictionary delete mtd; // // Get the binary data from the dictionary. // uint32_t outDataSize = ctd->dataSize(); const uint8_t *outData = (const uint8_t *)ctd->data(); // // Create the output file // size_t bytesWritten; UNewDataMemory *pData; pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status); if(U_FAILURE(status)) { fprintf(stderr, "genctd: Could not open output file \"%s\", \"%s\"\n", outFileName, u_errorName(status)); exit(status); } // Write the data itself. udata_writeBlock(pData, outData, outDataSize); // finish up bytesWritten = udata_finish(pData, &status); if(U_FAILURE(status)) { fprintf(stderr, "genctd: error \"%s\" writing the output file\n", u_errorName(status)); exit(status); } if (bytesWritten != outDataSize) { fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName); exit(-1); } // Get rid of the CompactTrieDictionary delete ctd; u_cleanup(); printf("genctd: tool completed successfully.\n"); return 0; #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ }