bool icu_regex_traits::isctype(char_type c, char_class_type f) const { // check for standard catagories first: char_class_type m = char_class_type(1u << u_charType(c)); if((m & f) != 0) return true; // now check for special cases: if(((f & mask_blank) != 0) && u_isblank(c)) return true; if(((f & mask_space) != 0) && u_isspace(c)) return true; if(((f & mask_xdigit) != 0) && (u_digit(c, 16) >= 0)) return true; if(((f & mask_unicode) != 0) && (c >= 0x100)) return true; if(((f & mask_underscore) != 0) && (c == '_')) return true; if(((f & mask_any) != 0) && (c <= 0x10FFFF)) return true; if(((f & mask_ascii) != 0) && (c <= 0x7F)) return true; if(((f & mask_vertical) != 0) && (::boost::re_detail::is_separator(c) || (c == static_cast<char_type>('\v')) || (m == U_GC_ZL_MASK) || (m == U_GC_ZP_MASK))) return true; if(((f & mask_horizontal) != 0) && !::boost::re_detail::is_separator(c) && u_isspace(c) && (c != static_cast<char_type>('\v'))) return true; return false; }
void * ufmt_utop(const UChar *buffer, int32_t *len) { int32_t count, resultIdx, incVal, offset; /* This union allows the pointer to be written as an array. */ union { void *ptr; uint8_t bytes[sizeof(void*)]; } result; /* intialize variables */ count = 0; offset = 0; result.ptr = NULL; /* Skip the leading zeros */ while(buffer[count] == DIGIT_0 || u_isspace(buffer[count])) { count++; offset++; } /* iterate through buffer, stop when you hit the end */ while(ufmt_isdigit(buffer[count], 16) && count < *len) { /* increment the count consumed */ ++count; } /* detect overflow */ if (count - offset > (int32_t)(sizeof(void*)*NIBBLE_PER_BYTE)) { offset = count - (int32_t)(sizeof(void*)*NIBBLE_PER_BYTE); } /* Initialize the direction of the input */ #if U_IS_BIG_ENDIAN incVal = -1; resultIdx = (int32_t)(sizeof(void*) - 1); #else incVal = 1; resultIdx = 0; #endif /* Write how much was consumed. */ *len = count; while(--count >= offset) { /* Get the first nibble of the byte */ uint8_t byte = (uint8_t)ufmt_digitvalue(buffer[count]); if (count > offset) { /* Get the second nibble of the byte when available */ byte = (uint8_t)(byte + (ufmt_digitvalue(buffer[--count]) << 4)); } /* Write the byte into the array */ result.bytes[resultIdx] = byte; resultIdx += incVal; } return result.ptr; }
int tokenizer_next( tokenizer_t *t, char *word, size_t size ) { UChar savedEndChar; int k; // start iterator if( t->end == 0 ) { t->start = ubrk_first(t->boundary); } // Find next word again: t->end = ubrk_next(t->boundary); if( t->end == UBRK_DONE ) { return -1; } // Null terminate savedEndChar = t->str[t->end]; t->str[t->end] = 0; // Skip unct if( t->end - t->start == 1 && u_ispunct( t->str[t->start] ) ) { t->str[t->end] = savedEndChar; t->start = t->end; goto again; } // Skip whitespace for( k=t->start; k<t->end; k++ ) { if( u_isspace( t->str[k] ) == 1 ) { t->str[t->end] = savedEndChar; t->start = t->end; goto again; } } // Copy to C bffer u_austrncpy(word, t->str+t->start, size-1); word[size-1] = 0; printf("string[%2d..%2d] \"%s\" %d\n", t->start, t->end-1, word, u_isspace( t->str[t->start])); t->str[t->end] = savedEndChar; t->start = t->end; return 0; }
static UBool readLine(UCHARBUF *f, UnicodeString &fileLine, IcuToolErrorCode &errorCode) { int32_t lineLength; const UChar *line = ucbuf_readline(f, &lineLength, errorCode); if(line == NULL || errorCode.isFailure()) { return FALSE; } // Strip trailing CR/LF, comments, and spaces. const UChar *comment = u_memchr(line, 0x23, lineLength); // '#' if(comment != NULL) { lineLength = (int32_t)(comment - line); } else { while(lineLength > 0 && (line[lineLength - 1] == CARRIAGE_RETURN_CHARACTER || line[lineLength - 1] == LINEFEED_CHARACTER)) { --lineLength; } } while(lineLength > 0 && u_isspace(line[lineLength - 1])) { --lineLength; } fileLine.setTo(FALSE, line, lineLength); return TRUE; }
bool ICUTransService::isSpace(const XMLCh toCheck) const { // // <TBD> // For now, we short circuit some of the control chars because ICU // is not correctly reporting them as space. Later, when they change // this, we can get rid of this special case. // if ((toCheck == 0x09) || (toCheck == 0x0A) || (toCheck == 0x0D)) { return true; } return (u_isspace(UChar(toCheck)) != 0); }
bool CheckString(CatalogItemPtr item, const wxString& source, const wxString& translation) override { if (u_isspace(source[0]) && !u_isspace(translation[0])) { item->SetIssue(CatalogItem::Issue::Warning, _(L"The translation doesn’t start with a space.")); return true; } if (!u_isspace(source[0]) && u_isspace(translation[0])) { item->SetIssue(CatalogItem::Issue::Warning, _(L"The translation starts with a space, but the source text doesn’t.")); return true; } if (source.Last() == '\n' && translation.Last() != '\n') { item->SetIssue(CatalogItem::Issue::Warning, _(L"The translation is missing a newline at the end.")); return true; } if (source.Last() != '\n' && translation.Last() == '\n') { item->SetIssue(CatalogItem::Issue::Warning, _(L"The translation ends with a newline, but the source text doesn’t.")); return true; } if (u_isspace(source.Last()) && !u_isspace(translation.Last())) { item->SetIssue(CatalogItem::Issue::Warning, _(L"The translation is missing a space at the end.")); return true; } if (!u_isspace(source.Last()) && u_isspace(translation.Last())) { item->SetIssue(CatalogItem::Issue::Warning, _(L"The translation ends with a space, but the source text doesn’t.")); return true; } return false; }
void CountWords(const UnicodeString& ustr, size_t& cnt, size_t& ctrl_cnt, size_t& sp_cnt) { UErrorCode status = U_ZERO_ERROR; boost::scoped_ptr<BreakIterator> bi ( BreakIterator::createWordInstance(Locale::getDefault(), status) ); bi->setText(ustr); int32_t i = bi->first(); while (i < ustr.length()) { ++cnt; UChar32 ch = ustr.char32At(i); if (u_iscntrl(ch)) ++ctrl_cnt; else if(u_isspace(ch)) ++sp_cnt; i = bi->next(); } }
static double collator_u_strtod(const UChar *nptr, UChar **endptr) { const UChar *u = nptr, *nstart; UChar c = *u; int any = 0; while (u_isspace(c)) { c = *++u; } nstart = u; if (c == 0x2D /*'-'*/ || c == 0x2B /*'+'*/) { c = *++u; } while (c >= 0x30 /*'0'*/ && c <= 0x39 /*'9'*/) { any = 1; c = *++u; } if (c == 0x2E /*'.'*/) { c = *++u; while (c >= 0x30 /*'0'*/ && c <= 0x39 /*'9'*/) { any = 1; c = *++u; } } if ((c == 0x65 /*'e'*/ || c == 0x45 /*'E'*/) && any) { const UChar *e = u; int any_exp = 0; c = *++u; if (c == 0x2D /*'-'*/ || c == 0x2B /*'+'*/) { c = *++u; } while (c >= 0x30 /*'0'*/ && c <= 0x39 /*'9'*/) { any_exp = 1; c = *++u; } if (!any_exp) { u = e; } } if (any) { char buf[64], *numbuf, *bufpos; int length = u - nstart; double value; if (length < (int)sizeof(buf)) { numbuf = buf; } else { numbuf = (char *) malloc(length + 1); } bufpos = numbuf; while (nstart < u) { *bufpos++ = (char) *nstart++; } *bufpos = '\0'; value = zend_strtod(numbuf, nullptr); if (numbuf != buf) { free(numbuf); } if (endptr != nullptr) { *endptr = (UChar *)u; } return value; } if (endptr != nullptr) { *endptr = (UChar *)nptr; } return 0; }
static long collator_u_strtol(const UChar *nptr, UChar **endptr, int base) { const UChar *s = nptr; unsigned long acc; UChar c; unsigned long cutoff; int neg = 0, any, cutlim; if (s == nullptr) { errno = ERANGE; if (endptr != nullptr) { *endptr = nullptr; } return 0; } /* * Skip white space and pick up leading +/- sign if any. * If base is 0, allow 0x for hex and 0 for octal, else * assume decimal; if base is already 16, allow 0x. */ do { c = *s++; } while (u_isspace(c)); if (c == 0x2D /*'-'*/) { neg = 1; c = *s++; } else if (c == 0x2B /*'+'*/) c = *s++; if ((base == 0 || base == 16) && (c == 0x30 /*'0'*/) && (*s == 0x78 /*'x'*/ || *s == 0x58 /*'X'*/)) { c = s[1]; s += 2; base = 16; } if (base == 0) base = (c == 0x30 /*'0'*/) ? 8 : 10; /* * Compute the cutoff value between legal numbers and illegal * numbers. That is the largest legal value, divided by the * base. An input number that is greater than this value, if * followed by a legal input character, is too big. One that * is equal to this value may be valid or not; the limit * between valid and invalid numbers is then based on the last * digit. For instance, if the range for longs is * [-2147483648..2147483647] and the input base is 10, * cutoff will be set to 214748364 and cutlim to either * 7 (neg==0) or 8 (neg==1), meaning that if we have accumulated * a value > 214748364, or equal but the next digit is > 7 (or 8), * the number is too big, and we will return a range error. * * Set any if any `digits' consumed; make it negative to indicate * overflow. */ cutoff = neg ? -(unsigned long)LONG_MIN : LONG_MAX; cutlim = cutoff % (unsigned long)base; cutoff /= (unsigned long)base; for (acc = 0, any = 0;; c = *s++) { if (c >= 0x30 /*'0'*/ && c <= 0x39 /*'9'*/) c -= 0x30 /*'0'*/; else if (c >= 0x41 /*'A'*/ && c <= 0x5A /*'Z'*/) c -= 0x41 /*'A'*/ - 10; else if (c >= 0x61 /*'a'*/ && c <= 0x7A /*'z'*/) c -= 0x61 /*'a'*/ - 10; else break; if (c >= base) break; if (any < 0 || acc > cutoff || (acc == cutoff && c > cutlim)) any = -1; else { any = 1; acc *= base; acc += c; } } if (any < 0) { acc = neg ? LONG_MIN : LONG_MAX; errno = ERANGE; } else if (neg) acc = -acc; if (endptr != nullptr) *endptr = (UChar *)(any ? s - 1 : nptr); return (acc); }
static int u_iscclass(PARROT_INTERP, UINTVAL codepoint, INTVAL flags) { ASSERT_ARGS(u_iscclass) #if PARROT_HAS_ICU UNUSED(interp); /* XXX which one return u_charDigitValue(codepoint); */ if ((flags & enum_cclass_uppercase) && u_isupper(codepoint)) return 1; if ((flags & enum_cclass_lowercase) && u_islower(codepoint)) return 1; if ((flags & enum_cclass_alphabetic) && u_isalpha(codepoint)) return 1; if ((flags & enum_cclass_numeric) && u_isdigit(codepoint)) return 1; if ((flags & enum_cclass_hexadecimal) && u_isxdigit(codepoint)) return 1; if ((flags & enum_cclass_whitespace) && u_isspace(codepoint)) return 1; if ((flags & enum_cclass_printing) && u_isprint(codepoint)) return 1; if ((flags & enum_cclass_graphical) && u_isgraph(codepoint)) return 1; if ((flags & enum_cclass_blank) && u_isblank(codepoint)) return 1; if ((flags & enum_cclass_control) && u_iscntrl(codepoint)) return 1; if ((flags & enum_cclass_alphanumeric) && u_isalnum(codepoint)) return 1; if ((flags & enum_cclass_word) && (u_isalnum(codepoint) || codepoint == '_')) return 1; if ((flags & enum_cclass_newline) && (codepoint == 0x2028 || codepoint == 0x2029 || u_hasBinaryProperty(codepoint, UCHAR_LINE_BREAK))) return 1; return 0; #else if (codepoint < 256) return (Parrot_iso_8859_1_typetable[codepoint] & flags) ? 1 : 0; if (flags == enum_cclass_any) return 1; /* All codepoints from u+0100 to u+02af are alphabetic, so we * cheat on the WORD and ALPHABETIC properties to include these * (and incorrectly exclude all others). This is a stopgap until * ICU is everywhere, or we have better non-ICU unicode support. */ if (flags == enum_cclass_word || flags == enum_cclass_alphabetic) return (codepoint < 0x2b0); if (flags & enum_cclass_whitespace) { /* from http://www.unicode.org/Public/UNIDATA/PropList.txt */ switch (codepoint) { case 0x1680: case 0x180e: case 0x2000: case 0x2001: case 0x2002: case 0x2003: case 0x2004: case 0x2005: case 0x2006: case 0x2007: case 0x2008: case 0x2009: case 0x200a: case 0x2028: case 0x2029: case 0x202f: case 0x205f: case 0x3000: return 1; default: break; } } if (flags & enum_cclass_numeric) { /* from http://www.unicode.org/Public/UNIDATA/UnicodeData.txt */ if (codepoint >= 0x0660 && codepoint <= 0x0669) return 1; if (codepoint >= 0x06f0 && codepoint <= 0x06f9) return 1; if (codepoint >= 0x07c0 && codepoint <= 0x07c9) return 1; if (codepoint >= 0x0966 && codepoint <= 0x096f) return 1; if (codepoint >= 0x09e6 && codepoint <= 0x09ef) return 1; if (codepoint >= 0x0a66 && codepoint <= 0x0a6f) return 1; if (codepoint >= 0x0ae6 && codepoint <= 0x0aef) return 1; if (codepoint >= 0x0b66 && codepoint <= 0x0b6f) return 1; if (codepoint >= 0x0be6 && codepoint <= 0x0bef) return 1; if (codepoint >= 0x0c66 && codepoint <= 0x0c6f) return 1; if (codepoint >= 0x0ce6 && codepoint <= 0x0cef) return 1; if (codepoint >= 0x0d66 && codepoint <= 0x0d6f) return 1; if (codepoint >= 0x0e50 && codepoint <= 0x0e59) return 1; if (codepoint >= 0x0ed0 && codepoint <= 0x0ed9) return 1; if (codepoint >= 0x0f20 && codepoint <= 0x0f29) return 1; if (codepoint >= 0x1040 && codepoint <= 0x1049) return 1; if (codepoint >= 0x17e0 && codepoint <= 0x17e9) return 1; if (codepoint >= 0x1810 && codepoint <= 0x1819) return 1; if (codepoint >= 0x1946 && codepoint <= 0x194f) return 1; if (codepoint >= 0x19d0 && codepoint <= 0x19d9) return 1; if (codepoint >= 0x1b50 && codepoint <= 0x1b59) return 1; if (codepoint >= 0xff10 && codepoint <= 0xff19) return 1; } if (flags & enum_cclass_newline) { /* from http://www.unicode.org/Public/UNIDATA/extracted/DerivedLineBreak.txt * Line_Break=Mandatory_Break*/ if (codepoint == 0x2028 || codepoint == 0x2029) return 1; } if (flags & ~(enum_cclass_whitespace | enum_cclass_numeric | enum_cclass_newline)) Parrot_ex_throw_from_c_noargs(interp, EXCEPTION_LIBRARY_ERROR, "no ICU lib loaded"); return 0; #endif }
//---------------------------------------------------------------------------- // // main for genctd // //---------------------------------------------------------------------------- int main(int argc, char **argv) { UErrorCode status = U_ZERO_ERROR; const char *wordFileName; const char *outFileName; const char *outDir = NULL; const char *copyright = NULL; // // Pick up and check the command line arguments, // using the standard ICU tool utils option handling. // U_MAIN_INIT_ARGS(argc, argv); progName = argv[0]; argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); if(argc<0) { // Unrecognized option fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]); usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); } if(options[0].doesOccur || options[1].doesOccur) { // -? or -h for help. usageAndDie(0); } if (!options[3].doesOccur || argc < 2) { fprintf(stderr, "input and output file must both be specified.\n"); usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); } outFileName = options[3].value; wordFileName = argv[1]; if (options[4].doesOccur) { u_setDataDirectory(options[4].value); } status = U_ZERO_ERROR; /* Combine the directory with the file name */ if(options[5].doesOccur) { outDir = options[5].value; } if (options[6].doesOccur) { copyright = U_COPYRIGHT_STRING; } #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO UNewDataMemory *pData; char msg[1024]; /* write message with just the name */ sprintf(msg, "genctd writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName); fprintf(stderr, "%s\n", msg); /* write the dummy data file */ pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status); udata_writeBlock(pData, msg, strlen(msg)); udata_finish(pData, &status); return (int)status; #else /* Initialize ICU */ u_init(&status); if (U_FAILURE(status)) { fprintf(stderr, "%s: can not initialize ICU. status = %s\n", argv[0], u_errorName(status)); exit(1); } status = U_ZERO_ERROR; // // Read in the dictionary source file // long result; long wordFileSize; FILE *file; char *wordBufferC; MutableTrieDictionary *mtd = NULL; file = fopen(wordFileName, "rb"); if( file == 0 ) { //cannot find file //create 1-line dummy file: ie 1 char, 1 value UNewDataMemory *pData; char msg[1024]; /* write message with just the name */ sprintf(msg, "%s not found, genctd writes dummy %s", wordFileName, outFileName); fprintf(stderr, "%s\n", msg); UChar c = 0x0020; mtd = new MutableTrieDictionary(c, status, TRUE); mtd->addWord(&c, 1, status, 1); } else { //read words in from input file fseek(file, 0, SEEK_END); wordFileSize = ftell(file); fseek(file, 0, SEEK_SET); wordBufferC = new char[wordFileSize+10]; result = (long)fread(wordBufferC, 1, wordFileSize, file); if (result != wordFileSize) { fprintf(stderr, "Error reading file \"%s\"\n", wordFileName); exit (-1); } wordBufferC[wordFileSize]=0; fclose(file); // // Look for a Unicode Signature (BOM) on the word file // int32_t signatureLength; const char * wordSourceC = wordBufferC; const char* encoding = ucnv_detectUnicodeSignature( wordSourceC, wordFileSize, &signatureLength, &status); if (U_FAILURE(status)) { exit(status); } if(encoding!=NULL ){ wordSourceC += signatureLength; wordFileSize -= signatureLength; } // // Open a converter to take the rule file to UTF-16 // UConverter* conv; conv = ucnv_open(encoding, &status); if (U_FAILURE(status)) { fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status)); exit(status); } // // Convert the words to UChar. // Preflight first to determine required buffer size. // uint32_t destCap = ucnv_toUChars(conv, NULL, // dest, 0, // destCapacity, wordSourceC, wordFileSize, &status); if (status != U_BUFFER_OVERFLOW_ERROR) { fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); exit(status); }; status = U_ZERO_ERROR; UChar *wordSourceU = new UChar[destCap+1]; ucnv_toUChars(conv, wordSourceU, // dest, destCap+1, wordSourceC, wordFileSize, &status); if (U_FAILURE(status)) { fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); exit(status); }; ucnv_close(conv); // Get rid of the original file buffer delete[] wordBufferC; // Create a MutableTrieDictionary, and loop through all the lines, inserting // words. // First, pick a median character. UChar *current = wordSourceU + (destCap/2); UChar uc = *current++; UnicodeSet breaks; breaks.add(0x000A); // Line Feed breaks.add(0x000D); // Carriage Return breaks.add(0x2028); // Line Separator breaks.add(0x2029); // Paragraph Separator do { // Look for line break while (uc && !breaks.contains(uc)) { uc = *current++; } // Now skip to first non-line-break while (uc && breaks.contains(uc)) { uc = *current++; } } while (uc && (breaks.contains(uc) || u_isspace(uc))); mtd = new MutableTrieDictionary(uc, status); if (U_FAILURE(status)) { fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status)); exit(status); } // Now add the words. Words are non-space characters at the beginning of // lines, and must be at least one UChar. If a word has an associated value, // the value should follow the word on the same line after a tab character. current = wordSourceU; UChar *candidate = current; uc = *current++; int32_t length = 0; int count = 0; while (uc) { while (uc && !u_isspace(uc)) { ++length; uc = *current++; } UnicodeString valueString; UChar candidateValue; if(uc == 0x0009){ //separator is a tab char, read in number after space while (uc && u_isspace(uc)) { uc = *current++; } while (uc && !u_isspace(uc)) { valueString.append(uc); uc = *current++; } } if (length > 0) { count++; if(valueString.length() > 0){ mtd->setValued(TRUE); uint32_t value = 0; char* s = new char[valueString.length()]; valueString.extract(0,valueString.length(), s, valueString.length()); int n = sscanf(s, "%ud", &value); U_ASSERT(n == 1); U_ASSERT(value >= 0); mtd->addWord(candidate, length, status, (uint16_t)value); delete[] s; } else { mtd->addWord(candidate, length, status); } if (U_FAILURE(status)) { fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\" at line %d in input file\n", u_errorName(status), count); exit(status); } } // Find beginning of next line while (uc && !breaks.contains(uc)) { uc = *current++; } // Find next non-line-breaking character while (uc && breaks.contains(uc)) { uc = *current++; } candidate = current-1; length = 0; } // Get rid of the Unicode text buffer delete[] wordSourceU; } // Now, create a CompactTrieDictionary from the mutable dictionary CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status); if (U_FAILURE(status)) { fprintf(stderr, "new CompactTrieDictionary: ICU Error \"%s\"\n", u_errorName(status)); exit(status); } // Get rid of the MutableTrieDictionary delete mtd; // // Get the binary data from the dictionary. // uint32_t outDataSize = ctd->dataSize(); const uint8_t *outData = (const uint8_t *)ctd->data(); // // Create the output file // size_t bytesWritten; UNewDataMemory *pData; pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status); if(U_FAILURE(status)) { fprintf(stderr, "genctd: Could not open output file \"%s\", \"%s\"\n", outFileName, u_errorName(status)); exit(status); } // Write the data itself. udata_writeBlock(pData, outData, outDataSize); // finish up bytesWritten = udata_finish(pData, &status); if(U_FAILURE(status)) { fprintf(stderr, "genctd: error \"%s\" writing the output file\n", u_errorName(status)); exit(status); } if (bytesWritten != outDataSize) { fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName); exit(-1); } // Get rid of the CompactTrieDictionary delete ctd; u_cleanup(); printf("genctd: tool completed successfully.\n"); return 0; #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ }
//---------------------------------------------------------------------------- // // main for gendict // //---------------------------------------------------------------------------- int main(int argc, char **argv) { // // Pick up and check the command line arguments, // using the standard ICU tool utils option handling. // U_MAIN_INIT_ARGS(argc, argv); progName = argv[0]; argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); if(argc<0) { // Unrecognized option fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]); usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); } if(options[ARG_HELP].doesOccur || options[ARG_QMARK].doesOccur) { // -? or -h for help. usageAndDie(U_ZERO_ERROR); } UBool verbose = options[ARG_VERBOSE].doesOccur; if (argc < 3) { fprintf(stderr, "input and output file must both be specified.\n"); usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); } const char *outFileName = argv[2]; const char *wordFileName = argv[1]; startTime = uprv_getRawUTCtime(); // initialize start timer if (options[ARG_ICUDATADIR].doesOccur) { u_setDataDirectory(options[ARG_ICUDATADIR].value); } const char *copyright = NULL; if (options[ARG_COPYRIGHT].doesOccur) { copyright = U_COPYRIGHT_STRING; } if (options[ARG_UCHARS].doesOccur == options[ARG_BYTES].doesOccur) { fprintf(stderr, "you must specify exactly one type of trie to output!\n"); usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); } UBool isBytesTrie = options[ARG_BYTES].doesOccur; if (isBytesTrie != options[ARG_TRANSFORM].doesOccur) { fprintf(stderr, "you must provide a transformation for a bytes trie, and must not provide one for a uchars trie!\n"); usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); } IcuToolErrorCode status("gendict/main()"); #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO const char* outDir=NULL; UNewDataMemory *pData; char msg[1024]; UErrorCode tempstatus = U_ZERO_ERROR; /* write message with just the name */ // potential for a buffer overflow here... sprintf(msg, "gendict writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName); fprintf(stderr, "%s\n", msg); /* write the dummy data file */ pData = udata_create(outDir, NULL, outFileName, &dataInfo, NULL, &tempstatus); udata_writeBlock(pData, msg, strlen(msg)); udata_finish(pData, &tempstatus); return (int)tempstatus; #else // Read in the dictionary source file if (verbose) { printf("Opening file %s...\n", wordFileName); } const char *codepage = "UTF-8"; UCHARBUF *f = ucbuf_open(wordFileName, &codepage, TRUE, FALSE, status); if (status.isFailure()) { fprintf(stderr, "error opening input file: ICU Error \"%s\"\n", status.errorName()); exit(status.reset()); } if (verbose) { printf("Initializing dictionary builder of type %s...\n", (isBytesTrie ? "BytesTrie" : "UCharsTrie")); } DataDict dict(isBytesTrie, status); if (status.isFailure()) { fprintf(stderr, "new DataDict: ICU Error \"%s\"\n", status.errorName()); exit(status.reset()); } if (options[ARG_TRANSFORM].doesOccur) { dict.setTransform(options[ARG_TRANSFORM].value); } UnicodeString fileLine; if (verbose) { puts("Adding words to dictionary..."); } UBool hasValues = FALSE; UBool hasValuelessContents = FALSE; int lineCount = 0; int wordCount = 0; int minlen = 255; int maxlen = 0; UBool isOk = TRUE; while (readLine(f, fileLine, status)) { lineCount++; if (fileLine.isEmpty()) continue; // Parse word [spaces value]. int32_t keyLen; for (keyLen = 0; keyLen < fileLine.length() && !u_isspace(fileLine[keyLen]); ++keyLen) {} if (keyLen == 0) { fprintf(stderr, "Error: no word on line %i!\n", lineCount); isOk = FALSE; continue; } int32_t valueStart; for (valueStart = keyLen; valueStart < fileLine.length() && u_isspace(fileLine[valueStart]); ++valueStart) {} if (keyLen < valueStart) { int32_t valueLength = fileLine.length() - valueStart; if (valueLength > 15) { fprintf(stderr, "Error: value too long on line %i!\n", lineCount); isOk = FALSE; continue; } char s[16]; fileLine.extract(valueStart, valueLength, s, 16, US_INV); char *end; unsigned long value = uprv_strtoul(s, &end, 0); if (end == s || *end != 0 || (int32_t)uprv_strlen(s) != valueLength || value > 0xffffffff) { fprintf(stderr, "Error: value syntax error or value too large on line %i!\n", lineCount); isOk = FALSE; continue; } dict.addWord(fileLine.tempSubString(0, keyLen), (int32_t)value, status); hasValues = TRUE; wordCount++; if (keyLen < minlen) minlen = keyLen; if (keyLen > maxlen) maxlen = keyLen; } else { dict.addWord(fileLine.tempSubString(0, keyLen), 0, status); hasValuelessContents = TRUE; wordCount++; if (keyLen < minlen) minlen = keyLen; if (keyLen > maxlen) maxlen = keyLen; } if (status.isFailure()) { fprintf(stderr, "ICU Error \"%s\": Failed to add word to trie at input line %d in input file\n", status.errorName(), lineCount); exit(status.reset()); } } if (verbose) { printf("Processed %d lines, added %d words, minlen %d, maxlen %d\n", lineCount, wordCount, minlen, maxlen); } if (!isOk && status.isSuccess()) { status.set(U_ILLEGAL_ARGUMENT_ERROR); } if (hasValues && hasValuelessContents) { fprintf(stderr, "warning: file contained both valued and unvalued strings!\n"); } if (verbose) { printf("Serializing data...isBytesTrie? %d\n", isBytesTrie); } int32_t outDataSize; const void *outData; UnicodeString usp; if (isBytesTrie) { StringPiece sp = dict.serializeBytes(status); outDataSize = sp.size(); outData = sp.data(); } else { dict.serializeUChars(usp, status); outDataSize = usp.length() * U_SIZEOF_UCHAR; outData = usp.getBuffer(); } if (status.isFailure()) { fprintf(stderr, "gendict: got failure of type %s while serializing, if U_ILLEGAL_ARGUMENT_ERROR possibly due to duplicate dictionary entries\n", status.errorName()); exit(status.reset()); } if (verbose) { puts("Opening output file..."); } UNewDataMemory *pData = udata_create(NULL, NULL, outFileName, &dataInfo, copyright, status); if (status.isFailure()) { fprintf(stderr, "gendict: could not open output file \"%s\", \"%s\"\n", outFileName, status.errorName()); exit(status.reset()); } if (verbose) { puts("Writing to output file..."); } int32_t indexes[DictionaryData::IX_COUNT] = { DictionaryData::IX_COUNT * sizeof(int32_t), 0, 0, 0, 0, 0, 0, 0 }; int32_t size = outDataSize + indexes[DictionaryData::IX_STRING_TRIE_OFFSET]; indexes[DictionaryData::IX_RESERVED1_OFFSET] = size; indexes[DictionaryData::IX_RESERVED2_OFFSET] = size; indexes[DictionaryData::IX_TOTAL_SIZE] = size; indexes[DictionaryData::IX_TRIE_TYPE] = isBytesTrie ? DictionaryData::TRIE_TYPE_BYTES : DictionaryData::TRIE_TYPE_UCHARS; if (hasValues) { indexes[DictionaryData::IX_TRIE_TYPE] |= DictionaryData::TRIE_HAS_VALUES; } indexes[DictionaryData::IX_TRANSFORM] = dict.getTransform(); udata_writeBlock(pData, indexes, sizeof(indexes)); udata_writeBlock(pData, outData, outDataSize); size_t bytesWritten = udata_finish(pData, status); if (status.isFailure()) { fprintf(stderr, "gendict: error \"%s\" writing the output file\n", status.errorName()); exit(status.reset()); } if (bytesWritten != (size_t)size) { fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName); exit(U_INTERNAL_PROGRAM_ERROR); } printf("%s: done writing\t%s (%ds).\n", progName, outFileName, elapsedTime()); #ifdef TEST_GENDICT if (isBytesTrie) { BytesTrie::Iterator it(outData, outDataSize, status); while (it.hasNext()) { it.next(status); const StringPiece s = it.getString(); int32_t val = it.getValue(); printf("%s -> %i\n", s.data(), val); } } else { UCharsTrie::Iterator it((const UChar *)outData, outDataSize, status); while (it.hasNext()) { it.next(status); const UnicodeString s = it.getString(); int32_t val = it.getValue(); char tmp[1024]; s.extract(0, s.length(), tmp, 1024); printf("%s -> %i\n", tmp, val); } } #endif return 0; #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ }
/* * imp: common/uchar.c * hdr: common/unicode/uchar.h * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 u_isspace_4_0(UChar32 c) { return u_isspace(c); }
static int icuNext( sqlite3_tokenizer_cursor *pCursor, const char **ppToken, int *pnBytes, int *piStartOffset, int *piEndOffset, int *piPosition ){ IcuCursor *pCsr = (IcuCursor *)pCursor; int iStart = 0; int iEnd = 0; int nByte = 0; while( iStart==iEnd ){ UChar32 c; iStart = ubrk_current(pCsr->pIter); iEnd = ubrk_next(pCsr->pIter); if( iEnd==UBRK_DONE ){ return SQLITE_DONE; } while( iStart<iEnd ){ int iWhite = iStart; U16_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c); if( u_isspace(c) ){ iStart = iWhite; }else{ break; } } assert(iStart<=iEnd); } do { UErrorCode status = U_ZERO_ERROR; if( nByte ){ char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte); if( !zNew ){ return SQLITE_NOMEM; } pCsr->zBuffer = zNew; pCsr->nBuffer = nByte; } u_strToUTF8( pCsr->zBuffer, pCsr->nBuffer, &nByte, &pCsr->aChar[iStart], iEnd-iStart, &status ); } while( nByte>pCsr->nBuffer ); *ppToken = pCsr->zBuffer; *pnBytes = nByte; *piStartOffset = pCsr->aOffset[iStart]; *piEndOffset = pCsr->aOffset[iEnd]; *piPosition = pCsr->iToken++; return SQLITE_OK; }
OSMAND_CORE_API QVector<int> OSMAND_CORE_CALL OsmAnd::ICU::getTextWrapping(const QString& input, const int maxCharsPerLine) { assert(maxCharsPerLine > 0); QVector<int> result; UErrorCode icuError = U_ZERO_ERROR; bool ok = true; // Create break iterator const auto pBreakIterator = g_pIcuWordBreakIterator->clone(); if(pBreakIterator == nullptr || !U_SUCCESS(icuError)) { LogPrintf(LogSeverityLevel::Error, "ICU error: %d", icuError); if(pBreakIterator != nullptr) delete pBreakIterator; return (result << 0); } // Set text for breaking pBreakIterator->setText(UnicodeString(reinterpret_cast<const UChar*>(input.unicode()), input.length())); auto cursor = 0; while(ok && cursor < input.length()) { // Get next desired breaking position auto lookAheadCursor = cursor + maxCharsPerLine; if(lookAheadCursor >= input.length()) break; // If look-ahead cursor is still in bounds of input, and is pointing to: // - control character // - space character // - non-spacing mark // then move forward until a valuable character is found while(lookAheadCursor < input.length()) { const auto c = static_cast<UChar>(input[lookAheadCursor].unicode()); if(!u_isspace(c) && u_charType(c) != U_CONTROL_CHAR && u_charType(c) != U_NON_SPACING_MARK) break; lookAheadCursor++; } // Now locate last legal word-break at or before the look-ahead cursor const auto lastBreak = pBreakIterator->preceding(lookAheadCursor + 1); // If last legal word-break wasn't found since current cursor, perform a hard-break if(lastBreak <= cursor) { result.push_back(lookAheadCursor); cursor = lookAheadCursor; continue; } // Otherwise a legal word-break was found, so move there and find next valuable character // and place line start there cursor = lastBreak; while(cursor < input.length()) { const auto c = static_cast<UChar>(input[cursor].unicode()); if(!u_isspace(c) && u_charType(c) != U_CONTROL_CHAR && u_charType(c) != U_NON_SPACING_MARK) break; cursor++; } result.push_back(cursor); } if(result.isEmpty()) result.push_back(0); if(pBreakIterator != nullptr) delete pBreakIterator; if(!ok) { LogPrintf(LogSeverityLevel::Error, "ICU error: %d", icuError); return (result << 0); } return result; }
/* ** Extract the next token from a tokenization cursor. */ static int icuNext( sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */ const char **ppToken, /* OUT: *ppToken is the token text */ int *pnBytes, /* OUT: Number of bytes in token */ int *piStartOffset, /* OUT: Starting offset of token */ int *piEndOffset, /* OUT: Ending offset of token */ int *piPosition /* OUT: Position integer of token */ ){ IcuCursor *pCsr = (IcuCursor *)pCursor; int iStart = 0; int iEnd = 0; int nByte = 0; while( iStart==iEnd ){ UChar32 c; iStart = ubrk_current(pCsr->pIter); iEnd = ubrk_next(pCsr->pIter); if( iEnd==UBRK_DONE ){ return SQLITE_DONE; } while( iStart<iEnd ){ int iWhite = iStart; U8_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c); if( u_isspace(c) ){ iStart = iWhite; }else{ break; } } assert(iStart<=iEnd); } do { UErrorCode status = U_ZERO_ERROR; if( nByte ){ char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte); if( !zNew ){ return SQLITE_NOMEM; } pCsr->zBuffer = zNew; pCsr->nBuffer = nByte; } u_strToUTF8( pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */ &pCsr->aChar[iStart], iEnd-iStart, /* Input vars */ &status /* Output success/failure */ ); } while( nByte>pCsr->nBuffer ); *ppToken = pCsr->zBuffer; *pnBytes = nByte; *piStartOffset = pCsr->aOffset[iStart]; *piEndOffset = pCsr->aOffset[iEnd]; *piPosition = pCsr->iToken++; return SQLITE_OK; }
void CSVOutputStream::writeUtf8(size32_t len, const char * data) { append(prefix); if (oldOutputFormat) { append(quote).append(rtlUtf8Size(len, data), data).append(quote); } else if (len) { // is this OTT? // not sure if best way but generate an array of utf8 sizes MemoryAttr ma; size32_t * cl; if (len>256) cl = (size32_t *)ma.allocate(sizeof(size32_t)*len); else cl = (size32_t *)alloca(sizeof(size32_t)*len); unsigned start=(unsigned)-1; unsigned end=0; const byte * s = (const byte *)data; unsigned i; for (i=0;i<len;i++) { const byte *p=s; UChar next = readUtf8Character(sizeof(UChar), s); cl[i] = (size32_t)(s-p); if (!u_isspace(next)) { end = i; if (start==(unsigned)-1) start = i; } } const byte *e=s; // do trim if (start!=(unsigned)-1) { for (i=0;i<start;i++) data += *(cl++); len -= start; end -= start; end++; while (end<len) e -= cl[--len]; } // now see if need quoting by looking for separator, terminator or quote // I *think* this can be done with memcmps as has to be exact size32_t sl = separator.length(); size32_t tl = terminator.length(); size32_t ql = quote.length(); bool needquote=false; s = (const byte *)data; for (i=0;i<len;i++) { size32_t l = (size32_t)(e-s); if (sl&&(l>=sl)&&(memcmp(separator.get(),s,sl)==0)) { needquote = true; break; } if (tl&&(l>=tl)&&(memcmp(terminator.get(),s,tl)==0)) { needquote = true; break; } if ((l>=ql)&&(memcmp(quote.get(),s,ql)==0)) { needquote = true; break; } s+=cl[i]; } if (needquote) { append(quote); s = (const byte *)data; for (i=0;i<len;i++) { size32_t l = (size32_t)(e-s); if ((l>=ql)&&(memcmp(quote.get(),s,ql)==0)) append(quote); append(cl[i],(const char *)s); s+=cl[i]; } append(quote); } else append((size32_t)(e-(const byte *)data),data); } prefix = separator; }