/* read a range like start or start..end */ U_CAPI int32_t U_EXPORT2 u_parseCodePointRangeAnyTerminator(const char *s, uint32_t *pStart, uint32_t *pEnd, const char **terminator, UErrorCode *pErrorCode) { char *end; uint32_t value; if(U_FAILURE(*pErrorCode)) { return 0; } if(s==NULL || pStart==NULL || pEnd==NULL) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0; } /* read the start code point */ s=u_skipWhitespace(s); value=(uint32_t)uprv_strtoul(s, &end, 16); if(end<=s || value>=0x110000) { *pErrorCode=U_PARSE_ERROR; return 0; } *pStart=*pEnd=value; /* is there a "..end"? */ s=u_skipWhitespace(end); if(*s!='.' || s[1]!='.') { *terminator=end; return 1; } s=u_skipWhitespace(s+2); /* read the end code point */ value=(uint32_t)uprv_strtoul(s, &end, 16); if(end<=s || value>=0x110000) { *pErrorCode=U_PARSE_ERROR; return 0; } *pEnd=value; /* is this a valid range? */ if(value<*pStart) { *pErrorCode=U_PARSE_ERROR; return 0; } *terminator=end; return value-*pStart+1; }
static void U_CALLCONV ageLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode) { char *s, *numberLimit; uint32_t value, start, end, version; u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode); if(U_FAILURE(*pErrorCode)) { fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 0 at %s\n", fields[0][0]); exit(*pErrorCode); } /* ignore "unassigned" (the default is already set to 0.0) */ s=(char *)u_skipWhitespace(fields[1][0]); if(0==uprv_strncmp(s, "unassigned", 10)) { return; } /* parse version number */ value=(uint32_t)uprv_strtoul(s, &numberLimit, 10); if(s==numberLimit || value==0 || value>15 || (*numberLimit!='.' && *numberLimit!=' ' && *numberLimit!='\t' && *numberLimit!=0)) { fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } version=value<<4; /* parse minor version number */ if(*numberLimit=='.') { s=(char *)u_skipWhitespace(numberLimit+1); value=(uint32_t)uprv_strtoul(s, &numberLimit, 10); if(s==numberLimit || value>15 || (*numberLimit!=' ' && *numberLimit!='\t' && *numberLimit!=0)) { fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } version|=value; } if(start==0 && end==0x10ffff) { /* Also set bits for initialValue and errorValue. */ end=UPVEC_MAX_CP; } upvec_setValue(pv, start, end, 0, version<<UPROPS_AGE_SHIFT, UPROPS_AGE_MASK, pErrorCode); if(U_FAILURE(*pErrorCode)) { fprintf(stderr, "genprops error: unable to set character age: %s\n", u_errorName(*pErrorCode)); exit(*pErrorCode); } }
U_CAPI int8_t U_EXPORT2 ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps) { const char *s=*ps; char *end; uint8_t byte; int8_t bLen; bLen=0; for(;;) { /* skip an optional plus sign */ if(bLen>0 && *s=='+') { ++s; } if(*s!='\\') { break; } if( s[1]!='x' || (byte=(uint8_t)uprv_strtoul(s+2, &end, 16), end)!=s+4 ) { fprintf(stderr, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line); return -1; } if(bLen==UCNV_EXT_MAX_BYTES) { fprintf(stderr, "ucm error: too many bytes on \"%s\"\n", line); return -1; } bytes[bLen++]=byte; s=end; } *ps=s; return bLen; }
U_CAPI int32_t U_EXPORT2 T_CString_stringToInteger(const char *integerString, int32_t radix) { char *end; return uprv_strtoul(integerString, &end, radix); }
static void U_CALLCONV unicodeDataLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode) { char *end; UErrorCode errorCode; UChar32 c; errorCode=U_ZERO_ERROR; /* get the character code, field 0 */ c=(UChar32)uprv_strtoul(fields[0][0], &end, 16); if(end<=fields[0][0] || end!=fields[0][1]) { fprintf(stderr, "genbidi: syntax error in field 0 at %s\n", fields[0][0]); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* get Mirrored flag, field 9 */ if(*fields[9][0]=='Y') { upvec_setValue(pv, c, c, 0, U_MASK(UBIDI_IS_MIRRORED_SHIFT), U_MASK(UBIDI_IS_MIRRORED_SHIFT), &errorCode); if(U_FAILURE(*pErrorCode)) { fprintf(stderr, "genbidi error: unable to set 'is mirrored' for U+%04lx, code: %s\n", (long)c, u_errorName(errorCode)); exit(errorCode); } } else if(fields[9][1]-fields[9][0]!=1 || *fields[9][0]!='N') { fprintf(stderr, "genbidi: syntax error in field 9 at U+%04lx\n", (long)c); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } }
/* * parse a list of code points * store them as a string in dest[destCapacity] * set the first code point in *pFirst * @return The length of the string in numbers of UChars. */ U_CAPI int32_t U_EXPORT2 u_parseString(const char *s, UChar *dest, int32_t destCapacity, uint32_t *pFirst, UErrorCode *pErrorCode) { char *end; uint32_t value; int32_t destLength; if(U_FAILURE(*pErrorCode)) { return 0; } if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0; } if(pFirst!=NULL) { *pFirst=0xffffffff; } destLength=0; for(;;) { s=u_skipWhitespace(s); if(*s==';' || *s==0) { if(destLength<destCapacity) { dest[destLength]=0; } else if(destLength==destCapacity) { *pErrorCode=U_STRING_NOT_TERMINATED_WARNING; } else { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; } return destLength; } /* read one code point */ value=(uint32_t)uprv_strtoul(s, &end, 16); if(end<=s || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || value>=0x110000) { *pErrorCode=U_PARSE_ERROR; return 0; } /* store the first code point */ if(pFirst!=NULL) { *pFirst=value; pFirst=NULL; } /* append it to the destination array */ if((destLength+U16_LENGTH(value))<=destCapacity) { U16_APPEND_UNSAFE(dest, destLength, value); } else { destLength+=U16_LENGTH(value); } /* go to the following characters */ s=end; } }
U_CDECL_BEGIN static void U_CALLCONV strprepProfileLineFn(void * /*context*/, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode) { uint32_t mapping[40]; char *end, *map; uint32_t code; int32_t length; /*UBool* mapWithNorm = (UBool*) context;*/ const char* typeName; uint32_t rangeStart=0,rangeEnd =0; const char *s; s = u_skipWhitespace(fields[0][0]); if (*s == '@') { /* a special directive introduced in 4.2 */ return; } if(fieldCount != 3){ *pErrorCode = U_INVALID_FORMAT_ERROR; return; } typeName = fields[2][0]; map = fields[1][0]; if(uprv_strstr(typeName, usprepTypeNames[USPREP_UNASSIGNED])!=NULL){ u_parseCodePointRange(s, &rangeStart,&rangeEnd, pErrorCode); /* store the range */ compareFlagsForRange(rangeStart,rangeEnd,USPREP_UNASSIGNED); }else if(uprv_strstr(typeName, usprepTypeNames[USPREP_PROHIBITED])!=NULL){ u_parseCodePointRange(s, &rangeStart,&rangeEnd, pErrorCode); /* store the range */ compareFlagsForRange(rangeStart,rangeEnd,USPREP_PROHIBITED); }else if(uprv_strstr(typeName, usprepTypeNames[USPREP_MAP])!=NULL){ /* get the character code, field 0 */ code=(uint32_t)uprv_strtoul(s, &end, 16); /* parse the mapping string */ length=u_parseCodePoints(map, mapping, sizeof(mapping)/4, pErrorCode); /* store the mapping */ compareMapping(code,mapping, length,USPREP_MAP); }else{ *pErrorCode = U_INVALID_FORMAT_ERROR; } }
static void U_CALLCONV nameAliasesLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode) { char *name; int16_t length=0; static uint32_t prevCode=0; uint32_t code=0; if(U_FAILURE(*pErrorCode)) { return; } /* get the character code */ code=uprv_strtoul(fields[0][0], NULL, 16); /* get the character name */ name=fields[1][0]; length=getName(&name, fields[1][1]); if(length==0 || length>=sizeof(cpNameAliases[cpNameAliasesTop].nameAlias)) { fprintf(stderr, "gennames: error - name alias %s empty or too long for code point U+%04lx\n", name, (unsigned long)code); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* check for non-character code points */ if(!U_IS_UNICODE_CHAR(code)) { fprintf(stderr, "gennames: error - name alias for non-character code point U+%04lx\n", (unsigned long)code); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* check that the code points (code) are in ascending order */ if(code<=prevCode && code>0) { fprintf(stderr, "gennames: error - NameAliases entries out of order, U+%04lx after U+%04lx\n", (unsigned long)code, (unsigned long)prevCode); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } prevCode=code; if(cpNameAliasesTop>=LENGTHOF(cpNameAliases)) { fprintf(stderr, "gennames: error - too many name aliases\n"); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } cpNameAliases[cpNameAliasesTop].code=code; uprv_memcpy(cpNameAliases[cpNameAliasesTop].nameAlias, name, length); cpNameAliases[cpNameAliasesTop].nameAlias[length]=0; ++cpNameAliasesTop; parseName(name, length); }
UChar32 PreparsedUCD::parseCodePoint(const char *s, UErrorCode &errorCode) { char *end; uint32_t value=(uint32_t)uprv_strtoul(s, &end, 16); if(end<=s || *end!=0 || value>=0x110000) { fprintf(stderr, "error in preparsed UCD: '%s' is not a valid code point on line %ld\n", s, (long)lineNumber); errorCode=U_PARSE_ERROR; return U_SENTINEL; } return (UChar32)value; }
static void U_CALLCONV specialCasingLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode) { char *end; /* get code point */ specialCasings[specialCasingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16); end=(char *)u_skipWhitespace(end); if(end<=fields[0][0] || end!=fields[0][1]) { fprintf(stderr, "gencase: syntax error in SpecialCasing.txt field 0 at %s\n", fields[0][0]); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* is this a complex mapping? */ if(*(end=(char *)u_skipWhitespace(fields[4][0]))!=0 && *end!=';' && *end!='#') { /* there is some condition text in the fifth field */ specialCasings[specialCasingCount].isComplex=TRUE; /* do not store any actual mappings for this */ specialCasings[specialCasingCount].lowerCase[0]=0; specialCasings[specialCasingCount].upperCase[0]=0; specialCasings[specialCasingCount].titleCase[0]=0; } else { /* just set the "complex" flag and get the case mappings */ specialCasings[specialCasingCount].isComplex=FALSE; specialCasings[specialCasingCount].lowerCase[0]= (UChar)u_parseString(fields[1][0], specialCasings[specialCasingCount].lowerCase+1, 31, NULL, pErrorCode); specialCasings[specialCasingCount].upperCase[0]= (UChar)u_parseString(fields[3][0], specialCasings[specialCasingCount].upperCase+1, 31, NULL, pErrorCode); specialCasings[specialCasingCount].titleCase[0]= (UChar)u_parseString(fields[2][0], specialCasings[specialCasingCount].titleCase+1, 31, NULL, pErrorCode); if(U_FAILURE(*pErrorCode)) { fprintf(stderr, "gencase: error parsing special casing at %s\n", fields[0][0]); exit(*pErrorCode); } uset_add(caseSensitive, (UChar32)specialCasings[specialCasingCount].code); _set_addAll(caseSensitive, specialCasings[specialCasingCount].lowerCase+1, specialCasings[specialCasingCount].lowerCase[0]); _set_addAll(caseSensitive, specialCasings[specialCasingCount].upperCase+1, specialCasings[specialCasingCount].upperCase[0]); _set_addAll(caseSensitive, specialCasings[specialCasingCount].titleCase+1, specialCasings[specialCasingCount].titleCase[0]); } if(++specialCasingCount==MAX_SPECIAL_CASING_COUNT) { fprintf(stderr, "gencase: too many special casing mappings\n"); *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; exit(U_INDEX_OUTOFBOUNDS_ERROR); } }
/** * Split a string into pieces based on the given delimiter * character. Then, parse the resultant fields from hex into * characters. That is, "0040 0400;0C00;0899" -> new String[] { * "\u0040\u0400", "\u0C00", "\u0899" }. The output is assumed to * be of the proper length already, and exactly output.length * fields are parsed. If there are too few an exception is * thrown. If there are too many the extras are ignored. * * @return FALSE upon failure */ UBool NormalizerConformanceTest::hexsplit(const char *s, char delimiter, UnicodeString output[], int32_t outputLength) { const char *t = s; char *end = NULL; UChar32 c; int32_t i; for (i=0; i<outputLength; ++i) { // skip whitespace while(*t == ' ' || *t == '\t') { ++t; } // read a sequence of code points output[i].remove(); for(;;) { c = (UChar32)uprv_strtoul(t, &end, 16); if( (char *)t == end || (uint32_t)c > 0x10ffff || (*end != ' ' && *end != '\t' && *end != delimiter) ) { errln(UnicodeString("Bad field ", "") + (i + 1) + " in " + UnicodeString(s, "")); return FALSE; } output[i].append(c); t = (const char *)end; // skip whitespace while(*t == ' ' || *t == '\t') { ++t; } if(*t == delimiter) { ++t; break; } if(*t == 0) { if((i + 1) == outputLength) { return TRUE; } else { errln(UnicodeString("Missing field(s) in ", "") + s + " only " + (i + 1) + " out of " + outputLength); return FALSE; } } } } return TRUE; }
static void U_CALLCONV mirrorLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode) { char *end; UChar32 src, mirror; src=(UChar32)uprv_strtoul(fields[0][0], &end, 16); if(end<=fields[0][0] || end!=fields[0][1]) { fprintf(stderr, "genbidi: syntax error in BidiMirroring.txt field 0 at %s\n", fields[0][0]); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } mirror=(UChar32)uprv_strtoul(fields[1][0], &end, 16); if(end<=fields[1][0] || end!=fields[1][1]) { fprintf(stderr, "genbidi: syntax error in BidiMirroring.txt field 1 at %s\n", fields[1][0]); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } addMirror(src, mirror); }
// sets the desired transformation data. // should be populated from a command line argument // so far the only acceptable format is offset-<hex constant> // eventually others (mask-<hex constant>?) may be enabled // more complex functions may be more difficult void setTransform(const char *t) { if (strncmp(t, "offset-", 7) == 0) { char *end; unsigned long base = uprv_strtoul(t + 7, &end, 16); if (end == (t + 7) || *end != 0 || base > 0x10FF80) { fprintf(stderr, "Syntax for offset value in --transform offset-%s invalid!\n", t + 7); usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); } transformType = DictionaryData::TRANSFORM_TYPE_OFFSET; transformConstant = (UChar32)base; } else { fprintf(stderr, "Invalid transform specified: %s\n", t); usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); } }
static void getPlatformAndCCSIDFromName(const char *name, int8_t *pPlatform, int32_t *pCCSID) { if( (name[0]=='i' || name[0]=='I') && (name[1]=='b' || name[1]=='B') && (name[2]=='m' || name[2]=='M') ) { name+=3; if(*name=='-') { ++name; } *pPlatform=UCNV_IBM; *pCCSID=(int32_t)uprv_strtoul(name, NULL, 10); } else { *pPlatform=UCNV_UNKNOWN; *pCCSID=0; } }
static void U_CALLCONV normalizationCorrectionsLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode) { uint32_t mapping[40]; char *end, *s; uint32_t code; int32_t length; UVersionInfo version; UVersionInfo thisVersion; /* get the character code, field 0 */ code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16); if(U_FAILURE(*pErrorCode)) { fprintf(stderr, "gensprep: error parsing NormalizationCorrections.txt mapping at %s\n", fields[0][0]); exit(*pErrorCode); } /* Original (erroneous) decomposition */ s = fields[1][0]; /* parse the mapping string */ length=u_parseCodePoints(s, mapping, sizeof(mapping)/4, pErrorCode); /* ignore corrected decomposition */ u_versionFromString(version,fields[3][0] ); u_versionFromString(thisVersion, "3.2.0"); if(U_FAILURE(*pErrorCode)) { fprintf(stderr, "gensprep error parsing NormalizationCorrections.txt of U+%04lx - %s\n", (long)code, u_errorName(*pErrorCode)); exit(*pErrorCode); } /* store the mapping */ if( version[0] > thisVersion[0] || ((version[0]==thisVersion[0]) && (version[1] > thisVersion[1])) ){ storeMapping(code,mapping, length, USPREP_MAP, pErrorCode); } setUnicodeVersionNC(version); }
static void U_CALLCONV strprepProfileLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode) { uint32_t mapping[40]; char *end, *map; uint32_t code; int32_t length; UStringPrepProfile* data = (UStringPrepProfile*) context; const char* typeName; uint32_t rangeStart=0,rangeEnd =0; typeName = fields[2][0]; map = fields[1][0]; if(strstr(typeName, usprepTypeNames[USPREP_UNASSIGNED])!=NULL){ u_parseCodePointRange(fields[0][0], &rangeStart,&rangeEnd, pErrorCode); /* store the range */ compareFlagsForRange(data, rangeStart,rangeEnd,USPREP_UNASSIGNED); }else if(strstr(typeName, usprepTypeNames[USPREP_PROHIBITED])!=NULL){ u_parseCodePointRange(fields[0][0], &rangeStart,&rangeEnd, pErrorCode); /* store the range */ compareFlagsForRange(data, rangeStart,rangeEnd,USPREP_PROHIBITED); }else if(strstr(typeName, usprepTypeNames[USPREP_MAP])!=NULL){ /* get the character code, field 0 */ code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16); /* parse the mapping string */ length=u_parseCodePoints(map, mapping, sizeof(mapping)/4, pErrorCode); /* compare the mapping */ compareMapping(data, code,mapping, length,USPREP_MAP); }else{ *pErrorCode = U_INVALID_FORMAT_ERROR; } }
/* * parse a list of code points * store them as a UTF-32 string in dest[destCapacity] * return the number of code points */ U_CAPI int32_t U_EXPORT2 u_parseCodePoints(const char *s, uint32_t *dest, int32_t destCapacity, UErrorCode *pErrorCode) { char *end; uint32_t value; int32_t count; if(U_FAILURE(*pErrorCode)) { return 0; } if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0; } count=0; for(;;) { s=u_skipWhitespace(s); if(*s==';' || *s==0) { return count; } /* read one code point */ value=(uint32_t)uprv_strtoul(s, &end, 16); if(end<=s || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || value>=0x110000) { *pErrorCode=U_PARSE_ERROR; return 0; } /* append it to the destination array */ if(count<destCapacity) { dest[count++]=value; } else { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; } /* go to the following characters */ s=end; } }
U_CAPI void U_EXPORT2 u_versionFromString(UVersionInfo versionArray, const char *versionString) { char *end; uint16_t part=0; if(versionArray==NULL) { return; } if(versionString!=NULL) { for(;;) { versionArray[part]=(uint8_t)uprv_strtoul(versionString, &end, 10); if(end==versionString || ++part==U_MAX_VERSION_LENGTH || *end!=U_VERSION_DELIMITER) { break; } versionString=end+1; } } while(part<U_MAX_VERSION_LENGTH) { versionArray[part++]=0; } }
// Returns TRUE for "ok to continue parsing fields". UBool PreparsedUCD::parseProperty(UniProps &props, const char *field, UnicodeSet &newValues, UErrorCode &errorCode) { CharString pBuffer; const char *p=field; const char *v=strchr(p, '='); int binaryValue; if(*p=='-') { if(v!=NULL) { fprintf(stderr, "error in preparsed UCD: mix of binary-property-no and " "enum-property syntax '%s' on line %ld\n", field, (long)lineNumber); errorCode=U_PARSE_ERROR; return FALSE; } binaryValue=0; ++p; } else if(v==NULL) { binaryValue=1; } else { binaryValue=-1; // Copy out the property name rather than modifying the field (writing a NUL). pBuffer.append(p, (int32_t)(v-p), errorCode); p=pBuffer.data(); ++v; } int32_t prop=pnames->getPropertyEnum(p); if(prop<0) { for(int32_t i=0;; ++i) { if(i==UPRV_LENGTHOF(ppucdProperties)) { // Ignore unknown property names. return TRUE; } if(0==uprv_stricmp(p, ppucdProperties[i].name)) { prop=ppucdProperties[i].prop; U_ASSERT(prop>=0); break; } } } if(prop<UCHAR_BINARY_LIMIT) { if(binaryValue>=0) { props.binProps[prop]=(UBool)binaryValue; } else { // No binary value for a binary property. fprintf(stderr, "error in preparsed UCD: enum-property syntax '%s' " "for binary property on line %ld\n", field, (long)lineNumber); errorCode=U_PARSE_ERROR; } } else if(binaryValue>=0) { // Binary value for a non-binary property. fprintf(stderr, "error in preparsed UCD: binary-property syntax '%s' " "for non-binary property on line %ld\n", field, (long)lineNumber); errorCode=U_PARSE_ERROR; } else if (prop < UCHAR_INT_START) { fprintf(stderr, "error in preparsed UCD: prop value is invalid: '%d' for line %ld\n", prop, (long)lineNumber); errorCode=U_PARSE_ERROR; } else if(prop<UCHAR_INT_LIMIT) { int32_t value=pnames->getPropertyValueEnum(prop, v); if(value==UCHAR_INVALID_CODE && prop==UCHAR_CANONICAL_COMBINING_CLASS) { // TODO: Make getPropertyValueEnum(UCHAR_CANONICAL_COMBINING_CLASS, v) work. char *end; unsigned long ccc=uprv_strtoul(v, &end, 10); if(v<end && *end==0 && ccc<=254) { value=(int32_t)ccc; } } if(value==UCHAR_INVALID_CODE) { fprintf(stderr, "error in preparsed UCD: '%s' is not a valid value on line %ld\n", field, (long)lineNumber); errorCode=U_PARSE_ERROR; } else { props.intProps[prop-UCHAR_INT_START]=value; } } else if(*v=='<') { // Do not parse default values like <code point>, just set null values. switch(prop) { case UCHAR_BIDI_MIRRORING_GLYPH: props.bmg=U_SENTINEL; break; case UCHAR_BIDI_PAIRED_BRACKET: props.bpb=U_SENTINEL; break; case UCHAR_SIMPLE_CASE_FOLDING: props.scf=U_SENTINEL; break; case UCHAR_SIMPLE_LOWERCASE_MAPPING: props.slc=U_SENTINEL; break; case UCHAR_SIMPLE_TITLECASE_MAPPING: props.stc=U_SENTINEL; break; case UCHAR_SIMPLE_UPPERCASE_MAPPING: props.suc=U_SENTINEL; break; case UCHAR_CASE_FOLDING: props.cf.remove(); break; case UCHAR_LOWERCASE_MAPPING: props.lc.remove(); break; case UCHAR_TITLECASE_MAPPING: props.tc.remove(); break; case UCHAR_UPPERCASE_MAPPING: props.uc.remove(); break; case UCHAR_SCRIPT_EXTENSIONS: props.scx.clear(); break; default: fprintf(stderr, "error in preparsed UCD: '%s' is not a valid default value on line %ld\n", field, (long)lineNumber); errorCode=U_PARSE_ERROR; } } else { char c; switch(prop) { case UCHAR_NUMERIC_VALUE: props.numericValue=v; c=*v; if('0'<=c && c<='9' && v[1]==0) { props.digitValue=c-'0'; } else { props.digitValue=-1; } break; case UCHAR_NAME: props.name=v; break; case UCHAR_AGE: u_versionFromString(props.age, v); // Writes 0.0.0.0 if v is not numeric. break; case UCHAR_BIDI_MIRRORING_GLYPH: props.bmg=parseCodePoint(v, errorCode); break; case UCHAR_BIDI_PAIRED_BRACKET: props.bpb=parseCodePoint(v, errorCode); break; case UCHAR_SIMPLE_CASE_FOLDING: props.scf=parseCodePoint(v, errorCode); break; case UCHAR_SIMPLE_LOWERCASE_MAPPING: props.slc=parseCodePoint(v, errorCode); break; case UCHAR_SIMPLE_TITLECASE_MAPPING: props.stc=parseCodePoint(v, errorCode); break; case UCHAR_SIMPLE_UPPERCASE_MAPPING: props.suc=parseCodePoint(v, errorCode); break; case UCHAR_CASE_FOLDING: parseString(v, props.cf, errorCode); break; case UCHAR_LOWERCASE_MAPPING: parseString(v, props.lc, errorCode); break; case UCHAR_TITLECASE_MAPPING: parseString(v, props.tc, errorCode); break; case UCHAR_UPPERCASE_MAPPING: parseString(v, props.uc, errorCode); break; case PPUCD_NAME_ALIAS: props.nameAlias=v; break; case PPUCD_CONDITIONAL_CASE_MAPPINGS: case PPUCD_TURKIC_CASE_FOLDING: // No need to parse their values: They are hardcoded in the runtime library. break; case UCHAR_SCRIPT_EXTENSIONS: parseScriptExtensions(v, props.scx, errorCode); break; default: // Ignore unhandled properties. return TRUE; } } if(U_SUCCESS(errorCode)) { newValues.add((UChar32)prop); return TRUE; } else { return FALSE; } }
/* * state table row grammar (ebnf-style): * (whitespace is allowed between all tokens) * * row=[[firstentry ','] entry (',' entry)*] * firstentry="initial" | "surrogates" * (initial state (default for state 0), output is all surrogate pairs) * entry=range [':' nextstate] ['.' action] * range=number ['-' number] * nextstate=number * (0..7f) * action='u' | 's' | 'p' | 'i' * (unassigned, state change only, surrogate pair, illegal) * number=(1- or 2-digit hexadecimal number) */ static const char * parseState(const char *s, int32_t state[256], uint32_t *pFlags) { const char *t; uint32_t start, end, i; int32_t entry; /* initialize the state: all illegal with U+ffff */ for(i=0; i<256; ++i) { state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0xffff); } /* skip leading white space */ s=u_skipWhitespace(s); /* is there an "initial" or "surrogates" directive? */ if(uprv_strncmp("initial", s, 7)==0) { *pFlags=MBCS_STATE_FLAG_DIRECT; s=u_skipWhitespace(s+7); if(*s++!=',') { return s-1; } } else if(*pFlags==0 && uprv_strncmp("surrogates", s, 10)==0) { *pFlags=MBCS_STATE_FLAG_SURROGATES; s=u_skipWhitespace(s+10); if(*s++!=',') { return s-1; } } else if(*s==0) { /* empty state row: all-illegal */ return NULL; } for(;;) { /* read an entry, the start of the range first */ s=u_skipWhitespace(s); start=uprv_strtoul(s, (char **)&t, 16); if(s==t || 0xff<start) { return s; } s=u_skipWhitespace(t); /* read the end of the range if there is one */ if(*s=='-') { s=u_skipWhitespace(s+1); end=uprv_strtoul(s, (char **)&t, 16); if(s==t || end<start || 0xff<end) { return s; } s=u_skipWhitespace(t); } else { end=start; } /* determine the state entrys for this range */ if(*s!=':' && *s!='.') { /* the default is: final state with valid entries */ entry=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_16, 0); } else { entry=MBCS_ENTRY_TRANSITION(0, 0); if(*s==':') { /* get the next state, default to 0 */ s=u_skipWhitespace(s+1); i=uprv_strtoul(s, (char **)&t, 16); if(s!=t) { if(0x7f<i) { return s; } s=u_skipWhitespace(t); entry=MBCS_ENTRY_SET_STATE(entry, i); } } /* get the state action, default to valid */ if(*s=='.') { /* this is a final state */ entry=MBCS_ENTRY_SET_FINAL(entry); s=u_skipWhitespace(s+1); if(*s=='u') { /* unassigned set U+fffe */ entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe); s=u_skipWhitespace(s+1); } else if(*s=='p') { if(*pFlags!=MBCS_STATE_FLAG_DIRECT) { entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16_PAIR); } else { entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16); } s=u_skipWhitespace(s+1); } else if(*s=='s') { entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_CHANGE_ONLY); s=u_skipWhitespace(s+1); } else if(*s=='i') { /* illegal set U+ffff */ entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_ILLEGAL, 0xffff); s=u_skipWhitespace(s+1); } else { /* default to valid */ entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16); } } else { /* this is an intermediate state, nothing to do */ } } /* adjust "final valid" states according to the state flags */ if(MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16) { switch(*pFlags) { case 0: /* no adjustment */ break; case MBCS_STATE_FLAG_DIRECT: /* set the valid-direct code point to "unassigned"==0xfffe */ entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_DIRECT_16, 0xfffe); break; case MBCS_STATE_FLAG_SURROGATES: entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_16_PAIR, 0); break; default: break; } } /* set this entry for the range */ for(i=start; i<=end; ++i) { state[i]=entry; } if(*s==',') { ++s; } else { return *s==0 ? NULL : s; } } }
/* parse a mapping line; must not be empty */ U_CAPI UBool U_EXPORT2 ucm_parseMappingLine(UCMapping *m, UChar32 codePoints[UCNV_EXT_MAX_UCHARS], uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line) { const char *s; char *end; UChar32 cp; int32_t u16Length; int8_t uLen, bLen, f; s=line; uLen=bLen=0; /* parse code points */ for(;;) { /* skip an optional plus sign */ if(uLen>0 && *s=='+') { ++s; } if(*s!='<') { break; } if( s[1]!='U' || (cp=(UChar32)uprv_strtoul(s+2, &end, 16), end)==s+2 || *end!='>' ) { fprintf(stderr, "ucm error: Unicode code point must be formatted as <UXXXX> (1..6 hex digits) - \"%s\"\n", line); return FALSE; } if((uint32_t)cp>0x10ffff || U_IS_SURROGATE(cp)) { fprintf(stderr, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line); return FALSE; } if(uLen==UCNV_EXT_MAX_UCHARS) { fprintf(stderr, "ucm error: too many code points on \"%s\"\n", line); return FALSE; } codePoints[uLen++]=cp; s=end+1; } if(uLen==0) { fprintf(stderr, "ucm error: no Unicode code points on \"%s\"\n", line); return FALSE; } else if(uLen==1) { m->u=codePoints[0]; } else { UErrorCode errorCode=U_ZERO_ERROR; u_strFromUTF32(NULL, 0, &u16Length, codePoints, uLen, &errorCode); if( (U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) || u16Length>UCNV_EXT_MAX_UCHARS ) { fprintf(stderr, "ucm error: too many UChars on \"%s\"\n", line); return FALSE; } } s=u_skipWhitespace(s); /* parse bytes */ bLen=ucm_parseBytes(bytes, line, &s); if(bLen<0) { return FALSE; } else if(bLen==0) { fprintf(stderr, "ucm error: no bytes on \"%s\"\n", line); return FALSE; } else if(bLen<=4) { uprv_memcpy(m->b.bytes, bytes, bLen); } /* skip everything until the fallback indicator, even the start of a comment */ for(;;) { if(*s==0) { f=-1; /* no fallback indicator */ break; } else if(*s=='|') { f=(int8_t)(s[1]-'0'); if((uint8_t)f>4) { fprintf(stderr, "ucm error: fallback indicator must be |0..|4 - \"%s\"\n", line); return FALSE; } break; } ++s; } m->uLen=uLen; m->bLen=bLen; m->f=f; return TRUE; }
static void U_CALLCONV lineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode) { Options *storeOptions=(Options *)context; char *names[4]; int16_t lengths[4]={ 0, 0, 0, 0 }; static uint32_t prevCode=0; uint32_t code=0; if(U_FAILURE(*pErrorCode)) { return; } /* get the character code */ code=uprv_strtoul(fields[0][0], NULL, 16); /* get the character name */ if(storeOptions->storeNames) { names[0]=fields[1][0]; lengths[0]=getName(names+0, fields[1][1]); if(names[0][0]=='<') { /* do not store pseudo-names in <> brackets */ lengths[0]=0; } } /* store 1.0 names */ /* get the second character name, the one from Unicode 1.0 */ if(storeOptions->store10Names) { names[1]=fields[10][0]; lengths[1]=getName(names+1, fields[10][1]); if(names[1][0]=='<') { /* do not store pseudo-names in <> brackets */ lengths[1]=0; } } /* get the ISO 10646 comment */ if(storeOptions->storeISOComments) { names[2]=fields[11][0]; lengths[2]=getName(names+2, fields[11][1]); } if(lengths[0]+lengths[1]+lengths[2]==0) { return; } /* check for non-character code points */ if(!U_IS_UNICODE_CHAR(code)) { fprintf(stderr, "gennames: error - properties for non-character code point U+%04lx\n", (unsigned long)code); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* check that the code points (code) are in ascending order */ if(code<=prevCode && code>0) { fprintf(stderr, "gennames: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n", (unsigned long)code, (unsigned long)prevCode); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } prevCode=code; parseName(names[0], lengths[0]); parseName(names[1], lengths[1]); parseName(names[2], lengths[2]); if(cpNameAliasesIndex<cpNameAliasesTop && code>=cpNameAliases[cpNameAliasesIndex].code) { if(code==cpNameAliases[cpNameAliasesIndex].code) { names[3]=cpNameAliases[cpNameAliasesIndex].nameAlias; lengths[3]=(int16_t)uprv_strlen(cpNameAliases[cpNameAliasesIndex].nameAlias); ++cpNameAliasesIndex; } else { fprintf(stderr, "gennames: error - NameAlias but no UnicodeData entry for U+%04lx\n", (unsigned long)code); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } } /* * set the count argument to * 1: only store regular names, or only store ISO 10646 comments * 2: store regular and 1.0 names * 3: store names and ISO 10646 comment * 4: also store name alias * * addLine() will ignore empty trailing names */ if(storeOptions->storeNames) { /* store names and comments as parsed according to storeOptions */ addLine(code, names, lengths, LENGTHOF(names)); } else { /* store only ISO 10646 comments */ addLine(code, names+2, lengths+2, 1); } }
/* read a range like start or start..end */ U_CAPI int32_t U_EXPORT2 u_parseCodePointRange(const char *s, uint32_t *pStart, uint32_t *pEnd, UErrorCode *pErrorCode) { char *end; uint32_t value; if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { return 0; } if(s==NULL || pStart==NULL || pEnd==NULL) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; } s=u_skipWhitespace(s); if(*s==';' || *s==0) { *pErrorCode=U_PARSE_ERROR; return 0; } /* read the start code point */ value=(uint32_t)uprv_strtoul(s, &end, 16); if(end<=s || (*end!=' ' && *end!='\t' && *end!='.' && *end!=';') || value>=0x110000) { *pErrorCode=U_PARSE_ERROR; return 0; } *pStart=*pEnd=value; /* is there a "..end"? */ s=u_skipWhitespace(end); if(*s==';' || *s==0) { return 1; } if(*s!='.' || s[1]!='.') { *pErrorCode=U_PARSE_ERROR; return 0; } s+=2; /* read the end code point */ value=(uint32_t)uprv_strtoul(s, &end, 16); if(end<=s || (*end!=' ' && *end!='\t' && *end!=';') || value>=0x110000) { *pErrorCode=U_PARSE_ERROR; return 0; } *pEnd=value; /* is this a valid range? */ if(value<*pStart) { *pErrorCode=U_PARSE_ERROR; return 0; } /* no garbage after that? */ s=u_skipWhitespace(end); if(*s==';' || *s==0) { return value-*pStart+1; } else { *pErrorCode=U_PARSE_ERROR; return 0; } }
//---------------------------------------------------------------------------- // // main for gendict // //---------------------------------------------------------------------------- int main(int argc, char **argv) { // // Pick up and check the command line arguments, // using the standard ICU tool utils option handling. // U_MAIN_INIT_ARGS(argc, argv); progName = argv[0]; argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); if(argc<0) { // Unrecognized option fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]); usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); } if(options[ARG_HELP].doesOccur || options[ARG_QMARK].doesOccur) { // -? or -h for help. usageAndDie(U_ZERO_ERROR); } UBool verbose = options[ARG_VERBOSE].doesOccur; if (argc < 3) { fprintf(stderr, "input and output file must both be specified.\n"); usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); } const char *outFileName = argv[2]; const char *wordFileName = argv[1]; startTime = uprv_getRawUTCtime(); // initialize start timer if (options[ARG_ICUDATADIR].doesOccur) { u_setDataDirectory(options[ARG_ICUDATADIR].value); } const char *copyright = NULL; if (options[ARG_COPYRIGHT].doesOccur) { copyright = U_COPYRIGHT_STRING; } if (options[ARG_UCHARS].doesOccur == options[ARG_BYTES].doesOccur) { fprintf(stderr, "you must specify exactly one type of trie to output!\n"); usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); } UBool isBytesTrie = options[ARG_BYTES].doesOccur; if (isBytesTrie != options[ARG_TRANSFORM].doesOccur) { fprintf(stderr, "you must provide a transformation for a bytes trie, and must not provide one for a uchars trie!\n"); usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); } IcuToolErrorCode status("gendict/main()"); #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO const char* outDir=NULL; UNewDataMemory *pData; char msg[1024]; UErrorCode tempstatus = U_ZERO_ERROR; /* write message with just the name */ // potential for a buffer overflow here... sprintf(msg, "gendict writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName); fprintf(stderr, "%s\n", msg); /* write the dummy data file */ pData = udata_create(outDir, NULL, outFileName, &dataInfo, NULL, &tempstatus); udata_writeBlock(pData, msg, strlen(msg)); udata_finish(pData, &tempstatus); return (int)tempstatus; #else // Read in the dictionary source file if (verbose) { printf("Opening file %s...\n", wordFileName); } const char *codepage = "UTF-8"; UCHARBUF *f = ucbuf_open(wordFileName, &codepage, TRUE, FALSE, status); if (status.isFailure()) { fprintf(stderr, "error opening input file: ICU Error \"%s\"\n", status.errorName()); exit(status.reset()); } if (verbose) { printf("Initializing dictionary builder of type %s...\n", (isBytesTrie ? "BytesTrie" : "UCharsTrie")); } DataDict dict(isBytesTrie, status); if (status.isFailure()) { fprintf(stderr, "new DataDict: ICU Error \"%s\"\n", status.errorName()); exit(status.reset()); } if (options[ARG_TRANSFORM].doesOccur) { dict.setTransform(options[ARG_TRANSFORM].value); } UnicodeString fileLine; if (verbose) { puts("Adding words to dictionary..."); } UBool hasValues = FALSE; UBool hasValuelessContents = FALSE; int lineCount = 0; int wordCount = 0; int minlen = 255; int maxlen = 0; UBool isOk = TRUE; while (readLine(f, fileLine, status)) { lineCount++; if (fileLine.isEmpty()) continue; // Parse word [spaces value]. int32_t keyLen; for (keyLen = 0; keyLen < fileLine.length() && !u_isspace(fileLine[keyLen]); ++keyLen) {} if (keyLen == 0) { fprintf(stderr, "Error: no word on line %i!\n", lineCount); isOk = FALSE; continue; } int32_t valueStart; for (valueStart = keyLen; valueStart < fileLine.length() && u_isspace(fileLine[valueStart]); ++valueStart) {} if (keyLen < valueStart) { int32_t valueLength = fileLine.length() - valueStart; if (valueLength > 15) { fprintf(stderr, "Error: value too long on line %i!\n", lineCount); isOk = FALSE; continue; } char s[16]; fileLine.extract(valueStart, valueLength, s, 16, US_INV); char *end; unsigned long value = uprv_strtoul(s, &end, 0); if (end == s || *end != 0 || (int32_t)uprv_strlen(s) != valueLength || value > 0xffffffff) { fprintf(stderr, "Error: value syntax error or value too large on line %i!\n", lineCount); isOk = FALSE; continue; } dict.addWord(fileLine.tempSubString(0, keyLen), (int32_t)value, status); hasValues = TRUE; wordCount++; if (keyLen < minlen) minlen = keyLen; if (keyLen > maxlen) maxlen = keyLen; } else { dict.addWord(fileLine.tempSubString(0, keyLen), 0, status); hasValuelessContents = TRUE; wordCount++; if (keyLen < minlen) minlen = keyLen; if (keyLen > maxlen) maxlen = keyLen; } if (status.isFailure()) { fprintf(stderr, "ICU Error \"%s\": Failed to add word to trie at input line %d in input file\n", status.errorName(), lineCount); exit(status.reset()); } } if (verbose) { printf("Processed %d lines, added %d words, minlen %d, maxlen %d\n", lineCount, wordCount, minlen, maxlen); } if (!isOk && status.isSuccess()) { status.set(U_ILLEGAL_ARGUMENT_ERROR); } if (hasValues && hasValuelessContents) { fprintf(stderr, "warning: file contained both valued and unvalued strings!\n"); } if (verbose) { printf("Serializing data...isBytesTrie? %d\n", isBytesTrie); } int32_t outDataSize; const void *outData; UnicodeString usp; if (isBytesTrie) { StringPiece sp = dict.serializeBytes(status); outDataSize = sp.size(); outData = sp.data(); } else { dict.serializeUChars(usp, status); outDataSize = usp.length() * U_SIZEOF_UCHAR; outData = usp.getBuffer(); } if (status.isFailure()) { fprintf(stderr, "gendict: got failure of type %s while serializing, if U_ILLEGAL_ARGUMENT_ERROR possibly due to duplicate dictionary entries\n", status.errorName()); exit(status.reset()); } if (verbose) { puts("Opening output file..."); } UNewDataMemory *pData = udata_create(NULL, NULL, outFileName, &dataInfo, copyright, status); if (status.isFailure()) { fprintf(stderr, "gendict: could not open output file \"%s\", \"%s\"\n", outFileName, status.errorName()); exit(status.reset()); } if (verbose) { puts("Writing to output file..."); } int32_t indexes[DictionaryData::IX_COUNT] = { DictionaryData::IX_COUNT * sizeof(int32_t), 0, 0, 0, 0, 0, 0, 0 }; int32_t size = outDataSize + indexes[DictionaryData::IX_STRING_TRIE_OFFSET]; indexes[DictionaryData::IX_RESERVED1_OFFSET] = size; indexes[DictionaryData::IX_RESERVED2_OFFSET] = size; indexes[DictionaryData::IX_TOTAL_SIZE] = size; indexes[DictionaryData::IX_TRIE_TYPE] = isBytesTrie ? DictionaryData::TRIE_TYPE_BYTES : DictionaryData::TRIE_TYPE_UCHARS; if (hasValues) { indexes[DictionaryData::IX_TRIE_TYPE] |= DictionaryData::TRIE_HAS_VALUES; } indexes[DictionaryData::IX_TRANSFORM] = dict.getTransform(); udata_writeBlock(pData, indexes, sizeof(indexes)); udata_writeBlock(pData, outData, outDataSize); size_t bytesWritten = udata_finish(pData, status); if (status.isFailure()) { fprintf(stderr, "gendict: error \"%s\" writing the output file\n", status.errorName()); exit(status.reset()); } if (bytesWritten != (size_t)size) { fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName); exit(U_INTERNAL_PROGRAM_ERROR); } printf("%s: done writing\t%s (%ds).\n", progName, outFileName, elapsedTime()); #ifdef TEST_GENDICT if (isBytesTrie) { BytesTrie::Iterator it(outData, outDataSize, status); while (it.hasNext()) { it.next(status); const StringPiece s = it.getString(); int32_t val = it.getValue(); printf("%s -> %i\n", s.data(), val); } } else { UCharsTrie::Iterator it((const UChar *)outData, outDataSize, status); while (it.hasNext()) { it.next(status); const UnicodeString s = it.getString(); int32_t val = it.getValue(); char tmp[1024]; s.extract(0, s.length(), tmp, 1024); printf("%s -> %i\n", tmp, val); } } #endif return 0; #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ }
static void U_CALLCONV lineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode) { char *names[3]; int16_t lengths[3]; static uint32_t prevCode=0; uint32_t code=0; if(U_FAILURE(*pErrorCode)) { return; } /* get the character code */ code=uprv_strtoul(fields[0][0], NULL, 16); /* get the character name */ names[0]=fields[1][0]; lengths[0]=getName(names+0, fields[1][1]); if(names[0][0]=='<') { /* do not store pseudo-names in <> brackets */ lengths[0]=0; } /* store 1.0 names */ /* get the second character name, the one from Unicode 1.0 */ /* do not store pseudo-names in <> brackets */ names[1]=fields[10][0]; lengths[1]=getName(names+1, fields[10][1]); if(*(UBool *)context && names[1][0]!='<') { /* keep the name */ } else { lengths[1]=0; } /* get the ISO 10646 comment */ names[2]=fields[11][0]; lengths[2]=getName(names+2, fields[11][1]); if(lengths[0]+lengths[1]+lengths[2]==0) { return; } /* check for non-character code points */ if(!UTF_IS_UNICODE_CHAR(code)) { fprintf(stderr, "gennames: error - properties for non-character code point U+%04lx\n", (unsigned long)code); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* check that the code points (code) are in ascending order */ if(code<=prevCode && code>0) { fprintf(stderr, "gennames: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n", (unsigned long)code, (unsigned long)prevCode); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } prevCode=code; parseName(names[0], lengths[0]); parseName(names[1], lengths[1]); parseName(names[2], lengths[2]); /* * set the count argument to * 1: only store regular names * 2: store regular and 1.0 names * 3: store names and ISO 10646 comment */ addLine(code, names, lengths, 3); }
static void U_CALLCONV strprepProfileLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode) { uint32_t mapping[40]; char *end, *map; uint32_t code; int32_t length; /*UBool* mapWithNorm = (UBool*) context;*/ const char* typeName; uint32_t rangeStart=0,rangeEnd =0; const char* filename = (const char*) context; const char *s; s = u_skipWhitespace(fields[0][0]); if (*s == '@') { /* special directive */ s++; length = fields[0][1] - s; if (length >= NORMALIZE_DIRECTIVE_LEN && uprv_strncmp(s, NORMALIZE_DIRECTIVE, NORMALIZE_DIRECTIVE_LEN) == 0) { options[NORMALIZE].doesOccur = TRUE; return; } else if (length >= CHECK_BIDI_DIRECTIVE_LEN && uprv_strncmp(s, CHECK_BIDI_DIRECTIVE, CHECK_BIDI_DIRECTIVE_LEN) == 0) { options[CHECK_BIDI].doesOccur = TRUE; return; } else { fprintf(stderr, "gensprep error parsing a directive %s.", fields[0][0]); } } typeName = fields[2][0]; map = fields[1][0]; if(uprv_strstr(typeName, usprepTypeNames[USPREP_UNASSIGNED])!=NULL){ u_parseCodePointRange(s, &rangeStart,&rangeEnd, pErrorCode); if(U_FAILURE(*pErrorCode)){ fprintf(stderr, "Could not parse code point range. Error: %s\n",u_errorName(*pErrorCode)); return; } /* store the range */ storeRange(rangeStart,rangeEnd,USPREP_UNASSIGNED, pErrorCode); }else if(uprv_strstr(typeName, usprepTypeNames[USPREP_PROHIBITED])!=NULL){ u_parseCodePointRange(s, &rangeStart,&rangeEnd, pErrorCode); if(U_FAILURE(*pErrorCode)){ fprintf(stderr, "Could not parse code point range. Error: %s\n",u_errorName(*pErrorCode)); return; } /* store the range */ storeRange(rangeStart,rangeEnd,USPREP_PROHIBITED, pErrorCode); }else if(uprv_strstr(typeName, usprepTypeNames[USPREP_MAP])!=NULL){ /* get the character code, field 0 */ code=(uint32_t)uprv_strtoul(s, &end, 16); if(end<=s || end!=fields[0][1]) { fprintf(stderr, "gensprep: syntax error in field 0 at %s\n", fields[0][0]); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* parse the mapping string */ length=u_parseCodePoints(map, mapping, sizeof(mapping)/4, pErrorCode); /* store the mapping */ storeMapping(code,mapping, length,USPREP_MAP, pErrorCode); }else{ *pErrorCode = U_INVALID_FORMAT_ERROR; } if(U_FAILURE(*pErrorCode)) { fprintf(stderr, "gensprep error parsing %s line %s at %s. Error: %s\n",filename, fields[0][0],fields[2][0],u_errorName(*pErrorCode)); exit(*pErrorCode); } }
static void U_CALLCONV unicodeDataLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode) { uint32_t decomp[40]; Norm norm; const char *s; char *end; uint32_t code, value; int32_t length; UBool isCompat, something=FALSE; /* ignore First and Last entries for ranges */ if( *fields[1][0]=='<' && (length=(int32_t)(fields[1][1]-fields[1][0]))>=9 && (0==uprv_memcmp(", First>", fields[1][1]-8, 8) || 0==uprv_memcmp(", Last>", fields[1][1]-7, 7)) ) { return; } /* reset the properties */ uprv_memset(&norm, 0, sizeof(Norm)); /* get the character code, field 0 */ code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16); if(end<=fields[0][0] || end!=fields[0][1]) { fprintf(stderr, "gennorm: syntax error in field 0 at %s\n", fields[0][0]); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* get canonical combining class, field 3 */ value=(uint32_t)uprv_strtoul(fields[3][0], &end, 10); if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) { fprintf(stderr, "gennorm: syntax error in field 3 at %s\n", fields[0][0]); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } if(value>0) { norm.udataCC=(uint8_t)value; something=TRUE; } /* get the decomposition, field 5 */ if(fields[5][0]<fields[5][1]) { if(*(s=fields[5][0])=='<') { ++s; isCompat=TRUE; /* skip and ignore the compatibility type name */ do { if(s==fields[5][1]) { /* missing '>' */ fprintf(stderr, "gennorm: syntax error in field 5 at %s\n", fields[0][0]); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } } while(*s++!='>'); } else { isCompat=FALSE; } /* parse the decomposition string */ length=u_parseCodePoints(s, decomp, sizeof(decomp)/4, pErrorCode); if(U_FAILURE(*pErrorCode)) { fprintf(stderr, "gennorm error parsing UnicodeData.txt decomposition of U+%04lx - %s\n", (long)code, u_errorName(*pErrorCode)); exit(*pErrorCode); } /* store the string */ if(length>0) { something=TRUE; if(isCompat) { norm.lenNFKD=(uint8_t)length; norm.nfkd=decomp; } else { if(length>2) { fprintf(stderr, "gennorm: error - length of NFD(U+%04lx) = %ld >2 in UnicodeData - illegal\n", (long)code, (long)length); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } norm.lenNFD=(uint8_t)length; norm.nfd=decomp; } } } /* check for non-character code points */ if((code&0xfffe)==0xfffe || (uint32_t)(code-0xfdd0)<0x20 || code>0x10ffff) { fprintf(stderr, "gennorm: error - properties for non-character code point U+%04lx\n", (long)code); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } if(something) { /* there are normalization values, so store them */ #if 0 if(beVerbose) { printf("store values for U+%04lx: cc=%d, lenNFD=%ld, lenNFKD=%ld\n", (long)code, norm.udataCC, (long)norm.lenNFD, (long)norm.lenNFKD); } #endif storeNorm(code, &norm); } }
static void U_CALLCONV unicodeDataLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode) { Props p; char *end; static UChar32 prevCode=0; UChar32 value; int32_t i; /* reset the properties */ uprv_memset(&p, 0, sizeof(Props)); /* get the character code, field 0 */ p.code=(UChar32)uprv_strtoul(fields[0][0], &end, 16); if(end<=fields[0][0] || end!=fields[0][1]) { fprintf(stderr, "gencase: syntax error in field 0 at %s\n", fields[0][0]); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* get general category, field 2 */ i=getTokenIndex(genCategoryNames, U_CHAR_CATEGORY_COUNT, fields[2][0]); if(i>=0) { p.gc=(uint8_t)i; } else { fprintf(stderr, "gencase: unknown general category \"%s\" at code 0x%lx\n", fields[2][0], (unsigned long)p.code); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* get canonical combining class, field 3 */ value=(UChar32)uprv_strtoul(fields[3][0], &end, 10); if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) { fprintf(stderr, "gencase: syntax error in field 3 at %s\n", fields[0][0]); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } p.cc=(uint8_t)value; /* get uppercase mapping, field 12 */ value=(UChar32)uprv_strtoul(fields[12][0], &end, 16); if(end!=fields[12][1]) { fprintf(stderr, "gencase: syntax error in field 12 at code 0x%lx\n", (unsigned long)p.code); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } if(value!=0 && value!=p.code) { p.upperCase=value; uset_add(caseSensitive, p.code); uset_add(caseSensitive, value); } /* get lowercase value, field 13 */ value=(UChar32)uprv_strtoul(fields[13][0], &end, 16); if(end!=fields[13][1]) { fprintf(stderr, "gencase: syntax error in field 13 at code 0x%lx\n", (unsigned long)p.code); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } if(value!=0 && value!=p.code) { p.lowerCase=value; uset_add(caseSensitive, p.code); uset_add(caseSensitive, value); } /* get titlecase value, field 14 */ value=(UChar32)uprv_strtoul(fields[14][0], &end, 16); if(end!=fields[14][1]) { fprintf(stderr, "gencase: syntax error in field 14 at code 0x%lx\n", (unsigned long)p.code); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } if(value!=0 && value!=p.code) { p.titleCase=value; uset_add(caseSensitive, p.code); uset_add(caseSensitive, value); } /* set additional properties from previously parsed files */ if(specialCasingIndex<specialCasingCount && p.code==specialCasings[specialCasingIndex].code) { p.specialCasing=specialCasings+specialCasingIndex++; } else { p.specialCasing=NULL; } if(caseFoldingIndex<caseFoldingCount && p.code==caseFoldings[caseFoldingIndex].code) { p.caseFolding=caseFoldings+caseFoldingIndex++; /* ignore "Common" mappings (simple==full) that map to the same code point as the regular lowercase mapping */ if( p.caseFolding->status=='C' && p.caseFolding->simple==p.lowerCase ) { p.caseFolding=NULL; } } else { p.caseFolding=NULL; } /* check for non-character code points */ if((p.code&0xfffe)==0xfffe || (uint32_t)(p.code-0xfdd0)<0x20) { fprintf(stderr, "gencase: error - properties for non-character code point U+%04lx\n", (unsigned long)p.code); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* check that the code points (p.code) are in ascending order */ if(p.code<=prevCode && p.code>0) { fprintf(stderr, "gencase: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n", (unsigned long)p.code, (unsigned long)prevCode); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* properties for a single code point */ setProps(&p); prevCode=p.code; }
static void U_CALLCONV caseFoldingLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode) { char *end; static UChar32 prevCode=0; int32_t count; char status; /* get code point */ caseFoldings[caseFoldingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16); end=(char *)u_skipWhitespace(end); if(end<=fields[0][0] || end!=fields[0][1]) { fprintf(stderr, "gencase: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* get the status of this mapping */ caseFoldings[caseFoldingCount].status=status=*u_skipWhitespace(fields[1][0]); if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I' && status!='T') { fprintf(stderr, "gencase: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* ignore all case folding mappings that are the same as the UnicodeData.txt lowercase mappings */ if(status=='L') { return; } /* get the mapping */ count=caseFoldings[caseFoldingCount].full[0]= (UChar)u_parseString(fields[2][0], caseFoldings[caseFoldingCount].full+1, 31, (uint32_t *)&caseFoldings[caseFoldingCount].simple, pErrorCode); if(U_FAILURE(*pErrorCode)) { fprintf(stderr, "gencase: error parsing CaseFolding.txt mapping at %s\n", fields[0][0]); exit(*pErrorCode); } /* there is a simple mapping only if there is exactly one code point (count is in UChars) */ if(count==0 || count>2 || (count==2 && UTF_IS_SINGLE(caseFoldings[caseFoldingCount].full[1]))) { caseFoldings[caseFoldingCount].simple=0; } /* update the case-sensitive set */ if(status!='T') { uset_add(caseSensitive, (UChar32)caseFoldings[caseFoldingCount].code); _set_addAll(caseSensitive, caseFoldings[caseFoldingCount].full+1, caseFoldings[caseFoldingCount].full[0]); } /* check the status */ if(status=='S') { /* check if there was a full mapping for this code point before */ if( caseFoldingCount>0 && caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code && caseFoldings[caseFoldingCount-1].status=='F' ) { /* merge the two entries */ caseFoldings[caseFoldingCount-1].simple=caseFoldings[caseFoldingCount].simple; return; } } else if(status=='F') { /* check if there was a simple mapping for this code point before */ if( caseFoldingCount>0 && caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code && caseFoldings[caseFoldingCount-1].status=='S' ) { /* merge the two entries */ uprv_memcpy(caseFoldings[caseFoldingCount-1].full, caseFoldings[caseFoldingCount].full, 32*U_SIZEOF_UCHAR); return; } } else if(status=='I' || status=='T') { /* check if there was a default mapping for this code point before (remove it) */ while(caseFoldingCount>0 && caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code ) { prevCode=0; --caseFoldingCount; } /* store only a marker for special handling for cases like dotless i */ caseFoldings[caseFoldingCount].simple=0; caseFoldings[caseFoldingCount].full[0]=0; } /* check that the code points (caseFoldings[caseFoldingCount].code) are in ascending order */ if(caseFoldings[caseFoldingCount].code<=prevCode && caseFoldings[caseFoldingCount].code>0) { fprintf(stderr, "gencase: error - CaseFolding entries out of order, U+%04lx after U+%04lx\n", (unsigned long)caseFoldings[caseFoldingCount].code, (unsigned long)prevCode); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } prevCode=caseFoldings[caseFoldingCount].code; if(++caseFoldingCount==MAX_CASE_FOLDING_COUNT) { fprintf(stderr, "gencase: too many case folding mappings\n"); *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; exit(U_INDEX_OUTOFBOUNDS_ERROR); } }
extern int main(int argc, char* argv[]) { UBool sourceTOC, verbose; uint32_t maxSize; U_MAIN_INIT_ARGS(argc, argv); /* preset then read command line options */ argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options); /* error handling, printing usage message */ if(argc<0) { fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]); } else if(argc<2) { argc=-1; } if(argc<0 || options[0].doesOccur || options[1].doesOccur) { FILE *where = argc < 0 ? stderr : stdout; /* * Broken into chucks because the C89 standard says the minimum * required supported string length is 509 bytes. */ fprintf(where, "%csage: %s [ -h, -?, --help ] [ -v, --verbose ] [ -c, --copyright ] [ -C, --comment comment ] [ -d, --destdir dir ] [ -n, --name filename ] [ -t, --type filetype ] [ -S, --source tocfile ] [ -e, --entrypoint name ] maxsize listfile\n", argc < 0 ? 'u' : 'U', *argv); if (options[0].doesOccur || options[1].doesOccur) { fprintf(where, "\n" "Read the list file (default: standard input) and create a common data\n" "file from specified files. Omit any files larger than maxsize, if maxsize > 0.\n"); fprintf(where, "\n" "Options:\n" "\t-h, -?, --help this usage text\n" "\t-v, --verbose verbose output\n" "\t-c, --copyright include the ICU copyright notice\n" "\t-C, --comment comment include a comment string\n" "\t-d, --destdir dir destination directory\n"); fprintf(where, "\t-n, --name filename output filename, without .type extension\n" "\t (default: " U_ICUDATA_NAME ")\n" "\t-t, --type filetype type of the destination file\n" "\t (default: \" dat \")\n" "\t-S, --source tocfile write a .c source file with the table of\n" "\t contents\n" "\t-e, --entrypoint name override the c entrypoint name\n" "\t (default: \"<name>_<type>\")\n"); } return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; } sourceTOC=options[8].doesOccur; verbose = options[2].doesOccur; maxSize=(uint32_t)uprv_strtoul(argv[1], NULL, 0); createCommonDataFile(options[4].doesOccur ? options[4].value : NULL, options[6].doesOccur ? options[6].value : NULL, options[9].doesOccur ? options[9].value : options[6].doesOccur ? options[6].value : NULL, options[7].doesOccur ? options[7].value : NULL, options[10].doesOccur ? options[10].value : NULL, options[3].doesOccur ? U_COPYRIGHT_STRING : options[5].doesOccur ? options[5].value : NULL, argc == 2 ? NULL : argv[2], maxSize, sourceTOC, verbose, NULL); return 0; }