// copied from genprops.c static int32_t getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) { const char *t, *z; int32_t i, j; s=u_skipWhitespace(s); for(i=0; i<countTokens; ++i) { t=tokens[i]; if(t!=NULL) { for(j=0;; ++j) { if(t[j]!=0) { if(s[j]!=t[j]) { break; } } else { z=u_skipWhitespace(s+j); if(*z==';' || *z==0) { return i; } else { break; } } } } } return -1; }
static void U_CALLCONV ageLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode) { char *s, *numberLimit; uint32_t value, start, end, version; u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode); if(U_FAILURE(*pErrorCode)) { fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 0 at %s\n", fields[0][0]); exit(*pErrorCode); } /* ignore "unassigned" (the default is already set to 0.0) */ s=(char *)u_skipWhitespace(fields[1][0]); if(0==uprv_strncmp(s, "unassigned", 10)) { return; } /* parse version number */ value=(uint32_t)uprv_strtoul(s, &numberLimit, 10); if(s==numberLimit || value==0 || value>15 || (*numberLimit!='.' && *numberLimit!=' ' && *numberLimit!='\t' && *numberLimit!=0)) { fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } version=value<<4; /* parse minor version number */ if(*numberLimit=='.') { s=(char *)u_skipWhitespace(numberLimit+1); value=(uint32_t)uprv_strtoul(s, &numberLimit, 10); if(s==numberLimit || value>15 || (*numberLimit!=' ' && *numberLimit!='\t' && *numberLimit!=0)) { fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } version|=value; } if(start==0 && end==0x10ffff) { /* Also set bits for initialValue and errorValue. */ end=UPVEC_MAX_CP; } upvec_setValue(pv, start, end, 0, version<<UPROPS_AGE_SHIFT, UPROPS_AGE_MASK, pErrorCode); if(U_FAILURE(*pErrorCode)) { fprintf(stderr, "genprops error: unable to set character age: %s\n", u_errorName(*pErrorCode)); exit(*pErrorCode); } }
/* read a range like start or start..end */ U_CAPI int32_t U_EXPORT2 u_parseCodePointRangeAnyTerminator(const char *s, uint32_t *pStart, uint32_t *pEnd, const char **terminator, UErrorCode *pErrorCode) { char *end; uint32_t value; if(U_FAILURE(*pErrorCode)) { return 0; } if(s==NULL || pStart==NULL || pEnd==NULL) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0; } /* read the start code point */ s=u_skipWhitespace(s); value=(uint32_t)uprv_strtoul(s, &end, 16); if(end<=s || value>=0x110000) { *pErrorCode=U_PARSE_ERROR; return 0; } *pStart=*pEnd=value; /* is there a "..end"? */ s=u_skipWhitespace(end); if(*s!='.' || s[1]!='.') { *terminator=end; return 1; } s=u_skipWhitespace(s+2); /* read the end code point */ value=(uint32_t)uprv_strtoul(s, &end, 16); if(end<=s || value>=0x110000) { *pErrorCode=U_PARSE_ERROR; return 0; } *pEnd=value; /* is this a valid range? */ if(value<*pStart) { *pErrorCode=U_PARSE_ERROR; return 0; } *terminator=end; return value-*pStart+1; }
/* * If the string starts with # @missing: then return the pointer to the * following non-whitespace character. * Otherwise return the original pointer. * Unicode 5.0 adds such lines in some data files to document * default property values. * Poor man's regex for variable amounts of white space. */ static const char * getMissingLimit(const char *s) { const char *s0=s; if( *(s=u_skipWhitespace(s))=='#' && *(s=u_skipWhitespace(s+1))=='@' && 0==strncmp((s=u_skipWhitespace(s+1)), "missing", 7) && *(s=u_skipWhitespace(s+7))==':' ) { return u_skipWhitespace(s+1); } else { return s0; } }
static void U_CALLCONV specialCasingLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode) { char *end; /* get code point */ specialCasings[specialCasingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16); end=(char *)u_skipWhitespace(end); if(end<=fields[0][0] || end!=fields[0][1]) { fprintf(stderr, "gencase: syntax error in SpecialCasing.txt field 0 at %s\n", fields[0][0]); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* is this a complex mapping? */ if(*(end=(char *)u_skipWhitespace(fields[4][0]))!=0 && *end!=';' && *end!='#') { /* there is some condition text in the fifth field */ specialCasings[specialCasingCount].isComplex=TRUE; /* do not store any actual mappings for this */ specialCasings[specialCasingCount].lowerCase[0]=0; specialCasings[specialCasingCount].upperCase[0]=0; specialCasings[specialCasingCount].titleCase[0]=0; } else { /* just set the "complex" flag and get the case mappings */ specialCasings[specialCasingCount].isComplex=FALSE; specialCasings[specialCasingCount].lowerCase[0]= (UChar)u_parseString(fields[1][0], specialCasings[specialCasingCount].lowerCase+1, 31, NULL, pErrorCode); specialCasings[specialCasingCount].upperCase[0]= (UChar)u_parseString(fields[3][0], specialCasings[specialCasingCount].upperCase+1, 31, NULL, pErrorCode); specialCasings[specialCasingCount].titleCase[0]= (UChar)u_parseString(fields[2][0], specialCasings[specialCasingCount].titleCase+1, 31, NULL, pErrorCode); if(U_FAILURE(*pErrorCode)) { fprintf(stderr, "gencase: error parsing special casing at %s\n", fields[0][0]); exit(*pErrorCode); } uset_add(caseSensitive, (UChar32)specialCasings[specialCasingCount].code); _set_addAll(caseSensitive, specialCasings[specialCasingCount].lowerCase+1, specialCasings[specialCasingCount].lowerCase[0]); _set_addAll(caseSensitive, specialCasings[specialCasingCount].upperCase+1, specialCasings[specialCasingCount].upperCase[0]); _set_addAll(caseSensitive, specialCasings[specialCasingCount].titleCase+1, specialCasings[specialCasingCount].titleCase[0]); } if(++specialCasingCount==MAX_SPECIAL_CASING_COUNT) { fprintf(stderr, "gencase: too many special casing mappings\n"); *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; exit(U_INDEX_OUTOFBOUNDS_ERROR); } }
/* * parse a list of code points * store them as a string in dest[destCapacity] * set the first code point in *pFirst * @return The length of the string in numbers of UChars. */ U_CAPI int32_t U_EXPORT2 u_parseString(const char *s, UChar *dest, int32_t destCapacity, uint32_t *pFirst, UErrorCode *pErrorCode) { char *end; uint32_t value; int32_t destLength; if(U_FAILURE(*pErrorCode)) { return 0; } if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0; } if(pFirst!=NULL) { *pFirst=0xffffffff; } destLength=0; for(;;) { s=u_skipWhitespace(s); if(*s==';' || *s==0) { if(destLength<destCapacity) { dest[destLength]=0; } else if(destLength==destCapacity) { *pErrorCode=U_STRING_NOT_TERMINATED_WARNING; } else { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; } return destLength; } /* read one code point */ value=(uint32_t)uprv_strtoul(s, &end, 16); if(end<=s || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || value>=0x110000) { *pErrorCode=U_PARSE_ERROR; return 0; } /* store the first code point */ if(pFirst!=NULL) { *pFirst=value; pFirst=NULL; } /* append it to the destination array */ if((destLength+U16_LENGTH(value))<=destCapacity) { U16_APPEND_UNSAFE(dest, destLength, value); } else { destLength+=U16_LENGTH(value); } /* go to the following characters */ s=end; } }
U_CDECL_BEGIN static void U_CALLCONV strprepProfileLineFn(void * /*context*/, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode) { uint32_t mapping[40]; char *end, *map; uint32_t code; int32_t length; /*UBool* mapWithNorm = (UBool*) context;*/ const char* typeName; uint32_t rangeStart=0,rangeEnd =0; const char *s; s = u_skipWhitespace(fields[0][0]); if (*s == '@') { /* a special directive introduced in 4.2 */ return; } if(fieldCount != 3){ *pErrorCode = U_INVALID_FORMAT_ERROR; return; } typeName = fields[2][0]; map = fields[1][0]; if(uprv_strstr(typeName, usprepTypeNames[USPREP_UNASSIGNED])!=NULL){ u_parseCodePointRange(s, &rangeStart,&rangeEnd, pErrorCode); /* store the range */ compareFlagsForRange(rangeStart,rangeEnd,USPREP_UNASSIGNED); }else if(uprv_strstr(typeName, usprepTypeNames[USPREP_PROHIBITED])!=NULL){ u_parseCodePointRange(s, &rangeStart,&rangeEnd, pErrorCode); /* store the range */ compareFlagsForRange(rangeStart,rangeEnd,USPREP_PROHIBITED); }else if(uprv_strstr(typeName, usprepTypeNames[USPREP_MAP])!=NULL){ /* get the character code, field 0 */ code=(uint32_t)uprv_strtoul(s, &end, 16); /* parse the mapping string */ length=u_parseCodePoints(map, mapping, sizeof(mapping)/4, pErrorCode); /* store the mapping */ compareMapping(code,mapping, length,USPREP_MAP); }else{ *pErrorCode = U_INVALID_FORMAT_ERROR; } }
static char * trimTerminateField(char *s, char *limit) { /* trim leading whitespace */ s=(char *)u_skipWhitespace(s); /* trim trailing whitespace */ while(s<limit && (*(limit-1)==' ' || *(limit-1)=='\t')) { --limit; } *limit=0; return s; }
/* TODO: move to toolutil */ static UBool isToken(const char *token, const char *s) { const char *z; int32_t j; s=u_skipWhitespace(s); for(j=0;; ++j) { if(token[j]!=0) { if(s[j]!=token[j]) { break; } } else { z=u_skipWhitespace(s+j); if(*z==';' || *z==0) { return TRUE; } else { break; } } } return FALSE; }
UBool BiDiConformanceTest::parseOrdering(const char *start) { orderingCount=0; while(*start!=0 && *(start=u_skipWhitespace(start))!=0) { char *end; uint32_t value=(uint32_t)strtoul(start, &end, 10); if(end<=start || (!U_IS_INV_WHITESPACE(*end) && *end!=0) || value>=1000) { errln("@Reorder: parse error at %s", start); return FALSE; } ordering[orderingCount++]=(int32_t)value; start=end; } return TRUE; }
/* get a name, strip leading and trailing whitespace */ static int16_t getName(char **pStart, char *limit) { /* strip leading whitespace */ char *start=(char *)u_skipWhitespace(*pStart); /* strip trailing whitespace */ while(start<limit && (*(limit-1)==' ' || *(limit-1)=='\t')) { --limit; } /* return results */ *pStart=start; return (int16_t)(limit-start); }
static void U_CALLCONV binariesLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode) { const Binaries *bin; char *s; uint32_t start, end, uv; int32_t i; bin=(const Binaries *)context; u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode); if(U_FAILURE(*pErrorCode)) { fprintf(stderr, "genprops: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]); exit(*pErrorCode); } /* parse binary property name */ s=(char *)u_skipWhitespace(fields[1][0]); for(i=0;; ++i) { if(i==bin->binariesCount) { /* ignore unrecognized properties */ if(beVerbose) { addIgnoredProp(s, fields[1][1]); } return; } if(isToken(bin->binaries[i].propName, s)) { break; } } if(bin->binaries[i].vecShift>=32) { fprintf(stderr, "genprops error: shift value %d>=32 for %s %s\n", (int)bin->binaries[i].vecShift, bin->ucdFile, bin->binaries[i].propName); exit(U_INTERNAL_PROGRAM_ERROR); } uv=U_MASK(bin->binaries[i].vecShift); if(start==0 && end==0x10ffff) { /* Also set bits for initialValue and errorValue. */ end=UPVEC_MAX_CP; } upvec_setValue(pv, start, end, bin->binaries[i].vecWord, uv, uv, pErrorCode); if(U_FAILURE(*pErrorCode)) { fprintf(stderr, "genprops error: unable to set %s code: %s\n", bin->binaries[i].propName, u_errorName(*pErrorCode)); exit(*pErrorCode); } }
U_CAPI int32_t U_EXPORT2 u_parseCodePointRange(const char *s, uint32_t *pStart, uint32_t *pEnd, UErrorCode *pErrorCode) { const char *terminator; int32_t rangeLength= u_parseCodePointRangeAnyTerminator(s, pStart, pEnd, &terminator, pErrorCode); if(U_SUCCESS(*pErrorCode)) { terminator=u_skipWhitespace(terminator); if(*terminator!=';' && *terminator!=0) { *pErrorCode=U_PARSE_ERROR; return 0; } } return rangeLength; }
U_CAPI UBool U_EXPORT2 ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates) { UCMapping m={ 0, {0}, 0, 0, 0, 0 }; UChar32 codePoints[UCNV_EXT_MAX_UCHARS]; uint8_t bytes[UCNV_EXT_MAX_BYTES]; const char *s; /* ignore empty and comment lines */ if(line[0]=='#' || *(s=u_skipWhitespace(line))==0 || *s=='\n' || *s=='\r') { return TRUE; } return ucm_parseMappingLine(&m, codePoints, bytes, line) && ucm_addMappingAuto(ucm, forBase, baseStates, &m, codePoints, bytes); }
static void U_CALLCONV binariesLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode) { const Binaries *bin; char *s; uint32_t start, end; int32_t i; bin=(const Binaries *)context; u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode); if(U_FAILURE(*pErrorCode)) { fprintf(stderr, "genbidi: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]); exit(*pErrorCode); } /* parse binary property name */ s=(char *)u_skipWhitespace(fields[1][0]); for(i=0;; ++i) { if(i==bin->binariesCount) { /* ignore unrecognized properties */ return; } if(isToken(bin->binaries[i].propName, s)) { break; } } if(bin->binaries[i].vecMask==0) { fprintf(stderr, "genbidi error: mask value %d==0 for %s %s\n", (int)bin->binaries[i].vecMask, bin->ucdFile, bin->binaries[i].propName); exit(U_INTERNAL_PROGRAM_ERROR); } upvec_setValue(pv, start, end, bin->binaries[i].vecWord, bin->binaries[i].vecValue, bin->binaries[i].vecMask, pErrorCode); if(U_FAILURE(*pErrorCode)) { fprintf(stderr, "genbidi error: unable to set %s, code: %s\n", bin->binaries[i].propName, u_errorName(*pErrorCode)); exit(*pErrorCode); } }
/* * parse a list of code points * store them as a UTF-32 string in dest[destCapacity] * return the number of code points */ U_CAPI int32_t U_EXPORT2 u_parseCodePoints(const char *s, uint32_t *dest, int32_t destCapacity, UErrorCode *pErrorCode) { char *end; uint32_t value; int32_t count; if(U_FAILURE(*pErrorCode)) { return 0; } if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0; } count=0; for(;;) { s=u_skipWhitespace(s); if(*s==';' || *s==0) { return count; } /* read one code point */ value=(uint32_t)uprv_strtoul(s, &end, 16); if(end<=s || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || value>=0x110000) { *pErrorCode=U_PARSE_ERROR; return 0; } /* append it to the destination array */ if(count<destCapacity) { dest[count++]=value; } else { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; } /* go to the following characters */ s=end; } }
UBool BiDiConformanceTest::parseLevels(const char *start) { directionBits=0; levelsCount=0; while(*start!=0 && *(start=u_skipWhitespace(start))!=0) { if(*start=='x') { levels[levelsCount++]=UBIDI_DEFAULT_LTR; ++start; } else { char *end; uint32_t value=(uint32_t)strtoul(start, &end, 10); if(end<=start || (!U_IS_INV_WHITESPACE(*end) && *end!=0) || value>(UBIDI_MAX_EXPLICIT_LEVEL+1)) { errln("@Levels: parse error at %s", start); return FALSE; } levels[levelsCount++]=(UBiDiLevel)value; directionBits|=(1<<(value&1)); start=end; } } return TRUE; }
static void U_CALLCONV derivedNormalizationPropertiesLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode) { UChar string[32]; char *s; uint32_t start, end; int32_t count; uint8_t qcFlags; /* get code point range */ count=u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode); if(U_FAILURE(*pErrorCode)) { fprintf(stderr, "gennorm: error parsing DerivedNormalizationProperties.txt mapping at %s\n", fields[0][0]); exit(*pErrorCode); } /* ignore hangul - handle explicitly */ if(start==0xac00) { return; } /* get property - ignore unrecognized ones */ s=(char *)u_skipWhitespace(fields[1][0]); if(*s=='N' && s[1]=='F') { /* quick check flag */ qcFlags=0x11; s+=2; if(*s=='K') { qcFlags<<=1; ++s; } if(*s=='C' && s[1]=='_') { s+=2; } else if(*s=='D' && s[1]=='_') { qcFlags<<=2; s+=2; } else { return; } if(0==uprv_memcmp(s, "NO", 2)) { qcFlags&=0xf; } else if(0==uprv_memcmp(s, "MAYBE", 5)) { qcFlags&=0x30; } else if(0==uprv_memcmp(s, "QC", 2) && *(s=(char *)u_skipWhitespace(s+2))==';') { /* * Unicode 4.0.1: * changes single field "NFD_NO" -> two fields "NFD_QC; N" etc. */ /* start of the field */ s=(char *)u_skipWhitespace(s+1); if(*s=='N') { qcFlags&=0xf; } else if(*s=='M') { qcFlags&=0x30; } else { return; /* do nothing for "Yes" because it's the default value */ } } else { return; /* do nothing for "Yes" because it's the default value */ } /* set this flag for all code points in this range */ while(start<=end) { setQCFlags(start++, qcFlags); } } else if(0==uprv_memcmp(s, "Comp_Ex", 7) || 0==uprv_memcmp(s, "Full_Composition_Exclusion", 26)) { /* full composition exclusion */ while(start<=end) { setCompositionExclusion(start++); } } else if( ((0==uprv_memcmp(s, "FNC", 3) && *(s=(char *)u_skipWhitespace(s+3))==';') || (0==uprv_memcmp(s, "FC_NFKC", 7) && *(s=(char *)u_skipWhitespace(s+7))==';')) ) { /* FC_NFKC_Closure, parse field 2 to get the string */ char *t; /* start of the field */ s=(char *)u_skipWhitespace(s+1); /* find the end of the field */ for(t=s; *t!=';' && *t!='#' && *t!=0 && *t!='\n' && *t!='\r'; ++t) {} *t=0; string[0]=(UChar)u_parseString(s, string+1, 31, NULL, pErrorCode); if(U_FAILURE(*pErrorCode)) { fprintf(stderr, "gennorm error: illegal FNC string at %s\n", fields[0][0]); exit(*pErrorCode); } while(start<=end) { setFNC(start++, string); } } }
U_CAPI void U_EXPORT2 u_parseDelimitedFile(const char *filename, char delimiter, char *fields[][2], int32_t fieldCount, UParseLineFn *lineFn, void *context, UErrorCode *pErrorCode) { FileStream *file; char line[300]; char *start, *limit; int32_t i, length; if(U_FAILURE(*pErrorCode)) { return; } if(fields==NULL || lineFn==NULL || fieldCount<=0) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return; } if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) { filename=NULL; file=T_FileStream_stdin(); } else { file=T_FileStream_open(filename, "r"); } if(file==NULL) { *pErrorCode=U_FILE_ACCESS_ERROR; return; } while(T_FileStream_readLine(file, line, sizeof(line))!=NULL) { /* remove trailing newline characters */ length=(int32_t)(u_rtrim(line)-line); /* * detect a line with # @missing: * start parsing after that, or else from the beginning of the line * set the default warning for @missing lines */ start=(char *)getMissingLimit(line); if(start==line) { *pErrorCode=U_ZERO_ERROR; } else { *pErrorCode=U_USING_DEFAULT_WARNING; } /* skip this line if it is empty or a comment */ if(*start==0 || *start=='#') { continue; } /* remove in-line comments */ limit=uprv_strchr(start, '#'); if(limit!=NULL) { /* get white space before the pound sign */ while(limit>start && U_IS_INV_WHITESPACE(*(limit-1))) { --limit; } /* truncate the line */ *limit=0; } /* skip lines with only whitespace */ if(u_skipWhitespace(start)[0]==0) { continue; } /* for each field, call the corresponding field function */ for(i=0; i<fieldCount; ++i) { /* set the limit pointer of this field */ limit=start; while(*limit!=delimiter && *limit!=0) { ++limit; } /* set the field start and limit in the fields array */ fields[i][0]=start; fields[i][1]=limit; /* set start to the beginning of the next field, if any */ start=limit; if(*start!=0) { ++start; } else if(i+1<fieldCount) { *pErrorCode=U_PARSE_ERROR; limit=line+length; i=fieldCount; break; } } /* error in a field function? */ if(U_FAILURE(*pErrorCode)) { break; } /* call the field function */ lineFn(context, fields, fieldCount, pErrorCode); if(U_FAILURE(*pErrorCode)) { break; } } if(filename!=NULL) { T_FileStream_close(file); } }
static void U_CALLCONV caseFoldingLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode) { char *end; static UChar32 prevCode=0; int32_t count; char status; /* get code point */ caseFoldings[caseFoldingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16); end=(char *)u_skipWhitespace(end); if(end<=fields[0][0] || end!=fields[0][1]) { fprintf(stderr, "gencase: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* get the status of this mapping */ caseFoldings[caseFoldingCount].status=status=*u_skipWhitespace(fields[1][0]); if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I' && status!='T') { fprintf(stderr, "gencase: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* ignore all case folding mappings that are the same as the UnicodeData.txt lowercase mappings */ if(status=='L') { return; } /* get the mapping */ count=caseFoldings[caseFoldingCount].full[0]= (UChar)u_parseString(fields[2][0], caseFoldings[caseFoldingCount].full+1, 31, (uint32_t *)&caseFoldings[caseFoldingCount].simple, pErrorCode); if(U_FAILURE(*pErrorCode)) { fprintf(stderr, "gencase: error parsing CaseFolding.txt mapping at %s\n", fields[0][0]); exit(*pErrorCode); } /* there is a simple mapping only if there is exactly one code point (count is in UChars) */ if(count==0 || count>2 || (count==2 && UTF_IS_SINGLE(caseFoldings[caseFoldingCount].full[1]))) { caseFoldings[caseFoldingCount].simple=0; } /* update the case-sensitive set */ if(status!='T') { uset_add(caseSensitive, (UChar32)caseFoldings[caseFoldingCount].code); _set_addAll(caseSensitive, caseFoldings[caseFoldingCount].full+1, caseFoldings[caseFoldingCount].full[0]); } /* check the status */ if(status=='S') { /* check if there was a full mapping for this code point before */ if( caseFoldingCount>0 && caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code && caseFoldings[caseFoldingCount-1].status=='F' ) { /* merge the two entries */ caseFoldings[caseFoldingCount-1].simple=caseFoldings[caseFoldingCount].simple; return; } } else if(status=='F') { /* check if there was a simple mapping for this code point before */ if( caseFoldingCount>0 && caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code && caseFoldings[caseFoldingCount-1].status=='S' ) { /* merge the two entries */ uprv_memcpy(caseFoldings[caseFoldingCount-1].full, caseFoldings[caseFoldingCount].full, 32*U_SIZEOF_UCHAR); return; } } else if(status=='I' || status=='T') { /* check if there was a default mapping for this code point before (remove it) */ while(caseFoldingCount>0 && caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code ) { prevCode=0; --caseFoldingCount; } /* store only a marker for special handling for cases like dotless i */ caseFoldings[caseFoldingCount].simple=0; caseFoldings[caseFoldingCount].full[0]=0; } /* check that the code points (caseFoldings[caseFoldingCount].code) are in ascending order */ if(caseFoldings[caseFoldingCount].code<=prevCode && caseFoldings[caseFoldingCount].code>0) { fprintf(stderr, "gencase: error - CaseFolding entries out of order, U+%04lx after U+%04lx\n", (unsigned long)caseFoldings[caseFoldingCount].code, (unsigned long)prevCode); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } prevCode=caseFoldings[caseFoldingCount].code; if(++caseFoldingCount==MAX_CASE_FOLDING_COUNT) { fprintf(stderr, "gencase: too many case folding mappings\n"); *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; exit(U_INDEX_OUTOFBOUNDS_ERROR); } }
/* return TRUE if a base table was read, FALSE for an extension table */ static UBool readFile(ConvData *data, const char* converterName, UErrorCode *pErrorCode) { char line[1024]; char *end; FileStream *convFile; UCMStates *baseStates; UBool dataIsBase; if(U_FAILURE(*pErrorCode)) { return FALSE; } data->ucm=ucm_open(); convFile=T_FileStream_open(converterName, "r"); if(convFile==NULL) { *pErrorCode=U_FILE_ACCESS_ERROR; return FALSE; } readHeader(data, convFile, converterName, pErrorCode); if(U_FAILURE(*pErrorCode)) { return FALSE; } if(data->ucm->baseName[0]==0) { dataIsBase=TRUE; baseStates=&data->ucm->states; ucm_processStates(baseStates, IGNORE_SISO_CHECK); } else { dataIsBase=FALSE; baseStates=NULL; } /* read the base table */ ucm_readTable(data->ucm, convFile, dataIsBase, baseStates, pErrorCode); if(U_FAILURE(*pErrorCode)) { return FALSE; } /* read an extension table if there is one */ while(T_FileStream_readLine(convFile, line, sizeof(line))) { end=uprv_strchr(line, 0); while(line<end && (*(end-1)=='\n' || *(end-1)=='\r' || *(end-1)==' ' || *(end-1)=='\t')) { --end; } *end=0; if(line[0]=='#' || u_skipWhitespace(line)==end) { continue; /* ignore empty and comment lines */ } if(0==uprv_strcmp(line, "CHARMAP")) { /* read the extension table */ ucm_readTable(data->ucm, convFile, FALSE, baseStates, pErrorCode); } else { fprintf(stderr, "unexpected text after the base mapping table\n"); } break; } T_FileStream_close(convFile); if(data->ucm->base->flagsType==UCM_FLAGS_MIXED || data->ucm->ext->flagsType==UCM_FLAGS_MIXED) { fprintf(stderr, "error: some entries have the mapping precision (with '|'), some do not\n"); *pErrorCode=U_INVALID_TABLE_FORMAT; } return dataIsBase; }
static void U_CALLCONV strprepProfileLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode) { uint32_t mapping[40]; char *end, *map; uint32_t code; int32_t length; /*UBool* mapWithNorm = (UBool*) context;*/ const char* typeName; uint32_t rangeStart=0,rangeEnd =0; const char* filename = (const char*) context; const char *s; s = u_skipWhitespace(fields[0][0]); if (*s == '@') { /* special directive */ s++; length = fields[0][1] - s; if (length >= NORMALIZE_DIRECTIVE_LEN && uprv_strncmp(s, NORMALIZE_DIRECTIVE, NORMALIZE_DIRECTIVE_LEN) == 0) { options[NORMALIZE].doesOccur = TRUE; return; } else if (length >= CHECK_BIDI_DIRECTIVE_LEN && uprv_strncmp(s, CHECK_BIDI_DIRECTIVE, CHECK_BIDI_DIRECTIVE_LEN) == 0) { options[CHECK_BIDI].doesOccur = TRUE; return; } else { fprintf(stderr, "gensprep error parsing a directive %s.", fields[0][0]); } } typeName = fields[2][0]; map = fields[1][0]; if(uprv_strstr(typeName, usprepTypeNames[USPREP_UNASSIGNED])!=NULL){ u_parseCodePointRange(s, &rangeStart,&rangeEnd, pErrorCode); if(U_FAILURE(*pErrorCode)){ fprintf(stderr, "Could not parse code point range. Error: %s\n",u_errorName(*pErrorCode)); return; } /* store the range */ storeRange(rangeStart,rangeEnd,USPREP_UNASSIGNED, pErrorCode); }else if(uprv_strstr(typeName, usprepTypeNames[USPREP_PROHIBITED])!=NULL){ u_parseCodePointRange(s, &rangeStart,&rangeEnd, pErrorCode); if(U_FAILURE(*pErrorCode)){ fprintf(stderr, "Could not parse code point range. Error: %s\n",u_errorName(*pErrorCode)); return; } /* store the range */ storeRange(rangeStart,rangeEnd,USPREP_PROHIBITED, pErrorCode); }else if(uprv_strstr(typeName, usprepTypeNames[USPREP_MAP])!=NULL){ /* get the character code, field 0 */ code=(uint32_t)uprv_strtoul(s, &end, 16); if(end<=s || end!=fields[0][1]) { fprintf(stderr, "gensprep: syntax error in field 0 at %s\n", fields[0][0]); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* parse the mapping string */ length=u_parseCodePoints(map, mapping, sizeof(mapping)/4, pErrorCode); /* store the mapping */ storeMapping(code,mapping, length,USPREP_MAP, pErrorCode); }else{ *pErrorCode = U_INVALID_FORMAT_ERROR; } if(U_FAILURE(*pErrorCode)) { fprintf(stderr, "gensprep error parsing %s line %s at %s. Error: %s\n",filename, fields[0][0],fields[2][0],u_errorName(*pErrorCode)); exit(*pErrorCode); } }
/* read a range like start or start..end */ U_CAPI int32_t U_EXPORT2 u_parseCodePointRange(const char *s, uint32_t *pStart, uint32_t *pEnd, UErrorCode *pErrorCode) { char *end; uint32_t value; if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { return 0; } if(s==NULL || pStart==NULL || pEnd==NULL) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; } s=u_skipWhitespace(s); if(*s==';' || *s==0) { *pErrorCode=U_PARSE_ERROR; return 0; } /* read the start code point */ value=(uint32_t)uprv_strtoul(s, &end, 16); if(end<=s || (*end!=' ' && *end!='\t' && *end!='.' && *end!=';') || value>=0x110000) { *pErrorCode=U_PARSE_ERROR; return 0; } *pStart=*pEnd=value; /* is there a "..end"? */ s=u_skipWhitespace(end); if(*s==';' || *s==0) { return 1; } if(*s!='.' || s[1]!='.') { *pErrorCode=U_PARSE_ERROR; return 0; } s+=2; /* read the end code point */ value=(uint32_t)uprv_strtoul(s, &end, 16); if(end<=s || (*end!=' ' && *end!='\t' && *end!=';') || value>=0x110000) { *pErrorCode=U_PARSE_ERROR; return 0; } *pEnd=value; /* is this a valid range? */ if(value<*pStart) { *pErrorCode=U_PARSE_ERROR; return 0; } /* no garbage after that? */ s=u_skipWhitespace(end); if(*s==';' || *s==0) { return value-*pStart+1; } else { *pErrorCode=U_PARSE_ERROR; return 0; } }
void BiDiConformanceTest::TestBidiTest() { if(isICUVersionBefore(52, 1)) { // TODO: Update the ICU BiDi code to implement the additions in the Unicode 6.3 BiDi Algorithm, // and reenable the BiDi conformance test. return; } IcuTestErrorCode errorCode(*this, "TestBidiTest"); const char *sourceTestDataPath=getSourceTestData(errorCode); if(errorCode.logIfFailureAndReset("unable to find the source/test/testdata " "folder (getSourceTestData())")) { return; } char bidiTestPath[400]; strcpy(bidiTestPath, sourceTestDataPath); strcat(bidiTestPath, "BidiTest.txt"); LocalStdioFilePointer bidiTestFile(fopen(bidiTestPath, "r")); if(bidiTestFile.isNull()) { errln("unable to open %s", bidiTestPath); return; } LocalUBiDiPointer ubidi(ubidi_open()); ubidi_setClassCallback(ubidi.getAlias(), biDiConfUBiDiClassCallback, NULL, NULL, NULL, errorCode); if(errorCode.logIfFailureAndReset("ubidi_setClassCallback()")) { return; } lineNumber=0; levelsCount=0; orderingCount=0; errorCount=0; while(errorCount<10 && fgets(line, (int)sizeof(line), bidiTestFile.getAlias())!=NULL) { ++lineNumber; // Remove trailing comments and whitespace. char *commentStart=strchr(line, '#'); if(commentStart!=NULL) { *commentStart=0; } u_rtrim(line); const char *start=u_skipWhitespace(line); if(*start==0) { continue; // Skip empty and comment-only lines. } if(*start=='@') { ++start; if(0==strncmp(start, "Levels:", 7)) { if(!parseLevels(start+7)) { return; } } else if(0==strncmp(start, "Reorder:", 8)) { if(!parseOrdering(start+8)) { return; } } // Skip unknown @Xyz: ... } else { if(!parseInputStringFromBiDiClasses(start)) { return; } start=u_skipWhitespace(start); if(*start!=';') { errln("missing ; separator on input line %s", line); return; } start=u_skipWhitespace(start+1); char *end; uint32_t bitset=(uint32_t)strtoul(start, &end, 16); if(end<=start || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0)) { errln("input bitset parse error at %s", start); return; } // Loop over the bitset. static const UBiDiLevel paraLevels[]={ UBIDI_DEFAULT_LTR, 0, 1, UBIDI_DEFAULT_RTL }; static const char *const paraLevelNames[]={ "auto/LTR", "LTR", "RTL", "auto/RTL" }; for(int i=0; i<=3; ++i) { if(bitset&(1<<i)) { ubidi_setPara(ubidi.getAlias(), inputString.getBuffer(), inputString.length(), paraLevels[i], NULL, errorCode); const UBiDiLevel *actualLevels=ubidi_getLevels(ubidi.getAlias(), errorCode); if(errorCode.logIfFailureAndReset("ubidi_setPara() or ubidi_getLevels()")) { errln("Input line %d: %s", (int)lineNumber, line); return; } if(!checkLevels(actualLevels, ubidi_getProcessedLength(ubidi.getAlias()), paraLevelNames[i])) { // continue outerLoop; does not exist in C++ // so just break out of the inner loop. break; } if(!checkOrdering(ubidi.getAlias(), paraLevelNames[i])) { // continue outerLoop; does not exist in C++ // so just break out of the inner loop. break; } } } } } }
UBool BiDiConformanceTest::parseInputStringFromBiDiClasses(const char *&start) { inputString.remove(); /* * Lengthy but fast BiDi class parser. * A simple parser could terminate or extract the name string and use * int32_t biDiClassInt=u_getPropertyValueEnum(UCHAR_BIDI_CLASS, bidiClassString); * but that makes this test take significantly more time. */ while(*start!=0 && *(start=u_skipWhitespace(start))!=0 && *start!=';') { UCharDirection biDiClass=U_CHAR_DIRECTION_COUNT; // Compare each character once until we have a match on // a complete, short BiDi class name. if(start[0]=='L') { if(start[1]=='R') { if(start[2]=='E') { biDiClass=U_LEFT_TO_RIGHT_EMBEDDING; } else if(start[2]=='I') { biDiClass=U_LEFT_TO_RIGHT_ISOLATE; } else if(start[2]=='O') { biDiClass=U_LEFT_TO_RIGHT_OVERRIDE; } } else { biDiClass=U_LEFT_TO_RIGHT; } } else if(start[0]=='R') { if(start[1]=='L') { if(start[2]=='E') { biDiClass=U_RIGHT_TO_LEFT_EMBEDDING; } else if(start[2]=='I') { biDiClass=U_RIGHT_TO_LEFT_ISOLATE; } else if(start[2]=='O') { biDiClass=U_RIGHT_TO_LEFT_OVERRIDE; } } else { biDiClass=U_RIGHT_TO_LEFT; } } else if(start[0]=='E') { if(start[1]=='N') { biDiClass=U_EUROPEAN_NUMBER; } else if(start[1]=='S') { biDiClass=U_EUROPEAN_NUMBER_SEPARATOR; } else if(start[1]=='T') { biDiClass=U_EUROPEAN_NUMBER_TERMINATOR; } } else if(start[0]=='A') { if(start[1]=='L') { biDiClass=U_RIGHT_TO_LEFT_ARABIC; } else if(start[1]=='N') { biDiClass=U_ARABIC_NUMBER; } } else if(start[0]=='C' && start[1]=='S') { biDiClass=U_COMMON_NUMBER_SEPARATOR; } else if(start[0]=='B') { if(start[1]=='N') { biDiClass=U_BOUNDARY_NEUTRAL; } else { biDiClass=U_BLOCK_SEPARATOR; } } else if(start[0]=='S') { biDiClass=U_SEGMENT_SEPARATOR; } else if(start[0]=='W' && start[1]=='S') { biDiClass=U_WHITE_SPACE_NEUTRAL; } else if(start[0]=='O' && start[1]=='N') { biDiClass=U_OTHER_NEUTRAL; } else if(start[0]=='P' && start[1]=='D') { if(start[2]=='F') { biDiClass=U_POP_DIRECTIONAL_FORMAT; } else if(start[2]=='I') { biDiClass=U_POP_DIRECTIONAL_ISOLATE; } } else if(start[0]=='N' && start[1]=='S' && start[2]=='M') { biDiClass=U_DIR_NON_SPACING_MARK; } else if(start[0]=='F' && start[1]=='S' && start[2]=='I') { biDiClass=U_FIRST_STRONG_ISOLATE; } // Now we verify that the class name is terminated properly, // and not just the start of a longer word. int8_t biDiClassNameLength=biDiClassNameLengths[biDiClass]; char c=start[biDiClassNameLength]; if(biDiClass==U_CHAR_DIRECTION_COUNT || (!U_IS_INV_WHITESPACE(c) && c!=';' && c!=0)) { errln("BiDi class string not recognized at %s", start); return FALSE; } inputString.append(charFromBiDiClass[biDiClass]); start+=biDiClassNameLength; } return TRUE; }
U_CAPI void U_EXPORT2 u_parseDelimitedFile(const char *filename, char delimiter, char *fields[][2], int32_t fieldCount, UParseLineFn *lineFn, void *context, UErrorCode *pErrorCode) { FileStream *file; char line[300]; char *start, *limit; int32_t i, length; if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { return; } if(fields==NULL || lineFn==NULL || fieldCount<=0) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return; } if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) { filename=NULL; file=T_FileStream_stdin(); } else { file=T_FileStream_open(filename, "r"); } if(file==NULL) { *pErrorCode=U_FILE_ACCESS_ERROR; return; } while(T_FileStream_readLine(file, line, sizeof(line))!=NULL) { length=(int32_t)uprv_strlen(line); /* remove trailing newline characters */ while(length>0 && (line[length-1]=='\r' || line[length-1]=='\n')) { line[--length]=0; } /* skip this line if it is empty or a comment */ if(line[0]==0 || line[0]=='#') { continue; } /* remove in-line comments */ limit=uprv_strchr(line, '#'); if(limit!=NULL) { /* get white space before the pound sign */ while(limit>line && (*(limit-1)==' ' || *(limit-1)=='\t')) { --limit; } /* truncate the line */ *limit=0; } /* skip lines with only whitespace */ if(u_skipWhitespace(line)[0]==0) { continue; } /* for each field, call the corresponding field function */ start=line; for(i=0; i<fieldCount; ++i) { /* set the limit pointer of this field */ limit=start; while(*limit!=delimiter && *limit!=0) { ++limit; } /* set the field start and limit in the fields array */ fields[i][0]=start; fields[i][1]=limit; /* set start to the beginning of the next field, if any */ start=limit; if(*start!=0) { ++start; } else if(i+1<fieldCount) { *pErrorCode=U_PARSE_ERROR; limit=line+length; i=fieldCount; break; } } /* error in a field function? */ if(U_FAILURE(*pErrorCode)) { break; } /* call the field function */ lineFn(context, fields, fieldCount, pErrorCode); if(U_FAILURE(*pErrorCode)) { break; } } if(filename!=NULL) { T_FileStream_close(file); } }
/* * state table row grammar (ebnf-style): * (whitespace is allowed between all tokens) * * row=[[firstentry ','] entry (',' entry)*] * firstentry="initial" | "surrogates" * (initial state (default for state 0), output is all surrogate pairs) * entry=range [':' nextstate] ['.' action] * range=number ['-' number] * nextstate=number * (0..7f) * action='u' | 's' | 'p' | 'i' * (unassigned, state change only, surrogate pair, illegal) * number=(1- or 2-digit hexadecimal number) */ static const char * parseState(const char *s, int32_t state[256], uint32_t *pFlags) { const char *t; uint32_t start, end, i; int32_t entry; /* initialize the state: all illegal with U+ffff */ for(i=0; i<256; ++i) { state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0xffff); } /* skip leading white space */ s=u_skipWhitespace(s); /* is there an "initial" or "surrogates" directive? */ if(uprv_strncmp("initial", s, 7)==0) { *pFlags=MBCS_STATE_FLAG_DIRECT; s=u_skipWhitespace(s+7); if(*s++!=',') { return s-1; } } else if(*pFlags==0 && uprv_strncmp("surrogates", s, 10)==0) { *pFlags=MBCS_STATE_FLAG_SURROGATES; s=u_skipWhitespace(s+10); if(*s++!=',') { return s-1; } } else if(*s==0) { /* empty state row: all-illegal */ return NULL; } for(;;) { /* read an entry, the start of the range first */ s=u_skipWhitespace(s); start=uprv_strtoul(s, (char **)&t, 16); if(s==t || 0xff<start) { return s; } s=u_skipWhitespace(t); /* read the end of the range if there is one */ if(*s=='-') { s=u_skipWhitespace(s+1); end=uprv_strtoul(s, (char **)&t, 16); if(s==t || end<start || 0xff<end) { return s; } s=u_skipWhitespace(t); } else { end=start; } /* determine the state entrys for this range */ if(*s!=':' && *s!='.') { /* the default is: final state with valid entries */ entry=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_16, 0); } else { entry=MBCS_ENTRY_TRANSITION(0, 0); if(*s==':') { /* get the next state, default to 0 */ s=u_skipWhitespace(s+1); i=uprv_strtoul(s, (char **)&t, 16); if(s!=t) { if(0x7f<i) { return s; } s=u_skipWhitespace(t); entry=MBCS_ENTRY_SET_STATE(entry, i); } } /* get the state action, default to valid */ if(*s=='.') { /* this is a final state */ entry=MBCS_ENTRY_SET_FINAL(entry); s=u_skipWhitespace(s+1); if(*s=='u') { /* unassigned set U+fffe */ entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe); s=u_skipWhitespace(s+1); } else if(*s=='p') { if(*pFlags!=MBCS_STATE_FLAG_DIRECT) { entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16_PAIR); } else { entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16); } s=u_skipWhitespace(s+1); } else if(*s=='s') { entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_CHANGE_ONLY); s=u_skipWhitespace(s+1); } else if(*s=='i') { /* illegal set U+ffff */ entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_ILLEGAL, 0xffff); s=u_skipWhitespace(s+1); } else { /* default to valid */ entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16); } } else { /* this is an intermediate state, nothing to do */ } } /* adjust "final valid" states according to the state flags */ if(MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16) { switch(*pFlags) { case 0: /* no adjustment */ break; case MBCS_STATE_FLAG_DIRECT: /* set the valid-direct code point to "unassigned"==0xfffe */ entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_DIRECT_16, 0xfffe); break; case MBCS_STATE_FLAG_SURROGATES: entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_16_PAIR, 0); break; default: break; } } /* set this entry for the range */ for(i=start; i<=end; ++i) { state[i]=entry; } if(*s==',') { ++s; } else { return *s==0 ? NULL : s; } } }
U_CAPI UBool U_EXPORT2 ucm_parseHeaderLine(UCMFile *ucm, char *line, char **pKey, char **pValue) { UCMStates *states; char *s, *end; char c; states=&ucm->states; /* remove comments and trailing CR and LF and remove whitespace from the end */ for(end=line; (c=*end)!=0; ++end) { if(c=='#' || c=='\r' || c=='\n') { break; } } while(end>line && (*(end-1)==' ' || *(end-1)=='\t')) { --end; } *end=0; /* skip leading white space and ignore empty lines */ s=(char *)u_skipWhitespace(line); if(*s==0) { return TRUE; } /* stop at the beginning of the mapping section */ if(uprv_memcmp(s, "CHARMAP", 7)==0) { return FALSE; } /* get the key name, bracketed in <> */ if(*s!='<') { fprintf(stderr, "ucm error: no header field <key> in line \"%s\"\n", line); exit(U_INVALID_TABLE_FORMAT); } *pKey=++s; while(*s!='>') { if(*s==0) { fprintf(stderr, "ucm error: incomplete header field <key> in line \"%s\"\n", line); exit(U_INVALID_TABLE_FORMAT); } ++s; } *s=0; /* get the value string, possibly quoted */ s=(char *)u_skipWhitespace(s+1); if(*s!='"') { *pValue=s; } else { /* remove the quotes */ *pValue=s+1; if(end>*pValue && *(end-1)=='"') { *--end=0; } } /* collect the information from the header field, ignore unknown keys */ if(uprv_strcmp(*pKey, "uconv_class")==0) { if(uprv_strcmp(*pValue, "DBCS")==0) { states->conversionType=UCNV_DBCS; } else if(uprv_strcmp(*pValue, "SBCS")==0) { states->conversionType = UCNV_SBCS; } else if(uprv_strcmp(*pValue, "MBCS")==0) { states->conversionType = UCNV_MBCS; } else if(uprv_strcmp(*pValue, "EBCDIC_STATEFUL")==0) { states->conversionType = UCNV_EBCDIC_STATEFUL; } else { fprintf(stderr, "ucm error: unknown <uconv_class> %s\n", *pValue); exit(U_INVALID_TABLE_FORMAT); } return TRUE; } else if(uprv_strcmp(*pKey, "mb_cur_max")==0) { c=**pValue; if('1'<=c && c<='4' && (*pValue)[1]==0) { states->maxCharLength=(int8_t)(c-'0'); states->outputType=(int8_t)(states->maxCharLength-1); } else { fprintf(stderr, "ucm error: illegal <mb_cur_max> %s\n", *pValue); exit(U_INVALID_TABLE_FORMAT); } return TRUE; } else if(uprv_strcmp(*pKey, "mb_cur_min")==0) { c=**pValue; if('1'<=c && c<='4' && (*pValue)[1]==0) { states->minCharLength=(int8_t)(c-'0'); } else { fprintf(stderr, "ucm error: illegal <mb_cur_min> %s\n", *pValue); exit(U_INVALID_TABLE_FORMAT); } return TRUE; } else if(uprv_strcmp(*pKey, "icu:state")==0) { /* if an SBCS/DBCS/EBCDIC_STATEFUL converter has icu:state, then turn it into MBCS */ switch(states->conversionType) { case UCNV_SBCS: case UCNV_DBCS: case UCNV_EBCDIC_STATEFUL: states->conversionType=UCNV_MBCS; break; case UCNV_MBCS: break; default: fprintf(stderr, "ucm error: <icu:state> entry for non-MBCS table or before the <uconv_class> line\n"); exit(U_INVALID_TABLE_FORMAT); } if(states->maxCharLength==0) { fprintf(stderr, "ucm error: <icu:state> before the <mb_cur_max> line\n"); exit(U_INVALID_TABLE_FORMAT); } ucm_addState(states, *pValue); return TRUE; } else if(uprv_strcmp(*pKey, "icu:base")==0) { if(**pValue==0) { fprintf(stderr, "ucm error: <icu:base> without a base table name\n"); exit(U_INVALID_TABLE_FORMAT); } uprv_strcpy(ucm->baseName, *pValue); return TRUE; } return FALSE; }
/* parse a mapping line; must not be empty */ U_CAPI UBool U_EXPORT2 ucm_parseMappingLine(UCMapping *m, UChar32 codePoints[UCNV_EXT_MAX_UCHARS], uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line) { const char *s; char *end; UChar32 cp; int32_t u16Length; int8_t uLen, bLen, f; s=line; uLen=bLen=0; /* parse code points */ for(;;) { /* skip an optional plus sign */ if(uLen>0 && *s=='+') { ++s; } if(*s!='<') { break; } if( s[1]!='U' || (cp=(UChar32)uprv_strtoul(s+2, &end, 16), end)==s+2 || *end!='>' ) { fprintf(stderr, "ucm error: Unicode code point must be formatted as <UXXXX> (1..6 hex digits) - \"%s\"\n", line); return FALSE; } if((uint32_t)cp>0x10ffff || U_IS_SURROGATE(cp)) { fprintf(stderr, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line); return FALSE; } if(uLen==UCNV_EXT_MAX_UCHARS) { fprintf(stderr, "ucm error: too many code points on \"%s\"\n", line); return FALSE; } codePoints[uLen++]=cp; s=end+1; } if(uLen==0) { fprintf(stderr, "ucm error: no Unicode code points on \"%s\"\n", line); return FALSE; } else if(uLen==1) { m->u=codePoints[0]; } else { UErrorCode errorCode=U_ZERO_ERROR; u_strFromUTF32(NULL, 0, &u16Length, codePoints, uLen, &errorCode); if( (U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) || u16Length>UCNV_EXT_MAX_UCHARS ) { fprintf(stderr, "ucm error: too many UChars on \"%s\"\n", line); return FALSE; } } s=u_skipWhitespace(s); /* parse bytes */ bLen=ucm_parseBytes(bytes, line, &s); if(bLen<0) { return FALSE; } else if(bLen==0) { fprintf(stderr, "ucm error: no bytes on \"%s\"\n", line); return FALSE; } else if(bLen<=4) { uprv_memcpy(m->b.bytes, bytes, bLen); } /* skip everything until the fallback indicator, even the start of a comment */ for(;;) { if(*s==0) { f=-1; /* no fallback indicator */ break; } else if(*s=='|') { f=(int8_t)(s[1]-'0'); if((uint8_t)f>4) { fprintf(stderr, "ucm error: fallback indicator must be |0..|4 - \"%s\"\n", line); return FALSE; } break; } ++s; } m->uLen=uLen; m->bLen=bLen; m->f=f; return TRUE; }
void parseFile(FILE *f, Normalizer2DataBuilder &builder) { IcuToolErrorCode errorCode("gennorm2/parseFile()"); char line[300]; uint32_t startCP, endCP; while(NULL!=fgets(line, (int)sizeof(line), f)) { char *comment=(char *)strchr(line, '#'); if(comment!=NULL) { *comment=0; } u_rtrim(line); if(line[0]==0) { continue; // skip empty and comment-only lines } if(line[0]=='*') { const char *s=u_skipWhitespace(line+1); if(0==strncmp(s, "Unicode", 7)) { s=u_skipWhitespace(s+7); builder.setUnicodeVersion(s); } continue; // reserved syntax } const char *delimiter; int32_t rangeLength= u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode); if(errorCode.isFailure()) { fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line); exit(errorCode.reset()); } delimiter=u_skipWhitespace(delimiter); if(*delimiter==':') { const char *s=u_skipWhitespace(delimiter+1); char *end; unsigned long value=strtoul(s, &end, 10); if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) { fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line); exit(U_PARSE_ERROR); } for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) { builder.setCC(c, (uint8_t)value); } continue; } if(*delimiter=='-') { if(*u_skipWhitespace(delimiter+1)!=0) { fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line); exit(U_PARSE_ERROR); } for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) { builder.removeMapping(c); } continue; } if(*delimiter=='=' || *delimiter=='>') { UChar uchars[Normalizer2Impl::MAPPING_LENGTH_MASK]; int32_t length=u_parseString(delimiter+1, uchars, LENGTHOF(uchars), NULL, errorCode); if(errorCode.isFailure()) { fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line); exit(errorCode.reset()); } UnicodeString mapping(FALSE, uchars, length); if(*delimiter=='=') { if(rangeLength!=1) { fprintf(stderr, "gennorm2 error: round-trip mapping for more than 1 code point on %s\n", line); exit(U_PARSE_ERROR); } builder.setRoundTripMapping((UChar32)startCP, mapping); } else { for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) { builder.setOneWayMapping(c, mapping); } } continue; } fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line); exit(U_PARSE_ERROR); } }