Ejemplo n.º 1
0
// copied from genprops.c
static int32_t
getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
    const char *t, *z;
    int32_t i, j;

    s=u_skipWhitespace(s);
    for(i=0; i<countTokens; ++i) {
        t=tokens[i];
        if(t!=NULL) {
            for(j=0;; ++j) {
                if(t[j]!=0) {
                    if(s[j]!=t[j]) {
                        break;
                    }
                } else {
                    z=u_skipWhitespace(s+j);
                    if(*z==';' || *z==0) {
                        return i;
                    } else {
                        break;
                    }
                }
            }
        }
    }
    return -1;
}
Ejemplo n.º 2
0
static void U_CALLCONV
ageLineFn(void *context,
          char *fields[][2], int32_t fieldCount,
          UErrorCode *pErrorCode) {
    char *s, *numberLimit;
    uint32_t value, start, end, version;

    u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
    if(U_FAILURE(*pErrorCode)) {
        fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 0 at %s\n", fields[0][0]);
        exit(*pErrorCode);
    }

    /* ignore "unassigned" (the default is already set to 0.0) */
    s=(char *)u_skipWhitespace(fields[1][0]);
    if(0==uprv_strncmp(s, "unassigned", 10)) {
        return;
    }

    /* parse version number */
    value=(uint32_t)uprv_strtoul(s, &numberLimit, 10);
    if(s==numberLimit || value==0 || value>15 || (*numberLimit!='.' && *numberLimit!=' ' && *numberLimit!='\t' && *numberLimit!=0)) {
        fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]);
        *pErrorCode=U_PARSE_ERROR;
        exit(U_PARSE_ERROR);
    }
    version=value<<4;

    /* parse minor version number */
    if(*numberLimit=='.') {
        s=(char *)u_skipWhitespace(numberLimit+1);
        value=(uint32_t)uprv_strtoul(s, &numberLimit, 10);
        if(s==numberLimit || value>15 || (*numberLimit!=' ' && *numberLimit!='\t' && *numberLimit!=0)) {
            fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]);
            *pErrorCode=U_PARSE_ERROR;
            exit(U_PARSE_ERROR);
        }
        version|=value;
    }

    if(start==0 && end==0x10ffff) {
        /* Also set bits for initialValue and errorValue. */
        end=UPVEC_MAX_CP;
    }
    upvec_setValue(pv, start, end, 0, version<<UPROPS_AGE_SHIFT, UPROPS_AGE_MASK, pErrorCode);
    if(U_FAILURE(*pErrorCode)) {
        fprintf(stderr, "genprops error: unable to set character age: %s\n", u_errorName(*pErrorCode));
        exit(*pErrorCode);
    }
}
Ejemplo n.º 3
0
/* read a range like start or start..end */
U_CAPI int32_t U_EXPORT2
u_parseCodePointRangeAnyTerminator(const char *s,
                                   uint32_t *pStart, uint32_t *pEnd,
                                   const char **terminator,
                                   UErrorCode *pErrorCode) {
    char *end;
    uint32_t value;

    if(U_FAILURE(*pErrorCode)) {
        return 0;
    }
    if(s==NULL || pStart==NULL || pEnd==NULL) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    /* read the start code point */
    s=u_skipWhitespace(s);
    value=(uint32_t)uprv_strtoul(s, &end, 16);
    if(end<=s || value>=0x110000) {
        *pErrorCode=U_PARSE_ERROR;
        return 0;
    }
    *pStart=*pEnd=value;

    /* is there a "..end"? */
    s=u_skipWhitespace(end);
    if(*s!='.' || s[1]!='.') {
        *terminator=end;
        return 1;
    }
    s=u_skipWhitespace(s+2);

    /* read the end code point */
    value=(uint32_t)uprv_strtoul(s, &end, 16);
    if(end<=s || value>=0x110000) {
        *pErrorCode=U_PARSE_ERROR;
        return 0;
    }
    *pEnd=value;

    /* is this a valid range? */
    if(value<*pStart) {
        *pErrorCode=U_PARSE_ERROR;
        return 0;
    }

    *terminator=end;
    return value-*pStart+1;
}
Ejemplo n.º 4
0
/*
 * If the string starts with # @missing: then return the pointer to the
 * following non-whitespace character.
 * Otherwise return the original pointer.
 * Unicode 5.0 adds such lines in some data files to document
 * default property values.
 * Poor man's regex for variable amounts of white space.
 */
static const char *
getMissingLimit(const char *s) {
    const char *s0=s;
    if(
        *(s=u_skipWhitespace(s))=='#' &&
        *(s=u_skipWhitespace(s+1))=='@' &&
        0==strncmp((s=u_skipWhitespace(s+1)), "missing", 7) &&
        *(s=u_skipWhitespace(s+7))==':'
    ) {
        return u_skipWhitespace(s+1);
    } else {
        return s0;
    }
}
Ejemplo n.º 5
0
static void U_CALLCONV
specialCasingLineFn(void *context,
                    char *fields[][2], int32_t fieldCount,
                    UErrorCode *pErrorCode) {
    char *end;

    /* get code point */
    specialCasings[specialCasingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
    end=(char *)u_skipWhitespace(end);
    if(end<=fields[0][0] || end!=fields[0][1]) {
        fprintf(stderr, "gencase: syntax error in SpecialCasing.txt field 0 at %s\n", fields[0][0]);
        *pErrorCode=U_PARSE_ERROR;
        exit(U_PARSE_ERROR);
    }

    /* is this a complex mapping? */
    if(*(end=(char *)u_skipWhitespace(fields[4][0]))!=0 && *end!=';' && *end!='#') {
        /* there is some condition text in the fifth field */
        specialCasings[specialCasingCount].isComplex=TRUE;

        /* do not store any actual mappings for this */
        specialCasings[specialCasingCount].lowerCase[0]=0;
        specialCasings[specialCasingCount].upperCase[0]=0;
        specialCasings[specialCasingCount].titleCase[0]=0;
    } else {
        /* just set the "complex" flag and get the case mappings */
        specialCasings[specialCasingCount].isComplex=FALSE;
        specialCasings[specialCasingCount].lowerCase[0]=
            (UChar)u_parseString(fields[1][0], specialCasings[specialCasingCount].lowerCase+1, 31, NULL, pErrorCode);
        specialCasings[specialCasingCount].upperCase[0]=
            (UChar)u_parseString(fields[3][0], specialCasings[specialCasingCount].upperCase+1, 31, NULL, pErrorCode);
        specialCasings[specialCasingCount].titleCase[0]=
            (UChar)u_parseString(fields[2][0], specialCasings[specialCasingCount].titleCase+1, 31, NULL, pErrorCode);
        if(U_FAILURE(*pErrorCode)) {
            fprintf(stderr, "gencase: error parsing special casing at %s\n", fields[0][0]);
            exit(*pErrorCode);
        }

        uset_add(caseSensitive, (UChar32)specialCasings[specialCasingCount].code);
        _set_addAll(caseSensitive, specialCasings[specialCasingCount].lowerCase+1, specialCasings[specialCasingCount].lowerCase[0]);
        _set_addAll(caseSensitive, specialCasings[specialCasingCount].upperCase+1, specialCasings[specialCasingCount].upperCase[0]);
        _set_addAll(caseSensitive, specialCasings[specialCasingCount].titleCase+1, specialCasings[specialCasingCount].titleCase[0]);
    }

    if(++specialCasingCount==MAX_SPECIAL_CASING_COUNT) {
        fprintf(stderr, "gencase: too many special casing mappings\n");
        *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
        exit(U_INDEX_OUTOFBOUNDS_ERROR);
    }
}
Ejemplo n.º 6
0
/*
 * parse a list of code points
 * store them as a string in dest[destCapacity]
 * set the first code point in *pFirst
 * @return The length of the string in numbers of UChars.
 */
U_CAPI int32_t U_EXPORT2
u_parseString(const char *s,
              UChar *dest, int32_t destCapacity,
              uint32_t *pFirst,
              UErrorCode *pErrorCode) {
    char *end;
    uint32_t value;
    int32_t destLength;

    if(U_FAILURE(*pErrorCode)) {
        return 0;
    }
    if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    if(pFirst!=NULL) {
        *pFirst=0xffffffff;
    }

    destLength=0;
    for(;;) {
        s=u_skipWhitespace(s);
        if(*s==';' || *s==0) {
            if(destLength<destCapacity) {
                dest[destLength]=0;
            } else if(destLength==destCapacity) {
                *pErrorCode=U_STRING_NOT_TERMINATED_WARNING;
            } else {
                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
            }
            return destLength;
        }

        /* read one code point */
        value=(uint32_t)uprv_strtoul(s, &end, 16);
        if(end<=s || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || value>=0x110000) {
            *pErrorCode=U_PARSE_ERROR;
            return 0;
        }

        /* store the first code point */
        if(pFirst!=NULL) {
            *pFirst=value;
            pFirst=NULL;
        }

        /* append it to the destination array */
        if((destLength+U16_LENGTH(value))<=destCapacity) {
            U16_APPEND_UNSAFE(dest, destLength, value);
        } else {
            destLength+=U16_LENGTH(value);
        }

        /* go to the following characters */
        s=end;
    }
}
Ejemplo n.º 7
0
U_CDECL_BEGIN

static void U_CALLCONV
strprepProfileLineFn(void * /*context*/,
              char *fields[][2], int32_t fieldCount,
              UErrorCode *pErrorCode) {
    uint32_t mapping[40];
    char *end, *map;
    uint32_t code;
    int32_t length;
   /*UBool* mapWithNorm = (UBool*) context;*/
    const char* typeName;
    uint32_t rangeStart=0,rangeEnd =0;
    const char *s;

    s = u_skipWhitespace(fields[0][0]);
    if (*s == '@') {
        /* a special directive introduced in 4.2 */
        return;
    }

    if(fieldCount != 3){
        *pErrorCode = U_INVALID_FORMAT_ERROR;
        return;
    }

    typeName = fields[2][0];
    map = fields[1][0];
   
    if(uprv_strstr(typeName, usprepTypeNames[USPREP_UNASSIGNED])!=NULL){

        u_parseCodePointRange(s, &rangeStart,&rangeEnd, pErrorCode);

        /* store the range */
        compareFlagsForRange(rangeStart,rangeEnd,USPREP_UNASSIGNED);

    }else if(uprv_strstr(typeName, usprepTypeNames[USPREP_PROHIBITED])!=NULL){

        u_parseCodePointRange(s, &rangeStart,&rangeEnd, pErrorCode);

        /* store the range */
        compareFlagsForRange(rangeStart,rangeEnd,USPREP_PROHIBITED);

    }else if(uprv_strstr(typeName, usprepTypeNames[USPREP_MAP])!=NULL){
        /* get the character code, field 0 */
        code=(uint32_t)uprv_strtoul(s, &end, 16);

        /* parse the mapping string */
        length=u_parseCodePoints(map, mapping, sizeof(mapping)/4, pErrorCode);
        
        /* store the mapping */
        compareMapping(code,mapping, length,USPREP_MAP);

    }else{
        *pErrorCode = U_INVALID_FORMAT_ERROR;
    }

}
Ejemplo n.º 8
0
static char *
trimTerminateField(char *s, char *limit) {
    /* trim leading whitespace */
    s=(char *)u_skipWhitespace(s);

    /* trim trailing whitespace */
    while(s<limit && (*(limit-1)==' ' || *(limit-1)=='\t')) {
        --limit;
    }
    *limit=0;

    return s;
}
Ejemplo n.º 9
0
/* TODO: move to toolutil */
static UBool
isToken(const char *token, const char *s) {
    const char *z;
    int32_t j;

    s=u_skipWhitespace(s);
    for(j=0;; ++j) {
        if(token[j]!=0) {
            if(s[j]!=token[j]) {
                break;
            }
        } else {
            z=u_skipWhitespace(s+j);
            if(*z==';' || *z==0) {
                return TRUE;
            } else {
                break;
            }
        }
    }

    return FALSE;
}
Ejemplo n.º 10
0
UBool BiDiConformanceTest::parseOrdering(const char *start) {
    orderingCount=0;
    while(*start!=0 && *(start=u_skipWhitespace(start))!=0) {
        char *end;
        uint32_t value=(uint32_t)strtoul(start, &end, 10);
        if(end<=start || (!U_IS_INV_WHITESPACE(*end) && *end!=0) || value>=1000) {
            errln("@Reorder: parse error at %s", start);
            return FALSE;
        }
        ordering[orderingCount++]=(int32_t)value;
        start=end;
    }
    return TRUE;
}
Ejemplo n.º 11
0
/* get a name, strip leading and trailing whitespace */
static int16_t
getName(char **pStart, char *limit) {
    /* strip leading whitespace */
    char *start=(char *)u_skipWhitespace(*pStart);

    /* strip trailing whitespace */
    while(start<limit && (*(limit-1)==' ' || *(limit-1)=='\t')) {
        --limit;
    }

    /* return results */
    *pStart=start;
    return (int16_t)(limit-start);
}
Ejemplo n.º 12
0
static void U_CALLCONV
binariesLineFn(void *context,
               char *fields[][2], int32_t fieldCount,
               UErrorCode *pErrorCode) {
    const Binaries *bin;
    char *s;
    uint32_t start, end, uv;
    int32_t i;

    bin=(const Binaries *)context;

    u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
    if(U_FAILURE(*pErrorCode)) {
        fprintf(stderr, "genprops: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]);
        exit(*pErrorCode);
    }

    /* parse binary property name */
    s=(char *)u_skipWhitespace(fields[1][0]);
    for(i=0;; ++i) {
        if(i==bin->binariesCount) {
            /* ignore unrecognized properties */
            if(beVerbose) {
                addIgnoredProp(s, fields[1][1]);
            }
            return;
        }
        if(isToken(bin->binaries[i].propName, s)) {
            break;
        }
    }

    if(bin->binaries[i].vecShift>=32) {
        fprintf(stderr, "genprops error: shift value %d>=32 for %s %s\n",
                        (int)bin->binaries[i].vecShift, bin->ucdFile, bin->binaries[i].propName);
        exit(U_INTERNAL_PROGRAM_ERROR);
    }
    uv=U_MASK(bin->binaries[i].vecShift);

    if(start==0 && end==0x10ffff) {
        /* Also set bits for initialValue and errorValue. */
        end=UPVEC_MAX_CP;
    }
    upvec_setValue(pv, start, end, bin->binaries[i].vecWord, uv, uv, pErrorCode);
    if(U_FAILURE(*pErrorCode)) {
        fprintf(stderr, "genprops error: unable to set %s code: %s\n",
                        bin->binaries[i].propName, u_errorName(*pErrorCode));
        exit(*pErrorCode);
    }
}
Ejemplo n.º 13
0
U_CAPI int32_t U_EXPORT2
u_parseCodePointRange(const char *s,
                      uint32_t *pStart, uint32_t *pEnd,
                      UErrorCode *pErrorCode) {
    const char *terminator;
    int32_t rangeLength=
        u_parseCodePointRangeAnyTerminator(s, pStart, pEnd, &terminator, pErrorCode);
    if(U_SUCCESS(*pErrorCode)) {
        terminator=u_skipWhitespace(terminator);
        if(*terminator!=';' && *terminator!=0) {
            *pErrorCode=U_PARSE_ERROR;
            return 0;
        }
    }
    return rangeLength;
}
Ejemplo n.º 14
0
U_CAPI UBool U_EXPORT2
ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates) {
  UCMapping m={ 0, {0}, 0, 0, 0, 0 };
    UChar32 codePoints[UCNV_EXT_MAX_UCHARS];
    uint8_t bytes[UCNV_EXT_MAX_BYTES];

    const char *s;

    /* ignore empty and comment lines */
    if(line[0]=='#' || *(s=u_skipWhitespace(line))==0 || *s=='\n' || *s=='\r') {
        return TRUE;
    }

    return
        ucm_parseMappingLine(&m, codePoints, bytes, line) &&
        ucm_addMappingAuto(ucm, forBase, baseStates, &m, codePoints, bytes);
}
Ejemplo n.º 15
0
static void U_CALLCONV
binariesLineFn(void *context,
               char *fields[][2], int32_t fieldCount,
               UErrorCode *pErrorCode) {
    const Binaries *bin;
    char *s;
    uint32_t start, end;
    int32_t i;

    bin=(const Binaries *)context;

    u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
    if(U_FAILURE(*pErrorCode)) {
        fprintf(stderr, "genbidi: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]);
        exit(*pErrorCode);
    }

    /* parse binary property name */
    s=(char *)u_skipWhitespace(fields[1][0]);
    for(i=0;; ++i) {
        if(i==bin->binariesCount) {
            /* ignore unrecognized properties */
            return;
        }
        if(isToken(bin->binaries[i].propName, s)) {
            break;
        }
    }

    if(bin->binaries[i].vecMask==0) {
        fprintf(stderr, "genbidi error: mask value %d==0 for %s %s\n",
                        (int)bin->binaries[i].vecMask, bin->ucdFile, bin->binaries[i].propName);
        exit(U_INTERNAL_PROGRAM_ERROR);
    }

    upvec_setValue(pv, start, end, bin->binaries[i].vecWord, bin->binaries[i].vecValue, bin->binaries[i].vecMask, pErrorCode);
    if(U_FAILURE(*pErrorCode)) {
        fprintf(stderr, "genbidi error: unable to set %s, code: %s\n",
                        bin->binaries[i].propName, u_errorName(*pErrorCode));
        exit(*pErrorCode);
    }
}
Ejemplo n.º 16
0
/*
 * parse a list of code points
 * store them as a UTF-32 string in dest[destCapacity]
 * return the number of code points
 */
U_CAPI int32_t U_EXPORT2
u_parseCodePoints(const char *s,
                  uint32_t *dest, int32_t destCapacity,
                  UErrorCode *pErrorCode) {
    char *end;
    uint32_t value;
    int32_t count;

    if(U_FAILURE(*pErrorCode)) {
        return 0;
    }
    if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    count=0;
    for(;;) {
        s=u_skipWhitespace(s);
        if(*s==';' || *s==0) {
            return count;
        }

        /* read one code point */
        value=(uint32_t)uprv_strtoul(s, &end, 16);
        if(end<=s || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || value>=0x110000) {
            *pErrorCode=U_PARSE_ERROR;
            return 0;
        }

        /* append it to the destination array */
        if(count<destCapacity) {
            dest[count++]=value;
        } else {
            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
        }

        /* go to the following characters */
        s=end;
    }
}
Ejemplo n.º 17
0
UBool BiDiConformanceTest::parseLevels(const char *start) {
    directionBits=0;
    levelsCount=0;
    while(*start!=0 && *(start=u_skipWhitespace(start))!=0) {
        if(*start=='x') {
            levels[levelsCount++]=UBIDI_DEFAULT_LTR;
            ++start;
        } else {
            char *end;
            uint32_t value=(uint32_t)strtoul(start, &end, 10);
            if(end<=start || (!U_IS_INV_WHITESPACE(*end) && *end!=0) || value>(UBIDI_MAX_EXPLICIT_LEVEL+1)) {
                errln("@Levels: parse error at %s", start);
                return FALSE;
            }
            levels[levelsCount++]=(UBiDiLevel)value;
            directionBits|=(1<<(value&1));
            start=end;
        }
    }
    return TRUE;
}
Ejemplo n.º 18
0
static void U_CALLCONV
derivedNormalizationPropertiesLineFn(void *context,
                                     char *fields[][2], int32_t fieldCount,
                                     UErrorCode *pErrorCode) {
    UChar string[32];
    char *s;
    uint32_t start, end;
    int32_t count;
    uint8_t qcFlags;

    /* get code point range */
    count=u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
    if(U_FAILURE(*pErrorCode)) {
        fprintf(stderr, "gennorm: error parsing DerivedNormalizationProperties.txt mapping at %s\n", fields[0][0]);
        exit(*pErrorCode);
    }

    /* ignore hangul - handle explicitly */
    if(start==0xac00) {
        return;
    }

    /* get property - ignore unrecognized ones */
    s=(char *)u_skipWhitespace(fields[1][0]);
    if(*s=='N' && s[1]=='F') {
        /* quick check flag */
        qcFlags=0x11;
        s+=2;
        if(*s=='K') {
            qcFlags<<=1;
            ++s;
        }

        if(*s=='C' && s[1]=='_') {
            s+=2;
        } else if(*s=='D' && s[1]=='_') {
            qcFlags<<=2;
            s+=2;
        } else {
            return;
        }

        if(0==uprv_memcmp(s, "NO", 2)) {
            qcFlags&=0xf;
        } else if(0==uprv_memcmp(s, "MAYBE", 5)) {
            qcFlags&=0x30;
        } else if(0==uprv_memcmp(s, "QC", 2) && *(s=(char *)u_skipWhitespace(s+2))==';') {
            /*
             * Unicode 4.0.1:
             * changes single field "NFD_NO" -> two fields "NFD_QC; N" etc.
             */
            /* start of the field */
            s=(char *)u_skipWhitespace(s+1);
            if(*s=='N') {
                qcFlags&=0xf;
            } else if(*s=='M') {
                qcFlags&=0x30;
            } else {
                return; /* do nothing for "Yes" because it's the default value */
            }
        } else {
            return; /* do nothing for "Yes" because it's the default value */
        }

        /* set this flag for all code points in this range */
        while(start<=end) {
            setQCFlags(start++, qcFlags);
        }
    } else if(0==uprv_memcmp(s, "Comp_Ex", 7) || 0==uprv_memcmp(s, "Full_Composition_Exclusion", 26)) {
        /* full composition exclusion */
        while(start<=end) {
            setCompositionExclusion(start++);
        }
    } else if(
        ((0==uprv_memcmp(s, "FNC", 3) && *(s=(char *)u_skipWhitespace(s+3))==';') || 
        (0==uprv_memcmp(s, "FC_NFKC", 7) && *(s=(char *)u_skipWhitespace(s+7))==';'))
        
    ) {
        /* FC_NFKC_Closure, parse field 2 to get the string */
        char *t;

        /* start of the field */
        s=(char *)u_skipWhitespace(s+1);

        /* find the end of the field */
        for(t=s; *t!=';' && *t!='#' && *t!=0 && *t!='\n' && *t!='\r'; ++t) {}
        *t=0;

        string[0]=(UChar)u_parseString(s, string+1, 31, NULL, pErrorCode);
        if(U_FAILURE(*pErrorCode)) {
            fprintf(stderr, "gennorm error: illegal FNC string at %s\n", fields[0][0]);
            exit(*pErrorCode);
        }
        while(start<=end) {
            setFNC(start++, string);
        }
    }
}
Ejemplo n.º 19
0
U_CAPI void U_EXPORT2
u_parseDelimitedFile(const char *filename, char delimiter,
                     char *fields[][2], int32_t fieldCount,
                     UParseLineFn *lineFn, void *context,
                     UErrorCode *pErrorCode) {
    FileStream *file;
    char line[300];
    char *start, *limit;
    int32_t i, length;

    if(U_FAILURE(*pErrorCode)) {
        return;
    }

    if(fields==NULL || lineFn==NULL || fieldCount<=0) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return;
    }

    if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) {
        filename=NULL;
        file=T_FileStream_stdin();
    } else {
        file=T_FileStream_open(filename, "r");
    }
    if(file==NULL) {
        *pErrorCode=U_FILE_ACCESS_ERROR;
        return;
    }

    while(T_FileStream_readLine(file, line, sizeof(line))!=NULL) {
        /* remove trailing newline characters */
        length=(int32_t)(u_rtrim(line)-line);

        /*
         * detect a line with # @missing:
         * start parsing after that, or else from the beginning of the line
         * set the default warning for @missing lines
         */
        start=(char *)getMissingLimit(line);
        if(start==line) {
            *pErrorCode=U_ZERO_ERROR;
        } else {
            *pErrorCode=U_USING_DEFAULT_WARNING;
        }

        /* skip this line if it is empty or a comment */
        if(*start==0 || *start=='#') {
            continue;
        }

        /* remove in-line comments */
        limit=uprv_strchr(start, '#');
        if(limit!=NULL) {
            /* get white space before the pound sign */
            while(limit>start && U_IS_INV_WHITESPACE(*(limit-1))) {
                --limit;
            }

            /* truncate the line */
            *limit=0;
        }

        /* skip lines with only whitespace */
        if(u_skipWhitespace(start)[0]==0) {
            continue;
        }

        /* for each field, call the corresponding field function */
        for(i=0; i<fieldCount; ++i) {
            /* set the limit pointer of this field */
            limit=start;
            while(*limit!=delimiter && *limit!=0) {
                ++limit;
            }

            /* set the field start and limit in the fields array */
            fields[i][0]=start;
            fields[i][1]=limit;

            /* set start to the beginning of the next field, if any */
            start=limit;
            if(*start!=0) {
                ++start;
            } else if(i+1<fieldCount) {
                *pErrorCode=U_PARSE_ERROR;
                limit=line+length;
                i=fieldCount;
                break;
            }
        }

        /* error in a field function? */
        if(U_FAILURE(*pErrorCode)) {
            break;
        }

        /* call the field function */
        lineFn(context, fields, fieldCount, pErrorCode);
        if(U_FAILURE(*pErrorCode)) {
            break;
        }
    }

    if(filename!=NULL) {
        T_FileStream_close(file);
    }
}
Ejemplo n.º 20
0
static void U_CALLCONV
caseFoldingLineFn(void *context,
                  char *fields[][2], int32_t fieldCount,
                  UErrorCode *pErrorCode) {
    char *end;
    static UChar32 prevCode=0;
    int32_t count;
    char status;

    /* get code point */
    caseFoldings[caseFoldingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
    end=(char *)u_skipWhitespace(end);
    if(end<=fields[0][0] || end!=fields[0][1]) {
        fprintf(stderr, "gencase: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
        *pErrorCode=U_PARSE_ERROR;
        exit(U_PARSE_ERROR);
    }

    /* get the status of this mapping */
    caseFoldings[caseFoldingCount].status=status=*u_skipWhitespace(fields[1][0]);
    if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I' && status!='T') {
        fprintf(stderr, "gencase: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
        *pErrorCode=U_PARSE_ERROR;
        exit(U_PARSE_ERROR);
    }

    /* ignore all case folding mappings that are the same as the UnicodeData.txt lowercase mappings */
    if(status=='L') {
        return;
    }

    /* get the mapping */
    count=caseFoldings[caseFoldingCount].full[0]=
        (UChar)u_parseString(fields[2][0], caseFoldings[caseFoldingCount].full+1, 31, (uint32_t *)&caseFoldings[caseFoldingCount].simple, pErrorCode);
    if(U_FAILURE(*pErrorCode)) {
        fprintf(stderr, "gencase: error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
        exit(*pErrorCode);
    }

    /* there is a simple mapping only if there is exactly one code point (count is in UChars) */
    if(count==0 || count>2 || (count==2 && UTF_IS_SINGLE(caseFoldings[caseFoldingCount].full[1]))) {
        caseFoldings[caseFoldingCount].simple=0;
    }

    /* update the case-sensitive set */
    if(status!='T') {
        uset_add(caseSensitive, (UChar32)caseFoldings[caseFoldingCount].code);
        _set_addAll(caseSensitive, caseFoldings[caseFoldingCount].full+1, caseFoldings[caseFoldingCount].full[0]);
    }

    /* check the status */
    if(status=='S') {
        /* check if there was a full mapping for this code point before */
        if( caseFoldingCount>0 &&
            caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
            caseFoldings[caseFoldingCount-1].status=='F'
        ) {
            /* merge the two entries */
            caseFoldings[caseFoldingCount-1].simple=caseFoldings[caseFoldingCount].simple;
            return;
        }
    } else if(status=='F') {
        /* check if there was a simple mapping for this code point before */
        if( caseFoldingCount>0 &&
            caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
            caseFoldings[caseFoldingCount-1].status=='S'
        ) {
            /* merge the two entries */
            uprv_memcpy(caseFoldings[caseFoldingCount-1].full, caseFoldings[caseFoldingCount].full, 32*U_SIZEOF_UCHAR);
            return;
        }
    } else if(status=='I' || status=='T') {
        /* check if there was a default mapping for this code point before (remove it) */
        while(caseFoldingCount>0 &&
              caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code
        ) {
            prevCode=0;
            --caseFoldingCount;
        }
        /* store only a marker for special handling for cases like dotless i */
        caseFoldings[caseFoldingCount].simple=0;
        caseFoldings[caseFoldingCount].full[0]=0;
    }

    /* check that the code points (caseFoldings[caseFoldingCount].code) are in ascending order */
    if(caseFoldings[caseFoldingCount].code<=prevCode && caseFoldings[caseFoldingCount].code>0) {
        fprintf(stderr, "gencase: error - CaseFolding entries out of order, U+%04lx after U+%04lx\n",
                (unsigned long)caseFoldings[caseFoldingCount].code,
                (unsigned long)prevCode);
        *pErrorCode=U_PARSE_ERROR;
        exit(U_PARSE_ERROR);
    }
    prevCode=caseFoldings[caseFoldingCount].code;

    if(++caseFoldingCount==MAX_CASE_FOLDING_COUNT) {
        fprintf(stderr, "gencase: too many case folding mappings\n");
        *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
        exit(U_INDEX_OUTOFBOUNDS_ERROR);
    }
}
Ejemplo n.º 21
0
/* return TRUE if a base table was read, FALSE for an extension table */
static UBool
readFile(ConvData *data, const char* converterName,
         UErrorCode *pErrorCode) {
    char line[1024];
    char *end;
    FileStream *convFile;

    UCMStates *baseStates;
    UBool dataIsBase;

    if(U_FAILURE(*pErrorCode)) {
        return FALSE;
    }

    data->ucm=ucm_open();

    convFile=T_FileStream_open(converterName, "r");
    if(convFile==NULL) {
        *pErrorCode=U_FILE_ACCESS_ERROR;
        return FALSE;
    }

    readHeader(data, convFile, converterName, pErrorCode);
    if(U_FAILURE(*pErrorCode)) {
        return FALSE;
    }

    if(data->ucm->baseName[0]==0) {
        dataIsBase=TRUE;
        baseStates=&data->ucm->states;
        ucm_processStates(baseStates, IGNORE_SISO_CHECK);
    } else {
        dataIsBase=FALSE;
        baseStates=NULL;
    }

    /* read the base table */
    ucm_readTable(data->ucm, convFile, dataIsBase, baseStates, pErrorCode);
    if(U_FAILURE(*pErrorCode)) {
        return FALSE;
    }

    /* read an extension table if there is one */
    while(T_FileStream_readLine(convFile, line, sizeof(line))) {
        end=uprv_strchr(line, 0);
        while(line<end &&
              (*(end-1)=='\n' || *(end-1)=='\r' || *(end-1)==' ' || *(end-1)=='\t')) {
            --end;
        }
        *end=0;

        if(line[0]=='#' || u_skipWhitespace(line)==end) {
            continue; /* ignore empty and comment lines */
        }

        if(0==uprv_strcmp(line, "CHARMAP")) {
            /* read the extension table */
            ucm_readTable(data->ucm, convFile, FALSE, baseStates, pErrorCode);
        } else {
            fprintf(stderr, "unexpected text after the base mapping table\n");
        }
        break;
    }

    T_FileStream_close(convFile);

    if(data->ucm->base->flagsType==UCM_FLAGS_MIXED || data->ucm->ext->flagsType==UCM_FLAGS_MIXED) {
        fprintf(stderr, "error: some entries have the mapping precision (with '|'), some do not\n");
        *pErrorCode=U_INVALID_TABLE_FORMAT;
    }

    return dataIsBase;
}
Ejemplo n.º 22
0
static void U_CALLCONV
strprepProfileLineFn(void *context,
              char *fields[][2], int32_t fieldCount,
              UErrorCode *pErrorCode) {
    uint32_t mapping[40];
    char *end, *map;
    uint32_t code;
    int32_t length;
   /*UBool* mapWithNorm = (UBool*) context;*/
    const char* typeName;
    uint32_t rangeStart=0,rangeEnd =0;
    const char* filename = (const char*) context;
    const char *s;

    s = u_skipWhitespace(fields[0][0]);
    if (*s == '@') {
        /* special directive */
        s++;
        length = fields[0][1] - s;
        if (length >= NORMALIZE_DIRECTIVE_LEN
            && uprv_strncmp(s, NORMALIZE_DIRECTIVE, NORMALIZE_DIRECTIVE_LEN) == 0) {
            options[NORMALIZE].doesOccur = TRUE;
            return;
        }
        else if (length >= CHECK_BIDI_DIRECTIVE_LEN
            && uprv_strncmp(s, CHECK_BIDI_DIRECTIVE, CHECK_BIDI_DIRECTIVE_LEN) == 0) {
            options[CHECK_BIDI].doesOccur = TRUE;
            return;
        }
        else {
            fprintf(stderr, "gensprep error parsing a directive %s.", fields[0][0]);
        }
    }

    typeName = fields[2][0];
    map = fields[1][0];

    if(uprv_strstr(typeName, usprepTypeNames[USPREP_UNASSIGNED])!=NULL){

        u_parseCodePointRange(s, &rangeStart,&rangeEnd, pErrorCode);
        if(U_FAILURE(*pErrorCode)){
            fprintf(stderr, "Could not parse code point range. Error: %s\n",u_errorName(*pErrorCode));
            return;
        }

        /* store the range */
        storeRange(rangeStart,rangeEnd,USPREP_UNASSIGNED, pErrorCode);

    }else if(uprv_strstr(typeName, usprepTypeNames[USPREP_PROHIBITED])!=NULL){

        u_parseCodePointRange(s, &rangeStart,&rangeEnd, pErrorCode);
        if(U_FAILURE(*pErrorCode)){
            fprintf(stderr, "Could not parse code point range. Error: %s\n",u_errorName(*pErrorCode));
            return;
        }

        /* store the range */
        storeRange(rangeStart,rangeEnd,USPREP_PROHIBITED, pErrorCode);

    }else if(uprv_strstr(typeName, usprepTypeNames[USPREP_MAP])!=NULL){

        /* get the character code, field 0 */
        code=(uint32_t)uprv_strtoul(s, &end, 16);
        if(end<=s || end!=fields[0][1]) {
            fprintf(stderr, "gensprep: syntax error in field 0 at %s\n", fields[0][0]);
            *pErrorCode=U_PARSE_ERROR;
            exit(U_PARSE_ERROR);
        }

        /* parse the mapping string */
        length=u_parseCodePoints(map, mapping, sizeof(mapping)/4, pErrorCode);
        
        /* store the mapping */
        storeMapping(code,mapping, length,USPREP_MAP, pErrorCode);

    }else{
        *pErrorCode = U_INVALID_FORMAT_ERROR;
    }
    
    if(U_FAILURE(*pErrorCode)) {
        fprintf(stderr, "gensprep error parsing  %s line %s at %s. Error: %s\n",filename,
               fields[0][0],fields[2][0],u_errorName(*pErrorCode));
        exit(*pErrorCode);
    }

}
Ejemplo n.º 23
0
/* read a range like start or start..end */
U_CAPI int32_t U_EXPORT2
u_parseCodePointRange(const char *s,
                      uint32_t *pStart, uint32_t *pEnd,
                      UErrorCode *pErrorCode) {
    char *end;
    uint32_t value;

    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
        return 0;
    }
    if(s==NULL || pStart==NULL || pEnd==NULL) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    }

    s=u_skipWhitespace(s);
    if(*s==';' || *s==0) {
        *pErrorCode=U_PARSE_ERROR;
        return 0;
    }

    /* read the start code point */
    value=(uint32_t)uprv_strtoul(s, &end, 16);
    if(end<=s || (*end!=' ' && *end!='\t' && *end!='.' && *end!=';') || value>=0x110000) {
        *pErrorCode=U_PARSE_ERROR;
        return 0;
    }
    *pStart=*pEnd=value;

    /* is there a "..end"? */
    s=u_skipWhitespace(end);
    if(*s==';' || *s==0) {
        return 1;
    }

    if(*s!='.' || s[1]!='.') {
        *pErrorCode=U_PARSE_ERROR;
        return 0;
    }
    s+=2;

    /* read the end code point */
    value=(uint32_t)uprv_strtoul(s, &end, 16);
    if(end<=s || (*end!=' ' && *end!='\t' && *end!=';') || value>=0x110000) {
        *pErrorCode=U_PARSE_ERROR;
        return 0;
    }
    *pEnd=value;

    /* is this a valid range? */
    if(value<*pStart) {
        *pErrorCode=U_PARSE_ERROR;
        return 0;
    }

    /* no garbage after that? */
    s=u_skipWhitespace(end);
    if(*s==';' || *s==0) {
        return value-*pStart+1;
    } else {
        *pErrorCode=U_PARSE_ERROR;
        return 0;
    }
}
Ejemplo n.º 24
0
void BiDiConformanceTest::TestBidiTest() {
    if(isICUVersionBefore(52, 1)) {
        // TODO: Update the ICU BiDi code to implement the additions in the Unicode 6.3 BiDi Algorithm,
        // and reenable the BiDi conformance test.
        return;
    }
    IcuTestErrorCode errorCode(*this, "TestBidiTest");
    const char *sourceTestDataPath=getSourceTestData(errorCode);
    if(errorCode.logIfFailureAndReset("unable to find the source/test/testdata "
                                      "folder (getSourceTestData())")) {
        return;
    }
    char bidiTestPath[400];
    strcpy(bidiTestPath, sourceTestDataPath);
    strcat(bidiTestPath, "BidiTest.txt");
    LocalStdioFilePointer bidiTestFile(fopen(bidiTestPath, "r"));
    if(bidiTestFile.isNull()) {
        errln("unable to open %s", bidiTestPath);
        return;
    }
    LocalUBiDiPointer ubidi(ubidi_open());
    ubidi_setClassCallback(ubidi.getAlias(), biDiConfUBiDiClassCallback, NULL,
                           NULL, NULL, errorCode);
    if(errorCode.logIfFailureAndReset("ubidi_setClassCallback()")) {
        return;
    }
    lineNumber=0;
    levelsCount=0;
    orderingCount=0;
    errorCount=0;
    while(errorCount<10 && fgets(line, (int)sizeof(line), bidiTestFile.getAlias())!=NULL) {
        ++lineNumber;
        // Remove trailing comments and whitespace.
        char *commentStart=strchr(line, '#');
        if(commentStart!=NULL) {
            *commentStart=0;
        }
        u_rtrim(line);
        const char *start=u_skipWhitespace(line);
        if(*start==0) {
            continue;  // Skip empty and comment-only lines.
        }
        if(*start=='@') {
            ++start;
            if(0==strncmp(start, "Levels:", 7)) {
                if(!parseLevels(start+7)) {
                    return;
                }
            } else if(0==strncmp(start, "Reorder:", 8)) {
                if(!parseOrdering(start+8)) {
                    return;
                }
            }
            // Skip unknown @Xyz: ...
        } else {
            if(!parseInputStringFromBiDiClasses(start)) {
                return;
            }
            start=u_skipWhitespace(start);
            if(*start!=';') {
                errln("missing ; separator on input line %s", line);
                return;
            }
            start=u_skipWhitespace(start+1);
            char *end;
            uint32_t bitset=(uint32_t)strtoul(start, &end, 16);
            if(end<=start || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0)) {
                errln("input bitset parse error at %s", start);
                return;
            }
            // Loop over the bitset.
            static const UBiDiLevel paraLevels[]={ UBIDI_DEFAULT_LTR, 0, 1, UBIDI_DEFAULT_RTL };
            static const char *const paraLevelNames[]={ "auto/LTR", "LTR", "RTL", "auto/RTL" };
            for(int i=0; i<=3; ++i) {
                if(bitset&(1<<i)) {
                    ubidi_setPara(ubidi.getAlias(), inputString.getBuffer(), inputString.length(),
                                  paraLevels[i], NULL, errorCode);
                    const UBiDiLevel *actualLevels=ubidi_getLevels(ubidi.getAlias(), errorCode);
                    if(errorCode.logIfFailureAndReset("ubidi_setPara() or ubidi_getLevels()")) {
                        errln("Input line %d: %s", (int)lineNumber, line);
                        return;
                    }
                    if(!checkLevels(actualLevels, ubidi_getProcessedLength(ubidi.getAlias()),
                                    paraLevelNames[i])) {
                        // continue outerLoop;  does not exist in C++
                        // so just break out of the inner loop.
                        break;
                    }
                    if(!checkOrdering(ubidi.getAlias(), paraLevelNames[i])) {
                        // continue outerLoop;  does not exist in C++
                        // so just break out of the inner loop.
                        break;
                    }
                }
            }
        }
    }
}
Ejemplo n.º 25
0
UBool BiDiConformanceTest::parseInputStringFromBiDiClasses(const char *&start) {
    inputString.remove();
    /*
     * Lengthy but fast BiDi class parser.
     * A simple parser could terminate or extract the name string and use
     *   int32_t biDiClassInt=u_getPropertyValueEnum(UCHAR_BIDI_CLASS, bidiClassString);
     * but that makes this test take significantly more time.
     */
    while(*start!=0 && *(start=u_skipWhitespace(start))!=0 && *start!=';') {
        UCharDirection biDiClass=U_CHAR_DIRECTION_COUNT;
        // Compare each character once until we have a match on
        // a complete, short BiDi class name.
        if(start[0]=='L') {
            if(start[1]=='R') {
                if(start[2]=='E') {
                    biDiClass=U_LEFT_TO_RIGHT_EMBEDDING;
                } else if(start[2]=='I') {
                    biDiClass=U_LEFT_TO_RIGHT_ISOLATE;
                } else if(start[2]=='O') {
                    biDiClass=U_LEFT_TO_RIGHT_OVERRIDE;
                }
            } else {
                biDiClass=U_LEFT_TO_RIGHT;
            }
        } else if(start[0]=='R') {
            if(start[1]=='L') {
                if(start[2]=='E') {
                    biDiClass=U_RIGHT_TO_LEFT_EMBEDDING;
                } else if(start[2]=='I') {
                    biDiClass=U_RIGHT_TO_LEFT_ISOLATE;
                } else if(start[2]=='O') {
                    biDiClass=U_RIGHT_TO_LEFT_OVERRIDE;
                }
            } else {
                biDiClass=U_RIGHT_TO_LEFT;
            }
        } else if(start[0]=='E') {
            if(start[1]=='N') {
                biDiClass=U_EUROPEAN_NUMBER;
            } else if(start[1]=='S') {
                biDiClass=U_EUROPEAN_NUMBER_SEPARATOR;
            } else if(start[1]=='T') {
                biDiClass=U_EUROPEAN_NUMBER_TERMINATOR;
            }
        } else if(start[0]=='A') {
            if(start[1]=='L') {
                biDiClass=U_RIGHT_TO_LEFT_ARABIC;
            } else if(start[1]=='N') {
                biDiClass=U_ARABIC_NUMBER;
            }
        } else if(start[0]=='C' && start[1]=='S') {
            biDiClass=U_COMMON_NUMBER_SEPARATOR;
        } else if(start[0]=='B') {
            if(start[1]=='N') {
                biDiClass=U_BOUNDARY_NEUTRAL;
            } else {
                biDiClass=U_BLOCK_SEPARATOR;
            }
        } else if(start[0]=='S') {
            biDiClass=U_SEGMENT_SEPARATOR;
        } else if(start[0]=='W' && start[1]=='S') {
            biDiClass=U_WHITE_SPACE_NEUTRAL;
        } else if(start[0]=='O' && start[1]=='N') {
            biDiClass=U_OTHER_NEUTRAL;
        } else if(start[0]=='P' && start[1]=='D') {
            if(start[2]=='F') {
                biDiClass=U_POP_DIRECTIONAL_FORMAT;
            } else if(start[2]=='I') {
                biDiClass=U_POP_DIRECTIONAL_ISOLATE;
            }
        } else if(start[0]=='N' && start[1]=='S' && start[2]=='M') {
            biDiClass=U_DIR_NON_SPACING_MARK;
        } else if(start[0]=='F' && start[1]=='S' && start[2]=='I') {
            biDiClass=U_FIRST_STRONG_ISOLATE;
        }
        // Now we verify that the class name is terminated properly,
        // and not just the start of a longer word.
        int8_t biDiClassNameLength=biDiClassNameLengths[biDiClass];
        char c=start[biDiClassNameLength];
        if(biDiClass==U_CHAR_DIRECTION_COUNT || (!U_IS_INV_WHITESPACE(c) && c!=';' && c!=0)) {
            errln("BiDi class string not recognized at %s", start);
            return FALSE;
        }
        inputString.append(charFromBiDiClass[biDiClass]);
        start+=biDiClassNameLength;
    }
    return TRUE;
}
Ejemplo n.º 26
0
U_CAPI void U_EXPORT2
u_parseDelimitedFile(const char *filename, char delimiter,
                     char *fields[][2], int32_t fieldCount,
                     UParseLineFn *lineFn, void *context,
                     UErrorCode *pErrorCode) {
    FileStream *file;
    char line[300];
    char *start, *limit;
    int32_t i, length;

    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
        return;
    }

    if(fields==NULL || lineFn==NULL || fieldCount<=0) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
        return;
    }

    if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) {
        filename=NULL;
        file=T_FileStream_stdin();
    } else {
        file=T_FileStream_open(filename, "r");
    }
    if(file==NULL) {
        *pErrorCode=U_FILE_ACCESS_ERROR;
        return;
    }

    while(T_FileStream_readLine(file, line, sizeof(line))!=NULL) {
        length=(int32_t)uprv_strlen(line);

        /* remove trailing newline characters */
        while(length>0 && (line[length-1]=='\r' || line[length-1]=='\n')) {
            line[--length]=0;
        }

        /* skip this line if it is empty or a comment */
        if(line[0]==0 || line[0]=='#') {
            continue;
        }

        /* remove in-line comments */
        limit=uprv_strchr(line, '#');
        if(limit!=NULL) {
            /* get white space before the pound sign */
            while(limit>line && (*(limit-1)==' ' || *(limit-1)=='\t')) {
                --limit;
            }

            /* truncate the line */
            *limit=0;
        }

        /* skip lines with only whitespace */
        if(u_skipWhitespace(line)[0]==0) {
            continue;
        }

        /* for each field, call the corresponding field function */
        start=line;
        for(i=0; i<fieldCount; ++i) {
            /* set the limit pointer of this field */
            limit=start;
            while(*limit!=delimiter && *limit!=0) {
                ++limit;
            }

            /* set the field start and limit in the fields array */
            fields[i][0]=start;
            fields[i][1]=limit;

            /* set start to the beginning of the next field, if any */
            start=limit;
            if(*start!=0) {
                ++start;
            } else if(i+1<fieldCount) {
                *pErrorCode=U_PARSE_ERROR;
                limit=line+length;
                i=fieldCount;
                break;
            }
        }

        /* error in a field function? */
        if(U_FAILURE(*pErrorCode)) {
            break;
        }

        /* call the field function */
        lineFn(context, fields, fieldCount, pErrorCode);
        if(U_FAILURE(*pErrorCode)) {
            break;
        }
    }

    if(filename!=NULL) {
        T_FileStream_close(file);
    }
}
Ejemplo n.º 27
0
/*
 * state table row grammar (ebnf-style):
 * (whitespace is allowed between all tokens)
 *
 * row=[[firstentry ','] entry (',' entry)*]
 * firstentry="initial" | "surrogates"
 *            (initial state (default for state 0), output is all surrogate pairs)
 * entry=range [':' nextstate] ['.' action]
 * range=number ['-' number]
 * nextstate=number
 *           (0..7f)
 * action='u' | 's' | 'p' | 'i'
 *        (unassigned, state change only, surrogate pair, illegal)
 * number=(1- or 2-digit hexadecimal number)
 */
static const char *
parseState(const char *s, int32_t state[256], uint32_t *pFlags) {
    const char *t;
    uint32_t start, end, i;
    int32_t entry;

    /* initialize the state: all illegal with U+ffff */
    for(i=0; i<256; ++i) {
        state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0xffff);
    }

    /* skip leading white space */
    s=u_skipWhitespace(s);

    /* is there an "initial" or "surrogates" directive? */
    if(uprv_strncmp("initial", s, 7)==0) {
        *pFlags=MBCS_STATE_FLAG_DIRECT;
        s=u_skipWhitespace(s+7);
        if(*s++!=',') {
            return s-1;
        }
    } else if(*pFlags==0 && uprv_strncmp("surrogates", s, 10)==0) {
        *pFlags=MBCS_STATE_FLAG_SURROGATES;
        s=u_skipWhitespace(s+10);
        if(*s++!=',') {
            return s-1;
        }
    } else if(*s==0) {
        /* empty state row: all-illegal */
        return NULL;
    }

    for(;;) {
        /* read an entry, the start of the range first */
        s=u_skipWhitespace(s);
        start=uprv_strtoul(s, (char **)&t, 16);
        if(s==t || 0xff<start) {
            return s;
        }
        s=u_skipWhitespace(t);

        /* read the end of the range if there is one */
        if(*s=='-') {
            s=u_skipWhitespace(s+1);
            end=uprv_strtoul(s, (char **)&t, 16);
            if(s==t || end<start || 0xff<end) {
                return s;
            }
            s=u_skipWhitespace(t);
        } else {
            end=start;
        }

        /* determine the state entrys for this range */
        if(*s!=':' && *s!='.') {
            /* the default is: final state with valid entries */
            entry=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_16, 0);
        } else {
            entry=MBCS_ENTRY_TRANSITION(0, 0);
            if(*s==':') {
                /* get the next state, default to 0 */
                s=u_skipWhitespace(s+1);
                i=uprv_strtoul(s, (char **)&t, 16);
                if(s!=t) {
                    if(0x7f<i) {
                        return s;
                    }
                    s=u_skipWhitespace(t);
                    entry=MBCS_ENTRY_SET_STATE(entry, i);
                }
            }

            /* get the state action, default to valid */
            if(*s=='.') {
                /* this is a final state */
                entry=MBCS_ENTRY_SET_FINAL(entry);

                s=u_skipWhitespace(s+1);
                if(*s=='u') {
                    /* unassigned set U+fffe */
                    entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe);
                    s=u_skipWhitespace(s+1);
                } else if(*s=='p') {
                    if(*pFlags!=MBCS_STATE_FLAG_DIRECT) {
                        entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16_PAIR);
                    } else {
                        entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16);
                    }
                    s=u_skipWhitespace(s+1);
                } else if(*s=='s') {
                    entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_CHANGE_ONLY);
                    s=u_skipWhitespace(s+1);
                } else if(*s=='i') {
                    /* illegal set U+ffff */
                    entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_ILLEGAL, 0xffff);
                    s=u_skipWhitespace(s+1);
                } else {
                    /* default to valid */
                    entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16);
                }
            } else {
                /* this is an intermediate state, nothing to do */
            }
        }

        /* adjust "final valid" states according to the state flags */
        if(MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16) {
            switch(*pFlags) {
            case 0:
                /* no adjustment */
                break;
            case MBCS_STATE_FLAG_DIRECT:
                /* set the valid-direct code point to "unassigned"==0xfffe */
                entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_DIRECT_16, 0xfffe);
                break;
            case MBCS_STATE_FLAG_SURROGATES:
                entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_16_PAIR, 0);
                break;
            default:
                break;
            }
        }

        /* set this entry for the range */
        for(i=start; i<=end; ++i) {
            state[i]=entry;
        }

        if(*s==',') {
            ++s;
        } else {
            return *s==0 ? NULL : s;
        }
    }
}
Ejemplo n.º 28
0
U_CAPI UBool U_EXPORT2
ucm_parseHeaderLine(UCMFile *ucm,
                    char *line, char **pKey, char **pValue) {
    UCMStates *states;
    char *s, *end;
    char c;

    states=&ucm->states;

    /* remove comments and trailing CR and LF and remove whitespace from the end */
    for(end=line; (c=*end)!=0; ++end) {
        if(c=='#' || c=='\r' || c=='\n') {
            break;
        }
    }
    while(end>line && (*(end-1)==' ' || *(end-1)=='\t')) {
        --end;
    }
    *end=0;

    /* skip leading white space and ignore empty lines */
    s=(char *)u_skipWhitespace(line);
    if(*s==0) {
        return TRUE;
    }

    /* stop at the beginning of the mapping section */
    if(uprv_memcmp(s, "CHARMAP", 7)==0) {
        return FALSE;
    }

    /* get the key name, bracketed in <> */
    if(*s!='<') {
        fprintf(stderr, "ucm error: no header field <key> in line \"%s\"\n", line);
        exit(U_INVALID_TABLE_FORMAT);
    }
    *pKey=++s;
    while(*s!='>') {
        if(*s==0) {
            fprintf(stderr, "ucm error: incomplete header field <key> in line \"%s\"\n", line);
            exit(U_INVALID_TABLE_FORMAT);
        }
        ++s;
    }
    *s=0;

    /* get the value string, possibly quoted */
    s=(char *)u_skipWhitespace(s+1);
    if(*s!='"') {
        *pValue=s;
    } else {
        /* remove the quotes */
        *pValue=s+1;
        if(end>*pValue && *(end-1)=='"') {
            *--end=0;
        }
    }

    /* collect the information from the header field, ignore unknown keys */
    if(uprv_strcmp(*pKey, "uconv_class")==0) {
        if(uprv_strcmp(*pValue, "DBCS")==0) {
            states->conversionType=UCNV_DBCS;
        } else if(uprv_strcmp(*pValue, "SBCS")==0) {
            states->conversionType = UCNV_SBCS;
        } else if(uprv_strcmp(*pValue, "MBCS")==0) {
            states->conversionType = UCNV_MBCS;
        } else if(uprv_strcmp(*pValue, "EBCDIC_STATEFUL")==0) {
            states->conversionType = UCNV_EBCDIC_STATEFUL;
        } else {
            fprintf(stderr, "ucm error: unknown <uconv_class> %s\n", *pValue);
            exit(U_INVALID_TABLE_FORMAT);
        }
        return TRUE;
    } else if(uprv_strcmp(*pKey, "mb_cur_max")==0) {
        c=**pValue;
        if('1'<=c && c<='4' && (*pValue)[1]==0) {
            states->maxCharLength=(int8_t)(c-'0');
            states->outputType=(int8_t)(states->maxCharLength-1);
        } else {
            fprintf(stderr, "ucm error: illegal <mb_cur_max> %s\n", *pValue);
            exit(U_INVALID_TABLE_FORMAT);
        }
        return TRUE;
    } else if(uprv_strcmp(*pKey, "mb_cur_min")==0) {
        c=**pValue;
        if('1'<=c && c<='4' && (*pValue)[1]==0) {
            states->minCharLength=(int8_t)(c-'0');
        } else {
            fprintf(stderr, "ucm error: illegal <mb_cur_min> %s\n", *pValue);
            exit(U_INVALID_TABLE_FORMAT);
        }
        return TRUE;
    } else if(uprv_strcmp(*pKey, "icu:state")==0) {
        /* if an SBCS/DBCS/EBCDIC_STATEFUL converter has icu:state, then turn it into MBCS */
        switch(states->conversionType) {
        case UCNV_SBCS:
        case UCNV_DBCS:
        case UCNV_EBCDIC_STATEFUL:
            states->conversionType=UCNV_MBCS;
            break;
        case UCNV_MBCS:
            break;
        default:
            fprintf(stderr, "ucm error: <icu:state> entry for non-MBCS table or before the <uconv_class> line\n");
            exit(U_INVALID_TABLE_FORMAT);
        }

        if(states->maxCharLength==0) {
            fprintf(stderr, "ucm error: <icu:state> before the <mb_cur_max> line\n");
            exit(U_INVALID_TABLE_FORMAT);
        }
        ucm_addState(states, *pValue);
        return TRUE;
    } else if(uprv_strcmp(*pKey, "icu:base")==0) {
        if(**pValue==0) {
            fprintf(stderr, "ucm error: <icu:base> without a base table name\n");
            exit(U_INVALID_TABLE_FORMAT);
        }
        uprv_strcpy(ucm->baseName, *pValue);
        return TRUE;
    }

    return FALSE;
}
Ejemplo n.º 29
0
/* parse a mapping line; must not be empty */
U_CAPI UBool U_EXPORT2
ucm_parseMappingLine(UCMapping *m,
                     UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
                     uint8_t bytes[UCNV_EXT_MAX_BYTES],
                     const char *line) {
    const char *s;
    char *end;
    UChar32 cp;
    int32_t u16Length;
    int8_t uLen, bLen, f;

    s=line;
    uLen=bLen=0;

    /* parse code points */
    for(;;) {
        /* skip an optional plus sign */
        if(uLen>0 && *s=='+') {
            ++s;
        }
        if(*s!='<') {
            break;
        }

        if( s[1]!='U' ||
            (cp=(UChar32)uprv_strtoul(s+2, &end, 16), end)==s+2 ||
            *end!='>'
        ) {
            fprintf(stderr, "ucm error: Unicode code point must be formatted as <UXXXX> (1..6 hex digits) - \"%s\"\n", line);
            return FALSE;
        }
        if((uint32_t)cp>0x10ffff || U_IS_SURROGATE(cp)) {
            fprintf(stderr, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line);
            return FALSE;
        }

        if(uLen==UCNV_EXT_MAX_UCHARS) {
            fprintf(stderr, "ucm error: too many code points on \"%s\"\n", line);
            return FALSE;
        }
        codePoints[uLen++]=cp;
        s=end+1;
    }

    if(uLen==0) {
        fprintf(stderr, "ucm error: no Unicode code points on \"%s\"\n", line);
        return FALSE;
    } else if(uLen==1) {
        m->u=codePoints[0];
    } else {
        UErrorCode errorCode=U_ZERO_ERROR;
        u_strFromUTF32(NULL, 0, &u16Length, codePoints, uLen, &errorCode);
        if( (U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) ||
            u16Length>UCNV_EXT_MAX_UCHARS
        ) {
            fprintf(stderr, "ucm error: too many UChars on \"%s\"\n", line);
            return FALSE;
        }
    }

    s=u_skipWhitespace(s);

    /* parse bytes */
    bLen=ucm_parseBytes(bytes, line, &s);

    if(bLen<0) {
        return FALSE;
    } else if(bLen==0) {
        fprintf(stderr, "ucm error: no bytes on \"%s\"\n", line);
        return FALSE;
    } else if(bLen<=4) {
        uprv_memcpy(m->b.bytes, bytes, bLen);
    }

    /* skip everything until the fallback indicator, even the start of a comment */
    for(;;) {
        if(*s==0) {
            f=-1; /* no fallback indicator */
            break;
        } else if(*s=='|') {
            f=(int8_t)(s[1]-'0');
            if((uint8_t)f>4) {
                fprintf(stderr, "ucm error: fallback indicator must be |0..|4 - \"%s\"\n", line);
                return FALSE;
            }
            break;
        }
        ++s;
    }

    m->uLen=uLen;
    m->bLen=bLen;
    m->f=f;
    return TRUE;
}
Ejemplo n.º 30
0
void parseFile(FILE *f, Normalizer2DataBuilder &builder) {
    IcuToolErrorCode errorCode("gennorm2/parseFile()");
    char line[300];
    uint32_t startCP, endCP;
    while(NULL!=fgets(line, (int)sizeof(line), f)) {
        char *comment=(char *)strchr(line, '#');
        if(comment!=NULL) {
            *comment=0;
        }
        u_rtrim(line);
        if(line[0]==0) {
            continue;  // skip empty and comment-only lines
        }
        if(line[0]=='*') {
            const char *s=u_skipWhitespace(line+1);
            if(0==strncmp(s, "Unicode", 7)) {
                s=u_skipWhitespace(s+7);
                builder.setUnicodeVersion(s);
            }
            continue;  // reserved syntax
        }
        const char *delimiter;
        int32_t rangeLength=
            u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode);
        if(errorCode.isFailure()) {
            fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line);
            exit(errorCode.reset());
        }
        delimiter=u_skipWhitespace(delimiter);
        if(*delimiter==':') {
            const char *s=u_skipWhitespace(delimiter+1);
            char *end;
            unsigned long value=strtoul(s, &end, 10);
            if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) {
                fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line);
                exit(U_PARSE_ERROR);
            }
            for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
                builder.setCC(c, (uint8_t)value);
            }
            continue;
        }
        if(*delimiter=='-') {
            if(*u_skipWhitespace(delimiter+1)!=0) {
                fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line);
                exit(U_PARSE_ERROR);
            }
            for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
                builder.removeMapping(c);
            }
            continue;
        }
        if(*delimiter=='=' || *delimiter=='>') {
            UChar uchars[Normalizer2Impl::MAPPING_LENGTH_MASK];
            int32_t length=u_parseString(delimiter+1, uchars, LENGTHOF(uchars), NULL, errorCode);
            if(errorCode.isFailure()) {
                fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line);
                exit(errorCode.reset());
            }
            UnicodeString mapping(FALSE, uchars, length);
            if(*delimiter=='=') {
                if(rangeLength!=1) {
                    fprintf(stderr,
                            "gennorm2 error: round-trip mapping for more than 1 code point on %s\n",
                            line);
                    exit(U_PARSE_ERROR);
                }
                builder.setRoundTripMapping((UChar32)startCP, mapping);
            } else {
                for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
                    builder.setOneWayMapping(c, mapping);
                }
            }
            continue;
        }
        fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line);
        exit(U_PARSE_ERROR);
    }
}