int main(void){ for(char i = 'a'; i <= 'z'; ++i){ assert(is_alpha(i)); assert(!is_digit(i)); assert(is_lower(i)); assert(!is_upper(i)); assert(to_lower(i) == i); assert(to_upper(i) != i); } for(char i = 'A'; i <= 'Z'; ++i){ assert(is_alpha(i)); assert(!is_digit(i)); assert(!is_lower(i)); assert(is_upper(i)); assert(to_lower(i) != i); assert(to_upper(i) == i); } for(char i = '0'; i <= '9'; ++i){ assert(!is_alpha(i)); assert(is_digit(i)); assert(!is_lower(i)); assert(!is_upper(i)); assert(to_lower(i) == i); assert(to_upper(i) == i); } printf("TEST SUCEEDED\n"); return 0; }
const char *get_type(RfListItem *typed) { if (!typed) return "*0"; if (typed->IsChar()) { if (is_capital(typed->charcode)) return "Lu"; if (is_lower(typed->charcode)) return "Ll"; return "Ol"; } if (typed->IsLeftMarkup()) return "B0"; SExpressionInt *ti; ti = typed->symb_val.DynamicCastGetPtr<SExpressionInt>(); if (ti) return "N0"; SExpressionFloat *tf; tf = typed->symb_val.DynamicCastGetPtr<SExpressionFloat>(); if (tf) return "D0"; SString repr = typed->symb_val->TextRepresentation(); char *str = (char *)repr.c_str(); if (is_ident(str)) return "Wi"; return "Wq"; }
char* correct_words(char* correct_word,char* word) { char ch; char* res = NULL; int i = 0; int count_lower = 0, count_upper = 0; for(int i = 0; word[i] != '\0'; i++){ if(word[i]>=65 && word[i] < 91){ count_upper += 1; } else count_lower += 1; } if(count_upper > count_lower){ for(; word[i] != '\0'; i++){ ch = word[i]; if(is_lower(ch)){ ch = to_upper(ch); } correct_word[i] = ch; } } else{ for(i = 0; word[i] != '\0'; i++){ ch = word[i]; if(is_upper(ch)){ ch = to_lower(ch); } correct_word[i] = ch; } } correct_word[i] = '\0'; res = correct_word; return res; }
static int rec_shortu(cell* c, cell * cap) { cell * clist[8]; uchar let = c->vers[0].let; if(cap->row > c->row) return 0; // not a cap if(cap->width() * 3 < c->width()) return 0; // not so wide as need if(cap->width() < 4 || cap->height() < 3) return 0; // just dot if(cap->col < c->col - 2) return 0; // left dust if(c->row > cap->row + cap->height() && (c->row - (cap->row + cap->height())) > c->height() / 2 ) return 0; // dust lay so high if((let == (uchar)'\xE3') && (c->pos_inc & erect_rot)) // 'u' with cap if(c->col + c->width() / 2 < cap->col || c->col + c->width() / 2 > cap->col + cap->width()) return 0; // not centered dust clist[0] = c; clist[1] = cap; if(!compose_cell(2, clist, c)) return -1; //OLEG:new return style of composed let = is_lower(let) ? (uchar) u_bel : (uchar) U_bel; c->vers[0].let = let; c->vers[0].prob = MIN(254, c->vers[0].prob + 2); c->recsource = 0; // artifact c->dens = 255; // undef c->nvers = 1; c->vers[1].let = c->vers[1].prob = 0; return 1; }
bool str_is_slot_name(const char* s, fint len) { assert(len >= 0, "shouldn't be negative length"); if (len == 0) { return false; } char c = *s; if (!is_lower(c)) { if (!is_punct(c)) return false; switch (c) { case '^': case '|': case '\\': case '.': if (len == 1) return false; } for (int i = 0; i < len; ) { c = s[i++]; if (! is_punct(c)) return false; switch (c) { case '(': case ')': case '\'': case '\"': case ':': case '[': case ']': return false; } } return true; } for (int i = 1; i < len; ) { c = s[i++]; if (is_id_char(c)) continue; if (c != ':') return false; if (i == len) return true; // this was final ":" if (!is_upper(s[i])) return false; // after ":" must be uppercase if (s[len-1] != ':') return false; // one ":" -> last is ":" } return true; }
static void tokencat ( char c ) { if ( is_lower ( c ) ) { c = _toupper ( c ); } char *eot = strchr ( g_sb_token, '\0' ); *eot = c; *(eot + 1) = '\0'; }
/** * Returns 1 if the string 's' is only made of lowercase letters, * according to the given alphabet, 0 otherwise. */ int is_sequence_of_lowercase_letters(const unichar* s,const Alphabet* alphabet) { int i=0; while (s[i]!='\0') { if (!is_lower(s[i],alphabet)) return 0; i++; } return 1; }
bool detectCapitalUse(string word) { bool all_upper = true, all_lower = true, first = is_upper(word[0]); for(int i=1; i<word.size(); i++) { if (is_lower(word[i])) all_upper = false; if (is_upper(word[i])) all_lower = false; } return all_lower || first && all_upper; }
static void validate_matrices(const SparseMatrix *L, const SparseMatrix *P) { EXPENSIVE_ASSERT(is_lower(L)); EXPENSIVE_ASSERT(check_symbolic_zeros(L)); EXPENSIVE_ASSERT(is_symmetric(P)); assert(L->N == P->N); assert(P->nz == NZ_SYM(L->nz, L->N)); }
/*----------------------------------------------------------------------------- ** Function: init_type() ** Purpose: This is the initialization routine for this file. This ** has to be called before some of the macros in |type.h| ** will work as described. It does no harm to call this ** initialization more than once. It just takes some time. ** ** Note that this function is for internal purposes ** only. The normal user should call |init_bibtool()| ** instead. ** Arguments: none ** Returns: nothing **___________________________________________________ */ void init_type() /* */ { register int i; /* */ /* */ for ( i = 0; i < 256; ++i ) /* */ { trans_lower[i] = is_upper(i)?to_lower(i):i; /* */ trans_upper[i] = is_lower(i)?to_upper(i):i; /* */ trans_id[i] = i; /* */ } /* */ } /*------------------------*/
/** * @fn Вычисляет новый символ, закодированный по цезарю, с учетом зацикливания. */ short offset(short symbol, int counter){ short new_symbol = symbol + counter; if (is_digit(symbol)){ return cycle(new_symbol, '0', '9'); } if (is_lower(symbol)){ return cycle(new_symbol, 'a', 'z'); } if (is_upper(symbol)){ return cycle(new_symbol, 'A', 'Z'); } return symbol; }
// Change the case of the current character. First check lower and then upper. If it is not a letter, it gets returned // unchanged. int chcase(int ch) { // Translate lowercase. if(is_lower(ch)) return upcase[ch]; // Translate uppercase. if(is_upper(ch)) return lowcase[ch]; // Let the rest pass. return ch; }
/* * Encode char if it's a letter, or return original */ char encode(char src, int key) { if (is_upper(src)) { return rotate(src, 'A', key, 26); } if (is_lower(src)) { return rotate(src, 'a', key, 26); } else { return src; } }
int minimumNumber(int n, std::string p) { int t[4] = {0}; for (std::size_t i = 0; i < p.size(); ++i) if (is_upper(p[i])) t[0] = 1; else if (is_lower(p[i])) t[1] = 1; else if (is_digit(p[i])) t[2] = 1; else if (is_special(p[i])) t[3] = 1; int s = 0; for (int i = 0; i < 4; ++i) s += t[i]; if (n < 6) return std::max(6 - n, 4 - s); else return 4 - s; }
int16_t rec_ii(cell* c,cell * cap) { cell *clist[8]; uchar let; let = c->vers[0].let; if( cap->row > c->row ) return 0; // not a cap if( cap->w*3 < c->w ) return 0; // not so wide as need if( cap->w < 4 || cap->h < 3 ) return 0; // just dot if( cap->col < c->col-2 ) return 0; // left dust if(c->row > cap->row+cap->h && (c->row-(cap->row+cap->h)) > c->h/2 ) return 0; // dust lay so high if( let != r_cu_u || (let == r_cu_u&&(c->pos_inc&erect_rot)) ) // 'u' with cap if( c->col+c->w/2 < cap->col || c->col+c->w/2 > cap->col+cap->w) return 0; // not centered dust if( let == r_cu_u) // 'u' with cap if( c->col+c->w < cap->col || c->col > cap->col+cap->w) return 0; // not centered dust if(0&&!p2_active) // OLEG if( let==r_cu_u || let==(uchar)'\xa8' /* и */ ) { B_LINES bl; get_b_lines(c,&bl); if( cap->row+cap->h<=bl.b1+1 ) return 0; } clist[0]=c; clist[1]=cap; if( !compose_cell(2,clist,c) ) return -1; //OLEG:new return style of composed let = is_lower(let) ? (uchar)'\xa9' /* й */ : (uchar)'\x89' /* Й */; c->vers[0].let = let; c->vers[0].prob=MIN(254,c->vers[0].prob+2); c->recsource = 0; // artifact c->dens = 255; // undef c->nvers=1; c->vers[1].let=c->vers[1].prob=0; return 1; }
//input: *c=='[' **pc==':' static u16 bracket_class(u8 *c,u8 **pc,u8 **sc,u8 not,u8 sc_folded) { u8 char_class[CHAR_CLASS_MAX+1];//don't forget the 0 terminating char u16 r=bracket_char_class_get(c,pc,not,sc_folded,&char_class[0]); if(r!=OK) return r; if((STREQ(char_class,"alnum")&&is_alnum(**sc)) ||(STREQ(char_class,"alpha")&&is_alpha(**sc)) ||(STREQ(char_class,"blank")&&is_blank(**sc)) ||(STREQ(char_class,"cntrl")&&is_cntrl(**sc)) ||(STREQ(char_class,"digit")&&is_digit(**sc)) ||(STREQ(char_class,"graph")&&is_graph(**sc)) ||(STREQ(char_class,"lower")&&is_lower(**sc)) ||(STREQ(char_class,"print")&&is_print(**sc)) ||(STREQ(char_class,"punct")&&is_punct(**sc)) ||(STREQ(char_class,"space")&&is_space(**sc)) ||(STREQ(char_class,"upper")&&is_upper(**sc)) ||(STREQ(char_class,"xdigit")&&is_xdigit(**sc))) return bracket_matched(c,pc,not); *c=*(*pc)++; return OK; }
unsigned long simple_strtoul(const char *cp,char **endp,unsigned int base) { unsigned long result = 0,value; if (!base) { base = 10; if (*cp == '0') { base = 8; cp++; if ((*cp == 'x') && is_xdigit(cp[1])) { cp++; base = 16; } } } while (is_xdigit(*cp) && (value = is_digit(*cp) ? *cp-'0' : (is_lower(*cp) ? toupper(*cp) : *cp)-'A'+10) < base) { result = result*base + value; cp++; } if (endp) *endp = (char *)cp; return result; }
bool is_alpha(char ch) { return is_lower(ch) || is_upper(ch); }
// Is a character a letter? We presume a letter must be either in the upper or lower case tables (even if it gets // translated to itself). bool isletter(int ch) { return is_upper(ch) || is_lower(ch); }
bool SkFontConfigInterfaceDirect::matchFamilySet(const char inFamilyName[], SkString* outFamilyName, SkTArray<FontIdentity>* ids) { SkAutoMutexAcquire ac(mutex_); #if 0 SkString familyStr(familyName ? familyName : ""); if (familyStr.size() > kMaxFontFamilyLength) { return false; } SkAutoMutexAcquire ac(mutex_); FcPattern* pattern = FcPatternCreate(); if (familyName) { FcPatternAddString(pattern, FC_FAMILY, (FcChar8*)familyName); } FcPatternAddBool(pattern, FC_SCALABLE, FcTrue); FcConfigSubstitute(NULL, pattern, FcMatchPattern); FcDefaultSubstitute(pattern); // Font matching: // CSS often specifies a fallback list of families: // font-family: a, b, c, serif; // However, fontconfig will always do its best to find *a* font when asked // for something so we need a way to tell if the match which it has found is // "good enough" for us. Otherwise, we can return NULL which gets piped up // and lets WebKit know to try the next CSS family name. However, fontconfig // configs allow substitutions (mapping "Arial -> Helvetica" etc) and we // wish to support that. // // Thus, if a specific family is requested we set @family_requested. Then we // record two strings: the family name after config processing and the // family name after resolving. If the two are equal, it's a good match. // // So consider the case where a user has mapped Arial to Helvetica in their // config. // requested family: "Arial" // post_config_family: "Helvetica" // post_match_family: "Helvetica" // -> good match // // and for a missing font: // requested family: "Monaco" // post_config_family: "Monaco" // post_match_family: "Times New Roman" // -> BAD match // // However, we special-case fallback fonts; see IsFallbackFontAllowed(). const char* post_config_family = get_name(pattern, FC_FAMILY); FcResult result; FcFontSet* font_set = FcFontSort(0, pattern, 0, 0, &result); if (!font_set) { FcPatternDestroy(pattern); return false; } FcPattern* match = MatchFont(font_set, post_config_family, familyStr); if (!match) { FcPatternDestroy(pattern); FcFontSetDestroy(font_set); return false; } FcPatternDestroy(pattern); // From here out we just extract our results from 'match' if (FcPatternGetString(match, FC_FAMILY, 0, &post_config_family) != FcResultMatch) { FcFontSetDestroy(font_set); return false; } FcChar8* c_filename; if (FcPatternGetString(match, FC_FILE, 0, &c_filename) != FcResultMatch) { FcFontSetDestroy(font_set); return false; } int face_index; if (FcPatternGetInteger(match, FC_INDEX, 0, &face_index) != FcResultMatch) { FcFontSetDestroy(font_set); return false; } FcFontSetDestroy(font_set); if (outIdentity) { outIdentity->fTTCIndex = face_index; outIdentity->fString.set((const char*)c_filename); } if (outFamilyName) { outFamilyName->set((const char*)post_config_family); } if (outStyle) { *outStyle = GetFontStyle(match); } return true; //////////////////// int count; FcPattern** match = MatchFont(font_set, post_config_family, &count); if (!match) { FcPatternDestroy(pattern); FcFontSetDestroy(font_set); return NULL; } FcPatternDestroy(pattern); SkTDArray<FcPattern*> trimmedMatches; for (int i = 0; i < count; ++i) { const char* justName = find_just_name(get_name(match[i], FC_FILE)); if (!is_lower(*justName)) { *trimmedMatches.append() = match[i]; } } SkFontStyleSet_FC* sset = new SkFontStyleSet_FC (trimmedMatches.begin(), trimmedMatches.count()); #endif return false; }
/** * Returns a control byte that represents the characteristics of the given token. */ unsigned char get_control_byte(const unichar* token,const Alphabet* alph,struct string_hash* err,TokenizationPolicy tokenization_policy) { int i; int tmp; unsigned char c=0; if (token==NULL || token[0]=='\0') { fatal_error("NULL or empty token in get_control_byte\n"); } /* We consider that a token starting with a letter is a word */ if (is_letter(token[0],alph)) { set_bit_mask(&c,MOT_TOKEN_BIT_MASK); /* If a token is a word, we check if it is in the 'err' word list * in order to answer the question <!DIC>. We perform this test in order * to avoid taking "priori" as an unknown word if the compound "a priori" * is in the text. */ if (err!=NULL && get_value_index(token,err,DONT_INSERT)!=-1) { set_bit_mask(&c,NOT_DIC_TOKEN_BIT_MASK); } if (is_upper(token[0],alph)) { set_bit_mask(&c,PRE_TOKEN_BIT_MASK); i=0; tmp=0; while (token[i]!='\0') { if (is_lower(token[i],alph)) { tmp=1; break; } i++; } if (!tmp) { set_bit_mask(&c,MAJ_TOKEN_BIT_MASK); } return c; } i=0; tmp=0; while (token[i]!='\0') { if (is_upper(token[i],alph)) { tmp=1; break; } i++; } if (!tmp) { set_bit_mask(&c,MIN_TOKEN_BIT_MASK); } return c; } /* If the token doesn't start with a letter, we start with * checking if it is a tag like {today,.ADV} */ if (token[0]=='{' && u_strcmp(token,"{S}") && u_strcmp(token,"{STOP}")) { /* Anyway, such a tag is classed as verifying <MOT> and <DIC> */ set_bit_mask(&c,MOT_TOKEN_BIT_MASK|DIC_TOKEN_BIT_MASK|TDIC_TOKEN_BIT_MASK); struct dela_entry* temp=tokenize_tag_token(token); if (is_upper(temp->inflected[0],alph)) { set_bit_mask(&c,PRE_TOKEN_BIT_MASK); i=0; tmp=0; while (temp->inflected[i]!='\0') { if (is_letter(temp->inflected[i],alph) && is_lower(temp->inflected[i],alph)) { tmp=1; break; } i++; } if (!tmp) { set_bit_mask(&c,MAJ_TOKEN_BIT_MASK); } } else { i=0; tmp=0; while (temp->inflected[i]!='\0') { if (is_letter(temp->inflected[i],alph) && is_upper(temp->inflected[i],alph)) { tmp=1; break; } i++; } if (!tmp) { set_bit_mask(&c,MIN_TOKEN_BIT_MASK); } } if (!is_a_simple_word(temp->inflected,tokenization_policy,alph)) { /* If the tag is a compound word, we say that it verifies the <CDIC> pattern */ set_bit_mask(&c,CDIC_TOKEN_BIT_MASK); } free_dela_entry(temp); } return c; }
/** * Takes a given unicode string 'dest' and * replaces any lowercase letter by the set made of itself and * its uppercase equivalent, surrounded with square brackets if * the letter was not already between square brackets. * Examples: * * "For" => "F[oO][rR]" * "F[ao]r" => "F[aAoO][rR]" * * The output is stored in 'src'. The function assumes that 'src' is * wide enough. * * This function is used for morphological filter regular expressions. */ void replace_letter_by_letter_set(const Alphabet* a,unichar* dest,const unichar* src) { int i=0,j=0; char inside_a_set=0; while (src[i]!='\0') { switch (src[i]) { case '\\': if (src[i+1]=='\0') { // there is nothing after a backslash, then we stop, // and the RE compiler may indicate an error dest[j++] = src[i++]; dest[j] = src[i]; return; } if (is_lower(src[i+1],a)) { // this is a lowercase letter in Unitex alphabet : // we don't need "\" and we make expansion "[eE]" ++i; if (!inside_a_set) dest[j++]='['; dest[j++]=src[i]; if (a==NULL) { /* If there is no alphabet file, we just consider the unique * uppercase variant of the letter */ dest[j++]=u_toupper(src[i]); } else { unichar* tbrowse = NULL; int i_pos_in_array_of_string = a->pos_in_represent_list[src[i]]; if (i_pos_in_array_of_string != 0) tbrowse = a->t_array_collection[i_pos_in_array_of_string]; if (tbrowse != NULL) while ((*tbrowse) != '\0') { dest[j++]=*(tbrowse++); } } if (!inside_a_set) dest[j++]=']'; i++; } else { // others cases : // we keep the "\" and the letter dest[j++] = src[i++]; dest[j++] = src[i++]; } break; case '[': dest[j++]=src[i++]; inside_a_set=1; break; case ']': dest[j++]=src[i++]; inside_a_set=0; break; case '.': case '*': case '+': case '?': case '|': case '^': case '$': case ':': case '(': case ')': case '{': case '}': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': dest[j++]=src[i++]; break; default: if (is_lower(src[i],a)) { if (!inside_a_set) dest[j++]='['; dest[j++]=src[i]; if (inside_a_set && src[i+1]=='-') { /* Special case: * if we had [a-d], we don't want to turn it into * [aA-dD], but rather into [a-dA-D]. In such a case, * we just use u_toupper */ i=i+2; dest[j++]='-'; dest[j++]=src[i++]; dest[j++]=u_toupper(dest[i-3]); dest[j++]='-'; dest[j++]=u_toupper(src[i-1]); continue; } if (a==NULL) { /* If there is no alphabet file, we just consider the unique * uppercase variant of the letter */ dest[j++]=u_toupper(src[i]); } else { /* If there is an alphabet file, we use it */ unichar* tbrowse = NULL; int i_pos_in_array_of_string = a->pos_in_represent_list[src[i]]; if (i_pos_in_array_of_string != 0) { tbrowse = a->t_array_collection[i_pos_in_array_of_string]; } if (tbrowse != NULL) { while ((*tbrowse) != '\0') { dest[j++]=*(tbrowse++); } } } if (!inside_a_set) dest[j++]=']'; i++; } else { /* Not a lower case letter */ dest[j++]=src[i++]; } } } dest[j]='\0'; }
inline fint asnum(fint c) { return is_digit(c) ? c - '0' : is_lower(c) ? c - 'a' + 10 : c - 'A' + 10; }
bool SkParsePath::FromSVGString(const char data[], SkPath* result) { SkPath path; SkPoint f = {0, 0}; SkPoint c = {0, 0}; SkPoint lastc = {0, 0}; SkPoint points[3]; char op = '\0'; char previousOp = '\0'; bool relative = false; for (;;) { data = skip_ws(data); if (data[0] == '\0') { break; } char ch = data[0]; if (is_digit(ch) || ch == '-' || ch == '+') { if (op == '\0') { return false; } } else { op = ch; relative = false; if (is_lower(op)) { op = (char) to_upper(op); relative = true; } data++; data = skip_sep(data); } switch (op) { case 'M': data = find_points(data, points, 1, relative, &c); path.moveTo(points[0]); op = 'L'; c = points[0]; break; case 'L': data = find_points(data, points, 1, relative, &c); path.lineTo(points[0]); c = points[0]; break; case 'H': { SkScalar x; data = find_scalar(data, &x, relative, c.fX); path.lineTo(x, c.fY); c.fX = x; } break; case 'V': { SkScalar y; data = find_scalar(data, &y, relative, c.fY); path.lineTo(c.fX, y); c.fY = y; } break; case 'C': data = find_points(data, points, 3, relative, &c); goto cubicCommon; case 'S': data = find_points(data, &points[1], 2, relative, &c); points[0] = c; if (previousOp == 'C' || previousOp == 'S') { points[0].fX -= lastc.fX - c.fX; points[0].fY -= lastc.fY - c.fY; } cubicCommon: path.cubicTo(points[0], points[1], points[2]); lastc = points[1]; c = points[2]; break; case 'Q': // Quadratic Bezier Curve data = find_points(data, points, 2, relative, &c); goto quadraticCommon; case 'T': data = find_points(data, &points[1], 1, relative, &c); points[0] = points[1]; if (previousOp == 'Q' || previousOp == 'T') { points[0].fX = c.fX * 2 - lastc.fX; points[0].fY = c.fY * 2 - lastc.fY; } quadraticCommon: path.quadTo(points[0], points[1]); lastc = points[0]; c = points[1]; break; case 'Z': path.close(); #if 0 // !!! still a bug? if (fPath.isEmpty() && (f.fX != 0 || f.fY != 0)) { c.fX -= SkScalar.Epsilon; // !!! enough? fPath.moveTo(c); fPath.lineTo(f); fPath.close(); } #endif c = f; op = '\0'; break; case '~': { SkPoint args[2]; data = find_points(data, args, 2, false, NULL); path.moveTo(args[0].fX, args[0].fY); path.lineTo(args[1].fX, args[1].fY); } break; default: return false; } if (previousOp == 0) { f = c; } previousOp = op; } // we're good, go ahead and swap in the result result->swap(path); return true; }
unsigned int is_letter(unsigned char ch) { return (is_upper(ch) || is_lower(ch)); }
static void output_tables (const char *filename, const char *version) { FILE *stream; unsigned int ch; stream = fopen (filename, "w"); if (stream == NULL) { fprintf (stderr, "cannot open '%s' for writing\n", filename); exit (1); } fprintf (stream, "escape_char /\n"); fprintf (stream, "comment_char %%\n"); fprintf (stream, "\n"); fprintf (stream, "%% Generated automatically by gen-unicode-ctype for Unicode %s.\n", version); fprintf (stream, "\n"); fprintf (stream, "LC_IDENTIFICATION\n"); fprintf (stream, "title \"Unicode %s FDCC-set\"\n", version); fprintf (stream, "source \"UnicodeData.txt, PropList.txt\"\n"); fprintf (stream, "address \"\"\n"); fprintf (stream, "contact \"\"\n"); fprintf (stream, "email \"[email protected]\"\n"); fprintf (stream, "tel \"\"\n"); fprintf (stream, "fax \"\"\n"); fprintf (stream, "language \"\"\n"); fprintf (stream, "territory \"Earth\"\n"); fprintf (stream, "revision \"%s\"\n", version); { time_t now; char date[11]; now = time (NULL); strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now)); fprintf (stream, "date \"%s\"\n", date); } fprintf (stream, "category \"unicode:2001\";LC_CTYPE\n"); fprintf (stream, "END LC_IDENTIFICATION\n"); fprintf (stream, "\n"); /* Verifications. */ for (ch = 0; ch < 0x110000; ch++) { /* toupper restriction: "Only characters specified for the keywords lower and upper shall be specified. */ if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch))) fprintf (stderr, "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n", ucs_symbol (ch), ch, to_upper (ch)); /* tolower restriction: "Only characters specified for the keywords lower and upper shall be specified. */ if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch))) fprintf (stderr, "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n", ucs_symbol (ch), ch, to_lower (ch)); /* alpha restriction: "Characters classified as either upper or lower shall automatically belong to this class. */ if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch)) fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch)); /* alpha restriction: "No character specified for the keywords cntrl, digit, punct or space shall be specified." */ if (is_alpha (ch) && is_cntrl (ch)) fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch)); if (is_alpha (ch) && is_digit (ch)) fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch)); if (is_alpha (ch) && is_punct (ch)) fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch)); if (is_alpha (ch) && is_space (ch)) fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch)); /* space restriction: "No character specified for the keywords upper, lower, alpha, digit, graph or xdigit shall be specified." upper, lower, alpha already checked above. */ if (is_space (ch) && is_digit (ch)) fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch)); if (is_space (ch) && is_graph (ch)) fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch)); if (is_space (ch) && is_xdigit (ch)) fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch)); /* cntrl restriction: "No character specified for the keywords upper, lower, alpha, digit, punct, graph, print or xdigit shall be specified." upper, lower, alpha already checked above. */ if (is_cntrl (ch) && is_digit (ch)) fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch)); if (is_cntrl (ch) && is_punct (ch)) fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch)); if (is_cntrl (ch) && is_graph (ch)) fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch)); if (is_cntrl (ch) && is_print (ch)) fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch)); if (is_cntrl (ch) && is_xdigit (ch)) fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch)); /* punct restriction: "No character specified for the keywords upper, lower, alpha, digit, cntrl, xdigit or as the <space> character shall be specified." upper, lower, alpha, cntrl already checked above. */ if (is_punct (ch) && is_digit (ch)) fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch)); if (is_punct (ch) && is_xdigit (ch)) fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch)); if (is_punct (ch) && (ch == 0x0020)) fprintf (stderr, "%s is punct\n", ucs_symbol (ch)); /* graph restriction: "No character specified for the keyword cntrl shall be specified." Already checked above. */ /* print restriction: "No character specified for the keyword cntrl shall be specified." Already checked above. */ /* graph - print relation: differ only in the <space> character. How is this possible if there are more than one space character?! I think susv2/xbd/locale.html should speak of "space characters", not "space character". */ if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch))) fprintf (stderr, "%s is print but not graph|<space>\n", ucs_symbol (ch)); if (!is_print (ch) && (is_graph (ch) || ch == 0x0020)) fprintf (stderr, "%s is graph|<space> but not print\n", ucs_symbol (ch)); } fprintf (stream, "LC_CTYPE\n"); output_charclass (stream, "upper", is_upper); output_charclass (stream, "lower", is_lower); output_charclass (stream, "alpha", is_alpha); output_charclass (stream, "digit", is_digit); output_charclass (stream, "outdigit", is_outdigit); output_charclass (stream, "blank", is_blank); output_charclass (stream, "space", is_space); output_charclass (stream, "cntrl", is_cntrl); output_charclass (stream, "punct", is_punct); output_charclass (stream, "xdigit", is_xdigit); output_charclass (stream, "graph", is_graph); output_charclass (stream, "print", is_print); output_charclass (stream, "class \"combining\";", is_combining); output_charclass (stream, "class \"combining_level3\";", is_combining_level3); output_charmap (stream, "toupper", to_upper); output_charmap (stream, "tolower", to_lower); output_charmap (stream, "map \"totitle\";", to_title); output_widthmap (stream); fprintf (stream, "END LC_CTYPE\n"); if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error writing to '%s'\n", filename); exit (1); } }
/* * Returns the upper case of the character */ char to_upper(char c) { if (is_lower(c)) { c = c + 'A' - 'a'; } return c; }
char to_upper(char x){ return x - (is_lower(x) << 5); }
static bool is_identifier_first(char c) { return is_upper(c) || is_lower(c) || c == '_'; }
void scan_graph(int n_graph, // number of current graph int e, // number of current state int pos, // int depth, struct parsing_info** liste_arrivee, unichar* mot_token_buffer, struct fst2txt_parameters* p,Abstract_allocator prv_alloc_recycle) { Fst2State etat_courant=p->fst2->states[e]; if (depth > MAX_DEPTH) { error( "\n" "Maximal stack size reached in graph %i!\n" "Recognized more than %i tokens starting from:\n" " ", n_graph, MAX_DEPTH); for (int i=0; i<60; i++) { error("%S",p->buffer[p->current_origin+i]); } error("\nSkipping match at this position, trying from next token!\n"); p->output[0] = '\0'; // clear output p->input_length = 0; // reset taille_entree empty(p->stack); // clear output stack if (liste_arrivee != NULL) { while (*liste_arrivee != NULL) { // free list of subgraph matches struct parsing_info* la_tmp=*liste_arrivee; *liste_arrivee=(*liste_arrivee)->next; la_tmp->next=NULL; // to don't free the next item free_parsing_info(la_tmp, prv_alloc_recycle); } } return; // exit(1); // don't exit, try at next position } depth++; if (is_final_state(etat_courant)) { // if we are in a final state p->stack->stack[p->stack->stack_pointer+1]='\0'; if (n_graph == 0) { // in main graph if (pos>=p->input_length/*sommet>u_strlen(output)*/) { // and if the recognized input is longer than the current one, it replaces it u_strcpy(p->output,p->stack->stack); p->input_length=(pos); } } else { // in a subgraph (*liste_arrivee)=insert_if_absent(pos,-1,-1,(*liste_arrivee),p->stack->stack_pointer+1, p->stack->stack,p->variables,NULL,NULL,-1,-1,NULL,-1, prv_alloc_recycle); } } if (pos+p->current_origin==p->text_buffer->size) { // if we are at the end of the text, we return return; } int SOMMET=p->stack->stack_pointer+1; int pos2; /* If there are some letter sequence transitions like %hello, we process them */ if (p->token_tree[e]->transition_array!=NULL) { if (p->buffer[pos+p->current_origin]==' ') {pos2=pos+1;if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' ');} /* we don't keep this line because of problems occur in sentence tokenizing * if the return sequence is defautly considered as a separator like space else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);} */ else pos2=pos; int position=0; unichar *token=mot_token_buffer; if (p->tokenization_policy==CHAR_BY_CHAR_TOKENIZATION || (is_letter(p->buffer[pos2+p->current_origin],p->alphabet) && (pos2+p->current_origin==0 || !is_letter(p->buffer[pos2+p->current_origin-1],p->alphabet)))) { /* If we are in character by character mode */ while (pos2+p->current_origin<p->text_buffer->size && is_letter(p->buffer[pos2+p->current_origin],p->alphabet)) { token[position++]=p->buffer[(pos2++)+p->current_origin]; if (p->tokenization_policy==CHAR_BY_CHAR_TOKENIZATION) { break; } } token[position]='\0'; if (position!=0 && (p->tokenization_policy==CHAR_BY_CHAR_TOKENIZATION || !(is_letter(token[position-1],p->alphabet) && is_letter(p->buffer[pos2+p->current_origin],p->alphabet)))) { // we proceed only if we have exactly read the contenu sequence // in both modes MERGE and REPLACE, we process the transduction if any int SOMMET2=p->stack->stack_pointer; Transition* RES=get_matching_tags(token,p->token_tree[e],p->alphabet); Transition* TMP; unichar* mot_token_new_recurse_buffer=NULL; if (RES!=NULL) { // we allocate a new mot_token_buffer for the scan_graph recursin because we need preserve current // token=mot_token_buffer mot_token_new_recurse_buffer=(unichar*)malloc(MOT_BUFFER_TOKEN_SIZE*sizeof(unichar)); if (mot_token_new_recurse_buffer==NULL) { fatal_alloc_error("scan_graph"); } } while (RES!=NULL) { p->stack->stack_pointer=SOMMET2; Fst2Tag etiq=p->fst2->tags[RES->tag_number]; traiter_transduction(p,etiq->output); int longueur=u_strlen(etiq->input); unichar C=token[longueur]; token[longueur]='\0'; if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the char we have read push_input_string(p->stack,token,0); } token[longueur]=C; scan_graph(n_graph,RES->state_number,pos2-(position-longueur),depth,liste_arrivee,mot_token_new_recurse_buffer,p); TMP=RES; RES=RES->next; free(TMP); } if (mot_token_new_recurse_buffer!=NULL) { free(mot_token_new_recurse_buffer); } } } } Transition* t=etat_courant->transitions; while (t!=NULL) { p->stack->stack_pointer=SOMMET-1; // we process the transition of the current state int n_etiq=t->tag_number; if (n_etiq<0) { // case of a sub-graph struct parsing_info* liste=NULL; unichar* pile_old; p->stack->stack[p->stack->stack_pointer+1]='\0'; pile_old = u_strdup(p->stack->stack); scan_graph((((unsigned)n_etiq)-1),p->fst2->initial_states[-n_etiq],pos,depth,&liste,mot_token_buffer,p); while (liste!=NULL) { p->stack->stack_pointer=liste->stack_pointer-1; u_strcpy(p->stack->stack,liste->stack); scan_graph(n_graph,t->state_number,liste->position,depth,liste_arrivee,mot_token_buffer,p); struct parsing_info* l_tmp=liste; liste=liste->next; l_tmp->next=NULL; // to don't free the next item free_parsing_info(l_tmp, prv_alloc_recycle); } u_strcpy(p->stack->stack,pile_old); free(pile_old); p->stack->stack_pointer=SOMMET-1; } else { // case of a normal tag Fst2Tag etiq=p->fst2->tags[n_etiq]; unichar* contenu=etiq->input; int contenu_len_possible_match=u_len_possible_match(contenu); if (etiq->type==BEGIN_OUTPUT_VAR_TAG) { fatal_error("Unsupported $|XXX( tags in Fst2Txt\n"); } if (etiq->type==END_OUTPUT_VAR_TAG) { fatal_error("Unsupported $|XXX) tags in Fst2Txt\n"); } if (etiq->type==BEGIN_VAR_TAG) { // case of a $a( variable tag //int old; struct transduction_variable* L=get_transduction_variable(p->variables,etiq->variable); if (L==NULL) { fatal_error("Unknown variable: %S\n",etiq->variable); } //old=L->start; if (p->buffer[pos+p->current_origin]==' ' && pos+p->current_origin+1<p->text_buffer->size) { pos2=pos+1; if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' '); } //else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);} else pos2=pos; L->start_in_tokens=pos2; scan_graph(n_graph,t->state_number,pos2,depth,liste_arrivee,mot_token_buffer,p); //L->start=old; } else if (etiq->type==END_VAR_TAG) { // case of a $a) variable tag //int old; struct transduction_variable* L=get_transduction_variable(p->variables,etiq->variable); if (L==NULL) { fatal_error("Unknown variable: %S\n",etiq->variable); } //old=L->end; if (pos>0) L->end_in_tokens=pos-1; else L->end_in_tokens=pos; // BUG: qd changement de buffer, penser au cas start dans ancien buffer et end dans nouveau scan_graph(n_graph,t->state_number,pos,depth,liste_arrivee,mot_token_buffer,p); //L->end=old; } else if ((contenu_len_possible_match==5) && (!u_trymatch_superfast5(contenu,ETIQ_MOT_LN5))) { // case of transition by any sequence of letters if (p->buffer[pos+p->current_origin]==' ' && pos+p->current_origin+1<p->text_buffer->size) { pos2=pos+1; if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' '); } //else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);} else pos2=pos; unichar* mot=mot_token_buffer; int position=0; if (p->tokenization_policy==CHAR_BY_CHAR_TOKENIZATION || ((pos2+p->current_origin)==0 || !is_letter(p->buffer[pos2+p->current_origin-1],p->alphabet))) { while (pos2+p->current_origin<p->text_buffer->size && is_letter(p->buffer[pos2+p->current_origin],p->alphabet)) { mot[position++]=p->buffer[(pos2++)+p->current_origin]; } mot[position]='\0'; if (position!=0) { // we proceed only if we have read a letter sequence // in both modes MERGE and REPLACE, we process the transduction if any traiter_transduction(p,etiq->output); if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the char we have read push_output_string(p->stack,mot); } scan_graph(n_graph,t->state_number,pos2,depth,liste_arrivee,mot_token_buffer,p); } } } else if ((contenu_len_possible_match==4) && (!u_trymatch_superfast4(contenu,ETIQ_NB_LN4))) { // case of transition by any sequence of digits if (p->buffer[pos+p->current_origin]==' ') { pos2=pos+1; if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' '); } //else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);} else pos2=pos; unichar* mot=mot_token_buffer; int position=0; while (pos2+p->current_origin<p->text_buffer->size && (p->buffer[pos2+p->current_origin]>='0') && (p->buffer[pos2+p->current_origin]<='9')) { mot[position++]=p->buffer[(pos2++)+p->current_origin]; } mot[position]='\0'; if (position!=0) { // we proceed only if we have read a letter sequence // in both modes MERGE and REPLACE, we process the transduction if any traiter_transduction(p,etiq->output); if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the char we have read push_output_string(p->stack,mot); } scan_graph(n_graph,t->state_number,pos2,depth,liste_arrivee,mot_token_buffer,p); } } else if ((contenu_len_possible_match==5) && (!u_trymatch_superfast5(contenu,ETIQ_MAJ_LN5))) { // case of upper case letter sequence if (p->buffer[pos+p->current_origin]==' ') {pos2=pos+1;if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' ');} //else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);} else pos2=pos; unichar* mot=mot_token_buffer; int position=0; if (p->tokenization_policy==CHAR_BY_CHAR_TOKENIZATION || ((pos2+p->current_origin)==0 || !is_letter(p->buffer[pos2+p->current_origin-1],p->alphabet))) { while (pos2+p->current_origin<p->text_buffer->size && is_upper(p->buffer[pos2+p->current_origin],p->alphabet)) { mot[position++]=p->buffer[(pos2++)+p->current_origin]; } mot[position]='\0'; if (position!=0 && !is_letter(p->buffer[pos2+p->current_origin],p->alphabet)) { // we proceed only if we have read an upper case letter sequence // which is not followed by a lower case letter // in both modes MERGE and REPLACE, we process the transduction if any traiter_transduction(p,etiq->output); if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the char we have read push_input_string(p->stack,mot,0); } scan_graph(n_graph,t->state_number,pos2,depth,liste_arrivee,mot_token_buffer,p); } } } else if ((contenu_len_possible_match==5) && (!u_trymatch_superfast5(contenu,ETIQ_MIN_LN5))) { // case of lower case letter sequence if (p->buffer[pos+p->current_origin]==' ') {pos2=pos+1;if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' ');} //else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);} else pos2=pos; unichar* mot=mot_token_buffer; int position=0; if (p->tokenization_policy==CHAR_BY_CHAR_TOKENIZATION || (pos2+p->current_origin==0 || !is_letter(p->buffer[pos2+p->current_origin-1],p->alphabet))) { while (pos2+p->current_origin<p->text_buffer->size && is_lower(p->buffer[pos2+p->current_origin],p->alphabet)) { mot[position++]=p->buffer[(pos2++)+p->current_origin]; } mot[position]='\0'; if (position!=0 && !is_letter(p->buffer[pos2+p->current_origin],p->alphabet)) { // we proceed only if we have read a lower case letter sequence // which is not followed by an upper case letter // in both modes MERGE and REPLACE, we process the transduction if any traiter_transduction(p,etiq->output); if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the char we have read push_input_string(p->stack,mot,0); } scan_graph(n_graph,t->state_number,pos2,depth,liste_arrivee,mot_token_buffer,p); } } } else if ((contenu_len_possible_match==5) && (!u_trymatch_superfast5(contenu,ETIQ_PRE_LN5))) { // case of a sequence beginning by an upper case letter if (p->buffer[pos+p->current_origin]==' ') {pos2=pos+1;if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' ');} //else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);} else pos2=pos; unichar* mot=mot_token_buffer; int position=0; if (p->tokenization_policy==CHAR_BY_CHAR_TOKENIZATION || (is_upper(p->buffer[pos2+p->current_origin],p->alphabet) && (pos2+p->current_origin==0 || !is_letter(p->buffer[pos2+p->current_origin-1],p->alphabet)))) { while (pos2+p->current_origin<p->text_buffer->size && is_letter(p->buffer[pos2+p->current_origin],p->alphabet)) { mot[position++]=p->buffer[(pos2++)+p->current_origin]; } mot[position]='\0'; if (position!=0 && !is_letter(p->buffer[pos2+p->current_origin],p->alphabet)) { // we proceed only if we have read a letter sequence // which is not followed by a letter // in both modes MERGE and REPLACE, we process the transduction if any traiter_transduction(p,etiq->output); if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the char we have read push_input_string(p->stack,mot,0); } scan_graph(n_graph,t->state_number,pos2,depth,liste_arrivee,mot_token_buffer,p); } } } else if ((contenu_len_possible_match==5) && (!u_trymatch_superfast5(contenu,ETIQ_PNC_LN5))) { // case of a punctuation sequence if (p->buffer[pos+p->current_origin]==' ') {pos2=pos+1;if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' ');} //else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);} else pos2=pos; unichar C=p->buffer[pos2+p->current_origin]; if (C==';' || C=='!' || C=='?' || C==':' || C==0xbf || C==0xa1 || C==0x0e4f || C==0x0e5a || C==0x0e5b || C==0x3001 || C==0x3002 || C==0x30fb) { // in both modes MERGE and REPLACE, we process the transduction if any traiter_transduction(p,etiq->output); if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the char we have read push(p->stack,C); } scan_graph(n_graph,t->state_number,pos2+1,depth,liste_arrivee,mot_token_buffer,p); } else { // we consider the case of ... // BUG: if ... appears at the end of the buffer if (C=='.') { if ((pos2+p->current_origin+2)<p->text_buffer->size && p->buffer[pos2+p->current_origin+1]=='.' && p->buffer[pos2+p->current_origin+2]=='.') { traiter_transduction(p,etiq->output); if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the ... we have read push(p->stack,C);push(p->stack,C);push(p->stack,C); } scan_graph(n_graph,t->state_number,pos2+3,depth,liste_arrivee,mot_token_buffer,p); } else { // we consider the . as a normal punctuation sign traiter_transduction(p,etiq->output); if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the char we have read push(p->stack,C); } scan_graph(n_graph,t->state_number,pos2+1,depth,liste_arrivee,mot_token_buffer,p); } } } } else if ((contenu_len_possible_match==3) && (!u_trymatch_superfast3(contenu,ETIQ_E_LN3))) { // case of an empty sequence // in both modes MERGE and REPLACE, we process the transduction if any traiter_transduction(p,etiq->output); scan_graph(n_graph,t->state_number,pos,depth,liste_arrivee,mot_token_buffer,p); } else if ((contenu_len_possible_match==3) && (!u_trymatch_superfast3(contenu,ETIQ_CIRC_LN3))) { // case of a new line sequence if (p->buffer[pos+p->current_origin]=='\n') { // in both modes MERGE and REPLACE, we process the transduction if any traiter_transduction(p,etiq->output); if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the char we have read push(p->stack,'\n'); } scan_graph(n_graph,t->state_number,pos+1,depth,liste_arrivee,mot_token_buffer,p); } } else if ((contenu_len_possible_match==1) && (!u_trymatch_superfast1(contenu,'#')) && (!(etiq->control&RESPECT_CASE_TAG_BIT_MASK))) { // case of a no space condition if (p->buffer[pos+p->current_origin]!=' ') { // in both modes MERGE and REPLACE, we process the transduction if any traiter_transduction(p,etiq->output); scan_graph(n_graph,t->state_number,pos,depth,liste_arrivee,mot_token_buffer,p); } } else if ((contenu_len_possible_match==1) && (!u_trymatch_superfast1(contenu,' '))) { // case of an obligatory space if (p->buffer[pos+p->current_origin]==' ') { // in both modes MERGE and REPLACE, we process the transduction if any traiter_transduction(p,etiq->output); if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the char we have read push(p->stack,' '); } scan_graph(n_graph,t->state_number,pos+1,depth,liste_arrivee,mot_token_buffer,p); } } else if ((contenu_len_possible_match==3) && (!u_trymatch_superfast5(contenu,ETIQ_L_LN3))) { // case of a single letter if (p->buffer[pos+p->current_origin]==' ') {pos2=pos+1;if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' ');} //else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);} else pos2=pos; if (is_letter(p->buffer[pos2+p->current_origin],p->alphabet)) { // in both modes MERGE and REPLACE, we process the transduction if any traiter_transduction(p,etiq->output); if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the char we have read push(p->stack,p->buffer[pos2+p->current_origin]); } scan_graph(n_graph,t->state_number,pos2+1,depth,liste_arrivee,mot_token_buffer,p); } } else { // case of a normal letter sequence if (p->buffer[pos+p->current_origin]==' ') {pos2=pos+1;if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' ');} //else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);} else pos2=pos; if (etiq->control&RESPECT_CASE_TAG_BIT_MASK) { // case of exact case match int position=0; while (pos2+p->current_origin<p->text_buffer->size && p->buffer[pos2+p->current_origin]==contenu[position]) { pos2++; position++; } if (contenu[position]=='\0' && position!=0 && !(is_letter(contenu[position-1],p->alphabet) && is_letter(p->buffer[pos2+p->current_origin],p->alphabet))) { // we proceed only if we have exactly read the contenu sequence // in both modes MERGE and REPLACE, we process the transduction if any traiter_transduction(p,etiq->output); if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the char we have read push_input_string(p->stack,contenu,0); } scan_graph(n_graph,t->state_number,pos2,depth,liste_arrivee,mot_token_buffer,p); } } else { // case of variable case match // the letter sequences may have been caught by the arbre_etiquette structure int position=0; unichar* mot=mot_token_buffer; while (pos2+p->current_origin<p->text_buffer->size && is_equal_or_uppercase(contenu[position],p->buffer[pos2+p->current_origin],p->alphabet)) { mot[position++]=p->buffer[(pos2++)+p->current_origin]; } mot[position]='\0'; if (contenu[position]=='\0' && position!=0 && !(is_letter(contenu[position-1],p->alphabet) && is_letter(p->buffer[pos2+p->current_origin],p->alphabet))) { // we proceed only if we have exactly read the contenu sequence // in both modes MERGE and REPLACE, we process the transduction if any traiter_transduction(p,etiq->output); if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the char we have read push_input_string(p->stack,mot,0); } scan_graph(n_graph,t->state_number,pos2,depth,liste_arrivee,mot_token_buffer,p); } } } } t=t->next; } }