/** * Loads the initial keyword list from a tok_by_freq.txt file, * and turns all those tokens in a list whose primary key is the * lower case token: * The/20 THE/2 the/50 => the->(The/20 THE/2 the/50) */ struct string_hash_ptr* load_tokens_by_freq(char* name,VersatileEncodingConfig* vec) { U_FILE* f=u_fopen(vec,name,U_READ); if (f==NULL) return NULL; Ustring* line=new_Ustring(128); Ustring* lower=new_Ustring(128); struct string_hash_ptr* res=new_string_hash_ptr(1024); int val,pos; /* We skip the first line of the file, containing the number * of tokens */ if (EOF==readline(line,f)) { fatal_error("Invalid empty file %s\n",name); } while (EOF!=readline(line,f)) { if (1!=u_sscanf(line->str,"%d%n",&val,&pos)) { fatal_error("Invalid line in file %s:\n%S\n",name,line->str); } u_strcpy(lower,line->str+pos); u_tolower(lower->str); int index=get_value_index(lower->str,res,INSERT_IF_NEEDED,NULL); if (index==-1) { fatal_error("Internal error in load_tokens_by_freq\n"); } KeyWord* value=(KeyWord*)res->value[index]; res->value[index]=new_KeyWord(val,line->str+pos,value); } free_Ustring(line); free_Ustring(lower); u_fclose(f); return res; }
/** * Returns 1 if all tags have valid sentence automaton outputs; 0 otherwise. */ static int valid_outputs(Fst2* fst2) { if (u_strcmp(fst2->tags[0]->input,"<E>") || fst2->tags[0]->output!=NULL) { /* Should never happen */ fatal_error("valid_outputs: the first tag of the .fst2 should be <E>\n"); } for (int i=1;i<fst2->number_of_tags;i++) { /* Condition 3: no tag of the form <E>/XYZ */ if (!u_strcmp(fst2->tags[i]->input,"<E>") && fst2->tags[i]->output!=NULL) { return 0; } if (fst2->tags[i]->output==NULL) { /* Condition 4: <E> must the only tag without output */ return 0; } int w,x,y,z,f,g; char foo; /* Condition 5 */ if (6!=u_sscanf(fst2->tags[i]->output,"%d %d %d %d %d %d%c",&w,&x,&y,&z,&f,&g,&foo)) { /* If the output is not made of 6 integers */ return 0; } if (w<0 || x<-1 || y<0 || z<-1) { return 0; } } return 1; }
/** * This function loads concordance lines from 'f1' and/or 'f2' and prints them to * 'output' in the given color. */ void print_diff_matches(U_FILE* output,U_FILE* f1,U_FILE* f2,const char* color, unichar* match1,unichar* match2) { unichar left1[MAX_CONTEXT_IN_UNITS]; unichar middle1[MAX_CONTEXT_IN_UNITS]; unichar right1[MAX_CONTEXT_IN_UNITS]; unichar indices1[MAX_CONTEXT_IN_UNITS]; unichar left2[MAX_CONTEXT_IN_UNITS]; unichar middle2[MAX_CONTEXT_IN_UNITS]; unichar right2[MAX_CONTEXT_IN_UNITS]; unichar indices2[MAX_CONTEXT_IN_UNITS]; if (f1!=NULL) { read_concordance_line(f1,left1,middle1,right1,indices1,match1==NULL?0:u_strlen(match1)); } if (f2!=NULL) { read_concordance_line(f2,left2,middle2,right2,indices2,match2==NULL?0:u_strlen(match2)); } if (match1!=NULL) u_strcpy(middle1,match1); if (match2!=NULL) u_strcpy(middle2,match2); if (!strcmp(color,RED)) { /* If we have one match included in the another, we want to align * them. We do that by adjusting their left contexts */ int pos1,pos2; u_sscanf(indices1,"%d",&pos1); u_sscanf(indices2,"%d",&pos2); if (pos1<pos2) { adjust(left1,left2,middle1,pos2-pos1); } else if (pos1>pos2) { adjust(left2,left1,middle2,pos1-pos2); } /* Nothing to adjust if pos1==pos2 */ } /* We print the line from the first file, if needed */ u_fprintf(output,"<tr><td nowrap bgcolor=\"#FFE4C4\"><font color=\"%s\">",color); if (f1!=NULL) { u_fprintf(output,"%HS<a href=\"%S\" style=\"color:%s\">%HS</a>%HS",left1,indices1,color,middle1,right1); } else { u_fprintf(output," "); } u_fprintf(output,"</font></td></tr>\n"); u_fprintf(output,"<tr><td nowrap bgcolor=\"#90EE90\"><font color=\"%s\">",color); /* We print the line from the second file, if needed */ if (f2!=NULL) { u_fprintf(output,"%HS<a href=\"%S\" style=\"color:%s\">%HS</a>%HS",left2,indices2,color,middle2,right2); } else { u_fprintf(output," "); } u_fprintf(output,"</font></td></tr>\n"); }
/** * Reads the start and end positions of each token stored in the file * produced by Tokenize's --output_offsets option. */ vector_uima_offset* load_uima_offsets(const VersatileEncodingConfig* vec,const char* name) { U_FILE* f; f=u_fopen(vec,name,U_READ); if (f==NULL) { return NULL; } vector_int* v=new_vector_int(); Ustring* line=new_Ustring(); int a,b,c; while (EOF!=readline(line,f)) { u_sscanf(line->str,"%d%d%d",&a,&b,&c); vector_int_add(v,b); vector_int_add(v,c); } free_Ustring(line); u_fclose(f); return (vector_uima_offset*)v; }
void Tag::parseNumeric() { if (tag.size() >= 256) { return; } UChar tkey[256]; UChar top[256]; UChar txval[256]; UChar spn[] = { '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 0 }; tkey[0] = 0; top[0] = 0; txval[0] = 0; if (u_sscanf(tag.c_str(), "%*[<]%[^<>=:!]%[<>=:!]%[-MAXIN0-9]%*[>]", &tkey, &top, &txval) == 3 && top[0] && txval[0]) { int32_t tval = 0; int32_t r = u_strspn(txval, spn); if (txval[0] == 'M' && txval[1] == 'A' && txval[2] == 'X' && txval[3] == 0) { tval = std::numeric_limits<int32_t>::max(); } else if (txval[0] == 'M' && txval[1] == 'I' && txval[2] == 'N' && txval[3] == 0) { tval = std::numeric_limits<int32_t>::min(); } else if (txval[r] || u_sscanf(txval, "%d", &tval) != 1) { return; } if (top[0] == '<') { comparison_op = OP_LESSTHAN; } else if (top[0] == '>') { comparison_op = OP_GREATERTHAN; } else if (top[0] == '=' || top[0] == ':') { comparison_op = OP_EQUALS; } else if (top[0] == '!') { comparison_op = OP_NOTEQUALS; } if (top[1]) { if (top[1] == '=' || top[1] == ':') { if (comparison_op == OP_GREATERTHAN) { comparison_op = OP_GREATEREQUALS; } else if (comparison_op == OP_LESSTHAN) { comparison_op = OP_LESSEQUALS; } else if (comparison_op == OP_NOTEQUALS) { comparison_op = OP_NOTEQUALS; } } else if (top[1] == '>') { if (comparison_op == OP_EQUALS) { comparison_op = OP_GREATEREQUALS; } else if (comparison_op == OP_LESSTHAN) { comparison_op = OP_NOTEQUALS; } } else if (top[1] == '<') { if (comparison_op == OP_EQUALS) { comparison_op = OP_LESSEQUALS; } else if (comparison_op == OP_GREATERTHAN) { comparison_op = OP_NOTEQUALS; } } } comparison_val = tval; comparison_hash = hash_value(tkey); type |= T_NUMERICAL; } }