/**
 * Loads the initial keyword list from a tok_by_freq.txt file,
 * and turns all those tokens in a list whose primary key is the
 * lower case token:
 * The/20 THE/2 the/50 => the->(The/20 THE/2 the/50)
 */
struct string_hash_ptr* load_tokens_by_freq(char* name,VersatileEncodingConfig* vec) {
U_FILE* f=u_fopen(vec,name,U_READ);
if (f==NULL) return NULL;
Ustring* line=new_Ustring(128);
Ustring* lower=new_Ustring(128);
struct string_hash_ptr* res=new_string_hash_ptr(1024);
int val,pos;
/* We skip the first line of the file, containing the number
 * of tokens
 */
if (EOF==readline(line,f)) {
	fatal_error("Invalid empty file %s\n",name);
}
while (EOF!=readline(line,f)) {
	if (1!=u_sscanf(line->str,"%d%n",&val,&pos)) {
		fatal_error("Invalid line in file %s:\n%S\n",name,line->str);
	}
	u_strcpy(lower,line->str+pos);
	u_tolower(lower->str);
	int index=get_value_index(lower->str,res,INSERT_IF_NEEDED,NULL);
	if (index==-1) {
		fatal_error("Internal error in load_tokens_by_freq\n");
	}
	KeyWord* value=(KeyWord*)res->value[index];
	res->value[index]=new_KeyWord(val,line->str+pos,value);
}
free_Ustring(line);
free_Ustring(lower);
u_fclose(f);
return res;
}
/**
 * Returns 1 if all tags have valid sentence automaton outputs; 0 otherwise.
 */
static int valid_outputs(Fst2* fst2) {
if (u_strcmp(fst2->tags[0]->input,"<E>") || fst2->tags[0]->output!=NULL) {
   /* Should never happen */
   fatal_error("valid_outputs: the first tag of the .fst2 should be <E>\n");
}
for (int i=1;i<fst2->number_of_tags;i++) {
   /* Condition 3: no tag of the form <E>/XYZ */
   if (!u_strcmp(fst2->tags[i]->input,"<E>") && fst2->tags[i]->output!=NULL) {
      return 0;
   }
   if (fst2->tags[i]->output==NULL) {
      /* Condition 4: <E> must the only tag without output */
      return 0;
   }
   int w,x,y,z,f,g;
   char foo;
   /* Condition 5 */
   if (6!=u_sscanf(fst2->tags[i]->output,"%d %d %d %d %d %d%c",&w,&x,&y,&z,&f,&g,&foo)) {
      /* If the output is not made of 6 integers */
      return 0;
   }
   if (w<0 || x<-1 || y<0 || z<-1) {
      return 0;
   }
}
return 1;
}
Beispiel #3
0
/**
 * This function loads concordance lines from 'f1' and/or 'f2' and prints them to
 * 'output' in the given color.
 */
void print_diff_matches(U_FILE* output,U_FILE* f1,U_FILE* f2,const char* color,
        unichar* match1,unichar* match2) {
unichar left1[MAX_CONTEXT_IN_UNITS];
unichar middle1[MAX_CONTEXT_IN_UNITS];
unichar right1[MAX_CONTEXT_IN_UNITS];
unichar indices1[MAX_CONTEXT_IN_UNITS];
unichar left2[MAX_CONTEXT_IN_UNITS];
unichar middle2[MAX_CONTEXT_IN_UNITS];
unichar right2[MAX_CONTEXT_IN_UNITS];
unichar indices2[MAX_CONTEXT_IN_UNITS];
if (f1!=NULL) {
   read_concordance_line(f1,left1,middle1,right1,indices1,match1==NULL?0:u_strlen(match1));
}
if (f2!=NULL) {
   read_concordance_line(f2,left2,middle2,right2,indices2,match2==NULL?0:u_strlen(match2));
}
if (match1!=NULL) u_strcpy(middle1,match1);
if (match2!=NULL) u_strcpy(middle2,match2);
if (!strcmp(color,RED)) {
    /* If we have one match included in the another, we want to align
     * them. We do that by adjusting their left contexts */
    int pos1,pos2;
    u_sscanf(indices1,"%d",&pos1);
    u_sscanf(indices2,"%d",&pos2);
    if (pos1<pos2) {
        adjust(left1,left2,middle1,pos2-pos1);
    } else if (pos1>pos2) {
        adjust(left2,left1,middle2,pos1-pos2);
    } /*  Nothing to adjust if pos1==pos2 */
}
/* We print the line from the first file, if needed */
u_fprintf(output,"<tr><td nowrap bgcolor=\"#FFE4C4\"><font color=\"%s\">",color);
if (f1!=NULL) {
   u_fprintf(output,"%HS<a href=\"%S\" style=\"color:%s\">%HS</a>%HS",left1,indices1,color,middle1,right1);
} else {
    u_fprintf(output,"&nbsp;");
}
u_fprintf(output,"</font></td></tr>\n");
u_fprintf(output,"<tr><td nowrap bgcolor=\"#90EE90\"><font color=\"%s\">",color);
/* We print the line from the second file, if needed */
if (f2!=NULL) {
   u_fprintf(output,"%HS<a href=\"%S\" style=\"color:%s\">%HS</a>%HS",left2,indices2,color,middle2,right2);
} else {
    u_fprintf(output,"&nbsp;");
}
u_fprintf(output,"</font></td></tr>\n");
}
Beispiel #4
0
/**
 * Reads the start and end positions of each token stored in the file
 * produced by Tokenize's --output_offsets option.
 */
vector_uima_offset* load_uima_offsets(const VersatileEncodingConfig* vec,const char* name) {
U_FILE* f;
f=u_fopen(vec,name,U_READ);
if (f==NULL) {
   return NULL;
}
vector_int* v=new_vector_int();
Ustring* line=new_Ustring();
int a,b,c;
while (EOF!=readline(line,f)) {
	u_sscanf(line->str,"%d%d%d",&a,&b,&c);
	vector_int_add(v,b);
	vector_int_add(v,c);
}
free_Ustring(line);
u_fclose(f);
return (vector_uima_offset*)v;
}
Beispiel #5
0
void Tag::parseNumeric() {
	if (tag.size() >= 256) {
		return;
	}
	UChar tkey[256];
	UChar top[256];
	UChar txval[256];
	UChar spn[] = { '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 0 };
	tkey[0] = 0;
	top[0] = 0;
	txval[0] = 0;
	if (u_sscanf(tag.c_str(), "%*[<]%[^<>=:!]%[<>=:!]%[-MAXIN0-9]%*[>]", &tkey, &top, &txval) == 3 && top[0] && txval[0]) {
		int32_t tval = 0;
		int32_t r = u_strspn(txval, spn);
		if (txval[0] == 'M' && txval[1] == 'A' && txval[2] == 'X' && txval[3] == 0) {
			tval = std::numeric_limits<int32_t>::max();
		}
		else if (txval[0] == 'M' && txval[1] == 'I' && txval[2] == 'N' && txval[3] == 0) {
			tval = std::numeric_limits<int32_t>::min();
		}
		else if (txval[r] || u_sscanf(txval, "%d", &tval) != 1) {
			return;
		}
		if (top[0] == '<') {
			comparison_op = OP_LESSTHAN;
		}
		else if (top[0] == '>') {
			comparison_op = OP_GREATERTHAN;
		}
		else if (top[0] == '=' || top[0] == ':') {
			comparison_op = OP_EQUALS;
		}
		else if (top[0] == '!') {
			comparison_op = OP_NOTEQUALS;
		}
		if (top[1]) {
			if (top[1] == '=' || top[1] == ':') {
				if (comparison_op == OP_GREATERTHAN) {
					comparison_op = OP_GREATEREQUALS;
				}
				else if (comparison_op == OP_LESSTHAN) {
					comparison_op = OP_LESSEQUALS;
				}
				else if (comparison_op == OP_NOTEQUALS) {
					comparison_op = OP_NOTEQUALS;
				}
			}
			else if (top[1] == '>') {
				if (comparison_op == OP_EQUALS) {
					comparison_op = OP_GREATEREQUALS;
				}
				else if (comparison_op == OP_LESSTHAN) {
					comparison_op = OP_NOTEQUALS;
				}
			}
			else if (top[1] == '<') {
				if (comparison_op == OP_EQUALS) {
					comparison_op = OP_LESSEQUALS;
				}
				else if (comparison_op == OP_GREATERTHAN) {
					comparison_op = OP_NOTEQUALS;
				}
			}
		}
		comparison_val = tval;
		comparison_hash = hash_value(tkey);
		type |= T_NUMERICAL;
	}
}