Пример #1
0
/**
 * Reads the start and end positions of each token stored in the file
 * produced by Tokenize's --output_offsets option.
 */
vector_uima_offset* load_uima_offsets(const VersatileEncodingConfig* vec,const char* name) {
U_FILE* f;
f=u_fopen(vec,name,U_READ);
if (f==NULL) {
   return NULL;
}
vector_int* v=new_vector_int();
Ustring* line=new_Ustring();
int a,b,c;
while (EOF!=readline(line,f)) {
	u_sscanf(line->str,"%d%d%d",&a,&b,&c);
	vector_int_add(v,b);
	vector_int_add(v,c);
}
free_Ustring(line);
u_fclose(f);
return (vector_uima_offset*)v;
}
Пример #2
0
/**
 * Returns the number of the given token, inserting it if needed in the
 * data structures. Its number of occurrences is also updated.
 */
int get_token_number(unichar* s,vector_ptr* tokens,struct hash_table* hashtable,vector_int* n_occur) {
int ret;
struct any* value=get_value(hashtable,s,HT_INSERT_IF_NEEDED,&ret);
if (ret==HT_KEY_ADDED) {
   /* If the token was not already in the hash table, we must give it
    * a number */
   value->_int=vector_ptr_add(tokens,u_strdup(s));
   vector_int_add(n_occur,0);
}
int n=value->_int;
/* Then we update the number of occurrences */
n_occur->tab[n]++;
return n;
}
Пример #3
0
int main()
{
	vector_int a;
	vector_int_init(&a,3);
	printf("<<<init &a 3>>>\n");
	print_vector_int(&a, stdout);
	assert(3 == vector_int_size(&a));
	assert(3 == vector_int_capacity(&a));
	assert(0 == vector_int_e(&a, 0));

	vector_int_insert(&a, 2, 10);
	printf("<<<insert &a, 2, 10>>>\n");
	assert(4 == vector_int_capacity(&a));
	assert(4 == vector_int_size(&a));
	assert(10 == vector_int_e(&a, 2));
	assert(10 == *vector_int_e_ptr(&a, 2));
	print_vector_int(&a, stdout);

	vector_int_push_back(&a, 60);
	printf("<<<push back &a, 60>>>\n");
	assert(60 == vector_int_tail(&a));
	assert(8 == vector_int_capacity(&a));
	assert(5 == vector_int_size(&a));
	assert(60 == vector_int_max(&a));
	assert(4 == vector_int_which_max(&a));
	assert(1 == vector_int_contains(&a, 60));
	assert(0 == vector_int_contains(&a, 100));
	print_vector_int(&a, stdout);

	vector_int_resize_min(&a);
	printf("<<<resize min &a>>>\n");
	assert(60 == vector_int_tail(&a));
	assert(5 == vector_int_capacity(&a));
	assert(5 == vector_int_size(&a));
	print_vector_int(&a, stdout);

	printf("<<<pop_back &a>>>\n");
	assert(60 == vector_int_pop_back(&a));
	assert(5 == vector_int_capacity(&a));
	assert(4 == vector_int_size(&a));
	print_vector_int(&a, stdout);

	vector_int_set(&a, 2, -12);
	printf("<<<set &a 2 12>>>\n");
	assert(-12 == VECTOR(a)[2]);
	assert(5 == vector_int_capacity(&a));
	assert(4 == vector_int_size(&a));
	assert(-12 == vector_int_min(&a));
	assert(2 == vector_int_which_min(&a));
	print_vector_int(&a, stdout);


	vector_int_set(&a, 1, 12);
	int min, max;
	int which_min, which_max;
	vector_int_minmax(&a,&min, &max);
	vector_int_which_minmax(&a, &which_min, &which_max);
	assert(-12 == min);
	assert(12 == max);
	assert(2 == which_min);
	assert(1 == which_max);

	vector_int_reserve(&a, 10);
	printf("<<<reserve &a 10>>>\n");
	assert(10 == vector_int_capacity(&a));
	assert(4 == vector_int_size(&a));
	print_vector_int(&a, stdout);

	vector_int_null(&a);
	printf("<<<null &a>>>\n");
	assert(10 == vector_int_capacity(&a));
	assert(4 == vector_int_size(&a));
	assert(0 == VECTOR(a)[2]);
	print_vector_int(&a, stdout);

	vector_int_fill(&a, 15);
	printf("<<<fill &a 15>>>\n");
	assert(10 == vector_int_capacity(&a));
	assert(4 == vector_int_size(&a));
	assert(15 == VECTOR(a)[2]);
	print_vector_int(&a, stdout);

	vector_int_clear(&a);
	printf("<<<clear &a>>>\n");
	assert(10 == vector_int_capacity(&a));
	assert(0 == vector_int_size(&a));
	print_vector_int(&a, stdout);

	vector_int_destroy(&a);
	printf("<<<destroy &a>>>\n");
//	assert(0 == vector_int_capacity(&a));
//	assert(0 == vector_int_size(&a));
//	print_vector_int(&a, stdout);

	int hehe[5] = {1,2,3,4,5};
	vector_int_init_copy(&a, hehe, 5);
	printf("<<<init_copy &a {1,2,3,4,5}>>>\n");
	assert(5 == vector_int_capacity(&a));
	assert(5 == vector_int_size(&a));
	assert(3 == VECTOR(a)[2]);
	print_vector_int(&a, stdout);

	int hehe2[5];
	vector_int_copy_to(&a, hehe2);
	for (int i = 0; i < 5; i++)
		assert(hehe[i] == hehe2[i]);

	vector_int_add_constant(&a, 1);
	printf("<<<add_constant &a 1>>>\n");
	assert(4 == VECTOR(a)[2]);
	print_vector_int(&a, stdout);
	vector_int a2;
	vector_int af;
	vector_int_init(&af,0);
	vector_int_init_value(&a2, 5, 5,4,3,2,1);
	vector_int_add(&a2, &a);
	assert(7 == VECTOR(a2)[2]);
	print_vector_int(&a2, stdout);
	vector_int_ele_freq_min_max(&af, &a2, &min, &max);
	print_vector_int(&af,stdout);
	vector_int_ele_freq_min_max(&af, &a2, &min, &max);
	print_vector_int(&af,stdout);
	vector_int_sub(&a2, &a);
	assert(3 == VECTOR(a2)[2]);
	print_vector_int(&a2, stdout);
	vector_int_cumsum(&a2, &a);
	assert(20 == VECTOR(a2)[4]);
	print_vector_int(&a2, stdout);
	int sum = vector_int_sum(&a);
	assert(sum == VECTOR(a2)[4]);


	vector_int b;
	printf("<<<copy &a &b>>>\n");
	print_vector_int(&a, stdout);
	vector_int_copy(&b, &a);
	for (int i = 0; i < vector_int_size(&b); i++)
		assert(VECTOR(a)[i] == VECTOR(b)[i]);
	print_vector_int(&b, stdout);

	vector_int b2;
	vector_int_init_value(&b2, 5, 0,1,2,3,4,5);
	printf("<<<init_value &b2 5 0,1,2,3,4,5>>>\n");
	assert(5 == vector_int_capacity(&b2));
	assert(5 == vector_int_size(&b2));
	assert(2 == VECTOR(b2)[2]);
	print_vector_int(&b2, stdout);

	vector_int_remove_section(&b2, 1, 3);
	printf("<<<remove_section &b2 1 3>>>\n");
	assert(5 == vector_int_capacity(&b2));
	assert(3 == vector_int_size(&b2));
	assert(3 == VECTOR(b2)[1]);
	print_vector_int(&b2, stdout);

	vector_int_remove(&b2, 1);
	printf("<<<remove &b2, 1>>>\n");
	assert(5 == vector_int_capacity(&b2));
	assert(2 == vector_int_size(&b2));
	assert(4 == VECTOR(b2)[1]);
	print_vector_int(&b2, stdout);

	vector_int c;
	vector_int_init_value_end(&c, 14, 1,2,14,4);
	printf("<<<init_value_end &c 14 1,2,14,4>>>\n");
	assert(2 == vector_int_capacity(&c));
	assert(2 == vector_int_size(&c));
	assert(2 == VECTOR(c)[1]);
	print_vector_int(&c, stdout);


	vector_int v1,v2,res,res2;
	vector_int_init_value(&v1, 8, 4,5,2,3,7,1,6,1);
	vector_int_init_value(&v2, 8, 3,4,1,7,2,5,1,3);
	vector_int_init(&res, 1);
	vector_int_init(&res2, 1);
	print_vector_int(&res2, stdout);
	vector_int_order_inc2(&v1, &v2, &res, 8);
	vector_int_scan_tie(&res2, &v1, &res, 8);
	printf("<<<order&v1, &v2, &res, 7>>>\n");
	print_vector_int(&v1, stdout);
	print_vector_int(&v2, stdout);
	print_vector_int(&res, stdout);
	print_vector_int(&res2, stdout);
	assert(7 == VECTOR(res)[0]);
	assert(5 == VECTOR(res)[1]);
	assert(2 == VECTOR(res)[2]);
	assert(3 == VECTOR(res)[3]);
	assert(0 == VECTOR(res)[4]);
	assert(1 == VECTOR(res)[5]);
	assert(6 == VECTOR(res)[6]);
	assert(4 == VECTOR(res)[7]);

	assert(0 == VECTOR(res2)[0]);
	assert(0 == VECTOR(res2)[1]);
	assert(2 == VECTOR(res2)[2]);
	assert(3 == VECTOR(res2)[3]);
	assert(4 == VECTOR(res2)[4]);
	assert(5 == VECTOR(res2)[5]);
	assert(6 == VECTOR(res2)[6]);
	assert(7 == VECTOR(res2)[7]);


	vector_double d;
	vector_double_init(&d, 3);
	print_vector_double(&d, stdout);

	vector_double e;
	vector_double_init_value(&e, 3, 1.0, 1.2, 1.3);
	print_vector_double(&e, stdout);

	vector_double f;
	vector_double_init_value_end(&f, 1.4, 1.0, 1.3, 1.4, 10.0);
	print_vector_double(&f, stdout);

	vector_test_t g;
	vector_test_t_init(&g, 3);
	test_t tmp_test_t = {1,3};
	vector_test_t_set(&g, 1, tmp_test_t);
	print_vector_test_t(&g, stdout);
	assert(1 == vector_test_t_contains_op(&g, tmp_test_t, test_t_op));

	vector_char aaa;
	vector_char_init(&aaa, 4);
	VECTOR(aaa)[0] = 'a';
	VECTOR(aaa)[1] = 'a';
	VECTOR(aaa)[2] = 'a';
	VECTOR(aaa)[3] = '\0';
	printf("%s\n",VECTOR(aaa));

	vector_int haha;
	vector_int_init_value_end(&haha, -1, 3,4,7,1,6,5, -1);
	print_vector_int(&haha, stdout);
	vector_int order;
	vector_int_init(&order, 0);
	vector_int_order_inc(&haha, &order,8); 
	print_vector_int(&order, stdout);
	vector_int_order_dec(&haha, &order,8);
	print_vector_int(&order, stdout);
	vector_int_destroy(&order);
	vector_int_destroy(&haha);
	return 0;
}
Пример #4
0
/**
 * This function removes all non ambiguous outputs from the given match list.
 * If renumber is non NULL, we have renumber[x]=y, where x is the position
 * of a match in the filtered list, and y its corresponding number in the
 * unfiltered original one.
 */
void filter_unambiguous_outputs(struct match_list* *list,vector_int* renumber) {
struct match_list* tmp;
if (*list==NULL) return;
struct match_list* previous=NULL;
struct match_list* l=*list;
int previous_was_identical=0;
int original_match_number=-1;
while (l!=NULL) {
  original_match_number++;
  if (previous==NULL) {
    /* Case 1: we are at the beginning of the list */
    /* Case 1a: there is only one cell */
    if (l->next==NULL) {
      free_match_list(l);
      *list=NULL;
      return;
    }
    /* Case 1b: there is a next cell, but it's not ambiguous with the current one */
    if (!are_ambiguous(l,l->next)) {
      /* We have to delete the current cell */
      tmp=l->next;
      free_match_list_element(l);
      l=tmp;
      continue;
    }
    /* Case 1c: the next cell is an ambiguous one, we can move on */
    /* Now we know the list head element */
    *list=l;
    previous=l;
    previous_was_identical=1;
    l=l->next;
    vector_int_add(renumber,original_match_number);
    continue;
  } else {
    /* Case 2: there is a previous cell */
    if (previous_was_identical) {
      vector_int_add(renumber,original_match_number);
      /* Case 2a: we know that we have to keep this current cell, but
       * we must check if the next is also an ambiguous one */
      if (l->next==NULL) {
        /* No next cell ? We're done then */
        return;
      }
      previous_was_identical=are_ambiguous(l,l->next);
      previous=l;
      l=l->next;
      continue;
    }
    /* Case 2b: previous cell is different, so we have to test the next one
     * to know whether we must keep the current one or not */
    if (l->next==NULL) {
      /* No next cell ? We have to delete the current one and then
       * we are done */
      free_match_list_element(l);
      previous->next=NULL;
      return;
    }
    previous_was_identical=are_ambiguous(l,l->next);
    if (previous_was_identical) {
      /* We have to keep the current cell */
      previous=l;
      l=l->next;
      vector_int_add(renumber,original_match_number);
      continue;
    }
    /* Final case, the next cell is not ambiguous, so we have to delete
     * the current one */
    tmp=l;
    l=l->next;
    free_match_list_element(tmp);
    previous->next=l;
    continue;
  }
}
}
Пример #5
0
/**
 * Explores the given dictionary to match the given word.
 */
static void explore_dic(int offset,unichar* word,int pos_word,Dictionary* d,SpellCheckConfig* cfg,
		Ustring* output,SpellCheckHypothesis* *list,int base,Ustring* inflected) {
int original_offset=offset;
int original_base=base;
int final,n_transitions,inf_code;
int z=save_output(output);
int size_pairs=cfg->pairs->nbelems;
offset=read_dictionary_state(d,offset,&final,&n_transitions,&inf_code);
if (final) {
	if (word[pos_word]=='\0') {
		/* If we have a match */
		deal_with_matches(d,inflected->str,inf_code,output,cfg,base,list);
	}
	base=output->len;
}
/* If we are at the end of the token, then we stop */
if (word[pos_word]=='\0') {
	return;
}
unsigned int l2=inflected->len;
unichar c;
int dest_offset;
for (int i=0;i<n_transitions;i++) {
	restore_output(z,output);
	offset=read_dictionary_transition(d,offset,&c,&dest_offset,output);
	/* For backup_output, see comment below */
	int backup_output=save_output(output);
	if (c==word[pos_word] || word[pos_word]==u_toupper(c)) {
		u_strcat(inflected,c);
		explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected);
	} else {
		/* We deal with the SP_SWAP case, made of 2 SP_CHANGE_XXX */
		if (cfg->current_errors!=cfg->max_errors && cfg->current_SP_SWAP!=cfg->max_SP_SWAP
				&& is_letter_swap(cfg,word,pos_word,inflected,c)) {
			/* We don't modify the number of errors since we override an existing
			 * SP_CHANGE_XXX one */
			cfg->current_SP_SWAP++;
			/* We override the previous change */
			int a=cfg->pairs->tab[cfg->pairs->nbelems-2];
			int b=cfg->pairs->tab[cfg->pairs->nbelems-1];
			cfg->pairs->tab[cfg->pairs->nbelems-2]=pos_word-1;
			cfg->pairs->tab[cfg->pairs->nbelems-1]=SP_SWAP_DEFAULT;
			u_strcat(inflected,c);
			explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected);
			cfg->pairs->tab[cfg->pairs->nbelems-2]=a;
			cfg->pairs->tab[cfg->pairs->nbelems-1]=b;
			cfg->current_SP_SWAP--;
		} else /* We deal with the SP_CHANGE case */
		       if (cfg->current_errors!=cfg->max_errors && cfg->current_SP_CHANGE!=cfg->max_SP_CHANGE
				/* We want letters, not spaces or anything else */
				&& is_letter(c,NULL)
		        /* We do not allow the replacement of a lowercase letter by an uppercase
		         * letter at the beginning of the word like Niserable, unless the whole word
		         * is in uppercase or the letter is the same, module the case */
		        && (cfg->allow_uppercase_initial || pos_word>0 || (!is_upper(word[0],NULL) || is_upper(word[1],NULL) || word[0]==u_toupper(c)))) {
			cfg->current_errors++;
			cfg->current_SP_CHANGE++;
			/* Now we test all possible kinds of change */
			vector_int_add(cfg->pairs,pos_word);
			u_strcat(inflected,c);
			/* We always add the default case */
			vector_int_add(cfg->pairs,SP_CHANGE_DEFAULT);
			int n_elem=cfg->pairs->nbelems;
			explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected);
			/* Then we test the accent case */
			if (u_deaccentuate(c)==u_deaccentuate(word[pos_word])) {
				/* After a call to explore_dic, we must restore the output.
				 * But, when dealing with SP_CHANGE_XXX ops, we must restore the
				 * output including the output associated to the current transition,
				 * which is why we don't use z (output before the current transition)
				 * but backup_output */
				restore_output(backup_output,output);
			    cfg->pairs->nbelems=n_elem;
			    cfg->pairs->tab[cfg->pairs->nbelems-1]=SP_CHANGE_DIACRITIC;
			    explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected);
			}
			/* And the case variations */
			if (u_tolower(c)==u_tolower(word[pos_word])) {
			    restore_output(backup_output,output);
			    cfg->pairs->nbelems=n_elem;
				cfg->pairs->tab[cfg->pairs->nbelems-1]=SP_CHANGE_CASE;
				explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected);
			}
			/* And finally the position on keyboard */
			if (areCloseOnKeyboard(c,word[pos_word],cfg->keyboard)) {
			    restore_output(backup_output,output);
			    cfg->pairs->nbelems=n_elem;
				cfg->pairs->tab[cfg->pairs->nbelems-1]=SP_CHANGE_KEYBOARD;
				explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected);
			}
			cfg->pairs->nbelems=size_pairs;
			cfg->current_errors--;
			cfg->current_SP_CHANGE--;
			/* End of the SP_CHANGE case */
		}
	}
    restore_output(backup_output,output);
	truncate(inflected,l2);
	/* Now we deal with the SP_SUPPR case */
	if (cfg->current_errors!=cfg->max_errors && cfg->current_SP_SUPPR!=cfg->max_SP_SUPPR
		/* We want letters, not spaces or anything else */
		&& is_letter(c,NULL)) {
		cfg->current_errors++;
		cfg->current_SP_SUPPR++;
		vector_int_add(cfg->pairs,pos_word);
		if (pos_word>=1 && c==word[pos_word-1]) {
			vector_int_add(cfg->pairs,SP_SUPPR_DOUBLE);
		} else {
			vector_int_add(cfg->pairs,SP_SUPPR_DEFAULT);
		}
		u_strcat(inflected,c);
		explore_dic(dest_offset,word,pos_word,d,cfg,output,list,original_base,inflected);
		truncate(inflected,l2);
		cfg->pairs->nbelems=size_pairs;
		cfg->current_errors--;
		cfg->current_SP_SUPPR--;
	}
}
restore_output(z,output);
/* Finally, we deal with the SP_INSERT case, by calling again the current
 * function with the same parameters, except pos_word that will be increased of 1 */
if (cfg->current_errors!=cfg->max_errors && cfg->current_SP_INSERT!=cfg->max_SP_INSERT
	/* We want letters, not spaces or anything else */
	&& is_letter(word[pos_word],NULL)
	/* We do not allow the insertion of a capital letter at the beginning of
	 * the word like Astreet, unless the whole word is in uppercase like ASTREET */
    && (cfg->allow_uppercase_initial || pos_word>0 || (!is_upper(word[0],NULL) || is_upper(word[1],NULL)))) {
	cfg->current_errors++;
	cfg->current_SP_INSERT++;
	vector_int_add(cfg->pairs,pos_word);
	if (pos_word>=1 && word[pos_word]==word[pos_word-1]) {
		vector_int_add(cfg->pairs,SP_INSERT_DOUBLE);
	} else {
		vector_int_add(cfg->pairs,SP_INSERT_DEFAULT);
	}
	explore_dic(original_offset,word,pos_word+1,d,cfg,output,list,original_base,inflected);
	truncate(inflected,l2);
	cfg->pairs->nbelems=size_pairs;
	cfg->current_errors--;
	cfg->current_SP_INSERT--;
}
/* Finally, we restore the output as it was when we enter the function */
restore_output(z,output);
}
Пример #6
0
void char_by_char_tokenization(U_FILE* f,U_FILE* coded_text,U_FILE* output,Alphabet* alph,
                               vector_ptr* tokens,struct hash_table* hashtable,
                               vector_int* n_occur,vector_int* n_enter_pos,
                               int *SENTENCES,int *TOKENS_TOTAL,int *WORDS_TOTAL,
                               int *DIGITS_TOTAL) {
int c;
unichar s[MAX_TAG_LENGTH];
int n;
char ENTER;
int COUNT=0;
int current_megabyte=0;
c=u_fgetc(f);
while (c!=EOF) {
   COUNT++;
   if ((COUNT/(1024*512))!=current_megabyte) {
      current_megabyte++;
      u_printf("%d megabytes read...         \r",(COUNT/(1024*512)));
   }
   if (c==' ' || c==0x0d || c==0x0a) {
      ENTER=0;
      if (c=='\n') {
         ENTER=1;
      }
      // if the char is a separator, we jump all the separators
      while ((c=u_fgetc(f))==' ' || c==0x0d || c==0x0a) {
         if (c=='\n') ENTER=1;
         COUNT++;
      }
      s[0]=' ';
      s[1]='\0';
      n=get_token_number(s,tokens,hashtable,n_occur);
      /* If there is a \n, we note it */
      if (ENTER==1) {
         vector_int_add(n_enter_pos,*TOKENS_TOTAL);
      }
      (*TOKENS_TOTAL)++;
      fwrite(&n,4,1,coded_text);
   }
   else if (c=='{') {
     s[0]='{';
     int z=1;
     while (z<(MAX_TAG_LENGTH-1) && (c=u_fgetc(f))!='}' && c!='{' && c!='\n') {
        s[z++]=(unichar)c;
        COUNT++;
     }
     if (c=='\n') {
        // if the tag contains a return
        fatal_error("Error: a tag containing a new-line sequence has been found\n");
     }
     if (z==(MAX_TAG_LENGTH-1) || c!='}') {
        // if the tag has no ending }
        if (z==(MAX_TAG_LENGTH-1)) {z--;}
        s[z]='\0';
        fatal_error("Error: a tag without ending } has been found:\n==>%S<==\n",s);
     }
     s[z]='}';
     s[z+1]='\0';
     if (!u_strcmp(s,"{S}")) {
        // if we have found a sentence delimiter
        (*SENTENCES)++;
     } else {
        if (u_strcmp(s,"{STOP}") && !check_tag_token(s)) {
           // if a tag is incorrect, we exit
           fatal_error("The text contains an invalid tag. Unitex cannot process it.");
        }
     }
     n=get_token_number(s,tokens,hashtable,n_occur);
     (*TOKENS_TOTAL)++;
     fwrite(&n,4,1,coded_text);
     c=u_fgetc(f);
   }
   else {
      s[0]=(unichar)c;
      s[1]='\0';
      n=get_token_number(s,tokens,hashtable,n_occur);
      (*TOKENS_TOTAL)++;
      if (is_letter((unichar)c,alph)) (*WORDS_TOTAL)++;
      else if (c>='0' && c<='9') (*DIGITS_TOTAL)++;
      fwrite(&n,4,1,coded_text);
      c=u_fgetc(f);
   }
}
for (n=0;n<tokens->nbelems;n++) {
   u_fprintf(output,"%S\n",tokens->tab[n],output);
}
}
Пример #7
0
/**
 * This function adds a new token shift to the given snt offsets.
 */
void add_snt_offsets(vector_int* snt_offsets,int token_pos,int shift_before,int shift_after) {
vector_int_add(snt_offsets,token_pos);
vector_int_add(snt_offsets,shift_before);
vector_int_add(snt_offsets,shift_after);
}