コード例 #1
0
int main(int             argc, char           *argv[])
{

char onewdbfr[512],onewdaft[512];
char onetagbfr[512],twotagbfr[512],onetagaft[512],twotagaft[512];
char *freshcharvar;
Darray errorlist,temperrorkey,temperrorval;
Registry errorlistcount,SEENTAGGING,WORDS;

FILE *correct_file, *guess_file, *error_list,*correct_out;
char line[5000];  /* input line buffer */
char **split_ptr,**split_ptr2;

char wdpair[1024];
char *tempstr,*tempstr2;
float CONTINUE = 10000.0;
int count2,numwrong,lengthcount;
unsigned int count;
char globalprint[500];
char systemcall[500];
char forpasting[500];
char forpasting2[500];
float globalbest = 0.0;
char flag[20];
Registry currentwd,currentwd2;
Registry always,always2;
Registry wdnexttag,wdnexttag2,wdprevtag,wdprevtag2;
Registry rbigram,lbigram,rbigram2,lbigram2;
Registry next1tag,next1tag2,prev1tag,prev1tag2;
Registry next1or2tag,next1or2tag2,prev1or2tag,prev1or2tag2;
Registry next1or2or3tag,next1or2or3tag2,prev1or2or3tag,prev1or2or3tag2;
Registry next1wd,next1wd2,prev1wd,prev1wd2;
Registry next1or2wd,next1or2wd2,prev1or2wd,prev1or2wd2;
Registry nextbigram,nextbigram2,prevbigram,prevbigram2;
Registry surroundtag,surroundtag2;
Registry next2tag,next2tag2,prev2tag,prev2tag2;
Registry next2wd,next2wd2,prev2wd,prev2wd2;
char globaldif[20];
int printscore;
FILE *allowedmovefile;
char **perl_split_ptr,**perl_split_ptr2,*atempstr,atempstr2[1024];
char space[500];

SEENTAGGING = Registry_create(Registry_strcmp,Registry_strhash);
Registry_size_hint(SEENTAGGING,GUESSNUMWORDS);
WORDS = Registry_create(Registry_strcmp,Registry_strhash);
Registry_size_hint(WORDS,GUESSNUMWORDS);

allowedmovefile = fopen(argv[4], "r");
	  while(fgets(line,sizeof(line),allowedmovefile) != NULL) {
	    if (not_just_blank(line)) {
	      line[strlen(line) - 1] = '\0';

	      perl_split_ptr = perl_split(line);
	      perl_split_ptr2 = perl_split_ptr;
	      ++perl_split_ptr;
	      atempstr= mystrdup(*perl_split_ptr2);
	      Registry_add(WORDS,atempstr,(char *)1);
	      while(*perl_split_ptr != NULL) {
		sprintf(space,"%s %s",*perl_split_ptr2,
			*perl_split_ptr);
		atempstr=mystrdup(space);

		Registry_add(SEENTAGGING,atempstr,(char *)1);
		++perl_split_ptr; }
	      free(*perl_split_ptr2);
	      free(perl_split_ptr2);
	    }

	  }


system("/bin/rm AANEWRESTRJUNKKK");
correct_tag_corpus = Darray_create();
Darray_hint(correct_tag_corpus,100,400000);
word_corpus = Darray_create();
Darray_hint(word_corpus,100,400000);


correct_file = fopen(argv[1],"r");


while(fgets(line,sizeof(line),correct_file) != NULL) {
  Darray_addh(correct_tag_corpus,staart);
  Darray_addh(correct_tag_corpus,staart);
  Darray_addh(word_corpus,staart);
  Darray_addh(word_corpus,staart);
  line[strlen(line)-1] = '\0';
  split_ptr = perl_split_independent(line);
  while (*split_ptr != NULL) {
    Darray_addh(word_corpus,*split_ptr);
    while ((*(++*split_ptr)) != '/') {
    }
    **split_ptr = '\0';
    Darray_addh(correct_tag_corpus,++*split_ptr);
    ++split_ptr;
  }
}
fclose(correct_file);


printf("READ IN CORRECT FILE\n");




while(CONTINUE > THRESHOLD) {

  guess_tag_corpus = Darray_create();
  Darray_hint(guess_tag_corpus,100,400000);
  guess_file  = fopen(argv[2],"r");
  while(fgets(line,sizeof(line),guess_file) != NULL) {
    
    Darray_addh(guess_tag_corpus,staart);
    Darray_addh(guess_tag_corpus,staart);
    line[strlen(line)-1] = '\0';
    split_ptr = perl_split_independent(line); 
    split_ptr2 = split_ptr;
    while (*split_ptr != NULL) {
      tempstr = strtok(*split_ptr,"/");
      tempstr = strtok(NULL,"/");
      tempstr2  = mystrdup(tempstr);
      Darray_addh(guess_tag_corpus,tempstr2);
      free(*split_ptr);
      ++split_ptr;
    }
    free(split_ptr2);
  }
  fclose(guess_file);

printf("READ IN BAD FILE\n");

  errorlist = Darray_create();
  Darray_hint(errorlist,10,500);
  temperrorkey = Darray_create();
  temperrorval = Darray_create();
  Darray_hint(temperrorkey,10,500);
  Darray_hint(temperrorval,10,500);

  init_hash(&errorlistcount,500);


  printscore=0;
  for(count=0;count<Darray_len(guess_tag_corpus);++count) {
    if
      (! is_tagged_with((char *)Darray_get(correct_tag_corpus,count),(char *)Darray_get(guess_tag_corpus,count))) { 
	++printscore; 
	freshcharvar =
	  mystrdup(first_tag_nospace((char *)Darray_get(guess_tag_corpus,count)));
	sprintf(forpasting,"%s %s",freshcharvar,
		                       (char *)Darray_get(correct_tag_corpus,count));
	increment_array_create(&errorlistcount,forpasting);
      } 
  }

  error_list = fopen("AANEWRESTRJUNKKK","a");
  Registry_fetch_contents(errorlistcount,temperrorkey,temperrorval);  
  for (count=0;count<Darray_len(temperrorkey);++count) {
    if (*(int *)(char *)Darray_get(temperrorval,count) > THRESHOLD)
      /*Darray_addh(errorlist,tempstr);*/
      fprintf(error_list,"%d %s\n",*(int *)(char *)Darray_get(temperrorval,count),
	                    (char *)Darray_get(temperrorkey,count));
    free((char *)Darray_get(temperrorval,count));
    free((char *)Darray_get(temperrorkey,count));
  }
  fclose(error_list);
  Darray_destroy(temperrorval);
  Darray_destroy(temperrorkey);
  Registry_destroy(errorlistcount);

  printf("NUM ERRORS: %d\n",printscore);
/* shoud sort error list !!!!!!!*/
  
  system("cat AANEWRESTRJUNKKK | sort -rn > AANEWRESTRJUNKKK2");
  system("mv AANEWRESTRJUNKKK2 AANEWRESTRJUNKKK");
  
  error_list = fopen("AANEWRESTRJUNKKK","r");
  while(fgets(line,sizeof(line),error_list) != NULL) {
    line[strlen(line)-1] = '\0';
    tempstr = mystrdup(line);
    Darray_addh(errorlist,tempstr); 
  }
  fclose(error_list);
  system("/bin/rm AANEWRESTRJUNKKK");

  globalbest= 0;
  strcpy(globalprint,"");
  
  
  for (count=0;count<Darray_len(errorlist);++count) {

    localbest =0;
    strcpy(localbestthing,"");
    /*printf("ERROR LIST GUY: %s\n",(char *)Darray_get(errorlist,count));      */

    split_ptr = perl_split_independent((char *)Darray_get(errorlist,count));
/*printf("ERRORLISTGUY: %s %s %s\n",split_ptr[0],split_ptr[1],split_ptr[2]);*/
    wrong = split_ptr[1];
    right = split_ptr[2];
    numwrong = atoi(split_ptr[0]);
    if (numwrong > THRESHOLD3) {
      
      printf("WRONG,RI: %s %s\n",wrong,right);
      printf("GLOBALBEST, GLOBALPRINT, GLOBALDIF: %f %s %s\n",globalbest,globalprint,globaldif);

      init_hash(&always,NUMTAGS/2);
      init_hash(&always2,NUMTAGS/2);
      init_hash(&rbigram,(NUMWDS*NUMWDS)/4);
      init_hash(&lbigram,(NUMWDS*NUMWDS)/4);
      init_hash(&rbigram2,(NUMWDS*NUMWDS)/4);
      init_hash(&lbigram2,(NUMWDS*NUMWDS)/4);
      init_hash(&wdnexttag,(NUMWDS*NUMTAGS)/4);
      init_hash(&wdnexttag2,(NUMWDS*NUMTAGS)/4);
      init_hash(&wdprevtag,(NUMWDS*NUMTAGS)/4);
      init_hash(&wdprevtag2,(NUMWDS*NUMTAGS)/4);
      init_hash(&next1tag,NUMTAGS/2);
      init_hash(&next1tag2,NUMTAGS/2);
      init_hash(&prev1tag,NUMTAGS/2);
      init_hash(&prev1tag2,NUMTAGS/2);
      init_hash(&next1or2tag,NUMTAGS/2);
      init_hash(&next1or2tag2,NUMTAGS/2);
      init_hash(&prev1or2tag,NUMTAGS/2);
      init_hash(&prev1or2tag2,NUMTAGS/2);
      init_hash(&next1wd,NUMWDS/2);
      init_hash(&next1wd2,NUMWDS/2);
      init_hash(&prev1wd,NUMWDS/2);
      init_hash(&prev1wd2,NUMWDS/2);
      init_hash(&currentwd,NUMWDS/2);
      init_hash(&currentwd2,NUMWDS/2);
      init_hash(&next1or2wd,NUMWDS/2);
      init_hash(&next1or2wd2,NUMWDS/2);
      init_hash(&prev1or2wd,NUMWDS/2);
      init_hash(&prev1or2wd2,NUMWDS/2);
      init_hash(&next1or2or3tag,NUMTAGS/2);
      init_hash(&next1or2or3tag2,NUMTAGS/2);
      init_hash(&prev1or2or3tag,NUMTAGS/2);
      init_hash(&prev1or2or3tag2,NUMTAGS/2);
      init_hash(&nextbigram,NUMTAGS);
      init_hash(&nextbigram2,NUMTAGS);
      init_hash(&prevbigram,NUMTAGS);
      init_hash(&prevbigram2,NUMTAGS);
      init_hash(&surroundtag,NUMTAGS);
      init_hash(&surroundtag2,NUMTAGS);
      init_hash(&next2tag,NUMTAGS/2);
      init_hash(&next2tag2,NUMTAGS/2);
      init_hash(&prev2tag,NUMTAGS/2);
      init_hash(&prev2tag2,NUMTAGS/2);
      init_hash(&next2wd,NUMWDS/2);
      init_hash(&next2wd2,NUMWDS/2);
      init_hash(&prev2wd,NUMWDS/2);
      init_hash(&prev2wd2,NUMWDS/2);


      lengthcount = Darray_len(correct_tag_corpus);
      for(count2=0;count2<lengthcount;++count2){
	sprintf(atempstr2,"%s %s",(char *)Darray_get(word_corpus,count2),right);
	if (Registry_get(WORDS,(char *)Darray_get(word_corpus,count2)) &&
	    ! Registry_get(SEENTAGGING,atempstr2)) 
	  strcpy(flag,"NOMATCH");
	else if 
	  (strcmp((char *)Darray_get(correct_tag_corpus,count2),right) == 0 &&
	   (strcmp
	    (first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2)),wrong) 
	    ==  0)
	    &&
	    (! is_tagged_with(right,(char *)Darray_get(guess_tag_corpus,count2))))
	  strcpy(flag,"BADMATCH");
	else if
	  (strcmp((char *)Darray_get(correct_tag_corpus,count2),right) != 0 &&
	   (strcmp
	    (first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2)),wrong) 
	    ==  0)
	   &&
	   (! is_tagged_with(right,(char *)Darray_get(guess_tag_corpus,count2))))
	  strcpy(flag,"GOODMATCH");
	else 
	  strcpy(flag,"NOMATCH");
	
	if (strcmp(flag,"BADMATCH") == 0) {
	  increment_array(&always,"DUMMY");
	  increment_array(&currentwd,(char *)Darray_get(word_corpus,count2));
	  if (count2 != lengthcount-1) {
	    strcpy(onewdaft,(char *)Darray_get(word_corpus,count2+1));
	    strcpy(onetagaft,
		   first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+1)));
	    sprintf(wdpair,"%s %s",(char *)Darray_get(word_corpus,count2),
		                  (char *)Darray_get(word_corpus,count2+1));
    	    increment_array_create(&rbigram,wdpair);
	    sprintf(wdpair,"%s %s",(char *)Darray_get(word_corpus,count2),
		    first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+1)));
	    increment_array_create(&wdnexttag,wdpair);
	    increment_array_create(&next1or2tag,
			    first_tag_nospace(
				(char *)Darray_get(guess_tag_corpus,count2+1)));
	    increment_array_create(&next1or2or3tag,
			    first_tag_nospace(
				(char *)Darray_get(guess_tag_corpus,count2+1)));
	    increment_array_create(&next1tag,
			    first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+1)));
	    increment_array(&next1wd,(char *)Darray_get(word_corpus,count2+1));
	    increment_array(&next1or2wd,(char *)Darray_get(word_corpus,count2+1));
	  }
	  if (count2 < lengthcount-2) {
	    strcpy(twotagaft,first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+2)));
	    strcpy(forpasting2,
		   first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+1)));
	    sprintf(forpasting,"%s %s",forpasting2,
		                       first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+2)));
	    increment_array_create(&nextbigram,forpasting);
	    increment_array_create(&next2tag,first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+2)));
	    increment_array(&next2wd,(char *)Darray_get(word_corpus,count2+2));
	    if
	      (strcmp(first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+2)),                 onetagaft) != 0)
	    {
	      increment_array_create(&next1or2tag,
		  first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+2)));
	      
	      increment_array_create(&next1or2or3tag,
		  first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+2)));
	    }
	    if (strcmp((char *)Darray_get(word_corpus,count2+2),onewdaft) != 0)
	      increment_array(&next1or2wd,(char *)Darray_get(word_corpus,count2+2));
	  }
	  if (count2 < lengthcount-3) {
	    if (strcmp(first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+3)),onetagaft) != 0
		&&
		strcmp(first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+3)),twotagaft) != 0)
	      increment_array_create(&next1or2or3tag,
		 first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+3)));
	  }
	  if (count2 != 0) {
	    strcpy(onewdbfr,(char *)Darray_get(word_corpus,count2-1));
	    strcpy(onetagbfr,first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-1)));
	    sprintf(wdpair,"%s %s",(char *)Darray_get(word_corpus,count2-1),
		                  (char *)Darray_get(word_corpus,count2));
    	    increment_array_create(&lbigram,wdpair);
	    sprintf(wdpair,"%s %s",first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-1)),
		                  (char *)Darray_get(word_corpus,count2));
    	    increment_array_create(&wdprevtag,wdpair);
	    increment_array_create(&prev1tag,first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-1)));
	    increment_array(&prev1wd,(char *)Darray_get(word_corpus,count2-1));
	    increment_array(&prev1or2wd,(char *)Darray_get(word_corpus,count2-1));
	    increment_array_create(&prev1or2tag,
			    first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-1)));
	    increment_array_create(&prev1or2or3tag,
			    first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-1)));
	    if (count2 < lengthcount-1) {
	      strcpy(forpasting2,
		     first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-1)));
	      sprintf(forpasting,"%s %s",forpasting2,
		                       first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+1)));
	      increment_array_create(&surroundtag,forpasting);
	    }
	  }
	  if (count2 > 1) {
	    strcpy(twotagbfr,first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-2)));
	    strcpy(forpasting2,
		   first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-2)));
	    sprintf(forpasting,"%s %s",forpasting2,
		                       first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-1)));
	    increment_array_create(&prevbigram,forpasting);
	    increment_array_create(&prev2tag,first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-2)));
	    increment_array(&prev2wd,(char *)Darray_get(word_corpus,count2-2));
	    if (strcmp(first_tag_nospace(
		 (char *)Darray_get(guess_tag_corpus,count2-2)),onetagbfr) != 0){ 
	      increment_array_create(&prev1or2tag,
				     first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-2)));
	      increment_array_create(&prev1or2or3tag,
				     first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-2)));
	    }
	    if (strcmp((char *)Darray_get(word_corpus,count2-2),onewdbfr) !=
		0)
	      increment_array(&prev1or2wd,(char *)Darray_get(word_corpus,count2-2));
	  }
	  if (count2 > 2) {
	     if (strcmp(first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-3)),onetagbfr) != 0
		&&
		strcmp(first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-3)),twotagbfr) != 0)
	       increment_array_create(&prev1or2or3tag,
			    first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-3)));
	  }
	}


	else if (strcmp(flag,"GOODMATCH") == 0) {
	  increment_array(&always2,"DUMMY");
	  increment_array(&currentwd2,(char *)Darray_get(word_corpus,count2));
	  if (count2 != lengthcount-1) {
	    strcpy(onewdaft,(char *)Darray_get(word_corpus,count2+1));
	    strcpy(onetagaft,first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+1)));
            sprintf(wdpair,"%s %s",(char *)Darray_get(word_corpus,count2),
		                  (char *)Darray_get(word_corpus,count2+1));
    	    increment_array_create(&rbigram2,wdpair);
	    sprintf(wdpair,"%s %s",(char *)Darray_get(word_corpus,count2),
		                  first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+1)));
    	    increment_array_create(&wdnexttag2,wdpair);
	    increment_array_create(&next1tag2,first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+1)));
	    increment_array(&next1wd2,(char *)Darray_get(word_corpus,count2+1));
	    increment_array(&next1or2wd2,(char *)Darray_get(word_corpus,count2+1));
	    increment_array_create(&next1or2tag2,
			    first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+1)));
	    increment_array_create(&next1or2or3tag2,
			    first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+1)));
	  }
	  if (count2 < lengthcount-2) {
	    strcpy(twotagaft,first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+2)));
	    strcpy(forpasting2,
		   first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+1)));
	    sprintf(forpasting,"%s %s",forpasting2,
		                       first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+2)));
	    increment_array_create(&nextbigram2,forpasting);
	    increment_array_create(&next2tag2,first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+2)));
	    increment_array(&next2wd2,(char *)Darray_get(word_corpus,count2+2));
	    if (strcmp(first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+2)),onetagaft) !=0) {
	      increment_array_create(&next1or2tag2,
		     first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+2)));
	      increment_array_create(&next1or2or3tag2,
		    first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+2)));
	    }
	    if (strcmp((char *)Darray_get(word_corpus,count2+2),onewdaft) !=0)
	      increment_array(&next1or2wd2,(char *)Darray_get(word_corpus,count2+2));
	  }
	  if (count2 < lengthcount-3) {
	    if (strcmp(first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+3)),onetagaft) !=0 
		&&
		strcmp(first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+3)),twotagaft) !=0 )
	      increment_array_create(&next1or2or3tag2,
			    first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+3)));
	  }
	  if (count2 != 0) {
	    strcpy(onewdbfr,(char *)Darray_get(word_corpus,count2-1));
	    strcpy(onetagbfr,(char *)Darray_get(guess_tag_corpus,count2-1));
	    sprintf(wdpair,"%s %s",(char *)Darray_get(word_corpus,count2-1),
		                  (char *)Darray_get(word_corpus,count2));
    	    increment_array_create(&lbigram2,wdpair);
	    sprintf(wdpair,"%s %s",first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-1)),
		                  (char *)Darray_get(word_corpus,count2));
    	    increment_array_create(&wdprevtag2,wdpair);
	    increment_array_create(&prev1tag2,first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-1)));
	    increment_array(&prev1wd2,(char *)Darray_get(word_corpus,count2-1));
	    increment_array(&prev1or2wd2,(char *)Darray_get(word_corpus,count2-1));
	    increment_array_create(&prev1or2tag2,
			    first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-1)));
	    increment_array_create(&prev1or2or3tag2,
			    first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-1)));
	    if (count2 < lengthcount-1) {
	      strcpy(forpasting2,
		     first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-1)));
	      sprintf(forpasting,"%s %s",forpasting2,
		                       first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+1)));
      increment_array_create(&surroundtag2,forpasting);
	    }
	  }
	  if (count2 >1 ) { 
	    strcpy(twotagbfr,first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-2)));
	    strcpy(forpasting2,
		   first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-2)));
	    sprintf(forpasting,"%s %s",forpasting2,
		                       first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-1)));
	    increment_array_create(&prevbigram2,forpasting);
	    increment_array_create(&prev2tag2,first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-2)));
	    increment_array(&prev2wd2,(char *)Darray_get(word_corpus,count2-2));
	    if (strcmp(first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-2)),onetagbfr) != 0){
	      increment_array_create(&prev1or2tag2,
			    first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-2)));
	      increment_array_create(&prev1or2or3tag2,
			    first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-2)));
	    }
	    if (strcmp((char *)Darray_get(word_corpus,count2-2),onewdbfr) != 0)
	      increment_array(&prev1or2wd2,(char *)Darray_get(word_corpus,count2-2));
	  }
	  if (count2 > 2) {
	    if (strcmp(first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-3)),onetagbfr) != 0 
		&&
		strcmp(first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-3)),twotagbfr) != 0)
	      increment_array_create(&prev1or2or3tag2,
			    first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-3)));
	  }
	}
      }  


check_counts(&always,&always2,"ALWAYS");
check_counts(&prev1tag,&prev1tag2,"PREVTAG");
check_counts(&next1tag,&next1tag2,"NEXTTAG");
check_counts(&next1or2tag,&next1or2tag2,"NEXT1OR2TAG");
check_counts(&prev1or2tag,&prev1or2tag2,"PREV1OR2TAG");
check_counts(&next1wd,&next1wd2,"NEXTWD");
check_counts(&currentwd,&currentwd2,"CURRENTWD");
check_counts(&prev1wd,&prev1wd2,"PREVWD");
check_counts(&rbigram,&rbigram2,"RBIGRAM");
check_counts(&lbigram,&lbigram2,"LBIGRAM");
check_counts(&wdnexttag,&wdnexttag2,"WDNEXTTAG");
check_counts(&wdprevtag,&wdprevtag2,"WDPREVTAG");
check_counts(&next1or2wd,&next1or2wd2,"NEXT1OR2WD");
check_counts(&prev1or2wd,&prev1or2wd2,"PREV1OR2WD");
check_counts(&next1or2or3tag,&next1or2or3tag2,"NEXT1OR2OR3TAG");
check_counts(&prev1or2or3tag,&prev1or2or3tag2,"PREV1OR2OR3TAG");
check_counts(&prevbigram,&prevbigram2,"PREVBIGRAM");
check_counts(&nextbigram,&nextbigram2,"NEXTBIGRAM");
check_counts(&surroundtag,&surroundtag2,"SURROUNDTAG");
check_counts(&next2tag,&next2tag2,"NEXT2TAG");
check_counts(&prev2tag,&prev2tag2,"PREV2TAG");
check_counts(&next2wd,&next2wd2,"NEXT2WD");
check_counts(&prev2wd,&prev2wd2,"PREV2WD");



    if (localbest > globalbest) {
      globalbest = localbest;
      strcpy(globaldif,localdif);
      strcpy(globalprint,localbestthing);}
    }
  }
  free(split_ptr[0]);
  free(split_ptr[1]);
  free(split_ptr[2]);
  free(split_ptr);
  for (count=0;count<strlen(globalprint);++count)
    if (*(globalprint+count) == '\'') 
      *(globalprint+count) = '\b'; 
  sprintf(systemcall,"cat %s | fix-kbest-rule-learn \'%s\' %s > aanewmynewtagggs",
	  argv[2],globalprint,argv[4]);
  system(systemcall);
  for (count=0;count<strlen(globalprint);++count)
    if (*(globalprint+count) == '\b') 
      *(globalprint+count) = '\''; 
  sprintf(systemcall,"mv aanewmynewtagggs %s",argv[2]);
  system(systemcall);
  correct_out = fopen(argv[3],"a");
  fprintf(correct_out,"%s\n",globalprint);
/*  fprintf(correct_out,"%d %s %s\n",globalbest,globalprint,globaldif);*/
  fclose(correct_out);
  CONTINUE = globalbest; 
  for (count=0;count<Darray_len(guess_tag_corpus);++count)
    if (strcmp((tempstr=(char *)Darray_get(guess_tag_corpus,count)),"STAART") != 0)
      free(tempstr);
  Darray_destroy(guess_tag_corpus);
  for (count=0;count<Darray_len(errorlist);++count)
    free((char *)Darray_get(errorlist,count));
  Darray_destroy(errorlist);
	 
}
return 0;
}
コード例 #2
0
ファイル: tagger.c プロジェクト: kylebgorman/pposttl
void Tagger(FILE * lexicon, FILE * bigrams, FILE * lRuleFile,
	    FILE * cRuleFile, Registry * lexicon_hash,
	    Registry * lemma_hash, Registry * good_right_hash,
	    Registry * good_left_hash, Registry * seenTagging,
	    Darray * bigramArray, Darray * lRuleArray, Darray * cRuleArray)
{
	char line[MAXLINELEN];
	char space[500];
	char word[MAXWORDLEN], tag[MAXTAGLEN];
	char bigram1[MAXWORDLEN], bigram2[MAXWORDLEN];
	char **perl_split_ptr, **perl_split_ptr2, *atempstr,
	    **temp_perl_split_ptr;
	char *tempruleptr;
	char bigram_space[MAXWORDLEN * 2];
	int numLexiconEntries;
/*Added by Golam Mortuza Hossain */
	char lemma[MAXWORDLEN];
	*lemma_hash = Registry_create(Registry_strcmp, Registry_strhash);
/* g.m.h */

	/* Benjamin Han 100400: time for creativity! */
	*lexicon_hash = Registry_create(Registry_strcmp, Registry_strhash);
	*good_right_hash =
	    Registry_create(Registry_strcmp, Registry_strhash);
	*good_left_hash =
	    Registry_create(Registry_strcmp, Registry_strhash);
	*seenTagging = Registry_create(Registry_strcmp, Registry_strhash);
	*lRuleArray = Darray_create();
	*cRuleArray = Darray_create();
	*bigramArray = Darray_create();

	/* lexicon hash stores the most likely tag for all known words.
	   we can have a separate wordlist and lexicon file because unsupervised
	   learning    can add to wordlist, while not adding to lexicon.  For
	   example, if a big    untagged corpus is about to be tagged, the wordlist
	   can be extended to    include words in that corpus, while the lexicon
	   remains static.    Lexicon is file of form: 
	   word t1 t2 ... tn 
	   where t1 is the most likely tag for the word, and t2...tn are alternate
	   tags, in no particular order. */
	/* read through once to get size */
	for (numLexiconEntries = 0;
	     fgets(line, sizeof(line), lexicon) != NULL;
	     numLexiconEntries += num_words(line))
		if (not_just_blank(line))
			line[strlen(line) - 1] = '\0';

	fseek(lexicon, (long) 0, SEEK_SET);

	/* just need word and most likely tag from lexicon (first tag entry) */
	/* Benjamin Han 100400: originally it's hinted by the # of lines in lexicon
	   file */
	Registry_size_hint(*lexicon_hash, numLexiconEntries);
/*Added by Golam Mortuza Hossain */
	Registry_size_hint(*lemma_hash, numLexiconEntries);
/* g.m.h */

	while (fgets(line, sizeof(line), lexicon) != NULL) {
		if (not_just_blank(line)) {
			line[strlen(line) - 1] = '\0';
/*Added by Golam Mortuza Hossain */
			sscanf(line, "%s%s%s", word, lemma, tag);
//      if ( strcmp ( word, lemma) != 0 ) 
			Registry_add(*lemma_hash, (char *) mystrdup(word),
				     (char *) mystrdup(lemma));
/* It would have been much better to just use
 * "struct" and put "lemma" in lexicon hash. But
 * it does not seem to be working by simple hacking*/
/* g.m.h */
			Registry_add(*lexicon_hash,
				     (char *) mystrdup(word),
				     (char *) mystrdup(tag));
		}
	}

	/* read in lexical rule file */
	while (fgets(line, sizeof(line), lRuleFile) != NULL) {
		if (not_just_blank(line)) {
			line[strlen(line) - 1] = '\0';
			Darray_addh(*lRuleArray, mystrdup(line));
			perl_split_ptr = perl_split(line);
			temp_perl_split_ptr = perl_split_ptr;
			if (strcmp(perl_split_ptr[1], "goodright") == 0) {
				tempruleptr = mystrdup(perl_split_ptr[0]);
				Registry_add(*good_right_hash, tempruleptr,
					     (char *) 1);
			} else if (strcmp(perl_split_ptr[2], "fgoodright")
				   == 0) {
				tempruleptr = mystrdup(perl_split_ptr[1]);
				Registry_add(*good_right_hash, tempruleptr,
					     (char *) 1);
			} else if (strcmp(perl_split_ptr[1], "goodleft") ==
				   0) {
				tempruleptr = mystrdup(perl_split_ptr[0]);
				Registry_add(*good_left_hash, tempruleptr,
					     (char *) 1);
			} else if (strcmp(perl_split_ptr[2], "fgoodleft")
				   == 0) {
				tempruleptr = mystrdup(perl_split_ptr[1]);
				Registry_add(*good_left_hash, tempruleptr,
					     (char *) 1);
			}
			free(*perl_split_ptr);
			free(perl_split_ptr);
		}
	}

	/* read in bigram file */
	/* Benjamin Han 100400: I store the contents in bigramArray so
	   we don't have to do file IO everytime the start-state-tagger is
	   invoked. */
	while (fgets(line, sizeof(line), bigrams) != NULL) {
		if (not_just_blank(line)) {
			line[strlen(line) - 1] = '\0';
			atempstr =
			    (char *) malloc(sizeof(char) *
					    (strlen(line) + 1));
			strcpy(atempstr, line);
			Darray_addh(*bigramArray, atempstr);
		}
	}

	fseek(lexicon, (long) 0, SEEK_SET);

	/* read in the lexicon for the final-state-tagger */
	Registry_size_hint(*seenTagging, numLexiconEntries);

	/* Benjamin Han 100500: MISSING RESTRICT_MOVE section?
	   Answer: Brill used registry WORDS while I use lexicon_hash to replace
	   his WORDS (see POST::Run) - the only difference is in WORDS 
	   every value is 1 while in lexicon_hash a values is the first 
	   tag following the word in the lexicon file. */
	while (fgets(line, sizeof(line), lexicon) != NULL) {
		if (not_just_blank(line)) {
			line[strlen(line) - 1] = '\0';
			perl_split_ptr = perl_split(line);
			perl_split_ptr2 = perl_split_ptr;
			++perl_split_ptr;
			while (*perl_split_ptr != NULL) {
				sprintf(space, "%s %s", *perl_split_ptr2,
					*perl_split_ptr);
				Registry_add(*seenTagging, mystrdup(space),
					     (char *) 1);
				++perl_split_ptr;
			}
			free(*perl_split_ptr2);
			free(perl_split_ptr2);
		}
	}

	/* read in contextual rule */
	while (fgets(line, sizeof(line), cRuleFile) != NULL)
		if (not_just_blank(line)) {
			line[strlen(line) - 1] = '\0';
			Darray_addh(*cRuleArray, mystrdup(line));
		}
}