int main(int argc, char *argv[]) { char onewdbfr[512],onewdaft[512]; char onetagbfr[512],twotagbfr[512],onetagaft[512],twotagaft[512]; char *freshcharvar; Darray errorlist,temperrorkey,temperrorval; Registry errorlistcount,SEENTAGGING,WORDS; FILE *correct_file, *guess_file, *error_list,*correct_out; char line[5000]; /* input line buffer */ char **split_ptr,**split_ptr2; char wdpair[1024]; char *tempstr,*tempstr2; float CONTINUE = 10000.0; int count2,numwrong,lengthcount; unsigned int count; char globalprint[500]; char systemcall[500]; char forpasting[500]; char forpasting2[500]; float globalbest = 0.0; char flag[20]; Registry currentwd,currentwd2; Registry always,always2; Registry wdnexttag,wdnexttag2,wdprevtag,wdprevtag2; Registry rbigram,lbigram,rbigram2,lbigram2; Registry next1tag,next1tag2,prev1tag,prev1tag2; Registry next1or2tag,next1or2tag2,prev1or2tag,prev1or2tag2; Registry next1or2or3tag,next1or2or3tag2,prev1or2or3tag,prev1or2or3tag2; Registry next1wd,next1wd2,prev1wd,prev1wd2; Registry next1or2wd,next1or2wd2,prev1or2wd,prev1or2wd2; Registry nextbigram,nextbigram2,prevbigram,prevbigram2; Registry surroundtag,surroundtag2; Registry next2tag,next2tag2,prev2tag,prev2tag2; Registry next2wd,next2wd2,prev2wd,prev2wd2; char globaldif[20]; int printscore; FILE *allowedmovefile; char **perl_split_ptr,**perl_split_ptr2,*atempstr,atempstr2[1024]; char space[500]; SEENTAGGING = Registry_create(Registry_strcmp,Registry_strhash); Registry_size_hint(SEENTAGGING,GUESSNUMWORDS); WORDS = Registry_create(Registry_strcmp,Registry_strhash); Registry_size_hint(WORDS,GUESSNUMWORDS); allowedmovefile = fopen(argv[4], "r"); while(fgets(line,sizeof(line),allowedmovefile) != NULL) { if (not_just_blank(line)) { line[strlen(line) - 1] = '\0'; perl_split_ptr = perl_split(line); perl_split_ptr2 = perl_split_ptr; ++perl_split_ptr; atempstr= mystrdup(*perl_split_ptr2); Registry_add(WORDS,atempstr,(char *)1); while(*perl_split_ptr != NULL) { sprintf(space,"%s %s",*perl_split_ptr2, *perl_split_ptr); atempstr=mystrdup(space); Registry_add(SEENTAGGING,atempstr,(char *)1); ++perl_split_ptr; } free(*perl_split_ptr2); free(perl_split_ptr2); } } system("/bin/rm AANEWRESTRJUNKKK"); correct_tag_corpus = Darray_create(); Darray_hint(correct_tag_corpus,100,400000); word_corpus = Darray_create(); Darray_hint(word_corpus,100,400000); correct_file = fopen(argv[1],"r"); while(fgets(line,sizeof(line),correct_file) != NULL) { Darray_addh(correct_tag_corpus,staart); Darray_addh(correct_tag_corpus,staart); Darray_addh(word_corpus,staart); Darray_addh(word_corpus,staart); line[strlen(line)-1] = '\0'; split_ptr = perl_split_independent(line); while (*split_ptr != NULL) { Darray_addh(word_corpus,*split_ptr); while ((*(++*split_ptr)) != '/') { } **split_ptr = '\0'; Darray_addh(correct_tag_corpus,++*split_ptr); ++split_ptr; } } fclose(correct_file); printf("READ IN CORRECT FILE\n"); while(CONTINUE > THRESHOLD) { guess_tag_corpus = Darray_create(); Darray_hint(guess_tag_corpus,100,400000); guess_file = fopen(argv[2],"r"); while(fgets(line,sizeof(line),guess_file) != NULL) { Darray_addh(guess_tag_corpus,staart); Darray_addh(guess_tag_corpus,staart); line[strlen(line)-1] = '\0'; split_ptr = perl_split_independent(line); split_ptr2 = split_ptr; while (*split_ptr != NULL) { tempstr = strtok(*split_ptr,"/"); tempstr = strtok(NULL,"/"); tempstr2 = mystrdup(tempstr); Darray_addh(guess_tag_corpus,tempstr2); free(*split_ptr); ++split_ptr; } free(split_ptr2); } fclose(guess_file); printf("READ IN BAD FILE\n"); errorlist = Darray_create(); Darray_hint(errorlist,10,500); temperrorkey = Darray_create(); temperrorval = Darray_create(); Darray_hint(temperrorkey,10,500); Darray_hint(temperrorval,10,500); init_hash(&errorlistcount,500); printscore=0; for(count=0;count<Darray_len(guess_tag_corpus);++count) { if (! is_tagged_with((char *)Darray_get(correct_tag_corpus,count),(char *)Darray_get(guess_tag_corpus,count))) { ++printscore; freshcharvar = mystrdup(first_tag_nospace((char *)Darray_get(guess_tag_corpus,count))); sprintf(forpasting,"%s %s",freshcharvar, (char *)Darray_get(correct_tag_corpus,count)); increment_array_create(&errorlistcount,forpasting); } } error_list = fopen("AANEWRESTRJUNKKK","a"); Registry_fetch_contents(errorlistcount,temperrorkey,temperrorval); for (count=0;count<Darray_len(temperrorkey);++count) { if (*(int *)(char *)Darray_get(temperrorval,count) > THRESHOLD) /*Darray_addh(errorlist,tempstr);*/ fprintf(error_list,"%d %s\n",*(int *)(char *)Darray_get(temperrorval,count), (char *)Darray_get(temperrorkey,count)); free((char *)Darray_get(temperrorval,count)); free((char *)Darray_get(temperrorkey,count)); } fclose(error_list); Darray_destroy(temperrorval); Darray_destroy(temperrorkey); Registry_destroy(errorlistcount); printf("NUM ERRORS: %d\n",printscore); /* shoud sort error list !!!!!!!*/ system("cat AANEWRESTRJUNKKK | sort -rn > AANEWRESTRJUNKKK2"); system("mv AANEWRESTRJUNKKK2 AANEWRESTRJUNKKK"); error_list = fopen("AANEWRESTRJUNKKK","r"); while(fgets(line,sizeof(line),error_list) != NULL) { line[strlen(line)-1] = '\0'; tempstr = mystrdup(line); Darray_addh(errorlist,tempstr); } fclose(error_list); system("/bin/rm AANEWRESTRJUNKKK"); globalbest= 0; strcpy(globalprint,""); for (count=0;count<Darray_len(errorlist);++count) { localbest =0; strcpy(localbestthing,""); /*printf("ERROR LIST GUY: %s\n",(char *)Darray_get(errorlist,count)); */ split_ptr = perl_split_independent((char *)Darray_get(errorlist,count)); /*printf("ERRORLISTGUY: %s %s %s\n",split_ptr[0],split_ptr[1],split_ptr[2]);*/ wrong = split_ptr[1]; right = split_ptr[2]; numwrong = atoi(split_ptr[0]); if (numwrong > THRESHOLD3) { printf("WRONG,RI: %s %s\n",wrong,right); printf("GLOBALBEST, GLOBALPRINT, GLOBALDIF: %f %s %s\n",globalbest,globalprint,globaldif); init_hash(&always,NUMTAGS/2); init_hash(&always2,NUMTAGS/2); init_hash(&rbigram,(NUMWDS*NUMWDS)/4); init_hash(&lbigram,(NUMWDS*NUMWDS)/4); init_hash(&rbigram2,(NUMWDS*NUMWDS)/4); init_hash(&lbigram2,(NUMWDS*NUMWDS)/4); init_hash(&wdnexttag,(NUMWDS*NUMTAGS)/4); init_hash(&wdnexttag2,(NUMWDS*NUMTAGS)/4); init_hash(&wdprevtag,(NUMWDS*NUMTAGS)/4); init_hash(&wdprevtag2,(NUMWDS*NUMTAGS)/4); init_hash(&next1tag,NUMTAGS/2); init_hash(&next1tag2,NUMTAGS/2); init_hash(&prev1tag,NUMTAGS/2); init_hash(&prev1tag2,NUMTAGS/2); init_hash(&next1or2tag,NUMTAGS/2); init_hash(&next1or2tag2,NUMTAGS/2); init_hash(&prev1or2tag,NUMTAGS/2); init_hash(&prev1or2tag2,NUMTAGS/2); init_hash(&next1wd,NUMWDS/2); init_hash(&next1wd2,NUMWDS/2); init_hash(&prev1wd,NUMWDS/2); init_hash(&prev1wd2,NUMWDS/2); init_hash(¤twd,NUMWDS/2); init_hash(¤twd2,NUMWDS/2); init_hash(&next1or2wd,NUMWDS/2); init_hash(&next1or2wd2,NUMWDS/2); init_hash(&prev1or2wd,NUMWDS/2); init_hash(&prev1or2wd2,NUMWDS/2); init_hash(&next1or2or3tag,NUMTAGS/2); init_hash(&next1or2or3tag2,NUMTAGS/2); init_hash(&prev1or2or3tag,NUMTAGS/2); init_hash(&prev1or2or3tag2,NUMTAGS/2); init_hash(&nextbigram,NUMTAGS); init_hash(&nextbigram2,NUMTAGS); init_hash(&prevbigram,NUMTAGS); init_hash(&prevbigram2,NUMTAGS); init_hash(&surroundtag,NUMTAGS); init_hash(&surroundtag2,NUMTAGS); init_hash(&next2tag,NUMTAGS/2); init_hash(&next2tag2,NUMTAGS/2); init_hash(&prev2tag,NUMTAGS/2); init_hash(&prev2tag2,NUMTAGS/2); init_hash(&next2wd,NUMWDS/2); init_hash(&next2wd2,NUMWDS/2); init_hash(&prev2wd,NUMWDS/2); init_hash(&prev2wd2,NUMWDS/2); lengthcount = Darray_len(correct_tag_corpus); for(count2=0;count2<lengthcount;++count2){ sprintf(atempstr2,"%s %s",(char *)Darray_get(word_corpus,count2),right); if (Registry_get(WORDS,(char *)Darray_get(word_corpus,count2)) && ! Registry_get(SEENTAGGING,atempstr2)) strcpy(flag,"NOMATCH"); else if (strcmp((char *)Darray_get(correct_tag_corpus,count2),right) == 0 && (strcmp (first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2)),wrong) == 0) && (! is_tagged_with(right,(char *)Darray_get(guess_tag_corpus,count2)))) strcpy(flag,"BADMATCH"); else if (strcmp((char *)Darray_get(correct_tag_corpus,count2),right) != 0 && (strcmp (first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2)),wrong) == 0) && (! is_tagged_with(right,(char *)Darray_get(guess_tag_corpus,count2)))) strcpy(flag,"GOODMATCH"); else strcpy(flag,"NOMATCH"); if (strcmp(flag,"BADMATCH") == 0) { increment_array(&always,"DUMMY"); increment_array(¤twd,(char *)Darray_get(word_corpus,count2)); if (count2 != lengthcount-1) { strcpy(onewdaft,(char *)Darray_get(word_corpus,count2+1)); strcpy(onetagaft, first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+1))); sprintf(wdpair,"%s %s",(char *)Darray_get(word_corpus,count2), (char *)Darray_get(word_corpus,count2+1)); increment_array_create(&rbigram,wdpair); sprintf(wdpair,"%s %s",(char *)Darray_get(word_corpus,count2), first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+1))); increment_array_create(&wdnexttag,wdpair); increment_array_create(&next1or2tag, first_tag_nospace( (char *)Darray_get(guess_tag_corpus,count2+1))); increment_array_create(&next1or2or3tag, first_tag_nospace( (char *)Darray_get(guess_tag_corpus,count2+1))); increment_array_create(&next1tag, first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+1))); increment_array(&next1wd,(char *)Darray_get(word_corpus,count2+1)); increment_array(&next1or2wd,(char *)Darray_get(word_corpus,count2+1)); } if (count2 < lengthcount-2) { strcpy(twotagaft,first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+2))); strcpy(forpasting2, first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+1))); sprintf(forpasting,"%s %s",forpasting2, first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+2))); increment_array_create(&nextbigram,forpasting); increment_array_create(&next2tag,first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+2))); increment_array(&next2wd,(char *)Darray_get(word_corpus,count2+2)); if (strcmp(first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+2)), onetagaft) != 0) { increment_array_create(&next1or2tag, first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+2))); increment_array_create(&next1or2or3tag, first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+2))); } if (strcmp((char *)Darray_get(word_corpus,count2+2),onewdaft) != 0) increment_array(&next1or2wd,(char *)Darray_get(word_corpus,count2+2)); } if (count2 < lengthcount-3) { if (strcmp(first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+3)),onetagaft) != 0 && strcmp(first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+3)),twotagaft) != 0) increment_array_create(&next1or2or3tag, first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+3))); } if (count2 != 0) { strcpy(onewdbfr,(char *)Darray_get(word_corpus,count2-1)); strcpy(onetagbfr,first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-1))); sprintf(wdpair,"%s %s",(char *)Darray_get(word_corpus,count2-1), (char *)Darray_get(word_corpus,count2)); increment_array_create(&lbigram,wdpair); sprintf(wdpair,"%s %s",first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-1)), (char *)Darray_get(word_corpus,count2)); increment_array_create(&wdprevtag,wdpair); increment_array_create(&prev1tag,first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-1))); increment_array(&prev1wd,(char *)Darray_get(word_corpus,count2-1)); increment_array(&prev1or2wd,(char *)Darray_get(word_corpus,count2-1)); increment_array_create(&prev1or2tag, first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-1))); increment_array_create(&prev1or2or3tag, first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-1))); if (count2 < lengthcount-1) { strcpy(forpasting2, first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-1))); sprintf(forpasting,"%s %s",forpasting2, first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+1))); increment_array_create(&surroundtag,forpasting); } } if (count2 > 1) { strcpy(twotagbfr,first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-2))); strcpy(forpasting2, first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-2))); sprintf(forpasting,"%s %s",forpasting2, first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-1))); increment_array_create(&prevbigram,forpasting); increment_array_create(&prev2tag,first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-2))); increment_array(&prev2wd,(char *)Darray_get(word_corpus,count2-2)); if (strcmp(first_tag_nospace( (char *)Darray_get(guess_tag_corpus,count2-2)),onetagbfr) != 0){ increment_array_create(&prev1or2tag, first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-2))); increment_array_create(&prev1or2or3tag, first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-2))); } if (strcmp((char *)Darray_get(word_corpus,count2-2),onewdbfr) != 0) increment_array(&prev1or2wd,(char *)Darray_get(word_corpus,count2-2)); } if (count2 > 2) { if (strcmp(first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-3)),onetagbfr) != 0 && strcmp(first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-3)),twotagbfr) != 0) increment_array_create(&prev1or2or3tag, first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-3))); } } else if (strcmp(flag,"GOODMATCH") == 0) { increment_array(&always2,"DUMMY"); increment_array(¤twd2,(char *)Darray_get(word_corpus,count2)); if (count2 != lengthcount-1) { strcpy(onewdaft,(char *)Darray_get(word_corpus,count2+1)); strcpy(onetagaft,first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+1))); sprintf(wdpair,"%s %s",(char *)Darray_get(word_corpus,count2), (char *)Darray_get(word_corpus,count2+1)); increment_array_create(&rbigram2,wdpair); sprintf(wdpair,"%s %s",(char *)Darray_get(word_corpus,count2), first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+1))); increment_array_create(&wdnexttag2,wdpair); increment_array_create(&next1tag2,first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+1))); increment_array(&next1wd2,(char *)Darray_get(word_corpus,count2+1)); increment_array(&next1or2wd2,(char *)Darray_get(word_corpus,count2+1)); increment_array_create(&next1or2tag2, first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+1))); increment_array_create(&next1or2or3tag2, first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+1))); } if (count2 < lengthcount-2) { strcpy(twotagaft,first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+2))); strcpy(forpasting2, first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+1))); sprintf(forpasting,"%s %s",forpasting2, first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+2))); increment_array_create(&nextbigram2,forpasting); increment_array_create(&next2tag2,first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+2))); increment_array(&next2wd2,(char *)Darray_get(word_corpus,count2+2)); if (strcmp(first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+2)),onetagaft) !=0) { increment_array_create(&next1or2tag2, first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+2))); increment_array_create(&next1or2or3tag2, first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+2))); } if (strcmp((char *)Darray_get(word_corpus,count2+2),onewdaft) !=0) increment_array(&next1or2wd2,(char *)Darray_get(word_corpus,count2+2)); } if (count2 < lengthcount-3) { if (strcmp(first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+3)),onetagaft) !=0 && strcmp(first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+3)),twotagaft) !=0 ) increment_array_create(&next1or2or3tag2, first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+3))); } if (count2 != 0) { strcpy(onewdbfr,(char *)Darray_get(word_corpus,count2-1)); strcpy(onetagbfr,(char *)Darray_get(guess_tag_corpus,count2-1)); sprintf(wdpair,"%s %s",(char *)Darray_get(word_corpus,count2-1), (char *)Darray_get(word_corpus,count2)); increment_array_create(&lbigram2,wdpair); sprintf(wdpair,"%s %s",first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-1)), (char *)Darray_get(word_corpus,count2)); increment_array_create(&wdprevtag2,wdpair); increment_array_create(&prev1tag2,first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-1))); increment_array(&prev1wd2,(char *)Darray_get(word_corpus,count2-1)); increment_array(&prev1or2wd2,(char *)Darray_get(word_corpus,count2-1)); increment_array_create(&prev1or2tag2, first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-1))); increment_array_create(&prev1or2or3tag2, first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-1))); if (count2 < lengthcount-1) { strcpy(forpasting2, first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-1))); sprintf(forpasting,"%s %s",forpasting2, first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2+1))); increment_array_create(&surroundtag2,forpasting); } } if (count2 >1 ) { strcpy(twotagbfr,first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-2))); strcpy(forpasting2, first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-2))); sprintf(forpasting,"%s %s",forpasting2, first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-1))); increment_array_create(&prevbigram2,forpasting); increment_array_create(&prev2tag2,first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-2))); increment_array(&prev2wd2,(char *)Darray_get(word_corpus,count2-2)); if (strcmp(first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-2)),onetagbfr) != 0){ increment_array_create(&prev1or2tag2, first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-2))); increment_array_create(&prev1or2or3tag2, first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-2))); } if (strcmp((char *)Darray_get(word_corpus,count2-2),onewdbfr) != 0) increment_array(&prev1or2wd2,(char *)Darray_get(word_corpus,count2-2)); } if (count2 > 2) { if (strcmp(first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-3)),onetagbfr) != 0 && strcmp(first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-3)),twotagbfr) != 0) increment_array_create(&prev1or2or3tag2, first_tag_nospace((char *)Darray_get(guess_tag_corpus,count2-3))); } } } check_counts(&always,&always2,"ALWAYS"); check_counts(&prev1tag,&prev1tag2,"PREVTAG"); check_counts(&next1tag,&next1tag2,"NEXTTAG"); check_counts(&next1or2tag,&next1or2tag2,"NEXT1OR2TAG"); check_counts(&prev1or2tag,&prev1or2tag2,"PREV1OR2TAG"); check_counts(&next1wd,&next1wd2,"NEXTWD"); check_counts(¤twd,¤twd2,"CURRENTWD"); check_counts(&prev1wd,&prev1wd2,"PREVWD"); check_counts(&rbigram,&rbigram2,"RBIGRAM"); check_counts(&lbigram,&lbigram2,"LBIGRAM"); check_counts(&wdnexttag,&wdnexttag2,"WDNEXTTAG"); check_counts(&wdprevtag,&wdprevtag2,"WDPREVTAG"); check_counts(&next1or2wd,&next1or2wd2,"NEXT1OR2WD"); check_counts(&prev1or2wd,&prev1or2wd2,"PREV1OR2WD"); check_counts(&next1or2or3tag,&next1or2or3tag2,"NEXT1OR2OR3TAG"); check_counts(&prev1or2or3tag,&prev1or2or3tag2,"PREV1OR2OR3TAG"); check_counts(&prevbigram,&prevbigram2,"PREVBIGRAM"); check_counts(&nextbigram,&nextbigram2,"NEXTBIGRAM"); check_counts(&surroundtag,&surroundtag2,"SURROUNDTAG"); check_counts(&next2tag,&next2tag2,"NEXT2TAG"); check_counts(&prev2tag,&prev2tag2,"PREV2TAG"); check_counts(&next2wd,&next2wd2,"NEXT2WD"); check_counts(&prev2wd,&prev2wd2,"PREV2WD"); if (localbest > globalbest) { globalbest = localbest; strcpy(globaldif,localdif); strcpy(globalprint,localbestthing);} } } free(split_ptr[0]); free(split_ptr[1]); free(split_ptr[2]); free(split_ptr); for (count=0;count<strlen(globalprint);++count) if (*(globalprint+count) == '\'') *(globalprint+count) = '\b'; sprintf(systemcall,"cat %s | fix-kbest-rule-learn \'%s\' %s > aanewmynewtagggs", argv[2],globalprint,argv[4]); system(systemcall); for (count=0;count<strlen(globalprint);++count) if (*(globalprint+count) == '\b') *(globalprint+count) = '\''; sprintf(systemcall,"mv aanewmynewtagggs %s",argv[2]); system(systemcall); correct_out = fopen(argv[3],"a"); fprintf(correct_out,"%s\n",globalprint); /* fprintf(correct_out,"%d %s %s\n",globalbest,globalprint,globaldif);*/ fclose(correct_out); CONTINUE = globalbest; for (count=0;count<Darray_len(guess_tag_corpus);++count) if (strcmp((tempstr=(char *)Darray_get(guess_tag_corpus,count)),"STAART") != 0) free(tempstr); Darray_destroy(guess_tag_corpus); for (count=0;count<Darray_len(errorlist);++count) free((char *)Darray_get(errorlist,count)); Darray_destroy(errorlist); } return 0; }
void Tagger(FILE * lexicon, FILE * bigrams, FILE * lRuleFile, FILE * cRuleFile, Registry * lexicon_hash, Registry * lemma_hash, Registry * good_right_hash, Registry * good_left_hash, Registry * seenTagging, Darray * bigramArray, Darray * lRuleArray, Darray * cRuleArray) { char line[MAXLINELEN]; char space[500]; char word[MAXWORDLEN], tag[MAXTAGLEN]; char bigram1[MAXWORDLEN], bigram2[MAXWORDLEN]; char **perl_split_ptr, **perl_split_ptr2, *atempstr, **temp_perl_split_ptr; char *tempruleptr; char bigram_space[MAXWORDLEN * 2]; int numLexiconEntries; /*Added by Golam Mortuza Hossain */ char lemma[MAXWORDLEN]; *lemma_hash = Registry_create(Registry_strcmp, Registry_strhash); /* g.m.h */ /* Benjamin Han 100400: time for creativity! */ *lexicon_hash = Registry_create(Registry_strcmp, Registry_strhash); *good_right_hash = Registry_create(Registry_strcmp, Registry_strhash); *good_left_hash = Registry_create(Registry_strcmp, Registry_strhash); *seenTagging = Registry_create(Registry_strcmp, Registry_strhash); *lRuleArray = Darray_create(); *cRuleArray = Darray_create(); *bigramArray = Darray_create(); /* lexicon hash stores the most likely tag for all known words. we can have a separate wordlist and lexicon file because unsupervised learning can add to wordlist, while not adding to lexicon. For example, if a big untagged corpus is about to be tagged, the wordlist can be extended to include words in that corpus, while the lexicon remains static. Lexicon is file of form: word t1 t2 ... tn where t1 is the most likely tag for the word, and t2...tn are alternate tags, in no particular order. */ /* read through once to get size */ for (numLexiconEntries = 0; fgets(line, sizeof(line), lexicon) != NULL; numLexiconEntries += num_words(line)) if (not_just_blank(line)) line[strlen(line) - 1] = '\0'; fseek(lexicon, (long) 0, SEEK_SET); /* just need word and most likely tag from lexicon (first tag entry) */ /* Benjamin Han 100400: originally it's hinted by the # of lines in lexicon file */ Registry_size_hint(*lexicon_hash, numLexiconEntries); /*Added by Golam Mortuza Hossain */ Registry_size_hint(*lemma_hash, numLexiconEntries); /* g.m.h */ while (fgets(line, sizeof(line), lexicon) != NULL) { if (not_just_blank(line)) { line[strlen(line) - 1] = '\0'; /*Added by Golam Mortuza Hossain */ sscanf(line, "%s%s%s", word, lemma, tag); // if ( strcmp ( word, lemma) != 0 ) Registry_add(*lemma_hash, (char *) mystrdup(word), (char *) mystrdup(lemma)); /* It would have been much better to just use * "struct" and put "lemma" in lexicon hash. But * it does not seem to be working by simple hacking*/ /* g.m.h */ Registry_add(*lexicon_hash, (char *) mystrdup(word), (char *) mystrdup(tag)); } } /* read in lexical rule file */ while (fgets(line, sizeof(line), lRuleFile) != NULL) { if (not_just_blank(line)) { line[strlen(line) - 1] = '\0'; Darray_addh(*lRuleArray, mystrdup(line)); perl_split_ptr = perl_split(line); temp_perl_split_ptr = perl_split_ptr; if (strcmp(perl_split_ptr[1], "goodright") == 0) { tempruleptr = mystrdup(perl_split_ptr[0]); Registry_add(*good_right_hash, tempruleptr, (char *) 1); } else if (strcmp(perl_split_ptr[2], "fgoodright") == 0) { tempruleptr = mystrdup(perl_split_ptr[1]); Registry_add(*good_right_hash, tempruleptr, (char *) 1); } else if (strcmp(perl_split_ptr[1], "goodleft") == 0) { tempruleptr = mystrdup(perl_split_ptr[0]); Registry_add(*good_left_hash, tempruleptr, (char *) 1); } else if (strcmp(perl_split_ptr[2], "fgoodleft") == 0) { tempruleptr = mystrdup(perl_split_ptr[1]); Registry_add(*good_left_hash, tempruleptr, (char *) 1); } free(*perl_split_ptr); free(perl_split_ptr); } } /* read in bigram file */ /* Benjamin Han 100400: I store the contents in bigramArray so we don't have to do file IO everytime the start-state-tagger is invoked. */ while (fgets(line, sizeof(line), bigrams) != NULL) { if (not_just_blank(line)) { line[strlen(line) - 1] = '\0'; atempstr = (char *) malloc(sizeof(char) * (strlen(line) + 1)); strcpy(atempstr, line); Darray_addh(*bigramArray, atempstr); } } fseek(lexicon, (long) 0, SEEK_SET); /* read in the lexicon for the final-state-tagger */ Registry_size_hint(*seenTagging, numLexiconEntries); /* Benjamin Han 100500: MISSING RESTRICT_MOVE section? Answer: Brill used registry WORDS while I use lexicon_hash to replace his WORDS (see POST::Run) - the only difference is in WORDS every value is 1 while in lexicon_hash a values is the first tag following the word in the lexicon file. */ while (fgets(line, sizeof(line), lexicon) != NULL) { if (not_just_blank(line)) { line[strlen(line) - 1] = '\0'; perl_split_ptr = perl_split(line); perl_split_ptr2 = perl_split_ptr; ++perl_split_ptr; while (*perl_split_ptr != NULL) { sprintf(space, "%s %s", *perl_split_ptr2, *perl_split_ptr); Registry_add(*seenTagging, mystrdup(space), (char *) 1); ++perl_split_ptr; } free(*perl_split_ptr2); free(perl_split_ptr2); } } /* read in contextual rule */ while (fgets(line, sizeof(line), cRuleFile) != NULL) if (not_just_blank(line)) { line[strlen(line) - 1] = '\0'; Darray_addh(*cRuleArray, mystrdup(line)); } }