int main(int argc, char ** argv) { char *modelfile = NULL; char *morphtablefile = NULL; if(argc < 2) { cerr("usage: test model_file morph_table\n"); return 1; } modelfile = argv[1]; if(argc > 2) { morphtablefile = argv[2]; } int error = 0; Hunpos hp = hunpos_tagger_new(modelfile, morphtablefile, 3, 1000, &error); if(error) { cerr("Failed to load model file\n"); hunpos_tagger_destroy(hp, &error); return 1; } char* tokens[MAX_SENT_LENGTH]; int i, n; for (i=0; i<MAX_SENT_LENGTH;i++) { tokens[i] = (char *) malloc(MAX_TOKEN_LENGTH * sizeof(char)); } while((n = read_sentence(tokens)) > 0) { error = 0; //int j; //for(j = 0; j < 10000000; j++) hunpos_tagger_tag(hp, n, tokens, get_token, tokens, add_tag, &error); printf ("\n"); } }
corpus_type *read_corpus(corpusflags_type *flags, FILE *in) { sentence_type s; feature_type fmax = 0; int nread, i = 0, maxnparses = 0, nloserparses = 0; Float sum_g = 0; /* allocate feature counts */ read_parse_nfc_max = MIN_NFC; read_parse_fcp = MALLOC(read_parse_nfc_max*sizeof(fc_type)); assert(read_parse_fcp != NULL); /* allocate features w/ 1 count */ read_parse_nf_max = MIN_NF; read_parse_fp = MALLOC(read_parse_nf_max*sizeof(feature_type)); assert(read_parse_fp != NULL); corpus_type *c = SMALLOC(sizeof(corpus_type)); assert(c != NULL); size_type nsentences; nread = fscanf(in, " S = %d ", &nsentences); assert(nread != EOF); c->sentence = MALLOC(nsentences*sizeof(sentence_type)); assert(c->sentence != NULL); while (read_sentence(flags, in, &s, &fmax, &maxnparses) != EOF) { if (i >= nsentences) { nsentences *= 2; c->sentence = REALLOC(c->sentence, nsentences*sizeof(sentence_type)); assert(c->sentence != NULL); } assert(i < nsentences); /* skip sentences with no winners but some parses -- these are typically parse failures. */ if (s.Px == 0.0 && s.nparses != 0) continue; c->sentence[i++] = s; sum_g += s.g; if (s.Px > 0) nloserparses += s.nparses - 1; } c->nsentences = i; c->sentence = SREALLOC(c->sentence, nsentences*sizeof(sentence_type), c->nsentences*sizeof(sentence_type)); assert(c->sentence != NULL); c->nfeatures = fmax+1; c->maxnparses = maxnparses; c->nloserparses = nloserparses; if (flags && flags->Px_propto_g) for (i = 0; i < c->nsentences; ++i) /* normalize Px */ c->sentence[i].Px *= c->nsentences * c->sentence[i].g / sum_g; FREE(read_parse_fcp); FREE(read_parse_fp); return c; } /* read_corpus() */
int read_sentences(FILE *file,sentence_type *s,int max) { int num=0; while(num<max&&read_sentence(file,&s[num])!=0) num++; return num; }
corpus_type *read_corpus(corpusflags_type *flags, FILE *in, int nsentences) { sentence_type s; feature_type fmax = 0; int nread, i = 0, maxnparses = 0; Float sum_g = 0; corpus_type *c = SMALLOC(sizeof(corpus_type)); assert(c != NULL); nread = fscanf(in, " S = %d ", &nsentences); assert(nread != EOF); c->sentence = MALLOC(nsentences*sizeof(sentence_type)); assert(c->sentence != NULL); while (read_sentence(flags, in, &s, &fmax, &maxnparses) != EOF) { if (i >= nsentences) { nsentences *= 2; c->sentence = REALLOC(c->sentence, nsentences*sizeof(sentence_type)); assert(c->sentence != NULL); } assert(i < nsentences); c->sentence[i++] = s; sum_g += s.g; } c->nsentences = i; c->sentence = SREALLOC(c->sentence, nsentences*sizeof(sentence_type), c->nsentences*sizeof(sentence_type)); assert(c->sentence != NULL); c->nfeatures = fmax+1; c->maxnparses = maxnparses; if (flags->Px_propto_g) for (i = 0; i < c->nsentences; ++i) /* normalize Px */ c->sentence[i].Px *= c->nsentences * c->sentence[i].g / sum_g; return c; } /* read_corpus() */
/** * Counts number of SNR values in a GSV sentence above certain limits. * Takes the first sentence of a GSV, counts the number of other sentences * reads next sentences directly from file and tokenises them. * iterates through the 1-3 sentences in the list, counting SNR values. * @param in_sentence pointer to first tokenised sentence * @param pointer to the stream that generated the sentence * @return an integer representing the fix quality */ int make_gsv(list_ptr in_sentence, stream_ptr stream) { //Get the number of GSV lines from second token in the sentence int num_lines = atoi(get_head(&in_sentence)->next->node_data); int good_snr_count = 0; int min_snr_count = 0; //Make a list to hold GSVs, up to 3 list_ptr gsv_lines; init_list(&gsv_lines); //passed in sentence tokens added to GSV list node_ptr original_line; init_node(&original_line, in_sentence); add_to_list(&original_line, &gsv_lines); //Add the other gsv lines to the list int i; for (i = 1; i < num_lines; i++) { node_ptr gsv_line; init_node(&gsv_line, parseSentence(read_sentence(stream))); add_to_list(&gsv_line, &gsv_lines); } //Iterator for the GSV lines node_ptr gsv_iterator = get_head(&gsv_lines); //Iterator for sentence tokens in the GSV lines node_ptr sentence_token; while (gsv_iterator != NULL) { int tokencount = 0; //track number of tokens encountered int snr_value_token = 7; // first SNR in a line is 7th token //get first token in sentence sentence_token = get_head((list_ptr *) & gsv_iterator->node_data); //go through each sentence and count values for good/min fix while (sentence_token != NULL) { if (tokencount == snr_value_token) { if (!(strcmp(sentence_token->node_data, "") == 0)) { int snr = atoi(sentence_token->node_data); if (snr >= 35) { good_snr_count++; } if ((snr >= 30) && (snr < 35)) { min_snr_count++; } } snr_value_token += 4; //next SNR in line : every 4 tokens } sentence_token = sentence_token->next; tokencount++; } gsv_iterator = gsv_iterator -> next; } if (good_snr_count >= 3) { return 2; } else if ((good_snr_count + min_snr_count) >= 3) { return 1; } else { return 0; } }
void process_mbr_pred_str(APPROX_PARAMS *ap, MODEL_PARAMS *mp) { char cand_fname[MAX_NAME]; strcpy(cand_fname, ap->out_file); strcat(cand_fname, ".top"); char out_file[MAX_NAME]; strcpy(out_file, ap->out_file); strcat(out_file, ".mbr_ps"); DEF_FOPEN(ifp, cand_fname, "r"); DEF_FOPEN(ofp, out_file, "w"); SENTENCE *sents[ap->cand_num]; double lprobs[ap->cand_num]; int ind_num, c = 0; printf("[MBR] Reranking..."); while ((ind_num = read_cand_num(ifp) ) > 0) { if (ind_num > ap->cand_num) { fprintf(stderr, "Error: Number of candiates in file (%d) in file exceeded maximum (CAND_NUM = %d)\n", ind_num, ap->cand_num); exit(1); } int i; for (i = 0; i < ind_num; i++) { lprobs[i] = read_lprob(ifp); sents[i] = read_sentence(mp, ifp, 1); ASSERT(sents[i] != NULL); } double del = lprobs[ind_num - 1]; for (i = 0; i < ind_num; i++) { lprobs[i] -= del; } DEF_ALLOC(res_sent, SENTENCE); memcpy(res_sent, sents[0], sizeof(SENTENCE)); int t; for (t = 1; t < res_sent->len + 1; t++) { double max_gain = -1; int best_id = -1; for(i = 0; i < ind_num; i++) { double gain = 0; int j; for (j = 0; j < ind_num; j++) { int equals = (sents[i]->head[t] == sents[j]->head[t]) && (strcmp(sents[i]->s_deprel[t], sents[j]->s_deprel[t]) == 0); gain += equals * exp(lprobs[j]); } if (gain > max_gain) { max_gain = gain; best_id = i; } } ASSERT(best_id >= 0); res_sent->head[t] = sents[best_id]->head[t]; strcpy(res_sent->s_deprel[t], sents[best_id]->s_deprel[t]); res_sent->deprel[t] = sents[best_id]->deprel[t]; } save_sentence(ofp, res_sent, 1); free(res_sent); for (i = 0; i < ind_num; i++) { free(sents[i]); sents[i] = NULL; } if (c != 0 && c % 1 == 0) { printf("."); fflush(stdout); } fflush(stdout); c++; } printf("done. Processed %d sentences\n", c); fclose(ifp); fclose(ofp); }
void process_mbr_rerank(APPROX_PARAMS *ap, MODEL_PARAMS *mp) { char cand_fname[MAX_NAME]; strcpy(cand_fname, ap->out_file); strcat(cand_fname, ".top"); char out_file[MAX_NAME]; strcpy(out_file, ap->out_file); strcat(out_file, ".mbr_rr"); DEF_FOPEN(ifp, cand_fname, "r"); DEF_FOPEN(ofp, out_file, "w"); SENTENCE *sents[ap->cand_num]; double lprobs[ap->cand_num]; int ind_num, c = 0; printf("[MBR] Reranking..."); while ((ind_num = read_cand_num(ifp) ) > 0) { if (ind_num > ap->cand_num) { fprintf(stderr, "Error: Number of candiates in file (%d) in file exceeded maximum (CAND_NUM = %d)\n", ind_num, ap->cand_num); exit(1); } int i; for (i = 0; i < ind_num; i++) { lprobs[i] = read_lprob(ifp); sents[i] = read_sentence(mp, ifp, 1); ASSERT(sents[i] != NULL); } double del = lprobs[ind_num - 1]; for (i = 0; i < ind_num; i++) { lprobs[i] -= del; } double max_gain = -1; int best_id = -1; for(i = 0; i < ind_num; i++) { double gain = 0; int j; for (j = 0; j < ind_num; j++) { gain += get_matched_syntax(sents[i], sents[j], 1) * exp(lprobs[j] * ap->mbr_coeff); } if (gain > max_gain) { max_gain = gain; best_id = i; } } ASSERT(best_id >= 0); save_sentence(ofp, sents[best_id], 1); for (i = 0; i < ind_num; i++) { free(sents[i]); sents[i] = NULL; } if (c != 0 && c % 1 == 0) { printf("."); fflush(stdout); } fflush(stdout); c++; } printf("done. Processed %d sentences\n", c); fclose(ifp); fclose(ofp); }