Пример #1
0
int main(int argc, char ** argv)
{
  char *modelfile = NULL;
  char *morphtablefile = NULL;
  if(argc < 2)
  {
    cerr("usage: test model_file morph_table\n");
    return 1;
  }
  modelfile = argv[1];
  if(argc > 2) {
    morphtablefile = argv[2];
  }
  int error = 0;
  Hunpos hp = hunpos_tagger_new(modelfile, morphtablefile, 3, 1000, &error);
  if(error) {
    cerr("Failed to load model file\n");
    hunpos_tagger_destroy(hp, &error);
    return 1;
  }
  char* tokens[MAX_SENT_LENGTH];
  int i, n;
  for (i=0; i<MAX_SENT_LENGTH;i++)
  {
    tokens[i] =  (char *) malloc(MAX_TOKEN_LENGTH * sizeof(char));
  }
  while((n = read_sentence(tokens)) > 0)
  {
    error = 0;
    //int j;
    //for(j = 0; j < 10000000; j++)
    hunpos_tagger_tag(hp, n, tokens, get_token, tokens, add_tag, &error);
    printf ("\n");
  }
}
Пример #2
0
corpus_type *read_corpus(corpusflags_type *flags, FILE *in) {
  sentence_type s;
  feature_type fmax = 0;
  int nread, i = 0, maxnparses = 0, nloserparses = 0;
  Float sum_g = 0;

  /* allocate feature counts */
  read_parse_nfc_max = MIN_NFC;
  read_parse_fcp = MALLOC(read_parse_nfc_max*sizeof(fc_type));
  assert(read_parse_fcp != NULL);
  /* allocate features w/ 1 count */
  read_parse_nf_max = MIN_NF;
  read_parse_fp = MALLOC(read_parse_nf_max*sizeof(feature_type)); 
  assert(read_parse_fp != NULL);

  corpus_type *c = SMALLOC(sizeof(corpus_type));
  assert(c != NULL);

  size_type nsentences;
  nread = fscanf(in, " S = %d ", &nsentences);
  assert(nread != EOF);
  c->sentence = MALLOC(nsentences*sizeof(sentence_type));
  assert(c->sentence != NULL);
 
  while (read_sentence(flags, in, &s, &fmax, &maxnparses) != EOF) {
    if (i >= nsentences) {
      nsentences *= 2;
      c->sentence = REALLOC(c->sentence, nsentences*sizeof(sentence_type));
      assert(c->sentence != NULL);
    }
    assert(i < nsentences);

    /* skip sentences with no winners but some parses -- these are
       typically parse failures. */
    if (s.Px == 0.0 && s.nparses != 0)
      continue;

    c->sentence[i++] = s;
    sum_g += s.g;
    if (s.Px > 0)
      nloserparses += s.nparses - 1;
  }
  c->nsentences = i;
  c->sentence = SREALLOC(c->sentence, nsentences*sizeof(sentence_type),
			 c->nsentences*sizeof(sentence_type));
  assert(c->sentence != NULL);
  c->nfeatures = fmax+1;
  c->maxnparses = maxnparses;
  c->nloserparses = nloserparses;

  if (flags && flags->Px_propto_g)
    for (i = 0; i < c->nsentences; ++i)  /* normalize Px */
      c->sentence[i].Px *= c->nsentences * c->sentence[i].g / sum_g;

  FREE(read_parse_fcp);
  FREE(read_parse_fp);

  return c;
}  /* read_corpus() */
Пример #3
0
int read_sentences(FILE *file,sentence_type *s,int max)
{
  int num=0;

  while(num<max&&read_sentence(file,&s[num])!=0)
    num++;

  return num;
}
Пример #4
0
corpus_type *read_corpus(corpusflags_type *flags, FILE *in, int nsentences) {
  sentence_type s;
  feature_type fmax = 0;
  int nread, i = 0, maxnparses = 0;
  Float sum_g = 0;

  corpus_type *c = SMALLOC(sizeof(corpus_type));
  assert(c != NULL);

  nread = fscanf(in, " S = %d ", &nsentences);
  assert(nread != EOF);
  c->sentence = MALLOC(nsentences*sizeof(sentence_type));
  assert(c->sentence != NULL);

  while (read_sentence(flags, in, &s, &fmax, &maxnparses) != EOF) {
    if (i >= nsentences) {
      nsentences *= 2;
      c->sentence = REALLOC(c->sentence, nsentences*sizeof(sentence_type));
      assert(c->sentence != NULL);
    }
    assert(i < nsentences);
    c->sentence[i++] = s;
    sum_g += s.g;
  }
  c->nsentences = i;
  c->sentence = SREALLOC(c->sentence, nsentences*sizeof(sentence_type),
			 c->nsentences*sizeof(sentence_type));
  assert(c->sentence != NULL);
  c->nfeatures = fmax+1;
  c->maxnparses = maxnparses;

  if (flags->Px_propto_g)
    for (i = 0; i < c->nsentences; ++i)  /* normalize Px */
      c->sentence[i].Px *= c->nsentences * c->sentence[i].g / sum_g;

  return c;
}  /* read_corpus() */
/**
 * Counts number of SNR values in a GSV sentence above certain limits.
 * Takes the first sentence of a GSV, counts the number of other sentences 
 * reads next sentences directly from file and tokenises them.
 * iterates through the 1-3 sentences in the list, counting SNR values.
 * @param in_sentence pointer to first tokenised sentence
 * @param pointer to the stream that generated the sentence
 * @return an integer representing the fix quality
 */
int make_gsv(list_ptr in_sentence, stream_ptr stream) {

  //Get the number of GSV lines from second token in the sentence
  int num_lines = atoi(get_head(&in_sentence)->next->node_data);
  int good_snr_count = 0;
  int min_snr_count = 0;

  //Make a list to hold GSVs, up to 3
  list_ptr gsv_lines;
  init_list(&gsv_lines);

  //passed in sentence tokens added to GSV list
  node_ptr original_line;
  init_node(&original_line, in_sentence);
  add_to_list(&original_line, &gsv_lines);

  //Add the other gsv lines to the list
  int i;
  for (i = 1; i < num_lines; i++) {
    node_ptr gsv_line;
    init_node(&gsv_line, parseSentence(read_sentence(stream)));
    add_to_list(&gsv_line, &gsv_lines);

  }


  //Iterator for the GSV lines
  node_ptr gsv_iterator = get_head(&gsv_lines);

  //Iterator for sentence tokens in the GSV lines
  node_ptr sentence_token;
  while (gsv_iterator != NULL) {

    int tokencount = 0; //track number of tokens encountered
    int snr_value_token = 7; // first SNR in a line is 7th token

    //get first token in sentence
    sentence_token = get_head((list_ptr *) & gsv_iterator->node_data);


    //go through each sentence and count values for good/min fix
    while (sentence_token != NULL) {

      if (tokencount == snr_value_token) {

        if (!(strcmp(sentence_token->node_data, "") == 0)) {
          int snr = atoi(sentence_token->node_data);

          if (snr >= 35) {
            good_snr_count++;
          }
          if ((snr >= 30) && (snr < 35)) {
            min_snr_count++;
          }

        }

        snr_value_token += 4; //next SNR in line : every 4 tokens
      }


      sentence_token = sentence_token->next;
      tokencount++;
    }
    gsv_iterator = gsv_iterator -> next;
  }


  if (good_snr_count >= 3) {
    return 2;
  } else if ((good_snr_count + min_snr_count) >= 3) {
    return 1;
  } else {
    return 0;
  }

}
Пример #6
0
void process_mbr_pred_str(APPROX_PARAMS *ap, MODEL_PARAMS *mp) {
    char cand_fname[MAX_NAME];
    strcpy(cand_fname, ap->out_file);
    strcat(cand_fname, ".top");

    char out_file[MAX_NAME];
    strcpy(out_file, ap->out_file);
    strcat(out_file, ".mbr_ps");
    
    DEF_FOPEN(ifp, cand_fname, "r");
    DEF_FOPEN(ofp, out_file, "w");

    SENTENCE *sents[ap->cand_num];
    double lprobs[ap->cand_num];
    int ind_num, c = 0;
    printf("[MBR] Reranking...");
    while ((ind_num = read_cand_num(ifp) ) > 0) {
        if (ind_num > ap->cand_num) {
            fprintf(stderr, "Error: Number of candiates in file (%d) in file exceeded maximum (CAND_NUM = %d)\n", ind_num, ap->cand_num);
            exit(1);
        }
        int i;
        for (i = 0; i < ind_num; i++) {
            lprobs[i] =  read_lprob(ifp);
            sents[i] = read_sentence(mp, ifp, 1);
            ASSERT(sents[i] != NULL);
        }

        double del = lprobs[ind_num - 1];
        for (i = 0; i < ind_num; i++) {
            lprobs[i] -= del;
        }
        
        DEF_ALLOC(res_sent, SENTENCE);
        memcpy(res_sent, sents[0], sizeof(SENTENCE));
       
        int t; 
        for (t = 1; t < res_sent->len + 1; t++) {
            double max_gain = -1; int best_id = -1;
            for(i = 0; i < ind_num; i++) {
                double gain = 0;
                int j;
                for (j = 0; j < ind_num; j++) {
                    int equals = (sents[i]->head[t] == sents[j]->head[t]) 
                        && (strcmp(sents[i]->s_deprel[t], sents[j]->s_deprel[t]) == 0);
                    gain += equals * exp(lprobs[j]);
                }
                if (gain > max_gain) {
                    max_gain = gain;
                    best_id = i;
                }    
            }   
        
            ASSERT(best_id >= 0);
            res_sent->head[t] = sents[best_id]->head[t];
            strcpy(res_sent->s_deprel[t], sents[best_id]->s_deprel[t]);
            res_sent->deprel[t] = sents[best_id]->deprel[t];
            
        }
            
        save_sentence(ofp, res_sent, 1);
        free(res_sent);
        for (i = 0; i < ind_num; i++) {
            free(sents[i]);
            sents[i] = NULL;
        }
        
        if  (c != 0 && c % 1 == 0) {
            printf(".");
            fflush(stdout);
        }
        fflush(stdout);
        c++;
    }
    printf("done. Processed %d sentences\n", c);
    fclose(ifp);
    fclose(ofp);
}
Пример #7
0
void process_mbr_rerank(APPROX_PARAMS *ap, MODEL_PARAMS *mp) {
    char cand_fname[MAX_NAME];
    strcpy(cand_fname, ap->out_file);
    strcat(cand_fname, ".top");

    char out_file[MAX_NAME];
    strcpy(out_file, ap->out_file);
    strcat(out_file, ".mbr_rr");
    
    DEF_FOPEN(ifp, cand_fname, "r");
    DEF_FOPEN(ofp, out_file, "w");

    SENTENCE *sents[ap->cand_num];
    double lprobs[ap->cand_num];
    int ind_num, c = 0;
    printf("[MBR] Reranking...");
    while ((ind_num = read_cand_num(ifp) ) > 0) {
        if (ind_num > ap->cand_num) {
            fprintf(stderr, "Error: Number of candiates in file (%d) in file exceeded maximum (CAND_NUM = %d)\n", ind_num, ap->cand_num);
            exit(1);
        }
        int i;
        for (i = 0; i < ind_num; i++) {
            lprobs[i] =  read_lprob(ifp);
            sents[i] = read_sentence(mp, ifp, 1);
            ASSERT(sents[i] != NULL);
        }

        double del = lprobs[ind_num - 1];
        for (i = 0; i < ind_num; i++) {
            lprobs[i] -= del;
        }

        double max_gain = -1; int best_id = -1;
        for(i = 0; i < ind_num; i++) {
            double gain = 0;
            int j;
            for (j = 0; j < ind_num; j++) {
                gain += get_matched_syntax(sents[i], sents[j], 1) * exp(lprobs[j] * ap->mbr_coeff);
            }
            if (gain > max_gain) {
                max_gain = gain;
                best_id = i;
            }    
        }   
        
        ASSERT(best_id >= 0);
        save_sentence(ofp, sents[best_id], 1);

        for (i = 0; i < ind_num; i++) {
            free(sents[i]);
            sents[i] = NULL;
        }
        if  (c != 0 && c % 1 == 0) {
            printf(".");
            fflush(stdout);
        }
        fflush(stdout);
        c++;
    }
    printf("done. Processed %d sentences\n", c);
    fclose(ifp);
    fclose(ofp);
}