Exemplo n.º 1
0
  Tagger::Tagger(uint8_t type)
    : type_(type) {
    SENNA_set_verbose_mode(false);

    //std::cout << "Type of tagging: " << int(type_) << std::endl;
    
    word_hash_ = SENNA_Hash_new(SennaPath.c_str(), "hash/words.lst");
    caps_hash_ = SENNA_Hash_new(SennaPath.c_str(), "hash/caps.lst");
    suff_hash_ = SENNA_Hash_new(SennaPath.c_str(), "hash/suffix.lst");
    gazt_hash_ = SENNA_Hash_new(SennaPath.c_str(), "hash/gazetteer.lst");

    gazl_hash_ = SENNA_Hash_new_with_admissible_keys(SennaPath.c_str(), "hash/ner.loc.lst", "data/ner.loc.dat");
    gazm_hash_ = SENNA_Hash_new_with_admissible_keys(SennaPath.c_str(), "hash/ner.msc.lst", "data/ner.msc.dat");
    gazo_hash_ = SENNA_Hash_new_with_admissible_keys(SennaPath.c_str(), "hash/ner.org.lst", "data/ner.org.dat");
    gazp_hash_ = SENNA_Hash_new_with_admissible_keys(SennaPath.c_str(), "hash/ner.per.lst", "data/ner.per.dat");
    
    pos_hash_ = SENNA_Hash_new(SennaPath.c_str(), "hash/pos.lst");
    chk_hash_ = SENNA_Hash_new(SennaPath.c_str(), "hash/chk.lst");
    pt0_hash_ = SENNA_Hash_new(SennaPath.c_str(), "hash/pt0.lst");
    ner_hash_ = SENNA_Hash_new(SennaPath.c_str(), "hash/ner.lst");
    vbs_hash_ = SENNA_Hash_new(SennaPath.c_str(), "hash/vbs.lst");
    srl_hash_ = SENNA_Hash_new(SennaPath.c_str(), "hash/srl.lst");
    psg_left_hash_ = SENNA_Hash_new(SennaPath.c_str(), "hash/psg-left.lst");
    psg_right_hash_ = SENNA_Hash_new(SennaPath.c_str(), "hash/psg-right.lst");

    pos_ = SENNA_POS_new(SennaPath.c_str(), "data/pos.dat");
    chk_ = SENNA_CHK_new(SennaPath.c_str(), "data/chk.dat");
    pt0_ = SENNA_PT0_new(SennaPath.c_str(), "data/pt0.dat");
    ner_ = SENNA_NER_new(SennaPath.c_str(), "data/ner.dat");
    vbs_ = SENNA_VBS_new(SennaPath.c_str(), "data/vbs.dat");
    srl_ = SENNA_SRL_new(SennaPath.c_str(), "data/srl.dat");
    psg_ = SENNA_PSG_new(SennaPath.c_str(), "data/psg.dat");
    
    tokenizer_ = SENNA_Tokenizer_new(word_hash_, caps_hash_, suff_hash_, gazt_hash_, gazl_hash_, gazm_hash_, gazo_hash_, gazp_hash_, false);
  }
Exemplo n.º 2
0
/*
 * allocates hash tables labels and provides a pointer to the data structure
 * has to be freed using freeSenna
 */
SENNA* sennaCreate(const char * opt_path) {
    SENNA* senna = (SENNA*) malloc(sizeof(SENNA));
    CHECK_ALLOC(senna);

    senna->word_hash = SENNA_Hash_new(opt_path, "hash/words.lst");
    senna->caps_hash = SENNA_Hash_new(opt_path, "hash/caps.lst");
    senna->suff_hash = SENNA_Hash_new(opt_path, "hash/suffix.lst");
    senna->gazt_hash = SENNA_Hash_new(opt_path, "hash/gazetteer.lst");

    senna->gazl_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.loc.lst", "data/ner.loc.dat");
    senna->gazm_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.msc.lst", "data/ner.msc.dat");
    senna->gazo_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.org.lst", "data/ner.org.dat");
    senna->gazp_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.per.lst", "data/ner.per.dat");

    // labels
    senna->pos_hash = SENNA_Hash_new(opt_path, "hash/pos.lst");
    // senna->chk_hash = SENNA_Hash_new(opt_path, "hash/chk.lst");
    // senna->pt0_hash = SENNA_Hash_new(opt_path, "hash/pt0.lst");
    // senna->ner_hash = SENNA_Hash_new(opt_path, "hash/ner.lst");
    // senna->vbs_hash = SENNA_Hash_new(opt_path, "hash/vbs.lst");
    // senna->srl_hash = SENNA_Hash_new(opt_path, "hash/srl.lst");
    senna->psg_left_hash = SENNA_Hash_new(opt_path, "hash/psg-left.lst");
    senna->psg_right_hash = SENNA_Hash_new(opt_path, "hash/psg-right.lst");

    senna->pos = SENNA_POS_new(opt_path, "data/pos.dat");
    // senna->chk = SENNA_CHK_new(opt_path, "data/chk.dat");
    // senna->pt0 = SENNA_PT0_new(opt_path, "data/pt0.dat");
    // senna->ner = SENNA_NER_new(opt_path, "data/ner.dat");
    // senna->vbs = SENNA_VBS_new(opt_path, "data/vbs.dat");
    // senna->srl = SENNA_SRL_new(opt_path, "data/srl.dat");
    senna->psg = SENNA_PSG_new(opt_path, "data/psg.dat");

    senna->tokenizer = SENNA_Tokenizer_new(
                           senna->word_hash,
                           senna->caps_hash,
                           senna->suff_hash,
                           senna->gazt_hash,
                           senna->gazl_hash,
                           senna->gazm_hash,
                           senna->gazo_hash,
                           senna->gazp_hash,
                           0);

    senna->lastSentence.tokens = NULL;
    senna->lastSentence.pos_labels = NULL;
    senna->lastSentence.psg_labels = NULL;

    senna->strbuf.ptr = (char *) malloc(sizeof(char) * 512);
    CHECK_ALLOC(senna->strbuf.ptr);
    senna->strbuf.length = 512;
    senna->strbuf.pos = 0;

    return senna;
}
Exemplo n.º 3
0
int main(int argc, char *argv[])
{
  int i, j;

  /* options */
  char *opt_path = NULL;
  int opt_verbose = 0;
  int opt_notokentags = 0;
  int opt_offsettags = 0;
  int opt_iobtags = 0;
  int opt_brackettags = 0;
  int opt_posvbs = 0;
  int opt_usrtokens = 0;
  int opt_pos = 0;
  int opt_chk = 0;
  int opt_ner = 0;
  int opt_srl = 0;
  int opt_psg = 0;
  FILE *opt_usrvbs = NULL;
  FILE *senna_input = stdin;
  FILE *senna_output = stdout;
  int  pipe_mode = 0;
  char *output_pipe = NULL;

  for(i = 1; i < argc; i++)
  {
    if(!strcmp(argv[i], "-verbose"))
      opt_verbose = 1;
    else if(!strcmp(argv[i], "-notokentags"))
      opt_notokentags = 1;
    else if(!strcmp(argv[i], "-offsettags"))
      opt_offsettags = 1;
    else if(!strcmp(argv[i], "-iobtags"))
      opt_iobtags = 1;
    else if(!strcmp(argv[i], "-brackettags"))
      opt_brackettags = 1;
    else if(!strcmp(argv[i], "-path"))
    {
      if(i+1 >= argc)
        SENNA_error("please provide a path for the -path option");
      opt_path = argv[i+1];
      i++;
    }
    else if(!strcmp(argv[i], "-posvbs"))
      opt_posvbs = 1;
    else if(!strcmp(argv[i], "-usrtokens"))
      opt_usrtokens = 1;
    else if(!strcmp(argv[i], "-usrvbs"))
    {
      if(i+1 >= argc)
        SENNA_error("please provide a filename for the -usrvbs option");
      opt_usrvbs = SENNA_fopen(NULL, argv[i+1], "rb");
      i++;
    }
    else if(!strcmp(argv[i], "-pos"))
      opt_pos = 1;
    else if(!strcmp(argv[i], "-chk"))
      opt_chk = 1;
    else if(!strcmp(argv[i], "-ner"))
      opt_ner = 1;
    else if(!strcmp(argv[i], "-srl"))
      opt_srl = 1;
    else if(!strcmp(argv[i], "-psg"))
      opt_psg = 1;
    else if(!strcmp(argv[i], "-maxsentsize")) {
      if(i+1 >= argc)
        SENNA_error("please provide a sentence size for the -maxsentsize option");
      max_sent_size = atol(argv[i+1]);
      if(max_sent_size<0)
        SENNA_error("provide a positive value for the -maxsentsize option");
      i++;
    } else if(!strcmp(argv[i], "-input_pipe")){
      if(i+1 >= argc)
        SENNA_error("please provide the name of the input pipe");
      senna_input = fopen(argv[i+1], "r");
      if (senna_input == NULL) {
        SENNA_error("cannot open the input named pipe");
      }
      pipe_mode = 1;
      i++;
    } else if(!strcmp(argv[i], "-output_pipe")){
      if(i+1 >= argc)
        SENNA_error("please provide the name of the outputpipe");
      output_pipe = argv[i+1];
      i++;
    } else {
      printf("invalid argument: %s\n", argv[i]);
      help(argv[0]);
    }
  }

  SENNA_set_verbose_mode(opt_verbose);

  if(!opt_pos && !opt_chk && !opt_ner && !opt_srl && !opt_psg) /* the user does not know what he wants... */
    opt_pos = opt_chk = opt_ner = opt_srl = opt_psg = 1;     /* so give him everything (aren't we insane?) */


  /* the real thing */
  {
    char *sentence = NULL;
    char target_vb[MAX_TARGET_VB_SIZE];
    int *chk_labels = NULL;
    int *pt0_labels = NULL;
    int *pos_labels = NULL;
    int *ner_labels = NULL;
    int *vbs_labels = NULL;
    int **srl_labels = NULL;
    int *psg_labels = NULL;
    int n_psg_level = 0;
    int is_psg_one_segment = 0;
    int vbs_hash_novb_idx = 22;
    int n_verbs = 0;

    sentence = malloc(max_sent_size + 1);
    SENNA_message("Maximum sentence size %ld", max_sent_size);
    
    /* inputs */
    SENNA_Hash *word_hash = SENNA_Hash_new(opt_path, "hash/words.lst");
    SENNA_Hash *caps_hash = SENNA_Hash_new(opt_path, "hash/caps.lst");
    SENNA_Hash *suff_hash = SENNA_Hash_new(opt_path, "hash/suffix.lst");
    SENNA_Hash *gazt_hash = SENNA_Hash_new(opt_path, "hash/gazetteer.lst");

    SENNA_Hash *gazl_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.loc.lst", "data/ner.loc.dat");
    SENNA_Hash *gazm_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.msc.lst", "data/ner.msc.dat");
    SENNA_Hash *gazo_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.org.lst", "data/ner.org.dat");
    SENNA_Hash *gazp_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.per.lst", "data/ner.per.dat");

    /* labels */
    SENNA_Hash *pos_hash = SENNA_Hash_new(opt_path, "hash/pos.lst");
    SENNA_Hash *chk_hash = SENNA_Hash_new(opt_path, "hash/chk.lst");
    SENNA_Hash *pt0_hash = SENNA_Hash_new(opt_path, "hash/pt0.lst");
    SENNA_Hash *ner_hash = SENNA_Hash_new(opt_path, "hash/ner.lst");
    SENNA_Hash *vbs_hash = SENNA_Hash_new(opt_path, "hash/vbs.lst");
    SENNA_Hash *srl_hash = SENNA_Hash_new(opt_path, "hash/srl.lst");
    SENNA_Hash *psg_left_hash = SENNA_Hash_new(opt_path, "hash/psg-left.lst");
    SENNA_Hash *psg_right_hash = SENNA_Hash_new(opt_path, "hash/psg-right.lst");

    SENNA_POS *pos = SENNA_POS_new(opt_path, "data/pos.dat");
    SENNA_CHK *chk = SENNA_CHK_new(opt_path, "data/chk.dat");
    SENNA_PT0 *pt0 = SENNA_PT0_new(opt_path, "data/pt0.dat");
    SENNA_NER *ner = SENNA_NER_new(opt_path, "data/ner.dat");
    SENNA_VBS *vbs = SENNA_VBS_new(opt_path, "data/vbs.dat");
    SENNA_SRL *srl = SENNA_SRL_new(opt_path, "data/srl.dat");
    SENNA_PSG *psg = SENNA_PSG_new(opt_path, "data/psg.dat");

    SENNA_Tokenizer *tokenizer = SENNA_Tokenizer_new(word_hash, caps_hash, suff_hash, gazt_hash, gazl_hash, gazm_hash, gazo_hash, gazp_hash, opt_usrtokens);

    if(opt_iobtags)
    {
      SENNA_Hash_convert_IOBES_to_IOB(chk_hash);
      SENNA_Hash_convert_IOBES_to_IOB(ner_hash);
      SENNA_Hash_convert_IOBES_to_IOB(srl_hash);
    }
    else if(opt_brackettags)
    {
      SENNA_Hash_convert_IOBES_to_brackets(chk_hash);
      SENNA_Hash_convert_IOBES_to_brackets(ner_hash);
      SENNA_Hash_convert_IOBES_to_brackets(srl_hash);
    }

    SENNA_message("ready");

    do {
      if (output_pipe) {
        senna_output = fopen(output_pipe, "w");
        if (senna_output == NULL) {
          SENNA_error("cannot open the output named pipe");
        }
      } else {
        senna_output = stdout;
      }

      while(fgets(sentence, max_sent_size + 1, senna_input))
      {
        SENNA_Tokens* tokens = SENNA_Tokenizer_tokenize(tokenizer, sentence);
      
        if(tokens->n == 0)
          continue;

        pos_labels = SENNA_POS_forward(pos, tokens->word_idx, tokens->caps_idx, tokens->suff_idx, tokens->n);
        if(opt_chk)
          chk_labels = SENNA_CHK_forward(chk, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n);
        if(opt_srl)
          pt0_labels = SENNA_PT0_forward(pt0, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n);
        if(opt_ner)
          ner_labels = SENNA_NER_forward(ner, tokens->word_idx, tokens->caps_idx, tokens->gazl_idx, tokens->gazm_idx, tokens->gazo_idx, tokens->gazp_idx, tokens->n);
        if(opt_srl)
        {
          if(opt_usrvbs)
          {
            vbs_labels = SENNA_realloc(vbs_labels, sizeof(int), tokens->n);
            n_verbs = 0;
            for(i = 0; i < tokens->n; i++)
            {
              if(!SENNA_fgetline(target_vb, MAX_TARGET_VB_SIZE, opt_usrvbs))
                SENNA_error("invalid user verbs file\n");
              vbs_labels[i] = !( (target_vb[0] == '-') && ( (target_vb[1] == '\0') || isspace(target_vb[1])) );
              n_verbs += vbs_labels[i];          
            }
            if(!SENNA_fgetline(target_vb, MAX_TARGET_VB_SIZE, opt_usrvbs))
              SENNA_error("invalid user verbs file\n");
            if(strlen(target_vb) > 0)
              SENNA_error("sentence size does not match in user verbs file");
          }
          else if(opt_posvbs)
          {
            vbs_labels = SENNA_realloc(vbs_labels, sizeof(int), tokens->n);
            n_verbs = 0;
            for(i = 0; i < tokens->n; i++)
            {
              vbs_labels[i] = (SENNA_Hash_key(pos_hash, pos_labels[i])[0] == 'V');
              n_verbs += vbs_labels[i];
            }
          }
          else
          {
            vbs_labels = SENNA_VBS_forward(vbs, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n);
            n_verbs = 0;
            for(i = 0; i < tokens->n; i++)
            {
              vbs_labels[i] = (vbs_labels[i] != vbs_hash_novb_idx);
              n_verbs += vbs_labels[i];
            }
          }
        }

        if(opt_srl)
          srl_labels = SENNA_SRL_forward(srl, tokens->word_idx, tokens->caps_idx, pt0_labels, vbs_labels, tokens->n);

        if(opt_psg)
        {
          SENNA_PSG_forward(psg, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n, &psg_labels, &n_psg_level);

          /* check if top level takes the full sentence */
          {
            int *psg_top_labels = psg_labels + (n_psg_level-1)*tokens->n;

            if(tokens->n == 1)
              is_psg_one_segment = ((psg_top_labels[0]-1) % 4 == 3); /* S- ? */
            else
              is_psg_one_segment = ((psg_top_labels[0]-1) % 4 == 0) && ((psg_top_labels[tokens->n-1]-1) % 4 == 2); /* B- or E- ? */

            for(i = 1; is_psg_one_segment && (i < tokens->n-1); i++)
            {
              if((psg_top_labels[i]-1) % 4 != 1) /* I- ? */
                is_psg_one_segment = 0;
            }
          }
        }

        for(i = 0; i < tokens->n; i++)
        {
          if(!opt_notokentags)
            fprintf(senna_output, "%15s", tokens->words[i]);
          if(opt_offsettags)
            fprintf(senna_output, "\t%d %d", tokens->start_offset[i], tokens->end_offset[i]);
          if(opt_pos)
            fprintf(senna_output, "\t%10s", SENNA_Hash_key(pos_hash, pos_labels[i]));
          if(opt_chk)
            fprintf(senna_output, "\t%10s", SENNA_Hash_key(chk_hash, chk_labels[i]));
          if(opt_ner)
            fprintf(senna_output, "\t%10s", SENNA_Hash_key(ner_hash, ner_labels[i]));
          if(opt_srl)
          {
            fprintf(senna_output, "\t%15s", (vbs_labels[i] ? tokens->words[i] : "-"));
            for(j = 0; j < n_verbs; j++)
              fprintf(senna_output, "\t%10s", SENNA_Hash_key(srl_hash, srl_labels[j][i]));
          }
          if(opt_psg) /* last, can be long */
          {
            fprintf(senna_output, "\t");
            if(i == 0)
            {
              fprintf(senna_output, "(S1");
              if(!is_psg_one_segment)
                fprintf(senna_output, "(S");
            }
            for(j = n_psg_level-1; j >= 0; j--)
              fprintf(senna_output, "%s", SENNA_Hash_key(psg_left_hash, psg_labels[j*tokens->n+i]));
            fprintf(senna_output, "*");
            for(j = 0; j < n_psg_level; j++)
              fprintf(senna_output, "%s", SENNA_Hash_key(psg_right_hash, psg_labels[j*tokens->n+i]));
            if(i == tokens->n-1)
            {
              if(!is_psg_one_segment)
                fprintf(senna_output, ")");
              fprintf(senna_output, ")");
            }
          }
          fprintf(senna_output, "\n");
        }
        fprintf(senna_output, "\n"); /* end of sentence */
      }
      if (output_pipe) {
        fclose(senna_output);
      }
    } while (pipe_mode);

    if(opt_posvbs)
      SENNA_free(vbs_labels);

    if(opt_usrvbs)
    {
      SENNA_free(vbs_labels);
      SENNA_fclose(opt_usrvbs);
    }

    SENNA_Tokenizer_free(tokenizer);

    SENNA_POS_free(pos);
    SENNA_CHK_free(chk);
    SENNA_PT0_free(pt0);
    SENNA_NER_free(ner);
    SENNA_VBS_free(vbs);
    SENNA_SRL_free(srl);
    SENNA_PSG_free(psg);

    SENNA_Hash_free(word_hash);
    SENNA_Hash_free(caps_hash);
    SENNA_Hash_free(suff_hash);
    SENNA_Hash_free(gazt_hash);

    SENNA_Hash_free(gazl_hash);
    SENNA_Hash_free(gazm_hash);
    SENNA_Hash_free(gazo_hash);
    SENNA_Hash_free(gazp_hash);

    SENNA_Hash_free(pos_hash);
    SENNA_Hash_free(chk_hash);
    SENNA_Hash_free(pt0_hash);
    SENNA_Hash_free(ner_hash);
    SENNA_Hash_free(vbs_hash);
    SENNA_Hash_free(srl_hash);
    SENNA_Hash_free(psg_left_hash);
    SENNA_Hash_free(psg_right_hash);

    free(sentence);
  }

  return 0;
}
Exemplo n.º 4
0
int main(int argc, char *argv[])
{
  int i, j;

  /* options */
  char * sentence = NULL;
  char *opt_path = NULL;
  int opt_verbose = 0;
  int opt_notokentags = 0;
  int opt_iobtags = 0;
  int opt_brackettags = 0;
  int opt_posvbs = 0;
  int opt_usrtokens = 0;
  int opt_pos = 0;
  int opt_chk = 0;
  int opt_ner = 0;
  int opt_srl = 0;
  FILE *opt_usrvbs = NULL;

  for(i = 1; i < argc; i++)
  {
    if(!strcmp(argv[i], "-sentence")){
      if(i+1 >= argc)
        SENNA_error("please provide a path for the -path option");
      sentence = argv[i+1];
      i++;
    }else if(!strcmp(argv[i], "-verbose"))
      opt_verbose = 1;
    else if(!strcmp(argv[i], "-notokentags"))
      opt_notokentags = 1;
    else if(!strcmp(argv[i], "-iobtags"))
      opt_iobtags = 1;
    else if(!strcmp(argv[i], "-brackettags"))
      opt_brackettags = 1;
    else if(!strcmp(argv[i], "-path"))
    {
      if(i+1 >= argc)
        SENNA_error("please provide a path for the -path option");
      opt_path = argv[i+1];
      i++;
    }
    else if(!strcmp(argv[i], "-posvbs"))
      opt_posvbs = 1;
    else if(!strcmp(argv[i], "-usrtokens"))
      opt_usrtokens = 1;
    else if(!strcmp(argv[i], "-usrvbs"))
    {
      if(i+1 >= argc)
        SENNA_error("please provide a filename for the -usrvbs option");
      opt_usrvbs = SENNA_fopen(NULL, argv[i+1], "rb");
      i++;
    }
    else if(!strcmp(argv[i], "-pos"))
      opt_pos = 1;
    else if(!strcmp(argv[i], "-chk"))
      opt_chk = 1;
    else if(!strcmp(argv[i], "-ner"))
      opt_ner = 1;
    else if(!strcmp(argv[i], "-srl"))
      opt_srl = 1;
    else
    {
      printf("invalid argument: %s\n", argv[i]);
      help(argv[0]);
    }
  }

  SENNA_set_verbose_mode(opt_verbose);

  if(!opt_pos && !opt_chk && !opt_ner && !opt_srl) /* the user does not know what he wants... */
    opt_pos = opt_chk = opt_ner = opt_srl = 1;     /* so give him everything */


  /* the real thing */
  {
    //char sentence[MAX_SENTENCE_SIZE];
    char target_vb[MAX_TARGET_VB_SIZE];
    int *chk_labels = NULL;
    int *pt0_labels = NULL;
    int *pos_labels = NULL;
    int *ner_labels = NULL;
    int *vbs_labels = NULL;
    int **srl_labels = NULL;
    int vbs_hash_novb_idx = 22;
    int n_verbs = 0;
    
    /* inputs */
    SENNA_Hash *word_hash = SENNA_Hash_new(opt_path, "hash/words.lst");
    SENNA_Hash *caps_hash = SENNA_Hash_new(opt_path, "hash/caps.lst");
    SENNA_Hash *suff_hash = SENNA_Hash_new(opt_path, "hash/suffix.lst");
    SENNA_Hash *gazt_hash = SENNA_Hash_new(opt_path, "hash/gazetteer.lst");

    SENNA_Hash *gazl_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.loc.lst", "data/ner.loc.dat");
    SENNA_Hash *gazm_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.msc.lst", "data/ner.msc.dat");
    SENNA_Hash *gazo_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.org.lst", "data/ner.org.dat");
    SENNA_Hash *gazp_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.per.lst", "data/ner.per.dat");

    /* labels */
    SENNA_Hash *pos_hash = SENNA_Hash_new(opt_path, "hash/pos.lst");
    SENNA_Hash *chk_hash = SENNA_Hash_new(opt_path, "hash/chk.lst");
    SENNA_Hash *pt0_hash = SENNA_Hash_new(opt_path, "hash/pt0.lst");
    SENNA_Hash *ner_hash = SENNA_Hash_new(opt_path, "hash/ner.lst");
    SENNA_Hash *vbs_hash = SENNA_Hash_new(opt_path, "hash/vbs.lst");
    SENNA_Hash *srl_hash = SENNA_Hash_new(opt_path, "hash/srl.lst");

    SENNA_POS *pos = SENNA_POS_new(opt_path, "data/pos.dat");
    SENNA_CHK *chk = SENNA_CHK_new(opt_path, "data/chk.dat");
    SENNA_PT0 *pt0 = SENNA_PT0_new(opt_path, "data/pt0.dat");
    SENNA_NER *ner = SENNA_NER_new(opt_path, "data/ner.dat");
    SENNA_VBS *vbs = SENNA_VBS_new(opt_path, "data/vbs.dat");
    SENNA_SRL *srl = SENNA_SRL_new(opt_path, "data/srl.dat");

    SENNA_Tokenizer *tokenizer = SENNA_Tokenizer_new(word_hash, caps_hash, suff_hash, gazt_hash, gazl_hash, gazm_hash, gazo_hash, gazp_hash, opt_usrtokens);

    if(opt_iobtags)
    {
      SENNA_Hash_convert_IOBES_to_IOB(chk_hash);
      SENNA_Hash_convert_IOBES_to_IOB(ner_hash);
      SENNA_Hash_convert_IOBES_to_IOB(srl_hash);
    }
    else if(opt_brackettags)
    {
      SENNA_Hash_convert_IOBES_to_brackets(chk_hash);
      SENNA_Hash_convert_IOBES_to_brackets(ner_hash);
      SENNA_Hash_convert_IOBES_to_brackets(srl_hash);
    }

    SENNA_message("ready");

 //   while(fgets(sentence, MAX_SENTENCE_SIZE, stdin))
    if(sentence != NULL)
    {
      SENNA_Tokens* tokens = SENNA_Tokenizer_tokenize(tokenizer, sentence);
    
      if(tokens->n == 0)
        continue;

      pos_labels = SENNA_POS_forward(pos, tokens->word_idx, tokens->caps_idx, tokens->suff_idx, tokens->n);
      if(opt_chk)
        chk_labels = SENNA_CHK_forward(chk, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n);
      if(opt_srl)
        pt0_labels = SENNA_PT0_forward(pt0, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n);
      if(opt_ner)
        ner_labels = SENNA_NER_forward(ner, tokens->word_idx, tokens->caps_idx, tokens->gazl_idx, tokens->gazm_idx, tokens->gazo_idx, tokens->gazp_idx, tokens->n);
      if(opt_srl)
      {
        if(opt_usrvbs)
        {
          vbs_labels = SENNA_realloc(vbs_labels, sizeof(int), tokens->n);
          n_verbs = 0;
          for(i = 0; i < tokens->n; i++)
          {
            if(!SENNA_fgetline(target_vb, MAX_TARGET_VB_SIZE, opt_usrvbs))
              SENNA_error("invalid user verbs file\n");
            vbs_labels[i] = !( (target_vb[0] == '-') && ( (target_vb[1] == '\0') || isspace(target_vb[1])) );
            n_verbs += vbs_labels[i];          
          }
          if(!SENNA_fgetline(target_vb, MAX_TARGET_VB_SIZE, opt_usrvbs))
            SENNA_error("invalid user verbs file\n");
          if(strlen(target_vb) > 0)
            SENNA_error("sentence size does not match in user verbs file");
        }
        else if(opt_posvbs)
        {
          vbs_labels = SENNA_realloc(vbs_labels, sizeof(int), tokens->n);
          n_verbs = 0;
          for(i = 0; i < tokens->n; i++)
          {
            vbs_labels[i] = (SENNA_Hash_key(pos_hash, pos_labels[i])[0] == 'V');
            n_verbs += vbs_labels[i];
          }
        }
        else
        {
          vbs_labels = SENNA_VBS_forward(vbs, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n);
          n_verbs = 0;
          for(i = 0; i < tokens->n; i++)
          {
            vbs_labels[i] = (vbs_labels[i] != vbs_hash_novb_idx);
            n_verbs += vbs_labels[i];
          }
        }
      }

      if(opt_srl)
        srl_labels = SENNA_SRL_forward(srl, tokens->word_idx, tokens->caps_idx, pt0_labels, vbs_labels, tokens->n);

      for(i = 0; i < tokens->n; i++)
      {
        if(!opt_notokentags)
          printf("%15s", tokens->words[i]);
        if(opt_pos)
          printf("\t%10s", SENNA_Hash_key(pos_hash, pos_labels[i]));
        if(opt_chk)
          printf("\t%10s", SENNA_Hash_key(chk_hash, chk_labels[i]));
        if(opt_ner)
          printf("\t%10s", SENNA_Hash_key(ner_hash, ner_labels[i]));
        if(opt_srl)
        {
          printf("\t%15s", (vbs_labels[i] ? tokens->words[i] : "-"));
          for(j = 0; j < n_verbs; j++)
            printf("\t%10s", SENNA_Hash_key(srl_hash, srl_labels[j][i]));
        }
        printf("\n");
      }
      printf("\n"); /* end of sentence */
    }

    if(opt_posvbs)
      SENNA_free(vbs_labels);

    if(opt_usrvbs)
    {
      SENNA_free(vbs_labels);
      SENNA_fclose(opt_usrvbs);
    }

    SENNA_Tokenizer_free(tokenizer);

    SENNA_POS_free(pos);
    SENNA_CHK_free(chk);
    SENNA_PT0_free(pt0);
    SENNA_NER_free(ner);
    SENNA_VBS_free(vbs);
    SENNA_SRL_free(srl);

    SENNA_Hash_free(word_hash);
    SENNA_Hash_free(caps_hash);
    SENNA_Hash_free(suff_hash);
    SENNA_Hash_free(gazt_hash);

    SENNA_Hash_free(gazl_hash);
    SENNA_Hash_free(gazm_hash);
    SENNA_Hash_free(gazo_hash);
    SENNA_Hash_free(gazp_hash);

    SENNA_Hash_free(pos_hash);
    SENNA_Hash_free(chk_hash);
    SENNA_Hash_free(pt0_hash);
    SENNA_Hash_free(ner_hash);
    SENNA_Hash_free(vbs_hash);
    SENNA_Hash_free(srl_hash);
  }

  return 0;
}
Exemplo n.º 5
0
int main_tr4(int argc, char *argv[])
{
    int i, j; 
 
   /**************************************************
      SENNA setup
   **************************************************/
   /* options */
    char *opt_path = NULL;
    int opt_usrtokens = 0;
    int vbs_hash_novb_idx = 22;

    /* inputs */
    SENNA_Hash *word_hash = SENNA_Hash_new(opt_path, "hash/words.lst");
    SENNA_Hash *caps_hash = SENNA_Hash_new(opt_path, "hash/caps.lst");
    SENNA_Hash *suff_hash = SENNA_Hash_new(opt_path, "hash/suffix.lst");
    SENNA_Hash *gazt_hash = SENNA_Hash_new(opt_path, "hash/gazetteer.lst");

    SENNA_Hash *gazl_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.loc.lst", "data/ner.loc.dat");
    SENNA_Hash *gazm_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.msc.lst", "data/ner.msc.dat");
    SENNA_Hash *gazo_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.org.lst", "data/ner.org.dat");
    SENNA_Hash *gazp_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.per.lst", "data/ner.per.dat");

    SENNA_Tokenizer *tokenizer = SENNA_Tokenizer_new(word_hash, caps_hash, suff_hash, gazt_hash, gazl_hash, gazm_hash, gazo_hash, gazp_hash, opt_usrtokens);
    SENNA_SRL *srl = SENNA_SRL_new(opt_path, "data/srl.dat");

    /* labels */
    SENNA_Hash *pos_hash = SENNA_Hash_new(opt_path, "hash/pos.lst");
    SENNA_Hash *srl_hash = SENNA_Hash_new(opt_path, "hash/srl.lst");

    SENNA_POS *pos = SENNA_POS_new(opt_path, "data/pos.dat");
    SENNA_VBS *vbs = SENNA_VBS_new(opt_path, "data/vbs.dat");
    SENNA_PT0 *pt0 = SENNA_PT0_new(opt_path, "data/pt0.dat");

    /* FANN  setup */
    const unsigned int num_input = 2;
    const unsigned int num_output = 1;
    const unsigned int num_layers = 3;
    const unsigned int num_neurons_hidden = 3;
    const float desired_error = (const float) 0.00001;
    const unsigned int max_epochs = 500000;
    const unsigned int epochs_between_reports = 100000;

    struct fann *ann = fann_create_standard(num_layers, num_input, num_neurons_hidden, num_output);

    fann_set_activation_function_hidden(ann, FANN_SIGMOID_SYMMETRIC);
    fann_set_activation_function_output(ann, FANN_SIGMOID_SYMMETRIC);

    struct fann_train_data *train_data;
    train_data = fann_create_train_from_callback(4, 2, 1, &data_callback); 

   /**************************************************
      main program
   **************************************************/

    /* Read the training file line by line*/
    FILE * fp;
    char * line = NULL;
    size_t len = 0;
    ssize_t read;

    fp = fopen("training.dat", "r");

    int targets[] = {1, 1, 1, -1};
    int id=0;
    while ((read = getline(&line, &len, fp)) != -1) {

        SENNA_Tokens* tokens = SENNA_Tokenizer_tokenize(tokenizer, line);
        int *pos_labels = SENNA_POS_forward(pos, tokens->word_idx, tokens->caps_idx, tokens->suff_idx, tokens->n);
        int *pt0_labels = SENNA_PT0_forward(pt0, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n);
        int n_verbs = 0;
        char target_vb[MAX_TARGET_VB_SIZE];
        int *vbs_labels = SENNA_VBS_forward(vbs, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n);
        for(i = 0; i < tokens->n; i++){
           vbs_labels[i] = (vbs_labels[i] != vbs_hash_novb_idx);
           n_verbs += vbs_labels[i];
        }
        int **srl_labels = SENNA_SRL_forward(srl, tokens->word_idx, tokens->caps_idx, pt0_labels, vbs_labels, tokens->n);

        /* Logic */ 
        train_data->input[id][0] = -1;
        train_data->input[id][1] = -1;

        for(i = 0; i < tokens->n; i++){
          printf("%s %s ",tokens->words[i], SENNA_Hash_key(pos_hash, pos_labels[i]));
          printf("%s", (vbs_labels[i] ? tokens->words[i] : "-"));
          for(j = 0; j < n_verbs; j++){
            printf(" '%s'", SENNA_Hash_key(srl_hash, srl_labels[j][i]));

            //printf("%s %s %i %i", tokens->words[i], SENNA_Hash_key(srl_hash, srl_labels[j][i]), strcmp(tokens->words[i],"want"));
            if(strcmp(tokens->words[i],"want") == 0 && strcmp(SENNA_Hash_key(srl_hash, srl_labels[j][i]),"S-V") == 0){
              train_data->input[id][0] = 1;
            }
            else if(strcmp(tokens->words[i],"pizza") == 0 && strcmp(SENNA_Hash_key(srl_hash, srl_labels[j][i]),"E-A1") == 0){
              train_data->input[id][1] = 1;
            }          
          }
          printf("\n");
        }

        train_data->output[id][0] = targets[id];
        
        printf("%s", line);
        printf("Input: %f %f, Output: %d\n\n", train_data->input[id][0], train_data->input[id][1], targets[id]);
        id++;

    }

    /* Train a classifier */
    fann_train_on_data(ann, train_data, max_epochs, epochs_between_reports, desired_error);
    fann_save(ann, "asds.net");

    fann_destroy(ann);
    fclose(fp);


 return 0;
}
Exemplo n.º 6
0
int main_tr3(int argc, char *argv[])
{
    int i, j; 
 
   /**************************************************
      SENNA setup
   **************************************************/
   /* options */
    char *opt_path = NULL;
    int opt_usrtokens = 0;
    int vbs_hash_novb_idx = 22;

    /* inputs */
    SENNA_Hash *word_hash = SENNA_Hash_new(opt_path, "hash/words.lst");
    SENNA_Hash *caps_hash = SENNA_Hash_new(opt_path, "hash/caps.lst");
    SENNA_Hash *suff_hash = SENNA_Hash_new(opt_path, "hash/suffix.lst");
    SENNA_Hash *gazt_hash = SENNA_Hash_new(opt_path, "hash/gazetteer.lst");

    SENNA_Hash *gazl_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.loc.lst", "data/ner.loc.dat");
    SENNA_Hash *gazm_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.msc.lst", "data/ner.msc.dat");
    SENNA_Hash *gazo_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.org.lst", "data/ner.org.dat");
    SENNA_Hash *gazp_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.per.lst", "data/ner.per.dat");

    SENNA_Tokenizer *tokenizer = SENNA_Tokenizer_new(word_hash, caps_hash, suff_hash, gazt_hash, gazl_hash, gazm_hash, gazo_hash, gazp_hash, opt_usrtokens);
    SENNA_SRL *srl = SENNA_SRL_new(opt_path, "data/srl.dat");

    /* labels */
    SENNA_Hash *pos_hash = SENNA_Hash_new(opt_path, "hash/pos.lst");
    SENNA_Hash *srl_hash = SENNA_Hash_new(opt_path, "hash/srl.lst");

    SENNA_POS *pos = SENNA_POS_new(opt_path, "data/pos.dat");
    SENNA_VBS *vbs = SENNA_VBS_new(opt_path, "data/vbs.dat");
    SENNA_PT0 *pt0 = SENNA_PT0_new(opt_path, "data/pt0.dat");

   /**************************************************
      main program
   **************************************************/

    /* Read the training file line by line*/
    FILE * fp;
    char * line = NULL;
    size_t len = 0;
    ssize_t read;

    fp = fopen("training.dat", "r");

    int targets[] = {-1, 1, -1, -1};
    int id=0;
    while ((read = getline(&line, &len, fp)) != -1) {

        SENNA_Tokens* tokens = SENNA_Tokenizer_tokenize(tokenizer, line);
        int *pos_labels = SENNA_POS_forward(pos, tokens->word_idx, tokens->caps_idx, tokens->suff_idx, tokens->n);
        int *pt0_labels = SENNA_PT0_forward(pt0, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n);
        int n_verbs = 0;
        char target_vb[MAX_TARGET_VB_SIZE];
        int *vbs_labels = SENNA_VBS_forward(vbs, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n);
        for(i = 0; i < tokens->n; i++){
           vbs_labels[i] = (vbs_labels[i] != vbs_hash_novb_idx);
           n_verbs += vbs_labels[i];
        }
        int **srl_labels = SENNA_SRL_forward(srl, tokens->word_idx, tokens->caps_idx, pt0_labels, vbs_labels, tokens->n);

        /* Logic */ 
        int x0=-1, x1=-1;
        for(i = 0; i < tokens->n; i++){
          printf("%s %s ",tokens->words[i], SENNA_Hash_key(pos_hash, pos_labels[i]));
          printf("%s", (vbs_labels[i] ? tokens->words[i] : "-"));
          for(j = 0; j < n_verbs; j++){
            printf(" %s", SENNA_Hash_key(srl_hash, srl_labels[j][i]));
            if(strcmp(tokens->words[i],"want") == 0 && strcmp(SENNA_Hash_key(srl_hash, srl_labels[j][i]),"S-V")){
              x0 = 1;
            }
            else if(strcmp(tokens->words[i],"pizza") == 0 && strcmp(SENNA_Hash_key(srl_hash, srl_labels[j][i]),"E-A1")){
              x1 = 1;
            }          
          }
          printf("\n");
        }

        printf("%s", line);
        printf("Input: %d %d, Output: %d\n\n", x0, x1, targets[id]);
        id++;

    }
    fclose(fp);


 return 0;
}