Esempio n. 1
0
void sennaParseSentence(SENNA *senna, const char *sentence, unsigned int options) {
    // Tokenize
    SENNA_Tokens *tokens = SENNA_Tokenizer_tokenize(senna->tokenizer, sentence);
    senna->lastSentence.tokens = tokens;
    assert(tokens);

    // Pos
    if (options&GENERATE_POS || options&GENERATE_PSG) {
        senna->lastSentence.pos_labels = SENNA_POS_forward(
                                             senna->pos, tokens->word_idx, tokens->caps_idx, tokens->suff_idx, tokens->n);
    } else {
        senna->lastSentence.pos_labels = NULL;   // indicate they've not been generated
    }

    // Psg
    if (options&GENERATE_PSG) {
        assert(senna->lastSentence.pos_labels);
        SENNA_PSG_forward(senna->psg, tokens->word_idx, tokens->caps_idx,
                          senna->lastSentence.pos_labels, tokens->n, &senna->lastSentence.psg_labels,
                          &senna->lastSentence.n_psg_level);

        int is_psg_one_segment = 0;
        int i;
        int n_psg_level = senna->lastSentence.n_psg_level;
        /* The following 15 lines are some magic from SENNA_main.c. */
        /* check if top level takes the full sentence */
        {
            int *psg_top_labels = senna->lastSentence.psg_labels + (n_psg_level-1)*tokens->n;

            if(tokens->n == 1)
                is_psg_one_segment = ((psg_top_labels[0]-1) % 4 == 3); /* S- ? */
            else
                is_psg_one_segment = ((psg_top_labels[0]-1) % 4 == 0) && ((psg_top_labels[tokens->n-1]-1) % 4 == 2); /* B- or E- ? */

            for(i = 1; is_psg_one_segment && (i < tokens->n-1); i++)
            {
                if((psg_top_labels[i]-1) % 4 != 1) /* I- ? */
                    is_psg_one_segment = 0;
            }
        }
        senna->lastSentence.is_psg_one_segment = is_psg_one_segment;
    } else {  // don't generate psg
        senna->lastSentence.psg_labels = NULL;   // indicate they haven't been generated
    }
}
Esempio n. 2
0
int main(int argc, char *argv[])
{
  int i, j;

  /* options */
  char *opt_path = NULL;
  int opt_verbose = 0;
  int opt_notokentags = 0;
  int opt_offsettags = 0;
  int opt_iobtags = 0;
  int opt_brackettags = 0;
  int opt_posvbs = 0;
  int opt_usrtokens = 0;
  int opt_pos = 0;
  int opt_chk = 0;
  int opt_ner = 0;
  int opt_srl = 0;
  int opt_psg = 0;
  FILE *opt_usrvbs = NULL;
  FILE *senna_input = stdin;
  FILE *senna_output = stdout;
  int  pipe_mode = 0;
  char *output_pipe = NULL;

  for(i = 1; i < argc; i++)
  {
    if(!strcmp(argv[i], "-verbose"))
      opt_verbose = 1;
    else if(!strcmp(argv[i], "-notokentags"))
      opt_notokentags = 1;
    else if(!strcmp(argv[i], "-offsettags"))
      opt_offsettags = 1;
    else if(!strcmp(argv[i], "-iobtags"))
      opt_iobtags = 1;
    else if(!strcmp(argv[i], "-brackettags"))
      opt_brackettags = 1;
    else if(!strcmp(argv[i], "-path"))
    {
      if(i+1 >= argc)
        SENNA_error("please provide a path for the -path option");
      opt_path = argv[i+1];
      i++;
    }
    else if(!strcmp(argv[i], "-posvbs"))
      opt_posvbs = 1;
    else if(!strcmp(argv[i], "-usrtokens"))
      opt_usrtokens = 1;
    else if(!strcmp(argv[i], "-usrvbs"))
    {
      if(i+1 >= argc)
        SENNA_error("please provide a filename for the -usrvbs option");
      opt_usrvbs = SENNA_fopen(NULL, argv[i+1], "rb");
      i++;
    }
    else if(!strcmp(argv[i], "-pos"))
      opt_pos = 1;
    else if(!strcmp(argv[i], "-chk"))
      opt_chk = 1;
    else if(!strcmp(argv[i], "-ner"))
      opt_ner = 1;
    else if(!strcmp(argv[i], "-srl"))
      opt_srl = 1;
    else if(!strcmp(argv[i], "-psg"))
      opt_psg = 1;
    else if(!strcmp(argv[i], "-maxsentsize")) {
      if(i+1 >= argc)
        SENNA_error("please provide a sentence size for the -maxsentsize option");
      max_sent_size = atol(argv[i+1]);
      if(max_sent_size<0)
        SENNA_error("provide a positive value for the -maxsentsize option");
      i++;
    } else if(!strcmp(argv[i], "-input_pipe")){
      if(i+1 >= argc)
        SENNA_error("please provide the name of the input pipe");
      senna_input = fopen(argv[i+1], "r");
      if (senna_input == NULL) {
        SENNA_error("cannot open the input named pipe");
      }
      pipe_mode = 1;
      i++;
    } else if(!strcmp(argv[i], "-output_pipe")){
      if(i+1 >= argc)
        SENNA_error("please provide the name of the outputpipe");
      output_pipe = argv[i+1];
      i++;
    } else {
      printf("invalid argument: %s\n", argv[i]);
      help(argv[0]);
    }
  }

  SENNA_set_verbose_mode(opt_verbose);

  if(!opt_pos && !opt_chk && !opt_ner && !opt_srl && !opt_psg) /* the user does not know what he wants... */
    opt_pos = opt_chk = opt_ner = opt_srl = opt_psg = 1;     /* so give him everything (aren't we insane?) */


  /* the real thing */
  {
    char *sentence = NULL;
    char target_vb[MAX_TARGET_VB_SIZE];
    int *chk_labels = NULL;
    int *pt0_labels = NULL;
    int *pos_labels = NULL;
    int *ner_labels = NULL;
    int *vbs_labels = NULL;
    int **srl_labels = NULL;
    int *psg_labels = NULL;
    int n_psg_level = 0;
    int is_psg_one_segment = 0;
    int vbs_hash_novb_idx = 22;
    int n_verbs = 0;

    sentence = malloc(max_sent_size + 1);
    SENNA_message("Maximum sentence size %ld", max_sent_size);
    
    /* inputs */
    SENNA_Hash *word_hash = SENNA_Hash_new(opt_path, "hash/words.lst");
    SENNA_Hash *caps_hash = SENNA_Hash_new(opt_path, "hash/caps.lst");
    SENNA_Hash *suff_hash = SENNA_Hash_new(opt_path, "hash/suffix.lst");
    SENNA_Hash *gazt_hash = SENNA_Hash_new(opt_path, "hash/gazetteer.lst");

    SENNA_Hash *gazl_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.loc.lst", "data/ner.loc.dat");
    SENNA_Hash *gazm_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.msc.lst", "data/ner.msc.dat");
    SENNA_Hash *gazo_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.org.lst", "data/ner.org.dat");
    SENNA_Hash *gazp_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.per.lst", "data/ner.per.dat");

    /* labels */
    SENNA_Hash *pos_hash = SENNA_Hash_new(opt_path, "hash/pos.lst");
    SENNA_Hash *chk_hash = SENNA_Hash_new(opt_path, "hash/chk.lst");
    SENNA_Hash *pt0_hash = SENNA_Hash_new(opt_path, "hash/pt0.lst");
    SENNA_Hash *ner_hash = SENNA_Hash_new(opt_path, "hash/ner.lst");
    SENNA_Hash *vbs_hash = SENNA_Hash_new(opt_path, "hash/vbs.lst");
    SENNA_Hash *srl_hash = SENNA_Hash_new(opt_path, "hash/srl.lst");
    SENNA_Hash *psg_left_hash = SENNA_Hash_new(opt_path, "hash/psg-left.lst");
    SENNA_Hash *psg_right_hash = SENNA_Hash_new(opt_path, "hash/psg-right.lst");

    SENNA_POS *pos = SENNA_POS_new(opt_path, "data/pos.dat");
    SENNA_CHK *chk = SENNA_CHK_new(opt_path, "data/chk.dat");
    SENNA_PT0 *pt0 = SENNA_PT0_new(opt_path, "data/pt0.dat");
    SENNA_NER *ner = SENNA_NER_new(opt_path, "data/ner.dat");
    SENNA_VBS *vbs = SENNA_VBS_new(opt_path, "data/vbs.dat");
    SENNA_SRL *srl = SENNA_SRL_new(opt_path, "data/srl.dat");
    SENNA_PSG *psg = SENNA_PSG_new(opt_path, "data/psg.dat");

    SENNA_Tokenizer *tokenizer = SENNA_Tokenizer_new(word_hash, caps_hash, suff_hash, gazt_hash, gazl_hash, gazm_hash, gazo_hash, gazp_hash, opt_usrtokens);

    if(opt_iobtags)
    {
      SENNA_Hash_convert_IOBES_to_IOB(chk_hash);
      SENNA_Hash_convert_IOBES_to_IOB(ner_hash);
      SENNA_Hash_convert_IOBES_to_IOB(srl_hash);
    }
    else if(opt_brackettags)
    {
      SENNA_Hash_convert_IOBES_to_brackets(chk_hash);
      SENNA_Hash_convert_IOBES_to_brackets(ner_hash);
      SENNA_Hash_convert_IOBES_to_brackets(srl_hash);
    }

    SENNA_message("ready");

    do {
      if (output_pipe) {
        senna_output = fopen(output_pipe, "w");
        if (senna_output == NULL) {
          SENNA_error("cannot open the output named pipe");
        }
      } else {
        senna_output = stdout;
      }

      while(fgets(sentence, max_sent_size + 1, senna_input))
      {
        SENNA_Tokens* tokens = SENNA_Tokenizer_tokenize(tokenizer, sentence);
      
        if(tokens->n == 0)
          continue;

        pos_labels = SENNA_POS_forward(pos, tokens->word_idx, tokens->caps_idx, tokens->suff_idx, tokens->n);
        if(opt_chk)
          chk_labels = SENNA_CHK_forward(chk, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n);
        if(opt_srl)
          pt0_labels = SENNA_PT0_forward(pt0, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n);
        if(opt_ner)
          ner_labels = SENNA_NER_forward(ner, tokens->word_idx, tokens->caps_idx, tokens->gazl_idx, tokens->gazm_idx, tokens->gazo_idx, tokens->gazp_idx, tokens->n);
        if(opt_srl)
        {
          if(opt_usrvbs)
          {
            vbs_labels = SENNA_realloc(vbs_labels, sizeof(int), tokens->n);
            n_verbs = 0;
            for(i = 0; i < tokens->n; i++)
            {
              if(!SENNA_fgetline(target_vb, MAX_TARGET_VB_SIZE, opt_usrvbs))
                SENNA_error("invalid user verbs file\n");
              vbs_labels[i] = !( (target_vb[0] == '-') && ( (target_vb[1] == '\0') || isspace(target_vb[1])) );
              n_verbs += vbs_labels[i];          
            }
            if(!SENNA_fgetline(target_vb, MAX_TARGET_VB_SIZE, opt_usrvbs))
              SENNA_error("invalid user verbs file\n");
            if(strlen(target_vb) > 0)
              SENNA_error("sentence size does not match in user verbs file");
          }
          else if(opt_posvbs)
          {
            vbs_labels = SENNA_realloc(vbs_labels, sizeof(int), tokens->n);
            n_verbs = 0;
            for(i = 0; i < tokens->n; i++)
            {
              vbs_labels[i] = (SENNA_Hash_key(pos_hash, pos_labels[i])[0] == 'V');
              n_verbs += vbs_labels[i];
            }
          }
          else
          {
            vbs_labels = SENNA_VBS_forward(vbs, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n);
            n_verbs = 0;
            for(i = 0; i < tokens->n; i++)
            {
              vbs_labels[i] = (vbs_labels[i] != vbs_hash_novb_idx);
              n_verbs += vbs_labels[i];
            }
          }
        }

        if(opt_srl)
          srl_labels = SENNA_SRL_forward(srl, tokens->word_idx, tokens->caps_idx, pt0_labels, vbs_labels, tokens->n);

        if(opt_psg)
        {
          SENNA_PSG_forward(psg, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n, &psg_labels, &n_psg_level);

          /* check if top level takes the full sentence */
          {
            int *psg_top_labels = psg_labels + (n_psg_level-1)*tokens->n;

            if(tokens->n == 1)
              is_psg_one_segment = ((psg_top_labels[0]-1) % 4 == 3); /* S- ? */
            else
              is_psg_one_segment = ((psg_top_labels[0]-1) % 4 == 0) && ((psg_top_labels[tokens->n-1]-1) % 4 == 2); /* B- or E- ? */

            for(i = 1; is_psg_one_segment && (i < tokens->n-1); i++)
            {
              if((psg_top_labels[i]-1) % 4 != 1) /* I- ? */
                is_psg_one_segment = 0;
            }
          }
        }

        for(i = 0; i < tokens->n; i++)
        {
          if(!opt_notokentags)
            fprintf(senna_output, "%15s", tokens->words[i]);
          if(opt_offsettags)
            fprintf(senna_output, "\t%d %d", tokens->start_offset[i], tokens->end_offset[i]);
          if(opt_pos)
            fprintf(senna_output, "\t%10s", SENNA_Hash_key(pos_hash, pos_labels[i]));
          if(opt_chk)
            fprintf(senna_output, "\t%10s", SENNA_Hash_key(chk_hash, chk_labels[i]));
          if(opt_ner)
            fprintf(senna_output, "\t%10s", SENNA_Hash_key(ner_hash, ner_labels[i]));
          if(opt_srl)
          {
            fprintf(senna_output, "\t%15s", (vbs_labels[i] ? tokens->words[i] : "-"));
            for(j = 0; j < n_verbs; j++)
              fprintf(senna_output, "\t%10s", SENNA_Hash_key(srl_hash, srl_labels[j][i]));
          }
          if(opt_psg) /* last, can be long */
          {
            fprintf(senna_output, "\t");
            if(i == 0)
            {
              fprintf(senna_output, "(S1");
              if(!is_psg_one_segment)
                fprintf(senna_output, "(S");
            }
            for(j = n_psg_level-1; j >= 0; j--)
              fprintf(senna_output, "%s", SENNA_Hash_key(psg_left_hash, psg_labels[j*tokens->n+i]));
            fprintf(senna_output, "*");
            for(j = 0; j < n_psg_level; j++)
              fprintf(senna_output, "%s", SENNA_Hash_key(psg_right_hash, psg_labels[j*tokens->n+i]));
            if(i == tokens->n-1)
            {
              if(!is_psg_one_segment)
                fprintf(senna_output, ")");
              fprintf(senna_output, ")");
            }
          }
          fprintf(senna_output, "\n");
        }
        fprintf(senna_output, "\n"); /* end of sentence */
      }
      if (output_pipe) {
        fclose(senna_output);
      }
    } while (pipe_mode);

    if(opt_posvbs)
      SENNA_free(vbs_labels);

    if(opt_usrvbs)
    {
      SENNA_free(vbs_labels);
      SENNA_fclose(opt_usrvbs);
    }

    SENNA_Tokenizer_free(tokenizer);

    SENNA_POS_free(pos);
    SENNA_CHK_free(chk);
    SENNA_PT0_free(pt0);
    SENNA_NER_free(ner);
    SENNA_VBS_free(vbs);
    SENNA_SRL_free(srl);
    SENNA_PSG_free(psg);

    SENNA_Hash_free(word_hash);
    SENNA_Hash_free(caps_hash);
    SENNA_Hash_free(suff_hash);
    SENNA_Hash_free(gazt_hash);

    SENNA_Hash_free(gazl_hash);
    SENNA_Hash_free(gazm_hash);
    SENNA_Hash_free(gazo_hash);
    SENNA_Hash_free(gazp_hash);

    SENNA_Hash_free(pos_hash);
    SENNA_Hash_free(chk_hash);
    SENNA_Hash_free(pt0_hash);
    SENNA_Hash_free(ner_hash);
    SENNA_Hash_free(vbs_hash);
    SENNA_Hash_free(srl_hash);
    SENNA_Hash_free(psg_left_hash);
    SENNA_Hash_free(psg_right_hash);

    free(sentence);
  }

  return 0;
}
  vector<Tagger::Tag> Tagger::Tags(const string& text) const {
    vector<Tag> tags;
    if (text.empty()) {
      return tags;
    }
    // TODO(esawin): Is this thread-safe?
    SENNA_Tokens* tokens = SENNA_Tokenizer_tokenize(tokenizer_, text.c_str());
    if (tokens->n == 0) {
      //LOG(WARNING) << "Tokenizer failed.";
      return tags;
    }
    int* pos_labels;
    // TODO(esawin): Is this thread-safe?
    // Extracting POS tagging have to be carried out always because all depends on it
    pos_labels = SENNA_POS_forward(pos_, tokens->word_idx, tokens->caps_idx, tokens->suff_idx, tokens->n);
    tags.reserve(tokens->n);
    for (int i = 0; i < tokens->n; ++i) {
      string word = text.substr(tokens->start_offset[i], tokens->end_offset[i] - tokens->start_offset[i]);
      word_to_pos[word] = pos_hash_->keys[pos_labels[i]];
    }
    if (type_ & kPos) {
      std::cout << "POS ..." << std::endl;
      // Part-of-speech tagging.
      for (int i = 0; i < tokens->n; ++i) {
	//Offset offset = {tokens->start_offset[i], tokens->end_offset[i] - tokens->start_offset[i]};
	//tags.push_back(Tag(offset, kPos, pos_hash_->keys[pos_labels[i]]));
	string offset = text.substr(tokens->start_offset[i], tokens->end_offset[i] - tokens->start_offset[i]);
	tags.push_back(Tag(offset, kPos, pos_hash_->keys[pos_labels[i]]));
      }
    }
    if (type_ & kChk) {
      std::cout << "Chunking ..." << std::endl;
      // Chunking.
      int* chk_labels = SENNA_CHK_forward(chk_, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n);
      for (int i = 0; i < tokens->n; ++i) {
	//Offset offset = {tokens->start_offset[i], tokens->end_offset[i] - tokens->start_offset[i]};
	string offset = text.substr(tokens->start_offset[i], tokens->end_offset[i] - tokens->start_offset[i]);
	tags.push_back(Tag(offset, kChk, chk_hash_->keys[chk_labels[i]]));
      }
    }
    if (type_ & kNer) {
      std::cout << "NER ..." << std::endl;
      // Named entity recognition.
      int* ner_labels = SENNA_NER_forward(ner_, tokens->word_idx, tokens->caps_idx, tokens->gazl_idx, tokens->gazm_idx, tokens->gazo_idx, tokens->gazp_idx, tokens->n);
      for (int i = 0; i < tokens->n; ++i) {
	//Offset offset = {tokens->start_offset[i], tokens->end_offset[i] - tokens->start_offset[i]};
	string offset = text.substr(tokens->start_offset[i], tokens->end_offset[i] - tokens->start_offset[i]);
	tags.push_back(Tag(offset, kNer, ner_hash_->keys[ner_labels[i]]));
      }
    }
    if (type_ & kSrl) {
      //std::cout << "SRL ..." << std::endl;
      // Semantic Role Labeling.
      int* pt0_labels = SENNA_PT0_forward(pt0_, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n);
      int* vbs_labels = SENNA_VBS_forward(vbs_, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n);
      n_verbs = 0;
      for (int i = 0; i < tokens->n; ++i) {
	vbs_labels[i] = (vbs_labels[i] != vbs_hash_novb_idx);
	n_verbs += vbs_labels[i];
	//std::cout << vbs_labels[i] << "," << pt0_hash_->keys[pt0_labels[i]] << std::endl;
      }
    
      //std::cout << "Number of verbs: " << n_verbs << std::endl;

      int** srl_labels = SENNA_SRL_forward(srl_, tokens->word_idx,
					   tokens->caps_idx, pt0_labels, vbs_labels, tokens->n);

      for (int i = 0; i < tokens->n; ++i) {
	string label = (vbs_labels[i] ? tokens->words[i] : "-");
	//Offset offset = {tokens->start_offset[i], tokens->end_offset[i] - tokens->start_offset[i]};
	string offset = text.substr(tokens->start_offset[i], tokens->end_offset[i] - tokens->start_offset[i]);
	for(int j = 0; j < n_verbs; ++j)
	  label += "|" + string(srl_hash_->keys[srl_labels[j][i]]);
	tags.push_back(Tag(offset, kSrl, label));
      }
    }
    if (type_ & kPsg) {
      std::cout << "PSG ..." << std::endl;
      // Probabilistic Parsing.
      int* psg_labels;
      SENNA_PSG_forward(psg_, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n, &psg_labels, &n_psg_level);
      int* psg_top_labels = psg_labels + (n_psg_level-1)*tokens->n;
      if (tokens->n == 1)
	is_psg_one_segment = ((psg_top_labels[0]-1) % 4 == 3); /* S- ? */
      else
	is_psg_one_segment = ((psg_top_labels[0]-1) % 4 == 0) && ((psg_top_labels[tokens->n-1]-1) % 4 == 2); /* B- or E- ? */
      for(int i = 1; is_psg_one_segment && (i < tokens->n-1); ++i) {
	if((psg_top_labels[i]-1) % 4 != 1) /* I- ? */
	  is_psg_one_segment = 0;
      }
    
      for (int i = 0; i < tokens->n; ++i) {
	string label = "";
	//Offset offset = {tokens->start_offset[i], tokens->end_offset[i] - tokens->start_offset[i]};
	string offset = text.substr(tokens->start_offset[i], tokens->end_offset[i] - tokens->start_offset[i]);
	if(i == 0) 
	  label += (!is_psg_one_segment ? "(S(S1" : "(S");
	for(int j = n_psg_level-1; j >= 0; j--)
	  label += psg_left_hash_->keys[psg_labels[j*tokens->n+i]];
	label += "*";
	for(int j = 0; j < n_psg_level; j++)
	  label += psg_right_hash_->keys[psg_labels[j*tokens->n+i]];
	if(i == tokens->n-1)
	  label += (!is_psg_one_segment ? "))" : ")");
	tags.push_back(Tag(offset, kPsg, label));
      }

    }
    return tags;
  }