void sennaParseSentence(SENNA *senna, const char *sentence, unsigned int options) { // Tokenize SENNA_Tokens *tokens = SENNA_Tokenizer_tokenize(senna->tokenizer, sentence); senna->lastSentence.tokens = tokens; assert(tokens); // Pos if (options&GENERATE_POS || options&GENERATE_PSG) { senna->lastSentence.pos_labels = SENNA_POS_forward( senna->pos, tokens->word_idx, tokens->caps_idx, tokens->suff_idx, tokens->n); } else { senna->lastSentence.pos_labels = NULL; // indicate they've not been generated } // Psg if (options&GENERATE_PSG) { assert(senna->lastSentence.pos_labels); SENNA_PSG_forward(senna->psg, tokens->word_idx, tokens->caps_idx, senna->lastSentence.pos_labels, tokens->n, &senna->lastSentence.psg_labels, &senna->lastSentence.n_psg_level); int is_psg_one_segment = 0; int i; int n_psg_level = senna->lastSentence.n_psg_level; /* The following 15 lines are some magic from SENNA_main.c. */ /* check if top level takes the full sentence */ { int *psg_top_labels = senna->lastSentence.psg_labels + (n_psg_level-1)*tokens->n; if(tokens->n == 1) is_psg_one_segment = ((psg_top_labels[0]-1) % 4 == 3); /* S- ? */ else is_psg_one_segment = ((psg_top_labels[0]-1) % 4 == 0) && ((psg_top_labels[tokens->n-1]-1) % 4 == 2); /* B- or E- ? */ for(i = 1; is_psg_one_segment && (i < tokens->n-1); i++) { if((psg_top_labels[i]-1) % 4 != 1) /* I- ? */ is_psg_one_segment = 0; } } senna->lastSentence.is_psg_one_segment = is_psg_one_segment; } else { // don't generate psg senna->lastSentence.psg_labels = NULL; // indicate they haven't been generated } }
int main(int argc, char *argv[]) { int i, j; /* options */ char *opt_path = NULL; int opt_verbose = 0; int opt_notokentags = 0; int opt_offsettags = 0; int opt_iobtags = 0; int opt_brackettags = 0; int opt_posvbs = 0; int opt_usrtokens = 0; int opt_pos = 0; int opt_chk = 0; int opt_ner = 0; int opt_srl = 0; int opt_psg = 0; FILE *opt_usrvbs = NULL; FILE *senna_input = stdin; FILE *senna_output = stdout; int pipe_mode = 0; char *output_pipe = NULL; for(i = 1; i < argc; i++) { if(!strcmp(argv[i], "-verbose")) opt_verbose = 1; else if(!strcmp(argv[i], "-notokentags")) opt_notokentags = 1; else if(!strcmp(argv[i], "-offsettags")) opt_offsettags = 1; else if(!strcmp(argv[i], "-iobtags")) opt_iobtags = 1; else if(!strcmp(argv[i], "-brackettags")) opt_brackettags = 1; else if(!strcmp(argv[i], "-path")) { if(i+1 >= argc) SENNA_error("please provide a path for the -path option"); opt_path = argv[i+1]; i++; } else if(!strcmp(argv[i], "-posvbs")) opt_posvbs = 1; else if(!strcmp(argv[i], "-usrtokens")) opt_usrtokens = 1; else if(!strcmp(argv[i], "-usrvbs")) { if(i+1 >= argc) SENNA_error("please provide a filename for the -usrvbs option"); opt_usrvbs = SENNA_fopen(NULL, argv[i+1], "rb"); i++; } else if(!strcmp(argv[i], "-pos")) opt_pos = 1; else if(!strcmp(argv[i], "-chk")) opt_chk = 1; else if(!strcmp(argv[i], "-ner")) opt_ner = 1; else if(!strcmp(argv[i], "-srl")) opt_srl = 1; else if(!strcmp(argv[i], "-psg")) opt_psg = 1; else if(!strcmp(argv[i], "-maxsentsize")) { if(i+1 >= argc) SENNA_error("please provide a sentence size for the -maxsentsize option"); max_sent_size = atol(argv[i+1]); if(max_sent_size<0) SENNA_error("provide a positive value for the -maxsentsize option"); i++; } else if(!strcmp(argv[i], "-input_pipe")){ if(i+1 >= argc) SENNA_error("please provide the name of the input pipe"); senna_input = fopen(argv[i+1], "r"); if (senna_input == NULL) { SENNA_error("cannot open the input named pipe"); } pipe_mode = 1; i++; } else if(!strcmp(argv[i], "-output_pipe")){ if(i+1 >= argc) SENNA_error("please provide the name of the outputpipe"); output_pipe = argv[i+1]; i++; } else { printf("invalid argument: %s\n", argv[i]); help(argv[0]); } } SENNA_set_verbose_mode(opt_verbose); if(!opt_pos && !opt_chk && !opt_ner && !opt_srl && !opt_psg) /* the user does not know what he wants... */ opt_pos = opt_chk = opt_ner = opt_srl = opt_psg = 1; /* so give him everything (aren't we insane?) */ /* the real thing */ { char *sentence = NULL; char target_vb[MAX_TARGET_VB_SIZE]; int *chk_labels = NULL; int *pt0_labels = NULL; int *pos_labels = NULL; int *ner_labels = NULL; int *vbs_labels = NULL; int **srl_labels = NULL; int *psg_labels = NULL; int n_psg_level = 0; int is_psg_one_segment = 0; int vbs_hash_novb_idx = 22; int n_verbs = 0; sentence = malloc(max_sent_size + 1); SENNA_message("Maximum sentence size %ld", max_sent_size); /* inputs */ SENNA_Hash *word_hash = SENNA_Hash_new(opt_path, "hash/words.lst"); SENNA_Hash *caps_hash = SENNA_Hash_new(opt_path, "hash/caps.lst"); SENNA_Hash *suff_hash = SENNA_Hash_new(opt_path, "hash/suffix.lst"); SENNA_Hash *gazt_hash = SENNA_Hash_new(opt_path, "hash/gazetteer.lst"); SENNA_Hash *gazl_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.loc.lst", "data/ner.loc.dat"); SENNA_Hash *gazm_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.msc.lst", "data/ner.msc.dat"); SENNA_Hash *gazo_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.org.lst", "data/ner.org.dat"); SENNA_Hash *gazp_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.per.lst", "data/ner.per.dat"); /* labels */ SENNA_Hash *pos_hash = SENNA_Hash_new(opt_path, "hash/pos.lst"); SENNA_Hash *chk_hash = SENNA_Hash_new(opt_path, "hash/chk.lst"); SENNA_Hash *pt0_hash = SENNA_Hash_new(opt_path, "hash/pt0.lst"); SENNA_Hash *ner_hash = SENNA_Hash_new(opt_path, "hash/ner.lst"); SENNA_Hash *vbs_hash = SENNA_Hash_new(opt_path, "hash/vbs.lst"); SENNA_Hash *srl_hash = SENNA_Hash_new(opt_path, "hash/srl.lst"); SENNA_Hash *psg_left_hash = SENNA_Hash_new(opt_path, "hash/psg-left.lst"); SENNA_Hash *psg_right_hash = SENNA_Hash_new(opt_path, "hash/psg-right.lst"); SENNA_POS *pos = SENNA_POS_new(opt_path, "data/pos.dat"); SENNA_CHK *chk = SENNA_CHK_new(opt_path, "data/chk.dat"); SENNA_PT0 *pt0 = SENNA_PT0_new(opt_path, "data/pt0.dat"); SENNA_NER *ner = SENNA_NER_new(opt_path, "data/ner.dat"); SENNA_VBS *vbs = SENNA_VBS_new(opt_path, "data/vbs.dat"); SENNA_SRL *srl = SENNA_SRL_new(opt_path, "data/srl.dat"); SENNA_PSG *psg = SENNA_PSG_new(opt_path, "data/psg.dat"); SENNA_Tokenizer *tokenizer = SENNA_Tokenizer_new(word_hash, caps_hash, suff_hash, gazt_hash, gazl_hash, gazm_hash, gazo_hash, gazp_hash, opt_usrtokens); if(opt_iobtags) { SENNA_Hash_convert_IOBES_to_IOB(chk_hash); SENNA_Hash_convert_IOBES_to_IOB(ner_hash); SENNA_Hash_convert_IOBES_to_IOB(srl_hash); } else if(opt_brackettags) { SENNA_Hash_convert_IOBES_to_brackets(chk_hash); SENNA_Hash_convert_IOBES_to_brackets(ner_hash); SENNA_Hash_convert_IOBES_to_brackets(srl_hash); } SENNA_message("ready"); do { if (output_pipe) { senna_output = fopen(output_pipe, "w"); if (senna_output == NULL) { SENNA_error("cannot open the output named pipe"); } } else { senna_output = stdout; } while(fgets(sentence, max_sent_size + 1, senna_input)) { SENNA_Tokens* tokens = SENNA_Tokenizer_tokenize(tokenizer, sentence); if(tokens->n == 0) continue; pos_labels = SENNA_POS_forward(pos, tokens->word_idx, tokens->caps_idx, tokens->suff_idx, tokens->n); if(opt_chk) chk_labels = SENNA_CHK_forward(chk, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n); if(opt_srl) pt0_labels = SENNA_PT0_forward(pt0, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n); if(opt_ner) ner_labels = SENNA_NER_forward(ner, tokens->word_idx, tokens->caps_idx, tokens->gazl_idx, tokens->gazm_idx, tokens->gazo_idx, tokens->gazp_idx, tokens->n); if(opt_srl) { if(opt_usrvbs) { vbs_labels = SENNA_realloc(vbs_labels, sizeof(int), tokens->n); n_verbs = 0; for(i = 0; i < tokens->n; i++) { if(!SENNA_fgetline(target_vb, MAX_TARGET_VB_SIZE, opt_usrvbs)) SENNA_error("invalid user verbs file\n"); vbs_labels[i] = !( (target_vb[0] == '-') && ( (target_vb[1] == '\0') || isspace(target_vb[1])) ); n_verbs += vbs_labels[i]; } if(!SENNA_fgetline(target_vb, MAX_TARGET_VB_SIZE, opt_usrvbs)) SENNA_error("invalid user verbs file\n"); if(strlen(target_vb) > 0) SENNA_error("sentence size does not match in user verbs file"); } else if(opt_posvbs) { vbs_labels = SENNA_realloc(vbs_labels, sizeof(int), tokens->n); n_verbs = 0; for(i = 0; i < tokens->n; i++) { vbs_labels[i] = (SENNA_Hash_key(pos_hash, pos_labels[i])[0] == 'V'); n_verbs += vbs_labels[i]; } } else { vbs_labels = SENNA_VBS_forward(vbs, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n); n_verbs = 0; for(i = 0; i < tokens->n; i++) { vbs_labels[i] = (vbs_labels[i] != vbs_hash_novb_idx); n_verbs += vbs_labels[i]; } } } if(opt_srl) srl_labels = SENNA_SRL_forward(srl, tokens->word_idx, tokens->caps_idx, pt0_labels, vbs_labels, tokens->n); if(opt_psg) { SENNA_PSG_forward(psg, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n, &psg_labels, &n_psg_level); /* check if top level takes the full sentence */ { int *psg_top_labels = psg_labels + (n_psg_level-1)*tokens->n; if(tokens->n == 1) is_psg_one_segment = ((psg_top_labels[0]-1) % 4 == 3); /* S- ? */ else is_psg_one_segment = ((psg_top_labels[0]-1) % 4 == 0) && ((psg_top_labels[tokens->n-1]-1) % 4 == 2); /* B- or E- ? */ for(i = 1; is_psg_one_segment && (i < tokens->n-1); i++) { if((psg_top_labels[i]-1) % 4 != 1) /* I- ? */ is_psg_one_segment = 0; } } } for(i = 0; i < tokens->n; i++) { if(!opt_notokentags) fprintf(senna_output, "%15s", tokens->words[i]); if(opt_offsettags) fprintf(senna_output, "\t%d %d", tokens->start_offset[i], tokens->end_offset[i]); if(opt_pos) fprintf(senna_output, "\t%10s", SENNA_Hash_key(pos_hash, pos_labels[i])); if(opt_chk) fprintf(senna_output, "\t%10s", SENNA_Hash_key(chk_hash, chk_labels[i])); if(opt_ner) fprintf(senna_output, "\t%10s", SENNA_Hash_key(ner_hash, ner_labels[i])); if(opt_srl) { fprintf(senna_output, "\t%15s", (vbs_labels[i] ? tokens->words[i] : "-")); for(j = 0; j < n_verbs; j++) fprintf(senna_output, "\t%10s", SENNA_Hash_key(srl_hash, srl_labels[j][i])); } if(opt_psg) /* last, can be long */ { fprintf(senna_output, "\t"); if(i == 0) { fprintf(senna_output, "(S1"); if(!is_psg_one_segment) fprintf(senna_output, "(S"); } for(j = n_psg_level-1; j >= 0; j--) fprintf(senna_output, "%s", SENNA_Hash_key(psg_left_hash, psg_labels[j*tokens->n+i])); fprintf(senna_output, "*"); for(j = 0; j < n_psg_level; j++) fprintf(senna_output, "%s", SENNA_Hash_key(psg_right_hash, psg_labels[j*tokens->n+i])); if(i == tokens->n-1) { if(!is_psg_one_segment) fprintf(senna_output, ")"); fprintf(senna_output, ")"); } } fprintf(senna_output, "\n"); } fprintf(senna_output, "\n"); /* end of sentence */ } if (output_pipe) { fclose(senna_output); } } while (pipe_mode); if(opt_posvbs) SENNA_free(vbs_labels); if(opt_usrvbs) { SENNA_free(vbs_labels); SENNA_fclose(opt_usrvbs); } SENNA_Tokenizer_free(tokenizer); SENNA_POS_free(pos); SENNA_CHK_free(chk); SENNA_PT0_free(pt0); SENNA_NER_free(ner); SENNA_VBS_free(vbs); SENNA_SRL_free(srl); SENNA_PSG_free(psg); SENNA_Hash_free(word_hash); SENNA_Hash_free(caps_hash); SENNA_Hash_free(suff_hash); SENNA_Hash_free(gazt_hash); SENNA_Hash_free(gazl_hash); SENNA_Hash_free(gazm_hash); SENNA_Hash_free(gazo_hash); SENNA_Hash_free(gazp_hash); SENNA_Hash_free(pos_hash); SENNA_Hash_free(chk_hash); SENNA_Hash_free(pt0_hash); SENNA_Hash_free(ner_hash); SENNA_Hash_free(vbs_hash); SENNA_Hash_free(srl_hash); SENNA_Hash_free(psg_left_hash); SENNA_Hash_free(psg_right_hash); free(sentence); } return 0; }
vector<Tagger::Tag> Tagger::Tags(const string& text) const { vector<Tag> tags; if (text.empty()) { return tags; } // TODO(esawin): Is this thread-safe? SENNA_Tokens* tokens = SENNA_Tokenizer_tokenize(tokenizer_, text.c_str()); if (tokens->n == 0) { //LOG(WARNING) << "Tokenizer failed."; return tags; } int* pos_labels; // TODO(esawin): Is this thread-safe? // Extracting POS tagging have to be carried out always because all depends on it pos_labels = SENNA_POS_forward(pos_, tokens->word_idx, tokens->caps_idx, tokens->suff_idx, tokens->n); tags.reserve(tokens->n); for (int i = 0; i < tokens->n; ++i) { string word = text.substr(tokens->start_offset[i], tokens->end_offset[i] - tokens->start_offset[i]); word_to_pos[word] = pos_hash_->keys[pos_labels[i]]; } if (type_ & kPos) { std::cout << "POS ..." << std::endl; // Part-of-speech tagging. for (int i = 0; i < tokens->n; ++i) { //Offset offset = {tokens->start_offset[i], tokens->end_offset[i] - tokens->start_offset[i]}; //tags.push_back(Tag(offset, kPos, pos_hash_->keys[pos_labels[i]])); string offset = text.substr(tokens->start_offset[i], tokens->end_offset[i] - tokens->start_offset[i]); tags.push_back(Tag(offset, kPos, pos_hash_->keys[pos_labels[i]])); } } if (type_ & kChk) { std::cout << "Chunking ..." << std::endl; // Chunking. int* chk_labels = SENNA_CHK_forward(chk_, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n); for (int i = 0; i < tokens->n; ++i) { //Offset offset = {tokens->start_offset[i], tokens->end_offset[i] - tokens->start_offset[i]}; string offset = text.substr(tokens->start_offset[i], tokens->end_offset[i] - tokens->start_offset[i]); tags.push_back(Tag(offset, kChk, chk_hash_->keys[chk_labels[i]])); } } if (type_ & kNer) { std::cout << "NER ..." << std::endl; // Named entity recognition. int* ner_labels = SENNA_NER_forward(ner_, tokens->word_idx, tokens->caps_idx, tokens->gazl_idx, tokens->gazm_idx, tokens->gazo_idx, tokens->gazp_idx, tokens->n); for (int i = 0; i < tokens->n; ++i) { //Offset offset = {tokens->start_offset[i], tokens->end_offset[i] - tokens->start_offset[i]}; string offset = text.substr(tokens->start_offset[i], tokens->end_offset[i] - tokens->start_offset[i]); tags.push_back(Tag(offset, kNer, ner_hash_->keys[ner_labels[i]])); } } if (type_ & kSrl) { //std::cout << "SRL ..." << std::endl; // Semantic Role Labeling. int* pt0_labels = SENNA_PT0_forward(pt0_, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n); int* vbs_labels = SENNA_VBS_forward(vbs_, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n); n_verbs = 0; for (int i = 0; i < tokens->n; ++i) { vbs_labels[i] = (vbs_labels[i] != vbs_hash_novb_idx); n_verbs += vbs_labels[i]; //std::cout << vbs_labels[i] << "," << pt0_hash_->keys[pt0_labels[i]] << std::endl; } //std::cout << "Number of verbs: " << n_verbs << std::endl; int** srl_labels = SENNA_SRL_forward(srl_, tokens->word_idx, tokens->caps_idx, pt0_labels, vbs_labels, tokens->n); for (int i = 0; i < tokens->n; ++i) { string label = (vbs_labels[i] ? tokens->words[i] : "-"); //Offset offset = {tokens->start_offset[i], tokens->end_offset[i] - tokens->start_offset[i]}; string offset = text.substr(tokens->start_offset[i], tokens->end_offset[i] - tokens->start_offset[i]); for(int j = 0; j < n_verbs; ++j) label += "|" + string(srl_hash_->keys[srl_labels[j][i]]); tags.push_back(Tag(offset, kSrl, label)); } } if (type_ & kPsg) { std::cout << "PSG ..." << std::endl; // Probabilistic Parsing. int* psg_labels; SENNA_PSG_forward(psg_, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n, &psg_labels, &n_psg_level); int* psg_top_labels = psg_labels + (n_psg_level-1)*tokens->n; if (tokens->n == 1) is_psg_one_segment = ((psg_top_labels[0]-1) % 4 == 3); /* S- ? */ else is_psg_one_segment = ((psg_top_labels[0]-1) % 4 == 0) && ((psg_top_labels[tokens->n-1]-1) % 4 == 2); /* B- or E- ? */ for(int i = 1; is_psg_one_segment && (i < tokens->n-1); ++i) { if((psg_top_labels[i]-1) % 4 != 1) /* I- ? */ is_psg_one_segment = 0; } for (int i = 0; i < tokens->n; ++i) { string label = ""; //Offset offset = {tokens->start_offset[i], tokens->end_offset[i] - tokens->start_offset[i]}; string offset = text.substr(tokens->start_offset[i], tokens->end_offset[i] - tokens->start_offset[i]); if(i == 0) label += (!is_psg_one_segment ? "(S(S1" : "(S"); for(int j = n_psg_level-1; j >= 0; j--) label += psg_left_hash_->keys[psg_labels[j*tokens->n+i]]; label += "*"; for(int j = 0; j < n_psg_level; j++) label += psg_right_hash_->keys[psg_labels[j*tokens->n+i]]; if(i == tokens->n-1) label += (!is_psg_one_segment ? "))" : ")"); tags.push_back(Tag(offset, kPsg, label)); } } return tags; }