Tagger::~Tagger() { SENNA_Tokenizer_free(tokenizer_); SENNA_POS_free(pos_); SENNA_CHK_free(chk_); SENNA_PT0_free(pt0_); SENNA_NER_free(ner_); SENNA_VBS_free(vbs_); SENNA_SRL_free(srl_); SENNA_PSG_free(psg_); SENNA_Hash_free(word_hash_); SENNA_Hash_free(caps_hash_); SENNA_Hash_free(suff_hash_); SENNA_Hash_free(gazt_hash_); SENNA_Hash_free(gazl_hash_); SENNA_Hash_free(gazm_hash_); SENNA_Hash_free(gazo_hash_); SENNA_Hash_free(gazp_hash_); SENNA_Hash_free(pos_hash_); SENNA_Hash_free(chk_hash_); SENNA_Hash_free(pt0_hash_); SENNA_Hash_free(ner_hash_); SENNA_Hash_free(vbs_hash_); SENNA_Hash_free(srl_hash_); SENNA_Hash_free(psg_left_hash_); SENNA_Hash_free(psg_right_hash_); }
int main(int argc, char *argv[]) { int i, j; /* options */ char *opt_path = NULL; int opt_verbose = 0; int opt_notokentags = 0; int opt_offsettags = 0; int opt_iobtags = 0; int opt_brackettags = 0; int opt_posvbs = 0; int opt_usrtokens = 0; int opt_pos = 0; int opt_chk = 0; int opt_ner = 0; int opt_srl = 0; int opt_psg = 0; FILE *opt_usrvbs = NULL; FILE *senna_input = stdin; FILE *senna_output = stdout; int pipe_mode = 0; char *output_pipe = NULL; for(i = 1; i < argc; i++) { if(!strcmp(argv[i], "-verbose")) opt_verbose = 1; else if(!strcmp(argv[i], "-notokentags")) opt_notokentags = 1; else if(!strcmp(argv[i], "-offsettags")) opt_offsettags = 1; else if(!strcmp(argv[i], "-iobtags")) opt_iobtags = 1; else if(!strcmp(argv[i], "-brackettags")) opt_brackettags = 1; else if(!strcmp(argv[i], "-path")) { if(i+1 >= argc) SENNA_error("please provide a path for the -path option"); opt_path = argv[i+1]; i++; } else if(!strcmp(argv[i], "-posvbs")) opt_posvbs = 1; else if(!strcmp(argv[i], "-usrtokens")) opt_usrtokens = 1; else if(!strcmp(argv[i], "-usrvbs")) { if(i+1 >= argc) SENNA_error("please provide a filename for the -usrvbs option"); opt_usrvbs = SENNA_fopen(NULL, argv[i+1], "rb"); i++; } else if(!strcmp(argv[i], "-pos")) opt_pos = 1; else if(!strcmp(argv[i], "-chk")) opt_chk = 1; else if(!strcmp(argv[i], "-ner")) opt_ner = 1; else if(!strcmp(argv[i], "-srl")) opt_srl = 1; else if(!strcmp(argv[i], "-psg")) opt_psg = 1; else if(!strcmp(argv[i], "-maxsentsize")) { if(i+1 >= argc) SENNA_error("please provide a sentence size for the -maxsentsize option"); max_sent_size = atol(argv[i+1]); if(max_sent_size<0) SENNA_error("provide a positive value for the -maxsentsize option"); i++; } else if(!strcmp(argv[i], "-input_pipe")){ if(i+1 >= argc) SENNA_error("please provide the name of the input pipe"); senna_input = fopen(argv[i+1], "r"); if (senna_input == NULL) { SENNA_error("cannot open the input named pipe"); } pipe_mode = 1; i++; } else if(!strcmp(argv[i], "-output_pipe")){ if(i+1 >= argc) SENNA_error("please provide the name of the outputpipe"); output_pipe = argv[i+1]; i++; } else { printf("invalid argument: %s\n", argv[i]); help(argv[0]); } } SENNA_set_verbose_mode(opt_verbose); if(!opt_pos && !opt_chk && !opt_ner && !opt_srl && !opt_psg) /* the user does not know what he wants... */ opt_pos = opt_chk = opt_ner = opt_srl = opt_psg = 1; /* so give him everything (aren't we insane?) */ /* the real thing */ { char *sentence = NULL; char target_vb[MAX_TARGET_VB_SIZE]; int *chk_labels = NULL; int *pt0_labels = NULL; int *pos_labels = NULL; int *ner_labels = NULL; int *vbs_labels = NULL; int **srl_labels = NULL; int *psg_labels = NULL; int n_psg_level = 0; int is_psg_one_segment = 0; int vbs_hash_novb_idx = 22; int n_verbs = 0; sentence = malloc(max_sent_size + 1); SENNA_message("Maximum sentence size %ld", max_sent_size); /* inputs */ SENNA_Hash *word_hash = SENNA_Hash_new(opt_path, "hash/words.lst"); SENNA_Hash *caps_hash = SENNA_Hash_new(opt_path, "hash/caps.lst"); SENNA_Hash *suff_hash = SENNA_Hash_new(opt_path, "hash/suffix.lst"); SENNA_Hash *gazt_hash = SENNA_Hash_new(opt_path, "hash/gazetteer.lst"); SENNA_Hash *gazl_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.loc.lst", "data/ner.loc.dat"); SENNA_Hash *gazm_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.msc.lst", "data/ner.msc.dat"); SENNA_Hash *gazo_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.org.lst", "data/ner.org.dat"); SENNA_Hash *gazp_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.per.lst", "data/ner.per.dat"); /* labels */ SENNA_Hash *pos_hash = SENNA_Hash_new(opt_path, "hash/pos.lst"); SENNA_Hash *chk_hash = SENNA_Hash_new(opt_path, "hash/chk.lst"); SENNA_Hash *pt0_hash = SENNA_Hash_new(opt_path, "hash/pt0.lst"); SENNA_Hash *ner_hash = SENNA_Hash_new(opt_path, "hash/ner.lst"); SENNA_Hash *vbs_hash = SENNA_Hash_new(opt_path, "hash/vbs.lst"); SENNA_Hash *srl_hash = SENNA_Hash_new(opt_path, "hash/srl.lst"); SENNA_Hash *psg_left_hash = SENNA_Hash_new(opt_path, "hash/psg-left.lst"); SENNA_Hash *psg_right_hash = SENNA_Hash_new(opt_path, "hash/psg-right.lst"); SENNA_POS *pos = SENNA_POS_new(opt_path, "data/pos.dat"); SENNA_CHK *chk = SENNA_CHK_new(opt_path, "data/chk.dat"); SENNA_PT0 *pt0 = SENNA_PT0_new(opt_path, "data/pt0.dat"); SENNA_NER *ner = SENNA_NER_new(opt_path, "data/ner.dat"); SENNA_VBS *vbs = SENNA_VBS_new(opt_path, "data/vbs.dat"); SENNA_SRL *srl = SENNA_SRL_new(opt_path, "data/srl.dat"); SENNA_PSG *psg = SENNA_PSG_new(opt_path, "data/psg.dat"); SENNA_Tokenizer *tokenizer = SENNA_Tokenizer_new(word_hash, caps_hash, suff_hash, gazt_hash, gazl_hash, gazm_hash, gazo_hash, gazp_hash, opt_usrtokens); if(opt_iobtags) { SENNA_Hash_convert_IOBES_to_IOB(chk_hash); SENNA_Hash_convert_IOBES_to_IOB(ner_hash); SENNA_Hash_convert_IOBES_to_IOB(srl_hash); } else if(opt_brackettags) { SENNA_Hash_convert_IOBES_to_brackets(chk_hash); SENNA_Hash_convert_IOBES_to_brackets(ner_hash); SENNA_Hash_convert_IOBES_to_brackets(srl_hash); } SENNA_message("ready"); do { if (output_pipe) { senna_output = fopen(output_pipe, "w"); if (senna_output == NULL) { SENNA_error("cannot open the output named pipe"); } } else { senna_output = stdout; } while(fgets(sentence, max_sent_size + 1, senna_input)) { SENNA_Tokens* tokens = SENNA_Tokenizer_tokenize(tokenizer, sentence); if(tokens->n == 0) continue; pos_labels = SENNA_POS_forward(pos, tokens->word_idx, tokens->caps_idx, tokens->suff_idx, tokens->n); if(opt_chk) chk_labels = SENNA_CHK_forward(chk, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n); if(opt_srl) pt0_labels = SENNA_PT0_forward(pt0, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n); if(opt_ner) ner_labels = SENNA_NER_forward(ner, tokens->word_idx, tokens->caps_idx, tokens->gazl_idx, tokens->gazm_idx, tokens->gazo_idx, tokens->gazp_idx, tokens->n); if(opt_srl) { if(opt_usrvbs) { vbs_labels = SENNA_realloc(vbs_labels, sizeof(int), tokens->n); n_verbs = 0; for(i = 0; i < tokens->n; i++) { if(!SENNA_fgetline(target_vb, MAX_TARGET_VB_SIZE, opt_usrvbs)) SENNA_error("invalid user verbs file\n"); vbs_labels[i] = !( (target_vb[0] == '-') && ( (target_vb[1] == '\0') || isspace(target_vb[1])) ); n_verbs += vbs_labels[i]; } if(!SENNA_fgetline(target_vb, MAX_TARGET_VB_SIZE, opt_usrvbs)) SENNA_error("invalid user verbs file\n"); if(strlen(target_vb) > 0) SENNA_error("sentence size does not match in user verbs file"); } else if(opt_posvbs) { vbs_labels = SENNA_realloc(vbs_labels, sizeof(int), tokens->n); n_verbs = 0; for(i = 0; i < tokens->n; i++) { vbs_labels[i] = (SENNA_Hash_key(pos_hash, pos_labels[i])[0] == 'V'); n_verbs += vbs_labels[i]; } } else { vbs_labels = SENNA_VBS_forward(vbs, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n); n_verbs = 0; for(i = 0; i < tokens->n; i++) { vbs_labels[i] = (vbs_labels[i] != vbs_hash_novb_idx); n_verbs += vbs_labels[i]; } } } if(opt_srl) srl_labels = SENNA_SRL_forward(srl, tokens->word_idx, tokens->caps_idx, pt0_labels, vbs_labels, tokens->n); if(opt_psg) { SENNA_PSG_forward(psg, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n, &psg_labels, &n_psg_level); /* check if top level takes the full sentence */ { int *psg_top_labels = psg_labels + (n_psg_level-1)*tokens->n; if(tokens->n == 1) is_psg_one_segment = ((psg_top_labels[0]-1) % 4 == 3); /* S- ? */ else is_psg_one_segment = ((psg_top_labels[0]-1) % 4 == 0) && ((psg_top_labels[tokens->n-1]-1) % 4 == 2); /* B- or E- ? */ for(i = 1; is_psg_one_segment && (i < tokens->n-1); i++) { if((psg_top_labels[i]-1) % 4 != 1) /* I- ? */ is_psg_one_segment = 0; } } } for(i = 0; i < tokens->n; i++) { if(!opt_notokentags) fprintf(senna_output, "%15s", tokens->words[i]); if(opt_offsettags) fprintf(senna_output, "\t%d %d", tokens->start_offset[i], tokens->end_offset[i]); if(opt_pos) fprintf(senna_output, "\t%10s", SENNA_Hash_key(pos_hash, pos_labels[i])); if(opt_chk) fprintf(senna_output, "\t%10s", SENNA_Hash_key(chk_hash, chk_labels[i])); if(opt_ner) fprintf(senna_output, "\t%10s", SENNA_Hash_key(ner_hash, ner_labels[i])); if(opt_srl) { fprintf(senna_output, "\t%15s", (vbs_labels[i] ? tokens->words[i] : "-")); for(j = 0; j < n_verbs; j++) fprintf(senna_output, "\t%10s", SENNA_Hash_key(srl_hash, srl_labels[j][i])); } if(opt_psg) /* last, can be long */ { fprintf(senna_output, "\t"); if(i == 0) { fprintf(senna_output, "(S1"); if(!is_psg_one_segment) fprintf(senna_output, "(S"); } for(j = n_psg_level-1; j >= 0; j--) fprintf(senna_output, "%s", SENNA_Hash_key(psg_left_hash, psg_labels[j*tokens->n+i])); fprintf(senna_output, "*"); for(j = 0; j < n_psg_level; j++) fprintf(senna_output, "%s", SENNA_Hash_key(psg_right_hash, psg_labels[j*tokens->n+i])); if(i == tokens->n-1) { if(!is_psg_one_segment) fprintf(senna_output, ")"); fprintf(senna_output, ")"); } } fprintf(senna_output, "\n"); } fprintf(senna_output, "\n"); /* end of sentence */ } if (output_pipe) { fclose(senna_output); } } while (pipe_mode); if(opt_posvbs) SENNA_free(vbs_labels); if(opt_usrvbs) { SENNA_free(vbs_labels); SENNA_fclose(opt_usrvbs); } SENNA_Tokenizer_free(tokenizer); SENNA_POS_free(pos); SENNA_CHK_free(chk); SENNA_PT0_free(pt0); SENNA_NER_free(ner); SENNA_VBS_free(vbs); SENNA_SRL_free(srl); SENNA_PSG_free(psg); SENNA_Hash_free(word_hash); SENNA_Hash_free(caps_hash); SENNA_Hash_free(suff_hash); SENNA_Hash_free(gazt_hash); SENNA_Hash_free(gazl_hash); SENNA_Hash_free(gazm_hash); SENNA_Hash_free(gazo_hash); SENNA_Hash_free(gazp_hash); SENNA_Hash_free(pos_hash); SENNA_Hash_free(chk_hash); SENNA_Hash_free(pt0_hash); SENNA_Hash_free(ner_hash); SENNA_Hash_free(vbs_hash); SENNA_Hash_free(srl_hash); SENNA_Hash_free(psg_left_hash); SENNA_Hash_free(psg_right_hash); free(sentence); } return 0; }
int main(int argc, char *argv[]) { int i, j; /* options */ char * sentence = NULL; char *opt_path = NULL; int opt_verbose = 0; int opt_notokentags = 0; int opt_iobtags = 0; int opt_brackettags = 0; int opt_posvbs = 0; int opt_usrtokens = 0; int opt_pos = 0; int opt_chk = 0; int opt_ner = 0; int opt_srl = 0; FILE *opt_usrvbs = NULL; for(i = 1; i < argc; i++) { if(!strcmp(argv[i], "-sentence")){ if(i+1 >= argc) SENNA_error("please provide a path for the -path option"); sentence = argv[i+1]; i++; }else if(!strcmp(argv[i], "-verbose")) opt_verbose = 1; else if(!strcmp(argv[i], "-notokentags")) opt_notokentags = 1; else if(!strcmp(argv[i], "-iobtags")) opt_iobtags = 1; else if(!strcmp(argv[i], "-brackettags")) opt_brackettags = 1; else if(!strcmp(argv[i], "-path")) { if(i+1 >= argc) SENNA_error("please provide a path for the -path option"); opt_path = argv[i+1]; i++; } else if(!strcmp(argv[i], "-posvbs")) opt_posvbs = 1; else if(!strcmp(argv[i], "-usrtokens")) opt_usrtokens = 1; else if(!strcmp(argv[i], "-usrvbs")) { if(i+1 >= argc) SENNA_error("please provide a filename for the -usrvbs option"); opt_usrvbs = SENNA_fopen(NULL, argv[i+1], "rb"); i++; } else if(!strcmp(argv[i], "-pos")) opt_pos = 1; else if(!strcmp(argv[i], "-chk")) opt_chk = 1; else if(!strcmp(argv[i], "-ner")) opt_ner = 1; else if(!strcmp(argv[i], "-srl")) opt_srl = 1; else { printf("invalid argument: %s\n", argv[i]); help(argv[0]); } } SENNA_set_verbose_mode(opt_verbose); if(!opt_pos && !opt_chk && !opt_ner && !opt_srl) /* the user does not know what he wants... */ opt_pos = opt_chk = opt_ner = opt_srl = 1; /* so give him everything */ /* the real thing */ { //char sentence[MAX_SENTENCE_SIZE]; char target_vb[MAX_TARGET_VB_SIZE]; int *chk_labels = NULL; int *pt0_labels = NULL; int *pos_labels = NULL; int *ner_labels = NULL; int *vbs_labels = NULL; int **srl_labels = NULL; int vbs_hash_novb_idx = 22; int n_verbs = 0; /* inputs */ SENNA_Hash *word_hash = SENNA_Hash_new(opt_path, "hash/words.lst"); SENNA_Hash *caps_hash = SENNA_Hash_new(opt_path, "hash/caps.lst"); SENNA_Hash *suff_hash = SENNA_Hash_new(opt_path, "hash/suffix.lst"); SENNA_Hash *gazt_hash = SENNA_Hash_new(opt_path, "hash/gazetteer.lst"); SENNA_Hash *gazl_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.loc.lst", "data/ner.loc.dat"); SENNA_Hash *gazm_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.msc.lst", "data/ner.msc.dat"); SENNA_Hash *gazo_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.org.lst", "data/ner.org.dat"); SENNA_Hash *gazp_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.per.lst", "data/ner.per.dat"); /* labels */ SENNA_Hash *pos_hash = SENNA_Hash_new(opt_path, "hash/pos.lst"); SENNA_Hash *chk_hash = SENNA_Hash_new(opt_path, "hash/chk.lst"); SENNA_Hash *pt0_hash = SENNA_Hash_new(opt_path, "hash/pt0.lst"); SENNA_Hash *ner_hash = SENNA_Hash_new(opt_path, "hash/ner.lst"); SENNA_Hash *vbs_hash = SENNA_Hash_new(opt_path, "hash/vbs.lst"); SENNA_Hash *srl_hash = SENNA_Hash_new(opt_path, "hash/srl.lst"); SENNA_POS *pos = SENNA_POS_new(opt_path, "data/pos.dat"); SENNA_CHK *chk = SENNA_CHK_new(opt_path, "data/chk.dat"); SENNA_PT0 *pt0 = SENNA_PT0_new(opt_path, "data/pt0.dat"); SENNA_NER *ner = SENNA_NER_new(opt_path, "data/ner.dat"); SENNA_VBS *vbs = SENNA_VBS_new(opt_path, "data/vbs.dat"); SENNA_SRL *srl = SENNA_SRL_new(opt_path, "data/srl.dat"); SENNA_Tokenizer *tokenizer = SENNA_Tokenizer_new(word_hash, caps_hash, suff_hash, gazt_hash, gazl_hash, gazm_hash, gazo_hash, gazp_hash, opt_usrtokens); if(opt_iobtags) { SENNA_Hash_convert_IOBES_to_IOB(chk_hash); SENNA_Hash_convert_IOBES_to_IOB(ner_hash); SENNA_Hash_convert_IOBES_to_IOB(srl_hash); } else if(opt_brackettags) { SENNA_Hash_convert_IOBES_to_brackets(chk_hash); SENNA_Hash_convert_IOBES_to_brackets(ner_hash); SENNA_Hash_convert_IOBES_to_brackets(srl_hash); } SENNA_message("ready"); // while(fgets(sentence, MAX_SENTENCE_SIZE, stdin)) if(sentence != NULL) { SENNA_Tokens* tokens = SENNA_Tokenizer_tokenize(tokenizer, sentence); if(tokens->n == 0) continue; pos_labels = SENNA_POS_forward(pos, tokens->word_idx, tokens->caps_idx, tokens->suff_idx, tokens->n); if(opt_chk) chk_labels = SENNA_CHK_forward(chk, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n); if(opt_srl) pt0_labels = SENNA_PT0_forward(pt0, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n); if(opt_ner) ner_labels = SENNA_NER_forward(ner, tokens->word_idx, tokens->caps_idx, tokens->gazl_idx, tokens->gazm_idx, tokens->gazo_idx, tokens->gazp_idx, tokens->n); if(opt_srl) { if(opt_usrvbs) { vbs_labels = SENNA_realloc(vbs_labels, sizeof(int), tokens->n); n_verbs = 0; for(i = 0; i < tokens->n; i++) { if(!SENNA_fgetline(target_vb, MAX_TARGET_VB_SIZE, opt_usrvbs)) SENNA_error("invalid user verbs file\n"); vbs_labels[i] = !( (target_vb[0] == '-') && ( (target_vb[1] == '\0') || isspace(target_vb[1])) ); n_verbs += vbs_labels[i]; } if(!SENNA_fgetline(target_vb, MAX_TARGET_VB_SIZE, opt_usrvbs)) SENNA_error("invalid user verbs file\n"); if(strlen(target_vb) > 0) SENNA_error("sentence size does not match in user verbs file"); } else if(opt_posvbs) { vbs_labels = SENNA_realloc(vbs_labels, sizeof(int), tokens->n); n_verbs = 0; for(i = 0; i < tokens->n; i++) { vbs_labels[i] = (SENNA_Hash_key(pos_hash, pos_labels[i])[0] == 'V'); n_verbs += vbs_labels[i]; } } else { vbs_labels = SENNA_VBS_forward(vbs, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n); n_verbs = 0; for(i = 0; i < tokens->n; i++) { vbs_labels[i] = (vbs_labels[i] != vbs_hash_novb_idx); n_verbs += vbs_labels[i]; } } } if(opt_srl) srl_labels = SENNA_SRL_forward(srl, tokens->word_idx, tokens->caps_idx, pt0_labels, vbs_labels, tokens->n); for(i = 0; i < tokens->n; i++) { if(!opt_notokentags) printf("%15s", tokens->words[i]); if(opt_pos) printf("\t%10s", SENNA_Hash_key(pos_hash, pos_labels[i])); if(opt_chk) printf("\t%10s", SENNA_Hash_key(chk_hash, chk_labels[i])); if(opt_ner) printf("\t%10s", SENNA_Hash_key(ner_hash, ner_labels[i])); if(opt_srl) { printf("\t%15s", (vbs_labels[i] ? tokens->words[i] : "-")); for(j = 0; j < n_verbs; j++) printf("\t%10s", SENNA_Hash_key(srl_hash, srl_labels[j][i])); } printf("\n"); } printf("\n"); /* end of sentence */ } if(opt_posvbs) SENNA_free(vbs_labels); if(opt_usrvbs) { SENNA_free(vbs_labels); SENNA_fclose(opt_usrvbs); } SENNA_Tokenizer_free(tokenizer); SENNA_POS_free(pos); SENNA_CHK_free(chk); SENNA_PT0_free(pt0); SENNA_NER_free(ner); SENNA_VBS_free(vbs); SENNA_SRL_free(srl); SENNA_Hash_free(word_hash); SENNA_Hash_free(caps_hash); SENNA_Hash_free(suff_hash); SENNA_Hash_free(gazt_hash); SENNA_Hash_free(gazl_hash); SENNA_Hash_free(gazm_hash); SENNA_Hash_free(gazo_hash); SENNA_Hash_free(gazp_hash); SENNA_Hash_free(pos_hash); SENNA_Hash_free(chk_hash); SENNA_Hash_free(pt0_hash); SENNA_Hash_free(ner_hash); SENNA_Hash_free(vbs_hash); SENNA_Hash_free(srl_hash); } return 0; }