Tagger::Tagger(uint8_t type) : type_(type) { SENNA_set_verbose_mode(false); //std::cout << "Type of tagging: " << int(type_) << std::endl; word_hash_ = SENNA_Hash_new(SennaPath.c_str(), "hash/words.lst"); caps_hash_ = SENNA_Hash_new(SennaPath.c_str(), "hash/caps.lst"); suff_hash_ = SENNA_Hash_new(SennaPath.c_str(), "hash/suffix.lst"); gazt_hash_ = SENNA_Hash_new(SennaPath.c_str(), "hash/gazetteer.lst"); gazl_hash_ = SENNA_Hash_new_with_admissible_keys(SennaPath.c_str(), "hash/ner.loc.lst", "data/ner.loc.dat"); gazm_hash_ = SENNA_Hash_new_with_admissible_keys(SennaPath.c_str(), "hash/ner.msc.lst", "data/ner.msc.dat"); gazo_hash_ = SENNA_Hash_new_with_admissible_keys(SennaPath.c_str(), "hash/ner.org.lst", "data/ner.org.dat"); gazp_hash_ = SENNA_Hash_new_with_admissible_keys(SennaPath.c_str(), "hash/ner.per.lst", "data/ner.per.dat"); pos_hash_ = SENNA_Hash_new(SennaPath.c_str(), "hash/pos.lst"); chk_hash_ = SENNA_Hash_new(SennaPath.c_str(), "hash/chk.lst"); pt0_hash_ = SENNA_Hash_new(SennaPath.c_str(), "hash/pt0.lst"); ner_hash_ = SENNA_Hash_new(SennaPath.c_str(), "hash/ner.lst"); vbs_hash_ = SENNA_Hash_new(SennaPath.c_str(), "hash/vbs.lst"); srl_hash_ = SENNA_Hash_new(SennaPath.c_str(), "hash/srl.lst"); psg_left_hash_ = SENNA_Hash_new(SennaPath.c_str(), "hash/psg-left.lst"); psg_right_hash_ = SENNA_Hash_new(SennaPath.c_str(), "hash/psg-right.lst"); pos_ = SENNA_POS_new(SennaPath.c_str(), "data/pos.dat"); chk_ = SENNA_CHK_new(SennaPath.c_str(), "data/chk.dat"); pt0_ = SENNA_PT0_new(SennaPath.c_str(), "data/pt0.dat"); ner_ = SENNA_NER_new(SennaPath.c_str(), "data/ner.dat"); vbs_ = SENNA_VBS_new(SennaPath.c_str(), "data/vbs.dat"); srl_ = SENNA_SRL_new(SennaPath.c_str(), "data/srl.dat"); psg_ = SENNA_PSG_new(SennaPath.c_str(), "data/psg.dat"); tokenizer_ = SENNA_Tokenizer_new(word_hash_, caps_hash_, suff_hash_, gazt_hash_, gazl_hash_, gazm_hash_, gazo_hash_, gazp_hash_, false); }
/* * allocates hash tables labels and provides a pointer to the data structure * has to be freed using freeSenna */ SENNA* sennaCreate(const char * opt_path) { SENNA* senna = (SENNA*) malloc(sizeof(SENNA)); CHECK_ALLOC(senna); senna->word_hash = SENNA_Hash_new(opt_path, "hash/words.lst"); senna->caps_hash = SENNA_Hash_new(opt_path, "hash/caps.lst"); senna->suff_hash = SENNA_Hash_new(opt_path, "hash/suffix.lst"); senna->gazt_hash = SENNA_Hash_new(opt_path, "hash/gazetteer.lst"); senna->gazl_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.loc.lst", "data/ner.loc.dat"); senna->gazm_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.msc.lst", "data/ner.msc.dat"); senna->gazo_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.org.lst", "data/ner.org.dat"); senna->gazp_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.per.lst", "data/ner.per.dat"); // labels senna->pos_hash = SENNA_Hash_new(opt_path, "hash/pos.lst"); // senna->chk_hash = SENNA_Hash_new(opt_path, "hash/chk.lst"); // senna->pt0_hash = SENNA_Hash_new(opt_path, "hash/pt0.lst"); // senna->ner_hash = SENNA_Hash_new(opt_path, "hash/ner.lst"); // senna->vbs_hash = SENNA_Hash_new(opt_path, "hash/vbs.lst"); // senna->srl_hash = SENNA_Hash_new(opt_path, "hash/srl.lst"); senna->psg_left_hash = SENNA_Hash_new(opt_path, "hash/psg-left.lst"); senna->psg_right_hash = SENNA_Hash_new(opt_path, "hash/psg-right.lst"); senna->pos = SENNA_POS_new(opt_path, "data/pos.dat"); // senna->chk = SENNA_CHK_new(opt_path, "data/chk.dat"); // senna->pt0 = SENNA_PT0_new(opt_path, "data/pt0.dat"); // senna->ner = SENNA_NER_new(opt_path, "data/ner.dat"); // senna->vbs = SENNA_VBS_new(opt_path, "data/vbs.dat"); // senna->srl = SENNA_SRL_new(opt_path, "data/srl.dat"); senna->psg = SENNA_PSG_new(opt_path, "data/psg.dat"); senna->tokenizer = SENNA_Tokenizer_new( senna->word_hash, senna->caps_hash, senna->suff_hash, senna->gazt_hash, senna->gazl_hash, senna->gazm_hash, senna->gazo_hash, senna->gazp_hash, 0); senna->lastSentence.tokens = NULL; senna->lastSentence.pos_labels = NULL; senna->lastSentence.psg_labels = NULL; senna->strbuf.ptr = (char *) malloc(sizeof(char) * 512); CHECK_ALLOC(senna->strbuf.ptr); senna->strbuf.length = 512; senna->strbuf.pos = 0; return senna; }
int main(int argc, char *argv[]) { int i, j; /* options */ char *opt_path = NULL; int opt_verbose = 0; int opt_notokentags = 0; int opt_offsettags = 0; int opt_iobtags = 0; int opt_brackettags = 0; int opt_posvbs = 0; int opt_usrtokens = 0; int opt_pos = 0; int opt_chk = 0; int opt_ner = 0; int opt_srl = 0; int opt_psg = 0; FILE *opt_usrvbs = NULL; FILE *senna_input = stdin; FILE *senna_output = stdout; int pipe_mode = 0; char *output_pipe = NULL; for(i = 1; i < argc; i++) { if(!strcmp(argv[i], "-verbose")) opt_verbose = 1; else if(!strcmp(argv[i], "-notokentags")) opt_notokentags = 1; else if(!strcmp(argv[i], "-offsettags")) opt_offsettags = 1; else if(!strcmp(argv[i], "-iobtags")) opt_iobtags = 1; else if(!strcmp(argv[i], "-brackettags")) opt_brackettags = 1; else if(!strcmp(argv[i], "-path")) { if(i+1 >= argc) SENNA_error("please provide a path for the -path option"); opt_path = argv[i+1]; i++; } else if(!strcmp(argv[i], "-posvbs")) opt_posvbs = 1; else if(!strcmp(argv[i], "-usrtokens")) opt_usrtokens = 1; else if(!strcmp(argv[i], "-usrvbs")) { if(i+1 >= argc) SENNA_error("please provide a filename for the -usrvbs option"); opt_usrvbs = SENNA_fopen(NULL, argv[i+1], "rb"); i++; } else if(!strcmp(argv[i], "-pos")) opt_pos = 1; else if(!strcmp(argv[i], "-chk")) opt_chk = 1; else if(!strcmp(argv[i], "-ner")) opt_ner = 1; else if(!strcmp(argv[i], "-srl")) opt_srl = 1; else if(!strcmp(argv[i], "-psg")) opt_psg = 1; else if(!strcmp(argv[i], "-maxsentsize")) { if(i+1 >= argc) SENNA_error("please provide a sentence size for the -maxsentsize option"); max_sent_size = atol(argv[i+1]); if(max_sent_size<0) SENNA_error("provide a positive value for the -maxsentsize option"); i++; } else if(!strcmp(argv[i], "-input_pipe")){ if(i+1 >= argc) SENNA_error("please provide the name of the input pipe"); senna_input = fopen(argv[i+1], "r"); if (senna_input == NULL) { SENNA_error("cannot open the input named pipe"); } pipe_mode = 1; i++; } else if(!strcmp(argv[i], "-output_pipe")){ if(i+1 >= argc) SENNA_error("please provide the name of the outputpipe"); output_pipe = argv[i+1]; i++; } else { printf("invalid argument: %s\n", argv[i]); help(argv[0]); } } SENNA_set_verbose_mode(opt_verbose); if(!opt_pos && !opt_chk && !opt_ner && !opt_srl && !opt_psg) /* the user does not know what he wants... */ opt_pos = opt_chk = opt_ner = opt_srl = opt_psg = 1; /* so give him everything (aren't we insane?) */ /* the real thing */ { char *sentence = NULL; char target_vb[MAX_TARGET_VB_SIZE]; int *chk_labels = NULL; int *pt0_labels = NULL; int *pos_labels = NULL; int *ner_labels = NULL; int *vbs_labels = NULL; int **srl_labels = NULL; int *psg_labels = NULL; int n_psg_level = 0; int is_psg_one_segment = 0; int vbs_hash_novb_idx = 22; int n_verbs = 0; sentence = malloc(max_sent_size + 1); SENNA_message("Maximum sentence size %ld", max_sent_size); /* inputs */ SENNA_Hash *word_hash = SENNA_Hash_new(opt_path, "hash/words.lst"); SENNA_Hash *caps_hash = SENNA_Hash_new(opt_path, "hash/caps.lst"); SENNA_Hash *suff_hash = SENNA_Hash_new(opt_path, "hash/suffix.lst"); SENNA_Hash *gazt_hash = SENNA_Hash_new(opt_path, "hash/gazetteer.lst"); SENNA_Hash *gazl_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.loc.lst", "data/ner.loc.dat"); SENNA_Hash *gazm_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.msc.lst", "data/ner.msc.dat"); SENNA_Hash *gazo_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.org.lst", "data/ner.org.dat"); SENNA_Hash *gazp_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.per.lst", "data/ner.per.dat"); /* labels */ SENNA_Hash *pos_hash = SENNA_Hash_new(opt_path, "hash/pos.lst"); SENNA_Hash *chk_hash = SENNA_Hash_new(opt_path, "hash/chk.lst"); SENNA_Hash *pt0_hash = SENNA_Hash_new(opt_path, "hash/pt0.lst"); SENNA_Hash *ner_hash = SENNA_Hash_new(opt_path, "hash/ner.lst"); SENNA_Hash *vbs_hash = SENNA_Hash_new(opt_path, "hash/vbs.lst"); SENNA_Hash *srl_hash = SENNA_Hash_new(opt_path, "hash/srl.lst"); SENNA_Hash *psg_left_hash = SENNA_Hash_new(opt_path, "hash/psg-left.lst"); SENNA_Hash *psg_right_hash = SENNA_Hash_new(opt_path, "hash/psg-right.lst"); SENNA_POS *pos = SENNA_POS_new(opt_path, "data/pos.dat"); SENNA_CHK *chk = SENNA_CHK_new(opt_path, "data/chk.dat"); SENNA_PT0 *pt0 = SENNA_PT0_new(opt_path, "data/pt0.dat"); SENNA_NER *ner = SENNA_NER_new(opt_path, "data/ner.dat"); SENNA_VBS *vbs = SENNA_VBS_new(opt_path, "data/vbs.dat"); SENNA_SRL *srl = SENNA_SRL_new(opt_path, "data/srl.dat"); SENNA_PSG *psg = SENNA_PSG_new(opt_path, "data/psg.dat"); SENNA_Tokenizer *tokenizer = SENNA_Tokenizer_new(word_hash, caps_hash, suff_hash, gazt_hash, gazl_hash, gazm_hash, gazo_hash, gazp_hash, opt_usrtokens); if(opt_iobtags) { SENNA_Hash_convert_IOBES_to_IOB(chk_hash); SENNA_Hash_convert_IOBES_to_IOB(ner_hash); SENNA_Hash_convert_IOBES_to_IOB(srl_hash); } else if(opt_brackettags) { SENNA_Hash_convert_IOBES_to_brackets(chk_hash); SENNA_Hash_convert_IOBES_to_brackets(ner_hash); SENNA_Hash_convert_IOBES_to_brackets(srl_hash); } SENNA_message("ready"); do { if (output_pipe) { senna_output = fopen(output_pipe, "w"); if (senna_output == NULL) { SENNA_error("cannot open the output named pipe"); } } else { senna_output = stdout; } while(fgets(sentence, max_sent_size + 1, senna_input)) { SENNA_Tokens* tokens = SENNA_Tokenizer_tokenize(tokenizer, sentence); if(tokens->n == 0) continue; pos_labels = SENNA_POS_forward(pos, tokens->word_idx, tokens->caps_idx, tokens->suff_idx, tokens->n); if(opt_chk) chk_labels = SENNA_CHK_forward(chk, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n); if(opt_srl) pt0_labels = SENNA_PT0_forward(pt0, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n); if(opt_ner) ner_labels = SENNA_NER_forward(ner, tokens->word_idx, tokens->caps_idx, tokens->gazl_idx, tokens->gazm_idx, tokens->gazo_idx, tokens->gazp_idx, tokens->n); if(opt_srl) { if(opt_usrvbs) { vbs_labels = SENNA_realloc(vbs_labels, sizeof(int), tokens->n); n_verbs = 0; for(i = 0; i < tokens->n; i++) { if(!SENNA_fgetline(target_vb, MAX_TARGET_VB_SIZE, opt_usrvbs)) SENNA_error("invalid user verbs file\n"); vbs_labels[i] = !( (target_vb[0] == '-') && ( (target_vb[1] == '\0') || isspace(target_vb[1])) ); n_verbs += vbs_labels[i]; } if(!SENNA_fgetline(target_vb, MAX_TARGET_VB_SIZE, opt_usrvbs)) SENNA_error("invalid user verbs file\n"); if(strlen(target_vb) > 0) SENNA_error("sentence size does not match in user verbs file"); } else if(opt_posvbs) { vbs_labels = SENNA_realloc(vbs_labels, sizeof(int), tokens->n); n_verbs = 0; for(i = 0; i < tokens->n; i++) { vbs_labels[i] = (SENNA_Hash_key(pos_hash, pos_labels[i])[0] == 'V'); n_verbs += vbs_labels[i]; } } else { vbs_labels = SENNA_VBS_forward(vbs, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n); n_verbs = 0; for(i = 0; i < tokens->n; i++) { vbs_labels[i] = (vbs_labels[i] != vbs_hash_novb_idx); n_verbs += vbs_labels[i]; } } } if(opt_srl) srl_labels = SENNA_SRL_forward(srl, tokens->word_idx, tokens->caps_idx, pt0_labels, vbs_labels, tokens->n); if(opt_psg) { SENNA_PSG_forward(psg, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n, &psg_labels, &n_psg_level); /* check if top level takes the full sentence */ { int *psg_top_labels = psg_labels + (n_psg_level-1)*tokens->n; if(tokens->n == 1) is_psg_one_segment = ((psg_top_labels[0]-1) % 4 == 3); /* S- ? */ else is_psg_one_segment = ((psg_top_labels[0]-1) % 4 == 0) && ((psg_top_labels[tokens->n-1]-1) % 4 == 2); /* B- or E- ? */ for(i = 1; is_psg_one_segment && (i < tokens->n-1); i++) { if((psg_top_labels[i]-1) % 4 != 1) /* I- ? */ is_psg_one_segment = 0; } } } for(i = 0; i < tokens->n; i++) { if(!opt_notokentags) fprintf(senna_output, "%15s", tokens->words[i]); if(opt_offsettags) fprintf(senna_output, "\t%d %d", tokens->start_offset[i], tokens->end_offset[i]); if(opt_pos) fprintf(senna_output, "\t%10s", SENNA_Hash_key(pos_hash, pos_labels[i])); if(opt_chk) fprintf(senna_output, "\t%10s", SENNA_Hash_key(chk_hash, chk_labels[i])); if(opt_ner) fprintf(senna_output, "\t%10s", SENNA_Hash_key(ner_hash, ner_labels[i])); if(opt_srl) { fprintf(senna_output, "\t%15s", (vbs_labels[i] ? tokens->words[i] : "-")); for(j = 0; j < n_verbs; j++) fprintf(senna_output, "\t%10s", SENNA_Hash_key(srl_hash, srl_labels[j][i])); } if(opt_psg) /* last, can be long */ { fprintf(senna_output, "\t"); if(i == 0) { fprintf(senna_output, "(S1"); if(!is_psg_one_segment) fprintf(senna_output, "(S"); } for(j = n_psg_level-1; j >= 0; j--) fprintf(senna_output, "%s", SENNA_Hash_key(psg_left_hash, psg_labels[j*tokens->n+i])); fprintf(senna_output, "*"); for(j = 0; j < n_psg_level; j++) fprintf(senna_output, "%s", SENNA_Hash_key(psg_right_hash, psg_labels[j*tokens->n+i])); if(i == tokens->n-1) { if(!is_psg_one_segment) fprintf(senna_output, ")"); fprintf(senna_output, ")"); } } fprintf(senna_output, "\n"); } fprintf(senna_output, "\n"); /* end of sentence */ } if (output_pipe) { fclose(senna_output); } } while (pipe_mode); if(opt_posvbs) SENNA_free(vbs_labels); if(opt_usrvbs) { SENNA_free(vbs_labels); SENNA_fclose(opt_usrvbs); } SENNA_Tokenizer_free(tokenizer); SENNA_POS_free(pos); SENNA_CHK_free(chk); SENNA_PT0_free(pt0); SENNA_NER_free(ner); SENNA_VBS_free(vbs); SENNA_SRL_free(srl); SENNA_PSG_free(psg); SENNA_Hash_free(word_hash); SENNA_Hash_free(caps_hash); SENNA_Hash_free(suff_hash); SENNA_Hash_free(gazt_hash); SENNA_Hash_free(gazl_hash); SENNA_Hash_free(gazm_hash); SENNA_Hash_free(gazo_hash); SENNA_Hash_free(gazp_hash); SENNA_Hash_free(pos_hash); SENNA_Hash_free(chk_hash); SENNA_Hash_free(pt0_hash); SENNA_Hash_free(ner_hash); SENNA_Hash_free(vbs_hash); SENNA_Hash_free(srl_hash); SENNA_Hash_free(psg_left_hash); SENNA_Hash_free(psg_right_hash); free(sentence); } return 0; }
int main(int argc, char *argv[]) { int i, j; /* options */ char * sentence = NULL; char *opt_path = NULL; int opt_verbose = 0; int opt_notokentags = 0; int opt_iobtags = 0; int opt_brackettags = 0; int opt_posvbs = 0; int opt_usrtokens = 0; int opt_pos = 0; int opt_chk = 0; int opt_ner = 0; int opt_srl = 0; FILE *opt_usrvbs = NULL; for(i = 1; i < argc; i++) { if(!strcmp(argv[i], "-sentence")){ if(i+1 >= argc) SENNA_error("please provide a path for the -path option"); sentence = argv[i+1]; i++; }else if(!strcmp(argv[i], "-verbose")) opt_verbose = 1; else if(!strcmp(argv[i], "-notokentags")) opt_notokentags = 1; else if(!strcmp(argv[i], "-iobtags")) opt_iobtags = 1; else if(!strcmp(argv[i], "-brackettags")) opt_brackettags = 1; else if(!strcmp(argv[i], "-path")) { if(i+1 >= argc) SENNA_error("please provide a path for the -path option"); opt_path = argv[i+1]; i++; } else if(!strcmp(argv[i], "-posvbs")) opt_posvbs = 1; else if(!strcmp(argv[i], "-usrtokens")) opt_usrtokens = 1; else if(!strcmp(argv[i], "-usrvbs")) { if(i+1 >= argc) SENNA_error("please provide a filename for the -usrvbs option"); opt_usrvbs = SENNA_fopen(NULL, argv[i+1], "rb"); i++; } else if(!strcmp(argv[i], "-pos")) opt_pos = 1; else if(!strcmp(argv[i], "-chk")) opt_chk = 1; else if(!strcmp(argv[i], "-ner")) opt_ner = 1; else if(!strcmp(argv[i], "-srl")) opt_srl = 1; else { printf("invalid argument: %s\n", argv[i]); help(argv[0]); } } SENNA_set_verbose_mode(opt_verbose); if(!opt_pos && !opt_chk && !opt_ner && !opt_srl) /* the user does not know what he wants... */ opt_pos = opt_chk = opt_ner = opt_srl = 1; /* so give him everything */ /* the real thing */ { //char sentence[MAX_SENTENCE_SIZE]; char target_vb[MAX_TARGET_VB_SIZE]; int *chk_labels = NULL; int *pt0_labels = NULL; int *pos_labels = NULL; int *ner_labels = NULL; int *vbs_labels = NULL; int **srl_labels = NULL; int vbs_hash_novb_idx = 22; int n_verbs = 0; /* inputs */ SENNA_Hash *word_hash = SENNA_Hash_new(opt_path, "hash/words.lst"); SENNA_Hash *caps_hash = SENNA_Hash_new(opt_path, "hash/caps.lst"); SENNA_Hash *suff_hash = SENNA_Hash_new(opt_path, "hash/suffix.lst"); SENNA_Hash *gazt_hash = SENNA_Hash_new(opt_path, "hash/gazetteer.lst"); SENNA_Hash *gazl_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.loc.lst", "data/ner.loc.dat"); SENNA_Hash *gazm_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.msc.lst", "data/ner.msc.dat"); SENNA_Hash *gazo_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.org.lst", "data/ner.org.dat"); SENNA_Hash *gazp_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.per.lst", "data/ner.per.dat"); /* labels */ SENNA_Hash *pos_hash = SENNA_Hash_new(opt_path, "hash/pos.lst"); SENNA_Hash *chk_hash = SENNA_Hash_new(opt_path, "hash/chk.lst"); SENNA_Hash *pt0_hash = SENNA_Hash_new(opt_path, "hash/pt0.lst"); SENNA_Hash *ner_hash = SENNA_Hash_new(opt_path, "hash/ner.lst"); SENNA_Hash *vbs_hash = SENNA_Hash_new(opt_path, "hash/vbs.lst"); SENNA_Hash *srl_hash = SENNA_Hash_new(opt_path, "hash/srl.lst"); SENNA_POS *pos = SENNA_POS_new(opt_path, "data/pos.dat"); SENNA_CHK *chk = SENNA_CHK_new(opt_path, "data/chk.dat"); SENNA_PT0 *pt0 = SENNA_PT0_new(opt_path, "data/pt0.dat"); SENNA_NER *ner = SENNA_NER_new(opt_path, "data/ner.dat"); SENNA_VBS *vbs = SENNA_VBS_new(opt_path, "data/vbs.dat"); SENNA_SRL *srl = SENNA_SRL_new(opt_path, "data/srl.dat"); SENNA_Tokenizer *tokenizer = SENNA_Tokenizer_new(word_hash, caps_hash, suff_hash, gazt_hash, gazl_hash, gazm_hash, gazo_hash, gazp_hash, opt_usrtokens); if(opt_iobtags) { SENNA_Hash_convert_IOBES_to_IOB(chk_hash); SENNA_Hash_convert_IOBES_to_IOB(ner_hash); SENNA_Hash_convert_IOBES_to_IOB(srl_hash); } else if(opt_brackettags) { SENNA_Hash_convert_IOBES_to_brackets(chk_hash); SENNA_Hash_convert_IOBES_to_brackets(ner_hash); SENNA_Hash_convert_IOBES_to_brackets(srl_hash); } SENNA_message("ready"); // while(fgets(sentence, MAX_SENTENCE_SIZE, stdin)) if(sentence != NULL) { SENNA_Tokens* tokens = SENNA_Tokenizer_tokenize(tokenizer, sentence); if(tokens->n == 0) continue; pos_labels = SENNA_POS_forward(pos, tokens->word_idx, tokens->caps_idx, tokens->suff_idx, tokens->n); if(opt_chk) chk_labels = SENNA_CHK_forward(chk, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n); if(opt_srl) pt0_labels = SENNA_PT0_forward(pt0, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n); if(opt_ner) ner_labels = SENNA_NER_forward(ner, tokens->word_idx, tokens->caps_idx, tokens->gazl_idx, tokens->gazm_idx, tokens->gazo_idx, tokens->gazp_idx, tokens->n); if(opt_srl) { if(opt_usrvbs) { vbs_labels = SENNA_realloc(vbs_labels, sizeof(int), tokens->n); n_verbs = 0; for(i = 0; i < tokens->n; i++) { if(!SENNA_fgetline(target_vb, MAX_TARGET_VB_SIZE, opt_usrvbs)) SENNA_error("invalid user verbs file\n"); vbs_labels[i] = !( (target_vb[0] == '-') && ( (target_vb[1] == '\0') || isspace(target_vb[1])) ); n_verbs += vbs_labels[i]; } if(!SENNA_fgetline(target_vb, MAX_TARGET_VB_SIZE, opt_usrvbs)) SENNA_error("invalid user verbs file\n"); if(strlen(target_vb) > 0) SENNA_error("sentence size does not match in user verbs file"); } else if(opt_posvbs) { vbs_labels = SENNA_realloc(vbs_labels, sizeof(int), tokens->n); n_verbs = 0; for(i = 0; i < tokens->n; i++) { vbs_labels[i] = (SENNA_Hash_key(pos_hash, pos_labels[i])[0] == 'V'); n_verbs += vbs_labels[i]; } } else { vbs_labels = SENNA_VBS_forward(vbs, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n); n_verbs = 0; for(i = 0; i < tokens->n; i++) { vbs_labels[i] = (vbs_labels[i] != vbs_hash_novb_idx); n_verbs += vbs_labels[i]; } } } if(opt_srl) srl_labels = SENNA_SRL_forward(srl, tokens->word_idx, tokens->caps_idx, pt0_labels, vbs_labels, tokens->n); for(i = 0; i < tokens->n; i++) { if(!opt_notokentags) printf("%15s", tokens->words[i]); if(opt_pos) printf("\t%10s", SENNA_Hash_key(pos_hash, pos_labels[i])); if(opt_chk) printf("\t%10s", SENNA_Hash_key(chk_hash, chk_labels[i])); if(opt_ner) printf("\t%10s", SENNA_Hash_key(ner_hash, ner_labels[i])); if(opt_srl) { printf("\t%15s", (vbs_labels[i] ? tokens->words[i] : "-")); for(j = 0; j < n_verbs; j++) printf("\t%10s", SENNA_Hash_key(srl_hash, srl_labels[j][i])); } printf("\n"); } printf("\n"); /* end of sentence */ } if(opt_posvbs) SENNA_free(vbs_labels); if(opt_usrvbs) { SENNA_free(vbs_labels); SENNA_fclose(opt_usrvbs); } SENNA_Tokenizer_free(tokenizer); SENNA_POS_free(pos); SENNA_CHK_free(chk); SENNA_PT0_free(pt0); SENNA_NER_free(ner); SENNA_VBS_free(vbs); SENNA_SRL_free(srl); SENNA_Hash_free(word_hash); SENNA_Hash_free(caps_hash); SENNA_Hash_free(suff_hash); SENNA_Hash_free(gazt_hash); SENNA_Hash_free(gazl_hash); SENNA_Hash_free(gazm_hash); SENNA_Hash_free(gazo_hash); SENNA_Hash_free(gazp_hash); SENNA_Hash_free(pos_hash); SENNA_Hash_free(chk_hash); SENNA_Hash_free(pt0_hash); SENNA_Hash_free(ner_hash); SENNA_Hash_free(vbs_hash); SENNA_Hash_free(srl_hash); } return 0; }
int main_tr4(int argc, char *argv[]) { int i, j; /************************************************** SENNA setup **************************************************/ /* options */ char *opt_path = NULL; int opt_usrtokens = 0; int vbs_hash_novb_idx = 22; /* inputs */ SENNA_Hash *word_hash = SENNA_Hash_new(opt_path, "hash/words.lst"); SENNA_Hash *caps_hash = SENNA_Hash_new(opt_path, "hash/caps.lst"); SENNA_Hash *suff_hash = SENNA_Hash_new(opt_path, "hash/suffix.lst"); SENNA_Hash *gazt_hash = SENNA_Hash_new(opt_path, "hash/gazetteer.lst"); SENNA_Hash *gazl_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.loc.lst", "data/ner.loc.dat"); SENNA_Hash *gazm_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.msc.lst", "data/ner.msc.dat"); SENNA_Hash *gazo_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.org.lst", "data/ner.org.dat"); SENNA_Hash *gazp_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.per.lst", "data/ner.per.dat"); SENNA_Tokenizer *tokenizer = SENNA_Tokenizer_new(word_hash, caps_hash, suff_hash, gazt_hash, gazl_hash, gazm_hash, gazo_hash, gazp_hash, opt_usrtokens); SENNA_SRL *srl = SENNA_SRL_new(opt_path, "data/srl.dat"); /* labels */ SENNA_Hash *pos_hash = SENNA_Hash_new(opt_path, "hash/pos.lst"); SENNA_Hash *srl_hash = SENNA_Hash_new(opt_path, "hash/srl.lst"); SENNA_POS *pos = SENNA_POS_new(opt_path, "data/pos.dat"); SENNA_VBS *vbs = SENNA_VBS_new(opt_path, "data/vbs.dat"); SENNA_PT0 *pt0 = SENNA_PT0_new(opt_path, "data/pt0.dat"); /* FANN setup */ const unsigned int num_input = 2; const unsigned int num_output = 1; const unsigned int num_layers = 3; const unsigned int num_neurons_hidden = 3; const float desired_error = (const float) 0.00001; const unsigned int max_epochs = 500000; const unsigned int epochs_between_reports = 100000; struct fann *ann = fann_create_standard(num_layers, num_input, num_neurons_hidden, num_output); fann_set_activation_function_hidden(ann, FANN_SIGMOID_SYMMETRIC); fann_set_activation_function_output(ann, FANN_SIGMOID_SYMMETRIC); struct fann_train_data *train_data; train_data = fann_create_train_from_callback(4, 2, 1, &data_callback); /************************************************** main program **************************************************/ /* Read the training file line by line*/ FILE * fp; char * line = NULL; size_t len = 0; ssize_t read; fp = fopen("training.dat", "r"); int targets[] = {1, 1, 1, -1}; int id=0; while ((read = getline(&line, &len, fp)) != -1) { SENNA_Tokens* tokens = SENNA_Tokenizer_tokenize(tokenizer, line); int *pos_labels = SENNA_POS_forward(pos, tokens->word_idx, tokens->caps_idx, tokens->suff_idx, tokens->n); int *pt0_labels = SENNA_PT0_forward(pt0, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n); int n_verbs = 0; char target_vb[MAX_TARGET_VB_SIZE]; int *vbs_labels = SENNA_VBS_forward(vbs, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n); for(i = 0; i < tokens->n; i++){ vbs_labels[i] = (vbs_labels[i] != vbs_hash_novb_idx); n_verbs += vbs_labels[i]; } int **srl_labels = SENNA_SRL_forward(srl, tokens->word_idx, tokens->caps_idx, pt0_labels, vbs_labels, tokens->n); /* Logic */ train_data->input[id][0] = -1; train_data->input[id][1] = -1; for(i = 0; i < tokens->n; i++){ printf("%s %s ",tokens->words[i], SENNA_Hash_key(pos_hash, pos_labels[i])); printf("%s", (vbs_labels[i] ? tokens->words[i] : "-")); for(j = 0; j < n_verbs; j++){ printf(" '%s'", SENNA_Hash_key(srl_hash, srl_labels[j][i])); //printf("%s %s %i %i", tokens->words[i], SENNA_Hash_key(srl_hash, srl_labels[j][i]), strcmp(tokens->words[i],"want")); if(strcmp(tokens->words[i],"want") == 0 && strcmp(SENNA_Hash_key(srl_hash, srl_labels[j][i]),"S-V") == 0){ train_data->input[id][0] = 1; } else if(strcmp(tokens->words[i],"pizza") == 0 && strcmp(SENNA_Hash_key(srl_hash, srl_labels[j][i]),"E-A1") == 0){ train_data->input[id][1] = 1; } } printf("\n"); } train_data->output[id][0] = targets[id]; printf("%s", line); printf("Input: %f %f, Output: %d\n\n", train_data->input[id][0], train_data->input[id][1], targets[id]); id++; } /* Train a classifier */ fann_train_on_data(ann, train_data, max_epochs, epochs_between_reports, desired_error); fann_save(ann, "asds.net"); fann_destroy(ann); fclose(fp); return 0; }
int main_tr3(int argc, char *argv[]) { int i, j; /************************************************** SENNA setup **************************************************/ /* options */ char *opt_path = NULL; int opt_usrtokens = 0; int vbs_hash_novb_idx = 22; /* inputs */ SENNA_Hash *word_hash = SENNA_Hash_new(opt_path, "hash/words.lst"); SENNA_Hash *caps_hash = SENNA_Hash_new(opt_path, "hash/caps.lst"); SENNA_Hash *suff_hash = SENNA_Hash_new(opt_path, "hash/suffix.lst"); SENNA_Hash *gazt_hash = SENNA_Hash_new(opt_path, "hash/gazetteer.lst"); SENNA_Hash *gazl_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.loc.lst", "data/ner.loc.dat"); SENNA_Hash *gazm_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.msc.lst", "data/ner.msc.dat"); SENNA_Hash *gazo_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.org.lst", "data/ner.org.dat"); SENNA_Hash *gazp_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.per.lst", "data/ner.per.dat"); SENNA_Tokenizer *tokenizer = SENNA_Tokenizer_new(word_hash, caps_hash, suff_hash, gazt_hash, gazl_hash, gazm_hash, gazo_hash, gazp_hash, opt_usrtokens); SENNA_SRL *srl = SENNA_SRL_new(opt_path, "data/srl.dat"); /* labels */ SENNA_Hash *pos_hash = SENNA_Hash_new(opt_path, "hash/pos.lst"); SENNA_Hash *srl_hash = SENNA_Hash_new(opt_path, "hash/srl.lst"); SENNA_POS *pos = SENNA_POS_new(opt_path, "data/pos.dat"); SENNA_VBS *vbs = SENNA_VBS_new(opt_path, "data/vbs.dat"); SENNA_PT0 *pt0 = SENNA_PT0_new(opt_path, "data/pt0.dat"); /************************************************** main program **************************************************/ /* Read the training file line by line*/ FILE * fp; char * line = NULL; size_t len = 0; ssize_t read; fp = fopen("training.dat", "r"); int targets[] = {-1, 1, -1, -1}; int id=0; while ((read = getline(&line, &len, fp)) != -1) { SENNA_Tokens* tokens = SENNA_Tokenizer_tokenize(tokenizer, line); int *pos_labels = SENNA_POS_forward(pos, tokens->word_idx, tokens->caps_idx, tokens->suff_idx, tokens->n); int *pt0_labels = SENNA_PT0_forward(pt0, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n); int n_verbs = 0; char target_vb[MAX_TARGET_VB_SIZE]; int *vbs_labels = SENNA_VBS_forward(vbs, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n); for(i = 0; i < tokens->n; i++){ vbs_labels[i] = (vbs_labels[i] != vbs_hash_novb_idx); n_verbs += vbs_labels[i]; } int **srl_labels = SENNA_SRL_forward(srl, tokens->word_idx, tokens->caps_idx, pt0_labels, vbs_labels, tokens->n); /* Logic */ int x0=-1, x1=-1; for(i = 0; i < tokens->n; i++){ printf("%s %s ",tokens->words[i], SENNA_Hash_key(pos_hash, pos_labels[i])); printf("%s", (vbs_labels[i] ? tokens->words[i] : "-")); for(j = 0; j < n_verbs; j++){ printf(" %s", SENNA_Hash_key(srl_hash, srl_labels[j][i])); if(strcmp(tokens->words[i],"want") == 0 && strcmp(SENNA_Hash_key(srl_hash, srl_labels[j][i]),"S-V")){ x0 = 1; } else if(strcmp(tokens->words[i],"pizza") == 0 && strcmp(SENNA_Hash_key(srl_hash, srl_labels[j][i]),"E-A1")){ x1 = 1; } } printf("\n"); } printf("%s", line); printf("Input: %d %d, Output: %d\n\n", x0, x1, targets[id]); id++; } fclose(fp); return 0; }