SENNA_Hash* SENNA_Hash_new(const char *path, const char *filename) { FILE *f; SENNA_Hash *hash; char **keys = NULL; int n_keys; char key[MAX_KEY_SIZE]; //@AureDi temporary array of key, which stores the characters from filenames. int i; SENNA_message("loading hash: %s%s", (path ? path : ""), (filename ? filename : "")); //@AureDi Count the number of key (maximum is 256 characters) f = SENNA_fopen(path, filename, "rt"); /* the t is to comply with Windows */ n_keys = 0; while(fgets(key, MAX_KEY_SIZE, f)) n_keys++; //@AureDi Remember reading how many times SENNA_fclose(f); keys = SENNA_malloc(n_keys, sizeof(char*)); //@AureDi f = SENNA_fopen(path, filename, "rt"); /* the t is to comply with Windows */ n_keys = 0; while(fgets(key, MAX_KEY_SIZE, f)) { int key_size = strlen(key); //@ We can assume that the length of each line is not exceeded 255. key[key_size-1] = '\0'; /* discard the newline */ //@AureDi \0 is the sign of the end. Because this method is applied to process single word, so we can assume the end word is newline character . keys[n_keys] = SENNA_malloc(key_size, sizeof(char)); strcpy(keys[n_keys], key); n_keys++; } SENNA_fclose(f); hash = SENNA_malloc(sizeof(SENNA_Hash), 1); hash->keys = keys; //@AureDi keys is the pointer of second array. hash->size = n_keys; //@AureDi n_keys is the length of second aaay hash->is_admissible_key = NULL; //@AUreDi admissible /* sorted or unsorted hash ? */ /* (unsorted cannot return an index for a key) */ hash->is_sorted = 1; // uper is roght for(i = 0; i < n_keys-1; i++) { if(strcmp(keys[i], keys[i+1]) >= 0) { hash->is_sorted = 0; break; } } return hash; }
SENNA_CHK *SENNA_CHK_new(const char *path, const char *subpath) { SENNA_CHK *chk = SENNA_malloc(sizeof(SENNA_CHK), 1); FILE *f; float dummy; memset(chk, 0, sizeof(SENNA_CHK)); f = SENNA_fopen(path, subpath, "rb"); SENNA_fread(&chk->window_size, sizeof(int), 1, f); SENNA_fread_tensor_2d(&chk->ll_word_weight, &chk->ll_word_size, &chk->ll_word_max_idx, f); SENNA_fread_tensor_2d(&chk->ll_caps_weight, &chk->ll_caps_size, &chk->ll_caps_max_idx, f); SENNA_fread_tensor_2d(&chk->ll_posl_weight, &chk->ll_posl_size, &chk->ll_posl_max_idx, f); SENNA_fread_tensor_2d(&chk->l1_weight, &chk->input_state_size, &chk->hidden_state_size, f); SENNA_fread_tensor_1d(&chk->l1_bias, &chk->hidden_state_size, f); SENNA_fread_tensor_2d(&chk->l2_weight, &chk->hidden_state_size, &chk->output_state_size, f); SENNA_fread_tensor_1d(&chk->l2_bias, &chk->output_state_size, f); SENNA_fread_tensor_1d(&chk->viterbi_score_init, &chk->output_state_size, f); SENNA_fread_tensor_2d(&chk->viterbi_score_trans, &chk->output_state_size, &chk->output_state_size, f); SENNA_fread(&chk->ll_word_padding_idx, sizeof(int), 1, f); SENNA_fread(&chk->ll_caps_padding_idx, sizeof(int), 1, f); SENNA_fread(&chk->ll_posl_padding_idx, sizeof(int), 1, f); SENNA_fread(&dummy, sizeof(float), 1, f); SENNA_fclose(f); if ((int)dummy != 777) SENNA_error("chk: data corrupted (or not IEEE floating computer)"); chk->input_state = NULL; chk->hidden_state = SENNA_malloc(sizeof(float), chk->hidden_state_size); chk->output_state = NULL; chk->labels = NULL; /* some info if you want verbose */ SENNA_message("chk: window size: %d", chk->window_size); SENNA_message("chk: vector size in word lookup table: %d", chk->ll_word_size); SENNA_message("chk: word lookup table size: %d", chk->ll_word_max_idx); SENNA_message("chk: vector size in caps lookup table: %d", chk->ll_caps_size); SENNA_message("chk: caps lookup table size: %d", chk->ll_caps_max_idx); SENNA_message("chk: vector size in pos lookup table: %d", chk->ll_posl_size); SENNA_message("chk: pos lookup table size: %d", chk->ll_posl_max_idx); SENNA_message("chk: number of hidden units: %d", chk->hidden_state_size); SENNA_message("chk: number of classes: %d", chk->output_state_size); return chk; }
SENNA_PSG* SENNA_PSG_new(const char *path, const char *subpath) { SENNA_PSG *psg = SENNA_malloc(sizeof(SENNA_PSG), 1); FILE *f; float dummy; memset(psg, 0, sizeof(SENNA_PSG)); f = SENNA_fopen(path, subpath, "rb"); SENNA_fread_tensor_2d(&psg->ll_word_weight, &psg->ll_word_size, &psg->ll_word_max_idx, f); SENNA_fread_tensor_2d(&psg->ll_caps_weight, &psg->ll_caps_size, &psg->ll_caps_max_idx, f); SENNA_fread_tensor_2d(&psg->ll_posl_weight, &psg->ll_posl_size, &psg->ll_posl_max_idx, f); SENNA_fread_tensor_2d(&psg->ll_psgl_weight, &psg->ll_psgl_size, &psg->ll_psgl_max_idx, f); SENNA_fread_tensor_2d(&psg->l1_weight, &psg->input_state_size, &psg->l1_state_size, f); SENNA_fread_tensor_1d(&psg->l1_bias, &psg->l1_state_size, f); SENNA_fread_tensor_2d(&psg->l2_bias, &psg->l1_state_size, &psg->window_size, f); SENNA_fread_tensor_2d(&psg->l3_weight, &psg->l2_state_size, &psg->l3_state_size, f); SENNA_fread_tensor_1d(&psg->l3_bias, &psg->l3_state_size, f); SENNA_fread_tensor_2d(&psg->l4_weight, &psg->l3_state_size, &psg->l4_state_size, f); SENNA_fread_tensor_1d(&psg->l4_bias, &psg->l4_state_size, f); SENNA_fread_tensor_1d(&psg->viterbi_score_init, &psg->l4_state_size, f); SENNA_fread_tensor_2d(&psg->viterbi_score_trans, &psg->l4_state_size, &psg->l4_state_size, f); SENNA_fread(&psg->ll_word_padding_idx, sizeof(int), 1, f); SENNA_fread(&psg->ll_caps_padding_idx, sizeof(int), 1, f); SENNA_fread(&psg->ll_posl_padding_idx, sizeof(int), 1, f); SENNA_fread(&psg->ll_psgl_padding_idx, sizeof(int), 1, f); SENNA_fread(&dummy, sizeof(float), 1, f); SENNA_fclose(f); if((int)dummy != 777) SENNA_error("psg: data corrupted (or not IEEE floating computer)"); psg->input_state = NULL; psg->l1_state = NULL; psg->l2_state = NULL; psg->l3_state = NULL; psg->l4_state = NULL; psg->labels = NULL; psg->treillis = SENNA_Treillis_new(); return psg; }
SENNA_Hash *SENNA_Hash_new_with_admissible_keys(const char *path, const char *filename, const char *admissible_keys_filename) { SENNA_Hash *hash = SENNA_Hash_new(path, filename); FILE *f; int admissiblekeyssize = 0; f = SENNA_fopen(path, admissible_keys_filename, "rb"); //@Aure b means that the file is binary file. SENNA_fseek(f, 0, SEEK_END); //@Aure #define SEEK_END 2 Reposition stream position indicator admissiblekeyssize = SENNA_ftell(f); //@ Get current position in stream if(admissiblekeyssize != hash->size) SENNA_error("inconsistent hash and admissible key files"); SENNA_fseek(f, 0, SEEK_SET); hash->is_admissible_key = SENNA_malloc(sizeof(char), admissiblekeyssize); SENNA_fread(hash->is_admissible_key, 1, admissiblekeyssize, f); //@ Read block of data from stream to char array is_admissible_key. SENNA_fclose(f); return hash; }
int main(int argc, char *argv[]) { int i, j; /* options */ char *opt_path = NULL; int opt_verbose = 0; int opt_notokentags = 0; int opt_offsettags = 0; int opt_iobtags = 0; int opt_brackettags = 0; int opt_posvbs = 0; int opt_usrtokens = 0; int opt_pos = 0; int opt_chk = 0; int opt_ner = 0; int opt_srl = 0; int opt_psg = 0; FILE *opt_usrvbs = NULL; FILE *senna_input = stdin; FILE *senna_output = stdout; int pipe_mode = 0; char *output_pipe = NULL; for(i = 1; i < argc; i++) { if(!strcmp(argv[i], "-verbose")) opt_verbose = 1; else if(!strcmp(argv[i], "-notokentags")) opt_notokentags = 1; else if(!strcmp(argv[i], "-offsettags")) opt_offsettags = 1; else if(!strcmp(argv[i], "-iobtags")) opt_iobtags = 1; else if(!strcmp(argv[i], "-brackettags")) opt_brackettags = 1; else if(!strcmp(argv[i], "-path")) { if(i+1 >= argc) SENNA_error("please provide a path for the -path option"); opt_path = argv[i+1]; i++; } else if(!strcmp(argv[i], "-posvbs")) opt_posvbs = 1; else if(!strcmp(argv[i], "-usrtokens")) opt_usrtokens = 1; else if(!strcmp(argv[i], "-usrvbs")) { if(i+1 >= argc) SENNA_error("please provide a filename for the -usrvbs option"); opt_usrvbs = SENNA_fopen(NULL, argv[i+1], "rb"); i++; } else if(!strcmp(argv[i], "-pos")) opt_pos = 1; else if(!strcmp(argv[i], "-chk")) opt_chk = 1; else if(!strcmp(argv[i], "-ner")) opt_ner = 1; else if(!strcmp(argv[i], "-srl")) opt_srl = 1; else if(!strcmp(argv[i], "-psg")) opt_psg = 1; else if(!strcmp(argv[i], "-maxsentsize")) { if(i+1 >= argc) SENNA_error("please provide a sentence size for the -maxsentsize option"); max_sent_size = atol(argv[i+1]); if(max_sent_size<0) SENNA_error("provide a positive value for the -maxsentsize option"); i++; } else if(!strcmp(argv[i], "-input_pipe")){ if(i+1 >= argc) SENNA_error("please provide the name of the input pipe"); senna_input = fopen(argv[i+1], "r"); if (senna_input == NULL) { SENNA_error("cannot open the input named pipe"); } pipe_mode = 1; i++; } else if(!strcmp(argv[i], "-output_pipe")){ if(i+1 >= argc) SENNA_error("please provide the name of the outputpipe"); output_pipe = argv[i+1]; i++; } else { printf("invalid argument: %s\n", argv[i]); help(argv[0]); } } SENNA_set_verbose_mode(opt_verbose); if(!opt_pos && !opt_chk && !opt_ner && !opt_srl && !opt_psg) /* the user does not know what he wants... */ opt_pos = opt_chk = opt_ner = opt_srl = opt_psg = 1; /* so give him everything (aren't we insane?) */ /* the real thing */ { char *sentence = NULL; char target_vb[MAX_TARGET_VB_SIZE]; int *chk_labels = NULL; int *pt0_labels = NULL; int *pos_labels = NULL; int *ner_labels = NULL; int *vbs_labels = NULL; int **srl_labels = NULL; int *psg_labels = NULL; int n_psg_level = 0; int is_psg_one_segment = 0; int vbs_hash_novb_idx = 22; int n_verbs = 0; sentence = malloc(max_sent_size + 1); SENNA_message("Maximum sentence size %ld", max_sent_size); /* inputs */ SENNA_Hash *word_hash = SENNA_Hash_new(opt_path, "hash/words.lst"); SENNA_Hash *caps_hash = SENNA_Hash_new(opt_path, "hash/caps.lst"); SENNA_Hash *suff_hash = SENNA_Hash_new(opt_path, "hash/suffix.lst"); SENNA_Hash *gazt_hash = SENNA_Hash_new(opt_path, "hash/gazetteer.lst"); SENNA_Hash *gazl_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.loc.lst", "data/ner.loc.dat"); SENNA_Hash *gazm_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.msc.lst", "data/ner.msc.dat"); SENNA_Hash *gazo_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.org.lst", "data/ner.org.dat"); SENNA_Hash *gazp_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.per.lst", "data/ner.per.dat"); /* labels */ SENNA_Hash *pos_hash = SENNA_Hash_new(opt_path, "hash/pos.lst"); SENNA_Hash *chk_hash = SENNA_Hash_new(opt_path, "hash/chk.lst"); SENNA_Hash *pt0_hash = SENNA_Hash_new(opt_path, "hash/pt0.lst"); SENNA_Hash *ner_hash = SENNA_Hash_new(opt_path, "hash/ner.lst"); SENNA_Hash *vbs_hash = SENNA_Hash_new(opt_path, "hash/vbs.lst"); SENNA_Hash *srl_hash = SENNA_Hash_new(opt_path, "hash/srl.lst"); SENNA_Hash *psg_left_hash = SENNA_Hash_new(opt_path, "hash/psg-left.lst"); SENNA_Hash *psg_right_hash = SENNA_Hash_new(opt_path, "hash/psg-right.lst"); SENNA_POS *pos = SENNA_POS_new(opt_path, "data/pos.dat"); SENNA_CHK *chk = SENNA_CHK_new(opt_path, "data/chk.dat"); SENNA_PT0 *pt0 = SENNA_PT0_new(opt_path, "data/pt0.dat"); SENNA_NER *ner = SENNA_NER_new(opt_path, "data/ner.dat"); SENNA_VBS *vbs = SENNA_VBS_new(opt_path, "data/vbs.dat"); SENNA_SRL *srl = SENNA_SRL_new(opt_path, "data/srl.dat"); SENNA_PSG *psg = SENNA_PSG_new(opt_path, "data/psg.dat"); SENNA_Tokenizer *tokenizer = SENNA_Tokenizer_new(word_hash, caps_hash, suff_hash, gazt_hash, gazl_hash, gazm_hash, gazo_hash, gazp_hash, opt_usrtokens); if(opt_iobtags) { SENNA_Hash_convert_IOBES_to_IOB(chk_hash); SENNA_Hash_convert_IOBES_to_IOB(ner_hash); SENNA_Hash_convert_IOBES_to_IOB(srl_hash); } else if(opt_brackettags) { SENNA_Hash_convert_IOBES_to_brackets(chk_hash); SENNA_Hash_convert_IOBES_to_brackets(ner_hash); SENNA_Hash_convert_IOBES_to_brackets(srl_hash); } SENNA_message("ready"); do { if (output_pipe) { senna_output = fopen(output_pipe, "w"); if (senna_output == NULL) { SENNA_error("cannot open the output named pipe"); } } else { senna_output = stdout; } while(fgets(sentence, max_sent_size + 1, senna_input)) { SENNA_Tokens* tokens = SENNA_Tokenizer_tokenize(tokenizer, sentence); if(tokens->n == 0) continue; pos_labels = SENNA_POS_forward(pos, tokens->word_idx, tokens->caps_idx, tokens->suff_idx, tokens->n); if(opt_chk) chk_labels = SENNA_CHK_forward(chk, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n); if(opt_srl) pt0_labels = SENNA_PT0_forward(pt0, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n); if(opt_ner) ner_labels = SENNA_NER_forward(ner, tokens->word_idx, tokens->caps_idx, tokens->gazl_idx, tokens->gazm_idx, tokens->gazo_idx, tokens->gazp_idx, tokens->n); if(opt_srl) { if(opt_usrvbs) { vbs_labels = SENNA_realloc(vbs_labels, sizeof(int), tokens->n); n_verbs = 0; for(i = 0; i < tokens->n; i++) { if(!SENNA_fgetline(target_vb, MAX_TARGET_VB_SIZE, opt_usrvbs)) SENNA_error("invalid user verbs file\n"); vbs_labels[i] = !( (target_vb[0] == '-') && ( (target_vb[1] == '\0') || isspace(target_vb[1])) ); n_verbs += vbs_labels[i]; } if(!SENNA_fgetline(target_vb, MAX_TARGET_VB_SIZE, opt_usrvbs)) SENNA_error("invalid user verbs file\n"); if(strlen(target_vb) > 0) SENNA_error("sentence size does not match in user verbs file"); } else if(opt_posvbs) { vbs_labels = SENNA_realloc(vbs_labels, sizeof(int), tokens->n); n_verbs = 0; for(i = 0; i < tokens->n; i++) { vbs_labels[i] = (SENNA_Hash_key(pos_hash, pos_labels[i])[0] == 'V'); n_verbs += vbs_labels[i]; } } else { vbs_labels = SENNA_VBS_forward(vbs, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n); n_verbs = 0; for(i = 0; i < tokens->n; i++) { vbs_labels[i] = (vbs_labels[i] != vbs_hash_novb_idx); n_verbs += vbs_labels[i]; } } } if(opt_srl) srl_labels = SENNA_SRL_forward(srl, tokens->word_idx, tokens->caps_idx, pt0_labels, vbs_labels, tokens->n); if(opt_psg) { SENNA_PSG_forward(psg, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n, &psg_labels, &n_psg_level); /* check if top level takes the full sentence */ { int *psg_top_labels = psg_labels + (n_psg_level-1)*tokens->n; if(tokens->n == 1) is_psg_one_segment = ((psg_top_labels[0]-1) % 4 == 3); /* S- ? */ else is_psg_one_segment = ((psg_top_labels[0]-1) % 4 == 0) && ((psg_top_labels[tokens->n-1]-1) % 4 == 2); /* B- or E- ? */ for(i = 1; is_psg_one_segment && (i < tokens->n-1); i++) { if((psg_top_labels[i]-1) % 4 != 1) /* I- ? */ is_psg_one_segment = 0; } } } for(i = 0; i < tokens->n; i++) { if(!opt_notokentags) fprintf(senna_output, "%15s", tokens->words[i]); if(opt_offsettags) fprintf(senna_output, "\t%d %d", tokens->start_offset[i], tokens->end_offset[i]); if(opt_pos) fprintf(senna_output, "\t%10s", SENNA_Hash_key(pos_hash, pos_labels[i])); if(opt_chk) fprintf(senna_output, "\t%10s", SENNA_Hash_key(chk_hash, chk_labels[i])); if(opt_ner) fprintf(senna_output, "\t%10s", SENNA_Hash_key(ner_hash, ner_labels[i])); if(opt_srl) { fprintf(senna_output, "\t%15s", (vbs_labels[i] ? tokens->words[i] : "-")); for(j = 0; j < n_verbs; j++) fprintf(senna_output, "\t%10s", SENNA_Hash_key(srl_hash, srl_labels[j][i])); } if(opt_psg) /* last, can be long */ { fprintf(senna_output, "\t"); if(i == 0) { fprintf(senna_output, "(S1"); if(!is_psg_one_segment) fprintf(senna_output, "(S"); } for(j = n_psg_level-1; j >= 0; j--) fprintf(senna_output, "%s", SENNA_Hash_key(psg_left_hash, psg_labels[j*tokens->n+i])); fprintf(senna_output, "*"); for(j = 0; j < n_psg_level; j++) fprintf(senna_output, "%s", SENNA_Hash_key(psg_right_hash, psg_labels[j*tokens->n+i])); if(i == tokens->n-1) { if(!is_psg_one_segment) fprintf(senna_output, ")"); fprintf(senna_output, ")"); } } fprintf(senna_output, "\n"); } fprintf(senna_output, "\n"); /* end of sentence */ } if (output_pipe) { fclose(senna_output); } } while (pipe_mode); if(opt_posvbs) SENNA_free(vbs_labels); if(opt_usrvbs) { SENNA_free(vbs_labels); SENNA_fclose(opt_usrvbs); } SENNA_Tokenizer_free(tokenizer); SENNA_POS_free(pos); SENNA_CHK_free(chk); SENNA_PT0_free(pt0); SENNA_NER_free(ner); SENNA_VBS_free(vbs); SENNA_SRL_free(srl); SENNA_PSG_free(psg); SENNA_Hash_free(word_hash); SENNA_Hash_free(caps_hash); SENNA_Hash_free(suff_hash); SENNA_Hash_free(gazt_hash); SENNA_Hash_free(gazl_hash); SENNA_Hash_free(gazm_hash); SENNA_Hash_free(gazo_hash); SENNA_Hash_free(gazp_hash); SENNA_Hash_free(pos_hash); SENNA_Hash_free(chk_hash); SENNA_Hash_free(pt0_hash); SENNA_Hash_free(ner_hash); SENNA_Hash_free(vbs_hash); SENNA_Hash_free(srl_hash); SENNA_Hash_free(psg_left_hash); SENNA_Hash_free(psg_right_hash); free(sentence); } return 0; }
int main(int argc, char *argv[]) { int i, j; /* options */ char * sentence = NULL; char *opt_path = NULL; int opt_verbose = 0; int opt_notokentags = 0; int opt_iobtags = 0; int opt_brackettags = 0; int opt_posvbs = 0; int opt_usrtokens = 0; int opt_pos = 0; int opt_chk = 0; int opt_ner = 0; int opt_srl = 0; FILE *opt_usrvbs = NULL; for(i = 1; i < argc; i++) { if(!strcmp(argv[i], "-sentence")){ if(i+1 >= argc) SENNA_error("please provide a path for the -path option"); sentence = argv[i+1]; i++; }else if(!strcmp(argv[i], "-verbose")) opt_verbose = 1; else if(!strcmp(argv[i], "-notokentags")) opt_notokentags = 1; else if(!strcmp(argv[i], "-iobtags")) opt_iobtags = 1; else if(!strcmp(argv[i], "-brackettags")) opt_brackettags = 1; else if(!strcmp(argv[i], "-path")) { if(i+1 >= argc) SENNA_error("please provide a path for the -path option"); opt_path = argv[i+1]; i++; } else if(!strcmp(argv[i], "-posvbs")) opt_posvbs = 1; else if(!strcmp(argv[i], "-usrtokens")) opt_usrtokens = 1; else if(!strcmp(argv[i], "-usrvbs")) { if(i+1 >= argc) SENNA_error("please provide a filename for the -usrvbs option"); opt_usrvbs = SENNA_fopen(NULL, argv[i+1], "rb"); i++; } else if(!strcmp(argv[i], "-pos")) opt_pos = 1; else if(!strcmp(argv[i], "-chk")) opt_chk = 1; else if(!strcmp(argv[i], "-ner")) opt_ner = 1; else if(!strcmp(argv[i], "-srl")) opt_srl = 1; else { printf("invalid argument: %s\n", argv[i]); help(argv[0]); } } SENNA_set_verbose_mode(opt_verbose); if(!opt_pos && !opt_chk && !opt_ner && !opt_srl) /* the user does not know what he wants... */ opt_pos = opt_chk = opt_ner = opt_srl = 1; /* so give him everything */ /* the real thing */ { //char sentence[MAX_SENTENCE_SIZE]; char target_vb[MAX_TARGET_VB_SIZE]; int *chk_labels = NULL; int *pt0_labels = NULL; int *pos_labels = NULL; int *ner_labels = NULL; int *vbs_labels = NULL; int **srl_labels = NULL; int vbs_hash_novb_idx = 22; int n_verbs = 0; /* inputs */ SENNA_Hash *word_hash = SENNA_Hash_new(opt_path, "hash/words.lst"); SENNA_Hash *caps_hash = SENNA_Hash_new(opt_path, "hash/caps.lst"); SENNA_Hash *suff_hash = SENNA_Hash_new(opt_path, "hash/suffix.lst"); SENNA_Hash *gazt_hash = SENNA_Hash_new(opt_path, "hash/gazetteer.lst"); SENNA_Hash *gazl_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.loc.lst", "data/ner.loc.dat"); SENNA_Hash *gazm_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.msc.lst", "data/ner.msc.dat"); SENNA_Hash *gazo_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.org.lst", "data/ner.org.dat"); SENNA_Hash *gazp_hash = SENNA_Hash_new_with_admissible_keys(opt_path, "hash/ner.per.lst", "data/ner.per.dat"); /* labels */ SENNA_Hash *pos_hash = SENNA_Hash_new(opt_path, "hash/pos.lst"); SENNA_Hash *chk_hash = SENNA_Hash_new(opt_path, "hash/chk.lst"); SENNA_Hash *pt0_hash = SENNA_Hash_new(opt_path, "hash/pt0.lst"); SENNA_Hash *ner_hash = SENNA_Hash_new(opt_path, "hash/ner.lst"); SENNA_Hash *vbs_hash = SENNA_Hash_new(opt_path, "hash/vbs.lst"); SENNA_Hash *srl_hash = SENNA_Hash_new(opt_path, "hash/srl.lst"); SENNA_POS *pos = SENNA_POS_new(opt_path, "data/pos.dat"); SENNA_CHK *chk = SENNA_CHK_new(opt_path, "data/chk.dat"); SENNA_PT0 *pt0 = SENNA_PT0_new(opt_path, "data/pt0.dat"); SENNA_NER *ner = SENNA_NER_new(opt_path, "data/ner.dat"); SENNA_VBS *vbs = SENNA_VBS_new(opt_path, "data/vbs.dat"); SENNA_SRL *srl = SENNA_SRL_new(opt_path, "data/srl.dat"); SENNA_Tokenizer *tokenizer = SENNA_Tokenizer_new(word_hash, caps_hash, suff_hash, gazt_hash, gazl_hash, gazm_hash, gazo_hash, gazp_hash, opt_usrtokens); if(opt_iobtags) { SENNA_Hash_convert_IOBES_to_IOB(chk_hash); SENNA_Hash_convert_IOBES_to_IOB(ner_hash); SENNA_Hash_convert_IOBES_to_IOB(srl_hash); } else if(opt_brackettags) { SENNA_Hash_convert_IOBES_to_brackets(chk_hash); SENNA_Hash_convert_IOBES_to_brackets(ner_hash); SENNA_Hash_convert_IOBES_to_brackets(srl_hash); } SENNA_message("ready"); // while(fgets(sentence, MAX_SENTENCE_SIZE, stdin)) if(sentence != NULL) { SENNA_Tokens* tokens = SENNA_Tokenizer_tokenize(tokenizer, sentence); if(tokens->n == 0) continue; pos_labels = SENNA_POS_forward(pos, tokens->word_idx, tokens->caps_idx, tokens->suff_idx, tokens->n); if(opt_chk) chk_labels = SENNA_CHK_forward(chk, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n); if(opt_srl) pt0_labels = SENNA_PT0_forward(pt0, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n); if(opt_ner) ner_labels = SENNA_NER_forward(ner, tokens->word_idx, tokens->caps_idx, tokens->gazl_idx, tokens->gazm_idx, tokens->gazo_idx, tokens->gazp_idx, tokens->n); if(opt_srl) { if(opt_usrvbs) { vbs_labels = SENNA_realloc(vbs_labels, sizeof(int), tokens->n); n_verbs = 0; for(i = 0; i < tokens->n; i++) { if(!SENNA_fgetline(target_vb, MAX_TARGET_VB_SIZE, opt_usrvbs)) SENNA_error("invalid user verbs file\n"); vbs_labels[i] = !( (target_vb[0] == '-') && ( (target_vb[1] == '\0') || isspace(target_vb[1])) ); n_verbs += vbs_labels[i]; } if(!SENNA_fgetline(target_vb, MAX_TARGET_VB_SIZE, opt_usrvbs)) SENNA_error("invalid user verbs file\n"); if(strlen(target_vb) > 0) SENNA_error("sentence size does not match in user verbs file"); } else if(opt_posvbs) { vbs_labels = SENNA_realloc(vbs_labels, sizeof(int), tokens->n); n_verbs = 0; for(i = 0; i < tokens->n; i++) { vbs_labels[i] = (SENNA_Hash_key(pos_hash, pos_labels[i])[0] == 'V'); n_verbs += vbs_labels[i]; } } else { vbs_labels = SENNA_VBS_forward(vbs, tokens->word_idx, tokens->caps_idx, pos_labels, tokens->n); n_verbs = 0; for(i = 0; i < tokens->n; i++) { vbs_labels[i] = (vbs_labels[i] != vbs_hash_novb_idx); n_verbs += vbs_labels[i]; } } } if(opt_srl) srl_labels = SENNA_SRL_forward(srl, tokens->word_idx, tokens->caps_idx, pt0_labels, vbs_labels, tokens->n); for(i = 0; i < tokens->n; i++) { if(!opt_notokentags) printf("%15s", tokens->words[i]); if(opt_pos) printf("\t%10s", SENNA_Hash_key(pos_hash, pos_labels[i])); if(opt_chk) printf("\t%10s", SENNA_Hash_key(chk_hash, chk_labels[i])); if(opt_ner) printf("\t%10s", SENNA_Hash_key(ner_hash, ner_labels[i])); if(opt_srl) { printf("\t%15s", (vbs_labels[i] ? tokens->words[i] : "-")); for(j = 0; j < n_verbs; j++) printf("\t%10s", SENNA_Hash_key(srl_hash, srl_labels[j][i])); } printf("\n"); } printf("\n"); /* end of sentence */ } if(opt_posvbs) SENNA_free(vbs_labels); if(opt_usrvbs) { SENNA_free(vbs_labels); SENNA_fclose(opt_usrvbs); } SENNA_Tokenizer_free(tokenizer); SENNA_POS_free(pos); SENNA_CHK_free(chk); SENNA_PT0_free(pt0); SENNA_NER_free(ner); SENNA_VBS_free(vbs); SENNA_SRL_free(srl); SENNA_Hash_free(word_hash); SENNA_Hash_free(caps_hash); SENNA_Hash_free(suff_hash); SENNA_Hash_free(gazt_hash); SENNA_Hash_free(gazl_hash); SENNA_Hash_free(gazm_hash); SENNA_Hash_free(gazo_hash); SENNA_Hash_free(gazp_hash); SENNA_Hash_free(pos_hash); SENNA_Hash_free(chk_hash); SENNA_Hash_free(pt0_hash); SENNA_Hash_free(ner_hash); SENNA_Hash_free(vbs_hash); SENNA_Hash_free(srl_hash); } return 0; }
SENNA_SRL *SENNA_SRL_new(const char *path, const char *subpath) { SENNA_SRL *srl = SENNA_malloc(sizeof(SENNA_SRL), 1); FILE *f; float dummy; int dummy_size; f = SENNA_fopen(path, subpath, "rb"); SENNA_fread(&srl->window_size, sizeof(int), 1, f); SENNA_fread_tensor_2d(&srl->ll_word_weight, &srl->ll_word_size, &srl->ll_word_max_idx, f); SENNA_fread_tensor_2d(&srl->ll_caps_weight, &srl->ll_caps_size, &srl->ll_caps_max_idx, f); SENNA_fread_tensor_2d(&srl->ll_chkl_weight, &srl->ll_chkl_size, &srl->ll_chkl_max_idx, f); SENNA_fread_tensor_2d(&srl->ll_posv_weight, &srl->ll_posv_size, &srl->ll_posv_max_idx, f); SENNA_fread_tensor_2d(&srl->ll_posw_weight, &srl->ll_posw_size, &srl->ll_posw_max_idx, f); SENNA_fread_tensor_2d(&srl->l1_weight_wcc, &dummy_size, &srl->hidden_state1_size, f); SENNA_fread_tensor_2d(&srl->l1_weight_pv, &dummy_size, &srl->hidden_state1_size, f); SENNA_fread_tensor_2d(&srl->l1_weight_pw, &dummy_size, &srl->hidden_state1_size, f); SENNA_fread_tensor_1d(&srl->l1_bias, &srl->hidden_state1_size, f); SENNA_fread_tensor_2d(&srl->l3_weight, &srl->hidden_state1_size, &srl->hidden_state3_size, f); SENNA_fread_tensor_1d(&srl->l3_bias, &srl->hidden_state3_size, f); SENNA_fread_tensor_2d(&srl->l4_weight, &srl->hidden_state3_size, &srl->output_state_size, f); SENNA_fread_tensor_1d(&srl->l4_bias, &srl->output_state_size, f); SENNA_fread_tensor_1d(&srl->viterbi_score_init, &srl->output_state_size, f); SENNA_fread_tensor_2d(&srl->viterbi_score_trans, &srl->output_state_size, &srl->output_state_size, f); SENNA_fread(&srl->ll_word_padding_idx, sizeof(int), 1, f); SENNA_fread(&srl->ll_caps_padding_idx, sizeof(int), 1, f); SENNA_fread(&srl->ll_chkl_padding_idx, sizeof(int), 1, f); SENNA_fread(&dummy, sizeof(float), 1, f); SENNA_fclose(f); if ((int)dummy != 777) SENNA_error("srl: data corrupted (or not IEEE floating computer)"); /* states */ srl->sentence_posv = NULL; srl->sentence_posw = NULL; srl->input_state = NULL; srl->input_state_wcc = NULL; srl->input_state_pv = NULL; srl->input_state_pw = NULL; srl->hidden_state1 = NULL; srl->hidden_state1_wcc = NULL; srl->hidden_state1_pv = NULL; srl->hidden_state1_pw = NULL; srl->hidden_state2 = NULL; srl->hidden_state3 = NULL; srl->output_state = NULL; srl->labels = NULL; srl->labels_size = 0; srl->service = false; srl->debug = false; srl->calls = 0; srl->dnntime = 0; srl->apptime = 0; /* some info if you want verbose */ SENNA_message("srl: window size: %d", srl->window_size); SENNA_message("srl: vector size in word lookup table: %d", srl->ll_word_size); SENNA_message("srl: word lookup table size: %d", srl->ll_word_max_idx); SENNA_message("srl: vector size in caps lookup table: %d", srl->ll_caps_size); SENNA_message("srl: caps lookup table size: %d", srl->ll_caps_max_idx); SENNA_message("srl: vector size in verb position lookup table: %d", srl->ll_posv_size); SENNA_message("srl: verb position lookup table size: %d", srl->ll_posv_max_idx); SENNA_message("srl: vector size in word position lookup table: %d", srl->ll_posw_size); SENNA_message("srl: word position lookup table size: %d", srl->ll_posw_max_idx); SENNA_message("srl: number of hidden units (convolution): %d", srl->hidden_state1_size); SENNA_message("srl: number of hidden units (hidden layer): %d", srl->hidden_state3_size); SENNA_message("srl: number of classes: %d", srl->output_state_size); return srl; }