int pad_answer( const string &mode, const V2_STR &one_sent, V2_STR &sent_feats ) { vector<string> answer_tag(1, "DUMMY"); if( mode == "tag" ) { // no answer tag for( V2_STR_citr i_row = one_sent.begin(); i_row != one_sent.end(); ++i_row ) { sent_feats.push_back( answer_tag ); } }else if( mode == "learn" ) { for( V2_STR_citr i_row = one_sent.begin(); i_row != one_sent.end(); ++i_row ) { answer_tag[ 0 ] = (*i_row)[ 0 ]; sent_feats.push_back( answer_tag ); } }else { cerr << "invalid mode option! " << endl; exit(1); } return 0; }
void print_data( const V2_STR &one_sent ) { for( V2_STR::const_iterator i_row = one_sent.begin(); i_row != one_sent.end(); ++i_row ) { for( V1_STR::const_iterator i_col = i_row->begin(); i_col != i_row->end(); ++i_col ) { cout << *(i_col); if( (i_col + 1) != i_row->end() ) cout << "\t"; } cout << endl; } cout << endl; }
void Tokenizer::mark_pos( const string &raw_sent, V2_STR &data, const size_t init_offset ) { size_t beg = 0, end = 0; char chr_pos[128]; for( V2_STR::iterator i_row = data.begin(); i_row != data.end(); ++i_row ) { beg = raw_sent.find_first_of( i_row->back(), end ); end = beg + i_row->back().length(); sprintf(chr_pos, "%d", init_offset + beg); i_row->insert( i_row->end() - 1, chr_pos ); sprintf(chr_pos, "%d", init_offset + end); i_row->insert( i_row->end() - 1, chr_pos ); } }
int main(int argc, char* argv[]) { if( argc == 2 ) { string arg2 = argv[1]; if( arg2 == "--help" ) { cerr << "Usage: " << argv[0] << " < a sentence-per-line file" << endl; return 0; } } TOKENIZER tokenizer; string line = ""; V2_STR data; int n_lines = 1; while( getline(cin, line ) ) { data.clear(); if( line.empty() ) // Ignore blank lines continue; tokenizer.tokenize( line, data, 0 ); for( V2_STR::iterator i_row = data.begin(); i_row != data.end(); ++i_row) { for( V1_STR::iterator i_col = i_row->begin(); i_col != i_row->end(); ++i_col) { cout << *i_col; if( (i_col + 1) != i_row->end() ) cout << "\t"; } cout << endl; } cout << endl; ++n_lines; } return n_lines; }
/** * CRF tagging functions */ int tag_crfsuite(V2_STR &one_sent, V2_STR &sent_feat, crf_model_t *model, map<string, int> &term_idx, const COLUMN_INFO &COL_INFO, const nersuite_optparse &ner_opt) { int N = 0, L = 0, ret = 0, lid = -1; clock_t clk0, clk1; crf_sequence_t inst; crf_item_t item; crf_content_t cont; crf_output_t output; crf_evaluation_t eval; // _iwa_token_t *token = (_iwa_token_t*)malloc(sizeof(_iwa_token_t)); crf_tagger_t *tagger = NULL; crf_dictionary_t *attrs = NULL, *labels = NULL; /* Obtain the dictionary interface representing the labels in the model. */ if ((ret = model->get_labels(model, &labels))) { goto tag_crf_force_exit; } /* Obtain the dictionary interface representing the attributes in the model. */ if ((ret = model->get_attrs(model, &attrs))) { goto tag_crf_force_exit; } /* Obtain the tagger interface. */ if ((ret = model->get_tagger(model, &tagger))) { goto tag_crf_force_exit; } /* Initialize the objects for instance and evaluation. */ L = labels->num(labels); crf_sequence_init(&inst); crf_evaluation_init(&eval, L); /* Read the input data and assign labels. */ clk0 = clock(); // Initialize the item variable crf_item_init(&item); for(V2_STR::iterator i = sent_feat.begin(); i != sent_feat.end(); ++i) { // Label part (first column) lid = labels->to_id(labels, (*(i->begin())).c_str() ); if(lid < 0) lid = L; // Attribute part (second ~ last-1 column) for(vector<string>::iterator j = (i->begin() + 1); j != i->end(); ++j) { size_t pos = j->find_first_of(":"); string _attr, _value; if(pos == string::npos) { _attr = *j; }else { _attr = j->substr(0, pos); _value = j->substr(pos + 1, j->length() - pos - 1); } /* Fields after the first field present attributes. */ int aid = attrs->to_id(attrs, _attr.c_str()); /* Ignore attributes 'unknown' to the model. */ if (0 <= aid) { /* Associate the attribute with the current item. */ if (pos != string::npos) { crf_content_set(&cont, aid, atof(_value.c_str())); } else{ crf_content_set(&cont, aid, 1.0); } crf_item_append_content(&item, &cont); } } // End the item variable (last column) crf_sequence_append(&inst, &item, lid); crf_item_finish(&item); } if (!crf_sequence_empty(&inst)) { /* Initialize the object to receive the tagging result. */ crf_output_init(&output); /* Tag the instance. */ if ((ret = tagger->tag(tagger, &inst, &output))) { goto tag_crf_force_exit; } ++N; if (ner_opt.is_standoff == true) { output_result_standoff(stdout, &output, labels, one_sent, term_idx, COL_INFO); }else { output_result_conll(stdout, &output, labels, one_sent, COL_INFO); } crf_output_finish(&output); crf_sequence_finish(&inst); } clk1 = clock(); tag_crf_force_exit: crf_sequence_finish(&inst); crf_evaluation_finish(&eval); SAFE_RELEASE(tagger); SAFE_RELEASE(attrs); SAFE_RELEASE(labels); return ret; }