Esempio n. 1
0
void print_data( const V2_STR &one_sent )
{
	for( V2_STR::const_iterator i_row = one_sent.begin(); i_row != one_sent.end(); ++i_row ) {
		for( V1_STR::const_iterator i_col = i_row->begin(); i_col != i_row->end(); ++i_col ) {
			cout << *(i_col);
			if( (i_col + 1) != i_row->end() )
				cout << "\t";
		}
		cout << endl;
	}
	cout << endl;
}
Esempio n. 2
0
	void Tokenizer::mark_pos( const string &raw_sent, V2_STR &data, const size_t init_offset ) 
	{
		size_t	beg = 0, end = 0;
		char	chr_pos[128];

		for( V2_STR::iterator i_row = data.begin(); i_row != data.end(); ++i_row ) {
			beg = raw_sent.find_first_of( i_row->back(), end );
			end = beg + i_row->back().length();

			sprintf(chr_pos, "%d", init_offset + beg);
			i_row->insert( i_row->end() - 1, chr_pos );

			sprintf(chr_pos, "%d", init_offset + end);
			i_row->insert( i_row->end() - 1, chr_pos );
		}	
	}
Esempio n. 3
0
int main(int argc, char* argv[])
{
	if( argc == 2 ) {
		string	arg2 = argv[1];

		if( arg2 == "--help" ) {
			cerr << "Usage: " << argv[0] << " < a sentence-per-line file" << endl;
			return 0;
		}
	}

	TOKENIZER	tokenizer;

	string	line = "";
	V2_STR	data;
	int	n_lines = 1;

	while( getline(cin, line ) ) {
		data.clear();
		if( line.empty() )		// Ignore blank lines
			continue;

		tokenizer.tokenize( line, data, 0 );

		for( V2_STR::iterator i_row = data.begin(); i_row != data.end(); ++i_row) {
			for( V1_STR::iterator i_col = i_row->begin(); i_col != i_row->end(); ++i_col) {
				cout << *i_col;
				if( (i_col + 1) != i_row->end() )
					cout << "\t";
			}
			cout << endl;
		}
		cout << endl;
		
		++n_lines;
	}

	return n_lines;
}
Esempio n. 4
0
	int Tokenizer::splitter( const string &trimmed_sent, V2_STR &data )
	{
		size_t	beg = 0, end = find_token_end( trimmed_sent, 0 ), n_tokens = 0;

		V1_STR	one_row;
		one_row.push_back( "token" );		

		while( end != string::npos ) {
			string	token = trimmed_sent.substr( beg, end - beg );

			if( token != " " ) {				// Put all tokens except a space token into the 'data' container
				one_row.back() = token;
				data.push_back( one_row );
				++n_tokens;
			}

			beg = end;
			end = find_token_end( trimmed_sent, beg );
		}

		return n_tokens;
	}
Esempio n. 5
0
int pad_answer( const string &mode, const V2_STR &one_sent, V2_STR &sent_feats )
{
        vector<string>                  answer_tag(1, "DUMMY");

        if( mode == "tag" ) {            // no answer tag
                for( V2_STR_citr i_row = one_sent.begin(); i_row != one_sent.end(); ++i_row ) {
                        sent_feats.push_back( answer_tag );
                }
        }else if( mode == "learn" ) {
                for( V2_STR_citr i_row = one_sent.begin(); i_row != one_sent.end(); ++i_row ) {
                        answer_tag[ 0 ] = (*i_row)[ 0 ];
                        sent_feats.push_back( answer_tag );
                }
        }else {
                cerr << "invalid mode option! " << endl;
                exit(1);
        }

        return 0;
}
Esempio n. 6
0
/**
* CRF tagging functions
*/
int tag_crfsuite(V2_STR &one_sent, V2_STR &sent_feat, crf_model_t *model, map<string, int> &term_idx, const COLUMN_INFO &COL_INFO, const nersuite_optparse &ner_opt)
{

    int				N = 0, L = 0, ret = 0, lid = -1;
    clock_t			clk0, clk1;
    crf_sequence_t		inst;
    crf_item_t			item;
    crf_content_t		cont;
    crf_output_t		output;
    crf_evaluation_t	eval;
    // _iwa_token_t		*token = (_iwa_token_t*)malloc(sizeof(_iwa_token_t));
    crf_tagger_t		*tagger = NULL;
    crf_dictionary_t	*attrs = NULL, *labels = NULL;
 
    /* Obtain the dictionary interface representing the labels in the model. */
    if ((ret = model->get_labels(model, &labels))) {
        goto tag_crf_force_exit;
    }

    /* Obtain the dictionary interface representing the attributes in the model. */
    if ((ret = model->get_attrs(model, &attrs))) {
        goto tag_crf_force_exit;
    }

    /* Obtain the tagger interface. */
    if ((ret = model->get_tagger(model, &tagger))) {
        goto tag_crf_force_exit;
    }

    /* Initialize the objects for instance and evaluation. */
    L = labels->num(labels);
    crf_sequence_init(&inst);
    crf_evaluation_init(&eval, L);

	/* Read the input data and assign labels. */
	clk0 = clock();

	// Initialize the item variable
	crf_item_init(&item);

   	for(V2_STR::iterator i = sent_feat.begin(); i != sent_feat.end(); ++i) {
		// Label part (first column)
		lid = labels->to_id(labels, (*(i->begin())).c_str() );
		if(lid < 0)
			lid = L;

		// Attribute part (second ~ last-1 column)
		for(vector<string>::iterator j = (i->begin() + 1); j != i->end(); ++j) {
			size_t pos = j->find_first_of(":");
			string _attr, _value;
			
			if(pos == string::npos) {
				_attr = *j;
			}else {
				_attr = j->substr(0, pos);
				_value = j->substr(pos + 1, j->length() - pos - 1);
			}
			
			/* Fields after the first field present attributes. */
			int aid = attrs->to_id(attrs, _attr.c_str());

			/* Ignore attributes 'unknown' to the model. */
		    if (0 <= aid) {
                /* Associate the attribute with the current item. */
				if (pos != string::npos) {
                    crf_content_set(&cont, aid, atof(_value.c_str()));
                } else{ 
                    crf_content_set(&cont, aid, 1.0);
                }
                crf_item_append_content(&item, &cont);
            }
		}

		// End the item variable (last column)
		crf_sequence_append(&inst, &item, lid);
		crf_item_finish(&item);
	}


	if (!crf_sequence_empty(&inst)) {
		/* Initialize the object to receive the tagging result. */
		crf_output_init(&output);

		/* Tag the instance. */
		if ((ret = tagger->tag(tagger, &inst, &output))) {
			goto tag_crf_force_exit;
		}
		++N;

		if (ner_opt.is_standoff == true) {
			output_result_standoff(stdout, &output, labels, one_sent, term_idx, COL_INFO);
		}else {
			output_result_conll(stdout, &output, labels, one_sent, COL_INFO);
		}

		crf_output_finish(&output);
		crf_sequence_finish(&inst);
	}
	
	clk1 = clock();

tag_crf_force_exit:
    crf_sequence_finish(&inst);
    crf_evaluation_finish(&eval);

    SAFE_RELEASE(tagger);
    SAFE_RELEASE(attrs);
    SAFE_RELEASE(labels);

	return ret;
}