Пример #1
0
std::vector<SVMData> read_data(std::string file_name){
	std::vector<SVMData> output;

	std::ifstream f_in(file_name, std::ios_base::in);
	if(!f_in.good())
		return output;

	char line[256];
	while(!f_in.eof() && f_in.getline(line, 256)){
		SVMData svmData;
		if (analyze_features(line, svmData))
			output.push_back(svmData);
	}

	return output;
}
Пример #2
0
void Tagger::Init(int argc, char * argv[])
{

	setlocale(LC_CTYPE, "iso_8858_1");
#ifdef SpecialMalloc
	/* Force fast allocation */
	set_small_allocation(100);
#endif

	/* Clear data structures */
	InitDict(dict);
	InitDict(skip_dict);
	InitTrans(trans);
	InitTrans(c_newtrans);
	odictfile = otranfile = NULL;

	/* Verify command line */
	if (argc <= 2)
	  error_exit("Usage: label corpus options\n");

	/* Form options */
	InitOptions;

	set_up_options(argc, argv, &iterations, &initialise, &dict_size,
				dictname, tranname, odictname, otranname, outname, mapname,
				skipname, reducename, fsmname, grammarname, infername, ukwname,
				ofeaturesname, obadwordname, bdbmname, runkstatname, wunkstatname);

	any_output = !no_output || Option(report_stats) || OutOpt(prob_dist);


	/* Open BDBM dictionary */
	if (Option(bdbm)){
		/* Berkeley DB: first of all need to create the dbp data structure*/
		if((ret = db_create(&dbp, NULL, 0)) != 0) {
			fprintf(stderr, "db_create: %s\n", db_strerror(ret));
			exit (1);
		}
		/* Berkeley DB: Then you open it, readonly  */
		if((ret = dbp->open(dbp,bdbmname, NULL, DB_BTREE, DB_RDONLY, 0777)) != 0) {
			dbp->err(dbp, ret, "%s", bdbmname);
			exit(1);
		} 
	}

	/* Read mappings */
	if (Option(verbose)) printf("Read mappings\n");
	read_mapping(mapname);

	/* Read tag reduction mappings */
	if (Option(reduced_tags))
	{
		if (Option(verbose)) printf("Read reduced tag set\n");
		read_reduce_mapping(reducename);
	}

#ifdef Use_Parser
	/* Read parse rules */
	if (Option(use_parser))
	{
		if (Option(verbose)) printf("Read parse rules\n");
		parser_read_named(grammarname);
	}
#endif
#ifdef Use_FSM
	/* Read FSM definitions */
	if (Option(use_fsm))
	{
		if (Option(verbose)) printf("Read FSMs\n");
		fsm_read_named(fsmname);
	}
#endif

	/* Read skip list */
	if (Option(skip_list))
	{
		if (Option(verbose)) printf("Read skip list\n");
		read_named_dict(skipname, &skip_dict, -1);
	}

	/* Read unknown word rules */
	if (Option(unknown_rules))
	{
		if (Option(verbose)) printf("Read unknown word rules\n");
		read_unknown_rules(ukwname);
	}

	/* Set up dictionary [note]:it costs a few seconds*/
	if (dictname[0] == 0)
	{
		create_dict(&dict, dict_size);
		clear_dict(&dict);
	}
	else
	{
		if (Option(verbose)) printf("Read dictionary\n");
		read_named_dict(dictname, &dict, -1);
		if (infername[0] != 0)
		{
			if (Option(verbose)) printf("Read inference rules\n");
			infer_tags((char *)infername, &dict);

		}
	}

	/* Set up transitions [note] it costs a few seconds*/
	if (tranname[0] == 0)
	{
		create_trans(&trans, tags_all);
		clear_trans_all(&trans);
	}
	else
	{
		if (Option(verbose)) printf("Read transitions\n");
		read_named_ascii_trans(tranname, &trans);

		/* Analyze selected features of lexicon to generate tag probabilities for unknown words. */
		if ( Option(unknown_morph) || Option(unknown_rules))
		{
			/* Initialize feature values */

			Allocate(features->featuretags, sizeof(FeatureTagSt), "features->featuretags: main");
			features->featuretags->next_open_slot = 0;

			features->gamma = trans.gamma;

			if ( features->maxsuffix == 0 )
			  features->maxsuffix = MinSuffixLen;
			if ( features->maxunkwords == 0 )
			  features->maxunkwords = MAXUNKWORDS;
			if ( features->maxprefcut == 0 )
			  features->maxprefcut = MinPrefixLen;
			if ( features->maxsuffcut == 0 )
			  features->maxsuffcut = MinSuffixLen;

			unknown_word_handling_initialization();
			gather_unigram_freqs( &dict );
		}

		if ( Option(unknown_morph) )
		{
			analyze_features( &dict, ofeaturesname, obadwordname, &trans, dbp, &dict, runkstatname );
		}
	}

	set_special_words(&dict, features );

	/* Create space for re-estimation or training */
	if (Option(reestimate) || Option(training))
	{
		c_newtrans.gamma = trans.gamma; /* Share arrays */
		create_trans(&c_newtrans, tags_all);
	}

	if (odictname[0] != 0)
	  odictfile = open_file(odictname, "w");
	if (otranname[0] != 0)
	  otranfile = open_file(otranname, "w");

	/* Set up anchor word */
	set_anchor(&dict);

	adjust_dict(&dict, trans.gamma, FALSE);
	adjust_trans(&trans, NULL);
}