示例#1
0
bool 
make_stable_training_sets_by_personal(const list <Record> & all_records,
                                      const unsigned int limit,
                                      const vector <string> & training_filenames) {

    //if ( training_filenames.size() != 2 )
        //throw cException_Other("Training: there should be 2 changeable training sets.");

    cGroup_Value rare_firstname_set;
    cGroup_Value rare_lastname_set;

    std::ofstream outfile;
    cPrint_Pair do_print(outfile, cUnique_Record_ID::static_get_class_name());
    const char * current_file;
    vector<cGroup_Value *> rare_pointer_vec;
    rare_pointer_vec.push_back(&rare_firstname_set);
    rare_pointer_vec.push_back(&rare_lastname_set);
    const vector< const cGroup_Value * > const_rare_pointer_vec(rare_pointer_vec.begin(), rare_pointer_vec.end());

    list < const Record*> record_pointers;
    for ( list<Record>::const_iterator p = all_records.begin(); p != all_records.end(); ++p )
        record_pointers.push_back(&(*p));

    find_rare_names_v2(rare_pointer_vec, record_pointers);
    list<pointer_pairs> pair_list;
    vector <string> rare_column_names;
    rare_column_names.push_back(string(cFirstname::static_get_class_name()));
    rare_column_names.push_back(string(cLastname::static_get_class_name()));

    //xset03
    pair_list.clear();
    create_xset03(pair_list, record_pointers, const_rare_pointer_vec, limit);
    current_file = training_filenames.at(0).c_str();
    outfile.open(current_file);
    if ( ! outfile.good() )
        throw cException_File_Not_Found(current_file);
    std::cout << "Creating " << current_file << " ..." << std::endl;
    std::for_each(pair_list.begin(), pair_list.end(), do_print);
    outfile.close();
    std::cout << "Done" << std::endl;


    //tset02
    pair_list.clear();
    create_tset02(pair_list, record_pointers, rare_column_names, const_rare_pointer_vec, limit);

    current_file = training_filenames.at(1).c_str();
    outfile.open(current_file);
    if ( ! outfile.good() )
        throw cException_File_Not_Found(current_file);
    std::cout << "Creating " << current_file << " ..." << std::endl;
    std::for_each(pair_list.begin(), pair_list.end(), do_print);
    outfile.close();
    std::cout << "Done" << std::endl;

    return true;
}
示例#2
0
void
cRatioComponent::read_train_pairs(TrainingPairs & trainpairs,
                                  const char * txt_file) const {

    std::cout << "Reading training pairs from " << txt_file
              << ", " << __FILE__ << ":" << __LINE__ << std::endl;

    static const char * delim = ",";
    static const uint32_t delim_size = strlen(delim);

    std::ifstream::sync_with_stdio(false);
    std::ifstream infile(txt_file);

    if (infile.good()) {

        string filedata;

        while (getline(infile, filedata)) {
            register size_t pos = 0, prev_pos = 0;
            pos = filedata.find(delim, prev_pos);
            string firststring = filedata.substr( prev_pos, pos - prev_pos);
            prev_pos = pos + delim_size;
            pos = filedata.find(delim, prev_pos);
            string secondstring = filedata.substr(prev_pos, pos);
            trainpairs.push_back(TrainingPair(firststring, secondstring));
        }

        std::cout << txt_file << " has been loaded as the "
                  << attrib_group << " part of the training sets."<< std::endl;
    } else {
        throw cException_File_Not_Found(txt_file);
    }
}
示例#3
0
void
ClusterSet::read_from_file(const char * filename,
                           const map <string, const Record*> & uid_tree) {

    unsigned int count = 0;
    const unsigned int base = 100000;
    const unsigned int primary_delim_size = strlen(ClusterInfo::primary_delim);
    const unsigned int secondary_delim_size = strlen(ClusterInfo::secondary_delim);
    std::ifstream infile ( filename);

    if (infile.good()) {

        string filedata;
        while ( getline(infile, filedata)) {

            register size_t pos = 0, prev_pos = 0;
            pos = filedata.find(ClusterInfo::primary_delim, prev_pos);
            string keystring = filedata.substr( prev_pos, pos - prev_pos);
            const Record * key = retrieve_record_pointer_by_unique_id( keystring, uid_tree );
            prev_pos = pos + primary_delim_size;

            pos = filedata.find(ClusterInfo::primary_delim, prev_pos);
            double val = 0;
            if ( true ) {
                string cohesionstring = filedata.substr( prev_pos, pos - prev_pos);
                val = atof(cohesionstring.c_str());
            }
            prev_pos = pos + primary_delim_size;


            RecordPList tempv;
            while ( ( pos = filedata.find(ClusterInfo::secondary_delim, prev_pos) )!= string::npos){
                string valuestring = filedata.substr( prev_pos, pos - prev_pos);
                const Record * value = retrieve_record_pointer_by_unique_id( valuestring, uid_tree);
                tempv.push_back(value);
                prev_pos = pos + secondary_delim_size;
            }

            ClusterHead th(key, val);
            Cluster tempc(th, tempv);
            tempc.self_repair();
            this->consolidated.push_back(tempc);

            ++count;
            if ( count % base == 0 ) {
                std::cout << count << " records have been loaded from the cluster file. " << std::endl;
            }
        }
        std::cout << "Totally, " << count << " records have been loaded from " << filename << std::endl;
    }
    else {
        throw cException_File_Not_Found(filename);
    }
}
示例#4
0
void
cRatios::read_ratios_file(const char * filename) {

    std::ifstream::sync_with_stdio(false);
    std::ifstream infile (filename);
    const uint32_t primary_delim_size = strlen(primary_delim);
    const uint32_t secondary_delim_size = strlen(secondary_delim);

    if (!infile.good()) throw cException_File_Not_Found(filename);

    string filedata;
    register size_t pos, prev_pos;
    getline(infile, filedata);
    pos = prev_pos = 0;

    while ((pos = filedata.find(secondary_delim, prev_pos)) != string::npos) {
        attrib_names.push_back(filedata.substr(prev_pos, pos - prev_pos));
        prev_pos = pos + secondary_delim_size;
    }

    SimilarityProfile key;
    while (getline(infile, filedata)) {

      // TODO: replace this with a templated callback
        key.clear();
        pos = prev_pos = 0;
        while ((pos = filedata.find(secondary_delim, prev_pos)) != string::npos) {
            key.push_back(atoi(filedata.substr(prev_pos, pos - prev_pos).c_str()));
            prev_pos = pos + secondary_delim_size;
        }

        pos = filedata.find(primary_delim, 0);
        pos += primary_delim_size;

        const double value = atof(filedata.substr(pos).c_str());

        final_ratios.insert(std::pair<SimilarityProfile, double>(key, value));
    }

    // TODO: This should probably not go here, invoke from calling function.
    Record::activate_comparators_by_name(attrib_names);

    std::cout << filename << " has been loaded as the final ratios file"<< std::endl;
    std::cout << "Resetting similarity profiles ... ..." << std::endl;
    std::cout << "-----Similarity Profiles reset.-------" << std::endl;
}
bool make_changable_training_sets_by_assignee(const list <const cRecord*> & record_pointers, const vector<string >& blocking_column_names,
						const vector < const cString_Manipulator *> & pstring_oper, const unsigned int limit, const vector <string> & training_filenames) {


	if ( training_filenames.size() != 2 )
		throw cException_Other("Training: there should be 2 changeable training sets.");

	const string uid_identifier = cUnique_Record_ID::static_get_class_name();
	cBlocking_For_Training bft (record_pointers, blocking_column_names, pstring_oper, uid_identifier, limit);


	cString_Remain_Same donotchange;

	cString_NoSpace_Truncate operator_truncate_firstname;
	cString_NoSpace_Truncate operator_truncate_lastname;


	vector <const cString_Manipulator*> t_extract_equal, t_extract_nonequal, x_extract_equal, x_extract_nonequal;

	std::ofstream outfile;
	//xset01
	/*
	x_extract_nonequal.push_back(& donotchange);
	x_extract_equal.push_back(&donotchange);

	const string xset01_equal_name_array[] = {cApplyYear::static_get_class_name() };
	const string xset01_nonequal_name_array[] = { cCity::static_get_class_name() };
	const vector <string> xset01_equal_name_vec (xset01_equal_name_array, xset01_equal_name_array + sizeof(xset01_equal_name_array)/sizeof(string));
	const vector <string> xset01_nonequal_name_vec (xset01_nonequal_name_array, xset01_nonequal_name_array + sizeof(xset01_nonequal_name_array)/sizeof(string));


	bft.create_set(&cBlocking_For_Training::create_xset01_on_block, xset01_equal_name_vec, x_extract_equal, xset01_nonequal_name_vec, x_extract_nonequal);
	const char * current_file = training_filenames.at(0).c_str();
	outfile.open(current_file);
	if ( ! outfile.good() )
		throw cException_File_Not_Found(current_file);
	std::cout << "Creating " << current_file << " ..." << std::endl;
	bft.print(outfile, uid_identifier);
	outfile.close();
	std::cout << "Done" << std::endl;
	*/

	outfile.open(training_filenames.at(0).c_str());
	list < std::pair< const cRecord*, const cRecord*> > chosen_pairs;
	cPrint_Pair do_print(outfile, cUnique_Record_ID::static_get_class_name());
	create_xset01( chosen_pairs, record_pointers, limit);
	std::for_each(chosen_pairs.begin(), chosen_pairs.end(), do_print);
	outfile.close();
	std::cout << "Done" << std::endl;

	// tset05

	operator_truncate_firstname.set_truncater(0, 1, true);
	operator_truncate_lastname.set_truncater(0, 2, true);
	t_extract_equal.push_back(& operator_truncate_firstname);
	t_extract_equal.push_back(& operator_truncate_lastname);


	bft.reset(blocking_column_names.size());
	const string tset05_equal_name_array[] = { cFirstname::static_get_class_name(), cLastname::static_get_class_name()};
	const string tset05_nonequal_name_array[] = {};
	const vector <string> tset05_equal_name_vec (tset05_equal_name_array, tset05_equal_name_array + sizeof(tset05_equal_name_array)/sizeof(string));
	const vector <string> tset05_nonequal_name_vec (tset05_nonequal_name_array, tset05_nonequal_name_array + sizeof(tset05_nonequal_name_array)/sizeof(string));

	bft.create_set(&cBlocking_For_Training::create_tset05_on_block, tset05_equal_name_vec, t_extract_equal, tset05_nonequal_name_vec, t_extract_nonequal );

	const char * current_file = training_filenames.at(1).c_str();
	outfile.open(current_file);
	if ( ! outfile.good() )
		throw cException_File_Not_Found(current_file);
	std::cout << "Creating " << current_file << " ..." << std::endl;
	bft.print(outfile, uid_identifier);
	outfile.close();
	std::cout << "Done" << std::endl;

	return true;
}
bool make_changable_training_sets_by_patent(const list <const cRecord*> & record_pointers, const vector<string >& blocking_column_names,
						const vector < const cString_Manipulator *> & pstring_oper, const unsigned int limit, const vector <string> & training_filenames) {


	if ( training_filenames.size() != 2 )
		throw cException_Other("Training: there should be 2 changeable training sets.");


	const bool is_coauthor_active = cCoauthor::static_is_comparator_activated();
	const bool is_class_active = cClass::static_is_comparator_activated();

	if ( ! is_coauthor_active )
		cCoauthor::static_activate_comparator();

	if ( ! is_class_active )
		cClass::static_activate_comparator();

	const string uid_identifier = cUnique_Record_ID::static_get_class_name();
	cBlocking_For_Training bft (record_pointers, blocking_column_names, pstring_oper, uid_identifier, limit);

	cString_Remain_Same donotchange;
	vector <const cString_Manipulator*> t_extract_equal, t_extract_nonequal, x_extract_equal, x_extract_nonequal;
	x_extract_equal.push_back(& donotchange);
	x_extract_nonequal.push_back(& donotchange);
	x_extract_nonequal.push_back(&donotchange);

	std::ofstream outfile;
	//xset01
	const string xset01_equal_name_array[] = {cApplyYear::static_get_class_name() };
	const string xset01_nonequal_name_array[] = { cAsgNum::static_get_class_name(), cCity::static_get_class_name() };
	const vector <string> xset01_equal_name_vec (xset01_equal_name_array, xset01_equal_name_array + sizeof(xset01_equal_name_array)/sizeof(string));
	const vector <string> xset01_nonequal_name_vec (xset01_nonequal_name_array, xset01_nonequal_name_array + sizeof(xset01_nonequal_name_array)/sizeof(string));



	bft.create_set(&cBlocking_For_Training::create_xset01_on_block, xset01_equal_name_vec, x_extract_equal, xset01_nonequal_name_vec, x_extract_nonequal);
	const char * current_file = training_filenames.at(0).c_str();
	outfile.open(current_file);
	if ( ! outfile.good() )
		throw cException_File_Not_Found(current_file);
	std::cout << "Creating " << current_file << " ..." << std::endl;
	bft.print(outfile, uid_identifier);
	outfile.close();
	std::cout << "Done" << std::endl;

	// tset05
	bft.reset(blocking_column_names.size());
	const string tset05_equal_name_array[] = {};
	const string tset05_nonequal_name_array[] = {};
	const vector <string> tset05_equal_name_vec (tset05_equal_name_array, tset05_equal_name_array + sizeof(tset05_equal_name_array)/sizeof(string));
	const vector <string> tset05_nonequal_name_vec (tset05_nonequal_name_array, tset05_nonequal_name_array + sizeof(tset05_nonequal_name_array)/sizeof(string));

	bft.create_set(&cBlocking_For_Training::create_tset05_on_block, tset05_equal_name_vec, t_extract_equal, tset05_nonequal_name_vec, t_extract_nonequal );

	current_file = training_filenames.at(1).c_str();
	outfile.open(current_file);
	if ( ! outfile.good() )
		throw cException_File_Not_Found(current_file);
	std::cout << "Creating " << current_file << " ..." << std::endl;
	bft.print(outfile, uid_identifier);
	outfile.close();
	std::cout << "Done" << std::endl;

	if ( ! is_coauthor_active )
		cCoauthor::static_deactivate_comparator();

	if ( ! is_class_active )
		cClass::static_deactivate_comparator();

	return true;
}
bool dump_match ( const char * sqlite3_target, const char * tablename, const char * txt_source, const string & unique_record_name, const string & unique_inventor_name) {


	sqlite3* pDB;
	int sqlres;
	std::cout << "Dumping " << txt_source << " to file << " << sqlite3_target << " >>, tablename << " << tablename << " >> ......" << std::endl;


	sqlres = sqlite3_open_v2(sqlite3_target,&pDB,SQLITE_OPEN_READWRITE ,NULL);
	if (SQLITE_OK != sqlres ) {
		std::cout << "SQL DB open error." <<sqlres<< std::endl;
		throw cException_SQLITE3();
	}

	std::ifstream::sync_with_stdio(false);
	std::ifstream infile(txt_source);
	const unsigned int primary_delim_size = strlen(cCluster_Info::primary_delim);
	const unsigned int secondary_delim_size = strlen(cCluster_Info::secondary_delim);
	map < string, string > update_dict;
	map < string, string >::iterator pm;

	if (infile.good()) {
		string filedata;
		register size_t pos, prev_pos;
		while ( getline(infile, filedata)) {
			pos = prev_pos = 0;
			pos = filedata.find(cCluster_Info::primary_delim, prev_pos);
			string valuestring = filedata.substr( prev_pos, pos - prev_pos);
			prev_pos = pos + primary_delim_size;

			pos = filedata.find(cCluster_Info::primary_delim, prev_pos);
			prev_pos = pos + primary_delim_size;


			while ( ( pos = filedata.find(cCluster_Info::secondary_delim, prev_pos) )!= string::npos){
				string keystring = filedata.substr( prev_pos, pos - prev_pos);
				pm = update_dict.find(keystring);
				if ( pm != update_dict.end() )
					throw cException_Duplicate_Attribute_In_Tree(keystring.c_str());
				update_dict.insert(std::pair<string,string>(keystring, valuestring));
				prev_pos = pos + secondary_delim_size;
			}
		}
		std::cout << txt_source << " is ready to be dumped into "<< sqlite3_target << std::endl;
	}
	else {
		throw cException_File_Not_Found(txt_source);
	}

	sqlite3_exec(pDB, "BEGIN TRANSACTION", NULL, NULL, NULL);
	std::ifstream::sync_with_stdio(true);

	const unsigned int buff_size = 512;
	char buffer[buff_size];
	sqlite3_stmt *statement;
	sqlite3_exec(pDB, "PRAGMA synchronous = OFF", NULL, NULL, NULL);
	sprintf(buffer, "CREATE INDEX IF NOT EXISTS index_%s_on_%s on %s(%s);", unique_record_name.c_str(), tablename, tablename, unique_record_name.c_str());
	std::cout << "Creating index ......" << std::endl;
	sqlite3_exec(pDB, buffer, NULL, NULL, NULL);
	std::cout << "Index created." << std::endl;


	sprintf(buffer, "UPDATE %s set %s = @VAL WHERE %s = @KEY;", tablename, unique_inventor_name.c_str(), unique_record_name.c_str());
	sqlres = sqlite3_prepare_v2(pDB,  buffer, -1, &statement, NULL);
	if ( sqlres != SQLITE_OK )
		throw cException_SQLITE3();
	//char *zSQL;
	const unsigned int base = 100000;
	unsigned int count = 0;
	for ( map<string, string>::const_iterator cpm = update_dict.begin(); cpm != update_dict.end(); ++cpm) {
		sqlite3_bind_text(statement, 1, cpm->second.c_str(), -1, SQLITE_TRANSIENT);
		sqlite3_bind_text(statement, 2, cpm->first.c_str(), -1, SQLITE_TRANSIENT);

		sqlres = sqlite3_step(statement);
		if ( sqlres != SQLITE_DONE )
			throw cException_SQLITE3();
		sqlite3_clear_bindings(statement);
		sqlite3_reset(statement);
		++count;
		if ( count % base == 0 )
			std::cout << count << " records has been updated. " << std::endl;
	}

	sqlite3_exec(pDB, "END TRANSACTION", NULL, NULL, NULL);
	sqlite3_finalize(statement);
	sqlite3_close(pDB);

	std::cout << "Dumping complete. " << std::endl;
	return true;
}