bool make_stable_training_sets_by_personal(const list <Record> & all_records, const unsigned int limit, const vector <string> & training_filenames) { //if ( training_filenames.size() != 2 ) //throw cException_Other("Training: there should be 2 changeable training sets."); cGroup_Value rare_firstname_set; cGroup_Value rare_lastname_set; std::ofstream outfile; cPrint_Pair do_print(outfile, cUnique_Record_ID::static_get_class_name()); const char * current_file; vector<cGroup_Value *> rare_pointer_vec; rare_pointer_vec.push_back(&rare_firstname_set); rare_pointer_vec.push_back(&rare_lastname_set); const vector< const cGroup_Value * > const_rare_pointer_vec(rare_pointer_vec.begin(), rare_pointer_vec.end()); list < const Record*> record_pointers; for ( list<Record>::const_iterator p = all_records.begin(); p != all_records.end(); ++p ) record_pointers.push_back(&(*p)); find_rare_names_v2(rare_pointer_vec, record_pointers); list<pointer_pairs> pair_list; vector <string> rare_column_names; rare_column_names.push_back(string(cFirstname::static_get_class_name())); rare_column_names.push_back(string(cLastname::static_get_class_name())); //xset03 pair_list.clear(); create_xset03(pair_list, record_pointers, const_rare_pointer_vec, limit); current_file = training_filenames.at(0).c_str(); outfile.open(current_file); if ( ! outfile.good() ) throw cException_File_Not_Found(current_file); std::cout << "Creating " << current_file << " ..." << std::endl; std::for_each(pair_list.begin(), pair_list.end(), do_print); outfile.close(); std::cout << "Done" << std::endl; //tset02 pair_list.clear(); create_tset02(pair_list, record_pointers, rare_column_names, const_rare_pointer_vec, limit); current_file = training_filenames.at(1).c_str(); outfile.open(current_file); if ( ! outfile.good() ) throw cException_File_Not_Found(current_file); std::cout << "Creating " << current_file << " ..." << std::endl; std::for_each(pair_list.begin(), pair_list.end(), do_print); outfile.close(); std::cout << "Done" << std::endl; return true; }
void cRatioComponent::read_train_pairs(TrainingPairs & trainpairs, const char * txt_file) const { std::cout << "Reading training pairs from " << txt_file << ", " << __FILE__ << ":" << __LINE__ << std::endl; static const char * delim = ","; static const uint32_t delim_size = strlen(delim); std::ifstream::sync_with_stdio(false); std::ifstream infile(txt_file); if (infile.good()) { string filedata; while (getline(infile, filedata)) { register size_t pos = 0, prev_pos = 0; pos = filedata.find(delim, prev_pos); string firststring = filedata.substr( prev_pos, pos - prev_pos); prev_pos = pos + delim_size; pos = filedata.find(delim, prev_pos); string secondstring = filedata.substr(prev_pos, pos); trainpairs.push_back(TrainingPair(firststring, secondstring)); } std::cout << txt_file << " has been loaded as the " << attrib_group << " part of the training sets."<< std::endl; } else { throw cException_File_Not_Found(txt_file); } }
void ClusterSet::read_from_file(const char * filename, const map <string, const Record*> & uid_tree) { unsigned int count = 0; const unsigned int base = 100000; const unsigned int primary_delim_size = strlen(ClusterInfo::primary_delim); const unsigned int secondary_delim_size = strlen(ClusterInfo::secondary_delim); std::ifstream infile ( filename); if (infile.good()) { string filedata; while ( getline(infile, filedata)) { register size_t pos = 0, prev_pos = 0; pos = filedata.find(ClusterInfo::primary_delim, prev_pos); string keystring = filedata.substr( prev_pos, pos - prev_pos); const Record * key = retrieve_record_pointer_by_unique_id( keystring, uid_tree ); prev_pos = pos + primary_delim_size; pos = filedata.find(ClusterInfo::primary_delim, prev_pos); double val = 0; if ( true ) { string cohesionstring = filedata.substr( prev_pos, pos - prev_pos); val = atof(cohesionstring.c_str()); } prev_pos = pos + primary_delim_size; RecordPList tempv; while ( ( pos = filedata.find(ClusterInfo::secondary_delim, prev_pos) )!= string::npos){ string valuestring = filedata.substr( prev_pos, pos - prev_pos); const Record * value = retrieve_record_pointer_by_unique_id( valuestring, uid_tree); tempv.push_back(value); prev_pos = pos + secondary_delim_size; } ClusterHead th(key, val); Cluster tempc(th, tempv); tempc.self_repair(); this->consolidated.push_back(tempc); ++count; if ( count % base == 0 ) { std::cout << count << " records have been loaded from the cluster file. " << std::endl; } } std::cout << "Totally, " << count << " records have been loaded from " << filename << std::endl; } else { throw cException_File_Not_Found(filename); } }
void cRatios::read_ratios_file(const char * filename) { std::ifstream::sync_with_stdio(false); std::ifstream infile (filename); const uint32_t primary_delim_size = strlen(primary_delim); const uint32_t secondary_delim_size = strlen(secondary_delim); if (!infile.good()) throw cException_File_Not_Found(filename); string filedata; register size_t pos, prev_pos; getline(infile, filedata); pos = prev_pos = 0; while ((pos = filedata.find(secondary_delim, prev_pos)) != string::npos) { attrib_names.push_back(filedata.substr(prev_pos, pos - prev_pos)); prev_pos = pos + secondary_delim_size; } SimilarityProfile key; while (getline(infile, filedata)) { // TODO: replace this with a templated callback key.clear(); pos = prev_pos = 0; while ((pos = filedata.find(secondary_delim, prev_pos)) != string::npos) { key.push_back(atoi(filedata.substr(prev_pos, pos - prev_pos).c_str())); prev_pos = pos + secondary_delim_size; } pos = filedata.find(primary_delim, 0); pos += primary_delim_size; const double value = atof(filedata.substr(pos).c_str()); final_ratios.insert(std::pair<SimilarityProfile, double>(key, value)); } // TODO: This should probably not go here, invoke from calling function. Record::activate_comparators_by_name(attrib_names); std::cout << filename << " has been loaded as the final ratios file"<< std::endl; std::cout << "Resetting similarity profiles ... ..." << std::endl; std::cout << "-----Similarity Profiles reset.-------" << std::endl; }
bool make_changable_training_sets_by_assignee(const list <const cRecord*> & record_pointers, const vector<string >& blocking_column_names, const vector < const cString_Manipulator *> & pstring_oper, const unsigned int limit, const vector <string> & training_filenames) { if ( training_filenames.size() != 2 ) throw cException_Other("Training: there should be 2 changeable training sets."); const string uid_identifier = cUnique_Record_ID::static_get_class_name(); cBlocking_For_Training bft (record_pointers, blocking_column_names, pstring_oper, uid_identifier, limit); cString_Remain_Same donotchange; cString_NoSpace_Truncate operator_truncate_firstname; cString_NoSpace_Truncate operator_truncate_lastname; vector <const cString_Manipulator*> t_extract_equal, t_extract_nonequal, x_extract_equal, x_extract_nonequal; std::ofstream outfile; //xset01 /* x_extract_nonequal.push_back(& donotchange); x_extract_equal.push_back(&donotchange); const string xset01_equal_name_array[] = {cApplyYear::static_get_class_name() }; const string xset01_nonequal_name_array[] = { cCity::static_get_class_name() }; const vector <string> xset01_equal_name_vec (xset01_equal_name_array, xset01_equal_name_array + sizeof(xset01_equal_name_array)/sizeof(string)); const vector <string> xset01_nonequal_name_vec (xset01_nonequal_name_array, xset01_nonequal_name_array + sizeof(xset01_nonequal_name_array)/sizeof(string)); bft.create_set(&cBlocking_For_Training::create_xset01_on_block, xset01_equal_name_vec, x_extract_equal, xset01_nonequal_name_vec, x_extract_nonequal); const char * current_file = training_filenames.at(0).c_str(); outfile.open(current_file); if ( ! outfile.good() ) throw cException_File_Not_Found(current_file); std::cout << "Creating " << current_file << " ..." << std::endl; bft.print(outfile, uid_identifier); outfile.close(); std::cout << "Done" << std::endl; */ outfile.open(training_filenames.at(0).c_str()); list < std::pair< const cRecord*, const cRecord*> > chosen_pairs; cPrint_Pair do_print(outfile, cUnique_Record_ID::static_get_class_name()); create_xset01( chosen_pairs, record_pointers, limit); std::for_each(chosen_pairs.begin(), chosen_pairs.end(), do_print); outfile.close(); std::cout << "Done" << std::endl; // tset05 operator_truncate_firstname.set_truncater(0, 1, true); operator_truncate_lastname.set_truncater(0, 2, true); t_extract_equal.push_back(& operator_truncate_firstname); t_extract_equal.push_back(& operator_truncate_lastname); bft.reset(blocking_column_names.size()); const string tset05_equal_name_array[] = { cFirstname::static_get_class_name(), cLastname::static_get_class_name()}; const string tset05_nonequal_name_array[] = {}; const vector <string> tset05_equal_name_vec (tset05_equal_name_array, tset05_equal_name_array + sizeof(tset05_equal_name_array)/sizeof(string)); const vector <string> tset05_nonequal_name_vec (tset05_nonequal_name_array, tset05_nonequal_name_array + sizeof(tset05_nonequal_name_array)/sizeof(string)); bft.create_set(&cBlocking_For_Training::create_tset05_on_block, tset05_equal_name_vec, t_extract_equal, tset05_nonequal_name_vec, t_extract_nonequal ); const char * current_file = training_filenames.at(1).c_str(); outfile.open(current_file); if ( ! outfile.good() ) throw cException_File_Not_Found(current_file); std::cout << "Creating " << current_file << " ..." << std::endl; bft.print(outfile, uid_identifier); outfile.close(); std::cout << "Done" << std::endl; return true; }
bool make_changable_training_sets_by_patent(const list <const cRecord*> & record_pointers, const vector<string >& blocking_column_names, const vector < const cString_Manipulator *> & pstring_oper, const unsigned int limit, const vector <string> & training_filenames) { if ( training_filenames.size() != 2 ) throw cException_Other("Training: there should be 2 changeable training sets."); const bool is_coauthor_active = cCoauthor::static_is_comparator_activated(); const bool is_class_active = cClass::static_is_comparator_activated(); if ( ! is_coauthor_active ) cCoauthor::static_activate_comparator(); if ( ! is_class_active ) cClass::static_activate_comparator(); const string uid_identifier = cUnique_Record_ID::static_get_class_name(); cBlocking_For_Training bft (record_pointers, blocking_column_names, pstring_oper, uid_identifier, limit); cString_Remain_Same donotchange; vector <const cString_Manipulator*> t_extract_equal, t_extract_nonequal, x_extract_equal, x_extract_nonequal; x_extract_equal.push_back(& donotchange); x_extract_nonequal.push_back(& donotchange); x_extract_nonequal.push_back(&donotchange); std::ofstream outfile; //xset01 const string xset01_equal_name_array[] = {cApplyYear::static_get_class_name() }; const string xset01_nonequal_name_array[] = { cAsgNum::static_get_class_name(), cCity::static_get_class_name() }; const vector <string> xset01_equal_name_vec (xset01_equal_name_array, xset01_equal_name_array + sizeof(xset01_equal_name_array)/sizeof(string)); const vector <string> xset01_nonequal_name_vec (xset01_nonequal_name_array, xset01_nonequal_name_array + sizeof(xset01_nonequal_name_array)/sizeof(string)); bft.create_set(&cBlocking_For_Training::create_xset01_on_block, xset01_equal_name_vec, x_extract_equal, xset01_nonequal_name_vec, x_extract_nonequal); const char * current_file = training_filenames.at(0).c_str(); outfile.open(current_file); if ( ! outfile.good() ) throw cException_File_Not_Found(current_file); std::cout << "Creating " << current_file << " ..." << std::endl; bft.print(outfile, uid_identifier); outfile.close(); std::cout << "Done" << std::endl; // tset05 bft.reset(blocking_column_names.size()); const string tset05_equal_name_array[] = {}; const string tset05_nonequal_name_array[] = {}; const vector <string> tset05_equal_name_vec (tset05_equal_name_array, tset05_equal_name_array + sizeof(tset05_equal_name_array)/sizeof(string)); const vector <string> tset05_nonequal_name_vec (tset05_nonequal_name_array, tset05_nonequal_name_array + sizeof(tset05_nonequal_name_array)/sizeof(string)); bft.create_set(&cBlocking_For_Training::create_tset05_on_block, tset05_equal_name_vec, t_extract_equal, tset05_nonequal_name_vec, t_extract_nonequal ); current_file = training_filenames.at(1).c_str(); outfile.open(current_file); if ( ! outfile.good() ) throw cException_File_Not_Found(current_file); std::cout << "Creating " << current_file << " ..." << std::endl; bft.print(outfile, uid_identifier); outfile.close(); std::cout << "Done" << std::endl; if ( ! is_coauthor_active ) cCoauthor::static_deactivate_comparator(); if ( ! is_class_active ) cClass::static_deactivate_comparator(); return true; }
bool dump_match ( const char * sqlite3_target, const char * tablename, const char * txt_source, const string & unique_record_name, const string & unique_inventor_name) { sqlite3* pDB; int sqlres; std::cout << "Dumping " << txt_source << " to file << " << sqlite3_target << " >>, tablename << " << tablename << " >> ......" << std::endl; sqlres = sqlite3_open_v2(sqlite3_target,&pDB,SQLITE_OPEN_READWRITE ,NULL); if (SQLITE_OK != sqlres ) { std::cout << "SQL DB open error." <<sqlres<< std::endl; throw cException_SQLITE3(); } std::ifstream::sync_with_stdio(false); std::ifstream infile(txt_source); const unsigned int primary_delim_size = strlen(cCluster_Info::primary_delim); const unsigned int secondary_delim_size = strlen(cCluster_Info::secondary_delim); map < string, string > update_dict; map < string, string >::iterator pm; if (infile.good()) { string filedata; register size_t pos, prev_pos; while ( getline(infile, filedata)) { pos = prev_pos = 0; pos = filedata.find(cCluster_Info::primary_delim, prev_pos); string valuestring = filedata.substr( prev_pos, pos - prev_pos); prev_pos = pos + primary_delim_size; pos = filedata.find(cCluster_Info::primary_delim, prev_pos); prev_pos = pos + primary_delim_size; while ( ( pos = filedata.find(cCluster_Info::secondary_delim, prev_pos) )!= string::npos){ string keystring = filedata.substr( prev_pos, pos - prev_pos); pm = update_dict.find(keystring); if ( pm != update_dict.end() ) throw cException_Duplicate_Attribute_In_Tree(keystring.c_str()); update_dict.insert(std::pair<string,string>(keystring, valuestring)); prev_pos = pos + secondary_delim_size; } } std::cout << txt_source << " is ready to be dumped into "<< sqlite3_target << std::endl; } else { throw cException_File_Not_Found(txt_source); } sqlite3_exec(pDB, "BEGIN TRANSACTION", NULL, NULL, NULL); std::ifstream::sync_with_stdio(true); const unsigned int buff_size = 512; char buffer[buff_size]; sqlite3_stmt *statement; sqlite3_exec(pDB, "PRAGMA synchronous = OFF", NULL, NULL, NULL); sprintf(buffer, "CREATE INDEX IF NOT EXISTS index_%s_on_%s on %s(%s);", unique_record_name.c_str(), tablename, tablename, unique_record_name.c_str()); std::cout << "Creating index ......" << std::endl; sqlite3_exec(pDB, buffer, NULL, NULL, NULL); std::cout << "Index created." << std::endl; sprintf(buffer, "UPDATE %s set %s = @VAL WHERE %s = @KEY;", tablename, unique_inventor_name.c_str(), unique_record_name.c_str()); sqlres = sqlite3_prepare_v2(pDB, buffer, -1, &statement, NULL); if ( sqlres != SQLITE_OK ) throw cException_SQLITE3(); //char *zSQL; const unsigned int base = 100000; unsigned int count = 0; for ( map<string, string>::const_iterator cpm = update_dict.begin(); cpm != update_dict.end(); ++cpm) { sqlite3_bind_text(statement, 1, cpm->second.c_str(), -1, SQLITE_TRANSIENT); sqlite3_bind_text(statement, 2, cpm->first.c_str(), -1, SQLITE_TRANSIENT); sqlres = sqlite3_step(statement); if ( sqlres != SQLITE_DONE ) throw cException_SQLITE3(); sqlite3_clear_bindings(statement); sqlite3_reset(statement); ++count; if ( count % base == 0 ) std::cout << count << " records has been updated. " << std::endl; } sqlite3_exec(pDB, "END TRANSACTION", NULL, NULL, NULL); sqlite3_finalize(statement); sqlite3_close(pDB); std::cout << "Dumping complete. " << std::endl; return true; }