void one_step_postprocess(const list < Record > & all_records, const char * last_disambig_result, const char * outputfile) { // TODO: document valid keys for this dictionary. map <string, const Record *> uid_dict; const string uid_identifier = cUnique_Record_ID::static_get_class_name(); // uid_dict is probably the return value from create_btree_uid2record_pointer create_btree_uid2record_pointer(uid_dict, all_records, uid_identifier); // instantiate a map map < const Record *, RecordPList, cSort_by_attrib > patent_tree(cSort_by_attrib(cPatent::static_get_class_name())); build_patent_tree(patent_tree , all_records); ClusterSet cs; //cs.convert_from_ClusterInfo(&match); // Read results from last disambiguation cs.read_from_file(last_disambig_result, uid_dict); map < const Record *, const Record *> uid2uinv; const list < Cluster > & full_list = cs.get_set(); for (list < Cluster >::const_iterator t = full_list.begin(); t != full_list.end(); ++t) { t->add_uid2uinv(uid2uinv); } const char * suffix = ".pplog"; const string logfile = string(outputfile) + suffix ; //post_polish( cs, blocker_coauthor.get_uid2uinv_tree(), blocker_coauthor.get_patent_tree(), logfile); post_polish(cs, uid2uinv, patent_tree, logfile); cs.output_results(outputfile); }
void one_step_prostprocess(const list < Record > & all_records, const char * last_disambig_result, const char * outputfile) { map <string, const Record *> uid_dict; const string uid_identifier = cUnique_Record_ID::static_get_class_name(); create_btree_uid2record_pointer(uid_dict, all_records, uid_identifier); map < const Record *, cGroup_Value, cSort_by_attrib > patent_tree(cSort_by_attrib(cPatent::static_get_class_name())); build_patent_tree( patent_tree , all_records ); cCluster_Set cs; //cs.convert_from_ClusterInfo(&match); cs.read_from_file(last_disambig_result, uid_dict); map < const Record *, const Record *> uid2uinv; const list < cCluster > & full_list = cs.get_set(); for ( list < cCluster >::const_iterator t = full_list.begin(); t != full_list.end(); ++t ) t->add_uid2uinv(uid2uinv); const char * suffix = ".pplog"; const string logfile = string(outputfile) + suffix ; //post_polish( cs, blocker_coauthor.get_uid2uinv_tree(), blocker_coauthor.get_patent_tree(), logfile); post_polish( cs, uid2uinv, patent_tree, logfile); cs.output_results(outputfile); }
void one_step_prostprocess( const list < cRecord > & all_records, const char * last_disambig_result, const char * outputfile) { map <string, const cRecord *> uid_dict; const string uid_identifier = cUnique_Record_ID::static_get_class_name(); create_btree_uid2record_pointer(uid_dict, all_records, uid_identifier); map < const cRecord *, cGroup_Value, cSort_by_attrib > patent_tree(cSort_by_attrib(cPatent::static_get_class_name())); build_patent_tree( patent_tree , all_records ); #if 0 list < const cRecord *> all_rec_pointers; for ( list<cRecord>::const_iterator p = all_records.begin(); p != all_records.end(); ++p ) all_rec_pointers.push_back(&(*p)); cCluster_Info match ( uid_dict, true, true, false); const unsigned int num_coauthors_to_group = 2; cBlocking_Operation_By_Coauthors blocker_coauthor( all_rec_pointers, num_coauthors_to_group ); cString_NoSpace_Truncate operator_truncate_firstname; cString_NoSpace_Truncate operator_truncate_lastname; cString_NoSpace_Truncate operator_truncate_middlename; vector <const cString_Manipulator*> pstring_oper; pstring_oper.push_back(& operator_truncate_firstname); pstring_oper.push_back(& operator_truncate_middlename); pstring_oper.push_back(& operator_truncate_lastname); const string blocking_names[] = {cFirstname::static_get_class_name(), cMiddlename::static_get_class_name(), cLastname::static_get_class_name()}; vector < string > blocking_column_names(blocking_names, blocking_names + sizeof(blocking_names)/sizeof(string) ); vector < unsigned int > blocking_column_data_indice ( blocking_column_names.size(), 0 ); //blocking_column_data_indice.at(0) = 1; //blocking_column_data_indice.at(1) = 1; cBlocking_Operation_Multiple_Column_Manipulate blocker(pstring_oper, blocking_column_names, blocking_column_data_indice); operator_truncate_firstname.set_truncater(0, 0, true); operator_truncate_middlename.set_truncater(0, 0, false); operator_truncate_lastname.set_truncater(0, 0, true); match.reset_blocking( blocker , last_disambig_result ); blocker_coauthor.build_uid2uinv_tree(match); #endif cCluster_Set cs; //cs.convert_from_ClusterInfo(&match); cs.read_from_file(last_disambig_result, uid_dict); map < const cRecord *, const cRecord *> uid2uinv; const list < cCluster > & full_list = cs.get_set(); for ( list < cCluster >::const_iterator t = full_list.begin(); t != full_list.end(); ++t ) t->add_uid2uinv(uid2uinv); const char * suffix = ".pplog"; const string logfile = string(outputfile) + suffix ; //post_polish( cs, blocker_coauthor.get_uid2uinv_tree(), blocker_coauthor.get_patent_tree(), logfile); post_polish( cs, uid2uinv, patent_tree, logfile); cs.output_results(outputfile); }
cBlocking_Operation_By_Coauthors::cBlocking_Operation_By_Coauthors( const RecordPList & all_rec_pointers, const uint32_t coauthors) : patent_tree(cSort_by_attrib(cPatent::static_get_class_name())), num_coauthors(coauthors) { too_many_coauthors(num_coauthors); build_patent_tree(all_rec_pointers); for (uint32_t i = 0; i < num_coauthors; ++i) { infoless += cBlocking_Operation::delim + cBlocking_Operation::delim; infoless += cBlocking_Operation::delim + cBlocking_Operation::delim; } }