void one_step_postprocess(const list < Record > & all_records, const char * last_disambig_result, const char * outputfile) { // TODO: document valid keys for this dictionary. map <string, const Record *> uid_dict; const string uid_identifier = cUnique_Record_ID::static_get_class_name(); // uid_dict is probably the return value from create_btree_uid2record_pointer create_btree_uid2record_pointer(uid_dict, all_records, uid_identifier); // instantiate a map map < const Record *, RecordPList, cSort_by_attrib > patent_tree(cSort_by_attrib(cPatent::static_get_class_name())); build_patent_tree(patent_tree , all_records); ClusterSet cs; //cs.convert_from_ClusterInfo(&match); // Read results from last disambiguation cs.read_from_file(last_disambig_result, uid_dict); map < const Record *, const Record *> uid2uinv; const list < Cluster > & full_list = cs.get_set(); for (list < Cluster >::const_iterator t = full_list.begin(); t != full_list.end(); ++t) { t->add_uid2uinv(uid2uinv); } const char * suffix = ".pplog"; const string logfile = string(outputfile) + suffix ; //post_polish( cs, blocker_coauthor.get_uid2uinv_tree(), blocker_coauthor.get_patent_tree(), logfile); post_polish(cs, uid2uinv, patent_tree, logfile); cs.output_results(outputfile); }
void one_step_prostprocess(const list < Record > & all_records, const char * last_disambig_result, const char * outputfile) { map <string, const Record *> uid_dict; const string uid_identifier = cUnique_Record_ID::static_get_class_name(); create_btree_uid2record_pointer(uid_dict, all_records, uid_identifier); map < const Record *, cGroup_Value, cSort_by_attrib > patent_tree(cSort_by_attrib(cPatent::static_get_class_name())); build_patent_tree( patent_tree , all_records ); cCluster_Set cs; //cs.convert_from_ClusterInfo(&match); cs.read_from_file(last_disambig_result, uid_dict); map < const Record *, const Record *> uid2uinv; const list < cCluster > & full_list = cs.get_set(); for ( list < cCluster >::const_iterator t = full_list.begin(); t != full_list.end(); ++t ) t->add_uid2uinv(uid2uinv); const char * suffix = ".pplog"; const string logfile = string(outputfile) + suffix ; //post_polish( cs, blocker_coauthor.get_uid2uinv_tree(), blocker_coauthor.get_patent_tree(), logfile); post_polish( cs, uid2uinv, patent_tree, logfile); cs.output_results(outputfile); }
void one_step_prostprocess( const list < cRecord > & all_records, const char * last_disambig_result, const char * outputfile) { map <string, const cRecord *> uid_dict; const string uid_identifier = cUnique_Record_ID::static_get_class_name(); create_btree_uid2record_pointer(uid_dict, all_records, uid_identifier); map < const cRecord *, cGroup_Value, cSort_by_attrib > patent_tree(cSort_by_attrib(cPatent::static_get_class_name())); build_patent_tree( patent_tree , all_records ); #if 0 list < const cRecord *> all_rec_pointers; for ( list<cRecord>::const_iterator p = all_records.begin(); p != all_records.end(); ++p ) all_rec_pointers.push_back(&(*p)); cCluster_Info match ( uid_dict, true, true, false); const unsigned int num_coauthors_to_group = 2; cBlocking_Operation_By_Coauthors blocker_coauthor( all_rec_pointers, num_coauthors_to_group ); cString_NoSpace_Truncate operator_truncate_firstname; cString_NoSpace_Truncate operator_truncate_lastname; cString_NoSpace_Truncate operator_truncate_middlename; vector <const cString_Manipulator*> pstring_oper; pstring_oper.push_back(& operator_truncate_firstname); pstring_oper.push_back(& operator_truncate_middlename); pstring_oper.push_back(& operator_truncate_lastname); const string blocking_names[] = {cFirstname::static_get_class_name(), cMiddlename::static_get_class_name(), cLastname::static_get_class_name()}; vector < string > blocking_column_names(blocking_names, blocking_names + sizeof(blocking_names)/sizeof(string) ); vector < unsigned int > blocking_column_data_indice ( blocking_column_names.size(), 0 ); //blocking_column_data_indice.at(0) = 1; //blocking_column_data_indice.at(1) = 1; cBlocking_Operation_Multiple_Column_Manipulate blocker(pstring_oper, blocking_column_names, blocking_column_data_indice); operator_truncate_firstname.set_truncater(0, 0, true); operator_truncate_middlename.set_truncater(0, 0, false); operator_truncate_lastname.set_truncater(0, 0, true); match.reset_blocking( blocker , last_disambig_result ); blocker_coauthor.build_uid2uinv_tree(match); #endif cCluster_Set cs; //cs.convert_from_ClusterInfo(&match); cs.read_from_file(last_disambig_result, uid_dict); map < const cRecord *, const cRecord *> uid2uinv; const list < cCluster > & full_list = cs.get_set(); for ( list < cCluster >::const_iterator t = full_list.begin(); t != full_list.end(); ++t ) t->add_uid2uinv(uid2uinv); const char * suffix = ".pplog"; const string logfile = string(outputfile) + suffix ; //post_polish( cs, blocker_coauthor.get_uid2uinv_tree(), blocker_coauthor.get_patent_tree(), logfile); post_polish( cs, uid2uinv, patent_tree, logfile); cs.output_results(outputfile); }
int unique_inventors_per_period ( unsigned int starting_year, unsigned int interval, const char * wholedatabase, const char * disambigresult, const char * outputfile) { typedef std::pair< const cRecord *, set < const cRecord *> > cUINV2UCOAUTHOR; list <cRecord> all_records; const string columns[] = {"Unique_Record_ID", "Patent", "ApplyYear"}; const vector <string> column_vec(columns, columns + sizeof(columns)/sizeof(string) ); bool is_success = fetch_records_from_txt(all_records, wholedatabase, column_vec); if (not is_success) return 1; list < const cRecord *> all_rec_pointers; for ( list<cRecord>::const_iterator p = all_records.begin(); p != all_records.end(); ++p ) all_rec_pointers.push_back(&(*p)); cString_Remain_Same manobj; cBlocking_Operation_Column_Manipulate tempblocker (manobj, "ApplyYear"); map <string, const cRecord *> uid_dict; create_btree_uid2record_pointer(uid_dict, all_records, cUnique_Record_ID::static_get_class_name()); cCluster_Set all_clusters; //all_clusters.convert_from_ClusterInfo(&ci); all_clusters.read_from_file(disambigresult, uid_dict); map < const cRecord *, const cRecord *> uid2uinv; const list < cCluster > & full_list = all_clusters.get_set(); for ( list < cCluster >::const_iterator t = full_list.begin(); t != full_list.end(); ++t ) t->add_uid2uinv(uid2uinv); const map < const cRecord *, cGroup_Value, cSort_by_attrib > & patent_tree = bocobj.get_patent_tree(); const string & beginyearstring = ci.get_cluster_map().begin()->first; const string & endyearstring = ci.get_cluster_map().rbegin()->first; const unsigned int endyear = atoi( endyearstring.c_str() ); const unsigned int appyearindex = cRecord::get_index_by_name(cApplyYear::static_get_class_name()); std::cout << "Begin year = "<<beginyearstring << " , End year = " << endyearstring << std::endl; map < unsigned int, unsigned int > unique_coauthor_year_chunk; map < unsigned int, unsigned int > unique_inventor_year_chunk; for ( unsigned int y = starting_year; y <= endyear; y += interval ) { unsigned int unique_inventors = 0; unsigned int unique_coauthors = 0; for ( list< cCluster >::const_iterator puinv = full_list.begin() ; puinv != full_list.end(); ++puinv) { std::pair< const cRecord * , set< const cRecord *> > kk = ones_temporal_unique_coauthors ( *puinv, uid2uinv, patent_tree, y, y + interval, appyearindex ); if ( kk.first != NULL ) { ++ unique_inventors; unique_coauthors += kk.second.size(); } } unique_coauthor_year_chunk.insert(std::pair<unsigned int, unsigned int>(y, unique_coauthors)); unique_inventor_year_chunk.insert(std::pair<unsigned int, unsigned int>(y, unique_inventors)); std::cout << "Year " << y << " done." << std::endl; } std::ostream & os = std::cout; const string space_delim = " "; os << "Year Chunk:" << space_delim << "Number of Unique Inventors:" << space_delim << "Number of Unique Coauthors:" << std::endl; for ( map < unsigned int, unsigned int >::const_iterator p = unique_inventor_year_chunk.begin(); p != unique_inventor_year_chunk.end(); ++p ) { os << p->first << space_delim << p->second << space_delim << unique_coauthor_year_chunk.find(p->first)->second << std::endl; } return 0; }