Example #1
0
void
one_step_postprocess(const list < Record > & all_records,
                     const char * last_disambig_result,
                     const char * outputfile) {

    // TODO: document valid keys for this dictionary.
    map <string, const Record *> uid_dict;

    const string uid_identifier = cUnique_Record_ID::static_get_class_name();
    // uid_dict is probably the return value from create_btree_uid2record_pointer
    create_btree_uid2record_pointer(uid_dict, all_records, uid_identifier);
    // instantiate a map
    map < const Record *, RecordPList, cSort_by_attrib > patent_tree(cSort_by_attrib(cPatent::static_get_class_name()));
    build_patent_tree(patent_tree , all_records);
    ClusterSet cs;
    //cs.convert_from_ClusterInfo(&match);
    // Read results from last disambiguation 
    cs.read_from_file(last_disambig_result, uid_dict);
    map < const Record *, const Record *> uid2uinv;
    const list < Cluster > & full_list = cs.get_set();

    for (list < Cluster >::const_iterator t = full_list.begin(); t != full_list.end(); ++t) {
        t->add_uid2uinv(uid2uinv);
    }

    const char * suffix = ".pplog";
    const string logfile = string(outputfile) + suffix ;
    //post_polish( cs, blocker_coauthor.get_uid2uinv_tree(), blocker_coauthor.get_patent_tree(), logfile);
    post_polish(cs, uid2uinv, patent_tree, logfile);
    cs.output_results(outputfile);
}
Example #2
0
void 
one_step_prostprocess(const list < Record > & all_records, 
                      const char * last_disambig_result, 
                      const char * outputfile) {

    map <string, const Record *> uid_dict;
    const string uid_identifier = cUnique_Record_ID::static_get_class_name();
    create_btree_uid2record_pointer(uid_dict, all_records, uid_identifier);
    map < const Record *, cGroup_Value, cSort_by_attrib > patent_tree(cSort_by_attrib(cPatent::static_get_class_name()));
    build_patent_tree(  patent_tree , all_records );
    cCluster_Set cs;
    //cs.convert_from_ClusterInfo(&match);
    cs.read_from_file(last_disambig_result, uid_dict);
    map < const Record *, const Record *> uid2uinv;
    const list < cCluster > & full_list = cs.get_set();

    for ( list < cCluster >::const_iterator t = full_list.begin(); t != full_list.end(); ++t )
        t->add_uid2uinv(uid2uinv);

    const char * suffix = ".pplog";
    const string logfile = string(outputfile) + suffix ;
    //post_polish( cs, blocker_coauthor.get_uid2uinv_tree(), blocker_coauthor.get_patent_tree(), logfile);
    post_polish( cs, uid2uinv, patent_tree, logfile);
    cs.output_results(outputfile);
}
void one_step_prostprocess( const list < cRecord > & all_records, const char * last_disambig_result, const char * outputfile) {
	map <string, const cRecord *> uid_dict;
	const string uid_identifier = cUnique_Record_ID::static_get_class_name();
	create_btree_uid2record_pointer(uid_dict, all_records, uid_identifier);
	map < const cRecord *, cGroup_Value, cSort_by_attrib > patent_tree(cSort_by_attrib(cPatent::static_get_class_name()));
	build_patent_tree(  patent_tree , all_records );
#if 0
	list < const cRecord *> all_rec_pointers;
	for ( list<cRecord>::const_iterator p = all_records.begin(); p != all_records.end(); ++p )
		all_rec_pointers.push_back(&(*p));

	cCluster_Info match ( uid_dict, true, true, false);

	const unsigned int num_coauthors_to_group = 2;
	cBlocking_Operation_By_Coauthors blocker_coauthor( all_rec_pointers, num_coauthors_to_group );

	cString_NoSpace_Truncate operator_truncate_firstname;
	cString_NoSpace_Truncate operator_truncate_lastname;
	cString_NoSpace_Truncate operator_truncate_middlename;

	vector <const cString_Manipulator*> pstring_oper;
	pstring_oper.push_back(& operator_truncate_firstname);
	pstring_oper.push_back(& operator_truncate_middlename);
	pstring_oper.push_back(& operator_truncate_lastname);

	const string blocking_names[] = {cFirstname::static_get_class_name(), cMiddlename::static_get_class_name(), cLastname::static_get_class_name()};
	vector < string > blocking_column_names(blocking_names, blocking_names + sizeof(blocking_names)/sizeof(string) );
	vector < unsigned int > blocking_column_data_indice ( blocking_column_names.size(), 0 );
	//blocking_column_data_indice.at(0) = 1;
	//blocking_column_data_indice.at(1) = 1;
	cBlocking_Operation_Multiple_Column_Manipulate blocker(pstring_oper, blocking_column_names, blocking_column_data_indice);

	operator_truncate_firstname.set_truncater(0, 0, true);
	operator_truncate_middlename.set_truncater(0, 0, false);
	operator_truncate_lastname.set_truncater(0, 0, true);

	match.reset_blocking( blocker , last_disambig_result );

	blocker_coauthor.build_uid2uinv_tree(match);
#endif
	cCluster_Set cs;
	//cs.convert_from_ClusterInfo(&match);
	cs.read_from_file(last_disambig_result, uid_dict);
	map < const cRecord *, const cRecord *> uid2uinv;
	const list < cCluster > & full_list = cs.get_set();
	for ( list < cCluster >::const_iterator t = full_list.begin(); t != full_list.end(); ++t )
		t->add_uid2uinv(uid2uinv);
	const char * suffix = ".pplog";
	const string logfile = string(outputfile) + suffix ;
	//post_polish( cs, blocker_coauthor.get_uid2uinv_tree(), blocker_coauthor.get_patent_tree(), logfile);
	post_polish( cs, uid2uinv, patent_tree, logfile);
	cs.output_results(outputfile);
}
int unique_inventors_per_period ( unsigned int starting_year, unsigned int interval, const char * wholedatabase, const char * disambigresult, const char * outputfile) {
	typedef std::pair< const cRecord *, set < const cRecord *> > cUINV2UCOAUTHOR;
	list <cRecord> all_records;
	const string columns[] = {"Unique_Record_ID", "Patent",  "ApplyYear"};
	const vector <string> column_vec(columns, columns + sizeof(columns)/sizeof(string) );

	bool is_success = fetch_records_from_txt(all_records, wholedatabase, column_vec);
	if (not is_success) return 1;

	list < const cRecord *> all_rec_pointers;
	for ( list<cRecord>::const_iterator p = all_records.begin(); p != all_records.end(); ++p )
		all_rec_pointers.push_back(&(*p));

	cString_Remain_Same manobj;
	cBlocking_Operation_Column_Manipulate tempblocker (manobj, "ApplyYear");

	map <string, const cRecord *> uid_dict;
	create_btree_uid2record_pointer(uid_dict, all_records, cUnique_Record_ID::static_get_class_name());


	cCluster_Set all_clusters;
	//all_clusters.convert_from_ClusterInfo(&ci);
	all_clusters.read_from_file(disambigresult, uid_dict);
	map < const cRecord *, const cRecord *> uid2uinv;
	const list < cCluster > & full_list = all_clusters.get_set();
	for ( list < cCluster >::const_iterator t = full_list.begin(); t != full_list.end(); ++t )
		t->add_uid2uinv(uid2uinv);
	const map < const cRecord *, cGroup_Value, cSort_by_attrib > & patent_tree = bocobj.get_patent_tree();




	const string & beginyearstring = ci.get_cluster_map().begin()->first;
	const string & endyearstring = ci.get_cluster_map().rbegin()->first;
	const unsigned int endyear = atoi( endyearstring.c_str() );

	const unsigned int appyearindex = cRecord::get_index_by_name(cApplyYear::static_get_class_name());
	std::cout << "Begin year = "<<beginyearstring << " , End year = " << endyearstring << std::endl;

	map < unsigned int, unsigned int > unique_coauthor_year_chunk;
	map < unsigned int, unsigned int > unique_inventor_year_chunk;
	for ( unsigned int y = starting_year; y <= endyear; y += interval ) {
		unsigned int unique_inventors = 0;
		unsigned int unique_coauthors = 0;
		for ( list< cCluster >::const_iterator puinv = full_list.begin() ; puinv != full_list.end(); ++puinv) {
			std::pair< const cRecord * , set< const cRecord *> > kk =
					ones_temporal_unique_coauthors ( *puinv, uid2uinv, patent_tree, y, y + interval, appyearindex );
			if ( kk.first != NULL ) {
				++ unique_inventors;
				unique_coauthors += kk.second.size();
			}
		}
		unique_coauthor_year_chunk.insert(std::pair<unsigned int, unsigned int>(y, unique_coauthors));
		unique_inventor_year_chunk.insert(std::pair<unsigned int, unsigned int>(y, unique_inventors));
		std::cout << "Year " << y << " done." << std::endl;
	}

	std::ostream & os = std::cout;
	const string space_delim = "          ";
	os << "Year Chunk:" << space_delim << "Number of Unique Inventors:" << space_delim << "Number of Unique Coauthors:" << std::endl;
	for ( map < unsigned int, unsigned int >::const_iterator p = unique_inventor_year_chunk.begin(); p != unique_inventor_year_chunk.end(); ++p ) {
		os << p->first << space_delim << p->second << space_delim << unique_coauthor_year_chunk.find(p->first)->second << std::endl;
	}

	return 0;
}