Exemplo n.º 1
0
void
one_step_postprocess(const list < Record > & all_records,
                     const char * last_disambig_result,
                     const char * outputfile) {

    // TODO: document valid keys for this dictionary.
    map <string, const Record *> uid_dict;

    const string uid_identifier = cUnique_Record_ID::static_get_class_name();
    // uid_dict is probably the return value from create_btree_uid2record_pointer
    create_btree_uid2record_pointer(uid_dict, all_records, uid_identifier);
    // instantiate a map
    map < const Record *, RecordPList, cSort_by_attrib > patent_tree(cSort_by_attrib(cPatent::static_get_class_name()));
    build_patent_tree(patent_tree , all_records);
    ClusterSet cs;
    //cs.convert_from_ClusterInfo(&match);
    // Read results from last disambiguation 
    cs.read_from_file(last_disambig_result, uid_dict);
    map < const Record *, const Record *> uid2uinv;
    const list < Cluster > & full_list = cs.get_set();

    for (list < Cluster >::const_iterator t = full_list.begin(); t != full_list.end(); ++t) {
        t->add_uid2uinv(uid2uinv);
    }

    const char * suffix = ".pplog";
    const string logfile = string(outputfile) + suffix ;
    //post_polish( cs, blocker_coauthor.get_uid2uinv_tree(), blocker_coauthor.get_patent_tree(), logfile);
    post_polish(cs, uid2uinv, patent_tree, logfile);
    cs.output_results(outputfile);
}
Exemplo n.º 2
0
void 
one_step_prostprocess(const list < Record > & all_records, 
                      const char * last_disambig_result, 
                      const char * outputfile) {

    map <string, const Record *> uid_dict;
    const string uid_identifier = cUnique_Record_ID::static_get_class_name();
    create_btree_uid2record_pointer(uid_dict, all_records, uid_identifier);
    map < const Record *, cGroup_Value, cSort_by_attrib > patent_tree(cSort_by_attrib(cPatent::static_get_class_name()));
    build_patent_tree(  patent_tree , all_records );
    cCluster_Set cs;
    //cs.convert_from_ClusterInfo(&match);
    cs.read_from_file(last_disambig_result, uid_dict);
    map < const Record *, const Record *> uid2uinv;
    const list < cCluster > & full_list = cs.get_set();

    for ( list < cCluster >::const_iterator t = full_list.begin(); t != full_list.end(); ++t )
        t->add_uid2uinv(uid2uinv);

    const char * suffix = ".pplog";
    const string logfile = string(outputfile) + suffix ;
    //post_polish( cs, blocker_coauthor.get_uid2uinv_tree(), blocker_coauthor.get_patent_tree(), logfile);
    post_polish( cs, uid2uinv, patent_tree, logfile);
    cs.output_results(outputfile);
}
void one_step_prostprocess( const list < cRecord > & all_records, const char * last_disambig_result, const char * outputfile) {
	map <string, const cRecord *> uid_dict;
	const string uid_identifier = cUnique_Record_ID::static_get_class_name();
	create_btree_uid2record_pointer(uid_dict, all_records, uid_identifier);
	map < const cRecord *, cGroup_Value, cSort_by_attrib > patent_tree(cSort_by_attrib(cPatent::static_get_class_name()));
	build_patent_tree(  patent_tree , all_records );
#if 0
	list < const cRecord *> all_rec_pointers;
	for ( list<cRecord>::const_iterator p = all_records.begin(); p != all_records.end(); ++p )
		all_rec_pointers.push_back(&(*p));

	cCluster_Info match ( uid_dict, true, true, false);

	const unsigned int num_coauthors_to_group = 2;
	cBlocking_Operation_By_Coauthors blocker_coauthor( all_rec_pointers, num_coauthors_to_group );

	cString_NoSpace_Truncate operator_truncate_firstname;
	cString_NoSpace_Truncate operator_truncate_lastname;
	cString_NoSpace_Truncate operator_truncate_middlename;

	vector <const cString_Manipulator*> pstring_oper;
	pstring_oper.push_back(& operator_truncate_firstname);
	pstring_oper.push_back(& operator_truncate_middlename);
	pstring_oper.push_back(& operator_truncate_lastname);

	const string blocking_names[] = {cFirstname::static_get_class_name(), cMiddlename::static_get_class_name(), cLastname::static_get_class_name()};
	vector < string > blocking_column_names(blocking_names, blocking_names + sizeof(blocking_names)/sizeof(string) );
	vector < unsigned int > blocking_column_data_indice ( blocking_column_names.size(), 0 );
	//blocking_column_data_indice.at(0) = 1;
	//blocking_column_data_indice.at(1) = 1;
	cBlocking_Operation_Multiple_Column_Manipulate blocker(pstring_oper, blocking_column_names, blocking_column_data_indice);

	operator_truncate_firstname.set_truncater(0, 0, true);
	operator_truncate_middlename.set_truncater(0, 0, false);
	operator_truncate_lastname.set_truncater(0, 0, true);

	match.reset_blocking( blocker , last_disambig_result );

	blocker_coauthor.build_uid2uinv_tree(match);
#endif
	cCluster_Set cs;
	//cs.convert_from_ClusterInfo(&match);
	cs.read_from_file(last_disambig_result, uid_dict);
	map < const cRecord *, const cRecord *> uid2uinv;
	const list < cCluster > & full_list = cs.get_set();
	for ( list < cCluster >::const_iterator t = full_list.begin(); t != full_list.end(); ++t )
		t->add_uid2uinv(uid2uinv);
	const char * suffix = ".pplog";
	const string logfile = string(outputfile) + suffix ;
	//post_polish( cs, blocker_coauthor.get_uid2uinv_tree(), blocker_coauthor.get_patent_tree(), logfile);
	post_polish( cs, uid2uinv, patent_tree, logfile);
	cs.output_results(outputfile);
}
cBlocking_Operation_By_Coauthors::cBlocking_Operation_By_Coauthors(
    const RecordPList & all_rec_pointers,
    const uint32_t coauthors)
    : patent_tree(cSort_by_attrib(cPatent::static_get_class_name())),
      num_coauthors(coauthors) {

    too_many_coauthors(num_coauthors);
    build_patent_tree(all_rec_pointers);

    for (uint32_t i = 0; i < num_coauthors; ++i) {
        infoless += cBlocking_Operation::delim + cBlocking_Operation::delim;
        infoless += cBlocking_Operation::delim + cBlocking_Operation::delim;
    }
}