Ejemplo n.º 1
0
void gcta::read_phen(string phen_file, vector<string> &phen_ID, vector<string> &phen_buf, int mphen)
{
    // Read phenotype data
 	ifstream in_phen(phen_file.c_str());
	if(!in_phen) throw("Error: can not open the file ["+phen_file+"] to read.");

    int i=0;
    vector<string> fid, pid, vs_buf;
	string str_buf, fid_buf, pid_buf;
	phen_ID.clear();
	phen_buf.clear();
    cout<<"Reading phenotypes from ["+phen_file+"]."<<endl;
    getline(in_phen, str_buf);
    int phen_num=StrFunc::split_string(str_buf, vs_buf)-2;
    if(phen_num<=0) throw("Error: no phenotype data is found.");
    if(phen_num>1) cout<<"There are "<<phen_num<<" traits specified in the file ["+phen_file+"]."<<endl;
    if(mphen>phen_num){
        stringstream errmsg;
        errmsg<<"Error: can not find the "<<mphen<<"th trait in the file ["+phen_file+"].";
        throw(errmsg.str());
    }
    if(phen_num>1) cout<<"The "<<mphen<<"th trait is included for analysis."<<endl;
    in_phen.seekg(ios::beg);
    mphen--;
	while(in_phen){
	    in_phen>>fid_buf;
		if(in_phen.eof()) break;
		in_phen>>pid_buf;
		for(i=0; i<phen_num; i++){
		    in_phen>>str_buf;
            if(i==mphen && str_buf!="-9" && str_buf!="NA"){
                phen_ID.push_back(fid_buf+":"+pid_buf);
                fid.push_back(fid_buf);
                pid.push_back(pid_buf);
                phen_buf.push_back(str_buf);
            }
		}
	}
	in_phen.close();
    cout<<"Nonmissing phenotypes of "<<phen_buf.size()<<" individuals are included from ["+phen_file+"]."<<endl;

    if(_id_map.empty()){
        _fid=fid;
        _pid=pid;
        _indi_num=_fid.size();
        init_keep();
    }
}
Ejemplo n.º 2
0
void gcta::read_famfile(string famfile)
{
	ifstream Fam(famfile.c_str());
	if(!Fam) throw("Error: can not open the file ["+famfile+"] to read.");
	cout<<"Reading PLINK FAM file from ["+famfile+"]."<<endl;

	int i=0;
	string str_buf;
	_fid.clear();
	_pid.clear();
	_fa_id.clear();
	_mo_id.clear();
	_sex.clear();
	_pheno.clear();
	while(Fam){
	    Fam>>str_buf;
	    if(Fam.eof()) break;
		_fid.push_back(str_buf);
	    Fam>>str_buf;
		_pid.push_back(str_buf);
	    Fam>>str_buf;
		_fa_id.push_back(str_buf);
	    Fam>>str_buf;
		_mo_id.push_back(str_buf);
		Fam>>str_buf;
		_sex.push_back(atoi(str_buf.c_str()));
		Fam>>str_buf;
		_pheno.push_back(atoi(str_buf.c_str()));
	}
	Fam.clear();
	Fam.close();
	_indi_num=_fid.size();
	cout<<_indi_num<<" individuals to be included from ["+famfile+"]."<<endl;

	// Initialize _keep
	init_keep();
}
Ejemplo n.º 3
0
void gcta::read_imp_dose_beagle(string zdosefile, string kp_indi_file, string rm_indi_file, string blup_indi_file)
{
	if(_include.size()==0) throw("Error: No SNP is retained for analysis.");
    int i=0, j=0;
    vector<int> rsnp;
    get_rsnp(rsnp);

    const int MAX_LINE_LENGTH=10000000;
    char buf[MAX_LINE_LENGTH];
    string str_buf;

    gzifstream zinf;
    zinf.open( zdosefile.c_str() );
    if(! zinf.is_open()) throw("Error: can not open the file ["+zdosefile+"] to read.");
    cout<<"Reading imputed dosage scores (BEAGLE output) ..."<<endl;
    zinf.getline(buf, MAX_LINE_LENGTH, '\n');
    stringstream ss(buf);
    for(i=0; i<3; i++) ss>>str_buf;
    while(ss>>str_buf){ _fid.push_back(str_buf); }
    _pid=_fid;
    _indi_num=_fid.size();
    _fa_id.resize(_indi_num);
    _mo_id.resize(_indi_num);
    _sex.resize(_indi_num);
    _pheno.resize(_indi_num);
    cout<<_indi_num<<" individuals to be included from ["+zdosefile+"]."<<endl;
    init_keep();
    if(!kp_indi_file.empty()) keep_indi(kp_indi_file);
    if(!blup_indi_file.empty()) read_indi_blup(blup_indi_file);
    if(!rm_indi_file.empty()) remove_indi(rm_indi_file);

    _geno_dose.resize(_keep.size());
    for(i=0; i<_keep.size(); i++) _geno_dose[i].resize(_include.size());

    vector<int> rindi;
    get_rindi(rindi);

    int line=0;
    int k=0;
    double d_buf=0.0;
    while(1){
        zinf.getline(buf, MAX_LINE_LENGTH, '\n');
        if(zinf.fail() || !zinf.good()) break;
        if(!rsnp[line++]) continue;
        stringstream ss(buf);
        ss>>str_buf;
        if(str_buf!=_snp_name[line-1]){
            stringstream errmsg;
            errmsg<<"Error: the "<<line<<" th SNP ["+_snp_name[line-1]+"] in the summary file doesn't match to that in the dosage file."<<endl;
            throw(errmsg.str());
        }
        ss>>str_buf>>str_buf;
        for(i=0, j=0; i<_indi_num; i++){
            ss>>d_buf;
            if(rindi[i]){ _geno_dose[j][k]=d_buf; j++; }
        }
        k++;
    }
    zinf.clear();
    zinf.close();
}
Ejemplo n.º 4
0
void gcta::read_imp_dose_mach(string zdosefile, string kp_indi_file, string rm_indi_file, string blup_indi_file)
{
	if(_include.size()==0) throw("Error: No SNP is retained for analysis.");
    
    int i=0, j=0, k=0, line=0;
    vector<int> rsnp;
    get_rsnp(rsnp);
    
    const int MAX_LINE_LENGTH = 10000000;
    char buf[MAX_LINE_LENGTH];
    gzifstream zinf;
    zinf.open(zdosefile.c_str());
    if(! zinf.is_open()) throw("Error: can not open the file ["+zdosefile+"] to read.");
    
    vector<string> indi_ls;
    map<string, int> kp_id_map, blup_id_map, rm_id_map;
    bool kp_indi_flag=!kp_indi_file.empty(), blup_indi_flag=!blup_indi_file.empty(), rm_indi_flag=!rm_indi_file.empty();
    if(kp_indi_flag) read_indi_list(kp_indi_file, indi_ls);
    for(i=0; i<indi_ls.size(); i++) kp_id_map.insert(pair<string, int>(indi_ls[i], i));
    if(blup_indi_flag) read_indi_list(blup_indi_file, indi_ls);
    for(i=0; i<indi_ls.size(); i++) blup_id_map.insert(pair<string, int>(indi_ls[i], i));
    if(rm_indi_flag) read_indi_list(rm_indi_file, indi_ls);
    for(i=0; i<indi_ls.size(); i++) rm_id_map.insert(pair<string, int>(indi_ls[i], i));
    
	bool missing=false;
    string str_buf, id_buf, err_msg="Error: reading dosage data failed. Are the map file and the dosage file matched?";
    double f_buf=0.0;
    vector<string> kept_id, vs_buf;
    cout<<"Reading dosage data from ["+zdosefile+"] in individual-major format (Note: may use huge RAM)."<<endl;
    _fid.clear();
    _pid.clear();
    _geno_dose.clear();
    
    vector<int> kp_it;
    while(1){
        bool kp_flag=true;
        zinf.getline(buf, MAX_LINE_LENGTH, '\n');
        stringstream ss(buf);
        if(!(ss>>str_buf)) break;
        int ibuf=StrFunc::split_string(str_buf, vs_buf, ">");
        if(ibuf>1){
            if(vs_buf[0].empty()) throw("Error: family ID of the individual ["+str_buf+"] is missing.");
            else vs_buf[0].erase(vs_buf[0].end()-1);
        }
        else if(ibuf==1) vs_buf.push_back(vs_buf[0]);
        else break;
        id_buf=vs_buf[0]+":"+vs_buf[1];
        if(kp_indi_flag && kp_id_map.find(id_buf)==kp_id_map.end()) kp_flag=false;
        if(kp_flag && blup_indi_flag && blup_id_map.find(id_buf)==blup_id_map.end()) kp_flag=false;
        if(kp_flag && rm_indi_flag && rm_id_map.find(id_buf)!=rm_id_map.end()) kp_flag=false;
        if(kp_flag){
            kp_it.push_back(1);
            _fid.push_back(vs_buf[0]);
            _pid.push_back(vs_buf[1]);
            kept_id.push_back(id_buf);
        }
        else kp_it.push_back(0);
        if(zinf.fail() || !zinf.good()) break;
    }
    zinf.clear();
    zinf.close();
    cout<<"(Imputed dosage data for "<<kp_it.size()<<" individuals detected)."<<endl;
    _indi_num=_fid.size();
    
    zinf.open(zdosefile.c_str());
    _geno_dose.resize(_indi_num);
    for(line=0; line<_indi_num; line++) _geno_dose[line].resize(_include.size());
    for(line=0, k=0; line<kp_it.size(); line++){
        zinf.getline(buf, MAX_LINE_LENGTH, '\n');
        if(kp_it[line]==0) continue;
        stringstream ss(buf);
        if(!(ss>>str_buf)) break;
        if(!(ss>>str_buf)) break;
        for(i=0, j=0; i<_snp_num; i++){
			ss>>str_buf;
			f_buf=atof(str_buf.c_str());
			if(str_buf=="X" || str_buf=="NA"){
				if(!missing){
					cout<<"Warning: missing values detected in the dosage data."<<endl;
					missing=true;
				}
				f_buf=1e6;
			}
            if(rsnp[i]){ _geno_dose[k][j]=(f_buf); j++; }
        }
        k++;
    }
    zinf.clear();
    zinf.close();
    
    cout<<"Imputed dosage data for "<<kept_id.size()<<" individuals are included from ["<<zdosefile<<"]."<<endl;
    _fa_id.resize(_indi_num);
    _mo_id.resize(_indi_num);
    _sex.resize(_indi_num);
    _pheno.resize(_indi_num);
	for(i=0; i<_indi_num; i++){
		_fa_id[i]=_mo_id[i]="0";
		_sex[i]=-9;
		_pheno[i]=-9;
	}
    
 	// initialize keep
    init_keep();
    update_id_map_kp(kept_id, _id_map, _keep);
    if(_keep.size()==0) throw("Error: No individual is retained for analysis.");
    
    if(blup_indi_flag) read_indi_blup(blup_indi_file);
    
	// update data
	update_bim(rsnp);
}