Esempio n. 1
0
void gcta::read_indi_blup(string blup_indi_file)
{
    vector< vector<string> > g_buf;
	ifstream i_indi_blup(blup_indi_file.c_str());
	if(!i_indi_blup) throw("Error: can not open the file ["+blup_indi_file+"] to read.");
	string str_buf, id_buf;
	vector<string> id, vs_buf;
	int i=0, j=0, k=0, col_num=0;
	while(i_indi_blup){
	    i_indi_blup>>str_buf;
	    if(i_indi_blup.eof()) break;
		id_buf=str_buf+":";
	    i_indi_blup>>str_buf;
	    id_buf+=str_buf;
		getline(i_indi_blup, str_buf);
		col_num=StrFunc::split_string(str_buf, vs_buf);
		if(col_num<1) continue;
		id.push_back(id_buf);
		g_buf.push_back(vs_buf);
	}
	i_indi_blup.close();

	update_id_map_kp(id, _id_map, _keep);
    map<string, int> uni_id_map;
    map<string, int>::iterator iter;
	for(i=0; i<_keep.size(); i++) uni_id_map.insert(pair<string,int>(_fid[_keep[i]]+":"+_pid[_keep[i]], i));
	_varcmp_Py.setZero(_keep.size(), col_num/2);
	for(i=0; i<id.size(); i++){
	    iter=uni_id_map.find(id[i]);
	    if(iter==uni_id_map.end()) continue;
	    for(j=0, k=0; j<col_num; j+=2, k++) _varcmp_Py(iter->second,k)=atof(g_buf[i][j].c_str());
	}
	cout<<"BLUP solution to the total genetic effects for "<<_keep.size()<<" individuals have been read from ["+blup_indi_file+"]."<<endl;	
}
Esempio n. 2
0
void gcta::extract_single_snp(string snpname)
{
    vector<string> snplist;
    snplist.push_back(snpname);
    update_id_map_kp(snplist, _snp_name_map, _include);
    if(_include.empty()) throw("Error: can not find the SNP ["+snpname+"] in the data.");
    else cout<<"Only the SNP ["+snpname+"] is included in the analysis."<<endl;
}
Esempio n. 3
0
void gcta::read_imp_dose_mach(string zdosefile, string kp_indi_file, string rm_indi_file, string blup_indi_file)
{
	if(_include.size()==0) throw("Error: No SNP is retained for analysis.");
    
    int i=0, j=0, k=0, line=0;
    vector<int> rsnp;
    get_rsnp(rsnp);
    
    const int MAX_LINE_LENGTH = 10000000;
    char buf[MAX_LINE_LENGTH];
    gzifstream zinf;
    zinf.open(zdosefile.c_str());
    if(! zinf.is_open()) throw("Error: can not open the file ["+zdosefile+"] to read.");
    
    vector<string> indi_ls;
    map<string, int> kp_id_map, blup_id_map, rm_id_map;
    bool kp_indi_flag=!kp_indi_file.empty(), blup_indi_flag=!blup_indi_file.empty(), rm_indi_flag=!rm_indi_file.empty();
    if(kp_indi_flag) read_indi_list(kp_indi_file, indi_ls);
    for(i=0; i<indi_ls.size(); i++) kp_id_map.insert(pair<string, int>(indi_ls[i], i));
    if(blup_indi_flag) read_indi_list(blup_indi_file, indi_ls);
    for(i=0; i<indi_ls.size(); i++) blup_id_map.insert(pair<string, int>(indi_ls[i], i));
    if(rm_indi_flag) read_indi_list(rm_indi_file, indi_ls);
    for(i=0; i<indi_ls.size(); i++) rm_id_map.insert(pair<string, int>(indi_ls[i], i));
    
	bool missing=false;
    string str_buf, id_buf, err_msg="Error: reading dosage data failed. Are the map file and the dosage file matched?";
    double f_buf=0.0;
    vector<string> kept_id, vs_buf;
    cout<<"Reading dosage data from ["+zdosefile+"] in individual-major format (Note: may use huge RAM)."<<endl;
    _fid.clear();
    _pid.clear();
    _geno_dose.clear();
    
    vector<int> kp_it;
    while(1){
        bool kp_flag=true;
        zinf.getline(buf, MAX_LINE_LENGTH, '\n');
        stringstream ss(buf);
        if(!(ss>>str_buf)) break;
        int ibuf=StrFunc::split_string(str_buf, vs_buf, ">");
        if(ibuf>1){
            if(vs_buf[0].empty()) throw("Error: family ID of the individual ["+str_buf+"] is missing.");
            else vs_buf[0].erase(vs_buf[0].end()-1);
        }
        else if(ibuf==1) vs_buf.push_back(vs_buf[0]);
        else break;
        id_buf=vs_buf[0]+":"+vs_buf[1];
        if(kp_indi_flag && kp_id_map.find(id_buf)==kp_id_map.end()) kp_flag=false;
        if(kp_flag && blup_indi_flag && blup_id_map.find(id_buf)==blup_id_map.end()) kp_flag=false;
        if(kp_flag && rm_indi_flag && rm_id_map.find(id_buf)!=rm_id_map.end()) kp_flag=false;
        if(kp_flag){
            kp_it.push_back(1);
            _fid.push_back(vs_buf[0]);
            _pid.push_back(vs_buf[1]);
            kept_id.push_back(id_buf);
        }
        else kp_it.push_back(0);
        if(zinf.fail() || !zinf.good()) break;
    }
    zinf.clear();
    zinf.close();
    cout<<"(Imputed dosage data for "<<kp_it.size()<<" individuals detected)."<<endl;
    _indi_num=_fid.size();
    
    zinf.open(zdosefile.c_str());
    _geno_dose.resize(_indi_num);
    for(line=0; line<_indi_num; line++) _geno_dose[line].resize(_include.size());
    for(line=0, k=0; line<kp_it.size(); line++){
        zinf.getline(buf, MAX_LINE_LENGTH, '\n');
        if(kp_it[line]==0) continue;
        stringstream ss(buf);
        if(!(ss>>str_buf)) break;
        if(!(ss>>str_buf)) break;
        for(i=0, j=0; i<_snp_num; i++){
			ss>>str_buf;
			f_buf=atof(str_buf.c_str());
			if(str_buf=="X" || str_buf=="NA"){
				if(!missing){
					cout<<"Warning: missing values detected in the dosage data."<<endl;
					missing=true;
				}
				f_buf=1e6;
			}
            if(rsnp[i]){ _geno_dose[k][j]=(f_buf); j++; }
        }
        k++;
    }
    zinf.clear();
    zinf.close();
    
    cout<<"Imputed dosage data for "<<kept_id.size()<<" individuals are included from ["<<zdosefile<<"]."<<endl;
    _fa_id.resize(_indi_num);
    _mo_id.resize(_indi_num);
    _sex.resize(_indi_num);
    _pheno.resize(_indi_num);
	for(i=0; i<_indi_num; i++){
		_fa_id[i]=_mo_id[i]="0";
		_sex[i]=-9;
		_pheno[i]=-9;
	}
    
 	// initialize keep
    init_keep();
    update_id_map_kp(kept_id, _id_map, _keep);
    if(_keep.size()==0) throw("Error: No individual is retained for analysis.");
    
    if(blup_indi_flag) read_indi_blup(blup_indi_file);
    
	// update data
	update_bim(rsnp);
}
Esempio n. 4
0
void gcta::fit_reml(string grm_file, string phen_file, string qcovar_file, string covar_file, string qGE_file, string GE_file, string keep_indi_file, string remove_indi_file, string sex_file, int mphen, double grm_cutoff, double adj_grm_fac, int dosage_compen, bool m_grm_flag, bool pred_rand_eff, bool est_fix_eff, int reml_mtd, int MaxIter, vector<double> reml_priors, vector<double> reml_priors_var, vector<int> drop, bool no_lrt, double prevalence, bool no_constrain, bool mlmassoc)
{
    _reml_mtd=reml_mtd;
    _reml_max_iter=MaxIter;
    int i=0, j=0, k=0;
    bool grm_flag=(!grm_file.empty());
    bool qcovar_flag=(!qcovar_file.empty());
    bool covar_flag=(!covar_file.empty());
    bool GE_flag=(!GE_file.empty());
    bool qGE_flag=(!qGE_file.empty());
    if(m_grm_flag) grm_flag=false;

    // Read data
    stringstream errmsg;
    int qcovar_num=0, covar_num=0, qE_fac_num=0, E_fac_num=0;
    vector<string> phen_ID, phen_buf, qcovar_ID, covar_ID, qGE_ID, GE_ID, grm_id, grm_files;
    vector< vector<string> > qcovar, covar, GE, qGE; // save individuals by column

    if(grm_flag){
        read_grm_gz(grm_file, grm_id);
        update_id_map_kp(grm_id, _id_map, _keep);
        grm_files.push_back(grm_file);
    }
    else if(m_grm_flag){
        read_grm_filenames(grm_file, grm_files, false);
        for(i=0; i<grm_files.size(); i++){
            read_grm_gz(grm_files[i], grm_id, false, true);
            update_id_map_kp(grm_id, _id_map, _keep);
        }
    }
    read_phen(phen_file, phen_ID, phen_buf, mphen);
    update_id_map_kp(phen_ID, _id_map, _keep);
    if(qcovar_flag){
        qcovar_num=read_covar(qcovar_file, qcovar_ID, qcovar, true);
        update_id_map_kp(qcovar_ID, _id_map, _keep);
    }
    if(covar_flag){
        covar_num=read_covar(covar_file, covar_ID, covar, false);
        update_id_map_kp(covar_ID, _id_map, _keep);
    }
    if(qGE_flag){
        qE_fac_num=read_GE(qGE_file, qGE_ID, qGE, true);
        update_id_map_kp(qGE_ID, _id_map, _keep);
    }
    if(GE_flag){
        E_fac_num=read_GE(GE_file, GE_ID, GE, false);
        update_id_map_kp(GE_ID, _id_map, _keep);
    }
    if(!mlmassoc){
        if(!keep_indi_file.empty()) keep_indi(keep_indi_file);
        if(!remove_indi_file.empty()) remove_indi(remove_indi_file);       
    }
    if(grm_flag){
        if(grm_cutoff>-1.0) rm_cor_indi(grm_cutoff);
        if(!sex_file.empty()) update_sex(sex_file);
        if(adj_grm_fac>-1.0) adj_grm(adj_grm_fac);
        if(dosage_compen>-1) dc(dosage_compen);
        _grm_N.resize(1,1);
    }

    vector<string> uni_id;
	map<string, int> uni_id_map;
    map<string, int>::iterator iter;
	for(i=0; i<_keep.size(); i++){
	    uni_id.push_back(_fid[_keep[i]]+":"+_pid[_keep[i]]);
	    uni_id_map.insert(pair<string,int>(_fid[_keep[i]]+":"+_pid[_keep[i]], i));
	}
    _n=_keep.size();
    if(_n<1) throw("Error: no individual is in common in the input files.");

    // construct model terms
    _y.setZero(_n);
    for(i=0; i<phen_ID.size(); i++){
        iter=uni_id_map.find(phen_ID[i]);
        if(iter==uni_id_map.end()) continue;
        _y[iter->second]=atof(phen_buf[i].c_str());
    }

    int pos=0;
    _r_indx.clear();
    eigenMatrix A_N(_n, _n);
    vector<int> kp;
    if(grm_flag){
        for(i=0; i<1+qE_fac_num+E_fac_num; i++) _r_indx.push_back(i);
        if(!no_lrt) drop_comp(drop);
        _A=eigenMatrix::Zero(_n, _r_indx.size()*_n);
        if(mlmassoc) StrFunc::match(uni_id, grm_id, kp);
        else kp=_keep;
        for(i=0; i<_n; i++){
            for(j=0; j<=i; j++) (_A.block(0,0,_n,_n))(j,i)=(_A.block(0,0,_n,_n))(i,j)=_grm(kp[i],kp[j]);
        }
        pos++;
        _grm.resize(1,1);
    }
    else if(m_grm_flag){
        if(!sex_file.empty()) update_sex(sex_file);
        for(i=0; i<(1+qE_fac_num+E_fac_num)*grm_files.size(); i++) _r_indx.push_back(i);
        if(!no_lrt) drop_comp(drop);
        _A=eigenMatrix::Zero(_n, _r_indx.size()*_n);
        string prev_file=grm_files[0];
        vector<string> prev_grm_id(grm_id);
        cout<<"There are "<<grm_files.size()<<" GRM file names specified in the file ["+grm_file+"]."<<endl;
        for(i=0; i<grm_files.size(); i++, pos++){
            cout<<"Reading the GRM from the "<<i+1<<"th file ..."<<endl;
            read_grm_gz(grm_files[i], grm_id);
            if(adj_grm_fac>-1.0) adj_grm(adj_grm_fac);
            if(dosage_compen>-1) dc(dosage_compen);
            StrFunc::match(uni_id, grm_id, kp);
            int pos_n=pos*_n;
            for(j=0; j<_n; j++){
                for(k=0; k<=j; k++){
                    if(kp[j]>=kp[k]) (_A.block(0,pos_n,_n,_n))(k,j)=(_A.block(0,pos_n,_n,_n))(j,k)=_grm(kp[j],kp[k]);
                    else (_A.block(0,pos_n,_n,_n))(k,j)=(_A.block(0,pos_n,_n,_n))(j,k)=_grm(kp[k],kp[j]);
                }
            }
            prev_file=grm_files[i];
            prev_grm_id=grm_id;
       }
        _grm_N.resize(1,1);
        _grm.resize(1,1);
    }

    // GE interaction
    vector<eigenMatrix> E_float(E_fac_num);
    eigenMatrix qE_float, mbuf;
    if(qGE_flag){
        qE_float.resize(_n, qE_fac_num);
        for(i=0; i<qGE_ID.size(); i++){
            iter=uni_id_map.find(qGE_ID[i]);
            if(iter==uni_id_map.end()) continue;
            for(j=0; j<qE_fac_num; j++) qE_float(iter->second,j)=atof(qGE[i][j].c_str());
        }
        for(j=0; j<qE_fac_num; j++){
            mbuf=((qE_float.block(0,j,_n,1))*(qE_float.block(0,j,_n,1)).transpose());
            for(i=0; i<grm_files.size(); i++, pos++) (_A.block(0,pos*_n,_n,_n))=(_A.block(0,i*_n,_n,_n)).array()*mbuf.array();
        }
    }
    if(GE_flag){
        vector< vector<string> > E_str(E_fac_num);
        for(i=0; i<E_fac_num; i++) E_str[i].resize(_n);
        for(i=0; i<GE_ID.size(); i++){
            iter=uni_id_map.find(GE_ID[i]);
            if(iter!=uni_id_map.end()){
                for(j=0; j<E_fac_num; j++) E_str[j][iter->second]=GE[i][j];
            }
        }
        for(j=0; j<E_fac_num; j++){
            stringstream errmsg;
            errmsg<<"Error: too many classes for the "<<j+1<<"th environmental factor. \nPlease make sure you input a discrete variable as the environmental factor.";
            string errmsg1=errmsg.str();
            errmsg.str("");
            errmsg<<"Error: the "<<j+1<<"th envronmental factor has only one class.";
            string errmsg2=errmsg.str();
            coeff_mat(E_str[j], E_float[j], errmsg1, errmsg2);
            mbuf=((E_float[j])*(E_float[j]).transpose());
            for(i=0; i<grm_files.size(); i++, pos++) (_A.block(0,pos*_n,_n,_n))=(_A.block(0,i*_n,_n,_n)).array()*mbuf.array();
        }
    }

    // construct X matrix
    construct_X(uni_id_map, qcovar_flag, qcovar_num, qcovar_ID, qcovar, covar_flag, covar_num, covar_ID, covar, E_float, qE_float);

    // names of variance component
    for(i=0; i<grm_files.size(); i++){
        stringstream strstrm;
        if(grm_files.size()==1) strstrm<<"";
        else strstrm<<i+1;
        _var_name.push_back("V(G"+strstrm.str()+")");
        _hsq_name.push_back("V(G"+strstrm.str()+")/Vp");
    }
    for(j=0; j<qE_fac_num; j++){
        for(i=0; i<grm_files.size(); i++){
            stringstream strstrm1,strstrm2;
            if(grm_files.size()==1) strstrm1<<"";
            else  strstrm1<<i+1;
            if(qE_fac_num==1) strstrm2<<"";
            else strstrm2<<j+1;
            _var_name.push_back("V(G"+strstrm1.str()+"xqE"+strstrm2.str()+")");
            _hsq_name.push_back("V(G"+strstrm1.str()+"xqE"+strstrm2.str()+")"+"/Vp");
        }
    }
    for(j=0; j<E_fac_num; j++){
        for(i=0; i<grm_files.size(); i++){
            stringstream strstrm1,strstrm2;
            if(grm_files.size()==1) strstrm1<<"";
            else  strstrm1<<i+1;
            if(E_fac_num==1) strstrm2<<"";
            else strstrm2<<j+1;
            _var_name.push_back("V(G"+strstrm1.str()+"xE"+strstrm2.str()+")");
            _hsq_name.push_back("V(G"+strstrm1.str()+"xE"+strstrm2.str()+")"+"/Vp");
        }
    }
    _var_name.push_back("V(e)");

    // run REML algorithm
    cout<<_n<<" individuals are in common in these files."<<endl;
	reml(pred_rand_eff, est_fix_eff, reml_priors, reml_priors_var, prevalence, no_constrain, no_lrt, mlmassoc);
}