void gcta::read_indi_blup(string blup_indi_file) { vector< vector<string> > g_buf; ifstream i_indi_blup(blup_indi_file.c_str()); if(!i_indi_blup) throw("Error: can not open the file ["+blup_indi_file+"] to read."); string str_buf, id_buf; vector<string> id, vs_buf; int i=0, j=0, k=0, col_num=0; while(i_indi_blup){ i_indi_blup>>str_buf; if(i_indi_blup.eof()) break; id_buf=str_buf+":"; i_indi_blup>>str_buf; id_buf+=str_buf; getline(i_indi_blup, str_buf); col_num=StrFunc::split_string(str_buf, vs_buf); if(col_num<1) continue; id.push_back(id_buf); g_buf.push_back(vs_buf); } i_indi_blup.close(); update_id_map_kp(id, _id_map, _keep); map<string, int> uni_id_map; map<string, int>::iterator iter; for(i=0; i<_keep.size(); i++) uni_id_map.insert(pair<string,int>(_fid[_keep[i]]+":"+_pid[_keep[i]], i)); _varcmp_Py.setZero(_keep.size(), col_num/2); for(i=0; i<id.size(); i++){ iter=uni_id_map.find(id[i]); if(iter==uni_id_map.end()) continue; for(j=0, k=0; j<col_num; j+=2, k++) _varcmp_Py(iter->second,k)=atof(g_buf[i][j].c_str()); } cout<<"BLUP solution to the total genetic effects for "<<_keep.size()<<" individuals have been read from ["+blup_indi_file+"]."<<endl; }
void gcta::extract_single_snp(string snpname) { vector<string> snplist; snplist.push_back(snpname); update_id_map_kp(snplist, _snp_name_map, _include); if(_include.empty()) throw("Error: can not find the SNP ["+snpname+"] in the data."); else cout<<"Only the SNP ["+snpname+"] is included in the analysis."<<endl; }
void gcta::read_imp_dose_mach(string zdosefile, string kp_indi_file, string rm_indi_file, string blup_indi_file) { if(_include.size()==0) throw("Error: No SNP is retained for analysis."); int i=0, j=0, k=0, line=0; vector<int> rsnp; get_rsnp(rsnp); const int MAX_LINE_LENGTH = 10000000; char buf[MAX_LINE_LENGTH]; gzifstream zinf; zinf.open(zdosefile.c_str()); if(! zinf.is_open()) throw("Error: can not open the file ["+zdosefile+"] to read."); vector<string> indi_ls; map<string, int> kp_id_map, blup_id_map, rm_id_map; bool kp_indi_flag=!kp_indi_file.empty(), blup_indi_flag=!blup_indi_file.empty(), rm_indi_flag=!rm_indi_file.empty(); if(kp_indi_flag) read_indi_list(kp_indi_file, indi_ls); for(i=0; i<indi_ls.size(); i++) kp_id_map.insert(pair<string, int>(indi_ls[i], i)); if(blup_indi_flag) read_indi_list(blup_indi_file, indi_ls); for(i=0; i<indi_ls.size(); i++) blup_id_map.insert(pair<string, int>(indi_ls[i], i)); if(rm_indi_flag) read_indi_list(rm_indi_file, indi_ls); for(i=0; i<indi_ls.size(); i++) rm_id_map.insert(pair<string, int>(indi_ls[i], i)); bool missing=false; string str_buf, id_buf, err_msg="Error: reading dosage data failed. Are the map file and the dosage file matched?"; double f_buf=0.0; vector<string> kept_id, vs_buf; cout<<"Reading dosage data from ["+zdosefile+"] in individual-major format (Note: may use huge RAM)."<<endl; _fid.clear(); _pid.clear(); _geno_dose.clear(); vector<int> kp_it; while(1){ bool kp_flag=true; zinf.getline(buf, MAX_LINE_LENGTH, '\n'); stringstream ss(buf); if(!(ss>>str_buf)) break; int ibuf=StrFunc::split_string(str_buf, vs_buf, ">"); if(ibuf>1){ if(vs_buf[0].empty()) throw("Error: family ID of the individual ["+str_buf+"] is missing."); else vs_buf[0].erase(vs_buf[0].end()-1); } else if(ibuf==1) vs_buf.push_back(vs_buf[0]); else break; id_buf=vs_buf[0]+":"+vs_buf[1]; if(kp_indi_flag && kp_id_map.find(id_buf)==kp_id_map.end()) kp_flag=false; if(kp_flag && blup_indi_flag && blup_id_map.find(id_buf)==blup_id_map.end()) kp_flag=false; if(kp_flag && rm_indi_flag && rm_id_map.find(id_buf)!=rm_id_map.end()) kp_flag=false; if(kp_flag){ kp_it.push_back(1); _fid.push_back(vs_buf[0]); _pid.push_back(vs_buf[1]); kept_id.push_back(id_buf); } else kp_it.push_back(0); if(zinf.fail() || !zinf.good()) break; } zinf.clear(); zinf.close(); cout<<"(Imputed dosage data for "<<kp_it.size()<<" individuals detected)."<<endl; _indi_num=_fid.size(); zinf.open(zdosefile.c_str()); _geno_dose.resize(_indi_num); for(line=0; line<_indi_num; line++) _geno_dose[line].resize(_include.size()); for(line=0, k=0; line<kp_it.size(); line++){ zinf.getline(buf, MAX_LINE_LENGTH, '\n'); if(kp_it[line]==0) continue; stringstream ss(buf); if(!(ss>>str_buf)) break; if(!(ss>>str_buf)) break; for(i=0, j=0; i<_snp_num; i++){ ss>>str_buf; f_buf=atof(str_buf.c_str()); if(str_buf=="X" || str_buf=="NA"){ if(!missing){ cout<<"Warning: missing values detected in the dosage data."<<endl; missing=true; } f_buf=1e6; } if(rsnp[i]){ _geno_dose[k][j]=(f_buf); j++; } } k++; } zinf.clear(); zinf.close(); cout<<"Imputed dosage data for "<<kept_id.size()<<" individuals are included from ["<<zdosefile<<"]."<<endl; _fa_id.resize(_indi_num); _mo_id.resize(_indi_num); _sex.resize(_indi_num); _pheno.resize(_indi_num); for(i=0; i<_indi_num; i++){ _fa_id[i]=_mo_id[i]="0"; _sex[i]=-9; _pheno[i]=-9; } // initialize keep init_keep(); update_id_map_kp(kept_id, _id_map, _keep); if(_keep.size()==0) throw("Error: No individual is retained for analysis."); if(blup_indi_flag) read_indi_blup(blup_indi_file); // update data update_bim(rsnp); }
void gcta::fit_reml(string grm_file, string phen_file, string qcovar_file, string covar_file, string qGE_file, string GE_file, string keep_indi_file, string remove_indi_file, string sex_file, int mphen, double grm_cutoff, double adj_grm_fac, int dosage_compen, bool m_grm_flag, bool pred_rand_eff, bool est_fix_eff, int reml_mtd, int MaxIter, vector<double> reml_priors, vector<double> reml_priors_var, vector<int> drop, bool no_lrt, double prevalence, bool no_constrain, bool mlmassoc) { _reml_mtd=reml_mtd; _reml_max_iter=MaxIter; int i=0, j=0, k=0; bool grm_flag=(!grm_file.empty()); bool qcovar_flag=(!qcovar_file.empty()); bool covar_flag=(!covar_file.empty()); bool GE_flag=(!GE_file.empty()); bool qGE_flag=(!qGE_file.empty()); if(m_grm_flag) grm_flag=false; // Read data stringstream errmsg; int qcovar_num=0, covar_num=0, qE_fac_num=0, E_fac_num=0; vector<string> phen_ID, phen_buf, qcovar_ID, covar_ID, qGE_ID, GE_ID, grm_id, grm_files; vector< vector<string> > qcovar, covar, GE, qGE; // save individuals by column if(grm_flag){ read_grm_gz(grm_file, grm_id); update_id_map_kp(grm_id, _id_map, _keep); grm_files.push_back(grm_file); } else if(m_grm_flag){ read_grm_filenames(grm_file, grm_files, false); for(i=0; i<grm_files.size(); i++){ read_grm_gz(grm_files[i], grm_id, false, true); update_id_map_kp(grm_id, _id_map, _keep); } } read_phen(phen_file, phen_ID, phen_buf, mphen); update_id_map_kp(phen_ID, _id_map, _keep); if(qcovar_flag){ qcovar_num=read_covar(qcovar_file, qcovar_ID, qcovar, true); update_id_map_kp(qcovar_ID, _id_map, _keep); } if(covar_flag){ covar_num=read_covar(covar_file, covar_ID, covar, false); update_id_map_kp(covar_ID, _id_map, _keep); } if(qGE_flag){ qE_fac_num=read_GE(qGE_file, qGE_ID, qGE, true); update_id_map_kp(qGE_ID, _id_map, _keep); } if(GE_flag){ E_fac_num=read_GE(GE_file, GE_ID, GE, false); update_id_map_kp(GE_ID, _id_map, _keep); } if(!mlmassoc){ if(!keep_indi_file.empty()) keep_indi(keep_indi_file); if(!remove_indi_file.empty()) remove_indi(remove_indi_file); } if(grm_flag){ if(grm_cutoff>-1.0) rm_cor_indi(grm_cutoff); if(!sex_file.empty()) update_sex(sex_file); if(adj_grm_fac>-1.0) adj_grm(adj_grm_fac); if(dosage_compen>-1) dc(dosage_compen); _grm_N.resize(1,1); } vector<string> uni_id; map<string, int> uni_id_map; map<string, int>::iterator iter; for(i=0; i<_keep.size(); i++){ uni_id.push_back(_fid[_keep[i]]+":"+_pid[_keep[i]]); uni_id_map.insert(pair<string,int>(_fid[_keep[i]]+":"+_pid[_keep[i]], i)); } _n=_keep.size(); if(_n<1) throw("Error: no individual is in common in the input files."); // construct model terms _y.setZero(_n); for(i=0; i<phen_ID.size(); i++){ iter=uni_id_map.find(phen_ID[i]); if(iter==uni_id_map.end()) continue; _y[iter->second]=atof(phen_buf[i].c_str()); } int pos=0; _r_indx.clear(); eigenMatrix A_N(_n, _n); vector<int> kp; if(grm_flag){ for(i=0; i<1+qE_fac_num+E_fac_num; i++) _r_indx.push_back(i); if(!no_lrt) drop_comp(drop); _A=eigenMatrix::Zero(_n, _r_indx.size()*_n); if(mlmassoc) StrFunc::match(uni_id, grm_id, kp); else kp=_keep; for(i=0; i<_n; i++){ for(j=0; j<=i; j++) (_A.block(0,0,_n,_n))(j,i)=(_A.block(0,0,_n,_n))(i,j)=_grm(kp[i],kp[j]); } pos++; _grm.resize(1,1); } else if(m_grm_flag){ if(!sex_file.empty()) update_sex(sex_file); for(i=0; i<(1+qE_fac_num+E_fac_num)*grm_files.size(); i++) _r_indx.push_back(i); if(!no_lrt) drop_comp(drop); _A=eigenMatrix::Zero(_n, _r_indx.size()*_n); string prev_file=grm_files[0]; vector<string> prev_grm_id(grm_id); cout<<"There are "<<grm_files.size()<<" GRM file names specified in the file ["+grm_file+"]."<<endl; for(i=0; i<grm_files.size(); i++, pos++){ cout<<"Reading the GRM from the "<<i+1<<"th file ..."<<endl; read_grm_gz(grm_files[i], grm_id); if(adj_grm_fac>-1.0) adj_grm(adj_grm_fac); if(dosage_compen>-1) dc(dosage_compen); StrFunc::match(uni_id, grm_id, kp); int pos_n=pos*_n; for(j=0; j<_n; j++){ for(k=0; k<=j; k++){ if(kp[j]>=kp[k]) (_A.block(0,pos_n,_n,_n))(k,j)=(_A.block(0,pos_n,_n,_n))(j,k)=_grm(kp[j],kp[k]); else (_A.block(0,pos_n,_n,_n))(k,j)=(_A.block(0,pos_n,_n,_n))(j,k)=_grm(kp[k],kp[j]); } } prev_file=grm_files[i]; prev_grm_id=grm_id; } _grm_N.resize(1,1); _grm.resize(1,1); } // GE interaction vector<eigenMatrix> E_float(E_fac_num); eigenMatrix qE_float, mbuf; if(qGE_flag){ qE_float.resize(_n, qE_fac_num); for(i=0; i<qGE_ID.size(); i++){ iter=uni_id_map.find(qGE_ID[i]); if(iter==uni_id_map.end()) continue; for(j=0; j<qE_fac_num; j++) qE_float(iter->second,j)=atof(qGE[i][j].c_str()); } for(j=0; j<qE_fac_num; j++){ mbuf=((qE_float.block(0,j,_n,1))*(qE_float.block(0,j,_n,1)).transpose()); for(i=0; i<grm_files.size(); i++, pos++) (_A.block(0,pos*_n,_n,_n))=(_A.block(0,i*_n,_n,_n)).array()*mbuf.array(); } } if(GE_flag){ vector< vector<string> > E_str(E_fac_num); for(i=0; i<E_fac_num; i++) E_str[i].resize(_n); for(i=0; i<GE_ID.size(); i++){ iter=uni_id_map.find(GE_ID[i]); if(iter!=uni_id_map.end()){ for(j=0; j<E_fac_num; j++) E_str[j][iter->second]=GE[i][j]; } } for(j=0; j<E_fac_num; j++){ stringstream errmsg; errmsg<<"Error: too many classes for the "<<j+1<<"th environmental factor. \nPlease make sure you input a discrete variable as the environmental factor."; string errmsg1=errmsg.str(); errmsg.str(""); errmsg<<"Error: the "<<j+1<<"th envronmental factor has only one class."; string errmsg2=errmsg.str(); coeff_mat(E_str[j], E_float[j], errmsg1, errmsg2); mbuf=((E_float[j])*(E_float[j]).transpose()); for(i=0; i<grm_files.size(); i++, pos++) (_A.block(0,pos*_n,_n,_n))=(_A.block(0,i*_n,_n,_n)).array()*mbuf.array(); } } // construct X matrix construct_X(uni_id_map, qcovar_flag, qcovar_num, qcovar_ID, qcovar, covar_flag, covar_num, covar_ID, covar, E_float, qE_float); // names of variance component for(i=0; i<grm_files.size(); i++){ stringstream strstrm; if(grm_files.size()==1) strstrm<<""; else strstrm<<i+1; _var_name.push_back("V(G"+strstrm.str()+")"); _hsq_name.push_back("V(G"+strstrm.str()+")/Vp"); } for(j=0; j<qE_fac_num; j++){ for(i=0; i<grm_files.size(); i++){ stringstream strstrm1,strstrm2; if(grm_files.size()==1) strstrm1<<""; else strstrm1<<i+1; if(qE_fac_num==1) strstrm2<<""; else strstrm2<<j+1; _var_name.push_back("V(G"+strstrm1.str()+"xqE"+strstrm2.str()+")"); _hsq_name.push_back("V(G"+strstrm1.str()+"xqE"+strstrm2.str()+")"+"/Vp"); } } for(j=0; j<E_fac_num; j++){ for(i=0; i<grm_files.size(); i++){ stringstream strstrm1,strstrm2; if(grm_files.size()==1) strstrm1<<""; else strstrm1<<i+1; if(E_fac_num==1) strstrm2<<""; else strstrm2<<j+1; _var_name.push_back("V(G"+strstrm1.str()+"xE"+strstrm2.str()+")"); _hsq_name.push_back("V(G"+strstrm1.str()+"xE"+strstrm2.str()+")"+"/Vp"); } } _var_name.push_back("V(e)"); // run REML algorithm cout<<_n<<" individuals are in common in these files."<<endl; reml(pred_rand_eff, est_fix_eff, reml_priors, reml_priors_var, prevalence, no_constrain, no_lrt, mlmassoc); }