void gcta::read_phen(string phen_file, vector<string> &phen_ID, vector<string> &phen_buf, int mphen) { // Read phenotype data ifstream in_phen(phen_file.c_str()); if(!in_phen) throw("Error: can not open the file ["+phen_file+"] to read."); int i=0; vector<string> fid, pid, vs_buf; string str_buf, fid_buf, pid_buf; phen_ID.clear(); phen_buf.clear(); cout<<"Reading phenotypes from ["+phen_file+"]."<<endl; getline(in_phen, str_buf); int phen_num=StrFunc::split_string(str_buf, vs_buf)-2; if(phen_num<=0) throw("Error: no phenotype data is found."); if(phen_num>1) cout<<"There are "<<phen_num<<" traits specified in the file ["+phen_file+"]."<<endl; if(mphen>phen_num){ stringstream errmsg; errmsg<<"Error: can not find the "<<mphen<<"th trait in the file ["+phen_file+"]."; throw(errmsg.str()); } if(phen_num>1) cout<<"The "<<mphen<<"th trait is included for analysis."<<endl; in_phen.seekg(ios::beg); mphen--; while(in_phen){ in_phen>>fid_buf; if(in_phen.eof()) break; in_phen>>pid_buf; for(i=0; i<phen_num; i++){ in_phen>>str_buf; if(i==mphen && str_buf!="-9" && str_buf!="NA"){ phen_ID.push_back(fid_buf+":"+pid_buf); fid.push_back(fid_buf); pid.push_back(pid_buf); phen_buf.push_back(str_buf); } } } in_phen.close(); cout<<"Nonmissing phenotypes of "<<phen_buf.size()<<" individuals are included from ["+phen_file+"]."<<endl; if(_id_map.empty()){ _fid=fid; _pid=pid; _indi_num=_fid.size(); init_keep(); } }
void gcta::read_famfile(string famfile) { ifstream Fam(famfile.c_str()); if(!Fam) throw("Error: can not open the file ["+famfile+"] to read."); cout<<"Reading PLINK FAM file from ["+famfile+"]."<<endl; int i=0; string str_buf; _fid.clear(); _pid.clear(); _fa_id.clear(); _mo_id.clear(); _sex.clear(); _pheno.clear(); while(Fam){ Fam>>str_buf; if(Fam.eof()) break; _fid.push_back(str_buf); Fam>>str_buf; _pid.push_back(str_buf); Fam>>str_buf; _fa_id.push_back(str_buf); Fam>>str_buf; _mo_id.push_back(str_buf); Fam>>str_buf; _sex.push_back(atoi(str_buf.c_str())); Fam>>str_buf; _pheno.push_back(atoi(str_buf.c_str())); } Fam.clear(); Fam.close(); _indi_num=_fid.size(); cout<<_indi_num<<" individuals to be included from ["+famfile+"]."<<endl; // Initialize _keep init_keep(); }
void gcta::read_imp_dose_beagle(string zdosefile, string kp_indi_file, string rm_indi_file, string blup_indi_file) { if(_include.size()==0) throw("Error: No SNP is retained for analysis."); int i=0, j=0; vector<int> rsnp; get_rsnp(rsnp); const int MAX_LINE_LENGTH=10000000; char buf[MAX_LINE_LENGTH]; string str_buf; gzifstream zinf; zinf.open( zdosefile.c_str() ); if(! zinf.is_open()) throw("Error: can not open the file ["+zdosefile+"] to read."); cout<<"Reading imputed dosage scores (BEAGLE output) ..."<<endl; zinf.getline(buf, MAX_LINE_LENGTH, '\n'); stringstream ss(buf); for(i=0; i<3; i++) ss>>str_buf; while(ss>>str_buf){ _fid.push_back(str_buf); } _pid=_fid; _indi_num=_fid.size(); _fa_id.resize(_indi_num); _mo_id.resize(_indi_num); _sex.resize(_indi_num); _pheno.resize(_indi_num); cout<<_indi_num<<" individuals to be included from ["+zdosefile+"]."<<endl; init_keep(); if(!kp_indi_file.empty()) keep_indi(kp_indi_file); if(!blup_indi_file.empty()) read_indi_blup(blup_indi_file); if(!rm_indi_file.empty()) remove_indi(rm_indi_file); _geno_dose.resize(_keep.size()); for(i=0; i<_keep.size(); i++) _geno_dose[i].resize(_include.size()); vector<int> rindi; get_rindi(rindi); int line=0; int k=0; double d_buf=0.0; while(1){ zinf.getline(buf, MAX_LINE_LENGTH, '\n'); if(zinf.fail() || !zinf.good()) break; if(!rsnp[line++]) continue; stringstream ss(buf); ss>>str_buf; if(str_buf!=_snp_name[line-1]){ stringstream errmsg; errmsg<<"Error: the "<<line<<" th SNP ["+_snp_name[line-1]+"] in the summary file doesn't match to that in the dosage file."<<endl; throw(errmsg.str()); } ss>>str_buf>>str_buf; for(i=0, j=0; i<_indi_num; i++){ ss>>d_buf; if(rindi[i]){ _geno_dose[j][k]=d_buf; j++; } } k++; } zinf.clear(); zinf.close(); }
void gcta::read_imp_dose_mach(string zdosefile, string kp_indi_file, string rm_indi_file, string blup_indi_file) { if(_include.size()==0) throw("Error: No SNP is retained for analysis."); int i=0, j=0, k=0, line=0; vector<int> rsnp; get_rsnp(rsnp); const int MAX_LINE_LENGTH = 10000000; char buf[MAX_LINE_LENGTH]; gzifstream zinf; zinf.open(zdosefile.c_str()); if(! zinf.is_open()) throw("Error: can not open the file ["+zdosefile+"] to read."); vector<string> indi_ls; map<string, int> kp_id_map, blup_id_map, rm_id_map; bool kp_indi_flag=!kp_indi_file.empty(), blup_indi_flag=!blup_indi_file.empty(), rm_indi_flag=!rm_indi_file.empty(); if(kp_indi_flag) read_indi_list(kp_indi_file, indi_ls); for(i=0; i<indi_ls.size(); i++) kp_id_map.insert(pair<string, int>(indi_ls[i], i)); if(blup_indi_flag) read_indi_list(blup_indi_file, indi_ls); for(i=0; i<indi_ls.size(); i++) blup_id_map.insert(pair<string, int>(indi_ls[i], i)); if(rm_indi_flag) read_indi_list(rm_indi_file, indi_ls); for(i=0; i<indi_ls.size(); i++) rm_id_map.insert(pair<string, int>(indi_ls[i], i)); bool missing=false; string str_buf, id_buf, err_msg="Error: reading dosage data failed. Are the map file and the dosage file matched?"; double f_buf=0.0; vector<string> kept_id, vs_buf; cout<<"Reading dosage data from ["+zdosefile+"] in individual-major format (Note: may use huge RAM)."<<endl; _fid.clear(); _pid.clear(); _geno_dose.clear(); vector<int> kp_it; while(1){ bool kp_flag=true; zinf.getline(buf, MAX_LINE_LENGTH, '\n'); stringstream ss(buf); if(!(ss>>str_buf)) break; int ibuf=StrFunc::split_string(str_buf, vs_buf, ">"); if(ibuf>1){ if(vs_buf[0].empty()) throw("Error: family ID of the individual ["+str_buf+"] is missing."); else vs_buf[0].erase(vs_buf[0].end()-1); } else if(ibuf==1) vs_buf.push_back(vs_buf[0]); else break; id_buf=vs_buf[0]+":"+vs_buf[1]; if(kp_indi_flag && kp_id_map.find(id_buf)==kp_id_map.end()) kp_flag=false; if(kp_flag && blup_indi_flag && blup_id_map.find(id_buf)==blup_id_map.end()) kp_flag=false; if(kp_flag && rm_indi_flag && rm_id_map.find(id_buf)!=rm_id_map.end()) kp_flag=false; if(kp_flag){ kp_it.push_back(1); _fid.push_back(vs_buf[0]); _pid.push_back(vs_buf[1]); kept_id.push_back(id_buf); } else kp_it.push_back(0); if(zinf.fail() || !zinf.good()) break; } zinf.clear(); zinf.close(); cout<<"(Imputed dosage data for "<<kp_it.size()<<" individuals detected)."<<endl; _indi_num=_fid.size(); zinf.open(zdosefile.c_str()); _geno_dose.resize(_indi_num); for(line=0; line<_indi_num; line++) _geno_dose[line].resize(_include.size()); for(line=0, k=0; line<kp_it.size(); line++){ zinf.getline(buf, MAX_LINE_LENGTH, '\n'); if(kp_it[line]==0) continue; stringstream ss(buf); if(!(ss>>str_buf)) break; if(!(ss>>str_buf)) break; for(i=0, j=0; i<_snp_num; i++){ ss>>str_buf; f_buf=atof(str_buf.c_str()); if(str_buf=="X" || str_buf=="NA"){ if(!missing){ cout<<"Warning: missing values detected in the dosage data."<<endl; missing=true; } f_buf=1e6; } if(rsnp[i]){ _geno_dose[k][j]=(f_buf); j++; } } k++; } zinf.clear(); zinf.close(); cout<<"Imputed dosage data for "<<kept_id.size()<<" individuals are included from ["<<zdosefile<<"]."<<endl; _fa_id.resize(_indi_num); _mo_id.resize(_indi_num); _sex.resize(_indi_num); _pheno.resize(_indi_num); for(i=0; i<_indi_num; i++){ _fa_id[i]=_mo_id[i]="0"; _sex[i]=-9; _pheno[i]=-9; } // initialize keep init_keep(); update_id_map_kp(kept_id, _id_map, _keep); if(_keep.size()==0) throw("Error: No individual is retained for analysis."); if(blup_indi_flag) read_indi_blup(blup_indi_file); // update data update_bim(rsnp); }