Пример #1
0
void gcta::read_imp_dose_beagle(string zdosefile, string kp_indi_file, string rm_indi_file, string blup_indi_file)
{
	if(_include.size()==0) throw("Error: No SNP is retained for analysis.");
    int i=0, j=0;
    vector<int> rsnp;
    get_rsnp(rsnp);

    const int MAX_LINE_LENGTH=10000000;
    char buf[MAX_LINE_LENGTH];
    string str_buf;

    gzifstream zinf;
    zinf.open( zdosefile.c_str() );
    if(! zinf.is_open()) throw("Error: can not open the file ["+zdosefile+"] to read.");
    cout<<"Reading imputed dosage scores (BEAGLE output) ..."<<endl;
    zinf.getline(buf, MAX_LINE_LENGTH, '\n');
    stringstream ss(buf);
    for(i=0; i<3; i++) ss>>str_buf;
    while(ss>>str_buf){ _fid.push_back(str_buf); }
    _pid=_fid;
    _indi_num=_fid.size();
    _fa_id.resize(_indi_num);
    _mo_id.resize(_indi_num);
    _sex.resize(_indi_num);
    _pheno.resize(_indi_num);
    cout<<_indi_num<<" individuals to be included from ["+zdosefile+"]."<<endl;
    init_keep();
    if(!kp_indi_file.empty()) keep_indi(kp_indi_file);
    if(!blup_indi_file.empty()) read_indi_blup(blup_indi_file);
    if(!rm_indi_file.empty()) remove_indi(rm_indi_file);

    _geno_dose.resize(_keep.size());
    for(i=0; i<_keep.size(); i++) _geno_dose[i].resize(_include.size());

    vector<int> rindi;
    get_rindi(rindi);

    int line=0;
    int k=0;
    double d_buf=0.0;
    while(1){
        zinf.getline(buf, MAX_LINE_LENGTH, '\n');
        if(zinf.fail() || !zinf.good()) break;
        if(!rsnp[line++]) continue;
        stringstream ss(buf);
        ss>>str_buf;
        if(str_buf!=_snp_name[line-1]){
            stringstream errmsg;
            errmsg<<"Error: the "<<line<<" th SNP ["+_snp_name[line-1]+"] in the summary file doesn't match to that in the dosage file."<<endl;
            throw(errmsg.str());
        }
        ss>>str_buf>>str_buf;
        for(i=0, j=0; i<_indi_num; i++){
            ss>>d_buf;
            if(rindi[i]){ _geno_dose[j][k]=d_buf; j++; }
        }
        k++;
    }
    zinf.clear();
    zinf.close();
}
Пример #2
0
void gcta::fit_reml(string grm_file, string phen_file, string qcovar_file, string covar_file, string qGE_file, string GE_file, string keep_indi_file, string remove_indi_file, string sex_file, int mphen, double grm_cutoff, double adj_grm_fac, int dosage_compen, bool m_grm_flag, bool pred_rand_eff, bool est_fix_eff, int reml_mtd, int MaxIter, vector<double> reml_priors, vector<double> reml_priors_var, vector<int> drop, bool no_lrt, double prevalence, bool no_constrain, bool mlmassoc)
{
    _reml_mtd=reml_mtd;
    _reml_max_iter=MaxIter;
    int i=0, j=0, k=0;
    bool grm_flag=(!grm_file.empty());
    bool qcovar_flag=(!qcovar_file.empty());
    bool covar_flag=(!covar_file.empty());
    bool GE_flag=(!GE_file.empty());
    bool qGE_flag=(!qGE_file.empty());
    if(m_grm_flag) grm_flag=false;

    // Read data
    stringstream errmsg;
    int qcovar_num=0, covar_num=0, qE_fac_num=0, E_fac_num=0;
    vector<string> phen_ID, phen_buf, qcovar_ID, covar_ID, qGE_ID, GE_ID, grm_id, grm_files;
    vector< vector<string> > qcovar, covar, GE, qGE; // save individuals by column

    if(grm_flag){
        read_grm_gz(grm_file, grm_id);
        update_id_map_kp(grm_id, _id_map, _keep);
        grm_files.push_back(grm_file);
    }
    else if(m_grm_flag){
        read_grm_filenames(grm_file, grm_files, false);
        for(i=0; i<grm_files.size(); i++){
            read_grm_gz(grm_files[i], grm_id, false, true);
            update_id_map_kp(grm_id, _id_map, _keep);
        }
    }
    read_phen(phen_file, phen_ID, phen_buf, mphen);
    update_id_map_kp(phen_ID, _id_map, _keep);
    if(qcovar_flag){
        qcovar_num=read_covar(qcovar_file, qcovar_ID, qcovar, true);
        update_id_map_kp(qcovar_ID, _id_map, _keep);
    }
    if(covar_flag){
        covar_num=read_covar(covar_file, covar_ID, covar, false);
        update_id_map_kp(covar_ID, _id_map, _keep);
    }
    if(qGE_flag){
        qE_fac_num=read_GE(qGE_file, qGE_ID, qGE, true);
        update_id_map_kp(qGE_ID, _id_map, _keep);
    }
    if(GE_flag){
        E_fac_num=read_GE(GE_file, GE_ID, GE, false);
        update_id_map_kp(GE_ID, _id_map, _keep);
    }
    if(!mlmassoc){
        if(!keep_indi_file.empty()) keep_indi(keep_indi_file);
        if(!remove_indi_file.empty()) remove_indi(remove_indi_file);       
    }
    if(grm_flag){
        if(grm_cutoff>-1.0) rm_cor_indi(grm_cutoff);
        if(!sex_file.empty()) update_sex(sex_file);
        if(adj_grm_fac>-1.0) adj_grm(adj_grm_fac);
        if(dosage_compen>-1) dc(dosage_compen);
        _grm_N.resize(1,1);
    }

    vector<string> uni_id;
	map<string, int> uni_id_map;
    map<string, int>::iterator iter;
	for(i=0; i<_keep.size(); i++){
	    uni_id.push_back(_fid[_keep[i]]+":"+_pid[_keep[i]]);
	    uni_id_map.insert(pair<string,int>(_fid[_keep[i]]+":"+_pid[_keep[i]], i));
	}
    _n=_keep.size();
    if(_n<1) throw("Error: no individual is in common in the input files.");

    // construct model terms
    _y.setZero(_n);
    for(i=0; i<phen_ID.size(); i++){
        iter=uni_id_map.find(phen_ID[i]);
        if(iter==uni_id_map.end()) continue;
        _y[iter->second]=atof(phen_buf[i].c_str());
    }

    int pos=0;
    _r_indx.clear();
    eigenMatrix A_N(_n, _n);
    vector<int> kp;
    if(grm_flag){
        for(i=0; i<1+qE_fac_num+E_fac_num; i++) _r_indx.push_back(i);
        if(!no_lrt) drop_comp(drop);
        _A=eigenMatrix::Zero(_n, _r_indx.size()*_n);
        if(mlmassoc) StrFunc::match(uni_id, grm_id, kp);
        else kp=_keep;
        for(i=0; i<_n; i++){
            for(j=0; j<=i; j++) (_A.block(0,0,_n,_n))(j,i)=(_A.block(0,0,_n,_n))(i,j)=_grm(kp[i],kp[j]);
        }
        pos++;
        _grm.resize(1,1);
    }
    else if(m_grm_flag){
        if(!sex_file.empty()) update_sex(sex_file);
        for(i=0; i<(1+qE_fac_num+E_fac_num)*grm_files.size(); i++) _r_indx.push_back(i);
        if(!no_lrt) drop_comp(drop);
        _A=eigenMatrix::Zero(_n, _r_indx.size()*_n);
        string prev_file=grm_files[0];
        vector<string> prev_grm_id(grm_id);
        cout<<"There are "<<grm_files.size()<<" GRM file names specified in the file ["+grm_file+"]."<<endl;
        for(i=0; i<grm_files.size(); i++, pos++){
            cout<<"Reading the GRM from the "<<i+1<<"th file ..."<<endl;
            read_grm_gz(grm_files[i], grm_id);
            if(adj_grm_fac>-1.0) adj_grm(adj_grm_fac);
            if(dosage_compen>-1) dc(dosage_compen);
            StrFunc::match(uni_id, grm_id, kp);
            int pos_n=pos*_n;
            for(j=0; j<_n; j++){
                for(k=0; k<=j; k++){
                    if(kp[j]>=kp[k]) (_A.block(0,pos_n,_n,_n))(k,j)=(_A.block(0,pos_n,_n,_n))(j,k)=_grm(kp[j],kp[k]);
                    else (_A.block(0,pos_n,_n,_n))(k,j)=(_A.block(0,pos_n,_n,_n))(j,k)=_grm(kp[k],kp[j]);
                }
            }
            prev_file=grm_files[i];
            prev_grm_id=grm_id;
       }
        _grm_N.resize(1,1);
        _grm.resize(1,1);
    }

    // GE interaction
    vector<eigenMatrix> E_float(E_fac_num);
    eigenMatrix qE_float, mbuf;
    if(qGE_flag){
        qE_float.resize(_n, qE_fac_num);
        for(i=0; i<qGE_ID.size(); i++){
            iter=uni_id_map.find(qGE_ID[i]);
            if(iter==uni_id_map.end()) continue;
            for(j=0; j<qE_fac_num; j++) qE_float(iter->second,j)=atof(qGE[i][j].c_str());
        }
        for(j=0; j<qE_fac_num; j++){
            mbuf=((qE_float.block(0,j,_n,1))*(qE_float.block(0,j,_n,1)).transpose());
            for(i=0; i<grm_files.size(); i++, pos++) (_A.block(0,pos*_n,_n,_n))=(_A.block(0,i*_n,_n,_n)).array()*mbuf.array();
        }
    }
    if(GE_flag){
        vector< vector<string> > E_str(E_fac_num);
        for(i=0; i<E_fac_num; i++) E_str[i].resize(_n);
        for(i=0; i<GE_ID.size(); i++){
            iter=uni_id_map.find(GE_ID[i]);
            if(iter!=uni_id_map.end()){
                for(j=0; j<E_fac_num; j++) E_str[j][iter->second]=GE[i][j];
            }
        }
        for(j=0; j<E_fac_num; j++){
            stringstream errmsg;
            errmsg<<"Error: too many classes for the "<<j+1<<"th environmental factor. \nPlease make sure you input a discrete variable as the environmental factor.";
            string errmsg1=errmsg.str();
            errmsg.str("");
            errmsg<<"Error: the "<<j+1<<"th envronmental factor has only one class.";
            string errmsg2=errmsg.str();
            coeff_mat(E_str[j], E_float[j], errmsg1, errmsg2);
            mbuf=((E_float[j])*(E_float[j]).transpose());
            for(i=0; i<grm_files.size(); i++, pos++) (_A.block(0,pos*_n,_n,_n))=(_A.block(0,i*_n,_n,_n)).array()*mbuf.array();
        }
    }

    // construct X matrix
    construct_X(uni_id_map, qcovar_flag, qcovar_num, qcovar_ID, qcovar, covar_flag, covar_num, covar_ID, covar, E_float, qE_float);

    // names of variance component
    for(i=0; i<grm_files.size(); i++){
        stringstream strstrm;
        if(grm_files.size()==1) strstrm<<"";
        else strstrm<<i+1;
        _var_name.push_back("V(G"+strstrm.str()+")");
        _hsq_name.push_back("V(G"+strstrm.str()+")/Vp");
    }
    for(j=0; j<qE_fac_num; j++){
        for(i=0; i<grm_files.size(); i++){
            stringstream strstrm1,strstrm2;
            if(grm_files.size()==1) strstrm1<<"";
            else  strstrm1<<i+1;
            if(qE_fac_num==1) strstrm2<<"";
            else strstrm2<<j+1;
            _var_name.push_back("V(G"+strstrm1.str()+"xqE"+strstrm2.str()+")");
            _hsq_name.push_back("V(G"+strstrm1.str()+"xqE"+strstrm2.str()+")"+"/Vp");
        }
    }
    for(j=0; j<E_fac_num; j++){
        for(i=0; i<grm_files.size(); i++){
            stringstream strstrm1,strstrm2;
            if(grm_files.size()==1) strstrm1<<"";
            else  strstrm1<<i+1;
            if(E_fac_num==1) strstrm2<<"";
            else strstrm2<<j+1;
            _var_name.push_back("V(G"+strstrm1.str()+"xE"+strstrm2.str()+")");
            _hsq_name.push_back("V(G"+strstrm1.str()+"xE"+strstrm2.str()+")"+"/Vp");
        }
    }
    _var_name.push_back("V(e)");

    // run REML algorithm
    cout<<_n<<" individuals are in common in these files."<<endl;
	reml(pred_rand_eff, est_fix_eff, reml_priors, reml_priors_var, prevalence, no_constrain, no_lrt, mlmassoc);
}