Matrix Matrix::operator* (Matrix M)//operator+ with an other matrix { int i,j,k; int nrow2, ncol2; //test the dimension M.Dimension(&nrow2,&ncol2); if(nrow2!=ncol) { error err(OUFFF_ERROR_DIMENSIONDISAGREE, "Matrix::*(matrix)", "Try to make product between matrix of bad size"); throw err; } //creation of the new matrix Matrix N(nrow,ncol2); for(i=0; i<nrow; i++) { for(j=0;j<ncol2;j++) { //N(i,j) is the product of the ith row from this by the jth col from M N(i,j)=0;//init value to 0 for(k=0; k<ncol; k++) { N(i,j) += value[i][k]*M(k,j); } } } return N; }
void LoadMatrix(const char* fn, Matrix& m) { LineReader lr(fn); std::vector<std::string> s; int lineNo = 0; while (lr.readLineBySep(&s, " \t")) { lineNo++; m.Dimension(lineNo, s.size()); for (int j = 0; j < s.size(); j++) { m(lineNo - 1, j) = atof(s[j].c_str()); } } };
Matrix Matrix::operator+ (Matrix M)//operator+ with an other matrix { int i,j; M.Dimension(&i,&j); if(i!=nrow || j!= ncol) { throw error(OUFFF_ERROR_DIMENSIONDISAGREE, "Matrix::+(matrix)", "Try to make sum with matrix of different size");; } Matrix N(nrow,ncol); for(i=0; i<nrow*ncol; i++) N(i)=value[0][i] + M(i); return N; }
void LinearRegressionPermutationTest::splitMatrix(Matrix& x, int col, Matrix& xnull, Vector& xcol) { if (x.cols < 2) { printf("input matrix has too few cols!\n"); } xnull.Dimension(x.rows, x.cols - 1); xcol.Dimension(x.rows); for (int i = 0; i < x.rows; i++) { for (int j = 0; j < x.cols; j++) { if (j < col) { xnull[i][j] = x[i][j]; } else if (j == col) { xcol[i] = x[i][j]; } else { xnull[i][j - 1] = x[i][j]; } } } };
void HashErrorModel::setDataforPrediction(Matrix & X, Vector & succ, Vector & total,bool binarizeFlag) { int i = 0; BaseData data; if(ourUseFast) { return; } for(HashMatch::const_iterator it = mismatchTable.begin(); it != mismatchTable.end(); ++it) { data.parseKey(it->first); Covariates cov; cov.setCovariates(data); if(i == 0) { uint32_t rows = mismatchTable.size(); succ.Dimension(rows); total.Dimension(rows); X.Dimension(rows, cov.covariates.size() + 1); X.Zero(); } // The first column of the design matrix is constant one, for the slope X[i][0] = 1.0; int j = 0; //binarize a couple of co-variates for(std::vector<uint16_t>::const_iterator itv = cov.covariates.begin(); itv != cov.covariates.end(); ++itv) { if(binarizeFlag) { //hardcoded pos is 2 if(j==1){ uint16_t pos = (uint16_t)*itv; // hardcoded pos += 7; X[i][pos] = 1; } else { if(j>1) X[i][j] = (uint16_t)*itv; else X[i][j+1] = (uint16_t)*itv; } j++; } else { X[i][j+1] = (uint16_t)*itv; j++; } } total[i] = it->second.mm + it->second.m; succ[i] = it->second.m; i++; } }
int main(int argc, char** argv){ time_t currentTime = time(0); fprintf(stderr, "Analysis started at: %s", ctime(¤tTime)); //////////////////////////////////////////////// BEGIN_PARAMETER_LIST(pl) ADD_PARAMETER_GROUP(pl, "Input/Output") ADD_STRING_PARAMETER(pl, inVcf, "--inVcf", "input VCF File") ADD_STRING_PARAMETER(pl, outMerlin, "--outMerlin", "output prefix") ADD_PARAMETER_GROUP(pl, "People Filter") ADD_STRING_PARAMETER(pl, peopleIncludeID, "--peopleIncludeID", "give IDs of people that will be included in study") ADD_STRING_PARAMETER(pl, peopleIncludeFile, "--peopleIncludeFile", "from given file, set IDs of people that will be included in study") ADD_STRING_PARAMETER(pl, peopleExcludeID, "--peopleExcludeID", "give IDs of people that will be included in study") ADD_STRING_PARAMETER(pl, peopleExcludeFile, "--peopleExcludeFile", "from given file, set IDs of people that will be included in study") ADD_PARAMETER_GROUP(pl, "Site Filter") ADD_STRING_PARAMETER(pl, rangeList, "--rangeList", "Specify some ranges to use, please use chr:begin-end format.") ADD_STRING_PARAMETER(pl, rangeFile, "--rangeFile", "Specify the file containing ranges, please use chr:begin-end format.") END_PARAMETER_LIST(pl) ; pl.Read(argc, argv); pl.Status(); if (FLAG_REMAIN_ARG.size() > 0){ fprintf(stderr, "Unparsed arguments: "); for (unsigned int i = 0; i < FLAG_REMAIN_ARG.size(); i++){ fprintf(stderr, " %s", FLAG_REMAIN_ARG[i].c_str()); } fprintf(stderr, "\n"); abort(); } REQUIRE_STRING_PARAMETER(FLAG_inVcf, "Please provide input file using: --inVcf"); const char* fn = FLAG_inVcf.c_str(); VCFInputFile vin(fn); // set range filters here // e.g. // vin.setRangeList("1:69500-69600"); vin.setRangeList(FLAG_rangeList.c_str()); vin.setRangeFile(FLAG_rangeFile.c_str()); // set people filters here if (FLAG_peopleIncludeID.size() || FLAG_peopleIncludeFile.size()) { vin.excludeAllPeople(); vin.includePeople(FLAG_peopleIncludeID.c_str()); vin.includePeopleFromFile(FLAG_peopleIncludeFile.c_str()); } vin.excludePeople(FLAG_peopleExcludeID.c_str()); vin.excludePeopleFromFile(FLAG_peopleExcludeFile.c_str()); // let's write it out. FILE* fMap; // CHROMOSOME MARKER POSITION FILE* fDat; // A some_disease\n // T some_trait // M some_marker // M another_marker FILE* fPed; // first 5 column: FID, IID, PID, MID, SEX; then follow Dat file FILE* fPid; // Person ID file, (extra for Merlin), including all people ID as they are in PED file. fMap = fopen( (FLAG_outMerlin + ".map").c_str(), "wt"); fDat = fopen( (FLAG_outMerlin + ".dat").c_str(), "wt"); fPed = fopen( (FLAG_outMerlin + ".ped").c_str(), "wt"); fPid = fopen( (FLAG_outMerlin + ".pid").c_str(), "wt"); assert(fMap && fDat && fPed && fPid); std::string marker; // marker x people std::vector<std::string> allMarker; Matrix geno; fputs("CHROMOSOME\tMARKER\tPOSITION\n", fMap); while (vin.readRecord()){ VCFRecord& r = vin.getVCFRecord(); VCFPeople& people = r.getPeople(); VCFIndividual* indv; // write map file marker = r.getID(); if ( marker == "." ) { fprintf(fMap, "%s\t%s:%d\t%d\n", r.getChrom(), r.getChrom(), r.getPos(), r.getPos()); fprintf(fDat, "M\t%s:%d\n", r.getChrom(), r.getPos()); } else { fprintf(fMap, "%s\t%s\t%d\n", r.getChrom(), marker.c_str(), r.getPos()); fprintf(fDat, "M\t%s\n", marker.c_str()); } allMarker.push_back(marker); geno.Dimension(allMarker.size(), people.size()); // e.g.: get TAG from INFO field // fprintf(stderr, "%s\n", r.getInfoTag("ANNO")); int m = allMarker.size() - 1; // e.g.: Loop each (selected) people in the same order as in the VCF for (int i = 0; i < people.size(); i++) { indv = people[i]; // get GT index. if you are sure the index will not change, call this function only once! int GTidx = r.getFormatIndex("GT"); if (GTidx >= 0) { geno[m][i] = (*indv)[GTidx].getGenotype(); }else { fprintf(stderr, "Cannot find GT field!\n"); abort(); } } }; VCFHeader* h = vin.getVCFHeader(); std::vector< std::string> peopleId; h->getPeopleName(&peopleId); // dump PED and PID file for (int p = 0; p < peopleId.size(); p++){ fprintf(fPed, "%s\t%s\t0\t0\t0", peopleId[p].c_str(), peopleId[p].c_str()); for (int m = 0; m < allMarker.size(); m++){ int g = (int)geno[m][p]; switch (g){ case 0: fputs("\t0/0", fPed); break; case 1: fputs("\t0/1", fPed); break; case 2: fputs("\t1/1", fPed); break; default: fputs("x/x", fPed); break; } } fputs("\n", fPed); fprintf(fPid, "%s\n", peopleId[p].c_str()); } return 0; };
int GenotypeExtractor::extractMultipleGenotype(Matrix* g) { static Matrix m; // make it static to reduce memory allocation int row = 0; std::vector<std::string> colNames; std::string name; this->hemiRegion.clear(); GenotypeCounter genoCounter; while (this->vin->readRecord()) { VCFRecord& r = this->vin->getVCFRecord(); VCFPeople& people = r.getPeople(); VCFIndividual* indv; m.Dimension(row + 1, people.size()); genoCounter.reset(); int genoIdx; const bool useDosage = (!this->dosageTag.empty()); if (useDosage) { genoIdx = r.getFormatIndex(dosageTag.c_str()); } else { genoIdx = r.getFormatIndex("GT"); } int GDidx = r.getFormatIndex("GD"); int GQidx = r.getFormatIndex("GQ"); assert(this->parRegion); bool hemiRegion = this->parRegion->isHemiRegion(r.getChrom(), r.getPos()); // e.g.: Loop each (selected) people in the same order as in the VCF const int numPeople = (int)people.size(); for (int i = 0; i < numPeople; i++) { indv = people[i]; // get GT index. if you are sure the index will not change, call this // function only once! if (genoIdx >= 0) { // printf("%s ", indv->justGet(0).toStr()); // [0] meaning the first // field of each individual if (useDosage) { if (!hemiRegion) { m[row][i] = indv->justGet(genoIdx).toDouble(); } else { // for male hemi region, imputated dosage is usually between 0 and 1 // need to multiply by 2.0 if ((*sex)[i] == PLINK_MALE) { m[row][i] = indv->justGet(genoIdx).toDouble() * 2.0; } } } else { if (!hemiRegion) { m[row][i] = indv->justGet(genoIdx).getGenotype(); } else { if ((*sex)[i] == PLINK_MALE) { m[row][i] = indv->justGet(genoIdx).getMaleNonParGenotype02(); } else if ((*sex)[i] == PLINK_FEMALE) { m[row][i] = indv->justGet(genoIdx).getGenotype(); } else { m[row][i] = MISSING_GENOTYPE; } } } if (!checkGD(indv, GDidx) || !checkGQ(indv, GQidx)) { m[row][i] = MISSING_GENOTYPE; } genoCounter.add(m[row][i]); } else { logger->error("Cannot find %s field!", this->dosageTag.empty() ? "GT" : dosageTag.c_str()); return -1; } } // check frequency cutoffs // int numNonMissingPeople = 0; // double maf = 0.; // for (int i = 0; i < numPeople; ++i) { // if (m[row][i] < 0) continue; // maf += m[row][i]; // ++numNonMissingPeople; // } // if (numNonMissingPeople) { // maf = maf / (2. * numNonMissingPeople); // } else { // maf = 0.0; // } // if (maf > .5) { // maf = 1.0 - maf; // } const double maf = genoCounter.getMAF(); if (this->freqMin > 0. && this->freqMin > maf) continue; if (this->freqMax > 0. && this->freqMax < maf) continue; // store genotype results name = r.getChrom(); name += ":"; name += r.getPosStr(); colNames.push_back(name); ++row; assert(this->parRegion); if (this->parRegion && this->parRegion->isHemiRegion(r.getChrom(), r.getPos())) { this->hemiRegion.push_back(true); } else { this->hemiRegion.push_back(false); } this->counter.push_back(genoCounter); } // end while (this->vin->readRecord()) // delete rows (ugly code here, as we may allocate extra row in previous // loop) m.Dimension(row, m.cols); // now transpose (marker by people -> people by marker) g->Transpose(m); for (int i = 0; i < row; ++i) { g->SetColumnLabel(i, colNames[i].c_str()); } return SUCCEED; } // end GenotypeExtractor