void GenomeRegionSeqStats::OutputClusters(String &outFile, int minDepth, double minAvgDepth, int minClusterSize) { FILE *fh = fopen(outFile.c_str(), "w"); if(fh==NULL) error("Open cluster output file %s failed!\n", outFile.c_str()); bool inCluster = false; int winSize = 0; double avgDepth = 0; int totalDepth = 0; uint32_t cstart, cend; //cluster start and cluster end for(uint32_t i=0; i<depth.size(); i++) { if(depth[i]>=minDepth) { if(inCluster==false) { inCluster = true; cstart = i; } winSize++; totalDepth += depth[i]; } else { if(inCluster==false) continue; //std::cout<<totalDepth<<" "<<winSize<<" "<<inCluster<<":"<<minClusterSize<<":"<<avgDepth<<":"<<minAvgDepth<<std::endl; avgDepth = double(totalDepth)/winSize; if(winSize<minClusterSize || avgDepth<minAvgDepth) { winSize = 0; totalDepth = 0; inCluster = false; continue; } winSize = 0; totalDepth = 0; inCluster = false; cend = i-1; int atCnt, gcCnt; double gc = CalcRegionGCContent(referencegenome, cstart, cend, atCnt, gcCnt); String chr; int start, end; referencegenome.getChromosomeAndIndex(chr, cstart); StringArray tokens; tokens.ReplaceTokens(chr, ":"); chr = tokens[0]; start = tokens[1].AsInteger(); end = start + (cend-cstart); fprintf(fh, "%s\t%u\t%u\t%.2f\t%.2f\n", chr.c_str(), start, end, avgDepth, gc); } } fclose(fh); }
void GenomeRegionSeqStats::LoadRegionList(String &inputList) { FILE *in = fopen(inputList.c_str(), "r"); if(in==NULL) error("Open region input file %s failed!\n", inputList.c_str()); StringArray tokens; String buffer; while(!feof(in)) { buffer.ReadLine(in); if (buffer.IsEmpty() || buffer[0] == '#') continue; tokens.ReplaceTokens(buffer); if(tokens.Length()<3) error("Too few columns: %s\n", buffer.c_str()); String CSE = tokens[0]+":"+tokens[1]+":"+tokens[2]; std::pair<int, int> start_end; start_end.first = tokens[1].AsInteger(); start_end.second = tokens[2].AsInteger(); if(start_end.first>=start_end.second) // positions are 0-based. Otherwise == is valid error("Region end is equal or smaller than the start: %s!\n", buffer.c_str()); genomeRegions_lines[tokens[0]].push_back(buffer); genomeRegions[tokens[0]].push_back(start_end); genomeRegions_currentIndex[tokens[0]] = 0; if(tokens.Length()>3) { groupStats[tokens[3]].segCount++; groupStats[tokens[3]].totalLen += (start_end.second - start_end.first); genomeRegionGroups[CSE].push_back(tokens[3]); } } fclose(in); // Chromosome info contigs.clear(); std::map<String, vector<std::pair<int, int> > >::iterator p; for(p=genomeRegions.begin(); p!=genomeRegions.end(); p++) { contigs.push_back(p->first); for(unsigned int i=1; i<genomeRegions[p->first].size(); i++) if(genomeRegions[p->first][i].first<genomeRegions[p->first][i-1].first) error("Input coordinates are not in order: %s %d %d!\n", p->first.c_str(),genomeRegions[p->first][i].first,genomeRegions[p->first][i].second); } // Group info such as gene names groups.clear(); std::map<String, Stats>::iterator p2; for(p2=groupStats.begin(); p2!=groupStats.end(); p2++) groups.push_back(p2->first); }
void VerifyBamID::loadSubsetInds(const char* subsetFile) { if ( ( pPile == NULL ) && ( pGenotypes == NULL ) ) { if ( subsetInds.size() > 0 ) { Logger::gLogger->error("VerifyBamID::loadSubsetInds() called multiple times"); } IFILE f = ifopen(subsetFile,"rb"); String line; StringArray tok; while( line.ReadLine(f) > 0 ) { tok.ReplaceTokens(line,"\t \n\r"); subsetInds.push_back(tok[0].c_str()); } } else { Logger::gLogger->error("VerifyBamID::loadSubsetInds() called after VerifyBamID::loadFiles()"); } }
bool MarkovParameters::ReadCrossoverRates(const char * filename) { StringArray tokens; StringArray rec; rec.Read(filename); // Load estimated per marker error rates if (rec.Length() == markers) { printf(" Updating error rates using data in %s ...\n", (const char *) filename); for (int i = 0; i < markers; i++) { tokens.ReplaceTokens(rec[i+1]); if (tokens.Length() >= 2) R[i] = tokens[1].AsDouble(); } return true; } return false; }
void BedFile::openForRead(const char* bedFile, const char* bimFile, const char* famFile, const char* refFile, int nbuf) { StringArray tokens; reset(); iFile = ifopen(bedFile,"rb"); if ( iFile == NULL ) { throw VcfFileException("Failed opening file %s - %s",bedFile,strerror(errno)); } // read magic numbers char magicNumbers[3] = {0x6c,0x1b,0x01}; char firstThreeBytes[3]; ifread( iFile, firstThreeBytes, 3 ); for(int i=0; i < 3; ++i) { if ( firstThreeBytes[i] != magicNumbers[i] ) { throw VcfFileException("The magic numbers do not match in BED file %s",bedFile); } } iBimFile = ifopen(bimFile,"rb"); iFamFile = ifopen(famFile,"rb"); sRefFile = refFile; while( 1 ) { int ret = line.ReadLine(iFamFile); if ( ret <= 0 ) break; tokens.ReplaceTokens(line, " \t\r\n"); if ( tokens.Length() < 5 ) { throw VcfFileException("Less then 5 columns are observed in FAM file"); } VcfInd* p = new VcfInd(tokens[1],tokens[0],tokens[2],tokens[3],tokens[4]); vpVcfInds.push_back(p); } //Logger::gLogger->writeLog("Finished loading %d individuals from FAM file",(int)vpVcfInds.size()); nBytes = (vpVcfInds.size()+3)/4; if ( pBedBuffer != NULL ) { delete[] pBedBuffer; } pBedBuffer = new char[nBytes]; nBuffers = nbuf; nNumMarkers = 0; nHead = 0; bParseGenotypes = true; bParseDosages = false; bParseValues = false; if ( nBuffers == 0 ) { // infinite buffer size // do not set size of markers } else { vpVcfMarkers.resize( nBuffers ); for(int i=0; i < nBuffers; ++i) { VcfMarker* p = new VcfMarker; vpVcfMarkers[i] = p; } } genomeSequence.setReferenceName(sRefFile.c_str()); genomeSequence.useMemoryMap(true); //Logger::gLogger->writeLog("Loading reference file %s",sRefFile.c_str()); if ( genomeSequence.open() ) { // write a message that new index file is being created if ( genomeSequence.create(false) ) { throw VcfFileException("Failed creating index file of the reference. Please check the file permission"); } if ( genomeSequence.open() ) { throw VcfFileException("Failed opening index file of the reference."); } } }