int loadMatrix(Matrix& a, String& fileName) { a.Zero(); IFILE ifile(fileName.c_str(), "r"); String line; StringArray array; int lineNo = 0; while (!ifeof(ifile)) { line.ReadLine(ifile); lineNo++; if (line.Length() == 0) continue; array.Clear(); array.AddTokens(line); if (a.cols != 0 && a.cols != array.Length() && line.Length() > 0) { fprintf(stderr, "Wrong column size at line %d!\n", lineNo); array.Print(); line.Write(stdout); return -1; } else { a.GrowTo(a.rows, array.Length()); } if (a.rows < lineNo) { a.GrowTo(a.rows + 1, a.cols); } for (int i = 0; i < array.Length(); i++) { a[lineNo - 1][i] = atol(array[i]); } } // a.Print(stdout); return 0; };
void VcfHelper::printArrayDoubleJoin(IFILE oFile, const StringArray& arr1, const StringArray& arr2, const char* sep1, const char* sep2, const char* empty) { int len1 = arr1.Length(); int len2 = arr2.Length(); if ( len1 == len2 ) { printArrayDoubleJoin(oFile, arr1, arr2, sep1, sep2, empty, 0, len1); } else { throw VcfFileException("Inconsistency between arr1.Length() == %d and arr2.Length() == %d", len1, len2); } }
void GroupFromAnnotation::addLineFromVcf( String & buffer ) { // sample: // ANNO=Nonsynonymous:ASB16; // ANNOFULL=ASB16/NM_080863:+:Nonsynonymous(CCC/Pro/P->ACC/Thr/T:Base1310/1362:Codon437/454:Exon5/5):Exon // |C17orf65/NM_178542:-:Intron StringArray vfield; vfield.AddTokens(buffer, "\t"); if ( vfield.Length() < 8 ) error("Annotation vcf only has %d columns!\n", vfield.Length()); StringArray info_semicolon; info_semicolon.AddTokens( vfield[7],";" ); // find ANNOFULL first int annofull_index = -1; for( int i=0; i<info_semicolon.Length(); i++ ) { String iheader = info_semicolon[i].SubStr(0,8); if (iheader == "ANNOFULL") { annofull_index = i; break; } } if (annofull_index == -1) { printf("warning: no ANNOFULL field at chr%s:%s. Variant won't included in groups!\n", info_semicolon[0].c_str(), info_semicolon[1].c_str()); return; } // remove ANNOFULL= String anno_full_str = info_semicolon[annofull_index].SubStr(9); // check each alternative field StringArray alts; alts.AddTokens( anno_full_str, "|" ); for( int a=0; a<alts.Length(); a++ ) { StringArray sub; sub.AddTokens( alts[a], ":/="); if (func_upper.Length() != 0) { // match before add for(int f =0;f<func_upper.Length();f++) { bool pattern_match = checkPatternMatch( sub, func_upper[f] ); if ( pattern_match ) { addGeneFromVcf( vfield, sub[0] ); break; } } } else { // no pattern to match: check if intergenic first String upper_name = sub[0].ToUpper(); if ( !upper_name.SlowFind( "INTERGENIC" ) ) addGeneFromVcf( vfield, sub[0] ); } } }
void StringToArray(const String & input, IntArray & values, int desired) { StringArray tokens; tokens.AddTokens(input, ','); values.Dimension(desired); values.Zero(); if (tokens.Length()) for (int i = 0; i < desired; i++) values[i] = tokens[i % tokens.Length()].AsInteger(); }
void GenomeRegionSeqStats::LoadRegionList(String &inputList) { FILE *in = fopen(inputList.c_str(), "r"); if(in==NULL) error("Open region input file %s failed!\n", inputList.c_str()); StringArray tokens; String buffer; while(!feof(in)) { buffer.ReadLine(in); if (buffer.IsEmpty() || buffer[0] == '#') continue; tokens.ReplaceTokens(buffer); if(tokens.Length()<3) error("Too few columns: %s\n", buffer.c_str()); String CSE = tokens[0]+":"+tokens[1]+":"+tokens[2]; std::pair<int, int> start_end; start_end.first = tokens[1].AsInteger(); start_end.second = tokens[2].AsInteger(); if(start_end.first>=start_end.second) // positions are 0-based. Otherwise == is valid error("Region end is equal or smaller than the start: %s!\n", buffer.c_str()); genomeRegions_lines[tokens[0]].push_back(buffer); genomeRegions[tokens[0]].push_back(start_end); genomeRegions_currentIndex[tokens[0]] = 0; if(tokens.Length()>3) { groupStats[tokens[3]].segCount++; groupStats[tokens[3]].totalLen += (start_end.second - start_end.first); genomeRegionGroups[CSE].push_back(tokens[3]); } } fclose(in); // Chromosome info contigs.clear(); std::map<String, vector<std::pair<int, int> > >::iterator p; for(p=genomeRegions.begin(); p!=genomeRegions.end(); p++) { contigs.push_back(p->first); for(unsigned int i=1; i<genomeRegions[p->first].size(); i++) if(genomeRegions[p->first][i].first<genomeRegions[p->first][i-1].first) error("Input coordinates are not in order: %s %d %d!\n", p->first.c_str(),genomeRegions[p->first][i].first,genomeRegions[p->first][i].second); } // Group info such as gene names groups.clear(); std::map<String, Stats>::iterator p2; for(p2=groupStats.begin(); p2!=groupStats.end(); p2++) groups.push_back(p2->first); }
int loadVector(Vector& a, String& fileName) { a.Zero(); IFILE ifile(fileName.c_str(), "r"); String line; StringArray array; int lineNo = 0; while (!ifeof(ifile)) { line.ReadLine(ifile); lineNo++; if (line.Length() == 0) continue; array.Clear(); array.AddTokens(line); if (array.Length() > 1 && line.Length() > 0) { fprintf(stderr, "Warning: column size at line %d!\n", lineNo); array.Print(); line.Write(stdout); return -1; } if (a.dim < lineNo) { a.GrowTo(a.dim + 1); } a[lineNo - 1] = atol(array[0]); } // a.Print(stdout); return 0; };
void GCContent::LoadRegions(String & regionsFile, GenomeSequence &genome, bool invertRegion) { if(regionsFile.Length()==0) return; if(genome.sequenceLength()==0) error("No reference genome loaded!\n"); IFILE fhRegions; fhRegions = ifopen(regionsFile.c_str(),"r"); if(fhRegions==NULL) error("Open regions file %s failed!\n", regionsFile.c_str()); regionIndicator.resize(genome.sequenceLength()); StringArray tokens; String buffer; int len; fprintf(stderr, "Loading region list..."); while (!ifeof(fhRegions)){ buffer.ReadLine(fhRegions); if (buffer.IsEmpty() || buffer[0] == '#') continue; tokens.AddTokens(buffer, WHITESPACE); if(tokens.Length() < 3) continue; genomeIndex_t startGenomeIndex = 0; int chromosomeIndex = tokens[1].AsInteger(); // use chromosome name (token[0]) and position (token[1]) to query genome index. startGenomeIndex = genome.getGenomePosition(tokens[0].c_str(), chromosomeIndex); if(startGenomeIndex >= regionIndicator.size() ) { //fprintf(stderr, "WARNING: region list section %s position %u is not found in the reference and skipped...\n", tokens[0].c_str(), chromosomeIndex); continue; } len = tokens[2].AsInteger() - tokens[1].AsInteger() + 1; for(uint32_t i=startGenomeIndex; i<startGenomeIndex+len; i++) regionIndicator[i] = true; tokens.Clear(); buffer.Clear(); } if (invertRegion) { fprintf(stderr, " invert region..."); for (uint32_t i = 0; i < regionIndicator.size(); i++) { regionIndicator[i] = !regionIndicator[i]; } } ifclose(fhRegions); fprintf(stderr, "DONE!\n"); }
void VcfHelper::printArrayJoin(IFILE oFile, const StringArray& arr, const char* sep, const char* empty) { int len = arr.Length(); if ( len == 0 ) { ifprintf(oFile,"%s",empty); } else if ( len == 1 ) { ifprintf(oFile,"%s",arr[0].c_str()); } else { printArrayJoin(oFile,arr,sep,empty,0,len); } }
bool MarkovParameters::ReadCrossoverRates(const char * filename) { StringArray tokens; StringArray rec; rec.Read(filename); // Load estimated per marker error rates if (rec.Length() == markers) { printf(" Updating error rates using data in %s ...\n", (const char *) filename); for (int i = 0; i < markers; i++) { tokens.ReplaceTokens(rec[i+1]); if (tokens.Length() >= 2) R[i] = tokens[1].AsDouble(); } return true; } return false; }
bool GroupFromAnnotation::checkPatternMatch( StringArray & sub, String & func ) { int result = -1; for( int i=0; i<sub.Length(); i++ ) { if ( sub[i].Length() < func.Length() ) continue; String str = sub[i]; String upper_sub = str.ToUpper(); result = upper_sub.SlowFind( func ); if ( result == 0 ) break; } if ( result == 0 ) return 1; else return 0; }
// Set the fields from the passed in line. // Return true if successfully set. bool SamHeaderRecord::setFields(const StringArray& tokens) { bool status = true; // Loop through the tags for this type. // The tags start in column 1 since column 0 contains the type. for(int columnIndex = 1; columnIndex < tokens.Length(); columnIndex++) { // Validate that the tag is at least 3 characters. Two for the token, // one for the ':'. if((tokens[columnIndex].Length() < 3) || (tokens[columnIndex][2] != ':')) { // Continue to the next tag, this one is too small/invalid. status = false; std::cerr << "ERROR: Poorly formatted tag in header: " << tokens[columnIndex] << std::endl; continue; } // Get the tag from the token. char tag[3]; tag[0] = tokens[columnIndex][0]; tag[1] = tokens[columnIndex][1]; tag[2] = 0; // The tag value is the rest of the substring. String tagValue = (tokens[columnIndex]).SubStr(3); // Set the tag. status &= setTag(tag, tagValue.c_str()); } status &= isValid(); return(status); }
int VcfMac::execute(int argc, char **argv) { String inputVcf = ""; int minAC = -1; String sampleSubset = ""; String filterList = ""; bool params = false; IntervalTree<int> regions; std::vector<int> intersection; // Read in the parameters. ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inputVcf) LONG_PARAMETER_GROUP("Optional Parameters") LONG_STRINGPARAMETER("sampleSubset", &sampleSubset) LONG_INTPARAMETER("minAC", &minAC) LONG_STRINGPARAMETER("filterList", &filterList) LONG_PARAMETER("params", ¶ms) LONG_PHONEHOME(VERSION) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); inputParameters.Read(argc-1, &(argv[1])); // Check that all files were specified. if(inputVcf == "") { usage(); inputParameters.Status(); std::cerr << "Missing \"--in\", a required parameter.\n\n"; return(-1); } if(params) { inputParameters.Status(); } // Open the two input files. VcfFileReader inFile; VcfHeader header; VcfRecord record; // Open the file if(sampleSubset.IsEmpty()) { inFile.open(inputVcf, header); } else { inFile.open(inputVcf, header, sampleSubset, NULL, NULL); } // Add the discard rule for minor allele count. if(minAC >= 0) { inFile.addDiscardMinMinorAlleleCount(minAC, NULL); } if(!filterList.IsEmpty()) { // Open the filter list. IFILE regionFile = ifopen(filterList, "r"); String regionLine; StringArray regionColumn; int start; int end; int intervalVal = 1; if(regionFile == NULL) { std::cerr << "Failed to open " << filterList << ", so keeping all positions\n"; filterList.Clear(); } else { while( regionFile->isOpen() && !regionFile->ifeof()) { // Read the next interval regionLine.Clear(); regionLine.ReadLine(regionFile); if(regionLine.IsEmpty()) { // Nothing on this line, continue to the next. continue; } regionColumn.ReplaceColumns(regionLine, ' '); if(regionColumn.Length() != 2) { std::cerr << "Improperly formatted region line: " << regionLine << "; skipping to the next line.\n"; continue; } // Convert the columns to integers. if(!regionColumn[0].AsInteger(start)) { // The start position (1st column) is not an integer. std::cerr << "Improperly formatted region line, start position " << "(1st column) is not an integer: " << regionColumn[0] << "; Skipping to the next line.\n"; continue; } if(!regionColumn[1].AsInteger(end)) { // The start position (1st column) is not an integer. std::cerr << "Improperly formatted region line, end position " << "(2nd column) is not an integer: " << regionColumn[1] << "; Skipping to the next line.\n"; continue; } // Add 1-based inclusive intervals. regions.add(start,end, intervalVal); } } } int numReadRecords = 0; while( inFile.readRecord(record)) { if(!filterList.IsEmpty()) { // Check if the region should be kept. intersection.clear(); regions.get_intersecting_intervals(record.get1BasedPosition(), intersection); if(intersection.empty()) { // not in the interval, so continue to the next record. continue; } } ++numReadRecords; // Loop through the number of possible alternates. unsigned int numAlts = record.getNumAlts(); int minAlleleCount = -1; int curAlleleCount = 0; int totalAlleleCount = 0; for(unsigned int i = 0; i <= numAlts; i++) { curAlleleCount = record.getAlleleCount(i); if((minAlleleCount == -1) || (curAlleleCount < minAlleleCount)) { minAlleleCount = curAlleleCount; } totalAlleleCount += curAlleleCount; } if(totalAlleleCount != 0) { double maf = (double)minAlleleCount/totalAlleleCount; std::cout << record.getIDStr() << "\t" << minAlleleCount << "\t" << maf << "\n"; } } inFile.close(); // std::cerr << "\n\t# Records: " << numReadRecords << "\n"; // return success. return(0); }
void GenomeRegionSeqStats::CalcRegionStats(StringArray &bamFiles) { for(int i=0; i<bamFiles.Length(); i++) CalcRegionStats(bamFiles[i]); }
void GenomeRegionSeqStats::CalcClusters(StringArray &bamFiles, int minMapQuality) { for(int i=0; i<bamFiles.Length(); i++) CalcClusters(bamFiles[i], minMapQuality); }
void GroupFromAnnotation::GetGroupFromFile(FILE * log) { //Fill in annoGroups. StringArray tmp; FILE * file = fopen(groupFile,"r"); if(file==NULL) { printf("ERROR! Cannot open group file %s.\n",groupFile.c_str()); error("ERROR! Cannot open group file %s.\n",groupFile.c_str()); } String buffer; int line = 0; while (!feof(file)) { buffer.ReadLine(file); tmp.Clear(); tmp.AddTokens(buffer, SEPARATORS); if(tmp.Length()==0) continue; annoGroups.Push(tmp[0]); chrom.Push(tmp[1]); line++; } fclose(file); //Fill in SNPlist. SNPlist = new StringArray [line]; SNPNoAllele = new StringArray [line]; FILE * samefile = fopen(groupFile,"r"); line = 0; Vector pos; while (!feof(samefile)) { buffer.ReadLine(samefile); tmp.Clear(); pos.Clear(); tmp.AddTokens(buffer, "\t "); SNPlist[line].Dimension(0); SNPNoAllele[line].Dimension(0); for(int i=1;i<tmp.Length();i++) { SNPlist[line].Push(tmp[i]); StringArray sub; sub.Clear(); sub.AddTokens(tmp[i],":_/"); if(sub.Length()!=4) { printf("Warning: group %s has a variant %s that has invalid format. The correct format should be chr:pos:allele1:allele2.\n",tmp[0].c_str(),tmp[i].c_str()); fprintf(log,"Warning: group %s has a variant %s that has invalid format. The correct format should be chr:pos:allele1:allele2.\n",tmp[0].c_str(),tmp[i].c_str()); continue; } pos.Push(sub[1].AsInteger()); SNPNoAllele[line].Push(sub[0] + ":" + sub[1]); } //sort SNPlist[line] and SNPNoAllele[line] if(SNPlist[line].Length()>1) { Vector sorted_pos,order; sorted_pos.Copy(pos); sorted_pos.Sort(); order.Dimension(pos.Length()); for(int i=0;i<sorted_pos.Length();i++) { for(int j=0;j<pos.Length();j++) { if(sorted_pos[i]==pos[j]) { order[i]=j; break; } } } StringArray cp_SNPlist,cp_SNPNoAllele; cp_SNPlist.Dimension(SNPlist[line].Length()); cp_SNPNoAllele.Dimension(SNPNoAllele[line].Length()); for(int l=0;l<SNPlist[line].Length();l++) { cp_SNPlist[l] = SNPlist[line][l]; cp_SNPNoAllele[l] = SNPNoAllele[line][l]; } for(int i=0;i<order.Length();i++) { SNPlist[line][i] = cp_SNPlist[order[i]]; //printf("%s\t",SNPlist[line][i].c_str()); SNPNoAllele[line][i] = cp_SNPNoAllele[order[i]] ; } //printf("\n"); } line++; } fclose(samefile); }
bool FilterStat::appendStatVcf(const char* file) { VcfFile vcf; vcf.setSiteOnly(false); vcf.setParseValues(true); vcf.setParseGenotypes(false); vcf.setParseDosages(false); vcf.openForRead(file,1); VcfMarker* pMarker; StringArray tok; for( int i=0, j=0; vcf.iterateMarker(); ++i, ++j ) { pMarker = vcf.getLastMarker(); if ( sChrom.Compare(pMarker->sChrom) != 0 ) { Logger::gLogger->error("Chromosome name does not match - %s vs %s",sChrom.c_str(),pMarker->sChrom.c_str()); } while ( vPos[j] < pMarker->nPos ) { ++j; } if ( vPos[j] > pMarker->nPos ) { Logger::gLogger->error("Position %s:%d is not observed in the anchor VCF",sChrom.c_str(),pMarker->nPos); } std::vector<int> vAlleles; std::vector<int> vStrands; //fprintf(stderr,"%s:%d\t%s\n",pMarker->sChrom.c_str(),pMarker->nPos,pMarker->asFormatKeys[0].c_str()); for(int k=0; k < pMarker->asFormatKeys.Length(); ++k) { if ( pMarker->asFormatKeys[k].Compare("BASE") == 0 ) { tok.ReplaceColumns(pMarker->asSampleValues[k],','); for(int l=0; l < tok.Length(); ++l) { if ( tok[l].Compare(vAl1[j].c_str()) == 0 ) { vAlleles.push_back(0); } else if ( tok[l].Compare(vAl2[j].c_str()) == 0 ) { vAlleles.push_back(1); } else { vAlleles.push_back(2); } } } else if ( pMarker->asFormatKeys[k].Compare("STRAND") == 0 ) { tok.ReplaceColumns(pMarker->asSampleValues[k],','); for(int l=0; l < tok.Length(); ++l) { if ( tok[l].Compare("F") == 0 ) { vStrands.push_back(0); } else { vStrands.push_back(1); } } } } //fprintf(stderr,"%s:%d\t%d",pMarker->sChrom.c_str(),pMarker->nPos,(int)vAlleles.size()); //for(int k=0; k < (int) vAlleles.size(); ++k) { // fprintf(stderr,"\t%d",vAlleles[k]*2+vStrands[k]); //} //fprintf(stderr,"\n"); if ( vAlleles.size() != vStrands.size() ) { Logger::gLogger->error("Alleles and Strands do not match in size at %s:%d, in %s",pMarker->sChrom.c_str(), pMarker->nPos, file); } // updates the counts - needs synchronization { //boost::mutex::scoped_lock lock(mutex); for(int k=0; k < (int) vAlleles.size(); ++k) { ++(vCounts[FILTER_STAT_COUNTS*j + vAlleles[k]*2 + vStrands[k]]); } } } return true; }
int main(int argc, char ** argv) { setbuf(stdout, NULL); time_t start = time(NULL); printf("MiniMac - Imputation into phased haplotypes\n" "(c) 2011 Goncalo Abecasis\n"); #ifdef __VERSION__ printf("VERSION 5.0\n"); #else printf("UNDOCUMENTED RELEASE\n"); #endif int rounds = 5, states = 200, cpus = 0; bool em = false, gzip = false, phased = false; String referenceHaplotypes, referenceSnps; String haplotypes, snps; String prefix("minimac"); String firstMarker, lastMarker; String recombinationRates, errorRates; BEGIN_LONG_PARAMETERS(longParameters) LONG_PARAMETER_GROUP("Reference Haplotypes") LONG_STRINGPARAMETER("refHaps", &referenceHaplotypes) LONG_STRINGPARAMETER("refSnps", &referenceSnps) LONG_PARAMETER_GROUP("Target Haplotypes") LONG_STRINGPARAMETER("haps", &haplotypes) LONG_STRINGPARAMETER("snps", &snps) LONG_PARAMETER_GROUP("Starting Parameters") LONG_STRINGPARAMETER("rec", &recombinationRates) LONG_STRINGPARAMETER("erate", &errorRates) LONG_PARAMETER_GROUP("Parameter Fitting") LONG_INTPARAMETER("rounds", &rounds) LONG_INTPARAMETER("states", &states) LONG_PARAMETER("em", &em) LONG_PARAMETER_GROUP("Output Files") LONG_STRINGPARAMETER("prefix", &prefix) LONG_PARAMETER("phased", &phased) LONG_PARAMETER("gzip", &gzip) // LONG_PARAMETER_GROUP("Clipping Window") // LONG_STRINGPARAMETER("start", &firstMarker) // LONG_STRINGPARAMETER("stop", &lastMarker) #ifdef _OPENMP LONG_PARAMETER_GROUP("Multi-Threading") LONG_INTPARAMETER("cpus", &cpus) #endif END_LONG_PARAMETERS(); ParameterList pl; pl.Add(new LongParameters("Command Line Options", longParameters)); pl.Read(argc, argv); pl.Status(); #ifdef _OPENMP if (cpus > 0) omp_set_num_threads(cpus); #endif // Read marker list printf("Reading Reference Marker List ...\n"); StringArray refMarkerList; refMarkerList.Read(referenceSnps); // Index markers StringIntHash referenceHash; for (int i = 0; i < refMarkerList.Length(); i++) referenceHash.Add(refMarkerList[i].Trim(), i); printf(" %d Markers in Reference Haplotypes...\n\n", refMarkerList.Length()); // Load reference haplotypes printf("Loading reference haplotypes ...\n"); HaplotypeSet reference; reference.markerCount = refMarkerList.Length(); reference.LoadHaplotypes(referenceHaplotypes); printf(" %d Reference Haplotypes Loaded ...\n\n", reference.count); // Read framework marker list printf("Reading Framework Marker List ...\n"); StringArray markerList; markerList.Read(snps); ClipReference(reference, refMarkerList, referenceHash, markerList, firstMarker, lastMarker); // Crossref Marker Names to Reference Panel Positions IntArray markerIndex; markerIndex.Dimension(markerList.Length()); int matches = 0; for (int i = 0; i < markerList.Length(); i++) { markerIndex[i] = referenceHash.Integer(markerList[i].Trim()); if (markerIndex[i] >= 0) matches++; } printf(" %d Markers in Framework Haplotypes Overlap Reference ...\n", matches); if (matches == 0) error("No markers overlap between target and reference\n" "Please check correct reference is being used and markers are named consistently"); printf(" %d Other Markers in Framework Haplotypes Discarded ...\n\n", markerList.Length() - matches); // Check for flips in reference vs. target haplotypes int flips = 0; int previous = -1; for (int i = 0; i < markerIndex.Length(); i++) if (markerIndex[i] >= 0) if (markerIndex[i] < previous) { if (flips++ < 10) printf(" -> Marker %s precedes %s in reference, but follows it in target\n", (const char *) refMarkerList[previous], (const char *) markerList[i]); previous = markerIndex[i]; } if (flips > 10) printf(" -> %d Additional Marker Order Changes Not Listed\n", flips - 10); if (flips) printf(" %d Marker Pairs Change Order in Target vs Framework Haplotypes\n", flips); // Load target haplotypes printf("Loading target haplotypes ...\n"); HaplotypeSet target; target.markerCount = markerList.Length(); target.LoadHaplotypes(haplotypes, true); reference.CalculateFrequencies(); target.CalculateFrequencies(); target.CompareFrequencies(reference, markerIndex, markerList); printf(" %d Target Haplotypes Loaded ...\n\n", target.count); int startIndex = firstMarker.IsEmpty() ? 0 : referenceHash.Integer(firstMarker); int stopIndex = lastMarker.IsEmpty() ? reference.markerCount - 1 : referenceHash.Integer(lastMarker); if (startIndex < 0 || stopIndex < 0) error("Clipping requested, but no position available for one of the endpoints"); printf("Setting up Markov Model...\n\n"); // Setup Markov Model MarkovParameters mp; mp.Allocate(reference.markerCount); if (rounds > 0) printf("Initializing Model Parameters (using %s and up to %d haplotypes)\n", em ? "E-M" : "MCMC", states); // Simple initial estimates of error and recombination rate for (int i = 0; i < reference.markerCount; i++) mp.E[i] = 0.01; for (int i = 0; i < reference.markerCount - 1; i++) mp.R[i] = 0.001; if (mp.ReadErrorRates(errorRates)) printf(" Updated error rates using data in %s ...\n", (const char *) errorRates); if (mp.ReadCrossoverRates(recombinationRates)) printf(" Updated recombination rates using %s ...\n", (const char *) recombinationRates); // Parameter estimation loop for (int round = 0; round < rounds; round++) { printf(" Round %d of Parameter Refinement ...\n", round + 1); int iterations = states < reference.count ? states : reference.count; MarkovModel original; original.CopyParameters(mp); #pragma omp parallel for for (int i = 0; i < iterations; i++) { MarkovModel mm; mm.Allocate(reference.markerCount, reference.count - 1); mm.CopyParameters(original); // Reference leave one out (loo) panel char ** reference_loo = new char * [reference.count - 1]; for (int in = 0, out = 0; in < reference.count; in++) if (in != i) reference_loo[out++] = reference.haplotypes[in]; mm.WalkLeft(reference.haplotypes[i], reference_loo, reference.freq); if (em) mm.CountExpected(reference.haplotypes[i], reference_loo, reference.freq); else { #pragma omp critical { mm.ProfileModel(reference.haplotypes[i], reference_loo, reference.freq); } } delete [] reference_loo; #pragma omp critical mp += mm; } if (round >= rounds / 2) { int iterations = states < target.count ? states : target.count; #pragma omp parallel for for (int i = 0; i < iterations; i++) { MarkovModel mm; mm.Allocate(reference.markerCount, reference.count); mm.CopyParameters(original); // Padded version of target haplotype, including missing sites char * padded = new char [reference.markerCount]; for (int k = 0; k < reference.markerCount; k++) padded[k] = 0; // Copy current haplotype into padded vector for (int j = 0; j < target.markerCount; j++) if (markerIndex[j] >= 0) padded[markerIndex[j]] = target.haplotypes[i][j]; mm.WalkLeft(padded, reference.haplotypes, reference.freq); if (em) mm.CountExpected(padded, reference.haplotypes, reference.freq); else { #pragma omp critical { mm.ProfileModel(padded, reference.haplotypes, reference.freq); } } delete [] padded; #pragma omp critical mp += mm; } } mp.UpdateModel(); double crossovers = 0; for (int i = 0; i < reference.markerCount - 1; i++) crossovers += mp.R[i]; double errors = 0; for (int i = 0; i < reference.markerCount; i++) { double heterozygosity = 1.0 - square(reference.freq[1][i]) - square(reference.freq[2][i]) - square(reference.freq[3][i]) - square(reference.freq[4][i]); errors += mp.E[i] * heterozygosity; } errors /= reference.markerCount + 1e-30; printf(" %.0f mosaic crossovers expected per haplotype\n", crossovers); printf(" %.1f%% of crossovers are due to reference flips\n", mp.empiricalFlipRate * 100.); printf(" %.3g errors in mosaic expected per marker\n", errors); } if (rounds > 0) { printf(" Saving estimated parameters for future use ...\n"); mp.WriteParameters(refMarkerList, prefix, gzip); } printf("\n"); // List the major allele at each location reference.ListMajorAlleles(); printf("Generating Draft .info File ...\n\n"); // Output some basic information IFILE info = ifopen(prefix + ".info.draft", "wt"); ifprintf(info, "SNP\tAl1\tAl2\tFreq1\tGenotyped\n"); for (int i = 0, j = 0; i <= stopIndex; i++) if (i >= startIndex) ifprintf(info, "%s\t%s\t%s\t%.4f\t%s\n", (const char *) refMarkerList[i], reference.MajorAlleleLabel(i), reference.MinorAlleleLabel(i), reference.freq[reference.major[i]][i], j < markerIndex.Length() && i == markerIndex[j] ? (j++, "Genotyped") : "-"); else if (j < markerIndex.Length() && i == markerIndex[j]) j++; ifclose(info); printf("Imputing Genotypes ...\n"); IFILE dosages = ifopen(prefix + ".dose" + (gzip ? ".gz" : ""), "wt"); IFILE hapdose, haps; if (phased) { hapdose = ifopen(prefix + ".hapDose" + (gzip ? ".gz" : ""), "wt"); haps = ifopen(prefix + ".haps" + (gzip ? ".gz" : ""), "wt"); } ImputationStatistics stats(reference.markerCount); // Impute each haplotype #pragma omp parallel for for (int i = 0; i < target.count; i++) { if (i != 0 && target.labels[i] == target.labels[i-1]) continue; MarkovModel mm; mm.Allocate(reference.markerCount, reference.count); mm.ClearImputedDose(); mm.CopyParameters(mp); // Padded version of target haplotype, including missing sites char * padded = new char [reference.markerCount]; for (int j = 0; j < reference.markerCount; j++) padded[j] = 0; int k = i; do { printf(" Processing Haplotype %d of %d ...\n", k + 1, target.count); // Copy current haplotype into padded vector for (int j = 0; j < target.markerCount; j++) if (markerIndex[j] >= 0) padded[markerIndex[j]] = target.haplotypes[k][j]; mm.WalkLeft(padded, reference.haplotypes, reference.freq); mm.Impute(reference.major, padded, reference.haplotypes, reference.freq); #pragma omp critical { stats.Update(mm.imputedHap, mm.leaveOneOut, padded, reference.major); } #pragma omp critical if (phased) { ifprintf(hapdose, "%s\tHAPLO%d", (const char *) target.labels[i], k - i + 1); ifprintf(haps, "%s\tHAPLO%d", (const char *) target.labels[i], k - i + 1); for (int j = startIndex; j <= stopIndex; j++) { ifprintf(hapdose, "\t%.3f", mm.imputedHap[j]); ifprintf(haps, "%s%c", j % 8 == 0 ? " " : "", mm.imputedAlleles[j]); } ifprintf(hapdose, "\n"); ifprintf(haps, "\n"); } k++; } while (k < target.count && target.labels[k] == target.labels[i]); printf(" Outputting Individual %s ...\n", (const char *) target.labels[i]); #pragma omp critical { ifprintf(dosages, "%s\tDOSE", (const char *) target.labels[i]); for (int j = startIndex; j <= stopIndex; j++) ifprintf(dosages, "\t%.3f", mm.imputedDose[j]); ifprintf(dosages, "\n"); } delete [] padded; } ifclose(dosages); if (phased) { ifclose(hapdose); ifclose(haps); } // Output some basic information info = ifopen(prefix + ".info" + (gzip ? ".gz" : ""), "wt"); ifprintf(info, "SNP\tAl1\tAl2\tFreq1\tMAF\tAvgCall\tRsq\tGenotyped\tLooRsq\tEmpR\tEmpRsq\tDose1\tDose2\n"); // Padded version of target haplotype, including missing sites char * padded = new char [reference.markerCount]; for (int k = 0; k < reference.markerCount; k++) padded[k] = 0; // Mark genotyped SNPs in padded vector for (int j = 0; j < target.markerCount; j++) if (markerIndex[j] >= 0) padded[markerIndex[j]] = 1; for (int i = startIndex; i <= stopIndex; i++) { ifprintf(info, "%s\t%s\t%s\t%.5f\t%.5f\t%.5f\t%.5f\t", (const char *) refMarkerList[i], reference.MajorAlleleLabel(i), reference.MinorAlleleLabel(i), stats.AlleleFrequency(i), stats.AlleleFrequency(i) > 0.5 ? 1.0 - stats.AlleleFrequency(i) : stats.AlleleFrequency(i), stats.AverageCallScore(i), stats.Rsq(i)); if (padded[i]) ifprintf(info, "Genotyped\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\n", stats.LooRsq(i), stats.EmpiricalR(i), stats.EmpiricalRsq(i), stats.LooMajorDose(i), stats.LooMinorDose(i)); else ifprintf(info, "-\t-\t-\t-\t-\t-\n"); } ifclose(info); delete [] padded; time_t stop = time(NULL); int seconds = stop - start; printf("\nRun completed in %d hours, %d mins, %d seconds on %s\n\n", seconds / 3600, (seconds % 3600) / 60, seconds % 60, ctime(&stop)); }
void BedFile::openForRead(const char* bedFile, const char* bimFile, const char* famFile, const char* refFile, int nbuf) { StringArray tokens; reset(); iFile = ifopen(bedFile,"rb"); if ( iFile == NULL ) { throw VcfFileException("Failed opening file %s - %s",bedFile,strerror(errno)); } // read magic numbers char magicNumbers[3] = {0x6c,0x1b,0x01}; char firstThreeBytes[3]; ifread( iFile, firstThreeBytes, 3 ); for(int i=0; i < 3; ++i) { if ( firstThreeBytes[i] != magicNumbers[i] ) { throw VcfFileException("The magic numbers do not match in BED file %s",bedFile); } } iBimFile = ifopen(bimFile,"rb"); iFamFile = ifopen(famFile,"rb"); sRefFile = refFile; while( 1 ) { int ret = line.ReadLine(iFamFile); if ( ret <= 0 ) break; tokens.ReplaceTokens(line, " \t\r\n"); if ( tokens.Length() < 5 ) { throw VcfFileException("Less then 5 columns are observed in FAM file"); } VcfInd* p = new VcfInd(tokens[1],tokens[0],tokens[2],tokens[3],tokens[4]); vpVcfInds.push_back(p); } //Logger::gLogger->writeLog("Finished loading %d individuals from FAM file",(int)vpVcfInds.size()); nBytes = (vpVcfInds.size()+3)/4; if ( pBedBuffer != NULL ) { delete[] pBedBuffer; } pBedBuffer = new char[nBytes]; nBuffers = nbuf; nNumMarkers = 0; nHead = 0; bParseGenotypes = true; bParseDosages = false; bParseValues = false; if ( nBuffers == 0 ) { // infinite buffer size // do not set size of markers } else { vpVcfMarkers.resize( nBuffers ); for(int i=0; i < nBuffers; ++i) { VcfMarker* p = new VcfMarker; vpVcfMarkers[i] = p; } } genomeSequence.setReferenceName(sRefFile.c_str()); genomeSequence.useMemoryMap(true); //Logger::gLogger->writeLog("Loading reference file %s",sRefFile.c_str()); if ( genomeSequence.open() ) { // write a message that new index file is being created if ( genomeSequence.create(false) ) { throw VcfFileException("Failed creating index file of the reference. Please check the file permission"); } if ( genomeSequence.open() ) { throw VcfFileException("Failed opening index file of the reference."); } } }
int Stats::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; String indexFile = ""; bool basic = false; bool noeof = false; bool params = false; bool qual = false; bool phred = false; int maxNumReads = -1; bool unmapped = false; String pBaseQC = ""; String cBaseQC = ""; String regionList = ""; int excludeFlags = 0; int requiredFlags = 0; bool withinRegion = false; int minMapQual = 0; String dbsnp = ""; PosList *dbsnpListPtr = NULL; bool baseSum = false; int bufferSize = PileupHelper::DEFAULT_WINDOW_SIZE; ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inFile) LONG_PARAMETER_GROUP("Types of Statistics") LONG_PARAMETER("basic", &basic) LONG_PARAMETER("qual", &qual) LONG_PARAMETER("phred", &phred) LONG_STRINGPARAMETER("pBaseQC", &pBaseQC) LONG_STRINGPARAMETER("cBaseQC", &cBaseQC) LONG_PARAMETER_GROUP("Optional Parameters") LONG_INTPARAMETER("maxNumReads", &maxNumReads) LONG_PARAMETER("unmapped", &unmapped) LONG_STRINGPARAMETER("bamIndex", &indexFile) LONG_STRINGPARAMETER("regionList", ®ionList) LONG_INTPARAMETER("excludeFlags", &excludeFlags) LONG_INTPARAMETER("requiredFlags", &requiredFlags) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("params", ¶ms) LONG_PARAMETER_GROUP("Optional phred/qual Only Parameters") LONG_PARAMETER("withinRegion", &withinRegion) LONG_PARAMETER_GROUP("Optional BaseQC Only Parameters") LONG_PARAMETER("baseSum", &baseSum) LONG_INTPARAMETER("bufferSize", &bufferSize) LONG_INTPARAMETER("minMapQual", &minMapQual) LONG_STRINGPARAMETER("dbsnp", &dbsnp) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); inputParameters.Read(argc-1, &(argv[1])); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { usage(); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--in is a mandatory argument for stats, " << "but was not specified" << std::endl; return(-1); } // Use the index file if unmapped or regionList is not empty. bool useIndex = (unmapped|| (!regionList.IsEmpty())); // IndexFile is required, so check to see if it has been set. if(useIndex && (indexFile == "")) { // In file was not specified, so set it to the in file // + ".bai" indexFile = inFile + ".bai"; } //////////////////////////////////////// // Setup in case pileup is used. Pileup<PileupElementBaseQCStats> pileup(bufferSize); // Initialize start/end positions. myStartPos = 0; myEndPos = -1; // Open the output qc file if applicable. IFILE baseQCPtr = NULL; if(!pBaseQC.IsEmpty() && !cBaseQC.IsEmpty()) { usage(); inputParameters.Status(); // Cannot specify both types of baseQC. std::cerr << "Cannot specify both --pBaseQC & --cBaseQC." << std::endl; return(-1); } else if(!pBaseQC.IsEmpty()) { baseQCPtr = ifopen(pBaseQC, "w"); PileupElementBaseQCStats::setPercentStats(true); } else if(!cBaseQC.IsEmpty()) { baseQCPtr = ifopen(cBaseQC, "w"); PileupElementBaseQCStats::setPercentStats(false); } if(baseQCPtr != NULL) { PileupElementBaseQCStats::setOutputFile(baseQCPtr); PileupElementBaseQCStats::printHeader(); } if((baseQCPtr != NULL) || baseSum) { PileupElementBaseQCStats::setMapQualFilter(minMapQual); PileupElementBaseQCStats::setBaseSum(baseSum); } if(params) { inputParameters.Status(); } // Open the file for reading. SamFile samIn; if(!samIn.OpenForRead(inFile)) { fprintf(stderr, "%s\n", samIn.GetStatusMessage()); return(samIn.GetStatus()); } samIn.SetReadFlags(requiredFlags, excludeFlags); // Set whether or not basic statistics should be generated. samIn.GenerateStatistics(basic); // Read the sam header. SamFileHeader samHeader; if(!samIn.ReadHeader(samHeader)) { fprintf(stderr, "%s\n", samIn.GetStatusMessage()); return(samIn.GetStatus()); } // Open the bam index file for reading if we are // doing unmapped reads (also set the read section). if(useIndex) { samIn.ReadBamIndex(indexFile); if(unmapped) { samIn.SetReadSection(-1); } if(!regionList.IsEmpty()) { myRegionList = ifopen(regionList, "r"); } } ////////////////////////// // Read dbsnp if specified and doing baseQC if(((baseQCPtr != NULL) || baseSum) && (!dbsnp.IsEmpty())) { // Read the dbsnp file. IFILE fdbSnp; fdbSnp = ifopen(dbsnp,"r"); // Determine how many entries. const SamReferenceInfo& refInfo = samHeader.getReferenceInfo(); int maxRefLen = 0; for(int i = 0; i < refInfo.getNumEntries(); i++) { int refLen = refInfo.getReferenceLength(i); if(refLen >= maxRefLen) { maxRefLen = refLen + 1; } } dbsnpListPtr = new PosList(refInfo.getNumEntries(),maxRefLen); if(fdbSnp==NULL) { std::cerr << "Open dbSNP file " << dbsnp.c_str() << " failed!\n"; } else if(dbsnpListPtr == NULL) { std::cerr << "Failed to init the memory allocation for the dbsnpList.\n"; } else { // Read the dbsnp file. StringArray tokens; String buffer; int position = 0; int refID = 0; // Loop til the end of the file. while (!ifeof(fdbSnp)) { // Read the next line. buffer.ReadLine(fdbSnp); // If it does not have at least 2 columns, // continue to the next line. if (buffer.IsEmpty() || buffer[0] == '#') continue; tokens.AddTokens(buffer); if(tokens.Length() < 2) continue; if(!tokens[1].AsInteger(position)) { std::cerr << "Improperly formatted region line, start position " << "(2nd column) is not an integer: " << tokens[1] << "; Skipping to the next line.\n"; continue; } // Look up the reference name. refID = samHeader.getReferenceID(tokens[0]); if(refID != SamReferenceInfo::NO_REF_ID) { // Reference id was found, so add it to the dbsnp dbsnpListPtr->addPosition(refID, position); } tokens.Clear(); buffer.Clear(); } } ifclose(fdbSnp); } // Read the sam records. SamRecord samRecord; int numReads = 0; ////////////////////// // Setup in case doing a quality count. // Quality histogram. const int MAX_QUAL = 126; const int START_QUAL = 33; uint64_t qualCount[MAX_QUAL+1]; for(int i = 0; i <= MAX_QUAL; i++) { qualCount[i] = 0; } const int START_PHRED = 0; const int PHRED_DIFF = START_QUAL - START_PHRED; const int MAX_PHRED = MAX_QUAL - PHRED_DIFF; uint64_t phredCount[MAX_PHRED+1]; for(int i = 0; i <= MAX_PHRED; i++) { phredCount[i] = 0; } int refPos = 0; Cigar* cigarPtr = NULL; char cigarChar = '?'; // Exclude clips from the qual/phred counts if unmapped reads are excluded. bool qualExcludeClips = excludeFlags & SamFlag::UNMAPPED; ////////////////////////////////// // When not reading by sections, getNextSection returns true // the first time, then false the next time. while(getNextSection(samIn)) { // Keep reading records from the file until SamFile::ReadRecord // indicates to stop (returns false). while(((maxNumReads < 0) || (numReads < maxNumReads)) && samIn.ReadRecord(samHeader, samRecord)) { // Another record was read, so increment the number of reads. ++numReads; // See if the quality histogram should be genereated. if(qual || phred) { // Get the quality. const char* qual = samRecord.getQuality(); // Check for no quality ('*'). if((qual[0] == '*') && (qual[1] == 0)) { // This record does not have a quality string, so no // quality processing is necessary. } else { int index = 0; cigarPtr = samRecord.getCigarInfo(); cigarChar = '?'; refPos = samRecord.get0BasedPosition(); if(!qualExcludeClips && (cigarPtr != NULL)) { // Offset the reference position by any soft clips // by subtracting the queryIndex of this start position. // refPos is now the start position of the clips. refPos -= cigarPtr->getQueryIndex(0); } while(qual[index] != 0) { // Skip this quality if it is clipped and we are skipping clips. if(cigarPtr != NULL) { cigarChar = cigarPtr->getCigarCharOpFromQueryIndex(index); } if(qualExcludeClips && Cigar::isClip(cigarChar)) { // Skip a clipped quality. ++index; // Increment the position. continue; } if(withinRegion && (myEndPos != -1) && (refPos >= myEndPos)) { // We have hit the end of the region, stop processing this // quality string. break; } if(withinRegion && (refPos < myStartPos)) { // This position is not in the target. ++index; // Update the position if this is found in the reference or a clip. if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar)) { ++refPos; } continue; } // Check for valid quality. if((qual[index] < START_QUAL) || (qual[index] > MAX_QUAL)) { if(qual) { std::cerr << "Invalid Quality found: " << qual[index] << ". Must be between " << START_QUAL << " and " << MAX_QUAL << ".\n"; } if(phred) { std::cerr << "Invalid Phred Quality found: " << qual[index] - PHRED_DIFF << ". Must be between " << START_QUAL << " and " << MAX_QUAL << ".\n"; } // Skip an invalid quality. ++index; // Update the position if this is found in the reference or a clip. if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar)) { ++refPos; } continue; } // Increment the count for this quality. ++(qualCount[(int)(qual[index])]); ++(phredCount[(int)(qual[index]) - PHRED_DIFF]); // Update the position if this is found in the reference or a clip. if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar)) { ++refPos; } ++index; } } } // Check the next thing to do for the read. if((baseQCPtr != NULL) || baseSum) { // Pileup the bases for this read. pileup.processAlignmentRegion(samRecord, myStartPos, myEndPos, dbsnpListPtr); } } // Done with a section, move on to the next one. // New section, so flush the pileup. pileup.flushPileup(); } // Flush the rest of the pileup. if((baseQCPtr != NULL) || baseSum) { // Pileup the bases. pileup.processAlignmentRegion(samRecord, myStartPos, myEndPos, dbsnpListPtr); PileupElementBaseQCStats::printSummary(); ifclose(baseQCPtr); } std::cerr << "Number of records read = " << samIn.GetCurrentRecordCount() << std::endl; if(basic) { std::cerr << std::endl; samIn.PrintStatistics(); } // Print the quality stats. if(qual) { std::cerr << std::endl; std::cerr << "Quality\tCount\n"; for(int i = START_QUAL; i <= MAX_QUAL; i++) { std::cerr << i << "\t" << qualCount[i] << std::endl; } } // Print the phred quality stats. if(phred) { std::cerr << std::endl; std::cerr << "Phred\tCount\n"; for(int i = START_PHRED; i <= MAX_PHRED; i++) { std::cerr << i << "\t" << phredCount[i] << std::endl; } } SamStatus::Status status = samIn.GetStatus(); if(status == SamStatus::NO_MORE_RECS) { // A status of NO_MORE_RECS means that all reads were successful. status = SamStatus::SUCCESS; } return(status); }
void ClipReference(HaplotypeSet & reference, StringArray & refMarkerList, StringIntHash & referenceHash, StringArray & markerList, String & start, String & stop) { if (start == "start") start.Clear(); if (stop == "stop") stop.Clear(); // If no clipping was requested, then nothing to do if (start.IsEmpty() && stop.IsEmpty()) return; // Find the stretch of target that overlaps with reference int firstMatch = reference.markerCount, lastMatch = -1; bool matchStart = false, matchStop = false; String newStart, newStop; // First we find overlapping markers in target and, at the same time, // keep track of the marker nearest suggested start and stop positions // that overlaps with reference. for (int i = 0; i < markerList.Length(); i++) { String trimmed = markerList[i].Trim(); if (start == trimmed) matchStart = true; if (stop == trimmed) matchStop = true; int index = referenceHash.Integer(trimmed); if (index < 0) continue; if (index < firstMatch) firstMatch = index; if (index > lastMatch) lastMatch = index; if (matchStart) { newStart = trimmed; matchStart = false; } if (matchStop) { newStop = trimmed; matchStop = false; } } // If start and stop are not in the reference, adjust them // according to information in the target list int startIndex = referenceHash.Integer(start); int stopIndex = referenceHash.Integer(stop); if (startIndex < 0 && !start.IsEmpty()) { if (newStart.IsEmpty()) return; start = newStart; startIndex = referenceHash.Integer(start); } firstMatch = firstMatch < startIndex ? firstMatch : startIndex; if (stopIndex < 0 && !stop.IsEmpty()) { if (newStop.IsEmpty()) return; stop = newStop; stopIndex = referenceHash.Integer(stop); } lastMatch = lastMatch > stopIndex ? lastMatch : stopIndex; int clipFrom = !start.IsEmpty() ? firstMatch : 0; int clipTo = !stop.IsEmpty() ? lastMatch : reference.markerCount - 1; if (clipFrom > 0 || clipTo < reference.markerCount - 1) { printf(" Clipping reference haplotypes to match target ...\n"); reference.ClipHaplotypes(clipFrom, clipTo); StringArray newMarkerList; newMarkerList.Dimension(reference.markerCount); for (int i = clipFrom; i <= clipTo; i++) newMarkerList[i - clipFrom].Swap(refMarkerList[i]); newMarkerList.Swap(refMarkerList); referenceHash.Clear(); for (int i = 0; i < refMarkerList.Length(); i++) referenceHash.Add(refMarkerList[i].Trim(), i); printf(" %d Markers Remain After Clipping ...\n", reference.markerCount); } }
bool RegressionAnalysis::ReadModelsFromFile() { StringArray models; models.Read(modelsFile); if (models.Length() == 0) return false; regress = new FancyRegression[models.Length()]; printf("Retrieving analysis models from file [%s]...\n", (const char *) modelsFile); modelCount = 0; StringArray tokens; for (int i = 0, line = 0; i < models.Length(); i++) { models[i].Trim(); // Skip comments if (models[i][0] == '#') continue; // Divide each line into tokens tokens.Clear(); tokens.AddTokens(models[i]); // Skip blank lines if (tokens.Length() == 0) continue; // Print message for tracing... printf(" Input: %s\n", (const char *) models[i], line++); // Need a minimum of four tokens per line if (tokens.Length() < 4) { printf(" Skipped: Trait name, mean, variance and heritability required.\n"); continue; } regress[modelCount].trait = ped.LookupTrait(tokens[0]); if (regress[modelCount].trait < 0) { printf(line == 1 ? " Skipped: Appears to be a header line\n" : " Skipped: Trait %s not listed in the data file\n", (const char *) tokens[0]); continue; } // First check that mean, variance and heritability are valid numbers bool fail = false; for (int j = 1; j <= 3; j++) { char * ptr = NULL; strtod(tokens[j], &ptr); fail |= ptr[0] != 0; } // If one of the values is not a valid number, skip if (fail) { printf(line == 1 ? " Skipped: Appears to be a header line\n" : " Skipped: Invalid numeric format\n"); continue; } regress[modelCount].mean = tokens[1]; regress[modelCount].variance = tokens[2]; regress[modelCount].heritability = tokens[3]; if (tokens.Length() > 4) { regress[modelCount].label = tokens[4]; for (int j = 5; j < tokens.Length(); j++) { regress[modelCount].label += " "; regress[modelCount].label += tokens[j]; } } else regress[modelCount].label.printf("Model %d", modelCount + 1); regress[modelCount].shortLabel = regress[modelCount].label; regress[modelCount].testRetestCorrel = testRetestCorrel; regress[modelCount].bounded = !unrestricted; printf(" Model loaded and labelled %s\n", (const char *) regress[modelCount].label); modelCount++; } if (modelCount == 0) { printf("No valid models, default model will be used\n\n"); return false; } printf("Table processed. %d models recognized\n\n", modelCount); return true; }
void GroupFromAnnotation::vcfInitialize() { // func_upper if ( function != "" ) { func_upper.AddTokens( function, "/" ); for( int i=0; i<func_upper.Length(); i++ ) func_upper[i] = func_upper[i].ToUpper(); } FILE * inFile; inFile = fopen(vcfInput,"r"); while (!feof(inFile)) { String buffer; buffer.ReadLine( inFile); if ( buffer[0] == '#' ) continue; StringArray vfield; vfield.AddTokens(buffer, "\t"); if ( vfield.Length() < 8 ) error("Annotation vcf only has %d columns!\n", vfield.Length()); StringArray info_semicolon; info_semicolon.AddTokens( vfield[7],";" ); int annofull_index = -1; for( int i=0; i<info_semicolon.Length(); i++ ) { String iheader = info_semicolon[i].SubStr(0,8); if (iheader == "ANNOFULL") { annofull_index = i; break; } } if (annofull_index == -1) continue; String anno_full_str = info_semicolon[annofull_index].SubStr(9); StringArray alts; alts.AddTokens( anno_full_str, "|" ); for( int a=0; a<alts.Length(); a++ ) { StringArray sub; sub.AddTokens( alts[a], ":/="); if (func_upper.Length() != 0) { // match before add for(int f =0;f<func_upper.Length();f++) { bool pattern_match = checkPatternMatch( sub, func_upper[f] ); if ( pattern_match ) { chrom.Push( vfield[0] ); addGeneToGroupHash( sub[0] ); break; } } } else { // no pattern to match chrom.Push( vfield[0] ); addGeneToGroupHash( sub[0] ); } } } // vectors SNPlist = new StringArray [geneCount]; SNPNoAllele = new StringArray [geneCount]; pos = new Vector [geneCount]; }