int loadMatrix(Matrix& a, String& fileName) { a.Zero(); IFILE ifile(fileName.c_str(), "r"); String line; StringArray array; int lineNo = 0; while (!ifeof(ifile)) { line.ReadLine(ifile); lineNo++; if (line.Length() == 0) continue; array.Clear(); array.AddTokens(line); if (a.cols != 0 && a.cols != array.Length() && line.Length() > 0) { fprintf(stderr, "Wrong column size at line %d!\n", lineNo); array.Print(); line.Write(stdout); return -1; } else { a.GrowTo(a.rows, array.Length()); } if (a.rows < lineNo) { a.GrowTo(a.rows + 1, a.cols); } for (int i = 0; i < array.Length(); i++) { a[lineNo - 1][i] = atol(array[i]); } } // a.Print(stdout); return 0; };
int loadVector(Vector& a, String& fileName) { a.Zero(); IFILE ifile(fileName.c_str(), "r"); String line; StringArray array; int lineNo = 0; while (!ifeof(ifile)) { line.ReadLine(ifile); lineNo++; if (line.Length() == 0) continue; array.Clear(); array.AddTokens(line); if (array.Length() > 1 && line.Length() > 0) { fprintf(stderr, "Warning: column size at line %d!\n", lineNo); array.Print(); line.Write(stdout); return -1; } if (a.dim < lineNo) { a.GrowTo(a.dim + 1); } a[lineNo - 1] = atol(array[0]); } // a.Print(stdout); return 0; };
void GCContent::LoadRegions(String & regionsFile, GenomeSequence &genome, bool invertRegion) { if(regionsFile.Length()==0) return; if(genome.sequenceLength()==0) error("No reference genome loaded!\n"); IFILE fhRegions; fhRegions = ifopen(regionsFile.c_str(),"r"); if(fhRegions==NULL) error("Open regions file %s failed!\n", regionsFile.c_str()); regionIndicator.resize(genome.sequenceLength()); StringArray tokens; String buffer; int len; fprintf(stderr, "Loading region list..."); while (!ifeof(fhRegions)){ buffer.ReadLine(fhRegions); if (buffer.IsEmpty() || buffer[0] == '#') continue; tokens.AddTokens(buffer, WHITESPACE); if(tokens.Length() < 3) continue; genomeIndex_t startGenomeIndex = 0; int chromosomeIndex = tokens[1].AsInteger(); // use chromosome name (token[0]) and position (token[1]) to query genome index. startGenomeIndex = genome.getGenomePosition(tokens[0].c_str(), chromosomeIndex); if(startGenomeIndex >= regionIndicator.size() ) { //fprintf(stderr, "WARNING: region list section %s position %u is not found in the reference and skipped...\n", tokens[0].c_str(), chromosomeIndex); continue; } len = tokens[2].AsInteger() - tokens[1].AsInteger() + 1; for(uint32_t i=startGenomeIndex; i<startGenomeIndex+len; i++) regionIndicator[i] = true; tokens.Clear(); buffer.Clear(); } if (invertRegion) { fprintf(stderr, " invert region..."); for (uint32_t i = 0; i < regionIndicator.size(); i++) { regionIndicator[i] = !regionIndicator[i]; } } ifclose(fhRegions); fprintf(stderr, "DONE!\n"); }
int Stats::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; String indexFile = ""; bool basic = false; bool noeof = false; bool params = false; bool qual = false; bool phred = false; int maxNumReads = -1; bool unmapped = false; String pBaseQC = ""; String cBaseQC = ""; String regionList = ""; int excludeFlags = 0; int requiredFlags = 0; bool withinRegion = false; int minMapQual = 0; String dbsnp = ""; PosList *dbsnpListPtr = NULL; bool baseSum = false; int bufferSize = PileupHelper::DEFAULT_WINDOW_SIZE; ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inFile) LONG_PARAMETER_GROUP("Types of Statistics") LONG_PARAMETER("basic", &basic) LONG_PARAMETER("qual", &qual) LONG_PARAMETER("phred", &phred) LONG_STRINGPARAMETER("pBaseQC", &pBaseQC) LONG_STRINGPARAMETER("cBaseQC", &cBaseQC) LONG_PARAMETER_GROUP("Optional Parameters") LONG_INTPARAMETER("maxNumReads", &maxNumReads) LONG_PARAMETER("unmapped", &unmapped) LONG_STRINGPARAMETER("bamIndex", &indexFile) LONG_STRINGPARAMETER("regionList", ®ionList) LONG_INTPARAMETER("excludeFlags", &excludeFlags) LONG_INTPARAMETER("requiredFlags", &requiredFlags) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("params", ¶ms) LONG_PARAMETER_GROUP("Optional phred/qual Only Parameters") LONG_PARAMETER("withinRegion", &withinRegion) LONG_PARAMETER_GROUP("Optional BaseQC Only Parameters") LONG_PARAMETER("baseSum", &baseSum) LONG_INTPARAMETER("bufferSize", &bufferSize) LONG_INTPARAMETER("minMapQual", &minMapQual) LONG_STRINGPARAMETER("dbsnp", &dbsnp) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); inputParameters.Read(argc-1, &(argv[1])); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { usage(); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--in is a mandatory argument for stats, " << "but was not specified" << std::endl; return(-1); } // Use the index file if unmapped or regionList is not empty. bool useIndex = (unmapped|| (!regionList.IsEmpty())); // IndexFile is required, so check to see if it has been set. if(useIndex && (indexFile == "")) { // In file was not specified, so set it to the in file // + ".bai" indexFile = inFile + ".bai"; } //////////////////////////////////////// // Setup in case pileup is used. Pileup<PileupElementBaseQCStats> pileup(bufferSize); // Initialize start/end positions. myStartPos = 0; myEndPos = -1; // Open the output qc file if applicable. IFILE baseQCPtr = NULL; if(!pBaseQC.IsEmpty() && !cBaseQC.IsEmpty()) { usage(); inputParameters.Status(); // Cannot specify both types of baseQC. std::cerr << "Cannot specify both --pBaseQC & --cBaseQC." << std::endl; return(-1); } else if(!pBaseQC.IsEmpty()) { baseQCPtr = ifopen(pBaseQC, "w"); PileupElementBaseQCStats::setPercentStats(true); } else if(!cBaseQC.IsEmpty()) { baseQCPtr = ifopen(cBaseQC, "w"); PileupElementBaseQCStats::setPercentStats(false); } if(baseQCPtr != NULL) { PileupElementBaseQCStats::setOutputFile(baseQCPtr); PileupElementBaseQCStats::printHeader(); } if((baseQCPtr != NULL) || baseSum) { PileupElementBaseQCStats::setMapQualFilter(minMapQual); PileupElementBaseQCStats::setBaseSum(baseSum); } if(params) { inputParameters.Status(); } // Open the file for reading. SamFile samIn; if(!samIn.OpenForRead(inFile)) { fprintf(stderr, "%s\n", samIn.GetStatusMessage()); return(samIn.GetStatus()); } samIn.SetReadFlags(requiredFlags, excludeFlags); // Set whether or not basic statistics should be generated. samIn.GenerateStatistics(basic); // Read the sam header. SamFileHeader samHeader; if(!samIn.ReadHeader(samHeader)) { fprintf(stderr, "%s\n", samIn.GetStatusMessage()); return(samIn.GetStatus()); } // Open the bam index file for reading if we are // doing unmapped reads (also set the read section). if(useIndex) { samIn.ReadBamIndex(indexFile); if(unmapped) { samIn.SetReadSection(-1); } if(!regionList.IsEmpty()) { myRegionList = ifopen(regionList, "r"); } } ////////////////////////// // Read dbsnp if specified and doing baseQC if(((baseQCPtr != NULL) || baseSum) && (!dbsnp.IsEmpty())) { // Read the dbsnp file. IFILE fdbSnp; fdbSnp = ifopen(dbsnp,"r"); // Determine how many entries. const SamReferenceInfo& refInfo = samHeader.getReferenceInfo(); int maxRefLen = 0; for(int i = 0; i < refInfo.getNumEntries(); i++) { int refLen = refInfo.getReferenceLength(i); if(refLen >= maxRefLen) { maxRefLen = refLen + 1; } } dbsnpListPtr = new PosList(refInfo.getNumEntries(),maxRefLen); if(fdbSnp==NULL) { std::cerr << "Open dbSNP file " << dbsnp.c_str() << " failed!\n"; } else if(dbsnpListPtr == NULL) { std::cerr << "Failed to init the memory allocation for the dbsnpList.\n"; } else { // Read the dbsnp file. StringArray tokens; String buffer; int position = 0; int refID = 0; // Loop til the end of the file. while (!ifeof(fdbSnp)) { // Read the next line. buffer.ReadLine(fdbSnp); // If it does not have at least 2 columns, // continue to the next line. if (buffer.IsEmpty() || buffer[0] == '#') continue; tokens.AddTokens(buffer); if(tokens.Length() < 2) continue; if(!tokens[1].AsInteger(position)) { std::cerr << "Improperly formatted region line, start position " << "(2nd column) is not an integer: " << tokens[1] << "; Skipping to the next line.\n"; continue; } // Look up the reference name. refID = samHeader.getReferenceID(tokens[0]); if(refID != SamReferenceInfo::NO_REF_ID) { // Reference id was found, so add it to the dbsnp dbsnpListPtr->addPosition(refID, position); } tokens.Clear(); buffer.Clear(); } } ifclose(fdbSnp); } // Read the sam records. SamRecord samRecord; int numReads = 0; ////////////////////// // Setup in case doing a quality count. // Quality histogram. const int MAX_QUAL = 126; const int START_QUAL = 33; uint64_t qualCount[MAX_QUAL+1]; for(int i = 0; i <= MAX_QUAL; i++) { qualCount[i] = 0; } const int START_PHRED = 0; const int PHRED_DIFF = START_QUAL - START_PHRED; const int MAX_PHRED = MAX_QUAL - PHRED_DIFF; uint64_t phredCount[MAX_PHRED+1]; for(int i = 0; i <= MAX_PHRED; i++) { phredCount[i] = 0; } int refPos = 0; Cigar* cigarPtr = NULL; char cigarChar = '?'; // Exclude clips from the qual/phred counts if unmapped reads are excluded. bool qualExcludeClips = excludeFlags & SamFlag::UNMAPPED; ////////////////////////////////// // When not reading by sections, getNextSection returns true // the first time, then false the next time. while(getNextSection(samIn)) { // Keep reading records from the file until SamFile::ReadRecord // indicates to stop (returns false). while(((maxNumReads < 0) || (numReads < maxNumReads)) && samIn.ReadRecord(samHeader, samRecord)) { // Another record was read, so increment the number of reads. ++numReads; // See if the quality histogram should be genereated. if(qual || phred) { // Get the quality. const char* qual = samRecord.getQuality(); // Check for no quality ('*'). if((qual[0] == '*') && (qual[1] == 0)) { // This record does not have a quality string, so no // quality processing is necessary. } else { int index = 0; cigarPtr = samRecord.getCigarInfo(); cigarChar = '?'; refPos = samRecord.get0BasedPosition(); if(!qualExcludeClips && (cigarPtr != NULL)) { // Offset the reference position by any soft clips // by subtracting the queryIndex of this start position. // refPos is now the start position of the clips. refPos -= cigarPtr->getQueryIndex(0); } while(qual[index] != 0) { // Skip this quality if it is clipped and we are skipping clips. if(cigarPtr != NULL) { cigarChar = cigarPtr->getCigarCharOpFromQueryIndex(index); } if(qualExcludeClips && Cigar::isClip(cigarChar)) { // Skip a clipped quality. ++index; // Increment the position. continue; } if(withinRegion && (myEndPos != -1) && (refPos >= myEndPos)) { // We have hit the end of the region, stop processing this // quality string. break; } if(withinRegion && (refPos < myStartPos)) { // This position is not in the target. ++index; // Update the position if this is found in the reference or a clip. if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar)) { ++refPos; } continue; } // Check for valid quality. if((qual[index] < START_QUAL) || (qual[index] > MAX_QUAL)) { if(qual) { std::cerr << "Invalid Quality found: " << qual[index] << ". Must be between " << START_QUAL << " and " << MAX_QUAL << ".\n"; } if(phred) { std::cerr << "Invalid Phred Quality found: " << qual[index] - PHRED_DIFF << ". Must be between " << START_QUAL << " and " << MAX_QUAL << ".\n"; } // Skip an invalid quality. ++index; // Update the position if this is found in the reference or a clip. if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar)) { ++refPos; } continue; } // Increment the count for this quality. ++(qualCount[(int)(qual[index])]); ++(phredCount[(int)(qual[index]) - PHRED_DIFF]); // Update the position if this is found in the reference or a clip. if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar)) { ++refPos; } ++index; } } } // Check the next thing to do for the read. if((baseQCPtr != NULL) || baseSum) { // Pileup the bases for this read. pileup.processAlignmentRegion(samRecord, myStartPos, myEndPos, dbsnpListPtr); } } // Done with a section, move on to the next one. // New section, so flush the pileup. pileup.flushPileup(); } // Flush the rest of the pileup. if((baseQCPtr != NULL) || baseSum) { // Pileup the bases. pileup.processAlignmentRegion(samRecord, myStartPos, myEndPos, dbsnpListPtr); PileupElementBaseQCStats::printSummary(); ifclose(baseQCPtr); } std::cerr << "Number of records read = " << samIn.GetCurrentRecordCount() << std::endl; if(basic) { std::cerr << std::endl; samIn.PrintStatistics(); } // Print the quality stats. if(qual) { std::cerr << std::endl; std::cerr << "Quality\tCount\n"; for(int i = START_QUAL; i <= MAX_QUAL; i++) { std::cerr << i << "\t" << qualCount[i] << std::endl; } } // Print the phred quality stats. if(phred) { std::cerr << std::endl; std::cerr << "Phred\tCount\n"; for(int i = START_PHRED; i <= MAX_PHRED; i++) { std::cerr << i << "\t" << phredCount[i] << std::endl; } } SamStatus::Status status = samIn.GetStatus(); if(status == SamStatus::NO_MORE_RECS) { // A status of NO_MORE_RECS means that all reads were successful. status = SamStatus::SUCCESS; } return(status); }
bool RegressionAnalysis::ReadModelsFromFile() { StringArray models; models.Read(modelsFile); if (models.Length() == 0) return false; regress = new FancyRegression[models.Length()]; printf("Retrieving analysis models from file [%s]...\n", (const char *) modelsFile); modelCount = 0; StringArray tokens; for (int i = 0, line = 0; i < models.Length(); i++) { models[i].Trim(); // Skip comments if (models[i][0] == '#') continue; // Divide each line into tokens tokens.Clear(); tokens.AddTokens(models[i]); // Skip blank lines if (tokens.Length() == 0) continue; // Print message for tracing... printf(" Input: %s\n", (const char *) models[i], line++); // Need a minimum of four tokens per line if (tokens.Length() < 4) { printf(" Skipped: Trait name, mean, variance and heritability required.\n"); continue; } regress[modelCount].trait = ped.LookupTrait(tokens[0]); if (regress[modelCount].trait < 0) { printf(line == 1 ? " Skipped: Appears to be a header line\n" : " Skipped: Trait %s not listed in the data file\n", (const char *) tokens[0]); continue; } // First check that mean, variance and heritability are valid numbers bool fail = false; for (int j = 1; j <= 3; j++) { char * ptr = NULL; strtod(tokens[j], &ptr); fail |= ptr[0] != 0; } // If one of the values is not a valid number, skip if (fail) { printf(line == 1 ? " Skipped: Appears to be a header line\n" : " Skipped: Invalid numeric format\n"); continue; } regress[modelCount].mean = tokens[1]; regress[modelCount].variance = tokens[2]; regress[modelCount].heritability = tokens[3]; if (tokens.Length() > 4) { regress[modelCount].label = tokens[4]; for (int j = 5; j < tokens.Length(); j++) { regress[modelCount].label += " "; regress[modelCount].label += tokens[j]; } } else regress[modelCount].label.printf("Model %d", modelCount + 1); regress[modelCount].shortLabel = regress[modelCount].label; regress[modelCount].testRetestCorrel = testRetestCorrel; regress[modelCount].bounded = !unrestricted; printf(" Model loaded and labelled %s\n", (const char *) regress[modelCount].label); modelCount++; } if (modelCount == 0) { printf("No valid models, default model will be used\n\n"); return false; } printf("Table processed. %d models recognized\n\n", modelCount); return true; }
void GroupFromAnnotation::GetGroupFromFile(FILE * log) { //Fill in annoGroups. StringArray tmp; FILE * file = fopen(groupFile,"r"); if(file==NULL) { printf("ERROR! Cannot open group file %s.\n",groupFile.c_str()); error("ERROR! Cannot open group file %s.\n",groupFile.c_str()); } String buffer; int line = 0; while (!feof(file)) { buffer.ReadLine(file); tmp.Clear(); tmp.AddTokens(buffer, SEPARATORS); if(tmp.Length()==0) continue; annoGroups.Push(tmp[0]); chrom.Push(tmp[1]); line++; } fclose(file); //Fill in SNPlist. SNPlist = new StringArray [line]; SNPNoAllele = new StringArray [line]; FILE * samefile = fopen(groupFile,"r"); line = 0; Vector pos; while (!feof(samefile)) { buffer.ReadLine(samefile); tmp.Clear(); pos.Clear(); tmp.AddTokens(buffer, "\t "); SNPlist[line].Dimension(0); SNPNoAllele[line].Dimension(0); for(int i=1;i<tmp.Length();i++) { SNPlist[line].Push(tmp[i]); StringArray sub; sub.Clear(); sub.AddTokens(tmp[i],":_/"); if(sub.Length()!=4) { printf("Warning: group %s has a variant %s that has invalid format. The correct format should be chr:pos:allele1:allele2.\n",tmp[0].c_str(),tmp[i].c_str()); fprintf(log,"Warning: group %s has a variant %s that has invalid format. The correct format should be chr:pos:allele1:allele2.\n",tmp[0].c_str(),tmp[i].c_str()); continue; } pos.Push(sub[1].AsInteger()); SNPNoAllele[line].Push(sub[0] + ":" + sub[1]); } //sort SNPlist[line] and SNPNoAllele[line] if(SNPlist[line].Length()>1) { Vector sorted_pos,order; sorted_pos.Copy(pos); sorted_pos.Sort(); order.Dimension(pos.Length()); for(int i=0;i<sorted_pos.Length();i++) { for(int j=0;j<pos.Length();j++) { if(sorted_pos[i]==pos[j]) { order[i]=j; break; } } } StringArray cp_SNPlist,cp_SNPNoAllele; cp_SNPlist.Dimension(SNPlist[line].Length()); cp_SNPNoAllele.Dimension(SNPNoAllele[line].Length()); for(int l=0;l<SNPlist[line].Length();l++) { cp_SNPlist[l] = SNPlist[line][l]; cp_SNPNoAllele[l] = SNPNoAllele[line][l]; } for(int i=0;i<order.Length();i++) { SNPlist[line][i] = cp_SNPlist[order[i]]; //printf("%s\t",SNPlist[line][i].c_str()); SNPNoAllele[line][i] = cp_SNPNoAllele[order[i]] ; } //printf("\n"); } line++; } fclose(samefile); }