void GroupFromAnnotation::addLineFromVcf( String & buffer ) { // sample: // ANNO=Nonsynonymous:ASB16; // ANNOFULL=ASB16/NM_080863:+:Nonsynonymous(CCC/Pro/P->ACC/Thr/T:Base1310/1362:Codon437/454:Exon5/5):Exon // |C17orf65/NM_178542:-:Intron StringArray vfield; vfield.AddTokens(buffer, "\t"); if ( vfield.Length() < 8 ) error("Annotation vcf only has %d columns!\n", vfield.Length()); StringArray info_semicolon; info_semicolon.AddTokens( vfield[7],";" ); // find ANNOFULL first int annofull_index = -1; for( int i=0; i<info_semicolon.Length(); i++ ) { String iheader = info_semicolon[i].SubStr(0,8); if (iheader == "ANNOFULL") { annofull_index = i; break; } } if (annofull_index == -1) { printf("warning: no ANNOFULL field at chr%s:%s. Variant won't included in groups!\n", info_semicolon[0].c_str(), info_semicolon[1].c_str()); return; } // remove ANNOFULL= String anno_full_str = info_semicolon[annofull_index].SubStr(9); // check each alternative field StringArray alts; alts.AddTokens( anno_full_str, "|" ); for( int a=0; a<alts.Length(); a++ ) { StringArray sub; sub.AddTokens( alts[a], ":/="); if (func_upper.Length() != 0) { // match before add for(int f =0;f<func_upper.Length();f++) { bool pattern_match = checkPatternMatch( sub, func_upper[f] ); if ( pattern_match ) { addGeneFromVcf( vfield, sub[0] ); break; } } } else { // no pattern to match: check if intergenic first String upper_name = sub[0].ToUpper(); if ( !upper_name.SlowFind( "INTERGENIC" ) ) addGeneFromVcf( vfield, sub[0] ); } } }
int loadMatrix(Matrix& a, String& fileName) { a.Zero(); IFILE ifile(fileName.c_str(), "r"); String line; StringArray array; int lineNo = 0; while (!ifeof(ifile)) { line.ReadLine(ifile); lineNo++; if (line.Length() == 0) continue; array.Clear(); array.AddTokens(line); if (a.cols != 0 && a.cols != array.Length() && line.Length() > 0) { fprintf(stderr, "Wrong column size at line %d!\n", lineNo); array.Print(); line.Write(stdout); return -1; } else { a.GrowTo(a.rows, array.Length()); } if (a.rows < lineNo) { a.GrowTo(a.rows + 1, a.cols); } for (int i = 0; i < array.Length(); i++) { a[lineNo - 1][i] = atol(array[i]); } } // a.Print(stdout); return 0; };
int loadVector(Vector& a, String& fileName) { a.Zero(); IFILE ifile(fileName.c_str(), "r"); String line; StringArray array; int lineNo = 0; while (!ifeof(ifile)) { line.ReadLine(ifile); lineNo++; if (line.Length() == 0) continue; array.Clear(); array.AddTokens(line); if (array.Length() > 1 && line.Length() > 0) { fprintf(stderr, "Warning: column size at line %d!\n", lineNo); array.Print(); line.Write(stdout); return -1; } if (a.dim < lineNo) { a.GrowTo(a.dim + 1); } a[lineNo - 1] = atol(array[0]); } // a.Print(stdout); return 0; };
void GCContent::LoadRegions(String & regionsFile, GenomeSequence &genome, bool invertRegion) { if(regionsFile.Length()==0) return; if(genome.sequenceLength()==0) error("No reference genome loaded!\n"); IFILE fhRegions; fhRegions = ifopen(regionsFile.c_str(),"r"); if(fhRegions==NULL) error("Open regions file %s failed!\n", regionsFile.c_str()); regionIndicator.resize(genome.sequenceLength()); StringArray tokens; String buffer; int len; fprintf(stderr, "Loading region list..."); while (!ifeof(fhRegions)){ buffer.ReadLine(fhRegions); if (buffer.IsEmpty() || buffer[0] == '#') continue; tokens.AddTokens(buffer, WHITESPACE); if(tokens.Length() < 3) continue; genomeIndex_t startGenomeIndex = 0; int chromosomeIndex = tokens[1].AsInteger(); // use chromosome name (token[0]) and position (token[1]) to query genome index. startGenomeIndex = genome.getGenomePosition(tokens[0].c_str(), chromosomeIndex); if(startGenomeIndex >= regionIndicator.size() ) { //fprintf(stderr, "WARNING: region list section %s position %u is not found in the reference and skipped...\n", tokens[0].c_str(), chromosomeIndex); continue; } len = tokens[2].AsInteger() - tokens[1].AsInteger() + 1; for(uint32_t i=startGenomeIndex; i<startGenomeIndex+len; i++) regionIndicator[i] = true; tokens.Clear(); buffer.Clear(); } if (invertRegion) { fprintf(stderr, " invert region..."); for (uint32_t i = 0; i < regionIndicator.size(); i++) { regionIndicator[i] = !regionIndicator[i]; } } ifclose(fhRegions); fprintf(stderr, "DONE!\n"); }
void StringToArray(const String & input, IntArray & values, int desired) { StringArray tokens; tokens.AddTokens(input, ','); values.Dimension(desired); values.Zero(); if (tokens.Length()) for (int i = 0; i < desired; i++) values[i] = tokens[i % tokens.Length()].AsInteger(); }
// partition 8th column only void GroupFromAnnotation::GetGroupFromVCF() { printf("Parsing annotations from annotated VCF file ...\n"); StringArray func; func.AddTokens(function,"/"); vcfInitialize(); // set size of the tables FILE * inFile; inFile = fopen(vcfInput,"r"); StringIntHash groupHash; int geneCount=0; // add all genes to group hash first while (!feof(inFile)) { String buffer; buffer.ReadLine(inFile); if ( buffer[0] == '#' ) continue; addLineFromVcf( buffer ); } fclose(inFile); // sort SNPlist and SNPNoAllele for( int g=0; g<geneCount; g++ ) { if ( SNPlist[g].Length()>1 ) { Vector order; setOrderFromSortedPositions( g, order); StringArray cp_SNPlist,cp_SNPNoAllele; cp_SNPlist.Dimension(SNPlist[g].Length()); cp_SNPNoAllele.Dimension(SNPNoAllele[g].Length()); for(int l=0;l<SNPlist[g].Length();l++) { cp_SNPlist[l] = SNPlist[g][l]; cp_SNPNoAllele[l] = SNPNoAllele[g][l]; } for(int i=0;i<order.Length();i++) { SNPlist[g][i] = cp_SNPlist[order[i]]; SNPNoAllele[g][i] = cp_SNPNoAllele[order[i]] ; } } } // print test.groupfile printf("done!\n"); String grp_filename = "test.groupfile"; printGroupFile( grp_filename ); }
int Stats::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; String indexFile = ""; bool basic = false; bool noeof = false; bool params = false; bool qual = false; bool phred = false; int maxNumReads = -1; bool unmapped = false; String pBaseQC = ""; String cBaseQC = ""; String regionList = ""; int excludeFlags = 0; int requiredFlags = 0; bool withinRegion = false; int minMapQual = 0; String dbsnp = ""; PosList *dbsnpListPtr = NULL; bool baseSum = false; int bufferSize = PileupHelper::DEFAULT_WINDOW_SIZE; ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inFile) LONG_PARAMETER_GROUP("Types of Statistics") LONG_PARAMETER("basic", &basic) LONG_PARAMETER("qual", &qual) LONG_PARAMETER("phred", &phred) LONG_STRINGPARAMETER("pBaseQC", &pBaseQC) LONG_STRINGPARAMETER("cBaseQC", &cBaseQC) LONG_PARAMETER_GROUP("Optional Parameters") LONG_INTPARAMETER("maxNumReads", &maxNumReads) LONG_PARAMETER("unmapped", &unmapped) LONG_STRINGPARAMETER("bamIndex", &indexFile) LONG_STRINGPARAMETER("regionList", ®ionList) LONG_INTPARAMETER("excludeFlags", &excludeFlags) LONG_INTPARAMETER("requiredFlags", &requiredFlags) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("params", ¶ms) LONG_PARAMETER_GROUP("Optional phred/qual Only Parameters") LONG_PARAMETER("withinRegion", &withinRegion) LONG_PARAMETER_GROUP("Optional BaseQC Only Parameters") LONG_PARAMETER("baseSum", &baseSum) LONG_INTPARAMETER("bufferSize", &bufferSize) LONG_INTPARAMETER("minMapQual", &minMapQual) LONG_STRINGPARAMETER("dbsnp", &dbsnp) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); inputParameters.Read(argc-1, &(argv[1])); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { usage(); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--in is a mandatory argument for stats, " << "but was not specified" << std::endl; return(-1); } // Use the index file if unmapped or regionList is not empty. bool useIndex = (unmapped|| (!regionList.IsEmpty())); // IndexFile is required, so check to see if it has been set. if(useIndex && (indexFile == "")) { // In file was not specified, so set it to the in file // + ".bai" indexFile = inFile + ".bai"; } //////////////////////////////////////// // Setup in case pileup is used. Pileup<PileupElementBaseQCStats> pileup(bufferSize); // Initialize start/end positions. myStartPos = 0; myEndPos = -1; // Open the output qc file if applicable. IFILE baseQCPtr = NULL; if(!pBaseQC.IsEmpty() && !cBaseQC.IsEmpty()) { usage(); inputParameters.Status(); // Cannot specify both types of baseQC. std::cerr << "Cannot specify both --pBaseQC & --cBaseQC." << std::endl; return(-1); } else if(!pBaseQC.IsEmpty()) { baseQCPtr = ifopen(pBaseQC, "w"); PileupElementBaseQCStats::setPercentStats(true); } else if(!cBaseQC.IsEmpty()) { baseQCPtr = ifopen(cBaseQC, "w"); PileupElementBaseQCStats::setPercentStats(false); } if(baseQCPtr != NULL) { PileupElementBaseQCStats::setOutputFile(baseQCPtr); PileupElementBaseQCStats::printHeader(); } if((baseQCPtr != NULL) || baseSum) { PileupElementBaseQCStats::setMapQualFilter(minMapQual); PileupElementBaseQCStats::setBaseSum(baseSum); } if(params) { inputParameters.Status(); } // Open the file for reading. SamFile samIn; if(!samIn.OpenForRead(inFile)) { fprintf(stderr, "%s\n", samIn.GetStatusMessage()); return(samIn.GetStatus()); } samIn.SetReadFlags(requiredFlags, excludeFlags); // Set whether or not basic statistics should be generated. samIn.GenerateStatistics(basic); // Read the sam header. SamFileHeader samHeader; if(!samIn.ReadHeader(samHeader)) { fprintf(stderr, "%s\n", samIn.GetStatusMessage()); return(samIn.GetStatus()); } // Open the bam index file for reading if we are // doing unmapped reads (also set the read section). if(useIndex) { samIn.ReadBamIndex(indexFile); if(unmapped) { samIn.SetReadSection(-1); } if(!regionList.IsEmpty()) { myRegionList = ifopen(regionList, "r"); } } ////////////////////////// // Read dbsnp if specified and doing baseQC if(((baseQCPtr != NULL) || baseSum) && (!dbsnp.IsEmpty())) { // Read the dbsnp file. IFILE fdbSnp; fdbSnp = ifopen(dbsnp,"r"); // Determine how many entries. const SamReferenceInfo& refInfo = samHeader.getReferenceInfo(); int maxRefLen = 0; for(int i = 0; i < refInfo.getNumEntries(); i++) { int refLen = refInfo.getReferenceLength(i); if(refLen >= maxRefLen) { maxRefLen = refLen + 1; } } dbsnpListPtr = new PosList(refInfo.getNumEntries(),maxRefLen); if(fdbSnp==NULL) { std::cerr << "Open dbSNP file " << dbsnp.c_str() << " failed!\n"; } else if(dbsnpListPtr == NULL) { std::cerr << "Failed to init the memory allocation for the dbsnpList.\n"; } else { // Read the dbsnp file. StringArray tokens; String buffer; int position = 0; int refID = 0; // Loop til the end of the file. while (!ifeof(fdbSnp)) { // Read the next line. buffer.ReadLine(fdbSnp); // If it does not have at least 2 columns, // continue to the next line. if (buffer.IsEmpty() || buffer[0] == '#') continue; tokens.AddTokens(buffer); if(tokens.Length() < 2) continue; if(!tokens[1].AsInteger(position)) { std::cerr << "Improperly formatted region line, start position " << "(2nd column) is not an integer: " << tokens[1] << "; Skipping to the next line.\n"; continue; } // Look up the reference name. refID = samHeader.getReferenceID(tokens[0]); if(refID != SamReferenceInfo::NO_REF_ID) { // Reference id was found, so add it to the dbsnp dbsnpListPtr->addPosition(refID, position); } tokens.Clear(); buffer.Clear(); } } ifclose(fdbSnp); } // Read the sam records. SamRecord samRecord; int numReads = 0; ////////////////////// // Setup in case doing a quality count. // Quality histogram. const int MAX_QUAL = 126; const int START_QUAL = 33; uint64_t qualCount[MAX_QUAL+1]; for(int i = 0; i <= MAX_QUAL; i++) { qualCount[i] = 0; } const int START_PHRED = 0; const int PHRED_DIFF = START_QUAL - START_PHRED; const int MAX_PHRED = MAX_QUAL - PHRED_DIFF; uint64_t phredCount[MAX_PHRED+1]; for(int i = 0; i <= MAX_PHRED; i++) { phredCount[i] = 0; } int refPos = 0; Cigar* cigarPtr = NULL; char cigarChar = '?'; // Exclude clips from the qual/phred counts if unmapped reads are excluded. bool qualExcludeClips = excludeFlags & SamFlag::UNMAPPED; ////////////////////////////////// // When not reading by sections, getNextSection returns true // the first time, then false the next time. while(getNextSection(samIn)) { // Keep reading records from the file until SamFile::ReadRecord // indicates to stop (returns false). while(((maxNumReads < 0) || (numReads < maxNumReads)) && samIn.ReadRecord(samHeader, samRecord)) { // Another record was read, so increment the number of reads. ++numReads; // See if the quality histogram should be genereated. if(qual || phred) { // Get the quality. const char* qual = samRecord.getQuality(); // Check for no quality ('*'). if((qual[0] == '*') && (qual[1] == 0)) { // This record does not have a quality string, so no // quality processing is necessary. } else { int index = 0; cigarPtr = samRecord.getCigarInfo(); cigarChar = '?'; refPos = samRecord.get0BasedPosition(); if(!qualExcludeClips && (cigarPtr != NULL)) { // Offset the reference position by any soft clips // by subtracting the queryIndex of this start position. // refPos is now the start position of the clips. refPos -= cigarPtr->getQueryIndex(0); } while(qual[index] != 0) { // Skip this quality if it is clipped and we are skipping clips. if(cigarPtr != NULL) { cigarChar = cigarPtr->getCigarCharOpFromQueryIndex(index); } if(qualExcludeClips && Cigar::isClip(cigarChar)) { // Skip a clipped quality. ++index; // Increment the position. continue; } if(withinRegion && (myEndPos != -1) && (refPos >= myEndPos)) { // We have hit the end of the region, stop processing this // quality string. break; } if(withinRegion && (refPos < myStartPos)) { // This position is not in the target. ++index; // Update the position if this is found in the reference or a clip. if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar)) { ++refPos; } continue; } // Check for valid quality. if((qual[index] < START_QUAL) || (qual[index] > MAX_QUAL)) { if(qual) { std::cerr << "Invalid Quality found: " << qual[index] << ". Must be between " << START_QUAL << " and " << MAX_QUAL << ".\n"; } if(phred) { std::cerr << "Invalid Phred Quality found: " << qual[index] - PHRED_DIFF << ". Must be between " << START_QUAL << " and " << MAX_QUAL << ".\n"; } // Skip an invalid quality. ++index; // Update the position if this is found in the reference or a clip. if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar)) { ++refPos; } continue; } // Increment the count for this quality. ++(qualCount[(int)(qual[index])]); ++(phredCount[(int)(qual[index]) - PHRED_DIFF]); // Update the position if this is found in the reference or a clip. if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar)) { ++refPos; } ++index; } } } // Check the next thing to do for the read. if((baseQCPtr != NULL) || baseSum) { // Pileup the bases for this read. pileup.processAlignmentRegion(samRecord, myStartPos, myEndPos, dbsnpListPtr); } } // Done with a section, move on to the next one. // New section, so flush the pileup. pileup.flushPileup(); } // Flush the rest of the pileup. if((baseQCPtr != NULL) || baseSum) { // Pileup the bases. pileup.processAlignmentRegion(samRecord, myStartPos, myEndPos, dbsnpListPtr); PileupElementBaseQCStats::printSummary(); ifclose(baseQCPtr); } std::cerr << "Number of records read = " << samIn.GetCurrentRecordCount() << std::endl; if(basic) { std::cerr << std::endl; samIn.PrintStatistics(); } // Print the quality stats. if(qual) { std::cerr << std::endl; std::cerr << "Quality\tCount\n"; for(int i = START_QUAL; i <= MAX_QUAL; i++) { std::cerr << i << "\t" << qualCount[i] << std::endl; } } // Print the phred quality stats. if(phred) { std::cerr << std::endl; std::cerr << "Phred\tCount\n"; for(int i = START_PHRED; i <= MAX_PHRED; i++) { std::cerr << i << "\t" << phredCount[i] << std::endl; } } SamStatus::Status status = samIn.GetStatus(); if(status == SamStatus::NO_MORE_RECS) { // A status of NO_MORE_RECS means that all reads were successful. status = SamStatus::SUCCESS; } return(status); }
bool RegressionAnalysis::ReadModelsFromFile() { StringArray models; models.Read(modelsFile); if (models.Length() == 0) return false; regress = new FancyRegression[models.Length()]; printf("Retrieving analysis models from file [%s]...\n", (const char *) modelsFile); modelCount = 0; StringArray tokens; for (int i = 0, line = 0; i < models.Length(); i++) { models[i].Trim(); // Skip comments if (models[i][0] == '#') continue; // Divide each line into tokens tokens.Clear(); tokens.AddTokens(models[i]); // Skip blank lines if (tokens.Length() == 0) continue; // Print message for tracing... printf(" Input: %s\n", (const char *) models[i], line++); // Need a minimum of four tokens per line if (tokens.Length() < 4) { printf(" Skipped: Trait name, mean, variance and heritability required.\n"); continue; } regress[modelCount].trait = ped.LookupTrait(tokens[0]); if (regress[modelCount].trait < 0) { printf(line == 1 ? " Skipped: Appears to be a header line\n" : " Skipped: Trait %s not listed in the data file\n", (const char *) tokens[0]); continue; } // First check that mean, variance and heritability are valid numbers bool fail = false; for (int j = 1; j <= 3; j++) { char * ptr = NULL; strtod(tokens[j], &ptr); fail |= ptr[0] != 0; } // If one of the values is not a valid number, skip if (fail) { printf(line == 1 ? " Skipped: Appears to be a header line\n" : " Skipped: Invalid numeric format\n"); continue; } regress[modelCount].mean = tokens[1]; regress[modelCount].variance = tokens[2]; regress[modelCount].heritability = tokens[3]; if (tokens.Length() > 4) { regress[modelCount].label = tokens[4]; for (int j = 5; j < tokens.Length(); j++) { regress[modelCount].label += " "; regress[modelCount].label += tokens[j]; } } else regress[modelCount].label.printf("Model %d", modelCount + 1); regress[modelCount].shortLabel = regress[modelCount].label; regress[modelCount].testRetestCorrel = testRetestCorrel; regress[modelCount].bounded = !unrestricted; printf(" Model loaded and labelled %s\n", (const char *) regress[modelCount].label); modelCount++; } if (modelCount == 0) { printf("No valid models, default model will be used\n\n"); return false; } printf("Table processed. %d models recognized\n\n", modelCount); return true; }
void GroupFromAnnotation::vcfInitialize() { // func_upper if ( function != "" ) { func_upper.AddTokens( function, "/" ); for( int i=0; i<func_upper.Length(); i++ ) func_upper[i] = func_upper[i].ToUpper(); } FILE * inFile; inFile = fopen(vcfInput,"r"); while (!feof(inFile)) { String buffer; buffer.ReadLine( inFile); if ( buffer[0] == '#' ) continue; StringArray vfield; vfield.AddTokens(buffer, "\t"); if ( vfield.Length() < 8 ) error("Annotation vcf only has %d columns!\n", vfield.Length()); StringArray info_semicolon; info_semicolon.AddTokens( vfield[7],";" ); int annofull_index = -1; for( int i=0; i<info_semicolon.Length(); i++ ) { String iheader = info_semicolon[i].SubStr(0,8); if (iheader == "ANNOFULL") { annofull_index = i; break; } } if (annofull_index == -1) continue; String anno_full_str = info_semicolon[annofull_index].SubStr(9); StringArray alts; alts.AddTokens( anno_full_str, "|" ); for( int a=0; a<alts.Length(); a++ ) { StringArray sub; sub.AddTokens( alts[a], ":/="); if (func_upper.Length() != 0) { // match before add for(int f =0;f<func_upper.Length();f++) { bool pattern_match = checkPatternMatch( sub, func_upper[f] ); if ( pattern_match ) { chrom.Push( vfield[0] ); addGeneToGroupHash( sub[0] ); break; } } } else { // no pattern to match chrom.Push( vfield[0] ); addGeneToGroupHash( sub[0] ); } } } // vectors SNPlist = new StringArray [geneCount]; SNPNoAllele = new StringArray [geneCount]; pos = new Vector [geneCount]; }
void GroupFromAnnotation::GetGeneMap(String path) { IFILE genemap; genemap = ifopen(mapFile,"r"); if(genemap==NULL) { if(mapFile=="../data/refFlat_hg19.txt") { mapFile += ".gz"; genemap = ifopen(mapFile,"r"); if(genemap==NULL) { int loc = path.Find("bin"); if(loc!=-1) { mapFile = path.Left(loc-1); mapFile += "/data/refFlat_hg19.txt"; } else { mapFile += "../data/refFlat_hg19.txt"; } genemap = ifopen(mapFile,"r"); } if(genemap==NULL) { mapFile += ".gz"; genemap = ifopen(mapFile,"r"); } if(genemap==NULL) error("Cannot open gene mapping file %s.\n",mapFile.c_str()); } else error("Cannot open gene mapping file %s.\n",mapFile.c_str()); } StringIntHash GeneLocHash; StringArray strand; int gene_idx =0; while(!ifeof(genemap)) { String buffer; buffer.ReadLine(genemap); StringArray record; record.AddTokens(buffer,"\t"); int loc = GeneLocHash.Integer(record[0]); if(loc==-1) { GeneLocHash.SetInteger(record[0],gene_idx); //save chr, start and end positions StringArray gene_chr; if(record[2][2]=='r' || record[2][2]=='R') record[2] = record[2].SubStr(3); gene_chr.AddTokens(record[2],"_,;."); if(gene_chr[0].Find("Un")!=-1) continue; /* if(ChrLocHash.Integer(gene_chr[0])==-1) { chr_count++; unique_chr.Push(gene_chr[0]); ChrLocHash.SetInteger(gene_chr[0],chr_count); } */ chr.Push(gene_chr[0]); //printf("%d\t%s\t%s\n",idx,record[0].c_str(),gene_chr[0].c_str()); start_pos.Push(record[4].AsInteger()); end_pos.Push(record[5].AsInteger()); strand.Push(record[3]); genename.Push(record[0]); gene_idx++; } else { //get the current chr StringArray gene_chr; if(record[2][2]=='r' || record[2][2]=='R') record[2] = record[2].SubStr(3); gene_chr.AddTokens(record[2],"_,;."); if(gene_chr[0].Find("Un")!=-1) continue; //check if strand and chr are consistent with previous record if(chr[loc]!=gene_chr[0]) //if(strand[loc]!=record[3] || chr[loc]!=gene_chr[0]) // printf("Gene %s in %s has multiple records in different chromosome or strand.\n",record[0].c_str(),mapFile.c_str()); continue; //update start and end position if(record[4].AsInteger()<start_pos[loc]) start_pos[loc] = record[4].AsInteger(); if(record[5].AsInteger()>end_pos[loc]) end_pos[loc] = record[5].AsInteger(); } } ifclose(genemap); //ifclose(genemap); chr_idx.Index(chr); String chr_=chr[chr_idx[0]]; for(int i=1;i<chr.Length();i++) { if(chr[chr_idx[i]]!=chr_) { ChrStartHash.SetInteger(chr[chr_idx[i]],i); ChrEndHash.SetInteger(chr_,i-1); chr_ = chr[chr_idx[i]]; } } }
void GroupFromAnnotation::GetGroupFromFile(FILE * log) { //Fill in annoGroups. StringArray tmp; FILE * file = fopen(groupFile,"r"); if(file==NULL) { printf("ERROR! Cannot open group file %s.\n",groupFile.c_str()); error("ERROR! Cannot open group file %s.\n",groupFile.c_str()); } String buffer; int line = 0; while (!feof(file)) { buffer.ReadLine(file); tmp.Clear(); tmp.AddTokens(buffer, SEPARATORS); if(tmp.Length()==0) continue; annoGroups.Push(tmp[0]); chrom.Push(tmp[1]); line++; } fclose(file); //Fill in SNPlist. SNPlist = new StringArray [line]; SNPNoAllele = new StringArray [line]; FILE * samefile = fopen(groupFile,"r"); line = 0; Vector pos; while (!feof(samefile)) { buffer.ReadLine(samefile); tmp.Clear(); pos.Clear(); tmp.AddTokens(buffer, "\t "); SNPlist[line].Dimension(0); SNPNoAllele[line].Dimension(0); for(int i=1;i<tmp.Length();i++) { SNPlist[line].Push(tmp[i]); StringArray sub; sub.Clear(); sub.AddTokens(tmp[i],":_/"); if(sub.Length()!=4) { printf("Warning: group %s has a variant %s that has invalid format. The correct format should be chr:pos:allele1:allele2.\n",tmp[0].c_str(),tmp[i].c_str()); fprintf(log,"Warning: group %s has a variant %s that has invalid format. The correct format should be chr:pos:allele1:allele2.\n",tmp[0].c_str(),tmp[i].c_str()); continue; } pos.Push(sub[1].AsInteger()); SNPNoAllele[line].Push(sub[0] + ":" + sub[1]); } //sort SNPlist[line] and SNPNoAllele[line] if(SNPlist[line].Length()>1) { Vector sorted_pos,order; sorted_pos.Copy(pos); sorted_pos.Sort(); order.Dimension(pos.Length()); for(int i=0;i<sorted_pos.Length();i++) { for(int j=0;j<pos.Length();j++) { if(sorted_pos[i]==pos[j]) { order[i]=j; break; } } } StringArray cp_SNPlist,cp_SNPNoAllele; cp_SNPlist.Dimension(SNPlist[line].Length()); cp_SNPNoAllele.Dimension(SNPNoAllele[line].Length()); for(int l=0;l<SNPlist[line].Length();l++) { cp_SNPlist[l] = SNPlist[line][l]; cp_SNPNoAllele[l] = SNPNoAllele[line][l]; } for(int i=0;i<order.Length();i++) { SNPlist[line][i] = cp_SNPlist[order[i]]; //printf("%s\t",SNPlist[line][i].c_str()); SNPNoAllele[line][i] = cp_SNPNoAllele[order[i]] ; } //printf("\n"); } line++; } fclose(samefile); }