void getFstFromVCF() { std::cerr << "Calculating Fst using variants from: " << opt::vcfFile << std::endl; std::cerr << "Between the two 'populations' defined in: " << opt::sampleSets << std::endl; if (opt::windowSize > 0) { std::cerr << "also using a sliding window of size: " << opt::windowSize << " variants and sliding in steps of: " << opt::windowStep << std::endl; } string fileRoot = stripExtension(opt::sampleSets); //std::cerr << "Still alive: " << std::endl; // Open connection to read from the vcf file std::istream* vcfFile = createReader(opt::vcfFile.c_str()); //std::cerr << "Hello: " << std::endl; std::ifstream* setsFile = new std::ifstream(opt::sampleSets.c_str()); std::ifstream* annotFile; std::ofstream* snpCategoryFstFile; std::ofstream* regionsAboveFstFile; bool inRegAbove = false; std::ofstream* fstDxyFixedWindowFile; std::ifstream* ancSetsFile; std::ofstream* ancSetsOutFile; std::vector<string> ancSet1; std::vector<string> ancSet2; Annotation wgAnnotation; if (!opt::annotFile.empty()) { annotFile = new std::ifstream(opt::annotFile.c_str()); Annotation Annot(annotFile, false); // Does not use transcripts annotated as 5' or 3' partial wgAnnotation = Annot; string snpCategoryFstFileName = fileRoot + "_" + opt::runName + "SNPcategory_fst.txt"; snpCategoryFstFile = new std::ofstream(snpCategoryFstFileName.c_str()); *snpCategoryFstFile << "SNPcategory" << "\t" << "thisSNPFst" << "\t" << "thisSNPDxy" << "\t" << "scaffold" << "\t" << "position" << std::endl; } if (!opt::ancSets.empty()) { ancSetsFile = new std::ifstream(opt::ancSets); string ancOutFileName = fileRoot + "_" + opt::runName + "ancestralSNPs_fst.txt"; ancSetsOutFile = new std::ofstream(ancOutFileName); *ancSetsOutFile << "scaffold" << "\t" << "position" << "\t" << "AncAllelePopulation" << "\t" << "Fst" << "\t" << "ancSet1_segregating" << "\t" << "ancSet2_segregating" << std::endl; string ancSet1String; string ancSet2String; getline(*ancSetsFile, ancSet1String); getline(*ancSetsFile, ancSet2String); ancSet1 = split(ancSet1String, ','); ancSet2 = split(ancSet2String, ','); std::sort(ancSet1.begin(),ancSet1.end()); std::sort(ancSet2.begin(),ancSet2.end()); } if (opt::regAbove > 0) { string regionsAboveFstFileName = fileRoot + "_w_" + numToString(opt::windowSize) + opt::runName + "_fst_above" + numToString(opt::regAbove) + ".txt"; regionsAboveFstFile = new std::ofstream(regionsAboveFstFileName.c_str()); } string FstResultsFileName = fileRoot + "_w_" + numToString(opt::windowSize) + opt::runName + "_fst.txt"; std::ofstream* pFst = new std::ofstream(FstResultsFileName.c_str()); string fstDxyFixedWindowFileName = fileRoot + "dXY_fixedWindow.txt"; fstDxyFixedWindowFile = new std::ofstream(fstDxyFixedWindowFileName.c_str()); string heterozygositySetsFileName = fileRoot + "_w_" + numToString(opt::windowSize) + opt::runName + "_heterozygosity.txt"; *fstDxyFixedWindowFile << "scaffold" << "\t" << "Start" << "\t" << "End" << "\t" << "Fst" << "\t" << "Dxy" << "\t" << "Set1_pi" << "\t" << "Set2_pi" << std::endl; std::ofstream* pHetSets = new std::ofstream(heterozygositySetsFileName.c_str()); //std::cerr << "Still alive: " << std::endl; string set1String; string set2String; getline(*setsFile, set1String); getline(*setsFile, set2String); std::vector<string> set1 = split(set1String, ','); std::vector<string> set2 = split(set2String, ','); std::sort(set1.begin(),set1.end()); std::sort(set2.begin(),set2.end()); int numChromosomes; int totalVariantNumber = 0; int countedVariantNumber = 0; string windowMiddleVariant = "first\tWindow"; string windowStartEnd = "scaffold_0\t0"; int windowStart = 0; int windowEnd; int fixedWindowStart = 0; std::vector<double> fixedWindowDxyVector; std::vector<double> fixedWindowFstNumVector; std::vector<double> fixedWindowFstDenomVector; std::vector<double> fixedWindowHet1Vector; std::vector<double> fixedWindowHet2Vector; std::vector<double> fixedWindowPi1Vector; std::vector<double> fixedWindowPi2Vector; std::vector<string> sampleNames; std::vector<string> fields; std::vector<size_t> set1Loci; std::vector<size_t> set2Loci; std::vector<size_t> ancSet1Loci; std::vector<size_t> ancSet2Loci; short n1; short n2; short n1anc; short n2anc; string line; std::map<std::string, double> loc_pval; std::vector<double> fstNumerators; fstNumerators.reserve(30000000); std::vector<double> fstDenominators; fstDenominators.reserve(30000000); std::vector<double> DxyVector; DxyVector.reserve(30000000); std::vector<std::vector<double> > heterozygositiesVector; heterozygositiesVector.reserve(30000000); std::vector<double> set1heterozygositiesSimple; set1heterozygositiesSimple.reserve(30000000); std::vector<double> set2heterozygositiesSimple; set2heterozygositiesSimple.reserve(30000000); std::vector<double> set1heterozygositiesNei; set1heterozygositiesNei.reserve(30000000); std::vector<double> set2heterozygositiesNei; set2heterozygositiesNei.reserve(30000000); std::vector<double> set1heterozygositiesPi; set1heterozygositiesPi.reserve(30000000); std::vector<double> set2heterozygositiesPi; set2heterozygositiesPi.reserve(30000000); while (getline(*vcfFile, line)) { if (line[0] == '#' && line[1] == '#') { } else if (line[0] == '#' && line[1] == 'C') { std::vector<std::string> fields = split(line, '\t'); const std::vector<std::string>::size_type numSamples = fields.size() - NUM_NON_GENOTYPE_COLUMNS; numChromosomes = (int)numSamples * 2; // std::cerr << "Number of chromosomes: " << numChromosomes << std::endl; if (opt::sampleNameFile.empty()) { for (std::vector<std::string>::size_type i = NUM_NON_GENOTYPE_COLUMNS; i != fields.size(); i++) { sampleNames.push_back(fields[i]); } } else { sampleNames = readSampleNamesFromTextFile(opt::sampleNameFile); } set1Loci = locateSet(sampleNames, set1); set2Loci = locateSet(sampleNames, set2); n1 = set1Loci.size()*2; n2 = set2Loci.size()*2; std::cerr << "Set1 loci: " << std::endl; print_vector_stream(set1Loci, std::cerr); std::cerr << "Set2 loci: " << std::endl; print_vector_stream(set2Loci, std::cerr); if (!opt::ancSets.empty()) { ancSet1Loci = locateSet(sampleNames, ancSet1); ancSet2Loci = locateSet(sampleNames, ancSet2); std::cerr << "Ancestral Set1 loci: " << std::endl; print_vector_stream(ancSet1Loci, std::cerr); std::cerr << "Ancestral Set2 loci: " << std::endl; print_vector_stream(ancSet2Loci, std::cerr); n1anc = ancSet1Loci.size() * 2; n2anc = ancSet2Loci.size() * 2; } if (opt::windowSize > 0) { if (opt::windowSize == opt::windowStep) { *pHetSets << "scaffold" << "\t" << "Start" << "\t" << "End" << "Set1_heterozygosity" << "\t" << "Set2_heterozygosity" << "\t" << "Set1_heterozygosity_Nei" << "\t" << "Set2_heterozygosity_Nei" << "\t" << "Set1_nucleotideDiversity_pi" << "\t" << "Set2_nucleotideDiversity_pi" << std::endl; *pFst << "var_num" << "\t" << "scaffold" << "\t" << "Start" << "\t" << "End" << "\t" << "Fst" << "\t" << "Dxy_onlyVaiants" << "\t" << "Dxy_AllSites" << "\t" << "windowSize" << std::endl; if (opt::regAbove > 0) *regionsAboveFstFile << "scaffold" << "\t" << "Start" << "\t" << "End" << std::endl; } else { *pHetSets << "Middle_SNP_position" << "\t" << "Set1_heterozygosity" << "\t" << "Set2_heterozygosity" << "\t" << "Set1_heterozygosity_Nei" << "\t" << "Set2_heterozygosity_Nei" << "\t" << "Set1_nucleotideDiversity_pi" << "\t" << "Set2_nucleotideDiversity_pi" << std::endl; } } } else { totalVariantNumber++; std::vector<std::string> fields = split(line, '\t'); std::vector<std::string> info = split(fields[7], ';'); if (info[0] != "INDEL") { // Without indels SetCounts counts = getVariantCountsForFst(fields,set1Loci,set2Loci); //std::cerr << "Still here: " << counts.set1HaplotypeVariant.size() << "\t" << counts.set1individualsWithVariant.size() << "\t" << n1 << std::endl; //std::cerr << "Still here: " << counts.set2HaplotypeVariant.size() << "\t" << counts.set2individualsWithVariant.size() << "\t" << n2 << std::endl; //print_vector_stream(counts.set1HaplotypeVariant, std::cerr); //print_vector_stream(counts.set1individualsWithVariant, std::cerr); //print_vector_stream(counts.set2HaplotypeVariant, std::cerr); if ((counts.set1Count > 0 || counts.set2Count > 0) && (counts.set1Count < n1 || counts.set2Count < n2)) { countedVariantNumber++; double FstNumerator = calculateFstNumerator(counts, n1, n2); fstNumerators.push_back(FstNumerator); fixedWindowFstNumVector.push_back(FstNumerator); double FstDenominator = calculateFstDenominator(counts, n1, n2); fstDenominators.push_back(FstDenominator); fixedWindowFstDenomVector.push_back(FstDenominator); assert(FstDenominator != 0); double thisSNPDxy = calculateDxy(counts, n1, n2); DxyVector.push_back(thisSNPDxy); fixedWindowDxyVector.push_back(thisSNPDxy); std::vector<double> thisSNPhet = getSetHeterozygozities(counts, n1, n2); heterozygositiesVector.push_back(thisSNPhet); std::vector<double> thisSNPpis = calculatePiTwoSets(counts, n1, n2); fixedWindowPi1Vector.push_back(thisSNPpis[0]); fixedWindowPi2Vector.push_back(thisSNPpis[1]); set1heterozygositiesPi.push_back(thisSNPpis[0]); set2heterozygositiesPi.push_back(thisSNPpis[1]); // std::cerr << "Still here: " << thisSNPpis[0] << std::endl; set1heterozygositiesSimple.push_back(thisSNPhet[0]); set2heterozygositiesSimple.push_back(thisSNPhet[1]); fixedWindowHet1Vector.push_back(thisSNPhet[0]); set1heterozygositiesNei.push_back(thisSNPhet[2]); set2heterozygositiesNei.push_back(thisSNPhet[3]); fixedWindowHet2Vector.push_back(thisSNPhet[1]); if (!opt::annotFile.empty()) { string scaffold = fields[0]; string loc = fields[1]; // Scaffold string SNPcategory = wgAnnotation.getCategoryOfSNP(scaffold, loc); double thisSNPFst = FstNumerator/FstDenominator; *snpCategoryFstFile << SNPcategory << "\t" << thisSNPFst << "\t" << thisSNPDxy << "\t" << scaffold << "\t" << loc << std::endl; } if (!opt::ancSets.empty()) { double thisSNPFst = FstNumerator/FstDenominator; if (thisSNPFst < 0) { thisSNPFst = 0; } string AA = split(info[info.size()-1],'=')[1]; //std::cerr << "AA=" << " " << AA << std::endl; FourSetCounts c; if (AA == fields[3]) { c = getFourSetVariantCounts(fields,set1Loci,set2Loci,ancSet1Loci,ancSet2Loci,"ref"); *ancSetsOutFile << fields[0] << "\t" << fields[1] << "\t" << c.set1daAF-c.set2daAF << "\t" << thisSNPFst << "\t"; if (c.set3daAF > 0 & c.set3daAF < 1) { *ancSetsOutFile << "1" << "\t"; } else { *ancSetsOutFile << "0" << "\t"; } if (c.set4daAF > 0 & c.set4daAF < 1) { *ancSetsOutFile << "1" << std::endl; } else { *ancSetsOutFile << "0" << std::endl; } } else if (AA == fields[4]) { c = getFourSetVariantCounts(fields,set1Loci,set2Loci,ancSet1Loci,ancSet2Loci,"alt"); *ancSetsOutFile << fields[0] << "\t" << fields[1] << "\t" << c.set1daAF-c.set2daAF << "\t" << thisSNPFst << "\t"; if (c.set3daAF > 0 & c.set3daAF < 1) { *ancSetsOutFile << "1" << "\t"; } else { *ancSetsOutFile << "0" << "\t"; } if (c.set4daAF > 0 & c.set4daAF < 1) { *ancSetsOutFile << "1" << std::endl; } else { *ancSetsOutFile << "0" << std::endl; } // std::cerr << "AA=alt" << " " << c.set1daAF << " " << c.set2daAF << std::endl; } else { c = getFourSetVariantCounts(fields,set1Loci,set2Loci,ancSet1Loci,ancSet2Loci,"N"); *ancSetsOutFile << fields[0] << "\t" << fields[1] << "\t" << "-888" << "\t" << thisSNPFst << "\t"; if (c.set3AltAF > 0 & c.set3AltAF < 1) { *ancSetsOutFile << "1" << "\t"; } else { *ancSetsOutFile << "0" << "\t"; } if (c.set4AltAF > 0 & c.set4AltAF < 1) { *ancSetsOutFile << "1" << std::endl; } else { *ancSetsOutFile << "0" << std::endl; } } } std::vector<string> s = split(windowStartEnd, '\t'); if (s[0] == fields[0]) { if (atoi(fields[1].c_str()) > (fixedWindowStart+10000)) { double thisFixedWindowDxy = vector_average_withRegion(fixedWindowDxyVector, 10000); double thisFixedWindowFst = calculateFst(fixedWindowFstNumVector, fixedWindowFstDenomVector); //double thisFixedWindowHet1 = vector_average_withRegion(fixedWindowHet1Vector, 10000); //double thisFixedWindowHet2 = vector_average_withRegion(fixedWindowHet2Vector, 10000); double thisFixedWindowPi1 = vector_average_withRegion(fixedWindowPi1Vector, 10000); double thisFixedWindowPi2 = vector_average_withRegion(fixedWindowPi2Vector, 10000); *fstDxyFixedWindowFile << fields[0] << "\t" << fixedWindowStart << "\t" << fixedWindowStart+10000 << "\t" << thisFixedWindowFst << "\t" << thisFixedWindowDxy << "\t" << thisFixedWindowPi1 << "\t" << thisFixedWindowPi2 << std::endl; fixedWindowDxyVector.clear(); fixedWindowFstNumVector.clear(); fixedWindowFstDenomVector.clear(); fixedWindowHet1Vector.clear(); fixedWindowHet2Vector.clear(); fixedWindowPi1Vector.clear(); fixedWindowPi2Vector.clear(); fixedWindowStart= fixedWindowStart+10000; } } else { fixedWindowStart = 0; } if (opt::windowSize == 1) { double Fst = FstNumerator/FstDenominator; if (Fst < 0) Fst = 0; *pFst << countedVariantNumber << "\t" << fields[0] + "\t" + fields[1] << "\t" << Fst << "\t" << thisSNPDxy << std::endl; } else if ((opt::windowSize > 0) && (countedVariantNumber % opt::windowStep == 0) && countedVariantNumber >= opt::windowSize) { std::vector<double> windowFstNumerators(fstNumerators.end()-opt::windowSize, fstNumerators.end()); std::vector<double> windowFstDenominators(fstDenominators.end()-opt::windowSize, fstDenominators.end()); double windowFst = calculateFst(windowFstNumerators, windowFstDenominators); if (windowFst < 0) windowFst = 0; std::vector<double> windowDxyVec(DxyVector.end()-opt::windowSize, DxyVector.end()); double windowDxy = vector_average(windowDxyVec); if (opt::windowSize == opt::windowStep) { std::vector<string> s = split(windowStartEnd, '\t'); if (s[0] == fields[0]) { windowStartEnd = windowStartEnd + "\t" + fields[1]; windowEnd = atoi(fields[1].c_str()); double windowDxyIncNonSeg = vector_average_withRegion(windowDxyVec, windowEnd-windowStart); *pFst << countedVariantNumber-opt::windowSize+1 << "\t" << windowStartEnd << "\t" << windowFst << "\t" << windowDxy << "\t" << windowDxyIncNonSeg << "\t" << windowFstDenominators.size() << std::endl; if (opt::regAbove > 0) { if (windowFst >= opt::regAbove && !inRegAbove) { inRegAbove = true; *regionsAboveFstFile << s[0] << "\t" << s[1] << "\t"; } else if (windowFst < opt::regAbove && inRegAbove) { inRegAbove = false; *regionsAboveFstFile << s[1] << std::endl; } } } } else { *pFst << countedVariantNumber-opt::windowSize+1 << "\t" << windowMiddleVariant << "\t" << windowFst << "\t" << windowDxy << "\t" << windowFstDenominators.size() << std::endl; } // Now calculate and output expected heterozygosities for this window std::vector<double> windowHetS1Vec(set1heterozygositiesSimple.end()-opt::windowSize, set1heterozygositiesSimple.end()); double windowHetS1 = vector_average(windowHetS1Vec); std::vector<double> windowHetS2Vec(set2heterozygositiesSimple.end()-opt::windowSize, set2heterozygositiesSimple.end()); double windowHetS2 = vector_average(windowHetS2Vec); std::vector<double> windowHetNei1Vec(set1heterozygositiesNei.end()-opt::windowSize, set1heterozygositiesNei.end()); double windowHetNei1 = vector_average(windowHetNei1Vec); std::vector<double> windowHetNei2Vec(set2heterozygositiesNei.end()-opt::windowSize, set2heterozygositiesNei.end()); double windowHetNei2 = vector_average(windowHetNei2Vec); std::vector<double> windowHetPi1Vec(set1heterozygositiesPi.end()-opt::windowSize, set1heterozygositiesPi.end()); double windowHetPi1 = vector_average_withRegion(windowHetPi1Vec, windowEnd-windowStart); std::vector<double> windowHetPi2Vec(set2heterozygositiesPi.end()-opt::windowSize, set2heterozygositiesPi.end()); double windowHetPi2 = vector_average_withRegion(windowHetPi2Vec, windowEnd-windowStart); if (opt::windowSize == opt::windowStep) { std::vector<string> s = split(windowStartEnd, '\t'); if (s[0] == fields[0]) { *pHetSets << windowStartEnd << "\t" << windowHetS1 << "\t" << windowHetS2 << "\t" << windowHetNei1 << "\t" << windowHetNei2 << "\t" << windowHetPi1 << "\t" << windowHetPi2 << std::endl; windowStartEnd = fields[0] + "\t" + fields[1]; windowStart = atoi(fields[1].c_str()); } else { windowStartEnd = fields[0] + "\t0"; windowStart = 0; } } else { *pHetSets << windowMiddleVariant << "\t" << windowHetS1 << "\t" << windowHetS2 << "\t" << windowHetNei1 << "\t" << windowHetNei2 << std::endl; windowMiddleVariant = fields[0] + "\t" + fields[1]; // works only if STEP is half SIZE for the window } } } } if (totalVariantNumber % 100000 == 0) { double Fst = calculateFst(fstNumerators, fstDenominators); std::cerr << totalVariantNumber << " variants processed... Fst: " << Fst << std::endl; } } } double Fst = calculateFst(fstNumerators, fstDenominators); double overallHetS1 = vector_average(set1heterozygositiesSimple); double overallHetS2 = vector_average(set2heterozygositiesSimple); double overallHetNei1 = vector_average(set1heterozygositiesNei); double overallHetNei2 = vector_average(set2heterozygositiesNei); std::cerr << "Fst: " << Fst << std::endl; std::cerr << "Heterozygosities: " << "\tS1:" << overallHetS1 << "\tS2:" << overallHetS2 << "\tNei1:" << overallHetNei1 << "\tNei2" << overallHetNei2 << std::endl; *pHetSets << "#Heterozygosities: " << "\tS1:" << overallHetS1 << "\tS2:" << overallHetS2 << "\tNei1:" << overallHetNei1 << "\tNei2" << overallHetNei2 << std::endl; }
void getFstFromMs() { std::cerr << "Calculating Fst using variants from: " << opt::msFile << std::endl; std::cerr << "and outputting chi-sq test p-vals < " << opt::msPvalCutoff << std::endl; std::ifstream* msFile = new std::ifstream(opt::msFile.c_str()); string fileRoot = stripExtension(opt::msFile); string PvalFileName = fileRoot + "_" + opt::runName + "_pvals.txt"; std::ofstream* pValFile; if (opt::msPvalCutoff > 0) { pValFile = new std::ofstream(PvalFileName.c_str()); *pValFile << "Fisher p-val" << "\t" << "chi-sq pval" << "\t" << "set1Alt" << "\t" << "set1Ref" << "\t" << "set2Alt" << "\t" << "set2Ref" << "\t" << "Fst" << std::endl; } std::vector<int> set1_loci; std::vector<int> set2_loci; srand((int)time(NULL)); if (opt::msSet1FstSample == 0) { opt::msSet1FstSample = opt::msSet1Size; for (int i = 0; i != opt::msSet1FstSample; i++) { set1_loci.push_back(i); } } else { // Randomly sample individuals from population 1 for Fst calculation for (int i = 0; i != opt::msSet1FstSample; i++) { int rand_sample = (rand()%opt::msSet1Size); while (std::find(set1_loci.begin(),set1_loci.end(),rand_sample) != set1_loci.end()) { rand_sample = (rand()%opt::msSet1Size); } set1_loci.push_back(rand_sample); } } // Do the same for set2 if (opt::msSet2FstSample == 0) { opt::msSet2FstSample = opt::msSet2Size; for (int i = 0; i != opt::msSet2FstSample; i++) { set2_loci.push_back(i+opt::msSet1Size); } } else { // Randomly sample individuals from population 2 for Fst calculation for (int i = 0; i != opt::msSet2FstSample; i++) { int rand_sample = (rand()%opt::msSet2Size)+opt::msSet1Size; while (std::find(set2_loci.begin(),set2_loci.end(),rand_sample) != set2_loci.end()) { rand_sample = (rand()%opt::msSet2Size)+opt::msSet1Size; } set2_loci.push_back(rand_sample); } } std::cerr << "Selected population 1 individuals: "; print_vector_stream(set1_loci, std::cerr); std::cerr << "Selected population 2 individuals: "; print_vector_stream(set2_loci, std::cerr); if (opt::msSet1Size != opt::msSet1FstSample || opt::msSet2Size != opt::msSet2FstSample) { std::cerr << "Warning: the Fst column is going to contain '-1' values where the site is not a segregating site in the sampled individuals for Fst calcultation" << std::endl; } std::vector<double> fstNumerators; fstNumerators.reserve(500000000); std::vector<double> fstDenominators; fstDenominators.reserve(500000000); string line; int numFixedSites = 0; int numNearlyFixedSites = 0; std::vector<double> nullForChisq; std::vector<int> moreSet1; std::vector<int> lessSet1; std::vector<int> moreSet2; std::vector<int> lessSet2; SetCounts counts; while (getline(*msFile, line)) { counts.reset(); double thisFst = -1; for (std::vector<int>::iterator it = set1_loci.begin(); it != set1_loci.end(); it++) { // std::cerr << line[*it] << std::endl; if (line[*it] == '1') { counts.set1Count++; } } for (std::vector<int>::iterator it = set2_loci.begin(); it != set2_loci.end(); it++) { if (line[*it] == '1') { counts.set2Count++; } } //std::cerr << "counts.set1Count" << counts.set1Count << "\t" << "counts.set2Count" << counts.set2Count << std::endl; if (counts.set1Count > 0 || counts.set2Count > 0) { double FstNum = calculateFstNumerator(counts, opt::msSet1FstSample, opt::msSet2FstSample); double FstDenom = calculateFstDenominator(counts, opt::msSet1FstSample, opt::msSet2FstSample); thisFst = FstNum/FstDenom; if (thisFst < 0) thisFst = 0; fstNumerators.push_back(FstNum); fstDenominators.push_back(FstDenom); } if ((counts.set1Count == 0 && counts.set2Count == opt::msSet2FstSample) || (counts.set1Count == opt::msSet1FstSample && counts.set2Count == 0)) { numFixedSites++; } if ((counts.set1Count == 1 && counts.set2Count == opt::msSet2FstSample) || (counts.set1Count == 0 && counts.set2Count == opt::msSet2FstSample-1) || (counts.set1Count == opt::msSet1FstSample-1 && counts.set2Count == 0) || (counts.set1Count == opt::msSet1FstSample && counts.set2Count == 1)) { numNearlyFixedSites++; } int set1WithoutVariant = opt::msSet1FstSample-counts.set1Count; int set2WithoutVariant = opt::msSet2FstSample-counts.set2Count; if (counts.set1Count >= set1WithoutVariant) { moreSet1.push_back(counts.set1Count); lessSet1.push_back(set1WithoutVariant); moreSet2.push_back(counts.set2Count); lessSet2.push_back(set2WithoutVariant); } else { moreSet1.push_back(set1WithoutVariant); lessSet1.push_back(counts.set1Count); moreSet2.push_back(set2WithoutVariant); lessSet2.push_back(counts.set2Count); } // std::cerr << counts.set1Count << "\t" << set1WithoutVariant << "\t" << counts.set2Count << "\t" << set2WithoutVariant << std::endl; if ((counts.set1Count != 0 || counts.set2Count != 0) && (set1WithoutVariant != 0 || set2WithoutVariant != 0)) { if (opt::msSet1FstSample + opt::msSet2FstSample <= 60) { counts.fisher_pval = fisher_exact(counts.set1Count,set1WithoutVariant , counts.set2Count, set2WithoutVariant); // std::cerr << "Fisher: " << counts.fisher_pval << std::endl; counts.chi_sq_pval = pearson_chi_sq_indep(counts.set1Count,set1WithoutVariant , counts.set2Count, set2WithoutVariant); } else { counts.chi_sq_pval = pearson_chi_sq_indep(counts.set1Count,set1WithoutVariant , counts.set2Count, set2WithoutVariant); } } if (counts.fisher_pval < opt::msPvalCutoff || counts.chi_sq_pval < opt::msPvalCutoff) { *pValFile << counts.fisher_pval << "\t" << counts.chi_sq_pval << "\t" << counts.set1Count << "\t" << set1WithoutVariant << "\t" << counts.set2Count << "\t" << set2WithoutVariant << "\t" << thisFst << std::endl; } } double Fst = calculateFst(fstNumerators, fstDenominators); std::cerr << "Fst: " << Fst << std::endl; std::cerr << "Fixed sites: " << numFixedSites << std::endl; std::cerr << "Tier2 sites: " << numNearlyFixedSites << std::endl; std::cerr << "Null ChiSq 1:" << vector_average(moreSet1)/opt::msSet1FstSample << "\t" << vector_average(lessSet1)/opt::msSet1FstSample << std::endl; std::cerr << "Null ChiSq 2:" << vector_average(moreSet2)/opt::msSet2FstSample << "\t" << vector_average(lessSet2)/opt::msSet2FstSample << std::endl; }
int main(int argc, char *argv[]){ char **list; SSMAX = 5000000; int isMs = 0; // boolean == 1 if input is simulated ms data char *segSiteString; // used to parse the number of segregating sites out of stdin char temp; // attempts to distinguish between simulated and real data in ms format if (!scanf("%d ", &ss)){ isMs = 1; // if not simulated ms data, try the real thing segSiteString = "segsites: %d\n"; // look at the ms command line while(1){ temp = getchar(); if(temp == 'n'){ fprintf(stderr, "Failed to find the -I flag in your ms command line!\n"); return 3; } else if(temp == '-'){ temp = getchar(); if(temp == 'I') break; } } int numberOfPops; if(scanf("%d %d %d", &numberOfPops, &n1, &n2) != 3){ fprintf(stderr, "Failed to properly parse the ms -I flag!\n"); return 4; } else if(numberOfPops != 2){ fprintf(stderr, "filtFst only works with two populations at a time, not %d\n", numberOfPops); return 5; // ensure that only two populations are specified } n = n1 + n2; } else{ segSiteString = "%d "; if(argc != 3 && argc != 4){ fprintf(stderr, "Usage:\nfiltFst n1 n2 (Sample Size Per Population)\n"); return 1; } n1 = atoi(argv[1]); n2 = atoi(argv[2]); n = n1 + n2; if(argc == 4){ min_sample_size = atoi(argv[3]); // min = 2 to avoid divide by zero if (min_sample_size < 2) min_sample_size = 2; } else min_sample_size = 2; if(n1 <= min_sample_size || n2 <= min_sample_size){ fprintf(stderr, "Illegal population sizes specified!\nMust be greater than %d\n", min_sample_size); fprintf(stderr, "Usage:\nfiltFst n1 n2 (Sample Size Per Population)\n"); return 1; } } // allocate some memory list = (char **) malloc ((unsigned) n * sizeof (char *)); if(list == NULL){ fprintf(stderr, "Failed to malloc in main!\n"); return 2; } for(a=0; a<n; ++a) if( (list[a] = (char *) malloc ((unsigned) SSMAX * sizeof (char)) ) == NULL) fprintf(stderr, "Failed to malloc in main!\n"); if (isMs){ // this parses the ms file itself while((temp = getchar()) != EOF){ while(temp != EOF && temp != '/') // looks for the '//' delimiter temp = getchar(); if(temp == EOF) { fprintf(stderr, "Premature ending of stdin\n"); return 2; } getchar(); // second '/' delimiter getchar(); // newline if(scanf(segSiteString, &ss) != 1){ fprintf(stderr, "Failed to grab the number of segregating sites from the ms input!\n"); return 3; } scanf("positions: "); calculateFst(list, segSiteString); } } else{ // read the 'pseudoMs' (i.e., real data files a la Jeff Wall) do{ calculateFst(list, segSiteString); } while(scanf(segSiteString, &ss) == 1); } return 0; }