Пример #1
0
void getFstFromVCF() {
    std::cerr << "Calculating Fst using variants from: " << opt::vcfFile << std::endl;
    std::cerr << "Between the two 'populations' defined in: " << opt::sampleSets << std::endl;
    if (opt::windowSize > 0) {
        std::cerr << "also using a sliding window of size: " << opt::windowSize << " variants and sliding in steps of: " << opt::windowStep << std::endl;
    }
    string fileRoot = stripExtension(opt::sampleSets);
    //std::cerr << "Still alive: " << std::endl;
    // Open connection to read from the vcf file
    std::istream* vcfFile = createReader(opt::vcfFile.c_str());
    //std::cerr << "Hello: " << std::endl;
    std::ifstream* setsFile = new std::ifstream(opt::sampleSets.c_str());
    std::ifstream* annotFile;
    std::ofstream* snpCategoryFstFile;
    std::ofstream* regionsAboveFstFile; bool inRegAbove = false;
    std::ofstream* fstDxyFixedWindowFile;
    std::ifstream* ancSetsFile; std::ofstream* ancSetsOutFile;
    std::vector<string> ancSet1; std::vector<string> ancSet2;
    Annotation wgAnnotation;
    if (!opt::annotFile.empty()) {
        annotFile = new std::ifstream(opt::annotFile.c_str());
        Annotation Annot(annotFile, false); // Does not use transcripts annotated as 5' or 3' partial
        wgAnnotation = Annot;
        string snpCategoryFstFileName = fileRoot + "_" + opt::runName + "SNPcategory_fst.txt";
        snpCategoryFstFile = new std::ofstream(snpCategoryFstFileName.c_str());
        *snpCategoryFstFile << "SNPcategory" << "\t" << "thisSNPFst" << "\t" << "thisSNPDxy" << "\t" << "scaffold" << "\t" << "position" << std::endl;
    }
    if (!opt::ancSets.empty()) {
        ancSetsFile = new std::ifstream(opt::ancSets);
        string ancOutFileName = fileRoot + "_" + opt::runName + "ancestralSNPs_fst.txt";
        ancSetsOutFile = new std::ofstream(ancOutFileName);
        *ancSetsOutFile << "scaffold" << "\t" << "position" << "\t" << "AncAllelePopulation" << "\t" << "Fst" << "\t" << "ancSet1_segregating" << "\t" << "ancSet2_segregating" << std::endl;
        string ancSet1String; string ancSet2String;
        getline(*ancSetsFile, ancSet1String);
        getline(*ancSetsFile, ancSet2String);
        ancSet1 = split(ancSet1String, ','); ancSet2 = split(ancSet2String, ',');
        std::sort(ancSet1.begin(),ancSet1.end()); std::sort(ancSet2.begin(),ancSet2.end());
    }
    
    if (opt::regAbove > 0) {
        string regionsAboveFstFileName = fileRoot + "_w_" + numToString(opt::windowSize) + opt::runName + "_fst_above" + numToString(opt::regAbove) + ".txt";
        regionsAboveFstFile = new std::ofstream(regionsAboveFstFileName.c_str());
    }
    
    string FstResultsFileName = fileRoot + "_w_" + numToString(opt::windowSize) + opt::runName + "_fst.txt";
    std::ofstream* pFst = new std::ofstream(FstResultsFileName.c_str());
    string fstDxyFixedWindowFileName = fileRoot + "dXY_fixedWindow.txt";
    fstDxyFixedWindowFile = new std::ofstream(fstDxyFixedWindowFileName.c_str());
    string heterozygositySetsFileName = fileRoot + "_w_" + numToString(opt::windowSize) + opt::runName + "_heterozygosity.txt";
    *fstDxyFixedWindowFile << "scaffold" << "\t" << "Start" << "\t" << "End" << "\t" << "Fst" << "\t" << "Dxy" << "\t" << "Set1_pi" << "\t" << "Set2_pi" << std::endl;
    std::ofstream* pHetSets = new std::ofstream(heterozygositySetsFileName.c_str());
    //std::cerr << "Still alive: " << std::endl;
    
    string set1String; string set2String;
    getline(*setsFile, set1String);
    getline(*setsFile, set2String);
    std::vector<string> set1 = split(set1String, ',');
    std::vector<string> set2 = split(set2String, ',');
    std::sort(set1.begin(),set1.end());
    std::sort(set2.begin(),set2.end());
    
    int numChromosomes;
    int totalVariantNumber = 0;
    int countedVariantNumber = 0;
    string windowMiddleVariant = "first\tWindow";
    string windowStartEnd = "scaffold_0\t0";
    int windowStart = 0; int windowEnd;
    int fixedWindowStart = 0; std::vector<double> fixedWindowDxyVector; std::vector<double> fixedWindowFstNumVector; std::vector<double> fixedWindowFstDenomVector;
    std::vector<double> fixedWindowHet1Vector; std::vector<double> fixedWindowHet2Vector; std::vector<double> fixedWindowPi1Vector; std::vector<double> fixedWindowPi2Vector;
    std::vector<string> sampleNames;
    std::vector<string> fields;
    std::vector<size_t> set1Loci; std::vector<size_t> set2Loci;
    std::vector<size_t> ancSet1Loci; std::vector<size_t> ancSet2Loci;
    short n1; short n2; short n1anc; short n2anc;
    string line;
    std::map<std::string, double> loc_pval;
    std::vector<double> fstNumerators; fstNumerators.reserve(30000000);
    std::vector<double> fstDenominators; fstDenominators.reserve(30000000);
    std::vector<double> DxyVector; DxyVector.reserve(30000000);
    std::vector<std::vector<double> > heterozygositiesVector; heterozygositiesVector.reserve(30000000);
    std::vector<double> set1heterozygositiesSimple; set1heterozygositiesSimple.reserve(30000000);
    std::vector<double> set2heterozygositiesSimple; set2heterozygositiesSimple.reserve(30000000);
    std::vector<double> set1heterozygositiesNei; set1heterozygositiesNei.reserve(30000000);
    std::vector<double> set2heterozygositiesNei; set2heterozygositiesNei.reserve(30000000);
    std::vector<double> set1heterozygositiesPi; set1heterozygositiesPi.reserve(30000000);
    std::vector<double> set2heterozygositiesPi; set2heterozygositiesPi.reserve(30000000);
    while (getline(*vcfFile, line)) {
        if (line[0] == '#' && line[1] == '#') {
            
        } else if (line[0] == '#' && line[1] == 'C') {
            std::vector<std::string> fields = split(line, '\t');
            const std::vector<std::string>::size_type numSamples = fields.size() - NUM_NON_GENOTYPE_COLUMNS;
            numChromosomes = (int)numSamples * 2;
            // std::cerr << "Number of chromosomes: " << numChromosomes << std::endl;
            
            if (opt::sampleNameFile.empty()) {
                for (std::vector<std::string>::size_type i = NUM_NON_GENOTYPE_COLUMNS; i != fields.size(); i++) {
                    sampleNames.push_back(fields[i]);
                }
            } else {
                sampleNames = readSampleNamesFromTextFile(opt::sampleNameFile);
            }
            set1Loci = locateSet(sampleNames, set1);
            set2Loci = locateSet(sampleNames, set2);
            n1 = set1Loci.size()*2; n2 = set2Loci.size()*2;
            std::cerr << "Set1 loci: " << std::endl;
            print_vector_stream(set1Loci, std::cerr);
            std::cerr << "Set2 loci: " << std::endl;
            print_vector_stream(set2Loci, std::cerr);
            
            if (!opt::ancSets.empty()) {
                ancSet1Loci = locateSet(sampleNames, ancSet1);
                ancSet2Loci = locateSet(sampleNames, ancSet2);
                std::cerr << "Ancestral Set1 loci: " << std::endl;
                print_vector_stream(ancSet1Loci, std::cerr);
                std::cerr << "Ancestral Set2 loci: " << std::endl;
                print_vector_stream(ancSet2Loci, std::cerr);
                n1anc = ancSet1Loci.size() * 2; n2anc = ancSet2Loci.size() * 2;
            }
            
            if (opt::windowSize > 0) {
                if (opt::windowSize == opt::windowStep) {
                    *pHetSets << "scaffold" << "\t" << "Start" << "\t" << "End" << "Set1_heterozygosity" << "\t" << "Set2_heterozygosity" << "\t" << "Set1_heterozygosity_Nei" << "\t" << "Set2_heterozygosity_Nei" << "\t" << "Set1_nucleotideDiversity_pi" << "\t" << "Set2_nucleotideDiversity_pi" << std::endl;
                    *pFst << "var_num" << "\t" << "scaffold" << "\t" << "Start" << "\t" << "End" << "\t" << "Fst" << "\t" << "Dxy_onlyVaiants" << "\t" << "Dxy_AllSites" << "\t" << "windowSize" << std::endl;
                    if (opt::regAbove > 0) *regionsAboveFstFile << "scaffold" << "\t" << "Start" << "\t" << "End" << std::endl;
                } else {
                    *pHetSets << "Middle_SNP_position" << "\t" << "Set1_heterozygosity" << "\t" << "Set2_heterozygosity" << "\t" << "Set1_heterozygosity_Nei" << "\t" << "Set2_heterozygosity_Nei" << "\t" << "Set1_nucleotideDiversity_pi" << "\t" << "Set2_nucleotideDiversity_pi" << std::endl;
                }
            }
        } else {
            totalVariantNumber++;
            
            std::vector<std::string> fields = split(line, '\t');
            std::vector<std::string> info = split(fields[7], ';');
            if (info[0] != "INDEL") {  // Without indels
                SetCounts counts = getVariantCountsForFst(fields,set1Loci,set2Loci);
                //std::cerr << "Still here: " << counts.set1HaplotypeVariant.size() << "\t" << counts.set1individualsWithVariant.size() << "\t" << n1 << std::endl;
                //std::cerr << "Still here: " << counts.set2HaplotypeVariant.size() << "\t" << counts.set2individualsWithVariant.size() << "\t" << n2 << std::endl;
                //print_vector_stream(counts.set1HaplotypeVariant, std::cerr);
                //print_vector_stream(counts.set1individualsWithVariant, std::cerr);
                //print_vector_stream(counts.set2HaplotypeVariant, std::cerr);
                if ((counts.set1Count > 0 || counts.set2Count > 0) && (counts.set1Count < n1 || counts.set2Count < n2)) {
                    countedVariantNumber++;
                    double FstNumerator = calculateFstNumerator(counts, n1, n2); fstNumerators.push_back(FstNumerator); fixedWindowFstNumVector.push_back(FstNumerator);
                    double FstDenominator = calculateFstDenominator(counts, n1, n2); fstDenominators.push_back(FstDenominator); fixedWindowFstDenomVector.push_back(FstDenominator);
                    assert(FstDenominator != 0);
                    double thisSNPDxy = calculateDxy(counts, n1, n2); DxyVector.push_back(thisSNPDxy); fixedWindowDxyVector.push_back(thisSNPDxy);
                    std::vector<double> thisSNPhet = getSetHeterozygozities(counts, n1, n2); heterozygositiesVector.push_back(thisSNPhet);
                    std::vector<double> thisSNPpis = calculatePiTwoSets(counts, n1, n2); fixedWindowPi1Vector.push_back(thisSNPpis[0]); fixedWindowPi2Vector.push_back(thisSNPpis[1]);
                    set1heterozygositiesPi.push_back(thisSNPpis[0]); set2heterozygositiesPi.push_back(thisSNPpis[1]);
                   // std::cerr << "Still here: " << thisSNPpis[0] << std::endl;
                    set1heterozygositiesSimple.push_back(thisSNPhet[0]); set2heterozygositiesSimple.push_back(thisSNPhet[1]); fixedWindowHet1Vector.push_back(thisSNPhet[0]);
                    set1heterozygositiesNei.push_back(thisSNPhet[2]); set2heterozygositiesNei.push_back(thisSNPhet[3]); fixedWindowHet2Vector.push_back(thisSNPhet[1]);
                    if (!opt::annotFile.empty()) {
                        string scaffold = fields[0]; string loc = fields[1]; // Scaffold
                        string SNPcategory = wgAnnotation.getCategoryOfSNP(scaffold, loc);
                        double thisSNPFst = FstNumerator/FstDenominator;
                        *snpCategoryFstFile << SNPcategory << "\t" << thisSNPFst << "\t" << thisSNPDxy << "\t" << scaffold << "\t" << loc << std::endl;
                    }
                    if (!opt::ancSets.empty()) {
                        double thisSNPFst = FstNumerator/FstDenominator;
                        if (thisSNPFst < 0) { thisSNPFst = 0; }
                        string AA = split(info[info.size()-1],'=')[1];
                        //std::cerr << "AA=" << " " << AA << std::endl;
                        FourSetCounts c;
                        if (AA == fields[3]) {
                            c = getFourSetVariantCounts(fields,set1Loci,set2Loci,ancSet1Loci,ancSet2Loci,"ref");
                            *ancSetsOutFile << fields[0] << "\t" << fields[1] << "\t" << c.set1daAF-c.set2daAF << "\t" << thisSNPFst << "\t";
                            if (c.set3daAF > 0 & c.set3daAF < 1) { *ancSetsOutFile << "1" << "\t"; } else { *ancSetsOutFile << "0" << "\t"; }
                            if (c.set4daAF > 0 & c.set4daAF < 1) { *ancSetsOutFile << "1" << std::endl; } else { *ancSetsOutFile << "0" << std::endl; }
                        } else if (AA == fields[4]) {
                            c = getFourSetVariantCounts(fields,set1Loci,set2Loci,ancSet1Loci,ancSet2Loci,"alt");
                            *ancSetsOutFile << fields[0] << "\t" << fields[1] << "\t" << c.set1daAF-c.set2daAF << "\t" << thisSNPFst << "\t";
                            if (c.set3daAF > 0 & c.set3daAF < 1) { *ancSetsOutFile << "1" << "\t"; } else { *ancSetsOutFile << "0" << "\t"; }
                            if (c.set4daAF > 0 & c.set4daAF < 1) { *ancSetsOutFile << "1" << std::endl; } else { *ancSetsOutFile << "0" << std::endl; }
                            // std::cerr << "AA=alt" << " " << c.set1daAF << " " << c.set2daAF << std::endl;
                        } else {
                            c = getFourSetVariantCounts(fields,set1Loci,set2Loci,ancSet1Loci,ancSet2Loci,"N");
                            *ancSetsOutFile << fields[0] << "\t" << fields[1] << "\t" << "-888" << "\t" << thisSNPFst << "\t";
                            if (c.set3AltAF > 0 & c.set3AltAF < 1) { *ancSetsOutFile << "1" << "\t"; } else { *ancSetsOutFile << "0" << "\t"; }
                            if (c.set4AltAF > 0 & c.set4AltAF < 1) { *ancSetsOutFile << "1" << std::endl; } else { *ancSetsOutFile << "0" << std::endl; }
                        }
                        
                        
                    }
                    
                    std::vector<string> s = split(windowStartEnd, '\t');
                    if (s[0] == fields[0]) {
                        if (atoi(fields[1].c_str()) > (fixedWindowStart+10000)) {
                            double thisFixedWindowDxy = vector_average_withRegion(fixedWindowDxyVector, 10000);
                            double thisFixedWindowFst = calculateFst(fixedWindowFstNumVector, fixedWindowFstDenomVector);
                            //double thisFixedWindowHet1 = vector_average_withRegion(fixedWindowHet1Vector, 10000);
                            //double thisFixedWindowHet2 = vector_average_withRegion(fixedWindowHet2Vector, 10000);
                            double thisFixedWindowPi1 = vector_average_withRegion(fixedWindowPi1Vector, 10000);
                            double thisFixedWindowPi2 = vector_average_withRegion(fixedWindowPi2Vector, 10000);
                            *fstDxyFixedWindowFile << fields[0] << "\t" << fixedWindowStart << "\t" << fixedWindowStart+10000 << "\t" << thisFixedWindowFst << "\t" << thisFixedWindowDxy << "\t" << thisFixedWindowPi1 << "\t" << thisFixedWindowPi2 << std::endl;
                            fixedWindowDxyVector.clear(); fixedWindowFstNumVector.clear(); fixedWindowFstDenomVector.clear();
                            fixedWindowHet1Vector.clear(); fixedWindowHet2Vector.clear(); fixedWindowPi1Vector.clear(); fixedWindowPi2Vector.clear();
                            fixedWindowStart= fixedWindowStart+10000;
                        }
                    } else {
                        fixedWindowStart = 0;
                    }
                    
                    
                    
                    if (opt::windowSize == 1) {
                        double Fst = FstNumerator/FstDenominator;
                        if (Fst < 0) Fst = 0;
                        *pFst << countedVariantNumber << "\t" << fields[0] + "\t" + fields[1] << "\t" << Fst << "\t" << thisSNPDxy << std::endl;
                        
                    } else if ((opt::windowSize > 0) && (countedVariantNumber % opt::windowStep == 0) && countedVariantNumber >= opt::windowSize) {
                        std::vector<double> windowFstNumerators(fstNumerators.end()-opt::windowSize, fstNumerators.end());
                        std::vector<double> windowFstDenominators(fstDenominators.end()-opt::windowSize, fstDenominators.end());
                        double windowFst = calculateFst(windowFstNumerators, windowFstDenominators); if (windowFst < 0) windowFst = 0;
                        std::vector<double> windowDxyVec(DxyVector.end()-opt::windowSize, DxyVector.end());
                        double windowDxy = vector_average(windowDxyVec);
                        if (opt::windowSize == opt::windowStep) {
                            std::vector<string> s = split(windowStartEnd, '\t');
                            if (s[0] == fields[0]) {
                                windowStartEnd = windowStartEnd + "\t" + fields[1];
                                windowEnd = atoi(fields[1].c_str());
                                double windowDxyIncNonSeg = vector_average_withRegion(windowDxyVec, windowEnd-windowStart);
                                *pFst << countedVariantNumber-opt::windowSize+1 << "\t" << windowStartEnd << "\t" << windowFst << "\t" << windowDxy << "\t" << windowDxyIncNonSeg << "\t" << windowFstDenominators.size() << std::endl;
                                if (opt::regAbove > 0) {
                                    if (windowFst >= opt::regAbove && !inRegAbove) {
                                        inRegAbove = true;
                                        *regionsAboveFstFile << s[0] << "\t" << s[1] << "\t";
                                    } else if (windowFst < opt::regAbove && inRegAbove) {
                                        inRegAbove = false;
                                        *regionsAboveFstFile << s[1] << std::endl;
                                    }
                                }
                            }
                        } else {
                            *pFst << countedVariantNumber-opt::windowSize+1 << "\t" << windowMiddleVariant << "\t" << windowFst << "\t" << windowDxy << "\t" << windowFstDenominators.size() << std::endl;
                        }
                        // Now calculate and output expected heterozygosities for this window
                        std::vector<double> windowHetS1Vec(set1heterozygositiesSimple.end()-opt::windowSize, set1heterozygositiesSimple.end());
                        double windowHetS1 = vector_average(windowHetS1Vec);
                        std::vector<double> windowHetS2Vec(set2heterozygositiesSimple.end()-opt::windowSize, set2heterozygositiesSimple.end());
                        double windowHetS2 = vector_average(windowHetS2Vec);
                        std::vector<double> windowHetNei1Vec(set1heterozygositiesNei.end()-opt::windowSize, set1heterozygositiesNei.end());
                        double windowHetNei1 = vector_average(windowHetNei1Vec);
                        std::vector<double> windowHetNei2Vec(set2heterozygositiesNei.end()-opt::windowSize, set2heterozygositiesNei.end());
                        double windowHetNei2 = vector_average(windowHetNei2Vec);
                        std::vector<double> windowHetPi1Vec(set1heterozygositiesPi.end()-opt::windowSize, set1heterozygositiesPi.end());
                        double windowHetPi1 = vector_average_withRegion(windowHetPi1Vec, windowEnd-windowStart);
                        std::vector<double> windowHetPi2Vec(set2heterozygositiesPi.end()-opt::windowSize, set2heterozygositiesPi.end());
                        double windowHetPi2 = vector_average_withRegion(windowHetPi2Vec, windowEnd-windowStart);
                        if (opt::windowSize == opt::windowStep) {
                            std::vector<string> s = split(windowStartEnd, '\t');
                            if (s[0] == fields[0]) {
                                *pHetSets << windowStartEnd << "\t" << windowHetS1 << "\t" << windowHetS2 << "\t" << windowHetNei1 << "\t" << windowHetNei2 << "\t" << windowHetPi1 << "\t" << windowHetPi2 << std::endl;
                                windowStartEnd = fields[0] + "\t" + fields[1];
                                windowStart = atoi(fields[1].c_str());
                            } else {
                                windowStartEnd = fields[0] + "\t0";
                                windowStart = 0;
                            }
                        } else {
                            *pHetSets << windowMiddleVariant << "\t" << windowHetS1 << "\t" << windowHetS2 << "\t" << windowHetNei1 << "\t" << windowHetNei2 << std::endl;
                            windowMiddleVariant = fields[0] + "\t" + fields[1];     // works only if STEP is half SIZE for the window
                        }
                    }
                }
            }
            if (totalVariantNumber % 100000 == 0) {
                double Fst = calculateFst(fstNumerators, fstDenominators);
                std::cerr << totalVariantNumber << " variants processed... Fst: " << Fst << std::endl;
            }
                
        }
    }
    double Fst = calculateFst(fstNumerators, fstDenominators);
    double overallHetS1 = vector_average(set1heterozygositiesSimple);
    double overallHetS2 = vector_average(set2heterozygositiesSimple);
    double overallHetNei1 = vector_average(set1heterozygositiesNei);
    double overallHetNei2 = vector_average(set2heterozygositiesNei);
    
    std::cerr << "Fst: " << Fst << std::endl;
    std::cerr << "Heterozygosities: " << "\tS1:" << overallHetS1 << "\tS2:" << overallHetS2 << "\tNei1:" << overallHetNei1 << "\tNei2" << overallHetNei2 << std::endl;
    *pHetSets << "#Heterozygosities: " << "\tS1:" << overallHetS1 << "\tS2:" << overallHetS2 << "\tNei1:" << overallHetNei1 << "\tNei2" << overallHetNei2 << std::endl;
}
Пример #2
0
void getFstFromMs() {
    std::cerr << "Calculating Fst using variants from: " << opt::msFile << std::endl;
    std::cerr << "and outputting chi-sq test p-vals < " << opt::msPvalCutoff << std::endl;
    
    std::ifstream* msFile = new std::ifstream(opt::msFile.c_str());
    string fileRoot = stripExtension(opt::msFile);
    string PvalFileName = fileRoot + "_" + opt::runName + "_pvals.txt";
    std::ofstream* pValFile;
    if (opt::msPvalCutoff > 0) {
        pValFile = new std::ofstream(PvalFileName.c_str());
        *pValFile << "Fisher p-val" << "\t" << "chi-sq pval" << "\t" << "set1Alt" << "\t" << "set1Ref" << "\t" << "set2Alt" << "\t" << "set2Ref" << "\t" << "Fst" << std::endl;
    }
    
    
    std::vector<int> set1_loci;
    std::vector<int> set2_loci;
    srand((int)time(NULL));
    if (opt::msSet1FstSample == 0) {
        opt::msSet1FstSample = opt::msSet1Size;
        for (int i = 0; i != opt::msSet1FstSample; i++) {
            set1_loci.push_back(i);
        }
    } else { // Randomly sample individuals from population 1 for Fst calculation
        for (int i = 0; i != opt::msSet1FstSample; i++) {
            int rand_sample = (rand()%opt::msSet1Size);
            while (std::find(set1_loci.begin(),set1_loci.end(),rand_sample) != set1_loci.end()) {
                rand_sample = (rand()%opt::msSet1Size);
            }
            set1_loci.push_back(rand_sample);
        }
    }
    // Do the same for set2
    if (opt::msSet2FstSample == 0) {
        opt::msSet2FstSample = opt::msSet2Size;
        for (int i = 0; i != opt::msSet2FstSample; i++) {
            set2_loci.push_back(i+opt::msSet1Size);
        }
    } else { // Randomly sample individuals from population 2 for Fst calculation
        for (int i = 0; i != opt::msSet2FstSample; i++) {
            int rand_sample = (rand()%opt::msSet2Size)+opt::msSet1Size;
            while (std::find(set2_loci.begin(),set2_loci.end(),rand_sample) != set2_loci.end()) {
                rand_sample = (rand()%opt::msSet2Size)+opt::msSet1Size;
            }
            set2_loci.push_back(rand_sample);
        }
    }

    std::cerr << "Selected population 1 individuals: "; print_vector_stream(set1_loci, std::cerr);
    std::cerr << "Selected population 2 individuals: "; print_vector_stream(set2_loci, std::cerr);
    
    if (opt::msSet1Size != opt::msSet1FstSample || opt::msSet2Size != opt::msSet2FstSample) {
        std::cerr << "Warning: the Fst column is going to contain '-1' values where the site is not a segregating site in the sampled individuals for Fst calcultation" << std::endl;
    }
    
    std::vector<double> fstNumerators; fstNumerators.reserve(500000000);
    std::vector<double> fstDenominators; fstDenominators.reserve(500000000);
    
    
    string line;
    int numFixedSites = 0;
    int numNearlyFixedSites = 0;
    std::vector<double> nullForChisq;
    std::vector<int> moreSet1;
    std::vector<int> lessSet1;
    std::vector<int> moreSet2;
    std::vector<int> lessSet2;
    SetCounts counts;
    while (getline(*msFile, line)) {
        counts.reset();
        double thisFst = -1;
        for (std::vector<int>::iterator it = set1_loci.begin(); it != set1_loci.end(); it++) {
            // std::cerr << line[*it] << std::endl;
            if (line[*it] == '1') {
                counts.set1Count++;
            }
        }
        for (std::vector<int>::iterator it = set2_loci.begin(); it != set2_loci.end(); it++) {
            if (line[*it] == '1') {
                counts.set2Count++;
            }
        }
        
        //std::cerr << "counts.set1Count" << counts.set1Count << "\t" << "counts.set2Count" << counts.set2Count << std::endl;
        
        if (counts.set1Count > 0 || counts.set2Count > 0) {
            double FstNum = calculateFstNumerator(counts, opt::msSet1FstSample, opt::msSet2FstSample);
            double FstDenom = calculateFstDenominator(counts, opt::msSet1FstSample, opt::msSet2FstSample);
            thisFst = FstNum/FstDenom; if (thisFst < 0) thisFst = 0;
            fstNumerators.push_back(FstNum);
            fstDenominators.push_back(FstDenom);
            
        }
        
        if ((counts.set1Count == 0 && counts.set2Count == opt::msSet2FstSample) || (counts.set1Count == opt::msSet1FstSample && counts.set2Count == 0)) {
            numFixedSites++;
        }
        
        if ((counts.set1Count == 1 && counts.set2Count == opt::msSet2FstSample) || (counts.set1Count == 0 && counts.set2Count == opt::msSet2FstSample-1) ||
            (counts.set1Count == opt::msSet1FstSample-1 && counts.set2Count == 0) || (counts.set1Count == opt::msSet1FstSample && counts.set2Count == 1)) {
            numNearlyFixedSites++;
        }
        
        
        int set1WithoutVariant = opt::msSet1FstSample-counts.set1Count;
        int set2WithoutVariant = opt::msSet2FstSample-counts.set2Count;
        
        if (counts.set1Count >= set1WithoutVariant) {
            moreSet1.push_back(counts.set1Count);
            lessSet1.push_back(set1WithoutVariant);
            moreSet2.push_back(counts.set2Count);
            lessSet2.push_back(set2WithoutVariant);
        } else {
            moreSet1.push_back(set1WithoutVariant);
            lessSet1.push_back(counts.set1Count);
            moreSet2.push_back(set2WithoutVariant);
            lessSet2.push_back(counts.set2Count);
        }
        
       // std::cerr << counts.set1Count << "\t" << set1WithoutVariant << "\t" << counts.set2Count << "\t" << set2WithoutVariant << std::endl;
        if ((counts.set1Count != 0 || counts.set2Count != 0) && (set1WithoutVariant != 0 || set2WithoutVariant != 0)) {
            if (opt::msSet1FstSample + opt::msSet2FstSample <= 60) {
                counts.fisher_pval = fisher_exact(counts.set1Count,set1WithoutVariant , counts.set2Count, set2WithoutVariant);
   //             std::cerr << "Fisher: " << counts.fisher_pval << std::endl;
                counts.chi_sq_pval = pearson_chi_sq_indep(counts.set1Count,set1WithoutVariant , counts.set2Count, set2WithoutVariant);
            } else {
                counts.chi_sq_pval = pearson_chi_sq_indep(counts.set1Count,set1WithoutVariant , counts.set2Count, set2WithoutVariant);
            }
        }
        
        if (counts.fisher_pval < opt::msPvalCutoff || counts.chi_sq_pval < opt::msPvalCutoff) {
            *pValFile << counts.fisher_pval << "\t" << counts.chi_sq_pval << "\t" << counts.set1Count << "\t" << set1WithoutVariant << "\t" << counts.set2Count << "\t" << set2WithoutVariant << "\t" << thisFst << std::endl;
        }
    }
    
    double Fst = calculateFst(fstNumerators, fstDenominators);
    std::cerr << "Fst: " << Fst << std::endl;
    std::cerr << "Fixed sites: " << numFixedSites << std::endl;
    std::cerr << "Tier2 sites: " << numNearlyFixedSites << std::endl;
    std::cerr << "Null ChiSq 1:" << vector_average(moreSet1)/opt::msSet1FstSample << "\t" << vector_average(lessSet1)/opt::msSet1FstSample << std::endl;
    std::cerr << "Null ChiSq 2:" << vector_average(moreSet2)/opt::msSet2FstSample << "\t" << vector_average(lessSet2)/opt::msSet2FstSample << std::endl;
}
Пример #3
0
int main(int argc, char *argv[]){
    
    char **list;
    SSMAX = 5000000;
    
    int isMs = 0; // boolean == 1 if input is simulated ms data
    char *segSiteString; // used to parse the number of segregating sites out of stdin
    char temp;
    
    // attempts to distinguish between simulated and real data in ms format
    if (!scanf("%d ", &ss)){
        
        isMs = 1; // if not simulated ms data, try the real thing
        segSiteString = "segsites: %d\n";
        
        // look at the ms command line
        while(1){
            
            temp = getchar();
            if(temp == 'n'){
                
                fprintf(stderr, "Failed to find the -I flag in your ms command line!\n");
                return 3;
                
            }
            else if(temp == '-'){
                
                temp = getchar();
                if(temp == 'I')
                break;              
            }
                 
        }
        
        int numberOfPops;
        if(scanf("%d %d %d", &numberOfPops, &n1, &n2) != 3){
            
            fprintf(stderr, "Failed to properly parse the ms -I flag!\n");
            return 4;            
        }
        else if(numberOfPops != 2){
            
            fprintf(stderr, "filtFst only works with two populations at a time, not %d\n", numberOfPops);
            return 5; // ensure that only two populations are specified
        }
        
        n = n1 + n2;        
    }
    else{
        
        segSiteString = "%d ";
        
        if(argc != 3 && argc != 4){
            
            fprintf(stderr, "Usage:\nfiltFst n1 n2 (Sample Size Per Population)\n");
            return 1;
        }
        
        n1 = atoi(argv[1]);
        n2 = atoi(argv[2]);
        n = n1 + n2;
        
        if(argc == 4){
            
            min_sample_size = atoi(argv[3]);
            // min = 2 to avoid divide by zero
            if (min_sample_size < 2)
            min_sample_size = 2;
            
        }
        else
        min_sample_size = 2;
        
        if(n1 <= min_sample_size || n2 <= min_sample_size){
            
            fprintf(stderr,
            "Illegal population sizes specified!\nMust be greater than %d\n", min_sample_size);
            fprintf(stderr, "Usage:\nfiltFst n1 n2 (Sample Size Per Population)\n");
            
            return 1;
        }
    }
    
    // allocate some memory
    list = (char **) malloc ((unsigned) n * sizeof (char *));
    if(list == NULL){
        
        fprintf(stderr, "Failed to malloc in main!\n");
        return 2;
    }
    
    for(a=0; a<n; ++a)
    if( (list[a] = (char *) malloc ((unsigned) SSMAX * sizeof (char)) ) == NULL)
    fprintf(stderr, "Failed to malloc in main!\n");
    
    if (isMs){
        
        // this parses the ms file itself
        while((temp = getchar()) != EOF){
            
            while(temp != EOF && temp != '/') // looks for the '//' delimiter
            temp = getchar();
            
            if(temp == EOF) {
                
                fprintf(stderr, "Premature ending of stdin\n");
                return 2;
            }
            
            getchar(); // second '/' delimiter
            getchar(); // newline
            
            if(scanf(segSiteString, &ss) != 1){
                
                fprintf(stderr, "Failed to grab the number of segregating sites from the ms input!\n");
                return 3;
            }
            
            scanf("positions: ");
            calculateFst(list, segSiteString);
        }
    }
    else{
        
        // read the 'pseudoMs' (i.e., real data files a la Jeff Wall)
        do{
            calculateFst(list, segSiteString);
        }
        while(scanf(segSiteString, &ss) == 1);
    }
    
    return 0;
}