Beispiel #1
0
int VcfCleaner::execute(int argc, char **argv)
{
    String refFile = "";
    String inputVcf = "";
    String outputVcf = "";
    bool uncompress = false;
    bool params = false;
    
    // Read in the parameters.    
    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_PARAMETER_GROUP("Required Parameters")
        LONG_STRINGPARAMETER("in", &inputVcf)
        LONG_STRINGPARAMETER("out", &outputVcf)
        LONG_PARAMETER_GROUP("Optional Parameters")
        LONG_PARAMETER("uncompress", &uncompress)
        LONG_PARAMETER("params", &params)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));
    
    inputParameters.Read(argc-1, &(argv[1]));
    
    // Check that all files were specified.
    if(inputVcf == "")
    {
        usage();
        inputParameters.Status();
        std::cerr << "Missing \"--in\", a required parameter.\n\n";
        return(-1);
    }
    if(outputVcf == "")
    {
        usage();
        inputParameters.Status();
        std::cerr << "Missing \"--out\", a required parameter.\n\n";
        return(-1);
    }

    if(params)
    {
        inputParameters.Status();
    }

    VcfFileReader inFile;
    VcfFileWriter outFile;
    VcfHeader header;
    VcfRecord record;

    // Open the file.
    inFile.open(inputVcf, header);
    if(uncompress)
    {
        outFile.open(outputVcf, header, InputFile::DEFAULT);
    }
    else
    {
        outFile.open(outputVcf, header);
    }

    int numReadRecords = 0;
    int numWrittenRecords = 0;
    int returnVal = 0;

    // Set to only store/write the GT field.
    VcfRecordGenotype::addStoreField("GT");
    while(inFile.readRecord(record))
    {
        ++numReadRecords;
        // Check if any samples are missing GT or if any are not phased.
        if(!record.hasAllGenotypeAlleles() || !record.allPhased())
        {
            // Missing a GT or not phased, so continue without writing.
            continue;
        }
        
        // Clear the INFO field.
        record.getInfo().clear();
        // Write the record.
        if(!outFile.writeRecord(record))
        {
            // Write error.
            std::cerr << "Failed writing a vcf record.\n";
            returnVal = -1;
        }
        ++numWrittenRecords;
    }
 
    inFile.close();   
    outFile.close();   

    std::cerr << "NumReadRecords: " << numReadRecords
              << "; NumWrittenRecords: " << numWrittenRecords << "\n";
    return(returnVal);
}
Beispiel #2
0
int VcfSplit::execute(int argc, char **argv)
{
    String refFile = "";
    String inputVcf = "";
    String outputVcfBase = "";
    String refName = "";
    bool uncompress = false;
    bool params = false;
    bool noeof = false;
    
    // Read in the parameters.    
    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_PARAMETER_GROUP("Required Parameters")
        LONG_STRINGPARAMETER("in", &inputVcf)
        LONG_STRINGPARAMETER("obase", &outputVcfBase)
        LONG_PARAMETER_GROUP("Optional Parameters")
        LONG_PARAMETER("uncompress", &uncompress)
        LONG_STRINGPARAMETER("refName", &refName)
        LONG_PARAMETER("noeof", &noeof)
        LONG_PARAMETER("params", &params)
        LONG_PHONEHOME(VERSION)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));
    
    inputParameters.Read(argc-1, &(argv[1]));
    
    // Check that all files were specified.
    if(inputVcf == "")
    {
        usage();
        inputParameters.Status();
        std::cerr << "Missing \"--in\", a required parameter.\n\n";
        return(-1);
    }
    if(outputVcfBase == "")
    {
        usage();
        inputParameters.Status();
        std::cerr << "Missing \"--obase\", a required parameter.\n\n";
        return(-1);
    }
    outputVcfBase += ".";

    if(params)
    {
        inputParameters.Status();
    }

    // If no eof block is required for a bgzf file, set the bgzf file type to 
    // not look for it.
    if(noeof)
    {
        // Set that the eof block is not required.
        BgzfFileType::setRequireEofBlock(false);
    }

    VcfFileReader inFile;
    std::map<std::string, VcfFileWriter*> outFiles;
    VcfHeader header;
    
    // Open the file.
    inFile.open(inputVcf, header);

    if(refName != "")
    {
        inFile.setReadSection(refName.c_str());
    }

    VcfRecord record;
    int numRecords = 0;

    std::string prevChr = "";
    std::string chr = "";
    VcfFileWriter* outFilePtr = 0;
    std::string outName = "";
    while(inFile.readRecord(record))
    {
        ++numRecords;

        chr = record.getChromStr();

        if((outFilePtr == 0) || (chr != prevChr))
        {
            outFilePtr = outFiles[chr];
            if(outFilePtr == 0)
            {
                outFilePtr = new VcfFileWriter();
                outFiles[chr] = outFilePtr;
                outName = outputVcfBase.c_str();
                if(chr.substr(0,3) != "chr")
                {
                    outName += "chr";
                }
                outName += chr + ".vcf";
                // chr not in outFile list.
                if(uncompress)
                {
                    outFilePtr->open(outName.c_str(), header, InputFile::DEFAULT);
                }
                else
                {
                    outName += ".gz";
                    outFilePtr->open(outName.c_str(), header);
                }
            }
        }
        outFilePtr->writeRecord(record);
    }
 
    inFile.close();   

    for (std::map<std::string,VcfFileWriter*>::iterator it = outFiles.begin();
         it != outFiles.end(); ++it)
    {
        if(it->second != 0)
        {
            it->second->close();
            it->second = 0;
        }
    }
  

    std::cerr << "NumRecords: " << numRecords << "\n";
    return(0);
}
Beispiel #3
0
int VcfConvert::execute(int argc, char **argv)
{
    String refFile = "";
    String inputVcf = "";
    String outputVcf = "";
    String refName = "";
    bool uncompress = false;
    bool params = false;
    bool noeof = false;
    
    // Read in the parameters.    
    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_PARAMETER_GROUP("Required Parameters")
        LONG_STRINGPARAMETER("in", &inputVcf)
        LONG_STRINGPARAMETER("out", &outputVcf)
        LONG_PARAMETER_GROUP("Optional Parameters")
        LONG_PARAMETER("uncompress", &uncompress)
        LONG_STRINGPARAMETER("refName", &refName)
        LONG_PARAMETER("noeof", &noeof)
        LONG_PARAMETER("params", &params)
        LONG_PHONEHOME(VERSION)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));
    
    inputParameters.Read(argc-1, &(argv[1]));
    
    // Check that all files were specified.
    if(inputVcf == "")
    {
        usage();
        inputParameters.Status();
        std::cerr << "Missing \"--in\", a required parameter.\n\n";
        return(-1);
    }
    if(outputVcf == "")
    {
        usage();
        inputParameters.Status();
        std::cerr << "Missing \"--out\", a required parameter.\n\n";
        return(-1);
    }

    if(params)
    {
        inputParameters.Status();
    }

    // If no eof block is required for a bgzf file, set the bgzf file type to 
    // not look for it.
    if(noeof)
    {
        // Set that the eof block is not required.
        BgzfFileType::setRequireEofBlock(false);
    }

    VcfFileReader inFile;
    VcfFileWriter outFile;
    VcfHeader header;
    
    // Open the file.
    inFile.open(inputVcf, header);

    if(refName != "")
    {
        inFile.setReadSection(refName.c_str());
    }

    if(uncompress)
    {
        outFile.open(outputVcf, header, InputFile::DEFAULT);
    }
    else
    {
        outFile.open(outputVcf, header);
    }

    VcfRecord record;
    int numRecords = 0;

    while(inFile.readRecord(record))
    {
        ++numRecords;

        outFile.writeRecord(record);
    }
 
    inFile.close();   

    std::cerr << "NumRecords: " << numRecords << "\n";
    return(0);
}
Beispiel #4
0
int VcfConsensus::execute(int argc, char ** argv)
{
    String vcfName1;
    String vcfName2;
    String vcfName3;
    String outputFileName;
    bool uncompress = false;
    bool params = false;

    // Read in the parameters.    
    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_PARAMETER_GROUP("Required Parameters")
        LONG_STRINGPARAMETER("in1", &vcfName1)
        LONG_STRINGPARAMETER("in2", &vcfName2)
        LONG_STRINGPARAMETER("in3", &vcfName3)
        LONG_STRINGPARAMETER("out", &outputFileName)
        LONG_PARAMETER_GROUP("Optional Parameters")
        LONG_PARAMETER("uncompress", &uncompress)
        LONG_PARAMETER("params", &params)
        LONG_PHONEHOME(VERSION)
       END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));
    
    inputParameters.Read(argc-1, &(argv[1]));
    
    std::string gtField = "GT";

    VcfFileReader vcf1;
    VcfFileReader vcf2;
    VcfFileReader vcf3;
    VcfHeader header1;
    VcfHeader header2;
    VcfHeader header3;
    VcfRecord record1;
    VcfRecord record2;
    VcfRecord record3;
    VcfRecordGenotype* genotypeInfoPtr1 = NULL;
    VcfRecordGenotype* genotypeInfoPtr2 = NULL;
    VcfRecordGenotype* genotypeInfoPtr3 = NULL;
    
    unsigned int numMissing2 = 0;
    unsigned int numMissing3 = 0;
    unsigned int numMismatchRefAlt = 0;
    unsigned int numMissingGT1 = 0;
    const unsigned int myMaxErrors = 4;

    // Check that the required parameters were set.
    if(vcfName1 == "")
    {
        usage();
        inputParameters.Status();
        std::cerr << "Missing \"--in1\", a required parameter.\n\n";
        return(-1);
    }
    if(vcfName2 == "")
    {
        usage();
        inputParameters.Status();
        std::cerr << "Missing \"--in2\", a required parameter.\n\n";
        return(-1);
    }
    if(vcfName3 == "")
    {
        usage();
        inputParameters.Status();
        std::cerr << "Missing \"--in3\", a required parameter.\n\n";
        return(-1);
    }
    if(outputFileName == "")
    {
        usage();
        inputParameters.Status();
        std::cerr << "Missing \"--out\", a required parameter.\n\n";
        return(-1);
    }

    if(params)
    {
        inputParameters.Status();
    }

    
    // Open the files.
    vcf1.open(vcfName1, header1);
    vcf2.open(vcfName2, header2);
    vcf3.open(vcfName3, header3);

    // Setup the sample name maps.
    int numSamples = header1.getNumSamples();
    std::vector<int> sample2Indices;
    std::vector<int> sample3Indices;
    std::vector<int> removeIndices;
    int numSamplesSkipped1 = 0;
    int numSamplesSkipped2 = 0;
    int numSamplesSkipped3 = 0;
    for(int i = 0; i < numSamples; i++)
    {
        int sm2Index = header2.getSampleIndex(header1.getSampleName(i));
        int sm3Index = header3.getSampleIndex(header1.getSampleName(i));
        // Look for this sample name in vcf2.
        if((sm2Index != -1) && (sm3Index != -1))
        {
            sample2Indices.push_back(sm2Index);
            sample3Indices.push_back(sm3Index);
        }
        else
        {
            // Sample not found in all three vcfs.
            removeIndices.push_back(i);
            ++numSamplesSkipped1;
        }
    }
    // Remove samples not found in all 3 vcfs from header1.
    // Remove them in reverse order so they are removed from the end of the header first.
    VcfSubsetSamples subset1;
    subset1.init(header1, true);
    for(int i = (removeIndices.size() - 1); i >= 0; i--)
    {
        subset1.addExcludeSample(header1.getSampleName(removeIndices[i]));
        header1.removeSample(removeIndices[i]);
    }

    // Set numSamples to the new number of samples in header1.
    numSamples = header1.getNumSamples();

    // Calculate the number of samples skipped for files 2 & 3.
    numSamplesSkipped2 = header2.getNumSamples() - sample2Indices.size();
    numSamplesSkipped3 = header3.getNumSamples() - sample3Indices.size();

    if(numSamplesSkipped1 > 0)
    {
        std::cerr << "Skipping " << numSamplesSkipped1 << " samples from --in1\n";
    }
    if(numSamplesSkipped2 > 0)
    {
        std::cerr << "Skipping " << numSamplesSkipped2 << " samples from --in2\n";
    }
    if(numSamplesSkipped3 > 0)
    {
        std::cerr << "Skipping " << numSamplesSkipped3 << " samples from --in3\n";
    }

    VcfFileWriter outputVcf;
    // Open and write the header
    if(uncompress)
    {
        outputVcf.open(outputFileName, header1, InputFile::DEFAULT);
    }
    else
    {
        outputVcf.open(outputFileName, header1);
    }

    const char* chrom1 = NULL;
    int pos1 = UNSET_POS;

    // Read the first record from vcf2 & vcf3.
    vcf2.readRecord(record2);
    vcf3.readRecord(record3);

    bool newChrom = true;
    static std::string prevChrom = "";

    uint64_t numAllMatch = 0;
    uint64_t num1Match2Only = 0;
    uint64_t num1Match3Only = 0;
    uint64_t num2Match3Only = 0;
    uint64_t numNoMatches = 0;

    uint64_t numAllMatch00 = 0;
    uint64_t num1Match2Only00 = 0;
    uint64_t num1Match3Only00 = 0;
    uint64_t num2Match3Only00 = 0;

    uint64_t numAllMatch01 = 0;
    uint64_t num1Match2Only01 = 0;
    uint64_t num1Match3Only01 = 0;
    uint64_t num2Match3Only01 = 0;

    uint64_t numAllMatch11 = 0;
    uint64_t num1Match2Only11 = 0;
    uint64_t num1Match3Only11 = 0;
    uint64_t num2Match3Only11 = 0;

    // Loop through vcf1.
    while(vcf1.readRecord(record1, &subset1))
    {
        chrom1 = record1.getChromStr();
        pos1 = record1.get1BasedPosition();

        if(strcmp(chrom1, prevChrom.c_str()) == 0)
        {
            newChrom = false;
        }
        else
        {
            prevChrom = chrom1;
            newChrom = true;
        }

        bool found = true;
        if(!findPos(newChrom, chrom1, pos1, record2, vcf2))
        {
            // Failed to find the position, continue to the next position
            if(++numMissing2 <= myMaxErrors)
            {
                std::cerr << "Failed to find " << chrom1 << ":" << pos1 
                          << " in " << vcfName2 << ", so skipping this pos\n";
            }
            found = false;
        }
        
        if(!findPos(newChrom, chrom1, pos1, record3, vcf3))
        {
            // Failed to find the position, continue to the next position
            if(++numMissing3 <= myMaxErrors)
            {
                std::cerr << "Failed to find " << chrom1 << ":" << pos1 
                          << " in " << vcfName3 << ", so skipping this pos\n";
            }
            found = false;
        }

        if(found == false)
        {
            continue;
        }

        // Found the position in all files.
        
        // Validate that the reference & alternate alleles are the same.
        const char* ref1 = record1.getRefStr();
        const char* alt1 = record1.getAltStr();
        if((strcmp(ref1, record2.getRefStr()) != 0) ||
           (strcmp(ref1, record3.getRefStr()) != 0) ||
           (strcmp(alt1, record2.getAltStr()) != 0) ||
           (strcmp(alt1, record3.getAltStr()) != 0))
        {
            if(++numMismatchRefAlt <= myMaxErrors)
            {
                std::cerr << "Mismatching ref/alt found at " << chrom1 << ":" << pos1 << ", so skipping this pos\n";
            }
            continue;
        }

        // Get the genotype information for each.
        genotypeInfoPtr1 = &(record1.getGenotypeInfo());
        genotypeInfoPtr2 = &(record2.getGenotypeInfo());
        genotypeInfoPtr3 = &(record3.getGenotypeInfo());

        // Loop through all the samples in vcf1.
        // Get the Genotype Information.
        for(int i = 0; i < numSamples; i++)
        {
            const std::string* genotypeVal1 = genotypeInfoPtr1->getString(gtField, i);
            const std::string* genotypeVal2 = genotypeInfoPtr2->getString(gtField, sample2Indices[i]);
            const std::string* genotypeVal3 = genotypeInfoPtr3->getString(gtField, sample3Indices[i]);
            // Need to make sure the field was found.
            if(genotypeVal1 == NULL)
            {
                // GT not found in the first record, so just continue.
                if(++numMissingGT1 <= myMaxErrors)
                {
                    std::cerr << "Missing GT for " << header1.getSampleName(i) << " in " << vcfName1 << "\n";
                }
                continue;
            }


            if(isSame(genotypeVal1, genotypeVal2))
            {
                // genotypeVal1 is majority, so make no change.
                if(isSame(genotypeVal1, genotypeVal3))
                {
                    ++numAllMatch;
                    if(*genotypeVal1 == "0/0")
                    {
                        ++numAllMatch00;
                    }
                    else if((*genotypeVal1 == "0/1") || 
                            (*genotypeVal1 == "1/0"))
                    {
                        ++numAllMatch01;
                    }
                    if(*genotypeVal1 == "1/1")
                    {
                        ++numAllMatch11;
                    }
                }
                else
                {
                    ++num1Match2Only;
                    if(*genotypeVal1 == "0/0")
                    {
                        ++num1Match2Only00;
                    }
                    else if((*genotypeVal1 == "0/1") || 
                            (*genotypeVal1 == "1/0"))
                    {
                        ++num1Match2Only01;
                    }
                    if(*genotypeVal1 == "1/1")
                    {
                        ++num1Match2Only11;
                    }
                }
            }
            else if(isSame(genotypeVal1, genotypeVal3))
            {
                // genotypeVal1 is majority, so make no change.
                    ++num1Match3Only;
                    if(*genotypeVal1 == "0/0")
                    {
                        ++num1Match3Only00;
                    }
                    else if((*genotypeVal1 == "0/1") || 
                            (*genotypeVal1 == "1/0"))
                    {
                        ++num1Match3Only01;
                    }
                    if(*genotypeVal1 == "1/1")
                    {
                        ++num1Match3Only11;
                    }
            }
            else if(isSame(genotypeVal2, genotypeVal3))
            {
                // genotypeVal2 is majority, so change genotypeVal1.
                genotypeInfoPtr1->setString(gtField, i, *genotypeVal2);
                ++num2Match3Only;
                if(*genotypeVal2 == "0/0")
                {
                    ++num2Match3Only00;
                }
                else if((*genotypeVal2 == "0/1") || 
                        (*genotypeVal2 == "1/0"))
                {
                    ++num2Match3Only01;
                }
                if(*genotypeVal2 == "1/1")
                {
                    ++num2Match3Only11;
                }
            }
            else
            {
                // None are the same so set to "./."
                genotypeInfoPtr1->setString(gtField, i, "./.");
                ++numNoMatches;
            }
        } // loop back to vcf1 samples.

        // Write this record.
        outputVcf.writeRecord(record1);
    } // loop back to next vcf1 record.

    std::cerr << "\n";
    if(numMissing2 > myMaxErrors)
    {
        std::cerr << "Suppressed "
                  << numMissing2 - myMaxErrors
                  << " errors about skipped positions because they were not in "
                  << vcfName2
                  << "\n";
    }

    if(numMissing3 > myMaxErrors)
    {
        std::cerr << "Suppressed "
                  << numMissing3 - myMaxErrors
                  << " errors about skipped positions because they were not in "
                  << vcfName3
                  << "\n";
    }

    if(numMismatchRefAlt > myMaxErrors)
    {
        std::cerr << "Suppressed "
                  << numMismatchRefAlt - myMaxErrors
                  << " errors about mismatched ref/alt positions\n";
    }

    if(numMissingGT1 > myMaxErrors)
    {
        std::cerr << "Suppressed "
                  << numMissingGT1 - myMaxErrors
                  << " errors about missing GT for "
                  << vcfName1
                  << "\n";
    }
    std::cerr << "\n";
    // Output the stats.
    std::cerr << "File1 = " << vcfName1 << std::endl;
    std::cerr << "File2 = " << vcfName2 << std::endl;
    std::cerr << "File3 = " << vcfName3 << std::endl;
    std::cerr << "\nType\tTotal\t0/0\t0/1|1/0\t1/1\n";
    std::cerr << "AllMatched" 
              << "\t" << numAllMatch
              << "\t" << numAllMatch00 
              << "\t" << numAllMatch01 
              << "\t" << numAllMatch11 << std::endl;
    std::cerr << "1matched2"
              << "\t" << num1Match2Only 
              << "\t" << num1Match2Only00 
              << "\t" << num1Match2Only01 
              << "\t" << num1Match2Only11 << std::endl;
    std::cerr << "1matched3"
              << "\t" << num1Match3Only 
              << "\t" << num1Match3Only00 
              << "\t" << num1Match3Only01 
              << "\t" << num1Match3Only11 << std::endl;
    std::cerr << "2matched3"
              << "\t" << num2Match3Only
              << "\t" << num2Match3Only00 
              << "\t" << num2Match3Only01 
              << "\t" << num2Match3Only11 << std::endl;
    std::cerr << "NoneMatched\t" << numNoMatches << std::endl;

    return(0);
}