示例#1
0
// The VcfRecord passed in should already be set with a record.
bool findPos(bool newChrom, const char* chrom1, int pos1, VcfRecord& record2, VcfFileReader& vcf2)
{
    const char* chrom2 = record2.getChromStr();
    int pos2 = record2.get1BasedPosition();

    // Loop until the chrom/pos is found in vcf2.
    bool sameChrom = (strcmp(chrom2, chrom1) == 0);
    while(((pos2 < pos1) && sameChrom) ||
          (newChrom && !sameChrom))
    {
        if(vcf2.readRecord(record2))
        {
            chrom2 = record2.getChromStr();
            pos2 = record2.get1BasedPosition();
            sameChrom = (strcmp(chrom2, chrom1) == 0);
        }
        else
        {
            // no more records.
            chrom2 = NULL;
            pos2 = UNSET_POS;
            return(false);
        }
    }
    // If we wind up here, chrom2 is either at the correct 
    // position, or it is past the correct position.
    if((!sameChrom) || (pos2 != pos1))
    {
        // Position not found.
        return(false);
    }
    return(true);
}    
示例#2
0
bool VcfFileWriter::writeRecord(VcfRecord& record)
{
    if(!record.write(myFilePtr, mySiteOnly))
    {
        myStatus = record.getStatus();
        return(false);
    }
    ++myNumRecords;
    return(true);
}
示例#3
0
    void
    process_block(const bool is_in_region,
                  const unsigned end,
                  VcfRecord& vcfr) const {

        if(end>vcfr.GetPos()) {
            vcfr.SetInfoVal("END",_intstr.get32(end));
        } else {
            vcfr.DeleteInfoKeyVal("END");
        }
        if(is_in_region) make_record_haploid(vcfr);
        vcfr.WriteUnaltered(_opt.outfp);
    }
示例#4
0
    void
    make_record_haploid(VcfRecord& vcfr) const {
        const char* gt(vcfr.GetSampleVal("GT"));
        if(NULL == gt)  return;
        parse_gt(gt,_gti);
            
        if(_gti.size() == 2) { // record is diploid
            if(_gti[0] == _gti[1]) {
                // change GT:
                static const char* unknown(".");
                const char* val(unknown);
                if(_gti[0]>=0) {
                    val=_intstr.get32(_gti[0]);
                }
                vcfr.SetSampleVal("GT",val);

                // move PL field to 'backup' OPL field:
                const char* pl(vcfr.GetSampleVal("PL"));
                if(NULL != pl) {
                    vcfr.SetSampleVal(_shopt.orig_pl_tag.c_str(),pl);
                    vcfr.DeleteSampleKeyVal("PL");
                }
            } else {
                vcfr.AppendFilter(_shopt.haploid_conflict_label.c_str());
            }
        }
    }
示例#5
0
void ResetVcfRecordFromBam( VcfRecord & vcf_rec, RefStats & rstats, vector<RefSeq*> & REF_SEQ, string & chr, int center, SamFile & samIn, SamFileHeader & samHeader )
{
	vector<int> raw_counts; // read-type count vector
	raw_counts.resize(18, 0);
	setReadCountInSection( raw_counts, chr, center, samIn, samHeader, REF_SEQ ); // set from bam

// set gl & if 1/1 or 1/0, set breakpoint
	vector<MergeCell> new_vec;
	new_vec.resize(1);
	MergeCellPtr merge_ptr = new_vec.begin();
	merge_ptr->dups = 1;
	merge_ptr->counts = raw_counts;
	raw_counts.clear();
	merge_ptr->GL.resize(3,0);
	rstats.SetRecordGL( merge_ptr );

	vcf_rec.SetChrName( chr );
	vcf_rec.SetPosition( center );
	vcf_rec.UpdateFromMergeCellPtr( merge_ptr );
	
// if need to refine break point && exist MEI, do refine
	if (REFINE_BREAK_POINT && vcf_rec.GetDosage() > 0)
		vcf_rec.SetBreakPointAndCIFromBam( samIn, samHeader );
}
示例#6
0
int VcfCleaner::execute(int argc, char **argv)
{
    String refFile = "";
    String inputVcf = "";
    String outputVcf = "";
    bool uncompress = false;
    bool params = false;
    
    // Read in the parameters.    
    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_PARAMETER_GROUP("Required Parameters")
        LONG_STRINGPARAMETER("in", &inputVcf)
        LONG_STRINGPARAMETER("out", &outputVcf)
        LONG_PARAMETER_GROUP("Optional Parameters")
        LONG_PARAMETER("uncompress", &uncompress)
        LONG_PARAMETER("params", &params)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));
    
    inputParameters.Read(argc-1, &(argv[1]));
    
    // Check that all files were specified.
    if(inputVcf == "")
    {
        usage();
        inputParameters.Status();
        std::cerr << "Missing \"--in\", a required parameter.\n\n";
        return(-1);
    }
    if(outputVcf == "")
    {
        usage();
        inputParameters.Status();
        std::cerr << "Missing \"--out\", a required parameter.\n\n";
        return(-1);
    }

    if(params)
    {
        inputParameters.Status();
    }

    VcfFileReader inFile;
    VcfFileWriter outFile;
    VcfHeader header;
    VcfRecord record;

    // Open the file.
    inFile.open(inputVcf, header);
    if(uncompress)
    {
        outFile.open(outputVcf, header, InputFile::DEFAULT);
    }
    else
    {
        outFile.open(outputVcf, header);
    }

    int numReadRecords = 0;
    int numWrittenRecords = 0;
    int returnVal = 0;

    // Set to only store/write the GT field.
    VcfRecordGenotype::addStoreField("GT");
    while(inFile.readRecord(record))
    {
        ++numReadRecords;
        // Check if any samples are missing GT or if any are not phased.
        if(!record.hasAllGenotypeAlleles() || !record.allPhased())
        {
            // Missing a GT or not phased, so continue without writing.
            continue;
        }
        
        // Clear the INFO field.
        record.getInfo().clear();
        // Write the record.
        if(!outFile.writeRecord(record))
        {
            // Write error.
            std::cerr << "Failed writing a vcf record.\n";
            returnVal = -1;
        }
        ++numWrittenRecords;
    }
 
    inFile.close();   
    outFile.close();   

    std::cerr << "NumReadRecords: " << numReadRecords
              << "; NumWrittenRecords: " << numWrittenRecords << "\n";
    return(returnVal);
}
示例#7
0
int VcfSplit::execute(int argc, char **argv)
{
    String refFile = "";
    String inputVcf = "";
    String outputVcfBase = "";
    String refName = "";
    bool uncompress = false;
    bool params = false;
    bool noeof = false;
    
    // Read in the parameters.    
    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_PARAMETER_GROUP("Required Parameters")
        LONG_STRINGPARAMETER("in", &inputVcf)
        LONG_STRINGPARAMETER("obase", &outputVcfBase)
        LONG_PARAMETER_GROUP("Optional Parameters")
        LONG_PARAMETER("uncompress", &uncompress)
        LONG_STRINGPARAMETER("refName", &refName)
        LONG_PARAMETER("noeof", &noeof)
        LONG_PARAMETER("params", &params)
        LONG_PHONEHOME(VERSION)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));
    
    inputParameters.Read(argc-1, &(argv[1]));
    
    // Check that all files were specified.
    if(inputVcf == "")
    {
        usage();
        inputParameters.Status();
        std::cerr << "Missing \"--in\", a required parameter.\n\n";
        return(-1);
    }
    if(outputVcfBase == "")
    {
        usage();
        inputParameters.Status();
        std::cerr << "Missing \"--obase\", a required parameter.\n\n";
        return(-1);
    }
    outputVcfBase += ".";

    if(params)
    {
        inputParameters.Status();
    }

    // If no eof block is required for a bgzf file, set the bgzf file type to 
    // not look for it.
    if(noeof)
    {
        // Set that the eof block is not required.
        BgzfFileType::setRequireEofBlock(false);
    }

    VcfFileReader inFile;
    std::map<std::string, VcfFileWriter*> outFiles;
    VcfHeader header;
    
    // Open the file.
    inFile.open(inputVcf, header);

    if(refName != "")
    {
        inFile.setReadSection(refName.c_str());
    }

    VcfRecord record;
    int numRecords = 0;

    std::string prevChr = "";
    std::string chr = "";
    VcfFileWriter* outFilePtr = 0;
    std::string outName = "";
    while(inFile.readRecord(record))
    {
        ++numRecords;

        chr = record.getChromStr();

        if((outFilePtr == 0) || (chr != prevChr))
        {
            outFilePtr = outFiles[chr];
            if(outFilePtr == 0)
            {
                outFilePtr = new VcfFileWriter();
                outFiles[chr] = outFilePtr;
                outName = outputVcfBase.c_str();
                if(chr.substr(0,3) != "chr")
                {
                    outName += "chr";
                }
                outName += chr + ".vcf";
                // chr not in outFile list.
                if(uncompress)
                {
                    outFilePtr->open(outName.c_str(), header, InputFile::DEFAULT);
                }
                else
                {
                    outName += ".gz";
                    outFilePtr->open(outName.c_str(), header);
                }
            }
        }
        outFilePtr->writeRecord(record);
    }
 
    inFile.close();   

    for (std::map<std::string,VcfFileWriter*>::iterator it = outFiles.begin();
         it != outFiles.end(); ++it)
    {
        if(it->second != 0)
        {
            it->second->close();
            it->second = 0;
        }
    }
  

    std::cerr << "NumRecords: " << numRecords << "\n";
    return(0);
}
示例#8
0
    void
    process_block(const bool is_in_region,
                  const unsigned end,
                  VcfRecord& vcfr) const {

        if(! is_in_region) {
            if(end>vcfr.GetPos()) {
                vcfr.SetInfoVal("END",_intstr.get32(end));
            } else {
                vcfr.DeleteInfoKeyVal("END");
            }
            vcfr.WriteUnaltered(_opt.outfp);
        } else {
            vcfr.DeleteInfoKeyVal("END");
            vcfr.WriteUnaltered(_opt.outfp);
            while(end>vcfr.GetPos()) {
                const int next_pos(vcfr.GetPos()+1);
                vcfr.SetPos(next_pos);
                vcfr.SetRef(_scp.get_char(vcfr.GetChrom().c_str(),next_pos));
                vcfr.WriteUnaltered(_opt.outfp);
            }
        }
    }
示例#9
0
bool VcfFileReader::readRecord(VcfRecord& record, VcfSubsetSamples* subset)
{
    myStatus = StatGenStatus::SUCCESS;
    // Subset the read if there are subsets specified.
    VcfSubsetSamples* subsetPtr = subset;
    if((subsetPtr == NULL) && myUseSubset)
    {
        subsetPtr = &mySampleSubset;
    }

    // Check to see if a new region has been set.  If so, setup for that region.
    bool searchChrom = false;
    if(myNewSection)
    {
        if(myVcfIndex != NULL)
        {
            // Have an index file so use
            if(!processNewSection())
            {
                // processNewSection sets the status appropriately on failure.
                return(false);
            }
        }
        else if(myTotalRead == 0)
        {
            // ReadSection without an index only works if no records
            // have been read.
            searchChrom = true;
            myNewSection = false;
        }
        else
        {
            myNewSection = false;
            myStatus.setStatus(StatGenStatus::FAIL_ORDER, 
                               "Cannot set read section with no index after reading records");
            return(false);
        }
    }

    // Keep looping until a desired record is found.
    bool recordFound = false;
    while(!recordFound)
    {
        if(!record.read(myFilePtr, mySiteOnly, myRecordDiscardRules, subsetPtr))
        {
            myStatus = record.getStatus();
            myTotalRead += myRecordDiscardRules.getNumDiscarded();
            myNumRecords += myRecordDiscardRules.getNumDiscarded();
            myRecordDiscardRules.clearNumDiscarded();
            return(false);
        }

        ++myTotalRead;
        myTotalRead += myRecordDiscardRules.getNumDiscarded();

        // Check to see if the record is in the section.
        // First check the chromosome.
        if(!mySectionChrom.empty() && (mySectionChrom != record.getChromStr()))
        {
            if(searchChrom)
            {
                // Still searching for the chromosome, so continue
                // to the next record.
                continue;
            }

            // Record is not within the correct chromosome, so return failure.
            myStatus = StatGenStatus::NO_MORE_RECS;
           return(false);
        }
        searchChrom = false;

        // Check if the record is after the section end if applicable.
        if((mySection1BasedEndPos != -1) && 
           (record.get1BasedPosition() >= mySection1BasedEndPos))
        {
            myStatus = StatGenStatus::NO_MORE_RECS;
            return(false);
        }
        
        // Check if the record is prior to the section start if applicable.
        // Determinine the VCF record end position.
        // If we are not requiring overlap, then we only need to check
        // the start position, but if overlap is required, then it needs
        // to incrment the start by the length-1.
        int numIncBases = 0;
        if(mySectionOverlap)
        {
            // The VCF record end position is the start position + length of the
            // reference string - 1.
            numIncBases = record.getNumRefBases() - 1;
        }
        if((mySection1BasedStartPos != -1) &&
           ((record.get1BasedPosition() + numIncBases)
            < mySection1BasedStartPos))
        {
            // This record is prior to the section, so keep reading.
            continue;
        }

        ++myNumRecords;
        myNumRecords += myRecordDiscardRules.getNumDiscarded();
        myRecordDiscardRules.clearNumDiscarded();
        
        // Record successfully read, so check to see if it is discarded.
        if((myDiscardRules & DISCARD_NON_PHASED) && !record.allPhased())
        {
            // Not all samples are phased, so discard this record.
            continue;
        }
        if((myDiscardRules & DISCARD_MISSING_GT) &&
           !record.hasAllGenotypeAlleles())
        {
            // discard missing GTs and this record had missing alleles,
            // so keep reading.
            continue;
        }
        if((myDiscardRules & DISCARD_FILTERED) && 
           !(record.getFilter().passedAllFilters()))
        {
            // Record was filtered, so discard it.
            continue;
        }
        if((myDiscardRules & DISCARD_MULTIPLE_ALTS) &&
           (record.getNumAlts() > 1))
        {
            // Record had multiple alternates, so discard.
            continue;
        }

        // Check allele counts for discarding.
        if(myMinAltAlleleCount != UNSET_MIN_ALT_ALLELE_COUNT)
        {
            // Count the number of alternates.
            int32_t altCount = 0;
            for(int sampleNum = 0; sampleNum < record.getNumSamples(); 
                sampleNum++)
            {
                if((myAltAlleleCountSubset != NULL) &&
                   !(myAltAlleleCountSubset->keep(sampleNum)))
                {
                    // Skip this sample.
                    continue;
                }
                for(int gtNum = 0; gtNum < record.getNumGTs(sampleNum); gtNum++)
                {
                    if(record.getGT(sampleNum, gtNum) > 0)
                    {
                        // Alternate, so increment the count.
                        ++altCount;
                    }
                }
            }
            if(altCount < myMinAltAlleleCount)
            {
                // Not enough alternates so continue to the next sample.
                continue;
            }
        }

        // Check to see if the minimum alternate allele count is met.
        if(myMinMinorAlleleCount != UNSET_MIN_MINOR_ALLELE_COUNT)
        {
            // Get the number of possible alternates.
            unsigned int numAlts = record.getNumAlts();

            // Verify that each allele has the min count.
            bool failMinorAlleleCount = false;
            for(unsigned int i = 0; i <= numAlts; i++)
            {
                if(record.getAlleleCount(i, myMinorAlleleCountSubset) 
                   < myMinMinorAlleleCount)
                {
                    // Not enough of one gt, so not ok.
                    failMinorAlleleCount = true;
                    break;
                }
            }
            if(failMinorAlleleCount)
            {
                // not enough alleles, so continue to the next record.
                continue;
            }
        }

        // Record was not discarded.
        recordFound = true;
    }

    // Increment the number of kept records.
    ++myNumKeptRecords;
    return(true);
}
示例#10
0
int VcfMac::execute(int argc, char **argv)
{
    String inputVcf = "";
    int minAC = -1;
    String sampleSubset = "";
    String filterList = "";
    bool params = false;

    IntervalTree<int> regions;
    std::vector<int> intersection;
    
    // Read in the parameters.    
    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_PARAMETER_GROUP("Required Parameters")
        LONG_STRINGPARAMETER("in", &inputVcf)
        LONG_PARAMETER_GROUP("Optional Parameters")
        LONG_STRINGPARAMETER("sampleSubset", &sampleSubset)
        LONG_INTPARAMETER("minAC", &minAC)
        LONG_STRINGPARAMETER("filterList", &filterList)
        LONG_PARAMETER("params", &params)
        LONG_PHONEHOME(VERSION)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));
    
    inputParameters.Read(argc-1, &(argv[1]));
    
    // Check that all files were specified.
    if(inputVcf == "")
    {
        usage();
        inputParameters.Status();
        std::cerr << "Missing \"--in\", a required parameter.\n\n";
        return(-1);
    }

    if(params)
    {
        inputParameters.Status();
    }

    // Open the two input files.
    VcfFileReader inFile;
    VcfHeader header;
    VcfRecord record;

    // Open the file
    if(sampleSubset.IsEmpty())
    {
        inFile.open(inputVcf, header);        
    }
    else
    {
        inFile.open(inputVcf, header, sampleSubset, NULL, NULL);
    }
    
    // Add the discard rule for minor allele count.
    if(minAC >= 0)
    {
        inFile.addDiscardMinMinorAlleleCount(minAC, NULL);
    }
    
    if(!filterList.IsEmpty())
    {
        // Open the filter list.
        IFILE regionFile = ifopen(filterList, "r");
        String regionLine;
        StringArray regionColumn;
        int start;
        int end;
        int intervalVal = 1;
        if(regionFile == NULL)
        {
            std::cerr << "Failed to open " << filterList 
                      << ", so keeping all positions\n";
            filterList.Clear();
        }
        else
        {
            while( regionFile->isOpen() && !regionFile->ifeof())
            {
                // Read the next interval
                regionLine.Clear();
                regionLine.ReadLine(regionFile);
                if(regionLine.IsEmpty())
                {
                    // Nothing on this line, continue to the next.
                    continue;
                }
                regionColumn.ReplaceColumns(regionLine, ' ');
                if(regionColumn.Length() != 2)
                {
                    std::cerr << "Improperly formatted region line: " 
                              << regionLine << "; skipping to the next line.\n";
                    continue;
                }
                // Convert the columns to integers.
                if(!regionColumn[0].AsInteger(start))
                {
                    // The start position (1st column) is not an integer.
                    std::cerr << "Improperly formatted region line, start position "
                              << "(1st column) is not an integer: "
                              << regionColumn[0]
                              << "; Skipping to the next line.\n";
                    continue;
                }
                if(!regionColumn[1].AsInteger(end))
                {
                    // The start position (1st column) is not an integer.
                    std::cerr << "Improperly formatted region line, end position "
                              << "(2nd column) is not an integer: "
                              << regionColumn[1]
                              << "; Skipping to the next line.\n";
                    continue;
                }
                // Add 1-based inclusive intervals.
                regions.add(start,end, intervalVal);
            }
        }
    }


    int numReadRecords = 0;

    while( inFile.readRecord(record))
    {
        if(!filterList.IsEmpty())
        {
            // Check if the region should be kept.
            intersection.clear();
            regions.get_intersecting_intervals(record.get1BasedPosition(), intersection);
            
            if(intersection.empty())
            {
                // not in the interval, so continue to the next record.
                continue;
            }
        }

        ++numReadRecords;

        // Loop through the number of possible alternates.
        unsigned int numAlts = record.getNumAlts();
        int minAlleleCount = -1;
        int curAlleleCount = 0;
        int totalAlleleCount = 0;
        for(unsigned int i = 0; i <= numAlts; i++)
        {
            curAlleleCount = record.getAlleleCount(i);
            if((minAlleleCount == -1) ||
               (curAlleleCount < minAlleleCount))
            {
                minAlleleCount = curAlleleCount;
            }
            totalAlleleCount += curAlleleCount;
        }
        if(totalAlleleCount != 0)
        {
            double maf = (double)minAlleleCount/totalAlleleCount;
            std::cout << record.getIDStr()
                      << "\t" << minAlleleCount
                      << "\t" << maf << "\n";
        }
    }
    
    inFile.close();

    //    std::cerr << "\n\t# Records: " << numReadRecords << "\n";

    // return success.
    return(0);
}