Beispiel #1
0
// Read the header from the specified file.  Assumes the file is in
// the correct position for reading the header.
bool GlfHeader::read(IFILE filePtr)
{
    if((filePtr == NULL) || (filePtr->isOpen() == false))
    {
        // File is not open, return failure.
        std::string errorString = 
            "Failed to read the header since the file is not open.";
        throw(GlfException(GlfStatus::FAIL_ORDER, errorString));
        return(false);
    }

    // Read the magic
    int numRead = 0;
    char magic[GLF_MAGIC_LEN];
    numRead = ifread(filePtr, &magic, GLF_MAGIC_LEN);
    if(numRead != GLF_MAGIC_LEN)
    {
        String errorMsg = "Failed to read the magic number (";
        errorMsg += GLF_MAGIC_LEN;
        errorMsg += " bytes).  Only read ";
        errorMsg += numRead;
        errorMsg += " bytes.";
        std::string errorString = errorMsg.c_str();
        throw(GlfException(GlfStatus::FAIL_IO, errorString));
        return(false);
    }
    // Read the header length.
    int32_t headerLen = 0;
    int byteLen = sizeof(int32_t);
    numRead = ifread(filePtr, &headerLen, byteLen);
    if(numRead != byteLen)
    {
        String errorMsg = "Failed to read the length of the header text (";
        errorMsg += byteLen;
        errorMsg += " bytes).  Only read ";
        errorMsg += numRead;
        errorMsg += " bytes.";
        std::string errorString = errorMsg.c_str();
        throw(GlfException(GlfStatus::FAIL_IO, errorString));
        return(false);
    }
       
    // Read the header from the file.
    numRead = myText.readFromFile(filePtr, headerLen);
    if(numRead != headerLen)
    {
        String errorMsg = "Failed to read the header text (";
        errorMsg += headerLen;
        errorMsg += " bytes).  Only read ";
        errorMsg += numRead;
        errorMsg += " bytes.";
        std::string errorString = errorMsg.c_str();
        throw(GlfException(GlfStatus::FAIL_IO, errorString));
        return(false);
    }
    // Successfully read, return success.
    return(true);
}
Beispiel #2
0
// Write the header to the specified file.
bool GlfHeader::write(IFILE filePtr) const
{
    if((filePtr == NULL) || (filePtr->isOpen() == false))
    {
        // File is not open, return failure.
        std::string errorString = 
            "Failed to write the header since the file is not open.";
        throw(GlfException(GlfStatus::FAIL_ORDER, errorString));
        return(false);
    }

    int numWrite = 0;
    // Write the magic
    numWrite = ifwrite(filePtr, GLF_MAGIC.c_str(), GLF_MAGIC_LEN);
    if(numWrite != GLF_MAGIC_LEN)
    {
        String errorMsg = "Failed to write the magic number (";
        errorMsg += GLF_MAGIC_LEN;
        errorMsg += " bytes).  Only wrote ";
        errorMsg += numWrite;
        errorMsg += " bytes.";
        std::string errorString = errorMsg.c_str();
        throw(GlfException(GlfStatus::FAIL_IO, errorString));
        return(false);
    }

    // Write the header length.
    int32_t headerLen = myText.length();
    int byteLen = sizeof(int32_t);
    numWrite = ifwrite(filePtr, &headerLen, byteLen);
    if(numWrite != byteLen)
    {
        String errorMsg = "Failed to write the length of the header text (";
        errorMsg += byteLen;
        errorMsg += " bytes).  Only wrote ";
        errorMsg += numWrite;
        errorMsg += " bytes.";
        std::string errorString = errorMsg.c_str();
        throw(GlfException(GlfStatus::FAIL_IO, errorString));
        return(false);
    }
       
    // Write the header to the file.
    numWrite = ifwrite(filePtr, myText.c_str(), headerLen);
    if(numWrite != headerLen)
    {
        String errorMsg = "Failed to write the header text (";
        errorMsg += headerLen;
        errorMsg += " bytes).  Only wrote ";
        errorMsg += numWrite;
        errorMsg += " bytes.";
        std::string errorString = errorMsg.c_str();
        throw(GlfException(GlfStatus::FAIL_IO, errorString));
        return(false);
    }
    // Successfully wrote, return success.
    return(true);
}
Beispiel #3
0
void VcfFile::printBEDHeader(IFILE oBedFile, IFILE oFamFile) {
  for(int i=0; i < getSampleCount(); ++i) {
    if ( vpVcfInds[i]->sFamID.Length() == 0 ) {
      ifprintf(oFamFile,"%s",vpVcfInds[i]->sIndID.c_str());
    }
    else {
      ifprintf(oFamFile,"%s",vpVcfInds[i]->sFamID.c_str());
    }

    ifprintf(oFamFile,"\t%s",vpVcfInds[i]->sIndID.c_str());

    if ( vpVcfInds[i]->sFatID.Length() == 0 ) {
      ifprintf(oFamFile,"\t0");
    }
    else {
      ifprintf(oFamFile,"%s",vpVcfInds[i]->sFatID.c_str());
    }

    if ( vpVcfInds[i]->sMotID.Length() == 0 ) {
      ifprintf(oFamFile,"\t0");
    }
    else {
      ifprintf(oFamFile,"\t%s",vpVcfInds[i]->sMotID.c_str());
    }

    switch( vpVcfInds[i]->gender ) {
    case VcfInd::UNKNOWN:
      ifprintf(oFamFile,"\t0");
      break;
    case VcfInd::MALE:
      ifprintf(oFamFile,"\t1");
      break;
    case VcfInd::FEMALE:
      ifprintf(oFamFile,"\t2");
      break;
    default:
      throw VcfFileException("Unrecognized value for gender");
      break;
    }
    ifprintf(oFamFile,"\t-9\n");
  }

  char magicNumbers[3] = {0x6c,0x1b,0x01};
  oBedFile->ifwrite(magicNumbers, 3);
}
Beispiel #4
0
bool VcfRecord::read(IFILE filePtr, bool siteOnly,
                     VcfRecordDiscardRules& discardRules,
                     VcfSubsetSamples* sampleSubset)
{
    // Clear out any previously set values.
    reset();
    
    if(filePtr == NULL)
    {
        myStatus.setStatus(StatGenStatus::FAIL_ORDER,
                           "Error reading VCF record before opening the file.");
        return(false);
    }

    if(ifeof(filePtr))
    {
        // End of file, just return false.
        return(false);
    }
    
    // Read the chromosome.
    if(!readTilTab(filePtr, myChrom))
    {
        if(myChrom.empty())
        {
            // EOF.
            return(false);
        }
        // Not an empty line.
        myStatus.setStatus(StatGenStatus::FAIL_PARSE, 
                           "Error reading VCF Record CHROM.");
        return(false);
    }
    // Read the 1-based Position
    std::string strPos;
    if(!readTilTab(filePtr, strPos))
    {
        myStatus.setStatus(StatGenStatus::FAIL_PARSE, 
                           "Error reading VCF Record POS.");
        return(false);
    }
    else
    {
        // Read the position, so convert to an integer.
        my1BasedPosNum = atoi(strPos.c_str());
    }
    // Read the ID.
    if(!readTilTab(filePtr, myID))
    {
        myStatus.setStatus(StatGenStatus::FAIL_PARSE, 
                           "Error reading VCF Record ID.");
        return(false);
    }

    if(discardRules.discardForID(myID))
    {
        // Do not keep this id, so consume the rest of the record and
        // return the next record.
        filePtr->discardLine();
        return(read(filePtr, siteOnly, discardRules, sampleSubset));
    }

    // Read the Ref.
    if(!readTilTab(filePtr, myRef))
    {
        myStatus.setStatus(StatGenStatus::FAIL_PARSE, 
                           "Error reading VCF Record REF.");
        return(false);
    }
    // Read the Alt.
    myAltArray.clear();
    if(!readTilTab(filePtr, myAlt))
    {
        myStatus.setStatus(StatGenStatus::FAIL_PARSE, 
                           "Error reading VCF Record ALT.");
        return(false);
    }
    // Read the Qual.
    if(!readTilTab(filePtr, myQual))
    {
        myStatus.setStatus(StatGenStatus::FAIL_PARSE, 
                           "Error reading VCF Record QUAL.");
        return(false);
    }
    else
    {
        if(myQual != ".")
        {
            // Read the quality, so convert to an integer.
            myQualNum = atof(myQual.c_str());
        }
        else
        {
            myQualNum = -1;
        }
    }
    // Read the Filter.
    if(!myFilter.read(filePtr))
    {
        myStatus.setStatus(StatGenStatus::FAIL_PARSE, 
                           "Error reading VCF Record FILTER.");
        return(false);
    }
    // Read the Info (could be the last word in the line or file).
    if(!myInfo.read(filePtr))
    {
        // Found the end of the line after the info field, so return true,
        // successfully read the record.
        return(true);
    }

    if(siteOnly)
    {
        // Do not store genotypes, so just consume the rest of the line.
        filePtr->readTilChar("\n");
    }
    else
    {
        // Not yet at the end of the line, so read the genotype fields
        // (format & samples)
        try
        {
            myGenotype.read(filePtr, sampleSubset);
        }
        catch(std::exception& e)
        {
            myDummyString = "Failed parsing the Genotype Fields of " + myChrom + ":" + 
                std::to_string((long long int)my1BasedPosNum) + " (chr:pos) - " + e.what();
            myStatus.setStatus(StatGenStatus::FAIL_PARSE, myDummyString.c_str());
            return(false);
        }
    }
    // Found the end of the line, return true since all required fields
    // were read.
    return(true);
}
Beispiel #5
0
// Read a BAM file's header.
bool BamInterface::readHeader(IFILE filePtr, SamFileHeader& header,
                              SamStatus& status)
{
    if(filePtr == NULL)
    {
        // File is not open, return false.
        status.setStatus(SamStatus::FAIL_ORDER,
                         "Cannot read header since the file pointer is null");
        return(false);
    }
    if(filePtr->isOpen() == false)
    {
        status.setStatus(SamStatus::FAIL_ORDER,
                         "Cannot read header since the file is not open");
        return(false);
    }

    // Clear the passed in header.
    header.resetHeader();

    int32_t headerLength;
    int readSize = ifread(filePtr, &headerLength, sizeof(headerLength));

    if(readSize != sizeof(headerLength))
    {
        String errMsg = "Failed to read the BAM header length, read ";
        errMsg += readSize;
        errMsg += " bytes instead of ";
        errMsg += (unsigned int)sizeof(headerLength);
        status.setStatus(SamStatus::FAIL_IO, errMsg.c_str());
        return(false);
    }

    String headerStr;
    if(headerLength > 0)
    {
        // Read the header.
        readSize =
            ifread(filePtr, headerStr.LockBuffer(headerLength + 1), headerLength);
        headerStr[headerLength] = 0;
        headerStr.UnlockBuffer();
        if(readSize != headerLength)
        {
            // Failed to read the header.
            status.setStatus(SamStatus::FAIL_IO, "Failed to read the BAM header.");
            return(false);
        }
    }

    // Parse the header that was read.
    if(!header.addHeader(headerStr))
    {
        // Status is set in the method on failure.
        status.setStatus(SamStatus::FAIL_PARSE, header.getErrorMessage());
        return(false);
    }

    int referenceCount;
    // Read the number of references sequences.
    ifread(filePtr, &referenceCount, sizeof(int));

    // Get and clear the reference info so it can be set
    // from the bam reference table.
    SamReferenceInfo& refInfo =
        header.getReferenceInfoForBamInterface();
    refInfo.clear();

    CharBuffer refName;

    // Read each reference sequence
    for (int i = 0; i < referenceCount; i++)
    {
        int nameLength;
        int rc;
        // Read the length of the reference name.
        rc = ifread(filePtr, &nameLength, sizeof(int));
        if(rc != sizeof(int))
        {
            status.setStatus(SamStatus::FAIL_IO,
                             "Failed to read the BAM reference dictionary.");
            return(false);
        }

        // Read the name.
        refName.readFromFile(filePtr, nameLength);

        // Read the length of the reference sequence.
        int32_t refLen;
        rc = ifread(filePtr, &refLen, sizeof(int));

        if(rc != sizeof(int)) {
            status.setStatus(SamStatus::FAIL_IO,
                             "Failed to read the BAM reference dictionary.");
            return(false);
        }

        refInfo.add(refName.c_str(), refLen);
    }

    // Successfully read the file.
    return(true);
}
Beispiel #6
0
bool BamInterface::writeHeader(IFILE filePtr, SamFileHeader& header,
                               SamStatus& status)
{
    if((filePtr == NULL) || (filePtr->isOpen() == false))
    {
        // File is not open, return false.
        status.setStatus(SamStatus::FAIL_ORDER,
                         "Cannot write header since the file pointer is null");
        return(false);
    }

    char magic[4];
    magic[0] = 'B';
    magic[1] = 'A';
    magic[2] = 'M';
    magic[3] = 1;

    // Write magic to the file.
    ifwrite(filePtr, magic, 4);

    ////////////////////////////////
    // Write the header to the file.
    ////////////////////////////////
    // Construct a string containing the entire header.
    std::string headerString = "";
    header.getHeaderString(headerString);

    int32_t headerLen = headerString.length();
    int numWrite = 0;

    // Write the header length.
    numWrite = ifwrite(filePtr, &headerLen, sizeof(int32_t));
    if(numWrite != sizeof(int32_t))
    {
        status.setStatus(SamStatus::FAIL_IO,
                         "Failed to write the BAM header length.");
        return(false);
    }

    // Write the header to the file.
    numWrite = ifwrite(filePtr, headerString.c_str(), headerLen);
    if(numWrite != headerLen)
    {
        status.setStatus(SamStatus::FAIL_IO,
                         "Failed to write the BAM header.");
        return(false);
    }

    ////////////////////////////////////////////////////////
    // Write the Reference Information.
    const SamReferenceInfo& refInfo = header.getReferenceInfo();

    // Get the number of sequences.
    int32_t numSeq = refInfo.getNumEntries();
    ifwrite(filePtr, &numSeq, sizeof(int32_t));

    // Write each reference sequence
    for (int i = 0; i < numSeq; i++)
    {
        const char* refName = refInfo.getReferenceName(i);
        // Add one for the null value.
        int32_t nameLength = strlen(refName) + 1;
        // Write the length of the reference name.
        ifwrite(filePtr, &nameLength, sizeof(int32_t));

        // Write the name.
        ifwrite(filePtr, refName, nameLength);
        // Write the length of the reference sequence.
        int32_t refLen = refInfo.getReferenceLength(i);
        ifwrite(filePtr, &refLen, sizeof(int32_t));
    }

    return(true);
}
bool VcfGenotypeSample::read(IFILE filePtr, VcfGenotypeFormat& format)
{
    static const char* GT_DELIM = "\n\t:|/.";
    static const int END_GT = 2; // Ends at index 2 or less
    static const int PHASED_CHAR_POS = 3;
    static const int UNPHASED_CHAR_POS = 4;
    static const int MISSING_GT_POS = 5;

    // Clear out any previously set values.
    reset();
    
    myFormatPtr = &format;

    int gtIndex = format.getGTIndex();

    // Read the subfields.
    SUBFIELD_READ_STATUS readStatus = MORE_SUBFIELDS;  
    std::string* nextType = NULL;
    int subFieldIndex = 0;
    while(readStatus == MORE_SUBFIELDS)
    {
        // Get the field to write into.
        if(format.storeIndex(subFieldIndex))
        {
            nextType = &(myGenotypeSubFields.getNextEmpty());
            // Check if this is the GT field.
            if(subFieldIndex == gtIndex)
            {
                // There is a GT field, so set that all GT fields are there.
                // if any are missing it will be turned back to false.
                myHasAllGenotypeAlleles = true;
                // This is the GT field, so parse manually looking to see if it
                // is phased and store the genotypes.
                int stopChar = END_GT + 1;
                // Read until a new subfield is found.
                while(stopChar > END_GT)
                {
                    // TODO  have an option to autoparse the genotypes?
                    // todo - store the previous nextType len in order to
                    // do string conversion to ints...
                    stopChar = filePtr->readTilChar(GT_DELIM, *nextType);
                    if(stopChar == PHASED_CHAR_POS)
                    {
                        nextType->push_back('|');
                        myPhased = true;
                    }
                    else if(stopChar == UNPHASED_CHAR_POS)
                    {
                        nextType->push_back('/');
                        myUnphased = true;
                    }
                    else if(stopChar == MISSING_GT_POS)
                    {
                        nextType->push_back('.');
                        myHasAllGenotypeAlleles = false;
                    }
                }
                // Check if this is the END_GT signal.
                readStatus = getReadStatus(stopChar);
            }
            else
            {
                // more subfields to read.
                readStatus = readGenotypeSubField(filePtr, nextType);
            }
        }
        else
        {
            readStatus = readGenotypeSubField(filePtr, NULL);
        }
        ++subFieldIndex;
    }

    // subFieldIndex contains the number of fields in this sample.
    if(subFieldIndex > format.getOrigNumFields())
    {
        throw(std::runtime_error("VCF Number of Fields in a Sample does not match the Format."));
    }
    else if(subFieldIndex < format.getOrigNumFields())
    {
        // If there are no fields for this sample, enter the missing value.
        if(myGenotypeSubFields.size() == 0)
        {
            myGenotypeSubFields.getNextEmpty() = MISSING_FIELD;
        }
    }

    // Return true if there is a tab - it is just END_OF_FIELD.
    return(readStatus == END_OF_FIELD);
}
Beispiel #8
0
int VcfMac::execute(int argc, char **argv)
{
    String inputVcf = "";
    int minAC = -1;
    String sampleSubset = "";
    String filterList = "";
    bool params = false;

    IntervalTree<int> regions;
    std::vector<int> intersection;
    
    // Read in the parameters.    
    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_PARAMETER_GROUP("Required Parameters")
        LONG_STRINGPARAMETER("in", &inputVcf)
        LONG_PARAMETER_GROUP("Optional Parameters")
        LONG_STRINGPARAMETER("sampleSubset", &sampleSubset)
        LONG_INTPARAMETER("minAC", &minAC)
        LONG_STRINGPARAMETER("filterList", &filterList)
        LONG_PARAMETER("params", &params)
        LONG_PHONEHOME(VERSION)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));
    
    inputParameters.Read(argc-1, &(argv[1]));
    
    // Check that all files were specified.
    if(inputVcf == "")
    {
        usage();
        inputParameters.Status();
        std::cerr << "Missing \"--in\", a required parameter.\n\n";
        return(-1);
    }

    if(params)
    {
        inputParameters.Status();
    }

    // Open the two input files.
    VcfFileReader inFile;
    VcfHeader header;
    VcfRecord record;

    // Open the file
    if(sampleSubset.IsEmpty())
    {
        inFile.open(inputVcf, header);        
    }
    else
    {
        inFile.open(inputVcf, header, sampleSubset, NULL, NULL);
    }
    
    // Add the discard rule for minor allele count.
    if(minAC >= 0)
    {
        inFile.addDiscardMinMinorAlleleCount(minAC, NULL);
    }
    
    if(!filterList.IsEmpty())
    {
        // Open the filter list.
        IFILE regionFile = ifopen(filterList, "r");
        String regionLine;
        StringArray regionColumn;
        int start;
        int end;
        int intervalVal = 1;
        if(regionFile == NULL)
        {
            std::cerr << "Failed to open " << filterList 
                      << ", so keeping all positions\n";
            filterList.Clear();
        }
        else
        {
            while( regionFile->isOpen() && !regionFile->ifeof())
            {
                // Read the next interval
                regionLine.Clear();
                regionLine.ReadLine(regionFile);
                if(regionLine.IsEmpty())
                {
                    // Nothing on this line, continue to the next.
                    continue;
                }
                regionColumn.ReplaceColumns(regionLine, ' ');
                if(regionColumn.Length() != 2)
                {
                    std::cerr << "Improperly formatted region line: " 
                              << regionLine << "; skipping to the next line.\n";
                    continue;
                }
                // Convert the columns to integers.
                if(!regionColumn[0].AsInteger(start))
                {
                    // The start position (1st column) is not an integer.
                    std::cerr << "Improperly formatted region line, start position "
                              << "(1st column) is not an integer: "
                              << regionColumn[0]
                              << "; Skipping to the next line.\n";
                    continue;
                }
                if(!regionColumn[1].AsInteger(end))
                {
                    // The start position (1st column) is not an integer.
                    std::cerr << "Improperly formatted region line, end position "
                              << "(2nd column) is not an integer: "
                              << regionColumn[1]
                              << "; Skipping to the next line.\n";
                    continue;
                }
                // Add 1-based inclusive intervals.
                regions.add(start,end, intervalVal);
            }
        }
    }


    int numReadRecords = 0;

    while( inFile.readRecord(record))
    {
        if(!filterList.IsEmpty())
        {
            // Check if the region should be kept.
            intersection.clear();
            regions.get_intersecting_intervals(record.get1BasedPosition(), intersection);
            
            if(intersection.empty())
            {
                // not in the interval, so continue to the next record.
                continue;
            }
        }

        ++numReadRecords;

        // Loop through the number of possible alternates.
        unsigned int numAlts = record.getNumAlts();
        int minAlleleCount = -1;
        int curAlleleCount = 0;
        int totalAlleleCount = 0;
        for(unsigned int i = 0; i <= numAlts; i++)
        {
            curAlleleCount = record.getAlleleCount(i);
            if((minAlleleCount == -1) ||
               (curAlleleCount < minAlleleCount))
            {
                minAlleleCount = curAlleleCount;
            }
            totalAlleleCount += curAlleleCount;
        }
        if(totalAlleleCount != 0)
        {
            double maf = (double)minAlleleCount/totalAlleleCount;
            std::cout << record.getIDStr()
                      << "\t" << minAlleleCount
                      << "\t" << maf << "\n";
        }
    }
    
    inFile.close();

    //    std::cerr << "\n\t# Records: " << numReadRecords << "\n";

    // return success.
    return(0);
}