Ejemplo n.º 1
0
bool VcfRecord::read(IFILE filePtr, bool siteOnly,
                     VcfRecordDiscardRules& discardRules,
                     VcfSubsetSamples* sampleSubset)
{
    // Clear out any previously set values.
    reset();
    
    if(filePtr == NULL)
    {
        myStatus.setStatus(StatGenStatus::FAIL_ORDER,
                           "Error reading VCF record before opening the file.");
        return(false);
    }

    if(ifeof(filePtr))
    {
        // End of file, just return false.
        return(false);
    }
    
    // Read the chromosome.
    if(!readTilTab(filePtr, myChrom))
    {
        if(myChrom.empty())
        {
            // EOF.
            return(false);
        }
        // Not an empty line.
        myStatus.setStatus(StatGenStatus::FAIL_PARSE, 
                           "Error reading VCF Record CHROM.");
        return(false);
    }
    // Read the 1-based Position
    std::string strPos;
    if(!readTilTab(filePtr, strPos))
    {
        myStatus.setStatus(StatGenStatus::FAIL_PARSE, 
                           "Error reading VCF Record POS.");
        return(false);
    }
    else
    {
        // Read the position, so convert to an integer.
        my1BasedPosNum = atoi(strPos.c_str());
    }
    // Read the ID.
    if(!readTilTab(filePtr, myID))
    {
        myStatus.setStatus(StatGenStatus::FAIL_PARSE, 
                           "Error reading VCF Record ID.");
        return(false);
    }

    if(discardRules.discardForID(myID))
    {
        // Do not keep this id, so consume the rest of the record and
        // return the next record.
        filePtr->discardLine();
        return(read(filePtr, siteOnly, discardRules, sampleSubset));
    }

    // Read the Ref.
    if(!readTilTab(filePtr, myRef))
    {
        myStatus.setStatus(StatGenStatus::FAIL_PARSE, 
                           "Error reading VCF Record REF.");
        return(false);
    }
    // Read the Alt.
    myAltArray.clear();
    if(!readTilTab(filePtr, myAlt))
    {
        myStatus.setStatus(StatGenStatus::FAIL_PARSE, 
                           "Error reading VCF Record ALT.");
        return(false);
    }
    // Read the Qual.
    if(!readTilTab(filePtr, myQual))
    {
        myStatus.setStatus(StatGenStatus::FAIL_PARSE, 
                           "Error reading VCF Record QUAL.");
        return(false);
    }
    else
    {
        if(myQual != ".")
        {
            // Read the quality, so convert to an integer.
            myQualNum = atof(myQual.c_str());
        }
        else
        {
            myQualNum = -1;
        }
    }
    // Read the Filter.
    if(!myFilter.read(filePtr))
    {
        myStatus.setStatus(StatGenStatus::FAIL_PARSE, 
                           "Error reading VCF Record FILTER.");
        return(false);
    }
    // Read the Info (could be the last word in the line or file).
    if(!myInfo.read(filePtr))
    {
        // Found the end of the line after the info field, so return true,
        // successfully read the record.
        return(true);
    }

    if(siteOnly)
    {
        // Do not store genotypes, so just consume the rest of the line.
        filePtr->readTilChar("\n");
    }
    else
    {
        // Not yet at the end of the line, so read the genotype fields
        // (format & samples)
        try
        {
            myGenotype.read(filePtr, sampleSubset);
        }
        catch(std::exception& e)
        {
            myDummyString = "Failed parsing the Genotype Fields of " + myChrom + ":" + 
                std::to_string((long long int)my1BasedPosNum) + " (chr:pos) - " + e.what();
            myStatus.setStatus(StatGenStatus::FAIL_PARSE, myDummyString.c_str());
            return(false);
        }
    }
    // Found the end of the line, return true since all required fields
    // were read.
    return(true);
}
Ejemplo n.º 2
0
bool VcfGenotypeSample::read(IFILE filePtr, VcfGenotypeFormat& format)
{
    static const char* GT_DELIM = "\n\t:|/.";
    static const int END_GT = 2; // Ends at index 2 or less
    static const int PHASED_CHAR_POS = 3;
    static const int UNPHASED_CHAR_POS = 4;
    static const int MISSING_GT_POS = 5;

    // Clear out any previously set values.
    reset();
    
    myFormatPtr = &format;

    int gtIndex = format.getGTIndex();

    // Read the subfields.
    SUBFIELD_READ_STATUS readStatus = MORE_SUBFIELDS;  
    std::string* nextType = NULL;
    int subFieldIndex = 0;
    while(readStatus == MORE_SUBFIELDS)
    {
        // Get the field to write into.
        if(format.storeIndex(subFieldIndex))
        {
            nextType = &(myGenotypeSubFields.getNextEmpty());
            // Check if this is the GT field.
            if(subFieldIndex == gtIndex)
            {
                // There is a GT field, so set that all GT fields are there.
                // if any are missing it will be turned back to false.
                myHasAllGenotypeAlleles = true;
                // This is the GT field, so parse manually looking to see if it
                // is phased and store the genotypes.
                int stopChar = END_GT + 1;
                // Read until a new subfield is found.
                while(stopChar > END_GT)
                {
                    // TODO  have an option to autoparse the genotypes?
                    // todo - store the previous nextType len in order to
                    // do string conversion to ints...
                    stopChar = filePtr->readTilChar(GT_DELIM, *nextType);
                    if(stopChar == PHASED_CHAR_POS)
                    {
                        nextType->push_back('|');
                        myPhased = true;
                    }
                    else if(stopChar == UNPHASED_CHAR_POS)
                    {
                        nextType->push_back('/');
                        myUnphased = true;
                    }
                    else if(stopChar == MISSING_GT_POS)
                    {
                        nextType->push_back('.');
                        myHasAllGenotypeAlleles = false;
                    }
                }
                // Check if this is the END_GT signal.
                readStatus = getReadStatus(stopChar);
            }
            else
            {
                // more subfields to read.
                readStatus = readGenotypeSubField(filePtr, nextType);
            }
        }
        else
        {
            readStatus = readGenotypeSubField(filePtr, NULL);
        }
        ++subFieldIndex;
    }

    // subFieldIndex contains the number of fields in this sample.
    if(subFieldIndex > format.getOrigNumFields())
    {
        throw(std::runtime_error("VCF Number of Fields in a Sample does not match the Format."));
    }
    else if(subFieldIndex < format.getOrigNumFields())
    {
        // If there are no fields for this sample, enter the missing value.
        if(myGenotypeSubFields.size() == 0)
        {
            myGenotypeSubFields.getNextEmpty() = MISSING_FIELD;
        }
    }

    // Return true if there is a tab - it is just END_OF_FIELD.
    return(readStatus == END_OF_FIELD);
}