Exemple #1
0
bool glfHandler::NextSection()
{
    if (isStub)
    {
        endOfSection = true;
        data.recordType = 0;
        maxPosition = 1999999999;
        position = maxPosition + 1;
        return true;
    }

    while (!endOfSection && !ifeof(handle))
        NextEntry();

    endOfSection = false;

    int labelLength = 0;

    currentSection++;
    position = 0;
    if (ifread(handle, &labelLength, sizeof(int)) == sizeof(int))
    {
        ifread(handle, label.LockBuffer(labelLength+1), labelLength * sizeof(char));
        label.UnlockBuffer();

        maxPosition = 0;
        ifread(handle, &maxPosition, sizeof(int));

        return ((maxPosition > 0) && !ifeof(handle));
    }

    return false;
}
Exemple #2
0
// Read the refSection from the specified file.  Assumes the file is in
// the correct position for reading the refSection.
bool GlfRefSection::read(IFILE filePtr)
{
    // Read the reference sequence name length
    int numRead = 0;
    int32_t refNameLen = 0;
    int byteLen = sizeof(int32_t);
    numRead = ifread(filePtr, &refNameLen, byteLen);
    if(numRead != byteLen)
    {
        // If no bytes were read and it is the end of the file, then return
        // false, but do not throw an exception.  This is not an error, just
        // the end of the file.
        if((numRead == 0) && ifeof(filePtr))
        {
            return(false);
        }

        String errorMsg =
            "Failed to read the length of the reference sequence name (";
        errorMsg += byteLen;
        errorMsg += " bytes).  Only read ";
        errorMsg += numRead;
        errorMsg += " bytes.";
        std::string errorString = errorMsg.c_str();
        throw(GlfException(GlfStatus::FAIL_IO, errorString));
        return(false);
    }

    // Read the refSection from the file.
    numRead = myRefName.readFromFile(filePtr, refNameLen);
    if(numRead != refNameLen)
    {
        String errorMsg = "Failed to read the reference sequence name (";
        errorMsg += refNameLen;
        errorMsg += " bytes).  Only read ";
        errorMsg += numRead;
        errorMsg += " bytes.";
        std::string errorString = errorMsg.c_str();
        throw(GlfException(GlfStatus::FAIL_IO, errorString));
        return(false);
    }

    // Read the ref length.
    byteLen = sizeof(uint32_t);
    numRead = ifread(filePtr, &myRefLen, byteLen);
    if(numRead != byteLen)
    {
        String errorMsg = "Failed to read the reference sequence length (";
        errorMsg += byteLen;
        errorMsg += " bytes).  Only read ";
        errorMsg += numRead;
        errorMsg += " bytes.";
        std::string errorString = errorMsg.c_str();
        throw(GlfException(GlfStatus::FAIL_IO, errorString));
        return(false);
    }

    // Successfully read, return success.
    return(true);
}
Exemple #3
0
int loadVector(Vector& a, String& fileName) {
  a.Zero();

  IFILE ifile(fileName.c_str(), "r");
  String line;
  StringArray array;
  int lineNo = 0;
  while (!ifeof(ifile)) {
    line.ReadLine(ifile);
    lineNo++;
    if (line.Length() == 0) continue;
    array.Clear();
    array.AddTokens(line);
    if (array.Length() > 1 && line.Length() > 0) {
      fprintf(stderr, "Warning: column size at line %d!\n", lineNo);
      array.Print();
      line.Write(stdout);
      return -1;
    }
    if (a.dim < lineNo) {
      a.GrowTo(a.dim + 1);
    }
    a[lineNo - 1] = atol(array[0]);
  }

  // a.Print(stdout);

  return 0;
};
Exemple #4
0
int loadMatrix(Matrix& a, String& fileName) {
  a.Zero();

  IFILE ifile(fileName.c_str(), "r");
  String line;
  StringArray array;
  int lineNo = 0;
  while (!ifeof(ifile)) {
    line.ReadLine(ifile);
    lineNo++;
    if (line.Length() == 0) continue;
    array.Clear();
    array.AddTokens(line);
    if (a.cols != 0 && a.cols != array.Length() && line.Length() > 0) {
      fprintf(stderr, "Wrong column size at line %d!\n", lineNo);
      array.Print();
      line.Write(stdout);
      return -1;
    } else {
      a.GrowTo(a.rows, array.Length());
    }
    if (a.rows < lineNo) {
      a.GrowTo(a.rows + 1, a.cols);
    }
    for (int i = 0; i < array.Length(); i++) {
      a[lineNo - 1][i] = atol(array[i]);
    }
  }

  // a.Print(stdout);
  return 0;
};
Exemple #5
0
// copied from Matthew Flickenger/Snyder
void VcfFile::parseMeta() {
  do {
    if ( readLine() <= 0 ) break;
    if ( ifeof(iFile) || !isMetaLine() ) break;
    
    parseMetaLine();
  } while (1);
  verifyMetaLines();
}
Exemple #6
0
void StringHash::ReadLinesFromFile(IFILE & f)
{
    String buffer;

    while (!ifeof(f))
    {
        buffer.ReadLine(f);
        Add(buffer.Trim());
    }
}
Exemple #7
0
// Returns whether or not the end of the file has been reached.
// return: int - true = EOF; false = not eof.
bool SamFile::IsEOF()
{
    if (myFilePtr != NULL)
    {
        // File Pointer is set, so return if eof.
        return(ifeof(myFilePtr));
    }
    // File pointer is not set, so return true, eof.
    return true;
}
Exemple #8
0
// Returns whether or not the end of the file has been reached.
// return: int - true = EOF; false = not eof.
bool VcfFileReader::isEOF()
{
    if (myFilePtr != NULL)
    {
        // File Pointer is set, so return if eof.
        return(ifeof(myFilePtr));
    }
    // File pointer is not set, so return true, eof.
    return true;
}
Exemple #9
0
void GCContent::LoadRegions(String & regionsFile, GenomeSequence &genome, bool invertRegion)
{
    if(regionsFile.Length()==0) return;
    if(genome.sequenceLength()==0) error("No reference genome loaded!\n");

    IFILE fhRegions;
    fhRegions = ifopen(regionsFile.c_str(),"r");
    if(fhRegions==NULL)
        error("Open regions file %s failed!\n", regionsFile.c_str());

    regionIndicator.resize(genome.sequenceLength());

    StringArray tokens;
    String buffer;
    int len;

    fprintf(stderr, "Loading region list...");

    while (!ifeof(fhRegions)){
        buffer.ReadLine(fhRegions);
        if (buffer.IsEmpty() || buffer[0] == '#') continue;

        tokens.AddTokens(buffer, WHITESPACE);
        if(tokens.Length() < 3) continue;

        genomeIndex_t startGenomeIndex = 0;
        int chromosomeIndex = tokens[1].AsInteger();

        // use chromosome name (token[0]) and position (token[1]) to query genome index.
        startGenomeIndex = genome.getGenomePosition(tokens[0].c_str(), chromosomeIndex);

        if(startGenomeIndex >= regionIndicator.size() ) {
            //fprintf(stderr, "WARNING: region list section %s position %u is not found in the reference and skipped...\n", tokens[0].c_str(), chromosomeIndex);
            continue;
        }

        len = tokens[2].AsInteger() - tokens[1].AsInteger() + 1;
        for(uint32_t i=startGenomeIndex; i<startGenomeIndex+len; i++)
            regionIndicator[i] = true;

        tokens.Clear();
        buffer.Clear();
    }

    if (invertRegion) {
        fprintf(stderr, " invert region...");
        for (uint32_t i = 0; i < regionIndicator.size(); i++) {
            regionIndicator[i] = !regionIndicator[i];
        }
    }

    ifclose(fhRegions);
    fprintf(stderr, "DONE!\n");
}
Exemple #10
0
// Check to see if the file is at the end of the file.
bool FastQFile::isEof()
{
   // Check to see if the file is open.
   if((myFile != NULL) && (ifeof(myFile)))
   {
      // At EOF.
      return true;
   }

   // Not at EOF.
   return false;
}
Exemple #11
0
int main() 
{
    String fn = "/home/zhanxw/compareMapSoft/index/mapreads/chr1.fa";
    IFILE file = ifopen(fn.c_str(), "r");

    int totalChar = 0;
    String line;
    int freq[256] = {0};
    while (!ifeof(file)){
        line.ReadLine(file);
        totalChar += line.Length();
        for (int i = 0; i < line.Length(); i++)
            freq[(unsigned int) line[i]]++;
    }
    printf("A frequency: %d (%f)\n", freq[(int)'A'], (float)freq[(int)'A']/totalChar);
    printf("T frequency: %d (%f)\n", freq[(int)'T'], (float)freq[(int)'T']/totalChar);
    printf("G frequency: %d (%f)\n", freq[(int)'G'], (float)freq[(int)'G']/totalChar);
    printf("C frequency: %d (%f)\n", freq[(int)'C'], (float)freq[(int)'C']/totalChar);
}
int InputFile::readLine(std::string& line)
{
    int charRead = 0;
    while(!ifeof())
    {
        charRead = ifgetc();
        if(charRead == EOF)
        {
            return(-1);
        }
        if(charRead == '\n')
        {
            return(0);
        }
        line += charRead;
    }
    // Should never get here.
    return(-1);
}
int InputFile::readTilTab(std::string& field)
{
    int charRead = 0;
    while(!ifeof())
    {
        charRead = ifgetc();
        if(charRead == EOF)
        {
            return(-1);
        }
        if(charRead == '\n')
        {
            return(0);
        }
        if(charRead == '\t')
        {
            return(1);
        }
        field += charRead;
    }
    return(-1);
}
Exemple #14
0
bool Stats::getNextSection(SamFile &samIn)
{
    static bool alreadyRead = false;
    if(myRegionList == NULL)
    {
        // no region list is set, so just read once.
        if(alreadyRead)
        {
            // No regions and it has already been read, so
            // return false, no more to read.
            return(false);
        }
        // Return true that there is more to read, but
        // set the flag that it has already been read
        // so the next call will return false.
        alreadyRead = true;
        return(true);
    }
    else
    {
        // There is a region list, so read process that.
        // Track whether or not a section has been found.
        bool sectionFound = false;
        myStartPos = 0;
        myEndPos = 0;

        // Loop until the end of the file or the end of the file or 
        // a section is found.
        while(!sectionFound && !ifeof(myRegionList))
        {
            myRegBuffer.Clear();
            myRegBuffer.ReadLine(myRegionList);
            if(myRegBuffer.IsEmpty())
            {
                // Nothing read, so continue to the next line.
                continue;
            }
        
            // A line was read, so parse it.
            myRegColumn.ReplaceColumns(myRegBuffer, '\t');
            if(myRegColumn.Length() < 3)
            {
                // Incorrectly formatted line.
                std::cerr << "Improperly formatted reg line: "
                          << myRegBuffer
                          << "; Skipping to the next line.\n";
                continue;
            }
            
            // Check the columns.
            if(!myRegColumn[1].AsInteger(myStartPos))
            {
                // The start position (2nd column) is not an integer.
                std::cerr << "Improperly formatted region line, start position "
                          << "(2nd column) is not an integer: "
                          << myRegColumn[1]
                          << "; Skipping to the next line.\n";         
            }
            else if(!myRegColumn[2].AsInteger(myEndPos))
            {
                // The end position (3rd column) is not an integer.
                std::cerr << "Improperly formatted region line, end position "
                          << "(3rd column) is not an integer: "
                          << myRegColumn[2]
                          << "; Skipping to the next line.\n";         
            }
            else if((myStartPos >= myEndPos) && (myEndPos != -1))
            {
                // The start position is >= the end position
                std::cerr << "Improperly formatted region line, the start position "
                          << "is >= end position: "
                          << myRegColumn[1]
                          << " >= "
                          << myRegColumn[2]
                          << "; Skipping to the next line.\n";         
            }
            else
            {
                sectionFound = true;
                samIn.SetReadSection(myRegColumn[0].c_str(), myStartPos, myEndPos);
            }
        }
        return(sectionFound);
    }
}
Exemple #15
0
// Reads and validates a single fastq sequence from myFile.
FastQStatus::Status FastQFile::readFastQSequence()
{
   // First verify that a file is open, if not, return failure.
   if(!isOpen())
   {
      std::string message = 
         "ERROR: Trying to read a fastq file but no file is open.";
      logMessage(message.c_str());
      return(FastQStatus::FASTQ_ORDER_ERROR);
   }

   // Reset variables for each sequence.
   resetForEachSequence();
   
   bool valid = true;

   // No sequence was read.
   if(isTimeToQuit())
   {
      return(FastQStatus::FASTQ_NO_SEQUENCE_ERROR);
   }

   // The first line is the sequence identifier, so validate that.
   valid = validateSequenceIdentifierLine();
   
   if(myFileProblem)
   {
      return(FastQStatus::FASTQ_READ_ERROR);
   }
    
   // If we are at the end of the file, check to see if it is a partial
   // sequence or just an empty line at the end.
   if(ifeof(myFile))
   {
      // If the sequence identifier line was empty and we are at the
      // end of the file, there is nothing more to validate.
      if(mySequenceIdLine.Length() != 0)
      { 
         // There was a sequence identifier line, so this is an incomplete 
         // sequence.
         myErrorString = "Incomplete Sequence.\n";
         reportErrorOnLine();

         valid = false;
      }
      if(valid)
      {
         // Return failure - no sequences were left to read.  At the end
         // of the file.  It wasn't invalid and it wasn't really an error.
         return(FastQStatus::FASTQ_NO_SEQUENCE_ERROR);
      }
      else
      {
         return(FastQStatus::FASTQ_INVALID);
      }
   }

   // If enough errors, quit before reading any more.
   if(isTimeToQuit())
   {
      // Means there was an error, so mark it as invalid.
      return(FastQStatus::FASTQ_INVALID);
   }

   // Validate the Raw Sequence Line(s) and the "+" line.
   valid &= validateRawSequenceAndPlusLines();

   if(myFileProblem)
   {
      return(FastQStatus::FASTQ_READ_ERROR);
   }
    
   // If enough errors, quit before reading any more.
   if(isTimeToQuit())
   {
      return(FastQStatus::FASTQ_INVALID);
   }

   // If it is the end of a file, it is missing the quality string.
   if(ifeof(myFile))
   {
      // There was a sequence identifier line, so this is an incomplete 
      // sequence.
      myErrorString = "Incomplete Sequence, missing Quality String.";
      reportErrorOnLine();
      valid = false;
      return(FastQStatus::FASTQ_INVALID);
   }
    
   // All that is left is to validate the quality string line(s).
   valid &= validateQualityStringLines();

   if(myFileProblem)
   {
      return(FastQStatus::FASTQ_READ_ERROR);
   }
    
   if(valid)
   {
      return(FastQStatus::FASTQ_SUCCESS);
   }
   return(FastQStatus::FASTQ_INVALID);
}
Exemple #16
0
// Reads and validates the sequence identifier line of a fastq sequence.
bool FastQFile::validateSequenceIdentifierLine()
{
   // Read the first line of the sequence.
   int readStatus = mySequenceIdLine.ReadLine(myFile);

   // Check to see if the read was successful.
   if(readStatus <= 0)
   {
      // If EOF, not an error.
      if(ifeof(myFile))
      {
         return true;
      }
      myFileProblem = true;
      myErrorString = "Failure trying to read sequence identifier line";
      reportErrorOnLine();
      return false;
   }

   // If the line is 0 length and it is the end of the file, just
   // return since this is the eof - no error.
   if((mySequenceIdLine.Length() == 0) && (ifeof(myFile)))
   {
      // Not an error, just a new line at the end of the file.
      return true;
   }

   // Increment the line number.
   myLineNum++;
   
   // Verify that the line has at least 2 characters: '@' and at least
   // one character for the sequence identifier.
   if(mySequenceIdLine.Length() < 2)
   {
      // Error. Sequence Identifier line not long enough.
      myErrorString = "The sequence identifier line was too short.";
      reportErrorOnLine();
      return false;
   }
   
   // The sequence identifier line must start wtih a '@'
   if(mySequenceIdLine[0] != '@')
   {
      // Error - sequence identifier line does not begin with an '@'.
      myErrorString = "First line of a sequence does not begin with @";
      reportErrorOnLine();
      return false;
   }

   // Valid Sequence Identifier Line.

   // The sequence identifier ends at the first space or at the end of the
   // line if there is no space.
   // Use fast find since this is a case insensitive search.
   // Start at 1 since we know that 0 is '@'
   int endSequenceIdentifier = mySequenceIdLine.FastFindChar(' ', 1);
   
   // Check if a " " was found.
   if(endSequenceIdentifier == -1)
   {
      // Did not find a ' ', so the identifier is the rest of the line.
      // It starts at 1 since @ is at offset 0.
      mySequenceIdentifier = (mySequenceIdLine.SubStr(1)).c_str();
   }
   else
   {
      // Found a ' ', so the identifier ends just before that.
      // The sequence identifier must be at least 1 character long, 
      // therefore the endSequenceIdentifier must be greater than 1.
      if(endSequenceIdentifier <= 1)
      {
         myErrorString = 
            "No Sequence Identifier specified before the comment.";
         reportErrorOnLine();
         return false;
      }

      mySequenceIdentifier = 
         (mySequenceIdLine.SubStr(1, endSequenceIdentifier - 1)).c_str();
   }

   // If myInterleaved, validate matches the previous seqID.
   if(myInterleaved && (myPrevSeqID != ""))
   {
       // Valid if the sequence identifiers are identical or if
       // the only difference is a trailing 1 or 2.
       if(myPrevSeqID.compare(mySequenceIdentifier) != 0)
       {
           // Compare all but the last characters, then check the last characters for 1 or 2.
           if((myPrevSeqID.compare(0, myPrevSeqID.length()-1, mySequenceIdentifier.c_str(), mySequenceIdentifier.Length()-1) != 0) || 
              (((myPrevSeqID[myPrevSeqID.length()-1] != '1') || (mySequenceIdentifier[mySequenceIdentifier.Length()-1] != '2')) && 
               (myPrevSeqID[myPrevSeqID.length()-1] != mySequenceIdentifier[mySequenceIdentifier.Length()-1])))
           {
               myErrorString = "Interleaved: consecutive reads do not have matching sequence identifiers: ";
               myErrorString += mySequenceIdentifier.c_str();
               myErrorString += " and ";
               myErrorString += myPrevSeqID.c_str();
               reportErrorOnLine();
               myPrevSeqID.clear();
               return(false);
           }
       }
       myPrevSeqID.clear();
   }
   else
   {
       if(myInterleaved)
       {
           myPrevSeqID = mySequenceIdentifier.c_str();
       }

       // Check if sequence identifier should be validated for uniqueness if it is 
       // not the 2nd in an interleaved pair.
       if(myCheckSeqID)
       {
           // Check to see if the sequenceIdentifier is a repeat by adding
           // it to the set and seeing if it already existed.
           std::pair<std::map<std::string, unsigned int>::iterator,bool> insertResult;
           insertResult = 
               myIdentifierMap.insert(std::make_pair(mySequenceIdentifier.c_str(), 
                                                     myLineNum));
           
           if(insertResult.second == false)
           {
               // Sequence Identifier is a repeat.
               myErrorString = "Repeated Sequence Identifier: ";
               myErrorString += mySequenceIdentifier.c_str();
               myErrorString += " at Lines ";
               myErrorString += insertResult.first->second;
               myErrorString += " and ";
               myErrorString += myLineNum;
               reportErrorOnLine();
               return(false);
           }
       }
   }

   // Valid, return true.
   return(true);
}
Exemple #17
0
// Reads and validates the raw sequence line(s) and the plus line.  Both are
// included in one method since it is unknown when the raw sequence line
// ends until you find the plus line that divides it from the quality
// string.  Since this method will read the plus line to know when the
// raw sequence ends, it also validates that line.
bool FastQFile::validateRawSequenceAndPlusLines()
{
   // Read the raw sequence.
   int readStatus = myRawSequence.ReadLine(myFile);

   myLineNum++;

   if(readStatus <= 0)
   {
      myFileProblem = true;
      myErrorString = "Failure trying to read sequence line";
      reportErrorOnLine();
      return false;
   }

   // Offset into the raw sequence to be validated.
   int offset = 0;
   
   // Validate the raw sequence.
   bool valid = validateRawSequence(offset);

   // Increment the offset for what was just read.
   offset = myRawSequence.Length();

   // The next line is either a continuation of the raw sequence or it starts
   // with a '+'
   // Keep validating til the '+' line or the end of file is found.
   bool stillRawLine = true;
   while(stillRawLine && 
         !ifeof(myFile))
   {
      // If enough errors, quit before reading any more.
      if(isTimeToQuit())
      {
         return(false);
      }

      // Read the next line.
      // Read into the plus line, but if it isn't a plus line, then
      // it will be copied into the raw sequence line.
      readStatus = myPlusLine.ReadLine(myFile);
      myLineNum++;

      if(readStatus <= 0)
      {
         myFileProblem = true;
         myErrorString = "Failure trying to read sequence/plus line";
         reportErrorOnLine();
         return false;
      }

      // Check if the next line is blank
      if(myPlusLine.Length() == 0)
      {
         // The next line is blank.  Assume it is part of the raw sequence and
         // report an error since there are no valid characters on the line.
         myErrorString = 
            "Looking for continuation of Raw Sequence or '+' instead found a blank line, assuming it was part of Raw Sequence.";
         reportErrorOnLine();
      }
      // Check for the plus line.
      else if(myPlusLine[0] == '+')
      {
         // This is the + line.
         valid &= validateSequencePlus();
         stillRawLine = false;
      }
      else
      {
         // Not a plus line, so assume this is a continuation of the Raw
         // Sequence.
         // Copy from the plus line to the raw sequence line.
         myRawSequence += myPlusLine;
         myPlusLine.SetLength(0);
         valid &= validateRawSequence(offset);
         
         // Increment the offset.
         offset = myRawSequence.Length();
      }
   }
   
   // If enough errors, quit before reading any more.
   if(isTimeToQuit())
   {
      return(false);
   }
   
   // Now that the entire raw sequence has been obtained, check its length
   // against the minimum allowed length.
   if(myRawSequence.Length() < myMinReadLength)
   {
      // Raw sequence is not long enough - error.
      myErrorString = "Raw Sequence is shorter than the min read length: ";
      myErrorString += myRawSequence.Length();
      myErrorString += " < ";
      myErrorString += myMinReadLength;
      reportErrorOnLine();
      valid = false;
   }

   // If enough errors, quit before reading any more.
   if(isTimeToQuit())
   {
      return(false);
   }

   // if the flag still indicates it is processing the raw sequence that means
   // we reached the end of the file without a '+' line.  So report that error.
   if(stillRawLine)
   {
      myErrorString = "Reached the end of the file without a '+' line.";
      reportErrorOnLine();
      valid = false;
   }

   return(valid);
}
Exemple #18
0
// Reads and validates the quality string line(s).
bool FastQFile::validateQualityStringLines()
{
   // Read the quality string.
   int readStatus = myQualityString.ReadLine(myFile);
   myLineNum++;

   if(readStatus <= 0)
   {
      myFileProblem = true;
      myErrorString = "Failure trying to read quality line";
      reportErrorOnLine();
      return false;
   }

   // track the offset into the quality string to validate.
   int offset = 0;

   // Validate this line of the quality string.
   bool valid = validateQualityString(offset);

   offset = myQualityString.Length();

   // Keep reading quality string lines until the length of the 
   // raw sequence has been hit or the end of the file is reached.
   while((myQualityString.Length() < myRawSequence.Length()) && 
         (!ifeof(myFile)))
   {
      // If enough errors, quit before reading any more.
      if(isTimeToQuit())
      {
         return(false);
      }

      // Read another line of the quality string.
      readStatus = myTempPartialQuality.ReadLine(myFile);
      myLineNum++;

      if(readStatus <= 0)
      {
         myFileProblem = true;
         myErrorString = "Failure trying to read quality line";
         reportErrorOnLine();
         return false;
      }

      myQualityString += myTempPartialQuality;
      myTempPartialQuality.Clear();

      // Validate this line of the quality string.
      valid &= validateQualityString(offset);
      offset = myQualityString.Length();
   }

   // If enough errors, quit before reading any more.
   if(isTimeToQuit())
   {
      return(false);
   }

   // Validate that the quality string length is the same as the
   // raw sequence length.
   if(myQualityString.Length() != myRawSequence.Length()) 
   {
      myErrorString = "Quality string length (";
      myErrorString += myQualityString.Length();
      myErrorString += ") does not equal raw sequence length (";
      myErrorString += myRawSequence.Length();
      myErrorString += ")";
      reportErrorOnLine();
      valid = false;
   }
   return(valid);
}
Exemple #19
0
bool WriteRegion::getNextSection()
{
    bool anotherSection = false;
    // If refName is set, use that.
    if(myRefName.Length() != 0)
    {
        // Use Reference Name for the next section.
        anotherSection = true;
        mySamIn.SetReadSection(myRefName.c_str(), myStart, myEnd, !myWithinReg);
        // Already processed this section, so clear the reference name
        // so it will not be used again.
        myRefName.Clear();
        myStart = UNSPECIFIED_INT;
        myEnd = UNSPECIFIED_INT;
    }
    else if(myRefID != UNSET_REF)
    {
        // Use Reference ID for the next section.
        anotherSection = true;
        mySamIn.SetReadSection(myRefID, myStart, myEnd, !myWithinReg);
        // Already processed this section, so clear the reference id
        // so it will not be used again.
        myRefID = UNSET_REF;
        myStart = UNSPECIFIED_INT;
        myEnd = UNSPECIFIED_INT;
    }
    else if(myBedFile != NULL)
    {
        // There is a bed file, so read the next line.
        while(!anotherSection)
        {
            myBedBuffer.Clear();
            myBedBuffer.ReadLine(myBedFile);
            if(ifeof(myBedFile) && myBedBuffer.IsEmpty())
            {
                // End of the file, so break.
                break;
            }
            // Not the end of the file, so parse the line.
            myBedColumn.ReplaceColumns(myBedBuffer, '\t');
            if(myBedColumn.Length() != 3)
            {
                // Incorrectly formatted line.
                std::cerr << "Improperly formatted bed line: "
                          << myBedBuffer
                          << "; Skipping to the next line.\n";
            }
            else
            {
                // Check the reference name.
                if(myPrevRefName != myBedColumn[0])
                {
                    // New reference name (chromosome), so clear the previous
                    // start/end.
                    myPrevStart = UNSPECIFIED_INT;
                    myPrevEnd = UNSPECIFIED_INT;
                    myPrevRefName = myBedColumn[0];

                    // Get the reference ID for the reference name.
                    myBedRefID = mySamHeader.getReferenceID(myPrevRefName);
                    
                    // Check to see if the reference ID is found.
                    if(myBedRefID == SamReferenceInfo::NO_REF_ID)
                    {
                        // The specified Reference ID is not in the file,
                        // so check to see if it has chr.
                        // Check to see if it is the same except for 'chr' appended.
                        if((myPrevRefName[0] == 'c') && 
                           (myPrevRefName[1] == 'h') && 
                           (myPrevRefName[2] == 'r'))
                        {
                            // It starts with chr, so look up with out the chr
                            myBedRefID = mySamHeader.getReferenceID(myPrevRefName.c_str() + 3);
                        }
                    }
                }
                else
                {
                    // Not a new reference name.
                    // Store the previous positions before overwriting them.
                    myPrevStart = myStart;
                    if(myPrevEnd < myEnd)
                    {
                        // The last section ends later than the previous one,
                        // So update the previous latest end.
                        myPrevEnd = myEnd;
                    }
                }

                // If the refID is still NO_REF_ID, just continue to the next bed line.
                if(myBedRefID == SamReferenceInfo::NO_REF_ID)
                {
                    continue;
                }

                // Correct number of columns, check the columns.
                if(!myBedColumn[1].AsInteger(myStart))
                {
                    // The start position (2nd column) is not an integer.
                    std::cerr << "Improperly formatted bed line, start position (2nd column) is not an integer: "
                              << myBedColumn[1]
                              << "; Skipping to the next line.\n";         
                }
                else if(!myBedColumn[2].AsInteger(myEnd))
                {
                    // The end position (3rd column) is not an integer.
                    std::cerr << "Improperly formatted bed line, end position (3rd column) is not an integer: "
                              << myBedColumn[2]
                              << "; Skipping to the next line.\n";         
                }
                else if(myStart >= myEnd)
                {
                    // The start position is >= the end
                    std::cerr << "Improperly formatted bed line, the start position is >= end position: "
                              << myBedColumn[1]
                              << " >= "
                              << myBedColumn[2]
                              << "; Skipping to the next line.\n";         
                }
                else if(myPrevStart > myStart)
                {
                    // Same reference name, but the position goes backwards.
                    // This is against the assumption that the bed is sorted.
                    std::cerr << "Improperly formatted bed, the start position is < the previous start (bed is assumed to be sorted): "
                              << myStart
                              << " < "
                              << myPrevStart
                              << "; Skipping to the next line.\n";
                }
                else
                {
                    anotherSection = true;
                    mySamIn.SetReadSection(myBedRefID, myStart, myEnd, !myWithinReg);
                }
            }
        }
    }
    else
    {
        // If we have no bed file, then we only have another section
        // if we have not already written a region.
        anotherSection = !myWroteReg;
    }
    
    return(anotherSection);
}
Exemple #20
0
bool VcfRecord::read(IFILE filePtr, bool siteOnly,
                     VcfRecordDiscardRules& discardRules,
                     VcfSubsetSamples* sampleSubset)
{
    // Clear out any previously set values.
    reset();
    
    if(filePtr == NULL)
    {
        myStatus.setStatus(StatGenStatus::FAIL_ORDER,
                           "Error reading VCF record before opening the file.");
        return(false);
    }

    if(ifeof(filePtr))
    {
        // End of file, just return false.
        return(false);
    }
    
    // Read the chromosome.
    if(!readTilTab(filePtr, myChrom))
    {
        if(myChrom.empty())
        {
            // EOF.
            return(false);
        }
        // Not an empty line.
        myStatus.setStatus(StatGenStatus::FAIL_PARSE, 
                           "Error reading VCF Record CHROM.");
        return(false);
    }
    // Read the 1-based Position
    std::string strPos;
    if(!readTilTab(filePtr, strPos))
    {
        myStatus.setStatus(StatGenStatus::FAIL_PARSE, 
                           "Error reading VCF Record POS.");
        return(false);
    }
    else
    {
        // Read the position, so convert to an integer.
        my1BasedPosNum = atoi(strPos.c_str());
    }
    // Read the ID.
    if(!readTilTab(filePtr, myID))
    {
        myStatus.setStatus(StatGenStatus::FAIL_PARSE, 
                           "Error reading VCF Record ID.");
        return(false);
    }

    if(discardRules.discardForID(myID))
    {
        // Do not keep this id, so consume the rest of the record and
        // return the next record.
        filePtr->discardLine();
        return(read(filePtr, siteOnly, discardRules, sampleSubset));
    }

    // Read the Ref.
    if(!readTilTab(filePtr, myRef))
    {
        myStatus.setStatus(StatGenStatus::FAIL_PARSE, 
                           "Error reading VCF Record REF.");
        return(false);
    }
    // Read the Alt.
    myAltArray.clear();
    if(!readTilTab(filePtr, myAlt))
    {
        myStatus.setStatus(StatGenStatus::FAIL_PARSE, 
                           "Error reading VCF Record ALT.");
        return(false);
    }
    // Read the Qual.
    if(!readTilTab(filePtr, myQual))
    {
        myStatus.setStatus(StatGenStatus::FAIL_PARSE, 
                           "Error reading VCF Record QUAL.");
        return(false);
    }
    else
    {
        if(myQual != ".")
        {
            // Read the quality, so convert to an integer.
            myQualNum = atof(myQual.c_str());
        }
        else
        {
            myQualNum = -1;
        }
    }
    // Read the Filter.
    if(!myFilter.read(filePtr))
    {
        myStatus.setStatus(StatGenStatus::FAIL_PARSE, 
                           "Error reading VCF Record FILTER.");
        return(false);
    }
    // Read the Info (could be the last word in the line or file).
    if(!myInfo.read(filePtr))
    {
        // Found the end of the line after the info field, so return true,
        // successfully read the record.
        return(true);
    }

    if(siteOnly)
    {
        // Do not store genotypes, so just consume the rest of the line.
        filePtr->readTilChar("\n");
    }
    else
    {
        // Not yet at the end of the line, so read the genotype fields
        // (format & samples)
        try
        {
            myGenotype.read(filePtr, sampleSubset);
        }
        catch(std::exception& e)
        {
            myDummyString = "Failed parsing the Genotype Fields of " + myChrom + ":" + 
                std::to_string((long long int)my1BasedPosNum) + " (chr:pos) - " + e.what();
            myStatus.setStatus(StatGenStatus::FAIL_PARSE, myDummyString.c_str());
            return(false);
        }
    }
    // Found the end of the line, return true since all required fields
    // were read.
    return(true);
}
Exemple #21
0
int Stats::execute(int argc, char **argv)
{
    // Extract command line arguments.
    String inFile = "";
    String indexFile = "";
    bool basic = false;
    bool noeof = false;
    bool params = false;
    bool qual = false;
    bool phred = false;
    int maxNumReads = -1;
    bool unmapped = false;
    String pBaseQC = "";
    String cBaseQC = "";
    String regionList = "";
    int excludeFlags = 0;
    int requiredFlags = 0;
    bool withinRegion = false;
    int minMapQual = 0;
    String dbsnp = "";
    PosList *dbsnpListPtr = NULL;
    bool baseSum = false;
    int bufferSize = PileupHelper::DEFAULT_WINDOW_SIZE;

    ParameterList inputParameters;
    BEGIN_LONG_PARAMETERS(longParameterList)
        LONG_PARAMETER_GROUP("Required Parameters")
        LONG_STRINGPARAMETER("in", &inFile)
        LONG_PARAMETER_GROUP("Types of Statistics")
        LONG_PARAMETER("basic", &basic)
        LONG_PARAMETER("qual", &qual)
        LONG_PARAMETER("phred", &phred)
        LONG_STRINGPARAMETER("pBaseQC", &pBaseQC)
        LONG_STRINGPARAMETER("cBaseQC", &cBaseQC)
        LONG_PARAMETER_GROUP("Optional Parameters")
        LONG_INTPARAMETER("maxNumReads", &maxNumReads)
        LONG_PARAMETER("unmapped", &unmapped)
        LONG_STRINGPARAMETER("bamIndex", &indexFile)
        LONG_STRINGPARAMETER("regionList", &regionList)
        LONG_INTPARAMETER("excludeFlags", &excludeFlags)
        LONG_INTPARAMETER("requiredFlags", &requiredFlags)
        LONG_PARAMETER("noeof", &noeof)
        LONG_PARAMETER("params", &params)
        LONG_PARAMETER_GROUP("Optional phred/qual Only Parameters")
        LONG_PARAMETER("withinRegion", &withinRegion)
        LONG_PARAMETER_GROUP("Optional BaseQC Only Parameters")
        LONG_PARAMETER("baseSum", &baseSum)
        LONG_INTPARAMETER("bufferSize", &bufferSize)
        LONG_INTPARAMETER("minMapQual", &minMapQual)
        LONG_STRINGPARAMETER("dbsnp", &dbsnp)
        END_LONG_PARAMETERS();
   
    inputParameters.Add(new LongParameters ("Input Parameters", 
                                            longParameterList));

    inputParameters.Read(argc-1, &(argv[1]));

    // If no eof block is required for a bgzf file, set the bgzf file type to 
    // not look for it.
    if(noeof)
    {
        // Set that the eof block is not required.
        BgzfFileType::setRequireEofBlock(false);
    }

    // Check to see if the in file was specified, if not, report an error.
    if(inFile == "")
    {
        usage();
        inputParameters.Status();
        // In file was not specified but it is mandatory.
        std::cerr << "--in is a mandatory argument for stats, "
                  << "but was not specified" << std::endl;
        return(-1);
    }

    // Use the index file if unmapped or regionList is not empty.
    bool useIndex = (unmapped|| (!regionList.IsEmpty()));

    // IndexFile is required, so check to see if it has been set.
    if(useIndex && (indexFile == ""))
    {
        // In file was not specified, so set it to the in file
        // + ".bai"
        indexFile = inFile + ".bai";
    }
    ////////////////////////////////////////
    // Setup in case pileup is used.
    Pileup<PileupElementBaseQCStats> pileup(bufferSize);
    // Initialize start/end positions.
    myStartPos = 0;
    myEndPos = -1;
    
    // Open the output qc file if applicable.
    IFILE baseQCPtr = NULL;
    if(!pBaseQC.IsEmpty() && !cBaseQC.IsEmpty())
    {
        usage();
        inputParameters.Status();
        // Cannot specify both types of baseQC.
        std::cerr << "Cannot specify both --pBaseQC & --cBaseQC." << std::endl;
        return(-1);
    }
    else if(!pBaseQC.IsEmpty())
    {
        baseQCPtr = ifopen(pBaseQC, "w");
        PileupElementBaseQCStats::setPercentStats(true);
    }
    else if(!cBaseQC.IsEmpty())
    {
        baseQCPtr = ifopen(cBaseQC, "w");
        PileupElementBaseQCStats::setPercentStats(false);
    }

    if(baseQCPtr != NULL)
    {
        PileupElementBaseQCStats::setOutputFile(baseQCPtr);
        PileupElementBaseQCStats::printHeader();
    }
    if((baseQCPtr != NULL) || baseSum)
    {
        PileupElementBaseQCStats::setMapQualFilter(minMapQual);
        PileupElementBaseQCStats::setBaseSum(baseSum);
    }

    if(params)
    {
        inputParameters.Status();
    }

    // Open the file for reading.
    SamFile samIn;
    if(!samIn.OpenForRead(inFile))
    {
        fprintf(stderr, "%s\n", samIn.GetStatusMessage());
        return(samIn.GetStatus());
    }

    samIn.SetReadFlags(requiredFlags, excludeFlags);

    // Set whether or not basic statistics should be generated.
    samIn.GenerateStatistics(basic);

    // Read the sam header.
    SamFileHeader samHeader;
    if(!samIn.ReadHeader(samHeader))
    {
        fprintf(stderr, "%s\n", samIn.GetStatusMessage());
        return(samIn.GetStatus());
    }

    // Open the bam index file for reading if we are
    // doing unmapped reads (also set the read section).
    if(useIndex)
    {
        samIn.ReadBamIndex(indexFile);

        if(unmapped)
        {
            samIn.SetReadSection(-1);
        }

        if(!regionList.IsEmpty())
        {
            myRegionList = ifopen(regionList, "r");
        }
    }

    //////////////////////////
    // Read dbsnp if specified and doing baseQC
    if(((baseQCPtr != NULL) || baseSum) && (!dbsnp.IsEmpty()))
    {
        // Read the dbsnp file.
        IFILE fdbSnp;
        fdbSnp = ifopen(dbsnp,"r");
        // Determine how many entries.
        const SamReferenceInfo& refInfo = samHeader.getReferenceInfo();
        int maxRefLen = 0;
        for(int i = 0; i < refInfo.getNumEntries(); i++)
        {
            int refLen = refInfo.getReferenceLength(i);
            if(refLen >= maxRefLen)
            {
                maxRefLen = refLen + 1;
            }
        }
        
        dbsnpListPtr = new PosList(refInfo.getNumEntries(),maxRefLen);

        if(fdbSnp==NULL)
        {
            std::cerr << "Open dbSNP file " << dbsnp.c_str() << " failed!\n";
        }
        else if(dbsnpListPtr == NULL)
        {
            std::cerr << "Failed to init the memory allocation for the dbsnpList.\n";
        }
        else
        {
            // Read the dbsnp file.
            StringArray tokens;
            String buffer;
            int position = 0;
            int refID = 0;

            // Loop til the end of the file.
            while (!ifeof(fdbSnp))
            {
                // Read the next line.
                buffer.ReadLine(fdbSnp);
                // If it does not have at least 2 columns, 
                // continue to the next line.
                if (buffer.IsEmpty() || buffer[0] == '#') continue;
                tokens.AddTokens(buffer);
                if(tokens.Length() < 2) continue;

                if(!tokens[1].AsInteger(position))
                {
                    std::cerr << "Improperly formatted region line, start position "
                              << "(2nd column) is not an integer: "
                              << tokens[1]
                              << "; Skipping to the next line.\n";         
                    continue;
                }

                // Look up the reference name.
                refID = samHeader.getReferenceID(tokens[0]);
                if(refID != SamReferenceInfo::NO_REF_ID)
                {
                    // Reference id was found, so add it to the dbsnp
                    dbsnpListPtr->addPosition(refID, position);
                }
        
                tokens.Clear();
                buffer.Clear();
            }
        }
        ifclose(fdbSnp);
    }

    // Read the sam records.
    SamRecord samRecord;

    int numReads = 0;

    //////////////////////
    // Setup in case doing a quality count.
    // Quality histogram.
    const int MAX_QUAL = 126;
    const int START_QUAL = 33;
    uint64_t qualCount[MAX_QUAL+1];
    for(int i = 0; i <= MAX_QUAL; i++)
    {
        qualCount[i] = 0;
    }
    
    const int START_PHRED = 0;
    const int PHRED_DIFF = START_QUAL - START_PHRED;
    const int MAX_PHRED = MAX_QUAL - PHRED_DIFF;
    uint64_t phredCount[MAX_PHRED+1];
    for(int i = 0; i <= MAX_PHRED; i++)
    {
        phredCount[i] = 0;
    }
    
    int refPos = 0;
    Cigar* cigarPtr = NULL;
    char cigarChar = '?';
    // Exclude clips from the qual/phred counts if unmapped reads are excluded.
    bool qualExcludeClips = excludeFlags & SamFlag::UNMAPPED;

    //////////////////////////////////
    // When not reading by sections, getNextSection returns true
    // the first time, then false the next time.
    while(getNextSection(samIn))
    {
        // Keep reading records from the file until SamFile::ReadRecord
        // indicates to stop (returns false).
        while(((maxNumReads < 0) || (numReads < maxNumReads)) && samIn.ReadRecord(samHeader, samRecord))
        {
            // Another record was read, so increment the number of reads.
            ++numReads;
            // See if the quality histogram should be genereated.
            if(qual || phred)
            {
                // Get the quality.
                const char* qual = samRecord.getQuality();
                // Check for no quality ('*').
                if((qual[0] == '*') && (qual[1] == 0))
                {
                    // This record does not have a quality string, so no 
                    // quality processing is necessary.
                }
                else
                {
                    int index = 0;
                    cigarPtr = samRecord.getCigarInfo();
                    cigarChar = '?';
                    refPos = samRecord.get0BasedPosition();
                    if(!qualExcludeClips && (cigarPtr != NULL))
                    {
                        // Offset the reference position by any soft clips
                        // by subtracting the queryIndex of this start position.
                        // refPos is now the start position of the clips.
                        refPos -= cigarPtr->getQueryIndex(0);
                    }

                    while(qual[index] != 0)
                    {
                        // Skip this quality if it is clipped and we are skipping clips.
                        if(cigarPtr != NULL)
                        {
                            cigarChar = cigarPtr->getCigarCharOpFromQueryIndex(index);
                        }
                        if(qualExcludeClips && Cigar::isClip(cigarChar))
                        {
                            // Skip a clipped quality.
                            ++index;
                            // Increment the position.
                            continue;
                        }

                        if(withinRegion && (myEndPos != -1) && (refPos >= myEndPos))
                        {
                            // We have hit the end of the region, stop processing this
                            // quality string.
                            break;
                        }

                        if(withinRegion && (refPos < myStartPos))
                        {
                            // This position is not in the target.
                            ++index;
                            // Update the position if this is found in the reference or a clip.
                            if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar))
                            {
                                ++refPos;
                            }
                            continue;
                        }

                        // Check for valid quality.
                        if((qual[index] < START_QUAL) || (qual[index] > MAX_QUAL))
                        {
                            if(qual)
                            {
                                std::cerr << "Invalid Quality found: " << qual[index] 
                                          << ".  Must be between "
                                          << START_QUAL << " and " << MAX_QUAL << ".\n";
                            }
                            if(phred)
                            {
                                std::cerr << "Invalid Phred Quality found: " << qual[index] - PHRED_DIFF
                                          << ".  Must be between "
                                          << START_QUAL << " and " << MAX_QUAL << ".\n";
                            }
                            // Skip an invalid quality.
                            ++index;
                            // Update the position if this is found in the reference or a clip.
                            if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar))
                            {
                                ++refPos;
                            }
                            continue;
                        }
                        
                        // Increment the count for this quality.
                        ++(qualCount[(int)(qual[index])]);
                        ++(phredCount[(int)(qual[index]) - PHRED_DIFF]);
                        // Update the position if this is found in the reference or a clip.
                        if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar))
                        {
                            ++refPos;
                        }
                        ++index;
                    }
                }
            }

            // Check the next thing to do for the read.
            if((baseQCPtr != NULL) || baseSum)
            {
                // Pileup the bases for this read.
                pileup.processAlignmentRegion(samRecord, myStartPos, myEndPos, dbsnpListPtr);
            }
        }

        // Done with a section, move on to the next one.

        // New section, so flush the pileup.
        pileup.flushPileup();
    }

    // Flush the rest of the pileup.
    if((baseQCPtr != NULL) || baseSum)
    {
        // Pileup the bases.
        pileup.processAlignmentRegion(samRecord, myStartPos, myEndPos, dbsnpListPtr);
        PileupElementBaseQCStats::printSummary();
        ifclose(baseQCPtr);
    }

    std::cerr << "Number of records read = " << 
        samIn.GetCurrentRecordCount() << std::endl;

    if(basic)
    {
        std::cerr << std::endl;
        samIn.PrintStatistics();
    }

    // Print the quality stats.
    if(qual)
    {
        std::cerr << std::endl;
        std::cerr << "Quality\tCount\n";
        for(int i = START_QUAL; i <= MAX_QUAL; i++)
        {
            std::cerr << i << "\t" << qualCount[i] << std::endl;
        }
    }
    // Print the phred quality stats.
    if(phred)
    {
        std::cerr << std::endl;
        std::cerr << "Phred\tCount\n";
        for(int i = START_PHRED; i <= MAX_PHRED; i++)
        {
            std::cerr << i << "\t" << phredCount[i] << std::endl;
        }
    }

    SamStatus::Status status = samIn.GetStatus();
    if(status == SamStatus::NO_MORE_RECS)
    {
        // A status of NO_MORE_RECS means that all reads were successful.
        status = SamStatus::SUCCESS;
    }

    return(status);
}
void GroupFromAnnotation::GetGeneMap(String path)
{
   IFILE genemap;
   genemap =  ifopen(mapFile,"r");
   if(genemap==NULL)
   {
      if(mapFile=="../data/refFlat_hg19.txt")
      {
	 mapFile += ".gz";

	 genemap = ifopen(mapFile,"r");
	 if(genemap==NULL)
	 {
	    int loc = path.Find("bin");
	    if(loc!=-1)
	    {
	       mapFile = path.Left(loc-1);
	       mapFile += "/data/refFlat_hg19.txt";
	    }
	    else
	    {
	       mapFile += "../data/refFlat_hg19.txt";
	    }
	    genemap = ifopen(mapFile,"r");
	 }
	 if(genemap==NULL)
	 {
	    mapFile += ".gz";
	    genemap = ifopen(mapFile,"r");
	 }
	 if(genemap==NULL)
	    error("Cannot open gene mapping file %s.\n",mapFile.c_str());

      }
      else
	 error("Cannot open gene mapping file %s.\n",mapFile.c_str());
   }
   StringIntHash GeneLocHash;
   StringArray strand;
   int gene_idx =0;

   while(!ifeof(genemap))
   {
      String buffer;
      buffer.ReadLine(genemap);
      StringArray record;
      record.AddTokens(buffer,"\t");
      int loc = GeneLocHash.Integer(record[0]);
      if(loc==-1)
      {
	 GeneLocHash.SetInteger(record[0],gene_idx);
	 //save chr, start and end positions
	 StringArray gene_chr;
	 if(record[2][2]=='r' || record[2][2]=='R')
	    record[2] = record[2].SubStr(3);
	 gene_chr.AddTokens(record[2],"_,;.");
	 if(gene_chr[0].Find("Un")!=-1)
	    continue;
	 /*
	    if(ChrLocHash.Integer(gene_chr[0])==-1)
	    {
	    chr_count++;
	    unique_chr.Push(gene_chr[0]);
	    ChrLocHash.SetInteger(gene_chr[0],chr_count);
	    }
	  */
	 chr.Push(gene_chr[0]);
	 //printf("%d\t%s\t%s\n",idx,record[0].c_str(),gene_chr[0].c_str());
	 start_pos.Push(record[4].AsInteger());
	 end_pos.Push(record[5].AsInteger());
	 strand.Push(record[3]);
	 genename.Push(record[0]);
	 gene_idx++;
      }
      else
      {
	 //get the current chr
	 StringArray gene_chr;
	 if(record[2][2]=='r' || record[2][2]=='R')
	    record[2] = record[2].SubStr(3);
	 gene_chr.AddTokens(record[2],"_,;.");
	 if(gene_chr[0].Find("Un")!=-1)
	    continue;
	 //check if strand and chr are consistent with previous record
	 if(chr[loc]!=gene_chr[0]) 
	    //if(strand[loc]!=record[3] || chr[loc]!=gene_chr[0]) 
	    //    printf("Gene %s in %s has multiple records in different chromosome or strand.\n",record[0].c_str(),mapFile.c_str());
	    continue;
	 //update start and end position
	 if(record[4].AsInteger()<start_pos[loc])
	    start_pos[loc] = record[4].AsInteger();
	 if(record[5].AsInteger()>end_pos[loc])
	    end_pos[loc] = record[5].AsInteger();
      }
   }
   ifclose(genemap);
   //ifclose(genemap);
   chr_idx.Index(chr);
   String chr_=chr[chr_idx[0]];
   for(int i=1;i<chr.Length();i++)
   {
      if(chr[chr_idx[i]]!=chr_)
      {
	 ChrStartHash.SetInteger(chr[chr_idx[i]],i);
	 ChrEndHash.SetInteger(chr_,i-1);
	 chr_ = chr[chr_idx[i]];
      }
   }
}