Ejemplo n.º 1
0
    SAMReader *
SAMReader::create(
    DataSupplier* supplier,
    const char *fileName,
    const ReaderContext& context,
    _int64 startingOffset, 
    _int64 amountOfFileToProcess)
{
    DataReader* data = supplier->getDataReader(maxLineLen);
    SAMReader *reader = new SAMReader(data, context);
    reader->init(fileName, startingOffset, amountOfFileToProcess);
    return reader;
}
Ejemplo n.º 2
0
void
WorkerThreadMain(void *param)
{
    ThreadContext *context = (ThreadContext *)param;

    _int64 rangeStart, rangeLength;

    SAMReader *samReader = NULL;
    ReaderContext rcontext;
    rcontext.clipping = NoClipping;
    rcontext.genome = genome;
    rcontext.paired = false;
    rcontext.defaultReadGroup = "";
    while (rangeSplitter->getNextRange(&rangeStart, &rangeLength)) {
        if (NULL == samReader) {
            samReader = SAMReader::create(DataSupplier::Default[true], inputFileName, rcontext, rangeStart, rangeLength);
        } else {
            ((ReadReader *)samReader)->reinit(rangeStart, rangeLength);
        }

        AlignmentResult alignmentResult;
        unsigned genomeLocation;
        Direction isRC;
        unsigned mapQ;
        unsigned flag;
        const char *cigar;
        unsigned nextFileToWrite = 0;
        Read read;
        LandauVishkinWithCigar lv;
        while (samReader->getNextRead(&read, &alignmentResult, &genomeLocation, &isRC, &mapQ, &flag, &cigar)) {

            if (mapQ < 0 || mapQ > MaxMAPQ) {
                fprintf(stderr,"Invalid MAPQ: %d\n",mapQ);
                exit(1);
            }

            if (0xffffffff == genomeLocation) {
                context->nUnaligned++;
            } else {
                if (flag & SAM_REVERSE_COMPLEMENT) {
                    read.becomeRC();
                }
                            
                const Genome::Piece *piece = genome->getPieceAtLocation(genomeLocation);
                if (NULL == piece) {
                    fprintf(stderr,"couldn't find genome piece for offset %u\n",genomeLocation);
                    exit(1);
                }
                unsigned offsetA, offsetB;
                bool matched;

                const unsigned cigarBufLen = 1000;
                char cigarForAligned[cigarBufLen];
                const char *alignedGenomeData = genome->getSubstring(genomeLocation, 1); 
                int editDistance = lv.computeEditDistance(alignedGenomeData, read.getDataLength() + 20, read.getData(), read.getDataLength(), 30, cigarForAligned, cigarBufLen, false);

                if (editDistance == -1 || editDistance > MaxEditDistance) {
                    editDistance = MaxEditDistance;
                }

                //
                // Parse the read ID.  The format is ChrName_OffsetA_OffsetB_?:<more stuff>.  This would be simple to parse, except that
                // ChrName can include "_".  So, we parse it by looking for the first : and then working backward.
                //
                char idBuffer[10000];   // Hopefully big enough.  I'm not worried about malicious input data here.

                memcpy(idBuffer,read.getId(),read.getIdLength());
                idBuffer[read.getIdLength()] = 0;
                    
                const char *firstColon = strchr(idBuffer,':');
                bool badParse = true;
                size_t chrNameLen;
                const char *beginningOfSecondNumber;
                const char *beginningOfFirstNumber; int stage = 0;
                unsigned offsetOfCorrectChromosome;
 
                if (NULL != firstColon && firstColon - 3 > idBuffer && (*(firstColon-1) == '?' || isADigit(*(firstColon - 1)))) {
                    //
                    // We've parsed backwards to see that we have at least #: or ?: where '#' is a digit and ? is literal.  If it's
                    // a digit, then scan backwards through that number.
                    //
                    const char *underscoreBeforeFirstColon = firstColon - 2;
                    while (underscoreBeforeFirstColon > idBuffer && isADigit(*underscoreBeforeFirstColon)) {
                        underscoreBeforeFirstColon--;
                    }

                    if (*underscoreBeforeFirstColon == '_' && (isADigit(*(underscoreBeforeFirstColon - 1)) || *(underscoreBeforeFirstColon - 1) == '_')) {
                        stage = 1;
                        if (isADigit(*(underscoreBeforeFirstColon - 1))) {
                            beginningOfSecondNumber = firstColon - 3;
                            while (beginningOfSecondNumber > idBuffer && isADigit(*beginningOfSecondNumber)) {
                                beginningOfSecondNumber--;
                            }
                            beginningOfSecondNumber++; // That loop actually moved us back one char before the beginning;
                        } else {
                            //
                            // There's only one number,  we have two consecutive underscores.
                            //
                            beginningOfSecondNumber = underscoreBeforeFirstColon;
                        }
                        if (beginningOfSecondNumber - 2 > idBuffer && *(beginningOfSecondNumber - 1) == '_' && isADigit(*(beginningOfSecondNumber - 2))) {
                            stage = 2;
                            beginningOfFirstNumber = beginningOfSecondNumber - 2;
                            while (beginningOfFirstNumber > idBuffer && isADigit(*beginningOfFirstNumber)) {
                                beginningOfFirstNumber--;
                            }
                            beginningOfFirstNumber++; // Again, we went one too far.

                           offsetA = -1;
                           offsetB = -1;

                            if (*(beginningOfFirstNumber - 1) == '_' && 1 == sscanf(beginningOfFirstNumber,"%u",&offsetA) &&
                                ('_' == *beginningOfSecondNumber || 1 == sscanf(beginningOfSecondNumber,"%u", &offsetB))) {
                                    stage = 3;

                                chrNameLen = (beginningOfFirstNumber - 1) - idBuffer;
                                char correctChromosomeName[1000];
                                memcpy(correctChromosomeName, idBuffer, chrNameLen);
                                correctChromosomeName[chrNameLen] = '\0';

                                if (!genome->getOffsetOfPiece(correctChromosomeName, &offsetOfCorrectChromosome)) {
                                    fprintf(stderr, "Couldn't parse chromosome name '%s' from read id\n", correctChromosomeName);
                                } else {
                                    badParse = false;
                                }
                            }
                        }
                    }

 

                    if (badParse) {
                        fprintf(stderr,"Unable to parse read ID '%s', perhaps this isn't simulated data.  piecelen = %d, pieceName = '%s', piece offset = %u, genome offset = %u\n", idBuffer, strlen(piece->name), piece->name, piece->beginningOffset, genomeLocation);
                        exit(1);
                    }

 
                    bool match0 = false;
                    bool match1 = false;
                    if (-1 == offsetA || -1 == offsetB) {
                        matched = false;
                    }  else if(strncmp(piece->name, idBuffer, __min(read.getIdLength(), chrNameLen))) {
                        matched = false;
                    } else {
                        if (isWithin(offsetA, genomeLocation - piece->beginningOffset, 50)) {
                            matched = true;
                            match0 = true;
                        } else if (isWithin(offsetB, genomeLocation - piece->beginningOffset, 50)) {
                            matched = true;
                            match1 = true;
                        } else {
                            matched = false;
                            if (flag & SAM_FIRST_SEGMENT) {
                                match0 = true;
                            } else {
                                match1 = true;
                            }
                        }
                    }

                    context->countOfReads[mapQ]++;
                    context->countOfReadsByEditDistance[mapQ][editDistance]++;

                    if (!matched) {
                        context->countOfMisalignments[mapQ]++;
                        context->countOfMisalignmentsByEditDistance[mapQ][editDistance]++;

                        if (70 == mapQ || 69 == mapQ) {

                            //
                            // We don't know which offset is correct, because neither one matched.  Just take the one with the lower edit distance.
                            //
                            unsigned correctLocationA = offsetOfCorrectChromosome + offsetA;
                            unsigned correctLocationB = offsetOfCorrectChromosome + offsetB;

                            unsigned correctLocation = 0;
                            const char *correctData = NULL;

                            const char *dataA = genome->getSubstring(correctLocationA, 1);
                            const char *dataB = genome->getSubstring(correctLocationB, 1);
                            int distanceA, distanceB;
                            char cigarA[cigarBufLen];
                            char cigarB[cigarBufLen];

                            cigarA[0] = '*'; cigarA[1] = '\0';
                            cigarB[0] = '*'; cigarB[1] = '\0';

                            if (dataA == NULL) {
                                distanceA = -1;
                            } else {
                                distanceA = lv.computeEditDistance(dataA, read.getDataLength() + 20, read.getData(), read.getDataLength(), 30, cigarA, cigarBufLen, false);
                            }

                            if (dataB == NULL) {
                                distanceB = -1;
                            } else {
                                distanceB = lv.computeEditDistance(dataB, read.getDataLength() + 20, read.getData(), read.getDataLength(), 30, cigarB, cigarBufLen, false);
                            }

                            const char *correctGenomeData;
                            char *cigarForCorrect;

                            if (distanceA != -1 && distanceA <= distanceB || distanceB == -1) {
                                correctGenomeData = dataA;
                                correctLocation = correctLocationA;
                                cigarForCorrect = cigarA;
                            } else {
                                correctGenomeData = dataB;
                                correctLocation = correctLocationB;
                                cigarForCorrect = cigarB;
                            }
                           
                            printf("%s\t%d\t%s\t%u\t%d\t%s\t*\t*\t100\t%.*s\t%.*s\tAlignedGenomeLocation:%u\tCorrectGenomeLocation: %u\tCigarForCorrect: %s\tCorrectData: %.*s\tAlignedData: %.*s\n", 
                                idBuffer, flag, piece->name, genomeLocation - piece->beginningOffset, mapQ, cigarForAligned, read.getDataLength(), read.getData(), 
                                read.getDataLength(), read.getQuality(),  genomeLocation, correctLocation, cigarForCorrect, read.getDataLength(),
                                correctGenomeData, read.getDataLength(), alignedGenomeData);
                        }
                    }
                }
            } // if it was mapped
        } // for each read from the sam reader
    }

     if (0 == InterlockedAdd64AndReturnNewValue(&nRunningThreads, -1)) {
        SignalSingleWaiterObject(&allThreadsDone);
    }
}
int main(int argc, char* argv[]) {
  
  CommandLineParser clp;
  string readsFileName;
  string alignmentsFileName;
  string outputFileName;
  float minMergeIdentity = 0.70;
  clp.RegisterStringOption("reads", &readsFileName, "Reads used for alignments.");
  clp.RegisterStringOption("alignments", &alignmentsFileName, "SAM formatted alignments.");
  clp.RegisterIntOption("k", &vertexSize, "Minimum match length", CommandLineParser::PositiveInteger);
  clp.RegisterStringOption("outfile", &outputFileName, "Alignment output.");
  clp.RegisterPreviousFlagsAsHidden();
  clp.RegisterFlagOption("v", &verbose, "");
  clp.RegisterFloatOption("minMergeIdentity", 
                          &minMergeIdentity, 
                          "Minimum identity to merge paths.", CommandLineParser::PositiveFloat);
  
  clp.ParseCommandLine(argc, argv);

  if (minMergeIdentity < 0 or minMergeIdentity > 1) {
    cout << "ERROR. minMergeIdentity must be between 0 and 1" << endl;
    exit(1);
  }
  
  vector<FASTASequence> reads;

  FASTAReader fastaReader;
  fastaReader.Initialize(readsFileName);
  fastaReader.ReadAllSequences(reads);

  //
  // It is necessary to go from read title to index in the list of reads. 
  //
  map<string, int> readNameToIndex;
  BuildReadNameToIndexMap(reads, readNameToIndex);

  ReadWordMatchVector readWordMatches;
  InitializeFromReads(reads, readWordMatches);
  
  //
  // Get ready to read in the alignments.
  //
  SAMReader<SAMFullReferenceSequence, SAMReadGroup, SAMPosAlignment> samReader;
  samReader.Initialize(alignmentsFileName);
  AlignmentSet<SAMFullReferenceSequence, SAMReadGroup, SAMPosAlignment> alignmentSet;
  samReader.ReadHeader(alignmentSet);
  
  SAMAlignment samAlignment;
  AlignmentCandidate<> alignment;
  int numAlignedBases = 0;
  int alignmentIndex = 0;
  while ( samReader.GetNextAlignment( samAlignment ) ) {
    vector<AlignmentCandidate<> > alignments;
    SAMAlignmentsToCandidates(samAlignment,
                              reads,
                              readNameToIndex,
                              alignments, false, true);

    int i;
    ++alignmentIndex;
    int a;
    for (a = 0; a < alignments.size();a++) {
      if (alignments[a].qName != alignments[a].tName) {
        MarkMatches(alignments[a], readNameToIndex, vertexSize, readWordMatches);
      }
    }
    if (alignmentIndex % 1000 == 0) {
      cout << alignmentIndex << endl;
    }
  }


  int numMatches = 0;
  int parentIndex = 1;
  int r;
  for (r = 0; r < readWordMatches.size(); r++) {
    readWordMatches[r].CreateParents();
    numMatches += readWordMatches[r].pos.size();
  }

  vector<int> parentIndices;
  parentIndices.resize(2*numMatches + 1);
  fill(parentIndices.begin(), parentIndices.end(), 0);
  //
  // Start indexing off at 1 so that 0 does not need to be treated in
  // a special case.
  //
  int curParentIndex = 1;
  cout << "There are " << numMatches << " matches." << endl;

  samReader.Close();
  samReader.Initialize(alignmentsFileName);
  AlignmentSet<SAMFullReferenceSequence, SAMReadGroup, SAMPosAlignment> alignmentSet2;
  samReader.ReadHeader(alignmentSet2);
  
  numAlignedBases = 0;
  alignmentIndex = 0;
  while ( samReader.GetNextAlignment( samAlignment ) ) {
    vector<AlignmentCandidate<> > alignments;
    SAMAlignmentsToCandidates(samAlignment,
                              reads,
                              readNameToIndex,
                              alignments, false, true);

    int i;
    ++alignmentIndex;
    int a;
    for (a = 0; a < alignments.size();a++) {
      if (alignments[a].qName != alignments[a].tName) {
        JoinVertices(alignments[a], vertexSize, readNameToIndex, readWordMatches, curParentIndex, parentIndices);
      }
    }
    if (alignmentIndex % 1000 == 0) {
      cout << alignmentIndex << endl;
    }
  }
  vector<int> parentCounts;
  parentCounts.resize(parentIndices.size());
  fill(parentCounts.begin(), parentCounts.end(), 0);
  int p;
  PromoteAll(parentIndices);
  int i;
  for (r = 0; r < readWordMatches.size(); r++) {
    for (i = 0; i < readWordMatches[r].parents.size(); i++) {
      readWordMatches[r].parents[i] = parentIndices[readWordMatches[r].parents[i]];
      parentCounts[readWordMatches[r].parents[i]]++;
    }
  }
  /*
  for (i = 0; i < readWordMatches.size(); i++) {
    readWordMatches[i].PrintPos(cout);
    readWordMatches[i].PrintParents(cout);
  }
  */

  map<int,int> hist;
  int numParents = 0;
  for (i = 1; i < parentCounts.size() && parentIndices[i] != 0; i++) {
    if (parentCounts[i] != 0) {
      ++numParents;
    }
    if (hist.find(parentCounts[i]) == hist.end()) {
      hist[parentCounts[i]] = 1;
    }
    else {
      hist[parentCounts[i]]++;
    }
  }

  map<int,int>::iterator histIt;
  cout << " freq count" << endl;
  for(histIt = hist.begin(); histIt != hist.end(); ++histIt) {
    cout << (*histIt).second << " " << (*histIt).first << endl;
  }

  MatchVertexList vertices;
  vertices.resize(numParents);
  cout << "there are " << numParents << " parents. " << endl;
  
}
Ejemplo n.º 4
0
int main(int argc, char* argv[]) {
  string program = "samtoh5";
  string versionString = VERSION;
  AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionString);
  string samFileName, cmpFileName, refFileName;
  bool parseSmrtTitle = false;
  bool useShortRefName = false;
  CommandLineParser clp;
  string readType = "standard";
  int verbosity = 0;

  clp.SetProgramName(program);
  clp.SetProgramSummary("Converts in.sam file to out.cmp.h5 file.");
  clp.SetVersion(versionString);

  clp.RegisterStringOption("in.sam", &samFileName, 
                           "Input SAM file.", true);
  clp.RegisterStringOption("reference.fasta", &refFileName, 
                           "Reference used to generate reads.", true);
  clp.RegisterStringOption("out.cmp.h5", &cmpFileName, 
                           "Output cmp.h5 file.", true);
  clp.RegisterPreviousFlagsAsHidden();
  clp.RegisterFlagOption("smrtTitle", &parseSmrtTitle, 
                         "Use this option when converting alignments "
                         "generated from reads produced by the "
                         "pls2fasta from bas.h5 files by parsing read "
                         "coordinates from the SMRT read title.  The title " 
                         "is in the format /name/hole/coordinates, where "
                         "coordinates are in the format \\d+_\\d+, and "
                         "represent the interval of the read that was "
                         "aligned.");
  clp.RegisterStringOption("readType", &readType, 
                         "Set the read type: 'standard', 'strobe', 'CCS', "
                         "or 'cDNA'");
  clp.RegisterIntOption("verbosity", &verbosity, 
                         "Set desired verbosity.", 
                         CommandLineParser::PositiveInteger);
  clp.RegisterFlagOption("useShortRefName", &useShortRefName, 
                         "Use abbreviated reference names obtained "
                         "from file.sam instead of using full names "
                         "from reference.fasta.");
  string description = ("Because SAM has optional tags that have different "
    "meanings in different programs, careful usage is required in order to "
    "have proper output. The \"xs\" tag in bwa-sw is used to show the "
    "suboptimal score, but in PacBio SAM (blasr) it is defined as the start "
    "in the query sequence of the alignment.\nWhen \"-smrtTitle\" is "
    "specified, the xs tag is ignored, but when it is not specified, the "
    "coordinates given by the xs and xe tags are used to define the interval "
    "of a read that is aligned. The CIGAR string is relative to this interval.");
  clp.SetExamples(description);

  clp.ParseCommandLine(argc, argv);

  if (readType != "standard" and readType != "strobe" and 
      readType != "cDNA" and readType != "CCS") {
    cout << "ERROR. Read type '" << readType 
         << "' must be one of either 'standard', 'strobe', 'cDNA' or 'CCS'." 
         << endl;
    exit(1);
  }
    
  cerr << "[INFO] " << GetTimestamp() << " [" << program << "] started." << endl;

  SAMReader<SAMFullReferenceSequence, SAMReadGroup, SAMPosAlignment> samReader;
  FASTAReader fastaReader;
  HDFCmpFile<AlignmentCandidate<FASTASequence, FASTASequence> > cmpFile;

  //
  // Initialize input/output files.
  //
  samReader.Initialize(samFileName);
  fastaReader.Initialize(refFileName);
  cmpFile.Create(cmpFileName);

  //
  // Configure the file log.
  //
  string command;
  CommandLineParser::CommandLineToString(argc, argv, command);
  string log = "Convert sam to cmp.h5";
  cmpFile.fileLogGroup.AddEntry(command, log, program, GetTimestamp(), versionString);

  //
  // Set the readType
  //
  cmpFile.SetReadType(readType);

  //
  // Read necessary input.
  //

  vector<FASTASequence> references;
  fastaReader.ReadAllSequences(references);
  
  //
  // This should probably be handled by the alignmentSetAdapter, but
  // time constraints...
  //
  AlignmentSet<SAMFullReferenceSequence, SAMReadGroup, SAMPosAlignment> alignmentSet;
  samReader.ReadHeader(alignmentSet);
 
  //
  // The order of references in vector<FASTASequence> references and
  // AlignmentSet<, , >alignmentSet.references can be different.
  // Rearrange alignmentSet.references such that it is ordered in
  // exactly the same way as vector<FASTASequence> references.
  //
  alignmentSet.RearrangeReferences(references);

  //
  // Always recompute the MD5 values even if they exist in the input
  // sam file. Because MD5 is defined differently in sam and cmp.h5 files.
  // The SAM convention uppercases and normalizes before computing the MD5. 
  // For cmp.h5, we compute the MD5 on the sequence 'as is'.
  // 
  for(int i = 0; i < alignmentSet.references.size(); i++) {
      MakeMD5((const char*)&references[i].seq[0], 
              (unsigned int)references[i].length, alignmentSet.references[i].md5);
  }
 
  //
  // Map short names for references obtained from file.sam to full names obtained from reference.fasta
  //
  map<string, string> shortRefNameToFull;
  map<string, string>::iterator it;
  assert(references.size() == alignmentSet.references.size());
  if (!useShortRefName) {
      for (int i = 0; i < references.size(); i++) {
          string shortRefName = alignmentSet.references[i].GetSequenceName();
          string fullRefName(references[i].title); 
          if (shortRefNameToFull.find(shortRefName) != shortRefNameToFull.end()) {
              cout << "ERROR, Found more than one reference " << shortRefName << "in sam header" << endl;
              exit(1);
          } 
          shortRefNameToFull[shortRefName] = fullRefName;
          alignmentSet.references[i].sequenceName = fullRefName;
      }
  }

  //
  // Start setting up the cmp.h5 file.
  //
  AlignmentSetToCmpH5Adapter<HDFCmpFile<AlignmentCandidate<FASTASequence, FASTASequence> > > alignmentSetAdapter;
  alignmentSetAdapter.Initialize();
  alignmentSetAdapter.StoreReferenceInfo(alignmentSet.references, cmpFile);
  
  //
  // Store the alignments.
  //
  SAMAlignment samAlignment;
  int alignIndex = 0;
  while (samReader.GetNextAlignment(samAlignment)) {
    if (samAlignment.rName == "*") {
      continue;
    }
    if (!useShortRefName) {
        //convert shortRefName to fullRefName
        it = shortRefNameToFull.find(samAlignment.rName);
        if (it == shortRefNameToFull.end()) {
            cout << "ERROR, Could not find " << samAlignment.rName << " in the reference repository." << endl;
            exit(1);
        }
        samAlignment.rName = (*it).second;
    }
    vector<AlignmentCandidate<> > convertedAlignments;
    if (verbosity > 0) {
      cout << "Storing alignment for " << samAlignment.qName << endl;
    }
    SAMAlignmentsToCandidates(samAlignment, 
                              references, alignmentSetAdapter.refNameToIndex,
                              convertedAlignments, parseSmrtTitle, false);

    alignmentSetAdapter.StoreAlignmentCandidateList(convertedAlignments, cmpFile, alignIndex);
    int a;
    for (a = 0; a < convertedAlignments.size(); a++) {
      convertedAlignments[a].FreeSubsequences();
    }
    ++alignIndex;
    /*    if (alignIndex == 100) {
      return 0;
      }*/
  }

  cerr << "[INFO] " << GetTimestamp() << " [" << program << "] ended." << endl;
  return 0;
}
Ejemplo n.º 5
0
int main(int argc, char* argv[]) {
    string program = "samtom4";
    string versionString = VERSION;
    AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionString);

    string samFileName, refFileName, outFileName;
    bool printHeader = false;
    bool parseSmrtTitle = false;
    bool useShortRefName = false;

    CommandLineParser clp;
    clp.SetProgramName(program);
    clp.SetVersion(versionString);
    clp.SetProgramSummary("Converts a SAM file generated by blasr to M4 format.");
    clp.RegisterStringOption("in.sam",        &samFileName,
                             "Input SAM file, which is produced by blasr.");
    clp.RegisterStringOption("reference.fasta", &refFileName,
                             "Reference used to generate file.sam.");
    clp.RegisterStringOption("out.m4",          &outFileName,
                             "Output in blasr M4 format.");
    clp.RegisterPreviousFlagsAsHidden();
    clp.RegisterFlagOption("header",            &printHeader,
                           "Print M4 header.");
    clp.RegisterFlagOption("useShortRefName",   &useShortRefName, 
                           "Use abbreviated reference names obtained "
                           "from file.sam instead of using full names "
                           "from reference.fasta.");
    //clp.SetExamples(program + " file.sam reference.fasta out.m4");

    clp.ParseCommandLine(argc, argv);

    ostream * outFilePtr = &cout;
	ofstream outFileStrm;
	if (outFileName != "") {
		CrucialOpen(outFileName, outFileStrm, std::ios::out);
		outFilePtr = &outFileStrm;
	}

    SAMReader<SAMFullReferenceSequence, SAMReadGroup, SAMAlignment> samReader;
    FASTAReader fastaReader;

    //
    // Initialize samReader and fastaReader.
    //
    samReader.Initialize(samFileName);
    fastaReader.Initialize(refFileName);

    //
    // Configure the file log.
    //
    string command;
    CommandLineParser::CommandLineToString(argc, argv, command);

    //
    // Read necessary input.
    //
    vector<FASTASequence> references;
    fastaReader.ReadAllSequences(references);

    AlignmentSet<SAMFullReferenceSequence, SAMReadGroup, SAMAlignment> alignmentSet;
    samReader.ReadHeader(alignmentSet); 

    //
    // The order of references in vector<FASTASequence> references and
    // AlignmentSet<, , >alignmentSet.references can be different.
    // Rearrange alignmentSet.references such that it is ordered in
    // exactly the same way as vector<FASTASequence> references.
    //
    alignmentSet.RearrangeReferences(references);

    //
    // Map short names for references obtained from file.sam to 
    // full names obtained from reference.fasta
    //
    map<string, string> shortRefNameToFull;
    map<string, string>::iterator it;
    assert(references.size() == alignmentSet.references.size());
    if (!useShortRefName) {
        for (size_t i = 0; i < references.size(); i++) {
            string shortRefName = alignmentSet.references[i].GetSequenceName();
            string fullRefName(references[i].title); 
            if (shortRefNameToFull.find(shortRefName) != shortRefNameToFull.end()) {
                cout << "ERROR, Found more than one reference " << shortRefName << "in sam header" << endl;
                exit(1);
            } 
            shortRefNameToFull[shortRefName] = fullRefName;
            alignmentSet.references[i].sequenceName = fullRefName;
        }
    }

    // Map reference name obtained from SAM file to indices
    map<string, int> refNameToIndex;
    for (size_t i = 0; i < references.size(); i++) {
        string refName = alignmentSet.references[i].GetSequenceName();
        refNameToIndex[refName] = i;
    }

    //
    // Store the alignments.
    //
    SAMAlignment samAlignment;
    size_t alignIndex = 0; 

    //
    // For 150K, each chip produces about 300M sequences 
    // (not including quality values and etc.).
    // Let's assume that the sam file and reference data can 
    // fit in the memory. 
    // Need to scale for larger sequal data in the future.
    //
    if (printHeader)
        IntervalOutput::PrintHeader(*outFilePtr);

    // The socre matrix does not matter because we will use the 
    // aligner's score from SAM file anyway.
    DistanceMatrixScoreFunction<DNASequence, DNASequence> distScoreFn;

    while (samReader.GetNextAlignment(samAlignment)) {
        if (samAlignment.rName == "*") {
            continue;
        }

        if (!useShortRefName) {
            //convert shortRefName to fullRefName
            it = shortRefNameToFull.find(samAlignment.rName);
            if (it == shortRefNameToFull.end()) {
                cout << "ERROR, Could not find " << samAlignment.rName << " in the reference repository." << endl;
                exit(1);
            }
            samAlignment.rName = (*it).second;
        }

        // The padding character 'P' is not supported
        if (samAlignment.cigar.find('P') != string::npos) {
            cout << "WARNING. Could not process sam record with 'P' in its cigar string."
                 << endl;
            continue;
        }

        vector<AlignmentCandidate<> > convertedAlignments;

        //
        // Keep reference as forward.
        // So if IsReverseComplement(sam.flag)==true, then qStrand is reverse
        // and tStrand is forward.
        //
        bool keepRefAsForward = false;

        SAMAlignmentsToCandidates(samAlignment, references, refNameToIndex,
                                  convertedAlignments, parseSmrtTitle, 
                                  keepRefAsForward);

        if (convertedAlignments.size() > 1) {
            cout << "WARNING. Ignore an alignment which has multiple segments." << endl;
            continue;
        }

        //all alignments are unique single-ended alignments.
        for (int i = 0; i < 1; i++) {
            AlignmentCandidate<> & alignment = convertedAlignments[i];

            ComputeAlignmentStats(alignment, alignment.qAlignedSeq.seq, 
                                  alignment.tAlignedSeq.seq, distScoreFn);

            // Use aligner's score from SAM file anyway.
            alignment.score = samAlignment.as;
            alignment.mapQV = samAlignment.mapQV;

            // Since SAM only has the aligned sequence, many info of the 
            // original query (e.g. the full length) is missing. 
            // Overwrite alignment.qLength (which is length of the query
            // in the SAM alignment) with xq (which is the length of the 
            // original query sequence saved by blasr) right before printing 
            // the output so that one can reconstruct a blasr m4 record from 
            // a blasr sam alignment.
            if (samAlignment.xq!=0)
                alignment.qLength = samAlignment.xq;

            IntervalOutput::PrintFromSAM(alignment, *outFilePtr);

            alignment.FreeSubsequences();
        }
        ++alignIndex;
    }

	if (outFileName != "") {
		outFileStrm.close();
	}
    return 0;
}
Ejemplo n.º 6
0
int main(int argc, char* argv[]) {
#ifdef USE_GOOGLE_PROFILER
    char *profileFileName = getenv("CPUPROFILE");
    if (profileFileName != NULL) {
      ProfilerStart(profileFileName);
    }
    else {
      ProfilerStart("google_profile.txt");
    }
#endif

    // Register inputs and outputs.
    string samFileName, refFileName, outFileName;

    CommandLineParser clp;
    clp.RegisterStringOption("file.sam", &samFileName,
                             "Input SAM file.");
    clp.RegisterStringOption("reference.fasta", &refFileName,
                             "Reference used to generate reads.");
    clp.RegisterStringOption("out.sam", &outFileName,
                             "Output SAM file.");
    clp.RegisterPreviousFlagsAsHidden();

    // Register filter criteria options.
    int minAlnLength = 50;
    float minPctSimilarity = 70, minPctAccuracy = 70;
    string hitPolicyStr = "randombest";
    bool useScoreCutoff = false;
    int  scoreCutoff = INF_INT;
    int  scoreSignInt = -1;
    RegisterFilterOptions(clp, minAlnLength, minPctSimilarity, 
                          minPctAccuracy, hitPolicyStr, useScoreCutoff,
                          scoreSignInt, scoreCutoff);

    int seed = 1; 
    clp.RegisterIntOption("seed", &seed,
            "(1)  Seed for random number generator.\n"
            "If seed is 0, then use current time as seed.",
            CommandLineParser::Integer);

    string holeNumberStr;
    Ranges holeNumberRanges;
    clp.RegisterStringOption("holeNumbers", &holeNumberStr,
            "A string of comma-delimited hole number ranges to output hits, "
            "such as '1,2,10-12'. "
            "This requires hit titles to be in SMRT read title format.");

    bool parseSmrtTitle = false;
    clp.RegisterFlagOption("smrtTitle", &parseSmrtTitle,
            "Use this option when filtering alignments generated by "
            "programs other than blasr, e.g. bwa-sw or gmap. "
            "  Parse read coordinates from the SMRT read title. " 
            "The title is in the format /name/hole/coordinates, where"
            " coordinates are in the format \\d+_\\d+, and represent "
            "the interval of the read that was aligned.");
    /* This experimental option can be useful for metagenomics, in which case
     * there are hundreds of sequences in the target, of which many titles are
     * long and may contain white spaces (e.g., ' ', '\t'). 
     * In order to save disc space and avoid the (possibly) none unique mapping
     * between full and short reference names, one may call blasr with 
     * -titleTable option to represent all target sequences in the output
     * by their indices in the title table.*/

    string titleTableName = "";
    clp.RegisterStringOption("titleTable", &titleTableName,
            "Use this experimental option when filtering alignments generated by "
            "blasr with -titleTable titleTableName, in which case "
            "reference titles in SAM are represented by their "
            "indices (e.g., 0, 1, 2, ...) in the title table.");

    string adapterGffFileName = "";
    clp.RegisterStringOption("filterAdapterOnly", &adapterGffFileName,
            "Use this option to remove reads which can only map to adapters " 
            "specified in the GFF file.");

    bool verbose = false;
    clp.RegisterFlagOption("v", &verbose, "Be verbose.");

    clp.SetExamples(
            "Because SAM has optional tags that have different meanings"
            " in different programs, careful usage is required in order "
            "to have proper output.  The \"xs\" tag in bwa-sw is used to "
            "show the suboptimal score, but in PacBio SAM (blasr) it is "
            "defined as the start in the query sequence of the alignment.\n"
            "When \"-smrtTitle\" is specified, the xs tag is ignored, but "
            "when it is not specified, the coordinates given by the xs and "
            "xe tags are used to define the interval of a read that is "
            "aligned.  The CIGAR string is relative to this interval.");

    clp.ParseCommandLine(argc, argv);

    // Set random number seed. 
    if (seed == 0) {
        srand(time(NULL));
    } else {
        srand(seed);
    }
    
    scoreSign = (scoreSignInt == -1)?ScoreSign::NEGATIVE:ScoreSign::POSITIVE;
    Score s(static_cast<float>(scoreCutoff), scoreSign);
    FilterCriteria filterCriteria(minAlnLength, minPctSimilarity, 
                                  minPctAccuracy, true, s);
    filterCriteria.Verbose(verbose);
    HitPolicy hitPolicy(hitPolicyStr, scoreSign);
                                  
    string errMsg;
    if (not filterCriteria.MakeSane(errMsg)) {
        cout << errMsg << endl;
        exit(1);
    }

    // Parse hole number ranges. 
    if (holeNumberStr.size() != 0) {
        if (not holeNumberRanges.setRanges(holeNumberStr)) {
            cout << "Could not parse hole number ranges: "
                 << holeNumberStr << "." << endl;
            exit(1);
        } 
    }

    // Open output file.
    ostream * outFilePtr = &cout;
	ofstream outFileStrm;
	if (outFileName != "") {
		CrucialOpen(outFileName, outFileStrm, std::ios::out);
		outFilePtr = &outFileStrm;
	}
    
    GFFFile adapterGffFile;
    if (adapterGffFileName != "")
        adapterGffFile.ReadAll(adapterGffFileName);
    
    SAMReader<SAMFullReferenceSequence, SAMReadGroup, SAMAlignment> samReader;
    FASTAReader fastaReader;

    //
    // Initialize samReader and fastaReader.
    //
    samReader.Initialize(samFileName);
    fastaReader.Initialize(refFileName);

    //
    // Configure the file log.
    //
    string command;
    CommandLineParser::CommandLineToString(argc, argv, command);
    string log = "Filter sam hits.";
    string program = "samFilter";
    string versionString = VERSION;
    AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionString);

    //
    // Read necessary input.
    //
    vector<FASTASequence> references;
    fastaReader.ReadAllSequences(references);

    // If the SAM file is generated by blasr with -titleTable,
    // then references in the SAM are represented by 
    // their corresponding indices in the title table.
    // In that case, we need to convert reference titles in fasta file
    // to their corresponding indices in the title table, such that
    // references in both SAM and fasta files are represented
    // by title table indices and therefore can match.
    if (titleTableName != "") {
        ConvertTitlesToTitleTableIndices(references, titleTableName);
    }
 
    AlignmentSet<SAMFullReferenceSequence, SAMReadGroup, SAMAlignment> alignmentSet;
    vector<string> allHeaders = samReader.ReadHeader(alignmentSet); 

    // Process SAM Header.
    string commandLineString;
    clp.CommandLineToString(argc, argv, commandLineString);
    allHeaders.push_back("@PG\tID:SAMFILTER\tVN:" + versionString + \
                         "\tCL:" + program + " " + commandLineString);
    for (int i = 0; i < allHeaders.size(); i++) {
        outFileStrm << allHeaders[i] << endl;
    }

    //
    // The order of references in vector<FASTASequence> references and
    // AlignmentSet<, , >alignmentSet.references can be different.
    // Rearrange alignmentSet.references such that they are ordered in
    // exactly the same way as vector<FASTASequence> references.
    //
    alignmentSet.RearrangeReferences(references);

    // Map reference name obtained from SAM file to indices
    map<string, int> refNameToIndex;
    for (int i = 0; i < references.size(); i++) {
        string refName = alignmentSet.references[i].GetSequenceName();
        refNameToIndex[refName] = i;
    }

    //
    // Store the alignments.
    //
    SAMAlignment samAlignment;
    int alignIndex = 0; 

    //
    // For 150K, each chip produces about 300M sequences 
    // (not including quality values and etc.).
    // Let's assume that the sam file and reference data can 
    // fit in the memory. 
    // Need to scale for larger sequal data in the future.
    //
    vector<SAMAlignment> allSAMAlignments;
    while (samReader.GetNextAlignment(samAlignment)) {
        if (samAlignment.rName == "*") {
            continue;
        }

        if (parseSmrtTitle and holeNumberStr.size() != 0) {
            string movieName;
            int thisHoleNumber;
            if (not ParsePBIReadName(samAlignment.qName, 
                                     movieName, 
                                     thisHoleNumber)) {
                cout << "ERROR, could not parse SMRT title: "
                     << samAlignment.qName << "." << endl;
                exit(1);
            }
            if (not holeNumberRanges.contains(UInt(thisHoleNumber))) {
                if (verbose) 
                    cout << thisHoleNumber << " is not in range." << endl; 
                continue;
            }
        }

        if (samAlignment.cigar.find('P') != string::npos) {
            cout << "WARNING. Could not process SAM record with 'P' in "
                 << "its cigar string." << endl;
            continue;
        }

        vector<AlignmentCandidate<> > convertedAlignments;
        SAMAlignmentsToCandidates(samAlignment, 
                references, refNameToIndex,
                convertedAlignments, parseSmrtTitle, false);
        
        if (convertedAlignments.size() > 1) {
            cout << "WARNING. Ignore multiple segments." << endl;
            continue;
        }

        for (int i = 0; i < 1; i++) {
            AlignmentCandidate<> & alignment = convertedAlignments[i];

            //score func does not matter
            DistanceMatrixScoreFunction<DNASequence, DNASequence> distFunc; 
            ComputeAlignmentStats(alignment, alignment.qAlignedSeq.seq, 
                                  alignment.tAlignedSeq.seq, distFunc);
                                  
            // Check whether this alignment can only map to adapters in 
            // the adapter GFF file.
            if (adapterGffFileName != "" and 
                CheckAdapterOnly(adapterGffFile, alignment, refNameToIndex)) {
                if (verbose)
                    cout << alignment.qName << " filter adapter only."
                         << endl;
                continue;
            }

            // Assign score to samAlignment.
            samAlignment.score = samAlignment.as;

            if (not filterCriteria.Satisfy(static_cast<AlignmentCandidate<> *>(&alignment))) {
                continue;
            }
            allSAMAlignments.push_back( samAlignment ); 

            alignment.FreeSubsequences();
        }
        ++alignIndex;
    }

    // Sort all SAM alignments by qName, score and target position.
    sort(allSAMAlignments.begin(), allSAMAlignments.end(), 
         byQNameScoreTStart);

    unsigned int groupBegin = 0;
    unsigned int groupEnd = -1;
    vector<SAMAlignment> filteredSAMAlignments;
    while(groupBegin < allSAMAlignments.size()) {
        // Get the next group of SAM alignments which have the same qName
        // from allSAMAlignments[groupBegin ... groupEnd)
        GetNextSAMAlignmentGroup(allSAMAlignments, groupBegin, groupEnd);
        vector<unsigned int> hitIndices = ApplyHitPolicy(
                hitPolicy, allSAMAlignments, groupBegin, groupEnd);
        for(unsigned int i = 0; i < hitIndices.size(); i++) {
            filteredSAMAlignments.push_back(allSAMAlignments[hitIndices[i]]);
        }
        groupBegin = groupEnd;
    }

    // Sort all SAM alignments by reference name and query name
    sort(filteredSAMAlignments.begin(), filteredSAMAlignments.end(), 
         byRNameQName);

    for(unsigned int i = 0; i < filteredSAMAlignments.size(); i++) {
        filteredSAMAlignments[i].PrintSAMAlignment(outFileStrm);
    }

	if (outFileName != "") {
		outFileStrm.close();
	}
#ifdef USE_GOOGLE_PROFILER
  ProfilerStop();
#endif
    return 0;
}