コード例 #1
0
ファイル: TestAdvance.cpp プロジェクト: EichlerLab/blasr
int main(int argc, char* argv[]) {

	string plsFileName;
	int advance;

	if (argc <= 2) {
		cout << "usage: testAdvance file.pls.h5 advance " << endl;
		cout << "move 'advance' reads forward in a file." << endl;
		exit(1);
	}
	plsFileName = argv[1];
	advance = atoi(argv[2]);

	
	ReaderAgglomerate reader;
	reader.Initialize(plsFileName);
	
  SMRTSequence seq;
  int seqIndex = 0;
	int i;
	for (i = 0; i < 4; i++ ){
		seq.Free();
		reader.Advance(advance);
		reader.GetNext(seq);
	}
	seq.PrintSeq(cout);
}
コード例 #2
0
ファイル: PulseToFastq.cpp プロジェクト: EichlerLab/blasr
int main(int argc, char* argv[]) {

	string plsFileName, fastaOutName;

	if (argc < 2) {
		cout << "usage: pls2fasta  file.pls.h5 file.fasta " << endl;
		cout << "Print reads stored in hdf as fasta." << endl;
		exit(1);
	}
	vector<string> plsFileNames;
	plsFileName = argv[1];
	fastaOutName = argv[2];

	if (FileOfFileNames::IsFOFN(plsFileName)) {
		FileOfFileNames::FOFNToList(plsFileName, plsFileNames);
	}
	else {
		plsFileNames.push_back(plsFileName);
	}

	int plsFileIndex;
	for (plsFileIndex = 0; plsFileIndex < plsFileNames.size(); plsFileIndex++) {

		ReaderAgglomerate reader;
		reader.IgnoreCCS();
		reader.Initialize(plsFileNames[plsFileIndex]);

		ofstream fastaOut;
		CrucialOpen(fastaOutName, fastaOut);
	
		SMRTSequence seq;
		int seqIndex = 0;
		while (reader.GetNext(seq)) {
			seq.PrintQualSeq(fastaOut);
		}
	}
}
コード例 #3
0
ファイル: BlasrMiscsImpl.hpp プロジェクト: phametus/blasr
bool GetNextReadThroughSemaphore(ReaderAgglomerate &reader,
                                 MappingParameters &params,
                                 T_Sequence &read,
                                 string & readGroupId,
                                 int & associatedRandInt,
                                 MappingSemaphores & semaphores)
{
    // Wait on a semaphore
    if (params.nProc > 1) {
#ifdef __APPLE__
        sem_wait(semaphores.reader);
#else
        sem_wait(&semaphores.reader);
#endif
    }

    bool returnValue = true;
    //
    // CCS Reads are read differently from other reads.  Do static casting here
    // of this.
    //
    if (reader.GetNext(read, associatedRandInt) == 0) {
        returnValue = false;
    }

    //
    // Set the read group id before releasing the semaphore, since other
    // threads may change the reader object to a new read group before
    // sending this alignment out to printing.
    readGroupId = reader.readGroupId;

    if (params.nProc > 1) {
#ifdef __APPLE__
        sem_post(semaphores.reader);
#else
        sem_post(&semaphores.reader);
#endif
    }
    return returnValue;
}
コード例 #4
0
int main(int argc, char* argv[]) {
	
	string queryFileName, targetFileName;
	if (argc < 3) {
		cout << "Usage: guidedalign query target [sdptuple]" << endl;
		exit(1);
	}
	queryFileName = argv[1];
	targetFileName = argv[2];
	int sdpTupleSize = 4;
	if (argc > 3) {
		sdpTupleSize = atoi(argv[3]);
	}
	
	ReaderAgglomerate reader;
	FASTQSequence query, target;

	reader.Initialize(queryFileName);
	reader.GetNext(query);
	reader.Close();
	reader.Initialize(targetFileName);
	reader.GetNext(target);
	reader.Close();
	
	int alignScore;
	/*
	Alignment sdpAlignment;
	int nSDPHits = 0;
	alignScore = SDPAlign(query, target,
												SMRTDistanceMatrix, 
												4, 4, sdpTupleSize, 4, 0.90,
												sdpAlignment, nSDPHits, Local, false, false);
	int b;
	for (b = 0; b < sdpAlignment.blocks.size(); b++) {
		sdpAlignment.blocks[b].qPos += sdpAlignment.qPos;
		sdpAlignment.blocks[b].tPos += sdpAlignment.tPos;
		}
	Guide guide;
	int bandSize = 16;
	AlignmentToGuide(sdpAlignment, guide, bandSize);
	StoreMatrixOffsets(guide);
	int guideSize = ComputeMatrixNElem(guide);
	int i;
	*/

	vector<int> scoreMat;
	vector<Arrow> pathMat;
	vector<double> probMat, optPathProbMat;
  vector<float> lnSubVect, lnInsVect, lnDelVect, lnMatchVect;
  //	AlignmentCandidate<FASTASequence, FASTASequence> alignment;
  Alignment alignment;
	DistanceMatrixScoreFunction<DNASequence, DNASequence> distScoreFn;
	distScoreFn.del = 3;
	distScoreFn.ins = 3;
	distScoreFn.InitializeScoreMatrix(SMRTDistanceMatrix);

	alignScore = GuidedAlign(query, target, distScoreFn, 10,
                           // in order after edit distance:
                           // pairwise-ins, pairwise-del, k, sdp-ins, sdp-del, sdp-insrate
                           //                           distScoreFn, 

                           5,5,.15,
                           alignment, Local, false, 8);
  //	StickPrintAlignment(alignment, query, target, cout);
}
コード例 #5
0
ファイル: ExtendAlign.cpp プロジェクト: jcombs1/blasr
int main(int argc, char* argv[]) {
	string fileAName, fileBName;
	if (argc < 3) {
		cout << "usage: extendAlign file1 fil2 [pos1 pos2] " << endl;
		exit(0);
	}

	fileAName = argv[1];
	fileBName = argv[2];
	int argi = 3;
	int aPos = 0;
	int bPos = 0;
	if (argc == 5) {
		aPos = atoi(argv[3]);
		bPos = atoi(argv[4]);
	}
	
	ReaderAgglomerate reader;
	reader.Initialize(fileAName);
	
	FASTASequence aSeq, bSeq;
	reader.GetNext(aSeq);
	reader.Initialize(fileBName);
	reader.GetNext(bSeq);
	
	DistanceMatrixScoreFunction<FASTASequence, FASTASequence> scoreFn;
	scoreFn.ins = 3;
	scoreFn.del = 3;
	scoreFn.InitializeScoreMatrix(SMRTDistanceMatrix);

	vector<int>  scoreMat;
	vector<Arrow>pathMat;
	
	AlignmentCandidate<FASTASequence, FASTASequence> extendedAlignment;

	/*	ExtendAlignmentForward(aSeq, aPos,
												 bSeq, bPos,
												 5, //k
												 scoreMat, pathMat,
												 extendedAlignment,
												 scoreFn,
												 1, // don't bother attempting
												 // to extend the alignment
												 // if one of the sequences
												 // is less than 1 base long
												 2);

	extendedAlignment.qAlignedSeq.ReferenceSubstring(aSeq);
	extendedAlignment.tAlignedSeq.ReferenceSubstring(bSeq);

	//	extendedAlignment.qAlignedSeqPos = aPos;
	//	extendedAlignment.tAlignedSeqPos = bPos;

	StickPrintAlignment(extendedAlignment, aSeq, bSeq, cout);
	extendedAlignment.Clear();
	*/
	if (aPos == 0) { aPos = aSeq.length; }
	if (bPos == 0) { bPos = bSeq.length; }

	ExtendAlignmentReverse(aSeq, aPos,
												 bSeq, bPos,
												 5, //k
												 scoreMat, pathMat,
												 extendedAlignment,
												 scoreFn,
												 1, // don't bother attempting
												 // to extend the alignment
												 // if one of the sequences
												 // is less than 1 base long
												 2);

	extendedAlignment.qAlignedSeq.ReferenceSubstring(aSeq);
	extendedAlignment.tAlignedSeq.ReferenceSubstring(bSeq);

	//	extendedAlignment.qAlignedSeqPos = aPos;
	//	extendedAlignment.tAlignedSeqPos = bPos;

	StickPrintAlignment(extendedAlignment, aSeq, bSeq, cout);

	return 0;
}
コード例 #6
0
ファイル: PulseToFasta.cpp プロジェクト: asifemon/blasr
int main(int argc, char* argv[]) {
    string program = "pls2fasta";
    string versionString = VERSION;
    AppendPerforceChangelist(PERFORCE_VERSION_STRING, versionString);

	string plsFileName, fastaOutName;
	vector<string> plsFileNames;
	bool trimByRegion, maskByRegion;
	trimByRegion = false;
	maskByRegion = false;
	int argi = 3;
	RegionTable regionTable;
	string regionsFOFNName = "";
	vector<string> regionFileNames;
	bool splitSubreads = true;
	int minSubreadLength = 0;
	bool addSimulatedData = false;
	bool printSimulatedCoordinate = false;
	bool printSimulatedSequenceIndex = false;
  bool printFastq = false;
  bool printCcs   = false;
  int  lineLength = 50;
  int minReadScore = 0;
  vector<int> holeNumbers;
  CommandLineParser clp;
  bool printOnlyBest = false;

  clp.SetProgramName(program);
  clp.SetVersion(versionString);
  clp.RegisterStringOption("in.pls.h5", &plsFileName, "Input pls.h5/bax.h5/fofn file.", true);
  clp.RegisterStringOption("out.fasta", &fastaOutName, "Output fasta/fastq file.", true);
  clp.RegisterPreviousFlagsAsHidden();
  clp.RegisterFlagOption("trimByRegion", &trimByRegion, "Trim away low quality regions.");
  clp.RegisterFlagOption("maskByRegion", &maskByRegion, "Mask low quality regions with 'N'.");
  clp.RegisterStringOption("regionTable", &regionsFOFNName, "Optional HDF file with a /PulseData/Regions dataset.");
  clp.RegisterIntOption("minSubreadLength", &minSubreadLength, "Do not write subreads less than the specified length.", CommandLineParser::PositiveInteger);
  clp.RegisterFlagOption("noSplitSubreads", &splitSubreads, "Do not split reads on adapter sequences.");
  clp.RegisterIntListOption("holeNumber", &holeNumbers, "Only print this hole number (or list of numbers).");
  clp.RegisterFlagOption("fastq", &printFastq, "Print in FASTQ format with quality.");
  clp.RegisterFlagOption("ccs", &printCcs, "Print de novo CCS sequences");
  clp.RegisterIntOption("lineLength", &lineLength, "Specify fasta/fastq line length", CommandLineParser::PositiveInteger);
  clp.RegisterIntOption("minReadScore", &minReadScore, "Minimum read score to print a read.  The score is "
                        "a number between 0 and 1000 and represents the expected accuracy percentage * 10. "
                        "A typical value would be between 750 and 800.  This does not apply to ccs reads.", CommandLineParser::NonNegativeInteger);
  clp.RegisterFlagOption("best", &printOnlyBest, "If a CCS sequence exists, print this.  Otherwise, print the longest"
                         "subread.  This does not support fastq.");
  string description = ("Converts pls.h5/bax.h5/fofn files to fasta or fastq files. Although fasta files are provided"
  " with every run, they are not trimmed nor split into subreads. This program takes "
  "additional annotation information, such as the subread coordinates and high quality regions "
  "and uses them to create fasta sequences that are substrings of all bases called. Most of the time "
  "you will want to trim low quality reads, so you should specify -trimByRegion.");
  clp.SetProgramSummary(description);
                        
  clp.ParseCommandLine(argc, argv);

    cerr << "[INFO] " << GetTimestamp() << " [" << program << "] started."  << endl;
	if (trimByRegion and maskByRegion) {
		cout << "ERROR! You cannot both trim and mask regions. Use one or the other." << endl;
		exit(1);
	}
		 
  if (printFastq) {
    // Setting lineLength to 0 flags to print on one line.
    lineLength = 0;
  }

	if (FileOfFileNames::IsFOFN(plsFileName)) {
		FileOfFileNames::FOFNToList(plsFileName, plsFileNames);
	}
	else {
		plsFileNames.push_back(plsFileName);
	}
	if (regionsFOFNName == "") {
		regionFileNames = plsFileNames;
	}
	else {
		if (FileOfFileNames::IsFOFN(regionsFOFNName)) {
			FileOfFileNames::FOFNToList(regionsFOFNName, regionFileNames);
		}
		else {
			regionFileNames.push_back(regionsFOFNName);
		}
	}



	ofstream fastaOut;
	CrucialOpen(fastaOutName, fastaOut);
	int plsFileIndex;
	HDFRegionTableReader hdfRegionReader;
  sort(holeNumbers.begin(), holeNumbers.end());
	for (plsFileIndex = 0; plsFileIndex < plsFileNames.size(); plsFileIndex++) {
		if (trimByRegion or maskByRegion or splitSubreads) {
			hdfRegionReader.Initialize(regionFileNames[plsFileIndex]);
			hdfRegionReader.ReadTable(regionTable);
			regionTable.SortTableByHoleNumber();
		}
		
		ReaderAgglomerate reader;
    HDFBasReader ccsReader;

    if (printOnlyBest) {
      ccsReader.SetReadBasesFromCCS();
      ccsReader.Initialize(plsFileNames[plsFileIndex]);
    }
    if (printCcs == false) {
  		reader.IgnoreCCS();
    }
    else {
      reader.hdfBasReader.SetReadBasesFromCCS();
    }
		if (addSimulatedData) {
			reader.hdfBasReader.IncludeField("SimulatedCoordinate");
			reader.hdfBasReader.IncludeField("SimulatedSequenceIndex");
		}

        if (reader.SetReadFileName(plsFileNames[plsFileIndex]) == 0) {
          cout << "ERROR, could not determine file type."
               << plsFileNames[plsFileIndex] << endl;
          exit(1);
        }
        if (reader.Initialize() == 0) {
          cout << "ERROR, could not initialize file "
               << plsFileNames[plsFileIndex] << endl;
          exit(1);
        }

		DNALength simulatedCoordinate;
		DNALength simulatedSequenceIndex;
		reader.SkipReadQuality();
		SMRTSequence seq;
		vector<ReadInterval> subreadIntervals;;
    SMRTSequence ccsSeq;
		while (reader.GetNext(seq)) {
      if (printOnlyBest) {
        ccsReader.GetNext(ccsSeq);
      }

      if (holeNumbers.size() != 0 and 
          binary_search(holeNumbers.begin(), holeNumbers.end(), seq.zmwData.holeNumber) == false) {
        continue;
      }

      if (seq.length == 0) {
        continue;
      }

			if (addSimulatedData) {
				reader.hdfBasReader.simulatedCoordinateArray.Read(reader.hdfBasReader.curRead-1, reader.hdfBasReader.curRead, &simulatedCoordinate);
				reader.hdfBasReader.simulatedSequenceIndexArray.Read(reader.hdfBasReader.curRead-1, reader.hdfBasReader.curRead, &simulatedSequenceIndex);
			}

		  if (printCcs == true) {
        if (printFastq == false) {
          seq.PrintSeq(fastaOut);
        }
        else {
          seq.PrintFastq(fastaOut, lineLength);
        }
        continue;
      }	

      //
      // Determine the high quality boundaries of the read.  This is
      // the full read is no hq regions exist, or it is stated to
      // ignore regions.
      //
      DNALength hqReadStart, hqReadEnd;
      int hqRegionScore;
      if (GetReadTrimCoordinates(seq, seq.zmwData, regionTable, hqReadStart, hqReadEnd, hqRegionScore) == false or 
          (trimByRegion == false and maskByRegion == false)) {
        hqReadStart = 0;
        hqReadEnd   = seq.length;
      }
      
      //
      // Mask off the low quality portions of the reads.
      //
			if (maskByRegion) {
        if (hqReadStart > 0) {
          fill(&seq.seq[0], &seq.seq[hqReadStart], 'N');
        }
        if (hqReadEnd != seq.length) {
          fill(&seq.seq[hqReadEnd], &seq.seq[seq.length], 'N');
        }
			}
      


      //
      // Now possibly print the full read with masking.  This could be handled by making a 
      // 
			if (splitSubreads == false) {
        ReadInterval wholeRead(0, seq.length);
        // The set of subread intervals is just the entire read.
        subreadIntervals.clear();
        subreadIntervals.push_back(wholeRead);
			}
			else {
				//
				// Print subread coordinates no matter whether or not reads have subreads.
				//
				subreadIntervals.clear(); // clear old, new intervals are appended.
				CollectSubreadIntervals(seq, &regionTable, subreadIntervals);
      }
      //
      // Output all subreads as separate sequences.
      //
      int intvIndex;
      SMRTSequence bestSubreadSequence;
      int bestSubreadScore = -1;
      int bestSubreadIndex = 0;
      int bestSubreadStart = 0, bestSubreadEnd = 0;
      SMRTSequence bestSubread;
      for (intvIndex = 0; intvIndex < subreadIntervals.size(); intvIndex++) {
        SMRTSequence subreadSequence, subreadSequenceRC;
					
        subreadSequence.subreadStart = subreadIntervals[intvIndex].start;
        subreadSequence.subreadEnd   = subreadIntervals[intvIndex].end;
          
        // 
        // When trimming by region, only output the parts of the
        // subread that overlap the hq region.
        //
        if (trimByRegion == true) {
          subreadSequence.subreadStart = max((DNALength) subreadIntervals[intvIndex].start, hqReadStart);
          subreadSequence.subreadEnd   = min((DNALength) subreadIntervals[intvIndex].end, hqReadEnd);
        }

        if (subreadSequence.subreadStart >= subreadSequence.subreadEnd or 
            subreadSequence.subreadEnd - subreadSequence.subreadStart <= minSubreadLength) {
          //
          // There is no high qualty portion of this subread. Skip it.
          //
          continue;
        }

        if (hqRegionScore < minReadScore) {
          continue;
        }

        //
        // Print the subread, adding the coordinates as part of the title.
        //
        subreadSequence.ReferenceSubstring(seq, subreadSequence.subreadStart, 
                                           subreadSequence.subreadEnd - subreadSequence.subreadStart);
        stringstream titleStream;
        titleStream << seq.title;
        if (splitSubreads) {
          //
          // Add the subread coordinates if splitting on subread.
          //
          titleStream << "/" 
                      << subreadSequence.subreadStart
                      << "_" << subreadSequence.subreadEnd;
        }
          
        // 
        // If running on simulated data, add where the values were simulated from.
        //
        if (addSimulatedData) {
          titleStream << ((FASTASequence*)&seq)->title << "/chrIndex_" 
                      << simulatedSequenceIndex << "/position_"<< simulatedCoordinate;
          ((FASTASequence*)&seq)->CopyTitle(titleStream.str());
        }

        subreadSequence.CopyTitle(titleStream.str());

        //
        // Eventually replace with WriterAgglomerate.
        //
        if (printOnlyBest == false) {
          if (subreadSequence.length > 0) {
            if (printFastq == false) {
              ((FASTASequence*)&subreadSequence)->PrintSeq(fastaOut);
            }
            else {
              subreadSequence.PrintFastq(fastaOut, lineLength);
            }
          }
          delete[] subreadSequence.title;
        }
        else {
          int subreadWeightedScore = subreadSequence.length * hqRegionScore;
          if (subreadWeightedScore > bestSubreadScore) {
            bestSubreadIndex = intvIndex;
            bestSubread = subreadSequence;
            bestSubreadScore = subreadWeightedScore;
          }
        }
      }

      if (printOnlyBest) {
        if (ccsSeq.length > 0) {
          if (printFastq == false) {
            ccsSeq.PrintSeq(fastaOut);
          }
          else {
            ccsSeq.PrintFastq(fastaOut, ccsSeq.length);
          }
        }
        else {
          if (bestSubreadScore >= 0) {
            if (printFastq == false) {
              bestSubread.PrintSeq(fastaOut);
            }
            else {
              bestSubread.PrintFastq(fastaOut, bestSubread.length);
            }
            bestSubread.Free();
          }
        }
        ccsSeq.Free();
      }
      seq.Free();
    }
    reader.Close();
    hdfRegionReader.Close();
  }
  cerr << "[INFO] " << GetTimestamp() << " [" << program << "] ended."  << endl;
}
コード例 #7
0
ファイル: ExhaustiveAlign.cpp プロジェクト: EichlerLab/blasr
int main(int argc, char* argv[]) {
	
	string genomeFileName, readsFileName;
	TupleMetrics tm;
	float insRate = 0.10;
	tm.tupleSize = 8;
	CommandLineParser clp;
	int nProcessors = 1;
	clp.SetProgramName("exhalign");
	clp.SetProgramSummary("Count the number of occurrences of every k-mer in a file.");
	clp.RegisterStringOption("genome", &genomeFileName, "The file of the genome to align to.");
	clp.RegisterStringOption("reads",  &readsFileName,  "The reads to align.");
	clp.RegisterPreviousFlagsAsHidden();
	clp.RegisterIntOption("wordsize", &tm.tupleSize, "Size of words to count", 
												CommandLineParser::NonNegativeInteger);
	clp.RegisterFloatOption("insrate", &insRate, "Roughly the insertion rate (10%)", 
													CommandLineParser::NonNegativeFloat);
	clp.RegisterIntOption("nProc", &nProcessors, "Number of processors to use", CommandLineParser::NonNegativeInteger);
	clp.ParseCommandLine(argc, argv);

	insRate+=1.0;
	//
	// Process the reads into a vector of read keywords
	//
	

	vector<string> readsFileNames;
	vector<FASTQSequence> reads;
	vector<vector<ReadKeyword> > keywords;
	SMRTSequence seq, seqRC;
	ReadKeyword keyword;
	int readIndex = 0;

	if (FileOfFileNames::IsFOFN(readsFileName)) {
		FileOfFileNames::FOFNToList(readsFileName, readsFileNames);
	}
	else {
		readsFileNames.push_back(readsFileName);
	}

	ReaderAgglomerate genomeReader;	
	HDFRegionTableReader regionTableReader;
	genomeReader.Initialize(genomeFileName);
	FASTQSequence genome;
	genomeReader.GetNext(genome);
	SubreadIterator subreadIterator;

	keywords.resize(nProcessors);
	RegionTable  regionTable, *regionTablePtr;

	int readsFileIndex;
	for (readsFileIndex = 0; readsFileIndex < readsFileNames.size(); readsFileIndex++ ) {
		
		ReaderAgglomerate reader;
		reader.Initialize(readsFileNames[readsFileIndex]);
		regionTalePtr = NULL;
		
		if (reader.fileType == HDFPulse or
				reader.fileType == HDFBase) {
			regionTableReader.Initialize(readsFileNames[readsFileIndex]);
			regionTableReader.Read(regionTable);
			regionTablePtr = &regionTable;
		}
		else {
			regionTablePtr = NULL;
		}
		SMRTSequence fullSequence;
		while(reader.GetNext(fullSequence)) {

			subreadIterator.Initialize(&fullSequence, regionTablePtr);
			
			SMRTSequence seq;
			while (subreadIterator.GetNext(seq)) {
				DNALength pos;
				if (seq.length < tm.tupleSize) 
					continue;
				reads.push_back(seq);
				for (pos = 0; pos < seq.length - tm.tupleSize + 1; pos++) {
					keyword.tuple.FromStringLR(&seq.seq[pos], tm);
					keyword.readPos = pos;
					keyword.readIndex = readIndex;
					keywords[(readIndex/2)%nProcessors].push_back(keyword);
				}
				readIndex++;
				seq.MakeRC(seqRC);
				reads.push_back(seqRC);
				for (pos = 0; pos < seqRC.length - tm.tupleSize + 1; pos++) {
					keyword.tuple.FromStringLR(&seqRC.seq[pos], tm);
					keyword.readPos = pos;
					keyword.readIndex = readIndex;
					keywords[(readIndex/2)%nProcessors].push_back(keyword);
				}
				readIndex++;
				//				seq.Free();
				seqRC.Free();
			}
			fullSequence.Free();
		}
	}
	int procIndex;
	for (procIndex = 0; procIndex < nProcessors; procIndex++) {
		std::sort(keywords[procIndex].begin(), keywords[procIndex].end());
	}


  std::vector<int> prevAlignedGenomePos;
  std::vector<int> readOptScore;
  std::vector<FastqAlignment > optAlignment;
	std::vector<int> optGenomeAlignPos;
	std::vector<int> optGenomeAlignLength;

  prevAlignedGenomePos.resize(reads.size());
  readOptScore.resize(reads.size());
  optAlignment.resize(reads.size());
	optGenomeAlignPos.resize(reads.size());
	optGenomeAlignLength.resize(reads.size());
	vector<Data> tdata;
	tdata.resize(nProcessors);
  std::fill(prevAlignedGenomePos.begin(), prevAlignedGenomePos.end(), -1);
	for (procIndex = 0; procIndex < nProcessors; procIndex++) {
		tdata[procIndex].prevAlignedGenomePos = &prevAlignedGenomePos;
		tdata[procIndex].readOptScore         = &readOptScore;
		tdata[procIndex].optAlignment         = &optAlignment;
		tdata[procIndex].optGenomeAlignPos    = &optGenomeAlignPos;
		tdata[procIndex].optGenomeAlignLength = &optGenomeAlignLength;
		tdata[procIndex].keywords             = &keywords[procIndex];
		tdata[procIndex].genome               = &genome;
		tdata[procIndex].insRate              = insRate;
		tdata[procIndex].reads                = &reads;
		tdata[procIndex].tm                   = &tm;
	}
	if (nProcessors == 1) {
		KeywordSeededAlignment(&tdata[0]);
	}
	else {
		pthread_t *threads = new pthread_t[nProcessors];
		pthread_attr_t *threadAttr = new pthread_attr_t[nProcessors];
		for (procIndex = 0; procIndex < nProcessors; procIndex++) {
			pthread_attr_init(&threadAttr[procIndex]);			
			pthread_create(&threads[procIndex], &threadAttr[procIndex], (void*(*)(void*))KeywordSeededAlignment, &tdata[procIndex]);
		}
		for (procIndex = 0; procIndex < nProcessors; procIndex++) {
			pthread_join(threads[procIndex], NULL);
		}

	}
	VectorIndex i;
	//	cout << "printing alignments for " << reads.size() << " reads." << endl;
	for (readIndex = 0; readIndex < readOptScore.size(); readIndex +=2 ){
		int optIndex = readIndex;
		if (readOptScore[readIndex] > readOptScore[readIndex+1]) {
			optIndex= readIndex + 1;
		}
		FASTQSequence genomeSubstring;
		genomeSubstring.seq = &genome.seq[optGenomeAlignPos[optIndex]];
		genomeSubstring.length =  optGenomeAlignLength[optIndex];
		if (prevAlignedGenomePos[optIndex] >= 0) {
			optAlignment[optIndex].qName.assign(reads[optIndex].title, reads[optIndex].titleLength);
			optAlignment[optIndex].tName.assign(genome.GetName());
			ComputeAlignmentStats(optAlignment[optIndex], reads[optIndex].seq, genomeSubstring.seq, SMRTDistanceMatrix, 6, 6);
			if (optAlignment[optIndex].blocks.size() > 0) {
				PrintCompareSequencesAlignment(optAlignment[optIndex], reads[optIndex], genomeSubstring,cout);
			}
			/*			StickPrintAlignment(optAlignment[optIndex],
													reads[optIndex],
													genomeSubstring, cout, 0, optGenomeAlignPos[optIndex]);
			*/
						
		}
	}
	for (readIndex = 0; readIndex < readOptScore.size(); readIndex++ ) {
		reads[readIndex].Free();
	}

	return 0;
}
コード例 #8
0
/// \param[in] readsFiles - incoming reads files in BAM or other formats
/// \param[in] readType - read type, must be either SUBREAD or CCS or UNKNOWN
/// \param[in] samQVs - SupplementalQVList, an object that handles which 
///                      QVs to print in SAM/BAM file.
SAMHeaderRGs SAMHeaderPrinter::MakeRGs(const std::vector<std::string> & readsFiles,
        const ReadType::ReadTypeEnum & readType,
        const SupplementalQVList & samQVs) {
    SAMHeaderRGs rgs;

    if (fileType != PBBAM) {
        ReaderAgglomerate * reader = new ReaderAgglomerate();
        std::vector<std::string>::const_iterator rfit;
        for(rfit = readsFiles.begin(); rfit != readsFiles.end(); rfit++) {
            std::string rf(*rfit);
            reader->SetReadFileName(rf);
            reader->SetReadType(readType);
            reader->Initialize();

            // Get movie name from ReaderAgglomerate
            std::string movieName;
            reader->GetMovieName(movieName);

            string bindingKit, sequencingKit, baseCallerVersion;
            reader->GetChemistryTriple(bindingKit, sequencingKit, baseCallerVersion);
            reader->Close();

            std::vector<SAMHeaderItem> dsItems;
            dsItems.push_back(SAMHeaderItem("READTYPE", ReadType::ToString(readType)));
            dsItems.push_back(SAMHeaderItem("BINDINGKIT", bindingKit));
            dsItems.push_back(SAMHeaderItem("SEQUENCINGKIT", sequencingKit));
            dsItems.push_back(SAMHeaderItem("BASECALLERVERSION", baseCallerVersion));

            // Add QVs, e.g., InsertionQV=iq;DeletionQV=dq;...
            if (samQVs.useqv) {
                for (int i = 0; i < samQVs.nTags; i++) {
                    if (samQVs.useqv & (1 << i)) {
                        dsItems.push_back(SAMHeaderItem(samQVs.qvNames[i], samQVs.qvTags[i]));
                    }
                }
            }
            rgs.Add(SAMHeaderRG(reader->readGroupId, PACBIOPL, movieName, dsItems));
        }
        delete reader;
    } else {
#ifdef USE_PBBAM
        // TODO: use Derek's API to merge bamHeaders from different files when 
        // it is in place. Use the following code for now. 
        std::vector<std::string>::const_iterator rfit;
        for(rfit = readsFiles.begin(); rfit != readsFiles.end(); rfit++) {
            try {
                PacBio::BAM::BamFile bamFile(*rfit);
                PacBio::BAM::BamHeader header = bamFile.Header();
                // Get read groups from bam header.
                std::vector<PacBio::BAM::ReadGroupInfo> vrgs = header.ReadGroups();
                std::vector<PacBio::BAM::ReadGroupInfo>::iterator rgit;
                for (rgit = vrgs.begin(); rgit != vrgs.end(); rgit++) {
                    rgs.Add(SAMHeaderRG((*rgit).ToSam()));
                }
            } catch (std::exception e) {
                cout << "ERROR, unable to open bam file " << (*rfit) << endl;
                exit(1);
            }
        }
#else
        REQUEST_PBBAM_ERROR();
#endif
    }
    return rgs;
}
コード例 #9
0
ファイル: ToAfg.cpp プロジェクト: jcombs1/blasr
int main(int argc, char* argv[]) {

    string inputFileName, outputFileName;

    if (argc < 2) {
        PrintUsage();
        exit(0);
    }
    vector<string> inputFileNames;
    inputFileName = argv[1];
    outputFileName = argv[2];
    int argi = 3;
    RegionTable regionTable;
    string regionsFOFNName = "";
    vector<string> regionFileNames;
    bool splitSubreads = true;
    bool useCCS = false;
    int minSubreadLength = 1;
    while (argi < argc) {
        if (strcmp(argv[argi], "-regionTable") == 0) {
            regionsFOFNName = argv[++argi];
        }
        else if (strcmp(argv[argi], "-noSplitSubreads") == 0) {
            splitSubreads = false;
        }
        else if (strcmp(argv[argi], "-minSubreadLength") == 0) {
            minSubreadLength = atoi(argv[++argi]);
        }
        else if (strcmp(argv[argi], "-useccsdenovo") == 0) {
            useCCS = true;
        }
        else {
            PrintUsage();
            cout << "ERROR! Option " << argv[argi] << " is not supported." << endl;
        }
        argi++;
    }
         
    if (FileOfFileNames::IsFOFN(inputFileName)) {
        FileOfFileNames::FOFNToList(inputFileName, inputFileNames);
    }
    else {
        inputFileNames.push_back(inputFileName);
    }
    if (regionsFOFNName == "") {
        regionFileNames = inputFileNames;
    }
    else {
        if (FileOfFileNames::IsFOFN(regionsFOFNName)) {
            FileOfFileNames::FOFNToList(regionsFOFNName, regionFileNames);
        }
        else {
            regionFileNames.push_back(regionsFOFNName);
        }
    }


    ofstream fastaOut;
    CrucialOpen(outputFileName, fastaOut);
    int plsFileIndex;
    HDFRegionTableReader hdfRegionReader;
    AfgBasWriter afgWriter;
    afgWriter.Initialize(outputFileName);

    for (plsFileIndex = 0; plsFileIndex < inputFileNames.size(); plsFileIndex++) {
        if (splitSubreads) {
            hdfRegionReader.Initialize(regionFileNames[plsFileIndex]);
            hdfRegionReader.ReadTable(regionTable);
            regionTable.SortTableByHoleNumber();
        }
        
        ReaderAgglomerate reader;
        // reader.SkipReadQuality(); // should have been taken care of by *Filter modules
        if (useCCS){
            reader.UseCCS();
        } else {
            reader.IgnoreCCS();
        }
        reader.Initialize(inputFileNames[plsFileIndex]);
        CCSSequence seq; 
        int seqIndex = 0;
        int numRecords = 0;
        vector<ReadInterval> subreadIntervals;
        while (reader.GetNext(seq)){ 
            ++seqIndex;
            if (splitSubreads == false) {
                if (seq.length >= minSubreadLength) {
                    afgWriter.Write(seq);
                }
                seq.Free();
                continue;
            }

            DNALength hqReadStart, hqReadEnd;
            int score;
            GetReadTrimCoordinates(seq, seq.zmwData, regionTable, hqReadStart, hqReadEnd, score);
            subreadIntervals.clear(); // clear old, new intervals are appended.
            CollectSubreadIntervals(seq,&regionTable, subreadIntervals);

            if (seq.length == 0 and subreadIntervals.size() > 0) {
                cout << "WARNING! A high quality interval region exists for a read of length 0." <<endl;
                cout << "  The offending ZMW number is " << seq.zmwData.holeNumber << endl;
                seq.Free();
                continue;
            }

            for (int intvIndex = 0; intvIndex < subreadIntervals.size(); intvIndex++) {
                SMRTSequence subreadSequence;
                
                int subreadStart = subreadIntervals[intvIndex].start > hqReadStart ? 
                                   subreadIntervals[intvIndex].start : hqReadStart;
                int subreadEnd   = subreadIntervals[intvIndex].end < hqReadEnd ?
                                   subreadIntervals[intvIndex].end : hqReadEnd;
                int subreadLength = subreadEnd - subreadStart;

                if (subreadLength < minSubreadLength) continue;

                subreadSequence.subreadStart = subreadStart;
                subreadSequence.subreadEnd   = subreadEnd;
                subreadSequence.ReferenceSubstring(seq, subreadStart, subreadLength);      
            
                stringstream titleStream;
                titleStream << seq.title << "/" << subreadIntervals[intvIndex].start 
                                         << "_" << subreadIntervals[intvIndex].end;
                subreadSequence.CopyTitle(titleStream.str());
                afgWriter.Write(subreadSequence);
                delete[] subreadSequence.title;
            }
            seq.Free();
        }
        reader.Close();
        hdfRegionReader.Close();
    }
}