Beispiel #1
0
// Given a SMRT sequence and one of its subreads, make the
// reverse complement of the subread in the coordinate of the
// reverse complement sequence of the SMRT sequence.
// Input:
//   smrtRead          - a SMRT read
//   subreadSequence   - a subread of smrtRead
// Output:
//   subreadSequenceRC - the reverse complement of the subread
//                       in the coordinate of the reverse
//                       complement of the SMRT read.
void MakeSubreadRC(SMRTSequence & subreadSequenceRC,
                   SMRTSequence & subreadSequence,
                   SMRTSequence & smrtRead)
{
    assert(smrtRead.length >= subreadSequence.length);
    // Reverse complement sequence of the subread.
    subreadSequence.MakeRC(subreadSequenceRC);
    // Update start and end positions of subreadSequenceRC in the
    // coordinate of reverse compelement sequence of the SMRT read.
    subreadSequenceRC.SubreadStart(smrtRead.length - subreadSequence.SubreadEnd());
    subreadSequenceRC.SubreadEnd  (smrtRead.length - subreadSequence.SubreadStart());
    subreadSequenceRC.zmwData     = smrtRead.zmwData;
}
Beispiel #2
0
int main(int argc, char* argv[]) {
	
	string genomeFileName, readsFileName;
	TupleMetrics tm;
	float insRate = 0.10;
	tm.tupleSize = 8;
	CommandLineParser clp;
	int nProcessors = 1;
	clp.SetProgramName("exhalign");
	clp.SetProgramSummary("Count the number of occurrences of every k-mer in a file.");
	clp.RegisterStringOption("genome", &genomeFileName, "The file of the genome to align to.");
	clp.RegisterStringOption("reads",  &readsFileName,  "The reads to align.");
	clp.RegisterPreviousFlagsAsHidden();
	clp.RegisterIntOption("wordsize", &tm.tupleSize, "Size of words to count", 
												CommandLineParser::NonNegativeInteger);
	clp.RegisterFloatOption("insrate", &insRate, "Roughly the insertion rate (10%)", 
													CommandLineParser::NonNegativeFloat);
	clp.RegisterIntOption("nProc", &nProcessors, "Number of processors to use", CommandLineParser::NonNegativeInteger);
	clp.ParseCommandLine(argc, argv);

	insRate+=1.0;
	//
	// Process the reads into a vector of read keywords
	//
	

	vector<string> readsFileNames;
	vector<FASTQSequence> reads;
	vector<vector<ReadKeyword> > keywords;
	SMRTSequence seq, seqRC;
	ReadKeyword keyword;
	int readIndex = 0;

	if (FileOfFileNames::IsFOFN(readsFileName)) {
		FileOfFileNames::FOFNToList(readsFileName, readsFileNames);
	}
	else {
		readsFileNames.push_back(readsFileName);
	}

	ReaderAgglomerate genomeReader;	
	HDFRegionTableReader regionTableReader;
	genomeReader.Initialize(genomeFileName);
	FASTQSequence genome;
	genomeReader.GetNext(genome);
	SubreadIterator subreadIterator;

	keywords.resize(nProcessors);
	RegionTable  regionTable, *regionTablePtr;

	int readsFileIndex;
	for (readsFileIndex = 0; readsFileIndex < readsFileNames.size(); readsFileIndex++ ) {
		
		ReaderAgglomerate reader;
		reader.Initialize(readsFileNames[readsFileIndex]);
		regionTalePtr = NULL;
		
		if (reader.fileType == HDFPulse or
				reader.fileType == HDFBase) {
			regionTableReader.Initialize(readsFileNames[readsFileIndex]);
			regionTableReader.Read(regionTable);
			regionTablePtr = &regionTable;
		}
		else {
			regionTablePtr = NULL;
		}
		SMRTSequence fullSequence;
		while(reader.GetNext(fullSequence)) {

			subreadIterator.Initialize(&fullSequence, regionTablePtr);
			
			SMRTSequence seq;
			while (subreadIterator.GetNext(seq)) {
				DNALength pos;
				if (seq.length < tm.tupleSize) 
					continue;
				reads.push_back(seq);
				for (pos = 0; pos < seq.length - tm.tupleSize + 1; pos++) {
					keyword.tuple.FromStringLR(&seq.seq[pos], tm);
					keyword.readPos = pos;
					keyword.readIndex = readIndex;
					keywords[(readIndex/2)%nProcessors].push_back(keyword);
				}
				readIndex++;
				seq.MakeRC(seqRC);
				reads.push_back(seqRC);
				for (pos = 0; pos < seqRC.length - tm.tupleSize + 1; pos++) {
					keyword.tuple.FromStringLR(&seqRC.seq[pos], tm);
					keyword.readPos = pos;
					keyword.readIndex = readIndex;
					keywords[(readIndex/2)%nProcessors].push_back(keyword);
				}
				readIndex++;
				//				seq.Free();
				seqRC.Free();
			}
			fullSequence.Free();
		}
	}
	int procIndex;
	for (procIndex = 0; procIndex < nProcessors; procIndex++) {
		std::sort(keywords[procIndex].begin(), keywords[procIndex].end());
	}


  std::vector<int> prevAlignedGenomePos;
  std::vector<int> readOptScore;
  std::vector<FastqAlignment > optAlignment;
	std::vector<int> optGenomeAlignPos;
	std::vector<int> optGenomeAlignLength;

  prevAlignedGenomePos.resize(reads.size());
  readOptScore.resize(reads.size());
  optAlignment.resize(reads.size());
	optGenomeAlignPos.resize(reads.size());
	optGenomeAlignLength.resize(reads.size());
	vector<Data> tdata;
	tdata.resize(nProcessors);
  std::fill(prevAlignedGenomePos.begin(), prevAlignedGenomePos.end(), -1);
	for (procIndex = 0; procIndex < nProcessors; procIndex++) {
		tdata[procIndex].prevAlignedGenomePos = &prevAlignedGenomePos;
		tdata[procIndex].readOptScore         = &readOptScore;
		tdata[procIndex].optAlignment         = &optAlignment;
		tdata[procIndex].optGenomeAlignPos    = &optGenomeAlignPos;
		tdata[procIndex].optGenomeAlignLength = &optGenomeAlignLength;
		tdata[procIndex].keywords             = &keywords[procIndex];
		tdata[procIndex].genome               = &genome;
		tdata[procIndex].insRate              = insRate;
		tdata[procIndex].reads                = &reads;
		tdata[procIndex].tm                   = &tm;
	}
	if (nProcessors == 1) {
		KeywordSeededAlignment(&tdata[0]);
	}
	else {
		pthread_t *threads = new pthread_t[nProcessors];
		pthread_attr_t *threadAttr = new pthread_attr_t[nProcessors];
		for (procIndex = 0; procIndex < nProcessors; procIndex++) {
			pthread_attr_init(&threadAttr[procIndex]);			
			pthread_create(&threads[procIndex], &threadAttr[procIndex], (void*(*)(void*))KeywordSeededAlignment, &tdata[procIndex]);
		}
		for (procIndex = 0; procIndex < nProcessors; procIndex++) {
			pthread_join(threads[procIndex], NULL);
		}

	}
	VectorIndex i;
	//	cout << "printing alignments for " << reads.size() << " reads." << endl;
	for (readIndex = 0; readIndex < readOptScore.size(); readIndex +=2 ){
		int optIndex = readIndex;
		if (readOptScore[readIndex] > readOptScore[readIndex+1]) {
			optIndex= readIndex + 1;
		}
		FASTQSequence genomeSubstring;
		genomeSubstring.seq = &genome.seq[optGenomeAlignPos[optIndex]];
		genomeSubstring.length =  optGenomeAlignLength[optIndex];
		if (prevAlignedGenomePos[optIndex] >= 0) {
			optAlignment[optIndex].qName.assign(reads[optIndex].title, reads[optIndex].titleLength);
			optAlignment[optIndex].tName.assign(genome.GetName());
			ComputeAlignmentStats(optAlignment[optIndex], reads[optIndex].seq, genomeSubstring.seq, SMRTDistanceMatrix, 6, 6);
			if (optAlignment[optIndex].blocks.size() > 0) {
				PrintCompareSequencesAlignment(optAlignment[optIndex], reads[optIndex], genomeSubstring,cout);
			}
			/*			StickPrintAlignment(optAlignment[optIndex],
													reads[optIndex],
													genomeSubstring, cout, 0, optGenomeAlignPos[optIndex]);
			*/
						
		}
	}
	for (readIndex = 0; readIndex < readOptScore.size(); readIndex++ ) {
		reads[readIndex].Free();
	}

	return 0;
}