int main(int argc, char* argv[]) {
	if (argc < 3) {
		cout << "usage: buildLengthHistogram.cpp input.cmp.h5 output.txt" << endl;
		exit(0);
	}

	string cmpH5FileName = argv[1];
	string outputFileName = argv[2];
	
	CmpFile cmpFile;
	HDFCmpReader<CmpAlignment> cmpFileReader;
	if (cmpFileReader.Initialize(cmpH5FileName) == 0) {
		cout << "Could not read the cmp file." << endl;
		exit(0);
	}
	ofstream outFile;
	CrucialOpen(outputFileName, outFile, std::ios::out);

	cmpFileReader.Read(cmpFile);
	vector<int> readLengths;
	int a;
	cout << "Processing " << cmpFile.alnInfo.alignments.size() << " alignments." << endl;
	for (a = 0; a < cmpFile.alnInfo.alignments.size(); a++ ) {
		readLengths.push_back(abs(cmpFile.alnInfo.alignments[a].GetQueryEnd() - cmpFile.alnInfo.alignments[a].GetQueryStart()));
	}
	sort(readLengths.begin(), readLengths.end());
	int r = 0;
	while(r < readLengths.size()) {
		int rn = r;
		while(rn < readLengths.size() and readLengths[rn] == readLengths[r]) { rn++;}
		outFile << readLengths[r] << " " << rn - r << endl;
		r = rn;
	}
}
Exemplo n.º 2
0
int main(int argc, char* argv[]) {

	string cmpFileName;
	string refFileName;
	string readsFileName;
  string mapqvTrackName;
	if (argc < 2) {
		cout << "  printMapqvTrack: print a gff file of the average mapping quality value" << endl;
		exit(1);
	}
	vector<int> refPositions;
	cmpFileName = argv[1];
	refFileName = argv[2];
  mapqvFileName = argv[3];

	CmpFile cmpFile;
	FASTASequence ref;
	FASTAReader reader;

	reader.Initialize(refFileName);
	reader.GetNext(ref);

	HDFBasReader basReader;

	SMRTSequence seq, *seqPtr;

	vector<int> refCoverage;
	refCoverage.resize(ref.length);
	std::fill(refCoverage.begin(), refCoverage.end(), 0);
	/*
	 * These guys pull information from the same pls file.
	 */
	HDFCmpReader<CmpAlignment> cmpReader;


	if (cmpReader.Initialize(cmpFileName) == 0) {
		cout << "ERROR, could not open the cmp file." << endl;
		exit(1);
	}
	
	
	cmpReader.Read(cmpFile);
	UInt alignmentIndex;

	//	movieIndexSets.resize(nMovies);
	for (alignmentIndex = 0; alignmentIndex < cmpFile.alnInfo.alignments.size(); alignmentIndex++) {
		int refSeqId    = cmpFile.alnInfo.alignments[alignmentIndex].GetRefSeqId();
		int readGroupId = cmpFile.alnInfo.alignments[alignmentIndex].GetReadGroupId();
		int refSeqIdIndex;
		if (cmpFile.refSeqTable.GetIndexOfId(refSeqId, refSeqIdIndex) == false) {
			//
			// Sanity check -- we're only looking at alignments to references in the cmp file.
			//
			cout << "ERROR, ref seq id: " << refSeqId << " should exist in the cmp file but it does not." << endl;
			assert(0);
		}

		int readGroupIdIndex;
		cmpFile.readGroupTable.GetIndexOfId(readGroupId, readGroupIdIndex);
		
		string readGroupPath    = cmpFile.readGroupTable.names[readGroupIdIndex];
		string readGroup        = cmpReader.readGroupPathToReadGroup[readGroupPath];
		int readGroupArrayIndex = cmpReader.refAlignGroups[refSeqIdIndex]->experimentNameToIndex[readGroup];
		vector<char> alignedSequence, alignedTarget;

		//
		// This read overlaps one of the ref positions.
		
		UInt offsetEnd, offsetBegin;
				
		offsetEnd   = cmpFile.alnInfo.alignments[alignmentIndex].GetOffsetEnd();
		offsetBegin = cmpFile.alnInfo.alignments[alignmentIndex].GetOffsetBegin();
		vector<unsigned char> byteAlignment;
		int alignedSequenceLength = offsetEnd - offsetBegin;
		if (alignedSequenceLength >= 0) {
			alignedSequence.resize(alignedSequenceLength);
			alignedTarget.resize(alignedSequenceLength);
			byteAlignment.resize(alignedSequenceLength);
		}

		cmpReader.refAlignGroups[refSeqIdIndex]->readGroups[readGroupArrayIndex]->alignmentArray.Read(offsetBegin, offsetEnd, &byteAlignment[0]);
		UInt refStart = cmpFile.alnInfo.alignments[alignmentIndex].GetRefStart();
		UInt refEnd   = cmpFile.alnInfo.alignments[alignmentIndex].GetRefEnd();
		UInt readStart= cmpFile.alnInfo.alignments[alignmentIndex].GetQueryStart();
		UInt readEnd  = cmpFile.alnInfo.alignments[alignmentIndex].GetQueryEnd();
		//
		// Read the alignment string.
		//
		if (refSeqIdIndex > 0) continue;


		

		
		//
		// Convert to something we can compare easily.
		//
		alignedSequence[alignedSequence.size()-1]= '\0';
		ByteAlignmentToQueryString(&byteAlignment[0], byteAlignment.size(), &alignedSequence[0]);
		ByteAlignmentToRefString(&byteAlignment[0], byteAlignment.size(), &alignedTarget[0]);
		int gi, i;
		gi = 0;
		int refStrand =  cmpFile.alnInfo.alignments[alignmentIndex].GetRCRefStrand();
		if (refStrand == 1) {
			// revcomp the ref strand
			vector<char> rcAlignedTarget, rcAlignedQuery;
			int t;
			rcAlignedTarget.resize(alignedTarget.size());
			rcAlignedQuery.resize(alignedSequence.size());
			for (t = 0; t < alignedTarget.size(); t++) {
				if (alignedTarget[t] == ' ') {
					rcAlignedTarget[alignedTarget.size() - t - 1] = ' ';
				}
				else {
					rcAlignedTarget[alignedTarget.size() - t - 1] = ReverseComplementNuc[alignedTarget[t]];
				}
				if (alignedSequence[t] == ' '){ 
					rcAlignedQuery[alignedTarget.size()  - t - 1] = ' ';
				}
				else {
					rcAlignedQuery[alignedTarget.size() - t - 1] = ReverseComplementNuc[alignedTarget[t]];
				}
			}
			alignedTarget = rcAlignedTarget;
			alignedSequence = rcAlignedQuery;
		}
		
		int holeNumber = cmpFile.alnInfo.alignments[alignmentIndex].GetHoleNumber();
		int ri = readStart;

		gi = refStart;

		for (i = 0; i < alignedTarget.size(); i++, gi++, ri++ ) {
			while(i < alignedTarget.size() and alignedTarget[i] == ' ') { 
				i++; 
			}
			if (alignedSequence[i] != ' ') {
				refCoverage[gi]++;
			}
		}
	} // end looping over regions

// Now compute the number of gaps.
	UInt pos;
	int numNotCovered = 0;
	for (pos = 0; pos < refCoverage.size(); pos++ ){
		if (refCoverage[pos] < 1) { numNotCovered++;}
	}
	if (numNotCovered > 100) {
		cout << "TOO Many!!!" << endl;
	}
	else {
		for (pos = 0; pos < refCoverage.size(); pos++ ){
			//		cout << refCoverage[pos] << endl;
			if (refCoverage[pos] < 1) {
				int left, right;
				left = right = -1;
				if (pos > 0) { left = refCoverage[pos-1];}
				if (pos < refCoverage.size()-1) {right = refCoverage[pos+1];}
				cout << pos << " " << left << " " << right << endl;
			}
		}
	}

}