void ReadAlignmentArray(int alignmentIndex, ByteAlignment &alignmentArray) { CmpAlignment cmpAlignment; alnInfoGroup.ReadCmpAlignment(alignmentIndex, cmpAlignment); // // Cache some stats about the read, and where it was aligned to. // int queryStart = cmpAlignment.GetQueryStart(); int queryEnd = cmpAlignment.GetQueryEnd(); int refGroupId = cmpAlignment.GetRefGroupId(); int alnGroupId = cmpAlignment.GetAlnGroupId(); int refGroupIndex = refGroupIdToArrayIndex[refGroupId]; if (alnGroupIdToReadGroupName.find(alnGroupId) == alnGroupIdToReadGroupName.end()) { cout << "INTERNAL ERROR! Could not find read group name for alignment " << "group with Id " << alnGroupId << "." << endl; assert(0); } string readGroupName = alnGroupIdToReadGroupName[alnGroupId]; if (refAlignGroups[refGroupIndex]->experimentNameToIndex.find(readGroupName) == refAlignGroups[refGroupIndex]->experimentNameToIndex.end()) { cout << "Internal ERROR! The read group name " << readGroupName << " is specified as part of " << " the path in alignment " << alignmentIndex << " though it does not exist in the ref align group specified for this alignment." << endl; assert(0); } int readGroupIndex = refAlignGroups[refGroupIndex]->experimentNameToIndex[readGroupName]; HDFCmpExperimentGroup* expGroup = refAlignGroups[refGroupIndex]->readGroups[readGroupIndex]; int offsetBegin = cmpAlignment.GetOffsetBegin(); int offsetEnd = cmpAlignment.GetOffsetEnd(); int alignedSequenceLength = offsetEnd - offsetBegin; if (alignedSequenceLength >= 0) { alignmentArray.resize(alignedSequenceLength); } else { return; } // // Read the alignment string. All alignments // refAlignGroups[refGroupIndex]->readGroups[readGroupIndex]->alignmentArray.Read(offsetBegin, offsetEnd, &alignmentArray[0]); }
int main(int argc, char* argv[]) { std::string outFileName; unsigned contextLength = 5; int minSamples = 500; int maxSamples = 1000; if (argc < 3) { PrintUsage(); std::exit(EXIT_FAILURE); } int argi = 1; std::string cmpH5FileName; cmpH5FileName = argv[argi++]; outFileName = argv[argi++]; int minAverageQual = 0; bool onlyMaxLength = false; while (argi < argc) { if (strcmp(argv[argi], "-contextLength") == 0) { contextLength = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-minSamples") == 0) { minSamples = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-maxSamples") == 0) { maxSamples = atoi(argv[++argi]); } else if (strcmp(argv[argi], "-onlyMaxLength") == 0) { onlyMaxLength = true; } else { PrintUsage(); std::cout << "ERROR, bad option: " << argv[argi] << std::endl; std::exit(EXIT_FAILURE); } ++argi; } std::map<std::string, ScoredLength> maxLengthMap; OutputSampleListSet samples(contextLength); SMRTSequence read; std::ofstream sampleOut; CrucialOpen(outFileName, sampleOut, std::ios::out | std::ios::binary); int fileNameIndex; int numContextsReached = 0; int numContexts = 1 << (contextLength * 2); ReaderAgglomerate reader; samples.keyLength = contextLength; HDFCmpFile<CmpAlignment> cmpReader; cmpReader.IncludeField("QualityValue"); cmpReader.IncludeField("DeletionQV"); cmpReader.IncludeField("InsertionQV"); cmpReader.IncludeField("SubstitutionQV"); cmpReader.IncludeField("SubstitutionTag"); cmpReader.IncludeField("DeletionTag"); cmpReader.IncludeField("PulseIndex"); cmpReader.IncludeField("WidthInFrames"); cmpReader.IncludeField("PreBaseFrames"); if (cmpReader.Initialize(cmpH5FileName, H5F_ACC_RDWR) == 0) { std::cout << "ERROR, could not open the cmp file." << std::endl; std::exit(EXIT_FAILURE); } std::cout << "Reading cmp file." << std::endl; CmpFile cmpFile; cmpReader.ReadAlignmentDescriptions(cmpFile); cmpReader.ReadStructure(cmpFile); std::cout << "done reading structure." << std::endl; int alignmentIndex; int nAlignments = cmpReader.alnInfoGroup.GetNAlignments(); std::vector<int> alignmentToBaseMap; for (alignmentIndex = 0; alignmentIndex < nAlignments and !samples.Sufficient(); alignmentIndex++) { // // For ease of use, store the length of the alignment to make another model. // ByteAlignment alignmentArray; cmpReader.ReadAlignmentArray(alignmentIndex, alignmentArray); Alignment alignment; ByteAlignmentToAlignment(alignmentArray, alignment); std::string readSequence, refSequence; readSequence.resize(alignmentArray.size()); refSequence.resize(alignmentArray.size()); DNASequence readDNA, refDNA; ByteAlignmentToQueryString(&alignmentArray[0], alignmentArray.size(), &readSequence[0]); ByteAlignmentToRefString(&alignmentArray[0], alignmentArray.size(), &refSequence[0]); RemoveGaps(readSequence, readSequence); RemoveGaps(refSequence, refSequence); readDNA.seq = (Nucleotide*)readSequence.c_str(); readDNA.length = readSequence.size(); refDNA.seq = (Nucleotide*)refSequence.c_str(); refDNA.length = refSequence.size(); CmpAlignment cmpAlignment; cmpReader.ImportReadFromCmpH5(alignmentIndex, cmpAlignment, read); CreateAlignmentToSequenceMap(alignmentArray, alignmentToBaseMap); if (read.length < contextLength) { continue; } int subreadLength = (cmpFile.alnInfo.alignments[alignmentIndex].GetQueryEnd() - cmpFile.alnInfo.alignments[alignmentIndex].GetQueryStart()); if (onlyMaxLength == false) { samples.lengths.push_back(subreadLength); } else { int score = (cmpAlignment.GetNMatch() - cmpAlignment.GetNMismatch() - cmpAlignment.GetNInsertions() - cmpAlignment.GetNDeletions()); std::stringstream nameStrm; nameStrm << cmpAlignment.GetMovieId() << "_" << cmpAlignment.GetHoleNumber(); std::string nameStr = nameStrm.str(); if (maxLengthMap.find(nameStr) == maxLengthMap.end()) { maxLengthMap[nameStr] = ScoredLength(score, subreadLength); } } int sampleEnd = alignmentArray.size() - contextLength / 2; int a; for (a = contextLength / 2; a < sampleEnd; a++) { // Make sure the context begins on a real nucleotide. while (a < sampleEnd and ((RefChar[alignmentArray[a]] == ' '))) { a++; } // // Move ab back to an index where there are contextLength/2 non-gap // characters, counted by nb // int ab; //num bases int ae; //alignment end ab = a - 1; int nb = 0, ne = 0; while (true) { if (RefChar[alignmentArray[ab]] != ' ') { nb++; } if (ab == 0 or nb == static_cast<int>(contextLength) / 2) break; ab--; } // // Advance ae to an index where there are contextLength/2 non-gap // characters, counted by ne. // ae = a + 1; while (ae < static_cast<int>(alignmentArray.size()) and ne < static_cast<int>(contextLength) / 2) { if (RefChar[alignmentArray[ae]] != ' ') { ne++; } ae++; } // // Make sure there are no edge effects that prevent a context of the correct length from being assigned. // if (nb + ne + 1 != static_cast<int>(contextLength)) { continue; } int ai; std::string context; for (ai = ab; ai < ae; ai++) { if (RefChar[alignmentArray[ai]] != ' ') { context.push_back(RefChar[alignmentArray[ai]]); } } assert(context.size() == contextLength); // // Now create the context. // OutputSample sample; // // This context is a deletion, create that. // sample.type = OutputSample::Deletion; // // This context is either an insertion or substitution // // Look to see if the previous aligned position was an // insertion, and move back as far as the insertion extends. int aq = a - 1; int sampleLength; if (QueryChar[alignmentArray[a]] == ' ') { sample.type = OutputSample::Deletion; sampleLength = 0; } else if (RefChar[alignmentArray[aq]] == ' ') { while (aq > 0 and RefChar[alignmentArray[aq]] == ' ' and QueryChar[alignmentArray[aq]] != ' ') { aq--; } sample.type = OutputSample::Insertion; sampleLength = a - aq; } else if (QueryChar[alignmentArray[a]] == RefChar[alignmentArray[aq]]) { sample.type = OutputSample::Match; sampleLength = 1; } else { sample.type = OutputSample::Substitution; sampleLength = 1; } sample.Resize(sampleLength); if (sampleLength > 0) { int seqPos = alignmentToBaseMap[aq]; if (seqPos < static_cast<int>(read.length)) { sample.CopyFromSeq(read, seqPos, sampleLength); std::string nucs; for (size_t n = 0; n < sample.nucleotides.size(); n++) { char c = sample.nucleotides[n]; assert(c == 'A' or c == 'T' or c == 'G' or c == 'C'); nucs.push_back(sample.nucleotides[n]); } } } samples.AppendOutputSample(context, sample); } read.Free(); } if (onlyMaxLength) { std::map<std::string, ScoredLength>::iterator maxScoreIt; for (maxScoreIt = maxLengthMap.begin(); maxScoreIt != maxLengthMap.end(); ++maxScoreIt) { std::cout << maxScoreIt->second.length << std::endl; samples.lengths.push_back(maxScoreIt->second.length); } } samples.Write(sampleOut); return 0; }
void ImportReadFromCmpH5(int alignmentIndex, SMRTSequence &read) { CmpAlignment cmpAlignment; alnInfoGroup.ReadCmpAlignment(alignmentIndex, cmpAlignment); // // Cache some stats about the read, and where it was aligned to. // int queryStart = cmpAlignment.GetQueryStart(); int queryEnd = cmpAlignment.GetQueryEnd(); read.holeNumber = cmpAlignment.GetHoleNumber(); int refGroupId = cmpAlignment.GetRefGroupId(); int alnGroupId = cmpAlignment.GetAlnGroupId(); int refGroupIndex = refGroupIdToArrayIndex[refGroupId]; if (alnGroupIdToReadGroupName.find(alnGroupId) == alnGroupIdToReadGroupName.end()) { cout << "INTERNAL ERROR! Could not find read group name for alignment " << "group with Id " << alnGroupId << "." << endl; assert(0); } string readGroupName = alnGroupIdToReadGroupName[alnGroupId]; if (refAlignGroups[refGroupIndex]->experimentNameToIndex.find(readGroupName) == refAlignGroups[refGroupIndex]->experimentNameToIndex.end()) { cout << "Internal ERROR! The read group name " << readGroupName << " is specified as part of " << " the path in alignment " << alignmentIndex << " though it does not exist in the ref align group specified for this alignment." << endl; assert(0); } int readGroupIndex = refAlignGroups[refGroupIndex]->experimentNameToIndex[readGroupName]; HDFCmpExperimentGroup* expGroup = refAlignGroups[refGroupIndex]->readGroups[readGroupIndex]; int offsetBegin = cmpAlignment.GetOffsetBegin(); int offsetEnd = cmpAlignment.GetOffsetEnd(); int alignedSequenceLength = offsetEnd - offsetBegin; string alignedSequence; string readSequence; vector<unsigned char> byteAlignment; if (alignedSequenceLength >= 0) { alignedSequence.resize(alignedSequenceLength); byteAlignment.resize(alignedSequenceLength); } // // Read the alignment string. All alignments // refAlignGroups[refGroupIndex]->readGroups[readGroupIndex]->alignmentArray.Read(offsetBegin, offsetEnd, &byteAlignment[0]); // // Convert to something we can compare easily. // ByteAlignmentToQueryString(&byteAlignment[0], byteAlignment.size(), &alignedSequence[0]); // // Initialize the sequence of the read. // RemoveGaps(alignedSequence, readSequence); // // Make space for the sequence and all fields. // read.length = readSequence.size(); read.Allocate(read.length); memcpy(read.seq, readSequence.c_str(), readSequence.size() * sizeof(char)); vector<int> baseToAlignmentMap; CreateSequenceToAlignmentMap(byteAlignment, baseToAlignmentMap); // // Read in the quality values // vector<unsigned char> storedQVArray; vector<UChar> qvValues; vector<HalfWord> frameValues; int length = offsetEnd - offsetBegin; qvValues.resize(length); frameValues.resize(length); int i; if (expGroup->experimentGroup.ContainsObject("QualityValue")) { expGroup->qualityValue.Read(offsetBegin, offsetEnd, &qvValues[0]); StoreQualityValueFromAlignment(qvValues, baseToAlignmentMap, &read.qual.data[0]); int i; for (i= 0; i < read.length; i++) { assert(read.qual[i] < 100); } } if (expGroup->experimentGroup.ContainsObject("InsertionQV")) { expGroup->insertionQV.Read(offsetBegin, offsetEnd, &qvValues[0]); StoreQualityValueFromAlignment(qvValues, baseToAlignmentMap, &read.insertionQV.data[0]); } if (expGroup->experimentGroup.ContainsObject("SubstitutionQV")) { expGroup->substitutionQV.Read(offsetBegin, offsetEnd, &qvValues[0]); StoreQualityValueFromAlignment(qvValues, baseToAlignmentMap, &read.substitutionQV.data[0]); } if (expGroup->experimentGroup.ContainsObject("DeletionQV")) { expGroup->deletionQV.Read(offsetBegin, offsetEnd, &qvValues[0]); StoreQualityValueFromAlignment(qvValues, baseToAlignmentMap, &read.deletionQV.data[0]); } if (expGroup->experimentGroup.ContainsObject("DeletionTag")) { vector<char> deletionTagValues; deletionTagValues.resize(offsetEnd-offsetBegin); expGroup->deletionTag.Read(offsetBegin, offsetEnd, &deletionTagValues[0]); StoreQualityValueFromAlignment(deletionTagValues, baseToAlignmentMap, read.deletionTag); } if (expGroup->experimentGroup.ContainsObject("SubstitutionTag")) { vector<char> substitutionTagValues; substitutionTagValues.resize(offsetEnd-offsetBegin); expGroup->substitutionTag.Read(offsetBegin, offsetEnd, &substitutionTagValues[0]); StoreQualityValueFromAlignment(substitutionTagValues, baseToAlignmentMap, read.substitutionTag); } if (expGroup->experimentGroup.ContainsObject("PulseIndex")) { vector<uint32_t> pulseIndexValues; pulseIndexValues.resize(offsetEnd-offsetBegin); expGroup->pulseIndex.Read(offsetBegin, offsetEnd, &pulseIndexValues[0]); StoreQualityValueFromAlignment(pulseIndexValues, baseToAlignmentMap, read.pulseIndex); } if (expGroup->experimentGroup.ContainsObject("PreBaseFrames")) { expGroup->preBaseFrames.Read(offsetBegin, offsetEnd, &frameValues[0]); StoreQualityValueFromAlignment(frameValues, baseToAlignmentMap, read.preBaseFrames); } if (expGroup->experimentGroup.ContainsObject("WidthInFrames")) { expGroup->widthInFrames.Read(offsetBegin, offsetEnd, &frameValues[0]); StoreQualityValueFromAlignment(frameValues, baseToAlignmentMap, read.widthInFrames); } }
void ReadAlignment(int alignmentIndex, AlignmentCandidate<FASTASequence, FASTASequence> &alignment) { CmpAlignment cmpAln; ReadAlignment(alignmentIndex, cmpAln); string refSequence; string readSequence; readSequence.resize(cmpAln.alignmentArray.size()); refSequence.resize(cmpAln.alignmentArray.size()); ByteAlignmentToQueryString(&cmpAln.alignmentArray[0], cmpAln.alignmentArray.size(), &readSequence[0]); ByteAlignmentToRefString(&cmpAln.alignmentArray[0], cmpAln.alignmentArray.size(), &refSequence[0]); string ungappedRead, ungappedRef; RemoveGaps(readSequence, ungappedRead); RemoveGaps(refSequence, ungappedRef); GappedStringsToAlignment(readSequence, refSequence, alignment); FASTASequence qAlignedSeq, rAlignedSeq; qAlignedSeq.seq = (Nucleotide*) &ungappedRead[0]; qAlignedSeq.length = ungappedRead.size(); rAlignedSeq.seq = (Nucleotide*) &ungappedRef[0]; rAlignedSeq.length = ungappedRef.size(); alignment.tAlignedSeq.Copy(rAlignedSeq); alignment.qAlignedSeq.Copy(qAlignedSeq); unsigned int qStart = cmpAln.GetQueryStart(); unsigned int tStart = cmpAln.GetRefStart(); alignment.tPos = cmpAln.GetRefStart(); alignment.qPos = cmpAln.GetQueryStart(); alignment.nIns = cmpAln.GetNInsertions(); alignment.nDel = cmpAln.GetNDeletions(); alignment.nMatch = cmpAln.GetNMatch(); alignment.nMismatch=cmpAln.GetNMismatch(); alignment.qStrand= 0; alignment.tStrand = cmpAln.GetTStrand(); alignment.pctSimilarity = ((float)alignment.nMatch) / (alignment.nMatch + alignment.nMismatch + alignment.nIns + alignment.nDel); alignment.mapQV = cmpAln.GetMapQV(); }