void BaseFile::CopyReadAt(uint32_t readIndex, SMRTSequence &read) { assert(holeNumbers.size() > readIndex); read.HoleNumber(holeNumbers[readIndex]); if (holeXY.size() > 0) { assert(holeXY.size() > readIndex); read.HoleXY(holeXY[readIndex].xy[0], holeXY[readIndex].xy[1]); } DSLength startPos = readStartPositions[readIndex]; DNALength readLength = readLengths[readIndex]; read.length = readLength; read.Allocate(readLength); if (baseCalls.size() > 0) { assert(baseCalls.size() >= readLength + startPos); CopyArray(baseCalls, startPos, readLength, read.seq); } if (qualityValues.size() > 0) { assert(qualityValues.size() >= readLength + startPos); CopyArray(qualityValues, startPos, readLength, read.qual.data); } if (basWidthInFrames.size() > 0) { assert(basWidthInFrames.size() >= readLength + startPos); CopyArray(basWidthInFrames, startPos, readLength, read.widthInFrames); } if (deletionQV.size() > 0) { assert(deletionQV.size() >= readLength + startPos); CopyArray(deletionQV, startPos, readLength, read.deletionQV.data); } if (deletionTag.size() > 0) { assert(deletionTag.size() >= readLength + startPos); CopyArray(deletionTag, startPos, readLength, read.deletionTag); } if (insertionQV.size() > 0) { assert(insertionQV.size() >= readLength + startPos); CopyArray(insertionQV, startPos, readLength, read.insertionQV.data); } if (substitutionQV.size() > 0) { assert(substitutionQV.size() >= readLength + startPos); CopyArray(substitutionQV, startPos, readLength, read.substitutionQV.data); } if (mergeQV.size() > 0) { assert(mergeQV.size() >= readLength + startPos); CopyArray(mergeQV, startPos, readLength, read.mergeQV.data); } if (substitutionTag.size() > 0) { assert(substitutionTag.size() >= readLength + startPos); CopyArray(substitutionTag, startPos, readLength, read.substitutionTag); } if (preBaseFrames.size() > 0) { assert(preBaseFrames.size() >= readLength + startPos); CopyArray(preBaseFrames, startPos, readLength, read.preBaseFrames); } }
void MakeVirtualRead(SMRTSequence & smrtRead, const vector<SMRTSequence> & subreads) { assert(subreads.size() > 0); DNALength hqStart = 0, hqEnd = 0; for(auto subread: subreads) { hqStart = min(DNALength(subread.SubreadStart()), hqStart); hqEnd = max(DNALength(subread.SubreadEnd()), hqEnd); } smrtRead.Free(); smrtRead.Allocate(hqEnd); memset(smrtRead.seq, 'N', sizeof(char) * hqEnd); smrtRead.lowQualityPrefix = hqStart; smrtRead.lowQualitySuffix = smrtRead.length - hqEnd; smrtRead.highQualityRegionScore = subreads[0].highQualityRegionScore; smrtRead.HoleNumber(subreads[0].HoleNumber()); stringstream ss; ss << SMRTTitle(subreads[0].GetTitle()).MovieName() << "/" << subreads[0].HoleNumber(); smrtRead.CopyTitle(ss.str()); for (auto subread: subreads) { memcpy(&smrtRead.seq[subread.SubreadStart()], &subread.seq[0], sizeof(char) * subread.length); } }
void ImportReadFromCmpH5(int alignmentIndex, SMRTSequence &read) { CmpAlignment cmpAlignment; alnInfoGroup.ReadCmpAlignment(alignmentIndex, cmpAlignment); // // Cache some stats about the read, and where it was aligned to. // int queryStart = cmpAlignment.GetQueryStart(); int queryEnd = cmpAlignment.GetQueryEnd(); read.holeNumber = cmpAlignment.GetHoleNumber(); int refGroupId = cmpAlignment.GetRefGroupId(); int alnGroupId = cmpAlignment.GetAlnGroupId(); int refGroupIndex = refGroupIdToArrayIndex[refGroupId]; if (alnGroupIdToReadGroupName.find(alnGroupId) == alnGroupIdToReadGroupName.end()) { cout << "INTERNAL ERROR! Could not find read group name for alignment " << "group with Id " << alnGroupId << "." << endl; assert(0); } string readGroupName = alnGroupIdToReadGroupName[alnGroupId]; if (refAlignGroups[refGroupIndex]->experimentNameToIndex.find(readGroupName) == refAlignGroups[refGroupIndex]->experimentNameToIndex.end()) { cout << "Internal ERROR! The read group name " << readGroupName << " is specified as part of " << " the path in alignment " << alignmentIndex << " though it does not exist in the ref align group specified for this alignment." << endl; assert(0); } int readGroupIndex = refAlignGroups[refGroupIndex]->experimentNameToIndex[readGroupName]; HDFCmpExperimentGroup* expGroup = refAlignGroups[refGroupIndex]->readGroups[readGroupIndex]; int offsetBegin = cmpAlignment.GetOffsetBegin(); int offsetEnd = cmpAlignment.GetOffsetEnd(); int alignedSequenceLength = offsetEnd - offsetBegin; string alignedSequence; string readSequence; vector<unsigned char> byteAlignment; if (alignedSequenceLength >= 0) { alignedSequence.resize(alignedSequenceLength); byteAlignment.resize(alignedSequenceLength); } // // Read the alignment string. All alignments // refAlignGroups[refGroupIndex]->readGroups[readGroupIndex]->alignmentArray.Read(offsetBegin, offsetEnd, &byteAlignment[0]); // // Convert to something we can compare easily. // ByteAlignmentToQueryString(&byteAlignment[0], byteAlignment.size(), &alignedSequence[0]); // // Initialize the sequence of the read. // RemoveGaps(alignedSequence, readSequence); // // Make space for the sequence and all fields. // read.length = readSequence.size(); read.Allocate(read.length); memcpy(read.seq, readSequence.c_str(), readSequence.size() * sizeof(char)); vector<int> baseToAlignmentMap; CreateSequenceToAlignmentMap(byteAlignment, baseToAlignmentMap); // // Read in the quality values // vector<unsigned char> storedQVArray; vector<UChar> qvValues; vector<HalfWord> frameValues; int length = offsetEnd - offsetBegin; qvValues.resize(length); frameValues.resize(length); int i; if (expGroup->experimentGroup.ContainsObject("QualityValue")) { expGroup->qualityValue.Read(offsetBegin, offsetEnd, &qvValues[0]); StoreQualityValueFromAlignment(qvValues, baseToAlignmentMap, &read.qual.data[0]); int i; for (i= 0; i < read.length; i++) { assert(read.qual[i] < 100); } } if (expGroup->experimentGroup.ContainsObject("InsertionQV")) { expGroup->insertionQV.Read(offsetBegin, offsetEnd, &qvValues[0]); StoreQualityValueFromAlignment(qvValues, baseToAlignmentMap, &read.insertionQV.data[0]); } if (expGroup->experimentGroup.ContainsObject("SubstitutionQV")) { expGroup->substitutionQV.Read(offsetBegin, offsetEnd, &qvValues[0]); StoreQualityValueFromAlignment(qvValues, baseToAlignmentMap, &read.substitutionQV.data[0]); } if (expGroup->experimentGroup.ContainsObject("DeletionQV")) { expGroup->deletionQV.Read(offsetBegin, offsetEnd, &qvValues[0]); StoreQualityValueFromAlignment(qvValues, baseToAlignmentMap, &read.deletionQV.data[0]); } if (expGroup->experimentGroup.ContainsObject("DeletionTag")) { vector<char> deletionTagValues; deletionTagValues.resize(offsetEnd-offsetBegin); expGroup->deletionTag.Read(offsetBegin, offsetEnd, &deletionTagValues[0]); StoreQualityValueFromAlignment(deletionTagValues, baseToAlignmentMap, read.deletionTag); } if (expGroup->experimentGroup.ContainsObject("SubstitutionTag")) { vector<char> substitutionTagValues; substitutionTagValues.resize(offsetEnd-offsetBegin); expGroup->substitutionTag.Read(offsetBegin, offsetEnd, &substitutionTagValues[0]); StoreQualityValueFromAlignment(substitutionTagValues, baseToAlignmentMap, read.substitutionTag); } if (expGroup->experimentGroup.ContainsObject("PulseIndex")) { vector<uint32_t> pulseIndexValues; pulseIndexValues.resize(offsetEnd-offsetBegin); expGroup->pulseIndex.Read(offsetBegin, offsetEnd, &pulseIndexValues[0]); StoreQualityValueFromAlignment(pulseIndexValues, baseToAlignmentMap, read.pulseIndex); } if (expGroup->experimentGroup.ContainsObject("PreBaseFrames")) { expGroup->preBaseFrames.Read(offsetBegin, offsetEnd, &frameValues[0]); StoreQualityValueFromAlignment(frameValues, baseToAlignmentMap, read.preBaseFrames); } if (expGroup->experimentGroup.ContainsObject("WidthInFrames")) { expGroup->widthInFrames.Read(offsetBegin, offsetEnd, &frameValues[0]); StoreQualityValueFromAlignment(frameValues, baseToAlignmentMap, read.widthInFrames); } }
int main(int argc, char* argv[]) { string refGenomeFileName = ""; string lengthModelFileName = ""; string outputModelFileName = ""; DNALength numBasesPerFile = 0; string sourceReadsFileName = ""; string titleTableFileName = ""; int numBasH5Files = 1; string basH5BaseFileName = "simulated"; string movieName = "m101211_092754_00114_cSIM_s1_p0"; bool doRandGenInit = true; bool usePosMap = false; bool printPercentRepeat = false; string posMapFileName = ""; vector<string> movieNames; bool useLengthModel = false; bool useFixedLength = false; ofstream posMapFile; int scaledLength = 0; int fixedLength = 0; int nBasFiles = 1; bool useLengthsModel = true; bool printHelp = false; // Look to see if the refAsReads flag is specified anywhere before // parsing the command line. CommandLineParser clp; string commandLine; string helpString; SetHelp(helpString); vector<string> fns; clp.RegisterStringOption("genome", &refGenomeFileName, ""); clp.RegisterIntOption("numBasesPerFile", (int*)&numBasesPerFile, "", CommandLineParser::PositiveInteger); clp.RegisterStringOption("sourceReads", &sourceReadsFileName, ""); clp.RegisterStringOption("lengthModel", &lengthModelFileName, ""); clp.RegisterIntOption("fixedLength", &fixedLength, "", CommandLineParser::PositiveInteger); clp.RegisterFlagOption("lengthModel", &useLengthModel, ""); clp.RegisterStringOption("movieName", &movieName, ""); clp.RegisterStringOption("titleTable", &titleTableFileName, ""); clp.RegisterStringOption("baseFileName", &basH5BaseFileName, ""); clp.RegisterIntOption("nFiles", &nBasFiles, "", CommandLineParser::PositiveInteger); clp.RegisterIntOption("meanLength", &scaledLength, "", CommandLineParser::PositiveInteger); clp.RegisterStringOption("posMap", &posMapFileName, ""); clp.RegisterFlagOption("printPercentRepeat", &printPercentRepeat, ""); clp.RegisterFlagOption("h", &printHelp, ""); clp.SetHelp(helpString); clp.ParseCommandLine(argc, argv, fns); clp.CommandLineToString(argc, argv, commandLine); clp.SetProgramName("alchemy"); outputModelFileName = fns[0]; if (argc <= 1 or printHelp or outputModelFileName == "") { cout << helpString << endl; exit(0); } if (usePosMap) { CrucialOpen(posMapFileName, posMapFile, std::ios::out); } if (sourceReadsFileName == "" and fixedLength == 0) { useLengthModel = true; } if (useLengthModel and fixedLength != 0) { cout << "ERROR! You must either use a length model or a fixed length." << endl; exit(1); } if (sourceReadsFileName == "" and numBasesPerFile == 0) { cout << "ERROR! You must specify either a set of read to use as " << endl << "original reads for simulation or the total number of bases " << endl << "to simulate in each bas.h5 file." << endl; exit(1); } if (sourceReadsFileName == "" and refGenomeFileName == "") { cout << "ERROR! You must specify a genome to sample reads from or a set of read "<<endl << "to use as original reads for simulation." << endl; exit(1); } if (fixedLength != 0 and refGenomeFileName == "") { cout << "ERROR! You must specify a genome file if using a fixed length." << endl; exit(1); } if ((fixedLength != 0 or scaledLength != 0) and sourceReadsFileName != "") { cout << "ERROR! You cannot specify a fixed length nor mean length with a source " << endl << "reads file. The read lengths are taken from the source reads or the length model." << endl; exit(1); } LengthHistogram lengthHistogram; OutputSampleListSet outputModel(0); TitleTable titleTable; if (doRandGenInit) { InitializeRandomGeneratorWithTime(); } // // Read models. // if (titleTableFileName != "") { titleTable.Read(titleTableFileName); } outputModel.Read(outputModelFileName); if (useLengthModel) { lengthHistogram.BuildFromAlignmentLengths(outputModel.lengths); } vector<int> alignmentLengths; int meanAlignmentLength; if (scaledLength != 0 and useLengthModel) { // // Scale the histogram so that the average length is 'scaledLength'. // // 1. Integrate histogram long totalLength = 0; long totalSamples = 0; int hi; for (hi = 0; hi < lengthHistogram.lengthHistogram.cdf.size()-1; hi++) { int ni; ni = lengthHistogram.lengthHistogram.cdf[hi+1] - lengthHistogram.lengthHistogram.cdf[hi]; totalLength += ni * lengthHistogram.lengthHistogram.data[hi]; } totalSamples = lengthHistogram.lengthHistogram.cdf[lengthHistogram.lengthHistogram.cdf.size()-1]; float meanSampleLength = totalLength / (1.0*totalSamples); float fractionIncrease = scaledLength / meanSampleLength; for (hi = 0; hi < lengthHistogram.lengthHistogram.cdf.size(); hi++) { lengthHistogram.lengthHistogram.data[hi] *= fractionIncrease; } } FASTAReader inReader, seqReader; vector<FASTASequence> reference; DNALength refLength = 0; int i; if (refGenomeFileName != "") { inReader.Init(refGenomeFileName); inReader.ReadAllSequences(reference); for (i = 0; i < reference.size(); i++) { refLength += reference[i].length; } } if (sourceReadsFileName != "") { seqReader.Init(sourceReadsFileName); } ofstream readsFile; // // Create and simulate bas.h5 files. // int baseFileIndex; bool readsRemain = true; for (baseFileIndex = 0; ((sourceReadsFileName == "" and baseFileIndex < nBasFiles) // case 1 is reads are generated by file or (sourceReadsFileName != "" and readsRemain)); // case 2 is reads are generated by an input file. baseFileIndex++) { // // Prep the base file for writing. // stringstream fileNameStrm, movieNameStrm; //string movieName = "m000000_000000_00000_cSIMULATED_s"; movieNameStrm << movieName << baseFileIndex << "_p0"; string fullMovieName = movieNameStrm.str(); fileNameStrm << fullMovieName << ".bas.h5"; HDFBasWriter basWriter; HDFRegionTableWriter regionWriter; // // This is mainly used to create the atributes. // RegionTable regionTable; regionTable.CreateDefaultAttributes(); basWriter.SetPlatform(Springfield); // // Use a fixed set of fields for now. // // These are all pulled from the outputModel. basWriter.IncludeField("Basecall"); basWriter.IncludeField("QualityValue"); basWriter.IncludeField("SubstitutionQV"); basWriter.IncludeField("SubstitutionTag"); basWriter.IncludeField("InsertionQV"); basWriter.IncludeField("DeletionQV"); basWriter.IncludeField("DeletionTag"); basWriter.IncludeField("WidthInFrames"); basWriter.IncludeField("PreBaseFrames"); basWriter.IncludeField("PulseIndex"); vector<unsigned char> qualityValue, substitutionQV, substitutionTag, insertionQV, deletionQV, deletionTag; vector<HalfWord> widthInFrames, preBaseFrames, pulseIndex; // Just go from 0 .. hole Number basWriter.IncludeField("HoleNumber"); // Fixed to 0. basWriter.IncludeField("HoleXY"); if (usePosMap == false) { basWriter.IncludeField("SimulatedSequenceIndex"); basWriter.IncludeField("SimulatedCoordinate"); } basWriter.SetChangeListID("1.3.0.50.104380"); DNALength numSimulatedBases = 0; FASTASequence sampleSeq; //sampleSeq.length = readLength; int maxRetry = 10000000; int retryNumber = 0; int numReads = 0; int readLength = 0; while (numBasesPerFile == 0 or numSimulatedBases < numBasesPerFile) { DNALength seqIndex, seqPos; if (useLengthModel or fixedLength) { if (useLengthModel) { lengthHistogram.GetRandomLength(readLength); } else { readLength = fixedLength; } } if (refGenomeFileName != "") { FindRandomPos(reference, seqIndex, seqPos, readLength + (outputModel.keyLength - 1)); sampleSeq.seq = &reference[seqIndex].seq[seqPos]; sampleSeq.length = readLength + (outputModel.keyLength - 1); assert(reference[seqIndex].length >= sampleSeq.length); } else if (sourceReadsFileName != "") { if (seqReader.GetNext(sampleSeq) == false) { readsRemain = false; break; } if (sampleSeq.length < outputModel.keyLength) { continue; } // // Now attempt to parse the position from the fasta title. // if (useLengthModel) { int tryNumber = 0; readLength = 0; int maxNTries = 1000; int tryBuffer[5] = {-1,-1,-1,-1,-1}; while (tryNumber < maxNTries and readLength < outputModel.keyLength) { lengthHistogram.GetRandomLength(readLength); readLength = sampleSeq.length = min(sampleSeq.length, (unsigned int) readLength); tryBuffer[tryNumber%5] = readLength; tryNumber++; } if (tryNumber >= maxNTries) { cout << "ERROR. Could not generate a read length greater than the " << outputModel.keyLength << " requried " <<endl << "minimum number of bases using the length model specified in the alchemy." <<endl << "model. Something is either wrong with the model or the context length is too large." <<endl; cout << "The last few tries were: " << tryBuffer[0] << " " << tryBuffer[1] << " " << tryBuffer[2] << " " << tryBuffer[3] << " " << tryBuffer[4] << endl; exit(1); } } readLength = sampleSeq.length; vector<string> tokens; Tokenize(sampleSeq.title, "|", tokens); if (tokens.size() == 4) { seqPos = atoi(tokens[2].c_str()); if (titleTableFileName == "") { seqIndex = 0; } else { int index; titleTable.Lookup(tokens[1], index); seqIndex = index; } } else { seqPos = 0; } } // // If this is the first read printed to the base file, initialize it. // if (numSimulatedBases == 0) { basWriter.Initialize(fileNameStrm.str(), movieNameStrm.str(), Springfield); regionWriter.Initialize(basWriter.pulseDataGroup); } numSimulatedBases += readLength; int p; // create the sample sequence int contextLength = outputModel.keyLength; int contextMiddle = contextLength / 2; string outputString; int nDel = 0; int nIns = 0; // // Simulate to beyond the sample length. // qualityValue.clear(); substitutionQV.clear(); substitutionTag.clear(); insertionQV.clear(); deletionQV.clear(); deletionTag.clear(); pulseIndex.clear(); widthInFrames.clear(); preBaseFrames.clear(); assert(sampleSeq.length > contextMiddle + 1); for (p = contextMiddle; p < sampleSeq.length - contextMiddle - 1; p++) { string refContext; refContext.assign((const char*) &sampleSeq.seq[p-contextMiddle], contextLength); string outputContext; int contextWasFound; OutputSample sample; int i; for (i = 0; i < refContext.size(); i++) { refContext[i] = toupper(refContext[i]); } outputModel.SampleRandomSample(refContext, sample); if (sample.type == OutputSample::Deletion ) { // // There was a deletion. Advance in reference, then output // the base after the deletion. // p++; ++nDel; } int cp; // // Add the sampled context, possibly multiple characters because of an insertion. // for (i = 0; i < sample.nucleotides.size(); i++) { outputString.push_back(sample.nucleotides[i]); qualityValue.push_back(sample.qualities[i].qv[0]); deletionQV.push_back(sample.qualities[i].qv[1]); insertionQV.push_back(sample.qualities[i].qv[2]); substitutionQV.push_back(sample.qualities[i].qv[3]); deletionTag.push_back(sample.qualities[i].tags[0]); substitutionTag.push_back(sample.qualities[i].tags[1]); pulseIndex.push_back(sample.qualities[i].frameValues[0]); preBaseFrames.push_back(sample.qualities[i].frameValues[1]); widthInFrames.push_back(sample.qualities[i].frameValues[2]); } nIns += sample.qualities.size() - 1; } if (outputString.find('N') != outputString.npos or outputString.find('n') != outputString.npos) { cout << "WARNING! The sampled string " << endl << outputString << endl << "should not contain N's, but it seems to. This is being ignored "<<endl << "for now so that simulation may continue, but this shouldn't happen"<<endl << "and is really a bug." << endl; numSimulatedBases -= readLength; continue; } // // Ok, done creating the read, now time to create some quality values!!!!! // SMRTSequence read; read.length = outputString.size(); read.Allocate(read.length); memcpy(read.seq, outputString.c_str(), read.length * sizeof(unsigned char)); assert(qualityValue.size() == read.length * sizeof(unsigned char)); memcpy(read.qual.data, &qualityValue[0], read.length * sizeof(unsigned char)); memcpy(read.deletionQV.data, &deletionQV[0], read.length * sizeof(unsigned char)); memcpy(read.insertionQV.data, &insertionQV[0], read.length * sizeof(unsigned char)); memcpy(read.substitutionQV.data, &substitutionQV[0], read.length * sizeof(unsigned char)); memcpy(read.deletionTag, &deletionTag[0], read.length * sizeof(unsigned char)); memcpy(read.substitutionTag, &substitutionTag[0], read.length * sizeof(unsigned char)); memcpy(read.pulseIndex, &pulseIndex[0], read.length * sizeof(int)); memcpy(read.preBaseFrames, &preBaseFrames[0], read.length * sizeof(HalfWord)); memcpy(read.widthInFrames, &widthInFrames[0], read.length * sizeof(HalfWord)); // // The pulse index for now is just fake data. // int i; for (i = 0; i < read.length; i++) { read.pulseIndex[i] = 1; } read.xy[0] = seqIndex; read.xy[1] = seqPos; read.zmwData.holeNumber = numReads; basWriter.Write(read); // Record where this was simulated from. if (usePosMap == false) { basWriter.WriteSimulatedCoordinate(seqPos); basWriter.WriteSimulatedSequenceIndex(seqIndex); } else { posMapFile << fullMovieName << "/" << numReads << "/0_" << read.length << " " << seqIndex << " "<< seqPos; if (printPercentRepeat) { DNALength nRepeat = sampleSeq.GetRepeatContent(); posMapFile << " " << nRepeat*1.0/sampleSeq.length; } posMapFile << endl; } RegionAnnotation region; region.row[0] = read.zmwData.holeNumber; region.row[1] = 1; region.row[2] = 0; region.row[3] = read.length; region.row[4] = 1000; // Should be enough. regionWriter.Write(region); region.row[1] = 2; // Rewrite for hq region encompassing everything. regionWriter.Write(region); if (sourceReadsFileName != "") { sampleSeq.Free(); } read.Free(); ++numReads; } regionWriter.Finalize(regionTable.columnNames, regionTable.regionTypes, regionTable.regionDescriptions, regionTable.regionSources); basWriter.Close(); numReads = 0; // // The bas writer should automatically flush on closing. // } if (usePosMap) { posMapFile.close(); } for (i = 0; i < reference.size(); i++) { reference[i].Free(); } }