void FragmentCollector::add( const alignment::BamTemplate &bamTemplate, const unsigned fragmentIndex, const unsigned barcodeIdx) { ISAAC_ASSERT_MSG(2 >= bamTemplate.getFragmentCount(), "Expected paired or single-ended data"); const alignment::FragmentMetadata &fragment = bamTemplate.getFragmentMetadata(fragmentIndex); ISAAC_THREAD_CERR_DEV_TRACE_CLUSTER_ID(fragment.getCluster().getId(), "FragmentCollector::add: " << fragment); FragmentBuffer::IndexRecord &recordStart = buffer_.initialize(fragment.getCluster().getId(), fragment.getReadIndex()); recordStart.fStrandPos_ = fragment.getFStrandReferencePosition(); storeBclAndCigar(fragment, recordStart); if (2 == bamTemplate.getFragmentCount()) { const alignment::FragmentMetadata &mate = bamTemplate.getMateFragmentMetadata(fragment); if (fragment.isNoMatch()) { ISAAC_ASSERT_MSG(mate.isNoMatch(), "If mate is not a no-match, fragment must be a shadow. fragment: " << fragment << " mate:" << mate); recordStart.fragmentHeader() = io::FragmentHeader(bamTemplate, fragment, mate, barcodeIdx, 0); } else { const unsigned mateStorageBin = binIndexMap_.getBinIndex(mate.getFStrandReferencePosition()); recordStart.fragmentHeader() = io::FragmentHeader(bamTemplate, fragment, mate, barcodeIdx, mateStorageBin); } } else { recordStart.fragmentHeader() = io::FragmentHeader(bamTemplate, fragment, barcodeIdx); } }
SequencingAdapter::SequencingAdapter(const flowcell::SequencingAdapterMetadata &adapterMetadata) : adapterMetadata_(adapterMetadata), kmerPositions_(oligo::getMaxKmer<unsigned>(adapterMatchBasesMin_) + 1, char(UNINITIALIZED_POSITION)) { ISAAC_ASSERT_MSG(adapterMetadata_.getSequence().size() < unsigned(std::numeric_limits<char>::max()), "Adapter sequence is too long"); ISAAC_ASSERT_MSG((adapterMetadata_.isUnbounded() || adapterMetadata_.getSequence().size() <= adapterMetadata_.getClipLength()), "Clip length cannot be shorter than the adapter sequence"); oligo::KmerGenerator<unsigned short, std::string::const_iterator> kmerGenerator( adapterMetadata_.getSequence().begin(), adapterMetadata_.getSequence().end(), adapterMatchBasesMin_); std::string::const_iterator position = adapterMetadata_.getSequence().begin(); unsigned short kmer = 0; while(kmerGenerator.next(kmer, position)) { char &pos = kmerPositions_.at(kmer); if (UNINITIALIZED_POSITION == pos) { pos = std::distance(adapterMetadata_.getSequence().begin(), position); } else if (NON_UNIQUE_KMER_POSITION != pos) { pos = NON_UNIQUE_KMER_POSITION; } } }
static void getBclBgzfCycleFilePath( const boost::filesystem::path &baseCallsPath, const unsigned lane, const unsigned cycle, const bool bci, boost::filesystem::path &result) { ISAAC_ASSERT_MSG(lane <= bclBgzf::LANE_NUMBER_MAX, "Lane number " << lane << " must not exceed " << bclBgzf::LANE_NUMBER_MAX); ISAAC_ASSERT_MSG(cycle <= bclBgzf::CYCLE_NUMBER_MAX, "Cycle number should not exceeed " << bclBgzf::CYCLE_NUMBER_MAX << " digits"); // Warning: all this mad code below is to avoid memory allocations during path formatting. // the result is expected to be pre-sized, else allocations will occur as usual. char laneFolder[100]; /*const int laneFolderLength = */snprintf(laneFolder, sizeof(laneFolder), "%cL%03d", common::getDirectorySeparatorChar(), lane); char bclFileName[100]; /*const int bclFileNameLength = */snprintf(bclFileName, sizeof(bclFileName), bci ? "%c%04d.bcl.bgzf.bci":"%c%04d.bcl.bgzf", common::getDirectorySeparatorChar(), cycle); // // boost 1.46 implementation of filesystem::path is coded to instantiate an std::string // // when doing append. Therefore have to jump through hoops to prevent memory allocations from happening // std::string & pathInternalStringRef = const_cast<std::string&>(result.string()); // pathInternalStringRef = baseCallsPath.c_str(); // pathInternalStringRef.append(laneFolder, laneFolder + laneFolderLength); // pathInternalStringRef.append(bclFileName, bclFileName + bclFileNameLength); result = baseCallsPath.c_str(); result /= laneFolder; result /= bclFileName; // std::cerr << "formatted " << result << " out of " << laneFolder << "," // << cycleFolder << "," << bclFileName << "\n"; }
const std::vector<std::vector<unsigned > > SeedMemoryManager<KmerT>::getNotFoundMatchesCount( const flowcell::TileMetadataList &unprocessedTiles, const flowcell::BarcodeMetadataList &barcodeMetadataList, const ReadMetadataList &readMetadataList, const matchFinder::TileClusterInfo &foundMatches) const { ISAAC_ASSERT_MSG(unprocessedTiles.front().getIndex() <= unprocessedTiles.back().getIndex(), "Expected tiles ordered by index"); std::vector<std::vector<unsigned > > ret(readMetadataList.size(), std::vector<unsigned>( unprocessedTiles.back().getIndex() + 1)); BOOST_FOREACH(const flowcell::ReadMetadata &readMetadata, readMetadataList_) { const unsigned readIndex = readMetadata.getIndex(); BOOST_FOREACH(const flowcell::TileMetadata &tileMetadata, unprocessedTiles) { const unsigned tileIndex = tileMetadata.getIndex(); const std::vector<matchFinder::ClusterInfo> &oneTileInfo = foundMatches.at(tileIndex); // match only clusters where no matches were found so far ISAAC_ASSERT_MSG(oneTileInfo.size() == tileMetadata.getClusterCount(), "allTiles and foundMatches geometries must match"); ret.at(readIndex).at(tileIndex) = std::count_if(oneTileInfo.begin(), oneTileInfo.end(), boost::bind(&willLoadSeeds, boost::ref(barcodeMetadataList), _1, readIndex)); } } return ret; }
void BclBaseCallsSource::bclToClusters( const flowcell::TileMetadata &tileMetadata, alignment::BclClusters &bclData, const bool useLocsPositions) const { ISAAC_THREAD_CERR << "Transposing Bcl data for " << tileMetadata.getClusterCount() << " bcl clusters" << std::endl; const clock_t startTranspose = clock(); bclMapper_.transpose(bclData.addMoreClusters(tileMetadata.getClusterCount())); ISAAC_THREAD_CERR << "Transposing Bcl data done for " << bclData.getClusterCount() << " bcl clusters in " << (clock() - startTranspose) / 1000 << "ms" << std::endl; ISAAC_THREAD_CERR << "Extracting Pf values for " << tileMetadata.getClusterCount() << " bcl clusters" << std::endl; // gcc 4.4 has trouble figuring out which assignment implementation to use with back insert iterators filtersMapper_.getPf(std::back_inserter(bclData.pf())); ISAAC_ASSERT_MSG(bclData.pf().size() == bclData.getClusterCount(), "Mismatch between data " << bclData.getClusterCount() << " and pf " << bclData.pf().size() << "counts"); ISAAC_THREAD_CERR << "Extracting Pf values done for " << bclData.getClusterCount() << " bcl clusters" << std::endl; if (bclData.storeXy()) { ISAAC_THREAD_CERR << "Extracting Positions values for " << tileMetadata.getClusterCount() << " bcl clusters" << std::endl; if (!useLocsPositions) { clocsMapper_.getPositions(std::back_inserter(bclData.xy())); } else { locsMapper_.getPositions(std::back_inserter(bclData.xy())); } ISAAC_ASSERT_MSG(bclData.xy().size() == bclData.getClusterCount(), "Mismatch between data " << bclData.getClusterCount() << " and position " << bclData.xy().size() << "counts"); ISAAC_THREAD_CERR << "Extracting Positions values done for " << bclData.getClusterCount() << " bcl clusters" << std::endl; } }
void ParallelSeedLoader<ReaderT, KmerT>::loadTileCycle( const matchFinder::TileClusterInfo &tileClusterBarcode, rta::SingleCycleBclMapper<ReaderT> &threadBclMapper, std::vector<typename std::vector<Seed<KmerT> >::iterator> &destinationBegins, const flowcell::TileMetadata &tile, const unsigned cycle, std::vector<SeedMetadata>::const_iterator cycleSeedsBegin, const std::vector<SeedMetadata>::const_iterator cycleSeedsEnd) { ISAAC_ASSERT_MSG(cycleSeedsEnd > cycleSeedsBegin, "Seed list cannot be empty"); ISAAC_ASSERT_MSG((cycleSeedsEnd -1)->getReadIndex() == cycleSeedsBegin->getReadIndex(), "All seeds must belong to the same read"); const unsigned readIndex = cycleSeedsBegin->getReadIndex(); const std::vector<matchFinder::ClusterInfo> &clustersToDiscard = tileClusterBarcode.at(tile.getIndex()); ISAAC_ASSERT_MSG(tile.getClusterCount() == clustersToDiscard.size(), "Found matches from a wrong tile/read"); threadBclMapper.mapTileCycle(BaseT::flowcellLayout_, tile, cycle); while (cycleSeedsEnd != cycleSeedsBegin) { for (unsigned int clusterId = 0; tile.getClusterCount() > clusterId; ++clusterId) { const unsigned barcodeIndex = clustersToDiscard.at(clusterId).getBarcodeIndex(); const unsigned referenceIndex = BaseT::barcodeMetadataList_.at(barcodeIndex).getReferenceIndex(); char base = 0; threadBclMapper.get(clusterId, &base); if (flowcell::BarcodeMetadata::UNMAPPED_REFERENCE_INDEX != referenceIndex) { if (!clustersToDiscard.at(clusterId).isReadComplete(readIndex)) { Seed<KmerT> & forwardSeed = *destinationBegins[referenceIndex]++; // skip those previously found to contain Ns if (!forwardSeed.isNSeed()) { if (oligo::getQuality(base) >= seedBaseQualityMin_) { KmerT forward = forwardSeed.getKmer(); const KmerT forwardBaseValue(base & oligo::BITS_PER_BASE_MASK); forward <<= oligo::BITS_PER_BASE; forward |= forwardBaseValue; forwardSeed = Seed<KmerT>(forward, SeedId(tile.getIndex(), barcodeIndex, clusterId, cycleSeedsBegin->getIndex(), 0)); } else { // we can't have holes. The Ns must be stored in such a way that // they will be easy to remove later (after sorting) forwardSeed = makeNSeed<KmerT>(tile.getIndex(), barcodeIndex, clusterId, 0 == cycleSeedsBegin->getIndex()); } } } } } ++cycleSeedsBegin; } }
SeedMemoryManager<KmerT>::SeedMemoryManager( const flowcell::BarcodeMetadataList &barcodeMetadataList, const ReadMetadataList &readMetadataList, const SeedMetadataList &seedMetadataList, const flowcell::TileMetadataList &allTiles ) : barcodeMetadataList_(barcodeMetadataList) , readMetadataList_(readMetadataList) , seedMetadataList_(seedMetadataList) , notFoundMatchesCount_() { ISAAC_ASSERT_MSG(!readMetadataList_.empty(), "Empty readMetadataList is not allowed"); ISAAC_ASSERT_MSG(!seedMetadataList_.empty(), "Empty seedMetadataList is not allowed"); }
void Layout::getLaneAttribute<Layout::BclBgzf, BciFilePathAttributeTag>( const unsigned lane, boost::filesystem::path &result) const { ISAAC_ASSERT_MSG(BclBgzf == format_, BciFilePathAttributeTag() << " is only allowed for bcl-bgzf flowcells"); return getLaneBciFilePath(getBaseCallsPath(), lane, result); }
static void getPositionsFilePath( const boost::filesystem::path &baseCallsPath, const bool patternedFlowcell, const unsigned lane, boost::filesystem::path &result) { if (patternedFlowcell) { result = baseCallsPath.c_str(); result /= ".."; result /= "s.locs"; return; } ISAAC_ASSERT_MSG(lane <= bclBgzf::LANE_NUMBER_MAX, "Lane number " << lane << " must not exceed " << bclBgzf::LANE_NUMBER_MAX); // Warning: all this mad code below is to avoid memory allocations during path formatting. // the result is expected to be pre-sized, else allocations will occur as usual. char laneFolder[100]; // assuming Intensities folder is one level anove BaseCalls folder sprintf(laneFolder, "%c..%cL%03d", common::getDirectorySeparatorChar(), common::getDirectorySeparatorChar(), lane); // boost 1.46 implementation of filesystem::path is coded to instantiated std::string // when doing append. Therefore have to jump through hoops to prevent memory allocations from happening // std::string & pathInternalStringRef = const_cast<std::string&>(result.string()); char filterFileName[100]; sprintf(filterFileName, "%cs_%d.locs", common::getDirectorySeparatorChar(), lane); result = baseCallsPath.c_str(); result /= laneFolder; result /= filterFileName; }
unsigned UngappedAligner::alignUngapped( FragmentMetadata &fragmentMetadata, Cigar &cigarBuffer, const flowcell::ReadMetadata &readMetadata, const matchSelector::FragmentSequencingAdapterClipper &adapterClipper, const reference::ContigList &contigList, const isaac::reference::ContigAnnotations &contigAnnotations) const { const unsigned cigarOffset = cigarBuffer.size(); // Don't reset alignment to preserve the seed-based anchors. // fragmentMetadata.resetAlignment(); ISAAC_ASSERT_MSG(!fragmentMetadata.isAligned(), "alignUngapped is expected to be performend on a clean fragment"); fragmentMetadata.resetClipping(); const reference::Contig &contig = contigList[fragmentMetadata.contigId]; const Read &read = fragmentMetadata.getRead(); const bool reverse = fragmentMetadata.reverse; const std::vector<char> &sequence = read.getStrandSequence(reverse); const reference::Contig &reference = contig; std::vector<char>::const_iterator sequenceBegin = sequence.begin(); std::vector<char>::const_iterator sequenceEnd = sequence.end(); adapterClipper.clip(contig, fragmentMetadata, sequenceBegin, sequenceEnd); clipReadMasking(read, fragmentMetadata, sequenceBegin, sequenceEnd); clipReference(reference.size(), fragmentMetadata, sequenceBegin, sequenceEnd); const unsigned firstMappedBaseOffset = std::distance(sequence.begin(), sequenceBegin); if (firstMappedBaseOffset) { cigarBuffer.addOperation(firstMappedBaseOffset, Cigar::SOFT_CLIP); } const unsigned mappedBases = std::distance(sequenceBegin, sequenceEnd); if (mappedBases) { const Cigar::OpCode opCode = Cigar::ALIGN; cigarBuffer.addOperation(mappedBases, opCode); } const unsigned clipEndBases = std::distance(sequenceEnd, sequence.end()); if (clipEndBases) { cigarBuffer.addOperation(clipEndBases, Cigar::SOFT_CLIP); } const unsigned ret = updateFragmentCigar( readMetadata, contigList, contigAnnotations, fragmentMetadata, fragmentMetadata.reverse, fragmentMetadata.contigId, fragmentMetadata.position, cigarBuffer, cigarOffset); if (!ret) { fragmentMetadata.setUnaligned(); } return ret; }
const unsigned& Layout::getAttribute<Layout::BclBgzf, TilesPerLaneMaxAttributeTag>( unsigned &result) const { ISAAC_ASSERT_MSG(BclBgzf == format_, TilesPerLaneMaxAttributeTag() << " is only allowed for bcl-bgzf flowcells"); const BclFlowcellData &data = boost::get<BclFlowcellData>(formatSpecificData_); return data.tilesPerLaneMax_; }
void Layout::getLaneAttribute<Layout::BclBgzf, PositionsFilePathAttributeTag>( const unsigned lane, boost::filesystem::path &result) const { ISAAC_ASSERT_MSG(BclBgzf == format_, PositionsFilePathAttributeTag() << " is only allowed for bcl-bgzf flowcells"); const BclFlowcellData &data = boost::get<BclFlowcellData>(formatSpecificData_); return flowcell::getPositionsFilePath(getBaseCallsPath(), data.patternedFlowcell_, lane, result); }
const bool &Layout::getAttribute<Layout::BclBgzf, PatternedFlowcellAttributeTag>( bool &result) const { ISAAC_ASSERT_MSG(BclBgzf == format_, PatternedFlowcellAttributeTag() << " is only allowed for bcl-bgzf flowcells"); const BclFlowcellData &data = boost::get<BclFlowcellData>(formatSpecificData_); return data.patternedFlowcell_; }
void serialize<xml::XmlReader>(xml::XmlReader &reader, SortedReferenceMetadata &sortedReferenceMetadata, const unsigned int version) { ISAAC_ASSERT_MSG(version == SortedReferenceMetadata::CURRENT_REFERENCE_FORMAT_VERSION, "Unexpected version requested: " << version); sortedReferenceMetadata.formatVersion_ = (reader+="SortedReference").nextChildElement("FormatVersion").readElementText(); if (SortedReferenceMetadata::CURRENT_REFERENCE_FORMAT_VERSION < sortedReferenceMetadata.formatVersion_ || SortedReferenceMetadata::OLDEST_SUPPORTED_REFERENCE_FORMAT_VERSION > sortedReferenceMetadata.formatVersion_) { BOOST_THROW_EXCEPTION(xml::XmlReaderException( (boost::format("Unexpected sorted reference FormatVersion: %s. FormatVersion must be in range [%d,%d]") % reader.getValue().string() % SortedReferenceMetadata::OLDEST_SUPPORTED_REFERENCE_FORMAT_VERSION % SortedReferenceMetadata::CURRENT_REFERENCE_FORMAT_VERSION).str())); } // SoftwareVersion is optional for older xml files if (reader++.checkName("SoftwareVersion")) { reader++; } // Contigs may not be present if (reader.checkName("Contigs")) { serialize(reader, sortedReferenceMetadata.contigs_, version); // advance if possible ++reader; } // Permutations may not be present if (reader && reader.checkName("Permutations")) { // only ABCD permutation is supported reader += "Permutation"; if (reader["Name"] != "ABCD") { BOOST_THROW_EXCEPTION(xml::XmlReaderException(std::string("Only ABCD permutation masks are supported"))); } serialize(reader, sortedReferenceMetadata.maskFiles_, version); } if (!sortedReferenceMetadata.maskFiles_.empty()) { sortedReferenceMetadata.defaultMaskWidth_ = sortedReferenceMetadata.maskFiles_.begin()->second.at(0).maskWidth; } else { sortedReferenceMetadata.defaultMaskWidth_ = 0; } // As we were able to successfully read the file, bump format version up to the current to avoid confusion // when stored or merged sortedReferenceMetadata.formatVersion_ = SortedReferenceMetadata::CURRENT_REFERENCE_FORMAT_VERSION; }
inline unsigned getLaneNumber(const flowcell::TileMetadataList& tiles) { ISAAC_ASSERT_MSG( tiles.end() == std::find_if( tiles.begin(), tiles.end(), boost::bind(&flowcell::TileMetadata::getLane, _1) != tiles.front().getLane()), "Expected all tiles to belong to the same lane"); return tiles.front().getLane(); }
void AlignWorkflow::findMatches(alignWorkflow::FoundMatchesMetadata &foundMatches) const { alignWorkflow::FindMatchesTransition findMatchesTransition( flowcellLayoutList_, barcodeMetadataList_, allowVariableFastqLength_, cleanupIntermediary_, ignoreMissingBcls_, firstPassSeeds_, availableMemory_, clustersAtATimeMax_, tempDirectory_, demultiplexingStatsXmlPath_, coresMax_, repeatThreshold_, neighborhoodSizeThreshold_, ignoreNeighbors_, ignoreRepeats_, inputLoadersMax_, tempSaversMax_, memoryControl_, clusterIdList_, sortedReferenceMetadataList_); if (16 == seedLength_) { findMatchesTransition.perform<isaac::oligo::ShortKmerType>(foundMatches); } else if (32 == seedLength_) { findMatchesTransition.perform<oligo::KmerType>(foundMatches); } else if (64 == seedLength_) { findMatchesTransition.perform<oligo::LongKmerType>(foundMatches); } else { ISAAC_ASSERT_MSG(false, "Unexpected seed length " << seedLength_); } }
reference::ReferencePosition getAlignmentPositionFromName(const std::size_t readNumber, const FragmentMetadata &fragment) { // numbers are 1-based const auto name = getReadName(readNumber - 1, fragment); if (name.second == name.first) { return reference::ReferencePosition(reference::ReferencePosition::TooManyMatch); } if ('u' == *name.first) { ISAAC_ASSERT_MSG(false, common::makeFastIoString(fragment.getCluster().nameBegin(), fragment.getCluster().nameEnd()) << " " << fragment); return reference::ReferencePosition(reference::ReferencePosition::NoMatch); } return reference::ReferencePosition( std::atol(&*name.first + 2), std::atol(&*std::find(name.first + 2, name.second, ':') + 1), false, 'r' == *name.first); }
static void getFiltersFilePath( const boost::filesystem::path &baseCallsPath, const unsigned lane, boost::filesystem::path &result) { ISAAC_ASSERT_MSG(lane <= bclBgzf::LANE_NUMBER_MAX, "Lane number " << lane << " must not exceed " << bclBgzf::LANE_NUMBER_MAX); // Warning: all this mad code below is to avoid memory allocations during path formatting. // the result is expected to be pre-sized, else allocations will occur as usual. char laneFolder[100]; sprintf(laneFolder, "%cL%03d", common::getDirectorySeparatorChar(), lane); // boost 1.46 implementation of filesystem::path is coded to instantiated std::string // when doing append. Therefore have to jump through hoops to prevent memory allocations from happening std::string & pathInternalStringRef = const_cast<std::string&>(result.string()); char filterFileName[100]; sprintf(filterFileName, "%cs_%d.filter", common::getDirectorySeparatorChar(), lane); pathInternalStringRef = baseCallsPath.c_str(); pathInternalStringRef.append(laneFolder).append(filterFileName); }
void ExtractNeighborsWorkflow::run() { const reference::SortedReferenceMetadata::MaskFiles &maskFiles = xml_.getMaskFileList(oligo::KmerTraits<KmerT>::KMER_BASES); if (maskFiles.empty()) { BOOST_THROW_EXCEPTION(isaac::common::PreConditionException("No mask files in " + sortedReferenceMetadata_.string())); } const reference::SortedReferenceMetadata::Contigs contigs = xml_.getContigs(); const std::vector<uint64_t> contigOffsets = reference::computeContigOffsets(contigs); std::vector<unsigned> karyotypes; karyotypes.reserve(xml_.getContigs().size()); std::transform(xml_.getContigs().begin(), xml_.getContigs().end(), std::back_inserter(karyotypes), boost::bind(&reference::SortedReferenceMetadata::Contig::karyotypeIndex_, _1)); std::vector<bool> neighbors(reference::genomeLength(contigs), false); std::vector<bool> highRepeats(highRepeatsFilePath_.empty() ? 0 : reference::genomeLength(contigs), true); // there could be multiple mask widths in the xml. Just fail if there are. unsigned maskWidth = -1U; BOOST_FOREACH(const reference::SortedReferenceMetadata::MaskFile &maskFile, maskFiles) { if (-1U == maskWidth) { maskWidth = maskFile.maskWidth; } ISAAC_ASSERT_MSG(maskWidth == maskFile.maskWidth, "Mixed mask widths are not supported"); scanMaskFile<KmerT>(maskFile, contigOffsets, karyotypes, neighbors, highRepeats); } dumpResults(neighbors, highRepeats); }
std::string Cigar::toString(const unsigned offset, const unsigned length) const { ISAAC_ASSERT_MSG(this->size() >= offset + length, "Requested end is outside of cigarBuffer"); return toString(begin() + offset, begin() + offset + length); }
std::string Cigar::toString(const std::vector<uint32_t> &cigarBuffer, unsigned offset, unsigned length) { ISAAC_ASSERT_MSG(cigarBuffer.size() >= offset + length, "Requested end is outside of cigarBuffer"); return toString(cigarBuffer.begin() + offset, cigarBuffer.begin() + offset + length); }