void LodExtract::writeSequences(const Genome* inParent, const vector<const Genome*>& inChildren) { vector<const Genome*> inGenomes = inChildren; inGenomes.push_back(inParent); const Genome* outParent = _outAlignment->openGenome(inParent->getName()); (void)outParent; assert(outParent != NULL && outParent->getNumBottomSegments() > 0); string buffer; for (hal_size_t i = 0; i < inGenomes.size(); ++i) { const Genome* inGenome = inGenomes[i]; Genome* outGenome = _outAlignment->openGenome(inGenome->getName()); if (inGenome == inParent || outGenome->getNumChildren() == 0) { SequenceIteratorConstPtr inSeqIt = inGenome->getSequenceIterator(); SequenceIteratorConstPtr end = inGenome->getSequenceEndIterator(); for (; inSeqIt != end; inSeqIt->toNext()) { const Sequence* inSequence = inSeqIt->getSequence(); if (inSequence->getSequenceLength() > 0) { Sequence* outSequence = outGenome->getSequence(inSequence->getName()); assert(outSequence != NULL); inSequence->getString(buffer); outSequence->setString(buffer); } } } } }
void Genome::copyTopDimensions(Genome *dest) const { vector<Sequence::UpdateInfo> dimensions; SequenceIteratorConstPtr seqIt = getSequenceIterator(); SequenceIteratorConstPtr seqEndIt = getSequenceEndIterator(); for (; seqIt != seqEndIt; seqIt->toNext()) { const Sequence* sequence = seqIt->getSequence(); if (sequence->getSequenceLength() == 0 && dest->getSequence(sequence->getName()) == NULL) { // progressiveCactus creates 0-length sequences in ancestors, // which are not usually extractable and aren't important continue; } Sequence::UpdateInfo info(sequence->getName(), sequence->getNumTopSegments()); dimensions.push_back(info); } dest->updateTopDimensions(dimensions); }
void Genome::copyDimensions(Genome *dest) const { vector<Sequence::Info> dimensions; const Alignment *inAlignment = getAlignment(); SequenceIteratorConstPtr seqIt = getSequenceIterator(); SequenceIteratorConstPtr seqEndIt = getSequenceEndIterator(); bool root = inAlignment->getParentName(getName()).empty(); bool leaf = inAlignment->getChildNames(getName()).empty(); for (; seqIt != seqEndIt; seqIt->toNext()) { const Sequence* sequence = seqIt->getSequence(); Sequence::Info info(sequence->getName(), sequence->getSequenceLength(), root ? 0 : sequence->getNumTopSegments(), leaf ? 0 : sequence->getNumBottomSegments()); dimensions.push_back(info); } dest->setDimensions(dimensions); }
void printBedSequenceStats(ostream& os, AlignmentConstPtr alignment, const string& genomeName) { const Genome* genome = alignment->openGenome(genomeName); if (genome == NULL) { throw hal_exception(string("Genome ") + genomeName + " not found."); } if (genome->getNumSequences() > 0) { SequenceIteratorConstPtr seqIt = genome->getSequenceIterator(); SequenceIteratorConstPtr seqEnd = genome->getSequenceEndIterator(); for (; !seqIt->equals(seqEnd); seqIt->toNext()) { os << seqIt->getSequence()->getName() << "\t" << 0 << "\t" << seqIt->getSequence()->getSequenceLength() << "\n"; } } os << endl; }
void LodExtract::writeDimensions( const map<const Sequence*, hal_size_t>& segmentCounts, const string& parentName, const vector<string>& childNames) { // initialize a dimensions list for each (input) genome map<const Genome*, vector<Sequence::Info> > dimMap; map<const Genome*, vector<Sequence::Info> >::iterator dimMapIt; vector<string> newGenomeNames = childNames; newGenomeNames.push_back(parentName); for (size_t i = 0; i < newGenomeNames.size(); ++i) { const Genome* inGenome = _inAlignment->openGenome(newGenomeNames[i]); pair<const Genome*, vector<Sequence::Info> > newEntry; newEntry.first = inGenome; // it's important we keep the sequences in the output genome // in the same order as the sequences in the input genome since // we always use global coordinates! SequenceIteratorConstPtr seqIt = inGenome->getSequenceIterator(); SequenceIteratorConstPtr seqEnd = inGenome->getSequenceEndIterator(); for (; seqIt != seqEnd; seqIt->toNext()) { const Sequence* inSequence = seqIt->getSequence(); map<const Sequence*, hal_size_t>::const_iterator segMapIt; segMapIt = segmentCounts.find(inSequence); // we skip empty sequences for now with below check if (segMapIt != segmentCounts.end()) { vector<Sequence::Info>& segDims = newEntry.second; hal_size_t nTop = inGenome->getName() == parentName ? 0 : segMapIt->second; hal_size_t nBot = inGenome->getName() != parentName ? 0 : segMapIt->second; segDims.push_back(Sequence::Info(inSequence->getName(), inSequence->getSequenceLength(), nTop, nBot)); } } // note potential bug here for genome with no data dimMap.insert(newEntry); } // now that we have the dimensions for each genome, update them in // the output alignment for (dimMapIt = dimMap.begin(); dimMapIt != dimMap.end(); ++dimMapIt) { Genome* newGenome = _outAlignment->openGenome(dimMapIt->first->getName()); assert(newGenome != NULL); vector<Sequence::Info>& segDims = dimMapIt->second; // ROOT if (newGenome->getName() == _outAlignment->getRootName()) { assert(newGenome->getName() == parentName); newGenome->setDimensions(segDims, _keepSequences); } // LEAF else if (newGenome->getName() != parentName) { newGenome->setDimensions(segDims, _keepSequences); } // INTERNAL NODE else { vector<Sequence::UpdateInfo> updateInfo; for (size_t i = 0; i < segDims.size(); ++i) { updateInfo.push_back( Sequence::UpdateInfo(segDims[i]._name, segDims[i]._numBottomSegments)); } newGenome->updateBottomDimensions(updateInfo); } } }
void hal::validateGenome(const Genome* genome) { // first we check the sequence coverage hal_size_t totalTop = 0; hal_size_t totalBottom = 0; hal_size_t totalLength = 0; SequenceIteratorConstPtr seqIt = genome->getSequenceIterator(); SequenceIteratorConstPtr seqEnd = genome->getSequenceEndIterator(); for (; seqIt != seqEnd; seqIt->toNext()) { const Sequence* sequence = seqIt->getSequence(); validateSequence(sequence); totalTop += sequence->getNumTopSegments(); totalBottom += sequence->getNumBottomSegments(); totalLength += sequence->getSequenceLength(); // make sure it doesn't overlap any other sequences; if (sequence->getSequenceLength() > 0) { const Sequence* s1 = genome->getSequenceBySite(sequence->getStartPosition()); if (s1 == NULL || s1->getName() != sequence->getName()) { stringstream ss; ss << "Sequence " << sequence->getName() << " has a bad overlap in " << genome->getName(); throw hal_exception(ss.str()); } const Sequence* s2 = genome->getSequenceBySite(sequence->getStartPosition() + sequence->getSequenceLength() - 1); if (s2 == NULL || s2->getName() != sequence->getName()) { stringstream ss; ss << "Sequence " << sequence->getName() << " has a bad overlap in " << genome->getName(); throw hal_exception(ss.str()); } } } hal_size_t genomeLength = genome->getSequenceLength(); hal_size_t genomeTop = genome->getNumTopSegments(); hal_size_t genomeBottom = genome->getNumBottomSegments(); if (genomeLength != totalLength) { stringstream ss; ss << "Problem: genome has length " << genomeLength << "But sequences total " << totalLength; throw hal_exception(ss.str()); } if (genomeTop != totalTop) { stringstream ss; ss << "Problem: genome has " << genomeTop << " top segments but " << "sequences have " << totalTop << " top segments"; throw ss.str(); } if (genomeBottom != totalBottom) { stringstream ss; ss << "Problem: genome has " << genomeBottom << " bottom segments but " << "sequences have " << totalBottom << " bottom segments"; throw hal_exception(ss.str()); } if (genomeLength > 0 && genomeTop == 0 && genomeBottom == 0) { stringstream ss; ss << "Problem: genome " << genome->getName() << " has length " << genomeLength << "but no segments"; throw hal_exception(ss.str()); } validateDuplications(genome); }
void Genome::copyBottomSegments(Genome *dest) const { assert(getNumBottomSegments() == dest->getNumBottomSegments()); hal_size_t inNc = getNumChildren(); hal_size_t outNc = dest->getNumChildren(); // The child indices aren't consistent across files--make sure each bottom // segment points to the correct children vector<string> inChildNames; vector<string> outChildNames; for (hal_size_t inChild = 0; inChild < inNc; ++inChild) { inChildNames.push_back(getChild(inChild)->getName()); } for (hal_size_t outChild = 0; outChild < outNc; ++outChild) { outChildNames.push_back(dest->getChild(outChild)->getName()); } map<hal_size_t, hal_size_t> inChildToOutChild; for (hal_size_t inChild = 0; inChild < inNc; inChild++) { hal_size_t outChild; for (outChild = 0; outChild < outNc; outChild++) { if (inChildNames[inChild] == outChildNames[outChild]) { inChildToOutChild[inChild] = outChild; break; } } if (outChild == outNc) { inChildToOutChild[inChild] = outNc; } } // Go through each sequence in this genome, find the matching // sequence in the dest genome, then copy over the segments for each // sequence. SequenceIteratorConstPtr seqIt = getSequenceIterator(); SequenceIteratorConstPtr seqEndIt = getSequenceEndIterator(); for (; seqIt != seqEndIt; seqIt->toNext()) { const Sequence *inSeq = seqIt->getSequence(); const Sequence *outSeq = dest->getSequence(inSeq->getName()); BottomSegmentIteratorPtr inBot = inSeq->getBottomSegmentIterator(); BottomSegmentIteratorPtr outBot = outSeq->getBottomSegmentIterator(); cout << "DEBUG: inSeq name: " << inSeq->getName() << ", outSeq name: " << outSeq->getName() << endl; if (inSeq->getName() != outSeq->getName()) { // This check is important enough that it can't be an assert. stringstream ss; ss << "When copying bottom segments: segment #" << inBot->getArrayIndex() << " of source genome is from sequence " << inBot->getSequence()->getName() << ", but segment #" << outBot->getArrayIndex() << " is from sequence " << outBot->getSequence()->getName(); throw hal_exception(ss.str()); } if (inSeq->getNumBottomSegments() != outSeq->getNumBottomSegments()) { stringstream ss; ss << "When copying bottom segments: sequence " << inSeq->getName() << " has " << inSeq->getNumBottomSegments() << " in genome " << getName() << ", while it has " << outSeq->getNumBottomSegments() << " in genome " << dest->getName(); throw hal_exception(ss.str()); } hal_index_t inSegmentEnd = inSeq->getBottomSegmentArrayIndex() + inSeq->getNumBottomSegments(); cout << "DEBUG: inSegmentStart: " << inSeq->getBottomSegmentArrayIndex() << " inSegmentEnd: " << inSegmentEnd << " num bottom segments: " << inSeq->getNumBottomSegments() << endl; for (; inBot->getArrayIndex() < inSegmentEnd; inBot->toRight(), outBot->toRight()) { hal_index_t outStartPosition = inBot->getStartPosition() - inSeq->getStartPosition() + outSeq->getStartPosition(); cout << "Decided on outStartPosition " << outStartPosition << " for seg index " << outBot->getArrayIndex() << " (src index " << inBot->getArrayIndex() << ")" << endl; if (dest->getSequenceBySite(outStartPosition) != outSeq) { stringstream ss; ss << "When copying bottom segments from " << getName() << " to " << dest->getName() << ": expected destination sequence " << outSeq->getName() << " for segment # " << inBot->getArrayIndex() << " but got " << dest->getSequenceBySite(outStartPosition)->getName(); throw hal_exception(ss.str()); } outBot->setCoordinates(outStartPosition, inBot->getLength()); for(hal_size_t inChild = 0; inChild < inNc; inChild++) { hal_size_t outChild = inChildToOutChild[inChild]; if (outChild != outNc) { outBot->setChildIndex(outChild, inBot->getChildIndex(inChild)); cout << "genome " << getName() << ": Set child index " << inChild << " to " << inBot->getChildIndex(inChild) << endl; outBot->setChildReversed(outChild, inBot->getChildReversed(inChild)); } } outBot->setTopParseIndex(inBot->getTopParseIndex()); } } }