void hal::validateDuplications(const Genome* genome) { const Genome* parent = genome->getParent(); if (parent == NULL) { return; } TopSegmentIteratorConstPtr topIt = genome->getTopSegmentIterator(); TopSegmentIteratorConstPtr endIt = genome->getTopSegmentEndIterator(); vector<unsigned char> pcount(parent->getNumBottomSegments(), 0); for (; topIt != endIt; topIt->toRight()) { if (topIt->hasParent()) { if (pcount[topIt->getTopSegment()->getParentIndex()] < 250) { ++pcount[topIt->getTopSegment()->getParentIndex()]; } } } for (topIt = genome->getTopSegmentIterator(); topIt != endIt; topIt->toRight()) { if (topIt->hasParent()) { size_t count = pcount[topIt->getTopSegment()->getParentIndex()]; assert(count > 0); { if (topIt->hasNextParalogy() == false && count > 1) { stringstream ss; ss << "Top Segment " << topIt->getTopSegment()->getArrayIndex() << " in genome " << genome->getName() << " is not marked as a" << " duplication but it shares its parent " << topIt->getTopSegment()->getArrayIndex() << " with at least " << count - 1 << " other segments in the same genome"; throw hal_exception(ss.str()); } } } } }
void Genome::copyTopSegments(Genome *dest) const { const Genome *inParent = getParent(); const Genome *outParent = dest->getParent(); TopSegmentIteratorConstPtr inTop = getTopSegmentIterator(); TopSegmentIteratorPtr outTop = dest->getTopSegmentIterator(); hal_size_t n = dest->getNumTopSegments(); assert(n == 0 || n == getNumTopSegments()); if (n == 0) { // Nothing to do if there are no top segments. return; } BottomSegmentIteratorConstPtr inParentBottomSegIt = inParent->getBottomSegmentIterator(); BottomSegmentIteratorConstPtr outParentBottomSegIt = outParent->getBottomSegmentIterator(); for (; (hal_size_t)inTop->getArrayIndex() < n; inTop->toRight(), outTop->toRight()) { hal_index_t genomePos = inTop->getStartPosition(); assert(genomePos != NULL_INDEX); string inSeqName = getSequenceBySite(genomePos)->getName(); string outSeqName = dest->getSequenceBySite(genomePos)->getName(); // if (inSeqName != outSeqName) { // stringstream ss; // ss << "When copying top segments from " << getName() << " to " << dest->getName() << ": sequence " << inSeqName << " != " << outSeqName << " at site " << genomePos; // throw hal_exception(ss.str()); // } outTop->setCoordinates(inTop->getStartPosition(), inTop->getLength()); outTop->setParentIndex(inTop->getParentIndex()); outTop->setParentReversed(inTop->getParentReversed()); outTop->setBottomParseIndex(inTop->getBottomParseIndex()); outTop->setNextParalogyIndex(inTop->getNextParalogyIndex()); // Check that the sequences from the bottom segments we point to are the same. If not, correct the indices so that they are. if (inTop->getParentIndex() != NULL_INDEX) { inParentBottomSegIt->toParent(inTop); const Sequence *inParentSequence = inParentBottomSegIt->getSequence(); const Sequence *outParentSequence = outParent->getSequence(inParentSequence->getName()); hal_index_t inParentSegmentOffset = inTop->getParentIndex() - inParentSequence->getBottomSegmentArrayIndex(); hal_index_t outParentSegmentIndex = inParentSegmentOffset + outParentSequence->getBottomSegmentArrayIndex(); outTop->setParentIndex(outParentSegmentIndex); } } }
void SummarizeMutations::subsAndGapInserts( GappedTopSegmentIteratorConstPtr gappedTop, MutationsStats& stats) { assert(gappedTop->getReversed() == false); hal_size_t numGaps = gappedTop->getNumGaps(); if (numGaps > 0) { stats._gapInsertionLength.add(gappedTop->getNumGapBases(), numGaps); } string parent, child; TopSegmentIteratorConstPtr l = gappedTop->getLeft(); TopSegmentIteratorConstPtr r = gappedTop->getRight(); BottomSegmentIteratorConstPtr p = l->getTopSegment()->getGenome()->getParent()->getBottomSegmentIterator(); for (TopSegmentIteratorConstPtr i = l->copy(); i->getTopSegment()->getArrayIndex() <= r->getTopSegment()->getArrayIndex(); i->toRight()) { if (i->hasParent()) { p->toParent(i); i->getString(child); p->getString(parent); assert(child.length() == parent.length()); for (size_t j = 0; j < child.length(); ++j) { if (isTransition(child[j], parent[j])) { ++stats._transitions; ++stats._subs; } else if (isTransversion(child[j], parent[j])) { ++stats._transversions; ++stats._subs; } else if (isSubstitution(child[j], parent[j])) { ++stats._subs; } else if (!isMissingData(child[j]) && !isMissingData(parent[j])) { ++stats._matches; } } } } }
void DefaultGappedBottomSegmentIterator::toRightNextUngapped( TopSegmentIteratorConstPtr ts) const { while (ts->hasParent() == false && ts->getLength() <= _gapThreshold) { if ((!ts->getReversed() && ts->getTopSegment()->isLast()) || (ts->getReversed() && ts->getTopSegment()->isFirst())) { break; } ts->toRight(); } }
void hal::validateSequence(const Sequence* sequence) { // Verify that the DNA sequence doesn't contain funny characters DNAIteratorConstPtr dnaIt = sequence->getDNAIterator(); hal_size_t length = sequence->getSequenceLength(); for (hal_size_t i = 0; i < length; ++i) { char c = dnaIt->getChar(); if (isNucleotide(c) == false) { stringstream ss; ss << "Non-nucleotide character discoverd at position " << i << " of sequence " << sequence->getName() << ": " << c; throw hal_exception(ss.str()); } } // Check the top segments if (sequence->getGenome()->getParent() != NULL) { hal_size_t totalTopLength = 0; TopSegmentIteratorConstPtr topIt = sequence->getTopSegmentIterator(); hal_size_t numTopSegments = sequence->getNumTopSegments(); for (hal_size_t i = 0; i < numTopSegments; ++i) { const TopSegment* topSegment = topIt->getTopSegment(); validateTopSegment(topSegment); totalTopLength += topSegment->getLength(); topIt->toRight(); } if (totalTopLength != length) { stringstream ss; ss << "Sequence " << sequence->getName() << " has length " << length << " but its top segments add up to " << totalTopLength; throw hal_exception(ss.str()); } } // Check the bottom segments if (sequence->getGenome()->getNumChildren() > 0) { hal_size_t totalBottomLength = 0; BottomSegmentIteratorConstPtr bottomIt = sequence->getBottomSegmentIterator(); hal_size_t numBottomSegments = sequence->getNumBottomSegments(); for (hal_size_t i = 0; i < numBottomSegments; ++i) { const BottomSegment* bottomSegment = bottomIt->getBottomSegment(); validateBottomSegment(bottomSegment); totalBottomLength += bottomSegment->getLength(); bottomIt->toRight(); } if (totalBottomLength != length) { stringstream ss; ss << "Sequence " << sequence->getName() << " has length " << length << " but its bottom segments add up to " << totalBottomLength; throw hal_exception(ss.str()); } } }
void TopSegmentSimpleIteratorTest::checkCallBack(AlignmentConstPtr alignment) { const Genome* ancGenome = alignment->openGenome("Anc0"); CuAssertTrue(_testCase, ancGenome->getNumTopSegments() == _topSegments.size()); TopSegmentIteratorConstPtr tsIt = ancGenome->getTopSegmentIterator(0); for (size_t i = 0; i < ancGenome->getNumTopSegments(); ++i) { CuAssertTrue(_testCase, (size_t)tsIt->getTopSegment()->getArrayIndex() == i); _topSegments[i].compareTo(tsIt, _testCase); tsIt->toRight(); } tsIt = ancGenome->getTopSegmentIterator( ancGenome->getNumTopSegments() - 1); for (hal_index_t i = ancGenome->getNumTopSegments() - 1; i >= 0; --i) { CuAssertTrue(_testCase, tsIt->getTopSegment()->getArrayIndex() == i); _topSegments[i].compareTo(tsIt, _testCase); tsIt->toLeft(); } tsIt = ancGenome->getTopSegmentIterator(0); tsIt->slice(0, tsIt->getLength() - 1); for (hal_index_t i = 0; i < (hal_index_t)ancGenome->getSequenceLength(); ++i) { CuAssertTrue(_testCase, tsIt->getLength() == 1); CuAssertTrue(_testCase, tsIt->getStartPosition() == i); tsIt->toRight(tsIt->getStartPosition() + 1); } tsIt = ancGenome->getTopSegmentIterator( ancGenome->getNumTopSegments() - 1); tsIt->slice(tsIt->getLength() - 1, 0); for (hal_index_t i = ancGenome->getSequenceLength() - 1; i >= 0; --i) { CuAssertTrue(_testCase, tsIt->getLength() == 1); CuAssertTrue(_testCase, tsIt->getStartPosition() == i); tsIt->toLeft(tsIt->getStartPosition() - 1); } tsIt = ancGenome->getTopSegmentIterator(0); tsIt->toReverse(); CuAssertTrue(_testCase, tsIt->getReversed() == true); tsIt->slice(tsIt->getLength() - 1, 0); for (hal_index_t i = 0; i < (hal_index_t)ancGenome->getSequenceLength(); ++i) { CuAssertTrue(_testCase, tsIt->getLength() == 1); CuAssertTrue(_testCase, tsIt->getStartPosition() == i); tsIt->toLeft(tsIt->getStartPosition() + 1); } tsIt = ancGenome->getTopSegmentIterator( ancGenome->getNumTopSegments() - 1); tsIt->toReverse(); tsIt->slice(0, tsIt->getLength() - 1); for (hal_index_t i = ancGenome->getSequenceLength() - 1; i >= 0; --i) { CuAssertTrue(_testCase, tsIt->getLength() == 1); CuAssertTrue(_testCase, tsIt->getStartPosition() == i); tsIt->toRight(tsIt->getStartPosition() - 1); } }