Esempio n. 1
0
// This test verifies if the algorithm considers an edge between two identical AlignmentRecords with soft clip.
TEST(edgeBetweenFunctionTest, edgeBetweenSameAlignmentsSoftClip){
    
    AlignmentRecord alignment;
    EdgeCalculator* edge_calculator = nullptr;
    
    alignment.restoreCompleteAlignmentRecord("test/data/simulation/unit_data/alignment_sample01.txt");
    
    double Q = 0.9;
    double edge_quasi_cutoff_cliques = 0.99;
    double overlap_cliques = 0.9;
    bool frameshift_merge = false;
    std::unordered_map<int, double> simpson_map;
    double edge_quasi_cutoff_single = 0.95;
    double overlap_single = 0.6;
    double edge_quasi_cutoff_mixed = 0.97;
    unsigned int maxPosition1 = 0;
    bool noProb0 = false;
    
    edge_calculator = new NewEdgeCalculator(Q, edge_quasi_cutoff_cliques, overlap_cliques, frameshift_merge, simpson_map, edge_quasi_cutoff_single, overlap_single, edge_quasi_cutoff_mixed, maxPosition1, noProb0);
    
    bool set_edge = edge_calculator->edgeBetween(alignment, alignment);
    
    EXPECT_EQ(set_edge, true);
    
    delete edge_calculator;
}
void GaussianEdgeCalculator::getPartnerLengthRange(const AlignmentRecord& ap, unsigned int *min, unsigned int *max) const {
    if ((unsigned)allowable_insert_size_diff > ap.getInsertLength()) {
		*min = 0;
	} else {
		*min = ap.getInsertLength() - allowable_insert_size_diff;
	}
	*max = ap.getInsertLength() + allowable_insert_size_diff;
}
void AnyDistributionEdgeCalculator::getPartnerLengthRange(const AlignmentRecord& ap, unsigned int *min, unsigned int *max) const {
	if (allowable_insert_size_diff > (int)ap.getInsertLength()) {
		*min = 0;
	} else {
		*min = ap.getInsertLength() - allowable_insert_size_diff;
	}
	*max = ap.getInsertLength() + allowable_insert_size_diff;
}
bool AnyDistributionEdgeCalculator::edgeBetween(const AlignmentRecord & ap1, const AlignmentRecord & ap2) const {
	double insert_length_diff = abs(((int)ap1.getInsertLength()) - ((int)ap2.getInsertLength()));
// 	// double meanedge = 2.0 * sf( insert_length_diff / (sqrt2*insert_size_popstddev) );
	if (insert_length_diff > allowable_insert_size_diff) {
		return false;
	}
	size_t intersection_length = ap1.intersectionLength(ap2);
	if (intersection_length == 0) return false;
	int insert_length_sum = ap1.getInsertLength() + ap2.getInsertLength() - 2*intersection_length;
	double intersection_pvalue = insertSizeSumRightTail(insert_length_sum);
	return intersection_pvalue >= significance_level;
}
Esempio n. 5
0
void EdgeWriter::addEdge(const AlignmentRecord& node1, const AlignmentRecord& node2) {
	assert(!finished);

	nodes[node1.getID()].insert(node2.getID());
	nodes[node2.getID()].insert(node1.getID());

    vertex_to_read_names[node1.getID()] = node1.getName();
    vertex_to_read_names[node2.getID()] = node2.getName();
}
bool QuasispeciesEdgeCalculator::edgeBetween(const AlignmentRecord & ap1, const AlignmentRecord & ap2) const {
    if (ap1.getName().compare(ap2.getName()) == 0) {
        return 1;
    }
    // string s1 = "Clique_0";
    // string s2 = "Clique_2";
    // if ((ap1.getName().compare(s1) == 0 || ap2.getName().compare(s1) == 0 )
    //     && (ap1.getName().compare(s2) == 0 || ap2.getName().compare(s2) == 0 )) {
    //     cerr << ap1.getName() << "\t" << ap2.getName() << endl;
    // }

    double cutoff = 0;
    if (ap1.getName().find("Clique") != string::npos
        && ap2.getName().find("Clique") != string::npos) {
        cutoff = EDGE_QUASI_CUTOFF;
    } else if (ap1.getName().find("Clique") != string::npos
        || ap2.getName().find("Clique") != string::npos) {
        cutoff = 0.97;
    } else {
        cutoff = EDGE_QUASI_CUTOFF_SINGLE;
    }
    double q = computeOverlap(ap1, ap2, cutoff);
    // if ((ap1.getName().compare(s1) == 0 || ap2.getName().compare(s1) == 0 )
    //     && (ap1.getName().compare(s2) == 0 || ap2.getName().compare(s2) == 0 )) {
    //     cerr << endl << "Q: " << q << endl;
    // }
    return q >= cutoff;
}
bool GaussianEdgeCalculator::edgeBetween(const AlignmentRecord & ap1, const AlignmentRecord & ap2, int numGCAllowedPos, int ct) const {
	if (ap1.isSingleEnd() || ap2.isSingleEnd()) {
		throw runtime_error("Cannot process single-end reads in GaussianEdgeCalculator!");
	}
	double insert_length_diff = abs(((int)ap1.getInsertLength()) - ((int)ap2.getInsertLength()));
	// double meanedge = 2.0 * sf( insert_length_diff / (sqrt2*insert_size_popstddev) );
	if (insert_length_diff > allowable_insert_size_diff) {
		return false;
	}
	size_t intersection_length = ap1.internalSegmentIntersectionLength(ap2);
	if (intersection_length == 0) return false;
	double mean_insert_length = (ap1.getInsertLength() + ap2.getInsertLength()) / 2.0;
	double interedge = min(1.0, 2.0 * sf(sqrt2*(mean_insert_length - intersection_length - insert_size_popmean) / insert_size_popstddev) );
	return interedge >= significance_level;
}
size_t AlignmentRecord::internalSegmentIntersectionLength(const AlignmentRecord& ap) const {
	int left = max(getInsertStart(), ap.getInsertStart());
	int right = min(getInsertEnd(), ap.getInsertEnd()) + 1;
	return max(0, right-left);
}
size_t AlignmentRecord::intersectionLength(const AlignmentRecord& ap) const {
	assert(single_end == ap.single_end);
	int left = max(getIntervalStart(), ap.getIntervalStart());
	int right = min(getIntervalEnd(), ap.getIntervalEnd()) + 1;
	return max(0, right-left);
}
double QuasispeciesEdgeCalculator::singleOverlap(const AlignmentRecord & ap1, const AlignmentRecord & ap2, int strain1, int strain2, double MIN_OVERLAP, const double cutoff) const {

    int e1 = 0;
    int s1 = 0;
    ShortDnaSequence sequence1;
    vector<char> cigar1;

    int e2 = 0;
    int s2 = 0;
    ShortDnaSequence sequence2;
    vector<char> cigar2;

    if (strain1 == 1) {
        s1 = ap1.getStart1();
        e1 = ap1.getEnd1();
        sequence1 = ap1.getSequence1();

        for (vector<BamTools::CigarOp>::const_iterator it = ap1.getCigar1().begin(); it != ap1.getCigar1().end(); ++it) {
            for (int s = 0; s < it->Length; ++s) cigar1.push_back(it->Type);
        }
    } else if (strain1 == 2) {
        s1 = ap1.getStart2();
        e1 = ap1.getEnd2();
        sequence1 = ap1.getSequence2();

        for (vector<BamTools::CigarOp>::const_iterator it = ap1.getCigar2().begin(); it != ap1.getCigar2().end(); ++it) {
            for (int s = 0; s < it->Length; ++s) cigar1.push_back(it->Type);
        }
    }
    if (strain2 == 1) {
        s2 = ap2.getStart1();
        e2 = ap2.getEnd1();
        sequence2 = ap2.getSequence1();

        for (vector<BamTools::CigarOp>::const_iterator it = ap2.getCigar1().begin(); it != ap2.getCigar1().end(); ++it) {
            for (int s = 0; s < it->Length; ++s) cigar2.push_back(it->Type);
        }
    } else if (strain2 == 2) {
        s2 = ap2.getStart2();
        e2 = ap2.getEnd2();
        sequence2 = ap2.getSequence2();

        for (vector<BamTools::CigarOp>::const_iterator it = ap2.getCigar2().begin(); it != ap2.getCigar2().end(); ++it) {
            for (int s = 0; s < it->Length; ++s) cigar2.push_back(it->Type);
        }
    }

        // ====
        // compute overlap
    int x_l1 = e1 - s1;
    int x_l2 = e2 - s2;

    int offset1 = 0;
    int offset2 = 0;

    int overlap = 0;

    if (s1 == s2 && e1 == e2) {
            // -----
            // -----
        overlap = e1 - s1;
        offset1 = 0;
        offset2 = 0;
    } else if (s1 == s2) {
            // ----  AND -----
            // ----- AND ----
        if (e1 < e2) {
            overlap = e1 - s1;
        } else {
            overlap = e2 - s2;
        }
        offset1 = 0;
        offset2 = 0;
    } else if (e1 == e2) {
            //  ---- AND -----
            // ----- AND  ----
        if (s1 > s2) {
            overlap = e1 - s1;
            offset1 = 0;
            offset2 = x_l2 - overlap;
        } else {
            overlap = e2 - s2;
            offset1 = x_l1 - overlap;
            offset2 = 0;
        }
    } else if (e1 < e2) {
            // ----    AND   --
            //   ----  AND ------
        if (s1 < s2) {
                // ----
                //   ----
            overlap = e1 - s2;
            offset1 = x_l1 - overlap;
            offset2 = 0;
        } else {
                //   --
                // ------
            overlap = e1 - s1;
            offset1 = 0;
            offset2 = e2 - (e2 - e1) - overlap - s2;
        }
    } else {
            //   ---- AND ------
            // ----   AND   --
        if (s2 < s1) {
                //   ----
                // ----
                //cout << "#########" << endl;
            overlap = e2 - s1;
            offset1 = 0;
            offset2 = x_l2 - overlap;
        } else {
                // ------
                //   --
            overlap = e2 - s2;
            offset1 = e1 - (e1 - e2) - overlap - s1;
            offset2 = 0;
        }
    }
        // cerr << "single overlap: " << overlap << endl;
    if (overlap < MIN_OVERLAP) {
        return 0;
    }

        // ====
        //Offsets for the deletions and insertions that occured
        //upstream of the overlap. They cause index errors if not fixed.
    int offset_deletion1_ = 0;
    int offset_deletion2_ = 0;

    double overlap_probability = 0.0;
    char alphabet[] = {'A', 'C', 'G', 'T'};
    double hamming = 0;
    double total_size = 0;
    if (offset1 >= sequence1.size() || offset2 >= sequence2.size()) {
            // cerr << "out of here" << endl;
        return 0;
    }
    /*if (offset1 > 0) {
        offset1 -= 1;
    }
    if (offset2 > 0) {
        offset2 -= 1;
    }*/
    //cerr << offset1 << " offset " << offset2 << endl;
    bool perfect = 0;
    if (ap1.getName().find("Clique") != string::npos
        && ap2.getName().find("Clique") != string::npos
        && cutoff == 1.0) {
        perfect = 1;
    }
    for (int j_compare = 0, j2_compare = 0, prefix = 1, run = 1, run2 = 1, compute_overlap = 0, j_overlap = 0, jm = 0, jm2 = 0,
            shift_ins_prefix = 0, shift_ins_prefix2 = 0,
            shift = 0, shift_del = 0, shift_del_prefix = 0, shift_ins = 0, insertion_index = 0, j = 0, j_cigar = 0,
            shift2 = 0, shift_del2 = 0, shift_del_prefix2 = 0, shift_ins2 = 0, insertion_index2 = 0, j2 = 0, j_cigar2 = 0;;) {
        int j_tmp = j - shift_del_prefix + shift_ins_prefix;
        int j2_tmp = j2 - shift_del_prefix2 + shift_ins_prefix2;
        int jump_single = 0;
        int jump_single2 = 0;
        if ((j - shift == offset1 || offset1 == 0) && prefix) {
            run = 0;
        }
        if ((j2 - shift2 == offset2 || offset2 == 0) && prefix) {
            run2 = 0;
        }
        if (!run && !run2) {
            compute_overlap = 1;
            run = 1;
            run2 = 1;
        }
        if (j_overlap >= overlap) {
            compute_overlap = 0;
        }
        bool skip = 1;
        if (compute_overlap && j < sequence1.size() && j2 < sequence2.size() && j_tmp < sequence1.size() && j2_tmp < sequence2.size()) {
            skip = 0;
            prefix = 0;
            if (cigar1[j_cigar] == cigar2[j_cigar2]) {
                switch (cigar1[j_cigar]) {
                    case 'M':
                    case 'I':
                        if (!perfect) {
                            double q_x1 = sequence1.qualityCorrect(j_tmp);
                            double q_x2 = sequence2.qualityCorrect(j2_tmp);
                            double anti_q_x1 = (1.0 - q_x1) / 3.0;
                            double anti_q_x2 = (1.0 - q_x2) / 3.0;

                            assert(q_x1 <= 1 && q_x2 <= 1);
                            double sum = 0.0;
                            sum += ((sequence1[j_tmp] == alphabet[0] ? q_x1 : anti_q_x1) * (sequence2[j2_tmp] == alphabet[0] ? q_x2 : anti_q_x2));
                            sum += ((sequence1[j_tmp] == alphabet[1] ? q_x1 : anti_q_x1) * (sequence2[j2_tmp] == alphabet[1] ? q_x2 : anti_q_x2));
                            sum += ((sequence1[j_tmp] == alphabet[2] ? q_x1 : anti_q_x1) * (sequence2[j2_tmp] == alphabet[2] ? q_x2 : anti_q_x2));
                            sum += ((sequence1[j_tmp] == alphabet[3] ? q_x1 : anti_q_x1) * (sequence2[j2_tmp] == alphabet[3] ? q_x2 : anti_q_x2));
                            overlap_probability += log(sum);
                        } else {
                            if (sequence1[j_tmp] != sequence2[j2_tmp]) {
                                return 0;
                            }
                        }
                        j_overlap++;
                        total_size++;
                        default: break;
                }
            } else if (this->FRAMESHIFT_MERGE && (cigar1[j_cigar] == 'I' || cigar1[j_cigar] == 'D')  && cigar2[j_cigar2] == 'M' && j_cigar + 1 < cigar1.size() && j_cigar - 1 >= 0 && cigar1[j_cigar - 1] == 'M' && cigar1[j_cigar + 1] == 'M') {
                if (cigar1[j_cigar] == 'I') {
                    jump_single2 = 1;
                }
                //otherwise it's a deletion and just ignore that base
            } else if (this->FRAMESHIFT_MERGE && cigar1[j_cigar] == 'M' && (cigar2[j_cigar2] == 'I' || cigar2[j_cigar2] == 'D') && j_cigar2 + 1 < cigar2.size() && j_cigar2 - 1 >= 0 && cigar2[j_cigar2 - 1] == 'M' && cigar2[j_cigar2 + 1] == 'M') {
                if (cigar2[j_cigar2] == 'I') {
                    jump_single = 1;
                }
                //otherwise it's a deletion and just ignore that base
            } else {
                return 0;
            }
        }
        if (j < sequence1.size() && run && !jump_single) {
            skip = 0;
            int j_global = j - shift - shift_ins + shift_del + s1 - 1;
            //cerr << "jump1 " << j_global << endl;
            if (cigar1[j_cigar] == 'I') {
                if (prefix) shift_ins_prefix++;
                insertion_index++;
                shift_ins++;
                j++;
            } else {
                insertion_index = 0;
                if (cigar1[j_cigar] == 'M') {
                    if (j - shift < offset1 || j - shift > offset1 + overlap) {
                        if (this->SIMPSON_MAP.begin() != this->SIMPSON_MAP.end()
                            && this->SIMPSON_MAP.find(j_global) != this->SIMPSON_MAP.end()) {
                            overlap_probability += log(this->SIMPSON_MAP.at(j_global));
                        }
                        jm++;
                    }
                    j++;
                } else if (cigar1[j_cigar] == 'D') {
                    if (prefix) shift_del_prefix++;
                    shift_del++;
                    if (jm < offset1) offset_deletion1_++;
                } else if (cigar1[j_cigar] == 'S') {
                    shift++;
                    j++;
                }
            }
            j_cigar++;
        }
        if (j2 < sequence2.size() && run2 && !jump_single2) {
            skip = 0;
            int j_global = j2 - shift2 - shift_ins2 + shift_del2 + s2 - 1;
            //cerr << "jump2 " << j_global << endl;
            if (cigar2[j_cigar2] == 'I') {
                if (prefix) shift_ins_prefix2++;
                insertion_index2++;
                shift_ins2++;
                j2++;
            } else {
                insertion_index2 = 0;
                if (cigar2[j_cigar2] == 'M') {
                    if (j2 - shift2 < offset2 || j2 - shift2 > offset2 + overlap) {
                        if (this->SIMPSON_MAP.begin() != this->SIMPSON_MAP.end()
                            && this->SIMPSON_MAP.find(j_global) != this->SIMPSON_MAP.end()) {
                            overlap_probability += log(this->SIMPSON_MAP.at(j_global));
                        }
                        jm2++;
                    }
                    j2++;
                } else if (cigar2[j_cigar2] == 'D') {
                    //cerr << "D2" << endl;
                    if (prefix) shift_del_prefix2++;
                    shift_del2++;
                    if (jm2 < offset2) offset_deletion2_++;
                } else if (cigar2[j_cigar2] == 'S') {
                    shift2++;
                    j2++;
                }
            }
            j_cigar2++;
        }
        if (j == sequence1.size() && j2 == sequence2.size()) {
            break;
        }
        if (skip) {
            break;
        }
    }
    if (perfect) return 1;
    return pow(exp(overlap_probability),1.0/total_size);
}
double QuasispeciesEdgeCalculator::computeOverlap(const AlignmentRecord & ap1, const AlignmentRecord & ap2, const double cutoff) const {

    double MIN_OVERLAP = 0;
    if (ap1.getName().find("Clique") != string::npos
        && ap2.getName().find("Clique") != string::npos) {
        MIN_OVERLAP = MIN_OVERLAP_CLIQUES;
    } else {
        MIN_OVERLAP = MIN_OVERLAP_SINGLE;
    }

    if (ap1.isSingleEnd() && ap2.isSingleEnd()) {
        int read_size1 = min(ap1.getEnd1() - ap1.getStart1(), ap2.getEnd1() - ap2.getStart1());
        float overlap_size1 = overlapSize(ap1.getEnd1(), ap2.getEnd1(), ap1.getStart1(), ap2.getStart1());
        if (MIN_OVERLAP <= 1) overlap_size1 /= (float) read_size1;
        if (overlap_size1 < MIN_OVERLAP) return 0;
        // if (!is_disjoint(ap1.getReadNames(),ap2.getReadNames())) return 1;
        return singleOverlap(ap1, ap2, 1, 1, MIN_OVERLAP, cutoff);
    } else if (ap1.isSingleEnd() || ap2.isSingleEnd()) {
        if (ap1.isSingleEnd()) {
            int read_size1 = min(ap1.getEnd1() - ap1.getStart1(), ap2.getEnd1() - ap2.getStart1());
            int read_size2 = min(ap1.getEnd1() - ap1.getStart1(), ap2.getEnd2() - ap2.getStart2());
            float overlap_size1 = overlapSize(ap1.getEnd1(), ap2.getEnd1(), ap1.getStart1(), ap2.getStart1());
            float overlap_size2 = overlapSize(ap1.getEnd1(), ap2.getEnd2(), ap1.getStart1(), ap2.getStart2());

            if (MIN_OVERLAP <= 1) {
                overlap_size1 /= (float) read_size1;
                overlap_size2 /= (float) read_size2;
            }
            if ((overlap_size1 >= MIN_OVERLAP && overlap_size2 >= MIN_OVERLAP) || (overlap_size1+overlap_size2 >= MIN_OVERLAP && overlap_size1 > 0 && overlap_size2 > 0)) {
                // if (!is_disjoint(ap1.getReadNames(),ap2.getReadNames())) return 1;
                return singleOverlap(ap1, ap2, 1, 1, MIN_OVERLAP, cutoff)*singleOverlap(ap1, ap2, 1, 2, MIN_OVERLAP, cutoff);
            }
            return 0;
        } else {
            int read_size1 = min(ap1.getEnd1() - ap1.getStart1(), ap2.getEnd1() - ap2.getStart1());
            int read_size2 = min(ap1.getEnd2() - ap1.getStart2(), ap2.getEnd1() - ap2.getStart1());

            float overlap_size1 = overlapSize(ap1.getEnd1(), ap2.getEnd1(), ap1.getStart1(), ap2.getStart1());
            float overlap_size2 = overlapSize(ap1.getEnd2(), ap2.getEnd1(), ap1.getStart2(), ap2.getStart1());
            if (MIN_OVERLAP <= 1) {
                overlap_size1 /= (float) read_size1;
                overlap_size2 /= (float) read_size2;
            }
            if ((overlap_size1 >= MIN_OVERLAP && overlap_size2 >= MIN_OVERLAP) || (overlap_size1+overlap_size2 >= MIN_OVERLAP && overlap_size1 > 0 && overlap_size2 > 0)) {
                // if (!is_disjoint(ap1.getReadNames(),ap2.getReadNames())) return 1;
                return singleOverlap(ap1, ap2, 1, 1, MIN_OVERLAP, cutoff)*singleOverlap(ap1, ap2, 2, 1, MIN_OVERLAP, cutoff);
            }
            return 0;
        }
    } else {
        int read_size1 = min(ap1.getEnd1() - ap1.getStart1(), ap2.getEnd1() - ap2.getStart1());
        float overlap_size1 = overlapSize(ap1.getEnd1(), ap2.getEnd1(), ap1.getStart1(), ap2.getStart1());
        if (MIN_OVERLAP <= 1) overlap_size1 /= (float) read_size1;
        if (overlap_size1 < MIN_OVERLAP) return 0;

        int read_size2 = min(ap1.getEnd2() - ap1.getStart2(), ap2.getEnd2() - ap2.getStart2());
        float overlap_size2 = overlapSize(ap1.getEnd2(), ap2.getEnd2(), ap1.getStart2(), ap2.getStart2());
        if (MIN_OVERLAP <= 1) overlap_size2 /= (float) read_size2;
        if (overlap_size2 < MIN_OVERLAP) return 0;

        // if (!is_disjoint(ap1.getReadNames(),ap2.getReadNames())) return 1;
        return singleOverlap(ap1, ap2, 1, 1, MIN_OVERLAP, cutoff)*singleOverlap(ap1, ap2, 2, 2, MIN_OVERLAP, cutoff);
    }

    return 0;
}