Example #1
0
// See documentation in header file.
void searchMEIBreakpoints(MEI_data& currentState, std::vector<bam_info>& bam_sources, const Chromosome* chromosome,
                          UserDefinedSettings* userSettings) {
    LOG_DEBUG(*logStream << time_log() << "Start searching for breakpoints..." << std::endl);
    std::vector<std::vector<simple_read*> > clusters;
    cluster_reads(currentState.discordant_reads, currentState.current_insert_size, clusters, userSettings);
    
    // Find breakpoints per cluster.
    int bp_count = 0;
    for (size_t i = 0; i < clusters.size(); i++) {
        // print cluster debug info
        std::vector<simple_read*> cluster = clusters.at(i);
   
        if (cluster.size() < ((size_t) userSettings->MIN_DD_CLUSTER_SIZE)) {
            // Fluke cluster, skip it. (If there are very few reads in the cluster,
            // we likely won't find enough split reads supporting an insertion)
            continue;
        }


        
        // Find breakpoint for this cluster
        char cluster_strand = cluster.at(0)->strand;
        int cluster_tid = cluster.at(0)->tid;
        std::vector<MEI_breakpoint> MEI_bps;
        std::vector<MEI_breakpoint>::iterator MEI_iter;
        get_breakpoints(cluster, bam_sources, currentState.current_insert_size, cluster_tid, cluster_strand, chromosome,
                        currentState.sample_names, MEI_bps, userSettings);
        if (MEI_bps.size() > 1) {
            // More than one breakpoints found for current cluster.  Select only the one with the
            // most split reads supporting it.
            size_t best_support = 0;
            MEI_breakpoint best_bp;
            for (MEI_iter = MEI_bps.begin(); MEI_iter != MEI_bps.end(); ++MEI_iter) {
                if (MEI_iter->associated_split_reads.size() > best_support) {
                    best_bp = *MEI_iter;
                    best_support = MEI_iter->associated_split_reads.size();
                }
            }
            MEI_bps.clear();
            MEI_bps.push_back(best_bp);
        } else if (MEI_bps.size() == 0) {
            // No breakpoint found with split read support.  Estimate breakpoint from cluster reads.
            get_breakpoint_estimation(cluster, currentState.current_insert_size, cluster_tid, cluster_strand, MEI_bps);
        }
        // Check breakpoint validity.
        for (MEI_iter = MEI_bps.begin(); MEI_iter != MEI_bps.end(); ++MEI_iter) {
            currentState.MEI_breakpoints.push_back(*MEI_iter);
            bp_count += 1;
            LOG_INFO(*logStream << "Found potential DD breakpoint: " << (*MEI_iter).breakpoint_tid << ", " <<
                     (*MEI_iter).breakpoint_pos << ", " << (*MEI_iter).cluster_strand << ", " <<
                     (*MEI_iter).associated_reads.size() << ", " <<
                     (*MEI_iter).associated_split_reads.size() << std::endl);
        }
    }
    LOG_DEBUG(*logStream << time_log() << "Found " << bp_count << " breakpoints for " << clusters.size() <<
              " clusters." << std::endl);
}
Example #2
0
void process_chunk(struct alignedread** readlist, int s, int e, FRAGMENT* flist, VARIANT* varlist, REFLIST* reflist) {
    find_matepair(readlist, s, e);
    fprintf(stderr, "e %d s %d \n", e, s);

    //int cl = init_clusters(readlist,s,e); // cluster using a maximum intra-cluster distance value

    //estimate_readdistance_distribution(readlist,s,e,cluster); // estimate distances between start positions of adjacent reads within same cluster
    // cluster size distribution from data | probability of a read being a singleton read
    cluster_reads(readlist, s, e, flist, varlist, reflist);

    fprintf(stdout, "\n\n");
    //print_clusters(readlist,s,e,flist,varlist);  // print clusters
}
int main(int argc, char **argv)
{
	long int prev_cluster = 0, prev_total_cluster=-1, exit_count = 0;

        printf("Yet to cluster = %ld\n", yet_to_cluster);

        while(yet_to_cluster > 0)
        {
                if(prev_cluster-yet_to_cluster==total_clusters-prev_total_cluster)
                {
                        exit_count++;
                        if(exit_count >= 3*log(NO_READS))
                        {
                                printf("No more clustering ... exiting ...\n\n");
                                store_clusters();
                                return 0;
                        }
                }
                else
                        exit_count = 0;

		hash_comparisions = 0;
		false_collisions = 0;

                prev_cluster = yet_to_cluster;

		prev_total_cluster = total_clusters;

                // copy reads from reads_rem.txt to reads_err.txt
                char sequence[READ_LENGTH] = {0};
                long int read_number;
                FILE *f1, *f2;
                f1 = fopen("reads_rem.txt","r");
                f2 = fopen("reads_err.txt","w");

                while(fscanf(f1,"%ld",&read_number) != EOF)
                {
                        fscanf(f1, "%s", sequence);
                        fprintf(f2, "%ld\n", read_number);
                        fprintf(f2, "%s\n", sequence);
                }
                fclose(f1);
                fclose(f2);

                // pickup random reads and setup the cluster centers
                pick_cluster_centers();

		if(total_clusters > prev_total_cluster)
		{
                	// cluster_reads
                	cluster_reads();

			base_comparisions = hash_comparisions + (false_collisions * 30) + ((prev_cluster-yet_to_cluster) * 30);

                	// print clusters
                	print_clusters();
		}
        }

	store_clusters();

        return 0;
}