// See documentation in header file. void searchMEIBreakpoints(MEI_data& currentState, std::vector<bam_info>& bam_sources, const Chromosome* chromosome, UserDefinedSettings* userSettings) { LOG_DEBUG(*logStream << time_log() << "Start searching for breakpoints..." << std::endl); std::vector<std::vector<simple_read*> > clusters; cluster_reads(currentState.discordant_reads, currentState.current_insert_size, clusters, userSettings); // Find breakpoints per cluster. int bp_count = 0; for (size_t i = 0; i < clusters.size(); i++) { // print cluster debug info std::vector<simple_read*> cluster = clusters.at(i); if (cluster.size() < ((size_t) userSettings->MIN_DD_CLUSTER_SIZE)) { // Fluke cluster, skip it. (If there are very few reads in the cluster, // we likely won't find enough split reads supporting an insertion) continue; } // Find breakpoint for this cluster char cluster_strand = cluster.at(0)->strand; int cluster_tid = cluster.at(0)->tid; std::vector<MEI_breakpoint> MEI_bps; std::vector<MEI_breakpoint>::iterator MEI_iter; get_breakpoints(cluster, bam_sources, currentState.current_insert_size, cluster_tid, cluster_strand, chromosome, currentState.sample_names, MEI_bps, userSettings); if (MEI_bps.size() > 1) { // More than one breakpoints found for current cluster. Select only the one with the // most split reads supporting it. size_t best_support = 0; MEI_breakpoint best_bp; for (MEI_iter = MEI_bps.begin(); MEI_iter != MEI_bps.end(); ++MEI_iter) { if (MEI_iter->associated_split_reads.size() > best_support) { best_bp = *MEI_iter; best_support = MEI_iter->associated_split_reads.size(); } } MEI_bps.clear(); MEI_bps.push_back(best_bp); } else if (MEI_bps.size() == 0) { // No breakpoint found with split read support. Estimate breakpoint from cluster reads. get_breakpoint_estimation(cluster, currentState.current_insert_size, cluster_tid, cluster_strand, MEI_bps); } // Check breakpoint validity. for (MEI_iter = MEI_bps.begin(); MEI_iter != MEI_bps.end(); ++MEI_iter) { currentState.MEI_breakpoints.push_back(*MEI_iter); bp_count += 1; LOG_INFO(*logStream << "Found potential DD breakpoint: " << (*MEI_iter).breakpoint_tid << ", " << (*MEI_iter).breakpoint_pos << ", " << (*MEI_iter).cluster_strand << ", " << (*MEI_iter).associated_reads.size() << ", " << (*MEI_iter).associated_split_reads.size() << std::endl); } } LOG_DEBUG(*logStream << time_log() << "Found " << bp_count << " breakpoints for " << clusters.size() << " clusters." << std::endl); }
void process_chunk(struct alignedread** readlist, int s, int e, FRAGMENT* flist, VARIANT* varlist, REFLIST* reflist) { find_matepair(readlist, s, e); fprintf(stderr, "e %d s %d \n", e, s); //int cl = init_clusters(readlist,s,e); // cluster using a maximum intra-cluster distance value //estimate_readdistance_distribution(readlist,s,e,cluster); // estimate distances between start positions of adjacent reads within same cluster // cluster size distribution from data | probability of a read being a singleton read cluster_reads(readlist, s, e, flist, varlist, reflist); fprintf(stdout, "\n\n"); //print_clusters(readlist,s,e,flist,varlist); // print clusters }
int main(int argc, char **argv) { long int prev_cluster = 0, prev_total_cluster=-1, exit_count = 0; printf("Yet to cluster = %ld\n", yet_to_cluster); while(yet_to_cluster > 0) { if(prev_cluster-yet_to_cluster==total_clusters-prev_total_cluster) { exit_count++; if(exit_count >= 3*log(NO_READS)) { printf("No more clustering ... exiting ...\n\n"); store_clusters(); return 0; } } else exit_count = 0; hash_comparisions = 0; false_collisions = 0; prev_cluster = yet_to_cluster; prev_total_cluster = total_clusters; // copy reads from reads_rem.txt to reads_err.txt char sequence[READ_LENGTH] = {0}; long int read_number; FILE *f1, *f2; f1 = fopen("reads_rem.txt","r"); f2 = fopen("reads_err.txt","w"); while(fscanf(f1,"%ld",&read_number) != EOF) { fscanf(f1, "%s", sequence); fprintf(f2, "%ld\n", read_number); fprintf(f2, "%s\n", sequence); } fclose(f1); fclose(f2); // pickup random reads and setup the cluster centers pick_cluster_centers(); if(total_clusters > prev_total_cluster) { // cluster_reads cluster_reads(); base_comparisions = hash_comparisions + (false_collisions * 30) + ((prev_cluster-yet_to_cluster) * 30); // print clusters print_clusters(); } } store_clusters(); return 0; }