int time_adjust(xclock_t * clock, int dmsec, int dusec, int dnsec){ int u, n; time_log(("time_adjust: to [%d:%d:%d]\n", dmsec, dusec, dnsec)); if(dmsec==0 && dusec==0 && dnsec==0) return OS_OK; xthr_lock(clock->lock); n = clock->nsec + dnsec; clock->nsec = n % 1000000000; u = clock->usec + dusec + n/1000000000; clock->usec = u % 1000000; clock->msec += dmsec + u/1000000; xthr_unlock(clock->lock); time_log(("time_adjust: clock adjusted to [%d:%d:%d].\n", clock->msec, clock->usec, clock->nsec)); return OS_OK; }
// See documentation in header file. void searchMEIBreakpoints(MEI_data& currentState, std::vector<bam_info>& bam_sources, const Chromosome* chromosome, UserDefinedSettings* userSettings) { LOG_DEBUG(*logStream << time_log() << "Start searching for breakpoints..." << std::endl); std::vector<std::vector<simple_read*> > clusters; cluster_reads(currentState.discordant_reads, currentState.current_insert_size, clusters, userSettings); // Find breakpoints per cluster. int bp_count = 0; for (size_t i = 0; i < clusters.size(); i++) { // print cluster debug info std::vector<simple_read*> cluster = clusters.at(i); if (cluster.size() < ((size_t) userSettings->MIN_DD_CLUSTER_SIZE)) { // Fluke cluster, skip it. (If there are very few reads in the cluster, // we likely won't find enough split reads supporting an insertion) continue; } // Find breakpoint for this cluster char cluster_strand = cluster.at(0)->strand; int cluster_tid = cluster.at(0)->tid; std::vector<MEI_breakpoint> MEI_bps; std::vector<MEI_breakpoint>::iterator MEI_iter; get_breakpoints(cluster, bam_sources, currentState.current_insert_size, cluster_tid, cluster_strand, chromosome, currentState.sample_names, MEI_bps, userSettings); if (MEI_bps.size() > 1) { // More than one breakpoints found for current cluster. Select only the one with the // most split reads supporting it. size_t best_support = 0; MEI_breakpoint best_bp; for (MEI_iter = MEI_bps.begin(); MEI_iter != MEI_bps.end(); ++MEI_iter) { if (MEI_iter->associated_split_reads.size() > best_support) { best_bp = *MEI_iter; best_support = MEI_iter->associated_split_reads.size(); } } MEI_bps.clear(); MEI_bps.push_back(best_bp); } else if (MEI_bps.size() == 0) { // No breakpoint found with split read support. Estimate breakpoint from cluster reads. get_breakpoint_estimation(cluster, currentState.current_insert_size, cluster_tid, cluster_strand, MEI_bps); } // Check breakpoint validity. for (MEI_iter = MEI_bps.begin(); MEI_iter != MEI_bps.end(); ++MEI_iter) { currentState.MEI_breakpoints.push_back(*MEI_iter); bp_count += 1; LOG_INFO(*logStream << "Found potential DD breakpoint: " << (*MEI_iter).breakpoint_tid << ", " << (*MEI_iter).breakpoint_pos << ", " << (*MEI_iter).cluster_strand << ", " << (*MEI_iter).associated_reads.size() << ", " << (*MEI_iter).associated_split_reads.size() << std::endl); } } LOG_DEBUG(*logStream << time_log() << "Found " << bp_count << " breakpoints for " << clusters.size() << " clusters." << std::endl); }
static int load_discordant_reads(MEI_data& mei_data, std::vector<bam_info>& bam_sources, const std::string& chr_name, const SearchWindow& window, UserDefinedSettings* userSettings) { // Loop over associated bam files. for (size_t i = 0; i < bam_sources.size(); i++) { // Locate file. bam_info source = bam_sources.at(i); LOG_DEBUG(*logStream << time_log() << "Loading discordant reads from " << source.BamFile << std::endl); // Setup link to bamfile, its index and header. bamFile fp = bam_open(source.BamFile.c_str(), "r"); bam_index_t *idx = bam_index_load(source.BamFile.c_str()); if (idx == NULL) { LOG_WARN(*logStream << time_log() << "Failed to load index for " << source.BamFile.c_str() << std::endl); LOG_WARN(*logStream << "Skipping window: " << chr_name << ", " << window.getStart() << "--" << window.getEnd() << " for BAM-file: " << source.BamFile.c_str() << std::endl); continue; } bam_header_t *header = bam_header_read(fp); bam_init_header_hash(header); int tid = bam_get_tid(header, chr_name.c_str()); if (tid < 0) { LOG_WARN(*logStream << time_log() << "Could not find sequence in alignment file: '" << chr_name << "'" << std::endl); LOG_WARN(*logStream << "Skipping window: " << chr_name << ", " << window.getStart() << "--" << window.getEnd() << " for BAM-file: " << source.BamFile.c_str() << std::endl); continue; } mei_data.sample_names = get_sample_dictionary(header); // Save insert size of current bamfile in data object provided for callback function. // Note: the insert size should ideally be separate from the MEI_data object, tried to do // this using a std::pair object, which did not work. Suggestions are welcome here. mei_data.current_insert_size = source.InsertSize; mei_data.current_chr_name = chr_name; // Set up environment variable for callback function. std::pair<MEI_data*, UserDefinedSettings*> env = std::make_pair(&mei_data, userSettings); // Load discordant reads into mei_data. bam_fetch(fp, idx, tid, window.getStart(), window.getEnd(), &env, fetch_disc_read_callback); bam_index_destroy(idx); } return 0; }
// Report MEI events. static void reportMEIevent(MEI_data& mei_data, MEI_event& event, int MEI_count, Genome& genome, std::map<int, std::string>& seq_name_dict, std::ostream& out) { // Set evidence strand for event's reads (they'll be reported). set_evidence_strands(event); // List all read info that needs to be reported. std::vector<simple_read> all_reads; get_event_supporting_reads(event, all_reads); LOG_DEBUG(*logStream << time_log() << "reporting DD: #fwd.disc.: " << event.fwd_cluster_bp.associated_reads.size() << ", #fwd.split: " << event.fwd_cluster_bp.associated_split_reads.size() << ", #rev.disc.: " << event.rev_cluster_bp.associated_reads.size() << ", #rev.split: " << event.rev_cluster_bp.associated_split_reads.size() << std::endl); size_t all_read_count = event.fwd_cluster_bp.associated_reads.size() + event.fwd_cluster_bp.associated_split_reads.size() + event.rev_cluster_bp.associated_reads.size() + event.rev_cluster_bp.associated_split_reads.size(); out << "####################################################################################################" << std::endl; // Print machine summary line. out << MEI_count << "\t" << "DD" << "\t"; out << seq_name_dict.at(event.fwd_cluster_bp.breakpoint_tid) << "\t" << event.fwd_cluster_bp.breakpoint_pos << "\t" << event.rev_cluster_bp.breakpoint_pos; out << "\t" << all_read_count << "\t" << event.fwd_cluster_bp.associated_reads.size() << "\t" << event.fwd_cluster_bp.associated_split_reads.size(); out << "\t" << event.rev_cluster_bp.associated_reads.size() << "\t" << event.rev_cluster_bp.associated_split_reads.size() << std::endl; // Print human-readable summary lines. out << COMMENT_PREFIX << "Dispersed Duplication insertion (DD) found on chromosome '" << seq_name_dict.at(event.fwd_cluster_bp.breakpoint_tid) << "', breakpoint at " << event.fwd_cluster_bp.breakpoint_pos << " (estimated from + strand), " << event.rev_cluster_bp.breakpoint_pos << " (estimated from - strand)" << std::endl; out << COMMENT_PREFIX << "Found " << all_read_count << " supporting reads, of which " << event.fwd_cluster_bp.associated_reads.size() << " discordant reads and " << event.fwd_cluster_bp.associated_split_reads.size() << " split reads at 5' end, " << event.rev_cluster_bp.associated_reads.size() << " discordant reads and " << event.rev_cluster_bp.associated_split_reads.size() << " split reads at 3' end." << std::endl; // Print support for breakpoint at 5' end. out << COMMENT_PREFIX << "Supporting reads for insertion location (5' end):" << std::endl; report_split_read_support(genome, event.fwd_cluster_bp, true, seq_name_dict, out); // Print support for breakpoint at 3' end. out << COMMENT_PREFIX << "Supporting reads for insertion location (3' end):" << std::endl; report_split_read_support(genome, event.rev_cluster_bp, false, seq_name_dict, out); // Print all supporting reads and read fragments for the inserted element. report_supporting_reads(all_reads, seq_name_dict, out); }
int lrtime_sleep(xclock_t * clock, rtime_t howlong, rtime_t *remain){ int r; if (howlong <=0) return OS_OK; struct timespec tv, rem; tv.tv_sec = howlong / LRTIME_SECOND_DIVISOR; tv.tv_nsec = (howlong % LRTIME_SECOND_DIVISOR) * LRT_HRT_DIVISOR; time_log(("lrtime_sleep: sleep %ums = %u sec %u nsec\n", howlong, (uint)tv.tv_sec, (uint)tv.tv_nsec)); r = nanosleep(&tv, &rem); if(remain) *remain = rem.tv_sec * LRTIME_SECOND_DIVISOR + rem.tv_nsec / LRT_HRT_DIVISOR; if(r == -1) return OS_EINTR; return OS_OK; }
void searchMEI(MEI_data& finalState, Genome& genome, std::map<int, std::string>& seq_name_dict, UserDefinedSettings* userSettings, ControlState& current_state, std::ostream& out) { LOG_INFO(*logStream << time_log() << "Start calling dispersed duplication events from found breakpoints..." << std::endl); std::vector<MEI_event> insertion_events; size_t bp_amount = finalState.MEI_breakpoints.size(); LOG_INFO(*logStream << time_log() << "Examining " << bp_amount << " breakpoints in total." << std::endl); std::sort(finalState.MEI_breakpoints.begin(), finalState.MEI_breakpoints.end(), comp_breakpoint_pos); LOG_DEBUG(*logStream << time_log() << "Sorted breakpoints." << std::endl); for (size_t i = 0; i < (bp_amount-1); i++) { if (finalState.MEI_breakpoints.at(i).cluster_strand == finalState.MEI_breakpoints.at(i+1).cluster_strand || (finalState.MEI_breakpoints.at(i+1).breakpoint_pos - finalState.MEI_breakpoints.at(i).breakpoint_pos) > userSettings->MAX_DD_BREAKPOINT_DISTANCE || finalState.MEI_breakpoints.at(i).breakpoint_tid != finalState.MEI_breakpoints.at(i+1).breakpoint_tid) { // Current two consecutive breakpoints cannot be combined into an event. continue; } MEI_event event; if (finalState.MEI_breakpoints.at(i).cluster_strand == Plus) { event = MEI_event(finalState.MEI_breakpoints.at(i), finalState.MEI_breakpoints.at(i+1)); } else { event = MEI_event(finalState.MEI_breakpoints.at(i+1), finalState.MEI_breakpoints.at(i)); } insertion_events.push_back(event); } LOG_INFO(*logStream << time_log() << "Found " << insertion_events.size() << " dispersed duplication events." << std::endl); if (userSettings->DD_REPORT_DUPLICATION_READS) { // Append information about reads mapping inside DDs. LOG_INFO(*logStream << time_log() << "Collecting discordant read information for dispersed duplication " << "events." << std::endl); append_cluster_connections(insertion_events, current_state, userSettings); } LOG_INFO(*logStream << time_log() << "Reporting " << insertion_events.size() << " dispersed duplication events to " << userSettings->getMEIOutputFilename().c_str() << std::endl); // Report events. for (size_t i = 0; i < insertion_events.size(); i++) { reportMEIevent(finalState, insertion_events.at(i), i + 1, genome, seq_name_dict, out); } }
int posix_time_now(xclock_t * clock) { struct timeval then = clock->now; gettimeofday(&clock->now, NULL); clock->msec += (clock->now.tv_sec - then.tv_sec) * 1000; clock->msec += (clock->now.tv_usec - then.tv_usec) / 1000; clock->usec += (clock->now.tv_sec - then.tv_sec) * 1000000; clock->usec += clock->now.tv_usec - then.tv_usec; clock->nsec += (clock->now.tv_sec - then.tv_sec) * 1000000000; clock->nsec += (clock->now.tv_usec - then.tv_usec) * 1000; clock->ntp_usec = clock->now.tv_usec; time_log(("posix_time_now: msec = %d, nsec = %u\n",clock->msec , clock->nsec)); return clock->nsec; }
xclock_t * time_begin(rtime_t lrt, rtime_t hrt){ xclock_t * clock = (xclock_t *)xmalloc(sizeof(struct xrtp_clock_s)); if(clock){ gettimeofday(&clock->now, NULL); clock->msec = lrt; clock->nsec = hrt; clock->hrtime_now = posix_time_now; clock->lock = xthr_new_lock(); if(!clock->lock){ xfree(clock); return NULL; } } time_log(("time_begin: new clock[@%u] created.\n", (int)(clock))); return clock; }
// This function is based on Pindel's main function. Todo: integrate with pindel's main structure. int searchMEImain(ControlState& current_state, Genome& genome, UserDefinedSettings* userSettings) { // Reset genome before traversal. g_genome.reset(); std::ofstream file_output(userSettings->getMEIOutputFilename().c_str()); MEI_data mei_data; int result; std::string CurrentChrName; std::string PreviousChrName = ""; // Loop over BED-regions defined in control state. for (unsigned bed_index = 0; bed_index < current_state.IncludeBed.size(); bed_index++) { std::string Bed_ChrName = current_state.IncludeBed[bed_index].ChrName; unsigned Bed_start = current_state.IncludeBed[bed_index].Start; unsigned Bed_end = current_state.IncludeBed[bed_index].End; const Chromosome* currentChromosome = g_genome.getChr(Bed_ChrName); if (currentChromosome == NULL) { std::cout << "There is no " << CurrentChrName << " in the reference file." << std::endl; return 1; } LOG_INFO(*logStream << time_log() << "Dispersed Duplication detection current window: " << Bed_ChrName << ", " << Bed_start << "--" << Bed_end << std::endl); CurrentChrMask.resize(currentChromosome->getCompSize()); for (unsigned int i = 0; i < currentChromosome->getCompSize(); i++) { CurrentChrMask[i] = 'N'; } userSettings->getRegion()->SetRegion(Bed_ChrName, Bed_start, Bed_end); LoopingSearchWindow currentWindow( userSettings->getRegion(), currentChromosome, WINDOW_SIZE, Bed_start, Bed_end ); // loop over one bed region do { result = load_discordant_reads(mei_data, current_state.bams_to_parse, currentChromosome->getName(), currentWindow, userSettings); if (result) { // something went wrong loading the reads, return error code. return result; } searchMEIBreakpoints(mei_data, current_state.bams_to_parse, currentChromosome, userSettings); cleanup_reads(mei_data.discordant_reads); currentWindow.next(); } while (!currentWindow.finished()); } // Reset genome for subsequent traversals. g_genome.reset(); std::map<int, std::string> seq_name_dictionary = get_sequence_name_dictionary(current_state); searchMEI(mei_data, genome, seq_name_dictionary, userSettings, current_state, file_output); file_output.close(); return 0; }
static int append_cluster_connections(std::vector<MEI_event>& insertion_events, ControlState& current_state, UserDefinedSettings* userSettings) { // Setup maps for base read names of mates we need to collect. Also setup 'exclude_names' holding // the original read names (we don't want those, only their mates, which fall inside the event). std::map<std::string, size_t> fwd_name_links, rev_name_links, exclude_names; std::string tmp_basename; for (size_t i = 0; i < insertion_events.size(); i++) { MEI_event event = insertion_events.at(i); for (size_t j = 0; j < event.fwd_cluster_bp.associated_reads.size(); j++) { tmp_basename = base_read_name(event.fwd_cluster_bp.associated_reads.at(j).name); fwd_name_links.insert(std::make_pair(tmp_basename, i)); exclude_names.insert(std::make_pair(event.fwd_cluster_bp.associated_reads.at(j).name, i)); } for (size_t j = 0; j < event.rev_cluster_bp.associated_reads.size(); j++) { tmp_basename = base_read_name(event.rev_cluster_bp.associated_reads.at(j).name); rev_name_links.insert(std::make_pair(tmp_basename, i)); exclude_names.insert(std::make_pair(event.rev_cluster_bp.associated_reads.at(j).name, i)); } } // Loop over whole genome to find mates of discordant reads near DD breakpoints. g_genome.reset(); MEI_data mei_data; int result; // Make dummy BED-records spanning the whole genome. std::vector<BED> dummy_beds; for (unsigned index = 0; index < g_ChrNameAndSizeAndIndex.size(); index++) { BED OneBedRecord; OneBedRecord.ChrName = g_ChrNameAndSizeAndIndex[index].ChrName; OneBedRecord.Start = 1; OneBedRecord.End = g_ChrNameAndSizeAndIndex[index].ChrSize; dummy_beds.push_back(OneBedRecord); } // Loop over BED-regions. for (unsigned bed_index = 0; bed_index < dummy_beds.size(); bed_index++) { std::string Bed_ChrName = dummy_beds[bed_index].ChrName; unsigned Bed_start = dummy_beds[bed_index].Start; unsigned Bed_end = dummy_beds[bed_index].End; const Chromosome* currentChromosome = g_genome.getChr(Bed_ChrName); if (currentChromosome == NULL) { return 1; } LOG_INFO(*logStream << time_log() << "Discordant read collection for current window: " << Bed_ChrName << ", " << Bed_start << "--" << Bed_end << std::endl); CurrentChrMask.resize(currentChromosome->getCompSize()); for (unsigned int i = 0; i < currentChromosome->getCompSize(); i++) { CurrentChrMask[i] = 'N'; } userSettings->getRegion()->SetRegion(Bed_ChrName, Bed_start, Bed_end); LoopingSearchWindow currentWindow( userSettings->getRegion(), currentChromosome, WINDOW_SIZE, Bed_start, Bed_end ); // loop over one bed region do { result = load_discordant_reads(mei_data, current_state.bams_to_parse, currentChromosome->getName(), currentWindow, userSettings); if (result) { // something went wrong loading the reads, return error code. return result; } std::map<std::string, size_t>::iterator name_match; size_t disc_read_count = mei_data.discordant_reads.size(); for (size_t i = 0; i < disc_read_count; i++) { // Determine event and strand for which mate is evidence. tmp_basename = base_read_name(mei_data.discordant_reads.at(i)->name); int event_idx = -1; char strand = Plus; name_match = fwd_name_links.find(tmp_basename); if (name_match != fwd_name_links.end()) { // Current read referenced by a DD event near bp on fwd strand. event_idx = (*name_match).second; } else { name_match = rev_name_links.find(tmp_basename); if (name_match != rev_name_links.end()) { // Current read referenced by a DD event near bp on rev strand. event_idx = (*name_match).second; strand = Minus; } } if (event_idx == -1) { // No match found, this read is not related to an event. continue; } if (exclude_names.find(mei_data.discordant_reads.at(i)->name) != exclude_names.end()) { // read name in exlude list, this is one of the reads we used for calling // the breakpoint, skip it! continue; } if (strand == Plus) { insertion_events.at(event_idx).fwd_mapping_reads.push_back(*(mei_data.discordant_reads.at(i))); } else { insertion_events.at(event_idx).rev_mapping_reads.push_back(*(mei_data.discordant_reads.at(i))); } } cleanup_reads(mei_data.discordant_reads); currentWindow.next(); } while (!currentWindow.finished()); } return 0; }
// Returns a breakpoint for a cluster of connected reads. If no viable // breakpoint can be found, it returns a breakpoint with position -1. // Note: returned pointer must be deleted by caller. static void get_breakpoints(std::vector<simple_read*>& cluster, std::vector<bam_info>& bam_sources, int insert_size, int cluster_tid, char cluster_strand, const Chromosome* chromosome, std::map<std::string, std::string>& sample_dict, std::vector<MEI_breakpoint>& breakpoints, UserDefinedSettings* userSettings) { std::vector<SPLIT_READ> split_reads; int outer_read_pos = (cluster_strand == Minus)? cluster.at(cluster.size()-1)->pos : cluster.at(0)->pos; // int inner_read_pos = (cluster_strand == Minus)? cluster.at(0)->pos : cluster.at(cluster.size()-1)->pos; get_split_reads_for_cluster(bam_sources, cluster_strand, outer_read_pos, chromosome, split_reads); // Search for split reads with a mate close to the outer read of the // cluster. Store candidate breakpoints. // Todo: speedup by exploiting the fact that both clusters and split reads are sorted // by mapping location. std::map<int, std::vector<simple_read> > bio_candidate_breakpoints; for (size_t i = 0; i < split_reads.size(); i++) { SPLIT_READ read = split_reads.at(i); char anchor_strand = read.MatchedD; if (cluster_strand != anchor_strand) { continue; } unsigned int comp_candidate_bp = read.getLastAbsLocCloseEnd(); unsigned int bio_candidate_bp = get_bio_chr_index(comp_candidate_bp); if (bio_candidate_breakpoints.find(bio_candidate_bp) == bio_candidate_breakpoints.end()) { // New candidate, look ahead to check whether there are enough supporting split reads. int SR_support = 1; for (size_t j = i + 1; j < split_reads.size(); j++) { if (split_reads.at(j).getLastAbsLocCloseEnd() == comp_candidate_bp && split_reads.at(j).MatchedD == cluster_strand) { SR_support++; } } if (SR_support < userSettings->MIN_DD_BREAKPOINT_SUPPORT) { // Not enough support, skip it. continue; } else { std::vector<simple_read> new_bp_split_reads; bio_candidate_breakpoints.insert(std::make_pair(bio_candidate_bp, new_bp_split_reads)); } } // Store the unmatched sequence as it should be matched on the opposite strand of // the mapped mate. std::string whole_sequence; std::string mapped_part; std::string unmapped_part; if (anchor_strand == Plus) { whole_sequence = ReverseComplement(read.getUnmatchedSeq()); mapped_part = whole_sequence.substr(0, read.CloseEndLength); unmapped_part = whole_sequence.substr(read.CloseEndLength, whole_sequence.length()); } else { whole_sequence = read.getUnmatchedSeq(); mapped_part = whole_sequence.substr(whole_sequence.length() - read.CloseEndLength, whole_sequence.length()); unmapped_part = whole_sequence.substr(0, whole_sequence.length() - read.CloseEndLength); } std::string sample_name; get_sample_name(read.read_group, sample_dict, sample_name); simple_read simple_split_read(read.Name, -1, -1, '?', sample_name, whole_sequence, mapped_part, unmapped_part); (*bio_candidate_breakpoints.find(bio_candidate_bp)).second.push_back(simple_split_read); } char SR_mapping_strand = (cluster_strand == Plus)? Minus : Plus; // Remove any split reads for which a far end can be found locally, these are // assumed to contribute to some local variants. // Todo: determine region that is searched for far end. std::map<int, std::vector<simple_read> >::iterator map_iter; for (map_iter = bio_candidate_breakpoints.begin(); map_iter != bio_candidate_breakpoints.end(); ++map_iter) { std::string mapped_consensus = get_consensus_unmapped((*map_iter).second, SR_mapping_strand); std::vector<simple_read> sreads = (*map_iter).second; if (mapped_consensus.length() == 0) { LOG_DEBUG(*logStream << time_log() << "Consensus building failed for split read mapping ends (" << map_iter->second.size() << " reads @ " << map_iter->first << ")" << std::endl); continue; } int bio_bp = (*map_iter).first; // If far end consensus is not found in local window, store breakpoint. size_t FE_window_start = std::max(0, get_comp_chr_index(bio_bp) - userSettings->MIN_DD_MAP_DISTANCE); size_t FE_window_size = std::min(chromosome->getCompSize() - (unsigned) FE_window_start, 2 * (unsigned) userSettings->MIN_DD_MAP_DISTANCE); if (!contains_subseq_any_strand(mapped_consensus, chromosome->getSeq().substr(FE_window_start, FE_window_size), MIN_CONSENSUS_LENGTH)) { MEI_breakpoint bp(cluster_tid, bio_bp, cluster_strand); bp.associated_split_reads = (*map_iter).second; // Link associated discordant reads (all reads from cluster) and split reads. std::vector<simple_read*>::iterator read_iter; for (read_iter = cluster.begin(); read_iter != cluster.end(); ++read_iter) { bp.associated_reads.push_back(*(*read_iter)); } breakpoints.push_back(bp); } } }